Update to ytdl-commit-cf2dbec

cf2dbec630

Except: [kakao] improve info extraction and detect geo restriction
d8085580f6
This commit is contained in:
pukkandan 2021-02-20 02:14:36 +05:30
parent 5e41dca334
commit bc2ca1bb75
19 changed files with 1013 additions and 395 deletions

View file

@ -12,6 +12,7 @@
from youtube_dlc.extractor import ( from youtube_dlc.extractor import (
YoutubePlaylistIE, YoutubePlaylistIE,
YoutubeTabIE,
YoutubeIE, YoutubeIE,
) )
@ -57,14 +58,22 @@ def test_youtube_toptracks(self):
entries = result['entries'] entries = result['entries']
self.assertEqual(len(entries), 100) self.assertEqual(len(entries), 100)
def test_youtube_flat_playlist_titles(self): def test_youtube_flat_playlist_extraction(self):
dl = FakeYDL() dl = FakeYDL()
dl.params['extract_flat'] = True dl.params['extract_flat'] = True
ie = YoutubePlaylistIE(dl) ie = YoutubeTabIE(dl)
result = ie.extract('https://www.youtube.com/playlist?list=PL-KKIb8rvtMSrAO9YFbeM6UQrAqoFTUWv') result = ie.extract('https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc')
self.assertIsPlaylist(result) self.assertIsPlaylist(result)
for entry in result['entries']: entries = list(result['entries'])
self.assertTrue(entry.get('title')) self.assertTrue(len(entries) == 1)
video = entries[0]
self.assertEqual(video['_type'], 'url_transparent')
self.assertEqual(video['ie_key'], 'Youtube')
self.assertEqual(video['id'], 'BaW_jenozKc')
self.assertEqual(video['url'], 'BaW_jenozKc')
self.assertEqual(video['title'], 'youtube-dl test video "\'/\\ä↭𝕐')
self.assertEqual(video['duration'], 10)
self.assertEqual(video['uploader'], 'Philipp Hagemeister')
if __name__ == '__main__': if __name__ == '__main__':

View file

@ -324,20 +324,42 @@ def _real_extract(self, url):
formats = [] formats = []
for a in video_node.findall('.//asset'): for a in video_node.findall('.//asset'):
file_name = xpath_text(a, './fileName', default=None)
if not file_name:
continue
format_type = a.attrib.get('type')
format_url = url_or_none(file_name)
if format_url:
ext = determine_ext(file_name)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
format_url, display_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id=format_type or 'hls', fatal=False))
continue
elif ext == 'f4m':
formats.extend(self._extract_f4m_formats(
update_url_query(format_url, {'hdcore': '3.7.0'}),
display_id, f4m_id=format_type or 'hds', fatal=False))
continue
f = { f = {
'format_id': a.attrib['type'], 'format_id': format_type,
'width': int_or_none(a.find('./frameWidth').text), 'width': int_or_none(xpath_text(a, './frameWidth')),
'height': int_or_none(a.find('./frameHeight').text), 'height': int_or_none(xpath_text(a, './frameHeight')),
'vbr': int_or_none(a.find('./bitrateVideo').text), 'vbr': int_or_none(xpath_text(a, './bitrateVideo')),
'abr': int_or_none(a.find('./bitrateAudio').text), 'abr': int_or_none(xpath_text(a, './bitrateAudio')),
'vcodec': a.find('./codecVideo').text, 'vcodec': xpath_text(a, './codecVideo'),
'tbr': int_or_none(a.find('./totalBitrate').text), 'tbr': int_or_none(xpath_text(a, './totalBitrate')),
} }
if a.find('./serverPrefix').text: server_prefix = xpath_text(a, './serverPrefix', default=None)
f['url'] = a.find('./serverPrefix').text if server_prefix:
f['playpath'] = a.find('./fileName').text f.update({
'url': server_prefix,
'playpath': file_name,
})
else: else:
f['url'] = a.find('./fileName').text if not format_url:
continue
f['url'] = format_url
formats.append(f) formats.append(f)
self._sort_formats(formats) self._sort_formats(formats)

View file

@ -7,19 +7,21 @@
from .gigya import GigyaBaseIE from .gigya import GigyaBaseIE
from ..compat import compat_HTTPError from ..compat import compat_HTTPError
from ..utils import ( from ..utils import (
extract_attributes,
ExtractorError, ExtractorError,
strip_or_none, clean_html,
extract_attributes,
float_or_none, float_or_none,
get_element_by_class,
int_or_none, int_or_none,
merge_dicts, merge_dicts,
str_or_none, str_or_none,
strip_or_none,
url_or_none, url_or_none,
) )
class CanvasIE(InfoExtractor): class CanvasIE(InfoExtractor):
_VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza)/assets/(?P<id>[^/?#&]+)' _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza|dako)/assets/(?P<id>[^/?#&]+)'
_TESTS = [{ _TESTS = [{
'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
'md5': '68993eda72ef62386a15ea2cf3c93107', 'md5': '68993eda72ef62386a15ea2cf3c93107',
@ -332,3 +334,51 @@ def _real_extract(self, url):
'display_id': display_id, 'display_id': display_id,
'season_number': int_or_none(page.get('episode_season')), 'season_number': int_or_none(page.get('episode_season')),
}) })
class DagelijkseKostIE(InfoExtractor):
IE_DESC = 'dagelijksekost.een.be'
_VALID_URL = r'https?://dagelijksekost\.een\.be/gerechten/(?P<id>[^/?#&]+)'
_TEST = {
'url': 'https://dagelijksekost.een.be/gerechten/hachis-parmentier-met-witloof',
'md5': '30bfffc323009a3e5f689bef6efa2365',
'info_dict': {
'id': 'md-ast-27a4d1ff-7d7b-425e-b84f-a4d227f592fa',
'display_id': 'hachis-parmentier-met-witloof',
'ext': 'mp4',
'title': 'Hachis parmentier met witloof',
'description': 'md5:9960478392d87f63567b5b117688cdc5',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 283.02,
},
'expected_warnings': ['is not a supported codec'],
}
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
title = strip_or_none(get_element_by_class(
'dish-metadata__title', webpage
) or self._html_search_meta(
'twitter:title', webpage))
description = clean_html(get_element_by_class(
'dish-description', webpage)
) or self._html_search_meta(
('description', 'twitter:description', 'og:description'),
webpage)
video_id = self._html_search_regex(
r'data-url=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id',
group='id')
return {
'_type': 'url_transparent',
'url': 'https://mediazone.vrt.be/api/v1/dako/assets/%s' % video_id,
'ie_key': CanvasIE.ie_key(),
'id': video_id,
'display_id': display_id,
'title': title,
'description': description,
}

View file

@ -1,12 +1,14 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import calendar
import datetime import datetime
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
clean_html, clean_html,
extract_timezone,
int_or_none, int_or_none,
parse_duration, parse_duration,
parse_resolution, parse_resolution,
@ -97,8 +99,9 @@ def _real_extract(self, url):
timestamp = None timestamp = None
data_utc = try_get(informacio, lambda x: x['data_emissio']['utc']) data_utc = try_get(informacio, lambda x: x['data_emissio']['utc'])
try: try:
timestamp = datetime.datetime.strptime( timezone, data_utc = extract_timezone(data_utc)
data_utc, '%Y-%d-%mT%H:%M:%S%z').timestamp() timestamp = calendar.timegm((datetime.datetime.strptime(
data_utc, '%Y-%d-%mT%H:%M:%S') - timezone).timetuple())
except TypeError: except TypeError:
pass pass

View file

@ -1,6 +1,7 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import json
import re import re
from .common import InfoExtractor from .common import InfoExtractor
@ -10,11 +11,13 @@
ExtractorError, ExtractorError,
float_or_none, float_or_none,
int_or_none, int_or_none,
strip_or_none,
unified_timestamp, unified_timestamp,
) )
class DPlayIE(InfoExtractor): class DPlayIE(InfoExtractor):
_PATH_REGEX = r'/(?P<id>[^/]+/[^/?#]+)'
_VALID_URL = r'''(?x)https?:// _VALID_URL = r'''(?x)https?://
(?P<domain> (?P<domain>
(?:www\.)?(?P<host>d (?:www\.)?(?P<host>d
@ -24,7 +27,7 @@ class DPlayIE(InfoExtractor):
) )
)| )|
(?P<subdomain_country>es|it)\.dplay\.com (?P<subdomain_country>es|it)\.dplay\.com
)/[^/]+/(?P<id>[^/]+/[^/?#]+)''' )/[^/]+''' + _PATH_REGEX
_TESTS = [{ _TESTS = [{
# non geo restricted, via secure api, unsigned download hls URL # non geo restricted, via secure api, unsigned download hls URL
@ -151,56 +154,79 @@ class DPlayIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
def _process_errors(self, e, geo_countries):
info = self._parse_json(e.cause.read().decode('utf-8'), None)
error = info['errors'][0]
error_code = error.get('code')
if error_code == 'access.denied.geoblocked':
self.raise_geo_restricted(countries=geo_countries)
elif error_code in ('access.denied.missingpackage', 'invalid.token'):
raise ExtractorError(
'This video is only available for registered users. You may want to use --cookies.', expected=True)
raise ExtractorError(info['errors'][0]['detail'], expected=True)
def _update_disco_api_headers(self, headers, disco_base, display_id, realm):
headers['Authorization'] = 'Bearer ' + self._download_json(
disco_base + 'token', display_id, 'Downloading token',
query={
'realm': realm,
})['data']['attributes']['token']
def _download_video_playback_info(self, disco_base, video_id, headers):
streaming = self._download_json(
disco_base + 'playback/videoPlaybackInfo/' + video_id,
video_id, headers=headers)['data']['attributes']['streaming']
streaming_list = []
for format_id, format_dict in streaming.items():
streaming_list.append({
'type': format_id,
'url': format_dict.get('url'),
})
return streaming_list
def _get_disco_api_info(self, url, display_id, disco_host, realm, country): def _get_disco_api_info(self, url, display_id, disco_host, realm, country):
geo_countries = [country.upper()] geo_countries = [country.upper()]
self._initialize_geo_bypass({ self._initialize_geo_bypass({
'countries': geo_countries, 'countries': geo_countries,
}) })
disco_base = 'https://%s/' % disco_host disco_base = 'https://%s/' % disco_host
token = self._download_json(
disco_base + 'token', display_id, 'Downloading token',
query={
'realm': realm,
})['data']['attributes']['token']
headers = { headers = {
'Referer': url, 'Referer': url,
'Authorization': 'Bearer ' + token,
} }
video = self._download_json( self._update_disco_api_headers(headers, disco_base, display_id, realm)
disco_base + 'content/videos/' + display_id, display_id, try:
headers=headers, query={ video = self._download_json(
'fields[channel]': 'name', disco_base + 'content/videos/' + display_id, display_id,
'fields[image]': 'height,src,width', headers=headers, query={
'fields[show]': 'name', 'fields[channel]': 'name',
'fields[tag]': 'name', 'fields[image]': 'height,src,width',
'fields[video]': 'description,episodeNumber,name,publishStart,seasonNumber,videoDuration', 'fields[show]': 'name',
'include': 'images,primaryChannel,show,tags' 'fields[tag]': 'name',
}) 'fields[video]': 'description,episodeNumber,name,publishStart,seasonNumber,videoDuration',
'include': 'images,primaryChannel,show,tags'
})
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
self._process_errors(e, geo_countries)
raise
video_id = video['data']['id'] video_id = video['data']['id']
info = video['data']['attributes'] info = video['data']['attributes']
title = info['name'].strip() title = info['name'].strip()
formats = [] formats = []
try: try:
streaming = self._download_json( streaming = self._download_video_playback_info(
disco_base + 'playback/videoPlaybackInfo/' + video_id, disco_base, video_id, headers)
display_id, headers=headers)['data']['attributes']['streaming']
except ExtractorError as e: except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
info = self._parse_json(e.cause.read().decode('utf-8'), display_id) self._process_errors(e, geo_countries)
error = info['errors'][0]
error_code = error.get('code')
if error_code == 'access.denied.geoblocked':
self.raise_geo_restricted(countries=geo_countries)
elif error_code == 'access.denied.missingpackage':
self.raise_login_required()
raise ExtractorError(info['errors'][0]['detail'], expected=True)
raise raise
for format_id, format_dict in streaming.items(): for format_dict in streaming:
if not isinstance(format_dict, dict): if not isinstance(format_dict, dict):
continue continue
format_url = format_dict.get('url') format_url = format_dict.get('url')
if not format_url: if not format_url:
continue continue
format_id = format_dict.get('type')
ext = determine_ext(format_url) ext = determine_ext(format_url)
if format_id == 'dash' or ext == 'mpd': if format_id == 'dash' or ext == 'mpd':
formats.extend(self._extract_mpd_formats( formats.extend(self._extract_mpd_formats(
@ -248,7 +274,7 @@ def _get_disco_api_info(self, url, display_id, disco_host, realm, country):
'id': video_id, 'id': video_id,
'display_id': display_id, 'display_id': display_id,
'title': title, 'title': title,
'description': info.get('description'), 'description': strip_or_none(info.get('description')),
'duration': float_or_none(info.get('videoDuration'), 1000), 'duration': float_or_none(info.get('videoDuration'), 1000),
'timestamp': unified_timestamp(info.get('publishStart')), 'timestamp': unified_timestamp(info.get('publishStart')),
'series': series, 'series': series,
@ -268,3 +294,75 @@ def _real_extract(self, url):
host = 'disco-api.' + domain if domain[0] == 'd' else 'eu2-prod.disco-api.com' host = 'disco-api.' + domain if domain[0] == 'd' else 'eu2-prod.disco-api.com'
return self._get_disco_api_info( return self._get_disco_api_info(
url, display_id, host, 'dplay' + country, country) url, display_id, host, 'dplay' + country, country)
class DiscoveryPlusIE(DPlayIE):
_VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/video' + DPlayIE._PATH_REGEX
_TESTS = [{
'url': 'https://www.discoveryplus.com/video/property-brothers-forever-home/food-and-family',
'info_dict': {
'id': '1140794',
'display_id': 'property-brothers-forever-home/food-and-family',
'ext': 'mp4',
'title': 'Food and Family',
'description': 'The brothers help a Richmond family expand their single-level home.',
'duration': 2583.113,
'timestamp': 1609304400,
'upload_date': '20201230',
'creator': 'HGTV',
'series': 'Property Brothers: Forever Home',
'season_number': 1,
'episode_number': 1,
},
'skip': 'Available for Premium users',
}]
def _update_disco_api_headers(self, headers, disco_base, display_id, realm):
headers['x-disco-client'] = 'WEB:UNKNOWN:dplus_us:15.0.0'
def _download_video_playback_info(self, disco_base, video_id, headers):
return self._download_json(
disco_base + 'playback/v3/videoPlaybackInfo',
video_id, headers=headers, data=json.dumps({
'deviceInfo': {
'adBlocker': False,
},
'videoId': video_id,
'wisteriaProperties': {
'platform': 'desktop',
},
}).encode('utf-8'))['data']['attributes']['streaming']
def _real_extract(self, url):
display_id = self._match_id(url)
return self._get_disco_api_info(
url, display_id, 'us1-prod-direct.discoveryplus.com', 'go', 'us')
class HGTVDeIE(DPlayIE):
_VALID_URL = r'https?://de\.hgtv\.com/sendungen' + DPlayIE._PATH_REGEX
_TESTS = [{
'url': 'https://de.hgtv.com/sendungen/tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette/',
'info_dict': {
'id': '151205',
'display_id': 'tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette',
'ext': 'mp4',
'title': 'Wer braucht schon eine Toilette',
'description': 'md5:05b40a27e7aed2c9172de34d459134e2',
'duration': 1177.024,
'timestamp': 1595705400,
'upload_date': '20200725',
'creator': 'HGTV',
'series': 'Tiny House - klein, aber oho',
'season_number': 3,
'episode_number': 3,
},
'params': {
'format': 'bestvideo',
},
}]
def _real_extract(self, url):
display_id = self._match_id(url)
return self._get_disco_api_info(
url, display_id, 'eu1-prod.disco-api.com', 'hgtv', 'de')

View file

@ -0,0 +1,193 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
unified_strdate,
xpath_text,
determine_ext,
float_or_none,
ExtractorError,
)
class DreiSatIE(InfoExtractor):
IE_NAME = '3sat'
_GEO_COUNTRIES = ['DE']
_VALID_URL = r'https?://(?:www\.)?3sat\.de/mediathek/(?:(?:index|mediathek)\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)'
_TESTS = [
{
'url': 'http://www.3sat.de/mediathek/index.php?mode=play&obj=45918',
'md5': 'be37228896d30a88f315b638900a026e',
'info_dict': {
'id': '45918',
'ext': 'mp4',
'title': 'Waidmannsheil',
'description': 'md5:cce00ca1d70e21425e72c86a98a56817',
'uploader': 'SCHWEIZWEIT',
'uploader_id': '100000210',
'upload_date': '20140913'
},
'params': {
'skip_download': True, # m3u8 downloads
}
},
{
'url': 'http://www.3sat.de/mediathek/mediathek.php?mode=play&obj=51066',
'only_matching': True,
},
]
def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
param_groups = {}
for param_group in smil.findall(self._xpath_ns('./head/paramGroup', namespace)):
group_id = param_group.get(self._xpath_ns(
'id', 'http://www.w3.org/XML/1998/namespace'))
params = {}
for param in param_group:
params[param.get('name')] = param.get('value')
param_groups[group_id] = params
formats = []
for video in smil.findall(self._xpath_ns('.//video', namespace)):
src = video.get('src')
if not src:
continue
bitrate = int_or_none(self._search_regex(r'_(\d+)k', src, 'bitrate', None)) or float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
group_id = video.get('paramGroup')
param_group = param_groups[group_id]
for proto in param_group['protocols'].split(','):
formats.append({
'url': '%s://%s' % (proto, param_group['host']),
'app': param_group['app'],
'play_path': src,
'ext': 'flv',
'format_id': '%s-%d' % (proto, bitrate),
'tbr': bitrate,
})
self._sort_formats(formats)
return formats
def extract_from_xml_url(self, video_id, xml_url):
doc = self._download_xml(
xml_url, video_id,
note='Downloading video info',
errnote='Failed to download video info')
status_code = xpath_text(doc, './status/statuscode')
if status_code and status_code != 'ok':
if status_code == 'notVisibleAnymore':
message = 'Video %s is not available' % video_id
else:
message = '%s returned error: %s' % (self.IE_NAME, status_code)
raise ExtractorError(message, expected=True)
title = xpath_text(doc, './/information/title', 'title', True)
urls = []
formats = []
for fnode in doc.findall('.//formitaeten/formitaet'):
video_url = xpath_text(fnode, 'url')
if not video_url or video_url in urls:
continue
urls.append(video_url)
is_available = 'http://www.metafilegenerator' not in video_url
geoloced = 'static_geoloced_online' in video_url
if not is_available or geoloced:
continue
format_id = fnode.attrib['basetype']
format_m = re.match(r'''(?x)
(?P<vcodec>[^_]+)_(?P<acodec>[^_]+)_(?P<container>[^_]+)_
(?P<proto>[^_]+)_(?P<index>[^_]+)_(?P<indexproto>[^_]+)
''', format_id)
ext = determine_ext(video_url, None) or format_m.group('container')
if ext == 'meta':
continue
elif ext == 'smil':
formats.extend(self._extract_smil_formats(
video_url, video_id, fatal=False))
elif ext == 'm3u8':
# the certificates are misconfigured (see
# https://github.com/ytdl-org/youtube-dl/issues/8665)
if video_url.startswith('https://'):
continue
formats.extend(self._extract_m3u8_formats(
video_url, video_id, 'mp4', 'm3u8_native',
m3u8_id=format_id, fatal=False))
elif ext == 'f4m':
formats.extend(self._extract_f4m_formats(
video_url, video_id, f4m_id=format_id, fatal=False))
else:
quality = xpath_text(fnode, './quality')
if quality:
format_id += '-' + quality
abr = int_or_none(xpath_text(fnode, './audioBitrate'), 1000)
vbr = int_or_none(xpath_text(fnode, './videoBitrate'), 1000)
tbr = int_or_none(self._search_regex(
r'_(\d+)k', video_url, 'bitrate', None))
if tbr and vbr and not abr:
abr = tbr - vbr
formats.append({
'format_id': format_id,
'url': video_url,
'ext': ext,
'acodec': format_m.group('acodec'),
'vcodec': format_m.group('vcodec'),
'abr': abr,
'vbr': vbr,
'tbr': tbr,
'width': int_or_none(xpath_text(fnode, './width')),
'height': int_or_none(xpath_text(fnode, './height')),
'filesize': int_or_none(xpath_text(fnode, './filesize')),
'protocol': format_m.group('proto').lower(),
})
geolocation = xpath_text(doc, './/details/geolocation')
if not formats and geolocation and geolocation != 'none':
self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
self._sort_formats(formats)
thumbnails = []
for node in doc.findall('.//teaserimages/teaserimage'):
thumbnail_url = node.text
if not thumbnail_url:
continue
thumbnail = {
'url': thumbnail_url,
}
thumbnail_key = node.get('key')
if thumbnail_key:
m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key)
if m:
thumbnail['width'] = int(m.group(1))
thumbnail['height'] = int(m.group(2))
thumbnails.append(thumbnail)
upload_date = unified_strdate(xpath_text(doc, './/details/airtime'))
return {
'id': video_id,
'title': title,
'description': xpath_text(doc, './/information/detail'),
'duration': int_or_none(xpath_text(doc, './/details/lengthSec')),
'thumbnails': thumbnails,
'uploader': xpath_text(doc, './/details/originChannelTitle'),
'uploader_id': xpath_text(doc, './/details/originChannelId'),
'upload_date': upload_date,
'formats': formats,
}
def _real_extract(self, url):
video_id = self._match_id(url)
details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?id=%s' % video_id
return self.extract_from_xml_url(video_id, details_url)

View file

@ -182,6 +182,7 @@
CanvasIE, CanvasIE,
CanvasEenIE, CanvasEenIE,
VrtNUIE, VrtNUIE,
DagelijkseKostIE,
) )
from .carambatv import ( from .carambatv import (
CarambaTVIE, CarambaTVIE,
@ -309,7 +310,12 @@
DouyuShowIE, DouyuShowIE,
DouyuTVIE, DouyuTVIE,
) )
from .dplay import DPlayIE from .dplay import (
DPlayIE,
DiscoveryPlusIE,
HGTVDeIE,
)
from .dreisat import DreiSatIE
from .drbonanza import DRBonanzaIE from .drbonanza import DRBonanzaIE
from .drtuber import DrTuberIE from .drtuber import DrTuberIE
from .drtv import ( from .drtv import (
@ -1107,6 +1113,11 @@
VivoIE, VivoIE,
) )
from .showroomlive import ShowRoomLiveIE from .showroomlive import ShowRoomLiveIE
from .simplecast import (
SimplecastIE,
SimplecastEpisodeIE,
SimplecastPodcastIE,
)
from .sina import SinaIE from .sina import SinaIE
from .sixplay import SixPlayIE from .sixplay import SixPlayIE
from .skyit import ( from .skyit import (
@ -1165,11 +1176,6 @@
BellatorIE, BellatorIE,
ParamountNetworkIE, ParamountNetworkIE,
) )
from .storyfire import (
StoryFireIE,
StoryFireUserIE,
StoryFireSeriesIE,
)
from .stitcher import StitcherIE from .stitcher import StitcherIE
from .sport5 import Sport5IE from .sport5 import Sport5IE
from .sportbox import SportBoxIE from .sportbox import SportBoxIE
@ -1193,6 +1199,11 @@
from .srmediathek import SRMediathekIE from .srmediathek import SRMediathekIE
from .stanfordoc import StanfordOpenClassroomIE from .stanfordoc import StanfordOpenClassroomIE
from .steam import SteamIE from .steam import SteamIE
from .storyfire import (
StoryFireIE,
StoryFireUserIE,
StoryFireSeriesIE,
)
from .streamable import StreamableIE from .streamable import StreamableIE
from .streamcloud import StreamcloudIE from .streamcloud import StreamcloudIE
from .streamcz import StreamCZIE from .streamcz import StreamCZIE
@ -1652,6 +1663,7 @@
ZattooLiveIE, ZattooLiveIE,
) )
from .zdf import ZDFIE, ZDFChannelIE from .zdf import ZDFIE, ZDFChannelIE
from .zhihu import ZhihuIE
from .zingmp3 import ZingMp3IE from .zingmp3 import ZingMp3IE
from .zoom import ZoomIE from .zoom import ZoomIE
from .zype import ZypeIE from .zype import ZypeIE

View file

@ -133,6 +133,7 @@
from .rumble import RumbleEmbedIE from .rumble import RumbleEmbedIE
from .arcpublishing import ArcPublishingIE from .arcpublishing import ArcPublishingIE
from .medialaan import MedialaanIE from .medialaan import MedialaanIE
from .simplecast import SimplecastIE
class GenericIE(InfoExtractor): class GenericIE(InfoExtractor):
@ -2240,6 +2241,15 @@ class GenericIE(InfoExtractor):
'duration': 159, 'duration': 159,
}, },
}, },
{
# Simplecast player embed
'url': 'https://www.bio.org/podcast',
'info_dict': {
'id': 'podcast',
'title': 'I AM BIO Podcast | BIO',
},
'playlist_mincount': 52,
},
] ]
def report_following_redirect(self, new_url): def report_following_redirect(self, new_url):
@ -2794,6 +2804,12 @@ def _real_extract(self, url):
return self.playlist_from_matches( return self.playlist_from_matches(
matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie') matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie')
# Look for Simplecast embeds
simplecast_urls = SimplecastIE._extract_urls(webpage)
if simplecast_urls:
return self.playlist_from_matches(
simplecast_urls, video_id, video_title)
# Look for BBC iPlayer embed # Look for BBC iPlayer embed
matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage) matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
if matches: if matches:

View file

@ -2,10 +2,11 @@
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
determine_ext,
ExtractorError, ExtractorError,
determine_ext,
int_or_none, int_or_none,
try_get, try_get,
unescapeHTML,
url_or_none, url_or_none,
) )
@ -14,7 +15,7 @@ class NineGagIE(InfoExtractor):
IE_NAME = '9gag' IE_NAME = '9gag'
_VALID_URL = r'https?://(?:www\.)?9gag\.com/gag/(?P<id>[^/?&#]+)' _VALID_URL = r'https?://(?:www\.)?9gag\.com/gag/(?P<id>[^/?&#]+)'
_TEST = { _TESTS = [{
'url': 'https://9gag.com/gag/ae5Ag7B', 'url': 'https://9gag.com/gag/ae5Ag7B',
'info_dict': { 'info_dict': {
'id': 'ae5Ag7B', 'id': 'ae5Ag7B',
@ -29,7 +30,11 @@ class NineGagIE(InfoExtractor):
'dislike_count': int, 'dislike_count': int,
'comment_count': int, 'comment_count': int,
} }
} }, {
# HTML escaped title
'url': 'https://9gag.com/gag/av5nvyb',
'only_matching': True,
}]
def _real_extract(self, url): def _real_extract(self, url):
post_id = self._match_id(url) post_id = self._match_id(url)
@ -43,7 +48,7 @@ def _real_extract(self, url):
'The given url does not contain a video', 'The given url does not contain a video',
expected=True) expected=True)
title = post['title'] title = unescapeHTML(post['title'])
duration = None duration = None
formats = [] formats = []

View file

@ -0,0 +1,160 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
clean_podcast_url,
int_or_none,
parse_iso8601,
strip_or_none,
try_get,
urlencode_postdata,
)
class SimplecastBaseIE(InfoExtractor):
_UUID_REGEX = r'[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12}'
_API_BASE = 'https://api.simplecast.com/'
def _call_api(self, path_tmpl, video_id):
return self._download_json(
self._API_BASE + path_tmpl % video_id, video_id)
def _call_search_api(self, resource, resource_id, resource_url):
return self._download_json(
'https://api.simplecast.com/%ss/search' % resource, resource_id,
data=urlencode_postdata({'url': resource_url}))
def _parse_episode(self, episode):
episode_id = episode['id']
title = episode['title'].strip()
audio_file = episode.get('audio_file') or {}
audio_file_url = audio_file.get('url') or episode.get('audio_file_url') or episode['enclosure_url']
season = episode.get('season') or {}
season_href = season.get('href')
season_id = None
if season_href:
season_id = self._search_regex(
r'https?://api.simplecast.com/seasons/(%s)' % self._UUID_REGEX,
season_href, 'season id', default=None)
webpage_url = episode.get('episode_url')
channel_url = None
if webpage_url:
channel_url = self._search_regex(
r'(https?://[^/]+\.simplecast\.com)',
webpage_url, 'channel url', default=None)
return {
'id': episode_id,
'display_id': episode.get('slug'),
'title': title,
'url': clean_podcast_url(audio_file_url),
'webpage_url': webpage_url,
'channel_url': channel_url,
'series': try_get(episode, lambda x: x['podcast']['title']),
'season_number': int_or_none(season.get('number')),
'season_id': season_id,
'thumbnail': episode.get('image_url'),
'episode_id': episode_id,
'episode_number': int_or_none(episode.get('number')),
'description': strip_or_none(episode.get('description')),
'timestamp': parse_iso8601(episode.get('published_at')),
'duration': int_or_none(episode.get('duration')),
'filesize': int_or_none(audio_file.get('size') or episode.get('audio_file_size')),
}
class SimplecastIE(SimplecastBaseIE):
IE_NAME = 'simplecast'
_VALID_URL = r'https?://(?:api\.simplecast\.com/episodes|player\.simplecast\.com)/(?P<id>%s)' % SimplecastBaseIE._UUID_REGEX
_COMMON_TEST_INFO = {
'display_id': 'errant-signal-chris-franklin-new-wave-video-essays',
'id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876',
'ext': 'mp3',
'title': 'Errant Signal - Chris Franklin & New Wave Video Essays',
'episode_number': 1,
'episode_id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876',
'description': 'md5:34752789d3d2702e2d2c975fbd14f357',
'season_number': 1,
'season_id': 'e23df0da-bae4-4531-8bbf-71364a88dc13',
'series': 'The RE:BIND.io Podcast',
'duration': 5343,
'timestamp': 1580979475,
'upload_date': '20200206',
'webpage_url': r're:^https?://the-re-bind-io-podcast\.simplecast\.com/episodes/errant-signal-chris-franklin-new-wave-video-essays',
'channel_url': r're:^https?://the-re-bind-io-podcast\.simplecast\.com$',
}
_TESTS = [{
'url': 'https://api.simplecast.com/episodes/b6dc49a2-9404-4853-9aa9-9cfc097be876',
'md5': '8c93be7be54251bf29ee97464eabe61c',
'info_dict': _COMMON_TEST_INFO,
}, {
'url': 'https://player.simplecast.com/b6dc49a2-9404-4853-9aa9-9cfc097be876',
'only_matching': True,
}]
@staticmethod
def _extract_urls(webpage):
return re.findall(
r'''(?x)<iframe[^>]+src=["\']
(
https?://(?:embed\.simplecast\.com/[0-9a-f]{8}|
player\.simplecast\.com/%s
))''' % SimplecastBaseIE._UUID_REGEX, webpage)
def _real_extract(self, url):
episode_id = self._match_id(url)
episode = self._call_api('episodes/%s', episode_id)
return self._parse_episode(episode)
class SimplecastEpisodeIE(SimplecastBaseIE):
IE_NAME = 'simplecast:episode'
_VALID_URL = r'https?://(?!api\.)[^/]+\.simplecast\.com/episodes/(?P<id>[^/?&#]+)'
_TEST = {
'url': 'https://the-re-bind-io-podcast.simplecast.com/episodes/errant-signal-chris-franklin-new-wave-video-essays',
'md5': '8c93be7be54251bf29ee97464eabe61c',
'info_dict': SimplecastIE._COMMON_TEST_INFO,
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
episode = self._call_search_api(
'episode', mobj.group(1), mobj.group(0))
return self._parse_episode(episode)
class SimplecastPodcastIE(SimplecastBaseIE):
IE_NAME = 'simplecast:podcast'
_VALID_URL = r'https?://(?!(?:api|cdn|embed|feeds|player)\.)(?P<id>[^/]+)\.simplecast\.com(?!/episodes/[^/?&#]+)'
_TESTS = [{
'url': 'https://the-re-bind-io-podcast.simplecast.com',
'playlist_mincount': 33,
'info_dict': {
'id': '07d28d26-7522-42eb-8c53-2bdcfc81c43c',
'title': 'The RE:BIND.io Podcast',
},
}, {
'url': 'https://the-re-bind-io-podcast.simplecast.com/episodes',
'only_matching': True,
}]
def _real_extract(self, url):
subdomain = self._match_id(url)
site = self._call_search_api('site', subdomain, url)
podcast = site['podcast']
podcast_id = podcast['id']
podcast_title = podcast.get('title')
def entries():
episodes = self._call_api('podcasts/%s/episodes', podcast_id)
for episode in (episodes.get('collection') or []):
info = self._parse_episode(episode)
info['series'] = podcast_title
yield info
return self.playlist_result(entries(), podcast_id, podcast_title)

View file

@ -1,255 +1,151 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import itertools import functools
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import (
# HEADRequest,
int_or_none,
OnDemandPagedList,
smuggle_url,
)
class StoryFireIE(InfoExtractor): class StoryFireBaseIE(InfoExtractor):
_VALID_URL = r'(?:(?:https?://(?:www\.)?storyfire\.com/video-details)|(?:https://storyfire.app.link))/(?P<id>[^/\s]+)' _VALID_URL_BASE = r'https?://(?:www\.)?storyfire\.com/'
_TESTS = [{
def _call_api(self, path, video_id, resource, query=None):
return self._download_json(
'https://storyfire.com/app/%s/%s' % (path, video_id), video_id,
'Downloading %s JSON metadata' % resource, query=query)
def _parse_video(self, video):
title = video['title']
vimeo_id = self._search_regex(
r'https?://player\.vimeo\.com/external/(\d+)',
video['vimeoVideoURL'], 'vimeo id')
# video_url = self._request_webpage(
# HEADRequest(video['vimeoVideoURL']), video_id).geturl()
# formats = []
# for v_url, suffix in [(video_url, '_sep'), (video_url.replace('/sep/video/', '/video/'), '')]:
# formats.extend(self._extract_m3u8_formats(
# v_url, video_id, 'mp4', 'm3u8_native',
# m3u8_id='hls' + suffix, fatal=False))
# formats.extend(self._extract_mpd_formats(
# v_url.replace('.m3u8', '.mpd'), video_id,
# mpd_id='dash' + suffix, fatal=False))
# self._sort_formats(formats)
uploader_id = video.get('hostID')
return {
'_type': 'url_transparent',
'id': vimeo_id,
'title': title,
'description': video.get('description'),
'url': smuggle_url(
'https://player.vimeo.com/video/' + vimeo_id, {
'http_headers': {
'Referer': 'https://storyfire.com/',
}
}),
# 'formats': formats,
'thumbnail': video.get('storyImage'),
'view_count': int_or_none(video.get('views')),
'like_count': int_or_none(video.get('likesCount')),
'comment_count': int_or_none(video.get('commentsCount')),
'duration': int_or_none(video.get('videoDuration')),
'timestamp': int_or_none(video.get('publishDate')),
'uploader': video.get('username'),
'uploader_id': uploader_id,
'uploader_url': 'https://storyfire.com/user/%s/video' % uploader_id if uploader_id else None,
'episode_number': int_or_none(video.get('episodeNumber') or video.get('episode_number')),
}
class StoryFireIE(StoryFireBaseIE):
_VALID_URL = StoryFireBaseIE._VALID_URL_BASE + r'video-details/(?P<id>[0-9a-f]{24})'
_TEST = {
'url': 'https://storyfire.com/video-details/5df1d132b6378700117f9181', 'url': 'https://storyfire.com/video-details/5df1d132b6378700117f9181',
'md5': '560953bfca81a69003cfa5e53ac8a920', 'md5': 'caec54b9e4621186d6079c7ec100c1eb',
'info_dict': { 'info_dict': {
'id': '5df1d132b6378700117f9181', 'id': '378954662',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Buzzfeed Teaches You About Memes', 'title': 'Buzzfeed Teaches You About Memes',
'uploader_id': 'ntZAJFECERSgqHSxzonV5K2E89s1', 'uploader_id': 'ntZAJFECERSgqHSxzonV5K2E89s1',
'timestamp': 1576129028, 'timestamp': 1576129028,
'description': 'Mocking Buzzfeed\'s meme lesson. Reuploaded from YouTube because of their new policies', 'description': 'md5:0b4e28021548e144bed69bb7539e62ea',
'uploader': 'whang!', 'uploader': 'whang!',
'upload_date': '20191212', 'upload_date': '20191212',
'duration': 418,
'view_count': int,
'like_count': int,
'comment_count': int,
}, },
'params': {'format': 'bestvideo'} # There are no merged formats in the playlist. 'params': {
}, { 'skip_download': True,
'url': 'https://storyfire.app.link/5GxAvWOQr8', # Alternate URL format, with unrelated short ID
'md5': '7a2dc6d60c4889edfed459c620fe690d',
'info_dict': {
'id': '5f1e11ecd78a57b6c702001d',
'ext': 'm4a',
'title': 'Weird Nintendo Prototype Leaks',
'description': 'A stream taking a look at some weird Nintendo Prototypes with Luigi in Mario 64 and weird Yoshis',
'timestamp': 1595808576,
'upload_date': '20200727',
'uploader': 'whang!',
'uploader_id': 'ntZAJFECERSgqHSxzonV5K2E89s1',
}, },
'params': {'format': 'bestaudio'} # Verifying audio extraction 'expected_warnings': ['Unable to download JSON metadata']
}]
_aformats = {
'audio-medium-audio': {'acodec': 'aac', 'abr': 125, 'preference': -10},
'audio-high-audio': {'acodec': 'aac', 'abr': 254, 'preference': -1},
} }
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) video = self._call_api(
'generic/video-detail', video_id, 'video')['video']
# Extracting the json blob is mandatory to proceed with extraction. return self._parse_video(video)
jsontext = self._html_search_regex(
r'<script id="__NEXT_DATA__" type="application/json">(.+?)</script>',
webpage, 'json_data')
json = self._parse_json(jsontext, video_id)
# The currentVideo field in the json is mandatory
# because it contains the only link to the m3u playlist
video = json['props']['initialState']['video']['currentVideo']
videourl = video['vimeoVideoURL'] # Video URL is mandatory
# Extract other fields from the json in an error tolerant fashion
# ID may be incorrect (on short URL format), correct it.
parsed_id = video.get('_id')
if parsed_id:
video_id = parsed_id
title = video.get('title')
description = video.get('description')
thumbnail = video.get('storyImage')
views = video.get('views')
likes = video.get('likesCount')
comments = video.get('commentsCount')
duration = video.get('videoDuration')
publishdate = video.get('publishDate') # Apparently epoch time, day only
uploader = video.get('username')
uploader_id = video.get('hostID')
# Construct an uploader URL
uploader_url = None
if uploader_id:
uploader_url = "https://storyfire.com/user/%s/video" % uploader_id
# Collect root playlist to determine formats
formats = self._extract_m3u8_formats(
videourl, video_id, 'mp4', 'm3u8_native')
# Modify formats to fill in missing information about audio codecs
for format in formats:
aformat = self._aformats.get(format['format_id'])
if aformat:
format['acodec'] = aformat['acodec']
format['abr'] = aformat['abr']
format['quality'] = aformat['preference']
format['ext'] = 'm4a'
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'description': description,
'ext': "mp4",
'url': videourl,
'formats': formats,
'thumbnail': thumbnail,
'view_count': views,
'like_count': likes,
'comment_count': comments,
'duration': duration,
'timestamp': publishdate,
'uploader': uploader,
'uploader_id': uploader_id,
'uploader_url': uploader_url,
}
class StoryFireUserIE(InfoExtractor): class StoryFireUserIE(StoryFireBaseIE):
_VALID_URL = r'https?://(?:www\.)?storyfire\.com/user/(?P<id>[^/\s]+)/video' _VALID_URL = StoryFireBaseIE._VALID_URL_BASE + r'user/(?P<id>[^/]+)/video'
_TESTS = [{ _TEST = {
'url': 'https://storyfire.com/user/ntZAJFECERSgqHSxzonV5K2E89s1/video',
'info_dict': {
'id': 'ntZAJFECERSgqHSxzonV5K2E89s1',
'title': 'whang!',
},
'playlist_mincount': 18
}, {
'url': 'https://storyfire.com/user/UQ986nFxmAWIgnkZQ0ftVhq4nOk2/video', 'url': 'https://storyfire.com/user/UQ986nFxmAWIgnkZQ0ftVhq4nOk2/video',
'info_dict': { 'info_dict': {
'id': 'UQ986nFxmAWIgnkZQ0ftVhq4nOk2', 'id': 'UQ986nFxmAWIgnkZQ0ftVhq4nOk2',
'title': 'McJuggerNuggets',
}, },
'playlist_mincount': 143 'playlist_mincount': 151,
}
_PAGE_SIZE = 20
}] def _fetch_page(self, user_id, page):
videos = self._call_api(
# Generator for fetching playlist items 'publicVideos', user_id, 'page %d' % (page + 1), {
def _enum_videos(self, baseurl, user_id, firstjson): 'skip': page * self._PAGE_SIZE,
totalVideos = int(firstjson['videosCount']) })['videos']
haveVideos = 0 for video in videos:
json = firstjson yield self._parse_video(video)
for page in itertools.count(1):
for video in json['videos']:
id = video['_id']
url = "https://storyfire.com/video-details/%s" % id
haveVideos += 1
yield {
'_type': 'url',
'id': id,
'url': url,
'ie_key': 'StoryFire',
'title': video.get('title'),
'description': video.get('description'),
'view_count': video.get('views'),
'comment_count': video.get('commentsCount'),
'duration': video.get('videoDuration'),
'timestamp': video.get('publishDate'),
}
# Are there more pages we could fetch?
if haveVideos < totalVideos:
pageurl = baseurl + ("%i" % haveVideos)
json = self._download_json(pageurl, user_id,
note='Downloading page %s' % page)
# Are there any videos in the new json?
videos = json.get('videos')
if not videos or len(videos) == 0:
break # no videos
else:
break # We have fetched all the videos, stop
def _real_extract(self, url): def _real_extract(self, url):
user_id = self._match_id(url) user_id = self._match_id(url)
entries = OnDemandPagedList(functools.partial(
baseurl = "https://storyfire.com/app/publicVideos/%s?skip=" % user_id self._fetch_page, user_id), self._PAGE_SIZE)
return self.playlist_result(entries, user_id)
# Download first page to ensure it can be downloaded, and get user information if available.
firstpage = baseurl + "0"
firstjson = self._download_json(firstpage, user_id)
title = None
videos = firstjson.get('videos')
if videos and len(videos):
title = videos[1].get('username')
return {
'_type': 'playlist',
'entries': self._enum_videos(baseurl, user_id, firstjson),
'id': user_id,
'title': title,
}
class StoryFireSeriesIE(InfoExtractor): class StoryFireSeriesIE(StoryFireBaseIE):
_VALID_URL = r'https?://(?:www\.)?storyfire\.com/write/series/stories/(?P<id>[^/\s]+)' _VALID_URL = StoryFireBaseIE._VALID_URL_BASE + r'write/series/stories/(?P<id>[^/?&#]+)'
_TESTS = [{ _TESTS = [{
'url': 'https://storyfire.com/write/series/stories/-Lq6MsuIHLODO6d2dDkr/', 'url': 'https://storyfire.com/write/series/stories/-Lq6MsuIHLODO6d2dDkr/',
'info_dict': { 'info_dict': {
'id': '-Lq6MsuIHLODO6d2dDkr', 'id': '-Lq6MsuIHLODO6d2dDkr',
}, },
'playlist_mincount': 13 'playlist_mincount': 13,
}, { }, {
'url': 'https://storyfire.com/write/series/stories/the_mortal_one/', 'url': 'https://storyfire.com/write/series/stories/the_mortal_one/',
'info_dict': { 'info_dict': {
'id': 'the_mortal_one', 'id': 'the_mortal_one',
}, },
'playlist_count': 0 # This playlist has entries, but no videos. 'playlist_count': 0,
}, {
'url': 'https://storyfire.com/write/series/stories/story_time',
'info_dict': {
'id': 'story_time',
},
'playlist_mincount': 10
}] }]
# Generator for returning playlist items def _extract_videos(self, stories):
# This object is substantially different than the one in the user videos page above for story in stories.values():
def _enum_videos(self, jsonlist): if story.get('hasVideo'):
for video in jsonlist: yield self._parse_video(story)
id = video['_id']
if video.get('hasVideo'): # Boolean element
url = "https://storyfire.com/video-details/%s" % id
yield {
'_type': 'url',
'id': id,
'url': url,
'ie_key': 'StoryFire',
'title': video.get('title'),
'description': video.get('description'),
'view_count': video.get('views'),
'likes_count': video.get('likesCount'),
'comment_count': video.get('commentsCount'),
'duration': video.get('videoDuration'),
'timestamp': video.get('publishDate'),
}
def _real_extract(self, url): def _real_extract(self, url):
list_id = self._match_id(url) series_id = self._match_id(url)
stories = self._call_api(
listurl = "https://storyfire.com/app/seriesStories/%s/list" % list_id 'seriesStories', series_id, 'series stories')
json = self._download_json(listurl, list_id) return self.playlist_result(self._extract_videos(stories), series_id)
return {
'_type': 'playlist',
'entries': self._enum_videos(json),
'id': list_id
}

View file

@ -4,21 +4,22 @@
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
float_or_none, float_or_none,
int_or_none,
parse_age_limit, parse_age_limit,
qualities, qualities,
random_birthday, random_birthday,
try_get,
unified_timestamp, unified_timestamp,
urljoin, urljoin,
) )
class VideoPressIE(InfoExtractor): class VideoPressIE(InfoExtractor):
_VALID_URL = r'https?://videopress\.com/embed/(?P<id>[\da-zA-Z]+)' _ID_REGEX = r'[\da-zA-Z]{8}'
_PATH_REGEX = r'video(?:\.word)?press\.com/embed/'
_VALID_URL = r'https?://%s(?P<id>%s)' % (_PATH_REGEX, _ID_REGEX)
_TESTS = [{ _TESTS = [{
'url': 'https://videopress.com/embed/kUJmAcSf', 'url': 'https://videopress.com/embed/kUJmAcSf',
'md5': '706956a6c875873d51010921310e4bc6', 'md5': '706956a6c875873d51010921310e4bc6',
@ -36,35 +37,36 @@ class VideoPressIE(InfoExtractor):
# 17+, requires birth_* params # 17+, requires birth_* params
'url': 'https://videopress.com/embed/iH3gstfZ', 'url': 'https://videopress.com/embed/iH3gstfZ',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://video.wordpress.com/embed/kUJmAcSf',
'only_matching': True,
}] }]
@staticmethod @staticmethod
def _extract_urls(webpage): def _extract_urls(webpage):
return re.findall( return re.findall(
r'<iframe[^>]+src=["\']((?:https?://)?videopress\.com/embed/[\da-zA-Z]+)', r'<iframe[^>]+src=["\']((?:https?://)?%s%s)' % (VideoPressIE._PATH_REGEX, VideoPressIE._ID_REGEX),
webpage) webpage)
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
query = random_birthday('birth_year', 'birth_month', 'birth_day') query = random_birthday('birth_year', 'birth_month', 'birth_day')
query['fields'] = 'description,duration,file_url_base,files,height,original,poster,rating,title,upload_date,width'
video = self._download_json( video = self._download_json(
'https://public-api.wordpress.com/rest/v1.1/videos/%s' % video_id, 'https://public-api.wordpress.com/rest/v1.1/videos/%s' % video_id,
video_id, query=query) video_id, query=query)
title = video['title'] title = video['title']
def base_url(scheme): file_url_base = video.get('file_url_base') or {}
return try_get( base_url = file_url_base.get('https') or file_url_base.get('http')
video, lambda x: x['file_url_base'][scheme], compat_str)
base_url = base_url('https') or base_url('http')
QUALITIES = ('std', 'dvd', 'hd') QUALITIES = ('std', 'dvd', 'hd')
quality = qualities(QUALITIES) quality = qualities(QUALITIES)
formats = [] formats = []
for format_id, f in video['files'].items(): for format_id, f in (video.get('files') or {}).items():
if not isinstance(f, dict): if not isinstance(f, dict):
continue continue
for ext, path in f.items(): for ext, path in f.items():
@ -75,12 +77,14 @@ def base_url(scheme):
'ext': determine_ext(path, ext), 'ext': determine_ext(path, ext),
'quality': quality(format_id), 'quality': quality(format_id),
}) })
original_url = try_get(video, lambda x: x['original'], compat_str) original_url = video.get('original')
if original_url: if original_url:
formats.append({ formats.append({
'url': original_url, 'url': original_url,
'format_id': 'original', 'format_id': 'original',
'quality': len(QUALITIES), 'quality': len(QUALITIES),
'width': int_or_none(video.get('width')),
'height': int_or_none(video.get('height')),
}) })
self._sort_formats(formats) self._sort_formats(formats)

View file

@ -22,6 +22,7 @@
parse_iso8601, parse_iso8601,
sanitized_Request, sanitized_Request,
std_headers, std_headers,
try_get,
) )
@ -42,7 +43,7 @@ class VikiBaseIE(InfoExtractor):
_ERRORS = { _ERRORS = {
'geo': 'Sorry, this content is not available in your region.', 'geo': 'Sorry, this content is not available in your region.',
'upcoming': 'Sorry, this content is not yet available.', 'upcoming': 'Sorry, this content is not yet available.',
# 'paywall': 'paywall', 'paywall': 'Sorry, this content is only available to Viki Pass Plus subscribers',
} }
def _prepare_call(self, path, timestamp=None, post_data=None): def _prepare_call(self, path, timestamp=None, post_data=None):
@ -94,11 +95,13 @@ def _raise_error(self, error):
expected=True) expected=True)
def _check_errors(self, data): def _check_errors(self, data):
for reason, status in data.get('blocking', {}).items(): for reason, status in (data.get('blocking') or {}).items():
if status and reason in self._ERRORS: if status and reason in self._ERRORS:
message = self._ERRORS[reason] message = self._ERRORS[reason]
if reason == 'geo': if reason == 'geo':
self.raise_geo_restricted(msg=message) self.raise_geo_restricted(msg=message)
elif reason == 'paywall':
self.raise_login_required(message)
raise ExtractorError('%s said: %s' % ( raise ExtractorError('%s said: %s' % (
self.IE_NAME, message), expected=True) self.IE_NAME, message), expected=True)
@ -143,13 +146,19 @@ class VikiIE(VikiBaseIE):
'info_dict': { 'info_dict': {
'id': '1023585v', 'id': '1023585v',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Heirs Episode 14', 'title': 'Heirs - Episode 14',
'uploader': 'SBS', 'uploader': 'SBS Contents Hub',
'description': 'md5:c4b17b9626dd4b143dcc4d855ba3474e', 'timestamp': 1385047627,
'upload_date': '20131121', 'upload_date': '20131121',
'age_limit': 13, 'age_limit': 13,
'duration': 3570,
'episode_number': 14,
},
'params': {
'format': 'bestvideo',
}, },
'skip': 'Blocked in the US', 'skip': 'Blocked in the US',
'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
}, { }, {
# clip # clip
'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference', 'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference',
@ -165,7 +174,8 @@ class VikiIE(VikiBaseIE):
'uploader': 'Arirang TV', 'uploader': 'Arirang TV',
'like_count': int, 'like_count': int,
'age_limit': 0, 'age_limit': 0,
} },
'skip': 'Sorry. There was an error loading this video',
}, { }, {
'url': 'http://www.viki.com/videos/1048879v-ankhon-dekhi', 'url': 'http://www.viki.com/videos/1048879v-ankhon-dekhi',
'info_dict': { 'info_dict': {
@ -183,7 +193,7 @@ class VikiIE(VikiBaseIE):
}, { }, {
# episode # episode
'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1', 'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1',
'md5': '94e0e34fd58f169f40c184f232356cfe', 'md5': '0a53dc252e6e690feccd756861495a8c',
'info_dict': { 'info_dict': {
'id': '44699v', 'id': '44699v',
'ext': 'mp4', 'ext': 'mp4',
@ -195,6 +205,10 @@ class VikiIE(VikiBaseIE):
'uploader': 'group8', 'uploader': 'group8',
'like_count': int, 'like_count': int,
'age_limit': 13, 'age_limit': 13,
'episode_number': 1,
},
'params': {
'format': 'bestvideo',
}, },
'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'], 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
}, { }, {
@ -221,7 +235,7 @@ class VikiIE(VikiBaseIE):
}, { }, {
# non-English description # non-English description
'url': 'http://www.viki.com/videos/158036v-love-in-magic', 'url': 'http://www.viki.com/videos/158036v-love-in-magic',
'md5': 'adf9e321a0ae5d0aace349efaaff7691', 'md5': '41faaba0de90483fb4848952af7c7d0d',
'info_dict': { 'info_dict': {
'id': '158036v', 'id': '158036v',
'ext': 'mp4', 'ext': 'mp4',
@ -232,6 +246,10 @@ class VikiIE(VikiBaseIE):
'title': 'Love In Magic', 'title': 'Love In Magic',
'age_limit': 13, 'age_limit': 13,
}, },
'params': {
'format': 'bestvideo',
},
'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
}] }]
def _real_extract(self, url): def _real_extract(self, url):
@ -249,22 +267,19 @@ def _real_extract(self, url):
self._check_errors(video) self._check_errors(video)
title = self.dict_selection(video.get('titles', {}), 'en', allow_fallback=False) title = self.dict_selection(video.get('titles', {}), 'en', allow_fallback=False)
episode_number = int_or_none(video.get('number'))
if not title: if not title:
title = 'Episode %d' % video.get('number') if video.get('type') == 'episode' else video.get('id') or video_id title = 'Episode %d' % episode_number if video.get('type') == 'episode' else video.get('id') or video_id
container_titles = video.get('container', {}).get('titles', {}) container_titles = try_get(video, lambda x: x['container']['titles'], dict) or {}
container_title = self.dict_selection(container_titles, 'en') container_title = self.dict_selection(container_titles, 'en')
title = '%s - %s' % (container_title, title) title = '%s - %s' % (container_title, title)
description = self.dict_selection(video.get('descriptions', {}), 'en') description = self.dict_selection(video.get('descriptions', {}), 'en')
duration = int_or_none(video.get('duration')) like_count = int_or_none(try_get(video, lambda x: x['likes']['count']))
timestamp = parse_iso8601(video.get('created_at'))
uploader = video.get('author')
like_count = int_or_none(video.get('likes', {}).get('count'))
age_limit = parse_age_limit(video.get('rating'))
thumbnails = [] thumbnails = []
for thumbnail_id, thumbnail in video.get('images', {}).items(): for thumbnail_id, thumbnail in (video.get('images') or {}).items():
thumbnails.append({ thumbnails.append({
'id': thumbnail_id, 'id': thumbnail_id,
'url': thumbnail.get('url'), 'url': thumbnail.get('url'),
@ -289,7 +304,7 @@ def _real_extract(self, url):
}] }]
except AttributeError: except AttributeError:
# fall-back to the old way if there isn't a streamSubtitles attribute # fall-back to the old way if there isn't a streamSubtitles attribute
for subtitle_lang, _ in video.get('subtitle_completions', {}).items(): for subtitle_lang, _ in (video.get('subtitle_completions') or {}).items():
subtitles[subtitle_lang] = [{ subtitles[subtitle_lang] = [{
'ext': subtitles_format, 'ext': subtitles_format,
'url': self._prepare_call( 'url': self._prepare_call(
@ -300,13 +315,15 @@ def _real_extract(self, url):
'id': video_id, 'id': video_id,
'title': title, 'title': title,
'description': description, 'description': description,
'duration': duration, 'duration': int_or_none(video.get('duration')),
'timestamp': timestamp, 'timestamp': parse_iso8601(video.get('created_at')),
'uploader': uploader, 'uploader': video.get('author'),
'uploader_url': video.get('author_url'),
'like_count': like_count, 'like_count': like_count,
'age_limit': age_limit, 'age_limit': parse_age_limit(video.get('rating')),
'thumbnails': thumbnails, 'thumbnails': thumbnails,
'subtitles': subtitles, 'subtitles': subtitles,
'episode_number': episode_number,
} }
formats = [] formats = []
@ -400,7 +417,7 @@ class VikiChannelIE(VikiBaseIE):
'info_dict': { 'info_dict': {
'id': '50c', 'id': '50c',
'title': 'Boys Over Flowers', 'title': 'Boys Over Flowers',
'description': 'md5:ecd3cff47967fe193cff37c0bec52790', 'description': 'md5:804ce6e7837e1fd527ad2f25420f4d59',
}, },
'playlist_mincount': 71, 'playlist_mincount': 71,
}, { }, {
@ -411,6 +428,7 @@ class VikiChannelIE(VikiBaseIE):
'description': 'md5:05bf5471385aa8b21c18ad450e350525', 'description': 'md5:05bf5471385aa8b21c18ad450e350525',
}, },
'playlist_count': 127, 'playlist_count': 127,
'skip': 'Page not found',
}, { }, {
'url': 'http://www.viki.com/news/24569c-showbiz-korea', 'url': 'http://www.viki.com/news/24569c-showbiz-korea',
'only_matching': True, 'only_matching': True,

View file

@ -221,10 +221,12 @@ def _parse_config(self, config, video_id):
'is_live': is_live, 'is_live': is_live,
} }
def _extract_original_format(self, url, video_id): def _extract_original_format(self, url, video_id, unlisted_hash=None):
query = {'action': 'load_download_config'}
if unlisted_hash:
query['unlisted_hash'] = unlisted_hash
download_data = self._download_json( download_data = self._download_json(
url, video_id, fatal=False, url, video_id, fatal=False, query=query,
query={'action': 'load_download_config'},
headers={'X-Requested-With': 'XMLHttpRequest'}) headers={'X-Requested-With': 'XMLHttpRequest'})
if download_data: if download_data:
source_file = download_data.get('source_file') source_file = download_data.get('source_file')
@ -504,6 +506,11 @@ class VimeoIE(VimeoBaseInfoExtractor):
{ {
'url': 'https://vimeo.com/160743502/abd0e13fb4', 'url': 'https://vimeo.com/160743502/abd0e13fb4',
'only_matching': True, 'only_matching': True,
},
{
# requires passing unlisted_hash(a52724358e) to load_download_config request
'url': 'https://vimeo.com/392479337/a52724358e',
'only_matching': True,
} }
# https://gettingthingsdone.com/workflowmap/ # https://gettingthingsdone.com/workflowmap/
# vimeo embed with check-password page protected by Referer header # vimeo embed with check-password page protected by Referer header
@ -668,7 +675,8 @@ def _real_extract(self, url):
if config.get('view') == 4: if config.get('view') == 4:
config = self._verify_player_video_password(redirect_url, video_id, headers) config = self._verify_player_video_password(redirect_url, video_id, headers)
vod = config.get('video', {}).get('vod', {}) video = config.get('video') or {}
vod = video.get('vod') or {}
def is_rented(): def is_rented():
if '>You rented this title.<' in webpage: if '>You rented this title.<' in webpage:
@ -728,7 +736,7 @@ def is_rented():
formats = [] formats = []
source_format = self._extract_original_format( source_format = self._extract_original_format(
'https://vimeo.com/' + video_id, video_id) 'https://vimeo.com/' + video_id, video_id, video.get('unlisted_hash'))
if source_format: if source_format:
formats.append(source_format) formats.append(source_format)

View file

@ -1,40 +1,55 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import (
compat_parse_qs,
compat_urllib_parse_urlparse,
)
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
month_by_abbreviation,
parse_filesize, parse_filesize,
unified_strdate,
) )
class XboxClipsIE(InfoExtractor): class XboxClipsIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?xboxclips\.com/(?:video\.php\?.*vid=|[^/]+/)(?P<id>[\w-]{36})' _VALID_URL = r'https?://(?:www\.)?(?:xboxclips\.com|gameclips\.io)/(?:video\.php\?.*vid=|[^/]+/)(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})'
_TEST = { _TESTS = [{
'url': 'http://xboxclips.com/video.php?uid=2533274823424419&gamertag=Iabdulelah&vid=074a69a9-5faf-46aa-b93b-9909c1720325', 'url': 'http://xboxclips.com/video.php?uid=2533274823424419&gamertag=Iabdulelah&vid=074a69a9-5faf-46aa-b93b-9909c1720325',
'md5': 'fbe1ec805e920aeb8eced3c3e657df5d', 'md5': 'fbe1ec805e920aeb8eced3c3e657df5d',
'info_dict': { 'info_dict': {
'id': '074a69a9-5faf-46aa-b93b-9909c1720325', 'id': '074a69a9-5faf-46aa-b93b-9909c1720325',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Iabdulelah playing Titanfall', 'title': 'iAbdulElah playing Titanfall',
'filesize_approx': 26800000, 'filesize_approx': 26800000,
'upload_date': '20140807', 'upload_date': '20140807',
'duration': 56, 'duration': 56,
} }
} }, {
'url': 'https://gameclips.io/iAbdulElah/074a69a9-5faf-46aa-b93b-9909c1720325',
'only_matching': True,
}]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) if '/video.php' in url:
qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
url = 'https://gameclips.io/%s/%s' % (qs['gamertag'][0], qs['vid'][0])
video_url = self._html_search_regex( webpage = self._download_webpage(url, video_id)
r'>(?:Link|Download): <a[^>]+href="([^"]+)"', webpage, 'video URL') info = self._parse_html5_media_entries(url, webpage, video_id)[0]
title = self._html_search_regex(
r'<title>XboxClips \| ([^<]+)</title>', webpage, 'title') title = self._html_search_meta(['og:title', 'twitter:title'], webpage)
upload_date = unified_strdate(self._html_search_regex( upload_date = None
r'>Recorded: ([^<]+)<', webpage, 'upload date', fatal=False)) mobj = re.search(
r'>Recorded: (\d{2})-(Jan|Feb|Mar|Apr|May|Ju[nl]|Aug|Sep|Oct|Nov|Dec)-(\d{4})',
webpage)
if mobj:
upload_date = '%s%.2d%s' % (mobj.group(3), month_by_abbreviation(mobj.group(2)), mobj.group(1))
filesize = parse_filesize(self._html_search_regex( filesize = parse_filesize(self._html_search_regex(
r'>Size: ([^<]+)<', webpage, 'file size', fatal=False)) r'>Size: ([^<]+)<', webpage, 'file size', fatal=False))
duration = int_or_none(self._html_search_regex( duration = int_or_none(self._html_search_regex(
@ -42,12 +57,12 @@ def _real_extract(self, url):
view_count = int_or_none(self._html_search_regex( view_count = int_or_none(self._html_search_regex(
r'>Views: (\d+)<', webpage, 'view count', fatal=False)) r'>Views: (\d+)<', webpage, 'view count', fatal=False))
return { info.update({
'id': video_id, 'id': video_id,
'url': video_url,
'title': title, 'title': title,
'upload_date': upload_date, 'upload_date': upload_date,
'filesize_approx': filesize, 'filesize_approx': filesize,
'duration': duration, 'duration': duration,
'view_count': view_count, 'view_count': view_count,
} })
return info

View file

@ -1,8 +1,9 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import re
import hashlib import hashlib
import itertools
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str from ..compat import compat_str
@ -209,17 +210,27 @@ def _extract_tracks(self, source, item_id, url, tld):
missing_track_ids = [ missing_track_ids = [
track_id for track_id in track_ids track_id for track_id in track_ids
if track_id not in present_track_ids] if track_id not in present_track_ids]
missing_tracks = self._call_api( # Request missing tracks in chunks to avoid exceeding max HTTP header size,
'track-entries', tld, url, item_id, # see https://github.com/ytdl-org/youtube-dl/issues/27355
'Downloading missing tracks JSON', { _TRACKS_PER_CHUNK = 250
'entries': ','.join(missing_track_ids), for chunk_num in itertools.count(0):
'lang': tld, start = chunk_num * _TRACKS_PER_CHUNK
'external-domain': 'music.yandex.%s' % tld, end = start + _TRACKS_PER_CHUNK
'overembed': 'false', missing_track_ids_req = missing_track_ids[start:end]
'strict': 'true', assert missing_track_ids_req
}) missing_tracks = self._call_api(
if missing_tracks: 'track-entries', tld, url, item_id,
tracks.extend(missing_tracks) 'Downloading missing tracks JSON chunk %d' % (chunk_num + 1), {
'entries': ','.join(missing_track_ids_req),
'lang': tld,
'external-domain': 'music.yandex.%s' % tld,
'overembed': 'false',
'strict': 'true',
})
if missing_tracks:
tracks.extend(missing_tracks)
if end >= len(missing_track_ids):
break
return tracks return tracks

View file

@ -324,7 +324,9 @@ def _extract_video(self, renderer):
r'^([\d,]+)', re.sub(r'\s', '', view_count_text), r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
'view count', default=None)) 'view count', default=None))
uploader = try_get( uploader = try_get(
renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str) renderer,
(lambda x: x['ownerText']['runs'][0]['text'],
lambda x: x['shortBylineText']['runs'][0]['text']), compat_str)
return { return {
'_type': 'url_transparent', '_type': 'url_transparent',
'ie_key': YoutubeIE.ie_key(), 'ie_key': YoutubeIE.ie_key(),
@ -340,64 +342,70 @@ def _extract_video(self, renderer):
class YoutubeIE(YoutubeBaseInfoExtractor): class YoutubeIE(YoutubeBaseInfoExtractor):
IE_DESC = 'YouTube.com' IE_DESC = 'YouTube.com'
_INVIDIOUS_SITES = (
# invidious-redirect websites
r'(?:www\.)?redirect\.invidious\.io',
r'(?:(?:www|dev)\.)?invidio\.us',
# Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md
r'(?:www\.)?invidious\.pussthecat\.org',
r'(?:www\.)?invidious\.048596\.xyz',
r'(?:www\.)?invidious\.zee\.li',
r'(?:www\.)?vid\.puffyan\.us',
r'(?:(?:www|au)\.)?ytprivate\.com',
r'(?:www\.)?invidious\.namazso\.eu',
r'(?:www\.)?invidious\.ethibox\.fr',
r'(?:www\.)?inv\.skyn3t\.in',
r'(?:www\.)?invidious\.himiko\.cloud',
r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion',
r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion',
r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion',
r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion',
# youtube-dl invidious instances list
r'(?:(?:www|no)\.)?invidiou\.sh',
r'(?:(?:www|fi)\.)?invidious\.snopyta\.org',
r'(?:www\.)?invidious\.kabi\.tk',
r'(?:www\.)?invidious\.13ad\.de',
r'(?:www\.)?invidious\.mastodon\.host',
r'(?:www\.)?invidious\.zapashcanon\.fr',
r'(?:www\.)?invidious\.kavin\.rocks',
r'(?:www\.)?invidious\.tube',
r'(?:www\.)?invidiou\.site',
r'(?:www\.)?invidious\.site',
r'(?:www\.)?invidious\.xyz',
r'(?:www\.)?invidious\.nixnet\.xyz',
r'(?:www\.)?invidious\.drycat\.fr',
r'(?:www\.)?tube\.poal\.co',
r'(?:www\.)?tube\.connect\.cafe',
r'(?:www\.)?vid\.wxzm\.sx',
r'(?:www\.)?vid\.mint\.lgbt',
r'(?:www\.)?yewtu\.be',
r'(?:www\.)?yt\.elukerio\.org',
r'(?:www\.)?yt\.lelux\.fi',
r'(?:www\.)?invidious\.ggc-project\.de',
r'(?:www\.)?yt\.maisputain\.ovh',
r'(?:www\.)?invidious\.toot\.koeln',
r'(?:www\.)?invidious\.fdn\.fr',
r'(?:www\.)?watch\.nettohikari\.com',
r'(?:www\.)?kgg2m7yk5aybusll\.onion',
r'(?:www\.)?qklhadlycap4cnod\.onion',
r'(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion',
r'(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion',
r'(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion',
r'(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion',
r'(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p',
r'(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion',
)
_VALID_URL = r"""(?x)^ _VALID_URL = r"""(?x)^
( (
(?:https?://|//) # http(s):// or protocol-independent URL (?:https?://|//) # http(s):// or protocol-independent URL
(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/| (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com|
(?:www\.)?deturl\.com/www\.youtube\.com/| (?:www\.)?deturl\.com/www\.youtube\.com|
(?:www\.)?pwnyoutube\.com/| (?:www\.)?pwnyoutube\.com|
(?:www\.)?hooktube\.com/| (?:www\.)?hooktube\.com|
(?:www\.)?yourepeat\.com/| (?:www\.)?yourepeat\.com|
tube\.majestyc\.net/| tube\.majestyc\.net|
# Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances %(invidious)s|
(?:www\.)?invidious\.pussthecat\.org/| youtube\.googleapis\.com)/ # the various hostnames, with wildcard subdomains
(?:www\.)?invidious\.048596\.xyz/|
(?:www\.)?invidious\.zee\.li/|
(?:www\.)?vid\.puffyan\.us/|
(?:(?:www|au)\.)?ytprivate\.com/|
(?:www\.)?invidious\.namazso\.eu/|
(?:www\.)?invidious\.ethibox\.fr/|
(?:www\.)?inv\.skyn3t\.in/|
(?:www\.)?invidious\.himiko\.cloud/|
(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion/|
(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion/|
(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion/|
(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion/|
(?:(?:www|dev)\.)?invidio\.us/|
(?:(?:www|no)\.)?invidiou\.sh/|
(?:(?:www|fi)\.)?invidious\.snopyta\.org/|
(?:www\.)?invidious\.kabi\.tk/|
(?:www\.)?invidious\.13ad\.de/|
(?:www\.)?invidious\.mastodon\.host/|
(?:www\.)?invidious\.zapashcanon\.fr/|
(?:www\.)?invidious\.kavin\.rocks/|
(?:www\.)?invidious\.tube/|
(?:www\.)?invidiou\.site/|
(?:www\.)?invidious\.site/|
(?:www\.)?invidious\.xyz/|
(?:www\.)?invidious\.nixnet\.xyz/|
(?:www\.)?invidious\.drycat\.fr/|
(?:www\.)?tube\.poal\.co/|
(?:www\.)?tube\.connect\.cafe/|
(?:www\.)?vid\.wxzm\.sx/|
(?:www\.)?vid\.mint\.lgbt/|
(?:www\.)?yewtu\.be/|
(?:www\.)?yt\.elukerio\.org/|
(?:www\.)?yt\.lelux\.fi/|
(?:www\.)?invidious\.ggc-project\.de/|
(?:www\.)?yt\.maisputain\.ovh/|
(?:www\.)?invidious\.toot\.koeln/|
(?:www\.)?invidious\.fdn\.fr/|
(?:www\.)?watch\.nettohikari\.com/|
(?:www\.)?kgg2m7yk5aybusll\.onion/|
(?:www\.)?qklhadlycap4cnod\.onion/|
(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
(?:.*?\#/)? # handle anchor (#/) redirect urls (?:.*?\#/)? # handle anchor (#/) redirect urls
(?: # the various things that can precede the ID: (?: # the various things that can precede the ID:
(?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/ (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
@ -412,6 +420,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
youtu\.be| # just youtu.be/xxxx youtu\.be| # just youtu.be/xxxx
vid\.plus| # or vid.plus/xxxx vid\.plus| # or vid.plus/xxxx
zwearz\.com/watch| # or zwearz.com/watch/xxxx zwearz\.com/watch| # or zwearz.com/watch/xxxx
%(invidious)s
)/ )/
|(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId= |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
) )
@ -424,7 +433,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
) )
) )
(?(1).+)? # if we found the ID, everything can follow (?(1).+)? # if we found the ID, everything can follow
$""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} $""" % {
'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE,
'invidious': '|'.join(_INVIDIOUS_SITES),
}
_PLAYER_INFO_RE = ( _PLAYER_INFO_RE = (
r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/player', r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/player',
r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$', r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$',
@ -1031,6 +1043,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'url': 'https://invidio.us/watch?v=BaW_jenozKc', 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
'only_matching': True, 'only_matching': True,
}, },
{
'url': 'https://redirect.invidious.io/watch?v=BaW_jenozKc',
'only_matching': True,
},
{
# from https://nitter.pussthecat.org/YouTube/status/1360363141947944964#m
'url': 'https://redirect.invidious.io/Yh0AhrY9GjA',
'only_matching': True,
},
{ {
# DRM protected # DRM protected
'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc', 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
@ -1169,6 +1190,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'skip_download': True, 'skip_download': True,
}, },
}, },
{
# controversial video, only works with bpctr when authenticated with cookies
'url': 'https://www.youtube.com/watch?v=nGC3D_FkCmg',
'only_matching': True,
},
] ]
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
@ -1426,7 +1452,7 @@ def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {}) url, smuggled_data = unsmuggle_url(url, {})
video_id = self._match_id(url) video_id = self._match_id(url)
base_url = self.http_scheme() + '//www.youtube.com/' base_url = self.http_scheme() + '//www.youtube.com/'
webpage_url = base_url + 'watch?v=' + video_id + '&has_verified=1' webpage_url = base_url + 'watch?v=' + video_id + '&has_verified=1&bpctr=9999999999'
webpage = self._download_webpage(webpage_url, video_id, fatal=False) webpage = self._download_webpage(webpage_url, video_id, fatal=False)
player_response = None player_response = None

View file

@ -0,0 +1,69 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import float_or_none, int_or_none
class ZhihuIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?zhihu\.com/zvideo/(?P<id>[0-9]+)'
_TEST = {
'url': 'https://www.zhihu.com/zvideo/1342930761977176064',
'md5': 'c8d4c9cd72dd58e6f9bc9c2c84266464',
'info_dict': {
'id': '1342930761977176064',
'ext': 'mp4',
'title': '写春联也太难了吧!',
'thumbnail': r're:^https?://.*\.jpg',
'uploader': '桥半舫',
'timestamp': 1612959715,
'upload_date': '20210210',
'uploader_id': '244ecb13b0fd7daf92235288c8ca3365',
'duration': 146.333,
'view_count': int,
'like_count': int,
'comment_count': int,
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
zvideo = self._download_json(
'https://www.zhihu.com/api/v4/zvideos/' + video_id, video_id)
title = zvideo['title']
video = zvideo.get('video') or {}
formats = []
for format_id, q in (video.get('playlist') or {}).items():
play_url = q.get('url') or q.get('play_url')
if not play_url:
continue
formats.append({
'asr': int_or_none(q.get('sample_rate')),
'filesize': int_or_none(q.get('size')),
'format_id': format_id,
'fps': int_or_none(q.get('fps')),
'height': int_or_none(q.get('height')),
'tbr': float_or_none(q.get('bitrate')),
'url': play_url,
'width': int_or_none(q.get('width')),
})
self._sort_formats(formats)
author = zvideo.get('author') or {}
url_token = author.get('url_token')
return {
'id': video_id,
'title': title,
'formats': formats,
'thumbnail': video.get('thumbnail') or zvideo.get('image_url'),
'uploader': author.get('name'),
'timestamp': int_or_none(zvideo.get('published_at')),
'uploader_id': author.get('id'),
'uploader_url': 'https://www.zhihu.com/people/' + url_token if url_token else None,
'duration': float_or_none(video.get('duration')),
'view_count': int_or_none(zvideo.get('play_count')),
'like_count': int_or_none(zvideo.get('liked_count')),
'comment_count': int_or_none(zvideo.get('comment_count')),
}

View file

@ -127,10 +127,13 @@ def is_webp(path):
except PostProcessingError as err: except PostProcessingError as err:
self.report_warning('unable to embed using ffprobe & ffmpeg; %s' % error_to_compat_str(err)) self.report_warning('unable to embed using ffprobe & ffmpeg; %s' % error_to_compat_str(err))
if not check_executable('AtomicParsley', ['-v']): atomicparsley = next((
x for x in ['AtomicParsley', 'atomicparsley']
if check_executable(x, ['-v'])), None)
if atomicparsley is None:
raise EmbedThumbnailPPError('AtomicParsley was not found. Please install.') raise EmbedThumbnailPPError('AtomicParsley was not found. Please install.')
cmd = [encodeFilename('AtomicParsley', True), cmd = [encodeFilename(atomicparsley, True),
encodeFilename(filename, True), encodeFilename(filename, True),
encodeArgument('--artwork'), encodeArgument('--artwork'),
encodeFilename(thumbnail_filename, True), encodeFilename(thumbnail_filename, True),