Merge 'ytdl-org/youtube-dl/master' release 2020.11.19

Old Extractors left behind:
	VLivePlaylistIE
	YoutubeSearchURLIE
	YoutubeShowIE
	YoutubeFavouritesIE

If removing old extractors, make corresponding changes in
	docs/supportedsites.md
	youtube_dlc/extractor/extractors.py

Not merged:
	.github/ISSUE_TEMPLATE/1_broken_site.md
	.github/ISSUE_TEMPLATE/2_site_support_request.md
	.github/ISSUE_TEMPLATE/3_site_feature_request.md
	.github/ISSUE_TEMPLATE/4_bug_report.md
	.github/ISSUE_TEMPLATE/5_feature_request.md
	test/test_all_urls.py
	youtube_dlc/version.py
	Changelog
This commit is contained in:
pukkandan 2020-11-20 00:52:59 +05:30
parent 228385340e
commit 8bdd16b499
34 changed files with 1828 additions and 1695 deletions

View file

@ -61,7 +61,7 @@ def build_lazy_ie(ie, name):
return s return s
# find the correct sorting and add the required base classes so that sublcasses # find the correct sorting and add the required base classes so that subclasses
# can be correctly created # can be correctly created
classes = _ALL_CLASSES[:-1] classes = _ALL_CLASSES[:-1]
ordered_cls = [] ordered_cls = []

View file

@ -59,9 +59,9 @@ # Supported sites
- **ARD:mediathek** - **ARD:mediathek**
- **ARDBetaMediathek** - **ARDBetaMediathek**
- **Arkena** - **Arkena**
- **arte.tv:+7** - **ArteTV**
- **arte.tv:embed** - **ArteTVEmbed**
- **arte.tv:playlist** - **ArteTVPlaylist**
- **AsianCrush** - **AsianCrush**
- **AsianCrushPlaylist** - **AsianCrushPlaylist**
- **AtresPlayer** - **AtresPlayer**
@ -424,6 +424,7 @@ # Supported sites
- **la7.it** - **la7.it**
- **laola1tv** - **laola1tv**
- **laola1tv:embed** - **laola1tv:embed**
- **lbry.tv**
- **LCI** - **LCI**
- **Lcp** - **Lcp**
- **LcpPlay** - **LcpPlay**
@ -835,8 +836,6 @@ # Supported sites
- **SpankBangPlaylist** - **SpankBangPlaylist**
- **Spankwire** - **Spankwire**
- **Spiegel** - **Spiegel**
- **Spiegel:Article**: Articles on spiegel.de
- **Spiegeltv**
- **sport.francetvinfo.fr** - **sport.francetvinfo.fr**
- **Sport5** - **Sport5**
- **SportBox** - **SportBox**
@ -1147,19 +1146,18 @@ # Supported sites
- **YourPorn** - **YourPorn**
- **YourUpload** - **YourUpload**
- **youtube**: YouTube.com - **youtube**: YouTube.com
- **youtube:channel**: YouTube.com channels
- **youtube:favorites**: YouTube.com favourite videos, ":ytfav" for short (requires authentication) - **youtube:favorites**: YouTube.com favourite videos, ":ytfav" for short (requires authentication)
- **youtube:history**: Youtube watch history, ":ythistory" for short (requires authentication) - **youtube:history**: Youtube watch history, ":ythistory" for short (requires authentication)
- **youtube:live**: YouTube.com live streams - **youtube:live**: YouTube.com live streams
- **youtube:playlist**: YouTube.com playlists - **youtube:playlist**: YouTube.com playlists
- **youtube:playlists**: YouTube.com user/channel playlists
- **youtube:recommended**: YouTube.com recommended videos, ":ytrec" for short (requires authentication) - **youtube:recommended**: YouTube.com recommended videos, ":ytrec" for short (requires authentication)
- **youtube:search**: YouTube.com searches - **youtube:search**: YouTube.com searches
- **youtube:search:date**: YouTube.com searches, newest videos first - **youtube:search:date**: YouTube.com searches, newest videos first
- **youtube:search_url**: YouTube.com search URLs - **youtube:search_url**: YouTube.com search URLs
- **youtube:show**: YouTube.com (multi-season) shows - **youtube:show**: YouTube.com (multi-season) shows
- **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication) - **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)
- **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword) - **YoutubeYtUser**: YouTube.com user videos (URL or "ytuser" keyword)
- **youtube:tab**: YouTube.com tab
- **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication) - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication)
- **Zapiks** - **Zapiks**
- **Zaq1** - **Zaq1**

View file

@ -31,15 +31,17 @@ def assertMatch(self, url, ie_list):
def test_youtube_playlist_matching(self): def test_youtube_playlist_matching(self):
assertPlaylist = lambda url: self.assertMatch(url, ['youtube:playlist']) assertPlaylist = lambda url: self.assertMatch(url, ['youtube:playlist'])
assertTab = lambda url: self.assertMatch(url, ['youtube:tab'])
assertPlaylist('ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') assertPlaylist('ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
assertPlaylist('UUBABnxM4Ar9ten8Mdjj1j0Q') # 585 assertPlaylist('UUBABnxM4Ar9ten8Mdjj1j0Q') # 585
assertPlaylist('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q') assertPlaylist('PL63F0C78739B09958')
assertTab('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')
assertPlaylist('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') assertPlaylist('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
assertPlaylist('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC') assertTab('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
assertPlaylist('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668 assertTab('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668
self.assertFalse('youtube:playlist' in self.matching_ies('PLtS2H6bU1M')) self.assertFalse('youtube:playlist' in self.matching_ies('PLtS2H6bU1M'))
# Top tracks # Top tracks
assertPlaylist('https://www.youtube.com/playlist?list=MCUS.20142101') assertTab('https://www.youtube.com/playlist?list=MCUS.20142101')
def test_youtube_matching(self): def test_youtube_matching(self):
self.assertTrue(YoutubeIE.suitable('PLtS2H6bU1M')) self.assertTrue(YoutubeIE.suitable('PLtS2H6bU1M'))
@ -50,26 +52,22 @@ def test_youtube_matching(self):
self.assertMatch('http://www.cleanvideosearch.com/media/action/yt/watch?videoId=8v_4O44sfjM', ['youtube']) self.assertMatch('http://www.cleanvideosearch.com/media/action/yt/watch?videoId=8v_4O44sfjM', ['youtube'])
def test_youtube_channel_matching(self): def test_youtube_channel_matching(self):
assertChannel = lambda url: self.assertMatch(url, ['youtube:channel']) assertChannel = lambda url: self.assertMatch(url, ['youtube:tab'])
assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM') assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM')
assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM?feature=gb_ch_rec') assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM?feature=gb_ch_rec')
assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM/videos') assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM/videos')
def test_youtube_user_matching(self): # def test_youtube_user_matching(self):
self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:user']) # self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:tab'])
def test_youtube_feeds(self): def test_youtube_feeds(self):
self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watchlater']) self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watchlater'])
self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:subscriptions']) self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:subscriptions'])
self.assertMatch('https://www.youtube.com/feed/recommended', ['youtube:recommended']) self.assertMatch('https://www.youtube.com/feed/recommended', ['youtube:recommended'])
self.assertMatch('https://www.youtube.com/my_favorites', ['youtube:favorites'])
def test_youtube_show_matching(self): # def test_youtube_search_matching(self):
self.assertMatch('http://www.youtube.com/show/airdisasters', ['youtube:show']) # self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url'])
# self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url'])
def test_youtube_search_matching(self):
self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url'])
self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url'])
def test_youtube_extract(self): def test_youtube_extract(self):
assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id) assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id)

View file

@ -937,6 +937,28 @@ def test_js_to_json_edgecases(self):
self.assertEqual(d['x'], 1) self.assertEqual(d['x'], 1)
self.assertEqual(d['y'], 'a') self.assertEqual(d['y'], 'a')
# Just drop ! prefix for now though this results in a wrong value
on = js_to_json('''{
a: !0,
b: !1,
c: !!0,
d: !!42.42,
e: !!![],
f: !"abc",
g: !"",
!42: 42
}''')
self.assertEqual(json.loads(on), {
'a': 0,
'b': 1,
'c': 0,
'd': 42.42,
'e': [],
'f': "abc",
'g': "",
'42': 42
})
on = js_to_json('["abc", "def",]') on = js_to_json('["abc", "def",]')
self.assertEqual(json.loads(on), ['abc', 'def']) self.assertEqual(json.loads(on), ['abc', 'def'])
@ -994,6 +1016,12 @@ def test_js_to_json_edgecases(self):
on = js_to_json('{42:4.2e1}') on = js_to_json('{42:4.2e1}')
self.assertEqual(json.loads(on), {'42': 42.0}) self.assertEqual(json.loads(on), {'42': 42.0})
on = js_to_json('{ "0x40": "0x40" }')
self.assertEqual(json.loads(on), {'0x40': '0x40'})
on = js_to_json('{ "040": "040" }')
self.assertEqual(json.loads(on), {'040': '040'})
def test_js_to_json_malformed(self): def test_js_to_json_malformed(self):
self.assertEqual(js_to_json('42a1'), '42"a1"') self.assertEqual(js_to_json('42a1'), '42"a1"')
self.assertEqual(js_to_json('42a-1'), '42"a"-1') self.assertEqual(js_to_json('42a-1'), '42"a"-1')

View file

@ -275,7 +275,7 @@ def _real_extract(self, url):
video_element = video_xml.findall(compat_xpath('./track/video'))[-1] video_element = video_xml.findall(compat_xpath('./track/video'))[-1]
if video_element is None or video_element.text is None: if video_element is None or video_element.text is None:
raise ExtractorError( raise ExtractorError(
'Video %s video does not exist' % video_id, expected=True) 'Video %s does not exist' % video_id, expected=True)
video_url = video_element.text.strip() video_url = video_element.text.strip()

View file

@ -4,23 +4,57 @@
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str from ..compat import (
compat_str,
compat_urlparse,
)
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
qualities, qualities,
try_get, try_get,
unified_strdate, unified_strdate,
url_or_none,
) )
# There are different sources of video in arte.tv, the extraction process
# is different for each one. The videos usually expire in 7 days, so we can't
# add tests.
class ArteTVBaseIE(InfoExtractor): class ArteTVBaseIE(InfoExtractor):
def _extract_from_json_url(self, json_url, video_id, lang, title=None): _ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
info = self._download_json(json_url, video_id) _API_BASE = 'https://api.arte.tv/api/player/v1'
class ArteTVIE(ArteTVBaseIE):
_VALID_URL = r'''(?x)
https?://
(?:
(?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
)
/(?P<id>\d{6}-\d{3}-[AF])
''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
_TESTS = [{
'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
'info_dict': {
'id': '088501-000-A',
'ext': 'mp4',
'title': 'Mexico: Stealing Petrol to Survive',
'upload_date': '20190628',
},
}, {
'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
'only_matching': True,
}, {
'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
'only_matching': True,
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
lang = mobj.group('lang') or mobj.group('lang_2')
info = self._download_json(
'%s/config/%s/%s' % (self._API_BASE, lang, video_id), video_id)
player_info = info['videoJsonPlayer'] player_info = info['videoJsonPlayer']
vsr = try_get(player_info, lambda x: x['VSR'], dict) vsr = try_get(player_info, lambda x: x['VSR'], dict)
@ -37,18 +71,11 @@ def _extract_from_json_url(self, json_url, video_id, lang, title=None):
if not upload_date_str: if not upload_date_str:
upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0] upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0]
title = (player_info.get('VTI') or title or player_info['VID']).strip() title = (player_info.get('VTI') or player_info['VID']).strip()
subtitle = player_info.get('VSU', '').strip() subtitle = player_info.get('VSU', '').strip()
if subtitle: if subtitle:
title += ' - %s' % subtitle title += ' - %s' % subtitle
info_dict = {
'id': player_info['VID'],
'title': title,
'description': player_info.get('VDE'),
'upload_date': unified_strdate(upload_date_str),
'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
}
qfunc = qualities(['MQ', 'HQ', 'EQ', 'SQ']) qfunc = qualities(['MQ', 'HQ', 'EQ', 'SQ'])
LANGS = { LANGS = {
@ -65,6 +92,10 @@ def _extract_from_json_url(self, json_url, video_id, lang, title=None):
formats = [] formats = []
for format_id, format_dict in vsr.items(): for format_id, format_dict in vsr.items():
f = dict(format_dict) f = dict(format_dict)
format_url = url_or_none(f.get('url'))
streamer = f.get('streamer')
if not format_url and not streamer:
continue
versionCode = f.get('versionCode') versionCode = f.get('versionCode')
l = re.escape(langcode) l = re.escape(langcode)
@ -107,6 +138,16 @@ def _extract_from_json_url(self, json_url, video_id, lang, title=None):
else: else:
lang_pref = -1 lang_pref = -1
media_type = f.get('mediaType')
if media_type == 'hls':
m3u8_formats = self._extract_m3u8_formats(
format_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id=format_id, fatal=False)
for m3u8_format in m3u8_formats:
m3u8_format['language_preference'] = lang_pref
formats.extend(m3u8_formats)
continue
format = { format = {
'format_id': format_id, 'format_id': format_id,
'preference': -10 if f.get('videoFormat') == 'M3U8' else None, 'preference': -10 if f.get('videoFormat') == 'M3U8' else None,
@ -118,7 +159,7 @@ def _extract_from_json_url(self, json_url, video_id, lang, title=None):
'quality': qfunc(f.get('quality')), 'quality': qfunc(f.get('quality')),
} }
if f.get('mediaType') == 'rtmp': if media_type == 'rtmp':
format['url'] = f['streamer'] format['url'] = f['streamer']
format['play_path'] = 'mp4:' + f['url'] format['play_path'] = 'mp4:' + f['url']
format['ext'] = 'flv' format['ext'] = 'flv'
@ -127,56 +168,50 @@ def _extract_from_json_url(self, json_url, video_id, lang, title=None):
formats.append(format) formats.append(format)
self._check_formats(formats, video_id)
self._sort_formats(formats) self._sort_formats(formats)
info_dict['formats'] = formats return {
return info_dict 'id': player_info.get('VID') or video_id,
'title': title,
'description': player_info.get('VDE'),
'upload_date': unified_strdate(upload_date_str),
'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
'formats': formats,
}
class ArteTVPlus7IE(ArteTVBaseIE): class ArteTVEmbedIE(InfoExtractor):
IE_NAME = 'arte.tv:+7' _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
_VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>fr|de|en|es|it|pl)/videos/(?P<id>\d{6}-\d{3}-[AF])'
_TESTS = [{ _TESTS = [{
'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/', 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
'info_dict': { 'info_dict': {
'id': '088501-000-A', 'id': '100605-013-A',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Mexico: Stealing Petrol to Survive', 'title': 'United we Stream November Lockdown Edition #13',
'upload_date': '20190628', 'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
'upload_date': '20201116',
}, },
}, {
'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
'only_matching': True,
}] }]
def _real_extract(self, url): @staticmethod
lang, video_id = re.match(self._VALID_URL, url).groups() def _extract_urls(webpage):
return self._extract_from_json_url( return [url for _, url in re.findall(
'https://api.arte.tv/api/player/v1/config/%s/%s' % (lang, video_id), r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1',
video_id, lang) webpage)]
class ArteTVEmbedIE(ArteTVPlus7IE):
IE_NAME = 'arte.tv:embed'
_VALID_URL = r'''(?x)
https://www\.arte\.tv
/player/v3/index\.php\?json_url=
(?P<json_url>
https?://api\.arte\.tv/api/player/v1/config/
(?P<lang>[^/]+)/(?P<id>\d{6}-\d{3}-[AF])
)
'''
_TESTS = []
def _real_extract(self, url): def _real_extract(self, url):
json_url, lang, video_id = re.match(self._VALID_URL, url).groups() qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
return self._extract_from_json_url(json_url, video_id, lang) json_url = qs['json_url'][0]
video_id = ArteTVIE._match_id(json_url)
return self.url_result(
json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
class ArteTVPlaylistIE(ArteTVBaseIE): class ArteTVPlaylistIE(ArteTVBaseIE):
IE_NAME = 'arte.tv:playlist' _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
_VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>fr|de|en|es|it|pl)/videos/(?P<id>RC-\d{6})'
_TESTS = [{ _TESTS = [{
'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/', 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
'info_dict': { 'info_dict': {
@ -185,17 +220,35 @@ class ArteTVPlaylistIE(ArteTVBaseIE):
'description': 'md5:d322c55011514b3a7241f7fb80d494c2', 'description': 'md5:d322c55011514b3a7241f7fb80d494c2',
}, },
'playlist_mincount': 6, 'playlist_mincount': 6,
}, {
'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
'only_matching': True,
}] }]
def _real_extract(self, url): def _real_extract(self, url):
lang, playlist_id = re.match(self._VALID_URL, url).groups() lang, playlist_id = re.match(self._VALID_URL, url).groups()
collection = self._download_json( collection = self._download_json(
'https://api.arte.tv/api/player/v1/collectionData/%s/%s?source=videos' '%s/collectionData/%s/%s?source=videos'
% (lang, playlist_id), playlist_id) % (self._API_BASE, lang, playlist_id), playlist_id)
entries = []
for video in collection['videos']:
if not isinstance(video, dict):
continue
video_url = url_or_none(video.get('url')) or url_or_none(video.get('jsonUrl'))
if not video_url:
continue
video_id = video.get('programId')
entries.append({
'_type': 'url_transparent',
'url': video_url,
'id': video_id,
'title': video.get('title'),
'alt_title': video.get('subtitle'),
'thumbnail': url_or_none(try_get(video, lambda x: x['mainImage']['url'], compat_str)),
'duration': int_or_none(video.get('durationSeconds')),
'view_count': int_or_none(video.get('views')),
'ie_key': ArteTVIE.ie_key(),
})
title = collection.get('title') title = collection.get('title')
description = collection.get('shortDescription') or collection.get('teaserText') description = collection.get('shortDescription') or collection.get('teaserText')
entries = [
self._extract_from_json_url(
video['jsonUrl'], video.get('programId') or playlist_id, lang)
for video in collection['videos'] if video.get('jsonUrl')]
return self.playlist_result(entries, playlist_id, title, description) return self.playlist_result(entries, playlist_id, title, description)

View file

@ -1,3 +1,4 @@
# coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import random import random
@ -5,10 +6,7 @@
import time import time
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import compat_str
compat_str,
compat_urlparse,
)
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
float_or_none, float_or_none,
@ -17,71 +15,32 @@
parse_filesize, parse_filesize,
str_or_none, str_or_none,
try_get, try_get,
unescapeHTML,
update_url_query, update_url_query,
unified_strdate, unified_strdate,
unified_timestamp, unified_timestamp,
url_or_none, url_or_none,
urljoin,
) )
class BandcampBaseIE(InfoExtractor): class BandcampIE(InfoExtractor):
"""Provide base functions for Bandcamp extractors""" _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<id>[^/?#&]+)'
def _extract_json_from_html_data_attribute(self, webpage, suffix, video_id):
json_string = self._html_search_regex(
r' data-%s="([^"]*)' % suffix,
webpage, '%s json' % suffix, default='{}')
return self._parse_json(json_string, video_id)
def _parse_json_track(self, json):
formats = []
file_ = json.get('file')
if isinstance(file_, dict):
for format_id, format_url in file_.items():
if not url_or_none(format_url):
continue
ext, abr_str = format_id.split('-', 1)
formats.append({
'format_id': format_id,
'url': self._proto_relative_url(format_url, 'http:'),
'ext': ext,
'vcodec': 'none',
'acodec': ext,
'abr': int_or_none(abr_str),
})
return {
'duration': float_or_none(json.get('duration')),
'id': str_or_none(json.get('track_id') or json.get('id')),
'title': json.get('title'),
'title_link': json.get('title_link'),
'number': int_or_none(json.get('track_num')),
'formats': formats
}
class BandcampIE(BandcampBaseIE):
IE_NAME = "Bandcamp:track"
_VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<title>[^/?#&]+)'
_TESTS = [{ _TESTS = [{
'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song', 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
'md5': 'c557841d5e50261777a6585648adf439', 'md5': 'c557841d5e50261777a6585648adf439',
'info_dict': { 'info_dict': {
'id': '1812978515', 'id': '1812978515',
'ext': 'mp3', 'ext': 'mp3',
'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", 'title': "youtube-dl \"'/\\ä↭ - youtube-dl \"'/\\ä↭ - youtube-dl test song \"'/\\ä↭",
'duration': 9.8485, 'duration': 9.8485,
'uploader': "youtube-dl \"'/\\\u00e4\u21ad", 'uploader': 'youtube-dl "\'/\\ä↭',
'timestamp': 1354224127,
'upload_date': '20121129', 'upload_date': '20121129',
'timestamp': 1354224127,
}, },
'_skip': 'There is a limit of 200 free downloads / month for the test song' '_skip': 'There is a limit of 200 free downloads / month for the test song'
}, { }, {
# free download # free download
'url': 'http://benprunty.bandcamp.com/track/lanius-battle', 'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
'md5': '5d92af55811e47f38962a54c30b07ef0',
'info_dict': { 'info_dict': {
'id': '2650410135', 'id': '2650410135',
'ext': 'aiff', 'ext': 'aiff',
@ -120,52 +79,59 @@ class BandcampIE(BandcampBaseIE):
}, },
}] }]
def _extract_data_attr(self, webpage, video_id, attr='tralbum', fatal=True):
return self._parse_json(self._html_search_regex(
r'data-%s=(["\'])({.+?})\1' % attr, webpage,
attr + ' data', group=2), video_id, fatal=fatal)
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) title = self._match_id(url)
title = mobj.group('title')
url_track_title = title
webpage = self._download_webpage(url, title) webpage = self._download_webpage(url, title)
thumbnail = self._html_search_meta('og:image', webpage, default=None) tralbum = self._extract_data_attr(webpage, title)
thumbnail = self._og_search_thumbnail(webpage)
json_tralbum = self._extract_json_from_html_data_attribute(webpage, "tralbum", url_track_title) track_id = None
json_embed = self._extract_json_from_html_data_attribute(webpage, "embed", url_track_title) track = None
track_number = None
duration = None
json_tracks = json_tralbum.get('trackinfo') formats = []
if not json_tracks: track_info = try_get(tralbum, lambda x: x['trackinfo'][0], dict)
raise ExtractorError('Could not extract track') if track_info:
file_ = track_info.get('file')
if isinstance(file_, dict):
for format_id, format_url in file_.items():
if not url_or_none(format_url):
continue
ext, abr_str = format_id.split('-', 1)
formats.append({
'format_id': format_id,
'url': self._proto_relative_url(format_url, 'http:'),
'ext': ext,
'vcodec': 'none',
'acodec': ext,
'abr': int_or_none(abr_str),
})
track = track_info.get('title')
track_id = str_or_none(
track_info.get('track_id') or track_info.get('id'))
track_number = int_or_none(track_info.get('track_num'))
duration = float_or_none(track_info.get('duration'))
track = self._parse_json_track(json_tracks[0]) embed = self._extract_data_attr(webpage, title, 'embed', False)
artist = json_tralbum.get('artist') current = tralbum.get('current') or {}
album_title = json_embed.get('album_title') artist = embed.get('artist') or current.get('artist') or tralbum.get('artist')
timestamp = unified_timestamp(
current.get('publish_date') or tralbum.get('album_publish_date'))
json_album = json_tralbum.get('packages') download_link = tralbum.get('freeDownloadPage')
if json_album:
json_album = json_album[0]
album_publish_date = json_album.get('album_publish_date')
album_release_date = json_album.get('album_release_date')
else:
album_publish_date = None
album_release_date = json_tralbum.get('album_release_date')
timestamp = unified_timestamp(json_tralbum.get('current', {}).get('publish_date') or album_publish_date)
release_date = unified_strdate(album_release_date)
download_link = self._search_regex(
r'freeDownloadPage(?:["\']|&quot;):\s*(["\']|&quot;)(?P<url>(?:(?!\1).)+)\1', webpage,
'download link', default=None, group='url')
if download_link: if download_link:
track_id = self._search_regex( track_id = compat_str(tralbum['id'])
r'\?id=(?P<id>\d+)&',
download_link, 'track id')
download_webpage = self._download_webpage( download_webpage = self._download_webpage(
download_link, track_id, 'Downloading free downloads page') download_link, track_id, 'Downloading free downloads page')
blob = self._parse_json( blob = self._extract_data_attr(download_webpage, track_id, 'blob')
self._search_regex(
r'data-blob=(["\'])(?P<blob>{.+?})\1', download_webpage,
'blob', group='blob'),
track_id, transform_source=unescapeHTML)
info = try_get( info = try_get(
blob, (lambda x: x['digital_items'][0], blob, (lambda x: x['digital_items'][0],
@ -173,6 +139,8 @@ def _real_extract(self, url):
if info: if info:
downloads = info.get('downloads') downloads = info.get('downloads')
if isinstance(downloads, dict): if isinstance(downloads, dict):
if not track:
track = info.get('title')
if not artist: if not artist:
artist = info.get('artist') artist = info.get('artist')
if not thumbnail: if not thumbnail:
@ -206,7 +174,7 @@ def _real_extract(self, url):
retry_url = url_or_none(stat.get('retry_url')) retry_url = url_or_none(stat.get('retry_url'))
if not retry_url: if not retry_url:
continue continue
track['formats'].append({ formats.append({
'url': self._proto_relative_url(retry_url, 'http:'), 'url': self._proto_relative_url(retry_url, 'http:'),
'ext': download_formats.get(format_id), 'ext': download_formats.get(format_id),
'format_id': format_id, 'format_id': format_id,
@ -215,30 +183,34 @@ def _real_extract(self, url):
'vcodec': 'none', 'vcodec': 'none',
}) })
self._sort_formats(track['formats']) self._sort_formats(formats)
title = '%s - %s' % (artist, track.get('title')) if artist else track.get('title') title = '%s - %s' % (artist, track) if artist else track
if not duration:
duration = float_or_none(self._html_search_meta(
'duration', webpage, default=None))
return { return {
'album': album_title, 'id': track_id,
'artist': artist,
'duration': track['duration'],
'formats': track['formats'],
'id': track['id'],
'release_date': release_date,
'thumbnail': thumbnail,
'timestamp': timestamp,
'title': title, 'title': title,
'track': track['title'], 'thumbnail': thumbnail,
'track_id': track['id'], 'uploader': artist,
'track_number': track['number'], 'timestamp': timestamp,
'uploader': artist 'release_date': unified_strdate(tralbum.get('album_release_date')),
'duration': duration,
'track': track,
'track_number': track_number,
'track_id': track_id,
'artist': artist,
'album': embed.get('album_title'),
'formats': formats,
} }
class BandcampAlbumIE(BandcampBaseIE): class BandcampAlbumIE(BandcampIE):
IE_NAME = 'Bandcamp:album' IE_NAME = 'Bandcamp:album'
_VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^/?#&]+))?' _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<id>[^/?#&]+))?'
_TESTS = [{ _TESTS = [{
'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1', 'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
@ -248,7 +220,10 @@ class BandcampAlbumIE(BandcampBaseIE):
'info_dict': { 'info_dict': {
'id': '1353101989', 'id': '1353101989',
'ext': 'mp3', 'ext': 'mp3',
'title': 'Intro', 'title': 'Blazo - Intro',
'timestamp': 1311756226,
'upload_date': '20110727',
'uploader': 'Blazo',
} }
}, },
{ {
@ -256,7 +231,10 @@ class BandcampAlbumIE(BandcampBaseIE):
'info_dict': { 'info_dict': {
'id': '38097443', 'id': '38097443',
'ext': 'mp3', 'ext': 'mp3',
'title': 'Kero One - Keep It Alive (Blazo remix)', 'title': 'Blazo - Kero One - Keep It Alive (Blazo remix)',
'timestamp': 1311757238,
'upload_date': '20110727',
'uploader': 'Blazo',
} }
}, },
], ],
@ -292,6 +270,7 @@ class BandcampAlbumIE(BandcampBaseIE):
'title': '"Entropy" EP', 'title': '"Entropy" EP',
'uploader_id': 'jstrecords', 'uploader_id': 'jstrecords',
'id': 'entropy-ep', 'id': 'entropy-ep',
'description': 'md5:0ff22959c943622972596062f2f366a5',
}, },
'playlist_mincount': 3, 'playlist_mincount': 3,
}, { }, {
@ -301,6 +280,7 @@ class BandcampAlbumIE(BandcampBaseIE):
'id': 'we-are-the-plague', 'id': 'we-are-the-plague',
'title': 'WE ARE THE PLAGUE', 'title': 'WE ARE THE PLAGUE',
'uploader_id': 'insulters', 'uploader_id': 'insulters',
'description': 'md5:b3cf845ee41b2b1141dc7bde9237255f',
}, },
'playlist_count': 2, 'playlist_count': 2,
}] }]
@ -312,41 +292,34 @@ def suitable(cls, url):
else super(BandcampAlbumIE, cls).suitable(url)) else super(BandcampAlbumIE, cls).suitable(url))
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) uploader_id, album_id = re.match(self._VALID_URL, url).groups()
uploader_id = mobj.group('subdomain')
album_id = mobj.group('album_id')
playlist_id = album_id or uploader_id playlist_id = album_id or uploader_id
webpage = self._download_webpage(url, playlist_id) webpage = self._download_webpage(url, playlist_id)
tralbum = self._extract_data_attr(webpage, playlist_id)
json_tralbum = self._extract_json_from_html_data_attribute(webpage, "tralbum", playlist_id) track_info = tralbum.get('trackinfo')
json_embed = self._extract_json_from_html_data_attribute(webpage, "embed", playlist_id) if not track_info:
raise ExtractorError('The page doesn\'t contain any tracks')
json_tracks = json_tralbum.get('trackinfo')
if not json_tracks:
raise ExtractorError('Could not extract album tracks')
album_title = json_embed.get('album_title')
# Only tracks with duration info have songs # Only tracks with duration info have songs
tracks = [self._parse_json_track(track) for track in json_tracks]
entries = [ entries = [
self.url_result( self.url_result(
compat_urlparse.urljoin(url, track['title_link']), urljoin(url, t['title_link']), BandcampIE.ie_key(),
ie=BandcampIE.ie_key(), video_id=track['id'], str_or_none(t.get('track_id') or t.get('id')), t.get('title'))
video_title=track['title']) for t in track_info
for track in tracks if t.get('duration')]
if track.get('duration')]
current = tralbum.get('current') or {}
return { return {
'_type': 'playlist', '_type': 'playlist',
'uploader_id': uploader_id, 'uploader_id': uploader_id,
'id': playlist_id, 'id': playlist_id,
'title': album_title, 'title': current.get('title'),
'entries': entries 'description': current.get('about'),
'entries': entries,
} }
class BandcampWeeklyIE(InfoExtractor): class BandcampWeeklyIE(BandcampIE):
IE_NAME = 'Bandcamp:weekly' IE_NAME = 'Bandcamp:weekly'
_VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)' _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)'
_TESTS = [{ _TESTS = [{
@ -361,29 +334,23 @@ class BandcampWeeklyIE(InfoExtractor):
'release_date': '20170404', 'release_date': '20170404',
'series': 'Bandcamp Weekly', 'series': 'Bandcamp Weekly',
'episode': 'Magic Moments', 'episode': 'Magic Moments',
'episode_number': 208,
'episode_id': '224', 'episode_id': '224',
} },
'params': {
'format': 'opus-lo',
},
}, { }, {
'url': 'https://bandcamp.com/?blah/blah@&show=228', 'url': 'https://bandcamp.com/?blah/blah@&show=228',
'only_matching': True 'only_matching': True
}] }]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) show_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, show_id)
blob = self._parse_json( blob = self._extract_data_attr(webpage, show_id, 'blob')
self._search_regex(
r'data-blob=(["\'])(?P<blob>{.+?})\1', webpage,
'blob', group='blob'),
video_id, transform_source=unescapeHTML)
show = blob['bcw_show'] show = blob['bcw_data'][show_id]
# This is desired because any invalid show id redirects to `bandcamp.com`
# which happens to expose the latest Bandcamp Weekly episode.
show_id = int_or_none(show.get('show_id')) or int_or_none(video_id)
formats = [] formats = []
for format_id, format_url in show['audio_stream'].items(): for format_id, format_url in show['audio_stream'].items():
@ -408,20 +375,8 @@ def _real_extract(self, url):
if subtitle: if subtitle:
title += ' - %s' % subtitle title += ' - %s' % subtitle
episode_number = None
seq = blob.get('bcw_seq')
if seq and isinstance(seq, list):
try:
episode_number = next(
int_or_none(e.get('episode_number'))
for e in seq
if isinstance(e, dict) and int_or_none(e.get('id')) == show_id)
except StopIteration:
pass
return { return {
'id': video_id, 'id': show_id,
'title': title, 'title': title,
'description': show.get('desc') or show.get('short_desc'), 'description': show.get('desc') or show.get('short_desc'),
'duration': float_or_none(show.get('audio_duration')), 'duration': float_or_none(show.get('audio_duration')),
@ -429,7 +384,6 @@ def _real_extract(self, url):
'release_date': unified_strdate(show.get('published_date')), 'release_date': unified_strdate(show.get('published_date')),
'series': 'Bandcamp Weekly', 'series': 'Bandcamp Weekly',
'episode': show.get('subtitle'), 'episode': show.get('subtitle'),
'episode_number': episode_number, 'episode_id': show_id,
'episode_id': compat_str(video_id),
'formats': formats 'formats': formats
} }

View file

@ -1,6 +1,7 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import smuggle_url from ..utils import smuggle_url
@ -38,7 +39,7 @@ def _real_extract(self, url):
class CNBCVideoIE(InfoExtractor): class CNBCVideoIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?cnbc\.com/video/(?:[^/]+/)+(?P<id>[^./?#&]+)' _VALID_URL = r'https?://(?:www\.)?cnbc\.com(?P<path>/video/(?:[^/]+/)+(?P<id>[^./?#&]+)\.html)'
_TEST = { _TEST = {
'url': 'https://www.cnbc.com/video/2018/07/19/trump-i-dont-necessarily-agree-with-raising-rates.html', 'url': 'https://www.cnbc.com/video/2018/07/19/trump-i-dont-necessarily-agree-with-raising-rates.html',
'info_dict': { 'info_dict': {
@ -56,11 +57,15 @@ class CNBCVideoIE(InfoExtractor):
} }
def _real_extract(self, url): def _real_extract(self, url):
display_id = self._match_id(url) path, display_id = re.match(self._VALID_URL, url).groups()
webpage = self._download_webpage(url, display_id) video_id = self._download_json(
video_id = self._search_regex( 'https://webql-redesign.cnbcfm.com/graphql', display_id, query={
r'content_id["\']\s*:\s*["\'](\d+)', webpage, display_id, 'query': '''{
'video id') page(path: "%s") {
vcpsId
}
}''' % path,
})['data']['page']['vcpsId']
return self.url_result( return self.url_result(
'http://video.cnbc.com/gallery/?video=%s' % video_id, 'http://video.cnbc.com/gallery/?video=%d' % video_id,
CNBCIE.ie_key()) CNBCIE.ie_key())

View file

@ -1456,9 +1456,10 @@ def _is_valid_url(self, url, video_id, item='video', headers={}):
try: try:
self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers) self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
return True return True
except ExtractorError: except ExtractorError as e:
self.to_screen( self.to_screen(
'%s: %s URL is invalid, skipping' % (video_id, item)) '%s: %s URL is invalid, skipping: %s'
% (video_id, item, error_to_compat_str(e.cause)))
return False return False
def http_scheme(self): def http_scheme(self):

View file

@ -16,6 +16,8 @@
mimetype2ext, mimetype2ext,
orderedSet, orderedSet,
parse_iso8601, parse_iso8601,
strip_or_none,
try_get,
) )
@ -82,6 +84,7 @@ class CondeNastIE(InfoExtractor):
'uploader': 'gq', 'uploader': 'gq',
'upload_date': '20170321', 'upload_date': '20170321',
'timestamp': 1490126427, 'timestamp': 1490126427,
'description': 'How much grimmer would things be if these people were competent?',
}, },
}, { }, {
# JS embed # JS embed
@ -93,7 +96,7 @@ class CondeNastIE(InfoExtractor):
'title': '3D printed TSA Travel Sentry keys really do open TSA locks', 'title': '3D printed TSA Travel Sentry keys really do open TSA locks',
'uploader': 'arstechnica', 'uploader': 'arstechnica',
'upload_date': '20150916', 'upload_date': '20150916',
'timestamp': 1442434955, 'timestamp': 1442434920,
} }
}, { }, {
'url': 'https://player.cnevids.com/inline/video/59138decb57ac36b83000005.js?target=js-cne-player', 'url': 'https://player.cnevids.com/inline/video/59138decb57ac36b83000005.js?target=js-cne-player',
@ -196,6 +199,13 @@ def _extract_video(self, params):
}) })
self._sort_formats(formats) self._sort_formats(formats)
subtitles = {}
for t, caption in video_info.get('captions', {}).items():
caption_url = caption.get('src')
if not (t in ('vtt', 'srt', 'tml') and caption_url):
continue
subtitles.setdefault('en', []).append({'url': caption_url})
return { return {
'id': video_id, 'id': video_id,
'formats': formats, 'formats': formats,
@ -208,6 +218,7 @@ def _extract_video(self, params):
'season': video_info.get('season_title'), 'season': video_info.get('season_title'),
'timestamp': parse_iso8601(video_info.get('premiere_date')), 'timestamp': parse_iso8601(video_info.get('premiere_date')),
'categories': video_info.get('categories'), 'categories': video_info.get('categories'),
'subtitles': subtitles,
} }
def _real_extract(self, url): def _real_extract(self, url):
@ -224,6 +235,14 @@ def _real_extract(self, url):
if url_type == 'series': if url_type == 'series':
return self._extract_series(url, webpage) return self._extract_series(url, webpage)
else:
video = try_get(self._parse_json(self._search_regex(
r'__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
'preload state', '{}'), display_id),
lambda x: x['transformed']['video'])
if video:
params = {'videoId': video['id']}
info = {'description': strip_or_none(video.get('description'))}
else: else:
params = self._extract_video_params(webpage, display_id) params = self._extract_video_params(webpage, display_id)
info = self._search_json_ld( info = self._search_json_ld(

View file

@ -62,7 +62,7 @@
ARDMediathekIE, ARDMediathekIE,
) )
from .arte import ( from .arte import (
ArteTVPlus7IE, ArteTVIE,
ArteTVEmbedIE, ArteTVEmbedIE,
ArteTVPlaylistIE, ArteTVPlaylistIE,
) )
@ -542,6 +542,7 @@
EHFTVIE, EHFTVIE,
ITTFIE, ITTFIE,
) )
from .lbry import LBRYIE
from .lci import LCIIE from .lci import LCIIE
from .lcp import ( from .lcp import (
LcpPlayIE, LcpPlayIE,
@ -1079,8 +1080,7 @@
SpankBangPlaylistIE, SpankBangPlaylistIE,
) )
from .spankwire import SpankwireIE from .spankwire import SpankwireIE
from .spiegel import SpiegelIE, SpiegelArticleIE from .spiegel import SpiegelIE
from .spiegeltv import SpiegeltvIE
from .spike import ( from .spike import (
BellatorIE, BellatorIE,
ParamountNetworkIE, ParamountNetworkIE,
@ -1505,12 +1505,11 @@
from .yourupload import YourUploadIE from .yourupload import YourUploadIE
from .youtube import ( from .youtube import (
YoutubeIE, YoutubeIE,
YoutubeChannelIE,
YoutubeFavouritesIE, YoutubeFavouritesIE,
YoutubeHistoryIE, YoutubeHistoryIE,
YoutubeLiveIE, YoutubeLiveIE,
YoutubeTabIE,
YoutubePlaylistIE, YoutubePlaylistIE,
YoutubePlaylistsIE,
YoutubeRecommendedIE, YoutubeRecommendedIE,
YoutubeSearchDateIE, YoutubeSearchDateIE,
YoutubeSearchIE, YoutubeSearchIE,
@ -1519,7 +1518,7 @@
YoutubeSubscriptionsIE, YoutubeSubscriptionsIE,
YoutubeTruncatedIDIE, YoutubeTruncatedIDIE,
YoutubeTruncatedURLIE, YoutubeTruncatedURLIE,
YoutubeUserIE, YoutubeYtUserIE,
YoutubeWatchLaterIE, YoutubeWatchLaterIE,
) )
from .zapiks import ZapiksIE from .zapiks import ZapiksIE

View file

@ -17,6 +17,7 @@
parse_duration, parse_duration,
try_get, try_get,
url_or_none, url_or_none,
urljoin,
) )
from .dailymotion import DailymotionIE from .dailymotion import DailymotionIE
@ -128,18 +129,38 @@ def sign(manifest_url, manifest_id):
is_live = None is_live = None
formats = [] videos = []
for video in info['videos']:
if video['statut'] != 'ONLINE': for video in (info.get('videos') or []):
if video.get('statut') != 'ONLINE':
continue continue
video_url = video['url'] if not video.get('url'):
continue
videos.append(video)
if not videos:
for device_type in ['desktop', 'mobile']:
fallback_info = self._download_json(
'https://player.webservices.francetelevisions.fr/v1/videos/%s' % video_id,
video_id, 'Downloading fallback %s video JSON' % device_type, query={
'device_type': device_type,
'browser': 'chrome',
}, fatal=False)
if fallback_info and fallback_info.get('video'):
videos.append(fallback_info['video'])
formats = []
for video in videos:
video_url = video.get('url')
if not video_url: if not video_url:
continue continue
if is_live is None: if is_live is None:
is_live = (try_get( is_live = (try_get(
video, lambda x: x['plages_ouverture'][0]['direct'], video, lambda x: x['plages_ouverture'][0]['direct'], bool) is True
bool) is True) or '/live.francetv.fr/' in video_url or video.get('is_live') is True
format_id = video['format'] or '/live.francetv.fr/' in video_url)
format_id = video.get('format')
ext = determine_ext(video_url) ext = determine_ext(video_url)
if ext == 'f4m': if ext == 'f4m':
if georestricted: if georestricted:
@ -154,6 +175,9 @@ def sign(manifest_url, manifest_id):
sign(video_url, format_id), video_id, 'mp4', sign(video_url, format_id), video_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id=format_id, entry_protocol='m3u8_native', m3u8_id=format_id,
fatal=False)) fatal=False))
elif ext == 'mpd':
formats.extend(self._extract_mpd_formats(
sign(video_url, format_id), video_id, mpd_id=format_id, fatal=False))
elif video_url.startswith('rtmp'): elif video_url.startswith('rtmp'):
formats.append({ formats.append({
'url': video_url, 'url': video_url,
@ -166,6 +190,7 @@ def sign(manifest_url, manifest_id):
'url': video_url, 'url': video_url,
'format_id': format_id, 'format_id': format_id,
}) })
self._sort_formats(formats) self._sort_formats(formats)
title = info['titre'] title = info['titre']
@ -185,10 +210,10 @@ def sign(manifest_url, manifest_id):
return { return {
'id': video_id, 'id': video_id,
'title': self._live_title(title) if is_live else title, 'title': self._live_title(title) if is_live else title,
'description': clean_html(info['synopsis']), 'description': clean_html(info.get('synopsis')),
'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', info['image']), 'thumbnail': urljoin('http://pluzz.francetv.fr', info.get('image')),
'duration': int_or_none(info.get('real_duration')) or parse_duration(info['duree']), 'duration': int_or_none(info.get('real_duration')) or parse_duration(info.get('duree')),
'timestamp': int_or_none(info['diffusion']['timestamp']), 'timestamp': int_or_none(try_get(info, lambda x: x['diffusion']['timestamp'])),
'is_live': is_live, 'is_live': is_live,
'formats': formats, 'formats': formats,
'subtitles': subtitles, 'subtitles': subtitles,

View file

@ -91,6 +91,7 @@
from .videa import VideaIE from .videa import VideaIE
from .twentymin import TwentyMinutenIE from .twentymin import TwentyMinutenIE
from .ustream import UstreamIE from .ustream import UstreamIE
from .arte import ArteTVEmbedIE
from .videopress import VideoPressIE from .videopress import VideoPressIE
from .rutube import RutubeIE from .rutube import RutubeIE
from .limelight import LimelightBaseIE from .limelight import LimelightBaseIE
@ -2760,11 +2761,9 @@ def _real_extract(self, url):
return self.url_result(ustream_url, UstreamIE.ie_key()) return self.url_result(ustream_url, UstreamIE.ie_key())
# Look for embedded arte.tv player # Look for embedded arte.tv player
mobj = re.search( arte_urls = ArteTVEmbedIE._extract_urls(webpage)
r'<(?:script|iframe) [^>]*?src="(?P<url>http://www\.arte\.tv/(?:playerv2/embed|arte_vp/index)[^"]+)"', if arte_urls:
webpage) return self.playlist_from_matches(arte_urls, video_id, video_title)
if mobj is not None:
return self.url_result(mobj.group('url'), 'ArteTVEmbed')
# Look for embedded francetv player # Look for embedded francetv player
mobj = re.search( mobj = re.search(

View file

@ -150,7 +150,7 @@ def run(self, target, ip, timestamp):
elif function in other_functions: elif function in other_functions:
other_functions[function]() other_functions[function]()
else: else:
raise ExtractorError('Unknown funcion %s' % function) raise ExtractorError('Unknown function %s' % function)
return sdk.target return sdk.target

View file

@ -0,0 +1,88 @@
# coding: utf-8
from __future__ import unicode_literals
import json
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
determine_ext,
ExtractorError,
int_or_none,
mimetype2ext,
try_get,
)
class LBRYIE(InfoExtractor):
IE_NAME = 'lbry.tv'
_VALID_URL = r'https?://(?:www\.)?(?:lbry\.tv|odysee\.com)/(?P<id>@[0-9a-zA-Z-]+:[0-9a-z]+/[0-9a-zA-Z().-]+:[0-9a-z])'
_TESTS = [{
# Video
'url': 'https://lbry.tv/@Mantega:1/First-day-LBRY:1',
'md5': '65bd7ec1f6744ada55da8e4c48a2edf9',
'info_dict': {
'id': '17f983b61f53091fb8ea58a9c56804e4ff8cff4d',
'ext': 'mp4',
'title': 'First day in LBRY? Start HERE!',
'description': 'md5:f6cb5c704b332d37f5119313c2c98f51',
'timestamp': 1595694354,
'upload_date': '20200725',
}
}, {
# Audio
'url': 'https://lbry.tv/@LBRYFoundation:0/Episode-1:e',
'md5': 'c94017d3eba9b49ce085a8fad6b98d00',
'info_dict': {
'id': 'e7d93d772bd87e2b62d5ab993c1c3ced86ebb396',
'ext': 'mp3',
'title': 'The LBRY Foundation Community Podcast Episode 1 - Introduction, Streaming on LBRY, Transcoding',
'description': 'md5:661ac4f1db09f31728931d7b88807a61',
'timestamp': 1591312601,
'upload_date': '20200604',
}
}, {
'url': 'https://odysee.com/@BrodieRobertson:5/apple-is-tracking-everything-you-do-on:e',
'only_matching': True,
}]
def _call_api_proxy(self, method, display_id, params):
return self._download_json(
'https://api.lbry.tv/api/v1/proxy', display_id,
headers={'Content-Type': 'application/json-rpc'},
data=json.dumps({
'method': method,
'params': params,
}).encode())['result']
def _real_extract(self, url):
display_id = self._match_id(url).replace(':', '#')
uri = 'lbry://' + display_id
result = self._call_api_proxy(
'resolve', display_id, {'urls': [uri]})[uri]
result_value = result['value']
if result_value.get('stream_type') not in ('video', 'audio'):
raise ExtractorError('Unsupported URL', expected=True)
streaming_url = self._call_api_proxy(
'get', display_id, {'uri': uri})['streaming_url']
source = result_value.get('source') or {}
media = result_value.get('video') or result_value.get('audio') or {}
signing_channel = result_value.get('signing_channel') or {}
return {
'id': result['claim_id'],
'title': result_value['title'],
'thumbnail': try_get(result_value, lambda x: x['thumbnail']['url'], compat_str),
'description': result_value.get('description'),
'license': result_value.get('license'),
'timestamp': int_or_none(result.get('timestamp')),
'tags': result_value.get('tags'),
'width': int_or_none(media.get('width')),
'height': int_or_none(media.get('height')),
'duration': int_or_none(media.get('duration')),
'channel': signing_channel.get('name'),
'channel_id': signing_channel.get('claim_id'),
'ext': determine_ext(source.get('name')) or mimetype2ext(source.get('media_type')),
'filesize': int_or_none(source.get('size')),
'url': streaming_url,
}

View file

@ -5,28 +5,26 @@
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
determine_ext, clean_html,
int_or_none, merge_dicts,
parse_duration,
remove_end,
) )
class LRTIE(InfoExtractor): class LRTIE(InfoExtractor):
IE_NAME = 'lrt.lt' IE_NAME = 'lrt.lt'
_VALID_URL = r'https?://(?:www\.)?lrt\.lt/mediateka/irasas/(?P<id>[0-9]+)' _VALID_URL = r'https?://(?:www\.)?lrt\.lt(?P<path>/mediateka/irasas/(?P<id>[0-9]+))'
_TESTS = [{ _TESTS = [{
# m3u8 download # m3u8 download
'url': 'http://www.lrt.lt/mediateka/irasas/54391/', 'url': 'https://www.lrt.lt/mediateka/irasas/2000127261/greita-ir-gardu-sicilijos-ikvepta-klasikiniu-makaronu-su-baklazanais-vakariene',
'md5': 'fe44cf7e4ab3198055f2c598fc175cb0', 'md5': '85cb2bb530f31d91a9c65b479516ade4',
'info_dict': { 'info_dict': {
'id': '54391', 'id': '2000127261',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Septynios Kauno dienos', 'title': 'Greita ir gardu: Sicilijos įkvėpta klasikinių makaronų su baklažanais vakarienė',
'description': 'md5:24d84534c7dc76581e59f5689462411a', 'description': 'md5:ad7d985f51b0dc1489ba2d76d7ed47fa',
'duration': 1783, 'duration': 3035,
'view_count': int, 'timestamp': 1604079000,
'like_count': int, 'upload_date': '20201030',
}, },
}, { }, {
# direct mp3 download # direct mp3 download
@ -43,52 +41,35 @@ class LRTIE(InfoExtractor):
}, },
}] }]
def _extract_js_var(self, webpage, var_name, default):
return self._search_regex(
r'%s\s*=\s*(["\'])((?:(?!\1).)+)\1' % var_name,
webpage, var_name.replace('_', ' '), default, group=2)
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) path, video_id = re.match(self._VALID_URL, url).groups()
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
title = remove_end(self._og_search_title(webpage), ' - LRT') media_url = self._extract_js_var(webpage, 'main_url', path)
media = self._download_json(self._extract_js_var(
webpage, 'media_info_url',
'https://www.lrt.lt/servisai/stream_url/vod/media_info/'),
video_id, query={'url': media_url})
jw_data = self._parse_jwplayer_data(
media['playlist_item'], video_id, base_url=url)
formats = [] json_ld_data = self._search_json_ld(webpage, video_id)
for _, file_url in re.findall(
r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage): tags = []
ext = determine_ext(file_url) for tag in (media.get('tags') or []):
if ext not in ('m3u8', 'mp3'): tag_name = tag.get('name')
if not tag_name:
continue continue
# mp3 served as m3u8 produces stuttered media file tags.append(tag_name)
if ext == 'm3u8' and '.mp3' in file_url:
continue
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
file_url, video_id, 'mp4', entry_protocol='m3u8_native',
fatal=False))
elif ext == 'mp3':
formats.append({
'url': file_url,
'vcodec': 'none',
})
self._sort_formats(formats)
thumbnail = self._og_search_thumbnail(webpage) clean_info = {
description = self._og_search_description(webpage) 'description': clean_html(media.get('content')),
duration = parse_duration(self._search_regex( 'tags': tags,
r'var\s+record_len\s*=\s*(["\'])(?P<duration>[0-9]+:[0-9]+:[0-9]+)\1',
webpage, 'duration', default=None, group='duration'))
view_count = int_or_none(self._html_search_regex(
r'<div[^>]+class=(["\']).*?record-desc-seen.*?\1[^>]*>(?P<count>.+?)</div>',
webpage, 'view count', fatal=False, group='count'))
like_count = int_or_none(self._search_regex(
r'<span[^>]+id=(["\'])flikesCount.*?\1>(?P<count>\d+)<',
webpage, 'like count', fatal=False, group='count'))
return {
'id': video_id,
'title': title,
'formats': formats,
'thumbnail': thumbnail,
'description': description,
'duration': duration,
'view_count': view_count,
'like_count': like_count,
} }
return merge_dicts(clean_info, jw_data, json_ld_data)

View file

@ -1,10 +1,16 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import merge_dicts from ..utils import (
clean_html,
dict_get,
float_or_none,
int_or_none,
merge_dicts,
parse_duration,
try_get,
)
class MallTVIE(InfoExtractor): class MallTVIE(InfoExtractor):
@ -17,7 +23,7 @@ class MallTVIE(InfoExtractor):
'display_id': '18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice', 'display_id': '18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice',
'ext': 'mp4', 'ext': 'mp4',
'title': '18 miliard pro neziskovky. Opravdu jsou sportovci nebo Člověk v tísni pijavice?', 'title': '18 miliard pro neziskovky. Opravdu jsou sportovci nebo Člověk v tísni pijavice?',
'description': 'md5:25fc0ec42a72ba602b602c683fa29deb', 'description': 'md5:db7d5744a4bd4043d9d98324aa72ab35',
'duration': 216, 'duration': 216,
'timestamp': 1538870400, 'timestamp': 1538870400,
'upload_date': '20181007', 'upload_date': '20181007',
@ -37,20 +43,46 @@ def _real_extract(self, url):
webpage = self._download_webpage( webpage = self._download_webpage(
url, display_id, headers=self.geo_verification_headers()) url, display_id, headers=self.geo_verification_headers())
SOURCE_RE = r'(<source[^>]+\bsrc=(?:(["\'])(?:(?!\2).)+|[^\s]+)/(?P<id>[\da-z]+)/index)\b' video = self._parse_json(self._search_regex(
r'videoObject\s*=\s*JSON\.parse\(JSON\.stringify\(({.+?})\)\);',
webpage, 'video object'), display_id)
video_source = video['VideoSource']
video_id = self._search_regex( video_id = self._search_regex(
SOURCE_RE, webpage, 'video id', group='id') r'/([\da-z]+)/index\b', video_source, 'video id')
media = self._parse_html5_media_entries( formats = self._extract_m3u8_formats(
url, re.sub(SOURCE_RE, r'\1.m3u8', webpage), video_id, video_source + '.m3u8', video_id, 'mp4', 'm3u8_native')
m3u8_id='hls', m3u8_entry_protocol='m3u8_native')[0] self._sort_formats(formats)
subtitles = {}
for s in (video.get('Subtitles') or {}):
s_url = s.get('Url')
if not s_url:
continue
subtitles.setdefault(s.get('Language') or 'cz', []).append({
'url': s_url,
})
entity_counts = video.get('EntityCounts') or {}
def get_count(k):
v = entity_counts.get(k + 's') or {}
return int_or_none(dict_get(v, ('Count', 'StrCount')))
info = self._search_json_ld(webpage, video_id, default={}) info = self._search_json_ld(webpage, video_id, default={})
return merge_dicts(media, info, { return merge_dicts({
'id': video_id, 'id': video_id,
'display_id': display_id, 'display_id': display_id,
'title': self._og_search_title(webpage, default=None) or display_id, 'title': video.get('Title'),
'description': self._og_search_description(webpage, default=None), 'description': clean_html(video.get('Description')),
'thumbnail': self._og_search_thumbnail(webpage, default=None), 'thumbnail': video.get('ThumbnailUrl'),
}) 'formats': formats,
'subtitles': subtitles,
'duration': int_or_none(video.get('DurationSeconds')) or parse_duration(video.get('Duration')),
'view_count': get_count('View'),
'like_count': get_count('Like'),
'dislike_count': get_count('Dislike'),
'average_rating': float_or_none(try_get(video, lambda x: x['EntityRating']['AvarageRate'])),
'comment_count': get_count('Comment'),
}, info)

View file

@ -17,9 +17,8 @@
class MGTVIE(InfoExtractor): class MGTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?mgtv\.com/(v|b)/(?:[^/]+/)*(?P<id>\d+)\.html' _VALID_URL = r'https?://(?:w(?:ww)?\.)?mgtv\.com/(v|b)/(?:[^/]+/)*(?P<id>\d+)\.html'
IE_DESC = '芒果TV' IE_DESC = '芒果TV'
_GEO_COUNTRIES = ['CN']
_TESTS = [{ _TESTS = [{
'url': 'http://www.mgtv.com/v/1/290525/f/3116640.html', 'url': 'http://www.mgtv.com/v/1/290525/f/3116640.html',
@ -34,14 +33,18 @@ class MGTVIE(InfoExtractor):
}, { }, {
'url': 'http://www.mgtv.com/b/301817/3826653.html', 'url': 'http://www.mgtv.com/b/301817/3826653.html',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://w.mgtv.com/b/301817/3826653.html',
'only_matching': True,
}] }]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
tk2 = base64.urlsafe_b64encode(b'did=%s|pno=1030|ver=0.3.0301|clit=%d' % (compat_str(uuid.uuid4()).encode(), time.time()))[::-1]
try: try:
api_data = self._download_json( api_data = self._download_json(
'https://pcweb.api.mgtv.com/player/video', video_id, query={ 'https://pcweb.api.mgtv.com/player/video', video_id, query={
'tk2': base64.urlsafe_b64encode(b'did=%s|pno=1030|ver=0.3.0301|clit=%d' % (compat_str(uuid.uuid4()).encode(), time.time()))[::-1], 'tk2': tk2,
'video_id': video_id, 'video_id': video_id,
}, headers=self.geo_verification_headers())['data'] }, headers=self.geo_verification_headers())['data']
except ExtractorError as e: except ExtractorError as e:
@ -56,6 +59,7 @@ def _real_extract(self, url):
stream_data = self._download_json( stream_data = self._download_json(
'https://pcweb.api.mgtv.com/player/getSource', video_id, query={ 'https://pcweb.api.mgtv.com/player/getSource', video_id, query={
'pm2': api_data['atc']['pm2'], 'pm2': api_data['atc']['pm2'],
'tk2': tk2,
'video_id': video_id, 'video_id': video_id,
}, headers=self.geo_verification_headers())['data'] }, headers=self.geo_verification_headers())['data']
stream_domain = stream_data['stream_domain'][0] stream_domain = stream_data['stream_domain'][0]

View file

@ -403,6 +403,18 @@ class MTVIE(MTVServicesInfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod
def extract_child_with_type(parent, t):
children = parent['children']
return next(c for c in children if c.get('type') == t)
def _extract_mgid(self, webpage):
data = self._parse_json(self._search_regex(
r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None)
main_container = self.extract_child_with_type(data, 'MainContainer')
video_player = self.extract_child_with_type(main_container, 'VideoPlayer')
return video_player['props']['media']['video']['config']['uri']
class MTVJapanIE(MTVServicesInfoExtractor): class MTVJapanIE(MTVServicesInfoExtractor):
IE_NAME = 'mtvjapan' IE_NAME = 'mtvjapan'

View file

@ -10,7 +10,6 @@
from ..compat import compat_urllib_parse_unquote from ..compat import compat_urllib_parse_unquote
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
js_to_json,
parse_duration, parse_duration,
smuggle_url, smuggle_url,
try_get, try_get,
@ -394,8 +393,8 @@ def _real_extract(self, url):
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
data = self._parse_json(self._search_regex( data = self._parse_json(self._search_regex(
r'window\.__data\s*=\s*({.+});', webpage, r'<script[^>]+id="__NEXT_DATA__"[^>]*>({.+?})</script>',
'bootstrap json'), video_id, js_to_json) webpage, 'bootstrap json'), video_id)['props']['initialState']
video_data = try_get(data, lambda x: x['video']['current'], dict) video_data = try_get(data, lambda x: x['video']['current'], dict)
if not video_data: if not video_data:
video_data = data['article']['content'][0]['primaryMedia']['video'] video_data = data['article']['content'][0]['primaryMedia']['video']

View file

@ -82,6 +82,29 @@ class NDRIE(NDRBaseIE):
'params': { 'params': {
'skip_download': True, 'skip_download': True,
}, },
}, {
# with subtitles
'url': 'https://www.ndr.de/fernsehen/sendungen/extra_3/extra-3-Satiremagazin-mit-Christian-Ehring,sendung1091858.html',
'info_dict': {
'id': 'extra18674',
'display_id': 'extra-3-Satiremagazin-mit-Christian-Ehring',
'ext': 'mp4',
'title': 'Extra 3 vom 11.11.2020 mit Christian Ehring',
'description': 'md5:42ee53990a715eaaf4dc7f13a3bd56c6',
'uploader': 'ndrtv',
'upload_date': '20201113',
'duration': 1749,
'subtitles': {
'de': [{
'ext': 'ttml',
'url': r're:^https://www\.ndr\.de.+',
}],
},
},
'params': {
'skip_download': True,
},
'expected_warnings': ['Unable to download f4m manifest'],
}, { }, {
'url': 'https://www.ndr.de/Fettes-Brot-Ferris-MC-und-Thees-Uhlmann-live-on-stage,festivalsommer116.html', 'url': 'https://www.ndr.de/Fettes-Brot-Ferris-MC-und-Thees-Uhlmann-live-on-stage,festivalsommer116.html',
'only_matching': True, 'only_matching': True,
@ -242,6 +265,20 @@ def _real_extract(self, url):
'preference': quality_key(thumbnail.get('quality')), 'preference': quality_key(thumbnail.get('quality')),
}) })
subtitles = {}
tracks = config.get('tracks')
if tracks and isinstance(tracks, list):
for track in tracks:
if not isinstance(track, dict):
continue
track_url = urljoin(url, track.get('src'))
if not track_url:
continue
subtitles.setdefault(track.get('srclang') or 'de', []).append({
'url': track_url,
'ext': 'ttml',
})
return { return {
'id': video_id, 'id': video_id,
'title': title, 'title': title,
@ -251,6 +288,7 @@ def _real_extract(self, url):
'duration': duration, 'duration': duration,
'thumbnails': thumbnails, 'thumbnails': thumbnails,
'formats': formats, 'formats': formats,
'subtitles': subtitles,
} }

View file

@ -17,7 +17,7 @@
int_or_none, int_or_none,
parse_duration, parse_duration,
strip_or_none, strip_or_none,
try_get, unescapeHTML,
unified_strdate, unified_strdate,
unified_timestamp, unified_timestamp,
update_url_query, update_url_query,
@ -30,7 +30,6 @@ class RaiBaseIE(InfoExtractor):
_UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}'
_GEO_COUNTRIES = ['IT'] _GEO_COUNTRIES = ['IT']
_GEO_BYPASS = False _GEO_BYPASS = False
_BASE_URL = 'https://www.raiplay.it'
def _extract_relinker_info(self, relinker_url, video_id): def _extract_relinker_info(self, relinker_url, video_id):
if not re.match(r'https?://', relinker_url): if not re.match(r'https?://', relinker_url):
@ -123,8 +122,27 @@ def _extract_subtitles(url, subtitle_url):
class RaiPlayIE(RaiBaseIE): class RaiPlayIE(RaiBaseIE):
_VALID_URL = r'(?P<url>(?P<base>https?://(?:www\.)?raiplay\.it/.+?-)(?P<id>%s)(?P<ext>\.(?:html|json)))' % RaiBaseIE._UUID_RE _VALID_URL = r'(?P<url>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>%s)\.(?:html|json))' % RaiBaseIE._UUID_RE
_TESTS = [{ _TESTS = [{
'url': 'http://www.raiplay.it/video/2016/10/La-Casa-Bianca-e06118bb-59a9-4636-b914-498e4cfd2c66.html?source=twitter',
'md5': '340aa3b7afb54bfd14a8c11786450d76',
'info_dict': {
'id': 'e06118bb-59a9-4636-b914-498e4cfd2c66',
'ext': 'mp4',
'title': 'La Casa Bianca',
'alt_title': 'S2016 - Puntata del 23/10/2016',
'description': 'md5:a09d45890850458077d1f68bb036e0a5',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Rai 3',
'creator': 'Rai 3',
'duration': 3278,
'timestamp': 1477764300,
'upload_date': '20161029',
'series': 'La Casa Bianca',
'season': '2016',
},
'skip': 'This content is not available',
}, {
'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html',
'md5': '8970abf8caf8aef4696e7b1f2adfc696', 'md5': '8970abf8caf8aef4696e7b1f2adfc696',
'info_dict': { 'info_dict': {
@ -136,6 +154,8 @@ class RaiPlayIE(RaiBaseIE):
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Rai Gulp', 'uploader': 'Rai Gulp',
'duration': 6160, 'duration': 6160,
'series': 'Report',
'season': '2013/14',
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
@ -146,11 +166,10 @@ class RaiPlayIE(RaiBaseIE):
}] }]
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) url, video_id = re.match(self._VALID_URL, url).groups()
url, base, video_id, ext = mobj.group('url', 'base', 'id', 'ext')
media = self._download_json( media = self._download_json(
'%s%s.json' % (base, video_id), video_id, 'Downloading video JSON') url.replace('.html', '.json'), video_id, 'Downloading video JSON')
title = media['name'] title = media['name']
video = media['video'] video = media['video']
@ -159,34 +178,38 @@ def _real_extract(self, url):
self._sort_formats(relinker_info['formats']) self._sort_formats(relinker_info['formats'])
thumbnails = [] thumbnails = []
if 'images' in media: for _, value in media.get('images', {}).items():
for _, value in media.get('images').items():
if value: if value:
thumbnails.append({ thumbnails.append({
'url': urljoin(RaiBaseIE._BASE_URL, value.replace('[RESOLUTION]', '600x400')) 'url': urljoin(url, value),
}) })
timestamp = unified_timestamp(try_get( date_published = media.get('date_published')
media, lambda x: x['availabilities'][0]['start'], compat_str)) time_published = media.get('time_published')
if date_published and time_published:
date_published += ' ' + time_published
subtitles = self._extract_subtitles(url, video.get('subtitles')) subtitles = self._extract_subtitles(url, video.get('subtitles'))
program_info = media.get('program_info') or {}
season = media.get('season')
info = { info = {
'id': video_id, 'id': video_id,
'title': self._live_title(title) if relinker_info.get( 'title': self._live_title(title) if relinker_info.get(
'is_live') else title, 'is_live') else title,
'alt_title': media.get('subtitle'), 'alt_title': strip_or_none(media.get('subtitle')),
'description': media.get('description'), 'description': media.get('description'),
'uploader': strip_or_none(media.get('channel')), 'uploader': strip_or_none(media.get('channel')),
'creator': strip_or_none(media.get('editor')), 'creator': strip_or_none(media.get('editor') or None),
'duration': parse_duration(video.get('duration')), 'duration': parse_duration(video.get('duration')),
'timestamp': timestamp, 'timestamp': unified_timestamp(date_published),
'thumbnails': thumbnails, 'thumbnails': thumbnails,
'series': try_get( 'series': program_info.get('name'),
media, lambda x: x['isPartOf']['name'], compat_str), 'season_number': int_or_none(season),
'season_number': int_or_none(try_get( 'season': season if (season and not season.isdigit()) else None,
media, lambda x: x['isPartOf']['numeroStagioni'])), 'episode': media.get('episode_title'),
'season': media.get('stagione') or None, 'episode_number': int_or_none(media.get('episode')),
'subtitles': subtitles, 'subtitles': subtitles,
} }
@ -203,7 +226,7 @@ class RaiPlayLiveIE(RaiBaseIE):
'display_id': 'rainews24', 'display_id': 'rainews24',
'ext': 'mp4', 'ext': 'mp4',
'title': 're:^Diretta di Rai News 24 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'title': 're:^Diretta di Rai News 24 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'description': 'md5:4d00bcf6dc98b27c6ec480de329d1497', 'description': 'md5:6eca31500550f9376819f174e5644754',
'uploader': 'Rai News 24', 'uploader': 'Rai News 24',
'creator': 'Rai News 24', 'creator': 'Rai News 24',
'is_live': True, 'is_live': True,
@ -216,32 +239,20 @@ class RaiPlayLiveIE(RaiBaseIE):
def _real_extract(self, url): def _real_extract(self, url):
display_id = self._match_id(url) display_id = self._match_id(url)
media = self._download_json( webpage = self._download_webpage(url, display_id)
'%s.json' % urljoin(RaiBaseIE._BASE_URL, 'dirette/' + display_id),
display_id, 'Downloading channel JSON')
title = media['name'] video_id = self._search_regex(
video = media['video'] r'data-uniquename=["\']ContentItem-(%s)' % RaiBaseIE._UUID_RE,
video_id = media['id'].replace('ContentItem-', '') webpage, 'content id')
relinker_info = self._extract_relinker_info(video['content_url'], video_id) return {
self._sort_formats(relinker_info['formats']) '_type': 'url_transparent',
'ie_key': RaiPlayIE.ie_key(),
info = { 'url': 'http://www.raiplay.it/dirette/ContentItem-%s.html' % video_id,
'id': video_id, 'id': video_id,
'display_id': display_id, 'display_id': display_id,
'title': self._live_title(title) if relinker_info.get(
'is_live') else title,
'alt_title': media.get('subtitle'),
'description': media.get('description'),
'uploader': strip_or_none(media.get('channel')),
'creator': strip_or_none(media.get('editor')),
'duration': parse_duration(video.get('duration')),
} }
info.update(relinker_info)
return info
class RaiPlayPlaylistIE(InfoExtractor): class RaiPlayPlaylistIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+)' _VALID_URL = r'https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+)'
@ -250,7 +261,7 @@ class RaiPlayPlaylistIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': 'nondirloalmiocapo', 'id': 'nondirloalmiocapo',
'title': 'Non dirlo al mio capo', 'title': 'Non dirlo al mio capo',
'description': 'md5:98ab6b98f7f44c2843fd7d6f045f153b', 'description': 'md5:9f3d603b2947c1c7abb098f3b14fac86',
}, },
'playlist_mincount': 12, 'playlist_mincount': 12,
}] }]
@ -258,22 +269,18 @@ class RaiPlayPlaylistIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
playlist_id = self._match_id(url) playlist_id = self._match_id(url)
media = self._download_json( webpage = self._download_webpage(url, playlist_id)
'%s.json' % urljoin(RaiBaseIE._BASE_URL, 'programmi/' + playlist_id),
playlist_id, 'Downloading program JSON')
title = media['name'] title = self._html_search_meta(
description = media['program_info']['description'] ('programma', 'nomeProgramma'), webpage, 'title')
description = unescapeHTML(self._html_search_meta(
content_sets = [s['id'] for b in media['blocks'] for s in b['sets']] ('description', 'og:description'), webpage, 'description'))
entries = [] entries = []
for cs in content_sets: for mobj in re.finditer(
medias = self._download_json( r'<a\b[^>]+\bhref=(["\'])(?P<path>/raiplay/video/.+?)\1',
'%s/%s.json' % (urljoin(RaiBaseIE._BASE_URL, 'programmi/' + playlist_id), cs), webpage):
cs, 'Downloading content set JSON') video_url = urljoin(url, mobj.group('path'))
for m in medias['items']:
video_url = urljoin(url, m['path_id'])
entries.append(self.url_result( entries.append(self.url_result(
video_url, ie=RaiPlayIE.ie_key(), video_url, ie=RaiPlayIE.ie_key(),
video_id=RaiPlayIE._match_id(video_url))) video_id=RaiPlayIE._match_id(video_url)))
@ -294,7 +301,8 @@ class RaiIE(RaiBaseIE):
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
'duration': 1758, 'duration': 1758,
'upload_date': '20140612', 'upload_date': '20140612',
} },
'skip': 'This content is available only in Italy',
}, { }, {
# with ContentItem in many metas # with ContentItem in many metas
'url': 'http://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html', 'url': 'http://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html',
@ -320,6 +328,19 @@ class RaiIE(RaiBaseIE):
'duration': 2214, 'duration': 2214,
'upload_date': '20161103', 'upload_date': '20161103',
} }
}, {
# drawMediaRaiTV(...)
'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html',
'md5': '2dd727e61114e1ee9c47f0da6914e178',
'info_dict': {
'id': '59d69d28-6bb6-409d-a4b5-ed44096560af',
'ext': 'mp4',
'title': 'Il pacco',
'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a',
'thumbnail': r're:^https?://.*\.jpg$',
'upload_date': '20141221',
},
'skip': 'This content is not available',
}, { }, {
# initEdizione('ContentItem-...' # initEdizione('ContentItem-...'
'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined',
@ -331,6 +352,18 @@ class RaiIE(RaiBaseIE):
'upload_date': '20170401', 'upload_date': '20170401',
}, },
'skip': 'Changes daily', 'skip': 'Changes daily',
}, {
# HDS live stream with only relinker URL
'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews',
'info_dict': {
'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc',
'ext': 'flv',
'title': 'EuroNews',
},
'params': {
'skip_download': True,
},
'skip': 'This content is available only in Italy',
}, { }, {
# HLS live stream with ContentItem in og:url # HLS live stream with ContentItem in og:url
'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html',

View file

@ -1,9 +1,15 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import (
determine_ext,
float_or_none,
int_or_none,
unified_timestamp,
urlencode_postdata,
url_or_none,
)
class ServusIE(InfoExtractor): class ServusIE(InfoExtractor):
@ -12,20 +18,29 @@ class ServusIE(InfoExtractor):
(?:www\.)? (?:www\.)?
(?: (?:
servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)| servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)|
servustv\.com/videos (?:servustv|pm-wissen)\.com/videos
) )
/(?P<id>[aA]{2}-\w+|\d+-\d+) /(?P<id>[aA]{2}-\w+|\d+-\d+)
''' '''
_TESTS = [{ _TESTS = [{
# new URL schema # new URL schema
'url': 'https://www.servustv.com/videos/aa-1t6vbu5pw1w12/', 'url': 'https://www.servustv.com/videos/aa-1t6vbu5pw1w12/',
'md5': '3e1dd16775aa8d5cbef23628cfffc1f4', 'md5': '60474d4c21f3eb148838f215c37f02b9',
'info_dict': { 'info_dict': {
'id': 'AA-1T6VBU5PW1W12', 'id': 'AA-1T6VBU5PW1W12',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Die Grünen aus Sicht des Volkes', 'title': 'Die Grünen aus Sicht des Volkes',
'alt_title': 'Talk im Hangar-7 Voxpops Gruene',
'description': 'md5:1247204d85783afe3682644398ff2ec4', 'description': 'md5:1247204d85783afe3682644398ff2ec4',
'thumbnail': r're:^https?://.*\.jpg', 'thumbnail': r're:^https?://.*\.jpg',
'duration': 62.442,
'timestamp': 1605193976,
'upload_date': '20201112',
'series': 'Talk im Hangar-7',
'season': 'Season 9',
'season_number': 9,
'episode': 'Episode 31 - September 14',
'episode_number': 31,
} }
}, { }, {
# old URL schema # old URL schema
@ -40,30 +55,94 @@ class ServusIE(InfoExtractor):
}, { }, {
'url': 'https://www.servus.com/tv/videos/1380889096408-1235196658/', 'url': 'https://www.servus.com/tv/videos/1380889096408-1235196658/',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://www.pm-wissen.com/videos/aa-24mus4g2w2112/',
'only_matching': True,
}] }]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url).upper() video_id = self._match_id(url).upper()
webpage = self._download_webpage(url, video_id)
title = self._search_regex( token = self._download_json(
(r'videoLabel\s*=\s*(["\'])(?P<title>(?:(?!\1).)+)\1', 'https://auth.redbullmediahouse.com/token', video_id,
r'<h\d+[^>]+\bclass=["\']heading--(?:one|two)["\'][^>]*>(?P<title>[^<]+)'), 'Downloading token', data=urlencode_postdata({
webpage, 'title', default=None, 'grant_type': 'client_credentials',
group='title') or self._og_search_title(webpage) }), headers={
title = re.sub(r'\s*-\s*Servus TV\s*$', '', title) 'Authorization': 'Basic SVgtMjJYNEhBNFdEM1cxMTpEdDRVSkFLd2ZOMG5IMjB1NGFBWTBmUFpDNlpoQ1EzNA==',
description = self._og_search_description(webpage) })
thumbnail = self._og_search_thumbnail(webpage) access_token = token['access_token']
token_type = token.get('token_type', 'Bearer')
formats = self._extract_m3u8_formats( video = self._download_json(
'https://stv.rbmbtnx.net/api/v1/manifests/%s.m3u8' % video_id, 'https://sparkle-api.liiift.io/api/v1/stv/channels/international/assets/%s' % video_id,
video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') video_id, 'Downloading video JSON', headers={
'Authorization': '%s %s' % (token_type, access_token),
})
formats = []
thumbnail = None
for resource in video['resources']:
if not isinstance(resource, dict):
continue
format_url = url_or_none(resource.get('url'))
if not format_url:
continue
extension = resource.get('extension')
type_ = resource.get('type')
if extension == 'jpg' or type_ == 'reference_keyframe':
thumbnail = format_url
continue
ext = determine_ext(format_url)
if type_ == 'dash' or ext == 'mpd':
formats.extend(self._extract_mpd_formats(
format_url, video_id, mpd_id='dash', fatal=False))
elif type_ == 'hls' or ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
format_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False))
elif extension == 'mp4' or ext == 'mp4':
formats.append({
'url': format_url,
'format_id': type_,
'width': int_or_none(resource.get('width')),
'height': int_or_none(resource.get('height')),
})
self._sort_formats(formats) self._sort_formats(formats)
attrs = {}
for attribute in video['attributes']:
if not isinstance(attribute, dict):
continue
key = attribute.get('fieldKey')
value = attribute.get('fieldValue')
if not key or not value:
continue
attrs[key] = value
title = attrs.get('title_stv') or video_id
alt_title = attrs.get('title')
description = attrs.get('long_description') or attrs.get('short_description')
series = attrs.get('label')
season = attrs.get('season')
episode = attrs.get('chapter')
duration = float_or_none(attrs.get('duration'), scale=1000)
season_number = int_or_none(self._search_regex(
r'Season (\d+)', season or '', 'season number', default=None))
episode_number = int_or_none(self._search_regex(
r'Episode (\d+)', episode or '', 'episode number', default=None))
return { return {
'id': video_id, 'id': video_id,
'title': title, 'title': title,
'alt_title': alt_title,
'description': description, 'description': description,
'thumbnail': thumbnail, 'thumbnail': thumbnail,
'duration': duration,
'timestamp': unified_timestamp(video.get('lastPublished')),
'series': series,
'season': season,
'season_number': season_number,
'episode': episode,
'episode_number': episode_number,
'formats': formats, 'formats': formats,
} }

View file

@ -1,159 +1,54 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import re
from .common import InfoExtractor from .common import InfoExtractor
from .nexx import ( from .jwplatform import JWPlatformIE
NexxIE,
NexxEmbedIE,
)
from .spiegeltv import SpiegeltvIE
from ..compat import compat_urlparse
from ..utils import (
parse_duration,
strip_or_none,
unified_timestamp,
)
class SpiegelIE(InfoExtractor): class SpiegelIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<id>[0-9]+)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}'
_VALID_URL = r'https?://(?:www\.)?(?:spiegel|manager-magazin)\.de(?:/[^/]+)+/[^/]*-(?P<id>[0-9]+|%s)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' % _UUID_RE
_TESTS = [{ _TESTS = [{
'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html', 'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html',
'md5': 'b57399839d055fccfeb9a0455c439868', 'md5': '50c7948883ec85a3e431a0a44b7ad1d6',
'info_dict': { 'info_dict': {
'id': '563747', 'id': 'II0BUyxY',
'display_id': '1259285',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv', 'title': 'Vulkan Tungurahua in Ecuador ist wieder aktiv - DER SPIEGEL - Wissenschaft',
'description': 'md5:8029d8310232196eb235d27575a8b9f4', 'description': 'md5:8029d8310232196eb235d27575a8b9f4',
'duration': 49, 'duration': 48.0,
'upload_date': '20130311', 'upload_date': '20130311',
'timestamp': 1362994320, 'timestamp': 1362997920,
}, },
}, { }, {
'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html', 'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html',
'md5': '5b6c2f4add9d62912ed5fc78a1faed80',
'info_dict': {
'id': '580988',
'ext': 'mp4',
'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers',
'description': 'md5:c2322b65e58f385a820c10fa03b2d088',
'duration': 983,
'upload_date': '20131115',
'timestamp': 1384546642,
},
}, {
'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-embed.html',
'md5': '97b91083a672d72976faa8433430afb9',
'info_dict': {
'id': '601883',
'ext': 'mp4',
'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.',
'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"',
'upload_date': '20140904',
'timestamp': 1409834160,
}
}, {
'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html',
'only_matching': True, 'only_matching': True,
}, { }, {
# nexx video 'url': 'https://www.spiegel.de/video/eifel-zoo-aufregung-um-ausgebrochene-raubtiere-video-99018031.html',
'only_matching': True,
}, {
'url': 'https://www.spiegel.de/panorama/urteile-im-goldmuenzenprozess-haftstrafen-fuer-clanmitglieder-a-aae8df48-43c1-4c61-867d-23f0a2d254b7',
'only_matching': True,
}, {
'url': 'http://www.spiegel.de/video/spiegel-tv-magazin-ueber-guellekrise-in-schleswig-holstein-video-99012776.html', 'url': 'http://www.spiegel.de/video/spiegel-tv-magazin-ueber-guellekrise-in-schleswig-holstein-video-99012776.html',
'only_matching': True, 'only_matching': True,
}] }, {
def _real_extract(self, url):
video_id = self._match_id(url)
metadata_url = 'http://www.spiegel.de/video/metadata/video-%s.json' % video_id
handle = self._request_webpage(metadata_url, video_id)
# 302 to spiegel.tv, like http://www.spiegel.de/video/der-film-zum-wochenende-die-wahrheit-ueber-maenner-video-99003272.html
if SpiegeltvIE.suitable(handle.geturl()):
return self.url_result(handle.geturl(), 'Spiegeltv')
video_data = self._parse_json(self._webpage_read_content(
handle, metadata_url, video_id), video_id)
title = video_data['title']
nexx_id = video_data['nexxOmniaId']
domain_id = video_data.get('nexxOmniaDomain') or '748'
return {
'_type': 'url_transparent',
'id': video_id,
'url': 'nexx:%s:%s' % (domain_id, nexx_id),
'title': title,
'description': strip_or_none(video_data.get('teaser')),
'duration': parse_duration(video_data.get('duration')),
'timestamp': unified_timestamp(video_data.get('datum')),
'ie_key': NexxIE.ie_key(),
}
class SpiegelArticleIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?spiegel\.de/(?!video/)[^?#]*?-(?P<id>[0-9]+)\.html'
IE_NAME = 'Spiegel:Article'
IE_DESC = 'Articles on spiegel.de'
_TESTS = [{
'url': 'http://www.spiegel.de/sport/sonst/badminton-wm-die-randsportart-soll-populaerer-werden-a-987092.html', 'url': 'http://www.spiegel.de/sport/sonst/badminton-wm-die-randsportart-soll-populaerer-werden-a-987092.html',
'info_dict': { 'only_matching': True,
'id': '1516455',
'ext': 'mp4',
'title': 'Faszination Badminton: Nennt es bloß nicht Federball',
'description': 're:^Patrick Kämnitz gehört.{100,}',
'upload_date': '20140825',
},
}, {
'url': 'http://www.spiegel.de/wissenschaft/weltall/astronaut-alexander-gerst-antwortet-spiegel-online-lesern-a-989876.html',
'info_dict': {
},
'playlist_count': 6,
}, {
# Nexx iFrame embed
'url': 'http://www.spiegel.de/sptv/spiegeltv/spiegel-tv-ueber-schnellste-katapult-achterbahn-der-welt-taron-a-1137884.html',
'info_dict': {
'id': '161464',
'ext': 'mp4',
'title': 'Nervenkitzel Achterbahn',
'alt_title': 'Karussellbauer in Deutschland',
'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc',
'release_year': 2005,
'creator': 'SPIEGEL TV',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 2761,
'timestamp': 1394021479,
'upload_date': '20140305',
},
'params': {
'format': 'bestvideo',
'skip_download': True,
},
}] }]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
media_id = self._html_search_regex(
# Single video on top of the page r'(&#34;|["\'])mediaId\1\s*:\s*(&#34;|["\'])(?P<id>(?:(?!\2).)+)\2',
video_link = self._search_regex( webpage, 'media id', group='id')
r'<a href="([^"]+)" onclick="return spOpenVideo\(this,', webpage, return {
'video page URL', default=None) '_type': 'url_transparent',
if video_link: 'id': video_id,
video_url = compat_urlparse.urljoin( 'display_id': video_id,
self.http_scheme() + '//spiegel.de/', video_link) 'url': 'jwplatform:%s' % media_id,
return self.url_result(video_url) 'title': self._og_search_title(webpage, default=None),
'ie_key': JWPlatformIE.ie_key(),
# Multiple embedded videos }
embeds = re.findall(
r'<div class="vid_holder[0-9]+.*?</div>\s*.*?url\s*=\s*"([^"]+)"',
webpage)
entries = [
self.url_result(compat_urlparse.urljoin(
self.http_scheme() + '//spiegel.de/', embed_path))
for embed_path in embeds]
if embeds:
return self.playlist_result(entries)
return self.playlist_from_matches(
NexxEmbedIE._extract_urls(webpage), ie=NexxEmbedIE.ie_key())

View file

@ -8,8 +8,8 @@
class TwentyThreeVideoIE(InfoExtractor): class TwentyThreeVideoIE(InfoExtractor):
IE_NAME = '23video' IE_NAME = '23video'
_VALID_URL = r'https?://video\.(?P<domain>twentythree\.net|23video\.com|filmweb\.no)/v\.ihtml/player\.html\?(?P<query>.*?\bphoto(?:_|%5f)id=(?P<id>\d+).*)' _VALID_URL = r'https?://(?P<domain>[^.]+\.(?:twentythree\.net|23video\.com|filmweb\.no))/v\.ihtml/player\.html\?(?P<query>.*?\bphoto(?:_|%5f)id=(?P<id>\d+).*)'
_TEST = { _TESTS = [{
'url': 'https://video.twentythree.net/v.ihtml/player.html?showDescriptions=0&source=site&photo%5fid=20448876&autoPlay=1', 'url': 'https://video.twentythree.net/v.ihtml/player.html?showDescriptions=0&source=site&photo%5fid=20448876&autoPlay=1',
'md5': '75fcf216303eb1dae9920d651f85ced4', 'md5': '75fcf216303eb1dae9920d651f85ced4',
'info_dict': { 'info_dict': {
@ -21,11 +21,14 @@ class TwentyThreeVideoIE(InfoExtractor):
'uploader_id': '12258964', 'uploader_id': '12258964',
'uploader': 'Rasmus Bysted', 'uploader': 'Rasmus Bysted',
} }
} }, {
'url': 'https://bonnier-publications-danmark.23video.com/v.ihtml/player.html?token=f0dc46476e06e13afd5a1f84a29e31e8&source=embed&photo%5fid=36137620',
'only_matching': True,
}]
def _real_extract(self, url): def _real_extract(self, url):
domain, query, photo_id = re.match(self._VALID_URL, url).groups() domain, query, photo_id = re.match(self._VALID_URL, url).groups()
base_url = 'https://video.%s' % domain base_url = 'https://%s' % domain
photo_data = self._download_json( photo_data = self._download_json(
base_url + '/api/photo/list?' + query, photo_id, query={ base_url + '/api/photo/list?' + query, photo_id, query={
'format': 'json', 'format': 'json',

View file

@ -2,8 +2,11 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import unified_timestamp from ..utils import (
import re dict_get,
int_or_none,
unified_timestamp,
)
class URPlayIE(InfoExtractor): class URPlayIE(InfoExtractor):
@ -14,7 +17,7 @@ class URPlayIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': '203704', 'id': '203704',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Om vetenskap, kritiskt tänkande och motstånd', 'title': 'UR Samtiden - Livet, universum och rymdens märkliga musik : Om vetenskap, kritiskt tänkande och motstånd',
'description': 'md5:5344508a52aa78c1ced6c1b8b9e44e9a', 'description': 'md5:5344508a52aa78c1ced6c1b8b9e44e9a',
'timestamp': 1513292400, 'timestamp': 1513292400,
'upload_date': '20171214', 'upload_date': '20171214',
@ -26,7 +29,7 @@ class URPlayIE(InfoExtractor):
'ext': 'mp4', 'ext': 'mp4',
'title': 'Tripp, Trapp, Träd : Sovkudde', 'title': 'Tripp, Trapp, Träd : Sovkudde',
'description': 'md5:b86bffdae04a7e9379d1d7e5947df1d1', 'description': 'md5:b86bffdae04a7e9379d1d7e5947df1d1',
'timestamp': 1440093600, 'timestamp': 1440086400,
'upload_date': '20150820', 'upload_date': '20150820',
}, },
}, { }, {
@ -36,28 +39,27 @@ class URPlayIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
url = url.replace('skola.se/Produkter', 'play.se/program')
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
urplayer_data = re.sub("&quot;", "\"", self._search_regex( urplayer_data = self._parse_json(self._html_search_regex(
r'components\/Player\/Player\" data-react-props=\"({.+?})\"', r'data-react-class="components/Player/Player"[^>]+data-react-props="({.+?})"',
webpage, 'urplayer data')) webpage, 'urplayer data'), video_id)['currentProduct']
urplayer_data = self._parse_json(urplayer_data, video_id) episode = urplayer_data['title']
for i in range(len(urplayer_data['accessibleEpisodes'])):
if urplayer_data.get('accessibleEpisodes', {})[i].get('id') == int(video_id):
urplayer_data = urplayer_data['accessibleEpisodes'][i]
break
host = self._download_json('http://streaming-loadbalancer.ur.se/loadbalancer.json', video_id)['redirect'] host = self._download_json('http://streaming-loadbalancer.ur.se/loadbalancer.json', video_id)['redirect']
formats = [] formats = []
urplayer_streams = urplayer_data.get("streamingInfo") urplayer_streams = urplayer_data.get('streamingInfo', {})
for quality in ('sd'), ('hd'):
location = (urplayer_streams.get("raw", {}).get(quality, {}).get("location") for k, v in urplayer_streams.get('raw', {}).items():
or urplayer_streams.get("sweComplete", {}).get(quality, {}).get("location")) if not (k in ('sd', 'hd') and isinstance(v, dict)):
if location: continue
file_http = v.get('location')
if file_http:
formats.extend(self._extract_wowza_formats( formats.extend(self._extract_wowza_formats(
'http://%s/%s/playlist.m3u8' % (host, location), video_id, 'http://%s/%splaylist.m3u8' % (host, file_http),
skip_protocols=['f4m', 'rtmp', 'rtsp'])) video_id, skip_protocols=['f4m', 'rtmp', 'rtsp']))
self._sort_formats(formats) self._sort_formats(formats)
subtitles = {} subtitles = {}
subs = urplayer_streams.get("sweComplete", {}).get("tt", {}).get("location") subs = urplayer_streams.get("sweComplete", {}).get("tt", {}).get("location")
if subs: if subs:
@ -65,14 +67,37 @@ def _real_extract(self, url):
'url': subs, 'url': subs,
}) })
image = urplayer_data.get('image') or {}
thumbnails = []
for k, v in image.items():
t = {
'id': k,
'url': v,
}
wh = k.split('x')
if len(wh) == 2:
t.update({
'width': int_or_none(wh[0]),
'height': int_or_none(wh[1]),
})
thumbnails.append(t)
series = urplayer_data.get('series') or {}
series_title = dict_get(series, ('seriesTitle', 'title')) or dict_get(urplayer_data, ('seriesTitle', 'mainTitle'))
return { return {
'id': video_id, 'id': video_id,
'title': urplayer_data['title'],
'description': self._og_search_description(webpage),
'thumbnail': urplayer_data.get('image', {}).get('1280x720'),
'timestamp': unified_timestamp(self._html_search_meta(('uploadDate', 'schema:uploadDate'),
webpage, 'timestamp')),
'series': urplayer_data.get('seriesTitle'),
'subtitles': subtitles, 'subtitles': subtitles,
'title': '%s : %s' % (series_title, episode) if series_title else episode,
'description': urplayer_data.get('description'),
'thumbnails': thumbnails,
'timestamp': unified_timestamp(urplayer_data.get('publishedAt')),
'series': series_title,
'formats': formats, 'formats': formats,
'duration': int_or_none(urplayer_data.get('duration')),
'categories': urplayer_data.get('categories'),
'tags': urplayer_data.get('keywords'),
'season': series.get('label'),
'episode': episode,
'episode_number': int_or_none(urplayer_data.get('episodeNumber')),
} }

View file

@ -1,74 +1,24 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from .adobepass import AdobePassIE from .nbc import NBCIE
from ..utils import (
NO_DEFAULT,
smuggle_url,
update_url_query,
)
class USANetworkIE(AdobePassIE): class USANetworkIE(NBCIE):
_VALID_URL = r'https?://(?:www\.)?usanetwork\.com/(?:[^/]+/videos|movies)/(?P<id>[^/?#]+)' _VALID_URL = r'https?(?P<permalink>://(?:www\.)?usanetwork\.com/(?:[^/]+/videos?|movies?)/(?:[^/]+/)?(?P<id>\d+))'
_TEST = { _TESTS = [{
'url': 'http://www.usanetwork.com/mrrobot/videos/hpe-cybersecurity', 'url': 'https://www.usanetwork.com/peacock-trailers/video/intelligence-trailer/4185302',
'md5': '33c0d2ba381571b414024440d08d57fd',
'info_dict': { 'info_dict': {
'id': '3086229', 'id': '4185302',
'ext': 'mp4', 'ext': 'mp4',
'title': 'HPE Cybersecurity', 'title': 'Intelligence (Trailer)',
'description': 'The more we digitize our world, the more vulnerable we are.', 'description': 'A maverick NSA agent enlists the help of a junior systems analyst in a workplace power grab.',
'upload_date': '20160818', 'upload_date': '20200715',
'timestamp': 1471535460, 'timestamp': 1594785600,
'uploader': 'NBCU-USA', 'uploader': 'NBCU-MPAT',
}, },
} 'params': {
# m3u8 download
def _real_extract(self, url): 'skip_download': True,
display_id = self._match_id(url) },
webpage = self._download_webpage(url, display_id) }]
def _x(name, default=NO_DEFAULT):
return self._search_regex(
r'data-%s\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % name,
webpage, name, default=default, group='value')
video_id = _x('mpx-guid')
title = _x('episode-title')
mpx_account_id = _x('mpx-account-id', '2304992029')
query = {
'mbr': 'true',
}
if _x('is-full-episode', None) == '1':
query['manifest'] = 'm3u'
if _x('is-entitlement', None) == '1':
adobe_pass = {}
drupal_settings = self._search_regex(
r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);',
webpage, 'drupal settings', fatal=False)
if drupal_settings:
drupal_settings = self._parse_json(drupal_settings, video_id, fatal=False)
if drupal_settings:
adobe_pass = drupal_settings.get('adobePass', {})
resource = self._get_mvpd_resource(
adobe_pass.get('adobePassResourceId', 'usa'),
title, video_id, _x('episode-rating', 'TV-14'))
query['auth'] = self._extract_mvpd_auth(
url, video_id, adobe_pass.get('adobePassRequestorId', 'usa'), resource)
info = self._search_json_ld(webpage, video_id, default={})
info.update({
'_type': 'url_transparent',
'url': smuggle_url(update_url_query(
'http://link.theplatform.com/s/HNK2IC/media/guid/%s/%s' % (mpx_account_id, video_id),
query), {'force_smil_url': True}),
'id': video_id,
'title': title,
'series': _x('show-title', None),
'episode': title,
'ie_key': 'ThePlatform',
})
return info

View file

@ -19,7 +19,7 @@
class UstreamIE(InfoExtractor): class UstreamIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?ustream\.tv/(?P<type>recorded|embed|embed/recorded)/(?P<id>\d+)' _VALID_URL = r'https?://(?:www\.)?(?:ustream\.tv|video\.ibm\.com)/(?P<type>recorded|embed|embed/recorded)/(?P<id>\d+)'
IE_NAME = 'ustream' IE_NAME = 'ustream'
_TESTS = [{ _TESTS = [{
'url': 'http://www.ustream.tv/recorded/20274954', 'url': 'http://www.ustream.tv/recorded/20274954',
@ -67,12 +67,15 @@ class UstreamIE(InfoExtractor):
'params': { 'params': {
'skip_download': True, # m3u8 download 'skip_download': True, # m3u8 download
}, },
}, {
'url': 'https://video.ibm.com/embed/recorded/128240221?&autoplay=true&controls=true&volume=100',
'only_matching': True,
}] }]
@staticmethod @staticmethod
def _extract_url(webpage): def _extract_url(webpage):
mobj = re.search( mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage) r'<iframe[^>]+?src=(["\'])(?P<url>http://(?:www\.)?(?:ustream\.tv|video\.ibm\.com)/embed/.+?)\1', webpage)
if mobj is not None: if mobj is not None:
return mobj.group('url') return mobj.group('url')

View file

@ -946,6 +946,9 @@ def _fetch_page(self, album_id, authorizaion, hashed_pass, page):
def _real_extract(self, url): def _real_extract(self, url):
album_id = self._match_id(url) album_id = self._match_id(url)
viewer = self._download_json(
'https://vimeo.com/_rv/viewer', album_id, fatal=False)
if not viewer:
webpage = self._download_webpage(url, album_id) webpage = self._download_webpage(url, album_id)
viewer = self._parse_json(self._search_regex( viewer = self._parse_json(self._search_regex(
r'bootstrap_data\s*=\s*({.+?})</script>', r'bootstrap_data\s*=\s*({.+?})</script>',

View file

@ -4,52 +4,48 @@
import re import re
import time import time
import itertools import itertools
import json
from .common import InfoExtractor from .common import InfoExtractor
from .naver import NaverBaseIE from .naver import NaverBaseIE
from ..compat import compat_str from ..compat import (
compat_HTTPError,
compat_str,
)
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none,
merge_dicts, merge_dicts,
try_get, try_get,
urlencode_postdata, urlencode_postdata,
) )
class VLiveIE(NaverBaseIE): class VLiveBaseIE(NaverBaseIE):
_APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b'
class VLiveIE(VLiveBaseIE):
IE_NAME = 'vlive' IE_NAME = 'vlive'
_VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/(?:video|post)/(?P<id>(?:\d-)?[0-9]+)' _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/(?:video|embed)/(?P<id>[0-9]+)'
_NETRC_MACHINE = 'vlive' _NETRC_MACHINE = 'vlive'
_TESTS = [{ _TESTS = [{
'url': 'https://www.vlive.tv/video/1326', 'url': 'http://www.vlive.tv/video/1326',
'md5': 'cc7314812855ce56de70a06a27314983', 'md5': 'cc7314812855ce56de70a06a27314983',
'info_dict': { 'info_dict': {
'id': '1326', 'id': '1326',
'ext': 'mp4', 'ext': 'mp4',
'title': "[V LIVE] Girl's Day's Broadcast", 'title': "Girl's Day's Broadcast",
'creator': "Girl's Day", 'creator': "Girl's Day",
'view_count': int, 'view_count': int,
'uploader_id': 'muploader_a', 'uploader_id': 'muploader_a',
}, },
}, }, {
{ 'url': 'http://www.vlive.tv/video/16937',
'url': 'https://vlive.tv/post/1-18244258',
'md5': 'cc7314812855ce56de70a06a27314983',
'info_dict': {
'id': '1326',
'ext': 'mp4',
'title': "[V LIVE] Girl's Day's Broadcast",
'creator': "Girl's Day",
'view_count': int,
'uploader_id': 'muploader_a',
},
},
{
'url': 'https://www.vlive.tv/video/16937',
'info_dict': { 'info_dict': {
'id': '16937', 'id': '16937',
'ext': 'mp4', 'ext': 'mp4',
'title': '[V LIVE] 첸백시 걍방', 'title': '첸백시 걍방',
'creator': 'EXO', 'creator': 'EXO',
'view_count': int, 'view_count': int,
'subtitles': 'mincount:12', 'subtitles': 'mincount:12',
@ -70,12 +66,11 @@ class VLiveIE(NaverBaseIE):
'subtitles': 'mincount:10', 'subtitles': 'mincount:10',
}, },
'skip': 'This video is only available for CH+ subscribers', 'skip': 'This video is only available for CH+ subscribers',
}, {
'url': 'https://www.vlive.tv/embed/1326',
'only_matching': True,
}] }]
@classmethod
def suitable(cls, url):
return False if VLivePlaylistIE.suitable(url) else super(VLiveIE, cls).suitable(url)
def _real_initialize(self): def _real_initialize(self):
self._login() self._login()
@ -107,118 +102,82 @@ def is_logged_in():
if not is_logged_in(): if not is_logged_in():
raise ExtractorError('Unable to log in', expected=True) raise ExtractorError('Unable to log in', expected=True)
def _call_api(self, path_template, video_id, fields=None):
query = {'appId': self._APP_ID}
if fields:
query['fields'] = fields
return self._download_json(
'https://www.vlive.tv/globalv-web/vam-web/' + path_template % video_id, video_id,
'Downloading %s JSON metadata' % path_template.split('/')[-1].split('-')[0],
headers={'Referer': 'https://www.vlive.tv/'}, query=query)
def _real_extract(self, url): def _real_extract(self, url):
# url may match on a post or a video url with a post_id potentially matching a video_id video_id = self._match_id(url)
working_id = self._match_id(url)
webpage = self._download_webpage(url, working_id)
PARAMS_RE = r'window\.__PRELOADED_STATE__\s*=\s*({.*});?\s*</script>' try:
PARAMS_FIELD = 'params' post = self._call_api(
'post/v1.0/officialVideoPost-%s', video_id,
'author{nickname},channel{channelCode,channelName},officialVideo{commentCount,exposeStatus,likeCount,playCount,playTime,status,title,type,vodId}')
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
self.raise_login_required(json.loads(e.cause.read().decode())['message'])
raise
params = self._search_regex( video = post['officialVideo']
PARAMS_RE, webpage, PARAMS_FIELD, default='', flags=re.DOTALL)
params = self._parse_json(params, working_id, fatal=False)
video_params = try_get(params, lambda x: x["postDetail"]["post"]["officialVideo"], dict) def get_common_fields():
channel = post.get('channel') or {}
if video_params is None:
error = try_get(params, lambda x: x["postDetail"]["error"], dict)
error_data = try_get(error, lambda x: x["data"], dict)
error_video = try_get(error_data, lambda x: x["officialVideo"], dict)
error_msg = try_get(error, lambda x: x["message"], compat_str)
product_type = try_get(error_data,
[lambda x: x["officialVideo"]["productType"],
lambda x: x["board"]["boardType"]],
compat_str)
if error_video is not None:
if product_type in ('VLIVE_PLUS', 'VLIVE+'):
self.raise_login_required('This video is only available with V LIVE+.')
elif error_msg is not None:
raise ExtractorError('V LIVE reported the following error: %s' % error_msg)
else:
raise ExtractorError('Failed to extract video parameters.')
elif 'post' in url:
raise ExtractorError('Url does not appear to be a video post.', expected=True)
else:
raise ExtractorError('Failed to extract video parameters.')
video_id = working_id if 'video' in url else str(video_params["videoSeq"])
video_type = video_params["type"]
if video_type in ('VOD'):
encoding_status = video_params["encodingStatus"]
if encoding_status == 'COMPLETE':
return self._replay(video_id, webpage, params, video_params)
else:
raise ExtractorError('VOD encoding not yet complete. Please try again later.',
expected=True)
elif video_type in ('LIVE'):
video_status = video_params["status"]
if video_status in ('RESERVED'):
raise ExtractorError('Coming soon!', expected=True)
elif video_status in ('ENDED', 'END'):
raise ExtractorError('Uploading for replay. Please wait...', expected=True)
else:
return self._live(video_id, webpage, params)
else:
raise ExtractorError('Unknown video type %s' % video_type)
def _get_common_fields(self, webpage, params):
title = self._og_search_title(webpage)
description = self._html_search_meta(
['og:description', 'description', 'twitter:description'],
webpage, 'description', default=None)
creator = (try_get(params, lambda x: x["channel"]["channel"]["channelName"], compat_str)
or self._search_regex(r'on (.*) channel', description or '', 'creator', fatal=False))
thumbnail = self._og_search_thumbnail(webpage)
return { return {
'title': title, 'title': video.get('title'),
'creator': creator, 'creator': post.get('author', {}).get('nickname'),
'thumbnail': thumbnail, 'channel': channel.get('channelName'),
'channel_id': channel.get('channelCode'),
'duration': int_or_none(video.get('playTime')),
'view_count': int_or_none(video.get('playCount')),
'like_count': int_or_none(video.get('likeCount')),
'comment_count': int_or_none(video.get('commentCount')),
} }
def _live(self, video_id, webpage, params): video_type = video.get('type')
LIVE_INFO_ENDPOINT = 'https://www.vlive.tv/globalv-web/vam-web/old/v3/live/%s/playInfo' % video_id if video_type == 'VOD':
play_info = self._download_json(LIVE_INFO_ENDPOINT, video_id, inkey = self._call_api('video/v1.0/vod/%s/inkey', video_id)['inkey']
headers={"referer": "https://www.vlive.tv"}) vod_id = video['vodId']
return merge_dicts(
streams = try_get(play_info, lambda x: x["result"]["streamList"], list) or [] get_common_fields(),
self._extract_video_info(video_id, vod_id, inkey))
formats = [] elif video_type == 'LIVE':
for stream in streams: status = video.get('status')
formats.extend(self._extract_m3u8_formats( if status == 'ON_AIR':
stream['serviceUrl'], video_id, 'mp4', stream_url = self._call_api(
fatal=False, live=True)) 'old/v3/live/%s/playInfo',
self._sort_formats(formats) video_id)['result']['adaptiveStreamUrl']
formats = self._extract_m3u8_formats(stream_url, video_id, 'mp4')
info = self._get_common_fields(webpage, params) info = get_common_fields()
info.update({ info.update({
'title': self._live_title(info['title']), 'title': self._live_title(video['title']),
'id': video_id, 'id': video_id,
'formats': formats, 'formats': formats,
'is_live': True, 'is_live': True,
}) })
return info return info
elif status == 'ENDED':
def _replay(self, video_id, webpage, params, video_params): raise ExtractorError(
long_video_id = video_params["vodId"] 'Uploading for replay. Please wait...', expected=True)
elif status == 'RESERVED':
VOD_KEY_ENDPOINT = 'https://www.vlive.tv/globalv-web/vam-web/video/v1.0/vod/%s/inkey' % video_id raise ExtractorError('Coming soon!', expected=True)
key_json = self._download_json(VOD_KEY_ENDPOINT, video_id, elif video.get('exposeStatus') == 'CANCEL':
headers={"referer": "https://www.vlive.tv"}) raise ExtractorError(
key = key_json["inkey"] 'We are sorry, but the live broadcast has been canceled.',
expected=True)
return merge_dicts( else:
self._get_common_fields(webpage, params), raise ExtractorError('Unknown status ' + status)
self._extract_video_info(video_id, long_video_id, key))
class VLiveChannelIE(InfoExtractor): class VLiveChannelIE(VLiveBaseIE):
IE_NAME = 'vlive:channel' IE_NAME = 'vlive:channel'
_VALID_URL = r'https?://(?:(?:www|m)\.)?(?:channels\.vlive\.tv/|vlive\.tv/channels?/)(?P<id>[0-9A-Z]+)' _VALID_URL = r'https?://(?:channels\.vlive\.tv|(?:(?:www|m)\.)?vlive\.tv/channel)/(?P<id>[0-9A-Z]+)'
_TESTS = [{ _TESTS = [{
'url': 'https://channels.vlive.tv/FCD4B', 'url': 'http://channels.vlive.tv/FCD4B',
'info_dict': { 'info_dict': {
'id': 'FCD4B', 'id': 'FCD4B',
'title': 'MAMAMOO', 'title': 'MAMAMOO',
@ -226,63 +185,39 @@ class VLiveChannelIE(InfoExtractor):
'playlist_mincount': 110 'playlist_mincount': 110
}, { }, {
'url': 'https://www.vlive.tv/channel/FCD4B', 'url': 'https://www.vlive.tv/channel/FCD4B',
'info_dict': { 'only_matching': True,
'id': 'FCD4B',
'title': 'MAMAMOO',
},
'playlist_mincount': 110
}] }]
_APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b'
def _call_api(self, path, channel_key_suffix, channel_value, note, query):
q = {
'app_id': self._APP_ID,
'channel' + channel_key_suffix: channel_value,
}
q.update(query)
return self._download_json(
'http://api.vfan.vlive.tv/vproxy/channelplus/' + path,
channel_value, note='Downloading ' + note, query=q)['result']
def _real_extract(self, url): def _real_extract(self, url):
channel_code = self._match_id(url) channel_code = self._match_id(url)
webpage = self._download_webpage( channel_seq = self._call_api(
'http://channels.vlive.tv/%s/video' % channel_code, channel_code) 'decodeChannelCode', 'Code', channel_code,
'decode channel code', {})['channelSeq']
app_id = None
app_js_url = self._search_regex(
r'<script[^>]+src=(["\'])(?P<url>http.+?/app\.js.*?)\1',
webpage, 'app js', default=None, group='url')
if app_js_url:
app_js = self._download_webpage(
app_js_url, channel_code, 'Downloading app JS', fatal=False)
if app_js:
app_id = self._search_regex(
r'Global\.VFAN_APP_ID\s*=\s*[\'"]([^\'"]+)[\'"]',
app_js, 'app id', default=None)
app_id = app_id or self._APP_ID
channel_info = self._download_json(
'http://api.vfan.vlive.tv/vproxy/channelplus/decodeChannelCode',
channel_code, note='Downloading decode channel code',
query={
'app_id': app_id,
'channelCode': channel_code,
'_': int(time.time())
})
channel_seq = channel_info['result']['channelSeq']
channel_name = None channel_name = None
entries = [] entries = []
for page_num in itertools.count(1): for page_num in itertools.count(1):
video_list = self._download_json( video_list = self._call_api(
'http://api.vfan.vlive.tv/vproxy/channelplus/getChannelVideoList', 'getChannelVideoList', 'Seq', channel_seq,
channel_code, note='Downloading channel list page #%d' % page_num, 'channel list page #%d' % page_num, {
query={
'app_id': app_id,
'channelSeq': channel_seq,
# Large values of maxNumOfRows (~300 or above) may cause # Large values of maxNumOfRows (~300 or above) may cause
# empty responses (see [1]), e.g. this happens for [2] that # empty responses (see [1]), e.g. this happens for [2] that
# has more than 300 videos. # has more than 300 videos.
# 1. https://github.com/ytdl-org/youtube-dl/issues/13830 # 1. https://github.com/ytdl-org/youtube-dl/issues/13830
# 2. http://channels.vlive.tv/EDBF. # 2. http://channels.vlive.tv/EDBF.
'maxNumOfRows': 100, 'maxNumOfRows': 100,
'_': int(time.time()),
'pageNo': page_num 'pageNo': page_num
} }
) )
@ -290,11 +225,11 @@ def _real_extract(self, url):
if not channel_name: if not channel_name:
channel_name = try_get( channel_name = try_get(
video_list, video_list,
lambda x: x['result']['channelInfo']['channelName'], lambda x: x['channelInfo']['channelName'],
compat_str) compat_str)
videos = try_get( videos = try_get(
video_list, lambda x: x['result']['videoList'], list) video_list, lambda x: x['videoList'], list)
if not videos: if not videos:
break break
@ -312,7 +247,9 @@ def _real_extract(self, url):
entries, channel_code, channel_name) entries, channel_code, channel_name)
class VLivePlaylistIE(InfoExtractor): # old extractor. Rewrite?
class VLivePlaylistIE(VLiveBaseIE):
IE_NAME = 'vlive:playlist' IE_NAME = 'vlive:playlist'
_VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P<video_id>[0-9]+)/playlist/(?P<id>[0-9]+)' _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P<video_id>[0-9]+)/playlist/(?P<id>[0-9]+)'
_VIDEO_URL_TEMPLATE = 'http://www.vlive.tv/video/%s' _VIDEO_URL_TEMPLATE = 'http://www.vlive.tv/video/%s'

View file

@ -5,7 +5,6 @@
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
ExtractorError,
int_or_none, int_or_none,
js_to_json, js_to_json,
orderedSet, orderedSet,
@ -34,7 +33,7 @@ class XTubeIE(InfoExtractor):
'title': 'strange erotica', 'title': 'strange erotica',
'description': 'contains:an ET kind of thing', 'description': 'contains:an ET kind of thing',
'uploader': 'greenshowers', 'uploader': 'greenshowers',
'duration': 449, 'duration': 450,
'view_count': int, 'view_count': int,
'comment_count': int, 'comment_count': int,
'age_limit': 18, 'age_limit': 18,
@ -74,18 +73,10 @@ def _real_extract(self, url):
title, thumbnail, duration = [None] * 3 title, thumbnail, duration = [None] * 3
json_config_string = self._search_regex( config = self._parse_json(self._search_regex(
r'playerConf=({.+?}),loaderConf', r'playerConf\s*=\s*({.+?})\s*,\s*(?:\n|loaderConf)', webpage, 'config',
webpage, 'config', default=None) default='{}'), video_id, transform_source=js_to_json, fatal=False)
if not json_config_string: if config:
raise ExtractorError("Could not extract video player data")
json_config_string = json_config_string.replace("!0", "true").replace("!1", "false")
config = self._parse_json(json_config_string, video_id, transform_source=js_to_json, fatal=False)
if not config:
raise ExtractorError("Could not extract video player data")
config = config.get('mainRoll') config = config.get('mainRoll')
if isinstance(config, dict): if isinstance(config, dict):
title = config.get('title') title = config.get('title')

View file

@ -29,7 +29,6 @@ class YouPornIE(InfoExtractor):
'upload_date': '20101217', 'upload_date': '20101217',
'average_rating': int, 'average_rating': int,
'view_count': int, 'view_count': int,
'comment_count': int,
'categories': list, 'categories': list,
'tags': list, 'tags': list,
'age_limit': 18, 'age_limit': 18,
@ -48,7 +47,6 @@ class YouPornIE(InfoExtractor):
'upload_date': '20110418', 'upload_date': '20110418',
'average_rating': int, 'average_rating': int,
'view_count': int, 'view_count': int,
'comment_count': int,
'categories': list, 'categories': list,
'tags': list, 'tags': list,
'age_limit': 18, 'age_limit': 18,
@ -156,7 +154,8 @@ def _real_extract(self, url):
r'(?s)<div[^>]+class=["\']submitByLink["\'][^>]*>(.+?)</div>', r'(?s)<div[^>]+class=["\']submitByLink["\'][^>]*>(.+?)</div>',
webpage, 'uploader', fatal=False) webpage, 'uploader', fatal=False)
upload_date = unified_strdate(self._html_search_regex( upload_date = unified_strdate(self._html_search_regex(
[r'Date\s+[Aa]dded:\s*<span>([^<]+)', [r'UPLOADED:\s*<span>([^<]+)',
r'Date\s+[Aa]dded:\s*<span>([^<]+)',
r'(?s)<div[^>]+class=["\']videoInfo(?:Date|Time)["\'][^>]*>(.+?)</div>'], r'(?s)<div[^>]+class=["\']videoInfo(?:Date|Time)["\'][^>]*>(.+?)</div>'],
webpage, 'upload date', fatal=False)) webpage, 'upload date', fatal=False))
@ -171,7 +170,7 @@ def _real_extract(self, url):
webpage, 'view count', fatal=False, group='count')) webpage, 'view count', fatal=False, group='count'))
comment_count = str_to_int(self._search_regex( comment_count = str_to_int(self._search_regex(
r'>All [Cc]omments? \(([\d,.]+)\)', r'>All [Cc]omments? \(([\d,.]+)\)',
webpage, 'comment count', fatal=False)) webpage, 'comment count', default=None))
def extract_tag_box(regex, title): def extract_tag_box(regex, title):
tag_box = self._search_regex(regex, webpage, title, default=None) tag_box = self._search_regex(regex, webpage, title, default=None)

File diff suppressed because it is too large Load diff

View file

@ -4085,7 +4085,7 @@ def fix_kv(m):
v = m.group(0) v = m.group(0)
if v in ('true', 'false', 'null'): if v in ('true', 'false', 'null'):
return v return v
elif v.startswith('/*') or v.startswith('//') or v == ',': elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
return "" return ""
if v[0] in ("'", '"'): if v[0] in ("'", '"'):
@ -4095,7 +4095,7 @@ def fix_kv(m):
'\\\n': '', '\\\n': '',
'\\x': '\\u00', '\\x': '\\u00',
}.get(m.group(0), m.group(0)), v[1:-1]) }.get(m.group(0), m.group(0)), v[1:-1])
else:
for regex, base in INTEGER_TABLE: for regex, base in INTEGER_TABLE:
im = re.match(regex, v) im = re.match(regex, v)
if im: if im:
@ -4110,7 +4110,8 @@ def fix_kv(m):
{comment}|,(?={skip}[\]}}])| {comment}|,(?={skip}[\]}}])|
(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*| (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
\b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?| \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
[0-9]+(?={skip}:) [0-9]+(?={skip}:)|
!+
'''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code) '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)