[bbc] Improve playlist extraction, refactor, expand support and document

This commit is contained in:
Sergey M․ 2015-07-25 20:21:42 +06:00
parent ff81c4c99c
commit 9afa1770d1
2 changed files with 259 additions and 121 deletions

View file

@ -43,7 +43,10 @@
from .baidu import BaiduVideoIE from .baidu import BaiduVideoIE
from .bambuser import BambuserIE, BambuserChannelIE from .bambuser import BambuserIE, BambuserChannelIE
from .bandcamp import BandcampIE, BandcampAlbumIE from .bandcamp import BandcampIE, BandcampAlbumIE
from .bbc import BBCCoUkIE, BBCNewsIE from .bbc import (
BBCCoUkIE,
BBCIE,
)
from .beeg import BeegIE from .beeg import BeegIE
from .behindkink import BehindKinkIE from .behindkink import BehindKinkIE
from .beatportpro import BeatportProIE from .beatportpro import BeatportProIE

View file

@ -1,15 +1,18 @@
# coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import re
import xml.etree.ElementTree import xml.etree.ElementTree
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
parse_duration, float_or_none,
int_or_none, int_or_none,
parse_duration,
parse_iso8601,
) )
from ..compat import compat_HTTPError from ..compat import compat_HTTPError
import re
class BBCCoUkIE(InfoExtractor): class BBCCoUkIE(InfoExtractor):
@ -17,7 +20,7 @@ class BBCCoUkIE(InfoExtractor):
IE_DESC = 'BBC iPlayer' IE_DESC = 'BBC iPlayer'
_VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z]{8})' _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z]{8})'
mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s'
_TESTS = [ _TESTS = [
{ {
@ -264,16 +267,21 @@ def _get_subtitles(self, media, programme_id):
return subtitles return subtitles
def _download_media_selector(self, programme_id): def _download_media_selector(self, programme_id):
return self._download_media_selector_url(
self._MEDIASELECTOR_URL % programme_id, programme_id)
def _download_media_selector_url(self, url, programme_id=None):
try: try:
media_selection = self._download_xml( media_selection = self._download_xml(
self.mediaselector_url % programme_id, url, programme_id, 'Downloading media selection XML')
programme_id, 'Downloading media selection XML')
except ExtractorError as ee: except ExtractorError as ee:
if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().decode('utf-8')) media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().decode('utf-8'))
else: else:
raise raise
return self._process_media_selector(media_selection, programme_id)
def _process_media_selector(self, media_selection, programme_id):
formats = [] formats = []
subtitles = None subtitles = None
@ -312,10 +320,21 @@ def _download_playlist(self, playlist_id):
raise raise
# fallback to legacy playlist # fallback to legacy playlist
playlist = self._download_xml( return self._process_legacy_playlist(playlist_id)
'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id,
playlist_id, 'Downloading legacy playlist XML')
def _process_legacy_playlist_url(self, url, display_id):
playlist = self._download_legacy_playlist_url(url, display_id)
return self._extract_from_legacy_playlist(playlist, display_id)
def _process_legacy_playlist(self, playlist_id):
return self._process_legacy_playlist_url(
'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
def _download_legacy_playlist_url(self, url, playlist_id=None):
return self._download_xml(
url, playlist_id, 'Downloading legacy playlist XML')
def _extract_from_legacy_playlist(self, playlist, playlist_id):
no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems') no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems')
if no_items is not None: if no_items is not None:
reason = no_items.get('reason') reason = no_items.get('reason')
@ -335,8 +354,23 @@ def _download_playlist(self, playlist_id):
continue continue
title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text
description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text
programme_id = item.get('identifier')
def get_programme_id(item):
def get_from_attributes(item):
for p in('identifier', 'group'):
value = item.get(p)
if value and re.match(r'^[pb][\da-z]{7}$', value):
return value
get_from_attributes(item)
mediator = item.find('./{http://bbc.co.uk/2008/emp/playlist}mediator')
if mediator is not None:
return get_from_attributes(mediator)
programme_id = get_programme_id(item)
duration = int_or_none(item.get('duration')) duration = int_or_none(item.get('duration'))
# TODO: programme_id can be None and media items can be incorporated right inside
# playlist's item (e.g. http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
# as f4m and m3u8
formats, subtitles = self._download_media_selector(programme_id) formats, subtitles = self._download_media_selector(programme_id)
return programme_id, title, description, duration, formats, subtitles return programme_id, title, description, duration, formats, subtitles
@ -383,175 +417,276 @@ def _real_extract(self, url):
} }
class BBCNewsIE(BBCCoUkIE): class BBCIE(BBCCoUkIE):
IE_NAME = 'bbc.com' IE_NAME = 'bbc'
IE_DESC = 'BBC news' IE_DESC = 'BBC'
_VALID_URL = r'https?://(?:www\.)?bbc\.com/.+?/(?P<id>[^/]+)$' _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' # fails with notukerror for some videos
#_MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s'
_MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s'
_TESTS = [{ _TESTS = [{
# article with multiple videos embedded with data-media-meta containing
# playlist.sxml, externalId and no direct video links
'url': 'http://www.bbc.com/news/world-europe-32668511', 'url': 'http://www.bbc.com/news/world-europe-32668511',
'info_dict': { 'info_dict': {
'id': 'world-europe-32668511', 'id': 'world-europe-32668511',
'title': 'Russia stages massive WW2 parade despite Western boycott', 'title': 'Russia stages massive WW2 parade despite Western boycott',
'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
}, },
'playlist_count': 2, 'playlist_count': 2,
}, { }, {
# article with multiple videos embedded with data-media-meta (more videos)
'url': 'http://www.bbc.com/news/business-28299555', 'url': 'http://www.bbc.com/news/business-28299555',
'info_dict': { 'info_dict': {
'id': 'business-28299555', 'id': 'business-28299555',
'title': 'Farnborough Airshow: Video highlights', 'title': 'Farnborough Airshow: Video highlights',
'description': 'BBC reports and video highlights at the Farnborough Airshow.',
}, },
'playlist_count': 9, 'playlist_count': 9,
'skip': 'Save time',
}, { }, {
# single video embedded with mediaAssetPage.init()
'url': 'http://www.bbc.com/news/world-europe-32041533', 'url': 'http://www.bbc.com/news/world-europe-32041533',
'note': 'Video',
'info_dict': { 'info_dict': {
'id': 'p02mprgb', 'id': 'p02mprgb',
'ext': 'mp4', 'ext': 'flv',
'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
'description': 'Germanwings plane crash site in aerial video - Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
'duration': 47, 'duration': 47,
'timestamp': 1427219242,
'upload_date': '20150324', 'upload_date': '20150324',
'uploader': 'BBC News',
}, },
'params': { 'params': {
# rtmp download
'skip_download': True, 'skip_download': True,
} }
}, { }, {
# article with single video embedded with data-media-meta containing
# direct video links (for now these are extracted) and playlist.xml (with
# media items as f4m and m3u8 - currently unsupported)
'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu', 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
'note': 'Video',
'info_dict': { 'info_dict': {
'id': 'NA', 'id': '150615_telabyad_kentin_cogu',
'ext': 'mp4', 'ext': 'mp4',
'title': 'YPG: Tel Abyad\'\u0131n tamam\u0131 kontrol\xfcm\xfczde', 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
'description': 'YPG: Tel Abyad\'\u0131n tamam\u0131 kontrol\xfcm\xfczde',
'duration': 47, 'duration': 47,
'timestamp': 1434397334,
'upload_date': '20150615', 'upload_date': '20150615',
'uploader': 'BBC News',
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
} }
}, { }, {
# single video embedded with mediaAssetPage.init() (regional section)
'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw', 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
'note': 'Video',
'info_dict': { 'info_dict': {
'id': '39275083', 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Honduras militariza sus hospitales por nuevo esc\xe1ndalo de corrupci\xf3n', 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
'description': 'Honduras militariza sus hospitales por nuevo esc\xe1ndalo de corrupci\xf3n',
'duration': 87, 'duration': 87,
'timestamp': 1434713142,
'upload_date': '20150619', 'upload_date': '20150619',
'uploader': 'BBC News',
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
} }
}, {
# single video story with digitalData
'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
'info_dict': {
'id': 'p02q6gc4',
'ext': 'flv',
'title': 'Sri Lankas spicy secret',
'description': 'As a new train line to Jaffna opens up the countrys north, travellers can experience a truly distinct slice of Tamil culture.',
'timestamp': 1437674293,
'upload_date': '20150723',
},
'params': {
# rtmp download
'skip_download': True,
}
}, {
# single video story without digitalData
'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
'info_dict': {
'id': 'p018zqqg',
'ext': 'flv',
'title': 'Hyundai Santa Fe Sport: Rock star',
'description': 'md5:b042a26142c4154a6e472933cf20793d',
'timestamp': 1368473503,
'upload_date': '20130513',
},
'params': {
# rtmp download
'skip_download': True,
}
}, {
# single video with playlist.sxml URL
'url': 'http://www.bbc.com/sport/0/football/33653409',
'info_dict': {
'id': 'p02xycnp',
'ext': 'flv',
'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
'description': 'md5:398fca0e2e701c609d726e034fa1fc89',
'duration': 140,
},
'params': {
# rtmp download
'skip_download': True,
}
}, {
# single video with playlist URL from weather section
'url': 'http://www.bbc.com/weather/features/33601775',
'only_matching': True,
}, {
# custom redirection to www.bbc.com
'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
'only_matching': True,
}] }]
@classmethod
def suitable(cls, url):
return False if BBCCoUkIE.suitable(url) else super(BBCIE, cls).suitable(url)
def _extract_from_media_meta(self, media_meta, video_id):
# Direct links to media in media metadata (e.g.
# http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
# TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
source_files = media_meta.get('sourceFiles')
if source_files:
return [{
'url': f['url'],
'format_id': format_id,
'ext': f.get('encoding'),
'tbr': float_or_none(f.get('bitrate'), 1000),
'filesize': int_or_none(f.get('filesize')),
} for format_id, f in source_files.items() if f.get('url')], []
programme_id = media_meta.get('externalId')
if programme_id:
return self._download_media_selector(programme_id)
# Process playlist.sxml as legacy playlist
href = media_meta.get('href')
if href:
playlist = self._download_legacy_playlist_url(href)
_, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
return formats, subtitles
return [], []
def _real_extract(self, url): def _real_extract(self, url):
list_id = self._match_id(url) playlist_id = self._match_id(url)
webpage = self._download_webpage(url, list_id)
list_title = self._html_search_regex(r'<title>(.*?)(?:\s*-\s*BBC [^ ]+)?</title>', webpage, 'list title') webpage = self._download_webpage(url, playlist_id)
pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None) timestamp = parse_iso8601(self._search_regex(
if pubdate: [r'"datePublished":\s*"([^"]+)',
pubdate = pubdate.replace('-', '') r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
r'itemprop="datePublished"[^>]+datetime="([^"]+)"'],
ret = [] webpage, 'date', default=None))
jsent = []
# works with bbc.com/news/something-something-123456 articles
jsent = map(
lambda m: self._parse_json(m, list_id),
re.findall(r"data-media-meta='({[^']+})'", webpage)
)
if len(jsent) == 0:
# http://www.bbc.com/news/video_and_audio/international
# and single-video articles
masset = self._html_search_regex(r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'mediaassets', default=None)
if masset:
jmasset = self._parse_json(masset, list_id)
for key, val in jmasset.get('videos', {}).items():
for skey, sval in val.items():
sval['id'] = skey
jsent.append(sval)
if len(jsent) == 0:
# stubbornly generic extractor for {json with "image":{allvideoshavethis},etc}
# in http://www.bbc.com/news/video_and_audio/international
# prone to breaking if entries have sourceFiles list
jsent = map(
lambda m: self._parse_json(m, list_id),
re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage)
)
if len(jsent) == 0:
raise ExtractorError('No video found', expected=True)
for jent in jsent:
programme_id = jent.get('externalId')
xml_url = jent.get('href')
title = jent.get('caption', '')
if title == '':
title = list_title
duration = parse_duration(jent.get('duration'))
description = list_title
if jent.get('caption', '') != '':
description += ' - ' + jent.get('caption')
thumbnail = None
if jent.get('image') is not None:
thumbnail = jent['image'].get('href')
formats = []
subtitles = []
if programme_id:
formats, subtitles = self._download_media_selector(programme_id)
elif jent.get('sourceFiles') is not None:
# mediaselector not used at
# http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu
for key, val in jent['sourceFiles'].items():
formats.append({
'ext': val.get('encoding'),
'url': val.get('url'),
'filesize': int(val.get('filesize')),
'format_id': key
})
elif xml_url:
# Cheap fallback
# http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml
xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)')
programme_id = self._search_regex(r'<mediator [^>]*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)')
formats, subtitles = self._download_media_selector(programme_id)
if len(formats) == 0:
raise ExtractorError('unsupported json media entry.\n ' + str(jent) + '\n')
# single video with playlist.sxml URL (e.g. http://www.bbc.com/sport/0/football/3365340ng)
playlist = self._search_regex(
r'<param[^>]+name="playlist"[^>]+value="([^"]+)"',
webpage, 'playlist', default=None)
if playlist:
programme_id, title, description, duration, formats, subtitles = \
self._process_legacy_playlist_url(playlist, playlist_id)
self._sort_formats(formats) self._sort_formats(formats)
return {
id = jent.get('id') if programme_id is None else programme_id 'id': programme_id,
if id is None:
id = 'NA'
ret.append({
'id': id,
'uploader': 'BBC News',
'upload_date': pubdate,
'title': title, 'title': title,
'description': description, 'description': description,
'thumbnail': thumbnail,
'duration': duration, 'duration': duration,
'timestamp': timestamp,
'formats': formats,
'subtitles': subtitles,
}
# single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
programme_id = self._search_regex(
[r'data-video-player-vpid="([\da-z]{8})"',
r'<param[^>]+name="externalIdentifier"[^>]+value="([\da-z]{8})"'],
webpage, 'vpid', default=None)
if programme_id:
formats, subtitles = self._download_media_selector(programme_id)
self._sort_formats(formats)
# digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
digital_data = self._parse_json(
self._search_regex(
r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
programme_id, fatal=False)
page_info = digital_data.get('page', {}).get('pageInfo', {})
title = page_info.get('pageName') or self._og_search_title(webpage)
description = page_info.get('description') or self._og_search_description(webpage)
timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
return {
'id': programme_id,
'title': title,
'description': description,
'timestamp': timestamp,
'formats': formats,
'subtitles': subtitles,
}
playlist_title = self._html_search_regex(
r'<title>(.*?)(?:\s*-\s*BBC [^ ]+)?</title>', webpage, 'playlist title')
playlist_description = self._og_search_description(webpage)
# Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
medias = list(filter(None, map(
lambda s: self._parse_json(s, playlist_id, fatal=False),
re.findall(r"data-media-meta='({[^']+})'", webpage))))
if not medias:
# Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
media_asset_page = self._parse_json(
self._search_regex(
r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'media asset'),
playlist_id)
medias = []
for video in media_asset_page.get('videos', {}).values():
medias.extend(video.values())
entries = []
for num, media_meta in enumerate(medias, start=1):
formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
if not formats:
continue
self._sort_formats(formats)
video_id = media_meta.get('externalId')
if not video_id:
video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
title = media_meta.get('caption')
if not title:
title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
images = []
for image in media_meta.get('images', {}).values():
images.extend(image.values())
if 'image' in media_meta:
images.append(media_meta['image'])
thumbnails = [{
'url': image.get('href'),
'width': int_or_none(image.get('width')),
'height': int_or_none(image.get('height')),
} for image in images]
entries.append({
'id': video_id,
'title': title,
'thumbnails': thumbnails,
'duration': duration,
'timestamp': timestamp,
'formats': formats, 'formats': formats,
'subtitles': subtitles, 'subtitles': subtitles,
}) })
if len(ret) > 0: return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
return self.playlist_result(ret, list_id, list_title)
raise ExtractorError('No video found', expected=True)