[spiegel] Test format video URLs for 404 (Closes #4579)

This commit is contained in:
Sergey M․ 2015-01-14 20:27:14 +06:00
parent f2cbc96c3e
commit e92d4a11f5

View file

@ -4,7 +4,14 @@
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_urlparse from ..compat import (
compat_urlparse,
compat_HTTPError,
)
from ..utils import (
HEADRequest,
ExtractorError,
)
from .spiegeltv import SpiegeltvIE from .spiegeltv import SpiegeltvIE
@ -60,21 +67,31 @@ def _real_extract(self, url):
xml_url = base_url + video_id + '.xml' xml_url = base_url + video_id + '.xml'
idoc = self._download_xml(xml_url, video_id) idoc = self._download_xml(xml_url, video_id)
formats = [ formats = []
{ for n in list(idoc):
'format_id': n.tag.rpartition('type')[2], if n.tag.startswith('type') and n.tag != 'type6':
'url': base_url + n.find('./filename').text, format_id = n.tag.rpartition('type')[2]
video_url = base_url + n.find('./filename').text
# Test video URLs beforehand as some of them are invalid
try:
self._request_webpage(
HEADRequest(video_url), video_id,
'Checking %s video URL' % format_id)
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
self.report_warning(
'%s video URL is invalid, skipping' % format_id, video_id)
continue
formats.append({
'format_id': format_id,
'url': video_url,
'width': int(n.find('./width').text), 'width': int(n.find('./width').text),
'height': int(n.find('./height').text), 'height': int(n.find('./height').text),
'abr': int(n.find('./audiobitrate').text), 'abr': int(n.find('./audiobitrate').text),
'vbr': int(n.find('./videobitrate').text), 'vbr': int(n.find('./videobitrate').text),
'vcodec': n.find('./codec').text, 'vcodec': n.find('./codec').text,
'acodec': 'MP4A', 'acodec': 'MP4A',
} })
for n in list(idoc)
# Blacklist type 6, it's extremely LQ and not available on the same server
if n.tag.startswith('type') and n.tag != 'type6'
]
duration = float(idoc[0].findall('./duration')[0].text) duration = float(idoc[0].findall('./duration')[0].text)
self._sort_formats(formats) self._sort_formats(formats)