[cinemassacre] Improve formats extraction and modernize

This commit is contained in:
Sergey M․ 2014-05-28 19:38:44 +07:00
parent 15a9f36849
commit d9dd3584e1
1 changed files with 28 additions and 15 deletions

View File

@ -1,10 +1,12 @@
# encoding: utf-8 # encoding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none,
) )
@ -13,9 +15,10 @@ class CinemassacreIE(InfoExtractor):
_TESTS = [ _TESTS = [
{ {
'url': 'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/', 'url': 'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/',
'file': '19911.mp4',
'md5': 'fde81fbafaee331785f58cd6c0d46190', 'md5': 'fde81fbafaee331785f58cd6c0d46190',
'info_dict': { 'info_dict': {
'id': '19911',
'ext': 'mp4',
'upload_date': '20121110', 'upload_date': '20121110',
'title': '“Angry Video Game Nerd: The Movie” Trailer', 'title': '“Angry Video Game Nerd: The Movie” Trailer',
'description': 'md5:fb87405fcb42a331742a0dce2708560b', 'description': 'md5:fb87405fcb42a331742a0dce2708560b',
@ -23,9 +26,10 @@ class CinemassacreIE(InfoExtractor):
}, },
{ {
'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940', 'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940',
'file': '521be8ef82b16.mp4',
'md5': 'd72f10cd39eac4215048f62ab477a511', 'md5': 'd72f10cd39eac4215048f62ab477a511',
'info_dict': { 'info_dict': {
'id': '521be8ef82b16',
'ext': 'mp4',
'upload_date': '20131002', 'upload_date': '20131002',
'title': 'The Mummys Hand (1940)', 'title': 'The Mummys Hand (1940)',
}, },
@ -50,29 +54,38 @@ class CinemassacreIE(InfoExtractor):
r'<div class="entry-content">(?P<description>.+?)</div>', r'<div class="entry-content">(?P<description>.+?)</div>',
webpage, 'description', flags=re.DOTALL, fatal=False) webpage, 'description', flags=re.DOTALL, fatal=False)
playerdata = self._download_webpage(playerdata_url, video_id) playerdata = self._download_webpage(playerdata_url, video_id, 'Downloading player webpage')
video_thumbnail = self._search_regex(r'image: \'(?P<thumbnail>[^\']+)\'', playerdata, 'thumbnail', fatal=False) video_thumbnail = self._search_regex(
r'image: \'(?P<thumbnail>[^\']+)\'', playerdata, 'thumbnail', fatal=False)
sd_url = self._search_regex(r'file: \'([^\']+)\', label: \'SD\'', playerdata, 'sd_file') sd_url = self._search_regex(r'file: \'([^\']+)\', label: \'SD\'', playerdata, 'sd_file')
videolist_url = self._search_regex(r'file: \'([^\']+\.smil)\'}', playerdata, 'videolist_url') videolist_url = self._search_regex(r'file: \'([^\']+\.smil)\'}', playerdata, 'videolist_url')
videolist = self._download_webpage(videolist_url, video_id) videolist = self._download_xml(videolist_url, video_id, 'Downloading videolist XML')
formats = [] formats = []
baseurl = sd_url[:sd_url.rfind('/')+1] baseurl = sd_url[:sd_url.rfind('/')+1]
for match in re.finditer('<video src="mp4:(?P<file>[^"]+_(?P<format_id>[^"]+)\.[^"]+)" system-bitrate="(?P<br>\d+)"(?: width="(?P<width>\d+)" height="(?P<height>\d+)")?/>', videolist): for video in videolist.findall('.//video'):
src = video.get('src')
if not src:
continue
file_ = src.partition(':')[-1]
width = int_or_none(video.get('width'))
height = int_or_none(video.get('height'))
bitrate = int_or_none(video.get('system-bitrate'))
format = { format = {
'url': baseurl + match.group('file'), 'url': baseurl + file_,
'format_id': match.group('format_id') 'format_id': src.rpartition('.')[0].rpartition('_')[-1],
} }
if match.group('width'): if width or height:
format.update({ format.update({
'tbr': int(match.group('br')) // 1000, 'tbr': bitrate // 1000 if bitrate else None,
'width': int(match.group('width')), 'width': width,
'height': int(match.group('height')) 'height': height,
}) })
else: else:
format.update({ format.update({
'abr': int(match.group('br')) // 1000, 'abr': bitrate // 1000 if bitrate else None,
'vcodec': 'none' 'vcodec': 'none',
}) })
formats.append(format) formats.append(format)
self._sort_formats(formats) self._sort_formats(formats)