add an extractor for tv.sohu.com

This commit is contained in:
huohuarong 2013-08-02 17:58:46 +08:00
parent 6d3a7d03e1
commit 6624a2b07d
2 changed files with 98 additions and 0 deletions

View file

@ -55,6 +55,7 @@
from .ringtv import RingTVIE from .ringtv import RingTVIE
from .roxwel import RoxwelIE from .roxwel import RoxwelIE
from .sina import SinaIE from .sina import SinaIE
from .sohu import SohuIE
from .soundcloud import SoundcloudIE, SoundcloudSetIE from .soundcloud import SoundcloudIE, SoundcloudSetIE
from .spiegel import SpiegelIE from .spiegel import SpiegelIE
from .stanfordoc import StanfordOpenClassroomIE from .stanfordoc import StanfordOpenClassroomIE

View file

@ -0,0 +1,97 @@
# encoding: utf-8
import re
import json
import time
import logging
import urllib2
from .common import InfoExtractor
from ..utils import compat_urllib_request
class SohuIE(InfoExtractor):
_VALID_URL = r'https?://tv\.sohu\.com/\d+?/n(?P<id>\d+)\.shtml.*?'
_TEST = {
u'url': u'http://tv.sohu.com/20130724/n382479172.shtml#super',
u'file': u'382479172.flv',
u'md5': u'cc84eed6b6fbf0f2f9a8d3cb9da1939b',
u'info_dict': {
u'title': u'The Illest - Far East Movement Riff Raff',
},
}
def _clearn_html(self, string):
tags = re.findall(r'<.+?>', string)
for t in tags:
string = string.replace(t, ' ')
for i in range(2):
spaces = re.findall(r'\s+', string)
for s in spaces:
string = string.replace(s, ' ')
string = string.strip()
return string
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
pattern = r'<h1 id="video-title">\n*?(.+?)\n*?</h1>'
compiled = re.compile(pattern, re.DOTALL)
title = self._search_regex(compiled, webpage, u'video title').strip('\t\n')
title = self._clearn_html(title)
pattern = re.compile(r'var vid="(\d+)"')
result = re.search(pattern, webpage)
if not result:
logging.info('[Sohu] could not get vid')
return None
vid = result.group(1)
logging.info('vid: %s' % vid)
base_url_1 = 'http://hot.vrs.sohu.com/vrs_flash.action?vid='
url_1 = base_url_1 + vid
logging.info('json url: %s' % url_1)
json_1 = json.loads(urllib2.urlopen(url_1).read())
# get the highest definition video vid and json infomation.
vids = []
qualities = ('oriVid', 'superVid', 'highVid', 'norVid')
for vid_name in qualities:
vids.append(json_1['data'][vid_name])
clearest_vid = 0
for i, v in enumerate(vids):
if v != 0:
clearest_vid = v
logging.info('quality definition: %s' % qualities[i][:-3])
break
if not clearest_vid:
logging.warning('could not find valid clearest_vid')
return None
if vid != clearest_vid:
url_1 = '%s%d' % (base_url_1, clearest_vid)
logging.info('highest definition json url: %s' % url_1)
json_1 = json.loads(urllib2.urlopen(url_1).read())
allot = json_1['allot']
prot = json_1['prot']
clipsURL = json_1['data']['clipsURL']
su = json_1['data']['su']
num_of_parts = json_1['data']['totalBlocks']
logging.info('Total parts: %d' % num_of_parts)
base_url_3 = 'http://allot/?prot=prot&file=clipsURL[i]&new=su[i]'
files_info = []
for i in range(num_of_parts):
middle_url = 'http://%s/?prot=%s&file=%s&new=%s' % (allot, prot, clipsURL[i], su[i])
logging.info('middle url part %d: %s' % (i, middle_url))
middle_info = urllib2.urlopen(middle_url).read().split('|')
middle_part_1 = middle_info[0]
download_url = '%s%s?key=%s' % (middle_info[0], su[i], middle_info[3])
info = {
'id': '%s_part%02d' % (video_id, i + 1),
'title': title,
'url': download_url,
'ext': 'mp4',
}
files_info.append(info)
time.sleep(1)
return files_info