[ie/sohu] Fix extractor (#7628)

Closes #1667, Closes #7463
Authored by: c-basalt, bashonly
This commit is contained in:
c-basalt 2023-09-16 17:13:04 -04:00 committed by GitHub
parent b4c1c408c6
commit 5be7e97886
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 105 additions and 7 deletions

View file

@ -1795,7 +1795,10 @@
from .slutload import SlutloadIE from .slutload import SlutloadIE
from .smotrim import SmotrimIE from .smotrim import SmotrimIE
from .snotr import SnotrIE from .snotr import SnotrIE
from .sohu import SohuIE from .sohu import (
SohuIE,
SohuVIE,
)
from .sonyliv import ( from .sonyliv import (
SonyLIVIE, SonyLIVIE,
SonyLIVSeriesIE, SonyLIVSeriesIE,

View file

@ -1,3 +1,4 @@
import base64
import re import re
from .common import InfoExtractor from .common import InfoExtractor
@ -8,7 +9,12 @@
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
float_or_none,
url_or_none,
unified_timestamp,
try_get, try_get,
urljoin,
traverse_obj,
) )
@ -31,13 +37,20 @@ class SohuIE(InfoExtractor):
'id': '409385080', 'id': '409385080',
'ext': 'mp4', 'ext': 'mp4',
'title': '《2015湖南卫视羊年元宵晚会》唐嫣《花好月圆》', 'title': '《2015湖南卫视羊年元宵晚会》唐嫣《花好月圆》',
} },
'skip': 'no longer available',
}, { }, {
'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml', 'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml',
'info_dict': { 'info_dict': {
'id': '78693464', 'id': '78693464',
'ext': 'mp4', 'ext': 'mp4',
'title': '【爱范品】第31期MWC见不到的奇葩手机', 'title': '【爱范品】第31期MWC见不到的奇葩手机',
'uploader': '爱范儿视频',
'duration': 213,
'timestamp': 1425519600,
'upload_date': '20150305',
'thumbnail': 'http://e3f49eaa46b57.cdn.sohucs.com//group1/M10/83/FA/MTAuMTAuODguODA=/6_14cbccdde5eg104SysCutcloud_78693464_7_0b.jpg',
'tags': ['爱范儿', '爱范品', 'MWC', '手机'],
} }
}, { }, {
'note': 'Multipart video', 'note': 'Multipart video',
@ -45,6 +58,12 @@ class SohuIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': '78910339', 'id': '78910339',
'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
'uploader': '小苍cany',
'duration': 744.0,
'timestamp': 1426269360,
'upload_date': '20150313',
'thumbnail': 'http://e3f49eaa46b57.cdn.sohucs.com//group1/M11/89/57/MTAuMTAuODguODA=/6_14cea022a1dg102SysCutcloud_78910339_8_0b.jpg',
'tags': ['小苍MM', '英雄联盟', '实战秘籍'],
}, },
'playlist': [{ 'playlist': [{
'info_dict': { 'info_dict': {
@ -75,6 +94,11 @@ class SohuIE(InfoExtractor):
'id': '78932792', 'id': '78932792',
'ext': 'mp4', 'ext': 'mp4',
'title': 'youtube-dl testing video', 'title': 'youtube-dl testing video',
'duration': 360,
'timestamp': 1426348620,
'upload_date': '20150314',
'thumbnail': 'http://e3f49eaa46b57.cdn.sohucs.com//group1/M02/8A/00/MTAuMTAuODguNzk=/6_14cee1be192g102SysCutcloud_78932792_7_7b.jpg',
'tags': [],
}, },
'params': { 'params': {
'skip_download': True 'skip_download': True
@ -100,7 +124,7 @@ def _fetch_data(vid_id, mytv=False):
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
title = re.sub(r' - 搜狐视频$', '', self._og_search_title(webpage)) title = re.sub(r'( - 高清正版在线观看)? - 搜狐视频$', '', self._og_search_title(webpage))
vid = self._html_search_regex( vid = self._html_search_regex(
r'var vid ?= ?["\'](\d+)["\']', r'var vid ?= ?["\'](\d+)["\']',
@ -132,7 +156,9 @@ def _fetch_data(vid_id, mytv=False):
allot = format_data['allot'] allot = format_data['allot']
data = format_data['data'] data = format_data['data']
clips_url = data['clipsURL'] clip_url = traverse_obj(data, (('clipsURL', 'mp4PlayUrl'), i, {url_or_none}), get_all=False)
if not clip_url:
raise ExtractorError(f'Unable to extract url for clip {i}')
su = data['su'] su = data['su']
video_url = 'newflv.sohu.ccgslb.net' video_url = 'newflv.sohu.ccgslb.net'
@ -142,9 +168,9 @@ def _fetch_data(vid_id, mytv=False):
while 'newflv.sohu.ccgslb.net' in video_url: while 'newflv.sohu.ccgslb.net' in video_url:
params = { params = {
'prot': 9, 'prot': 9,
'file': clips_url[i], 'file': clip_url,
'new': su[i], 'new': su[i],
'prod': 'flash', 'prod': 'h5n',
'rb': 1, 'rb': 1,
} }
@ -193,6 +219,75 @@ def _fetch_data(vid_id, mytv=False):
'entries': playlist, 'entries': playlist,
'id': video_id, 'id': video_id,
'title': title, 'title': title,
'duration': traverse_obj(vid_data, ('data', 'totalDuration', {float_or_none})),
} }
return info if mytv:
publish_time = unified_timestamp(self._search_regex(
r'publishTime:\s*["\'](\d+-\d+-\d+ \d+:\d+)["\']', webpage, 'publish time', fatal=False))
else:
publish_time = traverse_obj(vid_data, ('tv_application_time', {unified_timestamp}))
return {
'timestamp': publish_time - 8 * 3600 if publish_time else None,
**traverse_obj(vid_data, {
'alt_title': ('data', 'subName', {str}),
'uploader': ('wm_data', 'wm_username', {str}),
'thumbnail': ('data', 'coverImg', {url_or_none}),
'tags': ('data', 'tag', {str.split}),
}),
**info,
}
class SohuVIE(InfoExtractor):
_VALID_URL = r'https?://tv\.sohu\.com/v/(?P<id>[\w=-]+)\.html(?:$|[#?])'
_TESTS = [{
'note': 'Multipart video',
'url': 'https://tv.sohu.com/v/MjAyMzA2MTQvbjYwMTMxNTE5Mi5zaHRtbA==.html',
'info_dict': {
'id': '601315192',
'title': '《淬火丹心》第1集',
'alt_title': '“点天灯”发生事故',
'duration': 2701.692,
'timestamp': 1686758040,
'upload_date': '20230614',
'thumbnail': 'http://photocdn.tv.sohu.com/img/20230614/vrsa_hor_1686738763256_454010551.jpg',
},
'playlist_mincount': 9,
'skip': 'Only available in China',
}, {
'url': 'https://tv.sohu.com/v/dXMvMjMyNzk5ODg5Lzc4NjkzNDY0LnNodG1s.html',
'info_dict': {
'id': '78693464',
'ext': 'mp4',
'title': '【爱范品】第31期MWC见不到的奇葩手机',
'uploader': '爱范儿视频',
'duration': 213,
'timestamp': 1425519600,
'upload_date': '20150305',
'thumbnail': 'http://e3f49eaa46b57.cdn.sohucs.com//group1/M10/83/FA/MTAuMTAuODguODA=/6_14cbccdde5eg104SysCutcloud_78693464_7_0b.jpg',
'tags': ['爱范儿', '爱范品', 'MWC', '手机'],
}
}, {
'note': 'Multipart video',
'url': 'https://tv.sohu.com/v/dXMvMjQyNTYyMTYzLzc4OTEwMzM5LnNodG1s.html?src=pl',
'info_dict': {
'id': '78910339',
'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
'uploader': '小苍cany',
'duration': 744.0,
'timestamp': 1426269360,
'upload_date': '20150313',
'thumbnail': 'http://e3f49eaa46b57.cdn.sohucs.com//group1/M11/89/57/MTAuMTAuODguODA=/6_14cea022a1dg102SysCutcloud_78910339_8_0b.jpg',
'tags': ['小苍MM', '英雄联盟', '实战秘籍'],
},
'playlist_mincount': 3,
}]
def _real_extract(self, url):
encoded_id = self._match_id(url)
path = base64.urlsafe_b64decode(encoded_id).decode()
subdomain = 'tv' if re.match(r'\d+/n\d+\.shtml', path) else 'my.tv'
return self.url_result(urljoin(f'http://{subdomain}.sohu.com/', path), SohuIE)