[vice] improve extraction(closes #23631)

This commit is contained in:
Remita Amine 2020-01-05 16:32:43 +01:00
parent 484637a9cc
commit 44b434e4e3

View file

@ -1,35 +1,50 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import re import functools
import time
import hashlib import hashlib
import json import json
import random import random
import re
import time
from .adobepass import AdobePassIE from .adobepass import AdobePassIE
from .youtube import YoutubeIE
from .common import InfoExtractor from .common import InfoExtractor
from .youtube import YoutubeIE
from ..compat import ( from ..compat import (
compat_HTTPError, compat_HTTPError,
compat_str, compat_str,
) )
from ..utils import ( from ..utils import (
clean_html,
ExtractorError, ExtractorError,
int_or_none, int_or_none,
OnDemandPagedList,
parse_age_limit, parse_age_limit,
str_or_none, str_or_none,
try_get, try_get,
) )
class ViceIE(AdobePassIE): class ViceBaseIE(InfoExtractor):
def _call_api(self, resource, resource_key, resource_id, locale, fields, args=''):
return self._download_json(
'https://video.vice.com/api/v1/graphql', resource_id, query={
'query': '''{
%s(locale: "%s", %s: "%s"%s) {
%s
}
}''' % (resource, locale, resource_key, resource_id, args, fields),
})['data'][resource]
class ViceIE(ViceBaseIE, AdobePassIE):
IE_NAME = 'vice' IE_NAME = 'vice'
_VALID_URL = r'https?://(?:(?:video|vms)\.vice|(?:www\.)?viceland)\.com/(?P<locale>[^/]+)/(?:video/[^/]+|embed)/(?P<id>[\da-f]+)' _VALID_URL = r'https?://(?:(?:video|vms)\.vice|(?:www\.)?vice(?:land|tv))\.com/(?P<locale>[^/]+)/(?:video/[^/]+|embed)/(?P<id>[\da-f]{24})'
_TESTS = [{ _TESTS = [{
'url': 'https://video.vice.com/en_us/video/pet-cremator/58c69e38a55424f1227dc3f7', 'url': 'https://video.vice.com/en_us/video/pet-cremator/58c69e38a55424f1227dc3f7',
'info_dict': { 'info_dict': {
'id': '5e647f0125e145c9aef2069412c0cbde', 'id': '58c69e38a55424f1227dc3f7',
'ext': 'mp4', 'ext': 'mp4',
'title': '10 Questions You Always Wanted To Ask: Pet Cremator', 'title': '10 Questions You Always Wanted To Ask: Pet Cremator',
'description': 'md5:fe856caacf61fe0e74fab15ce2b07ca5', 'description': 'md5:fe856caacf61fe0e74fab15ce2b07ca5',
@ -43,17 +58,16 @@ class ViceIE(AdobePassIE):
# m3u8 download # m3u8 download
'skip_download': True, 'skip_download': True,
}, },
'add_ie': ['UplynkPreplay'],
}, { }, {
# geo restricted to US # geo restricted to US
'url': 'https://video.vice.com/en_us/video/the-signal-from-tolva/5816510690b70e6c5fd39a56', 'url': 'https://video.vice.com/en_us/video/the-signal-from-tolva/5816510690b70e6c5fd39a56',
'info_dict': { 'info_dict': {
'id': '930c0ad1f47141cc955087eecaddb0e2', 'id': '5816510690b70e6c5fd39a56',
'ext': 'mp4', 'ext': 'mp4',
'uploader': 'waypoint', 'uploader': 'vice',
'title': 'The Signal From Tölva', 'title': 'The Signal From Tölva',
'description': 'md5:3927e3c79f9e8094606a2b3c5b5e55d5', 'description': 'md5:3927e3c79f9e8094606a2b3c5b5e55d5',
'uploader_id': '57f7d621e05ca860fa9ccaf9', 'uploader_id': '57a204088cb727dec794c67b',
'timestamp': 1477941983, 'timestamp': 1477941983,
'upload_date': '20161031', 'upload_date': '20161031',
}, },
@ -61,15 +75,14 @@ class ViceIE(AdobePassIE):
# m3u8 download # m3u8 download
'skip_download': True, 'skip_download': True,
}, },
'add_ie': ['UplynkPreplay'],
}, { }, {
'url': 'https://video.vice.com/alps/video/ulfs-wien-beruchtigste-grafitti-crew-part-1/581b12b60a0e1f4c0fb6ea2f', 'url': 'https://video.vice.com/alps/video/ulfs-wien-beruchtigste-grafitti-crew-part-1/581b12b60a0e1f4c0fb6ea2f',
'info_dict': { 'info_dict': {
'id': '581b12b60a0e1f4c0fb6ea2f', 'id': '581b12b60a0e1f4c0fb6ea2f',
'ext': 'mp4', 'ext': 'mp4',
'title': 'ULFs - Wien berüchtigste Grafitti Crew - Part 1', 'title': 'ULFs - Wien berüchtigste Grafitti Crew - Part 1',
'description': '<p>Zwischen Hinterzimmer-Tattoos und U-Bahnschächten erzählen uns die Ulfs, wie es ist, "süchtig nach Sachbeschädigung" zu sein.</p>', 'description': 'Zwischen Hinterzimmer-Tattoos und U-Bahnschächten erzählen uns die Ulfs, wie es ist, "süchtig nach Sachbeschädigung" zu sein.',
'uploader': 'VICE', 'uploader': 'vice',
'uploader_id': '57a204088cb727dec794c67b', 'uploader_id': '57a204088cb727dec794c67b',
'timestamp': 1485368119, 'timestamp': 1485368119,
'upload_date': '20170125', 'upload_date': '20170125',
@ -78,9 +91,7 @@ class ViceIE(AdobePassIE):
'params': { 'params': {
# AES-encrypted m3u8 # AES-encrypted m3u8
'skip_download': True, 'skip_download': True,
'proxy': '127.0.0.1:8118',
}, },
'add_ie': ['UplynkPreplay'],
}, { }, {
'url': 'https://video.vice.com/en_us/video/pizza-show-trailer/56d8c9a54d286ed92f7f30e4', 'url': 'https://video.vice.com/en_us/video/pizza-show-trailer/56d8c9a54d286ed92f7f30e4',
'only_matching': True, 'only_matching': True,
@ -98,7 +109,7 @@ class ViceIE(AdobePassIE):
@staticmethod @staticmethod
def _extract_urls(webpage): def _extract_urls(webpage):
return re.findall( return re.findall(
r'<iframe\b[^>]+\bsrc=["\']((?:https?:)?//video\.vice\.com/[^/]+/embed/[\da-f]+)', r'<iframe\b[^>]+\bsrc=["\']((?:https?:)?//video\.vice\.com/[^/]+/embed/[\da-f]{24})',
webpage) webpage)
@staticmethod @staticmethod
@ -109,31 +120,16 @@ def _extract_url(webpage):
def _real_extract(self, url): def _real_extract(self, url):
locale, video_id = re.match(self._VALID_URL, url).groups() locale, video_id = re.match(self._VALID_URL, url).groups()
webpage = self._download_webpage( video = self._call_api('videos', 'id', video_id, locale, '''body
'https://video.vice.com/%s/embed/%s' % (locale, video_id), locked
video_id) rating
thumbnail_url
video = self._parse_json( title''')[0]
self._search_regex( title = video['title'].strip()
r'PREFETCH_DATA\s*=\s*({.+?})\s*;\s*\n', webpage,
'app state'), video_id)['video']
video_id = video.get('vms_id') or video.get('id') or video_id
title = video['title']
is_locked = video.get('locked')
rating = video.get('rating') rating = video.get('rating')
thumbnail = video.get('thumbnail_url')
duration = int_or_none(video.get('duration'))
series = try_get(
video, lambda x: x['episode']['season']['show']['title'],
compat_str)
episode_number = try_get(
video, lambda x: x['episode']['episode_number'])
season_number = try_get(
video, lambda x: x['episode']['season']['season_number'])
uploader = None
query = {} query = {}
if is_locked: if video.get('locked'):
resource = self._get_mvpd_resource( resource = self._get_mvpd_resource(
'VICELAND', title, video_id, rating) 'VICELAND', title, video_id, rating)
query['tvetoken'] = self._extract_mvpd_auth( query['tvetoken'] = self._extract_mvpd_auth(
@ -148,12 +144,9 @@ def _real_extract(self, url):
query.update({ query.update({
'exp': exp, 'exp': exp,
'sign': hashlib.sha512(('%s:GET:%d' % (video_id, exp)).encode()).hexdigest(), 'sign': hashlib.sha512(('%s:GET:%d' % (video_id, exp)).encode()).hexdigest(),
'_ad_blocked': None, 'skipadstitching': 1,
'_ad_unit': '',
'_debug': '',
'platform': 'desktop', 'platform': 'desktop',
'rn': random.randint(10000, 100000), 'rn': random.randint(10000, 100000),
'fbprebidtoken': '',
}) })
try: try:
@ -169,85 +162,94 @@ def _real_extract(self, url):
raise raise
video_data = preplay['video'] video_data = preplay['video']
base = video_data['base'] formats = self._extract_m3u8_formats(
uplynk_preplay_url = preplay['preplayURL'] preplay['playURL'], video_id, 'mp4', 'm3u8_native')
episode = video_data.get('episode', {}) self._sort_formats(formats)
channel = video_data.get('channel', {}) episode = video_data.get('episode') or {}
channel = video_data.get('channel') or {}
season = video_data.get('season') or {}
subtitles = {} subtitles = {}
cc_url = preplay.get('ccURL') for subtitle in preplay.get('subtitleURLs', []):
if cc_url: cc_url = subtitle.get('url')
subtitles['en'] = [{ if not cc_url:
continue
language_code = try_get(subtitle, lambda x: x['languages'][0]['language_code'], compat_str) or 'en'
subtitles.setdefault(language_code, []).append({
'url': cc_url, 'url': cc_url,
}] })
return { return {
'_type': 'url_transparent', 'formats': formats,
'url': uplynk_preplay_url,
'id': video_id, 'id': video_id,
'title': title, 'title': title,
'description': base.get('body') or base.get('display_body'), 'description': clean_html(video.get('body')),
'thumbnail': thumbnail, 'thumbnail': video.get('thumbnail_url'),
'duration': int_or_none(video_data.get('video_duration')) or duration, 'duration': int_or_none(video_data.get('video_duration')),
'timestamp': int_or_none(video_data.get('created_at'), 1000), 'timestamp': int_or_none(video_data.get('created_at'), 1000),
'age_limit': parse_age_limit(video_data.get('video_rating')), 'age_limit': parse_age_limit(video_data.get('video_rating') or rating),
'series': video_data.get('show_title') or series, 'series': try_get(video_data, lambda x: x['show']['base']['display_title'], compat_str),
'episode_number': int_or_none(episode.get('episode_number') or episode_number), 'episode_number': int_or_none(episode.get('episode_number')),
'episode_id': str_or_none(episode.get('id') or video_data.get('episode_id')), 'episode_id': str_or_none(episode.get('id') or video_data.get('episode_id')),
'season_number': int_or_none(season_number), 'season_number': int_or_none(season.get('season_number')),
'season_id': str_or_none(episode.get('season_id')), 'season_id': str_or_none(season.get('id') or video_data.get('season_id')),
'uploader': channel.get('base', {}).get('title') or channel.get('name') or uploader, 'uploader': channel.get('name'),
'uploader_id': str_or_none(channel.get('id')), 'uploader_id': str_or_none(channel.get('id')),
'subtitles': subtitles, 'subtitles': subtitles,
'ie_key': 'UplynkPreplay',
} }
class ViceShowIE(InfoExtractor): class ViceShowIE(ViceBaseIE):
IE_NAME = 'vice:show' IE_NAME = 'vice:show'
_VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?show/(?P<id>[^/?#&]+)' _VALID_URL = r'https?://(?:video\.vice|(?:www\.)?vice(?:land|tv))\.com/(?P<locale>[^/]+)/show/(?P<id>[^/?#&]+)'
_PAGE_SIZE = 25
_TEST = { _TESTS = [{
'url': 'https://munchies.vice.com/en/show/fuck-thats-delicious-2', 'url': 'https://video.vice.com/en_us/show/fck-thats-delicious',
'info_dict': { 'info_dict': {
'id': 'fuck-thats-delicious-2', 'id': '57a2040c8cb727dec794c901',
'title': "Fuck, That's Delicious", 'title': 'F*ck, Thats Delicious',
'description': 'Follow the culinary adventures of rapper Action Bronson during his ongoing world tour.', 'description': 'The life and eating habits of raps greatest bon vivant, Action Bronson.',
}, },
'playlist_count': 17, 'playlist_mincount': 64,
} }, {
'url': 'https://www.vicetv.com/en_us/show/fck-thats-delicious',
'only_matching': True,
}]
def _fetch_page(self, locale, show_id, page):
videos = self._call_api('videos', 'show_id', show_id, locale, '''body
id
url''', ', page: %d, per_page: %d' % (page + 1, self._PAGE_SIZE))
for video in videos:
yield self.url_result(
video['url'], ViceIE.ie_key(), video.get('id'))
def _real_extract(self, url): def _real_extract(self, url):
show_id = self._match_id(url) locale, display_id = re.match(self._VALID_URL, url).groups()
webpage = self._download_webpage(url, show_id) show = self._call_api('shows', 'slug', display_id, locale, '''dek
id
title''')[0]
show_id = show['id']
entries = [ entries = OnDemandPagedList(
self.url_result(video_url, ViceIE.ie_key()) functools.partial(self._fetch_page, locale, show_id),
for video_url, _ in re.findall( self._PAGE_SIZE)
r'<h2[^>]+class="article-title"[^>]+data-id="\d+"[^>]*>\s*<a[^>]+href="(%s.*?)"'
% ViceIE._VALID_URL, webpage)]
title = self._search_regex( return self.playlist_result(
r'<title>(.+?)</title>', webpage, 'title', default=None) entries, show_id, show.get('title'), show.get('dek'))
if title:
title = re.sub(r'(.+)\s*\|\s*.+$', r'\1', title).strip()
description = self._html_search_meta(
'description', webpage, 'description')
return self.playlist_result(entries, show_id, title, description)
class ViceArticleIE(InfoExtractor): class ViceArticleIE(ViceBaseIE):
IE_NAME = 'vice:article' IE_NAME = 'vice:article'
_VALID_URL = r'https://www\.vice\.com/[^/]+/article/(?P<id>[^?#]+)' _VALID_URL = r'https://(?:www\.)?vice\.com/(?P<locale>[^/]+)/article/(?:[0-9a-z]{6}/)?(?P<id>[^?#]+)'
_TESTS = [{ _TESTS = [{
'url': 'https://www.vice.com/en_us/article/on-set-with-the-woman-making-mormon-porn-in-utah', 'url': 'https://www.vice.com/en_us/article/on-set-with-the-woman-making-mormon-porn-in-utah',
'info_dict': { 'info_dict': {
'id': '41eae2a47b174a1398357cec55f1f6fc', 'id': '58dc0a3dee202d2a0ccfcbd8',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Mormon War on Porn ', 'title': 'Mormon War on Porn',
'description': 'md5:6394a8398506581d0346b9ab89093fef', 'description': 'md5:1c5d91fe25fa8aa304f9def118b92dbf',
'uploader': 'vice', 'uploader': 'vice',
'uploader_id': '57a204088cb727dec794c67b', 'uploader_id': '57a204088cb727dec794c67b',
'timestamp': 1491883129, 'timestamp': 1491883129,
@ -258,10 +260,10 @@ class ViceArticleIE(InfoExtractor):
# AES-encrypted m3u8 # AES-encrypted m3u8
'skip_download': True, 'skip_download': True,
}, },
'add_ie': ['UplynkPreplay'], 'add_ie': [ViceIE.ie_key()],
}, { }, {
'url': 'https://www.vice.com/en_us/article/how-to-hack-a-car', 'url': 'https://www.vice.com/en_us/article/how-to-hack-a-car',
'md5': '7fe8ebc4fa3323efafc127b82bd821d9', 'md5': '13010ee0bc694ea87ec40724397c2349',
'info_dict': { 'info_dict': {
'id': '3jstaBeXgAs', 'id': '3jstaBeXgAs',
'ext': 'mp4', 'ext': 'mp4',
@ -271,15 +273,15 @@ class ViceArticleIE(InfoExtractor):
'uploader_id': 'MotherboardTV', 'uploader_id': 'MotherboardTV',
'upload_date': '20140529', 'upload_date': '20140529',
}, },
'add_ie': ['Youtube'], 'add_ie': [YoutubeIE.ie_key()],
}, { }, {
'url': 'https://www.vice.com/en_us/article/znm9dx/karley-sciortino-slutever-reloaded', 'url': 'https://www.vice.com/en_us/article/znm9dx/karley-sciortino-slutever-reloaded',
'md5': 'a7ecf64ee4fa19b916c16f4b56184ae2', 'md5': 'a7ecf64ee4fa19b916c16f4b56184ae2',
'info_dict': { 'info_dict': {
'id': 'e2ed435eb67e43efb66e6ef9a6930a88', 'id': '57f41d3556a0a80f54726060',
'ext': 'mp4', 'ext': 'mp4',
'title': "Making The World's First Male Sex Doll", 'title': "Making The World's First Male Sex Doll",
'description': 'md5:916078ef0e032d76343116208b6cc2c4', 'description': 'md5:19b00b215b99961cf869c40fbe9df755',
'uploader': 'vice', 'uploader': 'vice',
'uploader_id': '57a204088cb727dec794c67b', 'uploader_id': '57a204088cb727dec794c67b',
'timestamp': 1476919911, 'timestamp': 1476919911,
@ -288,6 +290,7 @@ class ViceArticleIE(InfoExtractor):
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
'format': 'bestvideo',
}, },
'add_ie': [ViceIE.ie_key()], 'add_ie': [ViceIE.ie_key()],
}, { }, {
@ -299,14 +302,11 @@ class ViceArticleIE(InfoExtractor):
}] }]
def _real_extract(self, url): def _real_extract(self, url):
display_id = self._match_id(url) locale, display_id = re.match(self._VALID_URL, url).groups()
webpage = self._download_webpage(url, display_id) article = self._call_api('articles', 'slug', display_id, locale, '''body
embed_code''')[0]
prefetch_data = self._parse_json(self._search_regex( body = article['body']
r'__APP_STATE\s*=\s*({.+?})(?:\s*\|\|\s*{}\s*)?;\s*\n',
webpage, 'app state'), display_id)['pageData']
body = prefetch_data['body']
def _url_res(video_url, ie_key): def _url_res(video_url, ie_key):
return { return {
@ -316,7 +316,7 @@ def _url_res(video_url, ie_key):
'ie_key': ie_key, 'ie_key': ie_key,
} }
vice_url = ViceIE._extract_url(webpage) vice_url = ViceIE._extract_url(body)
if vice_url: if vice_url:
return _url_res(vice_url, ViceIE.ie_key()) return _url_res(vice_url, ViceIE.ie_key())
@ -332,6 +332,6 @@ def _url_res(video_url, ie_key):
video_url = self._html_search_regex( video_url = self._html_search_regex(
r'data-video-url="([^"]+)"', r'data-video-url="([^"]+)"',
prefetch_data['embed_code'], 'video URL') article['embed_code'], 'video URL')
return _url_res(video_url, ViceIE.ie_key()) return _url_res(video_url, ViceIE.ie_key())