[MainStreaming] Add extractor (#2180)

Closes #1183, https://github.com/ytdl-org/youtube-dl/issues/29615

Authored by: coletdjnz
This commit is contained in:
coletdjnz 2022-01-05 08:48:17 +00:00 committed by GitHub
parent 9f517bb1f3
commit 9c634ef857
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 240 additions and 1 deletions

View file

@ -756,6 +756,7 @@
MailRuMusicIE, MailRuMusicIE,
MailRuMusicSearchIE, MailRuMusicSearchIE,
) )
from .mainstreaming import MainStreamingIE
from .malltv import MallTVIE from .malltv import MallTVIE
from .mangomolo import ( from .mangomolo import (
MangomoloVideoIE, MangomoloVideoIE,

View file

@ -137,6 +137,7 @@
from .wimtv import WimTVIE from .wimtv import WimTVIE
from .tvp import TVPEmbedIE from .tvp import TVPEmbedIE
from .blogger import BloggerIE from .blogger import BloggerIE
from .mainstreaming import MainStreamingIE
from .gfycat import GfycatIE from .gfycat import GfycatIE
@ -2384,6 +2385,19 @@ class GenericIE(InfoExtractor):
'upload_date': '20211113' 'upload_date': '20211113'
} }
}, },
{
# MainStreaming player
'url': 'https://www.lactv.it/2021/10/03/lac-news24-la-settimana-03-10-2021/',
'info_dict': {
'id': 'EUlZfGWkGpOd',
'title': 'La Settimana ',
'description': '03 Ottobre ore 02:00',
'ext': 'mp4',
'live_status': 'not_live',
'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
'duration': 1512
}
},
{ {
# Multiple gfycat iframe embeds # Multiple gfycat iframe embeds
'url': 'https://www.gezip.net/bbs/board.php?bo_table=entertaine&wr_id=613422', 'url': 'https://www.gezip.net/bbs/board.php?bo_table=entertaine&wr_id=613422',
@ -2411,7 +2425,6 @@ class GenericIE(InfoExtractor):
}, },
'playlist_count': 9 'playlist_count': 9
} }
#
] ]
def report_following_redirect(self, new_url): def report_following_redirect(self, new_url):
@ -3600,10 +3613,16 @@ def _real_extract(self, url):
if tvp_urls: if tvp_urls:
return self.playlist_from_matches(tvp_urls, video_id, video_title, ie=TVPEmbedIE.ie_key()) return self.playlist_from_matches(tvp_urls, video_id, video_title, ie=TVPEmbedIE.ie_key())
# Look for MainStreaming embeds
mainstreaming_urls = MainStreamingIE._extract_urls(webpage)
if mainstreaming_urls:
return self.playlist_from_matches(mainstreaming_urls, video_id, video_title, ie=MainStreamingIE.ie_key())
# Look for Gfycat Embeds # Look for Gfycat Embeds
gfycat_urls = GfycatIE._extract_urls(webpage) gfycat_urls = GfycatIE._extract_urls(webpage)
if gfycat_urls: if gfycat_urls:
return self.playlist_from_matches(gfycat_urls, video_id, video_title, ie=GfycatIE.ie_key()) return self.playlist_from_matches(gfycat_urls, video_id, video_title, ie=GfycatIE.ie_key())
# Look for HTML5 media # Look for HTML5 media
entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
if entries: if entries:

View file

@ -0,0 +1,219 @@
# coding: utf-8
import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
js_to_json,
parse_duration,
traverse_obj,
try_get,
urljoin
)
class MainStreamingIE(InfoExtractor):
_VALID_URL = r'https?://(?:webtools-?)?(?P<host>[A-Za-z0-9-]*\.msvdn.net)/(?:embed|amp_embed|content)/(?P<id>\w+)'
IE_DESC = 'MainStreaming Player'
_TESTS = [
{
# Live stream offline, has alternative content id
'url': 'https://webtools-e18da6642b684f8aa9ae449862783a56.msvdn.net/embed/53EN6GxbWaJC',
'info_dict': {
'id': '53EN6GxbWaJC',
'title': 'Diretta homepage 2021-12-31 12:00',
'description': '',
'live_status': 'was_live',
'ext': 'mp4',
'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
},
'expected_warnings': [
'Ignoring alternative content ID: WDAF1KOWUpH3',
'MainStreaming said: Live event is OFFLINE'
],
'skip': 'live stream offline'
}, {
# playlist
'url': 'https://webtools-e18da6642b684f8aa9ae449862783a56.msvdn.net/embed/WDAF1KOWUpH3',
'info_dict': {
'id': 'WDAF1KOWUpH3',
'title': 'Playlist homepage',
},
'playlist_mincount': 2
}, {
# livestream
'url': 'https://webtools-859c1818ed614cc5b0047439470927b0.msvdn.net/embed/tDoFkZD3T1Lw',
'info_dict': {
'id': 'tDoFkZD3T1Lw',
'title': r're:Class CNBC Live \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
'live_status': 'is_live',
'ext': 'mp4',
'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
},
'skip': 'live stream'
}, {
'url': 'https://webtools-f5842579ff984c1c98d63b8d789673eb.msvdn.net/embed/EUlZfGWkGpOd?autoPlay=false',
'info_dict': {
'id': 'EUlZfGWkGpOd',
'title': 'La Settimana ',
'description': '03 Ottobre ore 02:00',
'ext': 'mp4',
'live_status': 'not_live',
'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
'duration': 1512
}
}, {
# video without webtools- prefix
'url': 'https://f5842579ff984c1c98d63b8d789673eb.msvdn.net/embed/MfuWmzL2lGkA?autoplay=false&T=1635860445',
'info_dict': {
'id': 'MfuWmzL2lGkA',
'title': 'TG Mattina',
'description': '06 Ottobre ore 08:00',
'ext': 'mp4',
'live_status': 'not_live',
'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
'duration': 789.04
}
}, {
# always-on livestream with DVR
'url': 'https://webtools-f5842579ff984c1c98d63b8d789673eb.msvdn.net/embed/HVvPMzy',
'info_dict': {
'id': 'HVvPMzy',
'title': r're:^Diretta LaC News24 \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
'description': 'canale all news',
'live_status': 'is_live',
'ext': 'mp4',
'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
},
'params': {
'skip_download': True,
},
}, {
# no host
'url': 'https://webtools.msvdn.net/embed/MfuWmzL2lGkA',
'only_matching': True
}, {
'url': 'https://859c1818ed614cc5b0047439470927b0.msvdn.net/amp_embed/tDoFkZD3T1Lw',
'only_matching': True
}, {
'url': 'https://859c1818ed614cc5b0047439470927b0.msvdn.net/content/tDoFkZD3T1Lw#',
'only_matching': True
}
]
@staticmethod
def _extract_urls(webpage):
mobj = re.findall(
r'<iframe[^>]+?src=["\']?(?P<url>%s)["\']?' % MainStreamingIE._VALID_URL, webpage)
if mobj:
return [group[0] for group in mobj]
def _playlist_entries(self, host, playlist_content):
for entry in playlist_content:
content_id = entry.get('contentID')
yield {
'_type': 'url',
'ie_key': MainStreamingIE.ie_key(),
'id': content_id,
'duration': int_or_none(traverse_obj(entry, ('duration', 'totalSeconds'))),
'title': entry.get('title'),
'url': f'https://{host}/embed/{content_id}'
}
@staticmethod
def _get_webtools_host(host):
if not host.startswith('webtools'):
host = 'webtools' + ('-' if not host.startswith('.') else '') + host
return host
def _get_webtools_base_url(self, host):
return f'{self.http_scheme()}//{self._get_webtools_host(host)}'
def _call_api(self, host: str, path: str, item_id: str, query=None, note='Downloading API JSON', fatal=False):
# JSON API, does not appear to be documented
return self._call_webtools_api(host, '/api/v2/' + path, item_id, query, note, fatal)
def _call_webtools_api(self, host: str, path: str, item_id: str, query=None, note='Downloading webtools API JSON', fatal=False):
# webtools docs: https://webtools.msvdn.net/
return self._download_json(
urljoin(self._get_webtools_base_url(host), path), item_id, query=query, note=note, fatal=fatal)
def _real_extract(self, url):
host, video_id = self._match_valid_url(url).groups()
content_info = try_get(
self._call_api(
host, f'content/{video_id}', video_id, note='Downloading content info API JSON'), lambda x: x['playerContentInfo'])
# Fallback
if not content_info:
webpage = self._download_webpage(url, video_id)
player_config = self._parse_json(
self._search_regex(
r'config\s*=\s*({.+?})\s*;', webpage, 'mainstreaming player config',
default='{}', flags=re.DOTALL),
video_id, transform_source=js_to_json, fatal=False) or {}
content_info = player_config['contentInfo']
host = content_info.get('host') or host
video_id = content_info.get('contentID') or video_id
title = content_info.get('title')
description = traverse_obj(content_info, 'longDescription', 'shortDescription', expected_type=str)
live_status = 'not_live'
if content_info.get('drmEnabled'):
self.report_drm(video_id)
alternative_content_id = content_info.get('alternativeContentID')
if alternative_content_id:
self.report_warning(f'Ignoring alternative content ID: {alternative_content_id}')
content_type = int_or_none(content_info.get('contentType'))
format_base_url = None
formats = []
subtitles = {}
# Live content
if content_type == 20:
dvr_enabled = traverse_obj(content_info, ('playerSettings', 'dvrEnabled'), expected_type=bool)
format_base_url = f"https://{host}/live/{content_info['liveSourceID']}/{video_id}/%s{'?DVR' if dvr_enabled else ''}"
live_status = 'is_live'
heartbeat = self._call_api(host, f'heartbeat/{video_id}', video_id, note='Checking stream status') or {}
if heartbeat.get('heartBeatUp') is False:
self.raise_no_formats(f'MainStreaming said: {heartbeat.get("responseMessage")}', expected=True)
live_status = 'was_live'
# Playlist
elif content_type == 31:
return self.playlist_result(
self._playlist_entries(host, content_info.get('playlistContents')), video_id, title, description)
# Normal video content?
elif content_type == 10:
format_base_url = f'https://{host}/vod/{video_id}/%s'
# Progressive format
# Note: in https://webtools.msvdn.net/loader/playerV2.js there is mention of original.mp3 format,
# however it seems to be the same as original.mp4?
formats.append({'url': format_base_url % 'original.mp4', 'format_note': 'original', 'quality': 1})
else:
self.raise_no_formats(f'Unknown content type {content_type}')
if format_base_url:
m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
format_base_url % 'playlist.m3u8', video_id=video_id, fatal=False)
mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(
format_base_url % 'manifest.mpd', video_id=video_id, fatal=False)
subtitles = self._merge_subtitles(m3u8_subs, mpd_subs)
formats.extend(m3u8_formats + mpd_formats)
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'description': description,
'formats': formats,
'live_status': live_status,
'duration': parse_duration(content_info.get('duration')),
'tags': content_info.get('tags'),
'subtitles': subtitles,
'thumbnail': urljoin(self._get_webtools_base_url(host), f'image/{video_id}/poster')
}