[RTP] Fix extraction and add subtitles (#497)

Authored by: fstirlitz
This commit is contained in:
Felix S 2021-07-14 01:36:18 +02:00 committed by GitHub
parent c843e68588
commit 182b6ae8a6
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -2,10 +2,11 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import js_to_json
determine_ext, import re
js_to_json, import json
) import urllib.parse
import base64
class RTPIE(InfoExtractor): class RTPIE(InfoExtractor):
@ -25,6 +26,22 @@ class RTPIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
_RX_OBFUSCATION = re.compile(r'''(?xs)
atob\s*\(\s*decodeURIComponent\s*\(\s*
(\[[0-9A-Za-z%,'"]*\])
\s*\.\s*join\(\s*(?:""|'')\s*\)\s*\)\s*\)
''')
def __unobfuscate(self, data, *, video_id):
if data.startswith('{'):
data = self._RX_OBFUSCATION.sub(
lambda m: json.dumps(
base64.b64decode(urllib.parse.unquote(
''.join(self._parse_json(m.group(1), video_id))
)).decode('iso-8859-1')),
data)
return js_to_json(data)
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
@ -32,30 +49,46 @@ def _real_extract(self, url):
title = self._html_search_meta( title = self._html_search_meta(
'twitter:title', webpage, display_name='title', fatal=True) 'twitter:title', webpage, display_name='title', fatal=True)
config = self._parse_json(self._search_regex( f, config = self._search_regex(
r'(?s)RTPPlayer\(({.+?})\);', webpage, r'''(?sx)
'player config'), video_id, js_to_json) var\s+f\s*=\s*(?P<f>".*?"|{[^;]+?});\s*
file_url = config['file'] var\s+player1\s+=\s+new\s+RTPPlayer\s*\((?P<config>{(?:(?!\*/).)+?})\);(?!\s*\*/)
ext = determine_ext(file_url) ''', webpage,
if ext == 'm3u8': 'player config', group=('f', 'config'))
file_key = config.get('fileKey')
formats = self._extract_m3u8_formats( f = self._parse_json(
file_url, video_id, 'mp4', 'm3u8_native', f, video_id,
m3u8_id='hls', fatal=file_key) lambda data: self.__unobfuscate(data, video_id=video_id))
if file_key: config = self._parse_json(
formats.append({ config, video_id,
'url': 'https://cdn-ondemand.rtp.pt' + file_key, lambda data: self.__unobfuscate(data, video_id=video_id))
'quality': 1,
}) formats = []
self._sort_formats(formats) if isinstance(f, dict):
f_hls = f.get('hls')
if f_hls is not None:
formats.extend(self._extract_m3u8_formats(
f_hls, video_id, 'mp4', 'm3u8_native', m3u8_id='hls'))
f_dash = f.get('dash')
if f_dash is not None:
formats.extend(self._extract_mpd_formats(f_dash, video_id, mpd_id='dash'))
else: else:
formats = [{ formats.append({
'url': file_url, 'format_id': 'f',
'ext': ext, 'url': f,
}] 'vcodec': 'none' if config.get('mediaType') == 'audio' else None,
if config.get('mediaType') == 'audio': })
for f in formats:
f['vcodec'] = 'none' subtitles = {}
vtt = config.get('vtt')
if vtt is not None:
for lcode, lname, url in vtt:
subtitles.setdefault(lcode, []).append({
'name': lname,
'url': url,
})
return { return {
'id': video_id, 'id': video_id,
@ -63,4 +96,5 @@ def _real_extract(self, url):
'formats': formats, 'formats': formats,
'description': self._html_search_meta(['description', 'twitter:description'], webpage), 'description': self._html_search_meta(['description', 'twitter:description'], webpage),
'thumbnail': config.get('poster') or self._og_search_thumbnail(webpage), 'thumbnail': config.get('poster') or self._og_search_thumbnail(webpage),
'subtitles': subtitles,
} }