[ie/MuseAI] Add extractor (#7614)

Closes #7543
Authored by: bashonly
This commit is contained in:
bashonly 2023-07-20 09:15:21 -05:00 committed by GitHub
parent f4ea501551
commit 65cfa2b057
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 113 additions and 0 deletions

View file

@ -1142,6 +1142,7 @@
) )
from .muenchentv import MuenchenTVIE from .muenchentv import MuenchenTVIE
from .murrtube import MurrtubeIE, MurrtubeUserIE from .murrtube import MurrtubeIE, MurrtubeUserIE
from .museai import MuseAIIE
from .musescore import MuseScoreIE from .musescore import MuseScoreIE
from .musicdex import ( from .musicdex import (
MusicdexSongIE, MusicdexSongIE,

112
yt_dlp/extractor/museai.py Normal file
View file

@ -0,0 +1,112 @@
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
determine_ext,
float_or_none,
int_or_none,
js_to_json,
traverse_obj,
url_or_none,
)
class MuseAIIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?muse\.ai/(?:v|embed)/(?P<id>\w+)'
_TESTS = [{
'url': 'https://muse.ai/embed/YdTWvUW',
'md5': 'f994f9a38be1c3aaf9e37cbd7d76fe7c',
'info_dict': {
'id': 'YdTWvUW',
'ext': 'mp4',
'title': '2023-05-28-Grabien-1941111 (1)',
'description': '',
'uploader': 'Today News Africa',
'uploader_id': 'TodayNewsAfrica',
'upload_date': '20230528',
'timestamp': 1685285044,
'duration': 1291.3,
'view_count': int,
'availability': 'public',
},
}, {
'url': 'https://muse.ai/v/gQ4gGAA-0756',
'md5': '52dbfc78e865e56dc19a1715badc35e8',
'info_dict': {
'id': 'gQ4gGAA',
'ext': 'mp4',
'title': '0756',
'description': 'md5:0ca1483f9aac423e9a96ad00bb3a0785',
'uploader': 'Aerial.ie',
'uploader_id': 'aerial',
'upload_date': '20210306',
'timestamp': 1615072842,
'duration': 21.4,
'view_count': int,
'availability': 'public',
},
}]
_WEBPAGE_TESTS = [{
'url': 'https://muse.ai/docs',
'playlist_mincount': 4,
'info_dict': {
'id': 'docs',
'title': 'muse.ai | docs',
'description': 'md5:6c0293431481582739c82ee8902687fa',
'age_limit': 0,
'thumbnail': 'https://muse.ai/static/imgs/poster-img-docs.png',
},
'params': {'allowed_extractors': ['all', '-html5']},
}]
_EMBED_REGEX = [r'<iframe[^>]*\bsrc=["\'](?P<url>https://muse\.ai/embed/\w+)']
@classmethod
def _extract_embed_urls(cls, url, webpage):
yield from super()._extract_embed_urls(url, webpage)
for embed_id in re.findall(r'<script>[^<]*\bMusePlayer\(\{[^}<]*\bvideo:\s*["\'](\w+)["\']', webpage):
yield f'https://muse.ai/embed/{embed_id}'
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(f'https://muse.ai/embed/{video_id}', video_id)
data = self._search_json(
r'player\.setData\(', webpage, 'player data', video_id, transform_source=js_to_json)
source_url = data['url']
if not url_or_none(source_url):
raise ExtractorError('Unable to extract video URL')
formats = [{
'url': source_url,
'format_id': 'source',
'quality': 1,
**traverse_obj(data, {
'ext': ('filename', {determine_ext}),
'width': ('width', {int_or_none}),
'height': ('height', {int_or_none}),
'filesize': ('size', {int_or_none}),
}),
}]
if source_url.endswith('/data'):
base_url = f'{source_url[:-5]}/videos'
formats.extend(self._extract_m3u8_formats(
f'{base_url}/hls.m3u8', video_id, m3u8_id='hls', fatal=False))
formats.extend(self._extract_mpd_formats(
f'{base_url}/dash.mpd', video_id, mpd_id='dash', fatal=False))
return {
'id': video_id,
'formats': formats,
**traverse_obj(data, {
'title': ('title', {str}),
'description': ('description', {str}),
'duration': ('duration', {float_or_none}),
'timestamp': ('tcreated', {int_or_none}),
'uploader': ('owner_name', {str}),
'uploader_id': ('owner_username', {str}),
'view_count': ('views', {int_or_none}),
'age_limit': ('mature', {lambda x: 18 if x else None}),
'availability': ('visibility', {lambda x: x if x in ('private', 'unlisted') else 'public'}),
}),
}