import re from collections import defaultdict from .common import InfoExtractor from ..utils import ( clean_html, int_or_none, float_or_none, parse_duration, unescapeHTML, urljoin, ) class AMVNewsIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?amvnews\.ru/(?:index.php)?\?go=Files&in=view&id=(?P[0-9]+)' _TESTS = [{ 'url': 'https://amvnews.ru/index.php?go=Files&in=view&id=12345', 'info_dict': { 'id': '12345', 'ext': 'mp4', 'description': 'md5:3c1391ce952f2125ce615b43081de1d0', 'title': 'Jadeite | Music: Jai Wolf - Lost', 'duration': 113, 'creator': 'Leafa', 'formats': [ { 'url': 'https://amvnews.ru/index.php?go=Files&file=down&id=12345&alt=4', 'ext': 'mp4', 'vcodec': 'h264', 'acodec': 'aac', 'width': 640, 'height': 360, 'fps': 23.98, }, { 'url': 'https://amvnews.ru/index.php?go=Files&file=down&id=12345', 'ext': 'mp4', 'vcodec': 'h264', 'acodec': 'aac', 'width': 1920, 'height': 1080, 'fps': 23.98, }, { 'url': 'https://amvnews.ru/index.php?go=Files&file=down&id=12345&alt=1', 'ext': 'mp4', 'vcodec': 'h264', 'acodec': 'aac', 'width': 3840, 'height': 2160, 'fps': 23.98, } ], } }] def _real_extract(self, html_url): video_id = self._match_id(html_url) webpage = self._download_webpage(html_url, video_id) formats = [] subtitles = defaultdict(list) for link, info, name in re.findall( r']*?(?:overlib\(\'(?P[^\']*)\'[^>]*)?>Download *(?P[^<]*)', webpage, flags=re.IGNORECASE): url = urljoin('https://amvnews.ru/', unescapeHTML(link)) clean_name = clean_html(name) if 'subtitle' in clean_name.lower(): # there are usually only english and russian subtitles (en, ru) subtitles[clean_name.lower()[0:2]].append({ 'url': url, 'ext': self._search_regex(r'type: (\w+)', info.lower(), 'ext', default='srt'), 'name': clean_name, }) elif 'resolution: ' in info.lower(): formats.append({ 'url': url, 'ext': 'mp4', 'format_note': clean_name, 'vcodec': self._search_regex(r'Codecs: (\w+)', info, 'vcodec', fatal=False, flags=re.IGNORECASE), 'acodec': self._search_regex(r'Codecs: \w+(?:\s*\([^\)]*\))*\/(\w+)', info, 'acodec', fatal=False, flags=re.IGNORECASE), 'width': int_or_none(self._search_regex(r'Resolution: (\d+)', info, 'width', fatal=False, flags=re.IGNORECASE)), 'height': int_or_none(self._search_regex(r'Resolution: \d+x(\d+)', info, 'height', fatal=False, flags=re.IGNORECASE)), 'fps': float_or_none(self._search_regex(r'Resolution: \d+x\d+\@([\d\.]+)', info, 'fps', fatal=False, flags=re.IGNORECASE)), 'duration': parse_duration(self._search_regex(r'Duration: ([ \w]+)', info, 'duration', fatal=False, flags=re.IGNORECASE)), }) title = self._html_extract_title(webpage) if title: title = title.removeprefix('AMV | Videos | ') url = None if not formats: # use "url" field instead formats = None url = 'https://amvnews.ru/index.php?go=Files&file=down&id=' + str(video_id) return { 'id': video_id, 'title': title, 'description': self._html_search_regex(r'
(.*?)
', webpage, 'description', fatal=False, flags=re.DOTALL | re.IGNORECASE), 'creator': self._html_search_regex(r'(.*?)', webpage, 'creator', fatal=False, flags=re.IGNORECASE), 'url': url, 'formats': formats, 'subtitles': subtitles, }