From 284b233891b61503efeed2e5e9b93b6a25b04b86 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Sat, 10 Feb 2024 00:15:06 -0500 Subject: [PATCH 1/6] event video --- yt_dlp/extractor/_extractors.py | 8 +- yt_dlp/extractor/microsoftembed.py | 169 ++++++++++++++++- yt_dlp/extractor/microsoftvirtualacademy.py | 189 -------------------- 3 files changed, 172 insertions(+), 194 deletions(-) delete mode 100644 yt_dlp/extractor/microsoftvirtualacademy.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index f8488d304..113609a6b 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1074,11 +1074,11 @@ from .metacritic import MetacriticIE from .mgtv import MGTVIE from .miaopai import MiaoPaiIE from .microsoftstream import MicrosoftStreamIE -from .microsoftvirtualacademy import ( - MicrosoftVirtualAcademyIE, - MicrosoftVirtualAcademyCourseIE, +from .microsoftembed import ( + MicrosoftEmbedIE, + MicrosoftMediusIE, + MicrosoftLearnIE, ) -from .microsoftembed import MicrosoftEmbedIE from .mildom import ( MildomIE, MildomVodIE, diff --git a/yt_dlp/extractor/microsoftembed.py b/yt_dlp/extractor/microsoftembed.py index f71ab3e92..d03578c74 100644 --- a/yt_dlp/extractor/microsoftembed.py +++ b/yt_dlp/extractor/microsoftembed.py @@ -1,5 +1,13 @@ +import re + from .common import InfoExtractor -from ..utils import int_or_none, traverse_obj, unified_timestamp +from ..utils import ( + int_or_none, + parse_iso8601, + traverse_obj, + unified_timestamp, + url_or_none, +) class MicrosoftEmbedIE(InfoExtractor): @@ -63,3 +71,162 @@ class MicrosoftEmbedIE(InfoExtractor): 'subtitles': subtitles, 'thumbnails': thumbnails, } + + +class MicrosoftMediusBaseIE(InfoExtractor): + def _sub_to_dict(self, subtitile_list): + subtitles = {} + for sub in subtitile_list: + subtitles.setdefault(sub['tag'], []).append({k: v for k, v in sub.items() if k != 'tag'}) + return subtitles + + def _extract_ism(self, ism_url, video_id): + formats = self._extract_ism_formats(ism_url, video_id) + for format in formats: + if format.get('language') == 'eng' or 'English' in format.get('format_id', ''): + format['language_preference'] = -1 + else: + format['language_preference'] = -10 + return formats + + +class MicrosoftMediusIE(MicrosoftMediusBaseIE): + _VALID_URL = r'https?://medius\.microsoft\.com/[^?#]+/(?P[0-9a-f\-]+)' + + _TESTS = [{ + 'url': 'https://medius.microsoft.com/Embed/video-nc/9640d86c-f513-4889-959e-5dace86e7d2b', + 'info_dict': { + 'id': '9640d86c-f513-4889-959e-5dace86e7d2b', + 'ext': 'ismv', + 'title': 'Rapidly code, test and ship from secure cloud developer environments', + 'description': 'md5:33c8e4facadc438613476eea24165f71', + 'thumbnail': r're:https://mediusimg\.event\.microsoft\.com/video-\d+/thumbnail\.jpg.*', + 'subtitles': 'count:30', + }, + 'params': {'listsubtitles': True}, + }, { + 'url': 'https://medius.microsoft.com/Embed/video-nc/81215af5-c813-4dcd-aede-94f4e1a7daa3', + 'info_dict': { + 'id': '81215af5-c813-4dcd-aede-94f4e1a7daa3', + 'ext': 'ismv', + 'title': 'Microsoft Build opening', + 'description': 'md5:43455096141077a1f23144cab8cec1cb', + 'thumbnail': r're:https://mediusimg\.event\.microsoft\.com/video-\d+/thumbnail\.jpg.*', + 'subtitles': 'count:31', + }, + 'params': {'listsubtitles': True}, + }] + + def _extract_subtitle(self, webpage, video_id): + captions = traverse_obj( + self._search_json(r'const\s+captionsConfiguration\s*=\s*', webpage, 'captions', video_id, default=False), + ('languageList', ..., { + 'url': ('src', {url_or_none}), + 'tag': ('srclang', {str}), + 'name': ('kind', {str}), + })) + + captions = captions or traverse_obj( + re.findall(r'var\s+file\s+=\s+\{[^}]+\'(https://[^\']+\.vtt\?[^\']+)', webpage), + (lambda _, v: url_or_none(v), {lambda x: {'url': x, 'tag': x.split('.vtt?')[0].split('_')[-1]}})) + + return self._sub_to_dict(captions) + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + ism_url = self._search_regex(r'StreamUrl\s*=\s*"([^"]+manifest)"', webpage, 'ism url') + + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'formats': self._extract_ism(ism_url, video_id), + 'thumbnail': self._og_search_thumbnail(webpage), + 'subtitles': self._extract_subtitle(webpage, video_id), + } + + +class MicrosoftLearnIE(MicrosoftMediusBaseIE): + _VALID_URL = [ + r'https://learn\.microsoft\.com/[\w\-]+/(events|shows)/[\w\-]+/(?P[^?#/]+)', + r'https://build\.microsoft\.com/[\w\-]+/sessions/(?P[0-9a-f\-]+)', + ] + + _TESTS = [{ + 'url': 'https://learn.microsoft.com/en-us/events/build-2022/ts01-rapidly-code-test-ship-from-secure-cloud-developer-environments', + 'info_dict': { + 'id': '9640d86c-f513-4889-959e-5dace86e7d2b', + 'ext': 'ismv', + 'title': 'Rapidly code, test and ship from secure cloud developer environments - Events', + 'description': 'md5:f26c1a85d41c1cffd27a0279254a25c3', + 'timestamp': 1653408600, + 'upload_date': '20220524', + 'thumbnail': r're:https://mediusimg\.event\.microsoft\.com/video-\d+/thumbnail\.jpg.*', + }, + }, { + 'url': 'https://learn.microsoft.com/en-us/shows/bash-for-beginners/what-is-the-difference-between-a-terminal-and-a-shell-2-of-20-bash-for-beginners/', + 'info_dict': { + 'id': 'd44e1a03-a0e5-45c2-9496-5c9fa08dc94c', + 'ext': 'ismv', + 'title': 'What is the Difference Between a Terminal and a Shell? (Part 2 of 20)', + 'description': 'md5:7bbbfb593d21c2cf2babc3715ade6b88', + 'timestamp': 1676339547, + 'upload_date': '20230214', + 'subtitles': 'count:14', + }, + 'params': {'listsubtitles': True}, + }, { + 'url': 'https://build.microsoft.com/en-US/sessions/49e81029-20f0-485b-b641-73b7f9622656?source=sessions', + 'info_dict': { + 'id': '81215af5-c813-4dcd-aede-94f4e1a7daa3', + 'ext': 'ismv', + 'title': 'Microsoft Build: Highlights from 2023', + 'description': 'md5:24fb8410b48256bb42dfca37eb936583', + 'timestamp': 1684857600, + 'upload_date': '20230523', + 'thumbnail': r're:https://mediusimg\.event\.microsoft\.com/video-\d+/thumbnail\.jpg.*', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + metainfo = { + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + } + + video_url = self._search_regex( + r'\d+)\s*\|\s*(?P.+)', title) - return (int(m.group('chapter')), m.group('title')) if m else (None, title) - - -class MicrosoftVirtualAcademyIE(MicrosoftVirtualAcademyBaseIE): - IE_NAME = 'mva' - IE_DESC = 'Microsoft Virtual Academy videos' - _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/[^/?#&]+-)(?P<course_id>\d+)(?::|\?l=)(?P<id>[\da-zA-Z]+_\d+)' % IE_NAME - - _TESTS = [{ - 'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788?l=gfVXISmEB_6804984382', - 'md5': '7826c44fc31678b12ad8db11f6b5abb9', - 'info_dict': { - 'id': 'gfVXISmEB_6804984382', - 'ext': 'mp4', - 'title': 'Course Introduction', - 'formats': 'mincount:3', - 'subtitles': { - 'en': [{ - 'ext': 'ttml', - }], - }, - } - }, { - 'url': 'mva:11788:gfVXISmEB_6804984382', - 'only_matching': True, - }] - - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - - mobj = self._match_valid_url(url) - course_id = mobj.group('course_id') - video_id = mobj.group('id') - - base_url = smuggled_data.get('base_url') or self._extract_base_url(course_id, video_id) - - settings = self._download_xml( - '%s/content/content_%s/videosettings.xml?v=1' % (base_url, video_id), - video_id, 'Downloading video settings XML') - - _, title = self._extract_chapter_and_title(xpath_text( - settings, './/Title', 'title', fatal=True)) - - formats = [] - - for sources in settings.findall('.//MediaSources'): - sources_type = sources.get('videoType') - for source in sources.findall('./MediaSource'): - video_url = source.text - if not video_url or not video_url.startswith('http'): - continue - if sources_type == 'smoothstreaming': - formats.extend(self._extract_ism_formats( - video_url, video_id, 'mss', fatal=False)) - continue - video_mode = source.get('videoMode') - height = int_or_none(self._search_regex( - r'^(\d+)[pP]$', video_mode or '', 'height', default=None)) - codec = source.get('codec') - acodec, vcodec = [None] * 2 - if codec: - codecs = codec.split(',') - if len(codecs) == 2: - acodec, vcodec = codecs - elif len(codecs) == 1: - vcodec = codecs[0] - formats.append({ - 'url': video_url, - 'format_id': video_mode, - 'height': height, - 'acodec': acodec, - 'vcodec': vcodec, - }) - - subtitles = {} - for source in settings.findall('.//MarkerResourceSource'): - subtitle_url = source.text - if not subtitle_url: - continue - subtitles.setdefault('en', []).append({ - 'url': '%s/%s' % (base_url, subtitle_url), - 'ext': source.get('type'), - }) - - return { - 'id': video_id, - 'title': title, - 'subtitles': subtitles, - 'formats': formats - } - - -class MicrosoftVirtualAcademyCourseIE(MicrosoftVirtualAcademyBaseIE): - IE_NAME = 'mva:course' - IE_DESC = 'Microsoft Virtual Academy courses' - _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/(?P<display_id>[^/?#&]+)-)(?P<id>\d+)' % IE_NAME - - _TESTS = [{ - 'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788', - 'info_dict': { - 'id': '11788', - 'title': 'Microsoft Azure Fundamentals: Virtual Machines', - }, - 'playlist_count': 36, - }, { - # with emphasized chapters - 'url': 'https://mva.microsoft.com/en-US/training-courses/developing-windows-10-games-with-construct-2-16335', - 'info_dict': { - 'id': '16335', - 'title': 'Developing Windows 10 Games with Construct 2', - }, - 'playlist_count': 10, - }, { - 'url': 'https://www.microsoftvirtualacademy.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788', - 'only_matching': True, - }, { - 'url': 'mva:course:11788', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return False if MicrosoftVirtualAcademyIE.suitable(url) else super( - MicrosoftVirtualAcademyCourseIE, cls).suitable(url) - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - course_id = mobj.group('id') - display_id = mobj.group('display_id') - - base_url = self._extract_base_url(course_id, display_id) - - manifest = self._download_json( - '%s/imsmanifestlite.json' % base_url, - display_id, 'Downloading course manifest JSON')['manifest'] - - organization = manifest['organizations']['organization'][0] - - entries = [] - for chapter in organization['item']: - chapter_number, chapter_title = self._extract_chapter_and_title(chapter.get('title')) - chapter_id = chapter.get('@identifier') - for item in chapter.get('item', []): - item_id = item.get('@identifier') - if not item_id: - continue - metadata = item.get('resource', {}).get('metadata') or {} - if metadata.get('learningresourcetype') != 'Video': - continue - _, title = self._extract_chapter_and_title(item.get('title')) - duration = parse_duration(metadata.get('duration')) - description = metadata.get('description') - entries.append({ - '_type': 'url_transparent', - 'url': smuggle_url( - 'mva:%s:%s' % (course_id, item_id), {'base_url': base_url}), - 'title': title, - 'description': description, - 'duration': duration, - 'chapter': chapter_title, - 'chapter_number': chapter_number, - 'chapter_id': chapter_id, - }) - - title = organization.get('title') or manifest.get('metadata', {}).get('title') - - return self.playlist_result(entries, course_id, title) From d8930151f12a04553f8eef50056d6cab8daf771e Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Sat, 10 Feb 2024 01:51:57 -0500 Subject: [PATCH 2/6] series playlist --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/microsoftembed.py | 137 ++++++++++++++++++++--------- 2 files changed, 97 insertions(+), 41 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 113609a6b..9ee598b8f 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1078,6 +1078,7 @@ from .microsoftembed import ( MicrosoftEmbedIE, MicrosoftMediusIE, MicrosoftLearnIE, + MicrosoftBuildIE, ) from .mildom import ( MildomIE, diff --git a/yt_dlp/extractor/microsoftembed.py b/yt_dlp/extractor/microsoftembed.py index d03578c74..5e640d0fb 100644 --- a/yt_dlp/extractor/microsoftembed.py +++ b/yt_dlp/extractor/microsoftembed.py @@ -149,10 +149,7 @@ class MicrosoftMediusIE(MicrosoftMediusBaseIE): class MicrosoftLearnIE(MicrosoftMediusBaseIE): - _VALID_URL = [ - r'https://learn\.microsoft\.com/[\w\-]+/(events|shows)/[\w\-]+/(?P<id>[^?#/]+)', - r'https://build\.microsoft\.com/[\w\-]+/sessions/(?P<id>[0-9a-f\-]+)', - ] + _VALID_URL = r'https?://learn\.microsoft\.com/(?:[\w\-]+/)?(?P<type>events|shows)/(?P<series>[\w\-]+)(?:/(?P<id>[^?#/]+))?' _TESTS = [{ 'url': 'https://learn.microsoft.com/en-us/events/build-2022/ts01-rapidly-code-test-ship-from-secure-cloud-developer-environments', @@ -165,6 +162,14 @@ class MicrosoftLearnIE(MicrosoftMediusBaseIE): 'upload_date': '20220524', 'thumbnail': r're:https://mediusimg\.event\.microsoft\.com/video-\d+/thumbnail\.jpg.*', }, + }, { + 'url': 'https://learn.microsoft.com/en-us/events/build-2022', + 'info_dict': { + 'id': 'build-2022', + 'title': 'Microsoft Build 2022 - Events', + 'description': 'md5:c16b43848027df837b22c6fbac7648d3', + }, + 'playlist_count': 201, }, { 'url': 'https://learn.microsoft.com/en-us/shows/bash-for-beginners/what-is-the-difference-between-a-terminal-and-a-shell-2-of-20-bash-for-beginners/', 'info_dict': { @@ -174,24 +179,34 @@ class MicrosoftLearnIE(MicrosoftMediusBaseIE): 'description': 'md5:7bbbfb593d21c2cf2babc3715ade6b88', 'timestamp': 1676339547, 'upload_date': '20230214', + 'thumbnail': r're:https://learn\.microsoft\.com/video/media/.*\.png', 'subtitles': 'count:14', }, 'params': {'listsubtitles': True}, }, { - 'url': 'https://build.microsoft.com/en-US/sessions/49e81029-20f0-485b-b641-73b7f9622656?source=sessions', + 'url': 'https://learn.microsoft.com/en-us/shows/bash-for-beginners', 'info_dict': { - 'id': '81215af5-c813-4dcd-aede-94f4e1a7daa3', - 'ext': 'ismv', - 'title': 'Microsoft Build: Highlights from 2023', - 'description': 'md5:24fb8410b48256bb42dfca37eb936583', - 'timestamp': 1684857600, - 'upload_date': '20230523', - 'thumbnail': r're:https://mediusimg\.event\.microsoft\.com/video-\d+/thumbnail\.jpg.*', + 'id': 'bash-for-beginners', + 'title': 'Bash for Beginners', + 'description': 'md5:16a91c07222117d1e00912f0dbc02c2c', }, + 'playlist_count': 20, }] + def _entries(self, url_base, video_id): + skip = 0 + while True: + playlist_info = self._download_json(f'{url_base}&$skip={skip}', video_id, f'Downloading entries {skip}') + items = traverse_obj(playlist_info, ( + 'results', ..., 'url', {lambda x: self.url_result(f'https://learn.microsoft.com/en-us{x}')})) + yield from items + skip += len(items) + if skip >= playlist_info['count'] or not items: + break + def _real_extract(self, url): - video_id = self._match_id(url) + video_type, series, slug = self._match_valid_url(url).groups() + video_id = slug or series webpage = self._download_webpage(url, video_id) metainfo = { @@ -199,34 +214,74 @@ class MicrosoftLearnIE(MicrosoftMediusBaseIE): 'description': self._og_search_description(webpage), } - video_url = self._search_regex( - r'<meta\s+name="externalVideoUrl"\s+content="([^"]+)"', webpage, 'videoUrl', default=None) - if video_url: - return self.url_result(video_url, url_transparent=True, **metainfo, **{ - 'timestamp': parse_iso8601(self._search_regex( - r'<meta\s+name="startDate"\s+content="([^"]+)"', webpage, 'date', default=None)), - }) + if slug: + if video_type == 'events': + return self.url_result( + self._search_regex(r'<meta\s+name="externalVideoUrl"\s+content="([^"]+)"', webpage, 'videoUrl'), url_transparent=True, **metainfo, **{ + 'timestamp': parse_iso8601(self._search_regex( + r'<meta\s+name="startDate"\s+content="([^"]+)"', webpage, 'date', default=None)), + }) + else: + entry_id = self._search_regex(r'<meta name="entryId" content="([^"]+)"', webpage, 'entryId') + video_info = self._download_json( + f'https://learn.microsoft.com/api/video/public/v1/entries/{entry_id}', video_id) + return { + 'id': entry_id, + 'formats': self._extract_ism(video_info['publicVideo']['adaptiveVideoUrl'], video_id), + 'subtitles': self._sub_to_dict(traverse_obj(video_info, ('publicVideo', 'captions', ..., { + 'tag': ('language', {str}), + 'url': ('url', {url_or_none}), + }))), + **metainfo, + **traverse_obj(video_info, { + 'timestamp': ('createTime', {parse_iso8601}), + 'thumbnails': ('publicVideo', 'thumbnailOtherSizes', ..., {lambda x: {'url': x}}), + }), + } + else: + url_base = f'https://learn.microsoft.com/api/contentbrowser/search/{video_type}/{series}/{"sessions" if video_type == "events" else "episodes"}?locale=en-us' + return self.playlist_result(self._entries(url_base, video_id), video_id, **metainfo) - entry_id = self._search_regex( - r'<meta name="entryId" content="([^"]+)"', webpage, 'entryId', default=None) - if entry_id: - video_info = self._download_json( - f'https://learn.microsoft.com/api/video/public/v1/entries/{entry_id}', video_id) - return { - 'id': entry_id, - 'formats': self._extract_ism(video_info['publicVideo']['adaptiveVideoUrl'], video_id), - 'subtitles': self._sub_to_dict(traverse_obj(video_info, ('publicVideo', 'captions', ..., { - 'tag': ('language', {str}), - 'url': ('url', {url_or_none}), - }))), - 'timestamp': traverse_obj(video_info, ('createTime', {parse_iso8601})), - **metainfo, - } +class MicrosoftBuildIE(MicrosoftMediusBaseIE): + _VALID_URL = [ + r'https?://build\.microsoft\.com/[\w\-]+/sessions/(?P<id>[0-9a-f\-]+)', + r'https?://build\.microsoft\.com/[\w\-]+/(?P<id>sessions)/?(?:[?#]|$)', + ] - if self._og_search_url(webpage) == 'https://build.microsoft.com': - video_info = self._download_json( - f'https://api.build.microsoft.com/api/session/en-US-{video_id}', video_id) - return self.url_result(video_info['onDemand'], url_transparent=True, **metainfo, **{ - 'timestamp': traverse_obj(video_info, ('startDateTime', {parse_iso8601})), - }) + _TESTS = [{ + 'url': 'https://build.microsoft.com/en-US/sessions/49e81029-20f0-485b-b641-73b7f9622656?source=sessions', + 'info_dict': { + 'id': '81215af5-c813-4dcd-aede-94f4e1a7daa3', + 'ext': 'ismv', + 'title': 'Microsoft Build opening', + 'description': 'md5:756ab1fb60bdc6923d627803694e9cc5', + 'timestamp': 1684857600, + 'upload_date': '20230523', + 'thumbnail': r're:https://mediusimg\.event\.microsoft\.com/video-\d+/thumbnail\.jpg.*', + }, + }, { + 'url': 'https://build.microsoft.com/en-US/sessions', + 'info_dict': { + 'id': 'sessions', + }, + 'playlist_mincount': 418, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + entries = [ + self.url_result(video_info['onDemand'], url_transparent=True, **traverse_obj(video_info, { + 'id': ('sessionId', {str}), + 'title': ('title', {str}), + 'description': ('description', {str}), + 'timestamp': ('startDateTime', {parse_iso8601}), + })) + for video_info in self._download_json( + 'https://api.build.microsoft.com/api/session/all/en-US', video_id, 'Downloading video info') + ] + if video_id == 'sessions': + return self.playlist_result(entries, video_id) + else: + return traverse_obj(entries, (lambda _, v: v['id'] == video_id), get_all=False) From ef958d16a62bb29cb75d26a261eedcfd42e93157 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Sat, 10 Feb 2024 11:47:23 -0500 Subject: [PATCH 3/6] Change to separate query Co-authored-by: Simon Sawicki <accounts@grub4k.xyz> --- yt_dlp/extractor/microsoftembed.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/microsoftembed.py b/yt_dlp/extractor/microsoftembed.py index 5e640d0fb..502fcf8e5 100644 --- a/yt_dlp/extractor/microsoftembed.py +++ b/yt_dlp/extractor/microsoftembed.py @@ -196,7 +196,10 @@ class MicrosoftLearnIE(MicrosoftMediusBaseIE): def _entries(self, url_base, video_id): skip = 0 while True: - playlist_info = self._download_json(f'{url_base}&$skip={skip}', video_id, f'Downloading entries {skip}') + playlist_info = self._download_json(url_base, video_id, f'Downloading entries {skip}', query={ + 'locale': 'en-us', + '$skip': skip, + }) items = traverse_obj(playlist_info, ( 'results', ..., 'url', {lambda x: self.url_result(f'https://learn.microsoft.com/en-us{x}')})) yield from items @@ -239,7 +242,7 @@ class MicrosoftLearnIE(MicrosoftMediusBaseIE): }), } else: - url_base = f'https://learn.microsoft.com/api/contentbrowser/search/{video_type}/{series}/{"sessions" if video_type == "events" else "episodes"}?locale=en-us' + url_base = f'https://learn.microsoft.com/api/contentbrowser/search/{video_type}/{series}/{"sessions" if video_type == "events" else "episodes"}' return self.playlist_result(self._entries(url_base, video_id), video_id, **metainfo) From 63654ea6f9d315a0c560a036e6afead39adc18b6 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Sat, 10 Feb 2024 11:51:28 -0500 Subject: [PATCH 4/6] use early exit --- yt_dlp/extractor/microsoftembed.py | 50 +++++++++++++++--------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/yt_dlp/extractor/microsoftembed.py b/yt_dlp/extractor/microsoftembed.py index 502fcf8e5..71b5fd43f 100644 --- a/yt_dlp/extractor/microsoftembed.py +++ b/yt_dlp/extractor/microsoftembed.py @@ -217,34 +217,34 @@ class MicrosoftLearnIE(MicrosoftMediusBaseIE): 'description': self._og_search_description(webpage), } - if slug: - if video_type == 'events': - return self.url_result( - self._search_regex(r'<meta\s+name="externalVideoUrl"\s+content="([^"]+)"', webpage, 'videoUrl'), url_transparent=True, **metainfo, **{ - 'timestamp': parse_iso8601(self._search_regex( - r'<meta\s+name="startDate"\s+content="([^"]+)"', webpage, 'date', default=None)), - }) - else: - entry_id = self._search_regex(r'<meta name="entryId" content="([^"]+)"', webpage, 'entryId') - video_info = self._download_json( - f'https://learn.microsoft.com/api/video/public/v1/entries/{entry_id}', video_id) - return { - 'id': entry_id, - 'formats': self._extract_ism(video_info['publicVideo']['adaptiveVideoUrl'], video_id), - 'subtitles': self._sub_to_dict(traverse_obj(video_info, ('publicVideo', 'captions', ..., { - 'tag': ('language', {str}), - 'url': ('url', {url_or_none}), - }))), - **metainfo, - **traverse_obj(video_info, { - 'timestamp': ('createTime', {parse_iso8601}), - 'thumbnails': ('publicVideo', 'thumbnailOtherSizes', ..., {lambda x: {'url': x}}), - }), - } - else: + if not slug: url_base = f'https://learn.microsoft.com/api/contentbrowser/search/{video_type}/{series}/{"sessions" if video_type == "events" else "episodes"}' return self.playlist_result(self._entries(url_base, video_id), video_id, **metainfo) + if video_type == 'events': + return self.url_result( + self._search_regex(r'<meta\s+name="externalVideoUrl"\s+content="([^"]+)"', webpage, 'videoUrl'), url_transparent=True, **metainfo, **{ + 'timestamp': parse_iso8601(self._search_regex( + r'<meta\s+name="startDate"\s+content="([^"]+)"', webpage, 'date', default=None)), + }) + + entry_id = self._search_regex(r'<meta name="entryId" content="([^"]+)"', webpage, 'entryId') + video_info = self._download_json( + f'https://learn.microsoft.com/api/video/public/v1/entries/{entry_id}', video_id) + return { + 'id': entry_id, + 'formats': self._extract_ism(video_info['publicVideo']['adaptiveVideoUrl'], video_id), + 'subtitles': self._sub_to_dict(traverse_obj(video_info, ('publicVideo', 'captions', ..., { + 'tag': ('language', {str}), + 'url': ('url', {url_or_none}), + }))), + **metainfo, + **traverse_obj(video_info, { + 'timestamp': ('createTime', {parse_iso8601}), + 'thumbnails': ('publicVideo', 'thumbnailOtherSizes', ..., {lambda x: {'url': x}}), + }), + } + class MicrosoftBuildIE(MicrosoftMediusBaseIE): _VALID_URL = [ From 9c04204efae34b39ba463b9d353e3fc0bc4e33c4 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Sun, 11 Feb 2024 12:40:22 -0500 Subject: [PATCH 5/6] Update yt_dlp/extractor/microsoftembed.py Co-authored-by: Simon Sawicki <accounts@grub4k.xyz> --- yt_dlp/extractor/microsoftembed.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/microsoftembed.py b/yt_dlp/extractor/microsoftembed.py index 71b5fd43f..e2a19d6c1 100644 --- a/yt_dlp/extractor/microsoftembed.py +++ b/yt_dlp/extractor/microsoftembed.py @@ -74,10 +74,11 @@ class MicrosoftEmbedIE(InfoExtractor): class MicrosoftMediusBaseIE(InfoExtractor): - def _sub_to_dict(self, subtitile_list): + @staticmethod + def _sub_to_dict(self, subtitle_list): subtitles = {} - for sub in subtitile_list: - subtitles.setdefault(sub['tag'], []).append({k: v for k, v in sub.items() if k != 'tag'}) + for sub in subtitle_list: + subtitles.setdefault(sub.pop('tag', None) or 'unknown', []).append(sub) return subtitles def _extract_ism(self, ism_url, video_id): From 504c935cde368571552974d1ec4d164808b057ef Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Sun, 25 Feb 2024 23:02:48 -0500 Subject: [PATCH 6/6] cleanup --- yt_dlp/extractor/microsoftembed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/microsoftembed.py b/yt_dlp/extractor/microsoftembed.py index e2a19d6c1..63bc51ea0 100644 --- a/yt_dlp/extractor/microsoftembed.py +++ b/yt_dlp/extractor/microsoftembed.py @@ -75,7 +75,7 @@ class MicrosoftEmbedIE(InfoExtractor): class MicrosoftMediusBaseIE(InfoExtractor): @staticmethod - def _sub_to_dict(self, subtitle_list): + def _sub_to_dict(subtitle_list): subtitles = {} for sub in subtitle_list: subtitles.setdefault(sub.pop('tag', None) or 'unknown', []).append(sub)