From ed4d9a40c145974abb62b36dabd5e4ab3db6aee9 Mon Sep 17 00:00:00 2001 From: kclauhk <78251477+kclauhk@users.noreply.github.com> Date: Tue, 29 Oct 2024 22:52:24 +0800 Subject: [PATCH] [ie/mediasite] Extract transcripts --- yt_dlp/extractor/mediasite.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/mediasite.py b/yt_dlp/extractor/mediasite.py index ad7ab27e28..a7bbaa5344 100644 --- a/yt_dlp/extractor/mediasite.py +++ b/yt_dlp/extractor/mediasite.py @@ -5,6 +5,7 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, + determine_ext, float_or_none, mimetype2ext, smuggle_url, @@ -268,7 +269,29 @@ def _real_extract(self, url): formats.extend(stream_formats) # XXX: Presentation['Presenters'] - # XXX: Presentation['Transcript'] + transcripts = presentation.get('Transcripts', {}) + captions, subtitles = {}, {} + for transcript in transcripts: + lang_code = traverse_obj( + transcript, (('DetailedLanguageCode', 'LanguageCode'), {str}), get_all=False) + lang_name = transcript.get('Language') + t = { + 'url': transcript.get('CaptionsUrl'), + 'name': lang_name, + } + if 'Auto-Generated' in lang_name: + captions.setdefault(lang_code, []).append(t) + else: + subtitles.setdefault(lang_code, []).append(t) + if transcript_url := presentation.get('TranscriptUrl'): + if determine_ext(transcript_url) != 'txt': + if len(transcripts) == 1 and captions: + captions.setdefault(lang_code, []).append({ + 'url': transcript_url, + 'name': lang_name, + }) + else: + subtitles.setdefault('und', []).append({'url': transcript_url}) return { 'id': resource_id, @@ -277,6 +300,8 @@ def _real_extract(self, url): 'duration': float_or_none(presentation.get('Duration'), 1000), 'timestamp': float_or_none(presentation.get('UnixTime'), 1000), 'formats': formats, + 'automatic_captions': captions, + 'subtitles': subtitles, 'thumbnails': thumbnails, }