From 9bd85019931927a99b0fe0dc58ac51acca9fbe72 Mon Sep 17 00:00:00 2001 From: Haxy Date: Thu, 20 Jun 2024 22:54:53 +0100 Subject: [PATCH] [ie/youtube] Extract all formats from multi-language m3u8s (#9875) Authored by: clienthax, bashonly Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com> --- yt_dlp/extractor/common.py | 5 +++++ yt_dlp/extractor/youtube.py | 24 +++++++++++++++--------- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index e5efd08b4f..f63bd78258 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -2222,6 +2222,11 @@ def build_stream_name(): 'quality': quality, 'has_drm': has_drm, } + + # YouTube-specific + if yt_audio_content_id := last_stream_inf.get('YT-EXT-AUDIO-CONTENT-ID'): + f['language'] = yt_audio_content_id.split('.')[0] + resolution = last_stream_inf.get('RESOLUTION') if resolution: mobj = re.search(r'(?P\d+)[xX](?P\d+)', resolution) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index a89744eb10..ab6201dae6 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3797,6 +3797,8 @@ def _needs_live_processing(self, live_status, duration): def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, live_status, duration): CHUNK_SIZE = 10 << 20 + PREFERRED_LANG_VALUE = 10 + original_language = None itags, stream_ids = collections.defaultdict(set), [] itag_qualities, res_qualities = {}, {0: None} q = qualities([ @@ -3894,10 +3896,12 @@ def build_fragments(f): throttled = True tbr = float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) - language_preference = ( - 10 if audio_track.get('audioIsDefault') and 10 - else -10 if 'descriptive' in (audio_track.get('displayName') or '').lower() and -10 - else -1) + is_default = audio_track.get('audioIsDefault') + is_descriptive = 'descriptive' in (audio_track.get('displayName') or '').lower() + language_code = audio_track.get('id', '').split('.')[0] + if language_code and is_default: + original_language = language_code + format_duration = traverse_obj(fmt, ('approxDurationMs', {lambda x: float_or_none(x, 1000)})) # Some formats may have much smaller duration than others (possibly damaged during encoding) # E.g. 2-nOtRESiUc Ref: https://github.com/yt-dlp/yt-dlp/issues/2823 @@ -3924,8 +3928,7 @@ def build_fragments(f): 'filesize': int_or_none(fmt.get('contentLength')), 'format_id': f'{itag}{"-drc" if fmt.get("isDrc") else ""}', 'format_note': join_nonempty( - join_nonempty(audio_track.get('displayName'), - language_preference > 0 and ' (default)', delim=''), + join_nonempty(audio_track.get('displayName'), is_default and ' (default)', delim=''), name, fmt.get('isDrc') and 'DRC', try_get(fmt, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()), try_get(fmt, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()), @@ -3944,9 +3947,8 @@ def build_fragments(f): 'filesize_approx': filesize_from_tbr(tbr, format_duration), 'url': fmt_url, 'width': int_or_none(fmt.get('width')), - 'language': join_nonempty(audio_track.get('id', '').split('.')[0], - 'desc' if language_preference < -1 else '') or None, - 'language_preference': language_preference, + 'language': join_nonempty(language_code, 'desc' if is_descriptive else '') or None, + 'language_preference': PREFERRED_LANG_VALUE if is_default else -10 if is_descriptive else -1, # Strictly de-prioritize broken, damaged and 3gp formats 'preference': -20 if is_broken else -10 if is_damaged else -2 if itag == '17' else None, } @@ -4007,6 +4009,10 @@ def process_manifest_format(f, proto, client_name, itag): elif itag: f['format_id'] = itag + if original_language and f.get('language') == original_language: + f['format_note'] = join_nonempty(f.get('format_note'), '(default)', delim=' ') + f['language_preference'] = PREFERRED_LANG_VALUE + if f.get('source_preference') is None: f['source_preference'] = -1