[extractor/youtube] Support changing extraction language (#4470)

Adds `--extractor-args youtube:lang=<supported lang code>` extractor arg to prefer translated fields (e.g. title and description) of that language, if available, for all YouTube extractors. See README or error message for list of supported language codes.

Closes https://github.com/yt-dlp/yt-dlp/issues/387

Authored by: coletdjnz
This commit is contained in:
coletdjnz 2022-09-09 05:16:46 +00:00 committed by GitHub
parent 0c0b78b273
commit c26f9b991a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 286 additions and 75 deletions

View file

@ -1705,6 +1705,8 @@ #### youtube
* E.g. `all,all,1000,10` will get a maximum of 1000 replies total, with up to 10 replies per thread. `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total
* `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others
* `innertube_key`: Innertube API key to use for all API requests
* `lang`: Supported content language code to prefer translated metadata of this language (case-sensitive). By default, video primary language metadata is preferred, with a fallback to `en` translated.
* See youtube.py for list of supported content language codes.
#### youtubetab (YouTube playlists, channels, feeds, etc.)
* `skip`: One or more of `webpage` (skip initial webpage download), `authcheck` (allow the download of playlists requiring authentication when no initial webpage is downloaded. This may cause unwanted behavior, see [#1122](https://github.com/yt-dlp/yt-dlp/pull/1122) for more details)

View file

@ -2,6 +2,7 @@
import calendar
import copy
import datetime
import enum
import hashlib
import itertools
import json
@ -275,6 +276,15 @@ def build_innertube_clients():
build_innertube_clients()
class BadgeType(enum.Enum):
AVAILABILITY_UNLISTED = enum.auto()
AVAILABILITY_PRIVATE = enum.auto()
AVAILABILITY_PUBLIC = enum.auto()
AVAILABILITY_PREMIUM = enum.auto()
AVAILABILITY_SUBSCRIPTION = enum.auto()
LIVE_NOW = enum.auto()
class YoutubeBaseInfoExtractor(InfoExtractor):
"""Provide base functions for Youtube extractors"""
@ -367,6 +377,36 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
r'(?:www\.)?piped\.privacy\.com\.de',
)
# extracted from account/account_menu ep
# XXX: These are the supported YouTube UI and API languages,
# which is slightly different from languages supported for translation in YouTube studio
_SUPPORTED_LANG_CODES = [
'af', 'az', 'id', 'ms', 'bs', 'ca', 'cs', 'da', 'de', 'et', 'en-IN', 'en-GB', 'en', 'es',
'es-419', 'es-US', 'eu', 'fil', 'fr', 'fr-CA', 'gl', 'hr', 'zu', 'is', 'it', 'sw', 'lv',
'lt', 'hu', 'nl', 'no', 'uz', 'pl', 'pt-PT', 'pt', 'ro', 'sq', 'sk', 'sl', 'sr-Latn', 'fi',
'sv', 'vi', 'tr', 'be', 'bg', 'ky', 'kk', 'mk', 'mn', 'ru', 'sr', 'uk', 'el', 'hy', 'iw',
'ur', 'ar', 'fa', 'ne', 'mr', 'hi', 'as', 'bn', 'pa', 'gu', 'or', 'ta', 'te', 'kn', 'ml',
'si', 'th', 'lo', 'my', 'ka', 'am', 'km', 'zh-CN', 'zh-TW', 'zh-HK', 'ja', 'ko'
]
@functools.cached_property
def _preferred_lang(self):
"""
Returns a language code supported by YouTube for the user preferred language.
Returns None if no preferred language set.
"""
preferred_lang = self._configuration_arg('lang', ie_key='Youtube', casesense=True, default=[''])[0]
if not preferred_lang:
return
if preferred_lang not in self._SUPPORTED_LANG_CODES:
raise ExtractorError(
f'Unsupported language code: {preferred_lang}. Supported language codes (case-sensitive): {join_nonempty(*self._SUPPORTED_LANG_CODES, delim=", ")}.',
expected=True)
elif preferred_lang != 'en':
self.report_warning(
f'Preferring "{preferred_lang}" translated fields. Note that some metadata extraction may fail or be incorrect.')
return preferred_lang
def _initialize_consent(self):
cookies = self._get_cookies('https://www.youtube.com/')
if cookies.get('__Secure-3PSID'):
@ -391,7 +431,7 @@ def _initialize_pref(self):
pref = dict(urllib.parse.parse_qsl(pref_cookie.value))
except ValueError:
self.report_warning('Failed to parse user PREF cookie' + bug_reports_message())
pref.update({'hl': 'en', 'tz': 'UTC'})
pref.update({'hl': self._preferred_lang or 'en', 'tz': 'UTC'})
self._set_cookie('.youtube.com', name='PREF', value=urllib.parse.urlencode(pref))
def _real_initialize(self):
@ -439,7 +479,7 @@ def _extract_context(self, ytcfg=None, default_client='web'):
(ytcfg, self._get_default_ytcfg(default_client)), 'INNERTUBE_CONTEXT', expected_type=dict)
# Enforce language and tz for extraction
client_context = traverse_obj(context, 'client', expected_type=dict, default={})
client_context.update({'hl': 'en', 'timeZone': 'UTC', 'utcOffsetMinutes': 0})
client_context.update({'hl': self._preferred_lang or 'en', 'timeZone': 'UTC', 'utcOffsetMinutes': 0})
return context
_SAPISID = None
@ -678,13 +718,49 @@ def _extract_and_report_alerts(self, data, *args, **kwargs):
return self._report_alerts(self._extract_alerts(data), *args, **kwargs)
def _extract_badges(self, renderer: dict):
badges = set()
for badge in try_get(renderer, lambda x: x['badges'], list) or []:
label = try_get(badge, lambda x: x['metadataBadgeRenderer']['label'], str)
if label:
badges.add(label.lower())
privacy_icon_map = {
'PRIVACY_UNLISTED': BadgeType.AVAILABILITY_UNLISTED,
'PRIVACY_PRIVATE': BadgeType.AVAILABILITY_PRIVATE,
'PRIVACY_PUBLIC': BadgeType.AVAILABILITY_PUBLIC
}
badge_style_map = {
'BADGE_STYLE_TYPE_MEMBERS_ONLY': BadgeType.AVAILABILITY_SUBSCRIPTION,
'BADGE_STYLE_TYPE_PREMIUM': BadgeType.AVAILABILITY_PREMIUM,
'BADGE_STYLE_TYPE_LIVE_NOW': BadgeType.LIVE_NOW
}
label_map = {
'unlisted': BadgeType.AVAILABILITY_UNLISTED,
'private': BadgeType.AVAILABILITY_PRIVATE,
'members only': BadgeType.AVAILABILITY_SUBSCRIPTION,
'live': BadgeType.LIVE_NOW,
'premium': BadgeType.AVAILABILITY_PREMIUM
}
badges = []
for badge in traverse_obj(renderer, ('badges', ..., 'metadataBadgeRenderer'), default=[]):
badge_type = (
privacy_icon_map.get(traverse_obj(badge, ('icon', 'iconType'), expected_type=str))
or badge_style_map.get(traverse_obj(badge, 'style'))
)
if badge_type:
badges.append({'type': badge_type})
continue
# fallback, won't work in some languages
label = traverse_obj(badge, 'label', expected_type=str, default='')
for match, label_badge_type in label_map.items():
if match in label.lower():
badges.append({'type': badge_type})
continue
return badges
@staticmethod
def _has_badge(badges, badge_type):
return bool(traverse_obj(badges, lambda _, v: v['type'] == badge_type))
@staticmethod
def _get_text(data, *path_list, max_runs=None):
for path in path_list or [None]:
@ -755,9 +831,9 @@ def extract_relative_time(relative_time_text):
except ValueError:
return None
def _extract_time_text(self, renderer, *path_list):
"""@returns (timestamp, time_text)"""
text = self._get_text(renderer, *path_list) or ''
def _parse_time_text(self, text):
if not text:
return
dt = self.extract_relative_time(text)
timestamp = None
if isinstance(dt, datetime.datetime):
@ -770,9 +846,10 @@ def _extract_time_text(self, renderer, *path_list):
(r'([a-z]+\s*\d{1,2},?\s*20\d{2})', r'(?:.+|^)(?:live|premieres|ed|ing)(?:\s*(?:on|for))?\s*(.+\d)'),
text.lower(), 'time text', default=None)))
if text and timestamp is None:
self.report_warning(f"Cannot parse localized time text '{text}'" + bug_reports_message(), only_once=True)
return timestamp, text
if text and timestamp is None and self._preferred_lang in (None, 'en'):
self.report_warning(
f'Cannot parse localized time text "{text}"', only_once=True)
return timestamp
def _extract_response(self, item_id, query, note='Downloading API JSON', headers=None,
ytcfg=None, check_get_keys=None, ep='browse', fatal=True, api_hostname=None,
@ -848,7 +925,7 @@ def _extract_video(self, renderer):
channel_id = traverse_obj(
renderer, ('shortBylineText', 'runs', ..., 'navigationEndpoint', 'browseEndpoint', 'browseId'),
expected_type=str, get_all=False)
timestamp, time_text = self._extract_time_text(renderer, 'publishedTimeText')
time_text = self._get_text(renderer, 'publishedTimeText') or ''
scheduled_timestamp = str_to_int(traverse_obj(renderer, ('upcomingEventData', 'startTime'), get_all=False))
overlay_style = traverse_obj(
renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'),
@ -874,15 +951,21 @@ def _extract_video(self, renderer):
'uploader': uploader,
'channel_id': channel_id,
'thumbnails': thumbnails,
'upload_date': (strftime_or_none(timestamp, '%Y%m%d')
'upload_date': (strftime_or_none(self._parse_time_text(time_text), '%Y%m%d')
if self._configuration_arg('approximate_date', ie_key='youtubetab')
else None),
'live_status': ('is_upcoming' if scheduled_timestamp is not None
else 'was_live' if 'streamed' in time_text.lower()
else 'is_live' if overlay_style == 'LIVE' or 'live now' in badges
else 'is_live' if overlay_style == 'LIVE' or self._has_badge(badges, BadgeType.LIVE_NOW)
else None),
'release_timestamp': scheduled_timestamp,
'availability': self._availability(needs_premium='premium' in badges, needs_subscription='members only' in badges)
'availability':
'public' if self._has_badge(badges, BadgeType.AVAILABILITY_PUBLIC)
else self._availability(
is_private=self._has_badge(badges, BadgeType.AVAILABILITY_PRIVATE) or None,
needs_premium=self._has_badge(badges, BadgeType.AVAILABILITY_PREMIUM) or None,
needs_subscription=self._has_badge(badges, BadgeType.AVAILABILITY_SUBSCRIPTION) or None,
is_unlisted=self._has_badge(badges, BadgeType.AVAILABILITY_UNLISTED) or None)
}
@ -2306,6 +2389,61 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'tags': [],
'uploader_url': 'http://www.youtube.com/user/nao20010128nao',
}
}, {
# Prefer primary title+description language metadata by default
# Do not prefer translated description if primary is empty
'url': 'https://www.youtube.com/watch?v=el3E4MbxRqQ',
'info_dict': {
'id': 'el3E4MbxRqQ',
'ext': 'mp4',
'title': 'dlp test video 2 - primary sv no desc',
'description': '',
'channel': 'cole-dlp-test-acc',
'tags': [],
'view_count': int,
'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA',
'like_count': int,
'playable_in_embed': True,
'availability': 'unlisted',
'thumbnail': 'https://i.ytimg.com/vi_webp/el3E4MbxRqQ/maxresdefault.webp',
'age_limit': 0,
'duration': 5,
'uploader_id': 'UCiu-3thuViMebBjw_5nWYrA',
'uploader_url': 'http://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA',
'live_status': 'not_live',
'upload_date': '20220908',
'categories': ['People & Blogs'],
'uploader': 'cole-dlp-test-acc',
'channel_id': 'UCiu-3thuViMebBjw_5nWYrA',
},
'params': {'skip_download': True}
}, {
# Extractor argument: prefer translated title+description
'url': 'https://www.youtube.com/watch?v=gHKT4uU8Zng',
'info_dict': {
'id': 'gHKT4uU8Zng',
'ext': 'mp4',
'channel': 'cole-dlp-test-acc',
'tags': [],
'duration': 5,
'live_status': 'not_live',
'channel_id': 'UCiu-3thuViMebBjw_5nWYrA',
'upload_date': '20220728',
'uploader_id': 'UCiu-3thuViMebBjw_5nWYrA',
'view_count': int,
'categories': ['People & Blogs'],
'thumbnail': 'https://i.ytimg.com/vi_webp/gHKT4uU8Zng/maxresdefault.webp',
'title': 'dlp test video title translated (fr)',
'availability': 'public',
'uploader': 'cole-dlp-test-acc',
'age_limit': 0,
'description': 'dlp test video description translated (fr)',
'playable_in_embed': True,
'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA',
'uploader_url': 'http://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA',
},
'params': {'skip_download': True, 'extractor_args': {'youtube': {'lang': ['fr']}}},
'expected_warnings': [r'Preferring "fr" translated fields'],
}, {
'note': '6 channel audio',
'url': 'https://www.youtube.com/watch?v=zgdo7-RRjgo',
@ -2907,8 +3045,10 @@ def _extract_comment(self, comment_renderer, parent=None):
text = self._get_text(comment_renderer, 'contentText')
# note: timestamp is an estimate calculated from the current time and time_text
timestamp, time_text = self._extract_time_text(comment_renderer, 'publishedTimeText')
# Timestamp is an estimate calculated from the current time and time_text
time_text = self._get_text(comment_renderer, 'publishedTimeText') or ''
timestamp = self._parse_time_text(time_text)
author = self._get_text(comment_renderer, 'authorText')
author_id = try_get(comment_renderer,
lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], str)
@ -3554,11 +3694,19 @@ def _real_extract(self, url):
microformats = traverse_obj(
player_responses, (..., 'microformat', 'playerMicroformatRenderer'),
expected_type=dict, default=[])
video_title = (
get_first(video_details, 'title')
or self._get_text(microformats, (..., 'title'))
translated_title = self._get_text(microformats, (..., 'title'))
video_title = (self._preferred_lang and translated_title
or get_first(video_details, 'title') # primary
or translated_title
or search_meta(['og:title', 'twitter:title', 'title']))
video_description = get_first(video_details, 'shortDescription')
translated_description = self._get_text(microformats, (..., 'description'))
original_description = get_first(video_details, 'shortDescription')
video_description = (
self._preferred_lang and translated_description
# If original description is blank, it will be an empty string.
# Do not prefer translated description in this case.
or original_description if original_description is not None else translated_description)
multifeed_metadata_list = get_first(
player_responses,
@ -3988,7 +4136,8 @@ def process_language(container, base_url, lang_code, sub_name, query):
and info.get('live_status') != 'is_upcoming'
and 'no-youtube-prefer-utc-upload-date' not in self.get_param('compat_opts', [])
):
upload_date = strftime_or_none(self._extract_time_text(vpir, 'dateText')[0], '%Y%m%d') or upload_date
upload_date = strftime_or_none(
self._parse_time_text(self._get_text(vpir, 'dateText')), '%Y%m%d') or upload_date
info['upload_date'] = upload_date
for to, frm in fallbacks.items():
@ -4000,33 +4149,25 @@ def process_language(container, base_url, lang_code, sub_name, query):
if v:
info[d_k] = v
is_private = get_first(video_details, 'isPrivate', expected_type=bool)
is_unlisted = get_first(microformats, 'isUnlisted', expected_type=bool)
is_membersonly = None
is_premium = None
if initial_data and is_private is not None:
is_membersonly = False
is_premium = False
contents = try_get(initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list) or []
badge_labels = set()
for content in contents:
if not isinstance(content, dict):
continue
badge_labels.update(self._extract_badges(content.get('videoPrimaryInfoRenderer')))
for badge_label in badge_labels:
if badge_label.lower() == 'members only':
is_membersonly = True
elif badge_label.lower() == 'premium':
is_premium = True
elif badge_label.lower() == 'unlisted':
is_unlisted = True
badges = self._extract_badges(traverse_obj(contents, (..., 'videoPrimaryInfoRenderer'), get_all=False))
info['availability'] = self._availability(
is_private = (self._has_badge(badges, BadgeType.AVAILABILITY_PRIVATE)
or get_first(video_details, 'isPrivate', expected_type=bool))
info['availability'] = (
'public' if self._has_badge(badges, BadgeType.AVAILABILITY_PUBLIC)
else self._availability(
is_private=is_private,
needs_premium=is_premium,
needs_subscription=is_membersonly,
needs_premium=(
self._has_badge(badges, BadgeType.AVAILABILITY_PREMIUM)
or False if initial_data and is_private is not None else None),
needs_subscription=(
self._has_badge(badges, BadgeType.AVAILABILITY_SUBSCRIPTION)
or False if initial_data and is_private is not None else None),
needs_auth=info['age_limit'] >= 18,
is_unlisted=None if is_private is None else is_unlisted)
is_unlisted=None if is_private is None else (
self._has_badge(badges, BadgeType.AVAILABILITY_UNLISTED)
or get_first(microformats, 'isUnlisted', expected_type=bool))))
info['__post_extractor'] = self.extract_comments(master_ytcfg, video_id, contents, webpage)
@ -4472,7 +4613,7 @@ def _get_uncropped(url):
playlist_id = item_id
playlist_stats = traverse_obj(primary_sidebar_renderer, 'stats')
last_updated_unix, _ = self._extract_time_text(playlist_stats, 2)
last_updated_unix = self._parse_time_text(self._get_text(playlist_stats, 2))
if title is None:
title = self._get_text(data, ('header', 'hashtagHeaderRenderer', 'hashtag')) or playlist_id
title += format_field(selected_tab, 'title', ' - %s')
@ -4566,31 +4707,37 @@ def _extract_availability(self, data):
Note: Unless YouTube tells us explicitly, we do not assume it is public
@param data: response
"""
is_private = is_unlisted = None
renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer') or {}
badge_labels = self._extract_badges(renderer)
player_header_privacy = traverse_obj(
data, ('header', 'playlistHeaderRenderer', 'privacy'), expected_type=str)
badges = self._extract_badges(renderer)
# Personal playlists, when authenticated, have a dropdown visibility selector instead of a badge
privacy_dropdown_entries = try_get(
renderer, lambda x: x['privacyForm']['dropdownFormFieldRenderer']['dropdown']['dropdownRenderer']['entries'], list) or []
for renderer_dict in privacy_dropdown_entries:
is_selected = try_get(
renderer_dict, lambda x: x['privacyDropdownItemRenderer']['isSelected'], bool) or False
if not is_selected:
continue
label = self._get_text(renderer_dict, ('privacyDropdownItemRenderer', 'label'))
if label:
badge_labels.add(label.lower())
break
privacy_setting_icon = traverse_obj(
renderer, (
'privacyForm', 'dropdownFormFieldRenderer', 'dropdown', 'dropdownRenderer', 'entries',
lambda _, v: v['privacyDropdownItemRenderer']['isSelected'], 'privacyDropdownItemRenderer', 'icon', 'iconType'),
get_all=False, expected_type=str)
for badge_label in badge_labels:
if badge_label == 'unlisted':
is_unlisted = True
elif badge_label == 'private':
is_private = True
elif badge_label == 'public':
is_unlisted = is_private = False
return self._availability(is_private, False, False, False, is_unlisted)
return (
'public' if (
self._has_badge(badges, BadgeType.AVAILABILITY_PUBLIC)
or player_header_privacy == 'PUBLIC'
or privacy_setting_icon == 'PRIVACY_PUBLIC')
else self._availability(
is_private=(
self._has_badge(badges, BadgeType.AVAILABILITY_PRIVATE)
or player_header_privacy == 'PRIVATE' if player_header_privacy is not None
else privacy_setting_icon == 'PRIVACY_PRIVATE' if privacy_setting_icon is not None else None),
is_unlisted=(
self._has_badge(badges, BadgeType.AVAILABILITY_UNLISTED)
or player_header_privacy == 'UNLISTED' if player_header_privacy is not None
else privacy_setting_icon == 'PRIVACY_UNLISTED' if privacy_setting_icon is not None else None),
needs_subscription=self._has_badge(badges, BadgeType.AVAILABILITY_SUBSCRIPTION) or None,
needs_premium=self._has_badge(badges, BadgeType.AVAILABILITY_PREMIUM) or None,
needs_auth=False))
@staticmethod
def _extract_sidebar_info_renderer(data, info_renderer, expected_type=dict):
@ -4866,6 +5013,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'channel_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
'uploader_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
'channel_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
'availability': 'public',
},
'playlist_count': 1,
}, {
@ -4883,6 +5031,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'channel_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
'channel_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
'uploader_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
'availability': 'public',
},
'playlist_count': 0,
}, {
@ -5029,6 +5178,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'channel_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
'channel_url': 'https://www.youtube.com/c/ChRiStIaAn008',
'channel': 'Christiaan008',
'availability': 'public',
},
'playlist_count': 96,
}, {
@ -5047,6 +5197,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'view_count': int,
'description': '',
'channel_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
'availability': 'public',
},
'playlist_mincount': 1123,
'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'],
@ -5070,6 +5221,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'channel': 'Interstellar Movie',
'description': '',
'modified_date': r're:\d{8}',
'availability': 'public',
},
'playlist_mincount': 21,
}, {
@ -5088,6 +5240,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'channel_url': 'https://www.youtube.com/channel/UCTYLiWFZy8xtPwxFwX9rV7Q',
'channel_id': 'UCTYLiWFZy8xtPwxFwX9rV7Q',
'modified_date': r're:\d{8}',
'availability': 'public',
},
'playlist_mincount': 200,
'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'],
@ -5107,6 +5260,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'uploader_url': 'https://www.youtube.com/c/blanktv',
'modified_date': r're:\d{8}',
'description': '',
'availability': 'public',
},
'playlist_mincount': 1000,
'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'],
@ -5125,6 +5279,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'channel_id': 'UC9-y-6csu5WGm29I7JiwpnA',
'channel_url': 'https://www.youtube.com/user/Computerphile',
'channel': 'Computerphile',
'availability': 'public',
},
'playlist_mincount': 11,
}, {
@ -5290,6 +5445,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'channel_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg',
'tags': [],
'channel': 'NoCopyrightSounds',
'availability': 'public',
},
'playlist_mincount': 166,
'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'],
@ -5310,6 +5466,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'modified_date': r're:\d{8}',
'uploader_url': 'https://www.youtube.com/channel/UC9ALqqC4aIeG5iDs7i90Bfw',
'description': '',
'availability': 'public',
},
'expected_warnings': [
'The URL does not have a videos tab',
@ -5410,6 +5567,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'channel': 'Royalty Free Music - Topic',
'view_count': int,
'uploader_url': 'https://www.youtube.com/channel/UC9ALqqC4aIeG5iDs7i90Bfw',
'availability': 'public',
},
'expected_warnings': [
'does not have a videos tab',
@ -5443,6 +5601,45 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'uploader_url': 'https://www.youtube.com/channel/UCKcqXmCcyqnhgpA5P0oHH_Q',
},
'playlist_mincount': 2
}, {
'note': 'translated tab name',
'url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA/playlists',
'info_dict': {
'id': 'UCiu-3thuViMebBjw_5nWYrA',
'tags': [],
'uploader_id': 'UCiu-3thuViMebBjw_5nWYrA',
'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA',
'description': '',
'title': 'cole-dlp-test-acc - 再生リスト',
'uploader_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA',
'uploader': 'cole-dlp-test-acc',
'channel_id': 'UCiu-3thuViMebBjw_5nWYrA',
'channel': 'cole-dlp-test-acc',
},
'playlist_mincount': 1,
'params': {'extractor_args': {'youtube': {'lang': ['ja']}}},
'expected_warnings': ['Preferring "ja"'],
}, {
# XXX: this should really check flat playlist entries, but the test suite doesn't support that
'note': 'preferred lang set with playlist with translated video titles',
'url': 'https://www.youtube.com/playlist?list=PLt5yu3-wZAlQAaPZ5Z-rJoTdbT-45Q7c0',
'info_dict': {
'id': 'PLt5yu3-wZAlQAaPZ5Z-rJoTdbT-45Q7c0',
'tags': [],
'view_count': int,
'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA',
'uploader': 'cole-dlp-test-acc',
'uploader_id': 'UCiu-3thuViMebBjw_5nWYrA',
'channel': 'cole-dlp-test-acc',
'channel_id': 'UCiu-3thuViMebBjw_5nWYrA',
'description': 'test',
'uploader_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA',
'title': 'dlp test playlist',
'availability': 'public',
},
'playlist_mincount': 1,
'params': {'extractor_args': {'youtube': {'lang': ['ja']}}},
'expected_warnings': ['Preferring "ja"'],
}]
@classmethod
@ -5527,10 +5724,20 @@ def get_mobj(url):
tabs = traverse_obj(data, ('contents', 'twoColumnBrowseResultsRenderer', 'tabs'), expected_type=list)
if tabs:
selected_tab = self._extract_selected_tab(tabs)
selected_tab_name = selected_tab.get('title', '').lower()
selected_tab_url = urljoin(
url, traverse_obj(selected_tab, ('endpoint', 'commandMetadata', 'webCommandMetadata', 'url')))
translated_tab_name = selected_tab.get('title', '').lower()
# Prefer tab name from tab url as it is always in en,
# but only when preferred lang is set as it may not extract reliably in all cases.
selected_tab_name = (self._preferred_lang in (None, 'en') and translated_tab_name
or selected_tab_url and get_mobj(selected_tab_url)['tab'][1:] # primary
or translated_tab_name)
if selected_tab_name == 'home':
selected_tab_name = 'featured'
requested_tab_name = mobj['tab'][1:]
if 'no-youtube-channel-redirect' not in compat_opts:
if requested_tab_name == 'live': # Live tab should have redirected to the video
raise UserNotLive(video_id=mobj['id'])
@ -5642,6 +5849,7 @@ class YoutubePlaylistIE(InfoExtractor):
'channel': 'milan',
'channel_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
'uploader_url': 'https://www.youtube.com/channel/UCEI1-PVPcYXjB73Hfelbmaw',
'availability': 'public',
},
'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'],
}, {
@ -5660,6 +5868,7 @@ class YoutubePlaylistIE(InfoExtractor):
'uploader_url': 'https://www.youtube.com/c/愛低音的國王',
'channel_id': 'UC21nz3_MesPLqtDqwdvnoxA',
'modified_date': r're:\d{8}',
'availability': 'public',
},
'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'],
}, {
@ -5848,7 +6057,7 @@ def _extract_notification_renderer(self, notification):
title = self._search_regex(
rf'{re.escape(channel or "")}[^:]+: (.+)', notification_title,
'video title', default=None)
upload_date = (strftime_or_none(self._extract_time_text(notification, 'sentTimeText')[0], '%Y%m%d')
upload_date = (strftime_or_none(self._parse_time_text(self._get_text(notification, 'sentTimeText')), '%Y%m%d')
if self._configuration_arg('approximate_date', ie_key=YoutubeTabIE.ie_key())
else None)
return {