From b60419c51aa3eb9872e278e526cc5e62bf484462 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Tue, 2 Feb 2021 21:51:32 +0530 Subject: [PATCH] [youtube] More metadata extraction for channels/playlists --- youtube_dlc/extractor/common.py | 8 ++-- youtube_dlc/extractor/youtube.py | 69 +++++++++++++++++++++++--------- 2 files changed, 53 insertions(+), 24 deletions(-) diff --git a/youtube_dlc/extractor/common.py b/youtube_dlc/extractor/common.py index e13ba5a394..49d99bb557 100644 --- a/youtube_dlc/extractor/common.py +++ b/youtube_dlc/extractor/common.py @@ -336,9 +336,8 @@ class InfoExtractor(object): There must be a key "entries", which is a list, an iterable, or a PagedList object, each element of which is a valid dictionary by this specification. - Additionally, playlists can have "id", "title", "description", "uploader", - "uploader_id", "uploader_url", "duration" attributes with the same semantics - as videos (see above). + Additionally, playlists can have "id", "title", and any other relevent + attributes with the same semantics as videos (see above). _type "multi_video" indicates that there are multiple videos that @@ -967,10 +966,11 @@ def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, urls, playlist_id=playlist_id, playlist_title=playlist_title) @staticmethod - def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None): + def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs): """Returns a playlist""" video_info = {'_type': 'playlist', 'entries': entries} + video_info.update(kwargs) if playlist_id: video_info['id'] = playlist_id if playlist_title: diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 0ba6a299e0..9b71776945 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -31,6 +31,7 @@ clean_html, error_to_compat_str, ExtractorError, + format_field, float_or_none, get_element_by_id, int_or_none, @@ -2675,6 +2676,7 @@ def decrypt_sig(mobj): 'uploader': video_uploader, 'uploader_id': video_uploader_id, 'uploader_url': video_uploader_url, + 'channel': video_uploader, 'channel_id': channel_id, 'channel_url': channel_url, 'upload_date': upload_date, @@ -3402,44 +3404,71 @@ def _extract_uploader(data): uploader['uploader_url'] = urljoin( 'https://www.youtube.com/', try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str)) - return uploader + return {k:v for k, v in uploader.items() if v is not None} def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token): + playlist_id = title = description = channel_url = channel_name = channel_id = None + thumbnails_list = tags = [] + selected_tab = self._extract_selected_tab(tabs) renderer = try_get( data, lambda x: x['metadata']['channelMetadataRenderer'], dict) - playlist_id = title = description = None if renderer: - channel_title = renderer.get('title') or item_id - tab_title = selected_tab.get('title') - title = channel_title or item_id - if tab_title: - title += ' - %s' % tab_title - description = renderer.get('description') - playlist_id = renderer.get('externalId') + channel_name = renderer.get('title') + channel_url = renderer.get('channelUrl') + channel_id = renderer.get('externalId') - # this has thumbnails, but there is currently no thumbnail field for playlists - # sidebar.playlistSidebarRenderer has even more data, but its stucture is more complec - renderer = try_get( - data, lambda x: x['microformat']['microformatDataRenderer'], dict) if not renderer: renderer = try_get( data, lambda x: x['metadata']['playlistMetadataRenderer'], dict) if renderer: title = renderer.get('title') description = renderer.get('description') - playlist_id = item_id + playlist_id = channel_id + tags = renderer.get('keywords', '').split() + thumbnails_list = ( + try_get(renderer, lambda x: x['avatar']['thumbnails'], list) + or data['sidebar']['playlistSidebarRenderer']['items'][0]['playlistSidebarPrimaryInfoRenderer']['thumbnailRenderer']['playlistVideoThumbnailRenderer']['thumbnail']['thumbnails'] + or []) + + thumbnails = [] + for t in thumbnails_list: + if not isinstance(t, dict): + continue + thumbnail_url = url_or_none(t.get('url')) + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(t.get('width')), + 'height': int_or_none(t.get('height')), + }) if playlist_id is None: playlist_id = item_id if title is None: - title = "Youtube " + playlist_id.title() - playlist = self.playlist_result( + title = playlist_id + title += format_field(selected_tab, 'title', ' - %s') + + metadata = { + 'playlist_id': playlist_id, + 'playlist_title': title, + 'playlist_description': description, + 'uploader': channel_name, + 'uploader_id': channel_id, + 'uploader_url': channel_url, + 'thumbnails': thumbnails, + 'tags': tags, + } + if not channel_id: + metadata.update(self._extract_uploader(data)) + metadata.update({ + 'channel': metadata['uploader'], + 'channel_id': metadata['uploader_id'], + 'channel_url': metadata['uploader_url']}) + return self.playlist_result( self._entries(selected_tab, identity_token), - playlist_id=playlist_id, playlist_title=title, - playlist_description=description) - playlist.update(self._extract_uploader(data)) - return playlist + **metadata) def _extract_from_playlist(self, item_id, url, data, playlist): title = playlist.get('title') or try_get(