[youtube:tab] Extract more playlist metadata (#2069)

* Add fields modified_date, modified_timestamp
* Add field playlist_count
* [youtube:tab] Extract view_count, playlist_count, modified_date

Authored by: coletdjnz, pukkandan
This commit is contained in:
coletdjnz 2022-01-07 11:03:02 +00:00 committed by GitHub
parent 97a6b117d9
commit f0d785d3ed
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 65 additions and 26 deletions

View file

@ -1120,8 +1120,10 @@ # OUTPUT TEMPLATE
- `creator` (string): The creator of the video - `creator` (string): The creator of the video
- `timestamp` (numeric): UNIX timestamp of the moment the video became available - `timestamp` (numeric): UNIX timestamp of the moment the video became available
- `upload_date` (string): Video upload date (YYYYMMDD) - `upload_date` (string): Video upload date (YYYYMMDD)
- `release_date` (string): The date (YYYYMMDD) when the video was released
- `release_timestamp` (numeric): UNIX timestamp of the moment the video was released - `release_timestamp` (numeric): UNIX timestamp of the moment the video was released
- `release_date` (string): The date (YYYYMMDD) when the video was released
- `modified_timestamp` (numeric): UNIX timestamp of the moment the video was last modified
- `modified_date` (string): The date (YYYYMMDD) when the video was last modified
- `uploader_id` (string): Nickname or id of the video uploader - `uploader_id` (string): Nickname or id of the video uploader
- `channel` (string): Full name of the channel the video is uploaded on - `channel` (string): Full name of the channel the video is uploaded on
- `channel_id` (string): Id of the channel - `channel_id` (string): Id of the channel
@ -1167,6 +1169,7 @@ # OUTPUT TEMPLATE
- `video_autonumber` (numeric): Number that will be increased with each video - `video_autonumber` (numeric): Number that will be increased with each video
- `n_entries` (numeric): Total number of extracted items in the playlist - `n_entries` (numeric): Total number of extracted items in the playlist
- `playlist` (string): Name or id of the playlist that contains the video - `playlist` (string): Name or id of the playlist that contains the video
- `playlist_count` (numeric): Total number of items in the playlist. May not be known if entire playlist is not extracted
- `playlist_index` (numeric): Index of the video in the playlist padded with leading zeros according the final index - `playlist_index` (numeric): Index of the video in the playlist padded with leading zeros according the final index
- `playlist_autonumber` (numeric): Position of the video in the playlist download queue padded with leading zeros according to the total length of the playlist - `playlist_autonumber` (numeric): Position of the video in the playlist download queue padded with leading zeros according to the total length of the playlist
- `playlist_id` (string): Playlist identifier - `playlist_id` (string): Playlist identifier

View file

@ -1636,14 +1636,15 @@ def iter_playlistitems(format):
playlistitems = orderedSet(iter_playlistitems(playlistitems_str)) playlistitems = orderedSet(iter_playlistitems(playlistitems_str))
ie_entries = ie_result['entries'] ie_entries = ie_result['entries']
msg = (
'Downloading %d videos' if not isinstance(ie_entries, list)
else 'Collected %d videos; downloading %%d of them' % len(ie_entries))
if isinstance(ie_entries, list): if isinstance(ie_entries, list):
playlist_count = len(ie_result)
msg = f'Collected {playlist_count} videos; downloading %d of them'
ie_result['playlist_count'] = ie_result.get('playlist_count') or playlist_count
def get_entry(i): def get_entry(i):
return ie_entries[i - 1] return ie_entries[i - 1]
else: else:
msg = 'Downloading %d videos'
if not isinstance(ie_entries, (PagedList, LazyList)): if not isinstance(ie_entries, (PagedList, LazyList)):
ie_entries = LazyList(ie_entries) ie_entries = LazyList(ie_entries)
@ -1652,7 +1653,7 @@ def get_entry(i):
lambda self, i: ie_entries[i - 1] lambda self, i: ie_entries[i - 1]
)(self, i) )(self, i)
entries = [] entries, broken = [], False
items = playlistitems if playlistitems is not None else itertools.count(playliststart) items = playlistitems if playlistitems is not None else itertools.count(playliststart)
for i in items: for i in items:
if i == 0: if i == 0:
@ -1674,6 +1675,7 @@ def get_entry(i):
if entry is not None: if entry is not None:
self._match_entry(entry, incomplete=True, silent=True) self._match_entry(entry, incomplete=True, silent=True)
except (ExistingVideoReached, RejectedVideoReached): except (ExistingVideoReached, RejectedVideoReached):
broken = True
break break
ie_result['entries'] = entries ie_result['entries'] = entries
@ -1684,6 +1686,9 @@ def get_entry(i):
if entry is not None] if entry is not None]
n_entries = len(entries) n_entries = len(entries)
if not (ie_result.get('playlist_count') or broken or playlistitems or playlistend):
ie_result['playlist_count'] = n_entries
if not playlistitems and (playliststart != 1 or playlistend): if not playlistitems and (playliststart != 1 or playlistend):
playlistitems = list(range(playliststart, playliststart + n_entries)) playlistitems = list(range(playliststart, playliststart + n_entries))
ie_result['requested_entries'] = playlistitems ie_result['requested_entries'] = playlistitems
@ -1733,6 +1738,7 @@ def get_entry(i):
extra = { extra = {
'n_entries': n_entries, 'n_entries': n_entries,
'_last_playlist_index': max(playlistitems) if playlistitems else (playlistend or n_entries), '_last_playlist_index': max(playlistitems) if playlistitems else (playlistend or n_entries),
'playlist_count': ie_result.get('playlist_count'),
'playlist_index': playlist_index, 'playlist_index': playlist_index,
'playlist_autonumber': i, 'playlist_autonumber': i,
'playlist': playlist, 'playlist': playlist,
@ -2331,6 +2337,7 @@ def sanitize_numeric_fields(info):
for ts_key, date_key in ( for ts_key, date_key in (
('timestamp', 'upload_date'), ('timestamp', 'upload_date'),
('release_timestamp', 'release_date'), ('release_timestamp', 'release_date'),
('modified_timestamp', 'modified_date'),
): ):
if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None: if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None:
# Working around out-of-range timestamp values (e.g. negative ones on Windows, # Working around out-of-range timestamp values (e.g. negative ones on Windows,

View file

@ -243,11 +243,16 @@ class InfoExtractor(object):
uploader: Full name of the video uploader. uploader: Full name of the video uploader.
license: License name the video is licensed under. license: License name the video is licensed under.
creator: The creator of the video. creator: The creator of the video.
release_timestamp: UNIX timestamp of the moment the video was released.
release_date: The date (YYYYMMDD) when the video was released.
timestamp: UNIX timestamp of the moment the video was uploaded timestamp: UNIX timestamp of the moment the video was uploaded
upload_date: Video upload date (YYYYMMDD). upload_date: Video upload date (YYYYMMDD).
If not explicitly set, calculated from timestamp. If not explicitly set, calculated from timestamp
release_timestamp: UNIX timestamp of the moment the video was released.
If it is not clear whether to use timestamp or this, use the former
release_date: The date (YYYYMMDD) when the video was released.
If not explicitly set, calculated from release_timestamp
modified_timestamp: UNIX timestamp of the moment the video was last modified.
modified_date: The date (YYYYMMDD) when the video was last modified.
If not explicitly set, calculated from modified_timestamp
uploader_id: Nickname or id of the video uploader. uploader_id: Nickname or id of the video uploader.
uploader_url: Full URL to a personal webpage of the video uploader. uploader_url: Full URL to a personal webpage of the video uploader.
channel: Full name of the channel the video is uploaded on. channel: Full name of the channel the video is uploaded on.
@ -383,6 +388,11 @@ class InfoExtractor(object):
Additionally, playlists can have "id", "title", and any other relevent Additionally, playlists can have "id", "title", and any other relevent
attributes with the same semantics as videos (see above). attributes with the same semantics as videos (see above).
It can also have the following optional fields:
playlist_count: The total number of videos in a playlist. If not given,
YoutubeDL tries to calculate it from "entries"
_type "multi_video" indicates that there are multiple videos that _type "multi_video" indicates that there are multiple videos that
form a single show, for examples multiple acts of an opera or TV episode. form a single show, for examples multiple acts of an opera or TV episode.

View file

@ -62,6 +62,7 @@
try_get, try_get,
unescapeHTML, unescapeHTML,
unified_strdate, unified_strdate,
unified_timestamp,
unsmuggle_url, unsmuggle_url,
update_url_query, update_url_query,
url_or_none, url_or_none,
@ -667,6 +668,14 @@ def _get_text(data, *path_list, max_runs=None):
if text: if text:
return text return text
def _get_count(self, data, *path_list):
count_text = self._get_text(data, *path_list) or ''
count = parse_count(count_text)
if count is None:
count = str_to_int(
self._search_regex(r'^([\d,]+)', re.sub(r'\s', '', count_text), 'count', default=None))
return count
@staticmethod @staticmethod
def _extract_thumbnails(data, *path_list): def _extract_thumbnails(data, *path_list):
""" """
@ -695,12 +704,15 @@ def _extract_thumbnails(data, *path_list):
def extract_relative_time(relative_time_text): def extract_relative_time(relative_time_text):
""" """
Extracts a relative time from string and converts to dt object Extracts a relative time from string and converts to dt object
e.g. 'streamed 6 days ago', '5 seconds ago (edited)' e.g. 'streamed 6 days ago', '5 seconds ago (edited)', 'updated today'
""" """
mobj = re.search(r'(?P<time>\d+)\s*(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?\s*ago', relative_time_text) mobj = re.search(r'(?P<start>today|yesterday|now)|(?P<time>\d+)\s*(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?\s*ago', relative_time_text)
if mobj: if mobj:
start = mobj.group('start')
if start:
return datetime_from_str(start)
try: try:
return datetime_from_str('now-%s%s' % (mobj.group('time'), mobj.group('unit')), precision='auto') return datetime_from_str('now-%s%s' % (mobj.group('time'), mobj.group('unit')))
except ValueError: except ValueError:
return None return None
@ -710,6 +722,13 @@ def _extract_time_text(self, renderer, *path_list):
timestamp = None timestamp = None
if isinstance(dt, datetime.datetime): if isinstance(dt, datetime.datetime):
timestamp = calendar.timegm(dt.timetuple()) timestamp = calendar.timegm(dt.timetuple())
if timestamp is None:
timestamp = (
unified_timestamp(text) or unified_timestamp(
self._search_regex(
(r'(?:.+|^)(?:live|premieres|ed|ing)(?:\s*on)?\s*(.+\d)', r'\w+[\s,\.-]*\w+[\s,\.-]+20\d{2}'), text.lower(), 'time text', default=None)))
if text and timestamp is None: if text and timestamp is None:
self.report_warning('Cannot parse localized time text' + bug_reports_message(), only_once=True) self.report_warning('Cannot parse localized time text' + bug_reports_message(), only_once=True)
return timestamp, text return timestamp, text
@ -794,10 +813,7 @@ def _extract_video(self, renderer):
description = self._get_text(renderer, 'descriptionSnippet') description = self._get_text(renderer, 'descriptionSnippet')
duration = parse_duration(self._get_text( duration = parse_duration(self._get_text(
renderer, 'lengthText', ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'text'))) renderer, 'lengthText', ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'text')))
view_count_text = self._get_text(renderer, 'viewCountText') or '' view_count = self._get_count(renderer, 'viewCountText')
view_count = str_to_int(self._search_regex(
r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
'view count', default=None))
uploader = self._get_text(renderer, 'ownerText', 'shortBylineText') uploader = self._get_text(renderer, 'ownerText', 'shortBylineText')
channel_id = traverse_obj( channel_id = traverse_obj(
@ -2317,8 +2333,8 @@ def extract_header(contents):
_continuation = None _continuation = None
for content in contents: for content in contents:
comments_header_renderer = traverse_obj(content, 'commentsHeaderRenderer') comments_header_renderer = traverse_obj(content, 'commentsHeaderRenderer')
expected_comment_count = parse_count(self._get_text( expected_comment_count = self._get_count(
comments_header_renderer, 'countText', 'commentsCount', max_runs=1)) comments_header_renderer, 'countText', 'commentsCount')
if expected_comment_count: if expected_comment_count:
tracker['est_total'] = expected_comment_count tracker['est_total'] = expected_comment_count
@ -3603,6 +3619,7 @@ def _extract_from_tabs(self, item_id, ytcfg, data, tabs):
tags = [] tags = []
selected_tab = self._extract_selected_tab(tabs) selected_tab = self._extract_selected_tab(tabs)
primary_sidebar_renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer')
renderer = try_get( renderer = try_get(
data, lambda x: x['metadata']['channelMetadataRenderer'], dict) data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
if renderer: if renderer:
@ -3622,17 +3639,18 @@ def _extract_from_tabs(self, item_id, ytcfg, data, tabs):
thumbnails = ( thumbnails = (
self._extract_thumbnails(renderer, 'avatar') self._extract_thumbnails(renderer, 'avatar')
or self._extract_thumbnails( or self._extract_thumbnails(
self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer'), primary_sidebar_renderer, ('thumbnailRenderer', 'playlistVideoThumbnailRenderer', 'thumbnail')))
('thumbnailRenderer', 'playlistVideoThumbnailRenderer', 'thumbnail')))
if playlist_id is None: if playlist_id is None:
playlist_id = item_id playlist_id = item_id
playlist_stats = traverse_obj(primary_sidebar_renderer, 'stats')
last_updated_unix, _ = self._extract_time_text(playlist_stats, 2)
if title is None: if title is None:
title = ( title = self._get_text(data, ('header', 'hashtagHeaderRenderer', 'hashtag')) or playlist_id
try_get(data, lambda x: x['header']['hashtagHeaderRenderer']['hashtag']['simpleText'])
or playlist_id)
title += format_field(selected_tab, 'title', ' - %s') title += format_field(selected_tab, 'title', ' - %s')
title += format_field(selected_tab, 'expandedText', ' - %s') title += format_field(selected_tab, 'expandedText', ' - %s')
metadata = { metadata = {
'playlist_id': playlist_id, 'playlist_id': playlist_id,
'playlist_title': title, 'playlist_title': title,
@ -3642,10 +3660,11 @@ def _extract_from_tabs(self, item_id, ytcfg, data, tabs):
'uploader_url': channel_url, 'uploader_url': channel_url,
'thumbnails': thumbnails, 'thumbnails': thumbnails,
'tags': tags, 'tags': tags,
'view_count': self._get_count(playlist_stats, 1),
'availability': self._extract_availability(data),
'modified_date': strftime_or_none(last_updated_unix, '%Y%m%d'),
'playlist_count': self._get_count(playlist_stats, 0)
} }
availability = self._extract_availability(data)
if availability:
metadata['availability'] = availability
if not channel_id: if not channel_id:
metadata.update(self._extract_uploader(data)) metadata.update(self._extract_uploader(data))
metadata.update({ metadata.update({