mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-11-27 10:31:29 +00:00
[youtube:tab] Extract more playlist metadata (#2069)
* Add fields modified_date, modified_timestamp * Add field playlist_count * [youtube:tab] Extract view_count, playlist_count, modified_date Authored by: coletdjnz, pukkandan
This commit is contained in:
parent
97a6b117d9
commit
f0d785d3ed
|
@ -1120,8 +1120,10 @@ # OUTPUT TEMPLATE
|
||||||
- `creator` (string): The creator of the video
|
- `creator` (string): The creator of the video
|
||||||
- `timestamp` (numeric): UNIX timestamp of the moment the video became available
|
- `timestamp` (numeric): UNIX timestamp of the moment the video became available
|
||||||
- `upload_date` (string): Video upload date (YYYYMMDD)
|
- `upload_date` (string): Video upload date (YYYYMMDD)
|
||||||
- `release_date` (string): The date (YYYYMMDD) when the video was released
|
|
||||||
- `release_timestamp` (numeric): UNIX timestamp of the moment the video was released
|
- `release_timestamp` (numeric): UNIX timestamp of the moment the video was released
|
||||||
|
- `release_date` (string): The date (YYYYMMDD) when the video was released
|
||||||
|
- `modified_timestamp` (numeric): UNIX timestamp of the moment the video was last modified
|
||||||
|
- `modified_date` (string): The date (YYYYMMDD) when the video was last modified
|
||||||
- `uploader_id` (string): Nickname or id of the video uploader
|
- `uploader_id` (string): Nickname or id of the video uploader
|
||||||
- `channel` (string): Full name of the channel the video is uploaded on
|
- `channel` (string): Full name of the channel the video is uploaded on
|
||||||
- `channel_id` (string): Id of the channel
|
- `channel_id` (string): Id of the channel
|
||||||
|
@ -1167,6 +1169,7 @@ # OUTPUT TEMPLATE
|
||||||
- `video_autonumber` (numeric): Number that will be increased with each video
|
- `video_autonumber` (numeric): Number that will be increased with each video
|
||||||
- `n_entries` (numeric): Total number of extracted items in the playlist
|
- `n_entries` (numeric): Total number of extracted items in the playlist
|
||||||
- `playlist` (string): Name or id of the playlist that contains the video
|
- `playlist` (string): Name or id of the playlist that contains the video
|
||||||
|
- `playlist_count` (numeric): Total number of items in the playlist. May not be known if entire playlist is not extracted
|
||||||
- `playlist_index` (numeric): Index of the video in the playlist padded with leading zeros according the final index
|
- `playlist_index` (numeric): Index of the video in the playlist padded with leading zeros according the final index
|
||||||
- `playlist_autonumber` (numeric): Position of the video in the playlist download queue padded with leading zeros according to the total length of the playlist
|
- `playlist_autonumber` (numeric): Position of the video in the playlist download queue padded with leading zeros according to the total length of the playlist
|
||||||
- `playlist_id` (string): Playlist identifier
|
- `playlist_id` (string): Playlist identifier
|
||||||
|
|
|
@ -1636,14 +1636,15 @@ def iter_playlistitems(format):
|
||||||
playlistitems = orderedSet(iter_playlistitems(playlistitems_str))
|
playlistitems = orderedSet(iter_playlistitems(playlistitems_str))
|
||||||
|
|
||||||
ie_entries = ie_result['entries']
|
ie_entries = ie_result['entries']
|
||||||
msg = (
|
|
||||||
'Downloading %d videos' if not isinstance(ie_entries, list)
|
|
||||||
else 'Collected %d videos; downloading %%d of them' % len(ie_entries))
|
|
||||||
|
|
||||||
if isinstance(ie_entries, list):
|
if isinstance(ie_entries, list):
|
||||||
|
playlist_count = len(ie_result)
|
||||||
|
msg = f'Collected {playlist_count} videos; downloading %d of them'
|
||||||
|
ie_result['playlist_count'] = ie_result.get('playlist_count') or playlist_count
|
||||||
|
|
||||||
def get_entry(i):
|
def get_entry(i):
|
||||||
return ie_entries[i - 1]
|
return ie_entries[i - 1]
|
||||||
else:
|
else:
|
||||||
|
msg = 'Downloading %d videos'
|
||||||
if not isinstance(ie_entries, (PagedList, LazyList)):
|
if not isinstance(ie_entries, (PagedList, LazyList)):
|
||||||
ie_entries = LazyList(ie_entries)
|
ie_entries = LazyList(ie_entries)
|
||||||
|
|
||||||
|
@ -1652,7 +1653,7 @@ def get_entry(i):
|
||||||
lambda self, i: ie_entries[i - 1]
|
lambda self, i: ie_entries[i - 1]
|
||||||
)(self, i)
|
)(self, i)
|
||||||
|
|
||||||
entries = []
|
entries, broken = [], False
|
||||||
items = playlistitems if playlistitems is not None else itertools.count(playliststart)
|
items = playlistitems if playlistitems is not None else itertools.count(playliststart)
|
||||||
for i in items:
|
for i in items:
|
||||||
if i == 0:
|
if i == 0:
|
||||||
|
@ -1674,6 +1675,7 @@ def get_entry(i):
|
||||||
if entry is not None:
|
if entry is not None:
|
||||||
self._match_entry(entry, incomplete=True, silent=True)
|
self._match_entry(entry, incomplete=True, silent=True)
|
||||||
except (ExistingVideoReached, RejectedVideoReached):
|
except (ExistingVideoReached, RejectedVideoReached):
|
||||||
|
broken = True
|
||||||
break
|
break
|
||||||
ie_result['entries'] = entries
|
ie_result['entries'] = entries
|
||||||
|
|
||||||
|
@ -1684,6 +1686,9 @@ def get_entry(i):
|
||||||
if entry is not None]
|
if entry is not None]
|
||||||
n_entries = len(entries)
|
n_entries = len(entries)
|
||||||
|
|
||||||
|
if not (ie_result.get('playlist_count') or broken or playlistitems or playlistend):
|
||||||
|
ie_result['playlist_count'] = n_entries
|
||||||
|
|
||||||
if not playlistitems and (playliststart != 1 or playlistend):
|
if not playlistitems and (playliststart != 1 or playlistend):
|
||||||
playlistitems = list(range(playliststart, playliststart + n_entries))
|
playlistitems = list(range(playliststart, playliststart + n_entries))
|
||||||
ie_result['requested_entries'] = playlistitems
|
ie_result['requested_entries'] = playlistitems
|
||||||
|
@ -1733,6 +1738,7 @@ def get_entry(i):
|
||||||
extra = {
|
extra = {
|
||||||
'n_entries': n_entries,
|
'n_entries': n_entries,
|
||||||
'_last_playlist_index': max(playlistitems) if playlistitems else (playlistend or n_entries),
|
'_last_playlist_index': max(playlistitems) if playlistitems else (playlistend or n_entries),
|
||||||
|
'playlist_count': ie_result.get('playlist_count'),
|
||||||
'playlist_index': playlist_index,
|
'playlist_index': playlist_index,
|
||||||
'playlist_autonumber': i,
|
'playlist_autonumber': i,
|
||||||
'playlist': playlist,
|
'playlist': playlist,
|
||||||
|
@ -2331,6 +2337,7 @@ def sanitize_numeric_fields(info):
|
||||||
for ts_key, date_key in (
|
for ts_key, date_key in (
|
||||||
('timestamp', 'upload_date'),
|
('timestamp', 'upload_date'),
|
||||||
('release_timestamp', 'release_date'),
|
('release_timestamp', 'release_date'),
|
||||||
|
('modified_timestamp', 'modified_date'),
|
||||||
):
|
):
|
||||||
if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None:
|
if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None:
|
||||||
# Working around out-of-range timestamp values (e.g. negative ones on Windows,
|
# Working around out-of-range timestamp values (e.g. negative ones on Windows,
|
||||||
|
|
|
@ -243,11 +243,16 @@ class InfoExtractor(object):
|
||||||
uploader: Full name of the video uploader.
|
uploader: Full name of the video uploader.
|
||||||
license: License name the video is licensed under.
|
license: License name the video is licensed under.
|
||||||
creator: The creator of the video.
|
creator: The creator of the video.
|
||||||
release_timestamp: UNIX timestamp of the moment the video was released.
|
|
||||||
release_date: The date (YYYYMMDD) when the video was released.
|
|
||||||
timestamp: UNIX timestamp of the moment the video was uploaded
|
timestamp: UNIX timestamp of the moment the video was uploaded
|
||||||
upload_date: Video upload date (YYYYMMDD).
|
upload_date: Video upload date (YYYYMMDD).
|
||||||
If not explicitly set, calculated from timestamp.
|
If not explicitly set, calculated from timestamp
|
||||||
|
release_timestamp: UNIX timestamp of the moment the video was released.
|
||||||
|
If it is not clear whether to use timestamp or this, use the former
|
||||||
|
release_date: The date (YYYYMMDD) when the video was released.
|
||||||
|
If not explicitly set, calculated from release_timestamp
|
||||||
|
modified_timestamp: UNIX timestamp of the moment the video was last modified.
|
||||||
|
modified_date: The date (YYYYMMDD) when the video was last modified.
|
||||||
|
If not explicitly set, calculated from modified_timestamp
|
||||||
uploader_id: Nickname or id of the video uploader.
|
uploader_id: Nickname or id of the video uploader.
|
||||||
uploader_url: Full URL to a personal webpage of the video uploader.
|
uploader_url: Full URL to a personal webpage of the video uploader.
|
||||||
channel: Full name of the channel the video is uploaded on.
|
channel: Full name of the channel the video is uploaded on.
|
||||||
|
@ -383,6 +388,11 @@ class InfoExtractor(object):
|
||||||
Additionally, playlists can have "id", "title", and any other relevent
|
Additionally, playlists can have "id", "title", and any other relevent
|
||||||
attributes with the same semantics as videos (see above).
|
attributes with the same semantics as videos (see above).
|
||||||
|
|
||||||
|
It can also have the following optional fields:
|
||||||
|
|
||||||
|
playlist_count: The total number of videos in a playlist. If not given,
|
||||||
|
YoutubeDL tries to calculate it from "entries"
|
||||||
|
|
||||||
|
|
||||||
_type "multi_video" indicates that there are multiple videos that
|
_type "multi_video" indicates that there are multiple videos that
|
||||||
form a single show, for examples multiple acts of an opera or TV episode.
|
form a single show, for examples multiple acts of an opera or TV episode.
|
||||||
|
|
|
@ -62,6 +62,7 @@
|
||||||
try_get,
|
try_get,
|
||||||
unescapeHTML,
|
unescapeHTML,
|
||||||
unified_strdate,
|
unified_strdate,
|
||||||
|
unified_timestamp,
|
||||||
unsmuggle_url,
|
unsmuggle_url,
|
||||||
update_url_query,
|
update_url_query,
|
||||||
url_or_none,
|
url_or_none,
|
||||||
|
@ -667,6 +668,14 @@ def _get_text(data, *path_list, max_runs=None):
|
||||||
if text:
|
if text:
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
def _get_count(self, data, *path_list):
|
||||||
|
count_text = self._get_text(data, *path_list) or ''
|
||||||
|
count = parse_count(count_text)
|
||||||
|
if count is None:
|
||||||
|
count = str_to_int(
|
||||||
|
self._search_regex(r'^([\d,]+)', re.sub(r'\s', '', count_text), 'count', default=None))
|
||||||
|
return count
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _extract_thumbnails(data, *path_list):
|
def _extract_thumbnails(data, *path_list):
|
||||||
"""
|
"""
|
||||||
|
@ -695,12 +704,15 @@ def _extract_thumbnails(data, *path_list):
|
||||||
def extract_relative_time(relative_time_text):
|
def extract_relative_time(relative_time_text):
|
||||||
"""
|
"""
|
||||||
Extracts a relative time from string and converts to dt object
|
Extracts a relative time from string and converts to dt object
|
||||||
e.g. 'streamed 6 days ago', '5 seconds ago (edited)'
|
e.g. 'streamed 6 days ago', '5 seconds ago (edited)', 'updated today'
|
||||||
"""
|
"""
|
||||||
mobj = re.search(r'(?P<time>\d+)\s*(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?\s*ago', relative_time_text)
|
mobj = re.search(r'(?P<start>today|yesterday|now)|(?P<time>\d+)\s*(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?\s*ago', relative_time_text)
|
||||||
if mobj:
|
if mobj:
|
||||||
|
start = mobj.group('start')
|
||||||
|
if start:
|
||||||
|
return datetime_from_str(start)
|
||||||
try:
|
try:
|
||||||
return datetime_from_str('now-%s%s' % (mobj.group('time'), mobj.group('unit')), precision='auto')
|
return datetime_from_str('now-%s%s' % (mobj.group('time'), mobj.group('unit')))
|
||||||
except ValueError:
|
except ValueError:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@ -710,6 +722,13 @@ def _extract_time_text(self, renderer, *path_list):
|
||||||
timestamp = None
|
timestamp = None
|
||||||
if isinstance(dt, datetime.datetime):
|
if isinstance(dt, datetime.datetime):
|
||||||
timestamp = calendar.timegm(dt.timetuple())
|
timestamp = calendar.timegm(dt.timetuple())
|
||||||
|
|
||||||
|
if timestamp is None:
|
||||||
|
timestamp = (
|
||||||
|
unified_timestamp(text) or unified_timestamp(
|
||||||
|
self._search_regex(
|
||||||
|
(r'(?:.+|^)(?:live|premieres|ed|ing)(?:\s*on)?\s*(.+\d)', r'\w+[\s,\.-]*\w+[\s,\.-]+20\d{2}'), text.lower(), 'time text', default=None)))
|
||||||
|
|
||||||
if text and timestamp is None:
|
if text and timestamp is None:
|
||||||
self.report_warning('Cannot parse localized time text' + bug_reports_message(), only_once=True)
|
self.report_warning('Cannot parse localized time text' + bug_reports_message(), only_once=True)
|
||||||
return timestamp, text
|
return timestamp, text
|
||||||
|
@ -794,10 +813,7 @@ def _extract_video(self, renderer):
|
||||||
description = self._get_text(renderer, 'descriptionSnippet')
|
description = self._get_text(renderer, 'descriptionSnippet')
|
||||||
duration = parse_duration(self._get_text(
|
duration = parse_duration(self._get_text(
|
||||||
renderer, 'lengthText', ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'text')))
|
renderer, 'lengthText', ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'text')))
|
||||||
view_count_text = self._get_text(renderer, 'viewCountText') or ''
|
view_count = self._get_count(renderer, 'viewCountText')
|
||||||
view_count = str_to_int(self._search_regex(
|
|
||||||
r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
|
|
||||||
'view count', default=None))
|
|
||||||
|
|
||||||
uploader = self._get_text(renderer, 'ownerText', 'shortBylineText')
|
uploader = self._get_text(renderer, 'ownerText', 'shortBylineText')
|
||||||
channel_id = traverse_obj(
|
channel_id = traverse_obj(
|
||||||
|
@ -2317,8 +2333,8 @@ def extract_header(contents):
|
||||||
_continuation = None
|
_continuation = None
|
||||||
for content in contents:
|
for content in contents:
|
||||||
comments_header_renderer = traverse_obj(content, 'commentsHeaderRenderer')
|
comments_header_renderer = traverse_obj(content, 'commentsHeaderRenderer')
|
||||||
expected_comment_count = parse_count(self._get_text(
|
expected_comment_count = self._get_count(
|
||||||
comments_header_renderer, 'countText', 'commentsCount', max_runs=1))
|
comments_header_renderer, 'countText', 'commentsCount')
|
||||||
|
|
||||||
if expected_comment_count:
|
if expected_comment_count:
|
||||||
tracker['est_total'] = expected_comment_count
|
tracker['est_total'] = expected_comment_count
|
||||||
|
@ -3603,6 +3619,7 @@ def _extract_from_tabs(self, item_id, ytcfg, data, tabs):
|
||||||
tags = []
|
tags = []
|
||||||
|
|
||||||
selected_tab = self._extract_selected_tab(tabs)
|
selected_tab = self._extract_selected_tab(tabs)
|
||||||
|
primary_sidebar_renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer')
|
||||||
renderer = try_get(
|
renderer = try_get(
|
||||||
data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
|
data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
|
||||||
if renderer:
|
if renderer:
|
||||||
|
@ -3622,17 +3639,18 @@ def _extract_from_tabs(self, item_id, ytcfg, data, tabs):
|
||||||
thumbnails = (
|
thumbnails = (
|
||||||
self._extract_thumbnails(renderer, 'avatar')
|
self._extract_thumbnails(renderer, 'avatar')
|
||||||
or self._extract_thumbnails(
|
or self._extract_thumbnails(
|
||||||
self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer'),
|
primary_sidebar_renderer, ('thumbnailRenderer', 'playlistVideoThumbnailRenderer', 'thumbnail')))
|
||||||
('thumbnailRenderer', 'playlistVideoThumbnailRenderer', 'thumbnail')))
|
|
||||||
|
|
||||||
if playlist_id is None:
|
if playlist_id is None:
|
||||||
playlist_id = item_id
|
playlist_id = item_id
|
||||||
|
|
||||||
|
playlist_stats = traverse_obj(primary_sidebar_renderer, 'stats')
|
||||||
|
last_updated_unix, _ = self._extract_time_text(playlist_stats, 2)
|
||||||
if title is None:
|
if title is None:
|
||||||
title = (
|
title = self._get_text(data, ('header', 'hashtagHeaderRenderer', 'hashtag')) or playlist_id
|
||||||
try_get(data, lambda x: x['header']['hashtagHeaderRenderer']['hashtag']['simpleText'])
|
|
||||||
or playlist_id)
|
|
||||||
title += format_field(selected_tab, 'title', ' - %s')
|
title += format_field(selected_tab, 'title', ' - %s')
|
||||||
title += format_field(selected_tab, 'expandedText', ' - %s')
|
title += format_field(selected_tab, 'expandedText', ' - %s')
|
||||||
|
|
||||||
metadata = {
|
metadata = {
|
||||||
'playlist_id': playlist_id,
|
'playlist_id': playlist_id,
|
||||||
'playlist_title': title,
|
'playlist_title': title,
|
||||||
|
@ -3642,10 +3660,11 @@ def _extract_from_tabs(self, item_id, ytcfg, data, tabs):
|
||||||
'uploader_url': channel_url,
|
'uploader_url': channel_url,
|
||||||
'thumbnails': thumbnails,
|
'thumbnails': thumbnails,
|
||||||
'tags': tags,
|
'tags': tags,
|
||||||
|
'view_count': self._get_count(playlist_stats, 1),
|
||||||
|
'availability': self._extract_availability(data),
|
||||||
|
'modified_date': strftime_or_none(last_updated_unix, '%Y%m%d'),
|
||||||
|
'playlist_count': self._get_count(playlist_stats, 0)
|
||||||
}
|
}
|
||||||
availability = self._extract_availability(data)
|
|
||||||
if availability:
|
|
||||||
metadata['availability'] = availability
|
|
||||||
if not channel_id:
|
if not channel_id:
|
||||||
metadata.update(self._extract_uploader(data))
|
metadata.update(self._extract_uploader(data))
|
||||||
metadata.update({
|
metadata.update({
|
||||||
|
|
Loading…
Reference in a new issue