[ie/youtube] Extract upload date timestamp if available

This commit is contained in:
coletdjnz 2024-05-04 21:13:01 +12:00
parent ac817bc83e
commit 17bcceda78
No known key found for this signature in database
GPG Key ID: 91984263BB39894A
3 changed files with 39 additions and 12 deletions

View File

@ -5,6 +5,7 @@ import os
import sys
import unittest
import warnings
import datetime as dt
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
@ -768,6 +769,8 @@ class TestUtil(unittest.TestCase):
def test_parse_iso8601(self):
self.assertEqual(parse_iso8601('2014-03-23T23:04:26+0100'), 1395612266)
self.assertEqual(parse_iso8601('2014-03-23T23:04:26-07:00'), 1395641066)
self.assertEqual(parse_iso8601('2014-03-23T23:04:26', timezone=dt.timedelta(hours=-7)), 1395641066)
self.assertEqual(parse_iso8601('2014-03-23T22:04:26+0000'), 1395612266)
self.assertEqual(parse_iso8601('2014-03-23T22:04:26Z'), 1395612266)
self.assertEqual(parse_iso8601('2014-03-23T22:04:26.1234Z'), 1395612266)

View File

@ -4552,19 +4552,43 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_id': channel_handle,
'uploader_url': format_field(channel_handle, None, 'https://www.youtube.com/%s', default=None),
})
def get_pacific_tz():
# Python 3.8 should be deprecated soon
# This fallback may make the timestamp slightly inaccurate for 3.8 users.
if sys.version_info < (3, 9):
return dt.timedelta(hours=-7)
from zoneinfo import ZoneInfo
return dt.datetime.now(ZoneInfo('US/Pacific')).utcoffset()
# We only want timestamp IF it has second precision
# Additionally, if there is no timezone present, we should assume it is in PT.
timestamp = (
parse_iso8601(get_first(microformats, 'uploadDate'), timezone=get_pacific_tz())
or parse_iso8601(search_meta('uploadDate'), timezone=get_pacific_tz())
)
upload_date = (
dt.datetime.fromtimestamp(timestamp, dt.timezone.utc).strftime('%Y%m%d') if timestamp else
(
unified_strdate(get_first(microformats, 'uploadDate'))
or unified_strdate(search_meta('uploadDate'))
))
# In the case we cannot get the timestamp:
# The upload date for scheduled, live and past live streams / premieres in microformats
# may be different from the stream date. Although not in UTC, we will prefer it in this case.
# See: https://github.com/yt-dlp/yt-dlp/pull/2223#issuecomment-1008485139
upload_date = (
unified_strdate(get_first(microformats, 'uploadDate'))
or unified_strdate(search_meta('uploadDate')))
if not upload_date or (
live_status in ('not_live', None)
not timestamp
and live_status in ('not_live', None)
and 'no-youtube-prefer-utc-upload-date' not in self.get_param('compat_opts', [])
):
# this should be in UTC, as configured in the cookie/client context
upload_date = strftime_or_none(
self._parse_time_text(self._get_text(vpir, 'dateText'))) or upload_date
info['upload_date'] = upload_date
info['timestamp'] = timestamp
if upload_date and live_status not in ('is_live', 'post_live', 'is_upcoming'):
# Newly uploaded videos' HLS formats are potentially problematic and need to be checked

View File

@ -1134,7 +1134,7 @@ def is_path_like(f):
return isinstance(f, (str, bytes, os.PathLike))
def extract_timezone(date_str):
def extract_timezone(date_str, default=None):
m = re.search(
r'''(?x)
^.{8,}? # >=8 char non-TZ prefix, if present
@ -1146,22 +1146,23 @@ def extract_timezone(date_str):
(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
$)
''', date_str)
timezone = None
if not m:
m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
if timezone is not None:
date_str = date_str[:-len(m.group('tz'))]
timezone = dt.timedelta(hours=timezone or 0)
timezone = dt.timedelta(hours=timezone)
else:
date_str = date_str[:-len(m.group('tz'))]
if not m.group('sign'):
timezone = dt.timedelta()
else:
if m.group('sign'):
sign = 1 if m.group('sign') == '+' else -1
timezone = dt.timedelta(
hours=sign * int(m.group('hours')),
minutes=sign * int(m.group('minutes')))
return timezone, date_str
return timezone or default or dt.timedelta(), date_str
def parse_iso8601(date_str, delimiter='T', timezone=None):
@ -1172,8 +1173,7 @@ def parse_iso8601(date_str, delimiter='T', timezone=None):
date_str = re.sub(r'\.[0-9]+', '', date_str)
if timezone is None:
timezone, date_str = extract_timezone(date_str)
timezone, date_str = extract_timezone(date_str, timezone)
with contextlib.suppress(ValueError):
date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'