From 17bcceda78f27caae4741bae65a6d59b2e82ae66 Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Sat, 4 May 2024 21:13:01 +1200 Subject: [PATCH 1/6] [ie/youtube] Extract upload date timestamp if available --- test/test_utils.py | 3 +++ yt_dlp/extractor/youtube.py | 32 ++++++++++++++++++++++++++++---- yt_dlp/utils/_utils.py | 16 ++++++++-------- 3 files changed, 39 insertions(+), 12 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 816cf03f6..e1404d17e 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -5,6 +5,7 @@ import os import sys import unittest import warnings +import datetime as dt sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -768,6 +769,8 @@ class TestUtil(unittest.TestCase): def test_parse_iso8601(self): self.assertEqual(parse_iso8601('2014-03-23T23:04:26+0100'), 1395612266) + self.assertEqual(parse_iso8601('2014-03-23T23:04:26-07:00'), 1395641066) + self.assertEqual(parse_iso8601('2014-03-23T23:04:26', timezone=dt.timedelta(hours=-7)), 1395641066) self.assertEqual(parse_iso8601('2014-03-23T22:04:26+0000'), 1395612266) self.assertEqual(parse_iso8601('2014-03-23T22:04:26Z'), 1395612266) self.assertEqual(parse_iso8601('2014-03-23T22:04:26.1234Z'), 1395612266) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index e553fff9f..4070a9c72 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -4552,19 +4552,43 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': channel_handle, 'uploader_url': format_field(channel_handle, None, 'https://www.youtube.com/%s', default=None), }) + + def get_pacific_tz(): + # Python 3.8 should be deprecated soon + # This fallback may make the timestamp slightly inaccurate for 3.8 users. + if sys.version_info < (3, 9): + return dt.timedelta(hours=-7) + from zoneinfo import ZoneInfo + return dt.datetime.now(ZoneInfo('US/Pacific')).utcoffset() + + # We only want timestamp IF it has second precision + # Additionally, if there is no timezone present, we should assume it is in PT. + timestamp = ( + parse_iso8601(get_first(microformats, 'uploadDate'), timezone=get_pacific_tz()) + or parse_iso8601(search_meta('uploadDate'), timezone=get_pacific_tz()) + ) + upload_date = ( + dt.datetime.fromtimestamp(timestamp, dt.timezone.utc).strftime('%Y%m%d') if timestamp else + ( + unified_strdate(get_first(microformats, 'uploadDate')) + or unified_strdate(search_meta('uploadDate')) + )) + + # In the case we cannot get the timestamp: # The upload date for scheduled, live and past live streams / premieres in microformats # may be different from the stream date. Although not in UTC, we will prefer it in this case. # See: https://github.com/yt-dlp/yt-dlp/pull/2223#issuecomment-1008485139 - upload_date = ( - unified_strdate(get_first(microformats, 'uploadDate')) - or unified_strdate(search_meta('uploadDate'))) if not upload_date or ( - live_status in ('not_live', None) + not timestamp + and live_status in ('not_live', None) and 'no-youtube-prefer-utc-upload-date' not in self.get_param('compat_opts', []) ): + # this should be in UTC, as configured in the cookie/client context upload_date = strftime_or_none( self._parse_time_text(self._get_text(vpir, 'dateText'))) or upload_date + info['upload_date'] = upload_date + info['timestamp'] = timestamp if upload_date and live_status not in ('is_live', 'post_live', 'is_upcoming'): # Newly uploaded videos' HLS formats are potentially problematic and need to be checked diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index b63766912..c7c6bdbdd 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -1134,7 +1134,7 @@ def is_path_like(f): return isinstance(f, (str, bytes, os.PathLike)) -def extract_timezone(date_str): +def extract_timezone(date_str, default=None): m = re.search( r'''(?x) ^.{8,}? # >=8 char non-TZ prefix, if present @@ -1146,22 +1146,23 @@ def extract_timezone(date_str): (?P[0-9]{2}):?(?P[0-9]{2}) # hh[:]mm $) ''', date_str) + timezone = None + if not m: m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P\s*[A-Z]+)$', date_str) timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip()) if timezone is not None: date_str = date_str[:-len(m.group('tz'))] - timezone = dt.timedelta(hours=timezone or 0) + timezone = dt.timedelta(hours=timezone) else: date_str = date_str[:-len(m.group('tz'))] - if not m.group('sign'): - timezone = dt.timedelta() - else: + if m.group('sign'): sign = 1 if m.group('sign') == '+' else -1 timezone = dt.timedelta( hours=sign * int(m.group('hours')), minutes=sign * int(m.group('minutes'))) - return timezone, date_str + + return timezone or default or dt.timedelta(), date_str def parse_iso8601(date_str, delimiter='T', timezone=None): @@ -1172,8 +1173,7 @@ def parse_iso8601(date_str, delimiter='T', timezone=None): date_str = re.sub(r'\.[0-9]+', '', date_str) - if timezone is None: - timezone, date_str = extract_timezone(date_str) + timezone, date_str = extract_timezone(date_str, timezone) with contextlib.suppress(ValueError): date_format = f'%Y-%m-%d{delimiter}%H:%M:%S' From b229dcf1b103dc071de263eb9434e99b1e79ee33 Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Sat, 4 May 2024 21:18:53 +1200 Subject: [PATCH 2/6] add test for override --- test/test_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index e1404d17e..e4c039fb2 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -771,6 +771,8 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_iso8601('2014-03-23T23:04:26+0100'), 1395612266) self.assertEqual(parse_iso8601('2014-03-23T23:04:26-07:00'), 1395641066) self.assertEqual(parse_iso8601('2014-03-23T23:04:26', timezone=dt.timedelta(hours=-7)), 1395641066) + # default does not override timezone in date_str + self.assertEqual(parse_iso8601('2014-03-23T23:04:26-07:00', timezone=dt.timedelta(hours=-10)), 1395641066) self.assertEqual(parse_iso8601('2014-03-23T22:04:26+0000'), 1395612266) self.assertEqual(parse_iso8601('2014-03-23T22:04:26Z'), 1395612266) self.assertEqual(parse_iso8601('2014-03-23T22:04:26.1234Z'), 1395612266) From a7e0a8452b908743be05456b47506bdf9d3872d9 Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Sat, 4 May 2024 21:20:42 +1200 Subject: [PATCH 3/6] linter --- test/test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index e4c039fb2..405d93c25 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -772,7 +772,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_iso8601('2014-03-23T23:04:26-07:00'), 1395641066) self.assertEqual(parse_iso8601('2014-03-23T23:04:26', timezone=dt.timedelta(hours=-7)), 1395641066) # default does not override timezone in date_str - self.assertEqual(parse_iso8601('2014-03-23T23:04:26-07:00', timezone=dt.timedelta(hours=-10)), 1395641066) + self.assertEqual(parse_iso8601('2014-03-23T23:04:26-07:00', timezone=dt.timedelta(hours=-10)), 1395641066) self.assertEqual(parse_iso8601('2014-03-23T22:04:26+0000'), 1395612266) self.assertEqual(parse_iso8601('2014-03-23T22:04:26Z'), 1395612266) self.assertEqual(parse_iso8601('2014-03-23T22:04:26.1234Z'), 1395612266) From a1af9ffe272fda8c67f524b9daa17d339ae3f7a7 Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Sat, 4 May 2024 21:46:31 +1200 Subject: [PATCH 4/6] Do not extract timestamp on 3.8 if timezone is not present --- test/test_utils.py | 2 ++ yt_dlp/extractor/youtube.py | 2 +- yt_dlp/utils/_utils.py | 7 +++++-- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 405d93c25..77fadbbea 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -28,6 +28,7 @@ from yt_dlp.utils import ( ExtractorError, InAdvancePagedList, LazyList, + NO_DEFAULT, OnDemandPagedList, Popen, age_restricted, @@ -771,6 +772,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_iso8601('2014-03-23T23:04:26+0100'), 1395612266) self.assertEqual(parse_iso8601('2014-03-23T23:04:26-07:00'), 1395641066) self.assertEqual(parse_iso8601('2014-03-23T23:04:26', timezone=dt.timedelta(hours=-7)), 1395641066) + self.assertEqual(parse_iso8601('2014-03-23T23:04:26', timezone=NO_DEFAULT), None) # default does not override timezone in date_str self.assertEqual(parse_iso8601('2014-03-23T23:04:26-07:00', timezone=dt.timedelta(hours=-10)), 1395641066) self.assertEqual(parse_iso8601('2014-03-23T22:04:26+0000'), 1395612266) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 4070a9c72..2e166b356 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -4557,7 +4557,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Python 3.8 should be deprecated soon # This fallback may make the timestamp slightly inaccurate for 3.8 users. if sys.version_info < (3, 9): - return dt.timedelta(hours=-7) + return NO_DEFAULT from zoneinfo import ZoneInfo return dt.datetime.now(ZoneInfo('US/Pacific')).utcoffset() diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index c7c6bdbdd..5f458ea45 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -1162,7 +1162,10 @@ def extract_timezone(date_str, default=None): hours=sign * int(m.group('hours')), minutes=sign * int(m.group('minutes'))) - return timezone or default or dt.timedelta(), date_str + if timezone is None and default is not NO_DEFAULT: + timezone = default or dt.timedelta() + + return timezone, date_str def parse_iso8601(date_str, delimiter='T', timezone=None): @@ -1175,7 +1178,7 @@ def parse_iso8601(date_str, delimiter='T', timezone=None): timezone, date_str = extract_timezone(date_str, timezone) - with contextlib.suppress(ValueError): + with contextlib.suppress(ValueError, TypeError): date_format = f'%Y-%m-%d{delimiter}%H:%M:%S' dt_ = dt.datetime.strptime(date_str, date_format) - timezone return calendar.timegm(dt_.timetuple()) From 481ada34375741b17da521655b101aa2668ce9f2 Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Sat, 4 May 2024 22:05:18 +1200 Subject: [PATCH 5/6] Do not extract timestamp if timezone not available --- yt_dlp/extractor/youtube.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 2e166b356..f0c23daee 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -4553,19 +4553,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': format_field(channel_handle, None, 'https://www.youtube.com/%s', default=None), }) - def get_pacific_tz(): - # Python 3.8 should be deprecated soon - # This fallback may make the timestamp slightly inaccurate for 3.8 users. - if sys.version_info < (3, 9): - return NO_DEFAULT - from zoneinfo import ZoneInfo - return dt.datetime.now(ZoneInfo('US/Pacific')).utcoffset() - - # We only want timestamp IF it has second precision - # Additionally, if there is no timezone present, we should assume it is in PT. + # We only want timestamp IF it has second precision AND a timezone + # Currently the uploadDate in microformats appears to be in US/Pacific timezone. timestamp = ( - parse_iso8601(get_first(microformats, 'uploadDate'), timezone=get_pacific_tz()) - or parse_iso8601(search_meta('uploadDate'), timezone=get_pacific_tz()) + parse_iso8601(get_first(microformats, 'uploadDate'), timezone=NO_DEFAULT) + or parse_iso8601(search_meta('uploadDate'), timezone=NO_DEFAULT) ) upload_date = ( dt.datetime.fromtimestamp(timestamp, dt.timezone.utc).strftime('%Y%m%d') if timestamp else From 919824f299d8faeead88be828ce2ef97c70efdff Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Sat, 4 May 2024 22:05:57 +1200 Subject: [PATCH 6/6] comment --- yt_dlp/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index f0c23daee..a01dd2eae 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -4553,7 +4553,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': format_field(channel_handle, None, 'https://www.youtube.com/%s', default=None), }) - # We only want timestamp IF it has second precision AND a timezone + # We only want timestamp IF it has time precision AND a timezone # Currently the uploadDate in microformats appears to be in US/Pacific timezone. timestamp = ( parse_iso8601(get_first(microformats, 'uploadDate'), timezone=NO_DEFAULT)