mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-12-02 19:36:45 +00:00
[generic] Add support for BOMs (Fixes #4753)
This commit is contained in:
parent
317239b097
commit
61ca9a80b3
|
@ -28,6 +28,7 @@
|
||||||
fix_xml_ampersands,
|
fix_xml_ampersands,
|
||||||
InAdvancePagedList,
|
InAdvancePagedList,
|
||||||
intlist_to_bytes,
|
intlist_to_bytes,
|
||||||
|
is_html,
|
||||||
js_to_json,
|
js_to_json,
|
||||||
limit_length,
|
limit_length,
|
||||||
OnDemandPagedList,
|
OnDemandPagedList,
|
||||||
|
@ -417,5 +418,21 @@ def test_age_restricted(self):
|
||||||
self.assertTrue(age_restricted(18, 14))
|
self.assertTrue(age_restricted(18, 14))
|
||||||
self.assertFalse(age_restricted(18, 18))
|
self.assertFalse(age_restricted(18, 18))
|
||||||
|
|
||||||
|
def test_is_html(self):
|
||||||
|
self.assertFalse(is_html(b'\x49\x44\x43<html'))
|
||||||
|
self.assertTrue(is_html(b'<!DOCTYPE foo>\xaaa'))
|
||||||
|
self.assertTrue(is_html( # UTF-8 with BOM
|
||||||
|
b'\xef\xbb\xbf<!DOCTYPE foo>\xaaa'))
|
||||||
|
self.assertTrue(is_html( # UTF-16-LE
|
||||||
|
b'\xff\xfe<\x00h\x00t\x00m\x00l\x00>\x00\xe4\x00'
|
||||||
|
))
|
||||||
|
self.assertTrue(is_html( # UTF-16-BE
|
||||||
|
b'\xfe\xff\x00<\x00h\x00t\x00m\x00l\x00>\x00\xe4'
|
||||||
|
))
|
||||||
|
self.assertTrue(is_html( # UTF-32-BE
|
||||||
|
b'\x00\x00\xFE\xFF\x00\x00\x00<\x00\x00\x00h\x00\x00\x00t\x00\x00\x00m\x00\x00\x00l\x00\x00\x00>\x00\x00\x00\xe4'))
|
||||||
|
self.assertTrue(is_html( # UTF-32-LE
|
||||||
|
b'\xFF\xFE\x00\x00<\x00\x00\x00h\x00\x00\x00t\x00\x00\x00m\x00\x00\x00l\x00\x00\x00>\x00\x00\x00\xe4\x00\x00\x00'))
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
ExtractorError,
|
ExtractorError,
|
||||||
float_or_none,
|
float_or_none,
|
||||||
HEADRequest,
|
HEADRequest,
|
||||||
|
is_html,
|
||||||
orderedSet,
|
orderedSet,
|
||||||
parse_xml,
|
parse_xml,
|
||||||
smuggle_url,
|
smuggle_url,
|
||||||
|
@ -647,7 +648,7 @@ def _real_extract(self, url):
|
||||||
# Maybe it's a direct link to a video?
|
# Maybe it's a direct link to a video?
|
||||||
# Be careful not to download the whole thing!
|
# Be careful not to download the whole thing!
|
||||||
first_bytes = full_response.read(512)
|
first_bytes = full_response.read(512)
|
||||||
if not re.match(r'^\s*<', first_bytes.decode('utf-8', 'replace')):
|
if not is_html(first_bytes):
|
||||||
self._downloader.report_warning(
|
self._downloader.report_warning(
|
||||||
'URL could be a direct video link, returning it as such.')
|
'URL could be a direct video link, returning it as such.')
|
||||||
upload_date = unified_strdate(
|
upload_date = unified_strdate(
|
||||||
|
|
|
@ -1631,3 +1631,23 @@ def age_restricted(content_limit, age_limit):
|
||||||
if content_limit is None:
|
if content_limit is None:
|
||||||
return False # Content available for everyone
|
return False # Content available for everyone
|
||||||
return age_limit < content_limit
|
return age_limit < content_limit
|
||||||
|
|
||||||
|
|
||||||
|
def is_html(first_bytes):
|
||||||
|
""" Detect whether a file contains HTML by examining its first bytes. """
|
||||||
|
|
||||||
|
BOMS = [
|
||||||
|
(b'\xef\xbb\xbf', 'utf-8'),
|
||||||
|
(b'\x00\x00\xfe\xff', 'utf-32-be'),
|
||||||
|
(b'\xff\xfe\x00\x00', 'utf-32-le'),
|
||||||
|
(b'\xff\xfe', 'utf-16-le'),
|
||||||
|
(b'\xfe\xff', 'utf-16-be'),
|
||||||
|
]
|
||||||
|
for bom, enc in BOMS:
|
||||||
|
if first_bytes.startswith(bom):
|
||||||
|
s = first_bytes[len(bom):].decode(enc, 'replace')
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
s = first_bytes.decode('utf-8', 'replace')
|
||||||
|
|
||||||
|
return re.match(r'^\s*<', s)
|
||||||
|
|
Loading…
Reference in a new issue