mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-11-30 12:01:28 +00:00
[NBC] Enhance embedURL extraction (closes #2549)
This commit is contained in:
parent
dc1eed93be
commit
0fe2ff78e6
|
@ -53,6 +53,7 @@
|
||||||
unified_strdate,
|
unified_strdate,
|
||||||
unsmuggle_url,
|
unsmuggle_url,
|
||||||
uppercase_escape,
|
uppercase_escape,
|
||||||
|
lowercase_escape,
|
||||||
url_basename,
|
url_basename,
|
||||||
urlencode_postdata,
|
urlencode_postdata,
|
||||||
version_tuple,
|
version_tuple,
|
||||||
|
@ -418,6 +419,10 @@ def test_uppercase_escape(self):
|
||||||
self.assertEqual(uppercase_escape('aä'), 'aä')
|
self.assertEqual(uppercase_escape('aä'), 'aä')
|
||||||
self.assertEqual(uppercase_escape('\\U0001d550'), '𝕐')
|
self.assertEqual(uppercase_escape('\\U0001d550'), '𝕐')
|
||||||
|
|
||||||
|
def test_lowercase_escape(self):
|
||||||
|
self.assertEqual(lowercase_escape('aä'), 'aä')
|
||||||
|
self.assertEqual(lowercase_escape('\\u0026'), '&')
|
||||||
|
|
||||||
def test_limit_length(self):
|
def test_limit_length(self):
|
||||||
self.assertEqual(limit_length(None, 12), None)
|
self.assertEqual(limit_length(None, 12), None)
|
||||||
self.assertEqual(limit_length('foo', 12), 'foo')
|
self.assertEqual(limit_length('foo', 12), 'foo')
|
||||||
|
|
|
@ -10,6 +10,8 @@
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
ExtractorError,
|
ExtractorError,
|
||||||
find_xpath_attr,
|
find_xpath_attr,
|
||||||
|
lowercase_escape,
|
||||||
|
unescapeHTML,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -46,18 +48,23 @@ class NBCIE(InfoExtractor):
|
||||||
'description': 'md5:0b40f9cbde5b671a7ff62fceccc4f442',
|
'description': 'md5:0b40f9cbde5b671a7ff62fceccc4f442',
|
||||||
},
|
},
|
||||||
'skip': 'Only works from US',
|
'skip': 'Only works from US',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
# This video has expired but with an escaped embedURL
|
||||||
|
'url': 'http://www.nbc.com/parenthood/episode-guide/season-5/just-like-at-home/515',
|
||||||
|
'skip': 'Expired'
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
video_id = self._match_id(url)
|
video_id = self._match_id(url)
|
||||||
webpage = self._download_webpage(url, video_id)
|
webpage = self._download_webpage(url, video_id)
|
||||||
theplatform_url = self._search_regex(
|
theplatform_url = unescapeHTML(lowercase_escape(self._html_search_regex(
|
||||||
[
|
[
|
||||||
r'(?:class="video-player video-player-full" data-mpx-url|class="player" src)="(.*?)"',
|
r'(?:class="video-player video-player-full" data-mpx-url|class="player" src)="(.*?)"',
|
||||||
r'"embedURL"\s*:\s*"([^"]+)"'
|
r'"embedURL"\s*:\s*"([^"]+)"'
|
||||||
],
|
],
|
||||||
webpage, 'theplatform url').replace('_no_endcard', '')
|
webpage, 'theplatform url').replace('_no_endcard', '').replace('\\/', '/')))
|
||||||
if theplatform_url.startswith('//'):
|
if theplatform_url.startswith('//'):
|
||||||
theplatform_url = 'http:' + theplatform_url
|
theplatform_url = 'http:' + theplatform_url
|
||||||
return self.url_result(theplatform_url)
|
return self.url_result(theplatform_url)
|
||||||
|
|
|
@ -1486,6 +1486,14 @@ def uppercase_escape(s):
|
||||||
s)
|
s)
|
||||||
|
|
||||||
|
|
||||||
|
def lowercase_escape(s):
|
||||||
|
unicode_escape = codecs.getdecoder('unicode_escape')
|
||||||
|
return re.sub(
|
||||||
|
r'\\u[0-9a-fA-F]{4}',
|
||||||
|
lambda m: unicode_escape(m.group(0))[0],
|
||||||
|
s)
|
||||||
|
|
||||||
|
|
||||||
def escape_rfc3986(s):
|
def escape_rfc3986(s):
|
||||||
"""Escape non-ASCII characters as suggested by RFC 3986"""
|
"""Escape non-ASCII characters as suggested by RFC 3986"""
|
||||||
if sys.version_info < (3, 0) and isinstance(s, compat_str):
|
if sys.version_info < (3, 0) and isinstance(s, compat_str):
|
||||||
|
|
Loading…
Reference in a new issue