From d891ff9fd9952b2829a47b508acf40d951546123 Mon Sep 17 00:00:00 2001 From: Witold Baryluk Date: Fri, 23 Mar 2012 06:15:57 +0100 Subject: [PATCH 1/4] Ignore leading spaces (and trailing also) in all URL from url list or command line --- youtube-dl | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube-dl b/youtube-dl index 5224611d2..b466c1570 100755 --- a/youtube-dl +++ b/youtube-dl @@ -4563,6 +4563,7 @@ def _real_main(): except IOError: sys.exit(u'ERROR: batch file could not be read') all_urls = batchurls + args + all_urls = map(lambda url: url.strip(), all_urls) # General configuration cookie_processor = urllib2.HTTPCookieProcessor(jar) From 69d3b2d8249e677c40d529e017cb765f50ba3fa0 Mon Sep 17 00:00:00 2001 From: Witold Baryluk Date: Fri, 23 Mar 2012 06:17:29 +0100 Subject: [PATCH 2/4] Extract original URL from next_url parameter of verify_age page, before actual extract --- youtube-dl | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/youtube-dl b/youtube-dl index b466c1570..d8b33e52c 100755 --- a/youtube-dl +++ b/youtube-dl @@ -1171,7 +1171,9 @@ class InfoExtractor(object): class YoutubeIE(InfoExtractor): """Information extractor for youtube.com.""" - _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$' + _PREFIX = r'(?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)' + _VALID_URL = r'^('+_PREFIX+r'(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$' + _VALID_URL_WITH_AGE = r'^('+_PREFIX+')verify_age\?next_url=([^&]+)(?:.+)?$' _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en' _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' @@ -1335,6 +1337,14 @@ class YoutubeIE(InfoExtractor): return def _real_extract(self, url): + # Extract original video URL from URL with age verification, using next_url parameter + mobj = re.match(self._VALID_URL_WITH_AGE, url) + if mobj: + urldecode = lambda x: re.sub(r'%([0-9a-hA-H][0-9a-hA-H])', lambda m: chr(int(m.group(1), 16)), x) + # Keep original domain. We can probably change to www.youtube.com, but it should not hurt so keep it. + # We just make sure we do not have double //, in URL, so we strip starting slash in next_url. + url = mobj.group(1) + re.sub(r'^/', '', urldecode(mobj.group(2))) + # Extract video id from URL mobj = re.match(self._VALID_URL, url) if mobj is None: From 071940680fc19be26e068c4c72d508ca252c1031 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Sat, 24 Mar 2012 01:17:36 +0100 Subject: [PATCH 3/4] Always extract original URL from next_url (#318) --- youtube-dl | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/youtube-dl b/youtube-dl index d8b33e52c..c7a116d9d 100755 --- a/youtube-dl +++ b/youtube-dl @@ -1171,12 +1171,11 @@ class InfoExtractor(object): class YoutubeIE(InfoExtractor): """Information extractor for youtube.com.""" - _PREFIX = r'(?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)' - _VALID_URL = r'^('+_PREFIX+r'(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$' - _VALID_URL_WITH_AGE = r'^('+_PREFIX+')verify_age\?next_url=([^&]+)(?:.+)?$' + _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$' _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en' _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' + _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' _NETRC_MACHINE = 'youtube' # Listed in order of quality _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13'] @@ -1337,13 +1336,10 @@ class YoutubeIE(InfoExtractor): return def _real_extract(self, url): - # Extract original video URL from URL with age verification, using next_url parameter - mobj = re.match(self._VALID_URL_WITH_AGE, url) + # Extract original video URL from URL with redirection, like age verification, using next_url parameter + mobj = re.search(self._NEXT_URL_RE, url) if mobj: - urldecode = lambda x: re.sub(r'%([0-9a-hA-H][0-9a-hA-H])', lambda m: chr(int(m.group(1), 16)), x) - # Keep original domain. We can probably change to www.youtube.com, but it should not hurt so keep it. - # We just make sure we do not have double //, in URL, so we strip starting slash in next_url. - url = mobj.group(1) + re.sub(r'^/', '', urldecode(mobj.group(2))) + url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/') # Extract video id from URL mobj = re.match(self._VALID_URL, url) From 74a5ff5f43fcdf4e1de2375b3edabd69ee78ebae Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Sat, 24 Mar 2012 01:23:19 +0100 Subject: [PATCH 4/4] transplant ceba827e9aab563ae7c7190fc236ec1aa358ee59, d891ff9fd9952b2829a47b508acf40d951546123, 69d3b2d8249e677c40d529e017cb765f50ba3fa0, 071940680fc19be26e068c4c72d508ca252c1031 --- youtube-dl | 1 + youtube_dl/__init__.py | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/youtube-dl b/youtube-dl index c7a116d9d..595cce497 100755 --- a/youtube-dl +++ b/youtube-dl @@ -15,6 +15,7 @@ __authors__ = ( 'Kevin Ngo', 'Ori Avtalion', 'shizeeg', + 'Filippo Valsorda', ) __license__ = 'Public Domain' diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 5f874b72f..595cce497 100755 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -1176,6 +1176,7 @@ class YoutubeIE(InfoExtractor): _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en' _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' + _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' _NETRC_MACHINE = 'youtube' # Listed in order of quality _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13'] @@ -1336,6 +1337,11 @@ def _real_initialize(self): return def _real_extract(self, url): + # Extract original video URL from URL with redirection, like age verification, using next_url parameter + mobj = re.search(self._NEXT_URL_RE, url) + if mobj: + url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/') + # Extract video id from URL mobj = re.match(self._VALID_URL, url) if mobj is None: @@ -4564,6 +4570,7 @@ def _real_main(): except IOError: sys.exit(u'ERROR: batch file could not be read') all_urls = batchurls + args + all_urls = map(lambda url: url.strip(), all_urls) # General configuration cookie_processor = urllib2.HTTPCookieProcessor(jar)