From ef9f2ba7afe0966b7d65158b663f9fcc11db3fd1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 21 Jan 2014 19:44:47 +0100 Subject: [PATCH 1/6] [mtv] Use unicode_literals --- youtube_dl/extractor/mtv.py | 44 +++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index c4fa16fb6d..e24f226560 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re import xml.etree.ElementTree @@ -36,7 +38,7 @@ def _get_thumbnail_url(self, uri, itemdoc): def _extract_video_formats(self, metadataXml): if '/error_country_block.swf' in metadataXml: - raise ExtractorError(u'This video is not available from your country.', expected=True) + raise ExtractorError('This video is not available from your country.', expected=True) mdoc = xml.etree.ElementTree.fromstring(metadataXml.encode('utf-8')) formats = [] @@ -60,11 +62,11 @@ def _get_video_info(self, itemdoc): self.report_extraction(video_id) mediagen_url = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content'))).attrib['url'] # Remove the templates, like &device={device} - mediagen_url = re.sub(r'&[^=]*?={.*?}(?=(&|$))', u'', mediagen_url) + mediagen_url = re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', mediagen_url) if 'acceptMethods' not in mediagen_url: mediagen_url += '&acceptMethods=fms' mediagen_page = self._download_webpage(mediagen_url, video_id, - u'Downloading video urls') + 'Downloading video urls') description_node = itemdoc.find('description') if description_node is not None: @@ -86,7 +88,7 @@ def _get_videos_info(self, uri): idoc = self._download_xml( self._FEED_URL + '?' + data, video_id, - u'Downloading info', transform_source=fix_xml_ampersands) + 'Downloading info', transform_source=fix_xml_ampersands) return [self._get_video_info(item) for item in idoc.findall('.//item')] @@ -99,25 +101,25 @@ class MTVIE(MTVServicesInfoExtractor): _TESTS = [ { - u'url': u'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml', - u'file': u'853555.mp4', - u'md5': u'850f3f143316b1e71fa56a4edfd6e0f8', - u'info_dict': { - u'title': u'Taylor Swift - "Ours (VH1 Storytellers)"', - u'description': u'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.', + 'url': 'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml', + 'file': '853555.mp4', + 'md5': '850f3f143316b1e71fa56a4edfd6e0f8', + 'info_dict': { + 'title': 'Taylor Swift - "Ours (VH1 Storytellers)"', + 'description': 'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.', }, }, { - u'add_ie': ['Vevo'], - u'url': u'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml', - u'file': u'USCJY1331283.mp4', - u'md5': u'73b4e7fcadd88929292fe52c3ced8caf', - u'info_dict': { - u'title': u'Everything Has Changed', - u'upload_date': u'20130606', - u'uploader': u'Taylor Swift', + 'add_ie': ['Vevo'], + 'url': 'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml', + 'file': 'USCJY1331283.mp4', + 'md5': '73b4e7fcadd88929292fe52c3ced8caf', + 'info_dict': { + 'title': 'Everything Has Changed', + 'upload_date': '20130606', + 'uploader': 'Taylor Swift', }, - u'skip': u'VEVO is only available in some countries', + 'skip': 'VEVO is only available in some countries', }, ] @@ -136,8 +138,8 @@ def _real_extract(self, url): webpage, re.DOTALL) if m_vevo: vevo_id = m_vevo.group(1); - self.to_screen(u'Vevo video detected: %s' % vevo_id) + self.to_screen('Vevo video detected: %s' % vevo_id) return self.url_result('vevo:%s' % vevo_id, ie='Vevo') - uri = self._html_search_regex(r'/uri/(.*?)\?', webpage, u'uri') + uri = self._html_search_regex(r'/uri/(.*?)\?', webpage, 'uri') return self._get_videos_info(uri) From e4f320a4d044b690721016e36972cd547ee787d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 21 Jan 2014 19:57:38 +0100 Subject: [PATCH 2/6] =?UTF-8?q?[mtv]=20Check=20for=20geo-blocked=20videos?= =?UTF-8?q?=20in=20the=20xml=20document,=20not=20in=20the=20xml=E2=80=99s?= =?UTF-8?q?=20string?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allows to use the `_download_xml` method --- youtube_dl/extractor/mtv.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index e24f226560..485c1fd7d0 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -1,7 +1,6 @@ from __future__ import unicode_literals import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -36,10 +35,9 @@ def _get_thumbnail_url(self, uri, itemdoc): else: return thumb_node.attrib['url'] - def _extract_video_formats(self, metadataXml): - if '/error_country_block.swf' in metadataXml: + def _extract_video_formats(self, mdoc): + if re.match(r'.*/error_country_block\.swf$', mdoc.find('.//src').text) is not None: raise ExtractorError('This video is not available from your country.', expected=True) - mdoc = xml.etree.ElementTree.fromstring(metadataXml.encode('utf-8')) formats = [] for rendition in mdoc.findall('.//rendition'): @@ -65,8 +63,8 @@ def _get_video_info(self, itemdoc): mediagen_url = re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', mediagen_url) if 'acceptMethods' not in mediagen_url: mediagen_url += '&acceptMethods=fms' - mediagen_page = self._download_webpage(mediagen_url, video_id, - 'Downloading video urls') + mediagen_doc = self._download_xml(mediagen_url, video_id, + 'Downloading video urls') description_node = itemdoc.find('description') if description_node is not None: @@ -76,7 +74,7 @@ def _get_video_info(self, itemdoc): return { 'title': itemdoc.find('title').text, - 'formats': self._extract_video_formats(mediagen_page), + 'formats': self._extract_video_formats(mediagen_doc), 'id': video_id, 'thumbnail': self._get_thumbnail_url(uri, itemdoc), 'description': description, From 8d9453b9e852b585cd7d0228c126d36b682af42f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 21 Jan 2014 20:54:47 +0100 Subject: [PATCH 3/6] Add an extractor for spike.com (#2072) Added a generic _real_extract to MTVServicesInfoExtractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/mtv.py | 13 +++++++++++++ youtube_dl/extractor/spike.py | 19 +++++++++++++++++++ 3 files changed, 33 insertions(+) create mode 100644 youtube_dl/extractor/spike.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 7b374f7b91..d37f0a1783 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -171,6 +171,7 @@ from .space import SpaceIE from .spankwire import SpankwireIE from .spiegel import SpiegelIE +from .spike import SpikeIE from .stanfordoc import StanfordOpenClassroomIE from .statigram import StatigramIE from .steam import SteamIE diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 485c1fd7d0..5171155017 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -7,6 +7,8 @@ compat_urllib_parse, ExtractorError, fix_xml_ampersands, + url_basename, + RegexNotFoundError, ) def _media_xml_tag(tag): @@ -89,6 +91,17 @@ def _get_videos_info(self, uri): 'Downloading info', transform_source=fix_xml_ampersands) return [self._get_video_info(item) for item in idoc.findall('.//item')] + def _real_extract(self, url): + title = url_basename(url) + webpage = self._download_webpage(url, title) + try: + # the url is in the format http://media.mtvnservices.com/fb/{mgid}.swf + fb_url = self._og_search_video_url(webpage) + mgid = url_basename(fb_url).rpartition('.')[0] + except RegexNotFoundError: + mgid = self._search_regex(r'data-mgid="(.*?)"', webpage, u'mgid') + return self._get_videos_info(mgid) + class MTVIE(MTVServicesInfoExtractor): _VALID_URL = r'''(?x)^https?:// diff --git a/youtube_dl/extractor/spike.py b/youtube_dl/extractor/spike.py new file mode 100644 index 0000000000..71a9aaa24d --- /dev/null +++ b/youtube_dl/extractor/spike.py @@ -0,0 +1,19 @@ +from __future__ import unicode_literals + +from .mtv import MTVServicesInfoExtractor + + +class SpikeIE(MTVServicesInfoExtractor): + _VALID_URL = r'https?://www\.spike\.com/(video-clips|episodes)/.+' + _TEST = { + 'url': 'http://www.spike.com/video-clips/lhtu8m/auction-hunters-can-allen-ride-a-hundred-year-old-motorcycle', + 'md5': '1a9265f32b0c375793d6c4ce45255256', + 'info_dict': { + 'id': 'b9c8221a-4e50-479a-b86d-3333323e38ba', + 'ext': 'mp4', + 'title': 'Can Allen Ride A Hundred Year-Old Motorcycle?', + 'description': 'md5:fbed7e82ed5fad493615b3094a9499cb', + }, + } + + _FEED_URL = 'http://www.spike.com/feeds/mrss/' From bc4ba05fcbb20dfead6796b0878427b51c9f150a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 21 Jan 2014 20:59:31 +0100 Subject: [PATCH 4/6] [mtv] Add an extractor for mtviggy.com (#2072) --- youtube_dl/extractor/__init__.py | 5 ++++- youtube_dl/extractor/mtv.py | 14 ++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index d37f0a1783..4d6aeabdf6 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -119,7 +119,10 @@ from .mixcloud import MixcloudIE from .mpora import MporaIE from .mofosex import MofosexIE -from .mtv import MTVIE +from .mtv import ( + MTVIE, + MTVIggyIE, +) from .muzu import MuzuTVIE from .myspace import MySpaceIE from .myspass import MySpassIE diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 5171155017..127fbeb4ed 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -154,3 +154,17 @@ def _real_extract(self, url): uri = self._html_search_regex(r'/uri/(.*?)\?', webpage, 'uri') return self._get_videos_info(uri) + + +class MTVIggyIE(MTVServicesInfoExtractor): + IE_NAME = 'mtviggy.com' + _VALID_URL = r'https?://www\.mtviggy\.com/videos/.+' + _TEST = { + 'url': 'http://www.mtviggy.com/videos/arcade-fire-behind-the-scenes-at-the-biggest-music-experiment-yet/', + 'info_dict': { + 'id': '984696', + 'ext': 'mp4', + 'title': 'Short', + } + } + _FEED_URL = 'http://all.mtvworldverticals.com/feed-xml/' From d614aa40e35825e1cde7c92fc6092d226afe4898 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 21 Jan 2014 21:53:10 +0100 Subject: [PATCH 5/6] [brightcove] Fix check for url in the result MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It may have the ‘formats’ field instead of ‘url’. --- youtube_dl/extractor/brightcove.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index b873dc0d44..e13c040f8b 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -230,6 +230,6 @@ def _extract_video_info(self, video_info): else: return ad_info - if 'url' not in info: + if 'url' not in info and not info.get('formats'): raise ExtractorError('Unable to extract video url for %s' % info['id']) return info From 47917f24c499f7949b04a23c35459ca69adae62d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 21 Jan 2014 22:04:46 +0100 Subject: [PATCH 6/6] [brightcove] Fix extraction of embedded videos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There was a leading ‘:’ in the regex. The ‘flashvars’ parameter is not always available. --- youtube_dl/extractor/brightcove.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index e13c040f8b..e1c45d1f0b 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -90,9 +90,12 @@ def _build_brighcove_url(cls, object_str): object_doc = xml.etree.ElementTree.fromstring(object_str) fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars') - flashvars = dict( - (k, v[0]) - for k, v in compat_parse_qs(fv_el.attrib['value']).items()) + if fv_el is not None: + flashvars = dict( + (k, v[0]) + for k, v in compat_parse_qs(fv_el.attrib['value']).items()) + else: + flashvars = {} def find_param(name): if name in flashvars: @@ -131,7 +134,7 @@ def _extract_brightcove_url(cls, webpage): m_brightcove = re.search( r'''(?sx)]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1 | + [^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1 | [^>]*?>\s*''', webpage)