[wsj:article] Add extractor

This commit is contained in:
John Hawkinson 2017-03-25 19:47:48 -04:00 committed by Sergey M․
parent 0254f93b08
commit 3266d08af2
No known key found for this signature in database
GPG key ID: 2C393E0F18A9236D
2 changed files with 29 additions and 4 deletions

View file

@ -1233,7 +1233,10 @@
WrzutaIE, WrzutaIE,
WrzutaPlaylistIE, WrzutaPlaylistIE,
) )
from .wsj import WSJIE from .wsj import (
WSJIE,
WSJArticleIE,
)
from .xbef import XBefIE from .xbef import XBefIE
from .xboxclips import XboxClipsIE from .xboxclips import XboxClipsIE
from .xfileshare import XFileShareIE from .xfileshare import XFileShareIE

View file

@ -10,10 +10,11 @@
class WSJIE(InfoExtractor): class WSJIE(InfoExtractor):
_VALID_URL = r'''(?x)https?:// _VALID_URL = r'''(?x)
(?: (?:
video-api\.wsj\.com/api-video/player/iframe\.html\?guid=| https?://video-api\.wsj\.com/api-video/player/iframe\.html\?guid=|
(?:www\.)?wsj\.com/video/[^/]+/ https?://(?:www\.)?wsj\.com/video/[^/]+/|
wsj:
) )
(?P<id>[a-zA-Z0-9-]+)''' (?P<id>[a-zA-Z0-9-]+)'''
IE_DESC = 'Wall Street Journal' IE_DESC = 'Wall Street Journal'
@ -87,3 +88,24 @@ def _real_extract(self, url):
'title': title, 'title': title,
'categories': info.get('keywords'), 'categories': info.get('keywords'),
} }
class WSJArticleIE(InfoExtractor):
_VALID_URL = r'(?i)https?://(?:www\.)?wsj\.com/articles/(?P<id>\w[^/]+)'
_TESTS = [{
'url': 'https://www.wsj.com/articles/dont-like-china-no-pandas-for-you-1490366939?',
'info_dict': {
'id': '4B13FA62-1D8C-45DB-8EA1-4105CB20B362',
'ext': 'mp4',
'upload_date': '20170221',
'uploader_id': 'ralcaraz',
'title': 'Bao Bao the Panda Leaves for China',
}
}]
def _real_extract(self, url):
article_id = self._match_id(url)
webpage = self._download_webpage(url, article_id)
video_id = self._search_regex(r'data-src=["\']([A-Z0-9\-]+)',
webpage, 'video id')
return self.url_result('wsj:%s' % video_id, WSJIE.ie_key(), video_id)