[ted] Fix playlist extraction (closes #20844)

This commit is contained in:
biwubo 2019-05-09 18:11:27 +00:00 committed by Sergey M․
parent 4831ef7fe4
commit c2ee6fa66a
No known key found for this signature in database
GPG key ID: 2C393E0F18A9236D

View file

@ -5,8 +5,12 @@
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str from ..compat import (
compat_str,
compat_urlparse
)
from ..utils import ( from ..utils import (
extract_attributes,
float_or_none, float_or_none,
int_or_none, int_or_none,
try_get, try_get,
@ -20,7 +24,7 @@ class TEDIE(InfoExtractor):
(?P<proto>https?://) (?P<proto>https?://)
(?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/ (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
( (
(?P<type_playlist>playlists(?:/\d+)?) # We have a playlist (?P<type_playlist>playlists(?:/(?P<playlist_id>\d+))?) # We have a playlist
| |
((?P<type_talk>talks)) # We have a simple talk ((?P<type_talk>talks)) # We have a simple talk
| |
@ -84,6 +88,7 @@ class TEDIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': '10', 'id': '10',
'title': 'Who are the hackers?', 'title': 'Who are the hackers?',
'description': 'md5:49a0dbe8fb76d81a0e64b4a80af7f15a'
}, },
'playlist_mincount': 6, 'playlist_mincount': 6,
}, { }, {
@ -150,22 +155,19 @@ def _playlist_videos_info(self, url, name):
webpage = self._download_webpage(url, name, webpage = self._download_webpage(url, name,
'Downloading playlist webpage') 'Downloading playlist webpage')
info = self._extract_info(webpage)
playlist_info = try_get( playlist_entries = []
info, lambda x: x['__INITIAL_DATA__']['playlist'], for entry in re.findall(r'(?s)<[^>]+data-ga-context="playlist"[^>]*>', webpage):
dict) or info['playlist'] attrs = extract_attributes(entry)
entry_url = compat_urlparse.urljoin(url, attrs['href'])
playlist_entries.append(self.url_result(entry_url, self.ie_key()))
playlist_entries = [ final_url = self._og_search_url(webpage)
self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
for talk in try_get(
info, lambda x: x['__INITIAL_DATA__']['talks'],
dict) or info['talks']
]
return self.playlist_result( return self.playlist_result(
playlist_entries, playlist_entries,
playlist_id=compat_str(playlist_info['id']), playlist_id=re.match(self._VALID_URL, final_url, re.VERBOSE).group('playlist_id'),
playlist_title=playlist_info['title']) playlist_title=self._og_search_title(webpage),
playlist_description=self._og_search_description(webpage))
def _talk_info(self, url, video_name): def _talk_info(self, url, video_name):
webpage = self._download_webpage(url, video_name) webpage = self._download_webpage(url, video_name)