[youtube:playlist] Improve flat extraction (closes #21927)

This commit is contained in:
Sergey M․ 2019-08-13 05:02:52 +07:00
parent 3bce4ff7d9
commit 351f37c022
No known key found for this signature in database
GPG key ID: 2C393E0F18A9236D

View file

@ -31,6 +31,7 @@
clean_html, clean_html,
dict_get, dict_get,
error_to_compat_str, error_to_compat_str,
extract_attributes,
ExtractorError, ExtractorError,
float_or_none, float_or_none,
get_element_by_attribute, get_element_by_attribute,
@ -324,17 +325,18 @@ def _process_page(self, content):
for video_id, video_title in self.extract_videos_from_page(content): for video_id, video_title in self.extract_videos_from_page(content):
yield self.url_result(video_id, 'Youtube', video_id, video_title) yield self.url_result(video_id, 'Youtube', video_id, video_title)
def extract_videos_from_page(self, page): def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page):
ids_in_page = [] for mobj in re.finditer(video_re, page):
titles_in_page = []
for mobj in re.finditer(self._VIDEO_RE, page):
# The link with index 0 is not the first video of the playlist (not sure if still actual) # The link with index 0 is not the first video of the playlist (not sure if still actual)
if 'index' in mobj.groupdict() and mobj.group('id') == '0': if 'index' in mobj.groupdict() and mobj.group('id') == '0':
continue continue
video_id = mobj.group('id') video_id = mobj.group('id')
video_title = unescapeHTML(mobj.group('title')) video_title = unescapeHTML(
mobj.group('title')) if 'title' in mobj.groupdict() else None
if video_title: if video_title:
video_title = video_title.strip() video_title = video_title.strip()
if video_title == '► Play all':
video_title = None
try: try:
idx = ids_in_page.index(video_id) idx = ids_in_page.index(video_id)
if video_title and not titles_in_page[idx]: if video_title and not titles_in_page[idx]:
@ -342,6 +344,12 @@ def extract_videos_from_page(self, page):
except ValueError: except ValueError:
ids_in_page.append(video_id) ids_in_page.append(video_id)
titles_in_page.append(video_title) titles_in_page.append(video_title)
def extract_videos_from_page(self, page):
ids_in_page = []
titles_in_page = []
self.extract_videos_from_page_impl(
self._VIDEO_RE, page, ids_in_page, titles_in_page)
return zip(ids_in_page, titles_in_page) return zip(ids_in_page, titles_in_page)
@ -2438,7 +2446,8 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
(%(playlist_id)s) (%(playlist_id)s)
)""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
_TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:&amp;(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?' _VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&amp;(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?'
_VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})'
IE_NAME = 'youtube:playlist' IE_NAME = 'youtube:playlist'
_TESTS = [{ _TESTS = [{
'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
@ -2603,6 +2612,34 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
def _real_initialize(self): def _real_initialize(self):
self._login() self._login()
def extract_videos_from_page(self, page):
ids_in_page = []
titles_in_page = []
for item in re.findall(
r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page):
attrs = extract_attributes(item)
video_id = attrs['data-video-id']
video_title = unescapeHTML(attrs.get('data-title'))
if video_title:
video_title = video_title.strip()
ids_in_page.append(video_id)
titles_in_page.append(video_title)
# Fallback with old _VIDEO_RE
self.extract_videos_from_page_impl(
self._VIDEO_RE, page, ids_in_page, titles_in_page)
# Relaxed fallbacks
self.extract_videos_from_page_impl(
r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page,
ids_in_page, titles_in_page)
self.extract_videos_from_page_impl(
r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page,
ids_in_page, titles_in_page)
return zip(ids_in_page, titles_in_page)
def _extract_mix(self, playlist_id): def _extract_mix(self, playlist_id):
# The mixes are generated from a single video # The mixes are generated from a single video
# the id of the playlist is just 'RD' + video_id # the id of the playlist is just 'RD' + video_id