[cbc] Improve extraction (closes #16583, closes #16593)

This commit is contained in:
Sergey M․ 2018-06-02 00:35:07 +07:00
parent b995043ab8
commit f20f636596
No known key found for this signature in database
GPG key ID: 2C393E0F18A9236D

View file

@ -17,6 +17,7 @@
xpath_element, xpath_element,
xpath_with_ns, xpath_with_ns,
find_xpath_attr, find_xpath_attr,
orderedSet,
parse_duration, parse_duration,
parse_iso8601, parse_iso8601,
parse_age_limit, parse_age_limit,
@ -136,9 +137,15 @@ def _real_extract(self, url):
entries = [ entries = [
self._extract_player_init(player_init, display_id) self._extract_player_init(player_init, display_id)
for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)] for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)]
media_ids = []
for media_id_re in (
r'<iframe[^>]+src="[^"]+?mediaId=(\d+)"',
r'<div[^>]+\bid=["\']player-(\d+)',
r'guid["\']\s*:\s*["\'](\d+)'):
media_ids.extend(re.findall(media_id_re, webpage))
entries.extend([ entries.extend([
self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id)
for media_id in re.findall(r'<iframe[^>]+src="[^"]+?mediaId=(\d+)"', webpage)]) for media_id in orderedSet(media_ids)])
return self.playlist_result( return self.playlist_result(
entries, display_id, strip_or_none(title), entries, display_id, strip_or_none(title),
self._og_search_description(webpage)) self._og_search_description(webpage))