Merge branch 'polskie-radio-programme' of https://github.com/JakubAdamWieczorek/youtube-dl

This commit is contained in:
Sergey M․ 2016-09-10 00:42:36 +07:00
commit 1e19ff2984
No known key found for this signature in database
GPG key ID: 2C393E0F18A9236D
2 changed files with 80 additions and 1 deletions

View file

@ -671,7 +671,7 @@
)
from .podomatic import PodomaticIE
from .pokemon import PokemonIE
from .polskieradio import PolskieRadioIE
from .polskieradio import PolskieRadioIE, PolskieRadioProgrammeIE
from .porn91 import Porn91IE
from .porncom import PornComIE
from .pornhd import PornHdIE

View file

@ -7,6 +7,7 @@
from ..compat import (
compat_str,
compat_urllib_parse_unquote,
compat_urlparse
)
from ..utils import (
int_or_none,
@ -15,6 +16,84 @@
)
class PolskieRadioProgrammeIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+(,[^/]+)?/(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.polskieradio.pl/7/5102,HISTORIA-ZYWA',
'info_dict': {
'id': '5102',
'title': 'HISTORIA ŻYWA',
},
'playlist_mincount': 34,
}, {
'url': 'http://www.polskieradio.pl/7/4807',
'info_dict': {
'id': '4807',
'title': 'Vademecum 1050. rocznicy Chrztu Polski'
},
'playlist_mincount': 5
}, {
'url': 'http://www.polskieradio.pl/7/129,Sygnaly-dnia?ref=source',
'only_matching': True
}, {
'url': 'http://www.polskieradio.pl/37,RedakcjaKatolicka/4143,Kierunek-Krakow',
'info_dict': {
'id': '4143',
'title': 'Kierunek Kraków',
},
'playlist_mincount': 61
}, {
'url': 'http://www.polskieradio.pl/7,Jedynka/5102,HISTORIA-ZYWA',
'only_matching': True
}]
def _get_entries_from_page_content(self, base_url, content):
entries = []
articles = re.findall(
r'<article class="ID-(\d+) article">\s+<a href="([^"]+)"( data-layer="[^"]*")? class="[^"]*" title="([^"]+)">',
content)
for article_id, article_url, _, article_title in articles:
resolved_article_url = compat_urlparse.urljoin(base_url, article_url)
entries.append(self.url_result(
resolved_article_url,
ie='PolskieRadio',
video_id=article_id,
video_title=article_title))
return entries
@classmethod
def suitable(cls, url):
return False if PolskieRadioIE.suitable(url) else super(PolskieRadioProgrammeIE, cls).suitable(url)
def _real_extract(self, url):
programme_id = self._match_id(url)
webpage = self._download_webpage(url, programme_id)
title = self._html_search_regex(
r'<a href="[^"]+" id=".*_linkCategory" title="[^"]+">(.+?)</a>',
webpage, 'title', fatal=False)
description = None
entries = self._get_entries_from_page_content(url, webpage)
pages = re.findall(r'<a( href="([^"]+/Strona/)\d+")? id="[^"]+" title="strona&#32;(\d+)"', webpage)
page_count = max(int(page_number) for _, _, page_number in pages) if pages else 1
if page_count > 1:
page_url_root = next(url for _, url, _ in pages if len(url) > 0)
for page_number in range(2, page_count + 1):
page_url = page_url_root + str(page_number)
resolved_page_url = compat_urlparse.urljoin(url, page_url)
page_content = self._download_webpage(
resolved_page_url, programme_id,
note="Downloading page number %d" % page_number)
entries.extend(self._get_entries_from_page_content(url, page_content))
return self.playlist_result(entries, programme_id, title, description)
class PolskieRadioIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)'
_TESTS = [{