diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index e58385c57..ccc2d476d 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -4,7 +4,10 @@ import re from .common import InfoExtractor -from ..utils import parse_filesize +from ..utils import ( + determine_ext, + parse_filesize, +) class TagesschauIE(InfoExtractor): @@ -82,37 +85,54 @@ class TagesschauIE(InfoExtractor): 'xxl': {'quality': 5}, } - def _extract_formats(self, download_text): + def _extract_formats(self, download_text, media_kind): links = re.finditer( r'
', download_text) formats = [] for l in links: + link_url = l.group('url') + if not link_url: + continue format_id = self._search_regex( - r'.*/[^/.]+\.([^/]+)\.[^/.]+', l.group('url'), 'format ID') + r'.*/[^/.]+\.([^/]+)\.[^/.]+$', link_url, 'format ID', + default=determine_ext(link_url)) format = { 'format_id': format_id, 'url': l.group('url'), 'format_name': l.group('name'), } - m = re.match( - r'''(?x) - Video:\s*(?PWir bieten dieses Video in folgenden Formaten zum Download an:
\s*' + DOWNLOAD_REGEX = r'(?s)
Wir bieten dieses (?P
' webpage_type = self._og_search_property('type', webpage, default=None) if webpage_type == 'website': # Article entries = [] - for num, (entry_title, download_text) in enumerate(re.findall( + for num, (entry_title, media_kind, download_text) in enumerate(re.findall( r'(?s)
]+class="infotext"[^>]*>.*?(.+?).*?
.*?%s' % DOWNLOAD_REGEX, webpage)): entries.append({ 'id': display_id, 'title': '%s-%d' % (entry_title, num), - 'formats': self._extract_formats(download_text), + 'formats': self._extract_formats(download_text, media_kind), }) return self.playlist_result(entries, display_id, title) else: # Assume single video - download_text = self._search_regex(DOWNLOAD_REGEX, webpage, 'download links') - formats = self._extract_formats(download_text) + download_text = self._search_regex( + DOWNLOAD_REGEX, webpage, 'download links', group='links') + media_kind = self._search_regex( + DOWNLOAD_REGEX, webpage, 'media kind', default='Video', group='links') + formats = self._extract_formats(download_text, media_kind) thumbnail = self._og_search_thumbnail(webpage) description = self._html_search_regex( r'(?s) ',