From ac12e888f9d76c75666822a79c125db8577e5fa5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 5 May 2016 21:02:54 +0600 Subject: [PATCH] [redtube] Extract all formats, duration, upload date and view count (Closes #9397) --- youtube_dl/extractor/redtube.py | 59 +++++++++++++++++++++++++++------ 1 file changed, 49 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index 7ba41ba593..1e532286c7 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -1,7 +1,12 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + int_or_none, + str_to_int, + unified_strdate, +) class RedTubeIE(InfoExtractor): @@ -13,6 +18,9 @@ class RedTubeIE(InfoExtractor): 'id': '66418', 'ext': 'mp4', 'title': 'Sucked on a toilet', + 'upload_date': '20120831', + 'duration': 596, + 'view_count': int, 'age_limit': 18, } } @@ -24,12 +32,40 @@ def _real_extract(self, url): if any(s in webpage for s in ['video-deleted-info', '>This video has been removed']): raise ExtractorError('Video %s has been removed' % video_id, expected=True) - video_url = self._html_search_regex( - r'', webpage, 'video URL') - video_title = self._html_search_regex( - r'

(.+?)

', - webpage, 'title') - video_thumbnail = self._og_search_thumbnail(webpage) + title = self._html_search_regex( + (r'

(?P.+?)</h1>', + r'videoTitle\s*:\s*(["\'])(?P<title>)\1'), + webpage, 'title', group='title') + + formats = [] + sources = self._parse_json( + self._search_regex( + r'sources\s*:\s*({.+?})', webpage, 'source', default='{}'), + video_id, fatal=False) + if sources and isinstance(sources, dict): + for format_id, format_url in sources.items(): + if format_url: + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'height': int_or_none(format_id), + }) + else: + video_url = self._html_search_regex( + r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL') + formats.append({'url': video_url}) + self._sort_formats(formats) + + thumbnail = self._og_search_thumbnail(webpage) + upload_date = unified_strdate(self._search_regex( + r'<span[^>]+class="added-time"[^>]*>ADDED ([^<]+)<', + webpage, 'upload date', fatal=False)) + duration = int_or_none(self._search_regex( + r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False)) + view_count = str_to_int(self._search_regex( + r'<span[^>]*>VIEWS</span></td>\s*<td>([\d,.]+)', + webpage, 'view count', fatal=False)) + # No self-labeling, but they describe themselves as # "Home of Videos Porno" @@ -37,9 +73,12 @@ def _real_extract(self, url): return { 'id': video_id, - 'url': video_url, 'ext': 'mp4', - 'title': video_title, - 'thumbnail': video_thumbnail, + 'title': title, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'duration': duration, + 'view_count': view_count, 'age_limit': age_limit, + 'formats': formats, }