From 2bca84e345b3f64f832128dbe24b3384e7ec9751 Mon Sep 17 00:00:00 2001 From: 5moufl Date: Sat, 13 Sep 2014 17:47:19 +0200 Subject: [PATCH 1/2] [BehindKink] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/behindkink.py | 56 ++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100644 youtube_dl/extractor/behindkink.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 5e7a0a775..5a02bea05 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -25,6 +25,7 @@ from .bandcamp import BandcampIE, BandcampAlbumIE from .bbccouk import BBCCoUkIE from .beeg import BeegIE +from .behindkink import BehindKinkIE from .bilibili import BiliBiliIE from .blinkx import BlinkxIE from .bliptv import BlipTVIE, BlipTVUserIE diff --git a/youtube_dl/extractor/behindkink.py b/youtube_dl/extractor/behindkink.py new file mode 100644 index 000000000..f0a86fda3 --- /dev/null +++ b/youtube_dl/extractor/behindkink.py @@ -0,0 +1,56 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import url_basename + + +class BehindKinkIE(InfoExtractor): + _VALID_URL = r'(?:http://)(?:www\.)?behindkink\.com/(?P[0-9]{4})/(?P[0-9]{2})/(?P[0-9]{2})/(?P[^/?_]+)' + _TEST = { + 'url': 'http://www.behindkink.com/2014/08/14/ab1576-performers-voice-finally-heard-the-bill-is-killed/', + 'md5': '41ad01222b8442089a55528fec43ec01', + 'info_dict': { + 'id': '36370', + 'ext': 'mp4', + 'title': 'AB1576 - PERFORMERS VOICE FINALLY HEARD - THE BILL IS KILLED!', + 'description': 'The adult industry voice was finally heard as Assembly Bill 1576 remained\xa0 in suspense today at the Senate Appropriations Hearing. AB1576 was, among other industry damaging issues, a condom mandate...', + 'upload_date': '20140814', + 'thumbnail': 'http://www.behindkink.com/wp-content/uploads/2014/08/36370_AB1576_Win.jpg', + 'age_limit': 18, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('id') + year = mobj.group('year') + month = mobj.group('month') + day = mobj.group('day') + upload_date = year + month + day + + webpage_url = 'http://www.behindkink.com/' + year + '/' + month + '/' + day + '/' + display_id + webpage = self._download_webpage(webpage_url, display_id) + + self.report_extraction(display_id) + video_url = self._search_regex( + r"'file':\s*'([^']+)'", + webpage, 'URL base') + + video_id = url_basename(video_url) + video_id = video_id.split('_')[0] + self.report_extraction(video_id) + + return { + 'id': video_id, + 'url': video_url, + 'ext': 'mp4', + 'title': self._og_search_title(webpage), + 'display_id': display_id, + 'thumbnail': self._og_search_thumbnail(webpage), + 'description': self._og_search_description(webpage), + 'upload_date': upload_date, + 'age_limit': 18, + } From 6d1f2431bd3ac1bd7d59c01747c190f42656f383 Mon Sep 17 00:00:00 2001 From: 5moufl Date: Mon, 15 Sep 2014 15:09:17 +0200 Subject: [PATCH 2/2] [BehindKink] Minor fixes - fix _VALID_URL regex - remove unnecessary variable - remove second call of report_extraction --- youtube_dl/extractor/behindkink.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/behindkink.py b/youtube_dl/extractor/behindkink.py index f0a86fda3..12f13aae9 100644 --- a/youtube_dl/extractor/behindkink.py +++ b/youtube_dl/extractor/behindkink.py @@ -8,7 +8,7 @@ class BehindKinkIE(InfoExtractor): - _VALID_URL = r'(?:http://)(?:www\.)?behindkink\.com/(?P[0-9]{4})/(?P[0-9]{2})/(?P[0-9]{2})/(?P[^/?_]+)' + _VALID_URL = r'http://(?:www\.)?behindkink\.com/(?P[0-9]{4})/(?P[0-9]{2})/(?P[0-9]{2})/(?P[^/#?_]+)' _TEST = { 'url': 'http://www.behindkink.com/2014/08/14/ab1576-performers-voice-finally-heard-the-bill-is-killed/', 'md5': '41ad01222b8442089a55528fec43ec01', @@ -31,8 +31,7 @@ def _real_extract(self, url): day = mobj.group('day') upload_date = year + month + day - webpage_url = 'http://www.behindkink.com/' + year + '/' + month + '/' + day + '/' + display_id - webpage = self._download_webpage(webpage_url, display_id) + webpage = self._download_webpage(url, display_id) self.report_extraction(display_id) video_url = self._search_regex( @@ -41,7 +40,6 @@ def _real_extract(self, url): video_id = url_basename(video_url) video_id = video_id.split('_')[0] - self.report_extraction(video_id) return { 'id': video_id,