[youporn] Improve formats extraction

2024-11-30 12:01:28 +00:00 · 2017-06-22 00:40:15 +07:00 · 2017-06-22 00:40:15 +07:00 · d4893e764b
parent 97b6e30113
commit d4893e764b
1 changed files with 24 additions and 8 deletions
--- a/youtube_dl/extractor/youporn.py
+++ b/youtube_dl/extractor/youporn.py
@ -3,6 +3,7 @@
 import re
 from .common import InfoExtractor
 from ..compat import compat_str
 from ..utils import (
    int_or_none,
    sanitized_Request,
@ -68,7 +69,7 @@ def _real_extract(self, url):
        webpage = self._download_webpage(request, display_id)
        title = self._search_regex(
-            [r'(?:video_titles|videoTitle|title)\s*[:=]\s*(["\'])(?P<title>(?:(?!\1).)+)\1',
+            [r'(?:video_titles|videoTitle)\s*[:=]\s*(["\'])(?P<title>(?:(?!\1).)+)\1',
             r'<h1[^>]+class=["\']heading\d?["\'][^>]*>(?P<title>[^<]+)<'],
            webpage, 'title', group='title',
            default=None) or self._og_search_title(
@ -77,22 +78,37 @@ def _real_extract(self, url):
        links = []
        # Main source
        definitions = self._parse_json(
            self._search_regex(
                r'mediaDefinition\s*=\s*(\[.+?\]);', webpage,
                'media definitions', default='[]'),
            video_id, fatal=False)
        if definitions:
            for definition in definitions:
                if not isinstance(definition, dict):
                    continue
                video_url = definition.get('videoUrl')
                if isinstance(video_url, compat_str) and video_url:
                    links.append(video_url)
        # Fallback #1, this also contains extra low quality 180p format
        for _, link in re.findall(r'<a[^>]+href=(["\'])(http.+?)\1[^>]+title=["\']Download [Vv]ideo', webpage):
            links.append(link)
        # Fallback #2 (unavailable as at 22.06.2017)
        sources = self._search_regex(
            r'(?s)sources\s*:\s*({.+?})', webpage, 'sources', default=None)
        if sources:
            for _, link in re.findall(r'[^:]+\s*:\s*(["\'])(http.+?)\1', sources):
                links.append(link)
-        # Fallback #1
+        # Fallback #3 (unavailable as at 22.06.2017)
        for _, link in re.findall(
-                r'(?:videoUrl|videoSrc|videoIpadUrl|html5PlayerSrc)\s*[:=]\s*(["\'])(http.+?)\1', webpage):
+                r'(?:videoSrc|videoIpadUrl|html5PlayerSrc)\s*[:=]\s*(["\'])(http.+?)\1', webpage):
            links.append(link)
-        # Fallback #2, this also contains extra low quality 180p format
+        # Fallback #4, encrypted links (unavailable as at 22.06.2017)
        for _, link in re.findall(r'<a[^>]+href=(["\'])(http.+?)\1[^>]+title=["\']Download [Vv]ideo', webpage):
            links.append(link)
        # Fallback #3, encrypted links
        for _, encrypted_link in re.findall(
                r'encryptedQuality\d{3,4}URL\s*=\s*(["\'])([\da-zA-Z+/=]+)\1', webpage):
            links.append(aes_decrypt_text(encrypted_link, title, 32).decode('utf-8'))