Improve URL extraction

This commit is contained in:
Sergey M․ 2018-07-21 19:08:28 +07:00
parent 4ecf300d13
commit 3052a30d42
No known key found for this signature in database
GPG key ID: 2C393E0F18A9236D
47 changed files with 166 additions and 139 deletions

View file

@ -7,6 +7,7 @@
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
strip_or_none, strip_or_none,
url_or_none,
) )
@ -98,7 +99,7 @@ def _real_extract(self, url):
if not video_id: if not video_id:
entries = [] entries = []
for episode in video_data.get('archiveEpisodes', []): for episode in video_data.get('archiveEpisodes', []):
episode_url = episode.get('url') episode_url = url_or_none(episode.get('url'))
if not episode_url: if not episode_url:
continue continue
entries.append(self.url_result( entries.append(self.url_result(

View file

@ -9,6 +9,7 @@
determine_ext, determine_ext,
ExtractorError, ExtractorError,
int_or_none, int_or_none,
url_or_none,
urlencode_postdata, urlencode_postdata,
xpath_text, xpath_text,
) )
@ -304,7 +305,7 @@ def _real_extract(self, url):
file_elements = video_element.findall(compat_xpath('./file')) file_elements = video_element.findall(compat_xpath('./file'))
one = len(file_elements) == 1 one = len(file_elements) == 1
for file_num, file_element in enumerate(file_elements, start=1): for file_num, file_element in enumerate(file_elements, start=1):
file_url = file_element.text file_url = url_or_none(file_element.text)
if not file_url: if not file_url:
continue continue
key = file_element.get('key', '') key = file_element.get('key', '')

View file

@ -3,11 +3,12 @@
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
int_or_none,
parse_iso8601,
mimetype2ext,
determine_ext, determine_ext,
ExtractorError, ExtractorError,
int_or_none,
mimetype2ext,
parse_iso8601,
url_or_none,
) )
@ -35,7 +36,7 @@ def get_media_node(name, default=None):
media_thumbnail = [media_thumbnail] media_thumbnail = [media_thumbnail]
for thumbnail_data in media_thumbnail: for thumbnail_data in media_thumbnail:
thumbnail = thumbnail_data.get('@attributes', {}) thumbnail = thumbnail_data.get('@attributes', {})
thumbnail_url = thumbnail.get('url') thumbnail_url = url_or_none(thumbnail.get('url'))
if not thumbnail_url: if not thumbnail_url:
continue continue
thumbnails.append({ thumbnails.append({
@ -51,7 +52,7 @@ def get_media_node(name, default=None):
media_subtitle = [media_subtitle] media_subtitle = [media_subtitle]
for subtitle_data in media_subtitle: for subtitle_data in media_subtitle:
subtitle = subtitle_data.get('@attributes', {}) subtitle = subtitle_data.get('@attributes', {})
subtitle_href = subtitle.get('href') subtitle_href = url_or_none(subtitle.get('href'))
if not subtitle_href: if not subtitle_href:
continue continue
subtitles.setdefault(subtitle.get('lang') or 'en', []).append({ subtitles.setdefault(subtitle.get('lang') or 'en', []).append({
@ -65,7 +66,7 @@ def get_media_node(name, default=None):
media_content = [media_content] media_content = [media_content]
for media_data in media_content: for media_data in media_content:
media = media_data.get('@attributes', {}) media = media_data.get('@attributes', {})
media_url = media.get('url') media_url = url_or_none(media.get('url'))
if not media_url: if not media_url:
continue continue
ext = mimetype2ext(media.get('type')) or determine_ext(media_url) ext = mimetype2ext(media.get('type')) or determine_ext(media_url)
@ -79,7 +80,7 @@ def get_media_node(name, default=None):
else: else:
formats.append({ formats.append({
'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'), 'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'),
'url': media['url'], 'url': media_url,
'tbr': int_or_none(media.get('bitrate')), 'tbr': int_or_none(media.get('bitrate')),
'filesize': int_or_none(media.get('fileSize')), 'filesize': int_or_none(media.get('fileSize')),
'ext': ext, 'ext': ext,

View file

@ -8,6 +8,7 @@
determine_ext, determine_ext,
extract_attributes, extract_attributes,
ExtractorError, ExtractorError,
url_or_none,
urlencode_postdata, urlencode_postdata,
urljoin, urljoin,
) )
@ -165,7 +166,7 @@ def extract_info(html, video_id, num=None):
}, fatal=False) }, fatal=False)
if not playlist: if not playlist:
continue continue
stream_url = playlist.get('streamurl') stream_url = url_or_none(playlist.get('streamurl'))
if stream_url: if stream_url:
rtmp = re.search( rtmp = re.search(
r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+/))(?P<playpath>mp[34]:.+)', r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+/))(?P<playpath>mp[34]:.+)',

View file

@ -7,6 +7,7 @@
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
url_or_none,
) )
@ -77,7 +78,7 @@ def _real_extract(self, url):
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
for rendition in video_data.get('renditions', []): for rendition in video_data.get('renditions', []):
video_url = rendition.get('url') video_url = url_or_none(rendition.get('url'))
if not video_url: if not video_url:
continue continue
ext = rendition.get('format') ext = rendition.get('format')

View file

@ -4,10 +4,10 @@
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
js_to_json, js_to_json,
url_or_none,
) )
@ -68,8 +68,8 @@ def _real_extract(self, url):
for source in sources: for source in sources:
if not isinstance(source, dict): if not isinstance(source, dict):
continue continue
source_url = source.get('file') source_url = url_or_none(source.get('file'))
if not source_url or not isinstance(source_url, compat_str): if not source_url:
continue continue
ext = determine_ext(source_url) ext = determine_ext(source_url)
if ext == 'm3u8': if ext == 'm3u8':

View file

@ -5,6 +5,7 @@
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
mimetype2ext, mimetype2ext,
url_or_none,
) )
@ -43,7 +44,7 @@ def _real_extract(self, url):
formats = [] formats = []
for item in file_list[0]: for item in file_list[0]:
file_url = item.get('file') file_url = url_or_none(item.get('file'))
if not file_url: if not file_url:
continue continue
ext = mimetype2ext(item.get('type')) ext = mimetype2ext(item.get('type'))

View file

@ -5,7 +5,6 @@
from .common import InfoExtractor from .common import InfoExtractor
from .generic import GenericIE from .generic import GenericIE
from ..compat import compat_str
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
ExtractorError, ExtractorError,
@ -15,6 +14,7 @@
unified_strdate, unified_strdate,
xpath_text, xpath_text,
update_url_query, update_url_query,
url_or_none,
) )
from ..compat import compat_etree_fromstring from ..compat import compat_etree_fromstring
@ -100,7 +100,7 @@ def _extract_formats(self, media_info, video_id):
quality = stream.get('_quality') quality = stream.get('_quality')
server = stream.get('_server') server = stream.get('_server')
for stream_url in stream_urls: for stream_url in stream_urls:
if not isinstance(stream_url, compat_str) or '//' not in stream_url: if not url_or_none(stream_url):
continue continue
ext = determine_ext(stream_url) ext = determine_ext(stream_url)
if quality != 'auto' and ext in ('f4m', 'm3u8'): if quality != 'auto' and ext in ('f4m', 'm3u8'):

View file

@ -19,6 +19,7 @@
unescapeHTML, unescapeHTML,
update_url_query, update_url_query,
unified_strdate, unified_strdate,
url_or_none,
) )
@ -131,8 +132,8 @@ def _real_extract(self, url):
fatal=False) fatal=False)
if not stat: if not stat:
continue continue
retry_url = stat.get('retry_url') retry_url = url_or_none(stat.get('retry_url'))
if not isinstance(retry_url, compat_str): if not retry_url:
continue continue
formats.append({ formats.append({
'url': self._proto_relative_url(retry_url, 'http:'), 'url': self._proto_relative_url(retry_url, 'http:'),
@ -306,7 +307,7 @@ def _real_extract(self, url):
formats = [] formats = []
for format_id, format_url in show['audio_stream'].items(): for format_id, format_url in show['audio_stream'].items():
if not isinstance(format_url, compat_str): if not url_or_none(format_url):
continue continue
for known_ext in KNOWN_EXTENSIONS: for known_ext in KNOWN_EXTENSIONS:
if known_ext in format_id: if known_ext in format_id:

View file

@ -4,8 +4,10 @@
from .common import InfoExtractor from .common import InfoExtractor
from .youtube import YoutubeIE from .youtube import YoutubeIE
from ..compat import compat_str from ..utils import (
from ..utils import int_or_none int_or_none,
url_or_none,
)
class BreakIE(InfoExtractor): class BreakIE(InfoExtractor):
@ -55,8 +57,8 @@ def _real_extract(self, url):
formats = [] formats = []
for video in content: for video in content:
video_url = video.get('url') video_url = url_or_none(video.get('url'))
if not video_url or not isinstance(video_url, compat_str): if not video_url:
continue continue
bitrate = int_or_none(self._search_regex( bitrate = int_or_none(self._search_regex(
r'(\d+)_kbps', video_url, 'tbr', default=None)) r'(\d+)_kbps', video_url, 'tbr', default=None))

View file

@ -2,10 +2,10 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
url_or_none,
) )
@ -56,8 +56,8 @@ def _real_extract(self, url):
for media in encodings: for media in encodings:
if not isinstance(media, dict): if not isinstance(media, dict):
continue continue
media_url = media.get('location') media_url = url_or_none(media.get('location'))
if not media_url or not isinstance(media_url, compat_str): if not media_url:
continue continue
format_id_list = [format_id] format_id_list = [format_id]

View file

@ -4,13 +4,13 @@
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
clean_html, clean_html,
int_or_none, int_or_none,
parse_duration, parse_duration,
parse_iso8601, parse_iso8601,
parse_resolution, parse_resolution,
url_or_none,
) )
@ -53,8 +53,8 @@ def _real_extract(self, url):
media_url = media['media']['url'] media_url = media['media']['url']
if isinstance(media_url, list): if isinstance(media_url, list):
for format_ in media_url: for format_ in media_url:
format_url = format_.get('file') format_url = url_or_none(format_.get('file'))
if not format_url or not isinstance(format_url, compat_str): if not format_url:
continue continue
label = format_.get('label') label = format_.get('label')
f = parse_resolution(label) f = parse_resolution(label)

View file

@ -4,16 +4,14 @@
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import compat_HTTPError
compat_str,
compat_HTTPError,
)
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
float_or_none, float_or_none,
int_or_none, int_or_none,
parse_age_limit, parse_age_limit,
parse_duration, parse_duration,
url_or_none,
ExtractorError ExtractorError
) )
@ -86,8 +84,8 @@ def _real_extract(self, url):
for e in media['MediaURLs']: for e in media['MediaURLs']:
if e.get('UseDRM') is True: if e.get('UseDRM') is True:
continue continue
format_url = e.get('Path') format_url = url_or_none(e.get('Path'))
if not format_url or not isinstance(format_url, compat_str): if not format_url:
continue continue
ext = determine_ext(format_url) ext = determine_ext(format_url)
if ext == 'm3u8': if ext == 'm3u8':
@ -124,8 +122,8 @@ def _real_extract(self, url):
for cc_file in cc_files: for cc_file in cc_files:
if not isinstance(cc_file, dict): if not isinstance(cc_file, dict):
continue continue
cc_url = cc_file.get('Path') cc_url = url_or_none(cc_file.get('Path'))
if not cc_url or not isinstance(cc_url, compat_str): if not cc_url:
continue continue
lang = cc_file.get('Locale') or 'en' lang = cc_file.get('Locale') or 'en'
subtitles.setdefault(lang, []).append({'url': cc_url}) subtitles.setdefault(lang, []).append({'url': cc_url})

View file

@ -7,6 +7,7 @@
float_or_none, float_or_none,
int_or_none, int_or_none,
unified_timestamp, unified_timestamp,
url_or_none,
) )
@ -69,7 +70,7 @@ def _real_extract(self, url):
endpoint = next( endpoint = next(
server['endpoint'] server['endpoint']
for server in servers for server in servers
if isinstance(server.get('endpoint'), compat_str) and if url_or_none(server.get('endpoint')) and
'cloudfront' in server['endpoint']) 'cloudfront' in server['endpoint'])
else: else:
endpoint = 'rtmpe://s2pqqn4u96e4j8.cloudfront.net/cfx/st/' endpoint = 'rtmpe://s2pqqn4u96e4j8.cloudfront.net/cfx/st/'
@ -92,8 +93,8 @@ def _real_extract(self, url):
for image in images: for image in images:
if not isinstance(image, dict): if not isinstance(image, dict):
continue continue
image_url = image.get('url') image_url = url_or_none(image.get('url'))
if not image_url or not isinstance(image_url, compat_str): if not image_url:
continue continue
thumbnails.append({ thumbnails.append({
'url': image_url, 'url': image_url,

View file

@ -3,7 +3,6 @@
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
extract_attributes, extract_attributes,
@ -12,6 +11,7 @@
parse_age_limit, parse_age_limit,
remove_end, remove_end,
unescapeHTML, unescapeHTML,
url_or_none,
) )
@ -69,9 +69,8 @@ def _extract_video_info(self, video, stream, display_id):
captions = stream.get('captions') captions = stream.get('captions')
if isinstance(captions, list): if isinstance(captions, list):
for caption in captions: for caption in captions:
subtitle_url = caption.get('fileUrl') subtitle_url = url_or_none(caption.get('fileUrl'))
if (not subtitle_url or not isinstance(subtitle_url, compat_str) or if not subtitle_url or not subtitle_url.startswith('http'):
not subtitle_url.startswith('http')):
continue continue
lang = caption.get('fileLang', 'en') lang = caption.get('fileLang', 'en')
ext = determine_ext(subtitle_url) ext = determine_ext(subtitle_url)

View file

@ -7,7 +7,6 @@
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_HTTPError, compat_HTTPError,
compat_str,
compat_urlparse, compat_urlparse,
) )
from ..utils import ( from ..utils import (
@ -17,6 +16,7 @@
parse_age_limit, parse_age_limit,
parse_duration, parse_duration,
unified_timestamp, unified_timestamp,
url_or_none,
) )
@ -139,8 +139,8 @@ def _get_subtitles(self, video_id):
for sub in subs: for sub in subs:
if not isinstance(sub, dict): if not isinstance(sub, dict):
continue continue
sub_url = sub.get('url') sub_url = url_or_none(sub.get('url'))
if not sub_url or not isinstance(sub_url, compat_str): if not sub_url:
continue continue
subtitles.setdefault( subtitles.setdefault(
sub.get('code') or sub.get('language') or 'en', []).append({ sub.get('code') or sub.get('language') or 'en', []).append({
@ -163,8 +163,8 @@ def _real_extract(self, url):
for format_id, format_dict in download_assets.items(): for format_id, format_dict in download_assets.items():
if not isinstance(format_dict, dict): if not isinstance(format_dict, dict):
continue continue
format_url = format_dict.get('url') format_url = url_or_none(format_dict.get('url'))
if not format_url or not isinstance(format_url, compat_str): if not format_url:
continue continue
formats.append({ formats.append({
'url': format_url, 'url': format_url,

View file

@ -4,14 +4,12 @@
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import compat_HTTPError
compat_HTTPError,
compat_str,
)
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
unsmuggle_url, unsmuggle_url,
url_or_none,
) )
@ -177,7 +175,7 @@ def _real_extract(self, url):
video_id, 'Downloading mp4 JSON', fatal=False) video_id, 'Downloading mp4 JSON', fatal=False)
if mp4_data: if mp4_data:
for format_id, format_url in mp4_data.get('data', {}).items(): for format_id, format_url in mp4_data.get('data', {}).items():
if not isinstance(format_url, compat_str): if not url_or_none(format_url):
continue continue
height = int_or_none(format_id) height = int_or_none(format_id)
if height is not None and m3u8_formats_dict.get(height): if height is not None and m3u8_formats_dict.get(height):

View file

@ -8,6 +8,7 @@
int_or_none, int_or_none,
try_get, try_get,
unified_timestamp, unified_timestamp,
url_or_none,
) )
@ -34,8 +35,8 @@ def _real_extract(self, url):
entries = [] entries = []
for lesson in lessons: for lesson in lessons:
lesson_url = lesson.get('http_url') lesson_url = url_or_none(lesson.get('http_url'))
if not lesson_url or not isinstance(lesson_url, compat_str): if not lesson_url:
continue continue
lesson_id = lesson.get('id') lesson_id = lesson.get('id')
if lesson_id: if lesson_id:
@ -95,7 +96,8 @@ def _real_extract(self, url):
formats = [] formats = []
for _, format_url in lesson['media_urls'].items(): for _, format_url in lesson['media_urls'].items():
if not format_url or not isinstance(format_url, compat_str): format_url = url_or_none(format_url)
if not format_url:
continue continue
ext = determine_ext(format_url) ext = determine_ext(format_url)
if ext == 'm3u8': if ext == 'm3u8':

View file

@ -11,6 +11,7 @@
int_or_none, int_or_none,
parse_duration, parse_duration,
str_to_int, str_to_int,
url_or_none,
) )
@ -82,8 +83,8 @@ def calc_hash(s):
for format_id, format_dict in formats_dict.items(): for format_id, format_dict in formats_dict.items():
if not isinstance(format_dict, dict): if not isinstance(format_dict, dict):
continue continue
src = format_dict.get('src') src = url_or_none(format_dict.get('src'))
if not isinstance(src, compat_str) or not src.startswith('http'): if not src or not src.startswith('http'):
continue continue
if kind == 'hls': if kind == 'hls':
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(

View file

@ -10,6 +10,7 @@
int_or_none, int_or_none,
qualities, qualities,
unified_strdate, unified_strdate,
url_or_none,
) )
@ -88,8 +89,8 @@ def _real_extract(self, url):
formats = [] formats = []
path = None path = None
for f in item.get('mbr', []): for f in item.get('mbr', []):
src = f.get('src') src = url_or_none(f.get('src'))
if not src or not isinstance(src, compat_str): if not src:
continue continue
tbr = int_or_none(self._search_regex( tbr = int_or_none(self._search_regex(
r'_(\d{3,})\.mp4', src, 'tbr', default=None)) r'_(\d{3,})\.mp4', src, 'tbr', default=None))

View file

@ -16,6 +16,7 @@
int_or_none, int_or_none,
parse_duration, parse_duration,
try_get, try_get,
url_or_none,
) )
from .dailymotion import DailymotionIE from .dailymotion import DailymotionIE
@ -115,14 +116,13 @@ def _extract_video(self, video_id, catalogue=None):
def sign(manifest_url, manifest_id): def sign(manifest_url, manifest_id):
for host in ('hdfauthftv-a.akamaihd.net', 'hdfauth.francetv.fr'): for host in ('hdfauthftv-a.akamaihd.net', 'hdfauth.francetv.fr'):
signed_url = self._download_webpage( signed_url = url_or_none(self._download_webpage(
'https://%s/esi/TA' % host, video_id, 'https://%s/esi/TA' % host, video_id,
'Downloading signed %s manifest URL' % manifest_id, 'Downloading signed %s manifest URL' % manifest_id,
fatal=False, query={ fatal=False, query={
'url': manifest_url, 'url': manifest_url,
}) }))
if (signed_url and isinstance(signed_url, compat_str) and if signed_url:
re.search(r'^(?:https?:)?//', signed_url)):
return signed_url return signed_url
return manifest_url return manifest_url

View file

@ -11,6 +11,7 @@
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
parse_duration, parse_duration,
url_or_none,
urlencode_postdata, urlencode_postdata,
) )
@ -80,7 +81,7 @@ def _extract_chapters(course):
chapters = [] chapters = []
lesson_elements = course.get('lessonElements') lesson_elements = course.get('lessonElements')
if isinstance(lesson_elements, list): if isinstance(lesson_elements, list):
chapters = [e for e in lesson_elements if isinstance(e, compat_str)] chapters = [url_or_none(e) for e in lesson_elements if url_or_none(e)]
return chapters return chapters
@staticmethod @staticmethod

View file

@ -32,6 +32,7 @@
unified_strdate, unified_strdate,
unsmuggle_url, unsmuggle_url,
UnsupportedError, UnsupportedError,
url_or_none,
xpath_text, xpath_text,
) )
from .commonprotocols import RtmpIE from .commonprotocols import RtmpIE
@ -3130,8 +3131,8 @@ def _real_extract(self, url):
sources = [sources] sources = [sources]
formats = [] formats = []
for source in sources: for source in sources:
src = source.get('src') src = url_or_none(source.get('src'))
if not src or not isinstance(src, compat_str): if not src:
continue continue
src = compat_urlparse.urljoin(url, src) src = compat_urlparse.urljoin(url, src)
src_type = source.get('type') src_type = source.get('type')

View file

@ -8,6 +8,7 @@
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
url_or_none,
urlencode_postdata, urlencode_postdata,
) )
@ -80,8 +81,8 @@ def _real_extract(self, url):
bitrates = rendition.get('bitrates') bitrates = rendition.get('bitrates')
if not isinstance(bitrates, dict): if not isinstance(bitrates, dict):
continue continue
m3u8_url = bitrates.get('hls') m3u8_url = url_or_none(bitrates.get('hls'))
if not isinstance(m3u8_url, compat_str): if not m3u8_url:
continue continue
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native',
@ -93,9 +94,8 @@ def _real_extract(self, url):
if not isinstance(cc_file, list) or len(cc_file) < 3: if not isinstance(cc_file, list) or len(cc_file) < 3:
continue continue
cc_lang = cc_file[0] cc_lang = cc_file[0]
cc_url = cc_file[2] cc_url = url_or_none(cc_file[2])
if not isinstance(cc_lang, compat_str) or not isinstance( if not isinstance(cc_lang, compat_str) or not cc_url:
cc_url, compat_str):
continue continue
subtitles.setdefault(cc_lang, []).append({ subtitles.setdefault(cc_lang, []).append({
'url': cc_url, 'url': cc_url,

View file

@ -3,12 +3,12 @@
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
mimetype2ext, mimetype2ext,
parse_duration, parse_duration,
qualities, qualities,
url_or_none,
) )
@ -61,8 +61,8 @@ def _real_extract(self, url):
for encoding in video_metadata.get('encodings', []): for encoding in video_metadata.get('encodings', []):
if not encoding or not isinstance(encoding, dict): if not encoding or not isinstance(encoding, dict):
continue continue
video_url = encoding.get('videoUrl') video_url = url_or_none(encoding.get('videoUrl'))
if not video_url or not isinstance(video_url, compat_str): if not video_url:
continue continue
ext = determine_ext(video_url, mimetype2ext(encoding.get('mimeType'))) ext = determine_ext(video_url, mimetype2ext(encoding.get('mimeType')))
if ext == 'm3u8': if ext == 'm3u8':

View file

@ -17,6 +17,7 @@
lowercase_escape, lowercase_escape,
std_headers, std_headers,
try_get, try_get,
url_or_none,
) )
@ -170,7 +171,7 @@ def get_count(key, kind):
node = try_get(edge, lambda x: x['node'], dict) node = try_get(edge, lambda x: x['node'], dict)
if not node: if not node:
continue continue
node_video_url = try_get(node, lambda x: x['video_url'], compat_str) node_video_url = url_or_none(node.get('video_url'))
if not node_video_url: if not node_video_url:
continue continue
entries.append({ entries.append({

View file

@ -20,6 +20,7 @@
merge_dicts, merge_dicts,
parse_duration, parse_duration,
smuggle_url, smuggle_url,
url_or_none,
xpath_with_ns, xpath_with_ns,
xpath_element, xpath_element,
xpath_text, xpath_text,
@ -250,8 +251,8 @@ def extract_subtitle(sub_url):
for sub in subs: for sub in subs:
if not isinstance(sub, dict): if not isinstance(sub, dict):
continue continue
href = sub.get('Href') href = url_or_none(sub.get('Href'))
if isinstance(href, compat_str): if href:
extract_subtitle(href) extract_subtitle(href)
if not info.get('duration'): if not info.get('duration'):
info['duration'] = parse_duration(video_data.get('Duration')) info['duration'] = parse_duration(video_data.get('Duration'))

View file

@ -4,16 +4,14 @@
from .common import InfoExtractor from .common import InfoExtractor
from ..aes import aes_decrypt_text from ..aes import aes_decrypt_text
from ..compat import ( from ..compat import compat_urllib_parse_unquote
compat_str,
compat_urllib_parse_unquote,
)
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
ExtractorError, ExtractorError,
int_or_none, int_or_none,
str_to_int, str_to_int,
strip_or_none, strip_or_none,
url_or_none,
) )
@ -55,7 +53,8 @@ def _extract_info(self, url, fatal=True):
encrypted = False encrypted = False
def extract_format(format_url, height=None): def extract_format(format_url, height=None):
if not isinstance(format_url, compat_str) or not format_url.startswith(('http', '//')): format_url = url_or_none(format_url)
if not format_url or not format_url.startswith(('http', '//')):
return return
if format_url in format_urls: if format_url in format_urls:
return return

View file

@ -2,11 +2,11 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
float_or_none, float_or_none,
int_or_none, int_or_none,
url_or_none,
) )
@ -109,7 +109,8 @@ def _real_extract(self, url):
captions = source.get('captionsAvailableLanguages') captions = source.get('captionsAvailableLanguages')
if isinstance(captions, dict): if isinstance(captions, dict):
for lang, subtitle_url in captions.items(): for lang, subtitle_url in captions.items():
if lang != 'none' and isinstance(subtitle_url, compat_str): subtitle_url = url_or_none(subtitle_url)
if lang != 'none' and subtitle_url:
subtitles.setdefault(lang, []).append({'url': subtitle_url}) subtitles.setdefault(lang, []).append({'url': subtitle_url})
return { return {

View file

@ -15,6 +15,7 @@
mimetype2ext, mimetype2ext,
unescapeHTML, unescapeHTML,
unsmuggle_url, unsmuggle_url,
url_or_none,
urljoin, urljoin,
) )
@ -156,8 +157,8 @@ def _real_extract(self, url):
stream_formats = [] stream_formats = []
for unum, VideoUrl in enumerate(video_urls): for unum, VideoUrl in enumerate(video_urls):
video_url = VideoUrl.get('Location') video_url = url_or_none(VideoUrl.get('Location'))
if not video_url or not isinstance(video_url, compat_str): if not video_url:
continue continue
# XXX: if Stream.get('CanChangeScheme', False), switch scheme to HTTP/HTTPS # XXX: if Stream.get('CanChangeScheme', False), switch scheme to HTTP/HTTPS

View file

@ -10,6 +10,7 @@
parse_resolution, parse_resolution,
try_get, try_get,
unified_timestamp, unified_timestamp,
url_or_none,
urljoin, urljoin,
) )
@ -200,8 +201,8 @@ def _real_extract(self, url):
for file_ in video['files']: for file_ in video['files']:
if not isinstance(file_, dict): if not isinstance(file_, dict):
continue continue
file_url = file_.get('fileUrl') file_url = url_or_none(file_.get('fileUrl'))
if not file_url or not isinstance(file_url, compat_str): if not file_url:
continue continue
file_size = int_or_none(file_.get('size')) file_size = int_or_none(file_.get('size'))
format_id = try_get( format_id = try_get(

View file

@ -3,12 +3,12 @@
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
str_to_int, str_to_int,
unified_strdate, unified_strdate,
url_or_none,
) )
@ -71,8 +71,8 @@ def _real_extract(self, url):
video_id, fatal=False) video_id, fatal=False)
if medias and isinstance(medias, list): if medias and isinstance(medias, list):
for media in medias: for media in medias:
format_url = media.get('videoUrl') format_url = url_or_none(media.get('videoUrl'))
if not format_url or not isinstance(format_url, compat_str): if not format_url:
continue continue
format_id = media.get('quality') format_id = media.get('quality')
formats.append({ formats.append({

View file

@ -6,6 +6,7 @@
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
int_or_none, int_or_none,
url_or_none,
) )
@ -37,8 +38,8 @@ def _real_extract(self, url):
title = config['title'] title = config['title']
formats = [] formats = []
for video in config['src']: for video in config['src']:
src = video.get('src') src = url_or_none(video.get('src'))
if not src or not isinstance(src, compat_str): if not src:
continue continue
ext = determine_ext(src) ext = determine_ext(src)
if ext == 'm3u8': if ext == 'm3u8':

View file

@ -16,6 +16,7 @@
int_or_none, int_or_none,
try_get, try_get,
unified_timestamp, unified_timestamp,
url_or_none,
) )
@ -176,8 +177,8 @@ def _entries(self, playlist_id, *args, **kwargs):
break break
for result in results: for result in results:
video_url = result.get('video_url') video_url = url_or_none(result.get('video_url'))
if not video_url or not isinstance(video_url, compat_str): if not video_url:
continue continue
entry = self._extract_video(result, require_title=False) entry = self._extract_video(result, require_title=False)
entry.update({ entry.update({

View file

@ -15,6 +15,7 @@
update_url_query, update_url_query,
ExtractorError, ExtractorError,
strip_or_none, strip_or_none,
url_or_none,
) )
@ -154,8 +155,8 @@ def _extract_cvp_info(self, data_src, video_id, path_data={}, ap_data={}):
subtitles = {} subtitles = {}
for source in video_data.findall('closedCaptions/source'): for source in video_data.findall('closedCaptions/source'):
for track in source.findall('track'): for track in source.findall('track'):
track_url = track.get('url') track_url = url_or_none(track.get('url'))
if not isinstance(track_url, compat_str) or track_url.endswith('/big'): if not track_url or track_url.endswith('/big'):
continue continue
lang = track.get('lang') or track.get('label') or 'en' lang = track.get('lang') or track.get('label') or 'en'
subtitles.setdefault(lang, []).append({ subtitles.setdefault(lang, []).append({

View file

@ -4,10 +4,10 @@
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
unescapeHTML, unescapeHTML,
url_or_none,
) )
@ -106,9 +106,8 @@ def _real_extract(self, url):
for stream in self._download_json(data_file, video_id): for stream in self._download_json(data_file, video_id):
if not isinstance(stream, dict): if not isinstance(stream, dict):
continue continue
stream_url = stream.get('url') stream_url = url_or_none(stream.get('url'))
if (stream_url in stream_urls or not stream_url or if stream_url in stream_urls or not stream_url:
not isinstance(stream_url, compat_str)):
continue continue
stream_urls.add(stream_url) stream_urls.add(stream_url)
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(

View file

@ -19,6 +19,7 @@
try_get, try_get,
unsmuggle_url, unsmuggle_url,
update_url_query, update_url_query,
url_or_none,
) )
@ -255,7 +256,8 @@ def _real_extract(self, url):
quality = qualities(['hls', 'medium', 'high']) quality = qualities(['hls', 'medium', 'high'])
formats = [] formats = []
for format_id, video_url in streams.get('streams', {}).items(): for format_id, video_url in streams.get('streams', {}).items():
if not video_url or not isinstance(video_url, compat_str): video_url = url_or_none(video_url)
if not video_url:
continue continue
ext = determine_ext(video_url) ext = determine_ext(video_url)
if ext == 'f4m': if ext == 'f4m':

View file

@ -27,6 +27,7 @@
unified_timestamp, unified_timestamp,
update_url_query, update_url_query,
urlencode_postdata, urlencode_postdata,
url_or_none,
urljoin, urljoin,
) )
@ -663,8 +664,8 @@ def _real_extract(self, url):
for option in status['quality_options']: for option in status['quality_options']:
if not isinstance(option, dict): if not isinstance(option, dict):
continue continue
source = option.get('source') source = url_or_none(option.get('source'))
if not source or not isinstance(source, compat_str): if not source:
continue continue
formats.append({ formats.append({
'url': source, 'url': source,

View file

@ -20,6 +20,7 @@
sanitized_Request, sanitized_Request,
try_get, try_get,
unescapeHTML, unescapeHTML,
url_or_none,
urlencode_postdata, urlencode_postdata,
) )
@ -265,8 +266,8 @@ def extract_formats(source_list):
if not isinstance(source_list, list): if not isinstance(source_list, list):
return return
for source in source_list: for source in source_list:
video_url = source.get('file') or source.get('src') video_url = url_or_none(source.get('file') or source.get('src'))
if not video_url or not isinstance(video_url, compat_str): if not video_url:
continue continue
if source.get('type') == 'application/x-mpegURL' or determine_ext(video_url) == 'm3u8': if source.get('type') == 'application/x-mpegURL' or determine_ext(video_url) == 'm3u8':
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(
@ -293,8 +294,8 @@ def extract_subtitles(track_list):
continue continue
if track.get('kind') != 'captions': if track.get('kind') != 'captions':
continue continue
src = track.get('src') src = url_or_none(track.get('src'))
if not src or not isinstance(src, compat_str): if not src:
continue continue
lang = track.get('language') or track.get( lang = track.get('language') or track.get(
'srclang') or track.get('label') 'srclang') or track.get('label')
@ -314,8 +315,8 @@ def extract_subtitles(track_list):
for cc in captions: for cc in captions:
if not isinstance(cc, dict): if not isinstance(cc, dict):
continue continue
cc_url = cc.get('url') cc_url = url_or_none(cc.get('url'))
if not cc_url or not isinstance(cc_url, compat_str): if not cc_url:
continue continue
lang = try_get(cc, lambda x: x['locale']['locale'], compat_str) lang = try_get(cc, lambda x: x['locale']['locale'], compat_str)
sub_dict = (automatic_captions if cc.get('source') == 'auto' sub_dict = (automatic_captions if cc.get('source') == 'auto'

View file

@ -3,15 +3,13 @@
import itertools import itertools
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import compat_HTTPError
compat_HTTPError,
compat_str,
)
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
float_or_none, float_or_none,
parse_iso8601, parse_iso8601,
url_or_none,
) )
@ -166,8 +164,8 @@ def _real_extract(self, url):
formats = [] formats = []
for f in video.get('formats', []): for f in video.get('formats', []):
format_url = f.get('uri') format_url = url_or_none(f.get('uri'))
if not format_url or not isinstance(format_url, compat_str): if not format_url:
continue continue
format_type = f.get('type') format_type = f.get('type')
if format_type == 'dash': if format_type == 'dash':

View file

@ -20,6 +20,7 @@
str_to_int, str_to_int,
unescapeHTML, unescapeHTML,
unified_timestamp, unified_timestamp,
url_or_none,
urlencode_postdata, urlencode_postdata,
) )
from .dailymotion import DailymotionIE from .dailymotion import DailymotionIE
@ -423,7 +424,8 @@ def _real_extract(self, url):
formats = [] formats = []
for format_id, format_url in data.items(): for format_id, format_url in data.items():
if not isinstance(format_url, compat_str) or not format_url.startswith(('http', '//', 'rtmp')): format_url = url_or_none(format_url)
if not format_url or not format_url.startswith(('http', '//', 'rtmp')):
continue continue
if (format_id.startswith(('url', 'cache')) or if (format_id.startswith(('url', 'cache')) or
format_id in ('extra_data', 'live_mp4', 'postlive_mp4')): format_id in ('extra_data', 'live_mp4', 'postlive_mp4')):

View file

@ -13,6 +13,7 @@
parse_duration, parse_duration,
try_get, try_get,
unified_strdate, unified_strdate,
url_or_none,
) )
@ -137,7 +138,8 @@ def get_height(s):
else: else:
format_url = format_item format_url = format_item
filesize = None filesize = None
if not isinstance(format_url, compat_str): format_url = url_or_none(format_url)
if not format_url:
continue continue
formats.append({ formats.append({
'format_id': '%s-%s' % (format_id, quality), 'format_id': '%s-%s' % (format_id, quality),
@ -198,7 +200,8 @@ def get_height(s):
default='{}'), default='{}'),
video_id, fatal=False) video_id, fatal=False)
for format_id, format_url in sources.items(): for format_id, format_url in sources.items():
if not isinstance(format_url, compat_str): format_url = url_or_none(format_url)
if not format_url:
continue continue
if format_url in format_urls: if format_url in format_urls:
continue continue

View file

@ -4,12 +4,12 @@
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
qualities, qualities,
unescapeHTML, unescapeHTML,
url_or_none,
) )
@ -80,9 +80,9 @@ def _real_extract(self, url):
formats = [] formats = []
for format_id in QUALITIES: for format_id in QUALITIES:
is_hd = format_id == 'hd' is_hd = format_id == 'hd'
format_url = playlist.get( format_url = url_or_none(playlist.get(
'file%s' % ('_hd' if is_hd else '')) 'file%s' % ('_hd' if is_hd else '')))
if not format_url or not isinstance(format_url, compat_str): if not format_url:
continue continue
formats.append({ formats.append({
'url': format_url, 'url': format_url,

View file

@ -3,11 +3,11 @@
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
int_or_none, int_or_none,
parse_duration, parse_duration,
url_or_none,
) )
@ -50,8 +50,8 @@ def _real_extract(self, url):
for encoding in encodings: for encoding in encodings:
if not isinstance(encoding, dict): if not isinstance(encoding, dict):
continue continue
format_url = encoding.get('filename') format_url = url_or_none(encoding.get('filename'))
if not isinstance(format_url, compat_str): if not format_url:
continue continue
if determine_ext(format_url) == 'm3u8': if determine_ext(format_url) == 'm3u8':
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(

View file

@ -3,13 +3,13 @@
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
sanitized_Request, sanitized_Request,
str_to_int, str_to_int,
unescapeHTML, unescapeHTML,
unified_strdate, unified_strdate,
url_or_none,
) )
from ..aes import aes_decrypt_text from ..aes import aes_decrypt_text
@ -88,8 +88,8 @@ def _real_extract(self, url):
for definition in definitions: for definition in definitions:
if not isinstance(definition, dict): if not isinstance(definition, dict):
continue continue
video_url = definition.get('videoUrl') video_url = url_or_none(definition.get('videoUrl'))
if isinstance(video_url, compat_str) and video_url: if video_url:
links.append(video_url) links.append(video_url)
# Fallback #1, this also contains extra low quality 180p format # Fallback #1, this also contains extra low quality 180p format

View file

@ -13,6 +13,7 @@
ExtractorError, ExtractorError,
int_or_none, int_or_none,
try_get, try_get,
url_or_none,
urlencode_postdata, urlencode_postdata,
) )
@ -150,8 +151,8 @@ def _extract_formats(self, cid, video_id, record_id=None, is_live=False):
for watch in watch_urls: for watch in watch_urls:
if not isinstance(watch, dict): if not isinstance(watch, dict):
continue continue
watch_url = watch.get('url') watch_url = url_or_none(watch.get('url'))
if not watch_url or not isinstance(watch_url, compat_str): if not watch_url:
continue continue
format_id_list = [stream_type] format_id_list = [stream_type]
maxrate = watch.get('maxrate') maxrate = watch.get('maxrate')

View file

@ -15,6 +15,7 @@
try_get, try_get,
unified_timestamp, unified_timestamp,
update_url_query, update_url_query,
url_or_none,
urljoin, urljoin,
) )
@ -67,8 +68,8 @@ class ZDFIE(ZDFBaseIE):
def _extract_subtitles(src): def _extract_subtitles(src):
subtitles = {} subtitles = {}
for caption in try_get(src, lambda x: x['captions'], list) or []: for caption in try_get(src, lambda x: x['captions'], list) or []:
subtitle_url = caption.get('uri') subtitle_url = url_or_none(caption.get('uri'))
if subtitle_url and isinstance(subtitle_url, compat_str): if subtitle_url:
lang = caption.get('language', 'deu') lang = caption.get('language', 'deu')
subtitles.setdefault(lang, []).append({ subtitles.setdefault(lang, []).append({
'url': subtitle_url, 'url': subtitle_url,
@ -76,8 +77,8 @@ def _extract_subtitles(src):
return subtitles return subtitles
def _extract_format(self, video_id, formats, format_urls, meta): def _extract_format(self, video_id, formats, format_urls, meta):
format_url = meta.get('url') format_url = url_or_none(meta.get('url'))
if not format_url or not isinstance(format_url, compat_str): if not format_url:
return return
if format_url in format_urls: if format_url in format_urls:
return return
@ -152,7 +153,8 @@ def _extract_entry(self, url, player, content, video_id):
content, lambda x: x['teaserImageRef']['layouts'], dict) content, lambda x: x['teaserImageRef']['layouts'], dict)
if layouts: if layouts:
for layout_key, layout_url in layouts.items(): for layout_key, layout_url in layouts.items():
if not isinstance(layout_url, compat_str): layout_url = url_or_none(layout_url)
if not layout_url:
continue continue
thumbnail = { thumbnail = {
'url': layout_url, 'url': layout_url,