support embedded playlist; write sane regex; rework _VALID_URL

Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com>
Co-authored-by: sepro <sepro@sepr0.com>
This commit is contained in:
Mozi 2024-08-30 17:17:04 +00:00
parent c4172f6de6
commit cddd001021

View file

@ -16,6 +16,7 @@
try_get, try_get,
unescapeHTML, unescapeHTML,
unsmuggle_url, unsmuggle_url,
update_url,
url_or_none, url_or_none,
urlencode_postdata, urlencode_postdata,
) )
@ -101,10 +102,15 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
_VALID_URL = r'''(?ix) _VALID_URL = r'''(?ix)
https?:// https?://
(?: (?:
(?:(?:www|touch|geo)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:(?:embed|swf|\#)/)|player(?:/\w+)?\.html\?)?video|swf)| (?:(?:www|touch|geo)\.)?dailymotion\.[a-z]{2,3}|
(?:www\.)?lequipe\.fr/video (?:www\.)?lequipe\.fr
)/
(?:
video/|
swf(?:/(?!video)|/video/)|
player(?:(?:/\w+)?\.html)?(?:\?video=|/)
) )
[/=](?P<id>[^/?_&]+)(?:.+?\bplaylist=(?P<playlist_id>x[0-9a-z]+))? (?P<id>[^/?_&#]+)(?:.+?\bplaylist=(?P<playlist_id>x[0-9a-z]+))?
''' '''
IE_NAME = 'dailymotion' IE_NAME = 'dailymotion'
_EMBED_REGEX = [r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1'] _EMBED_REGEX = [r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1']
@ -220,6 +226,31 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
'url': 'https://geo.dailymotion.com/player/xakln.html?video=x8mjju4&customConfig%5BcustomParams%5D=%2Ffr-fr%2Ftennis%2Fwimbledon-mens-singles%2Farticles-video', 'url': 'https://geo.dailymotion.com/player/xakln.html?video=x8mjju4&customConfig%5BcustomParams%5D=%2Ffr-fr%2Ftennis%2Fwimbledon-mens-singles%2Farticles-video',
'only_matching': True, 'only_matching': True,
}] }]
_WEBPAGE_TESTS = [{
'url': 'https://www.financialounge.com/video/2024/08/01/borse-europee-in-rosso-dopo-la-fed-a-milano-volano-mediobanca-e-tim-edizione-del-1-agosto/',
'info_dict': {
'id': 'x93blhi',
'ext': 'mp4',
'title': 'OnAir - 01/08/24',
'description': '',
'duration': 217,
'timestamp': 1722505658,
'upload_date': '20240801',
'uploader': 'Financialounge',
'uploader_id': 'x2vtgmm',
'age_limit': 0,
'tags': [],
'view_count': int,
'like_count': int,
},
'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
}, {
'url': 'https://www.cycleworld.com/blogs/ask-kevin/ducati-continues-to-evolve-with-v4/',
'info_dict': {
'id': 'x7wdsj',
},
'playlist_mincount': 50,
}]
_GEO_BYPASS = False _GEO_BYPASS = False
_COMMON_MEDIA_FIELDS = '''description _COMMON_MEDIA_FIELDS = '''description
geoblockedCountries { geoblockedCountries {
@ -235,12 +266,21 @@ def _extract_embed_urls(cls, url, webpage):
r'(?s)DM\.player\([^,]+,\s*{.*?video[\'"]?\s*:\s*["\']?(?P<id>[0-9a-zA-Z]+).+?}\s*\);', webpage): r'(?s)DM\.player\([^,]+,\s*{.*?video[\'"]?\s*:\s*["\']?(?P<id>[0-9a-zA-Z]+).+?}\s*\);', webpage):
yield from 'https://www.dailymotion.com/embed/video/' + mobj.group('id') yield from 'https://www.dailymotion.com/embed/video/' + mobj.group('id')
for mobj in re.finditer( for mobj in re.finditer(
r'(?s)<script\s+?[^>]*?\bsrc=(["\'])((?:https?:)?//[^>]+\.dailymotion\.com/player/(?:(?!\1).)+)\1[^>]*?>', webpage): r'(?s)<script [^>]*\bsrc=(["\'])(?:https?:)?//[\w-]+\.dailymotion\.com/player/(?:(?!\1).)+\1[^>]*>', webpage):
attrs = extract_attributes(mobj.group(0)) attrs = extract_attributes(mobj.group(0))
player_url = url_or_none(attrs.get('src')) player_url = url_or_none(attrs.get('src'))
video_id = attrs.get('data-video') if not player_url:
if all((player_url, video_id)): continue
yield f'{player_url.replace(".js", ".html")}?video={video_id}' player_url = player_url.replace('.js', '.html')
if player_url.startswith('//'):
player_url = f'https:{player_url}'
if video_id := attrs.get('data-video'):
query_string = f'video={video_id}'
elif playlist_id := attrs.get('data-playlist'):
query_string = f'playlist={playlist_id}'
else:
continue
yield update_url(player_url, query=query_string)
def _real_extract(self, url): def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url) url, smuggled_data = unsmuggle_url(url)