Compare commits

...

10 Commits

Author SHA1 Message Date
c-basalt 97a4dc8680
Merge 504c935cde into 96da952504 2024-05-05 10:16:20 +05:30
sepro 96da952504
[core] Warn if lack of ffmpeg alters format selection (#9805)
Authored by: seproDev, pukkandan
2024-05-05 00:44:08 +02:00
bashonly bec9a59e8e
[networking] Add `extensions` attribute to `Response` (#9756)
CurlCFFIRH now provides an `impersonate` field in its responses' extensions

Authored by: bashonly
2024-05-04 22:19:42 +00:00
bashonly 036e0d92c6
[ie/patreon] Extract multiple embeds (#9850)
Closes #9848
Authored by: bashonly
2024-05-04 22:11:11 +00:00
c-basalt 504c935cde cleanup 2024-02-25 23:02:48 -05:00
c-basalt 9c04204efa
Update yt_dlp/extractor/microsoftembed.py
Co-authored-by: Simon Sawicki <accounts@grub4k.xyz>
2024-02-11 12:40:22 -05:00
c-basalt 63654ea6f9 use early exit 2024-02-10 11:51:28 -05:00
c-basalt ef958d16a6
Change to separate query
Co-authored-by: Simon Sawicki <accounts@grub4k.xyz>
2024-02-10 11:47:23 -05:00
c-basalt d8930151f1 series playlist 2024-02-10 01:51:57 -05:00
c-basalt 284b233891 event video 2024-02-10 00:15:06 -05:00
8 changed files with 381 additions and 267 deletions

View File

@ -785,6 +785,25 @@ class TestHTTPImpersonateRequestHandler(TestRequestHandlerBase):
assert res.status == 200 assert res.status == 200
assert std_headers['user-agent'].lower() not in res.read().decode().lower() assert std_headers['user-agent'].lower() not in res.read().decode().lower()
def test_response_extensions(self, handler):
with handler() as rh:
for target in rh.supported_targets:
request = Request(
f'http://127.0.0.1:{self.http_port}/gen_200', extensions={'impersonate': target})
res = validate_and_send(rh, request)
assert res.extensions['impersonate'] == rh._get_request_target(request)
def test_http_error_response_extensions(self, handler):
with handler() as rh:
for target in rh.supported_targets:
request = Request(
f'http://127.0.0.1:{self.http_port}/gen_404', extensions={'impersonate': target})
try:
validate_and_send(rh, request)
except HTTPError as e:
res = e.response
assert res.extensions['impersonate'] == rh._get_request_target(request)
class TestRequestHandlerMisc: class TestRequestHandlerMisc:
"""Misc generic tests for request handlers, not related to request or validation testing""" """Misc generic tests for request handlers, not related to request or validation testing"""

View File

@ -2136,6 +2136,11 @@ class YoutubeDL:
def _check_formats(self, formats): def _check_formats(self, formats):
for f in formats: for f in formats:
working = f.get('__working')
if working is not None:
if working:
yield f
continue
self.to_screen('[info] Testing format %s' % f['format_id']) self.to_screen('[info] Testing format %s' % f['format_id'])
path = self.get_output_path('temp') path = self.get_output_path('temp')
if not self._ensure_dir_exists(f'{path}/'): if not self._ensure_dir_exists(f'{path}/'):
@ -2152,33 +2157,44 @@ class YoutubeDL:
os.remove(temp_file.name) os.remove(temp_file.name)
except OSError: except OSError:
self.report_warning('Unable to delete temporary file "%s"' % temp_file.name) self.report_warning('Unable to delete temporary file "%s"' % temp_file.name)
f['__working'] = success
if success: if success:
yield f yield f
else: else:
self.to_screen('[info] Unable to download format %s. Skipping...' % f['format_id']) self.to_screen('[info] Unable to download format %s. Skipping...' % f['format_id'])
def _select_formats(self, formats, selector):
return list(selector({
'formats': formats,
'has_merged_format': any('none' not in (f.get('acodec'), f.get('vcodec')) for f in formats),
'incomplete_formats': (all(f.get('vcodec') == 'none' for f in formats) # No formats with video
or all(f.get('acodec') == 'none' for f in formats)), # OR, No formats with audio
}))
def _default_format_spec(self, info_dict, download=True): def _default_format_spec(self, info_dict, download=True):
download = download and not self.params.get('simulate')
prefer_best = download and (
self.params['outtmpl']['default'] == '-'
or info_dict.get('is_live') and not self.params.get('live_from_start'))
def can_merge(): def can_merge():
merger = FFmpegMergerPP(self) merger = FFmpegMergerPP(self)
return merger.available and merger.can_merge() return merger.available and merger.can_merge()
prefer_best = ( if not prefer_best and download and not can_merge():
not self.params.get('simulate') prefer_best = True
and download formats = self._get_formats(info_dict)
and ( evaluate_formats = lambda spec: self._select_formats(formats, self.build_format_selector(spec))
not can_merge() if evaluate_formats('b/bv+ba') != evaluate_formats('bv*+ba/b'):
or info_dict.get('is_live') and not self.params.get('live_from_start') self.report_warning('ffmpeg not found. The downloaded format may not be the best available. '
or self.params['outtmpl']['default'] == '-')) 'Installing ffmpeg is strongly recommended: https://github.com/yt-dlp/yt-dlp#dependencies')
compat = (
prefer_best
or self.params.get('allow_multiple_audio_streams', False)
or 'format-spec' in self.params['compat_opts'])
return ( compat = (self.params.get('allow_multiple_audio_streams')
'best/bestvideo+bestaudio' if prefer_best or 'format-spec' in self.params['compat_opts'])
else 'bestvideo*+bestaudio/best' if not compat
else 'bestvideo+bestaudio/best') return ('best/bestvideo+bestaudio' if prefer_best
else 'bestvideo+bestaudio/best' if compat
else 'bestvideo*+bestaudio/best')
def build_format_selector(self, format_spec): def build_format_selector(self, format_spec):
def syntax_error(note, start): def syntax_error(note, start):
@ -2928,12 +2944,7 @@ class YoutubeDL:
self.write_debug(f'Default format spec: {req_format}') self.write_debug(f'Default format spec: {req_format}')
format_selector = self.build_format_selector(req_format) format_selector = self.build_format_selector(req_format)
formats_to_download = list(format_selector({ formats_to_download = self._select_formats(formats, format_selector)
'formats': formats,
'has_merged_format': any('none' not in (f.get('acodec'), f.get('vcodec')) for f in formats),
'incomplete_formats': (all(f.get('vcodec') == 'none' for f in formats) # No formats with video
or all(f.get('acodec') == 'none' for f in formats)), # OR, No formats with audio
}))
if interactive_format_selection and not formats_to_download: if interactive_format_selection and not formats_to_download:
self.report_error('Requested format is not available', tb=False, is_error=False) self.report_error('Requested format is not available', tb=False, is_error=False)
continue continue

View File

@ -1066,11 +1066,12 @@ from .melonvod import MelonVODIE
from .metacritic import MetacriticIE from .metacritic import MetacriticIE
from .mgtv import MGTVIE from .mgtv import MGTVIE
from .microsoftstream import MicrosoftStreamIE from .microsoftstream import MicrosoftStreamIE
from .microsoftvirtualacademy import ( from .microsoftembed import (
MicrosoftVirtualAcademyIE, MicrosoftEmbedIE,
MicrosoftVirtualAcademyCourseIE, MicrosoftMediusIE,
MicrosoftLearnIE,
MicrosoftBuildIE,
) )
from .microsoftembed import MicrosoftEmbedIE
from .mildom import ( from .mildom import (
MildomIE, MildomIE,
MildomVodIE, MildomVodIE,

View File

@ -1,5 +1,13 @@
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import int_or_none, traverse_obj, unified_timestamp from ..utils import (
int_or_none,
parse_iso8601,
traverse_obj,
unified_timestamp,
url_or_none,
)
class MicrosoftEmbedIE(InfoExtractor): class MicrosoftEmbedIE(InfoExtractor):
@ -63,3 +71,221 @@ class MicrosoftEmbedIE(InfoExtractor):
'subtitles': subtitles, 'subtitles': subtitles,
'thumbnails': thumbnails, 'thumbnails': thumbnails,
} }
class MicrosoftMediusBaseIE(InfoExtractor):
@staticmethod
def _sub_to_dict(subtitle_list):
subtitles = {}
for sub in subtitle_list:
subtitles.setdefault(sub.pop('tag', None) or 'unknown', []).append(sub)
return subtitles
def _extract_ism(self, ism_url, video_id):
formats = self._extract_ism_formats(ism_url, video_id)
for format in formats:
if format.get('language') == 'eng' or 'English' in format.get('format_id', ''):
format['language_preference'] = -1
else:
format['language_preference'] = -10
return formats
class MicrosoftMediusIE(MicrosoftMediusBaseIE):
_VALID_URL = r'https?://medius\.microsoft\.com/[^?#]+/(?P<id>[0-9a-f\-]+)'
_TESTS = [{
'url': 'https://medius.microsoft.com/Embed/video-nc/9640d86c-f513-4889-959e-5dace86e7d2b',
'info_dict': {
'id': '9640d86c-f513-4889-959e-5dace86e7d2b',
'ext': 'ismv',
'title': 'Rapidly code, test and ship from secure cloud developer environments',
'description': 'md5:33c8e4facadc438613476eea24165f71',
'thumbnail': r're:https://mediusimg\.event\.microsoft\.com/video-\d+/thumbnail\.jpg.*',
'subtitles': 'count:30',
},
'params': {'listsubtitles': True},
}, {
'url': 'https://medius.microsoft.com/Embed/video-nc/81215af5-c813-4dcd-aede-94f4e1a7daa3',
'info_dict': {
'id': '81215af5-c813-4dcd-aede-94f4e1a7daa3',
'ext': 'ismv',
'title': 'Microsoft Build opening',
'description': 'md5:43455096141077a1f23144cab8cec1cb',
'thumbnail': r're:https://mediusimg\.event\.microsoft\.com/video-\d+/thumbnail\.jpg.*',
'subtitles': 'count:31',
},
'params': {'listsubtitles': True},
}]
def _extract_subtitle(self, webpage, video_id):
captions = traverse_obj(
self._search_json(r'const\s+captionsConfiguration\s*=\s*', webpage, 'captions', video_id, default=False),
('languageList', ..., {
'url': ('src', {url_or_none}),
'tag': ('srclang', {str}),
'name': ('kind', {str}),
}))
captions = captions or traverse_obj(
re.findall(r'var\s+file\s+=\s+\{[^}]+\'(https://[^\']+\.vtt\?[^\']+)', webpage),
(lambda _, v: url_or_none(v), {lambda x: {'url': x, 'tag': x.split('.vtt?')[0].split('_')[-1]}}))
return self._sub_to_dict(captions)
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
ism_url = self._search_regex(r'StreamUrl\s*=\s*"([^"]+manifest)"', webpage, 'ism url')
return {
'id': video_id,
'title': self._og_search_title(webpage),
'description': self._og_search_description(webpage),
'formats': self._extract_ism(ism_url, video_id),
'thumbnail': self._og_search_thumbnail(webpage),
'subtitles': self._extract_subtitle(webpage, video_id),
}
class MicrosoftLearnIE(MicrosoftMediusBaseIE):
_VALID_URL = r'https?://learn\.microsoft\.com/(?:[\w\-]+/)?(?P<type>events|shows)/(?P<series>[\w\-]+)(?:/(?P<id>[^?#/]+))?'
_TESTS = [{
'url': 'https://learn.microsoft.com/en-us/events/build-2022/ts01-rapidly-code-test-ship-from-secure-cloud-developer-environments',
'info_dict': {
'id': '9640d86c-f513-4889-959e-5dace86e7d2b',
'ext': 'ismv',
'title': 'Rapidly code, test and ship from secure cloud developer environments - Events',
'description': 'md5:f26c1a85d41c1cffd27a0279254a25c3',
'timestamp': 1653408600,
'upload_date': '20220524',
'thumbnail': r're:https://mediusimg\.event\.microsoft\.com/video-\d+/thumbnail\.jpg.*',
},
}, {
'url': 'https://learn.microsoft.com/en-us/events/build-2022',
'info_dict': {
'id': 'build-2022',
'title': 'Microsoft Build 2022 - Events',
'description': 'md5:c16b43848027df837b22c6fbac7648d3',
},
'playlist_count': 201,
}, {
'url': 'https://learn.microsoft.com/en-us/shows/bash-for-beginners/what-is-the-difference-between-a-terminal-and-a-shell-2-of-20-bash-for-beginners/',
'info_dict': {
'id': 'd44e1a03-a0e5-45c2-9496-5c9fa08dc94c',
'ext': 'ismv',
'title': 'What is the Difference Between a Terminal and a Shell? (Part 2 of 20)',
'description': 'md5:7bbbfb593d21c2cf2babc3715ade6b88',
'timestamp': 1676339547,
'upload_date': '20230214',
'thumbnail': r're:https://learn\.microsoft\.com/video/media/.*\.png',
'subtitles': 'count:14',
},
'params': {'listsubtitles': True},
}, {
'url': 'https://learn.microsoft.com/en-us/shows/bash-for-beginners',
'info_dict': {
'id': 'bash-for-beginners',
'title': 'Bash for Beginners',
'description': 'md5:16a91c07222117d1e00912f0dbc02c2c',
},
'playlist_count': 20,
}]
def _entries(self, url_base, video_id):
skip = 0
while True:
playlist_info = self._download_json(url_base, video_id, f'Downloading entries {skip}', query={
'locale': 'en-us',
'$skip': skip,
})
items = traverse_obj(playlist_info, (
'results', ..., 'url', {lambda x: self.url_result(f'https://learn.microsoft.com/en-us{x}')}))
yield from items
skip += len(items)
if skip >= playlist_info['count'] or not items:
break
def _real_extract(self, url):
video_type, series, slug = self._match_valid_url(url).groups()
video_id = slug or series
webpage = self._download_webpage(url, video_id)
metainfo = {
'title': self._og_search_title(webpage),
'description': self._og_search_description(webpage),
}
if not slug:
url_base = f'https://learn.microsoft.com/api/contentbrowser/search/{video_type}/{series}/{"sessions" if video_type == "events" else "episodes"}'
return self.playlist_result(self._entries(url_base, video_id), video_id, **metainfo)
if video_type == 'events':
return self.url_result(
self._search_regex(r'<meta\s+name="externalVideoUrl"\s+content="([^"]+)"', webpage, 'videoUrl'), url_transparent=True, **metainfo, **{
'timestamp': parse_iso8601(self._search_regex(
r'<meta\s+name="startDate"\s+content="([^"]+)"', webpage, 'date', default=None)),
})
entry_id = self._search_regex(r'<meta name="entryId" content="([^"]+)"', webpage, 'entryId')
video_info = self._download_json(
f'https://learn.microsoft.com/api/video/public/v1/entries/{entry_id}', video_id)
return {
'id': entry_id,
'formats': self._extract_ism(video_info['publicVideo']['adaptiveVideoUrl'], video_id),
'subtitles': self._sub_to_dict(traverse_obj(video_info, ('publicVideo', 'captions', ..., {
'tag': ('language', {str}),
'url': ('url', {url_or_none}),
}))),
**metainfo,
**traverse_obj(video_info, {
'timestamp': ('createTime', {parse_iso8601}),
'thumbnails': ('publicVideo', 'thumbnailOtherSizes', ..., {lambda x: {'url': x}}),
}),
}
class MicrosoftBuildIE(MicrosoftMediusBaseIE):
_VALID_URL = [
r'https?://build\.microsoft\.com/[\w\-]+/sessions/(?P<id>[0-9a-f\-]+)',
r'https?://build\.microsoft\.com/[\w\-]+/(?P<id>sessions)/?(?:[?#]|$)',
]
_TESTS = [{
'url': 'https://build.microsoft.com/en-US/sessions/49e81029-20f0-485b-b641-73b7f9622656?source=sessions',
'info_dict': {
'id': '81215af5-c813-4dcd-aede-94f4e1a7daa3',
'ext': 'ismv',
'title': 'Microsoft Build opening',
'description': 'md5:756ab1fb60bdc6923d627803694e9cc5',
'timestamp': 1684857600,
'upload_date': '20230523',
'thumbnail': r're:https://mediusimg\.event\.microsoft\.com/video-\d+/thumbnail\.jpg.*',
},
}, {
'url': 'https://build.microsoft.com/en-US/sessions',
'info_dict': {
'id': 'sessions',
},
'playlist_mincount': 418,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
entries = [
self.url_result(video_info['onDemand'], url_transparent=True, **traverse_obj(video_info, {
'id': ('sessionId', {str}),
'title': ('title', {str}),
'description': ('description', {str}),
'timestamp': ('startDateTime', {parse_iso8601}),
}))
for video_info in self._download_json(
'https://api.build.microsoft.com/api/session/all/en-US', video_id, 'Downloading video info')
]
if video_id == 'sessions':
return self.playlist_result(entries, video_id)
else:
return traverse_obj(entries, (lambda _, v: v['id'] == video_id), get_all=False)

View File

@ -1,189 +0,0 @@
import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
parse_duration,
smuggle_url,
unsmuggle_url,
xpath_text,
)
class MicrosoftVirtualAcademyBaseIE(InfoExtractor):
def _extract_base_url(self, course_id, display_id):
return self._download_json(
'https://api-mlxprod.microsoft.com/services/products/anonymous/%s' % course_id,
display_id, 'Downloading course base URL')
def _extract_chapter_and_title(self, title):
if not title:
return None, None
m = re.search(r'(?P<chapter>\d+)\s*\|\s*(?P<title>.+)', title)
return (int(m.group('chapter')), m.group('title')) if m else (None, title)
class MicrosoftVirtualAcademyIE(MicrosoftVirtualAcademyBaseIE):
IE_NAME = 'mva'
IE_DESC = 'Microsoft Virtual Academy videos'
_VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/[^/?#&]+-)(?P<course_id>\d+)(?::|\?l=)(?P<id>[\da-zA-Z]+_\d+)' % IE_NAME
_TESTS = [{
'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788?l=gfVXISmEB_6804984382',
'md5': '7826c44fc31678b12ad8db11f6b5abb9',
'info_dict': {
'id': 'gfVXISmEB_6804984382',
'ext': 'mp4',
'title': 'Course Introduction',
'formats': 'mincount:3',
'subtitles': {
'en': [{
'ext': 'ttml',
}],
},
}
}, {
'url': 'mva:11788:gfVXISmEB_6804984382',
'only_matching': True,
}]
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
mobj = self._match_valid_url(url)
course_id = mobj.group('course_id')
video_id = mobj.group('id')
base_url = smuggled_data.get('base_url') or self._extract_base_url(course_id, video_id)
settings = self._download_xml(
'%s/content/content_%s/videosettings.xml?v=1' % (base_url, video_id),
video_id, 'Downloading video settings XML')
_, title = self._extract_chapter_and_title(xpath_text(
settings, './/Title', 'title', fatal=True))
formats = []
for sources in settings.findall('.//MediaSources'):
sources_type = sources.get('videoType')
for source in sources.findall('./MediaSource'):
video_url = source.text
if not video_url or not video_url.startswith('http'):
continue
if sources_type == 'smoothstreaming':
formats.extend(self._extract_ism_formats(
video_url, video_id, 'mss', fatal=False))
continue
video_mode = source.get('videoMode')
height = int_or_none(self._search_regex(
r'^(\d+)[pP]$', video_mode or '', 'height', default=None))
codec = source.get('codec')
acodec, vcodec = [None] * 2
if codec:
codecs = codec.split(',')
if len(codecs) == 2:
acodec, vcodec = codecs
elif len(codecs) == 1:
vcodec = codecs[0]
formats.append({
'url': video_url,
'format_id': video_mode,
'height': height,
'acodec': acodec,
'vcodec': vcodec,
})
subtitles = {}
for source in settings.findall('.//MarkerResourceSource'):
subtitle_url = source.text
if not subtitle_url:
continue
subtitles.setdefault('en', []).append({
'url': '%s/%s' % (base_url, subtitle_url),
'ext': source.get('type'),
})
return {
'id': video_id,
'title': title,
'subtitles': subtitles,
'formats': formats
}
class MicrosoftVirtualAcademyCourseIE(MicrosoftVirtualAcademyBaseIE):
IE_NAME = 'mva:course'
IE_DESC = 'Microsoft Virtual Academy courses'
_VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/(?P<display_id>[^/?#&]+)-)(?P<id>\d+)' % IE_NAME
_TESTS = [{
'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788',
'info_dict': {
'id': '11788',
'title': 'Microsoft Azure Fundamentals: Virtual Machines',
},
'playlist_count': 36,
}, {
# with emphasized chapters
'url': 'https://mva.microsoft.com/en-US/training-courses/developing-windows-10-games-with-construct-2-16335',
'info_dict': {
'id': '16335',
'title': 'Developing Windows 10 Games with Construct 2',
},
'playlist_count': 10,
}, {
'url': 'https://www.microsoftvirtualacademy.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788',
'only_matching': True,
}, {
'url': 'mva:course:11788',
'only_matching': True,
}]
@classmethod
def suitable(cls, url):
return False if MicrosoftVirtualAcademyIE.suitable(url) else super(
MicrosoftVirtualAcademyCourseIE, cls).suitable(url)
def _real_extract(self, url):
mobj = self._match_valid_url(url)
course_id = mobj.group('id')
display_id = mobj.group('display_id')
base_url = self._extract_base_url(course_id, display_id)
manifest = self._download_json(
'%s/imsmanifestlite.json' % base_url,
display_id, 'Downloading course manifest JSON')['manifest']
organization = manifest['organizations']['organization'][0]
entries = []
for chapter in organization['item']:
chapter_number, chapter_title = self._extract_chapter_and_title(chapter.get('title'))
chapter_id = chapter.get('@identifier')
for item in chapter.get('item', []):
item_id = item.get('@identifier')
if not item_id:
continue
metadata = item.get('resource', {}).get('metadata') or {}
if metadata.get('learningresourcetype') != 'Video':
continue
_, title = self._extract_chapter_and_title(item.get('title'))
duration = parse_duration(metadata.get('duration'))
description = metadata.get('description')
entries.append({
'_type': 'url_transparent',
'url': smuggle_url(
'mva:%s:%s' % (course_id, item_id), {'base_url': base_url}),
'title': title,
'description': description,
'duration': duration,
'chapter': chapter_title,
'chapter_number': chapter_number,
'chapter_id': chapter_id,
})
title = organization.get('title') or manifest.get('metadata', {}).get('title')
return self.playlist_result(entries, course_id, title)

View File

@ -219,7 +219,29 @@ class PatreonIE(PatreonBaseIE):
'thumbnail': r're:^https?://.+', 'thumbnail': r're:^https?://.+',
}, },
'params': {'skip_download': 'm3u8'}, 'params': {'skip_download': 'm3u8'},
}, {
# multiple attachments/embeds
'url': 'https://www.patreon.com/posts/holy-wars-solos-100601977',
'playlist_count': 3,
'info_dict': {
'id': '100601977',
'title': '"Holy Wars" (Megadeth) Solos Transcription & Lesson/Analysis',
'description': 'md5:d099ab976edfce6de2a65c2b169a88d3',
'uploader': 'Bradley Hall',
'uploader_id': '24401883',
'uploader_url': 'https://www.patreon.com/bradleyhallguitar',
'channel_id': '3193932',
'channel_url': 'https://www.patreon.com/bradleyhallguitar',
'channel_follower_count': int,
'timestamp': 1710777855,
'upload_date': '20240318',
'like_count': int,
'comment_count': int,
'thumbnail': r're:^https?://.+',
},
'skip': 'Patron-only content',
}] }]
_RETURN_TYPE = 'video'
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
@ -234,58 +256,54 @@ class PatreonIE(PatreonBaseIE):
'include': 'audio,user,user_defined_tags,campaign,attachments_media', 'include': 'audio,user,user_defined_tags,campaign,attachments_media',
}) })
attributes = post['data']['attributes'] attributes = post['data']['attributes']
title = attributes['title'].strip() info = traverse_obj(attributes, {
image = attributes.get('image') or {} 'title': ('title', {str.strip}),
info = { 'description': ('content', {clean_html}),
'id': video_id, 'thumbnail': ('image', ('large_url', 'url'), {url_or_none}, any),
'title': title, 'timestamp': ('published_at', {parse_iso8601}),
'description': clean_html(attributes.get('content')), 'like_count': ('like_count', {int_or_none}),
'thumbnail': image.get('large_url') or image.get('url'), 'comment_count': ('comment_count', {int_or_none}),
'timestamp': parse_iso8601(attributes.get('published_at')), })
'like_count': int_or_none(attributes.get('like_count')),
'comment_count': int_or_none(attributes.get('comment_count')),
}
can_view_post = traverse_obj(attributes, 'current_user_can_view')
if can_view_post and info['comment_count']:
info['__post_extractor'] = self.extract_comments(video_id)
for i in post.get('included', []): entries = []
i_type = i.get('type') idx = 0
if i_type == 'media': for include in traverse_obj(post, ('included', lambda _, v: v['type'])):
media_attributes = i.get('attributes') or {} include_type = include['type']
download_url = media_attributes.get('download_url') if include_type == 'media':
media_attributes = traverse_obj(include, ('attributes', {dict})) or {}
download_url = url_or_none(media_attributes.get('download_url'))
ext = mimetype2ext(media_attributes.get('mimetype')) ext = mimetype2ext(media_attributes.get('mimetype'))
# if size_bytes is None, this media file is likely unavailable # if size_bytes is None, this media file is likely unavailable
# See: https://github.com/yt-dlp/yt-dlp/issues/4608 # See: https://github.com/yt-dlp/yt-dlp/issues/4608
size_bytes = int_or_none(media_attributes.get('size_bytes')) size_bytes = int_or_none(media_attributes.get('size_bytes'))
if download_url and ext in KNOWN_EXTENSIONS and size_bytes is not None: if download_url and ext in KNOWN_EXTENSIONS and size_bytes is not None:
# XXX: what happens if there are multiple attachments? idx += 1
return { entries.append({
**info, 'id': f'{video_id}-{idx}',
'ext': ext, 'ext': ext,
'filesize': size_bytes, 'filesize': size_bytes,
'url': download_url, 'url': download_url,
}
elif i_type == 'user':
user_attributes = i.get('attributes')
if user_attributes:
info.update({
'uploader': user_attributes.get('full_name'),
'uploader_id': str_or_none(i.get('id')),
'uploader_url': user_attributes.get('url'),
}) })
elif i_type == 'post_tag': elif include_type == 'user':
info.setdefault('tags', []).append(traverse_obj(i, ('attributes', 'value'))) info.update(traverse_obj(include, {
'uploader': ('attributes', 'full_name', {str}),
'uploader_id': ('id', {str_or_none}),
'uploader_url': ('attributes', 'url', {url_or_none}),
}))
elif i_type == 'campaign': elif include_type == 'post_tag':
info.update({ if post_tag := traverse_obj(include, ('attributes', 'value', {str})):
'channel': traverse_obj(i, ('attributes', 'title')), info.setdefault('tags', []).append(post_tag)
'channel_id': str_or_none(i.get('id')),
'channel_url': traverse_obj(i, ('attributes', 'url')), elif include_type == 'campaign':
'channel_follower_count': int_or_none(traverse_obj(i, ('attributes', 'patron_count'))), info.update(traverse_obj(include, {
}) 'channel': ('attributes', 'title', {str}),
'channel_id': ('id', {str_or_none}),
'channel_url': ('attributes', 'url', {url_or_none}),
'channel_follower_count': ('attributes', 'patron_count', {int_or_none}),
}))
# handle Vimeo embeds # handle Vimeo embeds
if traverse_obj(attributes, ('embed', 'provider')) == 'Vimeo': if traverse_obj(attributes, ('embed', 'provider')) == 'Vimeo':
@ -296,36 +314,50 @@ class PatreonIE(PatreonBaseIE):
v_url, video_id, 'Checking Vimeo embed URL', v_url, video_id, 'Checking Vimeo embed URL',
headers={'Referer': 'https://patreon.com/'}, headers={'Referer': 'https://patreon.com/'},
fatal=False, errnote=False): fatal=False, errnote=False):
return self.url_result( entries.append(self.url_result(
VimeoIE._smuggle_referrer(v_url, 'https://patreon.com/'), VimeoIE._smuggle_referrer(v_url, 'https://patreon.com/'),
VimeoIE, url_transparent=True, **info) VimeoIE, url_transparent=True))
embed_url = traverse_obj(attributes, ('embed', 'url', {url_or_none})) embed_url = traverse_obj(attributes, ('embed', 'url', {url_or_none}))
if embed_url and self._request_webpage(embed_url, video_id, 'Checking embed URL', fatal=False, errnote=False): if embed_url and self._request_webpage(embed_url, video_id, 'Checking embed URL', fatal=False, errnote=False):
return self.url_result(embed_url, **info) entries.append(self.url_result(embed_url))
post_file = traverse_obj(attributes, 'post_file') post_file = traverse_obj(attributes, ('post_file', {dict}))
if post_file: if post_file:
name = post_file.get('name') name = post_file.get('name')
ext = determine_ext(name) ext = determine_ext(name)
if ext in KNOWN_EXTENSIONS: if ext in KNOWN_EXTENSIONS:
return { entries.append({
**info, 'id': video_id,
'ext': ext, 'ext': ext,
'url': post_file['url'], 'url': post_file['url'],
} })
elif name == 'video' or determine_ext(post_file.get('url')) == 'm3u8': elif name == 'video' or determine_ext(post_file.get('url')) == 'm3u8':
formats, subtitles = self._extract_m3u8_formats_and_subtitles(post_file['url'], video_id) formats, subtitles = self._extract_m3u8_formats_and_subtitles(post_file['url'], video_id)
return { entries.append({
**info, 'id': video_id,
'formats': formats, 'formats': formats,
'subtitles': subtitles, 'subtitles': subtitles,
} })
if can_view_post is False: can_view_post = traverse_obj(attributes, 'current_user_can_view')
comments = None
if can_view_post and info.get('comment_count'):
comments = self.extract_comments(video_id)
if not entries and can_view_post is False:
self.raise_no_formats('You do not have access to this post', video_id=video_id, expected=True) self.raise_no_formats('You do not have access to this post', video_id=video_id, expected=True)
else: elif not entries:
self.raise_no_formats('No supported media found in this post', video_id=video_id, expected=True) self.raise_no_formats('No supported media found in this post', video_id=video_id, expected=True)
elif len(entries) == 1:
info.update(entries[0])
else:
for entry in entries:
entry.update(info)
return self.playlist_result(entries, video_id, **info, __post_extractor=comments)
info['id'] = video_id
info['__post_extractor'] = comments
return info return info
def _get_comments(self, post_id): def _get_comments(self, post_id):

View File

@ -132,6 +132,16 @@ class CurlCFFIRH(ImpersonateRequestHandler, InstanceStoreMixin):
extensions.pop('cookiejar', None) extensions.pop('cookiejar', None)
extensions.pop('timeout', None) extensions.pop('timeout', None)
def send(self, request: Request) -> Response:
target = self._get_request_target(request)
try:
response = super().send(request)
except HTTPError as e:
e.response.extensions['impersonate'] = target
raise
response.extensions['impersonate'] = target
return response
def _send(self, request: Request): def _send(self, request: Request):
max_redirects_exceeded = False max_redirects_exceeded = False
session: curl_cffi.requests.Session = self._get_instance( session: curl_cffi.requests.Session = self._get_instance(

View File

@ -497,6 +497,7 @@ class Response(io.IOBase):
@param headers: response headers. @param headers: response headers.
@param status: Response HTTP status code. Default is 200 OK. @param status: Response HTTP status code. Default is 200 OK.
@param reason: HTTP status reason. Will use built-in reasons based on status code if not provided. @param reason: HTTP status reason. Will use built-in reasons based on status code if not provided.
@param extensions: Dictionary of handler-specific response extensions.
""" """
def __init__( def __init__(
@ -505,7 +506,9 @@ class Response(io.IOBase):
url: str, url: str,
headers: Mapping[str, str], headers: Mapping[str, str],
status: int = 200, status: int = 200,
reason: str = None): reason: str = None,
extensions: dict = None
):
self.fp = fp self.fp = fp
self.headers = Message() self.headers = Message()
@ -517,6 +520,7 @@ class Response(io.IOBase):
self.reason = reason or HTTPStatus(status).phrase self.reason = reason or HTTPStatus(status).phrase
except ValueError: except ValueError:
self.reason = None self.reason = None
self.extensions = extensions or {}
def readable(self): def readable(self):
return self.fp.readable() return self.fp.readable()