Differentiate native and external embeds

Add support for quoted embeds
Implement dictionary unpacking
This commit is contained in:
MellowKyler 2024-10-27 18:46:44 -05:00
parent f823ea950c
commit 070ca35cc4

View file

@ -14,7 +14,6 @@ class BlueskyIE(InfoExtractor):
'upload_date': '20240921', 'upload_date': '20240921',
'description': 'OMG WE HAVE VIDEOS NOW', 'description': 'OMG WE HAVE VIDEOS NOW',
'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$', 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
'alt_title': None,
'uploader': str, 'uploader': str,
'channel': 'blu3blue.bsky.social', 'channel': 'blu3blue.bsky.social',
'uploader_id': 'did:plc:pzdr5ylumf7vmvwasrpr5bf2', 'uploader_id': 'did:plc:pzdr5ylumf7vmvwasrpr5bf2',
@ -27,16 +26,16 @@ class BlueskyIE(InfoExtractor):
'comment_count': int, 'comment_count': int,
'webpage_url': 'https://bsky.app/profile/blu3blue.bsky.social/post/3l4omssdl632g', 'webpage_url': 'https://bsky.app/profile/blu3blue.bsky.social/post/3l4omssdl632g',
'tags': 'count:1', 'tags': 'count:1',
'subtitles': dict, 'comments': 'mincount:29',
'comments': None, # 'count:29' if getcomments
}, },
'params': {'getcomments': True},
}, { }, {
'url': 'https://bsky.app/profile/bsky.app/post/3l3vgf77uco2g', 'url': 'https://bsky.app/profile/bsky.app/post/3l3vgf77uco2g',
'md5': 'b9e344fdbce9f2852c668a97efefb105', 'md5': 'b9e344fdbce9f2852c668a97efefb105',
'info_dict': { 'info_dict': {
'id': '3l3vgf77uco2g', 'id': '3l3vgf77uco2g',
'ext': 'mp4', 'ext': 'mp4',
'title': r're:Bluesky: "Bluesky now has video!', 'title': str,
'upload_date': '20240911', 'upload_date': '20240911',
'description': r're:Bluesky now has video!', 'description': r're:Bluesky now has video!',
'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$', 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
@ -53,34 +52,9 @@ class BlueskyIE(InfoExtractor):
'comment_count': int, 'comment_count': int,
'webpage_url': 'https://bsky.app/profile/bsky.app/post/3l3vgf77uco2g', 'webpage_url': 'https://bsky.app/profile/bsky.app/post/3l3vgf77uco2g',
'tags': 'count:2', 'tags': 'count:2',
'subtitles': dict, 'subtitles': {
'comments': None, 'en': 'mincount:1',
}, },
}, {
'url': 'https://bsky.app/profile/did:plc:3tndo2mqg2vgpxnpyrxiol6p/post/3l45kdlktfe2o',
'md5': 'a426d7b0fc52bc89fc8f59668be3496e',
'info_dict': {
'id': '3l45kdlktfe2o',
'ext': 'mp4',
'title': str,
'upload_date': '20240914',
'description': r're:alright.\nthis was .. a tiny bit of a pain.',
'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
'alt_title': r're:me making a goofy little test video',
'uploader': str,
'channel': 'clockworkbanana.fun',
'uploader_id': 'did:plc:3tndo2mqg2vgpxnpyrxiol6p',
'channel_id': 'did:plc:3tndo2mqg2vgpxnpyrxiol6p',
'uploader_url': 'https://bsky.app/profile/clockworkbanana.fun',
'channel_url': 'https://bsky.app/profile/did:plc:3tndo2mqg2vgpxnpyrxiol6p',
'timestamp': 1726353835,
'like_count': int,
'repost_count': int,
'comment_count': int,
'webpage_url': 'https://bsky.app/profile/did:plc:3tndo2mqg2vgpxnpyrxiol6p/post/3l45kdlktfe2o',
'tags': 'count:1',
'subtitles': dict,
'comments': None,
}, },
}, { }, {
'url': 'https://bsky.app/profile/souris.moe/post/3l4qhp7bcs52c', 'url': 'https://bsky.app/profile/souris.moe/post/3l4qhp7bcs52c',
@ -92,7 +66,6 @@ class BlueskyIE(InfoExtractor):
'upload_date': '20240922', 'upload_date': '20240922',
'description': '', 'description': '',
'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$', 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
'alt_title': None,
'uploader': str, 'uploader': str,
'channel': 'souris.moe', 'channel': 'souris.moe',
'uploader_id': 'did:plc:tj7g244gl5v6ai6cm4f4wlqp', 'uploader_id': 'did:plc:tj7g244gl5v6ai6cm4f4wlqp',
@ -106,7 +79,6 @@ class BlueskyIE(InfoExtractor):
'webpage_url': 'https://bsky.app/profile/souris.moe/post/3l4qhp7bcs52c', 'webpage_url': 'https://bsky.app/profile/souris.moe/post/3l4qhp7bcs52c',
'tags': 'count:1', 'tags': 'count:1',
'subtitles': 'count:0', 'subtitles': 'count:0',
'comments': None,
}, },
}, { }, {
'url': 'https://bsky.app/profile/de1.pds.tentacle.expert/post/3l3w4tnezek2e', 'url': 'https://bsky.app/profile/de1.pds.tentacle.expert/post/3l3w4tnezek2e',
@ -118,7 +90,6 @@ class BlueskyIE(InfoExtractor):
'upload_date': '20240911', 'upload_date': '20240911',
'description': '', 'description': '',
'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$', 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
'alt_title': None,
'uploader': str, 'uploader': str,
'channel': 'de1.pds.tentacle.expert', 'channel': 'de1.pds.tentacle.expert',
'uploader_id': 'did:web:de1.tentacle.expert', 'uploader_id': 'did:web:de1.tentacle.expert',
@ -132,7 +103,86 @@ class BlueskyIE(InfoExtractor):
'webpage_url': 'https://bsky.app/profile/de1.pds.tentacle.expert/post/3l3w4tnezek2e', 'webpage_url': 'https://bsky.app/profile/de1.pds.tentacle.expert/post/3l3w4tnezek2e',
'tags': 'count:1', 'tags': 'count:1',
'subtitles': 'count:0', 'subtitles': 'count:0',
'comments': None, },
}, {
'url': 'https://bsky.app/profile/yunayuispink.bsky.social/post/3l7gqcfes742o',
'md5': 'd4dfae6a3e6e31b130e728b5b84258c4',
'info_dict': {
'id': 'XxK3t_5V3ao',
'ext': 'webm',
'uploader_id': '@yunayuispink',
'live_status': 'not_live',
'view_count': int,
'channel_url': 'https://www.youtube.com/channel/UCPLvXnHa7lTyNoR_dGsU14w',
'thumbnail': 'https://i.ytimg.com/vi_webp/XxK3t_5V3ao/maxresdefault.webp',
'upload_date': '20241026',
'uploader_url': 'https://www.youtube.com/@yunayuispink',
'description': 'md5:7d474e6ab76a88c84eb0f294e18ed828',
'categories': ['Entertainment'],
'tags': [],
'title': '5min vs 5hours drawing',
'duration': 321,
'uploader': 'yunayu',
'channel_follower_count': int,
'channel': 'yunayu',
'playable_in_embed': True,
'timestamp': 1729967784,
'like_count': int,
'channel_id': 'UCPLvXnHa7lTyNoR_dGsU14w',
'availability': 'public',
'age_limit': 0,
'comment_count': int,
},
'add_ie': ['Youtube'],
}, {
'url': 'https://bsky.app/profile/endshark.bsky.social/post/3jzxjkcemae2m',
'md5': 'd5c8fbc8f72b9f6ef160c150c420bb55',
'info_dict': {
'id': '222792849',
'ext': 'mp3',
'track': 'Forward to the End',
'thumbnail': 'https://f4.bcbits.com/img/a2507705510_5.jpg',
'album': 'Hari Nezumi [EP]',
'uploader_id': 'laserbatx',
'uploader': 'LASERBAT',
'duration': 228.571,
'album_artists': ['LASERBAT'],
'timestamp': 1682276040.0,
'uploader_url': 'https://laserbatx.bandcamp.com',
'track_id': '222792849',
'release_date': '20230423',
'upload_date': '20230423',
'release_timestamp': 1682276040.0,
'track_number': 1,
'artists': ['LASERBAT'],
'title': 'LASERBAT - Forward to the End',
},
'add_ie': ['Bandcamp'],
}, {
'url': 'https://bsky.app/profile/dannybhoix.bsky.social/post/3l6oe5mtr2c2j',
'md5': 'b9e344fdbce9f2852c668a97efefb105',
'info_dict': {
'id': '3l6oe5mtr2c2j',
'ext': 'mp4',
'description': 'this looks like a 2012 announcement video. i love it.',
'uploader_url': 'https://bsky.app/profile/dannybhoix.bsky.social',
'uploader': 'Danny',
'title': str,
'repost_count': int,
'comment_count': int,
'channel': 'dannybhoix.bsky.social',
'timestamp': 1729130330,
'uploader_id': 'did:plc:ng7fhshaed7assvhkq7cxxnw',
'upload_date': '20241017',
'channel_url': 'https://bsky.app/profile/did:plc:ng7fhshaed7assvhkq7cxxnw',
'tags': ['en'],
'like_count': int,
'channel_id': 'did:plc:ng7fhshaed7assvhkq7cxxnw',
'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
'alt_title': 'Bluesky video feature announcement',
'subtitles': {
'en': 'mincount:1',
},
}, },
}] }]
@ -148,13 +198,15 @@ def traverse_replies(self, thread_node, root_uri):
author_did = traverse_obj(post, ('author', 'did'), default='') author_did = traverse_obj(post, ('author', 'did'), default='')
yield { yield {
'id': post_uri, 'id': post_uri,
'text': traverse_obj(post, ('record', 'text')), **traverse_obj(post, {
'timestamp': parse_iso8601(traverse_obj(post, ('record', 'createdAt'))), 'text': ('record', 'text'),
'timestamp': ('record', 'createdAt', {parse_iso8601}),
'author': ('author', 'displayName'),
'author_thumbnail': ('author', 'avatar', {url_or_none}),
}),
'parent': 'root' if parent_uri == root_uri else parent_uri, 'parent': 'root' if parent_uri == root_uri else parent_uri,
'like_count': post.get('likeCount'), 'like_count': post.get('likeCount'),
'author': traverse_obj(post, ('author', 'displayName')),
'author_id': author_did, 'author_id': author_did,
'author_thumbnail': traverse_obj(post, ('author', 'avatar'), expected_type=url_or_none),
'author_url': f'https://bsky.app/profile/{author_handle}', 'author_url': f'https://bsky.app/profile/{author_handle}',
'author_is_uploader': author_did in root_uri, 'author_is_uploader': author_did in root_uri,
} }
@ -166,58 +218,100 @@ def traverse_replies(self, thread_node, root_uri):
def _real_extract(self, url): def _real_extract(self, url):
handle, video_id = self._match_valid_url(url).groups() handle, video_id = self._match_valid_url(url).groups()
did = handle if handle.startswith('did:') else self._download_json(
'https://public.api.bsky.app/xrpc/com.atproto.identity.resolveHandle',
video_id, query={'handle': handle}).get('did')
getcomments = self.get_param('getcomments', False) getcomments = self.get_param('getcomments', False)
meta = self._download_json( meta = self._download_json(
'https://public.api.bsky.app/xrpc/app.bsky.feed.getPostThread', 'https://public.api.bsky.app/xrpc/app.bsky.feed.getPostThread',
video_id, headers={'Content-Type': 'application/json'}, query={ video_id, headers={'Content-Type': 'application/json'}, query={
'uri': f'at://{did}/app.bsky.feed.post/{video_id}', 'uri': f'at://{handle}/app.bsky.feed.post/{video_id}',
'depth': 1000 if getcomments else 0, 'depth': 1000 if getcomments else 0,
'parentHeight': 1000 if getcomments else 0, 'parentHeight': 1000 if getcomments else 0,
}).get('thread') })['thread']
post, record_embed = meta.get('post'), traverse_obj(meta, ('post', 'record', 'embed')) post = meta.get('post')
formats, subs = self._extract_m3u8_formats_and_subtitles( did = traverse_obj(post, ('author', 'did'))
traverse_obj(post, ('embed', 'playlist'), ('embed', 'media', 'playlist')), record_embed = traverse_obj(post, ('record', 'embed', ('media', None)), get_all=False)
video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False, post_type = record_embed.get('$type') if record_embed else None
note='Downloading HD m3u8 information', errnote='Unable to download HD m3u8 information') quoted_post = traverse_obj(post, ('embed', 'record', ('record', None)), get_all=False)
blob_cid = traverse_obj(record_embed, ('video', 'ref', '$link'), ('video', 'cid'), quoted_type = traverse_obj(quoted_post, ('value', 'embed', ('media', None), '$type'), get_all=False)
('media', 'video', 'ref', '$link'), ('media', 'video', 'cid')) quoted_media = traverse_obj(quoted_post, ('embeds', 0, ('media', None)), get_all=False)
if blob_cid:
formats.append({ if post_type == 'app.bsky.embed.external':
'format_id': 'blob', return self.url_result(traverse_obj(
'url': f'https://bsky.social/xrpc/com.atproto.sync.getBlob?did={did}&cid={blob_cid}', post, ('embed', ('media', None), 'external', 'uri'), get_all=False)
'ext': mimetype2ext(traverse_obj(record_embed, ('video', 'mimeType')), 'mp4'), or traverse_obj(record_embed, ('external', 'uri')))
'width': traverse_obj(record_embed, ('aspectRatio', 'width'), expected_type=int_or_none), elif post_type == 'app.bsky.embed.video':
'height': traverse_obj(record_embed, ('aspectRatio', 'height'), expected_type=int_or_none), formats, subs = self._extract_m3u8_formats_and_subtitles(
'filesize': traverse_obj(record_embed, ('video', 'size'), expected_type=int_or_none), traverse_obj(post, ('embed', ('media', None), 'playlist'), get_all=False),
}) video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False,
note='Downloading m3u8 information', errnote='Unable to download m3u8 information')
if blob_cid := traverse_obj(record_embed, ('video', 'ref', '$link'), ('video', 'cid')):
formats.append({
'format_id': 'blob',
'url': f'https://bsky.social/xrpc/com.atproto.sync.getBlob?did={did}&cid={blob_cid}',
**traverse_obj(record_embed, {
'ext': ('video', 'mimeType', {mimetype2ext}),
'width': ('aspectRatio', 'width', {int_or_none}),
'height': ('aspectRatio', 'height', {int_or_none}),
'filesize': ('video', 'size', {int_or_none}),
}),
})
video_info = {
'formats': formats,
'subtitles': subs,
**traverse_obj(post, {
'thumbnail': ('embed', 'thumbnail', {url_or_none}),
'alt_title': ('embed', 'alt'),
}),
}
elif quoted_type == 'app.bsky.embed.external':
return self.url_result(traverse_obj(quoted_media, ('external', 'uri')))
elif quoted_type == 'app.bsky.embed.video':
formats, subs = self._extract_m3u8_formats_and_subtitles(
quoted_media.get('playlist'), video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False,
note='Downloading m3u8 information', errnote='Unable to download m3u8 information')
if blob_cid := quoted_media.get('cid'):
quoted_did = traverse_obj(quoted_post, ('author', 'did'))
quoted_embed = traverse_obj(quoted_post, ('value', 'embed', ('media', None)), get_all=False)
formats.append({
'format_id': 'blob',
'url': f'https://bsky.social/xrpc/com.atproto.sync.getBlob?did={quoted_did}&cid={blob_cid}',
**traverse_obj(quoted_embed, {
'ext': ('video', 'mimeType', {mimetype2ext}),
'width': ('aspectRatio', 'width', {int_or_none}),
'height': ('aspectRatio', 'height', {int_or_none}),
'filesize': ('video', 'size', {int_or_none}),
}),
})
video_info = {
'formats': formats,
'subtitles': subs,
'thumbnail': url_or_none(quoted_media.get('thumbnail')),
'alt_title': quoted_embed.get('alt') or quoted_media.get('alt'),
}
else:
self.raise_no_formats('No video could be found in this post', expected=True)
handle = traverse_obj(post, ('author', 'handle')) handle = traverse_obj(post, ('author', 'handle'))
uploader = traverse_obj(post, ('author', 'displayName')) uploader = traverse_obj(post, ('author', 'displayName')) or handle
description = traverse_obj(post, ('record', 'text'))
return { return {
'id': video_id, 'id': video_id,
'title': f'{uploader}: "{description}"', 'title': f'{uploader} on Bluesky',
'formats': formats, **video_info,
'description': description,
'thumbnail': traverse_obj(post, ('embed', 'thumbnail'), expected_type=url_or_none),
'alt_title': traverse_obj(post, ('embed', 'alt'), ('record', 'embed', 'alt')),
'uploader': uploader, 'uploader': uploader,
'channel': handle, 'channel': handle,
'uploader_id': did, 'uploader_id': did,
'channel_id': did, 'channel_id': did,
'uploader_url': f'https://bsky.app/profile/{handle}', 'uploader_url': f'https://bsky.app/profile/{handle}',
'channel_url': f'https://bsky.app/profile/{did}', 'channel_url': f'https://bsky.app/profile/{did}',
'timestamp': parse_iso8601(traverse_obj(post, ('record', 'createdAt'))),
'like_count': post.get('likeCount'), 'like_count': post.get('likeCount'),
'repost_count': post.get('repostCount'), 'repost_count': post.get('repostCount'),
'comment_count': post.get('replyCount'), 'comment_count': post.get('replyCount'),
'tags': post.get('labels', []) + traverse_obj(post, ('record', 'langs'), default=[]), 'tags': post.get('labels', []) + traverse_obj(post, ('record', 'langs'), default=[]),
'__post_extractor': self.extract_comments(meta), '__post_extractor': self.extract_comments(meta),
'subtitles': subs, **traverse_obj(post, {
'timestamp': ('record', 'createdAt', {parse_iso8601}),
'description': ('record', 'text'),
}),
} }