From ad2f206b059ada5cc03127c233f60c8df622000d Mon Sep 17 00:00:00 2001 From: MellowKyler Date: Thu, 31 Oct 2024 22:44:49 -0500 Subject: [PATCH] - support non-bsky PLCs (requires service_endpoint retrieval) - support main.bsky.dev links - update tag retrieval logic - add age_limit to info_dict - force getcomments in YouTube test to avoid inconsistent None comment_count --- yt_dlp/extractor/bluesky.py | 59 ++++++++++++++++++++++++++++++++----- 1 file changed, 52 insertions(+), 7 deletions(-) diff --git a/yt_dlp/extractor/bluesky.py b/yt_dlp/extractor/bluesky.py index 1a60943e17..936514351f 100644 --- a/yt_dlp/extractor/bluesky.py +++ b/yt_dlp/extractor/bluesky.py @@ -3,7 +3,7 @@ class BlueskyIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?bsky\.app/profile/(?P[^/]+)/post/(?P[0-9a-zA-Z]+)' + _VALID_URL = r'https?://(?:www\.)?(?:bsky\.app|main\.bsky\.dev)/profile/(?P[^/]+)/post/(?P[0-9a-zA-Z]+)' _TESTS = [{ 'url': 'https://bsky.app/profile/blu3blue.bsky.social/post/3l4omssdl632g', 'md5': '375539c1930ab05d15585ed772ab54fd', @@ -27,6 +27,7 @@ class BlueskyIE(InfoExtractor): 'webpage_url': 'https://bsky.app/profile/blu3blue.bsky.social/post/3l4omssdl632g', 'tags': 'count:1', 'comments': 'mincount:29', + 'age_limit': 0, }, 'params': {'getcomments': True}, }, { @@ -51,13 +52,14 @@ class BlueskyIE(InfoExtractor): 'repost_count': int, 'comment_count': int, 'webpage_url': 'https://bsky.app/profile/bsky.app/post/3l3vgf77uco2g', - 'tags': 'count:2', + 'tags': ['en', 'pt'], 'subtitles': { 'en': 'mincount:1', }, + 'age_limit': 0, }, }, { - 'url': 'https://bsky.app/profile/souris.moe/post/3l4qhp7bcs52c', + 'url': 'https://main.bsky.dev/profile/souris.moe/post/3l4qhp7bcs52c', 'md5': '5f2df8c200b5633eb7fb2c984d29772f', 'info_dict': { 'id': '3l4qhp7bcs52c', @@ -76,9 +78,10 @@ class BlueskyIE(InfoExtractor): 'like_count': int, 'repost_count': int, 'comment_count': int, - 'webpage_url': 'https://bsky.app/profile/souris.moe/post/3l4qhp7bcs52c', + 'webpage_url': 'https://main.bsky.dev/profile/souris.moe/post/3l4qhp7bcs52c', 'tags': 'count:1', 'subtitles': 'count:0', + 'age_limit': 0, }, }, { 'url': 'https://bsky.app/profile/de1.pds.tentacle.expert/post/3l3w4tnezek2e', @@ -103,6 +106,7 @@ class BlueskyIE(InfoExtractor): 'webpage_url': 'https://bsky.app/profile/de1.pds.tentacle.expert/post/3l3w4tnezek2e', 'tags': 'count:1', 'subtitles': 'count:0', + 'age_limit': 0, }, }, { 'url': 'https://bsky.app/profile/yunayuispink.bsky.social/post/3l7gqcfes742o', @@ -134,6 +138,7 @@ class BlueskyIE(InfoExtractor): 'comment_count': int, }, 'add_ie': ['Youtube'], + 'params': {'getcomments': True}, }, { 'url': 'https://bsky.app/profile/endshark.bsky.social/post/3jzxjkcemae2m', 'md5': 'd5c8fbc8f72b9f6ef160c150c420bb55', @@ -183,6 +188,30 @@ class BlueskyIE(InfoExtractor): 'subtitles': { 'en': 'mincount:1', }, + 'age_limit': 0, + }, + }, { + 'url': 'https://bsky.app/profile/alt.bun.how/post/3l7rdfxhyds2f', + 'md5': '8775118b235cf9fa6b5ad30f95cda75c', + 'info_dict': { + 'id': '3l7rdfxhyds2f', + 'ext': 'mp4', + 'channel_url': 'https://bsky.app/profile/did:plc:7x6rtuenkuvxq3zsvffp2ide', + 'timestamp': 1730332128, + 'channel_id': 'did:plc:7x6rtuenkuvxq3zsvffp2ide', + 'upload_date': '20241030', + 'channel': 'alt.bun.how', + 'uploader_id': 'did:plc:7x6rtuenkuvxq3zsvffp2ide', + 'description': 'crazy that i look like this tbh', + 'comment_count': int, + 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$', + 'uploader_url': 'https://bsky.app/profile/alt.bun.how', + 'tags': ['en', 'sexual', 'sexual'], + 'like_count': int, + 'title': 'cinnamon on Bluesky', + 'uploader': 'cinnamon', + 'repost_count': int, + 'age_limit': 18, }, }] @@ -216,6 +245,15 @@ def traverse_replies(self, thread_node, root_uri): if parent := thread_node.get('parent'): yield from self.traverse_replies(parent, root_uri) + def get_service_endpoint(self, did, video_id): + services = self._download_json( + f'https://resolver.identity.foundation/1.0/identifiers/{did}', + video_id, fatal=False).get('service') or [] + for service in services: + if service.get('type') == 'AtprotoPersonalDataServer': + return service.get('serviceEndpoint') + return 'https://bsky.social' + def _real_extract(self, url): handle, video_id = self._match_valid_url(url).groups() @@ -246,9 +284,10 @@ def _real_extract(self, url): video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False, note='Downloading m3u8 information', errnote='Unable to download m3u8 information') if blob_cid := traverse_obj(record_embed, ('video', 'ref', '$link'), ('video', 'cid')): + endpoint = self.get_service_endpoint(did, video_id) formats.append({ 'format_id': 'blob', - 'url': f'https://bsky.social/xrpc/com.atproto.sync.getBlob?did={did}&cid={blob_cid}', + 'url': f'{endpoint}/xrpc/com.atproto.sync.getBlob?did={did}&cid={blob_cid}', **traverse_obj(record_embed, { 'ext': ('video', 'mimeType', {mimetype2ext}), 'width': ('aspectRatio', 'width', {int_or_none}), @@ -273,9 +312,10 @@ def _real_extract(self, url): if blob_cid := quoted_media.get('cid'): quoted_did = traverse_obj(quoted_post, ('author', 'did')) quoted_embed = traverse_obj(quoted_post, ('value', 'embed', ('media', None)), get_all=False) + endpoint = self.get_service_endpoint(quoted_did, video_id) formats.append({ 'format_id': 'blob', - 'url': f'https://bsky.social/xrpc/com.atproto.sync.getBlob?did={quoted_did}&cid={blob_cid}', + 'url': f'{endpoint}/xrpc/com.atproto.sync.getBlob?did={quoted_did}&cid={blob_cid}', **traverse_obj(quoted_embed, { 'ext': ('video', 'mimeType', {mimetype2ext}), 'width': ('aspectRatio', 'width', {int_or_none}), @@ -295,6 +335,10 @@ def _real_extract(self, url): handle = traverse_obj(post, ('author', 'handle')) uploader = traverse_obj(post, ('author', 'displayName')) or handle + tags = traverse_obj(post, ('record', 'langs'), default=[]) + if label_list := post.get('labels'): + tags.extend(label.get('val') for label in label_list) + return { 'id': video_id, 'title': f'{uploader} on Bluesky', @@ -308,7 +352,8 @@ def _real_extract(self, url): 'like_count': post.get('likeCount'), 'repost_count': post.get('repostCount'), 'comment_count': post.get('replyCount'), - 'tags': post.get('labels', []) + traverse_obj(post, ('record', 'langs'), default=[]), + 'tags': tags, + 'age_limit': 18 if {'sexual', 'porn', 'graphic-media'}.intersection(tags) else 0, '__post_extractor': self.extract_comments(meta), **traverse_obj(post, { 'timestamp': ('record', 'createdAt', {parse_iso8601}),