Merge branch 'develop' into 'production'
Develop See merge request taobojlen/fediverse.space!49
This commit is contained in:
commit
a22d447644
|
@ -69,5 +69,11 @@ def validate_int(integer):
|
||||||
return integer if (isinstance(integer, int) and 0 <= integer < 2147483647) else None
|
return integer if (isinstance(integer, int) and 0 <= integer < 2147483647) else None
|
||||||
|
|
||||||
|
|
||||||
def log(text):
|
def log(obj, text, success=False, error=False):
|
||||||
return "{} - {}".format(datetime.now().isoformat(), text)
|
text = "{} - {}".format(datetime.now().isoformat(), text)
|
||||||
|
if success:
|
||||||
|
text = obj.style.SUCCESS(text)
|
||||||
|
if error:
|
||||||
|
obj.stderr.write(text)
|
||||||
|
else:
|
||||||
|
obj.stdout.write(text)
|
||||||
|
|
|
@ -21,11 +21,11 @@ from scraper.management.commands._util import require_lock, InvalidResponseExcep
|
||||||
|
|
||||||
# TODO: use the /api/v1/server/followers and /api/v1/server/following endpoints in peertube instances
|
# TODO: use the /api/v1/server/followers and /api/v1/server/following endpoints in peertube instances
|
||||||
|
|
||||||
SEED = 'mastodon.social'
|
SEED = 'p.a3.pm'
|
||||||
TIMEOUT = 20 # seconds
|
TIMEOUT = 20 # seconds
|
||||||
NUM_THREADS = 16 # roughly 40MB each
|
NUM_THREADS = 16 # roughly 40MB each
|
||||||
PERSONAL_INSTANCE_THRESHOLD = 10 # instances with < this many users won't be crawled
|
PERSONAL_INSTANCE_THRESHOLD = 10 # instances with < this many users won't be crawled
|
||||||
MAX_STATUSES_PER_PAGE = 100
|
MAX_STATUSES_PER_PAGE = 40
|
||||||
STATUS_SCRAPE_LIMIT = 5000
|
STATUS_SCRAPE_LIMIT = 5000
|
||||||
INSTANCE_SCRAPE_LIMIT = 50 # note: this does not include newly discovered instances! they will always be crawled.
|
INSTANCE_SCRAPE_LIMIT = 50 # note: this does not include newly discovered instances! they will always be crawled.
|
||||||
|
|
||||||
|
@ -47,26 +47,38 @@ class Command(BaseCommand):
|
||||||
dest='all',
|
dest='all',
|
||||||
help="Crawl all instances rather than limiting to stale ones"
|
help="Crawl all instances rather than limiting to stale ones"
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--verbose',
|
||||||
|
action='store_true',
|
||||||
|
dest='verbose',
|
||||||
|
help="Verbose logging"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--instance',
|
||||||
|
dest='instance',
|
||||||
|
help="Crawl a single instance"
|
||||||
|
)
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
|
self.verbose = False
|
||||||
self.scraped_count = 0
|
self.scraped_count = 0
|
||||||
f = open(os.path.join(settings.BASE_DIR, '../whitelist.txt'), 'r')
|
f = open(os.path.join(settings.BASE_DIR, '../whitelist.txt'), 'r')
|
||||||
self.whitelist = seq(f.readlines()).map(lambda i: i.lower().strip()).to_list()
|
self.whitelist = seq(f.readlines()).map(lambda i: i.lower().strip()).to_list()
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
@staticmethod
|
def get_instance_info(self, instance_name: str):
|
||||||
def get_instance_info(instance_name: str):
|
|
||||||
"""Collect info about instance"""
|
"""Collect info about instance"""
|
||||||
url = 'https://' + instance_name + '/api/v1/instance'
|
url = 'https://' + instance_name + '/api/v1/instance'
|
||||||
response = requests.get(url, timeout=TIMEOUT)
|
response = requests.get(url, timeout=TIMEOUT)
|
||||||
json = response.json()
|
json = response.json()
|
||||||
if response.status_code != 200 or get_key(json, ['error']):
|
if response.status_code != 200 or get_key(json, ['error']):
|
||||||
|
if self.verbose:
|
||||||
|
log(self, "Couldn't get instance info for {}: {}".format(instance_name, response), error=True)
|
||||||
raise InvalidResponseException("Could not get info for {}".format(instance_name))
|
raise InvalidResponseException("Could not get info for {}".format(instance_name))
|
||||||
return json
|
return json
|
||||||
|
|
||||||
@staticmethod
|
def get_instance_peers(self, instance_name: str):
|
||||||
def get_instance_peers(instance_name: str):
|
|
||||||
"""Collect connected instances"""
|
"""Collect connected instances"""
|
||||||
# The peers endpoint returns a "list of all domain names known to this instance"
|
# The peers endpoint returns a "list of all domain names known to this instance"
|
||||||
# (https://github.com/tootsuite/mastodon/pull/6125)
|
# (https://github.com/tootsuite/mastodon/pull/6125)
|
||||||
|
@ -74,24 +86,29 @@ class Command(BaseCommand):
|
||||||
response = requests.get(url, timeout=TIMEOUT)
|
response = requests.get(url, timeout=TIMEOUT)
|
||||||
peers = response.json()
|
peers = response.json()
|
||||||
if response.status_code != 200 or not isinstance(peers, list) or get_key(peers, ['error']):
|
if response.status_code != 200 or not isinstance(peers, list) or get_key(peers, ['error']):
|
||||||
|
if self.verbose:
|
||||||
|
log(self, "Couldn't get peers for {}: {}".format(instance_name, response), error=True)
|
||||||
raise InvalidResponseException("Could not get peers for {}".format(instance_name))
|
raise InvalidResponseException("Could not get peers for {}".format(instance_name))
|
||||||
# Get rid of peers that just say "null" and the instance itself
|
# Get rid of peers that just say "null" and the instance itself
|
||||||
# Also make sure to lowercase all instance names and remove duplicates
|
# Also make sure to lowercase all instance names and remove duplicates
|
||||||
return list(set([peer.lower() for peer in peers if peer and peer != instance_name]))
|
return list(set([peer.lower() for peer in peers if peer and peer != instance_name]))
|
||||||
|
|
||||||
@staticmethod
|
def get_statuses(self, instance_name: str):
|
||||||
def get_statuses(instance_name: str):
|
|
||||||
"""Collect all statuses that mention users on other instances"""
|
"""Collect all statuses that mention users on other instances"""
|
||||||
mentions = []
|
mentions = []
|
||||||
datetime_threshold = datetime.now(timezone.utc) - timedelta(days=31)
|
datetime_threshold = datetime.now(timezone.utc) - timedelta(days=31)
|
||||||
statuses_seen = 0
|
statuses_seen = 0
|
||||||
# We'll ask for lots of statuses, but Mastodon never returns more than 40. Some Pleroma instances will ignore
|
# We'll ask for lots of statuses, but Mastodon never returns more than 40. Some Pleroma instances will ignore
|
||||||
# the limit and return 20.
|
# the limit and return 20.
|
||||||
url = 'https://{}/api/v1/timelines/public?local=true&limit={}/'.format(instance_name, MAX_STATUSES_PER_PAGE)
|
url = 'https://{}/api/v1/timelines/public?local=true&limit={}'.format(instance_name, MAX_STATUSES_PER_PAGE)
|
||||||
while True:
|
while True:
|
||||||
|
if self.verbose:
|
||||||
|
log(self, "({} posts seen)\tGetting {}".format(statuses_seen, url))
|
||||||
response = requests.get(url, timeout=TIMEOUT)
|
response = requests.get(url, timeout=TIMEOUT)
|
||||||
statuses = response.json()
|
statuses = response.json()
|
||||||
if response.status_code != 200 or get_key(statuses, ['error']):
|
if response.status_code != 200 or get_key(statuses, ['error']):
|
||||||
|
if self.verbose:
|
||||||
|
log(self, "Couldn't get statuses for {}: {}".format(instance_name, response), error=True)
|
||||||
raise InvalidResponseException("Could not get statuses for {}".format(instance_name))
|
raise InvalidResponseException("Could not get statuses for {}".format(instance_name))
|
||||||
elif len(statuses) == 0:
|
elif len(statuses) == 0:
|
||||||
break
|
break
|
||||||
|
@ -197,7 +214,7 @@ class Command(BaseCommand):
|
||||||
relationship.last_updated = datetime.now()
|
relationship.last_updated = datetime.now()
|
||||||
bulk_update(relationships, update_fields=['mention_count', 'statuses_seen', 'last_updated'])
|
bulk_update(relationships, update_fields=['mention_count', 'statuses_seen', 'last_updated'])
|
||||||
|
|
||||||
self.stdout.write(log("Processed {}: {}".format(data['instance_name'], data['status'])))
|
log(self, "Processed {}: {}".format(data['instance_name'], data['status']))
|
||||||
|
|
||||||
def worker(self, queue: mp.JoinableQueue, existing_instance_ids, scraped_ids):
|
def worker(self, queue: mp.JoinableQueue, existing_instance_ids, scraped_ids):
|
||||||
"""The main worker that processes instances"""
|
"""The main worker that processes instances"""
|
||||||
|
@ -206,12 +223,12 @@ class Command(BaseCommand):
|
||||||
instance = queue.get()
|
instance = queue.get()
|
||||||
if instance.name in scraped_ids:
|
if instance.name in scraped_ids:
|
||||||
# If we hit this branch, it's indicative of a bug
|
# If we hit this branch, it's indicative of a bug
|
||||||
self.stderr.write(log("Skipping {}, already done. This should not have been added to the queue!"
|
log(self, "Skipping {}, already done. This should not have been added to the queue!".format(instance),
|
||||||
.format(instance)))
|
error=True)
|
||||||
queue.task_done()
|
queue.task_done()
|
||||||
else:
|
else:
|
||||||
# Fetch data on instance
|
# Fetch data on instance
|
||||||
self.stdout.write(log("Processing {}".format(instance.name)))
|
log(self, "Processing {}".format(instance.name))
|
||||||
data = self.process_instance(instance)
|
data = self.process_instance(instance)
|
||||||
self.save_data(instance, data, queue, existing_instance_ids)
|
self.save_data(instance, data, queue, existing_instance_ids)
|
||||||
scraped_ids[instance.name] = 1
|
scraped_ids[instance.name] = 1
|
||||||
|
@ -219,7 +236,13 @@ class Command(BaseCommand):
|
||||||
|
|
||||||
def handle(self, *args, **options):
|
def handle(self, *args, **options):
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
if options['all']:
|
|
||||||
|
self.verbose = options['verbose']
|
||||||
|
|
||||||
|
if options['instance']:
|
||||||
|
stale_instance, _ = Instance.objects.get_or_create(name=options['instance'])
|
||||||
|
stale_instances = [stale_instance]
|
||||||
|
elif options['all']:
|
||||||
stale_instances = Instance.objects.all()
|
stale_instances = Instance.objects.all()
|
||||||
else:
|
else:
|
||||||
stale_instances = Instance.objects.filter(last_updated__lte=datetime.now()-timedelta(days=1))
|
stale_instances = Instance.objects.filter(last_updated__lte=datetime.now()-timedelta(days=1))
|
||||||
|
@ -246,5 +269,4 @@ class Command(BaseCommand):
|
||||||
self.scraped_count = len(scraped_ids.keys())
|
self.scraped_count = len(scraped_ids.keys())
|
||||||
|
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
self.stdout.write(self.style.SUCCESS(log("Scraped {} instances in {:.0f}s"
|
log(self, "Scraped {} instances in {:.0f}s".format(self.scraped_count, end_time - start_time), True)
|
||||||
.format(self.scraped_count, end_time - start_time))))
|
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
mastodon.social
|
|
|
@ -267,7 +267,7 @@ class SidebarImpl extends React.Component<ISidebarProps, ISidebarState> {
|
||||||
<NonIdealState
|
<NonIdealState
|
||||||
icon={IconNames.BLOCKED_PERSON}
|
icon={IconNames.BLOCKED_PERSON}
|
||||||
title="No data"
|
title="No data"
|
||||||
description="This instance has fewer than 5 users. It was not crawled in order to protect their privacy, but if it's your instance you can opt in."
|
description="This instance has fewer than 10 users. It was not crawled in order to protect their privacy, but if it's your instance you can opt in."
|
||||||
action={<AnchorButton icon={IconNames.CONFIRM} href="https://cursed.technology/@tao" target="_blank">
|
action={<AnchorButton icon={IconNames.CONFIRM} href="https://cursed.technology/@tao" target="_blank">
|
||||||
Message @tao to opt in</AnchorButton>}
|
Message @tao to opt in</AnchorButton>}
|
||||||
/>
|
/>
|
||||||
|
|
Loading…
Reference in a new issue