don't scrape personal instances

This commit is contained in:
Tao Bojlen 2018-09-03 00:36:03 +02:00
parent a176e35ec2
commit 62ed809ee0
2 changed files with 34 additions and 14 deletions

View file

@ -42,11 +42,19 @@ def require_lock(model, lock):
return require_lock_decorator return require_lock_decorator
class InvalidResponseError(Exception): class InvalidResponseException(Exception):
"""Used for all responses other than HTTP 200""" """Used for all responses other than HTTP 200"""
pass pass
class PersonalInstanceException(Exception):
"""
Used for instances that we don't want to scrape because there are too few users.
We don't want information on individuals, but aggregate statistics on instances and how they interact.
"""
pass
def get_key(data, keys: list): def get_key(data, keys: list):
try: try:
val = data[keys.pop(0)] val = data[keys.pop(0)]

View file

@ -14,7 +14,7 @@ from django_bulk_update.helper import bulk_update
from django.core.management.base import BaseCommand from django.core.management.base import BaseCommand
from django import db from django import db
from scraper.models import Instance, PeerRelationship from scraper.models import Instance, PeerRelationship
from scraper.management.commands._util import require_lock, InvalidResponseError, get_key, log, validate_int from scraper.management.commands._util import require_lock, InvalidResponseException, get_key, log, validate_int, PersonalInstanceException
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Because the script uses the Mastodon API other platforms like # # Because the script uses the Mastodon API other platforms like #
@ -30,8 +30,9 @@ from scraper.management.commands._util import require_lock, InvalidResponseError
# TODO: use the /api/v1/server/followers and /api/v1/server/following endpoints in peertube instances # TODO: use the /api/v1/server/followers and /api/v1/server/following endpoints in peertube instances
SEED = 'mastodon.social' SEED = 'mastodon.social'
TIMEOUT = 10 TIMEOUT = 10 # seconds
NUM_THREADS = 64 NUM_THREADS = 64
PERSONAL_INSTANCE_THRESHOLD = 5 # instances with <= this many users won't be scraped
class Command(BaseCommand): class Command(BaseCommand):
@ -39,7 +40,7 @@ class Command(BaseCommand):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self.scraped_ids = set() self.scraped_count = 0
@staticmethod @staticmethod
def get_instance_info(instance_name: str): def get_instance_info(instance_name: str):
@ -48,7 +49,7 @@ class Command(BaseCommand):
response = requests.get(url, timeout=TIMEOUT) response = requests.get(url, timeout=TIMEOUT)
json = response.json() json = response.json()
if response.status_code != 200 or get_key(json, ['error']): if response.status_code != 200 or get_key(json, ['error']):
raise InvalidResponseError("Could not get info for {}".format(instance_name)) raise InvalidResponseException("Could not get info for {}".format(instance_name))
return json return json
@staticmethod @staticmethod
@ -60,7 +61,7 @@ class Command(BaseCommand):
response = requests.get(url, timeout=TIMEOUT) response = requests.get(url, timeout=TIMEOUT)
peers = response.json() peers = response.json()
if response.status_code != 200 or not isinstance(peers, list) or get_key(peers, ['error']): if response.status_code != 200 or not isinstance(peers, list) or get_key(peers, ['error']):
raise InvalidResponseError("Could not get peers for {}".format(instance_name)) raise InvalidResponseException("Could not get peers for {}".format(instance_name))
# Get rid of peers that just say "null" and the instance itself # Get rid of peers that just say "null" and the instance itself
return [peer for peer in peers if peer and peer != instance_name] return [peer for peer in peers if peer and peer != instance_name]
@ -77,7 +78,7 @@ class Command(BaseCommand):
response = requests.get(url, timeout=TIMEOUT) response = requests.get(url, timeout=TIMEOUT)
statuses = response.json() statuses = response.json()
if response.status_code != 200 or get_key(statuses, ['error']): if response.status_code != 200 or get_key(statuses, ['error']):
raise InvalidResponseError("Could not get statuses for {}".format(instance_name)) raise InvalidResponseException("Could not get statuses for {}".format(instance_name))
elif len(statuses) == 0: elif len(statuses) == 0:
break break
# Get mentions from this instance # Get mentions from this instance
@ -111,14 +112,23 @@ class Command(BaseCommand):
try: try:
data['instance_name'] = instance.name data['instance_name'] = instance.name
data['info'] = self.get_instance_info(instance.name) data['info'] = self.get_instance_info(instance.name)
# Check if this is a personal instance before continuing
user_count = get_key(data, ['info', 'stats', 'user_count'])
if isinstance(user_count, int) and user_count < PERSONAL_INSTANCE_THRESHOLD:
raise PersonalInstanceException
data['peers'] = self.get_instance_peers(instance.name) data['peers'] = self.get_instance_peers(instance.name)
if not data['info'] and not data['peers']: if not data['info'] and not data['peers']:
# We got a response from the instance, but it didn't have any of the information we were expecting. # We got a response from the instance, but it didn't have any of the information we were expecting.
raise InvalidResponseError raise InvalidResponseException
data['mentions'], data['statuses_seen'] = self.get_statuses(instance.name) data['mentions'], data['statuses_seen'] = self.get_statuses(instance.name)
data['status'] = 'success' data['status'] = 'success'
return data return data
except (InvalidResponseError,
except (InvalidResponseException,
PersonalInstanceException,
requests.exceptions.RequestException, requests.exceptions.RequestException,
json.decoder.JSONDecodeError) as e: json.decoder.JSONDecodeError) as e:
data['instance_name'] = instance.name data['instance_name'] = instance.name
@ -171,13 +181,13 @@ class Command(BaseCommand):
self.stdout.write(log("Saved {}".format(data['instance_name']))) self.stdout.write(log("Saved {}".format(data['instance_name'])))
def worker(self, queue: mp.JoinableQueue, existing_instance_ids): def worker(self, queue: mp.JoinableQueue, existing_instance_ids, scraped_ids):
"""The main worker that processes URLs""" """The main worker that processes URLs"""
# https://stackoverflow.com/a/38356519/3697202 # https://stackoverflow.com/a/38356519/3697202
db.connections.close_all() db.connections.close_all()
while True: while True:
instance = queue.get() instance = queue.get()
if instance.name in self.scraped_ids: if instance.name in scraped_ids:
self.stderr.write(log("Skipping {}, already done. This should not have been added to the queue!" self.stderr.write(log("Skipping {}, already done. This should not have been added to the queue!"
.format(instance))) .format(instance)))
queue.task_done() queue.task_done()
@ -186,7 +196,7 @@ class Command(BaseCommand):
self.stdout.write(log("Processing {}".format(instance.name))) self.stdout.write(log("Processing {}".format(instance.name)))
data = self.process_instance(instance) data = self.process_instance(instance)
self.save_data(instance, data, queue, existing_instance_ids) self.save_data(instance, data, queue, existing_instance_ids)
self.scraped_ids.add(instance.name) scraped_ids[instance.name] = 1
queue.task_done() queue.task_done()
def handle(self, *args, **options): def handle(self, *args, **options):
@ -196,6 +206,7 @@ class Command(BaseCommand):
# Share the list of existing instances amongst all threads (to avoid each thread having to query # Share the list of existing instances amongst all threads (to avoid each thread having to query
# for it on every instance it scrapes) # for it on every instance it scrapes)
existing_instance_ids = manager.list(list(Instance.objects.values_list('name', flat=True))) existing_instance_ids = manager.list(list(Instance.objects.values_list('name', flat=True)))
scraped_ids = manager.dict()
queue = mp.JoinableQueue() queue = mp.JoinableQueue()
if stale_instances: if stale_instances:
for instance in stale_instances: for instance in stale_instances:
@ -205,9 +216,10 @@ class Command(BaseCommand):
existing_instance_ids.append(instance.name) existing_instance_ids.append(instance.name)
queue.put(instance) queue.put(instance)
pool = mp.Pool(NUM_THREADS, initializer=self.worker, initargs=(queue, existing_instance_ids)) pool = mp.Pool(NUM_THREADS, initializer=self.worker, initargs=(queue, existing_instance_ids, scraped_ids))
queue.join() queue.join()
self.scraped_count = len(scraped_ids.keys())
end_time = time.time() end_time = time.time()
self.stdout.write(self.style.SUCCESS(log("Scraped {} instances in {:.0f}s" self.stdout.write(self.style.SUCCESS(log("Scraped {} instances in {:.0f}s"
.format(len(self.scraped_ids), end_time - start_time)))) .format(self.scraped_count, end_time - start_time))))