From 63d7a3fe95c32012558c1c225b1b9c40182fbdb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tao=20Bojl=C3=A9n?= <2803708-taobojlen@users.noreply.gitlab.com> Date: Thu, 28 Feb 2019 18:09:26 +0000 Subject: [PATCH] Clean up crawler --- backend/apiv1/views.py | 5 ++- backend/scraper/management/commands/scrape.py | 37 ++++++++++++++++--- 2 files changed, 35 insertions(+), 7 deletions(-) diff --git a/backend/apiv1/views.py b/backend/apiv1/views.py index 9dff54a..9be587f 100644 --- a/backend/apiv1/views.py +++ b/backend/apiv1/views.py @@ -32,5 +32,8 @@ class NodeView(viewsets.ReadOnlyModelViewSet): """ Endpoint to get a list of the graph's nodes in a SigmaJS-friendly format. """ - queryset = Instance.objects.filter(status='success') + queryset = Instance.objects.filter(status='success')\ + .filter(x_coord__isnull=False)\ + .filter(y_coord__isnull=False)\ + .filter(user_count__isnull=False) serializer_class = NodeSerializer diff --git a/backend/scraper/management/commands/scrape.py b/backend/scraper/management/commands/scrape.py index 8f30fa5..ddd0fea 100644 --- a/backend/scraper/management/commands/scrape.py +++ b/backend/scraper/management/commands/scrape.py @@ -24,13 +24,30 @@ from scraper.management.commands._util import require_lock, InvalidResponseExcep SEED = 'mastodon.social' TIMEOUT = 20 # seconds NUM_THREADS = 16 # roughly 40MB each -PERSONAL_INSTANCE_THRESHOLD = 5 # instances with < this many users won't be scraped +PERSONAL_INSTANCE_THRESHOLD = 10 # instances with < this many users won't be crawled +MAX_STATUSES_PER_PAGE = 100 STATUS_SCRAPE_LIMIT = 5000 +INSTANCE_SCRAPE_LIMIT = 50 # note: this does not include newly discovered instances! they will always be crawled. class Command(BaseCommand): help = "Scrapes the entire fediverse" + def add_arguments(self, parser): + # Named (optional) arguments + parser.add_argument( + '--unlimited', + action='store_true', + dest='unlimited', + help="Crawl all stale instances rather than limiting to {}".format(INSTANCE_SCRAPE_LIMIT), + ) + parser.add_argument( + '--all', + action='store_true', + dest='all', + help="Crawl all instances rather than limiting to stale ones" + ) + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.scraped_count = 0 @@ -59,7 +76,8 @@ class Command(BaseCommand): if response.status_code != 200 or not isinstance(peers, list) or get_key(peers, ['error']): raise InvalidResponseException("Could not get peers for {}".format(instance_name)) # Get rid of peers that just say "null" and the instance itself - return [peer for peer in peers if peer and peer != instance_name] + # Also make sure to lowercase all instance names; otherwise there'll be some duplicates + return [peer.lower() for peer in peers if peer and peer != instance_name] @staticmethod def get_statuses(instance_name: str): @@ -67,9 +85,9 @@ class Command(BaseCommand): mentions = [] datetime_threshold = datetime.now(timezone.utc) - timedelta(days=31) statuses_seen = 0 - # We'll ask for 1000 statuses, but Mastodon never returns more than 40. Some Pleroma instances will ignore + # We'll ask for lots of statuses, but Mastodon never returns more than 40. Some Pleroma instances will ignore # the limit and return 20. - url = 'https://' + instance_name + '/api/v1/timelines/public?local=true&limit=1000' + url = 'https://{}/api/v1/timelines/public?local=true&limit={}/'.format(instance_name, MAX_STATUSES_PER_PAGE) while True: response = requests.get(url, timeout=TIMEOUT) statuses = response.json() @@ -91,7 +109,7 @@ class Command(BaseCommand): break # Continuing, so get url for next page min_id = earliest_status['id'] - url = 'https://' + instance_name + '/api/v1/timelines/public?local=true&limit=1000&max_id=' + min_id + url = 'https://{}/api/v1/timelines/public?local=true&limit={}&max_id={}'.format(instance_name, MAX_STATUSES_PER_PAGE, min_id) time.sleep(2) # Sleep to avoid overloading the instance mentions_seq = (seq(mentions) @@ -201,7 +219,14 @@ class Command(BaseCommand): def handle(self, *args, **options): start_time = time.time() - stale_instances = Instance.objects.filter(last_updated__lte=datetime.now()-timedelta(days=1)) + if options['all']: + stale_instances = Instance.objects.all() + else: + stale_instances = Instance.objects.filter(last_updated__lte=datetime.now()-timedelta(days=1)) + + if not options['unlimited']: + stale_instances = stale_instances[:INSTANCE_SCRAPE_LIMIT] + with mp.Manager() as manager: # Share the list of existing instances amongst all threads (to avoid each thread having to query # for it on every instance it scrapes)