From 97247fe1304f818f8e1d2ce09959dc2e89a7fc8f Mon Sep 17 00:00:00 2001 From: Tao Bojlen Date: Sun, 26 Aug 2018 03:17:10 +0200 Subject: [PATCH] add scraper --- scraper/management/commands/scrape.py | 137 ++++++++++++++++++ scraper/migrations/0002_auto_20180826_0053.py | 42 ++++++ scraper/migrations/0003_auto_20180826_0057.py | 28 ++++ scraper/migrations/0004_auto_20180826_0100.py | 18 +++ scraper/models.py | 14 +- 5 files changed, 232 insertions(+), 7 deletions(-) create mode 100644 scraper/management/commands/scrape.py create mode 100644 scraper/migrations/0002_auto_20180826_0053.py create mode 100644 scraper/migrations/0003_auto_20180826_0057.py create mode 100644 scraper/migrations/0004_auto_20180826_0100.py diff --git a/scraper/management/commands/scrape.py b/scraper/management/commands/scrape.py new file mode 100644 index 0000000..8d87b1b --- /dev/null +++ b/scraper/management/commands/scrape.py @@ -0,0 +1,137 @@ +""" +This script starts at a seed instance and loads the list of connected +peers. From there, it slowly scrapes the peers of all instances it finds, +gradually mapping the fediverse. +""" +import json +import multiprocessing +import requests +from django.core.management.base import BaseCommand +from scraper.models import Instance, InstanceStats + +# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # +# Because the script uses the Mastodon API other platforms like # +# Pleroma, Peertube, Pixelfed, Funkwhale won't have outgoing peers. # +# # +# The script generates two files: # +# - nodes.csv # +# - edges.csv # +# # +# Change SEED to start from a different instance. # +# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # + +SEED = 'mastodon.social' +THREADS = 100 +TIMEOUT = 10 + + +class InvalidResponseError(Exception): + """Used for all responses other than HTTP 200""" + pass + + +def get_key(data, keys: list): + try: + val = data[keys.pop(0)] + while keys: + val = val[keys.pop(0)] + return val + except KeyError: + return '' + + +def get_instance_info(instance_name: str): + """Collect info about instance""" + url = 'https://' + instance_name + '/api/v1/instance' + response = requests.get(url, timeout=TIMEOUT) + if response.status_code != 200: + raise InvalidResponseError("Could not get info for {}".format(instance_name)) + return response.json() + + +def get_instance_peers(instance_name: str): + """Collect connected instances""" + url = 'https://' + instance_name + '/api/v1/instance/peers' + response = requests.get(url, timeout=TIMEOUT) + if response.status_code != 200: + raise InvalidResponseError("Could not get peers for {}".format(instance_name)) + return response.json() + + +def process_instance(instance_name: str): + """Given an instance, get all the data we're interested in""" + print("Processing {}".format(instance_name)) + data = dict() + try: + data['instance'] = instance_name + data['info'] = get_instance_info(instance_name) + data['peers'] = get_instance_peers(instance_name) + data['status'] = 'success' + print("Processed: {}".format(instance_name)) + return data + except (InvalidResponseError, + requests.exceptions.RequestException, + json.decoder.JSONDecodeError) as e: + data['instance'] = instance_name + data['status'] = type(e).__name__ + print("Failed: {}".format(instance_name)) + return data + + +def save_data(data): + """Save data""" + instance, _ = Instance.objects.get_or_create(name=get_key(data, ['instance'])) + if data['status'] == 'success': + # Save stats + stats = InstanceStats( + instance=instance, + num_peers=get_key(data, ['info', 'stats', 'domain_count']), + num_statuses=get_key(data, ['info', 'stats', 'status_count']), + num_users=get_key(data, ['info', 'stats', 'user_count']), + version=get_key(data, ['info', 'version']), + status=get_key(data, ['status']), + ) + stats.save() + # Save peers + # TODO: optimization opportunity here if we do this in bulk + # Make sure to consider race conditions + # https://stackoverflow.com/q/24502658/3697202 + peers = [Instance.objects.get_or_create(name=n) for n in data['peers']] + instance.peers.add(*[peers]) + else: + stats = InstanceStats( + instance=instance, + status=get_key(data, ['status']) + ) + stats.save() + + +def worker(queue: multiprocessing.JoinableQueue, done_bag: set): + """The main worker that processes URLs""" + while True: + # Get an item from the queue. Block if the queue is empty. + instance = queue.get() + if instance in done_bag: + print("Skipping {}, already done".format(instance)) + queue.task_done() + else: + data = process_instance(instance) + if 'peers' in data: + for peer in [p for p in data['peers'] if p not in done_bag]: + queue.put(peer) + save_data(data) + done_bag.add(instance) + queue.task_done() + + +class Command(BaseCommand): + help = "Scrapes the entire fediverse" + + + def handle(self, *args, **options): + done_bag = set() + queue = multiprocessing.JoinableQueue() + queue.put(SEED) + pool = multiprocessing.Pool(THREADS, initializer=worker, initargs=(queue, done_bag)) + queue.join() + self.stdout.write(self.style.SUCCESS("Successfully scraped the fediverse")) diff --git a/scraper/migrations/0002_auto_20180826_0053.py b/scraper/migrations/0002_auto_20180826_0053.py new file mode 100644 index 0000000..7fe5f9b --- /dev/null +++ b/scraper/migrations/0002_auto_20180826_0053.py @@ -0,0 +1,42 @@ +# Generated by Django 2.1 on 2018-08-26 00:53 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('scraper', '0001_initial'), + ] + + operations = [ + migrations.RemoveField( + model_name='instance', + name='id', + ), + migrations.AlterField( + model_name='instance', + name='name', + field=models.CharField(max_length=200, primary_key=True, serialize=False), + ), + migrations.AlterField( + model_name='instancestats', + name='num_peers', + field=models.IntegerField(blank=True), + ), + migrations.AlterField( + model_name='instancestats', + name='num_statuses', + field=models.IntegerField(blank=True), + ), + migrations.AlterField( + model_name='instancestats', + name='num_users', + field=models.IntegerField(blank=True), + ), + migrations.AlterField( + model_name='instancestats', + name='version', + field=models.CharField(blank=True, max_length=1000), + ), + ] diff --git a/scraper/migrations/0003_auto_20180826_0057.py b/scraper/migrations/0003_auto_20180826_0057.py new file mode 100644 index 0000000..0794ad7 --- /dev/null +++ b/scraper/migrations/0003_auto_20180826_0057.py @@ -0,0 +1,28 @@ +# Generated by Django 2.1 on 2018-08-26 00:57 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('scraper', '0002_auto_20180826_0053'), + ] + + operations = [ + migrations.AlterField( + model_name='instancestats', + name='num_peers', + field=models.IntegerField(blank=True, null=True), + ), + migrations.AlterField( + model_name='instancestats', + name='num_statuses', + field=models.IntegerField(blank=True, null=True), + ), + migrations.AlterField( + model_name='instancestats', + name='num_users', + field=models.IntegerField(blank=True, null=True), + ), + ] diff --git a/scraper/migrations/0004_auto_20180826_0100.py b/scraper/migrations/0004_auto_20180826_0100.py new file mode 100644 index 0000000..f3bbf41 --- /dev/null +++ b/scraper/migrations/0004_auto_20180826_0100.py @@ -0,0 +1,18 @@ +# Generated by Django 2.1 on 2018-08-26 01:00 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('scraper', '0003_auto_20180826_0057'), + ] + + operations = [ + migrations.AlterField( + model_name='instance', + name='peers', + field=models.ManyToManyField(to='scraper.Instance'), + ), + ] diff --git a/scraper/models.py b/scraper/models.py index 21fcc1a..9dfadde 100644 --- a/scraper/models.py +++ b/scraper/models.py @@ -2,18 +2,18 @@ from django.db import models class Instance(models.Model): - name = models.CharField(max_length=200) - peers = models.ManyToManyField('self', related_name='followers', symmetrical=False) + name = models.CharField(max_length=200, primary_key=True) + peers = models.ManyToManyField('self', symmetrical=False) class InstanceStats(models.Model): timestamp = models.DateTimeField(auto_now_add=True) instance = models.ForeignKey( - 'Instance', + Instance, on_delete=models.CASCADE, ) - num_peers = models.IntegerField() - num_statuses = models.IntegerField() - num_users = models.IntegerField() - version = models.CharField(max_length=1000) + num_peers = models.IntegerField(blank=True, null=True) + num_statuses = models.IntegerField(blank=True, null=True) + num_users = models.IntegerField(blank=True, null=True) + version = models.CharField(max_length=1000, blank=True) status = models.CharField(max_length=100)