fix scraper

This commit is contained in:
Tao Bojlen 2018-08-26 20:15:50 +02:00
parent 97247fe130
commit 074e2e1b88
5 changed files with 112 additions and 186 deletions

View file

@ -1,11 +1,12 @@
""" """
This script starts at a seed instance and loads the list of connected This script starts at a seed instance and loads the list of connected
peers. From there, it slowly scrapes the peers of all instances it finds, peers. From there, it scrapes the peers of all instances it finds,
gradually mapping the fediverse. gradually mapping the fediverse.
""" """
import json import json
import multiprocessing import multiprocessing
import requests import requests
import time
from django.core.management.base import BaseCommand from django.core.management.base import BaseCommand
from scraper.models import Instance, InstanceStats from scraper.models import Instance, InstanceStats
@ -20,9 +21,10 @@ from scraper.models import Instance, InstanceStats
# Change SEED to start from a different instance. # # Change SEED to start from a different instance. #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# TODO: use the /api/v1/server/followers and /api/v1/server/following endpoints in peertube instances
SEED = 'mastodon.social' SEED = 'mastodon.social'
THREADS = 100 TIMEOUT = 20
TIMEOUT = 10
class InvalidResponseError(Exception): class InvalidResponseError(Exception):
@ -40,7 +42,16 @@ def get_key(data, keys: list):
return '' return ''
def get_instance_info(instance_name: str): class Command(BaseCommand):
help = "Scrapes the entire fediverse"
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.done_bag = set()
self.existing_instance_ids = []
@staticmethod
def get_instance_info(instance_name: str):
"""Collect info about instance""" """Collect info about instance"""
url = 'https://' + instance_name + '/api/v1/instance' url = 'https://' + instance_name + '/api/v1/instance'
response = requests.get(url, timeout=TIMEOUT) response = requests.get(url, timeout=TIMEOUT)
@ -48,8 +59,8 @@ def get_instance_info(instance_name: str):
raise InvalidResponseError("Could not get info for {}".format(instance_name)) raise InvalidResponseError("Could not get info for {}".format(instance_name))
return response.json() return response.json()
@staticmethod
def get_instance_peers(instance_name: str): def get_instance_peers(instance_name: str):
"""Collect connected instances""" """Collect connected instances"""
url = 'https://' + instance_name + '/api/v1/instance/peers' url = 'https://' + instance_name + '/api/v1/instance/peers'
response = requests.get(url, timeout=TIMEOUT) response = requests.get(url, timeout=TIMEOUT)
@ -57,15 +68,14 @@ def get_instance_peers(instance_name: str):
raise InvalidResponseError("Could not get peers for {}".format(instance_name)) raise InvalidResponseError("Could not get peers for {}".format(instance_name))
return response.json() return response.json()
def process_instance(self, instance_name: str):
def process_instance(instance_name: str):
"""Given an instance, get all the data we're interested in""" """Given an instance, get all the data we're interested in"""
print("Processing {}".format(instance_name)) print("Processing {}".format(instance_name))
data = dict() data = dict()
try: try:
data['instance'] = instance_name data['instance'] = instance_name
data['info'] = get_instance_info(instance_name) data['info'] = self.get_instance_info(instance_name)
data['peers'] = get_instance_peers(instance_name) data['peers'] = self.get_instance_peers(instance_name)
data['status'] = 'success' data['status'] = 'success'
print("Processed: {}".format(instance_name)) print("Processed: {}".format(instance_name))
return data return data
@ -77,8 +87,7 @@ def process_instance(instance_name: str):
print("Failed: {}".format(instance_name)) print("Failed: {}".format(instance_name))
return data return data
def save_data(self, data):
def save_data(data):
"""Save data""" """Save data"""
instance, _ = Instance.objects.get_or_create(name=get_key(data, ['instance'])) instance, _ = Instance.objects.get_or_create(name=get_key(data, ['instance']))
if data['status'] == 'success': if data['status'] == 'success':
@ -93,11 +102,19 @@ def save_data(data):
) )
stats.save() stats.save()
# Save peers # Save peers
# TODO: optimization opportunity here if we do this in bulk # Save the list of instances we already have in the database
# Make sure to consider race conditions existing_peers = Instance.objects.filter(name__in=self.existing_instance_ids)
# https://stackoverflow.com/q/24502658/3697202 print("setting new_peer_ids")
peers = [Instance.objects.get_or_create(name=n) for n in data['peers']] new_peer_ids = [peer for peer in data['peers'] if peer not in self.existing_instance_ids]
instance.peers.add(*[peers]) if new_peer_ids:
print("setting new_peers (ids: {})".format(new_peer_ids))
new_peers = Instance.objects.bulk_create([Instance(name=peer) for peer in new_peer_ids])
print("adding to existing_instance_ids")
self.existing_instance_ids.extend(new_peer_ids)
print("adding new peers")
instance.peers.set(new_peers)
print("adding existing peers")
instance.peers.set(existing_peers)
else: else:
stats = InstanceStats( stats = InstanceStats(
instance=instance, instance=instance,
@ -105,33 +122,31 @@ def save_data(data):
) )
stats.save() stats.save()
def worker(self, queue: multiprocessing.JoinableQueue):
def worker(queue: multiprocessing.JoinableQueue, done_bag: set):
"""The main worker that processes URLs""" """The main worker that processes URLs"""
while True: while True:
# Get an item from the queue. Block if the queue is empty. # Get an item from the queue. Block if the queue is empty.
instance = queue.get() instance = queue.get()
if instance in done_bag: if instance in self.done_bag:
print("Skipping {}, already done".format(instance)) print("Skipping {}, already done".format(instance))
queue.task_done() queue.task_done()
else: else:
data = process_instance(instance) data = self.process_instance(instance)
if 'peers' in data: if 'peers' in data:
for peer in [p for p in data['peers'] if p not in done_bag]: for peer in [p for p in data['peers'] if p not in self.done_bag]:
queue.put(peer) queue.put(peer)
save_data(data) self.save_data(data)
done_bag.add(instance) self.done_bag.add(instance)
queue.task_done() queue.task_done()
class Command(BaseCommand):
help = "Scrapes the entire fediverse"
def handle(self, *args, **options): def handle(self, *args, **options):
done_bag = set() start_time = time.time()
self.existing_instance_ids = Instance.objects.all().values_list('name', flat=True)
print("Existing instances: {}".format(self.existing_instance_ids))
queue = multiprocessing.JoinableQueue() queue = multiprocessing.JoinableQueue()
queue.put(SEED) queue.put(SEED)
pool = multiprocessing.Pool(THREADS, initializer=worker, initargs=(queue, done_bag)) # pool = multiprocessing.Pool(1, initializer=self.worker, initargs=(queue, )) # Disable concurrency (debug)
pool = multiprocessing.Pool(initializer=self.worker, initargs=(queue, ))
queue.join() queue.join()
self.stdout.write(self.style.SUCCESS("Successfully scraped the fediverse")) end_time = time.time()
self.stdout.write(self.style.SUCCESS("Successfully scraped the fediverse in {}s".format(end_time-start_time)))

View file

@ -1,4 +1,4 @@
# Generated by Django 2.1 on 2018-08-26 00:29 # Generated by Django 2.1 on 2018-08-26 17:26
from django.db import migrations, models from django.db import migrations, models
import django.db.models.deletion import django.db.models.deletion
@ -15,9 +15,8 @@ class Migration(migrations.Migration):
migrations.CreateModel( migrations.CreateModel(
name='Instance', name='Instance',
fields=[ fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), ('name', models.CharField(max_length=200, primary_key=True, serialize=False)),
('name', models.CharField(max_length=200)), ('peers', models.ManyToManyField(to='scraper.Instance')),
('peers', models.ManyToManyField(related_name='followers', to='scraper.Instance')),
], ],
), ),
migrations.CreateModel( migrations.CreateModel(
@ -25,10 +24,10 @@ class Migration(migrations.Migration):
fields=[ fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('timestamp', models.DateTimeField(auto_now_add=True)), ('timestamp', models.DateTimeField(auto_now_add=True)),
('num_peers', models.IntegerField()), ('num_peers', models.IntegerField(blank=True, null=True)),
('num_statuses', models.IntegerField()), ('num_statuses', models.IntegerField(blank=True, null=True)),
('num_users', models.IntegerField()), ('num_users', models.IntegerField(blank=True, null=True)),
('version', models.CharField(max_length=1000)), ('version', models.CharField(blank=True, max_length=1000)),
('status', models.CharField(max_length=100)), ('status', models.CharField(max_length=100)),
('instance', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='scraper.Instance')), ('instance', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='scraper.Instance')),
], ],

View file

@ -1,42 +0,0 @@
# Generated by Django 2.1 on 2018-08-26 00:53
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('scraper', '0001_initial'),
]
operations = [
migrations.RemoveField(
model_name='instance',
name='id',
),
migrations.AlterField(
model_name='instance',
name='name',
field=models.CharField(max_length=200, primary_key=True, serialize=False),
),
migrations.AlterField(
model_name='instancestats',
name='num_peers',
field=models.IntegerField(blank=True),
),
migrations.AlterField(
model_name='instancestats',
name='num_statuses',
field=models.IntegerField(blank=True),
),
migrations.AlterField(
model_name='instancestats',
name='num_users',
field=models.IntegerField(blank=True),
),
migrations.AlterField(
model_name='instancestats',
name='version',
field=models.CharField(blank=True, max_length=1000),
),
]

View file

@ -1,28 +0,0 @@
# Generated by Django 2.1 on 2018-08-26 00:57
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('scraper', '0002_auto_20180826_0053'),
]
operations = [
migrations.AlterField(
model_name='instancestats',
name='num_peers',
field=models.IntegerField(blank=True, null=True),
),
migrations.AlterField(
model_name='instancestats',
name='num_statuses',
field=models.IntegerField(blank=True, null=True),
),
migrations.AlterField(
model_name='instancestats',
name='num_users',
field=models.IntegerField(blank=True, null=True),
),
]

View file

@ -1,18 +0,0 @@
# Generated by Django 2.1 on 2018-08-26 01:00
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('scraper', '0003_auto_20180826_0057'),
]
operations = [
migrations.AlterField(
model_name='instance',
name='peers',
field=models.ManyToManyField(to='scraper.Instance'),
),
]