add scraper
This commit is contained in:
parent
2447e8c222
commit
97247fe130
137
scraper/management/commands/scrape.py
Normal file
137
scraper/management/commands/scrape.py
Normal file
|
@ -0,0 +1,137 @@
|
||||||
|
"""
|
||||||
|
This script starts at a seed instance and loads the list of connected
|
||||||
|
peers. From there, it slowly scrapes the peers of all instances it finds,
|
||||||
|
gradually mapping the fediverse.
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
import multiprocessing
|
||||||
|
import requests
|
||||||
|
from django.core.management.base import BaseCommand
|
||||||
|
from scraper.models import Instance, InstanceStats
|
||||||
|
|
||||||
|
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
|
||||||
|
# Because the script uses the Mastodon API other platforms like #
|
||||||
|
# Pleroma, Peertube, Pixelfed, Funkwhale won't have outgoing peers. #
|
||||||
|
# #
|
||||||
|
# The script generates two files: #
|
||||||
|
# - nodes.csv #
|
||||||
|
# - edges.csv #
|
||||||
|
# #
|
||||||
|
# Change SEED to start from a different instance. #
|
||||||
|
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
|
||||||
|
|
||||||
|
SEED = 'mastodon.social'
|
||||||
|
THREADS = 100
|
||||||
|
TIMEOUT = 10
|
||||||
|
|
||||||
|
|
||||||
|
class InvalidResponseError(Exception):
|
||||||
|
"""Used for all responses other than HTTP 200"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def get_key(data, keys: list):
|
||||||
|
try:
|
||||||
|
val = data[keys.pop(0)]
|
||||||
|
while keys:
|
||||||
|
val = val[keys.pop(0)]
|
||||||
|
return val
|
||||||
|
except KeyError:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
|
||||||
|
def get_instance_info(instance_name: str):
|
||||||
|
"""Collect info about instance"""
|
||||||
|
url = 'https://' + instance_name + '/api/v1/instance'
|
||||||
|
response = requests.get(url, timeout=TIMEOUT)
|
||||||
|
if response.status_code != 200:
|
||||||
|
raise InvalidResponseError("Could not get info for {}".format(instance_name))
|
||||||
|
return response.json()
|
||||||
|
|
||||||
|
|
||||||
|
def get_instance_peers(instance_name: str):
|
||||||
|
"""Collect connected instances"""
|
||||||
|
url = 'https://' + instance_name + '/api/v1/instance/peers'
|
||||||
|
response = requests.get(url, timeout=TIMEOUT)
|
||||||
|
if response.status_code != 200:
|
||||||
|
raise InvalidResponseError("Could not get peers for {}".format(instance_name))
|
||||||
|
return response.json()
|
||||||
|
|
||||||
|
|
||||||
|
def process_instance(instance_name: str):
|
||||||
|
"""Given an instance, get all the data we're interested in"""
|
||||||
|
print("Processing {}".format(instance_name))
|
||||||
|
data = dict()
|
||||||
|
try:
|
||||||
|
data['instance'] = instance_name
|
||||||
|
data['info'] = get_instance_info(instance_name)
|
||||||
|
data['peers'] = get_instance_peers(instance_name)
|
||||||
|
data['status'] = 'success'
|
||||||
|
print("Processed: {}".format(instance_name))
|
||||||
|
return data
|
||||||
|
except (InvalidResponseError,
|
||||||
|
requests.exceptions.RequestException,
|
||||||
|
json.decoder.JSONDecodeError) as e:
|
||||||
|
data['instance'] = instance_name
|
||||||
|
data['status'] = type(e).__name__
|
||||||
|
print("Failed: {}".format(instance_name))
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def save_data(data):
|
||||||
|
"""Save data"""
|
||||||
|
instance, _ = Instance.objects.get_or_create(name=get_key(data, ['instance']))
|
||||||
|
if data['status'] == 'success':
|
||||||
|
# Save stats
|
||||||
|
stats = InstanceStats(
|
||||||
|
instance=instance,
|
||||||
|
num_peers=get_key(data, ['info', 'stats', 'domain_count']),
|
||||||
|
num_statuses=get_key(data, ['info', 'stats', 'status_count']),
|
||||||
|
num_users=get_key(data, ['info', 'stats', 'user_count']),
|
||||||
|
version=get_key(data, ['info', 'version']),
|
||||||
|
status=get_key(data, ['status']),
|
||||||
|
)
|
||||||
|
stats.save()
|
||||||
|
# Save peers
|
||||||
|
# TODO: optimization opportunity here if we do this in bulk
|
||||||
|
# Make sure to consider race conditions
|
||||||
|
# https://stackoverflow.com/q/24502658/3697202
|
||||||
|
peers = [Instance.objects.get_or_create(name=n) for n in data['peers']]
|
||||||
|
instance.peers.add(*[peers])
|
||||||
|
else:
|
||||||
|
stats = InstanceStats(
|
||||||
|
instance=instance,
|
||||||
|
status=get_key(data, ['status'])
|
||||||
|
)
|
||||||
|
stats.save()
|
||||||
|
|
||||||
|
|
||||||
|
def worker(queue: multiprocessing.JoinableQueue, done_bag: set):
|
||||||
|
"""The main worker that processes URLs"""
|
||||||
|
while True:
|
||||||
|
# Get an item from the queue. Block if the queue is empty.
|
||||||
|
instance = queue.get()
|
||||||
|
if instance in done_bag:
|
||||||
|
print("Skipping {}, already done".format(instance))
|
||||||
|
queue.task_done()
|
||||||
|
else:
|
||||||
|
data = process_instance(instance)
|
||||||
|
if 'peers' in data:
|
||||||
|
for peer in [p for p in data['peers'] if p not in done_bag]:
|
||||||
|
queue.put(peer)
|
||||||
|
save_data(data)
|
||||||
|
done_bag.add(instance)
|
||||||
|
queue.task_done()
|
||||||
|
|
||||||
|
|
||||||
|
class Command(BaseCommand):
|
||||||
|
help = "Scrapes the entire fediverse"
|
||||||
|
|
||||||
|
|
||||||
|
def handle(self, *args, **options):
|
||||||
|
done_bag = set()
|
||||||
|
queue = multiprocessing.JoinableQueue()
|
||||||
|
queue.put(SEED)
|
||||||
|
pool = multiprocessing.Pool(THREADS, initializer=worker, initargs=(queue, done_bag))
|
||||||
|
queue.join()
|
||||||
|
self.stdout.write(self.style.SUCCESS("Successfully scraped the fediverse"))
|
42
scraper/migrations/0002_auto_20180826_0053.py
Normal file
42
scraper/migrations/0002_auto_20180826_0053.py
Normal file
|
@ -0,0 +1,42 @@
|
||||||
|
# Generated by Django 2.1 on 2018-08-26 00:53
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('scraper', '0001_initial'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.RemoveField(
|
||||||
|
model_name='instance',
|
||||||
|
name='id',
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='instance',
|
||||||
|
name='name',
|
||||||
|
field=models.CharField(max_length=200, primary_key=True, serialize=False),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='instancestats',
|
||||||
|
name='num_peers',
|
||||||
|
field=models.IntegerField(blank=True),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='instancestats',
|
||||||
|
name='num_statuses',
|
||||||
|
field=models.IntegerField(blank=True),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='instancestats',
|
||||||
|
name='num_users',
|
||||||
|
field=models.IntegerField(blank=True),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='instancestats',
|
||||||
|
name='version',
|
||||||
|
field=models.CharField(blank=True, max_length=1000),
|
||||||
|
),
|
||||||
|
]
|
28
scraper/migrations/0003_auto_20180826_0057.py
Normal file
28
scraper/migrations/0003_auto_20180826_0057.py
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
# Generated by Django 2.1 on 2018-08-26 00:57
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('scraper', '0002_auto_20180826_0053'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='instancestats',
|
||||||
|
name='num_peers',
|
||||||
|
field=models.IntegerField(blank=True, null=True),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='instancestats',
|
||||||
|
name='num_statuses',
|
||||||
|
field=models.IntegerField(blank=True, null=True),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='instancestats',
|
||||||
|
name='num_users',
|
||||||
|
field=models.IntegerField(blank=True, null=True),
|
||||||
|
),
|
||||||
|
]
|
18
scraper/migrations/0004_auto_20180826_0100.py
Normal file
18
scraper/migrations/0004_auto_20180826_0100.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
# Generated by Django 2.1 on 2018-08-26 01:00
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('scraper', '0003_auto_20180826_0057'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='instance',
|
||||||
|
name='peers',
|
||||||
|
field=models.ManyToManyField(to='scraper.Instance'),
|
||||||
|
),
|
||||||
|
]
|
|
@ -2,18 +2,18 @@ from django.db import models
|
||||||
|
|
||||||
|
|
||||||
class Instance(models.Model):
|
class Instance(models.Model):
|
||||||
name = models.CharField(max_length=200)
|
name = models.CharField(max_length=200, primary_key=True)
|
||||||
peers = models.ManyToManyField('self', related_name='followers', symmetrical=False)
|
peers = models.ManyToManyField('self', symmetrical=False)
|
||||||
|
|
||||||
|
|
||||||
class InstanceStats(models.Model):
|
class InstanceStats(models.Model):
|
||||||
timestamp = models.DateTimeField(auto_now_add=True)
|
timestamp = models.DateTimeField(auto_now_add=True)
|
||||||
instance = models.ForeignKey(
|
instance = models.ForeignKey(
|
||||||
'Instance',
|
Instance,
|
||||||
on_delete=models.CASCADE,
|
on_delete=models.CASCADE,
|
||||||
)
|
)
|
||||||
num_peers = models.IntegerField()
|
num_peers = models.IntegerField(blank=True, null=True)
|
||||||
num_statuses = models.IntegerField()
|
num_statuses = models.IntegerField(blank=True, null=True)
|
||||||
num_users = models.IntegerField()
|
num_users = models.IntegerField(blank=True, null=True)
|
||||||
version = models.CharField(max_length=1000)
|
version = models.CharField(max_length=1000, blank=True)
|
||||||
status = models.CharField(max_length=100)
|
status = models.CharField(max_length=100)
|
||||||
|
|
Loading…
Reference in a new issue