fix scraper
This commit is contained in:
parent
97247fe130
commit
074e2e1b88
|
@ -1,11 +1,12 @@
|
||||||
"""
|
"""
|
||||||
This script starts at a seed instance and loads the list of connected
|
This script starts at a seed instance and loads the list of connected
|
||||||
peers. From there, it slowly scrapes the peers of all instances it finds,
|
peers. From there, it scrapes the peers of all instances it finds,
|
||||||
gradually mapping the fediverse.
|
gradually mapping the fediverse.
|
||||||
"""
|
"""
|
||||||
import json
|
import json
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
import requests
|
import requests
|
||||||
|
import time
|
||||||
from django.core.management.base import BaseCommand
|
from django.core.management.base import BaseCommand
|
||||||
from scraper.models import Instance, InstanceStats
|
from scraper.models import Instance, InstanceStats
|
||||||
|
|
||||||
|
@ -20,9 +21,10 @@ from scraper.models import Instance, InstanceStats
|
||||||
# Change SEED to start from a different instance. #
|
# Change SEED to start from a different instance. #
|
||||||
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
|
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
|
||||||
|
|
||||||
|
# TODO: use the /api/v1/server/followers and /api/v1/server/following endpoints in peertube instances
|
||||||
|
|
||||||
SEED = 'mastodon.social'
|
SEED = 'mastodon.social'
|
||||||
THREADS = 100
|
TIMEOUT = 20
|
||||||
TIMEOUT = 10
|
|
||||||
|
|
||||||
|
|
||||||
class InvalidResponseError(Exception):
|
class InvalidResponseError(Exception):
|
||||||
|
@ -40,7 +42,16 @@ def get_key(data, keys: list):
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
|
|
||||||
def get_instance_info(instance_name: str):
|
class Command(BaseCommand):
|
||||||
|
help = "Scrapes the entire fediverse"
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self.done_bag = set()
|
||||||
|
self.existing_instance_ids = []
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_instance_info(instance_name: str):
|
||||||
"""Collect info about instance"""
|
"""Collect info about instance"""
|
||||||
url = 'https://' + instance_name + '/api/v1/instance'
|
url = 'https://' + instance_name + '/api/v1/instance'
|
||||||
response = requests.get(url, timeout=TIMEOUT)
|
response = requests.get(url, timeout=TIMEOUT)
|
||||||
|
@ -48,8 +59,8 @@ def get_instance_info(instance_name: str):
|
||||||
raise InvalidResponseError("Could not get info for {}".format(instance_name))
|
raise InvalidResponseError("Could not get info for {}".format(instance_name))
|
||||||
return response.json()
|
return response.json()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
def get_instance_peers(instance_name: str):
|
def get_instance_peers(instance_name: str):
|
||||||
"""Collect connected instances"""
|
"""Collect connected instances"""
|
||||||
url = 'https://' + instance_name + '/api/v1/instance/peers'
|
url = 'https://' + instance_name + '/api/v1/instance/peers'
|
||||||
response = requests.get(url, timeout=TIMEOUT)
|
response = requests.get(url, timeout=TIMEOUT)
|
||||||
|
@ -57,15 +68,14 @@ def get_instance_peers(instance_name: str):
|
||||||
raise InvalidResponseError("Could not get peers for {}".format(instance_name))
|
raise InvalidResponseError("Could not get peers for {}".format(instance_name))
|
||||||
return response.json()
|
return response.json()
|
||||||
|
|
||||||
|
def process_instance(self, instance_name: str):
|
||||||
def process_instance(instance_name: str):
|
|
||||||
"""Given an instance, get all the data we're interested in"""
|
"""Given an instance, get all the data we're interested in"""
|
||||||
print("Processing {}".format(instance_name))
|
print("Processing {}".format(instance_name))
|
||||||
data = dict()
|
data = dict()
|
||||||
try:
|
try:
|
||||||
data['instance'] = instance_name
|
data['instance'] = instance_name
|
||||||
data['info'] = get_instance_info(instance_name)
|
data['info'] = self.get_instance_info(instance_name)
|
||||||
data['peers'] = get_instance_peers(instance_name)
|
data['peers'] = self.get_instance_peers(instance_name)
|
||||||
data['status'] = 'success'
|
data['status'] = 'success'
|
||||||
print("Processed: {}".format(instance_name))
|
print("Processed: {}".format(instance_name))
|
||||||
return data
|
return data
|
||||||
|
@ -77,8 +87,7 @@ def process_instance(instance_name: str):
|
||||||
print("Failed: {}".format(instance_name))
|
print("Failed: {}".format(instance_name))
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
def save_data(self, data):
|
||||||
def save_data(data):
|
|
||||||
"""Save data"""
|
"""Save data"""
|
||||||
instance, _ = Instance.objects.get_or_create(name=get_key(data, ['instance']))
|
instance, _ = Instance.objects.get_or_create(name=get_key(data, ['instance']))
|
||||||
if data['status'] == 'success':
|
if data['status'] == 'success':
|
||||||
|
@ -93,11 +102,19 @@ def save_data(data):
|
||||||
)
|
)
|
||||||
stats.save()
|
stats.save()
|
||||||
# Save peers
|
# Save peers
|
||||||
# TODO: optimization opportunity here if we do this in bulk
|
# Save the list of instances we already have in the database
|
||||||
# Make sure to consider race conditions
|
existing_peers = Instance.objects.filter(name__in=self.existing_instance_ids)
|
||||||
# https://stackoverflow.com/q/24502658/3697202
|
print("setting new_peer_ids")
|
||||||
peers = [Instance.objects.get_or_create(name=n) for n in data['peers']]
|
new_peer_ids = [peer for peer in data['peers'] if peer not in self.existing_instance_ids]
|
||||||
instance.peers.add(*[peers])
|
if new_peer_ids:
|
||||||
|
print("setting new_peers (ids: {})".format(new_peer_ids))
|
||||||
|
new_peers = Instance.objects.bulk_create([Instance(name=peer) for peer in new_peer_ids])
|
||||||
|
print("adding to existing_instance_ids")
|
||||||
|
self.existing_instance_ids.extend(new_peer_ids)
|
||||||
|
print("adding new peers")
|
||||||
|
instance.peers.set(new_peers)
|
||||||
|
print("adding existing peers")
|
||||||
|
instance.peers.set(existing_peers)
|
||||||
else:
|
else:
|
||||||
stats = InstanceStats(
|
stats = InstanceStats(
|
||||||
instance=instance,
|
instance=instance,
|
||||||
|
@ -105,33 +122,31 @@ def save_data(data):
|
||||||
)
|
)
|
||||||
stats.save()
|
stats.save()
|
||||||
|
|
||||||
|
def worker(self, queue: multiprocessing.JoinableQueue):
|
||||||
def worker(queue: multiprocessing.JoinableQueue, done_bag: set):
|
|
||||||
"""The main worker that processes URLs"""
|
"""The main worker that processes URLs"""
|
||||||
while True:
|
while True:
|
||||||
# Get an item from the queue. Block if the queue is empty.
|
# Get an item from the queue. Block if the queue is empty.
|
||||||
instance = queue.get()
|
instance = queue.get()
|
||||||
if instance in done_bag:
|
if instance in self.done_bag:
|
||||||
print("Skipping {}, already done".format(instance))
|
print("Skipping {}, already done".format(instance))
|
||||||
queue.task_done()
|
queue.task_done()
|
||||||
else:
|
else:
|
||||||
data = process_instance(instance)
|
data = self.process_instance(instance)
|
||||||
if 'peers' in data:
|
if 'peers' in data:
|
||||||
for peer in [p for p in data['peers'] if p not in done_bag]:
|
for peer in [p for p in data['peers'] if p not in self.done_bag]:
|
||||||
queue.put(peer)
|
queue.put(peer)
|
||||||
save_data(data)
|
self.save_data(data)
|
||||||
done_bag.add(instance)
|
self.done_bag.add(instance)
|
||||||
queue.task_done()
|
queue.task_done()
|
||||||
|
|
||||||
|
|
||||||
class Command(BaseCommand):
|
|
||||||
help = "Scrapes the entire fediverse"
|
|
||||||
|
|
||||||
|
|
||||||
def handle(self, *args, **options):
|
def handle(self, *args, **options):
|
||||||
done_bag = set()
|
start_time = time.time()
|
||||||
|
self.existing_instance_ids = Instance.objects.all().values_list('name', flat=True)
|
||||||
|
print("Existing instances: {}".format(self.existing_instance_ids))
|
||||||
queue = multiprocessing.JoinableQueue()
|
queue = multiprocessing.JoinableQueue()
|
||||||
queue.put(SEED)
|
queue.put(SEED)
|
||||||
pool = multiprocessing.Pool(THREADS, initializer=worker, initargs=(queue, done_bag))
|
# pool = multiprocessing.Pool(1, initializer=self.worker, initargs=(queue, )) # Disable concurrency (debug)
|
||||||
|
pool = multiprocessing.Pool(initializer=self.worker, initargs=(queue, ))
|
||||||
queue.join()
|
queue.join()
|
||||||
self.stdout.write(self.style.SUCCESS("Successfully scraped the fediverse"))
|
end_time = time.time()
|
||||||
|
self.stdout.write(self.style.SUCCESS("Successfully scraped the fediverse in {}s".format(end_time-start_time)))
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# Generated by Django 2.1 on 2018-08-26 00:29
|
# Generated by Django 2.1 on 2018-08-26 17:26
|
||||||
|
|
||||||
from django.db import migrations, models
|
from django.db import migrations, models
|
||||||
import django.db.models.deletion
|
import django.db.models.deletion
|
||||||
|
@ -15,9 +15,8 @@ class Migration(migrations.Migration):
|
||||||
migrations.CreateModel(
|
migrations.CreateModel(
|
||||||
name='Instance',
|
name='Instance',
|
||||||
fields=[
|
fields=[
|
||||||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
('name', models.CharField(max_length=200, primary_key=True, serialize=False)),
|
||||||
('name', models.CharField(max_length=200)),
|
('peers', models.ManyToManyField(to='scraper.Instance')),
|
||||||
('peers', models.ManyToManyField(related_name='followers', to='scraper.Instance')),
|
|
||||||
],
|
],
|
||||||
),
|
),
|
||||||
migrations.CreateModel(
|
migrations.CreateModel(
|
||||||
|
@ -25,10 +24,10 @@ class Migration(migrations.Migration):
|
||||||
fields=[
|
fields=[
|
||||||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||||
('timestamp', models.DateTimeField(auto_now_add=True)),
|
('timestamp', models.DateTimeField(auto_now_add=True)),
|
||||||
('num_peers', models.IntegerField()),
|
('num_peers', models.IntegerField(blank=True, null=True)),
|
||||||
('num_statuses', models.IntegerField()),
|
('num_statuses', models.IntegerField(blank=True, null=True)),
|
||||||
('num_users', models.IntegerField()),
|
('num_users', models.IntegerField(blank=True, null=True)),
|
||||||
('version', models.CharField(max_length=1000)),
|
('version', models.CharField(blank=True, max_length=1000)),
|
||||||
('status', models.CharField(max_length=100)),
|
('status', models.CharField(max_length=100)),
|
||||||
('instance', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='scraper.Instance')),
|
('instance', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='scraper.Instance')),
|
||||||
],
|
],
|
||||||
|
|
|
@ -1,42 +0,0 @@
|
||||||
# Generated by Django 2.1 on 2018-08-26 00:53
|
|
||||||
|
|
||||||
from django.db import migrations, models
|
|
||||||
|
|
||||||
|
|
||||||
class Migration(migrations.Migration):
|
|
||||||
|
|
||||||
dependencies = [
|
|
||||||
('scraper', '0001_initial'),
|
|
||||||
]
|
|
||||||
|
|
||||||
operations = [
|
|
||||||
migrations.RemoveField(
|
|
||||||
model_name='instance',
|
|
||||||
name='id',
|
|
||||||
),
|
|
||||||
migrations.AlterField(
|
|
||||||
model_name='instance',
|
|
||||||
name='name',
|
|
||||||
field=models.CharField(max_length=200, primary_key=True, serialize=False),
|
|
||||||
),
|
|
||||||
migrations.AlterField(
|
|
||||||
model_name='instancestats',
|
|
||||||
name='num_peers',
|
|
||||||
field=models.IntegerField(blank=True),
|
|
||||||
),
|
|
||||||
migrations.AlterField(
|
|
||||||
model_name='instancestats',
|
|
||||||
name='num_statuses',
|
|
||||||
field=models.IntegerField(blank=True),
|
|
||||||
),
|
|
||||||
migrations.AlterField(
|
|
||||||
model_name='instancestats',
|
|
||||||
name='num_users',
|
|
||||||
field=models.IntegerField(blank=True),
|
|
||||||
),
|
|
||||||
migrations.AlterField(
|
|
||||||
model_name='instancestats',
|
|
||||||
name='version',
|
|
||||||
field=models.CharField(blank=True, max_length=1000),
|
|
||||||
),
|
|
||||||
]
|
|
|
@ -1,28 +0,0 @@
|
||||||
# Generated by Django 2.1 on 2018-08-26 00:57
|
|
||||||
|
|
||||||
from django.db import migrations, models
|
|
||||||
|
|
||||||
|
|
||||||
class Migration(migrations.Migration):
|
|
||||||
|
|
||||||
dependencies = [
|
|
||||||
('scraper', '0002_auto_20180826_0053'),
|
|
||||||
]
|
|
||||||
|
|
||||||
operations = [
|
|
||||||
migrations.AlterField(
|
|
||||||
model_name='instancestats',
|
|
||||||
name='num_peers',
|
|
||||||
field=models.IntegerField(blank=True, null=True),
|
|
||||||
),
|
|
||||||
migrations.AlterField(
|
|
||||||
model_name='instancestats',
|
|
||||||
name='num_statuses',
|
|
||||||
field=models.IntegerField(blank=True, null=True),
|
|
||||||
),
|
|
||||||
migrations.AlterField(
|
|
||||||
model_name='instancestats',
|
|
||||||
name='num_users',
|
|
||||||
field=models.IntegerField(blank=True, null=True),
|
|
||||||
),
|
|
||||||
]
|
|
|
@ -1,18 +0,0 @@
|
||||||
# Generated by Django 2.1 on 2018-08-26 01:00
|
|
||||||
|
|
||||||
from django.db import migrations, models
|
|
||||||
|
|
||||||
|
|
||||||
class Migration(migrations.Migration):
|
|
||||||
|
|
||||||
dependencies = [
|
|
||||||
('scraper', '0003_auto_20180826_0057'),
|
|
||||||
]
|
|
||||||
|
|
||||||
operations = [
|
|
||||||
migrations.AlterField(
|
|
||||||
model_name='instance',
|
|
||||||
name='peers',
|
|
||||||
field=models.ManyToManyField(to='scraper.Instance'),
|
|
||||||
),
|
|
||||||
]
|
|
Loading…
Reference in a new issue