Clean up crawler
This commit is contained in:
parent
82b016a6a1
commit
63d7a3fe95
|
@ -32,5 +32,8 @@ class NodeView(viewsets.ReadOnlyModelViewSet):
|
|||
"""
|
||||
Endpoint to get a list of the graph's nodes in a SigmaJS-friendly format.
|
||||
"""
|
||||
queryset = Instance.objects.filter(status='success')
|
||||
queryset = Instance.objects.filter(status='success')\
|
||||
.filter(x_coord__isnull=False)\
|
||||
.filter(y_coord__isnull=False)\
|
||||
.filter(user_count__isnull=False)
|
||||
serializer_class = NodeSerializer
|
||||
|
|
|
@ -24,13 +24,30 @@ from scraper.management.commands._util import require_lock, InvalidResponseExcep
|
|||
SEED = 'mastodon.social'
|
||||
TIMEOUT = 20 # seconds
|
||||
NUM_THREADS = 16 # roughly 40MB each
|
||||
PERSONAL_INSTANCE_THRESHOLD = 5 # instances with < this many users won't be scraped
|
||||
PERSONAL_INSTANCE_THRESHOLD = 10 # instances with < this many users won't be crawled
|
||||
MAX_STATUSES_PER_PAGE = 100
|
||||
STATUS_SCRAPE_LIMIT = 5000
|
||||
INSTANCE_SCRAPE_LIMIT = 50 # note: this does not include newly discovered instances! they will always be crawled.
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "Scrapes the entire fediverse"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
# Named (optional) arguments
|
||||
parser.add_argument(
|
||||
'--unlimited',
|
||||
action='store_true',
|
||||
dest='unlimited',
|
||||
help="Crawl all stale instances rather than limiting to {}".format(INSTANCE_SCRAPE_LIMIT),
|
||||
)
|
||||
parser.add_argument(
|
||||
'--all',
|
||||
action='store_true',
|
||||
dest='all',
|
||||
help="Crawl all instances rather than limiting to stale ones"
|
||||
)
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.scraped_count = 0
|
||||
|
@ -59,7 +76,8 @@ class Command(BaseCommand):
|
|||
if response.status_code != 200 or not isinstance(peers, list) or get_key(peers, ['error']):
|
||||
raise InvalidResponseException("Could not get peers for {}".format(instance_name))
|
||||
# Get rid of peers that just say "null" and the instance itself
|
||||
return [peer for peer in peers if peer and peer != instance_name]
|
||||
# Also make sure to lowercase all instance names; otherwise there'll be some duplicates
|
||||
return [peer.lower() for peer in peers if peer and peer != instance_name]
|
||||
|
||||
@staticmethod
|
||||
def get_statuses(instance_name: str):
|
||||
|
@ -67,9 +85,9 @@ class Command(BaseCommand):
|
|||
mentions = []
|
||||
datetime_threshold = datetime.now(timezone.utc) - timedelta(days=31)
|
||||
statuses_seen = 0
|
||||
# We'll ask for 1000 statuses, but Mastodon never returns more than 40. Some Pleroma instances will ignore
|
||||
# We'll ask for lots of statuses, but Mastodon never returns more than 40. Some Pleroma instances will ignore
|
||||
# the limit and return 20.
|
||||
url = 'https://' + instance_name + '/api/v1/timelines/public?local=true&limit=1000'
|
||||
url = 'https://{}/api/v1/timelines/public?local=true&limit={}/'.format(instance_name, MAX_STATUSES_PER_PAGE)
|
||||
while True:
|
||||
response = requests.get(url, timeout=TIMEOUT)
|
||||
statuses = response.json()
|
||||
|
@ -91,7 +109,7 @@ class Command(BaseCommand):
|
|||
break
|
||||
# Continuing, so get url for next page
|
||||
min_id = earliest_status['id']
|
||||
url = 'https://' + instance_name + '/api/v1/timelines/public?local=true&limit=1000&max_id=' + min_id
|
||||
url = 'https://{}/api/v1/timelines/public?local=true&limit={}&max_id={}'.format(instance_name, MAX_STATUSES_PER_PAGE, min_id)
|
||||
time.sleep(2) # Sleep to avoid overloading the instance
|
||||
|
||||
mentions_seq = (seq(mentions)
|
||||
|
@ -201,7 +219,14 @@ class Command(BaseCommand):
|
|||
|
||||
def handle(self, *args, **options):
|
||||
start_time = time.time()
|
||||
if options['all']:
|
||||
stale_instances = Instance.objects.all()
|
||||
else:
|
||||
stale_instances = Instance.objects.filter(last_updated__lte=datetime.now()-timedelta(days=1))
|
||||
|
||||
if not options['unlimited']:
|
||||
stale_instances = stale_instances[:INSTANCE_SCRAPE_LIMIT]
|
||||
|
||||
with mp.Manager() as manager:
|
||||
# Share the list of existing instances amongst all threads (to avoid each thread having to query
|
||||
# for it on every instance it scrapes)
|
||||
|
|
Loading…
Reference in a new issue