clean up scraper and improve logging

This commit is contained in:
Tao Bojlen 2018-08-27 01:27:14 +02:00
parent cd6d85ef16
commit a8a51bec1c
2 changed files with 22 additions and 22 deletions

View file

@ -37,4 +37,19 @@ def require_lock(model, lock):
return wrapper return wrapper
return require_lock_decorator return require_lock_decorator
class InvalidResponseError(Exception):
"""Used for all responses other than HTTP 200"""
pass
def get_key(data, keys: list):
try:
val = data[keys.pop(0)]
while keys:
val = val[keys.pop(0)]
return val
except KeyError:
return ''

View file

@ -7,10 +7,11 @@ import json
import multiprocessing import multiprocessing
import requests import requests
import time import time
from datetime import datetime
from django.core.management.base import BaseCommand from django.core.management.base import BaseCommand
from django.db import transaction from django.db import transaction
from scraper.models import Instance, InstanceStats from scraper.models import Instance, InstanceStats
from scraper.management.commands._util import require_lock from scraper.management.commands._util import require_lock, InvalidResponseError, get_key
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Because the script uses the Mastodon API other platforms like # # Because the script uses the Mastodon API other platforms like #
@ -29,21 +30,6 @@ SEED = 'mastodon.social'
TIMEOUT = 20 TIMEOUT = 20
class InvalidResponseError(Exception):
"""Used for all responses other than HTTP 200"""
pass
def get_key(data, keys: list):
try:
val = data[keys.pop(0)]
while keys:
val = val[keys.pop(0)]
return val
except KeyError:
return ''
class Command(BaseCommand): class Command(BaseCommand):
help = "Scrapes the entire fediverse" help = "Scrapes the entire fediverse"
@ -71,21 +57,19 @@ class Command(BaseCommand):
def process_instance(self, instance_name: str): def process_instance(self, instance_name: str):
"""Given an instance, get all the data we're interested in""" """Given an instance, get all the data we're interested in"""
print("Processing {}".format(instance_name)) self.stdout.write("{} - Processing {}".format(datetime.now().isoformat(), instance_name))
data = dict() data = dict()
try: try:
data['instance'] = instance_name data['instance'] = instance_name
data['info'] = self.get_instance_info(instance_name) data['info'] = self.get_instance_info(instance_name)
data['peers'] = self.get_instance_peers(instance_name) data['peers'] = [peer for peer in self.get_instance_peers(instance_name) if peer] # get rid of null peers
data['status'] = 'success' data['status'] = 'success'
print("Processed: {}".format(instance_name))
return data return data
except (InvalidResponseError, except (InvalidResponseError,
requests.exceptions.RequestException, requests.exceptions.RequestException,
json.decoder.JSONDecodeError) as e: json.decoder.JSONDecodeError) as e:
data['instance'] = instance_name data['instance'] = instance_name
data['status'] = type(e).__name__ data['status'] = type(e).__name__
print("Failed: {}".format(instance_name))
return data return data
@transaction.atomic @transaction.atomic
@ -119,6 +103,7 @@ class Command(BaseCommand):
status=get_key(data, ['status']) status=get_key(data, ['status'])
) )
stats.save() stats.save()
self.stdout.write("{} - Saved {}".format(datetime.now().isoformat(), data['instance']))
def worker(self, queue: multiprocessing.JoinableQueue): def worker(self, queue: multiprocessing.JoinableQueue):
"""The main worker that processes URLs""" """The main worker that processes URLs"""
@ -145,4 +130,4 @@ class Command(BaseCommand):
pool = multiprocessing.Pool(initializer=self.worker, initargs=(queue, )) pool = multiprocessing.Pool(initializer=self.worker, initargs=(queue, ))
queue.join() queue.join()
end_time = time.time() end_time = time.time()
self.stdout.write(self.style.SUCCESS("Successfully scraped the fediverse in {}s".format(end_time-start_time))) self.stdout.write(self.style.SUCCESS("Successfully scraped the fediverse in {:.0f}s".format(end_time-start_time)))