clean up scraper and improve logging

This commit is contained in:
Tao Bojlen 2018-08-27 01:27:14 +02:00
parent cd6d85ef16
commit a8a51bec1c
2 changed files with 22 additions and 22 deletions

View file

@ -37,4 +37,19 @@ def require_lock(model, lock):
return wrapper
return require_lock_decorator
return require_lock_decorator
class InvalidResponseError(Exception):
"""Used for all responses other than HTTP 200"""
pass
def get_key(data, keys: list):
try:
val = data[keys.pop(0)]
while keys:
val = val[keys.pop(0)]
return val
except KeyError:
return ''

View file

@ -7,10 +7,11 @@ import json
import multiprocessing
import requests
import time
from datetime import datetime
from django.core.management.base import BaseCommand
from django.db import transaction
from scraper.models import Instance, InstanceStats
from scraper.management.commands._util import require_lock
from scraper.management.commands._util import require_lock, InvalidResponseError, get_key
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Because the script uses the Mastodon API other platforms like #
@ -29,21 +30,6 @@ SEED = 'mastodon.social'
TIMEOUT = 20
class InvalidResponseError(Exception):
"""Used for all responses other than HTTP 200"""
pass
def get_key(data, keys: list):
try:
val = data[keys.pop(0)]
while keys:
val = val[keys.pop(0)]
return val
except KeyError:
return ''
class Command(BaseCommand):
help = "Scrapes the entire fediverse"
@ -71,21 +57,19 @@ class Command(BaseCommand):
def process_instance(self, instance_name: str):
"""Given an instance, get all the data we're interested in"""
print("Processing {}".format(instance_name))
self.stdout.write("{} - Processing {}".format(datetime.now().isoformat(), instance_name))
data = dict()
try:
data['instance'] = instance_name
data['info'] = self.get_instance_info(instance_name)
data['peers'] = self.get_instance_peers(instance_name)
data['peers'] = [peer for peer in self.get_instance_peers(instance_name) if peer] # get rid of null peers
data['status'] = 'success'
print("Processed: {}".format(instance_name))
return data
except (InvalidResponseError,
requests.exceptions.RequestException,
json.decoder.JSONDecodeError) as e:
data['instance'] = instance_name
data['status'] = type(e).__name__
print("Failed: {}".format(instance_name))
return data
@transaction.atomic
@ -119,6 +103,7 @@ class Command(BaseCommand):
status=get_key(data, ['status'])
)
stats.save()
self.stdout.write("{} - Saved {}".format(datetime.now().isoformat(), data['instance']))
def worker(self, queue: multiprocessing.JoinableQueue):
"""The main worker that processes URLs"""
@ -145,4 +130,4 @@ class Command(BaseCommand):
pool = multiprocessing.Pool(initializer=self.worker, initargs=(queue, ))
queue.join()
end_time = time.time()
self.stdout.write(self.style.SUCCESS("Successfully scraped the fediverse in {}s".format(end_time-start_time)))
self.stdout.write(self.style.SUCCESS("Successfully scraped the fediverse in {:.0f}s".format(end_time-start_time)))