clean up scraper and improve logging

2018-08-27 01:27:14 +02:00 · 2018-08-27 01:27:14 +02:00 · a8a51bec1c
commit a8a51bec1c
parent cd6d85ef16
2 changed files with 22 additions and 22 deletions
--- a/scraper/management/commands/_util.py
+++ b/scraper/management/commands/_util.py
@ -37,4 +37,19 @@ def require_lock(model, lock):

        return wrapper

-    return require_lock_decorator
+    return require_lock_decorator
+
+
+class InvalidResponseError(Exception):
+    """Used for all responses other than HTTP 200"""
+    pass
+
+
+def get_key(data, keys: list):
+    try:
+        val = data[keys.pop(0)]
+        while keys:
+            val = val[keys.pop(0)]
+        return val
+    except KeyError:
+        return ''
--- a/scraper/management/commands/scrape.py
+++ b/scraper/management/commands/scrape.py
@ -7,10 +7,11 @@ import json
 import multiprocessing
 import requests
 import time
+from datetime import datetime
 from django.core.management.base import BaseCommand
 from django.db import transaction
 from scraper.models import Instance, InstanceStats
-from scraper.management.commands._util import require_lock
+from scraper.management.commands._util import require_lock, InvalidResponseError, get_key

 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 # Because the script uses the Mastodon API other platforms like         #
@ -29,21 +30,6 @@ SEED = 'mastodon.social'
 TIMEOUT = 20


-class InvalidResponseError(Exception):
-    """Used for all responses other than HTTP 200"""
-    pass
-
-
-def get_key(data, keys: list):
-    try:
-        val = data[keys.pop(0)]
-        while keys:
-            val = val[keys.pop(0)]
-        return val
-    except KeyError:
-        return ''
-
-
 class Command(BaseCommand):
    help = "Scrapes the entire fediverse"

@ -71,21 +57,19 @@ class Command(BaseCommand):

    def process_instance(self, instance_name: str):
        """Given an instance, get all the data we're interested in"""
-        print("Processing {}".format(instance_name))
+        self.stdout.write("{} - Processing {}".format(datetime.now().isoformat(), instance_name))
        data = dict()
        try:
            data['instance'] = instance_name
            data['info'] = self.get_instance_info(instance_name)
-            data['peers'] = self.get_instance_peers(instance_name)
+            data['peers'] = [peer for peer in self.get_instance_peers(instance_name) if peer]  # get rid of null peers
            data['status'] = 'success'
-            print("Processed: {}".format(instance_name))
            return data
        except (InvalidResponseError,
                requests.exceptions.RequestException,
                json.decoder.JSONDecodeError) as e:
            data['instance'] = instance_name
            data['status'] = type(e).__name__
-            print("Failed: {}".format(instance_name))
            return data

    @transaction.atomic
@ -119,6 +103,7 @@ class Command(BaseCommand):
                status=get_key(data, ['status'])
            )
            stats.save()
+        self.stdout.write("{} - Saved {}".format(datetime.now().isoformat(), data['instance']))

    def worker(self, queue: multiprocessing.JoinableQueue):
        """The main worker that processes URLs"""
@ -145,4 +130,4 @@ class Command(BaseCommand):
        pool = multiprocessing.Pool(initializer=self.worker, initargs=(queue, ))
        queue.join()
        end_time = time.time()
-        self.stdout.write(self.style.SUCCESS("Successfully scraped the fediverse in {}s".format(end_time-start_time)))
+        self.stdout.write(self.style.SUCCESS("Successfully scraped the fediverse in {:.0f}s".format(end_time-start_time)))