clean up scraper and improve logging
This commit is contained in:
parent
cd6d85ef16
commit
a8a51bec1c
|
@ -37,4 +37,19 @@ def require_lock(model, lock):
|
||||||
|
|
||||||
return wrapper
|
return wrapper
|
||||||
|
|
||||||
return require_lock_decorator
|
return require_lock_decorator
|
||||||
|
|
||||||
|
|
||||||
|
class InvalidResponseError(Exception):
|
||||||
|
"""Used for all responses other than HTTP 200"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def get_key(data, keys: list):
|
||||||
|
try:
|
||||||
|
val = data[keys.pop(0)]
|
||||||
|
while keys:
|
||||||
|
val = val[keys.pop(0)]
|
||||||
|
return val
|
||||||
|
except KeyError:
|
||||||
|
return ''
|
||||||
|
|
|
@ -7,10 +7,11 @@ import json
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
import requests
|
import requests
|
||||||
import time
|
import time
|
||||||
|
from datetime import datetime
|
||||||
from django.core.management.base import BaseCommand
|
from django.core.management.base import BaseCommand
|
||||||
from django.db import transaction
|
from django.db import transaction
|
||||||
from scraper.models import Instance, InstanceStats
|
from scraper.models import Instance, InstanceStats
|
||||||
from scraper.management.commands._util import require_lock
|
from scraper.management.commands._util import require_lock, InvalidResponseError, get_key
|
||||||
|
|
||||||
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
|
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
|
||||||
# Because the script uses the Mastodon API other platforms like #
|
# Because the script uses the Mastodon API other platforms like #
|
||||||
|
@ -29,21 +30,6 @@ SEED = 'mastodon.social'
|
||||||
TIMEOUT = 20
|
TIMEOUT = 20
|
||||||
|
|
||||||
|
|
||||||
class InvalidResponseError(Exception):
|
|
||||||
"""Used for all responses other than HTTP 200"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def get_key(data, keys: list):
|
|
||||||
try:
|
|
||||||
val = data[keys.pop(0)]
|
|
||||||
while keys:
|
|
||||||
val = val[keys.pop(0)]
|
|
||||||
return val
|
|
||||||
except KeyError:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
|
|
||||||
class Command(BaseCommand):
|
class Command(BaseCommand):
|
||||||
help = "Scrapes the entire fediverse"
|
help = "Scrapes the entire fediverse"
|
||||||
|
|
||||||
|
@ -71,21 +57,19 @@ class Command(BaseCommand):
|
||||||
|
|
||||||
def process_instance(self, instance_name: str):
|
def process_instance(self, instance_name: str):
|
||||||
"""Given an instance, get all the data we're interested in"""
|
"""Given an instance, get all the data we're interested in"""
|
||||||
print("Processing {}".format(instance_name))
|
self.stdout.write("{} - Processing {}".format(datetime.now().isoformat(), instance_name))
|
||||||
data = dict()
|
data = dict()
|
||||||
try:
|
try:
|
||||||
data['instance'] = instance_name
|
data['instance'] = instance_name
|
||||||
data['info'] = self.get_instance_info(instance_name)
|
data['info'] = self.get_instance_info(instance_name)
|
||||||
data['peers'] = self.get_instance_peers(instance_name)
|
data['peers'] = [peer for peer in self.get_instance_peers(instance_name) if peer] # get rid of null peers
|
||||||
data['status'] = 'success'
|
data['status'] = 'success'
|
||||||
print("Processed: {}".format(instance_name))
|
|
||||||
return data
|
return data
|
||||||
except (InvalidResponseError,
|
except (InvalidResponseError,
|
||||||
requests.exceptions.RequestException,
|
requests.exceptions.RequestException,
|
||||||
json.decoder.JSONDecodeError) as e:
|
json.decoder.JSONDecodeError) as e:
|
||||||
data['instance'] = instance_name
|
data['instance'] = instance_name
|
||||||
data['status'] = type(e).__name__
|
data['status'] = type(e).__name__
|
||||||
print("Failed: {}".format(instance_name))
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
@transaction.atomic
|
@transaction.atomic
|
||||||
|
@ -119,6 +103,7 @@ class Command(BaseCommand):
|
||||||
status=get_key(data, ['status'])
|
status=get_key(data, ['status'])
|
||||||
)
|
)
|
||||||
stats.save()
|
stats.save()
|
||||||
|
self.stdout.write("{} - Saved {}".format(datetime.now().isoformat(), data['instance']))
|
||||||
|
|
||||||
def worker(self, queue: multiprocessing.JoinableQueue):
|
def worker(self, queue: multiprocessing.JoinableQueue):
|
||||||
"""The main worker that processes URLs"""
|
"""The main worker that processes URLs"""
|
||||||
|
@ -145,4 +130,4 @@ class Command(BaseCommand):
|
||||||
pool = multiprocessing.Pool(initializer=self.worker, initargs=(queue, ))
|
pool = multiprocessing.Pool(initializer=self.worker, initargs=(queue, ))
|
||||||
queue.join()
|
queue.join()
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
self.stdout.write(self.style.SUCCESS("Successfully scraped the fediverse in {}s".format(end_time-start_time)))
|
self.stdout.write(self.style.SUCCESS("Successfully scraped the fediverse in {:.0f}s".format(end_time-start_time)))
|
||||||
|
|
Loading…
Reference in a new issue