harden scraper against unexpected API responses

This commit is contained in:
Tao Bojlen 2018-08-29 19:05:55 +02:00
parent 0f2aa3c938
commit df5ccf83ad
2 changed files with 8 additions and 4 deletions

View file

@ -27,7 +27,7 @@ from scraper.management.commands._util import require_lock, InvalidResponseError
# TODO: use the /api/v1/server/followers and /api/v1/server/following endpoints in peertube instances # TODO: use the /api/v1/server/followers and /api/v1/server/following endpoints in peertube instances
SEED = 'mastodon.social' SEED = 'mastodon.social'
TIMEOUT = 20 TIMEOUT = 1
class Command(BaseCommand): class Command(BaseCommand):
@ -51,9 +51,10 @@ class Command(BaseCommand):
"""Collect connected instances""" """Collect connected instances"""
url = 'https://' + instance_name + '/api/v1/instance/peers' url = 'https://' + instance_name + '/api/v1/instance/peers'
response = requests.get(url, timeout=TIMEOUT) response = requests.get(url, timeout=TIMEOUT)
if response.status_code != 200: json = response.json()
if response.status_code != 200 or not isinstance(json, list):
raise InvalidResponseError("Could not get peers for {}".format(instance_name)) raise InvalidResponseError("Could not get peers for {}".format(instance_name))
return response.json() return json
def process_instance(self, instance_name: str): def process_instance(self, instance_name: str):
"""Given an instance, get all the data we're interested in""" """Given an instance, get all the data we're interested in"""
@ -63,6 +64,9 @@ class Command(BaseCommand):
data['instance'] = instance_name data['instance'] = instance_name
data['info'] = self.get_instance_info(instance_name) data['info'] = self.get_instance_info(instance_name)
data['peers'] = [peer for peer in self.get_instance_peers(instance_name) if peer] # get rid of null peers data['peers'] = [peer for peer in self.get_instance_peers(instance_name) if peer] # get rid of null peers
if not data['info'] and not data['peers']:
# We got a response from the instance, but it didn't have any of the information we were expecting.
raise InvalidResponseError
data['status'] = 'success' data['status'] = 'success'
return data return data
except (InvalidResponseError, except (InvalidResponseError,

View file

@ -18,5 +18,5 @@ class InstanceStats(models.Model):
domain_count = models.IntegerField(blank=True, null=True) domain_count = models.IntegerField(blank=True, null=True)
status_count = models.IntegerField(blank=True, null=True) status_count = models.IntegerField(blank=True, null=True)
user_count = models.IntegerField(blank=True, null=True) user_count = models.IntegerField(blank=True, null=True)
version = models.CharField(max_length=1000, blank=True) version = models.CharField(max_length=1000, blank=True) # In Django CharField is never stored as NULL in the db
status = models.CharField(max_length=100) status = models.CharField(max_length=100)