improve scraping logic

This commit is contained in:
Tao Bror Bojlén 2019-02-20 22:09:21 +00:00
parent a20d8e23e3
commit e33c70fa25
No known key found for this signature in database
GPG key ID: C6EC7AAB905F9E6F

View file

@ -150,7 +150,7 @@ class Command(BaseCommand):
# Create instances for the peers we haven't seen before and add them to the queue # Create instances for the peers we haven't seen before and add them to the queue
new_instance_ids = [peer_id for peer_id in data['peers'] if peer_id not in existing_instance_ids] new_instance_ids = [peer_id for peer_id in data['peers'] if peer_id not in existing_instance_ids]
# bulk_create doesn't call save(), so the auto_now_add field won't get set automatically # bulk_create doesn't call save(), so the auto_now_add field won't get set automatically
new_instances = [Instance(name=id, first_seen=datetime.now(), last_updated=datetime.now()) new_instances = [Instance(name=id, first_seen=datetime.now(), last_updated=datetime.utcfromtimestamp(0))
for id in new_instance_ids] for id in new_instance_ids]
existing_instance_ids.extend(new_instance_ids) existing_instance_ids.extend(new_instance_ids)
Instance.objects.bulk_create(new_instances) Instance.objects.bulk_create(new_instances)
@ -180,12 +180,12 @@ class Command(BaseCommand):
self.stdout.write(log("Processed {}: {}".format(data['instance_name'], data['status']))) self.stdout.write(log("Processed {}: {}".format(data['instance_name'], data['status'])))
def worker(self, queue: mp.JoinableQueue, existing_instance_ids, scraped_ids): def worker(self, queue: mp.JoinableQueue, existing_instance_ids, scraped_ids):
"""The main worker that processes URLs""" """The main worker that processes instances"""
# https://stackoverflow.com/a/38356519/3697202 db.connections.close_all() # https://stackoverflow.com/a/38356519/3697202
db.connections.close_all()
while True: while True:
instance = queue.get() instance = queue.get()
if instance.name in scraped_ids: if instance.name in scraped_ids:
# If we hit this branch, it's indicative of a bug
self.stderr.write(log("Skipping {}, already done. This should not have been added to the queue!" self.stderr.write(log("Skipping {}, already done. This should not have been added to the queue!"
.format(instance))) .format(instance)))
queue.task_done() queue.task_done()