From e33c70fa25eb655d1991ca85abdf89cef9a12415 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tao=20Bror=20Bojl=C3=A9n?= Date: Wed, 20 Feb 2019 22:09:21 +0000 Subject: [PATCH] improve scraping logic --- backend/api/scraper/management/commands/scrape.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/backend/api/scraper/management/commands/scrape.py b/backend/api/scraper/management/commands/scrape.py index 81e22ab..92b5ad7 100644 --- a/backend/api/scraper/management/commands/scrape.py +++ b/backend/api/scraper/management/commands/scrape.py @@ -150,7 +150,7 @@ class Command(BaseCommand): # Create instances for the peers we haven't seen before and add them to the queue new_instance_ids = [peer_id for peer_id in data['peers'] if peer_id not in existing_instance_ids] # bulk_create doesn't call save(), so the auto_now_add field won't get set automatically - new_instances = [Instance(name=id, first_seen=datetime.now(), last_updated=datetime.now()) + new_instances = [Instance(name=id, first_seen=datetime.now(), last_updated=datetime.utcfromtimestamp(0)) for id in new_instance_ids] existing_instance_ids.extend(new_instance_ids) Instance.objects.bulk_create(new_instances) @@ -180,12 +180,12 @@ class Command(BaseCommand): self.stdout.write(log("Processed {}: {}".format(data['instance_name'], data['status']))) def worker(self, queue: mp.JoinableQueue, existing_instance_ids, scraped_ids): - """The main worker that processes URLs""" - # https://stackoverflow.com/a/38356519/3697202 - db.connections.close_all() + """The main worker that processes instances""" + db.connections.close_all() # https://stackoverflow.com/a/38356519/3697202 while True: instance = queue.get() if instance.name in scraped_ids: + # If we hit this branch, it's indicative of a bug self.stderr.write(log("Skipping {}, already done. This should not have been added to the queue!" .format(instance))) queue.task_done()