From ea0fae21527ed989a7aa56d6a6be5efa51f8b340 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tao=20Bror=20Bojl=C3=A9n?= Date: Sat, 27 Jul 2019 16:22:39 +0300 Subject: [PATCH] don't add domains to the queue twice --- backend/lib/backend/crawler/crawler.ex | 6 +++++ .../backend/crawler/stale_instance_manager.ex | 22 +++++++++++++++++-- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/backend/lib/backend/crawler/crawler.ex b/backend/lib/backend/crawler/crawler.ex index 392425c..d471707 100644 --- a/backend/lib/backend/crawler/crawler.ex +++ b/backend/lib/backend/crawler/crawler.ex @@ -49,6 +49,8 @@ defmodule Backend.Crawler do # go! |> crawl() |> save() + + Appsignal.increment_counter("crawler.total", 1) end # Adds a new ApiCrawler that run/1 will check. @@ -207,6 +209,8 @@ defmodule Backend.Crawler do CrawlInteraction |> Repo.insert_all(interactions) + + Appsignal.increment_counter("crawler.success", 1) end defp save(%{domain: domain, error: error, allows_crawling?: allows_crawling}) do @@ -232,6 +236,8 @@ defmodule Backend.Crawler do error: error }) end) + + Appsignal.increment_counter("crawler.failure", 1) end defp get_base_domain(domain) do diff --git a/backend/lib/backend/crawler/stale_instance_manager.ex b/backend/lib/backend/crawler/stale_instance_manager.ex index 4306b9c..0792d29 100644 --- a/backend/lib/backend/crawler/stale_instance_manager.ex +++ b/backend/lib/backend/crawler/stale_instance_manager.ex @@ -74,10 +74,28 @@ defmodule Backend.Crawler.StaleInstanceManager do ) |> select([i], i.domain) |> Repo.all() + |> MapSet.new() - Logger.debug("Adding #{length(stale_domains)} stale domains to queue.") + # Don't add a domain that's already in the queue + domains_in_queue = + Honeydew.filter(:crawl_queue, fn job -> + is_pending_crawl_job = match?(%Honeydew.Job{completed_at: nil, task: {:run, [_]}}, job) - stale_domains + if is_pending_crawl_job do + %Honeydew.Job{completed_at: nil, task: {:run, [d]}} = job + MapSet.member?(stale_domains, d) + else + false + end + end) + |> Enum.map(fn %Honeydew.Job{task: {:run, [d]}} -> d end) + |> MapSet.new() + + domains_to_queue = MapSet.difference(stale_domains, domains_in_queue) + + Logger.debug("Adding #{MapSet.size(domains_to_queue)} stale domains to queue.") + + domains_to_queue |> Enum.each(fn domain -> add_to_queue(domain) end) end