don't add domains to the queue twice

This commit is contained in:
Tao Bror Bojlén 2019-07-27 16:22:39 +03:00
parent 74fc30e8a5
commit ea0fae2152
No known key found for this signature in database
GPG key ID: C6EC7AAB905F9E6F
2 changed files with 26 additions and 2 deletions

View file

@ -49,6 +49,8 @@ defmodule Backend.Crawler do
# go!
|> crawl()
|> save()
Appsignal.increment_counter("crawler.total", 1)
end
# Adds a new ApiCrawler that run/1 will check.
@ -207,6 +209,8 @@ defmodule Backend.Crawler do
CrawlInteraction
|> Repo.insert_all(interactions)
Appsignal.increment_counter("crawler.success", 1)
end
defp save(%{domain: domain, error: error, allows_crawling?: allows_crawling}) do
@ -232,6 +236,8 @@ defmodule Backend.Crawler do
error: error
})
end)
Appsignal.increment_counter("crawler.failure", 1)
end
defp get_base_domain(domain) do

View file

@ -74,10 +74,28 @@ defmodule Backend.Crawler.StaleInstanceManager do
)
|> select([i], i.domain)
|> Repo.all()
|> MapSet.new()
Logger.debug("Adding #{length(stale_domains)} stale domains to queue.")
# Don't add a domain that's already in the queue
domains_in_queue =
Honeydew.filter(:crawl_queue, fn job ->
is_pending_crawl_job = match?(%Honeydew.Job{completed_at: nil, task: {:run, [_]}}, job)
stale_domains
if is_pending_crawl_job do
%Honeydew.Job{completed_at: nil, task: {:run, [d]}} = job
MapSet.member?(stale_domains, d)
else
false
end
end)
|> Enum.map(fn %Honeydew.Job{task: {:run, [d]}} -> d end)
|> MapSet.new()
domains_to_queue = MapSet.difference(stale_domains, domains_in_queue)
Logger.debug("Adding #{MapSet.size(domains_to_queue)} stale domains to queue.")
domains_to_queue
|> Enum.each(fn domain -> add_to_queue(domain) end)
end