Fewer unnecessary crawls
This commit is contained in:
parent
c01e324e91
commit
cf9ac30b1e
|
@ -13,6 +13,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||||
|
|
||||||
### Changed
|
### Changed
|
||||||
|
|
||||||
|
- Crawl instances that are down or unrecognized less often.
|
||||||
|
|
||||||
### Deprecated
|
### Deprecated
|
||||||
|
|
||||||
### Removed
|
### Removed
|
||||||
|
|
|
@ -47,6 +47,11 @@ config :logger, :console,
|
||||||
# Use Jason for JSON parsing in Phoenix
|
# Use Jason for JSON parsing in Phoenix
|
||||||
config :phoenix, :json_library, Jason
|
config :phoenix, :json_library, Jason
|
||||||
|
|
||||||
|
config :gollum,
|
||||||
|
refresh_secs: 86_400,
|
||||||
|
lazy_refresh: true,
|
||||||
|
user_agent: "fediverse.space crawler"
|
||||||
|
|
||||||
config :ex_twilio,
|
config :ex_twilio,
|
||||||
account_sid: System.get_env("TWILIO_ACCOUNT_SID"),
|
account_sid: System.get_env("TWILIO_ACCOUNT_SID"),
|
||||||
auth_token: System.get_env("TWILIO_AUTH_TOKEN")
|
auth_token: System.get_env("TWILIO_AUTH_TOKEN")
|
||||||
|
@ -60,7 +65,7 @@ config :backend, :crawler,
|
||||||
status_count_limit: 5000,
|
status_count_limit: 5000,
|
||||||
personal_instance_threshold: 10,
|
personal_instance_threshold: 10,
|
||||||
crawl_interval_mins: 60,
|
crawl_interval_mins: 60,
|
||||||
crawl_workers: 100,
|
crawl_workers: 50,
|
||||||
blacklist: [
|
blacklist: [
|
||||||
"gab.best",
|
"gab.best",
|
||||||
"4chan.icu"
|
"4chan.icu"
|
||||||
|
|
|
@ -61,7 +61,7 @@ config :backend, :crawler,
|
||||||
status_count_limit: 100,
|
status_count_limit: 100,
|
||||||
personal_instance_threshold: 5,
|
personal_instance_threshold: 5,
|
||||||
crawl_interval_mins: 60,
|
crawl_interval_mins: 60,
|
||||||
crawl_workers: 50,
|
crawl_workers: 10,
|
||||||
blacklist: [
|
blacklist: [
|
||||||
"gab.best",
|
"gab.best",
|
||||||
"4chan.icu"
|
"4chan.icu"
|
||||||
|
|
|
@ -20,7 +20,6 @@ defmodule Backend.Crawler.StaleInstanceManager do
|
||||||
|
|
||||||
instance_count =
|
instance_count =
|
||||||
Instance
|
Instance
|
||||||
|> where([i], not is_nil(i.version))
|
|
||||||
|> select([i], count(i.domain))
|
|> select([i], count(i.domain))
|
||||||
|> Repo.one()
|
|> Repo.one()
|
||||||
|
|
||||||
|
@ -49,48 +48,14 @@ defmodule Backend.Crawler.StaleInstanceManager do
|
||||||
Process.send_after(self(), :queue_stale_domains, 60_000)
|
Process.send_after(self(), :queue_stale_domains, 60_000)
|
||||||
end
|
end
|
||||||
|
|
||||||
# TODO: crawl instances with a blocking robots.txt less often (daily?)
|
|
||||||
defp queue_stale_domains() do
|
defp queue_stale_domains() do
|
||||||
interval = -1 * get_config(:crawl_interval_mins)
|
|
||||||
|
|
||||||
# Get domains that have never been crawled and where the last crawl is past the threshold
|
|
||||||
crawls_subquery =
|
|
||||||
Crawl
|
|
||||||
|> select([c], %{
|
|
||||||
instance_domain: c.instance_domain,
|
|
||||||
most_recent_crawl: max(c.inserted_at),
|
|
||||||
crawl_count: count(c.id)
|
|
||||||
})
|
|
||||||
|> where([c], is_nil(c.error))
|
|
||||||
|> group_by([c], c.instance_domain)
|
|
||||||
|
|
||||||
stale_domains =
|
stale_domains =
|
||||||
Instance
|
get_live_domains_to_crawl()
|
||||||
|> join(:left, [i], c in subquery(crawls_subquery), on: i.domain == c.instance_domain)
|
|> MapSet.union(get_dead_domains_to_crawl())
|
||||||
|> where(
|
|> MapSet.union(get_new_domains_to_crawl())
|
||||||
[i, c],
|
|
||||||
(c.most_recent_crawl < datetime_add(^NaiveDateTime.utc_now(), ^interval, "minute") or
|
|
||||||
is_nil(c.crawl_count)) and not i.opt_out
|
|
||||||
)
|
|
||||||
|> select([i], i.domain)
|
|
||||||
|> Repo.all()
|
|
||||||
|> MapSet.new()
|
|
||||||
|
|
||||||
# Don't add a domain that's already in the queue
|
# Don't add a domain that's already in the queue
|
||||||
domains_in_queue =
|
domains_in_queue = get_domains_in_queue(stale_domains)
|
||||||
Honeydew.filter(:crawl_queue, fn job ->
|
|
||||||
is_pending_crawl_job = match?(%Honeydew.Job{completed_at: nil, task: {:run, [_]}}, job)
|
|
||||||
|
|
||||||
if is_pending_crawl_job do
|
|
||||||
%Honeydew.Job{completed_at: nil, task: {:run, [d]}} = job
|
|
||||||
MapSet.member?(stale_domains, d)
|
|
||||||
else
|
|
||||||
false
|
|
||||||
end
|
|
||||||
end)
|
|
||||||
|> Enum.map(fn %Honeydew.Job{task: {:run, [d]}} -> d end)
|
|
||||||
|> MapSet.new()
|
|
||||||
|
|
||||||
domains_to_queue = MapSet.difference(stale_domains, domains_in_queue)
|
domains_to_queue = MapSet.difference(stale_domains, domains_in_queue)
|
||||||
|
|
||||||
Logger.debug("Adding #{MapSet.size(domains_to_queue)} stale domains to queue.")
|
Logger.debug("Adding #{MapSet.size(domains_to_queue)} stale domains to queue.")
|
||||||
|
@ -102,4 +67,132 @@ defmodule Backend.Crawler.StaleInstanceManager do
|
||||||
defp add_to_queue(domain) do
|
defp add_to_queue(domain) do
|
||||||
{:run, [domain]} |> Honeydew.async(:crawl_queue)
|
{:run, [domain]} |> Honeydew.async(:crawl_queue)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Handles instances where the most recent crawl was successful
|
||||||
|
@spec get_live_domains_to_crawl() :: MapSet.t()
|
||||||
|
defp get_live_domains_to_crawl() do
|
||||||
|
interval_mins = -1 * get_config(:crawl_interval_mins)
|
||||||
|
|
||||||
|
most_recent_crawl_subquery =
|
||||||
|
Crawl
|
||||||
|
|> select([c], %{
|
||||||
|
instance_domain: c.instance_domain,
|
||||||
|
inserted_at: max(c.inserted_at)
|
||||||
|
})
|
||||||
|
|> group_by([c], c.instance_domain)
|
||||||
|
|
||||||
|
Instance
|
||||||
|
|> join(:left, [i], most_recent_crawl in subquery(most_recent_crawl_subquery),
|
||||||
|
on: i.domain == most_recent_crawl.instance_domain
|
||||||
|
)
|
||||||
|
# Joining on a timestamp is really gross, but since we're joining on a timestamp in the same table, we should be OK.
|
||||||
|
|> join(:left, [i, most_recent_crawl], crawls in Crawl,
|
||||||
|
on:
|
||||||
|
i.domain == crawls.instance_domain and most_recent_crawl.inserted_at == crawls.inserted_at
|
||||||
|
)
|
||||||
|
|> where(
|
||||||
|
[i, most_recent_crawl, crawls],
|
||||||
|
is_nil(crawls.error) and
|
||||||
|
most_recent_crawl.inserted_at <
|
||||||
|
datetime_add(^NaiveDateTime.utc_now(), ^interval_mins, "minute") and not i.opt_out
|
||||||
|
)
|
||||||
|
|> select([i], i.domain)
|
||||||
|
|> Repo.all()
|
||||||
|
|> MapSet.new()
|
||||||
|
end
|
||||||
|
|
||||||
|
# Handles instances that have never been crawled at all.
|
||||||
|
@spec get_new_domains_to_crawl() :: MapSet.t()
|
||||||
|
defp get_new_domains_to_crawl() do
|
||||||
|
all_crawls_subquery =
|
||||||
|
Crawl
|
||||||
|
|> select([c], %{
|
||||||
|
instance_domain: c.instance_domain,
|
||||||
|
crawl_count: count(c.id)
|
||||||
|
})
|
||||||
|
|> group_by([c], c.instance_domain)
|
||||||
|
|
||||||
|
Instance
|
||||||
|
|> join(:left, [i], c in subquery(all_crawls_subquery), on: i.domain == c.instance_domain)
|
||||||
|
|> where([i, c], (is_nil(c.crawl_count) or c.crawl_count == 0) and not i.opt_out)
|
||||||
|
|> select([i], i.domain)
|
||||||
|
|> Repo.all()
|
||||||
|
|> MapSet.new()
|
||||||
|
end
|
||||||
|
|
||||||
|
# Handles instances where the previous crawl(s) were unsuccessful.
|
||||||
|
# These are crawled with an increasing delay
|
||||||
|
@spec get_dead_domains_to_crawl() :: MapSet.t()
|
||||||
|
defp get_dead_domains_to_crawl() do
|
||||||
|
now = get_now()
|
||||||
|
interval_mins = -1 * get_config(:crawl_interval_mins)
|
||||||
|
|
||||||
|
most_recent_successful_crawl_subquery =
|
||||||
|
Crawl
|
||||||
|
|> select([c], %{
|
||||||
|
instance_domain: c.instance_domain,
|
||||||
|
timestamp: max(c.inserted_at)
|
||||||
|
})
|
||||||
|
|> where([c], is_nil(c.error))
|
||||||
|
|> group_by([c], c.instance_domain)
|
||||||
|
|
||||||
|
Instance
|
||||||
|
|> join(
|
||||||
|
:left,
|
||||||
|
[i],
|
||||||
|
most_recent_successful_crawl in subquery(most_recent_successful_crawl_subquery),
|
||||||
|
on: i.domain == most_recent_successful_crawl.instance_domain
|
||||||
|
)
|
||||||
|
|> join(:left, [i, most_recent_successful_crawl_subquery], crawls in Crawl,
|
||||||
|
on: i.domain == crawls.instance_domain
|
||||||
|
)
|
||||||
|
|> select([i, most_recent_successful_crawl, crawls], %{
|
||||||
|
domain: i.domain,
|
||||||
|
most_recent_crawl: max(crawls.inserted_at),
|
||||||
|
failed_crawls: count(crawls.id)
|
||||||
|
})
|
||||||
|
|> group_by([i, most_recent_successful_crawl, crawls], i.domain)
|
||||||
|
|> where(
|
||||||
|
[i, most_recent_successful_crawl, crawls],
|
||||||
|
crawls.inserted_at > most_recent_successful_crawl.timestamp and not i.opt_out
|
||||||
|
)
|
||||||
|
|> Repo.all()
|
||||||
|
# We now have a list of domains, the # of failed crawls, and the most recent crawl timestamp.
|
||||||
|
# Now we filter down to those that should be crawled now.
|
||||||
|
|> Enum.map(fn %{
|
||||||
|
domain: domain,
|
||||||
|
most_recent_crawl: most_recent_crawl,
|
||||||
|
failed_crawls: failed_crawls
|
||||||
|
} ->
|
||||||
|
# The interval is never more than 24 hours
|
||||||
|
curr_interval = min(1440, interval_mins * :math.pow(2, failed_crawls))
|
||||||
|
next_crawl = NaiveDateTime.add(most_recent_crawl, curr_interval * 60, :second)
|
||||||
|
|
||||||
|
%{
|
||||||
|
domain: domain,
|
||||||
|
next_crawl: next_crawl
|
||||||
|
}
|
||||||
|
end)
|
||||||
|
|> Enum.filter(fn %{next_crawl: next_crawl} ->
|
||||||
|
NaiveDateTime.compare(now, next_crawl) == :gt
|
||||||
|
end)
|
||||||
|
|> Enum.map(fn %{domain: domain} -> domain end)
|
||||||
|
|> MapSet.new()
|
||||||
|
end
|
||||||
|
|
||||||
|
@spec get_domains_in_queue(MapSet.t()) :: MapSet.t()
|
||||||
|
defp get_domains_in_queue(domains) do
|
||||||
|
Honeydew.filter(:crawl_queue, fn job ->
|
||||||
|
is_pending_crawl_job = match?(%Honeydew.Job{completed_at: nil, task: {:run, [_]}}, job)
|
||||||
|
|
||||||
|
if is_pending_crawl_job do
|
||||||
|
%Honeydew.Job{completed_at: nil, task: {:run, [d]}} = job
|
||||||
|
MapSet.member?(domains, d)
|
||||||
|
else
|
||||||
|
false
|
||||||
|
end
|
||||||
|
end)
|
||||||
|
|> Enum.map(fn %Honeydew.Job{task: {:run, [d]}} -> d end)
|
||||||
|
|> MapSet.new()
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in a new issue