only save successful crawls in db

This commit is contained in:
Tao Bror Bojlén 2019-08-07 22:41:19 +03:00
parent d9b9081ec3
commit 30c5bbe2b9
No known key found for this signature in database
GPG key ID: C6EC7AAB905F9E6F
10 changed files with 65 additions and 90 deletions

View file

@ -11,16 +11,13 @@ defmodule Backend.Crawl do
field :interactions_seen, :integer field :interactions_seen, :integer
field :statuses_seen, :integer field :statuses_seen, :integer
# if something went wrong, otherwise null
field :error, :string
timestamps() timestamps()
end end
@doc false @doc false
def changeset(crawl, attrs) do def changeset(crawl, attrs) do
crawl crawl
|> cast(attrs, [:instance, :statuses_seen, :interactions_seen, :error]) |> cast(attrs, [:instance, :statuses_seen, :interactions_seen])
|> validate_required([:instance]) |> validate_required([:instance])
end end
end end

View file

@ -120,7 +120,9 @@ defmodule Backend.Crawler do
status_count: result.status_count, status_count: result.status_count,
type: instance_type, type: instance_type,
base_domain: get_base_domain(domain), base_domain: get_base_domain(domain),
next_crawl: NaiveDateTime.add(now, get_config(:crawl_interval_mins) * 60, :second) next_crawl: NaiveDateTime.add(now, get_config(:crawl_interval_mins) * 60, :second),
crawl_error: nil,
crawl_error_count: 0
} }
Repo.insert!( Repo.insert!(
@ -135,7 +137,9 @@ defmodule Backend.Crawler do
:type, :type,
:base_domain, :base_domain,
:updated_at, :updated_at,
:next_crawl :next_crawl,
:crawl_error,
:crawl_error_count
]}, ]},
conflict_target: :domain conflict_target: :domain
) )
@ -240,10 +244,15 @@ defmodule Backend.Crawler do
end end
# The "+1" is this error! # The "+1" is this error!
error_count = get_recent_crawl_error_count(domain) + 1 error_count =
# The crawl interval grows exponentially at first but never goes above 24 hours Instance
|> Repo.get_by!(domain: domain)
|> Map.get(:crawl_error_count)
|> Kernel.+(1)
# The crawl interval grows exponentially at first but never goes above 72 hours
crawl_interval_mins = crawl_interval_mins =
min(get_config(:crawl_interval_mins) * round(:math.pow(2, error_count)), 1440) min(get_config(:crawl_interval_mins) * round(:math.pow(2, error_count)), 4320)
next_crawl = NaiveDateTime.add(now, crawl_interval_mins * 60, :second) next_crawl = NaiveDateTime.add(now, crawl_interval_mins * 60, :second)
@ -252,16 +261,13 @@ defmodule Backend.Crawler do
%Instance{ %Instance{
domain: domain, domain: domain,
base_domain: get_base_domain(domain), base_domain: get_base_domain(domain),
crawl_error: error,
crawl_error_count: error_count,
next_crawl: next_crawl next_crawl: next_crawl
}, },
on_conflict: {:replace, [:next_crawl]}, on_conflict: {:replace, [:base_domain, :crawl_error, :crawl_error_count, :next_crawl]},
conflict_target: :domain conflict_target: :domain
) )
Repo.insert!(%Crawl{
instance_domain: domain,
error: error
})
end) end)
Appsignal.increment_counter("crawler.failure", 1) Appsignal.increment_counter("crawler.failure", 1)

View file

@ -108,7 +108,7 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
# most recent status we have. # most recent status we have.
min_timestamp = min_timestamp =
if statuses_seen == 0 do if statuses_seen == 0 do
get_last_successful_crawl_timestamp(domain) get_last_crawl_timestamp(domain)
else else
min_timestamp min_timestamp
end end

View file

@ -56,8 +56,7 @@ defmodule Backend.Crawler.Crawlers.Misskey do
|> NaiveDateTime.add(get_config(:status_age_limit_days) * 24 * 3600 * -1, :second) |> NaiveDateTime.add(get_config(:status_age_limit_days) * 24 * 3600 * -1, :second)
# Don't get any statuses older than this # Don't get any statuses older than this
min_timestamp = min_timestamp = max_datetime(get_last_crawl_timestamp(domain), status_datetime_threshold)
max_datetime(get_last_successful_crawl_timestamp(domain), status_datetime_threshold)
{interactions, statuses_seen} = get_interactions(domain, min_timestamp) {interactions, statuses_seen} = get_interactions(domain, min_timestamp)
{:ok, {version, description}} = get_version_and_description(domain) {:ok, {version, description}} = get_version_and_description(domain)

View file

@ -15,6 +15,8 @@ defmodule Backend.Instance do
field :opt_in, :boolean field :opt_in, :boolean
field :opt_out, :boolean field :opt_out, :boolean
field :next_crawl, :naive_datetime field :next_crawl, :naive_datetime
field :crawl_error, :string
field :crawl_error_count, :integer
many_to_many :peers, Backend.Instance, many_to_many :peers, Backend.Instance,
join_through: Backend.InstancePeer, join_through: Backend.InstancePeer,
@ -45,7 +47,9 @@ defmodule Backend.Instance do
:base_domain, :base_domain,
:opt_in, :opt_in,
:opt_out, :opt_out,
:next_crawl :next_crawl,
:crawl_error,
:crawl_error_count
]) ])
|> validate_required([:domain]) |> validate_required([:domain])
|> put_assoc(:peers, attrs.peers) |> put_assoc(:peers, attrs.peers)

View file

@ -43,7 +43,6 @@ defmodule Backend.Scheduler do
instance_domain: c.instance_domain, instance_domain: c.instance_domain,
interactions_seen: sum(c.interactions_seen) interactions_seen: sum(c.interactions_seen)
}) })
|> where([c], is_nil(c.error))
|> group_by([c], c.instance_domain) |> group_by([c], c.instance_domain)
scores = scores =
@ -100,7 +99,7 @@ defmodule Backend.Scheduler do
) )
|> where( |> where(
[c, c2], [c, c2],
c.inserted_at > c2.earliest_crawl and not is_nil(c.statuses_seen) and is_nil(c.error) c.inserted_at > c2.earliest_crawl and not is_nil(c.statuses_seen)
) )
|> select([c], %{ |> select([c], %{
instance_domain: c.instance_domain, instance_domain: c.instance_domain,
@ -148,7 +147,6 @@ defmodule Backend.Scheduler do
instance_domain: c.instance_domain, instance_domain: c.instance_domain,
statuses_seen: sum(c.statuses_seen) statuses_seen: sum(c.statuses_seen)
}) })
|> where([c], is_nil(c.error))
|> group_by([c], c.instance_domain) |> group_by([c], c.instance_domain)
interactions = interactions =

View file

@ -78,38 +78,16 @@ defmodule Backend.Util do
@spec get_last_crawl(String.t()) :: Crawl.t() | nil @spec get_last_crawl(String.t()) :: Crawl.t() | nil
def get_last_crawl(domain) do def get_last_crawl(domain) do
crawls = Crawl
Crawl |> select([c], c)
|> select([c], c) |> where([c], c.instance_domain == ^domain)
|> where([c], c.instance_domain == ^domain) |> order_by(desc: :id)
|> order_by(desc: :id) |> limit(1)
|> limit(1) |> Repo.one()
|> Repo.all()
case length(crawls) do
1 -> hd(crawls)
0 -> nil
end
end end
@spec get_last_successful_crawl(String.t()) :: Crawl.t() | nil @spec get_last_crawl_timestamp(String.t()) :: NaiveDateTime.t() | nil
def get_last_successful_crawl(domain) do def get_last_crawl_timestamp(domain) do
crawls =
Crawl
|> select([c], c)
|> where([c], is_nil(c.error) and c.instance_domain == ^domain)
|> order_by(desc: :id)
|> limit(1)
|> Repo.all()
case length(crawls) do
1 -> hd(crawls)
0 -> nil
end
end
@spec get_last_successful_crawl_timestamp(String.t()) :: NaiveDateTime.t() | nil
def get_last_successful_crawl_timestamp(domain) do
crawl = get_last_crawl(domain) crawl = get_last_crawl(domain)
case crawl do case crawl do
@ -167,28 +145,4 @@ defmodule Backend.Util do
def convert_keys_to_atoms(map) do def convert_keys_to_atoms(map) do
map |> Map.new(fn {k, v} -> {String.to_atom(k), v} end) map |> Map.new(fn {k, v} -> {String.to_atom(k), v} end)
end end
# Given a domain, returns the number of n most recent crawls that errored
@spec get_recent_crawl_error_count(String.t()) :: integer
def get_recent_crawl_error_count(domain) do
most_recent_success_crawl_subquery =
Crawl
|> select([c], %{
instance_domain: c.instance_domain,
timestamp: max(c.inserted_at)
})
|> where([c], c.instance_domain == ^domain and is_nil(c.error))
|> group_by([c], c.instance_domain)
Crawl
|> join(:left, [c1], c2 in subquery(most_recent_success_crawl_subquery),
on: c1.instance_domain == c2.instance_domain
)
|> where(
[c1, c2],
c1.instance_domain == ^domain and (c1.inserted_at > c2.timestamp or is_nil(c2.timestamp))
)
|> select([c1, c2], count(c1.id))
|> Repo.one()
end
end end

View file

@ -6,18 +6,6 @@ defmodule BackendWeb.InstanceView do
def render("show.json", %{instance: instance, crawl: crawl}) do def render("show.json", %{instance: instance, crawl: crawl}) do
user_threshold = get_config(:personal_instance_threshold) user_threshold = get_config(:personal_instance_threshold)
[status, last_updated] =
case crawl do
nil ->
["not crawled", nil]
_ ->
case crawl.error do
nil -> ["success", crawl.inserted_at]
err -> [err, crawl.inserted_at]
end
end
cond do cond do
instance.user_count < user_threshold and not instance.opt_in -> instance.user_count < user_threshold and not instance.opt_in ->
%{ %{
@ -25,7 +13,21 @@ defmodule BackendWeb.InstanceView do
status: "personal instance" status: "personal instance"
} }
instance.crawl_error == "robots.txt" ->
%{
name: instance.domain,
status: instance.crawl_error
}
instance.crawl_error != nil and instance.type == nil ->
%{
name: instance.domain,
status: instance.crawl_error
}
true -> true ->
last_updated = max_datetime(crawl.inserted_at, instance.updated_at)
filtered_peers = filtered_peers =
instance.peers instance.peers
|> Enum.filter(fn peer -> not peer.opt_out end) |> Enum.filter(fn peer -> not peer.opt_out end)
@ -48,7 +50,7 @@ defmodule BackendWeb.InstanceView do
domainCount: length(instance.peers), domainCount: length(instance.peers),
peers: render_many(filtered_peers, InstanceView, "instance.json"), peers: render_many(filtered_peers, InstanceView, "instance.json"),
lastUpdated: last_updated, lastUpdated: last_updated,
status: status, status: "success",
type: instance.type, type: instance.type,
statusesPerDay: instance.statuses_per_day, statusesPerDay: instance.statuses_per_day,
statusesPerUserPerDay: statuses_per_user_per_day statusesPerUserPerDay: statuses_per_user_per_day

View file

@ -13,7 +13,6 @@ defmodule Backend.Repo.Migrations.CreateCrawls do
timestamps() timestamps()
end end
# TODO: does this actually make WHERE error IS NULL queries faster? if not, drop it
create index(:crawls, [:error]) create index(:crawls, [:error])
create index(:crawls, [:inserted_at]) create index(:crawls, [:inserted_at])
end end

View file

@ -0,0 +1,16 @@
defmodule Backend.Repo.Migrations.RemoveCrawlError do
use Ecto.Migration
def change do
execute("DELETE FROM crawls WHERE error IS NOT NULL", "")
alter table(:crawls) do
remove :error, :string
end
alter table(:instances) do
add :crawl_error, :string
add :crawl_error_count, :integer, default: 0, null: false
end
end
end