only save successful crawls in db

This commit is contained in:
Tao Bror Bojlén 2019-08-07 22:41:19 +03:00
parent d9b9081ec3
commit 30c5bbe2b9
No known key found for this signature in database
GPG key ID: C6EC7AAB905F9E6F
10 changed files with 65 additions and 90 deletions

View file

@ -11,16 +11,13 @@ defmodule Backend.Crawl do
field :interactions_seen, :integer
field :statuses_seen, :integer
# if something went wrong, otherwise null
field :error, :string
timestamps()
end
@doc false
def changeset(crawl, attrs) do
crawl
|> cast(attrs, [:instance, :statuses_seen, :interactions_seen, :error])
|> cast(attrs, [:instance, :statuses_seen, :interactions_seen])
|> validate_required([:instance])
end
end

View file

@ -120,7 +120,9 @@ defmodule Backend.Crawler do
status_count: result.status_count,
type: instance_type,
base_domain: get_base_domain(domain),
next_crawl: NaiveDateTime.add(now, get_config(:crawl_interval_mins) * 60, :second)
next_crawl: NaiveDateTime.add(now, get_config(:crawl_interval_mins) * 60, :second),
crawl_error: nil,
crawl_error_count: 0
}
Repo.insert!(
@ -135,7 +137,9 @@ defmodule Backend.Crawler do
:type,
:base_domain,
:updated_at,
:next_crawl
:next_crawl,
:crawl_error,
:crawl_error_count
]},
conflict_target: :domain
)
@ -240,10 +244,15 @@ defmodule Backend.Crawler do
end
# The "+1" is this error!
error_count = get_recent_crawl_error_count(domain) + 1
# The crawl interval grows exponentially at first but never goes above 24 hours
error_count =
Instance
|> Repo.get_by!(domain: domain)
|> Map.get(:crawl_error_count)
|> Kernel.+(1)
# The crawl interval grows exponentially at first but never goes above 72 hours
crawl_interval_mins =
min(get_config(:crawl_interval_mins) * round(:math.pow(2, error_count)), 1440)
min(get_config(:crawl_interval_mins) * round(:math.pow(2, error_count)), 4320)
next_crawl = NaiveDateTime.add(now, crawl_interval_mins * 60, :second)
@ -252,16 +261,13 @@ defmodule Backend.Crawler do
%Instance{
domain: domain,
base_domain: get_base_domain(domain),
crawl_error: error,
crawl_error_count: error_count,
next_crawl: next_crawl
},
on_conflict: {:replace, [:next_crawl]},
on_conflict: {:replace, [:base_domain, :crawl_error, :crawl_error_count, :next_crawl]},
conflict_target: :domain
)
Repo.insert!(%Crawl{
instance_domain: domain,
error: error
})
end)
Appsignal.increment_counter("crawler.failure", 1)

View file

@ -108,7 +108,7 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
# most recent status we have.
min_timestamp =
if statuses_seen == 0 do
get_last_successful_crawl_timestamp(domain)
get_last_crawl_timestamp(domain)
else
min_timestamp
end

View file

@ -56,8 +56,7 @@ defmodule Backend.Crawler.Crawlers.Misskey do
|> NaiveDateTime.add(get_config(:status_age_limit_days) * 24 * 3600 * -1, :second)
# Don't get any statuses older than this
min_timestamp =
max_datetime(get_last_successful_crawl_timestamp(domain), status_datetime_threshold)
min_timestamp = max_datetime(get_last_crawl_timestamp(domain), status_datetime_threshold)
{interactions, statuses_seen} = get_interactions(domain, min_timestamp)
{:ok, {version, description}} = get_version_and_description(domain)

View file

@ -15,6 +15,8 @@ defmodule Backend.Instance do
field :opt_in, :boolean
field :opt_out, :boolean
field :next_crawl, :naive_datetime
field :crawl_error, :string
field :crawl_error_count, :integer
many_to_many :peers, Backend.Instance,
join_through: Backend.InstancePeer,
@ -45,7 +47,9 @@ defmodule Backend.Instance do
:base_domain,
:opt_in,
:opt_out,
:next_crawl
:next_crawl,
:crawl_error,
:crawl_error_count
])
|> validate_required([:domain])
|> put_assoc(:peers, attrs.peers)

View file

@ -43,7 +43,6 @@ defmodule Backend.Scheduler do
instance_domain: c.instance_domain,
interactions_seen: sum(c.interactions_seen)
})
|> where([c], is_nil(c.error))
|> group_by([c], c.instance_domain)
scores =
@ -100,7 +99,7 @@ defmodule Backend.Scheduler do
)
|> where(
[c, c2],
c.inserted_at > c2.earliest_crawl and not is_nil(c.statuses_seen) and is_nil(c.error)
c.inserted_at > c2.earliest_crawl and not is_nil(c.statuses_seen)
)
|> select([c], %{
instance_domain: c.instance_domain,
@ -148,7 +147,6 @@ defmodule Backend.Scheduler do
instance_domain: c.instance_domain,
statuses_seen: sum(c.statuses_seen)
})
|> where([c], is_nil(c.error))
|> group_by([c], c.instance_domain)
interactions =

View file

@ -78,38 +78,16 @@ defmodule Backend.Util do
@spec get_last_crawl(String.t()) :: Crawl.t() | nil
def get_last_crawl(domain) do
crawls =
Crawl
|> select([c], c)
|> where([c], c.instance_domain == ^domain)
|> order_by(desc: :id)
|> limit(1)
|> Repo.all()
case length(crawls) do
1 -> hd(crawls)
0 -> nil
end
|> Repo.one()
end
@spec get_last_successful_crawl(String.t()) :: Crawl.t() | nil
def get_last_successful_crawl(domain) do
crawls =
Crawl
|> select([c], c)
|> where([c], is_nil(c.error) and c.instance_domain == ^domain)
|> order_by(desc: :id)
|> limit(1)
|> Repo.all()
case length(crawls) do
1 -> hd(crawls)
0 -> nil
end
end
@spec get_last_successful_crawl_timestamp(String.t()) :: NaiveDateTime.t() | nil
def get_last_successful_crawl_timestamp(domain) do
@spec get_last_crawl_timestamp(String.t()) :: NaiveDateTime.t() | nil
def get_last_crawl_timestamp(domain) do
crawl = get_last_crawl(domain)
case crawl do
@ -167,28 +145,4 @@ defmodule Backend.Util do
def convert_keys_to_atoms(map) do
map |> Map.new(fn {k, v} -> {String.to_atom(k), v} end)
end
# Given a domain, returns the number of n most recent crawls that errored
@spec get_recent_crawl_error_count(String.t()) :: integer
def get_recent_crawl_error_count(domain) do
most_recent_success_crawl_subquery =
Crawl
|> select([c], %{
instance_domain: c.instance_domain,
timestamp: max(c.inserted_at)
})
|> where([c], c.instance_domain == ^domain and is_nil(c.error))
|> group_by([c], c.instance_domain)
Crawl
|> join(:left, [c1], c2 in subquery(most_recent_success_crawl_subquery),
on: c1.instance_domain == c2.instance_domain
)
|> where(
[c1, c2],
c1.instance_domain == ^domain and (c1.inserted_at > c2.timestamp or is_nil(c2.timestamp))
)
|> select([c1, c2], count(c1.id))
|> Repo.one()
end
end

View file

@ -6,18 +6,6 @@ defmodule BackendWeb.InstanceView do
def render("show.json", %{instance: instance, crawl: crawl}) do
user_threshold = get_config(:personal_instance_threshold)
[status, last_updated] =
case crawl do
nil ->
["not crawled", nil]
_ ->
case crawl.error do
nil -> ["success", crawl.inserted_at]
err -> [err, crawl.inserted_at]
end
end
cond do
instance.user_count < user_threshold and not instance.opt_in ->
%{
@ -25,7 +13,21 @@ defmodule BackendWeb.InstanceView do
status: "personal instance"
}
instance.crawl_error == "robots.txt" ->
%{
name: instance.domain,
status: instance.crawl_error
}
instance.crawl_error != nil and instance.type == nil ->
%{
name: instance.domain,
status: instance.crawl_error
}
true ->
last_updated = max_datetime(crawl.inserted_at, instance.updated_at)
filtered_peers =
instance.peers
|> Enum.filter(fn peer -> not peer.opt_out end)
@ -48,7 +50,7 @@ defmodule BackendWeb.InstanceView do
domainCount: length(instance.peers),
peers: render_many(filtered_peers, InstanceView, "instance.json"),
lastUpdated: last_updated,
status: status,
status: "success",
type: instance.type,
statusesPerDay: instance.statuses_per_day,
statusesPerUserPerDay: statuses_per_user_per_day

View file

@ -13,7 +13,6 @@ defmodule Backend.Repo.Migrations.CreateCrawls do
timestamps()
end
# TODO: does this actually make WHERE error IS NULL queries faster? if not, drop it
create index(:crawls, [:error])
create index(:crawls, [:inserted_at])
end

View file

@ -0,0 +1,16 @@
defmodule Backend.Repo.Migrations.RemoveCrawlError do
use Ecto.Migration
def change do
execute("DELETE FROM crawls WHERE error IS NOT NULL", "")
alter table(:crawls) do
remove :error, :string
end
alter table(:instances) do
add :crawl_error, :string
add :crawl_error_count, :integer, default: 0, null: false
end
end
end