only save successful crawls in db
This commit is contained in:
parent
d9b9081ec3
commit
30c5bbe2b9
|
@ -11,16 +11,13 @@ defmodule Backend.Crawl do
|
||||||
field :interactions_seen, :integer
|
field :interactions_seen, :integer
|
||||||
field :statuses_seen, :integer
|
field :statuses_seen, :integer
|
||||||
|
|
||||||
# if something went wrong, otherwise null
|
|
||||||
field :error, :string
|
|
||||||
|
|
||||||
timestamps()
|
timestamps()
|
||||||
end
|
end
|
||||||
|
|
||||||
@doc false
|
@doc false
|
||||||
def changeset(crawl, attrs) do
|
def changeset(crawl, attrs) do
|
||||||
crawl
|
crawl
|
||||||
|> cast(attrs, [:instance, :statuses_seen, :interactions_seen, :error])
|
|> cast(attrs, [:instance, :statuses_seen, :interactions_seen])
|
||||||
|> validate_required([:instance])
|
|> validate_required([:instance])
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -120,7 +120,9 @@ defmodule Backend.Crawler do
|
||||||
status_count: result.status_count,
|
status_count: result.status_count,
|
||||||
type: instance_type,
|
type: instance_type,
|
||||||
base_domain: get_base_domain(domain),
|
base_domain: get_base_domain(domain),
|
||||||
next_crawl: NaiveDateTime.add(now, get_config(:crawl_interval_mins) * 60, :second)
|
next_crawl: NaiveDateTime.add(now, get_config(:crawl_interval_mins) * 60, :second),
|
||||||
|
crawl_error: nil,
|
||||||
|
crawl_error_count: 0
|
||||||
}
|
}
|
||||||
|
|
||||||
Repo.insert!(
|
Repo.insert!(
|
||||||
|
@ -135,7 +137,9 @@ defmodule Backend.Crawler do
|
||||||
:type,
|
:type,
|
||||||
:base_domain,
|
:base_domain,
|
||||||
:updated_at,
|
:updated_at,
|
||||||
:next_crawl
|
:next_crawl,
|
||||||
|
:crawl_error,
|
||||||
|
:crawl_error_count
|
||||||
]},
|
]},
|
||||||
conflict_target: :domain
|
conflict_target: :domain
|
||||||
)
|
)
|
||||||
|
@ -240,10 +244,15 @@ defmodule Backend.Crawler do
|
||||||
end
|
end
|
||||||
|
|
||||||
# The "+1" is this error!
|
# The "+1" is this error!
|
||||||
error_count = get_recent_crawl_error_count(domain) + 1
|
error_count =
|
||||||
# The crawl interval grows exponentially at first but never goes above 24 hours
|
Instance
|
||||||
|
|> Repo.get_by!(domain: domain)
|
||||||
|
|> Map.get(:crawl_error_count)
|
||||||
|
|> Kernel.+(1)
|
||||||
|
|
||||||
|
# The crawl interval grows exponentially at first but never goes above 72 hours
|
||||||
crawl_interval_mins =
|
crawl_interval_mins =
|
||||||
min(get_config(:crawl_interval_mins) * round(:math.pow(2, error_count)), 1440)
|
min(get_config(:crawl_interval_mins) * round(:math.pow(2, error_count)), 4320)
|
||||||
|
|
||||||
next_crawl = NaiveDateTime.add(now, crawl_interval_mins * 60, :second)
|
next_crawl = NaiveDateTime.add(now, crawl_interval_mins * 60, :second)
|
||||||
|
|
||||||
|
@ -252,16 +261,13 @@ defmodule Backend.Crawler do
|
||||||
%Instance{
|
%Instance{
|
||||||
domain: domain,
|
domain: domain,
|
||||||
base_domain: get_base_domain(domain),
|
base_domain: get_base_domain(domain),
|
||||||
|
crawl_error: error,
|
||||||
|
crawl_error_count: error_count,
|
||||||
next_crawl: next_crawl
|
next_crawl: next_crawl
|
||||||
},
|
},
|
||||||
on_conflict: {:replace, [:next_crawl]},
|
on_conflict: {:replace, [:base_domain, :crawl_error, :crawl_error_count, :next_crawl]},
|
||||||
conflict_target: :domain
|
conflict_target: :domain
|
||||||
)
|
)
|
||||||
|
|
||||||
Repo.insert!(%Crawl{
|
|
||||||
instance_domain: domain,
|
|
||||||
error: error
|
|
||||||
})
|
|
||||||
end)
|
end)
|
||||||
|
|
||||||
Appsignal.increment_counter("crawler.failure", 1)
|
Appsignal.increment_counter("crawler.failure", 1)
|
||||||
|
|
|
@ -108,7 +108,7 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
|
||||||
# most recent status we have.
|
# most recent status we have.
|
||||||
min_timestamp =
|
min_timestamp =
|
||||||
if statuses_seen == 0 do
|
if statuses_seen == 0 do
|
||||||
get_last_successful_crawl_timestamp(domain)
|
get_last_crawl_timestamp(domain)
|
||||||
else
|
else
|
||||||
min_timestamp
|
min_timestamp
|
||||||
end
|
end
|
||||||
|
|
|
@ -56,8 +56,7 @@ defmodule Backend.Crawler.Crawlers.Misskey do
|
||||||
|> NaiveDateTime.add(get_config(:status_age_limit_days) * 24 * 3600 * -1, :second)
|
|> NaiveDateTime.add(get_config(:status_age_limit_days) * 24 * 3600 * -1, :second)
|
||||||
|
|
||||||
# Don't get any statuses older than this
|
# Don't get any statuses older than this
|
||||||
min_timestamp =
|
min_timestamp = max_datetime(get_last_crawl_timestamp(domain), status_datetime_threshold)
|
||||||
max_datetime(get_last_successful_crawl_timestamp(domain), status_datetime_threshold)
|
|
||||||
|
|
||||||
{interactions, statuses_seen} = get_interactions(domain, min_timestamp)
|
{interactions, statuses_seen} = get_interactions(domain, min_timestamp)
|
||||||
{:ok, {version, description}} = get_version_and_description(domain)
|
{:ok, {version, description}} = get_version_and_description(domain)
|
||||||
|
|
|
@ -15,6 +15,8 @@ defmodule Backend.Instance do
|
||||||
field :opt_in, :boolean
|
field :opt_in, :boolean
|
||||||
field :opt_out, :boolean
|
field :opt_out, :boolean
|
||||||
field :next_crawl, :naive_datetime
|
field :next_crawl, :naive_datetime
|
||||||
|
field :crawl_error, :string
|
||||||
|
field :crawl_error_count, :integer
|
||||||
|
|
||||||
many_to_many :peers, Backend.Instance,
|
many_to_many :peers, Backend.Instance,
|
||||||
join_through: Backend.InstancePeer,
|
join_through: Backend.InstancePeer,
|
||||||
|
@ -45,7 +47,9 @@ defmodule Backend.Instance do
|
||||||
:base_domain,
|
:base_domain,
|
||||||
:opt_in,
|
:opt_in,
|
||||||
:opt_out,
|
:opt_out,
|
||||||
:next_crawl
|
:next_crawl,
|
||||||
|
:crawl_error,
|
||||||
|
:crawl_error_count
|
||||||
])
|
])
|
||||||
|> validate_required([:domain])
|
|> validate_required([:domain])
|
||||||
|> put_assoc(:peers, attrs.peers)
|
|> put_assoc(:peers, attrs.peers)
|
||||||
|
|
|
@ -43,7 +43,6 @@ defmodule Backend.Scheduler do
|
||||||
instance_domain: c.instance_domain,
|
instance_domain: c.instance_domain,
|
||||||
interactions_seen: sum(c.interactions_seen)
|
interactions_seen: sum(c.interactions_seen)
|
||||||
})
|
})
|
||||||
|> where([c], is_nil(c.error))
|
|
||||||
|> group_by([c], c.instance_domain)
|
|> group_by([c], c.instance_domain)
|
||||||
|
|
||||||
scores =
|
scores =
|
||||||
|
@ -100,7 +99,7 @@ defmodule Backend.Scheduler do
|
||||||
)
|
)
|
||||||
|> where(
|
|> where(
|
||||||
[c, c2],
|
[c, c2],
|
||||||
c.inserted_at > c2.earliest_crawl and not is_nil(c.statuses_seen) and is_nil(c.error)
|
c.inserted_at > c2.earliest_crawl and not is_nil(c.statuses_seen)
|
||||||
)
|
)
|
||||||
|> select([c], %{
|
|> select([c], %{
|
||||||
instance_domain: c.instance_domain,
|
instance_domain: c.instance_domain,
|
||||||
|
@ -148,7 +147,6 @@ defmodule Backend.Scheduler do
|
||||||
instance_domain: c.instance_domain,
|
instance_domain: c.instance_domain,
|
||||||
statuses_seen: sum(c.statuses_seen)
|
statuses_seen: sum(c.statuses_seen)
|
||||||
})
|
})
|
||||||
|> where([c], is_nil(c.error))
|
|
||||||
|> group_by([c], c.instance_domain)
|
|> group_by([c], c.instance_domain)
|
||||||
|
|
||||||
interactions =
|
interactions =
|
||||||
|
|
|
@ -78,38 +78,16 @@ defmodule Backend.Util do
|
||||||
|
|
||||||
@spec get_last_crawl(String.t()) :: Crawl.t() | nil
|
@spec get_last_crawl(String.t()) :: Crawl.t() | nil
|
||||||
def get_last_crawl(domain) do
|
def get_last_crawl(domain) do
|
||||||
crawls =
|
Crawl
|
||||||
Crawl
|
|> select([c], c)
|
||||||
|> select([c], c)
|
|> where([c], c.instance_domain == ^domain)
|
||||||
|> where([c], c.instance_domain == ^domain)
|
|> order_by(desc: :id)
|
||||||
|> order_by(desc: :id)
|
|> limit(1)
|
||||||
|> limit(1)
|
|> Repo.one()
|
||||||
|> Repo.all()
|
|
||||||
|
|
||||||
case length(crawls) do
|
|
||||||
1 -> hd(crawls)
|
|
||||||
0 -> nil
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
|
||||||
@spec get_last_successful_crawl(String.t()) :: Crawl.t() | nil
|
@spec get_last_crawl_timestamp(String.t()) :: NaiveDateTime.t() | nil
|
||||||
def get_last_successful_crawl(domain) do
|
def get_last_crawl_timestamp(domain) do
|
||||||
crawls =
|
|
||||||
Crawl
|
|
||||||
|> select([c], c)
|
|
||||||
|> where([c], is_nil(c.error) and c.instance_domain == ^domain)
|
|
||||||
|> order_by(desc: :id)
|
|
||||||
|> limit(1)
|
|
||||||
|> Repo.all()
|
|
||||||
|
|
||||||
case length(crawls) do
|
|
||||||
1 -> hd(crawls)
|
|
||||||
0 -> nil
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
@spec get_last_successful_crawl_timestamp(String.t()) :: NaiveDateTime.t() | nil
|
|
||||||
def get_last_successful_crawl_timestamp(domain) do
|
|
||||||
crawl = get_last_crawl(domain)
|
crawl = get_last_crawl(domain)
|
||||||
|
|
||||||
case crawl do
|
case crawl do
|
||||||
|
@ -167,28 +145,4 @@ defmodule Backend.Util do
|
||||||
def convert_keys_to_atoms(map) do
|
def convert_keys_to_atoms(map) do
|
||||||
map |> Map.new(fn {k, v} -> {String.to_atom(k), v} end)
|
map |> Map.new(fn {k, v} -> {String.to_atom(k), v} end)
|
||||||
end
|
end
|
||||||
|
|
||||||
# Given a domain, returns the number of n most recent crawls that errored
|
|
||||||
@spec get_recent_crawl_error_count(String.t()) :: integer
|
|
||||||
def get_recent_crawl_error_count(domain) do
|
|
||||||
most_recent_success_crawl_subquery =
|
|
||||||
Crawl
|
|
||||||
|> select([c], %{
|
|
||||||
instance_domain: c.instance_domain,
|
|
||||||
timestamp: max(c.inserted_at)
|
|
||||||
})
|
|
||||||
|> where([c], c.instance_domain == ^domain and is_nil(c.error))
|
|
||||||
|> group_by([c], c.instance_domain)
|
|
||||||
|
|
||||||
Crawl
|
|
||||||
|> join(:left, [c1], c2 in subquery(most_recent_success_crawl_subquery),
|
|
||||||
on: c1.instance_domain == c2.instance_domain
|
|
||||||
)
|
|
||||||
|> where(
|
|
||||||
[c1, c2],
|
|
||||||
c1.instance_domain == ^domain and (c1.inserted_at > c2.timestamp or is_nil(c2.timestamp))
|
|
||||||
)
|
|
||||||
|> select([c1, c2], count(c1.id))
|
|
||||||
|> Repo.one()
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
|
|
@ -6,18 +6,6 @@ defmodule BackendWeb.InstanceView do
|
||||||
def render("show.json", %{instance: instance, crawl: crawl}) do
|
def render("show.json", %{instance: instance, crawl: crawl}) do
|
||||||
user_threshold = get_config(:personal_instance_threshold)
|
user_threshold = get_config(:personal_instance_threshold)
|
||||||
|
|
||||||
[status, last_updated] =
|
|
||||||
case crawl do
|
|
||||||
nil ->
|
|
||||||
["not crawled", nil]
|
|
||||||
|
|
||||||
_ ->
|
|
||||||
case crawl.error do
|
|
||||||
nil -> ["success", crawl.inserted_at]
|
|
||||||
err -> [err, crawl.inserted_at]
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
cond do
|
cond do
|
||||||
instance.user_count < user_threshold and not instance.opt_in ->
|
instance.user_count < user_threshold and not instance.opt_in ->
|
||||||
%{
|
%{
|
||||||
|
@ -25,7 +13,21 @@ defmodule BackendWeb.InstanceView do
|
||||||
status: "personal instance"
|
status: "personal instance"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
instance.crawl_error == "robots.txt" ->
|
||||||
|
%{
|
||||||
|
name: instance.domain,
|
||||||
|
status: instance.crawl_error
|
||||||
|
}
|
||||||
|
|
||||||
|
instance.crawl_error != nil and instance.type == nil ->
|
||||||
|
%{
|
||||||
|
name: instance.domain,
|
||||||
|
status: instance.crawl_error
|
||||||
|
}
|
||||||
|
|
||||||
true ->
|
true ->
|
||||||
|
last_updated = max_datetime(crawl.inserted_at, instance.updated_at)
|
||||||
|
|
||||||
filtered_peers =
|
filtered_peers =
|
||||||
instance.peers
|
instance.peers
|
||||||
|> Enum.filter(fn peer -> not peer.opt_out end)
|
|> Enum.filter(fn peer -> not peer.opt_out end)
|
||||||
|
@ -48,7 +50,7 @@ defmodule BackendWeb.InstanceView do
|
||||||
domainCount: length(instance.peers),
|
domainCount: length(instance.peers),
|
||||||
peers: render_many(filtered_peers, InstanceView, "instance.json"),
|
peers: render_many(filtered_peers, InstanceView, "instance.json"),
|
||||||
lastUpdated: last_updated,
|
lastUpdated: last_updated,
|
||||||
status: status,
|
status: "success",
|
||||||
type: instance.type,
|
type: instance.type,
|
||||||
statusesPerDay: instance.statuses_per_day,
|
statusesPerDay: instance.statuses_per_day,
|
||||||
statusesPerUserPerDay: statuses_per_user_per_day
|
statusesPerUserPerDay: statuses_per_user_per_day
|
||||||
|
|
|
@ -13,7 +13,6 @@ defmodule Backend.Repo.Migrations.CreateCrawls do
|
||||||
timestamps()
|
timestamps()
|
||||||
end
|
end
|
||||||
|
|
||||||
# TODO: does this actually make WHERE error IS NULL queries faster? if not, drop it
|
|
||||||
create index(:crawls, [:error])
|
create index(:crawls, [:error])
|
||||||
create index(:crawls, [:inserted_at])
|
create index(:crawls, [:inserted_at])
|
||||||
end
|
end
|
||||||
|
|
|
@ -0,0 +1,16 @@
|
||||||
|
defmodule Backend.Repo.Migrations.RemoveCrawlError do
|
||||||
|
use Ecto.Migration
|
||||||
|
|
||||||
|
def change do
|
||||||
|
execute("DELETE FROM crawls WHERE error IS NOT NULL", "")
|
||||||
|
|
||||||
|
alter table(:crawls) do
|
||||||
|
remove :error, :string
|
||||||
|
end
|
||||||
|
|
||||||
|
alter table(:instances) do
|
||||||
|
add :crawl_error, :string
|
||||||
|
add :crawl_error_count, :integer, default: 0, null: false
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
Loading…
Reference in a new issue