only save successful crawls in db
This commit is contained in:
parent
d9b9081ec3
commit
30c5bbe2b9
|
@ -11,16 +11,13 @@ defmodule Backend.Crawl do
|
|||
field :interactions_seen, :integer
|
||||
field :statuses_seen, :integer
|
||||
|
||||
# if something went wrong, otherwise null
|
||||
field :error, :string
|
||||
|
||||
timestamps()
|
||||
end
|
||||
|
||||
@doc false
|
||||
def changeset(crawl, attrs) do
|
||||
crawl
|
||||
|> cast(attrs, [:instance, :statuses_seen, :interactions_seen, :error])
|
||||
|> cast(attrs, [:instance, :statuses_seen, :interactions_seen])
|
||||
|> validate_required([:instance])
|
||||
end
|
||||
end
|
||||
|
|
|
@ -120,7 +120,9 @@ defmodule Backend.Crawler do
|
|||
status_count: result.status_count,
|
||||
type: instance_type,
|
||||
base_domain: get_base_domain(domain),
|
||||
next_crawl: NaiveDateTime.add(now, get_config(:crawl_interval_mins) * 60, :second)
|
||||
next_crawl: NaiveDateTime.add(now, get_config(:crawl_interval_mins) * 60, :second),
|
||||
crawl_error: nil,
|
||||
crawl_error_count: 0
|
||||
}
|
||||
|
||||
Repo.insert!(
|
||||
|
@ -135,7 +137,9 @@ defmodule Backend.Crawler do
|
|||
:type,
|
||||
:base_domain,
|
||||
:updated_at,
|
||||
:next_crawl
|
||||
:next_crawl,
|
||||
:crawl_error,
|
||||
:crawl_error_count
|
||||
]},
|
||||
conflict_target: :domain
|
||||
)
|
||||
|
@ -240,10 +244,15 @@ defmodule Backend.Crawler do
|
|||
end
|
||||
|
||||
# The "+1" is this error!
|
||||
error_count = get_recent_crawl_error_count(domain) + 1
|
||||
# The crawl interval grows exponentially at first but never goes above 24 hours
|
||||
error_count =
|
||||
Instance
|
||||
|> Repo.get_by!(domain: domain)
|
||||
|> Map.get(:crawl_error_count)
|
||||
|> Kernel.+(1)
|
||||
|
||||
# The crawl interval grows exponentially at first but never goes above 72 hours
|
||||
crawl_interval_mins =
|
||||
min(get_config(:crawl_interval_mins) * round(:math.pow(2, error_count)), 1440)
|
||||
min(get_config(:crawl_interval_mins) * round(:math.pow(2, error_count)), 4320)
|
||||
|
||||
next_crawl = NaiveDateTime.add(now, crawl_interval_mins * 60, :second)
|
||||
|
||||
|
@ -252,16 +261,13 @@ defmodule Backend.Crawler do
|
|||
%Instance{
|
||||
domain: domain,
|
||||
base_domain: get_base_domain(domain),
|
||||
crawl_error: error,
|
||||
crawl_error_count: error_count,
|
||||
next_crawl: next_crawl
|
||||
},
|
||||
on_conflict: {:replace, [:next_crawl]},
|
||||
on_conflict: {:replace, [:base_domain, :crawl_error, :crawl_error_count, :next_crawl]},
|
||||
conflict_target: :domain
|
||||
)
|
||||
|
||||
Repo.insert!(%Crawl{
|
||||
instance_domain: domain,
|
||||
error: error
|
||||
})
|
||||
end)
|
||||
|
||||
Appsignal.increment_counter("crawler.failure", 1)
|
||||
|
|
|
@ -108,7 +108,7 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
|
|||
# most recent status we have.
|
||||
min_timestamp =
|
||||
if statuses_seen == 0 do
|
||||
get_last_successful_crawl_timestamp(domain)
|
||||
get_last_crawl_timestamp(domain)
|
||||
else
|
||||
min_timestamp
|
||||
end
|
||||
|
|
|
@ -56,8 +56,7 @@ defmodule Backend.Crawler.Crawlers.Misskey do
|
|||
|> NaiveDateTime.add(get_config(:status_age_limit_days) * 24 * 3600 * -1, :second)
|
||||
|
||||
# Don't get any statuses older than this
|
||||
min_timestamp =
|
||||
max_datetime(get_last_successful_crawl_timestamp(domain), status_datetime_threshold)
|
||||
min_timestamp = max_datetime(get_last_crawl_timestamp(domain), status_datetime_threshold)
|
||||
|
||||
{interactions, statuses_seen} = get_interactions(domain, min_timestamp)
|
||||
{:ok, {version, description}} = get_version_and_description(domain)
|
||||
|
|
|
@ -15,6 +15,8 @@ defmodule Backend.Instance do
|
|||
field :opt_in, :boolean
|
||||
field :opt_out, :boolean
|
||||
field :next_crawl, :naive_datetime
|
||||
field :crawl_error, :string
|
||||
field :crawl_error_count, :integer
|
||||
|
||||
many_to_many :peers, Backend.Instance,
|
||||
join_through: Backend.InstancePeer,
|
||||
|
@ -45,7 +47,9 @@ defmodule Backend.Instance do
|
|||
:base_domain,
|
||||
:opt_in,
|
||||
:opt_out,
|
||||
:next_crawl
|
||||
:next_crawl,
|
||||
:crawl_error,
|
||||
:crawl_error_count
|
||||
])
|
||||
|> validate_required([:domain])
|
||||
|> put_assoc(:peers, attrs.peers)
|
||||
|
|
|
@ -43,7 +43,6 @@ defmodule Backend.Scheduler do
|
|||
instance_domain: c.instance_domain,
|
||||
interactions_seen: sum(c.interactions_seen)
|
||||
})
|
||||
|> where([c], is_nil(c.error))
|
||||
|> group_by([c], c.instance_domain)
|
||||
|
||||
scores =
|
||||
|
@ -100,7 +99,7 @@ defmodule Backend.Scheduler do
|
|||
)
|
||||
|> where(
|
||||
[c, c2],
|
||||
c.inserted_at > c2.earliest_crawl and not is_nil(c.statuses_seen) and is_nil(c.error)
|
||||
c.inserted_at > c2.earliest_crawl and not is_nil(c.statuses_seen)
|
||||
)
|
||||
|> select([c], %{
|
||||
instance_domain: c.instance_domain,
|
||||
|
@ -148,7 +147,6 @@ defmodule Backend.Scheduler do
|
|||
instance_domain: c.instance_domain,
|
||||
statuses_seen: sum(c.statuses_seen)
|
||||
})
|
||||
|> where([c], is_nil(c.error))
|
||||
|> group_by([c], c.instance_domain)
|
||||
|
||||
interactions =
|
||||
|
|
|
@ -78,38 +78,16 @@ defmodule Backend.Util do
|
|||
|
||||
@spec get_last_crawl(String.t()) :: Crawl.t() | nil
|
||||
def get_last_crawl(domain) do
|
||||
crawls =
|
||||
Crawl
|
||||
|> select([c], c)
|
||||
|> where([c], c.instance_domain == ^domain)
|
||||
|> order_by(desc: :id)
|
||||
|> limit(1)
|
||||
|> Repo.all()
|
||||
|
||||
case length(crawls) do
|
||||
1 -> hd(crawls)
|
||||
0 -> nil
|
||||
end
|
||||
|> Repo.one()
|
||||
end
|
||||
|
||||
@spec get_last_successful_crawl(String.t()) :: Crawl.t() | nil
|
||||
def get_last_successful_crawl(domain) do
|
||||
crawls =
|
||||
Crawl
|
||||
|> select([c], c)
|
||||
|> where([c], is_nil(c.error) and c.instance_domain == ^domain)
|
||||
|> order_by(desc: :id)
|
||||
|> limit(1)
|
||||
|> Repo.all()
|
||||
|
||||
case length(crawls) do
|
||||
1 -> hd(crawls)
|
||||
0 -> nil
|
||||
end
|
||||
end
|
||||
|
||||
@spec get_last_successful_crawl_timestamp(String.t()) :: NaiveDateTime.t() | nil
|
||||
def get_last_successful_crawl_timestamp(domain) do
|
||||
@spec get_last_crawl_timestamp(String.t()) :: NaiveDateTime.t() | nil
|
||||
def get_last_crawl_timestamp(domain) do
|
||||
crawl = get_last_crawl(domain)
|
||||
|
||||
case crawl do
|
||||
|
@ -167,28 +145,4 @@ defmodule Backend.Util do
|
|||
def convert_keys_to_atoms(map) do
|
||||
map |> Map.new(fn {k, v} -> {String.to_atom(k), v} end)
|
||||
end
|
||||
|
||||
# Given a domain, returns the number of n most recent crawls that errored
|
||||
@spec get_recent_crawl_error_count(String.t()) :: integer
|
||||
def get_recent_crawl_error_count(domain) do
|
||||
most_recent_success_crawl_subquery =
|
||||
Crawl
|
||||
|> select([c], %{
|
||||
instance_domain: c.instance_domain,
|
||||
timestamp: max(c.inserted_at)
|
||||
})
|
||||
|> where([c], c.instance_domain == ^domain and is_nil(c.error))
|
||||
|> group_by([c], c.instance_domain)
|
||||
|
||||
Crawl
|
||||
|> join(:left, [c1], c2 in subquery(most_recent_success_crawl_subquery),
|
||||
on: c1.instance_domain == c2.instance_domain
|
||||
)
|
||||
|> where(
|
||||
[c1, c2],
|
||||
c1.instance_domain == ^domain and (c1.inserted_at > c2.timestamp or is_nil(c2.timestamp))
|
||||
)
|
||||
|> select([c1, c2], count(c1.id))
|
||||
|> Repo.one()
|
||||
end
|
||||
end
|
||||
|
|
|
@ -6,18 +6,6 @@ defmodule BackendWeb.InstanceView do
|
|||
def render("show.json", %{instance: instance, crawl: crawl}) do
|
||||
user_threshold = get_config(:personal_instance_threshold)
|
||||
|
||||
[status, last_updated] =
|
||||
case crawl do
|
||||
nil ->
|
||||
["not crawled", nil]
|
||||
|
||||
_ ->
|
||||
case crawl.error do
|
||||
nil -> ["success", crawl.inserted_at]
|
||||
err -> [err, crawl.inserted_at]
|
||||
end
|
||||
end
|
||||
|
||||
cond do
|
||||
instance.user_count < user_threshold and not instance.opt_in ->
|
||||
%{
|
||||
|
@ -25,7 +13,21 @@ defmodule BackendWeb.InstanceView do
|
|||
status: "personal instance"
|
||||
}
|
||||
|
||||
instance.crawl_error == "robots.txt" ->
|
||||
%{
|
||||
name: instance.domain,
|
||||
status: instance.crawl_error
|
||||
}
|
||||
|
||||
instance.crawl_error != nil and instance.type == nil ->
|
||||
%{
|
||||
name: instance.domain,
|
||||
status: instance.crawl_error
|
||||
}
|
||||
|
||||
true ->
|
||||
last_updated = max_datetime(crawl.inserted_at, instance.updated_at)
|
||||
|
||||
filtered_peers =
|
||||
instance.peers
|
||||
|> Enum.filter(fn peer -> not peer.opt_out end)
|
||||
|
@ -48,7 +50,7 @@ defmodule BackendWeb.InstanceView do
|
|||
domainCount: length(instance.peers),
|
||||
peers: render_many(filtered_peers, InstanceView, "instance.json"),
|
||||
lastUpdated: last_updated,
|
||||
status: status,
|
||||
status: "success",
|
||||
type: instance.type,
|
||||
statusesPerDay: instance.statuses_per_day,
|
||||
statusesPerUserPerDay: statuses_per_user_per_day
|
||||
|
|
|
@ -13,7 +13,6 @@ defmodule Backend.Repo.Migrations.CreateCrawls do
|
|||
timestamps()
|
||||
end
|
||||
|
||||
# TODO: does this actually make WHERE error IS NULL queries faster? if not, drop it
|
||||
create index(:crawls, [:error])
|
||||
create index(:crawls, [:inserted_at])
|
||||
end
|
||||
|
|
|
@ -0,0 +1,16 @@
|
|||
defmodule Backend.Repo.Migrations.RemoveCrawlError do
|
||||
use Ecto.Migration
|
||||
|
||||
def change do
|
||||
execute("DELETE FROM crawls WHERE error IS NOT NULL", "")
|
||||
|
||||
alter table(:crawls) do
|
||||
remove :error, :string
|
||||
end
|
||||
|
||||
alter table(:instances) do
|
||||
add :crawl_error, :string
|
||||
add :crawl_error_count, :integer, default: 0, null: false
|
||||
end
|
||||
end
|
||||
end
|
Loading…
Reference in a new issue