From 30c5bbe2b9584ce32013995da8a0e0b507a18495 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tao=20Bror=20Bojl=C3=A9n?= Date: Wed, 7 Aug 2019 22:41:19 +0300 Subject: [PATCH] only save successful crawls in db --- backend/lib/backend/crawl.ex | 5 +- backend/lib/backend/crawler/crawler.ex | 28 +++++---- .../lib/backend/crawler/crawlers/mastodon.ex | 2 +- .../lib/backend/crawler/crawlers/misskey.ex | 3 +- backend/lib/backend/instance.ex | 6 +- backend/lib/backend/scheduler.ex | 4 +- backend/lib/backend/util.ex | 62 +++---------------- .../lib/backend_web/views/instance_view.ex | 28 +++++---- .../20190710155001_create_crawls.exs | 1 - .../20190807182941_remove_crawl_error.exs | 16 +++++ 10 files changed, 65 insertions(+), 90 deletions(-) create mode 100644 backend/priv/repo/migrations/20190807182941_remove_crawl_error.exs diff --git a/backend/lib/backend/crawl.ex b/backend/lib/backend/crawl.ex index bedd4af..63ecd26 100644 --- a/backend/lib/backend/crawl.ex +++ b/backend/lib/backend/crawl.ex @@ -11,16 +11,13 @@ defmodule Backend.Crawl do field :interactions_seen, :integer field :statuses_seen, :integer - # if something went wrong, otherwise null - field :error, :string - timestamps() end @doc false def changeset(crawl, attrs) do crawl - |> cast(attrs, [:instance, :statuses_seen, :interactions_seen, :error]) + |> cast(attrs, [:instance, :statuses_seen, :interactions_seen]) |> validate_required([:instance]) end end diff --git a/backend/lib/backend/crawler/crawler.ex b/backend/lib/backend/crawler/crawler.ex index 0d59a89..31c22da 100644 --- a/backend/lib/backend/crawler/crawler.ex +++ b/backend/lib/backend/crawler/crawler.ex @@ -120,7 +120,9 @@ defmodule Backend.Crawler do status_count: result.status_count, type: instance_type, base_domain: get_base_domain(domain), - next_crawl: NaiveDateTime.add(now, get_config(:crawl_interval_mins) * 60, :second) + next_crawl: NaiveDateTime.add(now, get_config(:crawl_interval_mins) * 60, :second), + crawl_error: nil, + crawl_error_count: 0 } Repo.insert!( @@ -135,7 +137,9 @@ defmodule Backend.Crawler do :type, :base_domain, :updated_at, - :next_crawl + :next_crawl, + :crawl_error, + :crawl_error_count ]}, conflict_target: :domain ) @@ -240,10 +244,15 @@ defmodule Backend.Crawler do end # The "+1" is this error! - error_count = get_recent_crawl_error_count(domain) + 1 - # The crawl interval grows exponentially at first but never goes above 24 hours + error_count = + Instance + |> Repo.get_by!(domain: domain) + |> Map.get(:crawl_error_count) + |> Kernel.+(1) + + # The crawl interval grows exponentially at first but never goes above 72 hours crawl_interval_mins = - min(get_config(:crawl_interval_mins) * round(:math.pow(2, error_count)), 1440) + min(get_config(:crawl_interval_mins) * round(:math.pow(2, error_count)), 4320) next_crawl = NaiveDateTime.add(now, crawl_interval_mins * 60, :second) @@ -252,16 +261,13 @@ defmodule Backend.Crawler do %Instance{ domain: domain, base_domain: get_base_domain(domain), + crawl_error: error, + crawl_error_count: error_count, next_crawl: next_crawl }, - on_conflict: {:replace, [:next_crawl]}, + on_conflict: {:replace, [:base_domain, :crawl_error, :crawl_error_count, :next_crawl]}, conflict_target: :domain ) - - Repo.insert!(%Crawl{ - instance_domain: domain, - error: error - }) end) Appsignal.increment_counter("crawler.failure", 1) diff --git a/backend/lib/backend/crawler/crawlers/mastodon.ex b/backend/lib/backend/crawler/crawlers/mastodon.ex index 68e887c..e4af52e 100644 --- a/backend/lib/backend/crawler/crawlers/mastodon.ex +++ b/backend/lib/backend/crawler/crawlers/mastodon.ex @@ -108,7 +108,7 @@ defmodule Backend.Crawler.Crawlers.Mastodon do # most recent status we have. min_timestamp = if statuses_seen == 0 do - get_last_successful_crawl_timestamp(domain) + get_last_crawl_timestamp(domain) else min_timestamp end diff --git a/backend/lib/backend/crawler/crawlers/misskey.ex b/backend/lib/backend/crawler/crawlers/misskey.ex index 0e16977..ccca567 100644 --- a/backend/lib/backend/crawler/crawlers/misskey.ex +++ b/backend/lib/backend/crawler/crawlers/misskey.ex @@ -56,8 +56,7 @@ defmodule Backend.Crawler.Crawlers.Misskey do |> NaiveDateTime.add(get_config(:status_age_limit_days) * 24 * 3600 * -1, :second) # Don't get any statuses older than this - min_timestamp = - max_datetime(get_last_successful_crawl_timestamp(domain), status_datetime_threshold) + min_timestamp = max_datetime(get_last_crawl_timestamp(domain), status_datetime_threshold) {interactions, statuses_seen} = get_interactions(domain, min_timestamp) {:ok, {version, description}} = get_version_and_description(domain) diff --git a/backend/lib/backend/instance.ex b/backend/lib/backend/instance.ex index b2093d8..5cdf7df 100644 --- a/backend/lib/backend/instance.ex +++ b/backend/lib/backend/instance.ex @@ -15,6 +15,8 @@ defmodule Backend.Instance do field :opt_in, :boolean field :opt_out, :boolean field :next_crawl, :naive_datetime + field :crawl_error, :string + field :crawl_error_count, :integer many_to_many :peers, Backend.Instance, join_through: Backend.InstancePeer, @@ -45,7 +47,9 @@ defmodule Backend.Instance do :base_domain, :opt_in, :opt_out, - :next_crawl + :next_crawl, + :crawl_error, + :crawl_error_count ]) |> validate_required([:domain]) |> put_assoc(:peers, attrs.peers) diff --git a/backend/lib/backend/scheduler.ex b/backend/lib/backend/scheduler.ex index 916281a..ef58345 100644 --- a/backend/lib/backend/scheduler.ex +++ b/backend/lib/backend/scheduler.ex @@ -43,7 +43,6 @@ defmodule Backend.Scheduler do instance_domain: c.instance_domain, interactions_seen: sum(c.interactions_seen) }) - |> where([c], is_nil(c.error)) |> group_by([c], c.instance_domain) scores = @@ -100,7 +99,7 @@ defmodule Backend.Scheduler do ) |> where( [c, c2], - c.inserted_at > c2.earliest_crawl and not is_nil(c.statuses_seen) and is_nil(c.error) + c.inserted_at > c2.earliest_crawl and not is_nil(c.statuses_seen) ) |> select([c], %{ instance_domain: c.instance_domain, @@ -148,7 +147,6 @@ defmodule Backend.Scheduler do instance_domain: c.instance_domain, statuses_seen: sum(c.statuses_seen) }) - |> where([c], is_nil(c.error)) |> group_by([c], c.instance_domain) interactions = diff --git a/backend/lib/backend/util.ex b/backend/lib/backend/util.ex index b8edab5..d98d34e 100644 --- a/backend/lib/backend/util.ex +++ b/backend/lib/backend/util.ex @@ -78,38 +78,16 @@ defmodule Backend.Util do @spec get_last_crawl(String.t()) :: Crawl.t() | nil def get_last_crawl(domain) do - crawls = - Crawl - |> select([c], c) - |> where([c], c.instance_domain == ^domain) - |> order_by(desc: :id) - |> limit(1) - |> Repo.all() - - case length(crawls) do - 1 -> hd(crawls) - 0 -> nil - end + Crawl + |> select([c], c) + |> where([c], c.instance_domain == ^domain) + |> order_by(desc: :id) + |> limit(1) + |> Repo.one() end - @spec get_last_successful_crawl(String.t()) :: Crawl.t() | nil - def get_last_successful_crawl(domain) do - crawls = - Crawl - |> select([c], c) - |> where([c], is_nil(c.error) and c.instance_domain == ^domain) - |> order_by(desc: :id) - |> limit(1) - |> Repo.all() - - case length(crawls) do - 1 -> hd(crawls) - 0 -> nil - end - end - - @spec get_last_successful_crawl_timestamp(String.t()) :: NaiveDateTime.t() | nil - def get_last_successful_crawl_timestamp(domain) do + @spec get_last_crawl_timestamp(String.t()) :: NaiveDateTime.t() | nil + def get_last_crawl_timestamp(domain) do crawl = get_last_crawl(domain) case crawl do @@ -167,28 +145,4 @@ defmodule Backend.Util do def convert_keys_to_atoms(map) do map |> Map.new(fn {k, v} -> {String.to_atom(k), v} end) end - - # Given a domain, returns the number of n most recent crawls that errored - @spec get_recent_crawl_error_count(String.t()) :: integer - def get_recent_crawl_error_count(domain) do - most_recent_success_crawl_subquery = - Crawl - |> select([c], %{ - instance_domain: c.instance_domain, - timestamp: max(c.inserted_at) - }) - |> where([c], c.instance_domain == ^domain and is_nil(c.error)) - |> group_by([c], c.instance_domain) - - Crawl - |> join(:left, [c1], c2 in subquery(most_recent_success_crawl_subquery), - on: c1.instance_domain == c2.instance_domain - ) - |> where( - [c1, c2], - c1.instance_domain == ^domain and (c1.inserted_at > c2.timestamp or is_nil(c2.timestamp)) - ) - |> select([c1, c2], count(c1.id)) - |> Repo.one() - end end diff --git a/backend/lib/backend_web/views/instance_view.ex b/backend/lib/backend_web/views/instance_view.ex index 91895a2..bb8ef8d 100644 --- a/backend/lib/backend_web/views/instance_view.ex +++ b/backend/lib/backend_web/views/instance_view.ex @@ -6,18 +6,6 @@ defmodule BackendWeb.InstanceView do def render("show.json", %{instance: instance, crawl: crawl}) do user_threshold = get_config(:personal_instance_threshold) - [status, last_updated] = - case crawl do - nil -> - ["not crawled", nil] - - _ -> - case crawl.error do - nil -> ["success", crawl.inserted_at] - err -> [err, crawl.inserted_at] - end - end - cond do instance.user_count < user_threshold and not instance.opt_in -> %{ @@ -25,7 +13,21 @@ defmodule BackendWeb.InstanceView do status: "personal instance" } + instance.crawl_error == "robots.txt" -> + %{ + name: instance.domain, + status: instance.crawl_error + } + + instance.crawl_error != nil and instance.type == nil -> + %{ + name: instance.domain, + status: instance.crawl_error + } + true -> + last_updated = max_datetime(crawl.inserted_at, instance.updated_at) + filtered_peers = instance.peers |> Enum.filter(fn peer -> not peer.opt_out end) @@ -48,7 +50,7 @@ defmodule BackendWeb.InstanceView do domainCount: length(instance.peers), peers: render_many(filtered_peers, InstanceView, "instance.json"), lastUpdated: last_updated, - status: status, + status: "success", type: instance.type, statusesPerDay: instance.statuses_per_day, statusesPerUserPerDay: statuses_per_user_per_day diff --git a/backend/priv/repo/migrations/20190710155001_create_crawls.exs b/backend/priv/repo/migrations/20190710155001_create_crawls.exs index 79b321e..581108f 100644 --- a/backend/priv/repo/migrations/20190710155001_create_crawls.exs +++ b/backend/priv/repo/migrations/20190710155001_create_crawls.exs @@ -13,7 +13,6 @@ defmodule Backend.Repo.Migrations.CreateCrawls do timestamps() end - # TODO: does this actually make WHERE error IS NULL queries faster? if not, drop it create index(:crawls, [:error]) create index(:crawls, [:inserted_at]) end diff --git a/backend/priv/repo/migrations/20190807182941_remove_crawl_error.exs b/backend/priv/repo/migrations/20190807182941_remove_crawl_error.exs new file mode 100644 index 0000000..162c55d --- /dev/null +++ b/backend/priv/repo/migrations/20190807182941_remove_crawl_error.exs @@ -0,0 +1,16 @@ +defmodule Backend.Repo.Migrations.RemoveCrawlError do + use Ecto.Migration + + def change do + execute("DELETE FROM crawls WHERE error IS NOT NULL", "") + + alter table(:crawls) do + remove :error, :string + end + + alter table(:instances) do + add :crawl_error, :string + add :crawl_error_count, :integer, default: 0, null: false + end + end +end