only save successful crawls in db

2019-08-07 22:41:19 +03:00 · 2019-08-07 22:41:19 +03:00 · 30c5bbe2b9
parent d9b9081ec3
commit 30c5bbe2b9
10 changed files with 65 additions and 90 deletions
--- a/backend/lib/backend/crawl.ex
+++ b/backend/lib/backend/crawl.ex
@ -11,16 +11,13 @@ defmodule Backend.Crawl do
    field :interactions_seen, :integer
    field :statuses_seen, :integer

-    # if something went wrong, otherwise null
-    field :error, :string
-
    timestamps()
  end

  @doc false
  def changeset(crawl, attrs) do
    crawl
-    |> cast(attrs, [:instance, :statuses_seen, :interactions_seen, :error])
+    |> cast(attrs, [:instance, :statuses_seen, :interactions_seen])
    |> validate_required([:instance])
  end
 end
--- a/backend/lib/backend/crawler/crawler.ex
+++ b/backend/lib/backend/crawler/crawler.ex
@ -120,7 +120,9 @@ defmodule Backend.Crawler do
      status_count: result.status_count,
      type: instance_type,
      base_domain: get_base_domain(domain),
-      next_crawl: NaiveDateTime.add(now, get_config(:crawl_interval_mins) * 60, :second)
+      next_crawl: NaiveDateTime.add(now, get_config(:crawl_interval_mins) * 60, :second),
+      crawl_error: nil,
+      crawl_error_count: 0
    }

    Repo.insert!(
@ -135,7 +137,9 @@ defmodule Backend.Crawler do
           :type,
           :base_domain,
           :updated_at,
-           :next_crawl
+           :next_crawl,
+           :crawl_error,
+           :crawl_error_count
         ]},
      conflict_target: :domain
    )
@ -240,10 +244,15 @@ defmodule Backend.Crawler do
      end

    # The "+1" is this error!
-    error_count = get_recent_crawl_error_count(domain) + 1
-    # The crawl interval grows exponentially at first but never goes above 24 hours
+    error_count =
+      Instance
+      |> Repo.get_by!(domain: domain)
+      |> Map.get(:crawl_error_count)
+      |> Kernel.+(1)
+
+    # The crawl interval grows exponentially at first but never goes above 72 hours
    crawl_interval_mins =
-      min(get_config(:crawl_interval_mins) * round(:math.pow(2, error_count)), 1440)
+      min(get_config(:crawl_interval_mins) * round(:math.pow(2, error_count)), 4320)

    next_crawl = NaiveDateTime.add(now, crawl_interval_mins * 60, :second)

@ -252,16 +261,13 @@ defmodule Backend.Crawler do
        %Instance{
          domain: domain,
          base_domain: get_base_domain(domain),
+          crawl_error: error,
+          crawl_error_count: error_count,
          next_crawl: next_crawl
        },
-        on_conflict: {:replace, [:next_crawl]},
+        on_conflict: {:replace, [:base_domain, :crawl_error, :crawl_error_count, :next_crawl]},
        conflict_target: :domain
      )
-
-      Repo.insert!(%Crawl{
-        instance_domain: domain,
-        error: error
-      })
    end)

    Appsignal.increment_counter("crawler.failure", 1)
--- a/backend/lib/backend/crawler/crawlers/mastodon.ex
+++ b/backend/lib/backend/crawler/crawlers/mastodon.ex
@ -108,7 +108,7 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
    # most recent status we have.
    min_timestamp =
      if statuses_seen == 0 do
-        get_last_successful_crawl_timestamp(domain)
+        get_last_crawl_timestamp(domain)
      else
        min_timestamp
      end
--- a/backend/lib/backend/crawler/crawlers/misskey.ex
+++ b/backend/lib/backend/crawler/crawlers/misskey.ex
@ -56,8 +56,7 @@ defmodule Backend.Crawler.Crawlers.Misskey do
      |> NaiveDateTime.add(get_config(:status_age_limit_days) * 24 * 3600 * -1, :second)

    # Don't get any statuses older than this
-    min_timestamp =
-      max_datetime(get_last_successful_crawl_timestamp(domain), status_datetime_threshold)
+    min_timestamp = max_datetime(get_last_crawl_timestamp(domain), status_datetime_threshold)

    {interactions, statuses_seen} = get_interactions(domain, min_timestamp)
    {:ok, {version, description}} = get_version_and_description(domain)
--- a/backend/lib/backend/instance.ex
+++ b/backend/lib/backend/instance.ex
@ -15,6 +15,8 @@ defmodule Backend.Instance do
    field :opt_in, :boolean
    field :opt_out, :boolean
    field :next_crawl, :naive_datetime
+    field :crawl_error, :string
+    field :crawl_error_count, :integer

    many_to_many :peers, Backend.Instance,
      join_through: Backend.InstancePeer,
@ -45,7 +47,9 @@ defmodule Backend.Instance do
      :base_domain,
      :opt_in,
      :opt_out,
-      :next_crawl
+      :next_crawl,
+      :crawl_error,
+      :crawl_error_count
    ])
    |> validate_required([:domain])
    |> put_assoc(:peers, attrs.peers)
--- a/backend/lib/backend/scheduler.ex
+++ b/backend/lib/backend/scheduler.ex
@ -43,7 +43,6 @@ defmodule Backend.Scheduler do
        instance_domain: c.instance_domain,
        interactions_seen: sum(c.interactions_seen)
      })
-      |> where([c], is_nil(c.error))
      |> group_by([c], c.instance_domain)

    scores =
@ -100,7 +99,7 @@ defmodule Backend.Scheduler do
      )
      |> where(
        [c, c2],
-        c.inserted_at > c2.earliest_crawl and not is_nil(c.statuses_seen) and is_nil(c.error)
+        c.inserted_at > c2.earliest_crawl and not is_nil(c.statuses_seen)
      )
      |> select([c], %{
        instance_domain: c.instance_domain,
@ -148,7 +147,6 @@ defmodule Backend.Scheduler do
        instance_domain: c.instance_domain,
        statuses_seen: sum(c.statuses_seen)
      })
-      |> where([c], is_nil(c.error))
      |> group_by([c], c.instance_domain)

    interactions =
--- a/backend/lib/backend/util.ex
+++ b/backend/lib/backend/util.ex
@ -78,38 +78,16 @@ defmodule Backend.Util do

  @spec get_last_crawl(String.t()) :: Crawl.t() | nil
  def get_last_crawl(domain) do
-    crawls =
    Crawl
    |> select([c], c)
    |> where([c], c.instance_domain == ^domain)
    |> order_by(desc: :id)
    |> limit(1)
-      |> Repo.all()
-
-    case length(crawls) do
-      1 -> hd(crawls)
-      0 -> nil
-    end
+    |> Repo.one()
  end

-  @spec get_last_successful_crawl(String.t()) :: Crawl.t() | nil
-  def get_last_successful_crawl(domain) do
-    crawls =
-      Crawl
-      |> select([c], c)
-      |> where([c], is_nil(c.error) and c.instance_domain == ^domain)
-      |> order_by(desc: :id)
-      |> limit(1)
-      |> Repo.all()
-
-    case length(crawls) do
-      1 -> hd(crawls)
-      0 -> nil
-    end
-  end
-
-  @spec get_last_successful_crawl_timestamp(String.t()) :: NaiveDateTime.t() | nil
-  def get_last_successful_crawl_timestamp(domain) do
+  @spec get_last_crawl_timestamp(String.t()) :: NaiveDateTime.t() | nil
+  def get_last_crawl_timestamp(domain) do
    crawl = get_last_crawl(domain)

    case crawl do
@ -167,28 +145,4 @@ defmodule Backend.Util do
  def convert_keys_to_atoms(map) do
    map |> Map.new(fn {k, v} -> {String.to_atom(k), v} end)
  end
-
-  # Given a domain, returns the number of n most recent crawls that errored
-  @spec get_recent_crawl_error_count(String.t()) :: integer
-  def get_recent_crawl_error_count(domain) do
-    most_recent_success_crawl_subquery =
-      Crawl
-      |> select([c], %{
-        instance_domain: c.instance_domain,
-        timestamp: max(c.inserted_at)
-      })
-      |> where([c], c.instance_domain == ^domain and is_nil(c.error))
-      |> group_by([c], c.instance_domain)
-
-    Crawl
-    |> join(:left, [c1], c2 in subquery(most_recent_success_crawl_subquery),
-      on: c1.instance_domain == c2.instance_domain
-    )
-    |> where(
-      [c1, c2],
-      c1.instance_domain == ^domain and (c1.inserted_at > c2.timestamp or is_nil(c2.timestamp))
-    )
-    |> select([c1, c2], count(c1.id))
-    |> Repo.one()
-  end
 end
--- a/backend/lib/backend_web/views/instance_view.ex
+++ b/backend/lib/backend_web/views/instance_view.ex
@ -6,18 +6,6 @@ defmodule BackendWeb.InstanceView do
  def render("show.json", %{instance: instance, crawl: crawl}) do
    user_threshold = get_config(:personal_instance_threshold)

-    [status, last_updated] =
-      case crawl do
-        nil ->
-          ["not crawled", nil]
-
-        _ ->
-          case crawl.error do
-            nil -> ["success", crawl.inserted_at]
-            err -> [err, crawl.inserted_at]
-          end
-      end
-
    cond do
      instance.user_count < user_threshold and not instance.opt_in ->
        %{
@ -25,7 +13,21 @@ defmodule BackendWeb.InstanceView do
          status: "personal instance"
        }

+      instance.crawl_error == "robots.txt" ->
+        %{
+          name: instance.domain,
+          status: instance.crawl_error
+        }
+
+      instance.crawl_error != nil and instance.type == nil ->
+        %{
+          name: instance.domain,
+          status: instance.crawl_error
+        }
+
      true ->
+        last_updated = max_datetime(crawl.inserted_at, instance.updated_at)
+
        filtered_peers =
          instance.peers
          |> Enum.filter(fn peer -> not peer.opt_out end)
@ -48,7 +50,7 @@ defmodule BackendWeb.InstanceView do
          domainCount: length(instance.peers),
          peers: render_many(filtered_peers, InstanceView, "instance.json"),
          lastUpdated: last_updated,
-          status: status,
+          status: "success",
          type: instance.type,
          statusesPerDay: instance.statuses_per_day,
          statusesPerUserPerDay: statuses_per_user_per_day
--- a/backend/priv/repo/migrations/20190710155001_create_crawls.exs
+++ b/backend/priv/repo/migrations/20190710155001_create_crawls.exs
@ -13,7 +13,6 @@ defmodule Backend.Repo.Migrations.CreateCrawls do
      timestamps()
    end

-    # TODO: does this actually make WHERE error IS NULL queries faster? if not, drop it
    create index(:crawls, [:error])
    create index(:crawls, [:inserted_at])
  end
--- a/backend/priv/repo/migrations/20190807182941_remove_crawl_error.exs
+++ b/backend/priv/repo/migrations/20190807182941_remove_crawl_error.exs
@ -0,0 +1,16 @@
+defmodule Backend.Repo.Migrations.RemoveCrawlError do
+  use Ecto.Migration
+
+  def change do
+    execute("DELETE FROM crawls WHERE error IS NOT NULL", "")
+
+    alter table(:crawls) do
+      remove :error, :string
+    end
+
+    alter table(:instances) do
+      add :crawl_error, :string
+      add :crawl_error_count, :integer, default: 0, null: false
+    end
+  end
+end