save crawl error details

2019-07-03 16:36:53 +01:00 · 2019-07-03 16:36:53 +01:00 · 27dc0c5260
parent 4cc24e0889
commit 27dc0c5260
6 changed files with 255 additions and 214 deletions
--- a/backend/lib/backend/crawler/api_crawler.ex
+++ b/backend/lib/backend/crawler/api_crawler.ex
@ -2,18 +2,17 @@ defmodule Backend.Crawler.ApiCrawler do
  @moduledoc """
  This module is a specification. Crawlers for all instance types must implement its behaviour.

-  TODO:
-  Make sure to respect the following configuration values:
-  * `:status_age_limit_days` specifies that you must only crawl statuses from the most recent N days
-  * `:status_count_limit` specifies the max number of statuses to crawl in one go
-  * `:personal_instance_threshold` specifies that instances with fewer than this number of users should not be crawled
-  * also, profiles with the string "nobot" (case insensitive) in their profile must not be included in any stats.
+  Make sure to respect the following:
+  * You must adhere to the following configuration values:
+    * `:status_age_limit_days` specifies that you must only crawl statuses from the most recent N days
+    * `:status_count_limit` specifies the max number of statuses to crawl in one go
+    * `:personal_instance_threshold` specifies that instances with fewer than this number of users should not be crawled
+  * profiles with the string "nobot" (case insensitive) in their profile must not be included in any stats
+  * Make sure to check the most recent crawl of the instance so you don't re-crawl old statuses
  """

-  alias __MODULE__
-
-  # {domain_mentioned, datetime}
-  @type instance_interaction :: {String.t(), DateTime}
+  # {domain_mentioned, timestamp}
+  @type instance_interaction :: {String.t(), NaiveDateTime}

  defstruct [
    :version,
@ -24,7 +23,7 @@ defmodule Backend.Crawler.ApiCrawler do
    :interactions
  ]

-  @type crawl_result :: %ApiCrawler{
+  @type t() :: %__MODULE__{
          version: String.t(),
          description: String.t(),
          user_count: Number,
@ -40,5 +39,5 @@ defmodule Backend.Crawler.ApiCrawler do
  @doc """
  Crawl the instance at the given domain.
  """
-  @callback crawl(String.t()) :: crawl_result()
+  @callback crawl(String.t()) :: t()
 end
--- a/backend/lib/backend/crawler/crawler.ex
+++ b/backend/lib/backend/crawler/crawler.ex
@ -5,25 +5,33 @@ defmodule Backend.Crawler.Crawler do

  alias __MODULE__
  alias Backend.Crawler.Crawlers.Mastodon
+  alias Backend.Crawler.ApiCrawler
  alias Backend.{Repo, Instance, Interaction, InstancePeer}
  import Ecto.Query
  require Logger

  defstruct [
    # the instance domain (a string)
-    :instance,
+    :domain,
    # a list of ApiCrawlers that will be attempted
    :api_crawlers,
    :found_api?,
-    # an instance of the ApiCrawler struct
-    :result
+    :result,
+    :error
  ]

-  @spec run(String.t()) :: any
+  @type t() :: %__MODULE__{
+          domain: String.t(),
+          api_crawlers: [ApiCrawler.t()],
+          found_api?: boolean,
+          result: ApiCrawler.t() | nil,
+          error: String.t() | nil
+        }
+
  def run(domain) do
    Logger.info("Crawling #{domain}...")
    HTTPoison.start()
-    state = %Crawler{instance: domain, api_crawlers: [], found_api?: false, result: nil}
+    state = %Crawler{domain: domain, api_crawlers: [], found_api?: false, result: nil, error: nil}

    state
    # register APICrawlers here
@ -38,135 +46,154 @@ defmodule Backend.Crawler.Crawler do
    Map.put(state, :api_crawlers, [api_crawler | crawlers])
  end

-  # Recursive function to check whether `instance` has an API that the head of the api_crawlers list can read.
+  # Recursive function to check whether `domain` has an API that the head of the api_crawlers list can read.
  # If so, crawls it. If not, continues with the tail of the api_crawlers list.
-  defp crawl(%Crawler{api_crawlers: []} = state, instance) do
-    Logger.info("Found no compatible API for #{instance}")
+  defp crawl(%Crawler{api_crawlers: []} = state, domain) do
+    Logger.info("Found no compatible API for #{domain}")
    Map.put(state, :found_api?, false)
  end

-  defp crawl(state, instance) do
-    crawlers = Map.get(state, :api_crawlers, [])
-    curr = hd(crawlers)
+  defp crawl(state, domain) do
+    curr =
+      state
+      |> Map.get(:api_crawlers, [])
+      |> Kernel.hd()

-    if curr.is_instance_type?(instance) do
+    if curr.is_instance_type?(domain) do
      Logger.debug("Found #{curr} instance")
+      state = Map.put(state, :found_api?, true)

-      Map.put(state, :found_api?, true)
-      |> Map.put(:result, curr.crawl(instance))
+      try do
+        Map.put(state, :result, curr.crawl(domain))
+      rescue
+        e in HTTPoison.Error ->
+          Map.put(state, :error, "HTTPoison error: " <> HTTPoison.Error.message(e))
+
+        e in Jason.DecodeError ->
+          Map.put(state, :error, "Jason DecodeError: " <> Jason.DecodeError.message(e))
+
+        e in _ ->
+          Map.put(state, :error, "Unknown error: " <> inspect(e))
+      end
    else
      # Nothing found so check the next APICrawler
-      Logger.debug("#{instance} is not an instance of #{curr}")
-      crawl(%Crawler{state | api_crawlers: tl(crawlers)}, instance)
+      Logger.debug("#{domain} is not an instance of #{curr}")
+
+      crawl(
+        %Crawler{state | api_crawlers: state |> Map.get(:api_crawlers, []) |> Kernel.tl()},
+        domain
+      )
    end
  end

  # Save the state (after crawling) to the database.
-  defp save(%Crawler{instance: domain, result: result} = state) do
-    if state.found_api? do
-      now = NaiveDateTime.truncate(NaiveDateTime.utc_now(), :second)
-      peers_domains = result.peers
+  defp save(%Crawler{domain: domain, result: result, found_api?: true, error: nil}) do
+    now = NaiveDateTime.truncate(NaiveDateTime.utc_now(), :second)
+    peers_domains = result.peers

-      # We also get a list of the instances mentioned, in case the server doesn't publish a list of peers.
-      peers_from_mentions =
-        result.interactions
-        |> Enum.map(fn i -> Kernel.elem(i, 0) end)
+    # We also get a list of the instances mentioned, in case the server doesn't publish a list of peers.
+    peers_from_mentions =
+      result.interactions
+      |> Enum.map(fn i -> Kernel.elem(i, 0) end)

-      ## Save the instance we crawled ##
-      Repo.insert!(
-        %Instance{
-          domain: domain,
+    ## Save the instance we crawled ##
+    Repo.insert!(
+      %Instance{
+        domain: domain,
+        description: result.description,
+        version: result.version,
+        user_count: result.user_count,
+        status_count: result.status_count,
+        last_crawl_timestamp: now
+      },
+      on_conflict: [
+        set: [
          description: result.description,
          version: result.version,
          user_count: result.user_count,
          status_count: result.status_count,
-          last_crawl_timestamp: now
-        },
-        on_conflict: [
-          set: [
-            description: result.description,
-            version: result.version,
-            user_count: result.user_count,
-            status_count: result.status_count,
-            last_crawl_timestamp: now,
-            updated_at: now
-          ]
-        ],
-        conflict_target: :domain
-      )
+          last_crawl_timestamp: now,
+          updated_at: now,
+          error: nil
+        ]
+      ],
+      conflict_target: :domain
+    )

-      # If we discovered new instances from the peers endpoint or from mentions, add them
-      peers =
-        peers_domains
-        |> MapSet.new()
-        |> (fn set -> MapSet.union(set, MapSet.new(peers_from_mentions)) end).()
-        |> MapSet.to_list()
-        |> Enum.map(&%{domain: &1, inserted_at: now, updated_at: now})
+    # If we discovered new instances from the peers endpoint or from mentions, add them
+    peers =
+      peers_domains
+      |> MapSet.new()
+      |> (fn set -> MapSet.union(set, MapSet.new(peers_from_mentions)) end).()
+      |> MapSet.to_list()
+      |> Enum.map(&%{domain: &1, inserted_at: now, updated_at: now})

-      Instance
-      |> Repo.insert_all(peers, on_conflict: :nothing, conflict_target: :domain)
-
-      ## Save peer relationships ##
-      Repo.transaction(fn ->
-        # get current peers (a list of strings)
-        current_peers =
-          InstancePeer
-          |> where(source_domain: ^domain)
-          |> select([p], p.target_domain)
-          |> Repo.all()
-
-        wanted_peers_set = MapSet.new(peers_domains)
-        current_peers_set = MapSet.new(current_peers)
-
-        # delete the peers we don't want
-        dont_want = current_peers_set |> MapSet.difference(wanted_peers_set) |> MapSet.to_list()
+    Instance
+    |> Repo.insert_all(peers, on_conflict: :nothing, conflict_target: :domain)

+    ## Save peer relationships ##
+    Repo.transaction(fn ->
+      # get current peers (a list of strings)
+      current_peers =
        InstancePeer
        |> where(source_domain: ^domain)
-        |> where([p], p.target_domain in ^dont_want)
-        |> Repo.delete_all([])
+        |> select([p], p.target_domain)
+        |> Repo.all()

-        # insert the ones we don't have yet
-        new_instance_peers =
-          wanted_peers_set
-          |> MapSet.difference(current_peers_set)
-          |> MapSet.to_list()
-          |> Enum.map(
-            &%{
-              source_domain: domain,
-              target_domain: &1,
-              inserted_at: now,
-              updated_at: now
-            }
-          )
+      wanted_peers_set = MapSet.new(peers_domains)
+      current_peers_set = MapSet.new(current_peers)

-        InstancePeer
-        |> Repo.insert_all(new_instance_peers)
-      end)
+      # delete the peers we don't want
+      dont_want = current_peers_set |> MapSet.difference(wanted_peers_set) |> MapSet.to_list()

-      ## Save interactions ##
-      interactions =
-        result.interactions
-        |> Enum.map(fn {target_domain, timestamp} ->
-          %{
+      InstancePeer
+      |> where(source_domain: ^domain)
+      |> where([p], p.target_domain in ^dont_want)
+      |> Repo.delete_all([])
+
+      # insert the ones we don't have yet
+      new_instance_peers =
+        wanted_peers_set
+        |> MapSet.difference(current_peers_set)
+        |> MapSet.to_list()
+        |> Enum.map(
+          &%{
            source_domain: domain,
-            target_domain: target_domain,
-            timestamp: NaiveDateTime.truncate(timestamp, :second),
+            target_domain: &1,
            inserted_at: now,
            updated_at: now
          }
-        end)
+        )

-      Interaction
-      |> Repo.insert_all(interactions)
-    else
-      Repo.insert!(
-        %Instance{
-          domain: domain
-        },
-        on_conflict: [set: [domain: :domain]],
-        conflict_target: :domain
-      )
-    end
+      InstancePeer
+      |> Repo.insert_all(new_instance_peers)
+    end)
+
+    ## Save interactions ##
+    interactions =
+      result.interactions
+      |> Enum.map(fn {target_domain, timestamp} ->
+        %{
+          source_domain: domain,
+          target_domain: target_domain,
+          timestamp: NaiveDateTime.truncate(timestamp, :second),
+          inserted_at: now,
+          updated_at: now
+        }
+      end)
+
+    Interaction
+    |> Repo.insert_all(interactions)
+  end
+
+  defp save(%{domain: domain, error: error}) do
+    Repo.insert!(
+      %Instance{
+        domain: domain,
+        error: error
+      },
+      on_conflict: [set: [error: error]],
+      conflict_target: :domain
+    )
  end
 end
--- a/backend/lib/backend/crawler/crawlers/mastodon.ex
+++ b/backend/lib/backend/crawler/crawlers/mastodon.ex
@ -34,7 +34,7 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
    end
  end

-  @spec crawl_large_instance(String.t(), any()) :: ApiCrawler.crawl_result()
+  @spec crawl_large_instance(String.t(), any()) :: ApiCrawler.t()
  defp crawl_large_instance(domain, instance) do
    # servers may not publish peers
    peers =
@ -58,14 +58,87 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
    )
  end

-  defp is_http_200?(response) do
-    case response do
-      %{status_code: 200} -> true
-      _ -> false
+  @spec get_mentions(
+          String.t(),
+          String.t() | nil,
+          Calendar.naive_datetime() | nil,
+          [ApiCrawler.instance_interaction()],
+          integer
+        ) :: {[ApiCrawler.instance_interaction()], integer}
+  defp get_mentions(
+         domain,
+         max_id \\ nil,
+         min_timestamp \\ nil,
+         mentions \\ [],
+         statuses_seen \\ 0
+       ) do
+    # If `statuses_seen == 0`, it's the first call of this function, which means we want to query the database for the
+    # most recent status we have.
+    min_timestamp =
+      if statuses_seen == 0 do
+        Instance
+        |> select([:last_crawl_timestamp])
+        |> Repo.get_by(domain: domain)
+        |> (fn result ->
+              if result == nil, do: nil, else: Map.get(result, :last_crawl_timestamp)
+            end).()
+      else
+        min_timestamp
+      end
+
+    endpoint = "https://#{domain}/api/v1/timelines/public?local=true"
+
+    endpoint =
+      if max_id do
+        endpoint <> "&max_id=#{max_id}"
+      else
+        endpoint
+      end
+
+    Logger.debug("Crawling #{endpoint}")
+
+    # TODO: handle errors here
+
+    statuses =
+      endpoint
+      |> HTTPoison.get!([], timeout: 10_000, recv_timeout: 10_000)
+      |> Map.get(:body)
+      |> Jason.decode!()
+
+    filtered_statuses =
+      statuses
+      |> Enum.filter(fn s -> is_after?(s["created_at"], min_timestamp) end)
+
+    if length(filtered_statuses) > 0 do
+      # get statuses that are eligible (i.e. users don't have #nobot in their profile) and have mentions
+      mentions = mentions ++ statuses_to_mentions(filtered_statuses)
+      statuses_seen = statuses_seen + length(filtered_statuses)
+
+      status_datetime_threshold =
+        NaiveDateTime.utc_now()
+        |> NaiveDateTime.add(get_config(:status_age_limit_days) * 24 * 3600 * -1, :second)
+
+      oldest_status = Enum.at(filtered_statuses, -1)
+
+      oldest_status_datetime =
+        oldest_status
+        |> (fn s -> s["created_at"] end).()
+        |> NaiveDateTime.from_iso8601!()
+
+      if NaiveDateTime.compare(oldest_status_datetime, status_datetime_threshold) == :gt and
+           statuses_seen < get_config(:status_count_limit) and
+           length(filtered_statuses) == length(statuses) do
+        get_mentions(domain, oldest_status["id"], min_timestamp, mentions, statuses_seen)
+      else
+        {mentions, statuses_seen}
+      end
+    else
+      {mentions, statuses_seen}
    end
  end

  # check if the endpoint works as expected
+  @spec has_title?(String.t()) :: boolean
  defp has_title?(body) do
    case Jason.decode(body) do
      {:ok, decoded} -> Map.has_key?(decoded, "title")
@ -115,97 +188,4 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
    |> Enum.filter(fn status -> is_eligible?(status) end)
    |> Enum.flat_map(fn status -> extract_mentions_from_status(status) end)
  end
-
-  @spec is_after?(String.t(), NaiveDateTime.t() | nil) :: boolean()
-  defp is_after?(timestamp, threshold) do
-    if threshold == nil do
-      true
-    else
-      timestamp
-      |> NaiveDateTime.from_iso8601!()
-      # :second is the granularity used in the database
-      |> NaiveDateTime.truncate(:second)
-      |> NaiveDateTime.compare(threshold)
-      |> Kernel.===(:gt)
-    end
-  end
-
-  @spec get_mentions(
-          String.t(),
-          String.t() | nil,
-          Calendar.naive_datetime() | nil,
-          [ApiCrawler.instance_interaction()],
-          integer
-        ) :: {[ApiCrawler.instance_interaction()], integer}
-  defp get_mentions(
-         domain,
-         max_id \\ nil,
-         min_timestamp \\ nil,
-         mentions \\ [],
-         statuses_seen \\ 0
-       ) do
-    # If `statuses_seen == 0`, it's the first call of this function, which means we want to query the database for the
-    # most recent status we have.
-    min_timestamp =
-      if statuses_seen == 0 do
-        Instance
-        |> select([:last_crawl_timestamp])
-        |> Repo.get_by(domain: domain)
-        |> (fn result ->
-              if result == nil, do: nil, else: Map.get(result, :last_crawl_timestamp)
-            end).()
-      else
-        min_timestamp
-      end
-
-    endpoint = "https://#{domain}/api/v1/timelines/public?local=true"
-
-    endpoint =
-      if max_id do
-        endpoint <> "&max_id=#{max_id}"
-      else
-        endpoint
-      end
-
-    Logger.debug("Crawling #{endpoint}")
-
-    # TODO: handle errors here
-
-    statuses =
-      endpoint
-      |> HTTPoison.get!()
-      |> Map.get(:body)
-      |> Jason.decode!()
-
-    filtered_statuses =
-      statuses
-      |> Enum.filter(fn s -> is_after?(s["created_at"], min_timestamp) end)
-
-    if length(filtered_statuses) > 0 do
-      # get statuses that are eligible (i.e. users don't have #nobot in their profile) and have mentions
-      mentions = mentions ++ statuses_to_mentions(filtered_statuses)
-      statuses_seen = statuses_seen + length(filtered_statuses)
-
-      status_datetime_threshold =
-        NaiveDateTime.utc_now()
-        |> NaiveDateTime.add(get_config(:status_age_limit_days) * 24 * 3600 * -1, :second)
-
-      oldest_status = Enum.at(filtered_statuses, -1)
-
-      oldest_status_datetime =
-        oldest_status
-        |> (fn s -> s["created_at"] end).()
-        |> NaiveDateTime.from_iso8601!()
-
-      if NaiveDateTime.compare(oldest_status_datetime, status_datetime_threshold) == :gt and
-           statuses_seen < get_config(:status_count_limit) and
-           length(filtered_statuses) == length(statuses) do
-        get_mentions(domain, oldest_status["id"], min_timestamp, mentions, statuses_seen)
-      else
-        {mentions, statuses_seen}
-      end
-    else
-      {mentions, statuses_seen}
-    end
-  end
 end
--- a/backend/lib/backend/crawler/util.ex
+++ b/backend/lib/backend/crawler/util.ex
@ -1,13 +1,38 @@
 defmodule Backend.Crawler.Util do
+  @spec get_config(atom) :: any
  def get_config(key) do
    Application.get_env(:backend, :crawler)[key]
  end

  # Gets the domain from a Mastodon/Pleroma account URL
  # (e.g. https://mastodon.social/@demouser or https://pleroma.site/users/demouser)
+  @spec get_domain(String.t()) :: String.t()
  def get_domain(url) do
    String.slice(url, 8..-1)
    |> String.split("/")
    |> Enum.at(0)
  end
+
+  @spec is_http_200?(HTTPoison.Response.t()) :: boolean
+  def is_http_200?(%{status_code: 200}) do
+    true
+  end
+
+  def is_http_200?(_) do
+    false
+  end
+
+  @spec is_after?(String.t(), NaiveDateTime.t() | nil) :: boolean()
+  def is_after?(timestamp, threshold) do
+    if threshold == nil do
+      true
+    else
+      timestamp
+      |> NaiveDateTime.from_iso8601!()
+      # :second is the granularity used in the database
+      |> NaiveDateTime.truncate(:second)
+      |> NaiveDateTime.compare(threshold)
+      |> Kernel.===(:gt)
+    end
+  end
 end
--- a/backend/lib/backend/instance.ex
+++ b/backend/lib/backend/instance.ex
@ -8,6 +8,7 @@ defmodule Backend.Instance do
    field :status_count, :integer
    field :user_count, :integer
    field :version, :string
+    field :error, :string

    # this is distinct from `updated_at` -- it indicates when the last *successful* crawl was. `updated_at` also
    # gets updated if the crawl fails.
@ -37,7 +38,15 @@ defmodule Backend.Instance do
  @doc false
  def changeset(instance, attrs) do
    instance
-    |> cast(attrs, [:domain, :user_count, :status_count, :description, :version, :updated_at])
+    |> cast(attrs, [
+      :domain,
+      :user_count,
+      :status_count,
+      :description,
+      :version,
+      :updated_at,
+      :last_crawl_timestamp
+    ])
    |> validate_required([:domain])
    |> put_assoc(:peers, attrs.peers)
  end
--- a/backend/priv/repo/migrations/20190624090436_create_instances.exs
+++ b/backend/priv/repo/migrations/20190624090436_create_instances.exs
@ -8,6 +8,7 @@ defmodule Backend.Repo.Migrations.CreateInstances do
      add :status_count, :integer
      add :description, :text
      add :version, :string
+      add :error, :text

      add :last_crawl_timestamp, :naive_datetime