Revert "add metadata endpoint"

This reverts commit 82153b283b.
2019-08-09 16:59:51 +00:00 · 2019-08-09 16:59:51 +00:00 · 3320e050c8
parent 9a0bbbb7d9
commit 3320e050c8
14 changed files with 477 additions and 170 deletions
--- a/backend/lib/backend/crawler/api_crawler.ex
+++ b/backend/lib/backend/crawler/api_crawler.ex
@ -11,10 +11,12 @@ defmodule Backend.Crawler.ApiCrawler do
  * Make sure to check the most recent crawl of the instance so you don't re-crawl old statuses
  """

+  alias Backend.Crawler.Crawlers.Nodeinfo
+
  # {domain_mentioned, count}
  @type instance_interactions :: %{String.t() => integer}

-  @type instance_type :: :mastodon | :pleroma | :gab | :misskey
+  @type instance_type :: :mastodon | :pleroma | :gab | :misskey | :gnusocial

  defstruct [
    :version,
@ -30,8 +32,8 @@ defmodule Backend.Crawler.ApiCrawler do
  @type t() :: %__MODULE__{
          version: String.t(),
          description: String.t(),
-          user_count: integer,
-          status_count: integer,
+          user_count: integer | nil,
+          status_count: integer | nil,
          peers: [String.t()],
          interactions: instance_interactions,
          statuses_seen: integer,
@ -40,8 +42,9 @@ defmodule Backend.Crawler.ApiCrawler do

  @doc """
  Check whether the instance at the given domain is of the type that this ApiCrawler implements.
+  Arguments are the instance domain and the nodeinfo results.
  """
-  @callback is_instance_type?(String.t()) :: boolean()
+  @callback is_instance_type?(String.t(), Nodeinfo.t()) :: boolean()

  @doc """
  Check whether the instance allows crawling according to its robots.txt or otherwise.
@ -50,6 +53,7 @@ defmodule Backend.Crawler.ApiCrawler do

  @doc """
  Crawl the instance at the given domain.
+  Takes two arguments: the domain to crawl and the existing results (from nodeinfo).
  """
-  @callback crawl(String.t()) :: t()
+  @callback crawl(String.t(), Nodeinfo.t()) :: t()
 end
--- a/backend/lib/backend/crawler/crawler.ex
+++ b/backend/lib/backend/crawler/crawler.ex
@ -4,7 +4,7 @@ defmodule Backend.Crawler do
  """

  alias __MODULE__
-  alias Backend.Crawler.Crawlers.{Mastodon, Misskey}
+  alias Backend.Crawler.Crawlers.{GnuSocial, Mastodon, Misskey, Nodeinfo}
  alias Backend.Crawler.ApiCrawler
  alias Backend.{Crawl, CrawlInteraction, Repo, Instance, InstancePeer}
  import Ecto.Query
@ -16,8 +16,8 @@ defmodule Backend.Crawler do
    :domain,
    # a list of ApiCrawlers that will be attempted
    :api_crawlers,
-    :found_api?,
    :allows_crawling?,
+    :found_api?,
    :result,
    :error
  ]
@ -25,8 +25,8 @@ defmodule Backend.Crawler do
  @type t() :: %__MODULE__{
          domain: String.t(),
          api_crawlers: [ApiCrawler.t()],
-          found_api?: boolean,
          allows_crawling?: boolean,
+          found_api?: boolean,
          result: ApiCrawler.t() | nil,
          error: String.t() | nil
        }
@ -37,16 +37,18 @@ defmodule Backend.Crawler do
    state = %Crawler{
      domain: domain,
      api_crawlers: [],
-      found_api?: false,
      allows_crawling?: true,
+      found_api?: false,
      result: nil,
      error: nil
    }

    state
-    # register APICrawlers here
+    # These crawlers are run in the order they're registered. Nodeinfo should be the first one.
+    |> register(Nodeinfo)
    |> register(Mastodon)
    |> register(Misskey)
+    |> register(GnuSocial)
    # go!
    |> crawl()
    |> save()
@ -56,33 +58,47 @@ defmodule Backend.Crawler do

  # Adds a new ApiCrawler that run/1 will check.
  defp register(%Crawler{api_crawlers: crawlers} = state, api_crawler) do
-    Map.put(state, :api_crawlers, [api_crawler | crawlers])
+    Map.put(state, :api_crawlers, crawlers ++ [api_crawler])
  end

  # Recursive function to check whether `domain` has an API that the head of the api_crawlers list can read.
  # If so, crawls it. If not, continues with the tail of the api_crawlers list.
  defp crawl(%Crawler{api_crawlers: [], domain: domain} = state) do
    Logger.debug("Found no compatible API for #{domain}")
-    Map.put(state, :found_api?, false)
+    state
  end

-  defp crawl(%Crawler{domain: domain, api_crawlers: [curr | remaining_crawlers]} = state) do
-    if curr.is_instance_type?(domain) do
+  # Nodeinfo is distinct from other crawlers in that
+  # a) it should always be run first
+  # b) it passes the results on to the next crawlers (e.g. user_count)
+  defp crawl(%Crawler{api_crawlers: [Nodeinfo | remaining_crawlers], domain: domain} = state) do
+    with true <- Nodeinfo.allows_crawling?(domain), {:ok, nodeinfo} <- Nodeinfo.crawl(domain) do
+      Logger.debug("Found nodeinfo for #{domain}.")
+      result = Map.merge(nodeinfo, %{peers: [], interactions: %{}, statuses_seen: 0})
+      crawl(%Crawler{state | result: result, found_api?: true, api_crawlers: remaining_crawlers})
+    else
+      _ ->
+        Logger.debug("Did not find nodeinfo for #{domain}.")
+        crawl(%Crawler{state | api_crawlers: remaining_crawlers})
+    end
+  end
+
+  defp crawl(
+         %Crawler{domain: domain, result: result, api_crawlers: [curr | remaining_crawlers]} =
+           state
+       ) do
+    if curr.is_instance_type?(domain, result) do
      Logger.debug("Found #{curr} instance")
-      state = Map.put(state, :found_api?, true)

      if curr.allows_crawling?(domain) do
        try do
-          %Crawler{state | result: curr.crawl(domain), api_crawlers: []}
+          %Crawler{state | result: curr.crawl(domain, result), found_api?: true}
        rescue
          e in HTTPoison.Error ->
            Map.put(state, :error, "HTTPoison error: " <> HTTPoison.Error.message(e))

          e in Jason.DecodeError ->
            Map.put(state, :error, "Jason DecodeError: " <> Jason.DecodeError.message(e))
-
-          e in _ ->
-            Map.put(state, :error, "Unknown error: " <> inspect(e))
        end
      else
        Logger.debug("#{domain} does not allow crawling.")
@ -99,9 +115,9 @@ defmodule Backend.Crawler do
  defp save(%Crawler{
         domain: domain,
         result: result,
-         found_api?: true,
         error: nil,
-         allows_crawling?: true
+         allows_crawling?: true,
+         found_api?: true
       }) do
    now = get_now()

@ -240,7 +256,7 @@ defmodule Backend.Crawler do
      cond do
        not allows_crawling -> "robots.txt"
        error == nil -> "no api found"
-        true -> "unknown error"
+        true -> error
      end

    # The "+1" is this error!
@ -250,25 +266,25 @@ defmodule Backend.Crawler do
      |> Map.get(:crawl_error_count)
      |> Kernel.+(1)

-    # The crawl interval grows exponentially at first but never goes above 72 hours
+    # The crawl interval grows exponentially at first but never goes above 24 hours
    crawl_interval_mins =
-      min(get_config(:crawl_interval_mins) * round(:math.pow(2, error_count)), 4320)
+      min(get_config(:crawl_interval_mins) * round(:math.pow(2, error_count)), 1440)

    next_crawl = NaiveDateTime.add(now, crawl_interval_mins * 60, :second)

-    Repo.transaction(fn ->
-      Repo.insert!(
-        %Instance{
-          domain: domain,
-          base_domain: get_base_domain(domain),
-          crawl_error: error,
-          crawl_error_count: error_count,
-          next_crawl: next_crawl
-        },
-        on_conflict: {:replace, [:base_domain, :crawl_error, :crawl_error_count, :next_crawl]},
-        conflict_target: :domain
-      )
-    end)
+    Repo.insert!(
+      %Instance{
+        domain: domain,
+        base_domain: get_base_domain(domain),
+        crawl_error: error,
+        crawl_error_count: error_count,
+        next_crawl: next_crawl,
+        updated_at: now
+      },
+      on_conflict:
+        {:replace, [:base_domain, :crawl_error, :crawl_error_count, :next_crawl, :updated_at]},
+      conflict_target: :domain
+    )

    Appsignal.increment_counter("crawler.failure", 1)
  end
--- a/backend/lib/backend/crawler/crawlers/gnu_social.ex
+++ b/backend/lib/backend/crawler/crawlers/gnu_social.ex
@ -0,0 +1,178 @@
+defmodule Backend.Crawler.Crawlers.GnuSocial do
+  alias Backend.Crawler.ApiCrawler
+  alias Backend.Crawler.Crawlers.Nodeinfo
+  import Backend.Crawler.Util
+  import Backend.Util
+  require Logger
+
+  @behaviour ApiCrawler
+
+  @impl ApiCrawler
+  def is_instance_type?(_domain, nodeinfo_result) do
+    nodeinfo_result != nil and Map.get(nodeinfo_result, :instance_type) == :gnusocial
+  end
+
+  @impl ApiCrawler
+  def allows_crawling?(domain) do
+    [
+      "/api/statuses/public_timeline.json"
+    ]
+    |> Enum.map(fn endpoint -> "https://#{domain}#{endpoint}" end)
+    |> urls_are_crawlable?()
+  end
+
+  @impl ApiCrawler
+  def crawl(domain, nodeinfo_result) do
+    if nodeinfo_result |> Map.get(:user_count) |> is_above_user_threshold?() do
+      crawl_large_instance(domain, nodeinfo_result)
+    else
+      nodeinfo_result
+    end
+  end
+
+  @spec crawl_large_instance(String.t(), Nodeinfo.t()) :: ApiCrawler.t()
+  defp crawl_large_instance(domain, nodeinfo_result) do
+    status_datetime_threshold =
+      NaiveDateTime.utc_now()
+      |> NaiveDateTime.add(get_config(:status_age_limit_days) * 24 * 3600 * -1, :second)
+
+    # Don't get any statuses older than this
+    min_timestamp = max_datetime(get_last_crawl_timestamp(domain), status_datetime_threshold)
+
+    {interactions, statuses_seen} = get_interactions(domain, min_timestamp)
+
+    Map.merge(nodeinfo_result, %{
+      interactions: interactions,
+      statuses_seen: statuses_seen,
+      peers: []
+    })
+  end
+
+  @spec get_interactions(
+          String.t(),
+          NaiveDateTime.t(),
+          String.t() | nil,
+          ApiCrawler.instance_interactions(),
+          integer()
+        ) :: {ApiCrawler.instance_interactions(), integer()}
+  defp get_interactions(
+         domain,
+         min_timestamp,
+         max_id \\ nil,
+         interactions \\ %{},
+         statuses_seen \\ 0
+       ) do
+    endpoint = "https://#{domain}/api/statuses/public_timeline.json"
+
+    endpoint =
+      if max_id != nil do
+        endpoint <> "?max_id=#{max_id}"
+      else
+        endpoint
+      end
+
+    Logger.debug("Crawling #{endpoint}")
+
+    statuses = get_and_decode!(endpoint)
+
+    # Filter to statuses that are in the correct timeframe
+    filtered_statuses =
+      statuses
+      |> Enum.filter(fn s ->
+        s["created_at"]
+        |> parse_timestamp()
+        |> is_after?(min_timestamp)
+      end)
+
+    if length(filtered_statuses) > 0 do
+      # Filter down further to statuses that a) aren't faves and b) aren't from #nobot users
+      eligible_statuses =
+        filtered_statuses |> Enum.filter(fn s -> not is_fave?(s) and not has_nobot?(s) end)
+
+      # get statuses that are eligible (i.e. users don't have #nobot in their profile), have mentions, and are not faves
+      interactions =
+        eligible_statuses
+        |> statuses_to_interactions()
+        |> merge_count_maps(interactions)
+
+      statuses_seen =
+        eligible_statuses
+        |> Kernel.length()
+        |> Kernel.+(statuses_seen)
+
+      oldest_status = Enum.at(filtered_statuses, -1)
+
+      oldest_status_datetime =
+        oldest_status
+        |> Map.get("created_at")
+        |> parse_timestamp()
+
+      if NaiveDateTime.compare(oldest_status_datetime, min_timestamp) == :gt and
+           statuses_seen < get_config(:status_count_limit) and
+           length(filtered_statuses) == length(statuses) do
+        get_interactions(domain, min_timestamp, oldest_status["id"], interactions, statuses_seen)
+      else
+        {interactions, statuses_seen}
+      end
+    else
+      {interactions, statuses_seen}
+    end
+  end
+
+  @spec statuses_to_interactions(any()) :: ApiCrawler.instance_interactions()
+  defp statuses_to_interactions(statuses) do
+    statuses
+    |> Enum.filter(fn status -> is_mention?(status) end)
+    |> Enum.map(fn status -> extract_mentions_from_status(status) end)
+    |> Enum.reduce(%{}, fn map, acc ->
+      Map.merge(acc, map)
+    end)
+  end
+
+  # Checks whether the status contains one or more mentions
+  @spec is_mention?(any()) :: boolean()
+  defp is_mention?(%{"attentions" => []}) do
+    false
+  end
+
+  defp is_mention?(_status) do
+    true
+  end
+
+  @spec is_fave?(any()) :: boolean()
+  defp is_fave?(status) do
+    uri_elements = status |> Map.get("uri") |> String.split(":")
+    Enum.member?(uri_elements, "fave")
+  end
+
+  @spec has_nobot?(any()) :: boolean()
+  defp has_nobot?(status) do
+    case get_in(status, ["user", "description"]) do
+      nil ->
+        false
+
+      description ->
+        description
+        |> String.downcase()
+        |> String.contains?("nobot")
+    end
+  end
+
+  @spec extract_mentions_from_status(any()) :: ApiCrawler.instance_interactions()
+  defp extract_mentions_from_status(status) do
+    status["attentions"]
+    |> Enum.map(fn mention -> get_domain(mention["profileurl"]) end)
+    |> Enum.reduce(%{}, fn domain, acc ->
+      Map.update(acc, domain, 1, &(&1 + 1))
+    end)
+  end
+
+  # Parses the messed-up time format that GNU social uses
+  # Like seriously, it's 2019, why *wouldn't* you use iso8601?
+  @spec parse_timestamp(String.t()) :: NaiveDateTime.t()
+  defp parse_timestamp(timestamp) do
+    timestamp
+    |> Timex.parse!("{WDshort} {Mshort} {0D} {h24}:{0m}:{0s} {0Z} {YYYY}")
+    |> Timex.to_naive_datetime()
+  end
+end
--- a/backend/lib/backend/crawler/crawlers/mastodon.ex
+++ b/backend/lib/backend/crawler/crawlers/mastodon.ex
@ -7,10 +7,15 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
  @behaviour ApiCrawler

  @impl ApiCrawler
-  def is_instance_type?(domain) do
-    case get("https://#{domain}/api/v1/instance") do
-      {:ok, response} -> if is_http_200?(response), do: has_title?(response.body), else: false
-      {:error, _error} -> false
+  def is_instance_type?(domain, result) do
+    # We might already know that this is a Pleroma instance from nodeinfo
+    if result != nil and Map.get(result, :instance_type) == :pleroma do
+      true
+    else
+      case get_and_decode("https://#{domain}/api/v1/instance") do
+        {:ok, %{"title" => _title}} -> true
+        _other -> false
+      end
    end
  end

@ -26,8 +31,8 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
  end

  @impl ApiCrawler
-  def crawl(domain) do
-    instance = Jason.decode!(get!("https://#{domain}/api/v1/instance").body)
+  def crawl(domain, _current_result) do
+    instance = get_and_decode!("https://#{domain}/api/v1/instance")
    user_count = get_in(instance, ["stats", "user_count"])

    if is_above_user_threshold?(user_count) or has_opted_in?(domain) do
@ -51,12 +56,7 @@ defmodule Backend.Crawler.Crawlers.Mastodon do

  @spec crawl_large_instance(String.t(), any()) :: ApiCrawler.t()
  defp crawl_large_instance(domain, instance) do
-    # servers may not publish peers
-    peers =
-      case get("https://#{domain}/api/v1/instance/peers") do
-        {:ok, response} -> if is_http_200?(response), do: Jason.decode!(response.body), else: []
-        {:error, _error} -> []
-      end
+    peers = get_peers(domain)

    Logger.debug("Found #{length(peers)} peers.")

@ -124,15 +124,15 @@ defmodule Backend.Crawler.Crawlers.Mastodon do

    Logger.debug("Crawling #{endpoint}")

-    statuses =
-      endpoint
-      |> get!()
-      |> Map.get(:body)
-      |> Jason.decode!()
+    statuses = get_and_decode!(endpoint)

    filtered_statuses =
      statuses
-      |> Enum.filter(fn s -> is_after?(s["created_at"], min_timestamp) end)
+      |> Enum.filter(fn s ->
+        s["created_at"]
+        |> NaiveDateTime.from_iso8601!()
+        |> is_after?(min_timestamp)
+      end)

    if length(filtered_statuses) > 0 do
      # get statuses that are eligible (i.e. users don't have #nobot in their profile) and have mentions
@ -166,12 +166,11 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
    end
  end

-  # To check if the endpoint works as expected
-  @spec has_title?(String.t()) :: boolean
-  defp has_title?(body) do
-    case Jason.decode(body) do
-      {:ok, decoded} -> Map.has_key?(decoded, "title")
-      {:error, _error} -> false
+  defp get_peers(domain) do
+    # servers may not publish peers
+    case get_and_decode("https://#{domain}/api/v1/instance/peers") do
+      {:ok, peers} -> peers
+      {:error, _err} -> []
    end
  end

--- a/backend/lib/backend/crawler/crawlers/misskey.ex
+++ b/backend/lib/backend/crawler/crawlers/misskey.ex
@ -7,10 +7,15 @@ defmodule Backend.Crawler.Crawlers.Misskey do
  require Logger

  @impl ApiCrawler
-  def is_instance_type?(domain) do
-    case get_version_and_description(domain) do
-      {:ok, _} -> true
-      {:error, _} -> false
+  def is_instance_type?(domain, result) do
+    # We may already know that this is a Misskey instance from nodeinfo
+    if result != nil and Map.get(result, :instance_type) == :misskey do
+      true
+    else
+      case get_version_and_description(domain) do
+        {:ok, _} -> true
+        {:error, _} -> false
+      end
    end
  end

@ -27,11 +32,9 @@ defmodule Backend.Crawler.Crawlers.Misskey do
  end

  @impl ApiCrawler
-  def crawl(domain) do
-    with {:ok, %{status_code: 200, body: stats_body}} <- post("https://#{domain}/api/stats") do
-      %{"originalUsersCount" => user_count, "originalNotesCount" => status_count} =
-        Jason.decode!(stats_body)
-
+  def crawl(domain, _result) do
+    with {:ok, %{"originalUsersCount" => user_count, "originalNotesCount" => status_count}} <-
+           post_and_decode("https://#{domain}/api/stats") do
      if is_above_user_threshold?(user_count) or has_opted_in?(domain) do
        crawl_large_instance(domain, user_count, status_count)
      else
@ -107,15 +110,15 @@ defmodule Backend.Crawler.Crawlers.Misskey do

    Logger.debug("Crawling #{endpoint} with untilId=#{until_id}")

-    statuses =
-      endpoint
-      |> post!(Jason.encode!(params))
-      |> Map.get(:body)
-      |> Jason.decode!()
+    statuses = post_and_decode!(endpoint, Jason.encode!(params))

    filtered_statuses =
      statuses
-      |> Enum.filter(fn s -> is_after?(s["createdAt"], min_timestamp) end)
+      |> Enum.filter(fn s ->
+        s["createdAt"]
+        |> NaiveDateTime.from_iso8601!()
+        |> is_after?(min_timestamp)
+      end)

    if length(filtered_statuses) > 0 do
      # get statuses that are eligible (i.e. users don't have #nobot in their profile) and have mentions
@ -151,35 +154,22 @@ defmodule Backend.Crawler.Crawlers.Misskey do
  end

  @spec get_version_and_description(String.t()) ::
-          {:ok, {String.t(), String.t()}} | {:error, String.t()}
+          {:ok, {String.t(), String.t()}} | {:error, Jason.DecodeError.t() | HTTPoison.Error.t()}
  defp get_version_and_description(domain) do
-    case post("https://#{domain}/api/meta") do
-      {:ok, %{status_code: 200, body: body}} ->
-        case Jason.decode(body) do
-          {:ok, decoded} ->
-            {:ok, {Map.get(decoded, "version"), Map.get(decoded, "description")}}
+    case post_and_decode("https://#{domain}/api/meta") do
+      {:ok, %{"version" => version, "description" => description}} ->
+        {:ok, {version, description}}

-          {:error, _error} ->
-            {:error, "invalid response"}
-        end
-
-      _ ->
-        {:error, "unsuccesful request"}
+      {:error, err} ->
+        {:error, err}
    end
  end

  @spec get_peers(String.t()) :: {:ok, [String.t()]} | {:error, Jason.DecodeError.t()}
  defp get_peers(domain) do
-    case get("https://#{domain}/api/v1/instance/peers") do
-      {:ok, response} ->
-        with %{status_code: 200, body: body} <- response do
-          Jason.decode(body)
-        else
-          _ -> {:ok, []}
-        end
-
-      {:error, _} ->
-        {:ok, []}
+    case get_and_decode("https://#{domain}/api/v1/instance/peers") do
+      {:ok, peers} -> {:ok, peers}
+      {:error, _} -> {:ok, []}
    end
  end

--- a/backend/lib/backend/crawler/crawlers/nodeinfo.ex
+++ b/backend/lib/backend/crawler/crawlers/nodeinfo.ex
@ -0,0 +1,117 @@
+defmodule Backend.Crawler.Crawlers.Nodeinfo do
+  alias Backend.Crawler.ApiCrawler
+  require Logger
+  import Backend.Util
+  import Backend.Crawler.Util
+
+  @moduledoc """
+  This module is slightly different from the other crawlers.
+  It doesn't implement the ApiCrawler spec because it isn't run as a self-contained crawler.
+  Instead, it's run before all the other crawlers.
+
+  This is to get the user count. Some servers don't publish this in other places (e.g. GNU Social, PeerTube) so we need
+  nodeinfo to know whether it's a personal instance or not.
+  """
+
+  defstruct [
+    :description,
+    :user_count,
+    :status_count,
+    :instance_type,
+    :version
+  ]
+
+  @type t() :: %__MODULE__{
+          description: String.t(),
+          user_count: integer,
+          status_count: integer,
+          instance_type: ApiCrawler.instance_type(),
+          version: String.t()
+        }
+
+  @spec allows_crawling?(String.t()) :: boolean()
+  def allows_crawling?(domain) do
+    [
+      ".well-known/nodeinfo"
+    ]
+    |> Enum.map(fn endpoint -> "https://#{domain}#{endpoint}" end)
+    |> urls_are_crawlable?()
+  end
+
+  @spec crawl(String.t()) :: {:ok, t()} | {:error, nil}
+  def crawl(domain) do
+    with {:ok, nodeinfo_url} <- get_nodeinfo_url(domain),
+         {:ok, nodeinfo} <- get_nodeinfo(nodeinfo_url) do
+      {:ok, nodeinfo}
+    else
+      _other -> {:error, nil}
+    end
+  end
+
+  @spec get_nodeinfo_url(String.t()) ::
+          {:ok, String.t()} | {:error, Jason.DecodeError.t() | HTTPoison.Error.t()}
+  defp get_nodeinfo_url(domain) do
+    case get_and_decode("https://#{domain}/.well-known/nodeinfo") do
+      {:ok, response} -> {:ok, process_nodeinfo_url(response)}
+      {:error, err} -> {:error, err}
+    end
+  end
+
+  @spec process_nodeinfo_url(any()) :: String.t()
+  defp process_nodeinfo_url(response) do
+    response
+    |> Map.get("links")
+    |> Enum.filter(fn %{"rel" => rel} -> is_compatible_nodeinfo_version?(rel) end)
+    |> Kernel.hd()
+    |> Map.get("href")
+  end
+
+  @spec get_nodeinfo(String.t()) ::
+          {:ok, t()} | {:error, Jason.DecodeError.t() | HTTPoison.Error.t()}
+  defp get_nodeinfo(nodeinfo_url) do
+    case get_and_decode(nodeinfo_url) do
+      {:ok, nodeinfo} -> {:ok, process_nodeinfo(nodeinfo)}
+      {:error, err} -> {:error, err}
+    end
+  end
+
+  @spec process_nodeinfo(any()) :: t()
+  defp process_nodeinfo(nodeinfo) do
+    user_count = get_in(nodeinfo, ["usage", "users", "total"])
+
+    if is_above_user_threshold?(user_count) do
+      # Both of these are used, depending on the server implementation
+      description =
+        [
+          get_in(nodeinfo, ["metadata", "description"]),
+          get_in(nodeinfo, ["metadata", "nodeDescription"])
+        ]
+        |> Enum.filter(fn d -> d != nil end)
+        |> Enum.at(0)
+
+      type = nodeinfo |> get_in(["software", "name"]) |> String.downcase() |> String.to_atom()
+
+      %__MODULE__{
+        description: description,
+        user_count: user_count,
+        status_count: get_in(nodeinfo, ["usage", "localPosts"]),
+        instance_type: type,
+        version: get_in(nodeinfo, ["software", "version"])
+      }
+    else
+      %{
+        description: nil,
+        user_count: user_count,
+        status_count: nil,
+        instance_type: nil,
+        version: nil
+      }
+    end
+  end
+
+  @spec is_compatible_nodeinfo_version?(String.t()) :: boolean()
+  defp is_compatible_nodeinfo_version?(schema_url) do
+    version = String.slice(schema_url, (String.length(schema_url) - 3)..-1)
+    Enum.member?(["1.0", "1.1", "2.0"], version)
+  end
+end
--- a/backend/lib/backend/crawler/stale_instance_manager.ex
+++ b/backend/lib/backend/crawler/stale_instance_manager.ex
@ -54,7 +54,7 @@ defmodule Backend.Crawler.StaleInstanceManager do
    stale_domains =
      Instance
      |> select([i], i.domain)
-      |> where([i], i.next_crawl < ^now)
+      |> where([i], i.next_crawl < ^now and not i.opt_out)
      |> Repo.all()
      |> MapSet.new()

--- a/backend/lib/backend/crawler/util.ex
+++ b/backend/lib/backend/crawler/util.ex
@ -8,27 +8,19 @@ defmodule Backend.Crawler.Util do
  # (e.g. https://mastodon.social/@demouser or https://pleroma.site/users/demouser)
  @spec get_domain(String.t()) :: String.t()
  def get_domain(url) do
-    String.slice(url, 8..-1)
-    |> String.split("/")
-    |> Enum.at(0)
+    [_match, domain] = Regex.run(~r/https?:\/\/([\w.-]+)\/.*/, url)
+    domain
  end

-  @spec is_http_200?(HTTPoison.Response.t()) :: boolean
-  def is_http_200?(%{status_code: 200}) do
-    true
-  end
-
-  def is_http_200?(_) do
-    false
-  end
-
-  @spec is_after?(String.t(), NaiveDateTime.t() | nil) :: boolean()
+  @doc """
+  Returns true if the first argument is after the second.
+  """
+  @spec is_after?(NaiveDateTime.t(), NaiveDateTime.t() | nil) :: boolean()
  def is_after?(timestamp, threshold) do
    if threshold == nil do
      true
    else
      timestamp
-      |> NaiveDateTime.from_iso8601!()
      # :second is the granularity used in the database
      |> NaiveDateTime.truncate(:second)
      |> NaiveDateTime.compare(threshold)
@ -36,49 +28,6 @@ defmodule Backend.Crawler.Util do
    end
  end

-  def get(url) do
-    # TODO: add version number to user agent?
-    HTTPoison.get(url, [{"User-Agent", get_config(:user_agent)}],
-      hackney: [pool: :crawler],
-      recv_timeout: 15000,
-      timeout: 15000
-    )
-  end
-
-  @spec get!(binary) :: %{
-          :__struct__ => HTTPoison.AsyncResponse | HTTPoison.Response,
-          optional(:body) => any,
-          optional(:headers) => [any],
-          optional(:id) => reference,
-          optional(:request) => HTTPoison.Request.t(),
-          optional(:request_url) => any,
-          optional(:status_code) => integer
-        }
-  def get!(url) do
-    # TODO: add version number to user agent?
-    HTTPoison.get!(url, [{"User-Agent", get_config(:user_agent)}],
-      hackney: [pool: :crawler],
-      recv_timeout: 15000,
-      timeout: 15000
-    )
-  end
-
-  def post(url, body \\ "") do
-    HTTPoison.post(url, body, [{"User-Agent", get_config(:user_agent)}],
-      hackney: [pool: :crawler],
-      recv_timeout: 15000,
-      timeout: 15000
-    )
-  end
-
-  def post!(url, body \\ "") do
-    HTTPoison.post!(url, body, [{"User-Agent", get_config(:user_agent)}],
-      hackney: [pool: :crawler],
-      recv_timeout: 15000,
-      timeout: 15000
-    )
-  end
-
  @spec urls_are_crawlable?([String.t()]) :: boolean()
  def urls_are_crawlable?(urls) do
    user_agent = get_config(:user_agent)
--- a/backend/lib/backend/util.ex
+++ b/backend/lib/backend/util.ex
@ -145,4 +145,54 @@ defmodule Backend.Util do
  def convert_keys_to_atoms(map) do
    map |> Map.new(fn {k, v} -> {String.to_atom(k), v} end)
  end
+
+  @doc """
+  Gets and decodes a HTTP response.
+  """
+  @spec get_and_decode(String.t()) ::
+          {:ok, any()} | {:error, Jason.DecodeError.t() | HTTPoison.Error.t()}
+  def get_and_decode(url) do
+    case HTTPoison.get(url, [{"User-Agent", get_config(:user_agent)}],
+           hackney: [pool: :crawler],
+           recv_timeout: 15000,
+           timeout: 15000
+         ) do
+      {:ok, %{status_code: 200, body: body}} -> Jason.decode(body)
+      {:ok, _} -> {:error, %HTTPoison.Error{reason: "Non-200 response"}}
+      {:error, err} -> {:error, err}
+    end
+  end
+
+  @spec get_and_decode!(String.t()) :: any()
+  def get_and_decode!(url) do
+    case get_and_decode(url) do
+      {:ok, decoded} -> decoded
+      {:error, error} -> raise error
+    end
+  end
+
+  @doc """
+  POSTS to a HTTP endpoint and decodes the JSON response.
+  """
+  @spec post_and_decode(String.t(), String.t()) ::
+          {:ok, any()} | {:error, Jason.DecodeError.t() | HTTPoison.Error.t()}
+  def post_and_decode(url, body \\ "") do
+    case HTTPoison.post(url, body, [{"User-Agent", get_config(:user_agent)}],
+           hackney: [pool: :crawler],
+           recv_timeout: 15000,
+           timeout: 15000
+         ) do
+      {:ok, %{status_code: 200, body: response_body}} -> Jason.decode(response_body)
+      {:ok, _} -> {:error, %HTTPoison.Error{reason: "Non-200 response"}}
+      {:error, err} -> {:error, err}
+    end
+  end
+
+  @spec post_and_decode!(String.t(), String.t()) :: any()
+  def post_and_decode!(url, body \\ "") do
+    case post_and_decode(url, body) do
+      {:ok, decoded} -> decoded
+      {:error, error} -> raise error
+    end
+  end
 end
--- a/backend/lib/backend_web/controllers/admin_login_controller.ex
+++ b/backend/lib/backend_web/controllers/admin_login_controller.ex
@ -14,10 +14,7 @@ defmodule BackendWeb.AdminLoginController do
    # TODO: this assumes mastodon/pleroma API
    cleaned_domain = clean_domain(domain)

-    instance_data =
-      HTTPoison.get!("https://#{cleaned_domain}/api/v1/instance")
-      |> Map.get(:body)
-      |> Jason.decode!()
+    instance_data = get_and_decode!("https://#{cleaned_domain}/api/v1/instance")

    render(conn, "show.json", instance_data: instance_data, cleaned_domain: cleaned_domain)
  end
@ -25,10 +22,7 @@ defmodule BackendWeb.AdminLoginController do
  def create(conn, %{"domain" => domain, "type" => type}) do
    cleaned_domain = clean_domain(domain)

-    instance_data =
-      HTTPoison.get!("https://#{cleaned_domain}/api/v1/instance")
-      |> Map.get(:body)
-      |> Jason.decode!()
+    instance_data = get_and_decode!("https://#{cleaned_domain}/api/v1/instance")

    error =
      cond do
--- a/frontend/src/components/atoms/InstanceType.tsx
+++ b/frontend/src/components/atoms/InstanceType.tsx
@ -3,7 +3,7 @@ import { IconNames } from "@blueprintjs/icons";
 import React from "react";
 import { QUALITATIVE_COLOR_SCHEME } from "../../constants";
 import { typeColorScheme } from "../../types";
-import { capitalize } from "../../util";
+import { getTypeDisplayString } from "../../util";

 interface IInstanceTypeProps {
  type: string;
@ -15,7 +15,7 @@ interface IInstanceTypeProps {
 */
 const InstanceType: React.FC<IInstanceTypeProps> = ({ type, colorAfterName }) => {
  const idx = typeColorScheme.values.indexOf(type);
-  const name = " " + capitalize(type);
+  const name = " " + getTypeDisplayString(type);
  return (
    <>
      {!!colorAfterName && name}
--- a/frontend/src/components/organisms/SearchFilters.tsx
+++ b/frontend/src/components/organisms/SearchFilters.tsx
@ -4,7 +4,7 @@ import React, { MouseEvent } from "react";
 import styled from "styled-components";
 import { INSTANCE_TYPES } from "../../constants";
 import { getSearchFilterDisplayValue, ISearchFilter } from "../../searchFilters";
-import { capitalize } from "../../util";
+import { getTypeDisplayString } from "../../util";

 const SearchFilterContainer = styled.div`
  margin: 10px 0 0 0;
@ -30,7 +30,7 @@ const SearchFilters: React.FC<ISearchFiltersProps> = ({ selectedFilters, selectF
  const handleSelectInstanceType = (e: MouseEvent<HTMLElement>) => {
    const field = "type";
    const relation = "eq";
-    const value = e.currentTarget.innerText.toLowerCase();
+    const value = e.currentTarget.innerText.toLowerCase().replace(" ", "");
    const filter: ISearchFilter = {
      displayValue: getSearchFilterDisplayValue(field, relation, value),
      field,
@ -43,7 +43,7 @@ const SearchFilters: React.FC<ISearchFiltersProps> = ({ selectedFilters, selectF
    <Menu>
      <MenuItem icon={IconNames.SYMBOL_CIRCLE} text="Instance type" disabled={hasInstanceTypeFilter}>
        {INSTANCE_TYPES.map(t => (
-          <MenuItem key={t} text={capitalize(t)} onClick={handleSelectInstanceType} />
+          <MenuItem key={t} text={getTypeDisplayString(t)} onClick={handleSelectInstanceType} />
        ))}
      </MenuItem>
    </Menu>
--- a/frontend/src/constants.tsx
+++ b/frontend/src/constants.tsx
@ -40,4 +40,4 @@ export interface IInstanceDomainPath {
 }

 // We could also extract the values from the server response, but this would slow things down...
-export const INSTANCE_TYPES = ["mastodon", "gab", "pleroma", "misskey"];
+export const INSTANCE_TYPES = ["mastodon", "gab", "pleroma", "misskey", "gnusocial"];
--- a/frontend/src/util.ts
+++ b/frontend/src/util.ts
@ -68,3 +68,13 @@ export const getBuckets = (min: number, max: number, steps: number, exponential:
    return range(min, max, bucketSize);
  }
 };
+
+const typeToDisplay = {
+  gnusocial: "GNU Social"
+};
+export const getTypeDisplayString = (key: string) => {
+  if (key in typeToDisplay) {
+    return typeToDisplay[key];
+  }
+  return capitalize(key);
+};