add GNU social crawler

2019-08-09 18:17:29 +03:00 · 2019-08-09 18:17:29 +03:00 · 509924ed52
commit 509924ed52
parent 72031c7d3e
7 changed files with 281 additions and 48 deletions
--- a/backend/lib/backend/crawler/api_crawler.ex
+++ b/backend/lib/backend/crawler/api_crawler.ex
@ -11,10 +11,12 @@ defmodule Backend.Crawler.ApiCrawler do
  * Make sure to check the most recent crawl of the instance so you don't re-crawl old statuses
  """

+  alias Backend.Crawler.Crawlers.Nodeinfo
+
  # {domain_mentioned, count}
  @type instance_interactions :: %{String.t() => integer}

-  @type instance_type :: :mastodon | :pleroma | :gab | :misskey
+  @type instance_type :: :mastodon | :pleroma | :gab | :misskey | :gnusocial

  defstruct [
    :version,
@ -40,8 +42,9 @@ defmodule Backend.Crawler.ApiCrawler do

  @doc """
  Check whether the instance at the given domain is of the type that this ApiCrawler implements.
+  Arguments are the instance domain and the nodeinfo results.
  """
-  @callback is_instance_type?(String.t()) :: boolean()
+  @callback is_instance_type?(String.t(), Nodeinfo.t()) :: boolean()

  @doc """
  Check whether the instance allows crawling according to its robots.txt or otherwise.
@ -52,5 +55,5 @@ defmodule Backend.Crawler.ApiCrawler do
  Crawl the instance at the given domain.
  Takes two arguments: the domain to crawl and the existing results (from nodeinfo).
  """
-  @callback crawl(String.t(), ApiCrawler.t()) :: t()
+  @callback crawl(String.t(), Nodeinfo.t()) :: t()
 end
--- a/backend/lib/backend/crawler/crawler.ex
+++ b/backend/lib/backend/crawler/crawler.ex
@ -4,7 +4,7 @@ defmodule Backend.Crawler do
  """

  alias __MODULE__
-  alias Backend.Crawler.Crawlers.{Mastodon, Misskey, Nodeinfo}
+  alias Backend.Crawler.Crawlers.{GnuSocial, Mastodon, Misskey, Nodeinfo}
  alias Backend.Crawler.ApiCrawler
  alias Backend.{Crawl, CrawlInteraction, Repo, Instance, InstancePeer}
  import Ecto.Query
@ -17,6 +17,7 @@ defmodule Backend.Crawler do
    # a list of ApiCrawlers that will be attempted
    :api_crawlers,
    :allows_crawling?,
+    :found_api?,
    :result,
    :error
  ]
@ -25,6 +26,7 @@ defmodule Backend.Crawler do
          domain: String.t(),
          api_crawlers: [ApiCrawler.t()],
          allows_crawling?: boolean,
+          found_api?: boolean,
          result: ApiCrawler.t() | nil,
          error: String.t() | nil
        }
@ -36,6 +38,7 @@ defmodule Backend.Crawler do
      domain: domain,
      api_crawlers: [],
      allows_crawling?: true,
+      found_api?: false,
      result: nil,
      error: nil
    }
@ -45,6 +48,7 @@ defmodule Backend.Crawler do
    |> register(Nodeinfo)
    |> register(Mastodon)
    |> register(Misskey)
+    |> register(GnuSocial)
    # go!
    |> crawl()
    |> save()
@ -70,7 +74,8 @@ defmodule Backend.Crawler do
  defp crawl(%Crawler{api_crawlers: [Nodeinfo | remaining_crawlers], domain: domain} = state) do
    with true <- Nodeinfo.allows_crawling?(domain), {:ok, nodeinfo} <- Nodeinfo.crawl(domain) do
      Logger.debug("Found nodeinfo for #{domain}.")
-      crawl(%Crawler{state | result: nodeinfo, api_crawlers: remaining_crawlers})
+      result = Map.merge(nodeinfo, %{peers: [], interactions: %{}, statuses_seen: 0})
+      crawl(%Crawler{state | result: result, found_api?: true, api_crawlers: remaining_crawlers})
    else
      _ ->
        Logger.debug("Did not find nodeinfo for #{domain}.")
@ -82,12 +87,12 @@ defmodule Backend.Crawler do
         %Crawler{domain: domain, result: result, api_crawlers: [curr | remaining_crawlers]} =
           state
       ) do
-    if curr.is_instance_type?(domain) do
+    if curr.is_instance_type?(domain, result) do
      Logger.debug("Found #{curr} instance")

      if curr.allows_crawling?(domain) do
        try do
-          %Crawler{state | result: curr.crawl(domain, result)}
+          %Crawler{state | result: curr.crawl(domain, result), found_api?: true}
        rescue
          e in HTTPoison.Error ->
            Map.put(state, :error, "HTTPoison error: " <> HTTPoison.Error.message(e))
@ -111,7 +116,8 @@ defmodule Backend.Crawler do
         domain: domain,
         result: result,
         error: nil,
-         allows_crawling?: true
+         allows_crawling?: true,
+         found_api?: true
       }) do
    now = get_now()

@ -250,7 +256,7 @@ defmodule Backend.Crawler do
      cond do
        not allows_crawling -> "robots.txt"
        error == nil -> "no api found"
-        true -> "unknown error"
+        true -> error
      end

    # The "+1" is this error!
--- a/backend/lib/backend/crawler/crawlers/gnu_social.ex
+++ b/backend/lib/backend/crawler/crawlers/gnu_social.ex
@ -0,0 +1,178 @@
+defmodule Backend.Crawler.Crawlers.GnuSocial do
+  alias Backend.Crawler.ApiCrawler
+  alias Backend.Crawler.Crawlers.Nodeinfo
+  import Backend.Crawler.Util
+  import Backend.Util
+  require Logger
+
+  @behaviour ApiCrawler
+
+  @impl ApiCrawler
+  def is_instance_type?(_domain, nodeinfo_result) do
+    nodeinfo_result != nil and Map.get(nodeinfo_result, :instance_type) == :gnusocial
+  end
+
+  @impl ApiCrawler
+  def allows_crawling?(domain) do
+    [
+      "/api/statuses/public_timeline.json"
+    ]
+    |> Enum.map(fn endpoint -> "https://#{domain}#{endpoint}" end)
+    |> urls_are_crawlable?()
+  end
+
+  @impl ApiCrawler
+  def crawl(domain, nodeinfo_result) do
+    if nodeinfo_result |> Map.get(:user_count) |> is_above_user_threshold?() do
+      crawl_large_instance(domain, nodeinfo_result)
+    else
+      nodeinfo_result
+    end
+  end
+
+  @spec crawl_large_instance(String.t(), Nodeinfo.t()) :: ApiCrawler.t()
+  defp crawl_large_instance(domain, nodeinfo_result) do
+    status_datetime_threshold =
+      NaiveDateTime.utc_now()
+      |> NaiveDateTime.add(get_config(:status_age_limit_days) * 24 * 3600 * -1, :second)
+
+    # Don't get any statuses older than this
+    min_timestamp = max_datetime(get_last_crawl_timestamp(domain), status_datetime_threshold)
+
+    {interactions, statuses_seen} = get_interactions(domain, min_timestamp)
+
+    Map.merge(nodeinfo_result, %{
+      interactions: interactions,
+      statuses_seen: statuses_seen,
+      peers: []
+    })
+  end
+
+  @spec get_interactions(
+          String.t(),
+          NaiveDateTime.t(),
+          String.t() | nil,
+          ApiCrawler.instance_interactions(),
+          integer()
+        ) :: {ApiCrawler.instance_interactions(), integer()}
+  defp get_interactions(
+         domain,
+         min_timestamp,
+         max_id \\ nil,
+         interactions \\ %{},
+         statuses_seen \\ 0
+       ) do
+    endpoint = "https://#{domain}/api/statuses/public_timeline.json"
+
+    endpoint =
+      if max_id != nil do
+        endpoint <> "?max_id=#{max_id}"
+      else
+        endpoint
+      end
+
+    Logger.debug("Crawling #{endpoint}")
+
+    statuses = get_and_decode!(endpoint)
+
+    # Filter to statuses that are in the correct timeframe
+    filtered_statuses =
+      statuses
+      |> Enum.filter(fn s ->
+        s["created_at"]
+        |> parse_timestamp()
+        |> is_after?(min_timestamp)
+      end)
+
+    if length(filtered_statuses) > 0 do
+      # Filter down further to statuses that a) aren't faves and b) aren't from #nobot users
+      eligible_statuses =
+        filtered_statuses |> Enum.filter(fn s -> not is_fave?(s) and not has_nobot?(s) end)
+
+      # get statuses that are eligible (i.e. users don't have #nobot in their profile), have mentions, and are not faves
+      interactions =
+        eligible_statuses
+        |> statuses_to_interactions()
+        |> merge_count_maps(interactions)
+
+      statuses_seen =
+        eligible_statuses
+        |> Kernel.length()
+        |> Kernel.+(statuses_seen)
+
+      oldest_status = Enum.at(filtered_statuses, -1)
+
+      oldest_status_datetime =
+        oldest_status
+        |> Map.get("created_at")
+        |> parse_timestamp()
+
+      if NaiveDateTime.compare(oldest_status_datetime, min_timestamp) == :gt and
+           statuses_seen < get_config(:status_count_limit) and
+           length(filtered_statuses) == length(statuses) do
+        get_interactions(domain, min_timestamp, oldest_status["id"], interactions, statuses_seen)
+      else
+        {interactions, statuses_seen}
+      end
+    else
+      {interactions, statuses_seen}
+    end
+  end
+
+  @spec statuses_to_interactions(any()) :: ApiCrawler.instance_interactions()
+  defp statuses_to_interactions(statuses) do
+    statuses
+    |> Enum.filter(fn status -> is_mention?(status) end)
+    |> Enum.map(fn status -> extract_mentions_from_status(status) end)
+    |> Enum.reduce(%{}, fn map, acc ->
+      Map.merge(acc, map)
+    end)
+  end
+
+  # Checks whether the status contains one or more mentions
+  @spec is_mention?(any()) :: boolean()
+  defp is_mention?(%{"attentions" => []}) do
+    false
+  end
+
+  defp is_mention?(_status) do
+    true
+  end
+
+  @spec is_fave?(any()) :: boolean()
+  defp is_fave?(status) do
+    uri_elements = status |> Map.get("uri") |> String.split(":")
+    Enum.member?(uri_elements, "fave")
+  end
+
+  @spec has_nobot?(any()) :: boolean()
+  defp has_nobot?(status) do
+    case get_in(status, ["user", "description"]) do
+      nil ->
+        false
+
+      description ->
+        description
+        |> String.downcase()
+        |> String.contains?("nobot")
+    end
+  end
+
+  @spec extract_mentions_from_status(any()) :: ApiCrawler.instance_interactions()
+  defp extract_mentions_from_status(status) do
+    status["attentions"]
+    |> Enum.map(fn mention -> get_domain(mention["profileurl"]) end)
+    |> Enum.reduce(%{}, fn domain, acc ->
+      Map.update(acc, domain, 1, &(&1 + 1))
+    end)
+  end
+
+  # Parses the messed-up time format that GNU social uses
+  # Like seriously, it's 2019, why *wouldn't* you use iso8601?
+  @spec parse_timestamp(String.t()) :: NaiveDateTime.t()
+  defp parse_timestamp(timestamp) do
+    timestamp
+    |> Timex.parse!("{WDshort} {Mshort} {0D} {h24}:{0m}:{0s} {0Z} {YYYY}")
+    |> Timex.to_naive_datetime()
+  end
+end
--- a/backend/lib/backend/crawler/crawlers/mastodon.ex
+++ b/backend/lib/backend/crawler/crawlers/mastodon.ex
@ -7,10 +7,15 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
  @behaviour ApiCrawler

  @impl ApiCrawler
-  def is_instance_type?(domain) do
-    case get_and_decode("https://#{domain}/api/v1/instance") do
-      {:ok, %{"title" => _title}} -> true
-      _other -> false
+  def is_instance_type?(domain, result) do
+    # We might already know that this is a Pleroma instance from nodeinfo
+    if result != nil and Map.get(result, :instance_type) == :pleroma do
+      true
+    else
+      case get_and_decode("https://#{domain}/api/v1/instance") do
+        {:ok, %{"title" => _title}} -> true
+        _other -> false
+      end
    end
  end

@ -123,7 +128,11 @@ defmodule Backend.Crawler.Crawlers.Mastodon do

    filtered_statuses =
      statuses
-      |> Enum.filter(fn s -> is_after?(s["created_at"], min_timestamp) end)
+      |> Enum.filter(fn s ->
+        s["created_at"]
+        |> NaiveDateTime.from_iso8601!()
+        |> is_after?(min_timestamp)
+      end)

    if length(filtered_statuses) > 0 do
      # get statuses that are eligible (i.e. users don't have #nobot in their profile) and have mentions
--- a/backend/lib/backend/crawler/crawlers/misskey.ex
+++ b/backend/lib/backend/crawler/crawlers/misskey.ex
@ -7,10 +7,15 @@ defmodule Backend.Crawler.Crawlers.Misskey do
  require Logger

  @impl ApiCrawler
-  def is_instance_type?(domain) do
-    case get_version_and_description(domain) do
-      {:ok, _} -> true
-      {:error, _} -> false
+  def is_instance_type?(domain, result) do
+    # We may already know that this is a Misskey instance from nodeinfo
+    if result != nil and Map.get(result, :instance_type) == :misskey do
+      true
+    else
+      case get_version_and_description(domain) do
+        {:ok, _} -> true
+        {:error, _} -> false
+      end
    end
  end

@ -109,7 +114,11 @@ defmodule Backend.Crawler.Crawlers.Misskey do

    filtered_statuses =
      statuses
-      |> Enum.filter(fn s -> is_after?(s["createdAt"], min_timestamp) end)
+      |> Enum.filter(fn s ->
+        s["createdAt"]
+        |> NaiveDateTime.from_iso8601!()
+        |> is_after?(min_timestamp)
+      end)

    if length(filtered_statuses) > 0 do
      # get statuses that are eligible (i.e. users don't have #nobot in their profile) and have mentions
--- a/backend/lib/backend/crawler/crawlers/nodeinfo.ex
+++ b/backend/lib/backend/crawler/crawlers/nodeinfo.ex
@ -1,4 +1,5 @@
 defmodule Backend.Crawler.Crawlers.Nodeinfo do
+  alias Backend.Crawler.ApiCrawler
  require Logger
  import Backend.Util
  import Backend.Crawler.Util
@ -12,6 +13,22 @@ defmodule Backend.Crawler.Crawlers.Nodeinfo do
  nodeinfo to know whether it's a personal instance or not.
  """

+  defstruct [
+    :description,
+    :user_count,
+    :status_count,
+    :instance_type,
+    :version
+  ]
+
+  @type t() :: %__MODULE__{
+          description: String.t(),
+          user_count: integer,
+          status_count: integer,
+          instance_type: ApiCrawler.instance_type(),
+          version: String.t()
+        }
+
  @spec allows_crawling?(String.t()) :: boolean()
  def allows_crawling?(domain) do
    [
@ -21,6 +38,7 @@ defmodule Backend.Crawler.Crawlers.Nodeinfo do
    |> urls_are_crawlable?()
  end

+  @spec crawl(String.t()) :: {:ok, t()} | {:error, nil}
  def crawl(domain) do
    with {:ok, nodeinfo_url} <- get_nodeinfo_url(domain),
         {:ok, nodeinfo} <- get_nodeinfo(nodeinfo_url) do
@ -30,6 +48,8 @@ defmodule Backend.Crawler.Crawlers.Nodeinfo do
    end
  end

+  @spec get_nodeinfo_url(String.t()) ::
+          {:ok, String.t()} | {:error, Jason.DecodeError.t() | HTTPoison.Error.t()}
  defp get_nodeinfo_url(domain) do
    case get_and_decode("https://#{domain}/.well-known/nodeinfo") do
      {:ok, response} -> {:ok, process_nodeinfo_url(response)}
@ -37,6 +57,7 @@ defmodule Backend.Crawler.Crawlers.Nodeinfo do
    end
  end

+  @spec process_nodeinfo_url(any()) :: String.t()
  defp process_nodeinfo_url(response) do
    response
    |> Map.get("links")
@ -45,6 +66,8 @@ defmodule Backend.Crawler.Crawlers.Nodeinfo do
    |> Map.get("href")
  end

+  @spec get_nodeinfo(String.t()) ::
+          {:ok, t()} | {:error, Jason.DecodeError.t() | HTTPoison.Error.t()}
  defp get_nodeinfo(nodeinfo_url) do
    case get_and_decode(nodeinfo_url) do
      {:ok, nodeinfo} -> {:ok, process_nodeinfo(nodeinfo)}
@ -52,23 +75,36 @@ defmodule Backend.Crawler.Crawlers.Nodeinfo do
    end
  end

+  @spec process_nodeinfo(any()) :: t()
  defp process_nodeinfo(nodeinfo) do
-    # Both of these are used, depending on the server implementation
-    description =
-      [
-        get_in(nodeinfo, ["metadata", "description"]),
-        get_in(nodeinfo, ["metadata", "nodeDescription"])
-      ]
-      |> Enum.filter(fn d -> d != nil end)
-      |> Enum.at(0)
+    user_count = get_in(nodeinfo, ["usage", "users", "total"])

-    %{
-      description: description,
-      user_count: get_in(nodeinfo, ["usage", "users", "total"]),
-      status_count: get_in(nodeinfo, ["usage", "localPosts"]),
-      instance_type: String.to_atom(get_in(nodeinfo, ["software", "name"])),
-      version: get_in(nodeinfo, ["software", "version"])
-    }
+    if is_above_user_threshold?(user_count) do
+      # Both of these are used, depending on the server implementation
+      description =
+        [
+          get_in(nodeinfo, ["metadata", "description"]),
+          get_in(nodeinfo, ["metadata", "nodeDescription"])
+        ]
+        |> Enum.filter(fn d -> d != nil end)
+        |> Enum.at(0)
+
+      %__MODULE__{
+        description: description,
+        user_count: user_count,
+        status_count: get_in(nodeinfo, ["usage", "localPosts"]),
+        instance_type: String.to_atom(get_in(nodeinfo, ["software", "name"])),
+        version: get_in(nodeinfo, ["software", "version"])
+      }
+    else
+      %{
+        description: nil,
+        user_count: user_count,
+        status_count: nil,
+        instance_type: nil,
+        version: nil
+      }
+    end
  end

  @spec is_compatible_nodeinfo_version?(String.t()) :: boolean()
--- a/backend/lib/backend/crawler/util.ex
+++ b/backend/lib/backend/crawler/util.ex
@ -8,27 +8,19 @@ defmodule Backend.Crawler.Util do
  # (e.g. https://mastodon.social/@demouser or https://pleroma.site/users/demouser)
  @spec get_domain(String.t()) :: String.t()
  def get_domain(url) do
-    String.slice(url, 8..-1)
-    |> String.split("/")
-    |> Enum.at(0)
+    [_match, domain] = Regex.run(~r/https?:\/\/([\w.-]+)\/.*/, url)
+    domain
  end

-  @spec is_http_200?(HTTPoison.Response.t()) :: boolean
-  def is_http_200?(%{status_code: 200}) do
-    true
-  end
-
-  def is_http_200?(_) do
-    false
-  end
-
-  @spec is_after?(String.t(), NaiveDateTime.t() | nil) :: boolean()
+  @doc """
+  Returns true if the first argument is after the second.
+  """
+  @spec is_after?(NaiveDateTime.t(), NaiveDateTime.t() | nil) :: boolean()
  def is_after?(timestamp, threshold) do
    if threshold == nil do
      true
    else
      timestamp
-      |> NaiveDateTime.from_iso8601!()
      # :second is the granularity used in the database
      |> NaiveDateTime.truncate(:second)
      |> NaiveDateTime.compare(threshold)