Add misskey crawler

2019-08-06 18:04:30 +00:00 · 2019-08-06 18:04:30 +00:00 · 75e66affe3
parent cc541b9ddf
commit 75e66affe3
9 changed files with 278 additions and 22 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 - Added Misskey crawler.
 ### Changed
 ### Deprecated
--- a/backend/config/dev.exs
+++ b/backend/config/dev.exs
@ -61,7 +61,7 @@ config :backend, :crawler,
  status_count_limit: 100,
  personal_instance_threshold: 5,
  crawl_interval_mins: 60,
-  crawl_workers: 1,
+  crawl_workers: 50,
  blacklist: [
    "gab.best",
    "4chan.icu"
--- a/backend/lib/backend/crawler/api_crawler.ex
+++ b/backend/lib/backend/crawler/api_crawler.ex
@ -14,7 +14,7 @@ defmodule Backend.Crawler.ApiCrawler do
  # {domain_mentioned, count}
  @type instance_interactions :: %{String.t() => integer}
-  @type instance_type :: :mastodon | :pleroma | :gab
+  @type instance_type :: :mastodon | :pleroma | :gab | :misskey
  defstruct [
    :version,
--- a/backend/lib/backend/crawler/crawler.ex
+++ b/backend/lib/backend/crawler/crawler.ex
@ -4,7 +4,7 @@ defmodule Backend.Crawler do
  """
  alias __MODULE__
-  alias Backend.Crawler.Crawlers.Mastodon
+  alias Backend.Crawler.Crawlers.{Mastodon, Misskey}
  alias Backend.Crawler.ApiCrawler
  alias Backend.{Crawl, CrawlInteraction, Repo, Instance, InstancePeer}
  import Ecto.Query
@ -46,6 +46,7 @@ defmodule Backend.Crawler do
    state
    # register APICrawlers here
    |> register(Mastodon)
    |> register(Misskey)
    # go!
    |> crawl()
    |> save()
--- a/backend/lib/backend/crawler/crawlers/mastodon.ex
+++ b/backend/lib/backend/crawler/crawlers/mastodon.ex
@ -2,9 +2,7 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
  require Logger
  import Backend.Crawler.Util
  import Backend.Util
  import Ecto.Query
  alias Backend.Crawler.ApiCrawler
  alias Backend.{Instance, Repo}
  @behaviour ApiCrawler
@ -18,31 +16,21 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
  @impl ApiCrawler
  def allows_crawling?(domain) do
-    endpoints = [
+    [
      "/api/v1/instance",
      "/api/v1/instance/peers",
      "/api/v1/timelines/public"
    ]
    user_agent = get_config(:user_agent)
    endpoints
    |> Enum.map(fn endpoint -> "https://#{domain}#{endpoint}" end)
-    |> Enum.all?(fn endpoint -> Gollum.crawlable?(user_agent, endpoint) != :uncrawlable end)
+    |> urls_are_crawlable?()
  end
  @impl ApiCrawler
  def crawl(domain) do
    instance = Jason.decode!(get!("https://#{domain}/api/v1/instance").body)
    user_count = get_in(instance, ["stats", "user_count"])
-    has_opted_in =
+    if is_above_user_threshold?(user_count) or has_opted_in?(domain) do
      case Instance |> select([:opt_in]) |> Repo.get_by(domain: domain) do
        %{opt_in: true} -> true
        _ -> false
      end
    if get_in(instance, ["stats", "user_count"]) > get_config(:personal_instance_threshold) or
         has_opted_in do
      crawl_large_instance(domain, instance)
    else
      Map.merge(
@ -148,7 +136,11 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
    if length(filtered_statuses) > 0 do
      # get statuses that are eligible (i.e. users don't have #nobot in their profile) and have mentions
-      interactions = Map.merge(interactions, statuses_to_interactions(filtered_statuses))
+      interactions =
        filtered_statuses
        |> statuses_to_interactions()
        |> merge_count_maps(interactions)
      statuses_seen = statuses_seen + length(filtered_statuses)
      status_datetime_threshold =
--- a/backend/lib/backend/crawler/crawlers/misskey.ex
+++ b/backend/lib/backend/crawler/crawlers/misskey.ex
@ -0,0 +1,222 @@
 defmodule Backend.Crawler.Crawlers.Misskey do
  alias Backend.Crawler.ApiCrawler
  @behaviour ApiCrawler
  import Backend.Crawler.Util
  import Backend.Util
  require Logger
  @impl ApiCrawler
  def is_instance_type?(domain) do
    case get_version_and_description(domain) do
      {:ok, _} -> true
      {:error, _} -> false
    end
  end
  @impl ApiCrawler
  def allows_crawling?(domain) do
    [
      "/api/meta",
      "/api/stats",
      "/api/notes/local-timeline",
      "/api/v1/instance/peers"
    ]
    |> Enum.map(fn endpoint -> "https://#{domain}#{endpoint}" end)
    |> urls_are_crawlable?()
  end
  @impl ApiCrawler
  def crawl(domain) do
    with {:ok, %{status_code: 200, body: stats_body}} <- post("https://#{domain}/api/stats") do
      %{"originalUsersCount" => user_count, "originalNotesCount" => status_count} =
        Jason.decode!(stats_body)
      if is_above_user_threshold?(user_count) or has_opted_in?(domain) do
        crawl_large_instance(domain, user_count, status_count)
      else
        %{
          version: nil,
          description: nil,
          user_count: user_count,
          status_count: nil,
          peers: [],
          interactions: %{},
          statuses_seen: 0,
          instance_type: nil
        }
      end
    end
  end
  @spec crawl_large_instance(String.t(), integer(), integer()) :: ApiCrawler.t()
  defp crawl_large_instance(domain, user_count, status_count) do
    status_datetime_threshold =
      NaiveDateTime.utc_now()
      |> NaiveDateTime.add(get_config(:status_age_limit_days) * 24 * 3600 * -1, :second)
    # Don't get any statuses older than this
    min_timestamp =
      max_datetime(get_last_successful_crawl_timestamp(domain), status_datetime_threshold)
    {interactions, statuses_seen} = get_interactions(domain, min_timestamp)
    {:ok, {version, description}} = get_version_and_description(domain)
    {:ok, peers} = get_peers(domain)
    %{
      instance_type: :misskey,
      # From stats endpoint
      user_count: user_count,
      status_count: status_count,
      # From meta endpoint
      version: version,
      description: description,
      # From timeline
      interactions: interactions,
      statuses_seen: statuses_seen,
      # From peers endpoint
      peers: peers
    }
  end
  @spec get_interactions(
          String.t(),
          NaiveDateTime.t(),
          String.t() | nil,
          ApiCrawler.instance_interactions(),
          integer()
        ) :: {ApiCrawler.instance_interactions(), integer()}
  defp get_interactions(
         domain,
         min_timestamp,
         until_id \\ nil,
         interactions \\ %{},
         statuses_seen \\ 0
       ) do
    endpoint = "https://#{domain}/api/notes/local-timeline"
    params = %{
      limit: 20
    }
    params =
      if until_id != nil do
        Map.put(params, :untilId, until_id)
      else
        params
      end
    Logger.debug("Crawling #{endpoint} with untilId=#{until_id}")
    statuses =
      endpoint
      |> post!(Jason.encode!(params))
      |> Map.get(:body)
      |> Jason.decode!()
    filtered_statuses =
      statuses
      |> Enum.filter(fn s -> is_after?(s["createdAt"], min_timestamp) end)
    if length(filtered_statuses) > 0 do
      # get statuses that are eligible (i.e. users don't have #nobot in their profile) and have mentions
      interactions =
        filtered_statuses
        |> statuses_to_interactions()
        |> merge_count_maps(interactions)
      # Don't count renotes in the # of statuses seen
      statuses_seen =
        filtered_statuses
        |> Enum.filter(&is_original_status?(&1))
        |> Kernel.length()
        |> Kernel.+(statuses_seen)
      oldest_status = Enum.at(filtered_statuses, -1)
      oldest_status_datetime =
        oldest_status
        |> (fn s -> s["createdAt"] end).()
        |> NaiveDateTime.from_iso8601!()
      if NaiveDateTime.compare(oldest_status_datetime, min_timestamp) == :gt and
           statuses_seen < get_config(:status_count_limit) and
           length(filtered_statuses) == length(statuses) do
        get_interactions(domain, min_timestamp, oldest_status["id"], interactions, statuses_seen)
      else
        {interactions, statuses_seen}
      end
    else
      {interactions, statuses_seen}
    end
  end
  @spec get_version_and_description(String.t()) ::
          {:ok, {String.t(), String.t()}} | {:error, String.t()}
  defp get_version_and_description(domain) do
    case post("https://#{domain}/api/meta") do
      {:ok, %{status_code: 200, body: body}} ->
        case Jason.decode(body) do
          {:ok, decoded} ->
            {:ok, {Map.get(decoded, "version"), Map.get(decoded, "description")}}
          {:error, _error} ->
            {:error, "invalid response"}
        end
      _ ->
        {:error, "unsuccesful request"}
    end
  end
  @spec get_peers(String.t()) :: {:ok, [String.t()]} | {:error, Jason.DecodeError.t()}
  defp get_peers(domain) do
    case get("https://#{domain}/api/v1/instance/peers") do
      {:ok, response} ->
        with %{status_code: 200, body: body} <- response do
          Jason.decode(body)
        else
          _ -> {:ok, []}
        end
      {:error, _} ->
        {:ok, []}
    end
  end
  @spec statuses_to_interactions(any()) :: ApiCrawler.instance_interactions()
  defp statuses_to_interactions(statuses) do
    statuses
    |> Enum.filter(fn status -> is_mention?(status) end)
    |> Enum.map(fn status -> extract_mentions_from_status(status) end)
    |> Enum.reduce(%{}, fn map, acc ->
      Map.merge(acc, map)
    end)
  end
  # Checks whether
  # * it's not a renote (a.k.a. a boost)
  # * the status contains one or more mentions
  @spec is_mention?(any()) :: boolean()
  defp is_mention?(status) do
    has_mentions = Map.get(status, "mentions") != nil
    is_original_status?(status) and has_mentions
  end
  # Checks whether it's not a renote (a.k.a. a boost)
  @spec is_original_status?(any()) :: boolean()
  defp is_original_status?(status) do
    Map.get(status, "renoteId") == nil
  end
  @spec extract_mentions_from_status(any()) :: ApiCrawler.instance_interactions()
  defp extract_mentions_from_status(status) do
    status_content = Map.get(status, "text")
    Regex.scan(~r/@\w+@([\w.-]+)/, status_content)
    |> Enum.map(fn [_match, domain] -> domain end)
    |> Enum.reduce(%{}, fn domain, acc ->
      Map.update(acc, domain, 1, &(&1 + 1))
    end)
  end
 end
--- a/backend/lib/backend/crawler/util.ex
+++ b/backend/lib/backend/crawler/util.ex
@ -1,6 +1,8 @@
 defmodule Backend.Crawler.Util do
  require Logger
  alias Backend.Repo
  import Backend.Util
  import Ecto.Query
  # Gets the domain from a Mastodon/Pleroma account URL
  # (e.g. https://mastodon.social/@demouser or https://pleroma.site/users/demouser)
@ -60,4 +62,41 @@ defmodule Backend.Crawler.Util do
      timeout: 15000
    )
  end
  def post(url, body \\ "") do
    HTTPoison.post(url, body, [{"User-Agent", get_config(:user_agent)}],
      hackney: [pool: :crawler],
      recv_timeout: 15000,
      timeout: 15000
    )
  end
  def post!(url, body \\ "") do
    HTTPoison.post!(url, body, [{"User-Agent", get_config(:user_agent)}],
      hackney: [pool: :crawler],
      recv_timeout: 15000,
      timeout: 15000
    )
  end
  @spec urls_are_crawlable?([String.t()]) :: boolean()
  def urls_are_crawlable?(urls) do
    user_agent = get_config(:user_agent)
    urls
    |> Enum.all?(fn url -> Gollum.crawlable?(user_agent, url) != :uncrawlable end)
  end
  @spec has_opted_in?(String.t()) :: boolean()
  def has_opted_in?(domain) do
    case Instance |> select([:opt_in]) |> Repo.get_by(domain: domain) do
      %{opt_in: true} -> true
      _ -> false
    end
  end
  @spec is_above_user_threshold?(integer) :: boolean()
  def is_above_user_threshold?(user_count) do
    user_count > get_config(:personal_instance_threshold)
  end
 end
--- a/backend/lib/backend/util.ex
+++ b/backend/lib/backend/util.ex
@ -30,7 +30,7 @@ defmodule Backend.Util do
    blacklist =
      case get_config(:blacklist) do
        nil -> []
-        _ -> get_config(:blacklist)
+        other -> other
      end
    blacklist
--- a/frontend/src/constants.tsx
+++ b/frontend/src/constants.tsx
@ -40,4 +40,4 @@ export interface IInstanceDomainPath {
 }
 // We could also extract the values from the server response, but this would slow things down...
-export const INSTANCE_TYPES = ["mastodon", "gab", "pleroma"];
+export const INSTANCE_TYPES = ["mastodon", "gab", "pleroma", "misskey"];