index.community/backend/lib/backend/crawler/crawlers/nodeinfo.ex

117 lines
3.5 KiB
Elixir

defmodule Backend.Crawler.Crawlers.Nodeinfo do
@moduledoc """
This module is slightly different from the other crawlers.
It doesn't implement the ApiCrawler spec because it isn't run as a self-contained crawler.
Instead, it's run before all the other crawlers.
This is to get the user count. Some servers don't publish this in other places (e.g. GNU Social, PeerTube) so we need
nodeinfo to know whether it's a personal instance or not.
"""
alias Backend.Crawler.ApiCrawler
require Logger
import Backend.Util
import Backend.Crawler.Util
defstruct [
:description,
:user_count,
:status_count,
:instance_type,
:version
]
@type t() :: %__MODULE__{
description: String.t(),
user_count: integer,
status_count: integer,
instance_type: ApiCrawler.instance_type(),
version: String.t()
}
@spec allows_crawling?(String.t()) :: boolean()
def allows_crawling?(domain) do
[
".well-known/nodeinfo"
]
|> Enum.map(fn endpoint -> "https://#{domain}#{endpoint}" end)
|> urls_are_crawlable?()
end
@spec crawl(String.t()) :: {:ok, t()} | {:error, nil}
def crawl(domain) do
with {:ok, nodeinfo_url} <- get_nodeinfo_url(domain),
{:ok, nodeinfo} <- get_nodeinfo(nodeinfo_url) do
{:ok, nodeinfo}
else
_other -> {:error, nil}
end
end
@spec get_nodeinfo_url(String.t()) ::
{:ok, String.t()} | {:error, Jason.DecodeError.t() | HTTPoison.Error.t()}
defp get_nodeinfo_url(domain) do
case get_and_decode("https://#{domain}/.well-known/nodeinfo") do
{:ok, response} -> {:ok, process_nodeinfo_url(response)}
{:error, err} -> {:error, err}
end
end
@spec process_nodeinfo_url(any()) :: String.t()
defp process_nodeinfo_url(response) do
response
|> Map.get("links")
|> Enum.filter(fn %{"rel" => rel} -> is_compatible_nodeinfo_version?(rel) end)
|> Kernel.hd()
|> Map.get("href")
end
@spec get_nodeinfo(String.t()) ::
{:ok, t()} | {:error, Jason.DecodeError.t() | HTTPoison.Error.t()}
defp get_nodeinfo(nodeinfo_url) do
case get_and_decode(nodeinfo_url) do
{:ok, nodeinfo} -> {:ok, process_nodeinfo(nodeinfo)}
{:error, err} -> {:error, err}
end
end
@spec process_nodeinfo(any()) :: t()
defp process_nodeinfo(nodeinfo) do
user_count = get_in(nodeinfo, ["usage", "users", "total"])
if is_above_user_threshold?(user_count) do
# Both of these are used, depending on the server implementation
description =
[
get_in(nodeinfo, ["metadata", "description"]),
get_in(nodeinfo, ["metadata", "nodeDescription"])
]
|> Enum.filter(fn d -> d != nil end)
|> Enum.at(0)
type = nodeinfo |> get_in(["software", "name"]) |> String.downcase() |> String.to_atom()
%__MODULE__{
description: description,
user_count: user_count,
status_count: get_in(nodeinfo, ["usage", "localPosts"]),
instance_type: type,
version: get_in(nodeinfo, ["software", "version"])
}
else
%{
description: nil,
user_count: user_count,
status_count: nil,
instance_type: nil,
version: nil
}
end
end
@spec is_compatible_nodeinfo_version?(String.t()) :: boolean()
defp is_compatible_nodeinfo_version?(schema_url) do
version = String.slice(schema_url, (String.length(schema_url) - 3)..-1)
Enum.member?(["1.0", "1.1", "2.0"], version)
end
end