refactor and add nodeinfo support
This commit is contained in:
parent
81cc940665
commit
72031c7d3e
|
@ -30,8 +30,8 @@ defmodule Backend.Crawler.ApiCrawler do
|
|||
@type t() :: %__MODULE__{
|
||||
version: String.t(),
|
||||
description: String.t(),
|
||||
user_count: integer,
|
||||
status_count: integer,
|
||||
user_count: integer | nil,
|
||||
status_count: integer | nil,
|
||||
peers: [String.t()],
|
||||
interactions: instance_interactions,
|
||||
statuses_seen: integer,
|
||||
|
@ -50,6 +50,7 @@ defmodule Backend.Crawler.ApiCrawler do
|
|||
|
||||
@doc """
|
||||
Crawl the instance at the given domain.
|
||||
Takes two arguments: the domain to crawl and the existing results (from nodeinfo).
|
||||
"""
|
||||
@callback crawl(String.t()) :: t()
|
||||
@callback crawl(String.t(), ApiCrawler.t()) :: t()
|
||||
end
|
||||
|
|
|
@ -4,7 +4,7 @@ defmodule Backend.Crawler do
|
|||
"""
|
||||
|
||||
alias __MODULE__
|
||||
alias Backend.Crawler.Crawlers.{Mastodon, Misskey}
|
||||
alias Backend.Crawler.Crawlers.{Mastodon, Misskey, Nodeinfo}
|
||||
alias Backend.Crawler.ApiCrawler
|
||||
alias Backend.{Crawl, CrawlInteraction, Repo, Instance, InstancePeer}
|
||||
import Ecto.Query
|
||||
|
@ -16,7 +16,6 @@ defmodule Backend.Crawler do
|
|||
:domain,
|
||||
# a list of ApiCrawlers that will be attempted
|
||||
:api_crawlers,
|
||||
:found_api?,
|
||||
:allows_crawling?,
|
||||
:result,
|
||||
:error
|
||||
|
@ -25,7 +24,6 @@ defmodule Backend.Crawler do
|
|||
@type t() :: %__MODULE__{
|
||||
domain: String.t(),
|
||||
api_crawlers: [ApiCrawler.t()],
|
||||
found_api?: boolean,
|
||||
allows_crawling?: boolean,
|
||||
result: ApiCrawler.t() | nil,
|
||||
error: String.t() | nil
|
||||
|
@ -37,14 +35,14 @@ defmodule Backend.Crawler do
|
|||
state = %Crawler{
|
||||
domain: domain,
|
||||
api_crawlers: [],
|
||||
found_api?: false,
|
||||
allows_crawling?: true,
|
||||
result: nil,
|
||||
error: nil
|
||||
}
|
||||
|
||||
state
|
||||
# register APICrawlers here
|
||||
# These crawlers are run in the order they're registered. Nodeinfo should be the first one.
|
||||
|> register(Nodeinfo)
|
||||
|> register(Mastodon)
|
||||
|> register(Misskey)
|
||||
# go!
|
||||
|
@ -56,33 +54,46 @@ defmodule Backend.Crawler do
|
|||
|
||||
# Adds a new ApiCrawler that run/1 will check.
|
||||
defp register(%Crawler{api_crawlers: crawlers} = state, api_crawler) do
|
||||
Map.put(state, :api_crawlers, [api_crawler | crawlers])
|
||||
Map.put(state, :api_crawlers, crawlers ++ [api_crawler])
|
||||
end
|
||||
|
||||
# Recursive function to check whether `domain` has an API that the head of the api_crawlers list can read.
|
||||
# If so, crawls it. If not, continues with the tail of the api_crawlers list.
|
||||
defp crawl(%Crawler{api_crawlers: [], domain: domain} = state) do
|
||||
Logger.debug("Found no compatible API for #{domain}")
|
||||
Map.put(state, :found_api?, false)
|
||||
state
|
||||
end
|
||||
|
||||
defp crawl(%Crawler{domain: domain, api_crawlers: [curr | remaining_crawlers]} = state) do
|
||||
# Nodeinfo is distinct from other crawlers in that
|
||||
# a) it should always be run first
|
||||
# b) it passes the results on to the next crawlers (e.g. user_count)
|
||||
defp crawl(%Crawler{api_crawlers: [Nodeinfo | remaining_crawlers], domain: domain} = state) do
|
||||
with true <- Nodeinfo.allows_crawling?(domain), {:ok, nodeinfo} <- Nodeinfo.crawl(domain) do
|
||||
Logger.debug("Found nodeinfo for #{domain}.")
|
||||
crawl(%Crawler{state | result: nodeinfo, api_crawlers: remaining_crawlers})
|
||||
else
|
||||
_ ->
|
||||
Logger.debug("Did not find nodeinfo for #{domain}.")
|
||||
crawl(%Crawler{state | api_crawlers: remaining_crawlers})
|
||||
end
|
||||
end
|
||||
|
||||
defp crawl(
|
||||
%Crawler{domain: domain, result: result, api_crawlers: [curr | remaining_crawlers]} =
|
||||
state
|
||||
) do
|
||||
if curr.is_instance_type?(domain) do
|
||||
Logger.debug("Found #{curr} instance")
|
||||
state = Map.put(state, :found_api?, true)
|
||||
|
||||
if curr.allows_crawling?(domain) do
|
||||
try do
|
||||
%Crawler{state | result: curr.crawl(domain), api_crawlers: []}
|
||||
%Crawler{state | result: curr.crawl(domain, result)}
|
||||
rescue
|
||||
e in HTTPoison.Error ->
|
||||
Map.put(state, :error, "HTTPoison error: " <> HTTPoison.Error.message(e))
|
||||
|
||||
e in Jason.DecodeError ->
|
||||
Map.put(state, :error, "Jason DecodeError: " <> Jason.DecodeError.message(e))
|
||||
|
||||
e in _ ->
|
||||
Map.put(state, :error, "Unknown error: " <> inspect(e))
|
||||
end
|
||||
else
|
||||
Logger.debug("#{domain} does not allow crawling.")
|
||||
|
@ -99,7 +110,6 @@ defmodule Backend.Crawler do
|
|||
defp save(%Crawler{
|
||||
domain: domain,
|
||||
result: result,
|
||||
found_api?: true,
|
||||
error: nil,
|
||||
allows_crawling?: true
|
||||
}) do
|
||||
|
@ -256,19 +266,19 @@ defmodule Backend.Crawler do
|
|||
|
||||
next_crawl = NaiveDateTime.add(now, crawl_interval_mins * 60, :second)
|
||||
|
||||
Repo.transaction(fn ->
|
||||
Repo.insert!(
|
||||
%Instance{
|
||||
domain: domain,
|
||||
base_domain: get_base_domain(domain),
|
||||
crawl_error: error,
|
||||
crawl_error_count: error_count,
|
||||
next_crawl: next_crawl
|
||||
},
|
||||
on_conflict: {:replace, [:base_domain, :crawl_error, :crawl_error_count, :next_crawl]},
|
||||
conflict_target: :domain
|
||||
)
|
||||
end)
|
||||
Repo.insert!(
|
||||
%Instance{
|
||||
domain: domain,
|
||||
base_domain: get_base_domain(domain),
|
||||
crawl_error: error,
|
||||
crawl_error_count: error_count,
|
||||
next_crawl: next_crawl,
|
||||
updated_at: now
|
||||
},
|
||||
on_conflict:
|
||||
{:replace, [:base_domain, :crawl_error, :crawl_error_count, :next_crawl, :updated_at]},
|
||||
conflict_target: :domain
|
||||
)
|
||||
|
||||
Appsignal.increment_counter("crawler.failure", 1)
|
||||
end
|
||||
|
|
|
@ -8,9 +8,9 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
|
|||
|
||||
@impl ApiCrawler
|
||||
def is_instance_type?(domain) do
|
||||
case get("https://#{domain}/api/v1/instance") do
|
||||
{:ok, response} -> if is_http_200?(response), do: has_title?(response.body), else: false
|
||||
{:error, _error} -> false
|
||||
case get_and_decode("https://#{domain}/api/v1/instance") do
|
||||
{:ok, %{"title" => _title}} -> true
|
||||
_other -> false
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -26,8 +26,8 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
|
|||
end
|
||||
|
||||
@impl ApiCrawler
|
||||
def crawl(domain) do
|
||||
instance = Jason.decode!(get!("https://#{domain}/api/v1/instance").body)
|
||||
def crawl(domain, _current_result) do
|
||||
instance = get_and_decode!("https://#{domain}/api/v1/instance")
|
||||
user_count = get_in(instance, ["stats", "user_count"])
|
||||
|
||||
if is_above_user_threshold?(user_count) or has_opted_in?(domain) do
|
||||
|
@ -51,12 +51,7 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
|
|||
|
||||
@spec crawl_large_instance(String.t(), any()) :: ApiCrawler.t()
|
||||
defp crawl_large_instance(domain, instance) do
|
||||
# servers may not publish peers
|
||||
peers =
|
||||
case get("https://#{domain}/api/v1/instance/peers") do
|
||||
{:ok, response} -> if is_http_200?(response), do: Jason.decode!(response.body), else: []
|
||||
{:error, _error} -> []
|
||||
end
|
||||
peers = get_peers(domain)
|
||||
|
||||
Logger.debug("Found #{length(peers)} peers.")
|
||||
|
||||
|
@ -124,11 +119,7 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
|
|||
|
||||
Logger.debug("Crawling #{endpoint}")
|
||||
|
||||
statuses =
|
||||
endpoint
|
||||
|> get!()
|
||||
|> Map.get(:body)
|
||||
|> Jason.decode!()
|
||||
statuses = get_and_decode!(endpoint)
|
||||
|
||||
filtered_statuses =
|
||||
statuses
|
||||
|
@ -166,12 +157,11 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
|
|||
end
|
||||
end
|
||||
|
||||
# To check if the endpoint works as expected
|
||||
@spec has_title?(String.t()) :: boolean
|
||||
defp has_title?(body) do
|
||||
case Jason.decode(body) do
|
||||
{:ok, decoded} -> Map.has_key?(decoded, "title")
|
||||
{:error, _error} -> false
|
||||
defp get_peers(domain) do
|
||||
# servers may not publish peers
|
||||
case get_and_decode("https://#{domain}/api/v1/instance/peers") do
|
||||
{:ok, peers} -> peers
|
||||
{:error, _err} -> []
|
||||
end
|
||||
end
|
||||
|
||||
|
|
|
@ -27,11 +27,9 @@ defmodule Backend.Crawler.Crawlers.Misskey do
|
|||
end
|
||||
|
||||
@impl ApiCrawler
|
||||
def crawl(domain) do
|
||||
with {:ok, %{status_code: 200, body: stats_body}} <- post("https://#{domain}/api/stats") do
|
||||
%{"originalUsersCount" => user_count, "originalNotesCount" => status_count} =
|
||||
Jason.decode!(stats_body)
|
||||
|
||||
def crawl(domain, _result) do
|
||||
with {:ok, %{"originalUsersCount" => user_count, "originalNotesCount" => status_count}} <-
|
||||
post_and_decode("https://#{domain}/api/stats") do
|
||||
if is_above_user_threshold?(user_count) or has_opted_in?(domain) do
|
||||
crawl_large_instance(domain, user_count, status_count)
|
||||
else
|
||||
|
@ -107,11 +105,7 @@ defmodule Backend.Crawler.Crawlers.Misskey do
|
|||
|
||||
Logger.debug("Crawling #{endpoint} with untilId=#{until_id}")
|
||||
|
||||
statuses =
|
||||
endpoint
|
||||
|> post!(Jason.encode!(params))
|
||||
|> Map.get(:body)
|
||||
|> Jason.decode!()
|
||||
statuses = post_and_decode!(endpoint, Jason.encode!(params))
|
||||
|
||||
filtered_statuses =
|
||||
statuses
|
||||
|
@ -151,35 +145,22 @@ defmodule Backend.Crawler.Crawlers.Misskey do
|
|||
end
|
||||
|
||||
@spec get_version_and_description(String.t()) ::
|
||||
{:ok, {String.t(), String.t()}} | {:error, String.t()}
|
||||
{:ok, {String.t(), String.t()}} | {:error, Jason.DecodeError.t() | HTTPoison.Error.t()}
|
||||
defp get_version_and_description(domain) do
|
||||
case post("https://#{domain}/api/meta") do
|
||||
{:ok, %{status_code: 200, body: body}} ->
|
||||
case Jason.decode(body) do
|
||||
{:ok, decoded} ->
|
||||
{:ok, {Map.get(decoded, "version"), Map.get(decoded, "description")}}
|
||||
case post_and_decode("https://#{domain}/api/meta") do
|
||||
{:ok, %{"version" => version, "description" => description}} ->
|
||||
{:ok, {version, description}}
|
||||
|
||||
{:error, _error} ->
|
||||
{:error, "invalid response"}
|
||||
end
|
||||
|
||||
_ ->
|
||||
{:error, "unsuccesful request"}
|
||||
{:error, err} ->
|
||||
{:error, err}
|
||||
end
|
||||
end
|
||||
|
||||
@spec get_peers(String.t()) :: {:ok, [String.t()]} | {:error, Jason.DecodeError.t()}
|
||||
defp get_peers(domain) do
|
||||
case get("https://#{domain}/api/v1/instance/peers") do
|
||||
{:ok, response} ->
|
||||
with %{status_code: 200, body: body} <- response do
|
||||
Jason.decode(body)
|
||||
else
|
||||
_ -> {:ok, []}
|
||||
end
|
||||
|
||||
{:error, _} ->
|
||||
{:ok, []}
|
||||
case get_and_decode("https://#{domain}/api/v1/instance/peers") do
|
||||
{:ok, peers} -> {:ok, peers}
|
||||
{:error, _} -> {:ok, []}
|
||||
end
|
||||
end
|
||||
|
||||
|
|
|
@ -0,0 +1,79 @@
|
|||
defmodule Backend.Crawler.Crawlers.Nodeinfo do
|
||||
require Logger
|
||||
import Backend.Util
|
||||
import Backend.Crawler.Util
|
||||
|
||||
@moduledoc """
|
||||
This module is slightly different from the other crawlers.
|
||||
It doesn't implement the ApiCrawler spec because it isn't run as a self-contained crawler.
|
||||
Instead, it's run before all the other crawlers.
|
||||
|
||||
This is to get the user count. Some servers don't publish this in other places (e.g. GNU Social, PeerTube) so we need
|
||||
nodeinfo to know whether it's a personal instance or not.
|
||||
"""
|
||||
|
||||
@spec allows_crawling?(String.t()) :: boolean()
|
||||
def allows_crawling?(domain) do
|
||||
[
|
||||
".well-known/nodeinfo"
|
||||
]
|
||||
|> Enum.map(fn endpoint -> "https://#{domain}#{endpoint}" end)
|
||||
|> urls_are_crawlable?()
|
||||
end
|
||||
|
||||
def crawl(domain) do
|
||||
with {:ok, nodeinfo_url} <- get_nodeinfo_url(domain),
|
||||
{:ok, nodeinfo} <- get_nodeinfo(nodeinfo_url) do
|
||||
{:ok, nodeinfo}
|
||||
else
|
||||
_other -> {:error, nil}
|
||||
end
|
||||
end
|
||||
|
||||
defp get_nodeinfo_url(domain) do
|
||||
case get_and_decode("https://#{domain}/.well-known/nodeinfo") do
|
||||
{:ok, response} -> {:ok, process_nodeinfo_url(response)}
|
||||
{:error, err} -> {:error, err}
|
||||
end
|
||||
end
|
||||
|
||||
defp process_nodeinfo_url(response) do
|
||||
response
|
||||
|> Map.get("links")
|
||||
|> Enum.filter(fn %{"rel" => rel} -> is_compatible_nodeinfo_version?(rel) end)
|
||||
|> Kernel.hd()
|
||||
|> Map.get("href")
|
||||
end
|
||||
|
||||
defp get_nodeinfo(nodeinfo_url) do
|
||||
case get_and_decode(nodeinfo_url) do
|
||||
{:ok, nodeinfo} -> {:ok, process_nodeinfo(nodeinfo)}
|
||||
{:error, err} -> {:error, err}
|
||||
end
|
||||
end
|
||||
|
||||
defp process_nodeinfo(nodeinfo) do
|
||||
# Both of these are used, depending on the server implementation
|
||||
description =
|
||||
[
|
||||
get_in(nodeinfo, ["metadata", "description"]),
|
||||
get_in(nodeinfo, ["metadata", "nodeDescription"])
|
||||
]
|
||||
|> Enum.filter(fn d -> d != nil end)
|
||||
|> Enum.at(0)
|
||||
|
||||
%{
|
||||
description: description,
|
||||
user_count: get_in(nodeinfo, ["usage", "users", "total"]),
|
||||
status_count: get_in(nodeinfo, ["usage", "localPosts"]),
|
||||
instance_type: String.to_atom(get_in(nodeinfo, ["software", "name"])),
|
||||
version: get_in(nodeinfo, ["software", "version"])
|
||||
}
|
||||
end
|
||||
|
||||
@spec is_compatible_nodeinfo_version?(String.t()) :: boolean()
|
||||
defp is_compatible_nodeinfo_version?(schema_url) do
|
||||
version = String.slice(schema_url, (String.length(schema_url) - 3)..-1)
|
||||
Enum.member?(["1.0", "1.1", "2.0"], version)
|
||||
end
|
||||
end
|
|
@ -36,49 +36,6 @@ defmodule Backend.Crawler.Util do
|
|||
end
|
||||
end
|
||||
|
||||
def get(url) do
|
||||
# TODO: add version number to user agent?
|
||||
HTTPoison.get(url, [{"User-Agent", get_config(:user_agent)}],
|
||||
hackney: [pool: :crawler],
|
||||
recv_timeout: 15000,
|
||||
timeout: 15000
|
||||
)
|
||||
end
|
||||
|
||||
@spec get!(binary) :: %{
|
||||
:__struct__ => HTTPoison.AsyncResponse | HTTPoison.Response,
|
||||
optional(:body) => any,
|
||||
optional(:headers) => [any],
|
||||
optional(:id) => reference,
|
||||
optional(:request) => HTTPoison.Request.t(),
|
||||
optional(:request_url) => any,
|
||||
optional(:status_code) => integer
|
||||
}
|
||||
def get!(url) do
|
||||
# TODO: add version number to user agent?
|
||||
HTTPoison.get!(url, [{"User-Agent", get_config(:user_agent)}],
|
||||
hackney: [pool: :crawler],
|
||||
recv_timeout: 15000,
|
||||
timeout: 15000
|
||||
)
|
||||
end
|
||||
|
||||
def post(url, body \\ "") do
|
||||
HTTPoison.post(url, body, [{"User-Agent", get_config(:user_agent)}],
|
||||
hackney: [pool: :crawler],
|
||||
recv_timeout: 15000,
|
||||
timeout: 15000
|
||||
)
|
||||
end
|
||||
|
||||
def post!(url, body \\ "") do
|
||||
HTTPoison.post!(url, body, [{"User-Agent", get_config(:user_agent)}],
|
||||
hackney: [pool: :crawler],
|
||||
recv_timeout: 15000,
|
||||
timeout: 15000
|
||||
)
|
||||
end
|
||||
|
||||
@spec urls_are_crawlable?([String.t()]) :: boolean()
|
||||
def urls_are_crawlable?(urls) do
|
||||
user_agent = get_config(:user_agent)
|
||||
|
|
|
@ -145,4 +145,54 @@ defmodule Backend.Util do
|
|||
def convert_keys_to_atoms(map) do
|
||||
map |> Map.new(fn {k, v} -> {String.to_atom(k), v} end)
|
||||
end
|
||||
|
||||
@doc """
|
||||
Gets and decodes a HTTP response.
|
||||
"""
|
||||
@spec get_and_decode(String.t()) ::
|
||||
{:ok, any()} | {:error, Jason.DecodeError.t() | HTTPoison.Error.t()}
|
||||
def get_and_decode(url) do
|
||||
case HTTPoison.get(url, [{"User-Agent", get_config(:user_agent)}],
|
||||
hackney: [pool: :crawler],
|
||||
recv_timeout: 15000,
|
||||
timeout: 15000
|
||||
) do
|
||||
{:ok, %{status_code: 200, body: body}} -> Jason.decode(body)
|
||||
{:ok, _} -> {:error, %HTTPoison.Error{reason: "Non-200 response"}}
|
||||
{:error, err} -> {:error, err}
|
||||
end
|
||||
end
|
||||
|
||||
@spec get_and_decode!(String.t()) :: any()
|
||||
def get_and_decode!(url) do
|
||||
case get_and_decode(url) do
|
||||
{:ok, decoded} -> decoded
|
||||
{:error, error} -> raise error
|
||||
end
|
||||
end
|
||||
|
||||
@doc """
|
||||
POSTS to a HTTP endpoint and decodes the JSON response.
|
||||
"""
|
||||
@spec post_and_decode(String.t(), String.t()) ::
|
||||
{:ok, any()} | {:error, Jason.DecodeError.t() | HTTPoison.Error.t()}
|
||||
def post_and_decode(url, body \\ "") do
|
||||
case HTTPoison.post(url, body, [{"User-Agent", get_config(:user_agent)}],
|
||||
hackney: [pool: :crawler],
|
||||
recv_timeout: 15000,
|
||||
timeout: 15000
|
||||
) do
|
||||
{:ok, %{status_code: 200, body: response_body}} -> Jason.decode(response_body)
|
||||
{:ok, _} -> {:error, %HTTPoison.Error{reason: "Non-200 response"}}
|
||||
{:error, err} -> {:error, err}
|
||||
end
|
||||
end
|
||||
|
||||
@spec post_and_decode!(String.t(), String.t()) :: any()
|
||||
def post_and_decode!(url, body \\ "") do
|
||||
case post_and_decode(url, body) do
|
||||
{:ok, decoded} -> decoded
|
||||
{:error, error} -> raise error
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -14,10 +14,7 @@ defmodule BackendWeb.AdminLoginController do
|
|||
# TODO: this assumes mastodon/pleroma API
|
||||
cleaned_domain = clean_domain(domain)
|
||||
|
||||
instance_data =
|
||||
HTTPoison.get!("https://#{cleaned_domain}/api/v1/instance")
|
||||
|> Map.get(:body)
|
||||
|> Jason.decode!()
|
||||
instance_data = get_and_decode!("https://#{cleaned_domain}/api/v1/instance")
|
||||
|
||||
render(conn, "show.json", instance_data: instance_data, cleaned_domain: cleaned_domain)
|
||||
end
|
||||
|
@ -25,10 +22,7 @@ defmodule BackendWeb.AdminLoginController do
|
|||
def create(conn, %{"domain" => domain, "type" => type}) do
|
||||
cleaned_domain = clean_domain(domain)
|
||||
|
||||
instance_data =
|
||||
HTTPoison.get!("https://#{cleaned_domain}/api/v1/instance")
|
||||
|> Map.get(:body)
|
||||
|> Jason.decode!()
|
||||
instance_data = get_and_decode!("https://#{cleaned_domain}/api/v1/instance")
|
||||
|
||||
error =
|
||||
cond do
|
||||
|
|
Loading…
Reference in New Issue