refactor and add nodeinfo support

This commit is contained in:
Tao Bror Bojlén 2019-08-09 14:15:22 +03:00
parent 81cc940665
commit 72031c7d3e
No known key found for this signature in database
GPG Key ID: C6EC7AAB905F9E6F
8 changed files with 197 additions and 135 deletions

View File

@ -30,8 +30,8 @@ defmodule Backend.Crawler.ApiCrawler do
@type t() :: %__MODULE__{
version: String.t(),
description: String.t(),
user_count: integer,
status_count: integer,
user_count: integer | nil,
status_count: integer | nil,
peers: [String.t()],
interactions: instance_interactions,
statuses_seen: integer,
@ -50,6 +50,7 @@ defmodule Backend.Crawler.ApiCrawler do
@doc """
Crawl the instance at the given domain.
Takes two arguments: the domain to crawl and the existing results (from nodeinfo).
"""
@callback crawl(String.t()) :: t()
@callback crawl(String.t(), ApiCrawler.t()) :: t()
end

View File

@ -4,7 +4,7 @@ defmodule Backend.Crawler do
"""
alias __MODULE__
alias Backend.Crawler.Crawlers.{Mastodon, Misskey}
alias Backend.Crawler.Crawlers.{Mastodon, Misskey, Nodeinfo}
alias Backend.Crawler.ApiCrawler
alias Backend.{Crawl, CrawlInteraction, Repo, Instance, InstancePeer}
import Ecto.Query
@ -16,7 +16,6 @@ defmodule Backend.Crawler do
:domain,
# a list of ApiCrawlers that will be attempted
:api_crawlers,
:found_api?,
:allows_crawling?,
:result,
:error
@ -25,7 +24,6 @@ defmodule Backend.Crawler do
@type t() :: %__MODULE__{
domain: String.t(),
api_crawlers: [ApiCrawler.t()],
found_api?: boolean,
allows_crawling?: boolean,
result: ApiCrawler.t() | nil,
error: String.t() | nil
@ -37,14 +35,14 @@ defmodule Backend.Crawler do
state = %Crawler{
domain: domain,
api_crawlers: [],
found_api?: false,
allows_crawling?: true,
result: nil,
error: nil
}
state
# register APICrawlers here
# These crawlers are run in the order they're registered. Nodeinfo should be the first one.
|> register(Nodeinfo)
|> register(Mastodon)
|> register(Misskey)
# go!
@ -56,33 +54,46 @@ defmodule Backend.Crawler do
# Adds a new ApiCrawler that run/1 will check.
defp register(%Crawler{api_crawlers: crawlers} = state, api_crawler) do
Map.put(state, :api_crawlers, [api_crawler | crawlers])
Map.put(state, :api_crawlers, crawlers ++ [api_crawler])
end
# Recursive function to check whether `domain` has an API that the head of the api_crawlers list can read.
# If so, crawls it. If not, continues with the tail of the api_crawlers list.
defp crawl(%Crawler{api_crawlers: [], domain: domain} = state) do
Logger.debug("Found no compatible API for #{domain}")
Map.put(state, :found_api?, false)
state
end
defp crawl(%Crawler{domain: domain, api_crawlers: [curr | remaining_crawlers]} = state) do
# Nodeinfo is distinct from other crawlers in that
# a) it should always be run first
# b) it passes the results on to the next crawlers (e.g. user_count)
defp crawl(%Crawler{api_crawlers: [Nodeinfo | remaining_crawlers], domain: domain} = state) do
with true <- Nodeinfo.allows_crawling?(domain), {:ok, nodeinfo} <- Nodeinfo.crawl(domain) do
Logger.debug("Found nodeinfo for #{domain}.")
crawl(%Crawler{state | result: nodeinfo, api_crawlers: remaining_crawlers})
else
_ ->
Logger.debug("Did not find nodeinfo for #{domain}.")
crawl(%Crawler{state | api_crawlers: remaining_crawlers})
end
end
defp crawl(
%Crawler{domain: domain, result: result, api_crawlers: [curr | remaining_crawlers]} =
state
) do
if curr.is_instance_type?(domain) do
Logger.debug("Found #{curr} instance")
state = Map.put(state, :found_api?, true)
if curr.allows_crawling?(domain) do
try do
%Crawler{state | result: curr.crawl(domain), api_crawlers: []}
%Crawler{state | result: curr.crawl(domain, result)}
rescue
e in HTTPoison.Error ->
Map.put(state, :error, "HTTPoison error: " <> HTTPoison.Error.message(e))
e in Jason.DecodeError ->
Map.put(state, :error, "Jason DecodeError: " <> Jason.DecodeError.message(e))
e in _ ->
Map.put(state, :error, "Unknown error: " <> inspect(e))
end
else
Logger.debug("#{domain} does not allow crawling.")
@ -99,7 +110,6 @@ defmodule Backend.Crawler do
defp save(%Crawler{
domain: domain,
result: result,
found_api?: true,
error: nil,
allows_crawling?: true
}) do
@ -256,19 +266,19 @@ defmodule Backend.Crawler do
next_crawl = NaiveDateTime.add(now, crawl_interval_mins * 60, :second)
Repo.transaction(fn ->
Repo.insert!(
%Instance{
domain: domain,
base_domain: get_base_domain(domain),
crawl_error: error,
crawl_error_count: error_count,
next_crawl: next_crawl
},
on_conflict: {:replace, [:base_domain, :crawl_error, :crawl_error_count, :next_crawl]},
conflict_target: :domain
)
end)
Repo.insert!(
%Instance{
domain: domain,
base_domain: get_base_domain(domain),
crawl_error: error,
crawl_error_count: error_count,
next_crawl: next_crawl,
updated_at: now
},
on_conflict:
{:replace, [:base_domain, :crawl_error, :crawl_error_count, :next_crawl, :updated_at]},
conflict_target: :domain
)
Appsignal.increment_counter("crawler.failure", 1)
end

View File

@ -8,9 +8,9 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
@impl ApiCrawler
def is_instance_type?(domain) do
case get("https://#{domain}/api/v1/instance") do
{:ok, response} -> if is_http_200?(response), do: has_title?(response.body), else: false
{:error, _error} -> false
case get_and_decode("https://#{domain}/api/v1/instance") do
{:ok, %{"title" => _title}} -> true
_other -> false
end
end
@ -26,8 +26,8 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
end
@impl ApiCrawler
def crawl(domain) do
instance = Jason.decode!(get!("https://#{domain}/api/v1/instance").body)
def crawl(domain, _current_result) do
instance = get_and_decode!("https://#{domain}/api/v1/instance")
user_count = get_in(instance, ["stats", "user_count"])
if is_above_user_threshold?(user_count) or has_opted_in?(domain) do
@ -51,12 +51,7 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
@spec crawl_large_instance(String.t(), any()) :: ApiCrawler.t()
defp crawl_large_instance(domain, instance) do
# servers may not publish peers
peers =
case get("https://#{domain}/api/v1/instance/peers") do
{:ok, response} -> if is_http_200?(response), do: Jason.decode!(response.body), else: []
{:error, _error} -> []
end
peers = get_peers(domain)
Logger.debug("Found #{length(peers)} peers.")
@ -124,11 +119,7 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
Logger.debug("Crawling #{endpoint}")
statuses =
endpoint
|> get!()
|> Map.get(:body)
|> Jason.decode!()
statuses = get_and_decode!(endpoint)
filtered_statuses =
statuses
@ -166,12 +157,11 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
end
end
# To check if the endpoint works as expected
@spec has_title?(String.t()) :: boolean
defp has_title?(body) do
case Jason.decode(body) do
{:ok, decoded} -> Map.has_key?(decoded, "title")
{:error, _error} -> false
defp get_peers(domain) do
# servers may not publish peers
case get_and_decode("https://#{domain}/api/v1/instance/peers") do
{:ok, peers} -> peers
{:error, _err} -> []
end
end

View File

@ -27,11 +27,9 @@ defmodule Backend.Crawler.Crawlers.Misskey do
end
@impl ApiCrawler
def crawl(domain) do
with {:ok, %{status_code: 200, body: stats_body}} <- post("https://#{domain}/api/stats") do
%{"originalUsersCount" => user_count, "originalNotesCount" => status_count} =
Jason.decode!(stats_body)
def crawl(domain, _result) do
with {:ok, %{"originalUsersCount" => user_count, "originalNotesCount" => status_count}} <-
post_and_decode("https://#{domain}/api/stats") do
if is_above_user_threshold?(user_count) or has_opted_in?(domain) do
crawl_large_instance(domain, user_count, status_count)
else
@ -107,11 +105,7 @@ defmodule Backend.Crawler.Crawlers.Misskey do
Logger.debug("Crawling #{endpoint} with untilId=#{until_id}")
statuses =
endpoint
|> post!(Jason.encode!(params))
|> Map.get(:body)
|> Jason.decode!()
statuses = post_and_decode!(endpoint, Jason.encode!(params))
filtered_statuses =
statuses
@ -151,35 +145,22 @@ defmodule Backend.Crawler.Crawlers.Misskey do
end
@spec get_version_and_description(String.t()) ::
{:ok, {String.t(), String.t()}} | {:error, String.t()}
{:ok, {String.t(), String.t()}} | {:error, Jason.DecodeError.t() | HTTPoison.Error.t()}
defp get_version_and_description(domain) do
case post("https://#{domain}/api/meta") do
{:ok, %{status_code: 200, body: body}} ->
case Jason.decode(body) do
{:ok, decoded} ->
{:ok, {Map.get(decoded, "version"), Map.get(decoded, "description")}}
case post_and_decode("https://#{domain}/api/meta") do
{:ok, %{"version" => version, "description" => description}} ->
{:ok, {version, description}}
{:error, _error} ->
{:error, "invalid response"}
end
_ ->
{:error, "unsuccesful request"}
{:error, err} ->
{:error, err}
end
end
@spec get_peers(String.t()) :: {:ok, [String.t()]} | {:error, Jason.DecodeError.t()}
defp get_peers(domain) do
case get("https://#{domain}/api/v1/instance/peers") do
{:ok, response} ->
with %{status_code: 200, body: body} <- response do
Jason.decode(body)
else
_ -> {:ok, []}
end
{:error, _} ->
{:ok, []}
case get_and_decode("https://#{domain}/api/v1/instance/peers") do
{:ok, peers} -> {:ok, peers}
{:error, _} -> {:ok, []}
end
end

View File

@ -0,0 +1,79 @@
defmodule Backend.Crawler.Crawlers.Nodeinfo do
require Logger
import Backend.Util
import Backend.Crawler.Util
@moduledoc """
This module is slightly different from the other crawlers.
It doesn't implement the ApiCrawler spec because it isn't run as a self-contained crawler.
Instead, it's run before all the other crawlers.
This is to get the user count. Some servers don't publish this in other places (e.g. GNU Social, PeerTube) so we need
nodeinfo to know whether it's a personal instance or not.
"""
@spec allows_crawling?(String.t()) :: boolean()
def allows_crawling?(domain) do
[
".well-known/nodeinfo"
]
|> Enum.map(fn endpoint -> "https://#{domain}#{endpoint}" end)
|> urls_are_crawlable?()
end
def crawl(domain) do
with {:ok, nodeinfo_url} <- get_nodeinfo_url(domain),
{:ok, nodeinfo} <- get_nodeinfo(nodeinfo_url) do
{:ok, nodeinfo}
else
_other -> {:error, nil}
end
end
defp get_nodeinfo_url(domain) do
case get_and_decode("https://#{domain}/.well-known/nodeinfo") do
{:ok, response} -> {:ok, process_nodeinfo_url(response)}
{:error, err} -> {:error, err}
end
end
defp process_nodeinfo_url(response) do
response
|> Map.get("links")
|> Enum.filter(fn %{"rel" => rel} -> is_compatible_nodeinfo_version?(rel) end)
|> Kernel.hd()
|> Map.get("href")
end
defp get_nodeinfo(nodeinfo_url) do
case get_and_decode(nodeinfo_url) do
{:ok, nodeinfo} -> {:ok, process_nodeinfo(nodeinfo)}
{:error, err} -> {:error, err}
end
end
defp process_nodeinfo(nodeinfo) do
# Both of these are used, depending on the server implementation
description =
[
get_in(nodeinfo, ["metadata", "description"]),
get_in(nodeinfo, ["metadata", "nodeDescription"])
]
|> Enum.filter(fn d -> d != nil end)
|> Enum.at(0)
%{
description: description,
user_count: get_in(nodeinfo, ["usage", "users", "total"]),
status_count: get_in(nodeinfo, ["usage", "localPosts"]),
instance_type: String.to_atom(get_in(nodeinfo, ["software", "name"])),
version: get_in(nodeinfo, ["software", "version"])
}
end
@spec is_compatible_nodeinfo_version?(String.t()) :: boolean()
defp is_compatible_nodeinfo_version?(schema_url) do
version = String.slice(schema_url, (String.length(schema_url) - 3)..-1)
Enum.member?(["1.0", "1.1", "2.0"], version)
end
end

View File

@ -36,49 +36,6 @@ defmodule Backend.Crawler.Util do
end
end
def get(url) do
# TODO: add version number to user agent?
HTTPoison.get(url, [{"User-Agent", get_config(:user_agent)}],
hackney: [pool: :crawler],
recv_timeout: 15000,
timeout: 15000
)
end
@spec get!(binary) :: %{
:__struct__ => HTTPoison.AsyncResponse | HTTPoison.Response,
optional(:body) => any,
optional(:headers) => [any],
optional(:id) => reference,
optional(:request) => HTTPoison.Request.t(),
optional(:request_url) => any,
optional(:status_code) => integer
}
def get!(url) do
# TODO: add version number to user agent?
HTTPoison.get!(url, [{"User-Agent", get_config(:user_agent)}],
hackney: [pool: :crawler],
recv_timeout: 15000,
timeout: 15000
)
end
def post(url, body \\ "") do
HTTPoison.post(url, body, [{"User-Agent", get_config(:user_agent)}],
hackney: [pool: :crawler],
recv_timeout: 15000,
timeout: 15000
)
end
def post!(url, body \\ "") do
HTTPoison.post!(url, body, [{"User-Agent", get_config(:user_agent)}],
hackney: [pool: :crawler],
recv_timeout: 15000,
timeout: 15000
)
end
@spec urls_are_crawlable?([String.t()]) :: boolean()
def urls_are_crawlable?(urls) do
user_agent = get_config(:user_agent)

View File

@ -145,4 +145,54 @@ defmodule Backend.Util do
def convert_keys_to_atoms(map) do
map |> Map.new(fn {k, v} -> {String.to_atom(k), v} end)
end
@doc """
Gets and decodes a HTTP response.
"""
@spec get_and_decode(String.t()) ::
{:ok, any()} | {:error, Jason.DecodeError.t() | HTTPoison.Error.t()}
def get_and_decode(url) do
case HTTPoison.get(url, [{"User-Agent", get_config(:user_agent)}],
hackney: [pool: :crawler],
recv_timeout: 15000,
timeout: 15000
) do
{:ok, %{status_code: 200, body: body}} -> Jason.decode(body)
{:ok, _} -> {:error, %HTTPoison.Error{reason: "Non-200 response"}}
{:error, err} -> {:error, err}
end
end
@spec get_and_decode!(String.t()) :: any()
def get_and_decode!(url) do
case get_and_decode(url) do
{:ok, decoded} -> decoded
{:error, error} -> raise error
end
end
@doc """
POSTS to a HTTP endpoint and decodes the JSON response.
"""
@spec post_and_decode(String.t(), String.t()) ::
{:ok, any()} | {:error, Jason.DecodeError.t() | HTTPoison.Error.t()}
def post_and_decode(url, body \\ "") do
case HTTPoison.post(url, body, [{"User-Agent", get_config(:user_agent)}],
hackney: [pool: :crawler],
recv_timeout: 15000,
timeout: 15000
) do
{:ok, %{status_code: 200, body: response_body}} -> Jason.decode(response_body)
{:ok, _} -> {:error, %HTTPoison.Error{reason: "Non-200 response"}}
{:error, err} -> {:error, err}
end
end
@spec post_and_decode!(String.t(), String.t()) :: any()
def post_and_decode!(url, body \\ "") do
case post_and_decode(url, body) do
{:ok, decoded} -> decoded
{:error, error} -> raise error
end
end
end

View File

@ -14,10 +14,7 @@ defmodule BackendWeb.AdminLoginController do
# TODO: this assumes mastodon/pleroma API
cleaned_domain = clean_domain(domain)
instance_data =
HTTPoison.get!("https://#{cleaned_domain}/api/v1/instance")
|> Map.get(:body)
|> Jason.decode!()
instance_data = get_and_decode!("https://#{cleaned_domain}/api/v1/instance")
render(conn, "show.json", instance_data: instance_data, cleaned_domain: cleaned_domain)
end
@ -25,10 +22,7 @@ defmodule BackendWeb.AdminLoginController do
def create(conn, %{"domain" => domain, "type" => type}) do
cleaned_domain = clean_domain(domain)
instance_data =
HTTPoison.get!("https://#{cleaned_domain}/api/v1/instance")
|> Map.get(:body)
|> Jason.decode!()
instance_data = get_and_decode!("https://#{cleaned_domain}/api/v1/instance")
error =
cond do