add GNU social crawler

This commit is contained in:
Tao Bror Bojlén 2019-08-09 18:17:29 +03:00
parent 72031c7d3e
commit 509924ed52
No known key found for this signature in database
GPG key ID: C6EC7AAB905F9E6F
7 changed files with 281 additions and 48 deletions

View file

@ -11,10 +11,12 @@ defmodule Backend.Crawler.ApiCrawler do
* Make sure to check the most recent crawl of the instance so you don't re-crawl old statuses
"""
alias Backend.Crawler.Crawlers.Nodeinfo
# {domain_mentioned, count}
@type instance_interactions :: %{String.t() => integer}
@type instance_type :: :mastodon | :pleroma | :gab | :misskey
@type instance_type :: :mastodon | :pleroma | :gab | :misskey | :gnusocial
defstruct [
:version,
@ -40,8 +42,9 @@ defmodule Backend.Crawler.ApiCrawler do
@doc """
Check whether the instance at the given domain is of the type that this ApiCrawler implements.
Arguments are the instance domain and the nodeinfo results.
"""
@callback is_instance_type?(String.t()) :: boolean()
@callback is_instance_type?(String.t(), Nodeinfo.t()) :: boolean()
@doc """
Check whether the instance allows crawling according to its robots.txt or otherwise.
@ -52,5 +55,5 @@ defmodule Backend.Crawler.ApiCrawler do
Crawl the instance at the given domain.
Takes two arguments: the domain to crawl and the existing results (from nodeinfo).
"""
@callback crawl(String.t(), ApiCrawler.t()) :: t()
@callback crawl(String.t(), Nodeinfo.t()) :: t()
end

View file

@ -4,7 +4,7 @@ defmodule Backend.Crawler do
"""
alias __MODULE__
alias Backend.Crawler.Crawlers.{Mastodon, Misskey, Nodeinfo}
alias Backend.Crawler.Crawlers.{GnuSocial, Mastodon, Misskey, Nodeinfo}
alias Backend.Crawler.ApiCrawler
alias Backend.{Crawl, CrawlInteraction, Repo, Instance, InstancePeer}
import Ecto.Query
@ -17,6 +17,7 @@ defmodule Backend.Crawler do
# a list of ApiCrawlers that will be attempted
:api_crawlers,
:allows_crawling?,
:found_api?,
:result,
:error
]
@ -25,6 +26,7 @@ defmodule Backend.Crawler do
domain: String.t(),
api_crawlers: [ApiCrawler.t()],
allows_crawling?: boolean,
found_api?: boolean,
result: ApiCrawler.t() | nil,
error: String.t() | nil
}
@ -36,6 +38,7 @@ defmodule Backend.Crawler do
domain: domain,
api_crawlers: [],
allows_crawling?: true,
found_api?: false,
result: nil,
error: nil
}
@ -45,6 +48,7 @@ defmodule Backend.Crawler do
|> register(Nodeinfo)
|> register(Mastodon)
|> register(Misskey)
|> register(GnuSocial)
# go!
|> crawl()
|> save()
@ -70,7 +74,8 @@ defmodule Backend.Crawler do
defp crawl(%Crawler{api_crawlers: [Nodeinfo | remaining_crawlers], domain: domain} = state) do
with true <- Nodeinfo.allows_crawling?(domain), {:ok, nodeinfo} <- Nodeinfo.crawl(domain) do
Logger.debug("Found nodeinfo for #{domain}.")
crawl(%Crawler{state | result: nodeinfo, api_crawlers: remaining_crawlers})
result = Map.merge(nodeinfo, %{peers: [], interactions: %{}, statuses_seen: 0})
crawl(%Crawler{state | result: result, found_api?: true, api_crawlers: remaining_crawlers})
else
_ ->
Logger.debug("Did not find nodeinfo for #{domain}.")
@ -82,12 +87,12 @@ defmodule Backend.Crawler do
%Crawler{domain: domain, result: result, api_crawlers: [curr | remaining_crawlers]} =
state
) do
if curr.is_instance_type?(domain) do
if curr.is_instance_type?(domain, result) do
Logger.debug("Found #{curr} instance")
if curr.allows_crawling?(domain) do
try do
%Crawler{state | result: curr.crawl(domain, result)}
%Crawler{state | result: curr.crawl(domain, result), found_api?: true}
rescue
e in HTTPoison.Error ->
Map.put(state, :error, "HTTPoison error: " <> HTTPoison.Error.message(e))
@ -111,7 +116,8 @@ defmodule Backend.Crawler do
domain: domain,
result: result,
error: nil,
allows_crawling?: true
allows_crawling?: true,
found_api?: true
}) do
now = get_now()
@ -250,7 +256,7 @@ defmodule Backend.Crawler do
cond do
not allows_crawling -> "robots.txt"
error == nil -> "no api found"
true -> "unknown error"
true -> error
end
# The "+1" is this error!

View file

@ -0,0 +1,178 @@
defmodule Backend.Crawler.Crawlers.GnuSocial do
alias Backend.Crawler.ApiCrawler
alias Backend.Crawler.Crawlers.Nodeinfo
import Backend.Crawler.Util
import Backend.Util
require Logger
@behaviour ApiCrawler
@impl ApiCrawler
def is_instance_type?(_domain, nodeinfo_result) do
nodeinfo_result != nil and Map.get(nodeinfo_result, :instance_type) == :gnusocial
end
@impl ApiCrawler
def allows_crawling?(domain) do
[
"/api/statuses/public_timeline.json"
]
|> Enum.map(fn endpoint -> "https://#{domain}#{endpoint}" end)
|> urls_are_crawlable?()
end
@impl ApiCrawler
def crawl(domain, nodeinfo_result) do
if nodeinfo_result |> Map.get(:user_count) |> is_above_user_threshold?() do
crawl_large_instance(domain, nodeinfo_result)
else
nodeinfo_result
end
end
@spec crawl_large_instance(String.t(), Nodeinfo.t()) :: ApiCrawler.t()
defp crawl_large_instance(domain, nodeinfo_result) do
status_datetime_threshold =
NaiveDateTime.utc_now()
|> NaiveDateTime.add(get_config(:status_age_limit_days) * 24 * 3600 * -1, :second)
# Don't get any statuses older than this
min_timestamp = max_datetime(get_last_crawl_timestamp(domain), status_datetime_threshold)
{interactions, statuses_seen} = get_interactions(domain, min_timestamp)
Map.merge(nodeinfo_result, %{
interactions: interactions,
statuses_seen: statuses_seen,
peers: []
})
end
@spec get_interactions(
String.t(),
NaiveDateTime.t(),
String.t() | nil,
ApiCrawler.instance_interactions(),
integer()
) :: {ApiCrawler.instance_interactions(), integer()}
defp get_interactions(
domain,
min_timestamp,
max_id \\ nil,
interactions \\ %{},
statuses_seen \\ 0
) do
endpoint = "https://#{domain}/api/statuses/public_timeline.json"
endpoint =
if max_id != nil do
endpoint <> "?max_id=#{max_id}"
else
endpoint
end
Logger.debug("Crawling #{endpoint}")
statuses = get_and_decode!(endpoint)
# Filter to statuses that are in the correct timeframe
filtered_statuses =
statuses
|> Enum.filter(fn s ->
s["created_at"]
|> parse_timestamp()
|> is_after?(min_timestamp)
end)
if length(filtered_statuses) > 0 do
# Filter down further to statuses that a) aren't faves and b) aren't from #nobot users
eligible_statuses =
filtered_statuses |> Enum.filter(fn s -> not is_fave?(s) and not has_nobot?(s) end)
# get statuses that are eligible (i.e. users don't have #nobot in their profile), have mentions, and are not faves
interactions =
eligible_statuses
|> statuses_to_interactions()
|> merge_count_maps(interactions)
statuses_seen =
eligible_statuses
|> Kernel.length()
|> Kernel.+(statuses_seen)
oldest_status = Enum.at(filtered_statuses, -1)
oldest_status_datetime =
oldest_status
|> Map.get("created_at")
|> parse_timestamp()
if NaiveDateTime.compare(oldest_status_datetime, min_timestamp) == :gt and
statuses_seen < get_config(:status_count_limit) and
length(filtered_statuses) == length(statuses) do
get_interactions(domain, min_timestamp, oldest_status["id"], interactions, statuses_seen)
else
{interactions, statuses_seen}
end
else
{interactions, statuses_seen}
end
end
@spec statuses_to_interactions(any()) :: ApiCrawler.instance_interactions()
defp statuses_to_interactions(statuses) do
statuses
|> Enum.filter(fn status -> is_mention?(status) end)
|> Enum.map(fn status -> extract_mentions_from_status(status) end)
|> Enum.reduce(%{}, fn map, acc ->
Map.merge(acc, map)
end)
end
# Checks whether the status contains one or more mentions
@spec is_mention?(any()) :: boolean()
defp is_mention?(%{"attentions" => []}) do
false
end
defp is_mention?(_status) do
true
end
@spec is_fave?(any()) :: boolean()
defp is_fave?(status) do
uri_elements = status |> Map.get("uri") |> String.split(":")
Enum.member?(uri_elements, "fave")
end
@spec has_nobot?(any()) :: boolean()
defp has_nobot?(status) do
case get_in(status, ["user", "description"]) do
nil ->
false
description ->
description
|> String.downcase()
|> String.contains?("nobot")
end
end
@spec extract_mentions_from_status(any()) :: ApiCrawler.instance_interactions()
defp extract_mentions_from_status(status) do
status["attentions"]
|> Enum.map(fn mention -> get_domain(mention["profileurl"]) end)
|> Enum.reduce(%{}, fn domain, acc ->
Map.update(acc, domain, 1, &(&1 + 1))
end)
end
# Parses the messed-up time format that GNU social uses
# Like seriously, it's 2019, why *wouldn't* you use iso8601?
@spec parse_timestamp(String.t()) :: NaiveDateTime.t()
defp parse_timestamp(timestamp) do
timestamp
|> Timex.parse!("{WDshort} {Mshort} {0D} {h24}:{0m}:{0s} {0Z} {YYYY}")
|> Timex.to_naive_datetime()
end
end

View file

@ -7,10 +7,15 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
@behaviour ApiCrawler
@impl ApiCrawler
def is_instance_type?(domain) do
case get_and_decode("https://#{domain}/api/v1/instance") do
{:ok, %{"title" => _title}} -> true
_other -> false
def is_instance_type?(domain, result) do
# We might already know that this is a Pleroma instance from nodeinfo
if result != nil and Map.get(result, :instance_type) == :pleroma do
true
else
case get_and_decode("https://#{domain}/api/v1/instance") do
{:ok, %{"title" => _title}} -> true
_other -> false
end
end
end
@ -123,7 +128,11 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
filtered_statuses =
statuses
|> Enum.filter(fn s -> is_after?(s["created_at"], min_timestamp) end)
|> Enum.filter(fn s ->
s["created_at"]
|> NaiveDateTime.from_iso8601!()
|> is_after?(min_timestamp)
end)
if length(filtered_statuses) > 0 do
# get statuses that are eligible (i.e. users don't have #nobot in their profile) and have mentions

View file

@ -7,10 +7,15 @@ defmodule Backend.Crawler.Crawlers.Misskey do
require Logger
@impl ApiCrawler
def is_instance_type?(domain) do
case get_version_and_description(domain) do
{:ok, _} -> true
{:error, _} -> false
def is_instance_type?(domain, result) do
# We may already know that this is a Misskey instance from nodeinfo
if result != nil and Map.get(result, :instance_type) == :misskey do
true
else
case get_version_and_description(domain) do
{:ok, _} -> true
{:error, _} -> false
end
end
end
@ -109,7 +114,11 @@ defmodule Backend.Crawler.Crawlers.Misskey do
filtered_statuses =
statuses
|> Enum.filter(fn s -> is_after?(s["createdAt"], min_timestamp) end)
|> Enum.filter(fn s ->
s["createdAt"]
|> NaiveDateTime.from_iso8601!()
|> is_after?(min_timestamp)
end)
if length(filtered_statuses) > 0 do
# get statuses that are eligible (i.e. users don't have #nobot in their profile) and have mentions

View file

@ -1,4 +1,5 @@
defmodule Backend.Crawler.Crawlers.Nodeinfo do
alias Backend.Crawler.ApiCrawler
require Logger
import Backend.Util
import Backend.Crawler.Util
@ -12,6 +13,22 @@ defmodule Backend.Crawler.Crawlers.Nodeinfo do
nodeinfo to know whether it's a personal instance or not.
"""
defstruct [
:description,
:user_count,
:status_count,
:instance_type,
:version
]
@type t() :: %__MODULE__{
description: String.t(),
user_count: integer,
status_count: integer,
instance_type: ApiCrawler.instance_type(),
version: String.t()
}
@spec allows_crawling?(String.t()) :: boolean()
def allows_crawling?(domain) do
[
@ -21,6 +38,7 @@ defmodule Backend.Crawler.Crawlers.Nodeinfo do
|> urls_are_crawlable?()
end
@spec crawl(String.t()) :: {:ok, t()} | {:error, nil}
def crawl(domain) do
with {:ok, nodeinfo_url} <- get_nodeinfo_url(domain),
{:ok, nodeinfo} <- get_nodeinfo(nodeinfo_url) do
@ -30,6 +48,8 @@ defmodule Backend.Crawler.Crawlers.Nodeinfo do
end
end
@spec get_nodeinfo_url(String.t()) ::
{:ok, String.t()} | {:error, Jason.DecodeError.t() | HTTPoison.Error.t()}
defp get_nodeinfo_url(domain) do
case get_and_decode("https://#{domain}/.well-known/nodeinfo") do
{:ok, response} -> {:ok, process_nodeinfo_url(response)}
@ -37,6 +57,7 @@ defmodule Backend.Crawler.Crawlers.Nodeinfo do
end
end
@spec process_nodeinfo_url(any()) :: String.t()
defp process_nodeinfo_url(response) do
response
|> Map.get("links")
@ -45,6 +66,8 @@ defmodule Backend.Crawler.Crawlers.Nodeinfo do
|> Map.get("href")
end
@spec get_nodeinfo(String.t()) ::
{:ok, t()} | {:error, Jason.DecodeError.t() | HTTPoison.Error.t()}
defp get_nodeinfo(nodeinfo_url) do
case get_and_decode(nodeinfo_url) do
{:ok, nodeinfo} -> {:ok, process_nodeinfo(nodeinfo)}
@ -52,23 +75,36 @@ defmodule Backend.Crawler.Crawlers.Nodeinfo do
end
end
@spec process_nodeinfo(any()) :: t()
defp process_nodeinfo(nodeinfo) do
# Both of these are used, depending on the server implementation
description =
[
get_in(nodeinfo, ["metadata", "description"]),
get_in(nodeinfo, ["metadata", "nodeDescription"])
]
|> Enum.filter(fn d -> d != nil end)
|> Enum.at(0)
user_count = get_in(nodeinfo, ["usage", "users", "total"])
%{
description: description,
user_count: get_in(nodeinfo, ["usage", "users", "total"]),
status_count: get_in(nodeinfo, ["usage", "localPosts"]),
instance_type: String.to_atom(get_in(nodeinfo, ["software", "name"])),
version: get_in(nodeinfo, ["software", "version"])
}
if is_above_user_threshold?(user_count) do
# Both of these are used, depending on the server implementation
description =
[
get_in(nodeinfo, ["metadata", "description"]),
get_in(nodeinfo, ["metadata", "nodeDescription"])
]
|> Enum.filter(fn d -> d != nil end)
|> Enum.at(0)
%__MODULE__{
description: description,
user_count: user_count,
status_count: get_in(nodeinfo, ["usage", "localPosts"]),
instance_type: String.to_atom(get_in(nodeinfo, ["software", "name"])),
version: get_in(nodeinfo, ["software", "version"])
}
else
%{
description: nil,
user_count: user_count,
status_count: nil,
instance_type: nil,
version: nil
}
end
end
@spec is_compatible_nodeinfo_version?(String.t()) :: boolean()

View file

@ -8,27 +8,19 @@ defmodule Backend.Crawler.Util do
# (e.g. https://mastodon.social/@demouser or https://pleroma.site/users/demouser)
@spec get_domain(String.t()) :: String.t()
def get_domain(url) do
String.slice(url, 8..-1)
|> String.split("/")
|> Enum.at(0)
[_match, domain] = Regex.run(~r/https?:\/\/([\w.-]+)\/.*/, url)
domain
end
@spec is_http_200?(HTTPoison.Response.t()) :: boolean
def is_http_200?(%{status_code: 200}) do
true
end
def is_http_200?(_) do
false
end
@spec is_after?(String.t(), NaiveDateTime.t() | nil) :: boolean()
@doc """
Returns true if the first argument is after the second.
"""
@spec is_after?(NaiveDateTime.t(), NaiveDateTime.t() | nil) :: boolean()
def is_after?(timestamp, threshold) do
if threshold == nil do
true
else
timestamp
|> NaiveDateTime.from_iso8601!()
# :second is the granularity used in the database
|> NaiveDateTime.truncate(:second)
|> NaiveDateTime.compare(threshold)