index.community/backend/lib/backend/crawler/crawlers/mastodon.ex

194 lines
5.9 KiB
Elixir

defmodule Backend.Crawler.Crawlers.Mastodon do
require Logger
import Backend.Crawler.Util
alias Backend.Crawler.ApiCrawler
@behaviour ApiCrawler
@impl ApiCrawler
def is_instance_type?(domain) do
case get("https://#{domain}/api/v1/instance") do
{:ok, response} -> if is_http_200?(response), do: has_title?(response.body), else: false
{:error, _error} -> false
end
end
@impl ApiCrawler
def crawl(domain) do
instance = Jason.decode!(get!("https://#{domain}/api/v1/instance").body)
if get_in(instance, ["stats", "user_count"]) > get_config(:personal_instance_threshold) do
crawl_large_instance(domain, instance)
else
Map.merge(
Map.merge(
Map.take(instance, ["version", "description"]),
Map.take(instance["stats"], ["user_count", "status_count"])
)
|> Map.new(fn {k, v} -> {String.to_atom(k), v} end),
%{peers: [], interactions: %{}, statuses_seen: 0}
)
end
end
@spec crawl_large_instance(String.t(), any()) :: ApiCrawler.t()
defp crawl_large_instance(domain, instance) do
# servers may not publish peers
peers =
case get("https://#{domain}/api/v1/instance/peers") do
{:ok, response} -> if is_http_200?(response), do: Jason.decode!(response.body), else: []
{:error, _error} -> []
end
Logger.debug("Found #{length(peers)} peers.")
{interactions, statuses_seen} = get_interactions(domain)
Logger.debug(
"#{domain}: found #{
interactions |> Map.values() |> Enum.reduce(0, fn count, acc -> count + acc end)
} mentions in #{statuses_seen} statuses."
)
Map.merge(
Map.merge(
Map.take(instance, ["version", "description"]),
Map.take(instance["stats"], ["user_count", "status_count"])
)
|> Map.new(fn {k, v} -> {String.to_atom(k), v} end),
%{peers: peers, interactions: interactions, statuses_seen: statuses_seen}
)
end
@spec get_interactions(
String.t(),
String.t() | nil,
Calendar.naive_datetime() | nil,
ApiCrawler.instance_interactions(),
integer
) :: {ApiCrawler.instance_interactions(), integer}
defp get_interactions(
domain,
max_id \\ nil,
min_timestamp \\ nil,
interactions \\ %{},
statuses_seen \\ 0
) do
# If `statuses_seen == 0`, it's the first call of this function, which means we want to query the database for the
# most recent status we have.
min_timestamp =
if statuses_seen == 0 do
get_last_successful_crawl_timestamp(domain)
else
min_timestamp
end
endpoint = "https://#{domain}/api/v1/timelines/public?local=true"
endpoint =
if max_id do
endpoint <> "&max_id=#{max_id}"
else
endpoint
end
Logger.debug("Crawling #{endpoint}")
statuses =
endpoint
|> get!()
|> Map.get(:body)
|> Jason.decode!()
filtered_statuses =
statuses
|> Enum.filter(fn s -> is_after?(s["created_at"], min_timestamp) end)
if length(filtered_statuses) > 0 do
# get statuses that are eligible (i.e. users don't have #nobot in their profile) and have mentions
interactions = Map.merge(interactions, statuses_to_interactions(filtered_statuses))
statuses_seen = statuses_seen + length(filtered_statuses)
status_datetime_threshold =
NaiveDateTime.utc_now()
|> NaiveDateTime.add(get_config(:status_age_limit_days) * 24 * 3600 * -1, :second)
oldest_status = Enum.at(filtered_statuses, -1)
oldest_status_datetime =
oldest_status
|> (fn s -> s["created_at"] end).()
|> NaiveDateTime.from_iso8601!()
if NaiveDateTime.compare(oldest_status_datetime, status_datetime_threshold) == :gt and
statuses_seen < get_config(:status_count_limit) and
length(filtered_statuses) == length(statuses) do
get_interactions(domain, oldest_status["id"], min_timestamp, interactions, statuses_seen)
else
{interactions, statuses_seen}
end
else
{interactions, statuses_seen}
end
end
# To check if the endpoint works as expected
@spec has_title?(String.t()) :: boolean
defp has_title?(body) do
case Jason.decode(body) do
{:ok, decoded} -> Map.has_key?(decoded, "title")
{:error, _error} -> false
end
end
# Checks whether the status contains one or more mentions
defp is_mention?(status) do
case status["mentions"] do
[] -> false
nil -> false
_ -> true
end
end
# Checks if the author of the status has "nobot" in their profile
defp has_nobot?(status) do
account = status["account"]
fields =
account["fields"]
|> Enum.map(fn %{"name" => name, "value" => value} -> name <> value end)
|> Enum.join("")
# this also means that any users who mentioned ethnobotany in their profiles will be excluded lol ¯\_(ツ)_/¯
(account["note"] <> fields)
|> String.downcase()
|> String.contains?("nobot")
end
# This checks if the status
# a) contains one or more mentions, and
# b) that the person posting doesn't have "nobot" in their profile
defp is_eligible?(status) do
is_mention?(status) and not has_nobot?(status)
end
@spec extract_mentions_from_status(any()) :: ApiCrawler.instance_interactions()
defp extract_mentions_from_status(status) do
status["mentions"]
|> Enum.map(fn mention -> get_domain(mention["url"]) end)
|> Enum.reduce(%{}, fn domain, acc ->
Map.update(acc, domain, 1, &(&1 + 1))
end)
end
@spec statuses_to_interactions(any()) :: ApiCrawler.instance_interactions()
defp statuses_to_interactions(statuses) do
statuses
|> Enum.filter(fn status -> is_eligible?(status) end)
|> Enum.map(fn status -> extract_mentions_from_status(status) end)
|> Enum.reduce(%{}, fn map, acc ->
Map.merge(acc, map)
end)
end
end