diff --git a/CHANGELOG.md b/CHANGELOG.md index e91045e..5a7b303 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Added Misskey crawler. + ### Changed ### Deprecated diff --git a/backend/config/dev.exs b/backend/config/dev.exs index 2dc65ca..73f88a9 100644 --- a/backend/config/dev.exs +++ b/backend/config/dev.exs @@ -61,7 +61,7 @@ config :backend, :crawler, status_count_limit: 100, personal_instance_threshold: 5, crawl_interval_mins: 60, - crawl_workers: 1, + crawl_workers: 50, blacklist: [ "gab.best", "4chan.icu" diff --git a/backend/lib/backend/crawler/api_crawler.ex b/backend/lib/backend/crawler/api_crawler.ex index 30f4933..4f07e47 100644 --- a/backend/lib/backend/crawler/api_crawler.ex +++ b/backend/lib/backend/crawler/api_crawler.ex @@ -14,7 +14,7 @@ defmodule Backend.Crawler.ApiCrawler do # {domain_mentioned, count} @type instance_interactions :: %{String.t() => integer} - @type instance_type :: :mastodon | :pleroma | :gab + @type instance_type :: :mastodon | :pleroma | :gab | :misskey defstruct [ :version, diff --git a/backend/lib/backend/crawler/crawler.ex b/backend/lib/backend/crawler/crawler.ex index ee2590f..f330c85 100644 --- a/backend/lib/backend/crawler/crawler.ex +++ b/backend/lib/backend/crawler/crawler.ex @@ -4,7 +4,7 @@ defmodule Backend.Crawler do """ alias __MODULE__ - alias Backend.Crawler.Crawlers.Mastodon + alias Backend.Crawler.Crawlers.{Mastodon, Misskey} alias Backend.Crawler.ApiCrawler alias Backend.{Crawl, CrawlInteraction, Repo, Instance, InstancePeer} import Ecto.Query @@ -46,6 +46,7 @@ defmodule Backend.Crawler do state # register APICrawlers here |> register(Mastodon) + |> register(Misskey) # go! |> crawl() |> save() diff --git a/backend/lib/backend/crawler/crawlers/mastodon.ex b/backend/lib/backend/crawler/crawlers/mastodon.ex index 5e858f5..68e887c 100644 --- a/backend/lib/backend/crawler/crawlers/mastodon.ex +++ b/backend/lib/backend/crawler/crawlers/mastodon.ex @@ -2,9 +2,7 @@ defmodule Backend.Crawler.Crawlers.Mastodon do require Logger import Backend.Crawler.Util import Backend.Util - import Ecto.Query alias Backend.Crawler.ApiCrawler - alias Backend.{Instance, Repo} @behaviour ApiCrawler @@ -18,31 +16,21 @@ defmodule Backend.Crawler.Crawlers.Mastodon do @impl ApiCrawler def allows_crawling?(domain) do - endpoints = [ + [ "/api/v1/instance", "/api/v1/instance/peers", "/api/v1/timelines/public" ] - - user_agent = get_config(:user_agent) - - endpoints |> Enum.map(fn endpoint -> "https://#{domain}#{endpoint}" end) - |> Enum.all?(fn endpoint -> Gollum.crawlable?(user_agent, endpoint) != :uncrawlable end) + |> urls_are_crawlable?() end @impl ApiCrawler def crawl(domain) do instance = Jason.decode!(get!("https://#{domain}/api/v1/instance").body) + user_count = get_in(instance, ["stats", "user_count"]) - has_opted_in = - case Instance |> select([:opt_in]) |> Repo.get_by(domain: domain) do - %{opt_in: true} -> true - _ -> false - end - - if get_in(instance, ["stats", "user_count"]) > get_config(:personal_instance_threshold) or - has_opted_in do + if is_above_user_threshold?(user_count) or has_opted_in?(domain) do crawl_large_instance(domain, instance) else Map.merge( @@ -148,7 +136,11 @@ defmodule Backend.Crawler.Crawlers.Mastodon do if length(filtered_statuses) > 0 do # get statuses that are eligible (i.e. users don't have #nobot in their profile) and have mentions - interactions = Map.merge(interactions, statuses_to_interactions(filtered_statuses)) + interactions = + filtered_statuses + |> statuses_to_interactions() + |> merge_count_maps(interactions) + statuses_seen = statuses_seen + length(filtered_statuses) status_datetime_threshold = diff --git a/backend/lib/backend/crawler/crawlers/misskey.ex b/backend/lib/backend/crawler/crawlers/misskey.ex new file mode 100644 index 0000000..0e16977 --- /dev/null +++ b/backend/lib/backend/crawler/crawlers/misskey.ex @@ -0,0 +1,222 @@ +defmodule Backend.Crawler.Crawlers.Misskey do + alias Backend.Crawler.ApiCrawler + + @behaviour ApiCrawler + import Backend.Crawler.Util + import Backend.Util + require Logger + + @impl ApiCrawler + def is_instance_type?(domain) do + case get_version_and_description(domain) do + {:ok, _} -> true + {:error, _} -> false + end + end + + @impl ApiCrawler + def allows_crawling?(domain) do + [ + "/api/meta", + "/api/stats", + "/api/notes/local-timeline", + "/api/v1/instance/peers" + ] + |> Enum.map(fn endpoint -> "https://#{domain}#{endpoint}" end) + |> urls_are_crawlable?() + end + + @impl ApiCrawler + def crawl(domain) do + with {:ok, %{status_code: 200, body: stats_body}} <- post("https://#{domain}/api/stats") do + %{"originalUsersCount" => user_count, "originalNotesCount" => status_count} = + Jason.decode!(stats_body) + + if is_above_user_threshold?(user_count) or has_opted_in?(domain) do + crawl_large_instance(domain, user_count, status_count) + else + %{ + version: nil, + description: nil, + user_count: user_count, + status_count: nil, + peers: [], + interactions: %{}, + statuses_seen: 0, + instance_type: nil + } + end + end + end + + @spec crawl_large_instance(String.t(), integer(), integer()) :: ApiCrawler.t() + defp crawl_large_instance(domain, user_count, status_count) do + status_datetime_threshold = + NaiveDateTime.utc_now() + |> NaiveDateTime.add(get_config(:status_age_limit_days) * 24 * 3600 * -1, :second) + + # Don't get any statuses older than this + min_timestamp = + max_datetime(get_last_successful_crawl_timestamp(domain), status_datetime_threshold) + + {interactions, statuses_seen} = get_interactions(domain, min_timestamp) + {:ok, {version, description}} = get_version_and_description(domain) + {:ok, peers} = get_peers(domain) + + %{ + instance_type: :misskey, + # From stats endpoint + user_count: user_count, + status_count: status_count, + # From meta endpoint + version: version, + description: description, + # From timeline + interactions: interactions, + statuses_seen: statuses_seen, + # From peers endpoint + peers: peers + } + end + + @spec get_interactions( + String.t(), + NaiveDateTime.t(), + String.t() | nil, + ApiCrawler.instance_interactions(), + integer() + ) :: {ApiCrawler.instance_interactions(), integer()} + defp get_interactions( + domain, + min_timestamp, + until_id \\ nil, + interactions \\ %{}, + statuses_seen \\ 0 + ) do + endpoint = "https://#{domain}/api/notes/local-timeline" + + params = %{ + limit: 20 + } + + params = + if until_id != nil do + Map.put(params, :untilId, until_id) + else + params + end + + Logger.debug("Crawling #{endpoint} with untilId=#{until_id}") + + statuses = + endpoint + |> post!(Jason.encode!(params)) + |> Map.get(:body) + |> Jason.decode!() + + filtered_statuses = + statuses + |> Enum.filter(fn s -> is_after?(s["createdAt"], min_timestamp) end) + + if length(filtered_statuses) > 0 do + # get statuses that are eligible (i.e. users don't have #nobot in their profile) and have mentions + interactions = + filtered_statuses + |> statuses_to_interactions() + |> merge_count_maps(interactions) + + # Don't count renotes in the # of statuses seen + statuses_seen = + filtered_statuses + |> Enum.filter(&is_original_status?(&1)) + |> Kernel.length() + |> Kernel.+(statuses_seen) + + oldest_status = Enum.at(filtered_statuses, -1) + + oldest_status_datetime = + oldest_status + |> (fn s -> s["createdAt"] end).() + |> NaiveDateTime.from_iso8601!() + + if NaiveDateTime.compare(oldest_status_datetime, min_timestamp) == :gt and + statuses_seen < get_config(:status_count_limit) and + length(filtered_statuses) == length(statuses) do + get_interactions(domain, min_timestamp, oldest_status["id"], interactions, statuses_seen) + else + {interactions, statuses_seen} + end + else + {interactions, statuses_seen} + end + end + + @spec get_version_and_description(String.t()) :: + {:ok, {String.t(), String.t()}} | {:error, String.t()} + defp get_version_and_description(domain) do + case post("https://#{domain}/api/meta") do + {:ok, %{status_code: 200, body: body}} -> + case Jason.decode(body) do + {:ok, decoded} -> + {:ok, {Map.get(decoded, "version"), Map.get(decoded, "description")}} + + {:error, _error} -> + {:error, "invalid response"} + end + + _ -> + {:error, "unsuccesful request"} + end + end + + @spec get_peers(String.t()) :: {:ok, [String.t()]} | {:error, Jason.DecodeError.t()} + defp get_peers(domain) do + case get("https://#{domain}/api/v1/instance/peers") do + {:ok, response} -> + with %{status_code: 200, body: body} <- response do + Jason.decode(body) + else + _ -> {:ok, []} + end + + {:error, _} -> + {:ok, []} + end + end + + @spec statuses_to_interactions(any()) :: ApiCrawler.instance_interactions() + defp statuses_to_interactions(statuses) do + statuses + |> Enum.filter(fn status -> is_mention?(status) end) + |> Enum.map(fn status -> extract_mentions_from_status(status) end) + |> Enum.reduce(%{}, fn map, acc -> + Map.merge(acc, map) + end) + end + + # Checks whether + # * it's not a renote (a.k.a. a boost) + # * the status contains one or more mentions + @spec is_mention?(any()) :: boolean() + defp is_mention?(status) do + has_mentions = Map.get(status, "mentions") != nil + is_original_status?(status) and has_mentions + end + + # Checks whether it's not a renote (a.k.a. a boost) + @spec is_original_status?(any()) :: boolean() + defp is_original_status?(status) do + Map.get(status, "renoteId") == nil + end + + @spec extract_mentions_from_status(any()) :: ApiCrawler.instance_interactions() + defp extract_mentions_from_status(status) do + status_content = Map.get(status, "text") + + Regex.scan(~r/@\w+@([\w.-]+)/, status_content) + |> Enum.map(fn [_match, domain] -> domain end) + |> Enum.reduce(%{}, fn domain, acc -> + Map.update(acc, domain, 1, &(&1 + 1)) + end) + end +end diff --git a/backend/lib/backend/crawler/util.ex b/backend/lib/backend/crawler/util.ex index c5013f1..2351c97 100644 --- a/backend/lib/backend/crawler/util.ex +++ b/backend/lib/backend/crawler/util.ex @@ -1,6 +1,8 @@ defmodule Backend.Crawler.Util do require Logger + alias Backend.Repo import Backend.Util + import Ecto.Query # Gets the domain from a Mastodon/Pleroma account URL # (e.g. https://mastodon.social/@demouser or https://pleroma.site/users/demouser) @@ -60,4 +62,41 @@ defmodule Backend.Crawler.Util do timeout: 15000 ) end + + def post(url, body \\ "") do + HTTPoison.post(url, body, [{"User-Agent", get_config(:user_agent)}], + hackney: [pool: :crawler], + recv_timeout: 15000, + timeout: 15000 + ) + end + + def post!(url, body \\ "") do + HTTPoison.post!(url, body, [{"User-Agent", get_config(:user_agent)}], + hackney: [pool: :crawler], + recv_timeout: 15000, + timeout: 15000 + ) + end + + @spec urls_are_crawlable?([String.t()]) :: boolean() + def urls_are_crawlable?(urls) do + user_agent = get_config(:user_agent) + + urls + |> Enum.all?(fn url -> Gollum.crawlable?(user_agent, url) != :uncrawlable end) + end + + @spec has_opted_in?(String.t()) :: boolean() + def has_opted_in?(domain) do + case Instance |> select([:opt_in]) |> Repo.get_by(domain: domain) do + %{opt_in: true} -> true + _ -> false + end + end + + @spec is_above_user_threshold?(integer) :: boolean() + def is_above_user_threshold?(user_count) do + user_count > get_config(:personal_instance_threshold) + end end diff --git a/backend/lib/backend/util.ex b/backend/lib/backend/util.ex index 652d13e..41baa23 100644 --- a/backend/lib/backend/util.ex +++ b/backend/lib/backend/util.ex @@ -30,7 +30,7 @@ defmodule Backend.Util do blacklist = case get_config(:blacklist) do nil -> [] - _ -> get_config(:blacklist) + other -> other end blacklist diff --git a/frontend/src/constants.tsx b/frontend/src/constants.tsx index f402c2c..90fcb7b 100644 --- a/frontend/src/constants.tsx +++ b/frontend/src/constants.tsx @@ -40,4 +40,4 @@ export interface IInstanceDomainPath { } // We could also extract the values from the server response, but this would slow things down... -export const INSTANCE_TYPES = ["mastodon", "gab", "pleroma"]; +export const INSTANCE_TYPES = ["mastodon", "gab", "pleroma", "misskey"];