Add misskey crawler
This commit is contained in:
parent
cc541b9ddf
commit
75e66affe3
|
@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
|
|
||||||
|
- Added Misskey crawler.
|
||||||
|
|
||||||
### Changed
|
### Changed
|
||||||
|
|
||||||
### Deprecated
|
### Deprecated
|
||||||
|
|
|
@ -61,7 +61,7 @@ config :backend, :crawler,
|
||||||
status_count_limit: 100,
|
status_count_limit: 100,
|
||||||
personal_instance_threshold: 5,
|
personal_instance_threshold: 5,
|
||||||
crawl_interval_mins: 60,
|
crawl_interval_mins: 60,
|
||||||
crawl_workers: 1,
|
crawl_workers: 50,
|
||||||
blacklist: [
|
blacklist: [
|
||||||
"gab.best",
|
"gab.best",
|
||||||
"4chan.icu"
|
"4chan.icu"
|
||||||
|
|
|
@ -14,7 +14,7 @@ defmodule Backend.Crawler.ApiCrawler do
|
||||||
# {domain_mentioned, count}
|
# {domain_mentioned, count}
|
||||||
@type instance_interactions :: %{String.t() => integer}
|
@type instance_interactions :: %{String.t() => integer}
|
||||||
|
|
||||||
@type instance_type :: :mastodon | :pleroma | :gab
|
@type instance_type :: :mastodon | :pleroma | :gab | :misskey
|
||||||
|
|
||||||
defstruct [
|
defstruct [
|
||||||
:version,
|
:version,
|
||||||
|
|
|
@ -4,7 +4,7 @@ defmodule Backend.Crawler do
|
||||||
"""
|
"""
|
||||||
|
|
||||||
alias __MODULE__
|
alias __MODULE__
|
||||||
alias Backend.Crawler.Crawlers.Mastodon
|
alias Backend.Crawler.Crawlers.{Mastodon, Misskey}
|
||||||
alias Backend.Crawler.ApiCrawler
|
alias Backend.Crawler.ApiCrawler
|
||||||
alias Backend.{Crawl, CrawlInteraction, Repo, Instance, InstancePeer}
|
alias Backend.{Crawl, CrawlInteraction, Repo, Instance, InstancePeer}
|
||||||
import Ecto.Query
|
import Ecto.Query
|
||||||
|
@ -46,6 +46,7 @@ defmodule Backend.Crawler do
|
||||||
state
|
state
|
||||||
# register APICrawlers here
|
# register APICrawlers here
|
||||||
|> register(Mastodon)
|
|> register(Mastodon)
|
||||||
|
|> register(Misskey)
|
||||||
# go!
|
# go!
|
||||||
|> crawl()
|
|> crawl()
|
||||||
|> save()
|
|> save()
|
||||||
|
|
|
@ -2,9 +2,7 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
|
||||||
require Logger
|
require Logger
|
||||||
import Backend.Crawler.Util
|
import Backend.Crawler.Util
|
||||||
import Backend.Util
|
import Backend.Util
|
||||||
import Ecto.Query
|
|
||||||
alias Backend.Crawler.ApiCrawler
|
alias Backend.Crawler.ApiCrawler
|
||||||
alias Backend.{Instance, Repo}
|
|
||||||
|
|
||||||
@behaviour ApiCrawler
|
@behaviour ApiCrawler
|
||||||
|
|
||||||
|
@ -18,31 +16,21 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
|
||||||
|
|
||||||
@impl ApiCrawler
|
@impl ApiCrawler
|
||||||
def allows_crawling?(domain) do
|
def allows_crawling?(domain) do
|
||||||
endpoints = [
|
[
|
||||||
"/api/v1/instance",
|
"/api/v1/instance",
|
||||||
"/api/v1/instance/peers",
|
"/api/v1/instance/peers",
|
||||||
"/api/v1/timelines/public"
|
"/api/v1/timelines/public"
|
||||||
]
|
]
|
||||||
|
|
||||||
user_agent = get_config(:user_agent)
|
|
||||||
|
|
||||||
endpoints
|
|
||||||
|> Enum.map(fn endpoint -> "https://#{domain}#{endpoint}" end)
|
|> Enum.map(fn endpoint -> "https://#{domain}#{endpoint}" end)
|
||||||
|> Enum.all?(fn endpoint -> Gollum.crawlable?(user_agent, endpoint) != :uncrawlable end)
|
|> urls_are_crawlable?()
|
||||||
end
|
end
|
||||||
|
|
||||||
@impl ApiCrawler
|
@impl ApiCrawler
|
||||||
def crawl(domain) do
|
def crawl(domain) do
|
||||||
instance = Jason.decode!(get!("https://#{domain}/api/v1/instance").body)
|
instance = Jason.decode!(get!("https://#{domain}/api/v1/instance").body)
|
||||||
|
user_count = get_in(instance, ["stats", "user_count"])
|
||||||
|
|
||||||
has_opted_in =
|
if is_above_user_threshold?(user_count) or has_opted_in?(domain) do
|
||||||
case Instance |> select([:opt_in]) |> Repo.get_by(domain: domain) do
|
|
||||||
%{opt_in: true} -> true
|
|
||||||
_ -> false
|
|
||||||
end
|
|
||||||
|
|
||||||
if get_in(instance, ["stats", "user_count"]) > get_config(:personal_instance_threshold) or
|
|
||||||
has_opted_in do
|
|
||||||
crawl_large_instance(domain, instance)
|
crawl_large_instance(domain, instance)
|
||||||
else
|
else
|
||||||
Map.merge(
|
Map.merge(
|
||||||
|
@ -148,7 +136,11 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
|
||||||
|
|
||||||
if length(filtered_statuses) > 0 do
|
if length(filtered_statuses) > 0 do
|
||||||
# get statuses that are eligible (i.e. users don't have #nobot in their profile) and have mentions
|
# get statuses that are eligible (i.e. users don't have #nobot in their profile) and have mentions
|
||||||
interactions = Map.merge(interactions, statuses_to_interactions(filtered_statuses))
|
interactions =
|
||||||
|
filtered_statuses
|
||||||
|
|> statuses_to_interactions()
|
||||||
|
|> merge_count_maps(interactions)
|
||||||
|
|
||||||
statuses_seen = statuses_seen + length(filtered_statuses)
|
statuses_seen = statuses_seen + length(filtered_statuses)
|
||||||
|
|
||||||
status_datetime_threshold =
|
status_datetime_threshold =
|
||||||
|
|
222
backend/lib/backend/crawler/crawlers/misskey.ex
Normal file
222
backend/lib/backend/crawler/crawlers/misskey.ex
Normal file
|
@ -0,0 +1,222 @@
|
||||||
|
defmodule Backend.Crawler.Crawlers.Misskey do
|
||||||
|
alias Backend.Crawler.ApiCrawler
|
||||||
|
|
||||||
|
@behaviour ApiCrawler
|
||||||
|
import Backend.Crawler.Util
|
||||||
|
import Backend.Util
|
||||||
|
require Logger
|
||||||
|
|
||||||
|
@impl ApiCrawler
|
||||||
|
def is_instance_type?(domain) do
|
||||||
|
case get_version_and_description(domain) do
|
||||||
|
{:ok, _} -> true
|
||||||
|
{:error, _} -> false
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
@impl ApiCrawler
|
||||||
|
def allows_crawling?(domain) do
|
||||||
|
[
|
||||||
|
"/api/meta",
|
||||||
|
"/api/stats",
|
||||||
|
"/api/notes/local-timeline",
|
||||||
|
"/api/v1/instance/peers"
|
||||||
|
]
|
||||||
|
|> Enum.map(fn endpoint -> "https://#{domain}#{endpoint}" end)
|
||||||
|
|> urls_are_crawlable?()
|
||||||
|
end
|
||||||
|
|
||||||
|
@impl ApiCrawler
|
||||||
|
def crawl(domain) do
|
||||||
|
with {:ok, %{status_code: 200, body: stats_body}} <- post("https://#{domain}/api/stats") do
|
||||||
|
%{"originalUsersCount" => user_count, "originalNotesCount" => status_count} =
|
||||||
|
Jason.decode!(stats_body)
|
||||||
|
|
||||||
|
if is_above_user_threshold?(user_count) or has_opted_in?(domain) do
|
||||||
|
crawl_large_instance(domain, user_count, status_count)
|
||||||
|
else
|
||||||
|
%{
|
||||||
|
version: nil,
|
||||||
|
description: nil,
|
||||||
|
user_count: user_count,
|
||||||
|
status_count: nil,
|
||||||
|
peers: [],
|
||||||
|
interactions: %{},
|
||||||
|
statuses_seen: 0,
|
||||||
|
instance_type: nil
|
||||||
|
}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
@spec crawl_large_instance(String.t(), integer(), integer()) :: ApiCrawler.t()
|
||||||
|
defp crawl_large_instance(domain, user_count, status_count) do
|
||||||
|
status_datetime_threshold =
|
||||||
|
NaiveDateTime.utc_now()
|
||||||
|
|> NaiveDateTime.add(get_config(:status_age_limit_days) * 24 * 3600 * -1, :second)
|
||||||
|
|
||||||
|
# Don't get any statuses older than this
|
||||||
|
min_timestamp =
|
||||||
|
max_datetime(get_last_successful_crawl_timestamp(domain), status_datetime_threshold)
|
||||||
|
|
||||||
|
{interactions, statuses_seen} = get_interactions(domain, min_timestamp)
|
||||||
|
{:ok, {version, description}} = get_version_and_description(domain)
|
||||||
|
{:ok, peers} = get_peers(domain)
|
||||||
|
|
||||||
|
%{
|
||||||
|
instance_type: :misskey,
|
||||||
|
# From stats endpoint
|
||||||
|
user_count: user_count,
|
||||||
|
status_count: status_count,
|
||||||
|
# From meta endpoint
|
||||||
|
version: version,
|
||||||
|
description: description,
|
||||||
|
# From timeline
|
||||||
|
interactions: interactions,
|
||||||
|
statuses_seen: statuses_seen,
|
||||||
|
# From peers endpoint
|
||||||
|
peers: peers
|
||||||
|
}
|
||||||
|
end
|
||||||
|
|
||||||
|
@spec get_interactions(
|
||||||
|
String.t(),
|
||||||
|
NaiveDateTime.t(),
|
||||||
|
String.t() | nil,
|
||||||
|
ApiCrawler.instance_interactions(),
|
||||||
|
integer()
|
||||||
|
) :: {ApiCrawler.instance_interactions(), integer()}
|
||||||
|
defp get_interactions(
|
||||||
|
domain,
|
||||||
|
min_timestamp,
|
||||||
|
until_id \\ nil,
|
||||||
|
interactions \\ %{},
|
||||||
|
statuses_seen \\ 0
|
||||||
|
) do
|
||||||
|
endpoint = "https://#{domain}/api/notes/local-timeline"
|
||||||
|
|
||||||
|
params = %{
|
||||||
|
limit: 20
|
||||||
|
}
|
||||||
|
|
||||||
|
params =
|
||||||
|
if until_id != nil do
|
||||||
|
Map.put(params, :untilId, until_id)
|
||||||
|
else
|
||||||
|
params
|
||||||
|
end
|
||||||
|
|
||||||
|
Logger.debug("Crawling #{endpoint} with untilId=#{until_id}")
|
||||||
|
|
||||||
|
statuses =
|
||||||
|
endpoint
|
||||||
|
|> post!(Jason.encode!(params))
|
||||||
|
|> Map.get(:body)
|
||||||
|
|> Jason.decode!()
|
||||||
|
|
||||||
|
filtered_statuses =
|
||||||
|
statuses
|
||||||
|
|> Enum.filter(fn s -> is_after?(s["createdAt"], min_timestamp) end)
|
||||||
|
|
||||||
|
if length(filtered_statuses) > 0 do
|
||||||
|
# get statuses that are eligible (i.e. users don't have #nobot in their profile) and have mentions
|
||||||
|
interactions =
|
||||||
|
filtered_statuses
|
||||||
|
|> statuses_to_interactions()
|
||||||
|
|> merge_count_maps(interactions)
|
||||||
|
|
||||||
|
# Don't count renotes in the # of statuses seen
|
||||||
|
statuses_seen =
|
||||||
|
filtered_statuses
|
||||||
|
|> Enum.filter(&is_original_status?(&1))
|
||||||
|
|> Kernel.length()
|
||||||
|
|> Kernel.+(statuses_seen)
|
||||||
|
|
||||||
|
oldest_status = Enum.at(filtered_statuses, -1)
|
||||||
|
|
||||||
|
oldest_status_datetime =
|
||||||
|
oldest_status
|
||||||
|
|> (fn s -> s["createdAt"] end).()
|
||||||
|
|> NaiveDateTime.from_iso8601!()
|
||||||
|
|
||||||
|
if NaiveDateTime.compare(oldest_status_datetime, min_timestamp) == :gt and
|
||||||
|
statuses_seen < get_config(:status_count_limit) and
|
||||||
|
length(filtered_statuses) == length(statuses) do
|
||||||
|
get_interactions(domain, min_timestamp, oldest_status["id"], interactions, statuses_seen)
|
||||||
|
else
|
||||||
|
{interactions, statuses_seen}
|
||||||
|
end
|
||||||
|
else
|
||||||
|
{interactions, statuses_seen}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
@spec get_version_and_description(String.t()) ::
|
||||||
|
{:ok, {String.t(), String.t()}} | {:error, String.t()}
|
||||||
|
defp get_version_and_description(domain) do
|
||||||
|
case post("https://#{domain}/api/meta") do
|
||||||
|
{:ok, %{status_code: 200, body: body}} ->
|
||||||
|
case Jason.decode(body) do
|
||||||
|
{:ok, decoded} ->
|
||||||
|
{:ok, {Map.get(decoded, "version"), Map.get(decoded, "description")}}
|
||||||
|
|
||||||
|
{:error, _error} ->
|
||||||
|
{:error, "invalid response"}
|
||||||
|
end
|
||||||
|
|
||||||
|
_ ->
|
||||||
|
{:error, "unsuccesful request"}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
@spec get_peers(String.t()) :: {:ok, [String.t()]} | {:error, Jason.DecodeError.t()}
|
||||||
|
defp get_peers(domain) do
|
||||||
|
case get("https://#{domain}/api/v1/instance/peers") do
|
||||||
|
{:ok, response} ->
|
||||||
|
with %{status_code: 200, body: body} <- response do
|
||||||
|
Jason.decode(body)
|
||||||
|
else
|
||||||
|
_ -> {:ok, []}
|
||||||
|
end
|
||||||
|
|
||||||
|
{:error, _} ->
|
||||||
|
{:ok, []}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
@spec statuses_to_interactions(any()) :: ApiCrawler.instance_interactions()
|
||||||
|
defp statuses_to_interactions(statuses) do
|
||||||
|
statuses
|
||||||
|
|> Enum.filter(fn status -> is_mention?(status) end)
|
||||||
|
|> Enum.map(fn status -> extract_mentions_from_status(status) end)
|
||||||
|
|> Enum.reduce(%{}, fn map, acc ->
|
||||||
|
Map.merge(acc, map)
|
||||||
|
end)
|
||||||
|
end
|
||||||
|
|
||||||
|
# Checks whether
|
||||||
|
# * it's not a renote (a.k.a. a boost)
|
||||||
|
# * the status contains one or more mentions
|
||||||
|
@spec is_mention?(any()) :: boolean()
|
||||||
|
defp is_mention?(status) do
|
||||||
|
has_mentions = Map.get(status, "mentions") != nil
|
||||||
|
is_original_status?(status) and has_mentions
|
||||||
|
end
|
||||||
|
|
||||||
|
# Checks whether it's not a renote (a.k.a. a boost)
|
||||||
|
@spec is_original_status?(any()) :: boolean()
|
||||||
|
defp is_original_status?(status) do
|
||||||
|
Map.get(status, "renoteId") == nil
|
||||||
|
end
|
||||||
|
|
||||||
|
@spec extract_mentions_from_status(any()) :: ApiCrawler.instance_interactions()
|
||||||
|
defp extract_mentions_from_status(status) do
|
||||||
|
status_content = Map.get(status, "text")
|
||||||
|
|
||||||
|
Regex.scan(~r/@\w+@([\w.-]+)/, status_content)
|
||||||
|
|> Enum.map(fn [_match, domain] -> domain end)
|
||||||
|
|> Enum.reduce(%{}, fn domain, acc ->
|
||||||
|
Map.update(acc, domain, 1, &(&1 + 1))
|
||||||
|
end)
|
||||||
|
end
|
||||||
|
end
|
|
@ -1,6 +1,8 @@
|
||||||
defmodule Backend.Crawler.Util do
|
defmodule Backend.Crawler.Util do
|
||||||
require Logger
|
require Logger
|
||||||
|
alias Backend.Repo
|
||||||
import Backend.Util
|
import Backend.Util
|
||||||
|
import Ecto.Query
|
||||||
|
|
||||||
# Gets the domain from a Mastodon/Pleroma account URL
|
# Gets the domain from a Mastodon/Pleroma account URL
|
||||||
# (e.g. https://mastodon.social/@demouser or https://pleroma.site/users/demouser)
|
# (e.g. https://mastodon.social/@demouser or https://pleroma.site/users/demouser)
|
||||||
|
@ -60,4 +62,41 @@ defmodule Backend.Crawler.Util do
|
||||||
timeout: 15000
|
timeout: 15000
|
||||||
)
|
)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def post(url, body \\ "") do
|
||||||
|
HTTPoison.post(url, body, [{"User-Agent", get_config(:user_agent)}],
|
||||||
|
hackney: [pool: :crawler],
|
||||||
|
recv_timeout: 15000,
|
||||||
|
timeout: 15000
|
||||||
|
)
|
||||||
|
end
|
||||||
|
|
||||||
|
def post!(url, body \\ "") do
|
||||||
|
HTTPoison.post!(url, body, [{"User-Agent", get_config(:user_agent)}],
|
||||||
|
hackney: [pool: :crawler],
|
||||||
|
recv_timeout: 15000,
|
||||||
|
timeout: 15000
|
||||||
|
)
|
||||||
|
end
|
||||||
|
|
||||||
|
@spec urls_are_crawlable?([String.t()]) :: boolean()
|
||||||
|
def urls_are_crawlable?(urls) do
|
||||||
|
user_agent = get_config(:user_agent)
|
||||||
|
|
||||||
|
urls
|
||||||
|
|> Enum.all?(fn url -> Gollum.crawlable?(user_agent, url) != :uncrawlable end)
|
||||||
|
end
|
||||||
|
|
||||||
|
@spec has_opted_in?(String.t()) :: boolean()
|
||||||
|
def has_opted_in?(domain) do
|
||||||
|
case Instance |> select([:opt_in]) |> Repo.get_by(domain: domain) do
|
||||||
|
%{opt_in: true} -> true
|
||||||
|
_ -> false
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
@spec is_above_user_threshold?(integer) :: boolean()
|
||||||
|
def is_above_user_threshold?(user_count) do
|
||||||
|
user_count > get_config(:personal_instance_threshold)
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -30,7 +30,7 @@ defmodule Backend.Util do
|
||||||
blacklist =
|
blacklist =
|
||||||
case get_config(:blacklist) do
|
case get_config(:blacklist) do
|
||||||
nil -> []
|
nil -> []
|
||||||
_ -> get_config(:blacklist)
|
other -> other
|
||||||
end
|
end
|
||||||
|
|
||||||
blacklist
|
blacklist
|
||||||
|
|
|
@ -40,4 +40,4 @@ export interface IInstanceDomainPath {
|
||||||
}
|
}
|
||||||
|
|
||||||
// We could also extract the values from the server response, but this would slow things down...
|
// We could also extract the values from the server response, but this would slow things down...
|
||||||
export const INSTANCE_TYPES = ["mastodon", "gab", "pleroma"];
|
export const INSTANCE_TYPES = ["mastodon", "gab", "pleroma", "misskey"];
|
||||||
|
|
Loading…
Reference in a new issue