don't create edges between blocking instances

This commit is contained in:
Tao Bror Bojlén 2019-08-27 14:10:06 +01:00
parent b1d6665bb8
commit 3e52431435
No known key found for this signature in database
GPG Key ID: C6EC7AAB905F9E6F
13 changed files with 252 additions and 131 deletions

View File

@ -63,7 +63,9 @@ config :backend, :crawler,
crawl_workers: 20,
blacklist: [
"gab.best",
"4chan.icu"
"4chan.icu",
"pleroma.site",
"pleroma.online"
],
user_agent: "fediverse.space crawler",
admin_phone: System.get_env("ADMIN_PHONE"),

View File

@ -62,8 +62,4 @@ config :backend, :crawler,
personal_instance_threshold: 5,
crawl_interval_mins: 60,
crawl_workers: 10,
blacklist: [
"gab.best",
"4chan.icu"
],
frontend_domain: "localhost:3000"

View File

@ -26,25 +26,39 @@ defmodule Backend.Crawler.ApiCrawler do
:peers,
:interactions,
:statuses_seen,
:instance_type
:instance_type,
:blocked_domains
]
@type t() :: %__MODULE__{
version: String.t(),
description: String.t(),
version: String.t() | nil,
description: String.t() | nil,
user_count: integer | nil,
status_count: integer | nil,
peers: [String.t()],
interactions: instance_interactions,
statuses_seen: integer,
instance_type: instance_type
instance_type: instance_type | nil,
blocked_domains: [String.t()]
}
@empty_result %{
version: nil,
description: nil,
user_count: nil,
status_count: nil,
peers: [],
interactions: %{},
statuses_seen: 0,
instance_type: nil,
blocked_domains: []
}
@doc """
Check whether the instance at the given domain is of the type that this ApiCrawler implements.
Arguments are the instance domain and the nodeinfo results.
"""
@callback is_instance_type?(String.t(), Nodeinfo.t()) :: boolean()
@callback is_instance_type?(String.t(), ApiCrawler.t()) :: boolean()
@doc """
Check whether the instance allows crawling according to its robots.txt or otherwise.
@ -56,4 +70,11 @@ defmodule Backend.Crawler.ApiCrawler do
Takes two arguments: the domain to crawl and the existing results (from nodeinfo).
"""
@callback crawl(String.t(), Nodeinfo.t()) :: t()
@doc """
Returns the default, empty state
"""
def get_default do
@empty_result
end
end

View File

@ -4,7 +4,17 @@ defmodule Backend.Crawler do
"""
alias __MODULE__
alias Backend.{Crawl, CrawlInteraction, Instance, InstancePeer, MostRecentCrawl, Repo}
alias Backend.{
Crawl,
CrawlInteraction,
FederationRestriction,
Instance,
InstancePeer,
MostRecentCrawl,
Repo
}
alias Backend.Crawler.ApiCrawler
alias Backend.Crawler.Crawlers.{Friendica, GnuSocial, Mastodon, Misskey, Nodeinfo}
@ -75,14 +85,24 @@ defmodule Backend.Crawler do
# a) it should always be run first
# b) it passes the results on to the next crawlers (e.g. user_count)
defp crawl(%Crawler{api_crawlers: [Nodeinfo | remaining_crawlers], domain: domain} = state) do
with true <- Nodeinfo.allows_crawling?(domain), {:ok, nodeinfo} <- Nodeinfo.crawl(domain) do
Logger.debug("Found nodeinfo for #{domain}.")
result = Map.merge(nodeinfo, %{peers: [], interactions: %{}, statuses_seen: 0})
crawl(%Crawler{state | result: result, found_api?: true, api_crawlers: remaining_crawlers})
else
_ ->
if Nodeinfo.allows_crawling?(domain) do
nodeinfo = Nodeinfo.crawl(domain, nil)
if nodeinfo != nil do
Logger.debug("Found nodeinfo for #{domain}.")
crawl(%Crawler{
state
| result: nodeinfo,
found_api?: true,
api_crawlers: remaining_crawlers
})
else
Logger.debug("Did not find nodeinfo for #{domain}.")
crawl(%Crawler{state | api_crawlers: remaining_crawlers})
end
else
crawl(%Crawler{state | api_crawlers: remaining_crawlers, allows_crawling?: false})
end
end
@ -165,7 +185,7 @@ defmodule Backend.Crawler do
Elasticsearch.put_document!(Backend.Elasticsearch.Cluster, instance, "instances/_doc")
# Save details of a new crawl
## Save details of a new crawl ##
curr_crawl =
Repo.insert!(%Crawl{
instance_domain: domain,
@ -202,12 +222,13 @@ defmodule Backend.Crawler do
raise "#{domain} has invalid peers: #{Enum.join(invalid_peers, ", ")}"
end
peers =
new_instances =
peers_domains
|> list_union(result.blocked_domains)
|> Enum.map(&%{domain: &1, inserted_at: now, updated_at: now, next_crawl: now})
Instance
|> Repo.insert_all(peers, on_conflict: :nothing, conflict_target: :domain)
|> Repo.insert_all(new_instances, on_conflict: :nothing, conflict_target: :domain)
Repo.transaction(fn ->
## Save peer relationships ##
@ -249,6 +270,56 @@ defmodule Backend.Crawler do
|> Repo.insert_all(new_instance_peers)
end)
## Save federation restrictions ##
Repo.transaction(fn ->
current_restrictions =
FederationRestriction
|> select([fr], {fr.target_domain, fr.type})
|> where(source_domain: ^domain)
|> Repo.all()
wanted_restrictions_set =
result.blocked_domains
|> Enum.map(&{&1, "reject"})
|> MapSet.new()
current_restrictions_set = MapSet.new(current_restrictions)
# Delete the ones we don't want
restrictions_to_delete =
current_restrictions_set
|> MapSet.difference(wanted_restrictions_set)
|> MapSet.to_list()
|> Enum.map(fn {target_domain, _type} -> target_domain end)
if length(restrictions_to_delete) > 0 do
FederationRestriction
|> where(
[fr],
fr.source_domain == ^domain and fr.target_domain in ^restrictions_to_delete
)
|> Repo.delete_all()
end
# Save the new ones
new_restrictions =
wanted_restrictions_set
|> MapSet.difference(current_restrictions_set)
|> MapSet.to_list()
|> Enum.map(fn {target_domain, type} ->
%{
source_domain: domain,
target_domain: target_domain,
type: type,
inserted_at: now,
updated_at: now
}
end)
FederationRestriction
|> Repo.insert_all(new_restrictions)
end)
## Save interactions ##
interactions =
result.interactions

View File

@ -62,12 +62,11 @@ defmodule Backend.Crawler.Crawlers.Friendica do
end)
if details |> Map.get(:user_count, 0) |> is_above_user_threshold?() do
Map.merge(
%{peers: peers, interactions: %{}, statuses_seen: 0, instance_type: :friendica},
Map.take(details, [:description, :version, :user_count, :status_count])
)
ApiCrawler.get_default()
|> Map.merge(%{peers: peers, instance_type: :friendica})
|> Map.merge(Map.take(details, [:description, :version, :user_count, :status_count]))
else
nodeinfo_result
Map.merge(ApiCrawler.get_default(), nodeinfo_result)
end
end

View File

@ -3,7 +3,6 @@ defmodule Backend.Crawler.Crawlers.GnuSocial do
Crawler for GNU Social servers.
"""
alias Backend.Crawler.ApiCrawler
alias Backend.Crawler.Crawlers.Nodeinfo
import Backend.Crawler.Util
import Backend.Util
require Logger
@ -32,17 +31,17 @@ defmodule Backend.Crawler.Crawlers.GnuSocial do
end
@impl ApiCrawler
def crawl(domain, nodeinfo_result) do
if nodeinfo_result == nil or
nodeinfo_result |> Map.get(:user_count) |> is_above_user_threshold?() do
crawl_large_instance(domain, nodeinfo_result)
def crawl(domain, nodeinfo) do
if nodeinfo == nil or
nodeinfo |> Map.get(:user_count) |> is_above_user_threshold?() do
Map.merge(crawl_large_instance(domain), nodeinfo)
else
nodeinfo_result
Map.merge(ApiCrawler.get_default(), nodeinfo)
end
end
@spec crawl_large_instance(String.t(), Nodeinfo.t()) :: ApiCrawler.t()
defp crawl_large_instance(domain, nodeinfo_result) do
@spec crawl_large_instance(String.t()) :: ApiCrawler.t()
defp crawl_large_instance(domain) do
status_datetime_threshold =
NaiveDateTime.utc_now()
|> NaiveDateTime.add(get_config(:status_age_limit_days) * 24 * 3600 * -1, :second)
@ -52,24 +51,14 @@ defmodule Backend.Crawler.Crawlers.GnuSocial do
{interactions, statuses_seen} = get_interactions(domain, min_timestamp)
if nodeinfo_result != nil do
Map.merge(nodeinfo_result, %{
interactions: interactions,
statuses_seen: statuses_seen,
peers: []
})
else
Map.merge(
ApiCrawler.get_default(),
%{
version: nil,
description: nil,
user_count: nil,
status_count: nil,
peers: [],
interactions: interactions,
statuses_seen: statuses_seen,
instance_type: :gnusocial
}
end
)
end
@spec get_interactions(

View File

@ -34,26 +34,19 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
end
@impl ApiCrawler
def crawl(domain, _current_result) do
def crawl(domain, nodeinfo) do
instance = get_and_decode!("https://#{domain}/api/v1/instance")
user_count = get_in(instance, ["stats", "user_count"])
if is_above_user_threshold?(user_count) or has_opted_in?(domain) do
crawl_large_instance(domain, instance)
Map.merge(nodeinfo, crawl_large_instance(domain, instance))
else
Map.merge(
Map.take(instance["stats"], ["user_count"])
|> convert_keys_to_atoms(),
%{
instance_type: get_instance_type(instance),
peers: [],
interactions: %{},
statuses_seen: 0,
description: nil,
version: nil,
status_count: nil
}
)
ApiCrawler.get_default()
|> Map.merge(nodeinfo)
|> Map.merge(%{
instance_type: get_instance_type(instance),
user_count: get_in(instance, ["stats", "user_count"])
})
end
end

View File

@ -35,22 +35,18 @@ defmodule Backend.Crawler.Crawlers.Misskey do
end
@impl ApiCrawler
def crawl(domain, _result) do
def crawl(domain, nodeinfo) do
with {:ok, %{"originalUsersCount" => user_count, "originalNotesCount" => status_count}} <-
post_and_decode("https://#{domain}/api/stats") do
if is_above_user_threshold?(user_count) or has_opted_in?(domain) do
crawl_large_instance(domain, user_count, status_count)
Map.merge(nodeinfo, crawl_large_instance(domain, user_count, status_count))
else
%{
instance_type: :misskey,
version: nil,
description: nil,
ApiCrawler.get_default()
|> Map.merge(nodeinfo)
|> Map.merge(%{
user_count: user_count,
status_count: nil,
peers: [],
interactions: %{},
statuses_seen: 0
}
type: :misskey
})
end
end
end

View File

@ -1,34 +1,16 @@
defmodule Backend.Crawler.Crawlers.Nodeinfo do
@moduledoc """
This module is slightly different from the other crawlers.
It doesn't implement the ApiCrawler spec because it isn't run as a self-contained crawler.
Instead, it's run before all the other crawlers.
This is to get the user count. Some servers don't publish this in other places (e.g. GNU Social, PeerTube) so we need
nodeinfo to know whether it's a personal instance or not.
This module is slightly different from the other crawlers. It's run before all the others and its
result is included in theirs.
"""
alias Backend.Crawler.ApiCrawler
require Logger
import Backend.Util
import Backend.Crawler.Util
@behaviour ApiCrawler
defstruct [
:description,
:user_count,
:status_count,
:instance_type,
:version
]
@type t() :: %__MODULE__{
description: String.t(),
user_count: integer,
status_count: integer,
instance_type: ApiCrawler.instance_type(),
version: String.t()
}
@spec allows_crawling?(String.t()) :: boolean()
@impl ApiCrawler
def allows_crawling?(domain) do
[
".well-known/nodeinfo"
@ -37,13 +19,19 @@ defmodule Backend.Crawler.Crawlers.Nodeinfo do
|> urls_are_crawlable?()
end
@spec crawl(String.t()) :: {:ok, t()} | {:error, nil}
def crawl(domain) do
@impl ApiCrawler
def is_instance_type?(_domain, _nodeinfo) do
# This crawler is used slightly differently from the others -- we always check for nodeinfo.
true
end
@impl ApiCrawler
def crawl(domain, _curr_result) do
with {:ok, nodeinfo_url} <- get_nodeinfo_url(domain),
{:ok, nodeinfo} <- get_nodeinfo(nodeinfo_url) do
{:ok, nodeinfo}
nodeinfo
else
_other -> {:error, nil}
_other -> ApiCrawler.get_default()
end
end
@ -65,8 +53,7 @@ defmodule Backend.Crawler.Crawlers.Nodeinfo do
|> Map.get("href")
end
@spec get_nodeinfo(String.t()) ::
{:ok, t()} | {:error, Jason.DecodeError.t() | HTTPoison.Error.t()}
@spec get_nodeinfo(String.t()) :: ApiCrawler.t()
defp get_nodeinfo(nodeinfo_url) do
case get_and_decode(nodeinfo_url) do
{:ok, nodeinfo} -> {:ok, process_nodeinfo(nodeinfo)}
@ -74,7 +61,7 @@ defmodule Backend.Crawler.Crawlers.Nodeinfo do
end
end
@spec process_nodeinfo(any()) :: t()
@spec process_nodeinfo(any()) :: ApiCrawler.t()
defp process_nodeinfo(nodeinfo) do
user_count = get_in(nodeinfo, ["usage", "users", "total"])
@ -90,21 +77,33 @@ defmodule Backend.Crawler.Crawlers.Nodeinfo do
type = nodeinfo |> get_in(["software", "name"]) |> String.downcase() |> String.to_atom()
%__MODULE__{
description: description,
user_count: user_count,
status_count: get_in(nodeinfo, ["usage", "localPosts"]),
instance_type: type,
version: get_in(nodeinfo, ["software", "version"])
}
Map.merge(
ApiCrawler.get_default(),
%{
description: description,
user_count: user_count,
status_count: get_in(nodeinfo, ["usage", "localPosts"]),
instance_type: type,
version: get_in(nodeinfo, ["software", "version"]),
blocked_domains:
get_in(nodeinfo, ["metadata", "federation", "mrf_simple", "reject"])
|> (fn b ->
if b == nil do
[]
else
b
end
end).()
|> Enum.map(&clean_domain(&1))
}
)
else
%{
description: nil,
user_count: user_count,
status_count: nil,
instance_type: nil,
version: nil
}
Map.merge(
ApiCrawler.get_default(),
%{
user_count: user_count
}
)
end
end

View File

@ -0,0 +1,27 @@
defmodule Backend.FederationRestriction do
use Ecto.Schema
import Ecto.Changeset
schema "federation_restrictions" do
belongs_to :source, Backend.Instance,
references: :domain,
type: :string,
foreign_key: :source_domain
belongs_to :target, Backend.Instance,
references: :domain,
type: :string,
foreign_key: :target_domain
field :type, :string
timestamps()
end
@doc false
def changeset(federation_restriction, attrs) do
federation_restriction
|> cast(attrs, [:source, :target, :type])
|> validate_required([:source, :target, :type])
end
end

View File

@ -3,10 +3,9 @@ defmodule Backend.Scheduler do
This module runs recurring tasks.
"""
use Appsignal.Instrumentation.Decorators
use Quantum.Scheduler, otp_app: :backend
alias Backend.{Crawl, CrawlInteraction, Edge, Instance, Repo}
alias Backend.{Crawl, CrawlInteraction, Edge, FederationRestriction, Instance, Repo}
alias Backend.Mailer.AdminEmail
import Backend.Util
@ -21,7 +20,6 @@ defmodule Backend.Scheduler do
`unit` must singular, e.g. "second", "minute", "hour", "month", "year", etc...
"""
@spec prune_crawls(integer, String.t()) :: any
@decorate transaction()
def prune_crawls(amount, unit) do
{deleted_num, _} =
Crawl
@ -39,7 +37,6 @@ defmodule Backend.Scheduler do
Calculates every instance's "insularity score" -- that is, the percentage of mentions that are among users on the
instance, rather than at other instances.
"""
@decorate transaction()
def generate_insularity_scores do
now = get_now()
@ -85,7 +82,6 @@ defmodule Backend.Scheduler do
@doc """
This function calculates the average number of statuses per hour over the last month.
"""
@decorate transaction()
def generate_status_rate do
now = get_now()
# We want the earliest sucessful crawl so that we can exclude it from the statistics.
@ -143,9 +139,10 @@ defmodule Backend.Scheduler do
@doc """
This function aggregates statistics from the interactions in the database.
It calculates the strength of edges between nodes. Self-edges are not generated.
Edges are only generated if both instances have been succesfully crawled.
Edges are only generated if
* both instances have been succesfully crawled
* neither of the instances have blocked each other
"""
@decorate transaction()
def generate_edges do
now = get_now()
@ -177,6 +174,13 @@ defmodule Backend.Scheduler do
})
|> Repo.all(timeout: :infinity)
federation_blocks =
FederationRestriction
|> select([fr], {fr.source_domain, fr.target_domain})
|> where([fr], fr.type == "reject")
|> Repo.all()
|> MapSet.new()
# Get edges and their weights
Repo.transaction(
fn ->
@ -185,7 +189,7 @@ defmodule Backend.Scheduler do
edges =
interactions
|> reduce_mention_count()
|> reduce_mention_count(federation_blocks)
|> Enum.map(fn {{source_domain, target_domain}, {mention_count, statuses_seen}} ->
%{
source_domain: source_domain,
@ -207,7 +211,6 @@ defmodule Backend.Scheduler do
This function checks to see if a lot of instances on the same base domain have been created recently. If so,
notifies the server admin over SMS.
"""
@decorate transaction()
def check_for_spam_instances do
hour_range = 3
@ -254,10 +257,9 @@ defmodule Backend.Scheduler do
end
end
# Takes a list of Interactions
# Takes a list of Interactions and a MapSet of blocks in the form {source_domain, target_domain}
# Returns a map of %{{source, target} => {total_mention_count, total_statuses_seen}}
@decorate transaction_event()
defp reduce_mention_count(interactions) do
defp reduce_mention_count(interactions, federation_blocks) do
Enum.reduce(interactions, %{}, fn
%{
source_domain: source_domain,
@ -278,9 +280,12 @@ defmodule Backend.Scheduler do
statuses_seen = source_statuses_seen + target_statuses_seen
Map.update(acc, key, {mentions, statuses_seen}, fn {curr_mentions, curr_statuses_seen} ->
{curr_mentions + mentions, curr_statuses_seen}
end)
if not MapSet.member?(federation_blocks, {source_domain, target_domain}) and
not MapSet.member?(federation_blocks, {target_domain, source_domain}) do
Map.update(acc, key, {mentions, statuses_seen}, fn {curr_mentions, curr_statuses_seen} ->
{curr_mentions + mentions, curr_statuses_seen}
end)
end
end)
end
end

View File

@ -128,6 +128,7 @@ defmodule Backend.Util do
end
end
@spec clean_domain(String.t()) :: String.t()
def clean_domain(domain) do
cleaned =
domain

View File

@ -0,0 +1,22 @@
defmodule Backend.Repo.Migrations.CreateFederationRestrictions do
use Ecto.Migration
def change do
create table(:federation_restrictions) do
add :source_domain,
references(:instances, column: :domain, type: :string, on_delete: :delete_all),
null: false
add :target_domain,
references(:instances, column: :domain, type: :string, on_delete: :delete_all),
null: false
add :type, :string, null: false
timestamps()
end
create index(:federation_restrictions, [:source_domain])
create index(:federation_restrictions, [:target_domain])
end
end