fix no edges between blocked instances

This commit is contained in:
Tao Bror Bojlén 2019-08-27 16:02:59 +01:00
parent 693cf2b2d9
commit c2f842263c
No known key found for this signature in database
GPG key ID: C6EC7AAB905F9E6F
3 changed files with 75 additions and 62 deletions

View file

@ -13,6 +13,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Changed ### Changed
- Edges are no longer shown between instances where one blocks the other (based on the federation list in nodeinfo).
- Edges are only shown between timeline-crawlable instance types (Mastodon, Pleroma, Gab, Misskey, and GNU Social)
if there's a mention in each direction. This is to avoid edges between instances where one blocks the other.
### Deprecated ### Deprecated
### Removed ### Removed

View file

@ -58,7 +58,7 @@ config :backend, Backend.Repo,
config :backend, :crawler, config :backend, :crawler,
status_age_limit_days: 28, status_age_limit_days: 28,
status_count_limit: 100, status_count_limit: 1000,
personal_instance_threshold: 5, personal_instance_threshold: 5,
crawl_interval_mins: 60, crawl_interval_mins: 60,
crawl_workers: 10, crawl_workers: 10,

View file

@ -163,16 +163,20 @@ defmodule Backend.Scheduler do
|> join(:inner, [ci], c_target in subquery(crawls_subquery), |> join(:inner, [ci], c_target in subquery(crawls_subquery),
on: ci.target_domain == c_target.instance_domain on: ci.target_domain == c_target.instance_domain
) )
|> where([ci], ci.source_domain != ci.target_domain) |> join(:inner, [ci], i_source in Instance, on: ci.source_domain == i_source.domain)
|> group_by([ci], [ci.source_domain, ci.target_domain]) |> join(:inner, [ci], i_target in Instance, on: ci.target_domain == i_target.domain)
|> select([ci, c_source, c_target], %{ |> select([ci, c_source, c_target, i_source, i_target], %{
source_domain: ci.source_domain, source_domain: ci.source_domain,
target_domain: ci.target_domain, target_domain: ci.target_domain,
mentions: sum(ci.mentions), mentions: sum(ci.mentions),
# we can take min() because every row is the same # we can take min() because every row is the same
source_type: min(i_source.type),
target_type: min(i_target.type),
source_statuses_seen: min(c_source.statuses_seen), source_statuses_seen: min(c_source.statuses_seen),
target_statuses_seen: min(c_target.statuses_seen) target_statuses_seen: min(c_target.statuses_seen)
}) })
|> where([ci], ci.source_domain != ci.target_domain)
|> group_by([ci], [ci.source_domain, ci.target_domain])
|> Repo.all(timeout: :infinity) |> Repo.all(timeout: :infinity)
federation_blocks = federation_blocks =
@ -182,23 +186,10 @@ defmodule Backend.Scheduler do
|> Repo.all() |> Repo.all()
|> MapSet.new() |> MapSet.new()
# Get edges and their weights new_edges =
Repo.transaction(
fn ->
Edge
|> Repo.delete_all(timeout: :infinity)
mentions =
interactions interactions
|> reduce_mention_count(federation_blocks) |> filter_to_eligible_interactions(federation_blocks)
|> combine_mention_directions()
# Filter down to mentions where there are interactions in both directions
filtered_mentions =
mentions
|> Enum.filter(&has_opposite_mention?(&1, mentions))
edges =
filtered_mentions
|> Enum.map(fn {{source_domain, target_domain}, {mention_count, statuses_seen}} -> |> Enum.map(fn {{source_domain, target_domain}, {mention_count, statuses_seen}} ->
%{ %{
source_domain: source_domain, source_domain: source_domain,
@ -209,8 +200,14 @@ defmodule Backend.Scheduler do
} }
end) end)
# Get edges and their weights
Repo.transaction(
fn ->
Edge Edge
|> Repo.insert_all(edges, timeout: :infinity) |> Repo.delete_all(timeout: :infinity)
Edge
|> Repo.insert_all(new_edges, timeout: :infinity)
end, end,
timeout: :infinity timeout: :infinity
) )
@ -266,9 +263,9 @@ defmodule Backend.Scheduler do
end end
end end
# Takes a list of Interactions and a MapSet of blocks in the form {source_domain, target_domain} # Takes a list of Interactions
# Returns a map of %{{source, target} => {total_mention_count, total_statuses_seen}} # Returns a map of %{{source, target} => {total_mention_count, total_statuses_seen}}
defp reduce_mention_count(interactions, federation_blocks) do defp combine_mention_directions(interactions) do
Enum.reduce(interactions, %{}, fn Enum.reduce(interactions, %{}, fn
%{ %{
source_domain: source_domain, source_domain: source_domain,
@ -289,46 +286,58 @@ defmodule Backend.Scheduler do
statuses_seen = source_statuses_seen + target_statuses_seen statuses_seen = source_statuses_seen + target_statuses_seen
maybe_update_map(
acc,
key,
source_domain,
target_domain,
mentions,
statuses_seen,
federation_blocks
)
end)
end
defp maybe_update_map(
acc,
key,
source_domain,
target_domain,
mentions,
statuses_seen,
federation_blocks
) do
if not MapSet.member?(federation_blocks, {source_domain, target_domain}) and
not MapSet.member?(federation_blocks, {target_domain, source_domain}) do
Map.update(acc, key, {mentions, statuses_seen}, fn {curr_mentions, curr_statuses_seen} -> Map.update(acc, key, {mentions, statuses_seen}, fn {curr_mentions, curr_statuses_seen} ->
{curr_mentions + mentions, curr_statuses_seen} {curr_mentions + mentions, curr_statuses_seen}
end) end)
end end)
end end
defp has_opposite_mention?(mention, all_mentions) do defp filter_to_eligible_interactions(interactions, federation_blocks) do
{{source_domain, target_domain}, {mention_count, _statuses_seen}} = mention # A map of {source_domain, target_domain} => mention_count. Used to find out whether a mention in the reverse
other_direction_key = {target_domain, source_domain} # direction has been seen.
mention_directions =
interactions
|> Enum.reduce(%{}, fn %{source_domain: source, target_domain: target, mentions: mentions},
acc ->
Map.put(acc, {source, target}, mentions)
end)
if mention_count > 0 and Map.has_key?(all_mentions, other_direction_key) do interactions
{other_direction_mentions, _other_statuses_seen} = |> Enum.filter(&is_eligible_interaction?(&1, mention_directions, federation_blocks))
Map.get(all_mentions, other_direction_key) end
other_direction_mentions > 0 # Returns true if
# * there's no federation block in either direction between the two instances
# * there are mentions in both directions
defp is_eligible_interaction?(
%{
source_domain: source,
target_domain: target,
mentions: mention_count,
source_type: source_type,
target_type: target_type
},
mention_directions,
federation_blocks
) do
mentions_were_seen = mention_count > 0
opposite_mention_exists =
if is_timeline_crawlable_type?(source_type) and is_timeline_crawlable_type?(target_type) do
Map.has_key?(mention_directions, {target, source}) and
Map.get(mention_directions, {target, source}) > 0
else else
false true
end end
federation_block_exists =
MapSet.member?(federation_blocks, {source, target}) or
MapSet.member?(federation_blocks, {target, source})
mentions_were_seen and opposite_mention_exists and not federation_block_exists
end
defp is_timeline_crawlable_type?(type) do
Enum.member?(["mastodon", "gab", "pleroma", "gnusocial", "misskey"], type)
end end
end end