fix edge generation, serve graph in cytoscape format

This commit is contained in:
Tao Bror Bojlén 2019-07-18 13:21:12 +03:00
parent 82677fcd32
commit 9478017eb0
No known key found for this signature in database
GPG key ID: C6EC7AAB905F9E6F
13 changed files with 166 additions and 130 deletions

View file

@ -80,7 +80,7 @@ defmodule Backend.Crawler do
# Save the state (after crawling) to the database. # Save the state (after crawling) to the database.
defp save(%Crawler{domain: domain, result: result, found_api?: true, error: nil}) do defp save(%Crawler{domain: domain, result: result, found_api?: true, error: nil}) do
now = NaiveDateTime.truncate(NaiveDateTime.utc_now(), :second) now = get_now()
## Update the instance we crawled ## ## Update the instance we crawled ##
Repo.insert!( Repo.insert!(

View file

@ -13,6 +13,8 @@ defmodule Backend.Edge do
type: :string, type: :string,
foreign_key: :target_domain foreign_key: :target_domain
field :weight, :float
timestamps() timestamps()
end end

View file

@ -13,8 +13,6 @@ defmodule Backend.InstancePeer do
type: :string, type: :string,
foreign_key: :target_domain foreign_key: :target_domain
field :weight, :float, default: 0.0
timestamps() timestamps()
end end

View file

@ -5,7 +5,8 @@ defmodule Backend.Scheduler do
use Quantum.Scheduler, otp_app: :backend use Quantum.Scheduler, otp_app: :backend
alias Backend.{Crawl, Edge, Interaction, Instance, Repo} alias Backend.{Crawl, Edge, CrawlInteraction, Instance, Repo}
import Backend.Util
import Ecto.Query import Ecto.Query
require Logger require Logger
@ -29,88 +30,136 @@ defmodule Backend.Scheduler do
Logger.info("Pruned #{deleted_num} old crawls.") Logger.info("Pruned #{deleted_num} old crawls.")
end end
@doc """
Calculates every instance's "insularity score" -- that is, the percentage of mentions that are among users on the
instance, rather than at other instances.
"""
def generate_insularity_scores() do
now = get_now()
crawls_subquery =
Crawl
|> select([c], %{
instance_domain: c.instance_domain,
interactions_seen: sum(c.interactions_seen)
})
|> where([c], is_nil(c.error))
|> group_by([c], c.instance_domain)
scores =
CrawlInteraction
|> join(:left, [ci], c in subquery(crawls_subquery),
on: ci.source_domain == c.instance_domain
)
|> where([ci], ci.source_domain == ci.target_domain)
|> group_by([ci], ci.source_domain)
|> select([ci, c], %{
domain: ci.source_domain,
mentions: sum(ci.mentions),
# we can take min() because every row is the same
interactions: min(c.interactions_seen)
})
|> Repo.all()
|> (fn o ->
Logger.info(inspect(o))
o
end).()
|> Enum.map(fn %{domain: domain, mentions: mentions, interactions: interactions} ->
%{
domain: domain,
insularity: mentions / interactions,
inserted_at: now,
updated_at: now
}
end)
Instance
|> Repo.insert_all(scores,
on_conflict: {:replace, [:insularity, :updated_at]},
conflict_target: :domain
)
end
@doc """ @doc """
This function aggregates statistics from the interactions in the database. This function aggregates statistics from the interactions in the database.
It calculates the strength of edges between nodes. It calculates the strength of edges between nodes.
TODO: generate edge weights. The weight of an edge between two instances will be
(number of mentions of each other) / (total number of statuses crawled).
This requires us to keep track of how many statuses we've seen.
""" """
def generate_edges() do def generate_edges() do
now = get_now()
crawls_subquery =
Crawl
|> select([c], %{
instance_domain: c.instance_domain,
statuses_seen: sum(c.statuses_seen)
})
|> where([c], is_nil(c.error))
|> group_by([c], c.instance_domain)
interactions = interactions =
Interaction CrawlInteraction
|> select([inter], {inter.source_domain, inter.target_domain}) |> join(:left, [ci], c_source in subquery(crawls_subquery),
|> join(:left, [inter], i_source in Instance, on: inter.source_domain == i_source.domain) on: ci.source_domain == c_source.instance_domain
|> join(:left, [inter], i_target in Instance, on: inter.target_domain == i_target.domain)
|> where(
[inter, i_source, i_target],
not is_nil(i_source.last_crawl_timestamp) and not is_nil(i_target.last_crawl_timestamp)
) )
# Repo.all() returns a tuple like {"mastodon.social", "cursed.technology"} |> join(:left, [ci], c_target in subquery(crawls_subquery),
on: ci.target_domain == c_target.instance_domain
)
|> group_by([ci], [ci.source_domain, ci.target_domain])
|> select([ci, c_source, c_target], %{
source_domain: ci.source_domain,
target_domain: ci.target_domain,
mentions: sum(ci.mentions),
# we can take min() because every row is the same
source_statuses_seen: min(c_source.statuses_seen),
target_statuses_seen: min(c_target.statuses_seen)
})
|> Repo.all() |> Repo.all()
# Create a map of %{source_domain => [target_domains]}
|> Enum.group_by(fn tuple -> Kernel.elem(tuple, 0) end, fn tuple ->
Kernel.elem(tuple, 1)
end)
# Calculate insularity score # Get edges and their weights
Repo.transaction(fn -> Repo.transaction(fn ->
interactions Edge
|> Enum.each(fn {source, targets} -> |> Repo.delete_all()
total_mentions = length(targets)
self_mentions = Enum.count(targets, fn t -> t == source end)
insularity = self_mentions / total_mentions
Repo.insert!(
%Instance{
domain: source,
insularity: insularity
},
on_conflict: [set: [insularity: insularity]],
conflict_target: :domain
)
end)
# Get edges
edges = MapSet.new()
interactions
|> Enum.each(fn {source, targets} ->
targets
|> Enum.each(fn target ->
[key_a, key_b] = Enum.sort([source, target])
edge = %Edge{
source_domain: key_a,
target_domain: key_b
}
MapSet.put(edges, edge)
Logger.debug(inspect(edges))
end)
end)
Logger.debug(inspect(edges))
now = NaiveDateTime.truncate(NaiveDateTime.utc_now(), :second)
Repo.delete_all(Edge)
edges = edges =
edges interactions
|> MapSet.to_list() # Get a map of %{{source, target} => {total_mention_count, total_statuses_seen}}
|> Enum.map(fn %{source_domain: source_domain, target_domain: target_domain} -> |> Enum.reduce(%{}, fn
%Edge{ %{
source_domain: source_domain, source_domain: source_domain,
target_domain: target_domain, target_domain: target_domain,
updated_at: now, mentions: mentions,
inserted_at: now source_statuses_seen: source_statuses_seen,
target_statuses_seen: target_statuses_seen
} = x,
acc ->
Logger.info(inspect(x))
key = get_interaction_key(source_domain, target_domain)
# target_statuses_seen might be nil if that instance was never crawled. default to 0.
target_statuses_seen =
case target_statuses_seen do
nil -> 0
_ -> target_statuses_seen
end
statuses_seen = source_statuses_seen + target_statuses_seen
Map.update(acc, key, {mentions, statuses_seen}, fn {curr_mentions, curr_statuses_seen} ->
{curr_mentions + mentions, curr_statuses_seen}
end)
end)
|> Enum.map(fn {{source_domain, target_domain}, {mention_count, statuses_seen}} ->
%{
source_domain: source_domain,
target_domain: target_domain,
weight: mention_count / statuses_seen,
inserted_at: now,
updated_at: now
} }
end) end)
Repo.insert_all(Edge, edges) Edge
|> Repo.insert_all(edges)
end) end)
end end
end end

View file

@ -16,21 +16,28 @@ defmodule BackendWeb.GraphView do
false -> 1 false -> 1
end end
# This is the format that cytoscape.js expects.
%{ %{
id: node.domain, data: %{
label: node.domain, id: node.domain,
size: size, label: node.domain,
x: node.x, size: size
y: node.y },
position: %{
x: node.x,
y: node.y
}
} }
end end
def render("edge.json", %{graph: edge}) do def render("edge.json", %{graph: edge}) do
%{ %{
id: edge.id, data: %{
source: edge.source_domain, id: edge.id,
target: edge.target_domain, source: edge.source_domain,
size: edge.weight target: edge.target_domain,
weight: edge.weight
}
} }
end end
end end

View file

@ -10,6 +10,9 @@ defmodule Backend.Repo.Migrations.CreateInstances do
add :version, :string add :version, :string
add :insularity, :float add :insularity, :float
add :x, :float
add :y, :float
timestamps() timestamps()
end end
@ -19,8 +22,6 @@ defmodule Backend.Repo.Migrations.CreateInstances do
add :source_domain, references(:instances, column: :domain, type: :string) add :source_domain, references(:instances, column: :domain, type: :string)
add :target_domain, references(:instances, column: :domain, type: :string) add :target_domain, references(:instances, column: :domain, type: :string)
add :weight, :float
timestamps() timestamps()
end end

View file

@ -6,6 +6,8 @@ defmodule Backend.Repo.Migrations.CreateEdges do
add :source_domain, references(:instances, column: :domain, type: :string), null: false add :source_domain, references(:instances, column: :domain, type: :string), null: false
add :target_domain, references(:instances, column: :domain, type: :string), null: false add :target_domain, references(:instances, column: :domain, type: :string), null: false
add :weight, :float, null: false
timestamps() timestamps()
end end

View file

@ -1,10 +0,0 @@
defmodule Backend.Repo.Migrations.AddInstanceCoords do
use Ecto.Migration
def change do
alter table(:instances) do
add :x, :float
add :y, :float
end
end
end

View file

@ -69,11 +69,12 @@ class GraphImpl extends React.Component<IGraphProps, IGraphState> {
// Check that all nodes have size & coordinates; otherwise the graph will look messed up // Check that all nodes have size & coordinates; otherwise the graph will look messed up
const lengthBeforeFilter = graph.nodes.length; const lengthBeforeFilter = graph.nodes.length;
graph = { ...graph, nodes: graph.nodes.filter(n => n.size && n.x && n.y) }; graph = { ...graph, nodes: graph.nodes.filter(n => n.data.size && n.position.x && n.position.y) };
if (graph.nodes.length !== lengthBeforeFilter) { if (graph.nodes.length !== lengthBeforeFilter) {
// tslint:disable-next-line:no-console // tslint:disable-next-line:no-console
console.error( console.error(
"Some nodes were missing details: " + graph.nodes.filter(n => !n.size || !n.x || !n.y).map(n => n.label) "Some nodes were missing details: " +
graph.nodes.filter(n => !n.data.size || !n.position.x || !n.position.y).map(n => n.data.label)
); );
this.setState({ didError: true }); this.setState({ didError: true });
} }
@ -125,29 +126,9 @@ class GraphImpl extends React.Component<IGraphProps, IGraphState> {
return; return;
} }
this.cy = cytoscape({ this.cy = cytoscape({
autoungrabify: true, autoungrabify: false,
container: this.cytoscapeDiv.current, container: this.cytoscapeDiv.current,
elements: { elements: graph,
edges: graph.edges.map(edge => ({
data: {
id: edge.id || `${edge.source}${edge.target}`,
source: edge.source,
target: edge.target,
weight: edge.size
},
group: "edges" as "edges"
})),
nodes: graph.nodes.map(node => ({
data: {
id: node.id
},
group: "nodes" as "nodes",
position: {
x: node.x,
y: node.y
}
}))
},
layout: { layout: {
name: "preset" name: "preset"
}, },

View file

@ -83,7 +83,7 @@ class SidebarImpl extends React.Component<ISidebarProps, ISidebarState> {
} else if ( } else if (
this.props.graph && this.props.graph &&
this.props.instanceName && this.props.instanceName &&
this.props.graph.nodes.map(n => n.id).indexOf(this.props.instanceName) < 0 this.props.graph.nodes.map(n => n.data.id).indexOf(this.props.instanceName) < 0
) { ) {
return this.renderQuietInstanceState(); return this.renderQuietInstanceState();
} }
@ -177,13 +177,15 @@ class SidebarImpl extends React.Component<ISidebarProps, ISidebarState> {
if (!this.props.graph || !this.props.instanceName) { if (!this.props.graph || !this.props.instanceName) {
return; return;
} }
const edges = this.props.graph.edges.filter(e => [e.source, e.target].indexOf(this.props.instanceName!) > -1); const edges = this.props.graph.edges.filter(
e => [e.data.source, e.data.target].indexOf(this.props.instanceName!) > -1
);
const neighbors: any[] = []; const neighbors: any[] = [];
edges.forEach(e => { edges.forEach(e => {
if (e.source === this.props.instanceName) { if (e.data.source === this.props.instanceName) {
neighbors.push({ neighbor: e.target, weight: e.size }); neighbors.push({ neighbor: e.data.target, weight: e.data.weight });
} else { } else {
neighbors.push({ neighbor: e.source, weight: e.size }); neighbors.push({ neighbor: e.data.source, weight: e.data.weight });
} }
}); });
const neighborRows = orderBy(neighbors, ["weight"], ["desc"]).map((neighborDetails: any, idx: number) => ( const neighborRows = orderBy(neighbors, ["weight"], ["desc"]).map((neighborDetails: any, idx: number) => (

View file

@ -33,19 +33,24 @@ export interface IInstanceDetails {
} }
interface IGraphNode { interface IGraphNode {
id: string; data: {
label: string; id: string;
x: number; label: string;
y: number; size: number;
size?: number; };
color?: string; position: {
x: number;
y: number;
};
} }
interface IGraphEdge { interface IGraphEdge {
source: string; data: {
target: string; source: string;
id?: string; target: string;
size?: number; id: string;
weight: number;
};
} }
export interface IGraph { export interface IGraph {

View file

@ -24,7 +24,6 @@ import java.io.IOException;
import java.sql.Connection; import java.sql.Connection;
import java.sql.PreparedStatement; import java.sql.PreparedStatement;
import java.sql.SQLException; import java.sql.SQLException;
import java.util.Arrays;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
public class GraphBuilder { public class GraphBuilder {