refactor/elixir backend

This commit is contained in:
Tao Bojlén 2019-07-14 11:47:06 +00:00
parent ab8e2b09d0
commit a37452f138
109 changed files with 5339 additions and 3675 deletions

2
.dokku-monorepo Normal file
View file

@ -0,0 +1,2 @@
backend=backend
gephi=gephi

173
.gitignore vendored
View file

@ -1,93 +1,9 @@
*.csv *.csv
.idea/ .idea/
backend/backend/static/
backend/static/
*.gexf *.gexf
backend/whitelist.txt backend/whitelist.txt
data/ data/
.vscode/
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
./lib/
./lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments # Environments
.env .env
@ -99,15 +15,84 @@ ENV/
env.bak/ env.bak/
venv.bak/ venv.bak/
# Spyder project settings # The directory Mix will write compiled artifacts to.
.spyderproject /backend/_build/
.spyproject
# Rope project settings # If you run "mix test --cover", coverage assets end up here.
.ropeproject /backend/cover/
# mkdocs documentation # The directory Mix downloads your dependencies sources to.
/site /backend/deps/
# mypy # Where 3rd-party dependencies like ExDoc output generated docs.
.mypy_cache/ /backend/doc/
# Ignore .fetch files in case you like to edit your project deps locally.
/backend/.fetch
# If the VM crashes, it generates a dump, let's ignore it too.
erl_crash.dump
# Also ignore archive artifacts (built via "mix archive.build").
*.ez
# Ignore package tarball (built via "mix hex.build").
backend-*.tar
# Since we are building assets from assets/,
# we ignore priv/static. You may want to comment
# this depending on your deployment strategy.
/backend/priv/static/
# Files matching config/*.secret.exs pattern contain sensitive
# data and you should not commit them into version control.
#
# Alternatively, you may comment the line below and commit the
# secrets files as long as you replace their contents by environment
# variables.
/backend/config/*.secret.exs
/backend/.elixir_ls/
*.pot
*.po
# dependencies
/frontend/node_modules
# testing
/frontend/coverage
# production
/frontend/build
# misc
.DS_Store
.env.local
.env.development.local
.env.test.local
.env.production.local
npm-debug.log*
yarn-debug.log*
yarn-error.log*
/gephi/.gradle/
/gephi/build/
/gephi/lib/*
/gephi/!lib/.gitkeep
# 64MB file but I don't have much faith that it'll remain available...
!/gephi/lib/gephi-toolkit-0.9.2.jar
*/.idea/
# Ignore Gradle GUI config
/gephi/gradle-app.setting
# Avoid ignoring Gradle wrapper jar file (.jar files are usually ignored)
!/gephi/gradle-wrapper.jar
# Cache of project
/gephi/.gradletasknamecache
*.javac

9
backend/.dockerignore Normal file
View file

@ -0,0 +1,9 @@
_build/
deps/
.git/
.gitignore
Dockerfile
Makefile
README*
test/
priv/static/

5
backend/.formatter.exs Normal file
View file

@ -0,0 +1,5 @@
[
import_deps: [:ecto, :phoenix],
inputs: ["*.{ex,exs}", "priv/*/seeds.exs", "{config,lib,test}/**/*.{ex,exs}"],
subdirectories: ["priv/*/migrations"]
]

View file

@ -1,12 +1,53 @@
FROM python:3 FROM elixir:1.9.0-alpine as build
ENV PYTHONUNBUFFERED 1
RUN apt-get update && \ # install build dependencies
apt-get install -qqy --no-install-recommends \ RUN apk add --update git build-base
postgresql-client-9.6=9.6.10-0+deb9u1
RUN mkdir /code # prepare build dir
WORKDIR /code RUN mkdir /app
COPY requirements.txt /code/ WORKDIR /app
RUN pip install -r requirements.txt
COPY . /code/ # install hex + rebar
RUN mix local.hex --force && \
mix local.rebar --force
# set build ENV
ENV MIX_ENV=prod
# install mix dependencies
COPY mix.exs mix.lock ./
COPY config config
RUN mix deps.get
RUN mix deps.compile
# build assets
# COPY assets assets
# RUN cd assets && npm install && npm run deploy
# RUN mix phx.digest
# build project
COPY priv priv
COPY lib lib
RUN mix compile
# build release
COPY rel rel
RUN mix release
# prepare release image
FROM alpine:3.9 AS app
RUN apk add --update bash openssl
RUN mkdir /app
WORKDIR /app
ENV APP_NAME=backend
COPY --from=build /app/_build/prod/rel/${APP_NAME} ./
RUN chown -R nobody: /app
USER nobody
ENV HOME=/app
# The command to start the backend
CMD trap 'exit' INT; ${HOME}/bin/${APP_NAME} start

33
backend/README.md Normal file
View file

@ -0,0 +1,33 @@
# fediverse.space backend
## Notes
- This project requires Elixir >= 1.9.
- Run with `SKIP_CRAWL=true` to just run the server (useful for working on the API without also crawling)
## Deployment
Deployment with Docker is handled as per the [Distillery docs](https://hexdocs.pm/distillery/guides/working_with_docker.html).
- To build a new version, run `make build` in this directory.
- To migrate a released version, run `./backend eval "Backend.Release.migrate"`
# Default README
To start your Phoenix server:
- Install dependencies with `mix deps.get`
- Create and migrate your database with `mix ecto.setup`
- Start Phoenix endpoint with `mix phx.server`
Now you can visit [`localhost:4000`](http://localhost:4000) from your browser.
Ready to run in production? Please [check our deployment guides](https://hexdocs.pm/phoenix/deployment.html).
## Learn more
- Official website: http://www.phoenixframework.org/
- Guides: https://hexdocs.pm/phoenix/overview.html
- Docs: https://hexdocs.pm/phoenix
- Mailing list: http://groups.google.com/group/phoenix-talk
- Source: https://github.com/phoenixframework/phoenix

View file

@ -1,8 +0,0 @@
def to_representation(self, instance):
"""
Object instance -> Dict of primitive datatypes.
We use a custom to_representation function to exclude empty fields in the serialized JSON.
"""
ret = super(InstanceListSerializer, self).to_representation(instance)
ret = OrderedDict(list(filter(lambda x: x[1], ret.items())))
return ret

View file

@ -1,5 +0,0 @@
from django.apps import AppConfig
class Apiv1Config(AppConfig):
name = 'apiv1'

View file

@ -1,105 +0,0 @@
from rest_framework import serializers
import math
from collections import OrderedDict
from scraper.models import Instance, Edge
class InstanceListSerializer(serializers.ModelSerializer):
"""
Minimal instance details used in the full list of instances.
"""
class Meta:
model = Instance
fields = ('name', 'user_count')
def to_representation(self, instance):
"""
Object instance -> Dict of primitive datatypes.
We use a custom to_representation function to exclude empty fields in the serialized JSON.
"""
ret = super(InstanceListSerializer, self).to_representation(instance)
ret = OrderedDict(list(filter(lambda x: x[1], ret.items())))
return ret
class InstanceDetailSerializer(serializers.ModelSerializer):
"""
Detailed instance view.
"""
userCount = serializers.SerializerMethodField()
statusCount = serializers.SerializerMethodField()
domainCount = serializers.SerializerMethodField()
lastUpdated = serializers.SerializerMethodField()
peers = InstanceListSerializer(many=True, read_only=True)
def get_userCount(self, obj):
return obj.user_count
def get_statusCount(self, obj):
return obj.status_count
def get_domainCount(self, obj):
return obj.domain_count
def get_lastUpdated(self, obj):
return obj.last_updated
class Meta:
model = Instance
fields = ('name', 'description', 'version', 'userCount',
'statusCount', 'domainCount', 'peers', 'lastUpdated',
'status')
class EdgeSerializer(serializers.ModelSerializer):
"""
Used for displaying the graph.
"""
id = serializers.SerializerMethodField('get_pk')
size = serializers.SerializerMethodField('get_weight')
class Meta:
model = Edge
fields = ('source', 'target', 'id', 'size')
def get_pk(self, obj):
return obj.pk
def get_weight(self, obj):
return obj.weight
class NodeSerializer(serializers.ModelSerializer):
"""
Used for displaying the graph.
"""
id = serializers.SerializerMethodField('get_name')
label = serializers.SerializerMethodField('get_name')
size = serializers.SerializerMethodField()
x = serializers.SerializerMethodField()
y = serializers.SerializerMethodField()
class Meta:
model = Instance
fields = ('id', 'label', 'size', 'x', 'y')
def get_name(self, obj):
return obj.name
def get_size(self, obj):
return math.log(obj.user_count) if (obj.user_count and (obj.user_count > 1)) else 1
def get_x(self, obj):
return obj.x_coord
def get_y(self, obj):
return obj.y_coord
def to_representation(self, instance):
"""
Object instance -> Dict of primitive datatypes.
We use a custom to_representation function to exclude empty fields in the serialized JSON.
"""
ret = super(NodeSerializer, self).to_representation(instance)
ret = OrderedDict(list(filter(lambda x: x[1], ret.items())))
return ret

View file

@ -1,37 +0,0 @@
from rest_framework import viewsets
from scraper.models import Instance, Edge
from apiv1.serializers import InstanceListSerializer, InstanceDetailSerializer, NodeSerializer, EdgeSerializer
class InstanceViewSet(viewsets.ReadOnlyModelViewSet):
"""API endpoint to view stats for, and the peers of, an instance"""
lookup_field = 'name'
lookup_value_regex = '[a-zA-Z0-9-_\.]+'
queryset = Instance.objects.all()
serializer_class = InstanceListSerializer
detail_serializer_class = InstanceDetailSerializer # this serializer also includes stats and a list of peers
def get_serializer_class(self):
if self.action == 'retrieve':
if hasattr(self, 'detail_serializer_class'):
return self.detail_serializer_class
return self.serializer_class
class EdgeView(viewsets.ReadOnlyModelViewSet):
"""
Endpoint to get a list of the graph's edges in a SigmaJS-friendly format.
"""
queryset = Edge.objects.all()
serializer_class = EdgeSerializer
class NodeView(viewsets.ReadOnlyModelViewSet):
"""
Endpoint to get a list of the graph's nodes in a SigmaJS-friendly format.
"""
queryset = Instance.objects.filter(status='success', x_coord__isnull=False, y_coord__isnull=False, user_count__isnull=False)\
.exclude(sources__isnull=True, targets__isnull=True)
serializer_class = NodeSerializer

View file

@ -1,124 +0,0 @@
"""
Django settings for backend project.
Generated by 'django-admin startproject' using Django 2.1.
For more information on this file, see
https://docs.djangoproject.com/en/2.1/topics/settings/
For the full list of settings and their values, see
https://docs.djangoproject.com/en/2.1/ref/settings/
"""
import os
import json
from django.core.exceptions import ImproperlyConfigured
SECRET_KEY = os.getenv("SECRET_KEY")
# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# Application definition
INSTALLED_APPS = [
'django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
'rest_framework',
'silk',
'corsheaders',
'scraper.apps.ScraperConfig',
'apiv1.apps.Apiv1Config',
]
MIDDLEWARE = [
'corsheaders.middleware.CorsMiddleware',
'django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.common.CommonMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.clickjacking.XFrameOptionsMiddleware',
'silk.middleware.SilkyMiddleware',
]
ROOT_URLCONF = 'backend.urls'
TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': [os.path.join(BASE_DIR, '../../frontend/build')],
'APP_DIRS': True,
'OPTIONS': {
'context_processors': [
'django.template.context_processors.debug',
'django.template.context_processors.request',
'django.contrib.auth.context_processors.auth',
'django.contrib.messages.context_processors.messages',
],
},
},
]
WSGI_APPLICATION = 'backend.wsgi.application'
# Database
# https://docs.djangoproject.com/en/2.1/ref/settings/#databases
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.postgresql',
'NAME': os.getenv("POSTGRES_DB"),
'USER': os.getenv("POSTGRES_USER"),
'PASSWORD': os.getenv("POSTGRES_PASSWORD"),
'HOST': 'db',
'PORT': 5432,
}
}
# Password validation
# https://docs.djangoproject.com/en/2.1/ref/settings/#auth-password-validators
AUTH_PASSWORD_VALIDATORS = [
{
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
},
]
# Internationalization
# https://docs.djangoproject.com/en/2.1/topics/i18n/
LANGUAGE_CODE = 'en-us'
TIME_ZONE = 'UTC'
USE_I18N = True
USE_L10N = True
USE_TZ = False
# Static files (CSS, JavaScript, Images)
# https://docs.djangoproject.com/en/2.1/howto/static-files/
STATIC_URL = '/static/'
STATICFILES_DIRS = []
STATIC_ROOT = os.path.join(BASE_DIR, 'static')

View file

@ -1,7 +0,0 @@
from .base import *
DEBUG = True
ALLOWED_HOSTS = ['localhost']
CORS_ORIGIN_ALLOW_ALL = True

View file

@ -1,10 +0,0 @@
from .base import *
DEBUG = False
ALLOWED_HOSTS = ['backend.fediverse.space']
CORS_ORIGIN_REGEX_WHITELIST = [
r'^(https?:\/\/)?(\w+\.)?(.*)?fediverse-space\.netlify\.com\/?$',
r'^(https?:\/\/)?(\w+\.)?(.*)?fediverse\.space\/?$',
]

View file

@ -1,37 +0,0 @@
"""backend URL Configuration
The `urlpatterns` list routes URLs to views. For more information please see:
https://docs.djangoproject.com/en/2.1/topics/http/urls/
Examples:
Function views
1. Add an import: from my_app import views
2. Add a URL to urlpatterns: path('', views.home, name='home')
Class-based views
1. Add an import: from other_app.views import Home
2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
Including another URLconf
1. Import the include() function: from django.urls import include, path
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
"""
from django.urls import path, include
from django.views.generic import TemplateView
from rest_framework import routers
from apiv1 import views
class OptionalTrailingSlashRouter(routers.DefaultRouter):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.trailing_slash = r'/?'
router = OptionalTrailingSlashRouter()
router.register(r'instances', views.InstanceViewSet)
router.register(r'graph/nodes', views.NodeView)
router.register(r'graph/edges', views.EdgeView, base_name='edge')
urlpatterns = [
path('api/v1/', include(router.urls)),
path('silk/', include('silk.urls', namespace='silk')),
]

View file

@ -1,13 +0,0 @@
"""
WSGI config for backend project.
It exposes the WSGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/2.1/howto/deployment/wsgi/
"""
import os
from django.core.wsgi import get_wsgi_application
application = get_wsgi_application()

51
backend/config/config.exs Normal file
View file

@ -0,0 +1,51 @@
# This file is responsible for configuring your application
# and its dependencies with the aid of the Mix.Config module.
#
# This configuration file is loaded before any dependency and
# is restricted to this project.
# General application configuration
import Config
config :backend,
ecto_repos: [Backend.Repo]
# Configures the endpoint
config :backend, BackendWeb.Endpoint,
url: [host: "localhost"],
secret_key_base: "XL4NKGBN9lZMrQbMEI1KJOlwAt8S7younVJl90TdAgzmwyapr3g7BRYSNYvX0sZ9",
render_errors: [view: BackendWeb.ErrorView, accepts: ~w(json)],
pubsub: [name: Backend.PubSub, adapter: Phoenix.PubSub.PG2]
config :backend, Backend.Repo, queue_target: 5000
# Configures Elixir's Logger
config :logger, :console,
format: "$time $metadata[$level] $message\n",
metadata: [:request_id]
# Use Jason for JSON parsing in Phoenix
config :phoenix, :json_library, Jason
config :backend, :crawler,
status_age_limit_days: 28,
status_count_limit: 5000,
personal_instance_threshold: 10,
crawl_interval_mins: 30,
crawl_workers: 50,
blacklist: [
"gab.best"
],
user_agent: "fediverse.space crawler"
config :backend, Backend.Scheduler,
jobs: [
# At midnight every day
{"@daily", {Backend.Scheduler, :prune_crawls, [1, "month"]}},
# 00.15 daily
{"15 0 * * *", {Backend.Scheduler, :generate_edges, []}}
]
# Import environment specific config. This must remain at the bottom
# of this file so it overrides the configuration defined above.
import_config "#{Mix.env()}.exs"

72
backend/config/dev.exs Normal file
View file

@ -0,0 +1,72 @@
import Config
# For development, we disable any cache and enable
# debugging and code reloading.
#
# The watchers configuration can be used to run external
# watchers to your application. For example, we use it
# with webpack to recompile .js and .css sources.
config :backend, BackendWeb.Endpoint,
http: [port: 4000],
debug_errors: true,
code_reloader: true,
check_origin: false,
watchers: []
# ## SSL Support
#
# In order to use HTTPS in development, a self-signed
# certificate can be generated by running the following
# Mix task:
#
# mix phx.gen.cert
#
# Note that this task requires Erlang/OTP 20 or later.
# Run `mix help phx.gen.cert` for more information.
#
# The `http:` config above can be replaced with:
#
# https: [
# port: 4001,
# cipher_suite: :strong,
# keyfile: "priv/cert/selfsigned_key.pem",
# certfile: "priv/cert/selfsigned.pem"
# ],
#
# If desired, both `http:` and `https:` keys can be
# configured to run both http and https servers on
# different ports.
# Do not include metadata nor timestamps in development logs
config :logger, :console, format: "[$level] $message\n"
# Set a higher stacktrace during development. Avoid configuring such
# in production as building large stacktraces may be expensive.
config :phoenix, :stacktrace_depth, 20
# Initialize plugs at runtime for faster development compilation
config :phoenix, :plug_init_mode, :runtime
# Configure your database
config :backend, Backend.Repo,
username: "postgres",
password: "postgres",
database: "backend_dev",
hostname: "localhost",
pool_size: 10
config :backend, :crawler,
status_age_limit_days: 28,
status_count_limit: 100,
personal_instance_threshold: 1,
crawl_interval_mins: 1,
crawl_workers: 10,
blacklist: [
"gab.best"
]
config :backend, Backend.Scheduler,
jobs: [
# Every 15 minutes
{"*/15 * * * *", {Backend.Scheduler, :prune_crawls, [12, "hour"]}}
]

57
backend/config/prod.exs Normal file
View file

@ -0,0 +1,57 @@
import Config
# Do not print debug messages in production
config :logger, level: :info
# ## SSL Support
#
# To get SSL working, you will need to add the `https` key
# to the previous section and set your `:url` port to 443:
#
# config :backend, BackendWeb.Endpoint,
# ...
# url: [host: "example.com", port: 443],
# https: [
# :inet6,
# port: 443,
# cipher_suite: :strong,
# keyfile: System.get_env("SOME_APP_SSL_KEY_PATH"),
# certfile: System.get_env("SOME_APP_SSL_CERT_PATH")
# ]
#
# The `cipher_suite` is set to `:strong` to support only the
# latest and more secure SSL ciphers. This means old browsers
# and clients may not be supported. You can set it to
# `:compatible` for wider support.
#
# `:keyfile` and `:certfile` expect an absolute path to the key
# and cert in disk or a relative path inside priv, for example
# "priv/ssl/server.key". For all supported SSL configuration
# options, see https://hexdocs.pm/plug/Plug.SSL.html#configure/1
#
# We also recommend setting `force_ssl` in your endpoint, ensuring
# no data is ever sent via http, always redirecting to https:
#
# config :backend, BackendWeb.Endpoint,
# force_ssl: [hsts: true]
#
# Check `Plug.SSL` for all available options in `force_ssl`.
# ## Using releases (distillery)
#
# If you are doing OTP releases, you need to instruct Phoenix
# to start the server for all endpoints:
#
# config :phoenix, :serve_endpoints, true
#
# Alternatively, you can configure exactly which server to
# start per endpoint:
#
# config :backend, BackendWeb.Endpoint, server: true
#
# Note you can't rely on `System.get_env/1` when using releases.
# See the releases documentation accordingly.
# Finally import the config/prod.secret.exs which should be versioned
# separately.
# import_config "prod.secret.exs"

View file

@ -0,0 +1,27 @@
# This file is for *runtime configuration in releases* only.
# https://hexdocs.pm/phoenix/releases.html#runtime-configuration
import Config
# For production, don't forget to configure the url host
# to something meaningful, Phoenix uses this information
# when generating URLs.
config :backend, Backend.Repo,
# username: System.get_env("POSTGRES_USER"),
# password: System.get_env("POSTGRES_PASSWORD"),
# database: System.get_env("POSTGRES_DB"),
# hostname: System.get_env("POSTGRES_HOSTNAME"),
url: System.get_env("ecto://" <> "DATABASE_URL"),
pool_size: String.to_integer(System.get_env("POOL_SIZE") || "10"),
ssl: true
# show_sensitive_data_on_connection_error: true
port = String.to_integer(System.get_env("PORT") || "4000")
config :backend, BackendWeb.Endpoint,
http: [:inet6, port: port],
url: [host: System.get_env("BACKEND_HOSTNAME"), port: port],
root: ".",
secret_key_base: System.get_env("SECRET_KEY_BASE"),
server: true

18
backend/config/test.exs Normal file
View file

@ -0,0 +1,18 @@
import Config
# We don't run a server during test. If one is required,
# you can enable the server option below.
config :backend, BackendWeb.Endpoint,
http: [port: 4002],
server: false
# Print only warnings and errors during test
config :logger, level: :warn
# Configure your database
config :backend, Backend.Repo,
username: "postgres",
password: "postgres",
database: "backend_test",
hostname: "localhost",
pool: Ecto.Adapters.SQL.Sandbox

9
backend/lib/backend.ex Normal file
View file

@ -0,0 +1,9 @@
defmodule Backend do
@moduledoc """
Backend keeps the contexts that define your domain
and business logic.
Contexts are also responsible for managing your data, regardless
if it comes from the database, an external API or others.
"""
end

View file

@ -0,0 +1,68 @@
defmodule Backend.Api do
alias Backend.{Crawl, Edge, Instance, Repo}
import Ecto.Query
@spec list_instances() :: [Instance.t()]
def list_instances() do
Instance
|> Repo.all()
end
@spec get_instance!(String.t()) :: Instance.t()
def get_instance!(domain) do
Instance
|> preload(:peers)
|> Repo.get_by!(domain: domain)
end
@doc """
Returns a list of instances that
* have at least one successful crawl
* have a user count (required to give the instance a size on the graph)
"""
@spec list_nodes() :: [Instance.t()]
def list_nodes() do
crawl_subquery =
Crawl
|> select([c], %{
instance_domain: c.instance_domain,
crawl_count: count(c.id)
})
|> where([c], is_nil(c.error))
|> group_by([c], c.instance_domain)
Instance
|> join(:inner, [i], c in subquery(crawl_subquery), on: i.domain == c.instance_domain)
|> where(
[i, c],
c.crawl_count > 0 and not is_nil(i.user_count) and not is_nil(i.x) and not is_nil(i.y)
)
|> select([c], [:domain, :user_count, :x, :y])
|> Repo.all()
end
@spec list_edges() :: [Edge.t()]
def list_edges() do
crawl_subquery =
Crawl
|> select([c], %{
instance_domain: c.instance_domain,
crawl_count: count(c.id)
})
|> where([c], is_nil(c.error))
|> group_by([c], c.instance_domain)
Edge
|> join(:inner, [e], c1 in subquery(crawl_subquery), on: e.source_domain == c1.instance_domain)
|> join(:inner, [e], c2 in subquery(crawl_subquery), on: e.target_domain == c2.instance_domain)
|> join(:inner, [e], i1 in Instance, on: e.source_domain == i1.domain)
|> join(:inner, [e], i2 in Instance, on: e.target_domain == i2.domain)
|> select([e], [:id, :source_domain, :target_domain, :weight])
|> where(
[e, c1, c2, i1, i2],
c1.crawl_count > 0 and c2.crawl_count > 0 and not is_nil(i1.x) and not is_nil(i1.y) and
not is_nil(i2.x) and not is_nil(i2.y) and e.source_domain != e.target_domain
)
|> Repo.all()
end
end

View file

@ -0,0 +1,46 @@
defmodule Backend.Application do
# See https://hexdocs.pm/elixir/Application.html
# for more information on OTP Applications
@moduledoc false
use Application
require Logger
import Backend.Util
def start(_type, _args) do
crawl_worker_count = get_config(:crawl_workers)
children = [
# Start the Ecto repository
Backend.Repo,
# Start the endpoint when the application starts
BackendWeb.Endpoint,
# Crawler children
:hackney_pool.child_spec(:crawler, timeout: 15000, max_connections: crawl_worker_count),
{Task,
fn ->
Honeydew.start_queue(:crawl_queue, failure_mode: Honeydew.FailureMode.Abandon)
Honeydew.start_workers(:crawl_queue, Backend.Crawler, num: crawl_worker_count)
end},
Backend.Scheduler
]
children =
case Enum.member?(["true", 1, "1"], System.get_env("SKIP_CRAWL")) do
true -> children
false -> children ++ [Backend.Crawler.StaleInstanceManager]
end
# See https://hexdocs.pm/elixir/Supervisor.html
# for other strategies and supported options
opts = [strategy: :one_for_one, name: Backend.Supervisor]
Supervisor.start_link(children, opts)
end
# Tell Phoenix to update the endpoint configuration
# whenever the application is updated.
def config_change(changed, _new, removed) do
BackendWeb.Endpoint.config_change(changed, removed)
:ok
end
end

View file

@ -0,0 +1,26 @@
defmodule Backend.Crawl do
use Ecto.Schema
import Ecto.Changeset
schema "crawls" do
belongs_to :instance, Backend.Instance,
references: :domain,
type: :string,
foreign_key: :instance_domain
field :interactions_seen, :integer
field :statuses_seen, :integer
# if something went wrong, otherwise null
field :error, :string
timestamps()
end
@doc false
def changeset(crawl, attrs) do
crawl
|> cast(attrs, [:instance, :statuses_seen, :interactions_seen, :error])
|> validate_required([:instance])
end
end

View file

@ -0,0 +1,29 @@
defmodule Backend.CrawlInteraction do
use Ecto.Schema
import Ecto.Changeset
schema "crawl_interactions" do
belongs_to :crawl, Backend.Crawl
belongs_to :source, Backend.Instance,
references: :domain,
type: :string,
foreign_key: :source_domain
belongs_to :target, Backend.Instance,
references: :domain,
type: :string,
foreign_key: :target_domain
field :mentions, :integer
timestamps()
end
@doc false
def changeset(crawl_interaction, attrs) do
crawl_interaction
|> cast(attrs, [:crawl, :source, :target, :mentions])
|> validate_required([:crawl, :source, :target, :mentions])
end
end

View file

@ -0,0 +1,45 @@
defmodule Backend.Crawler.ApiCrawler do
@moduledoc """
This module is a specification. Crawlers for all instance types must implement its behaviour.
Make sure to respect the following:
* You must adhere to the following configuration values:
* `:status_age_limit_days` specifies that you must only crawl statuses from the most recent N days
* `:status_count_limit` specifies the max number of statuses to crawl in one go
* `:personal_instance_threshold` specifies that instances with fewer than this number of users should not be crawled
* profiles with the string "nobot" (case insensitive) in their profile must not be included in any stats
* Make sure to check the most recent crawl of the instance so you don't re-crawl old statuses
"""
# {domain_mentioned, count}
@type instance_interactions :: %{String.t() => integer}
defstruct [
:version,
:description,
:user_count,
:status_count,
:peers,
:interactions,
:statuses_seen
]
@type t() :: %__MODULE__{
version: String.t(),
description: String.t(),
user_count: integer,
status_count: integer,
peers: [String.t()],
interactions: instance_interactions,
statuses_seen: integer
}
@doc """
Check whether the instance at the given domain is of the type that this ApiCrawler implements.
"""
@callback is_instance_type?(String.t()) :: boolean()
@doc """
Crawl the instance at the given domain.
"""
@callback crawl(String.t()) :: t()
end

View file

@ -0,0 +1,196 @@
defmodule Backend.Crawler do
@moduledoc """
This module crawls instances. Run `run(domain)` to crawl a given domain.
"""
alias __MODULE__
alias Backend.Crawler.Crawlers.Mastodon
alias Backend.Crawler.ApiCrawler
alias Backend.{Crawl, CrawlInteraction, Repo, Instance, InstancePeer}
import Ecto.Query
import Backend.Util
require Logger
defstruct [
# the instance domain (a string)
:domain,
# a list of ApiCrawlers that will be attempted
:api_crawlers,
:found_api?,
:result,
:error
]
@type t() :: %__MODULE__{
domain: String.t(),
api_crawlers: [ApiCrawler.t()],
found_api?: boolean,
result: ApiCrawler.t() | nil,
error: String.t() | nil
}
def run(domain) do
Logger.info("Crawling #{domain}...")
HTTPoison.start()
state = %Crawler{domain: domain, api_crawlers: [], found_api?: false, result: nil, error: nil}
state
# register APICrawlers here
|> register(Mastodon)
# go!
|> crawl()
|> save()
end
# Adds a new ApiCrawler that run/1 will check.
defp register(%Crawler{api_crawlers: crawlers} = state, api_crawler) do
Map.put(state, :api_crawlers, [api_crawler | crawlers])
end
# Recursive function to check whether `domain` has an API that the head of the api_crawlers list can read.
# If so, crawls it. If not, continues with the tail of the api_crawlers list.
defp crawl(%Crawler{api_crawlers: [], domain: domain} = state) do
Logger.debug("Found no compatible API for #{domain}")
Map.put(state, :found_api?, false)
end
defp crawl(%Crawler{domain: domain, api_crawlers: [curr | remaining_crawlers]} = state) do
if curr.is_instance_type?(domain) do
Logger.debug("Found #{curr} instance")
state = Map.put(state, :found_api?, true)
try do
%Crawler{state | result: curr.crawl(domain), api_crawlers: []}
rescue
e in HTTPoison.Error ->
Map.put(state, :error, "HTTPoison error: " <> HTTPoison.Error.message(e))
e in Jason.DecodeError ->
Map.put(state, :error, "Jason DecodeError: " <> Jason.DecodeError.message(e))
e in _ ->
Map.put(state, :error, "Unknown error: " <> inspect(e))
end
else
# Nothing found so check the next APICrawler
Logger.debug("#{domain} is not an instance of #{curr}")
crawl(%Crawler{state | api_crawlers: remaining_crawlers})
end
end
# Save the state (after crawling) to the database.
defp save(%Crawler{domain: domain, result: result, found_api?: true, error: nil}) do
now = NaiveDateTime.truncate(NaiveDateTime.utc_now(), :second)
## Update the instance we crawled ##
Repo.insert!(
%Instance{
domain: domain,
description: result.description,
version: result.version,
user_count: result.user_count,
status_count: result.status_count
},
on_conflict: [
set: [
description: result.description,
version: result.version,
user_count: result.user_count,
status_count: result.status_count,
updated_at: now
]
],
conflict_target: :domain
)
# Save details of a new crawl
curr_crawl =
Repo.insert!(%Crawl{
instance_domain: domain,
interactions_seen:
result.interactions |> Map.values() |> Enum.reduce(0, fn count, acc -> count + acc end),
statuses_seen: result.statuses_seen
})
# We get a list of peers from two places:
# * the official peers endpoint (which may be disabled)
# * the interactions
peers_domains =
result.interactions
|> Map.keys()
|> list_union(result.peers)
|> Enum.filter(fn domain -> not is_blacklisted?(domain) end)
peers =
peers_domains
|> Enum.map(&%{domain: &1, inserted_at: now, updated_at: now})
Instance
|> Repo.insert_all(peers, on_conflict: :nothing, conflict_target: :domain)
Repo.transaction(fn ->
## Save peer relationships ##
# get current peers (a list of strings)
current_peers =
InstancePeer
|> where(source_domain: ^domain)
|> select([p], p.target_domain)
|> Repo.all()
wanted_peers_set = MapSet.new(peers_domains)
current_peers_set = MapSet.new(current_peers)
# delete the peers we don't want
dont_want = current_peers_set |> MapSet.difference(wanted_peers_set) |> MapSet.to_list()
if length(dont_want) > 0 do
InstancePeer
|> where(source_domain: ^domain)
|> where([p], p.target_domain in ^dont_want)
|> Repo.delete_all([])
end
# insert the ones we don't have yet
new_instance_peers =
wanted_peers_set
|> MapSet.difference(current_peers_set)
|> MapSet.to_list()
|> Enum.map(
&%{
source_domain: domain,
target_domain: &1,
inserted_at: now,
updated_at: now
}
)
InstancePeer
|> Repo.insert_all(new_instance_peers)
end)
## Save interactions ##
interactions =
result.interactions
|> Enum.filter(fn {target_domain, _count} -> not is_blacklisted?(target_domain) end)
|> Enum.map(fn {target_domain, count} ->
%{
crawl_id: curr_crawl.id,
source_domain: domain,
target_domain: target_domain,
mentions: count,
inserted_at: now,
updated_at: now
}
end)
CrawlInteraction
|> Repo.insert_all(interactions)
end
defp save(%{domain: domain, error: error}) do
Repo.insert!(%Crawl{
instance_domain: domain,
error: error
})
end
end

View file

@ -0,0 +1,193 @@
defmodule Backend.Crawler.Crawlers.Mastodon do
require Logger
import Backend.Crawler.Util
alias Backend.Crawler.ApiCrawler
@behaviour ApiCrawler
@impl ApiCrawler
def is_instance_type?(domain) do
case get("https://#{domain}/api/v1/instance") do
{:ok, response} -> if is_http_200?(response), do: has_title?(response.body), else: false
{:error, _error} -> false
end
end
@impl ApiCrawler
def crawl(domain) do
instance = Jason.decode!(get!("https://#{domain}/api/v1/instance").body)
if get_in(instance, ["stats", "user_count"]) > get_config(:personal_instance_threshold) do
crawl_large_instance(domain, instance)
else
Map.merge(
Map.merge(
Map.take(instance, ["version", "description"]),
Map.take(instance["stats"], ["user_count", "status_count"])
)
|> Map.new(fn {k, v} -> {String.to_atom(k), v} end),
%{peers: [], interactions: %{}, statuses_seen: 0}
)
end
end
@spec crawl_large_instance(String.t(), any()) :: ApiCrawler.t()
defp crawl_large_instance(domain, instance) do
# servers may not publish peers
peers =
case get("https://#{domain}/api/v1/instance/peers") do
{:ok, response} -> if is_http_200?(response), do: Jason.decode!(response.body), else: []
{:error, _error} -> []
end
Logger.debug("Found #{length(peers)} peers.")
{interactions, statuses_seen} = get_interactions(domain)
Logger.debug(
"#{domain}: found #{
interactions |> Map.values() |> Enum.reduce(0, fn count, acc -> count + acc end)
} mentions in #{statuses_seen} statuses."
)
Map.merge(
Map.merge(
Map.take(instance, ["version", "description"]),
Map.take(instance["stats"], ["user_count", "status_count"])
)
|> Map.new(fn {k, v} -> {String.to_atom(k), v} end),
%{peers: peers, interactions: interactions, statuses_seen: statuses_seen}
)
end
@spec get_interactions(
String.t(),
String.t() | nil,
Calendar.naive_datetime() | nil,
ApiCrawler.instance_interactions(),
integer
) :: {ApiCrawler.instance_interactions(), integer}
defp get_interactions(
domain,
max_id \\ nil,
min_timestamp \\ nil,
interactions \\ %{},
statuses_seen \\ 0
) do
# If `statuses_seen == 0`, it's the first call of this function, which means we want to query the database for the
# most recent status we have.
min_timestamp =
if statuses_seen == 0 do
get_last_successful_crawl_timestamp(domain)
else
min_timestamp
end
endpoint = "https://#{domain}/api/v1/timelines/public?local=true"
endpoint =
if max_id do
endpoint <> "&max_id=#{max_id}"
else
endpoint
end
Logger.debug("Crawling #{endpoint}")
statuses =
endpoint
|> get!()
|> Map.get(:body)
|> Jason.decode!()
filtered_statuses =
statuses
|> Enum.filter(fn s -> is_after?(s["created_at"], min_timestamp) end)
if length(filtered_statuses) > 0 do
# get statuses that are eligible (i.e. users don't have #nobot in their profile) and have mentions
interactions = Map.merge(interactions, statuses_to_interactions(filtered_statuses))
statuses_seen = statuses_seen + length(filtered_statuses)
status_datetime_threshold =
NaiveDateTime.utc_now()
|> NaiveDateTime.add(get_config(:status_age_limit_days) * 24 * 3600 * -1, :second)
oldest_status = Enum.at(filtered_statuses, -1)
oldest_status_datetime =
oldest_status
|> (fn s -> s["created_at"] end).()
|> NaiveDateTime.from_iso8601!()
if NaiveDateTime.compare(oldest_status_datetime, status_datetime_threshold) == :gt and
statuses_seen < get_config(:status_count_limit) and
length(filtered_statuses) == length(statuses) do
get_interactions(domain, oldest_status["id"], min_timestamp, interactions, statuses_seen)
else
{interactions, statuses_seen}
end
else
{interactions, statuses_seen}
end
end
# To check if the endpoint works as expected
@spec has_title?(String.t()) :: boolean
defp has_title?(body) do
case Jason.decode(body) do
{:ok, decoded} -> Map.has_key?(decoded, "title")
{:error, _error} -> false
end
end
# Checks whether the status contains one or more mentions
defp is_mention?(status) do
case status["mentions"] do
[] -> false
nil -> false
_ -> true
end
end
# Checks if the author of the status has "nobot" in their profile
defp has_nobot?(status) do
account = status["account"]
fields =
account["fields"]
|> Enum.map(fn %{"name" => name, "value" => value} -> name <> value end)
|> Enum.join("")
# this also means that any users who mentioned ethnobotany in their profiles will be excluded lol ¯\_(ツ)_/¯
(account["note"] <> fields)
|> String.downcase()
|> String.contains?("nobot")
end
# This checks if the status
# a) contains one or more mentions, and
# b) that the person posting doesn't have "nobot" in their profile
defp is_eligible?(status) do
is_mention?(status) and not has_nobot?(status)
end
@spec extract_mentions_from_status(any()) :: ApiCrawler.instance_interactions()
defp extract_mentions_from_status(status) do
status["mentions"]
|> Enum.map(fn mention -> get_domain(mention["url"]) end)
|> Enum.reduce(%{}, fn domain, acc ->
Map.update(acc, domain, 1, &(&1 + 1))
end)
end
@spec statuses_to_interactions(any()) :: ApiCrawler.instance_interactions()
defp statuses_to_interactions(statuses) do
statuses
|> Enum.filter(fn status -> is_eligible?(status) end)
|> Enum.map(fn status -> extract_mentions_from_status(status) end)
|> Enum.reduce(%{}, fn map, acc ->
Map.merge(acc, map)
end)
end
end

View file

@ -0,0 +1,84 @@
defmodule Backend.Crawler.StaleInstanceManager do
use GenServer
alias Backend.{Crawl, Instance, Repo}
import Ecto.Query
import Backend.Util
require Logger
@moduledoc """
This module regularly finds stale instances (i.e. instances that haven't been updated for longer than the crawl
interval) and adds them to the job queue. It runs once a minute.
"""
def start_link(_opts) do
GenServer.start_link(__MODULE__, [], name: __MODULE__)
end
@impl true
def init(_opts) do
instance_count =
Instance
|> where([i], not is_nil(i.version))
|> select([i], count(i.domain))
|> Repo.one()
case instance_count do
# Add m.s. as the seed and schedule the next add
0 ->
add_to_queue("mastodon.social")
schedule_add()
# Start immediately
_ ->
Process.send(self(), :queue_stale_domains, [])
end
{:ok, []}
end
@impl true
def handle_info(:queue_stale_domains, state) do
queue_stale_domains()
schedule_add()
{:noreply, state}
end
defp schedule_add() do
Process.send_after(self(), :queue_stale_domains, 60_000)
end
defp queue_stale_domains() do
interval = -1 * get_config(:crawl_interval_mins)
# Get domains that have never been crawled and where the last crawl is past the threshold
crawls_subquery =
Crawl
|> select([c], %{
instance_domain: c.instance_domain,
most_recent_crawl: max(c.inserted_at),
crawl_count: count(c.id)
})
|> where([c], is_nil(c.error))
|> group_by([c], c.instance_domain)
stale_domains =
Instance
|> join(:left, [i], c in subquery(crawls_subquery), on: i.domain == c.instance_domain)
|> where(
[i, c],
c.most_recent_crawl < datetime_add(^NaiveDateTime.utc_now(), ^interval, "minute") or
is_nil(c.crawl_count)
)
|> select([i], i.domain)
|> Repo.all()
Logger.debug("Adding #{length(stale_domains)} stale domains to queue.")
stale_domains
|> Enum.each(fn domain -> add_to_queue(domain) end)
end
defp add_to_queue(domain) do
{:run, [domain]} |> Honeydew.async(:crawl_queue)
end
end

View file

@ -0,0 +1,63 @@
defmodule Backend.Crawler.Util do
require Logger
import Backend.Util
# Gets the domain from a Mastodon/Pleroma account URL
# (e.g. https://mastodon.social/@demouser or https://pleroma.site/users/demouser)
@spec get_domain(String.t()) :: String.t()
def get_domain(url) do
String.slice(url, 8..-1)
|> String.split("/")
|> Enum.at(0)
end
@spec is_http_200?(HTTPoison.Response.t()) :: boolean
def is_http_200?(%{status_code: 200}) do
true
end
def is_http_200?(_) do
false
end
@spec is_after?(String.t(), NaiveDateTime.t() | nil) :: boolean()
def is_after?(timestamp, threshold) do
if threshold == nil do
true
else
timestamp
|> NaiveDateTime.from_iso8601!()
# :second is the granularity used in the database
|> NaiveDateTime.truncate(:second)
|> NaiveDateTime.compare(threshold)
|> Kernel.===(:gt)
end
end
def get(url) do
# TODO: add version number to user agent?
HTTPoison.get(url, [{"User-Agent", get_config(:user_agent)}],
hackney: [pool: :crawler],
recv_timeout: 15000,
timeout: 15000
)
end
@spec get!(binary) :: %{
:__struct__ => HTTPoison.AsyncResponse | HTTPoison.Response,
optional(:body) => any,
optional(:headers) => [any],
optional(:id) => reference,
optional(:request) => HTTPoison.Request.t(),
optional(:request_url) => any,
optional(:status_code) => integer
}
def get!(url) do
# TODO: add version number to user agent?
HTTPoison.get!(url, [{"User-Agent", get_config(:user_agent)}],
hackney: [pool: :crawler],
recv_timeout: 15000,
timeout: 15000
)
end
end

View file

@ -0,0 +1,25 @@
defmodule Backend.Edge do
use Ecto.Schema
import Ecto.Changeset
schema "edges" do
belongs_to :source, Backend.Instance,
references: :domain,
type: :string,
foreign_key: :source_domain
belongs_to :target, Backend.Instance,
references: :domain,
type: :string,
foreign_key: :target_domain
timestamps()
end
@doc false
def changeset(edge, attrs) do
edge
|> cast(attrs, [:source, :target])
|> validate_required([:source, :target])
end
end

View file

@ -0,0 +1,41 @@
defmodule Backend.Instance do
use Ecto.Schema
import Ecto.Changeset
schema "instances" do
field :domain, :string
field :description, :string
field :user_count, :integer
field :status_count, :integer
field :version, :string
field :insularity, :float
many_to_many :peers, Backend.Instance,
join_through: Backend.InstancePeer,
join_keys: [source_domain: :domain, target_domain: :domain]
# This may look like it's duplicating :peers above, but it allows us to insert peer relationships quickly.
# https://stackoverflow.com/a/56764241/3697202
has_many :instance_peers, Backend.InstancePeer,
foreign_key: :source_domain,
references: :domain
timestamps()
end
@doc false
def changeset(instance, attrs) do
instance
|> cast(attrs, [
:domain,
:description,
:user_count,
:status_count,
:version,
:insularity,
:updated_at
])
|> validate_required([:domain])
|> put_assoc(:peers, attrs.peers)
end
end

View file

@ -0,0 +1,27 @@
defmodule Backend.InstancePeer do
use Ecto.Schema
import Ecto.Changeset
schema "instance_peers" do
belongs_to :source, Backend.Instance,
references: :domain,
type: :string,
foreign_key: :source_domain
belongs_to :target, Backend.Instance,
references: :domain,
type: :string,
foreign_key: :target_domain
field :weight, :float, default: 0.0
timestamps()
end
@doc false
def changeset(instance_peer, attrs) do
instance_peer
|> cast(attrs, [])
|> validate_required([])
end
end

View file

@ -0,0 +1,18 @@
defmodule Backend.Release do
@app :backend
def migrate do
for repo <- repos() do
{:ok, _, _} = Ecto.Migrator.with_repo(repo, &Ecto.Migrator.run(&1, :up, all: true))
end
end
def rollback(repo, version) do
{:ok, _, _} = Ecto.Migrator.with_repo(repo, &Ecto.Migrator.run(&1, :down, to: version))
end
defp repos do
Application.load(@app)
Application.fetch_env!(@app, :ecto_repos)
end
end

View file

@ -0,0 +1,5 @@
defmodule Backend.Repo do
use Ecto.Repo,
otp_app: :backend,
adapter: Ecto.Adapters.Postgres
end

View file

@ -0,0 +1,116 @@
defmodule Backend.Scheduler do
@moduledoc """
This module runs recurring tasks.
"""
use Quantum.Scheduler, otp_app: :backend
alias Backend.{Crawl, Edge, Interaction, Instance, Repo}
import Ecto.Query
require Logger
@doc """
Prunes all crawls that are more than `integer` `unit`s old.
For example, to delete crawls older than one month, call `prune(1, "month")`.
`unit` must singular, e.g. "second", "minute", "hour", "month", "year", etc...
"""
@spec prune_crawls(integer, String.t()) :: any
def prune_crawls(amount, unit) do
{deleted_num, _} =
Crawl
|> where(
[i],
i.inserted_at <
datetime_add(^NaiveDateTime.utc_now(), -1 * ^amount, ^unit)
)
|> Repo.delete_all()
Logger.info("Pruned #{deleted_num} old crawls.")
end
@doc """
This function aggregates statistics from the interactions in the database.
It calculates the strength of edges between nodes.
TODO: generate edge weights. The weight of an edge between two instances will be
(number of mentions of each other) / (total number of statuses crawled).
This requires us to keep track of how many statuses we've seen.
"""
def generate_edges() do
interactions =
Interaction
|> select([inter], {inter.source_domain, inter.target_domain})
|> join(:left, [inter], i_source in Instance, on: inter.source_domain == i_source.domain)
|> join(:left, [inter], i_target in Instance, on: inter.target_domain == i_target.domain)
|> where(
[inter, i_source, i_target],
not is_nil(i_source.last_crawl_timestamp) and not is_nil(i_target.last_crawl_timestamp)
)
# Repo.all() returns a tuple like {"mastodon.social", "cursed.technology"}
|> Repo.all()
# Create a map of %{source_domain => [target_domains]}
|> Enum.group_by(fn tuple -> Kernel.elem(tuple, 0) end, fn tuple ->
Kernel.elem(tuple, 1)
end)
# Calculate insularity score
Repo.transaction(fn ->
interactions
|> Enum.each(fn {source, targets} ->
total_mentions = length(targets)
self_mentions = Enum.count(targets, fn t -> t == source end)
insularity = self_mentions / total_mentions
Repo.insert!(
%Instance{
domain: source,
insularity: insularity
},
on_conflict: [set: [insularity: insularity]],
conflict_target: :domain
)
end)
# Get edges
edges = MapSet.new()
interactions
|> Enum.each(fn {source, targets} ->
targets
|> Enum.each(fn target ->
[key_a, key_b] = Enum.sort([source, target])
edge = %Edge{
source_domain: key_a,
target_domain: key_b
}
MapSet.put(edges, edge)
Logger.debug(inspect(edges))
end)
end)
Logger.debug(inspect(edges))
now = NaiveDateTime.truncate(NaiveDateTime.utc_now(), :second)
Repo.delete_all(Edge)
edges =
edges
|> MapSet.to_list()
|> Enum.map(fn %{source_domain: source_domain, target_domain: target_domain} ->
%Edge{
source_domain: source_domain,
target_domain: target_domain,
updated_at: now,
inserted_at: now
}
end)
Repo.insert_all(Edge, edges)
end)
end
end

129
backend/lib/backend/util.ex Normal file
View file

@ -0,0 +1,129 @@
defmodule Backend.Util do
import Ecto.Query
alias Backend.{Crawl, Repo}
@doc """
Returns the given key from :backend, :crawler in the config.
"""
@spec get_config(atom) :: any
def get_config(key) do
Application.get_env(:backend, :crawler)[key]
end
@doc """
Takes two lists and returns a list of the union thereof (without duplicates).
"""
def list_union(list_one, list_two) do
list_one
|> MapSet.new()
|> (fn set -> MapSet.union(set, MapSet.new(list_two)) end).()
|> MapSet.to_list()
end
@doc """
Returns `true` if `domain` ends with a blacklisted domain.
If e.g. "masto.host" is blacklisted, allof its subdomains will return `true`.
"""
@spec is_blacklisted?(String.t()) :: boolean
def is_blacklisted?(domain) do
blacklist =
case get_config(:blacklist) do
nil -> []
_ -> get_config(:blacklist)
end
blacklist
|> Enum.any?(fn blacklisted_domain ->
String.ends_with?(domain, blacklisted_domain)
end)
end
@doc """
Returns the key to use for non-directed edges
(really, just the two domains sorted alphabetically)
"""
@spec get_interaction_key(String.t(), String.t()) :: String.t()
def get_interaction_key(source, target) do
[source, target]
|> Enum.sort()
|> List.to_tuple()
end
@doc """
Gets the current UTC time as a NaiveDateTime in a format that can be inserted into the database.
"""
def get_now() do
NaiveDateTime.truncate(NaiveDateTime.utc_now(), :second)
end
@doc """
Returns the later of two NaiveDateTimes.
"""
@spec max_datetime(NaiveDateTime.t() | nil, NaiveDateTime.t() | nil) :: NaiveDateTime.t()
def max_datetime(datetime_one, nil) do
datetime_one
end
def max_datetime(nil, datetime_two) do
datetime_two
end
def max_datetime(datetime_one, datetime_two) do
case NaiveDateTime.compare(datetime_one, datetime_two) do
:gt -> datetime_one
_ -> datetime_two
end
end
@spec get_last_crawl(String.t()) :: Crawl.t() | nil
def get_last_crawl(domain) do
crawls =
Crawl
|> select([c], c)
|> where([c], c.instance_domain == ^domain)
|> order_by(desc: :id)
|> limit(1)
|> Repo.all()
case length(crawls) do
1 -> hd(crawls)
0 -> nil
end
end
@spec get_last_successful_crawl(String.t()) :: Crawl.t() | nil
def get_last_successful_crawl(domain) do
crawls =
Crawl
|> select([c], c)
|> where([c], is_nil(c.error) and c.instance_domain == ^domain)
|> order_by(desc: :id)
|> limit(1)
|> Repo.all()
case length(crawls) do
1 -> hd(crawls)
0 -> nil
end
end
@spec get_last_successful_crawl_timestamp(String.t()) :: NaiveDateTime.t() | nil
def get_last_successful_crawl_timestamp(domain) do
crawl = get_last_crawl(domain)
case crawl do
nil -> nil
_ -> crawl.inserted_at
end
end
@doc """
Takes two maps with numeric values and merges them, adding the values of duplicate keys.
"""
def merge_count_maps(map1, map2) do
map1
|> Enum.reduce(map2, fn {key, val}, acc ->
Map.update(acc, key, val, &(&1 + val))
end)
end
end

View file

@ -0,0 +1,66 @@
defmodule BackendWeb do
@moduledoc """
The entrypoint for defining your web interface, such
as controllers, views, channels and so on.
This can be used in your application as:
use BackendWeb, :controller
use BackendWeb, :view
The definitions below will be executed for every view,
controller, etc, so keep them short and clean, focused
on imports, uses and aliases.
Do NOT define functions inside the quoted expressions
below. Instead, define any helper function in modules
and import those modules here.
"""
def controller do
quote do
use Phoenix.Controller, namespace: BackendWeb
import Plug.Conn
import BackendWeb.Gettext
alias BackendWeb.Router.Helpers, as: Routes
end
end
def view do
quote do
use Phoenix.View,
root: "lib/backend_web/templates",
namespace: BackendWeb
# Import convenience functions from controllers
import Phoenix.Controller, only: [get_flash: 1, get_flash: 2, view_module: 1]
import BackendWeb.ErrorHelpers
import BackendWeb.Gettext
alias BackendWeb.Router.Helpers, as: Routes
end
end
def router do
quote do
use Phoenix.Router
import Plug.Conn
import Phoenix.Controller
end
end
def channel do
quote do
use Phoenix.Channel
import BackendWeb.Gettext
end
end
@doc """
When used, dispatch to the appropriate controller/view/etc.
"""
defmacro __using__(which) when is_atom(which) do
apply(__MODULE__, which, [])
end
end

View file

@ -0,0 +1,33 @@
defmodule BackendWeb.UserSocket do
use Phoenix.Socket
## Channels
# channel "room:*", BackendWeb.RoomChannel
# Socket params are passed from the client and can
# be used to verify and authenticate a user. After
# verification, you can put default assigns into
# the socket that will be set for all channels, ie
#
# {:ok, assign(socket, :user_id, verified_user_id)}
#
# To deny connection, return `:error`.
#
# See `Phoenix.Token` documentation for examples in
# performing token verification on connect.
def connect(_params, socket, _connect_info) do
{:ok, socket}
end
# Socket id's are topics that allow you to identify all sockets for a given user:
#
# def id(socket), do: "user_socket:#{socket.assigns.user_id}"
#
# Would allow you to broadcast a "disconnect" event and terminate
# all active sockets and channels for a given user:
#
# BackendWeb.Endpoint.broadcast("user_socket:#{user.id}", "disconnect", %{})
#
# Returning `nil` makes this socket anonymous.
def id(_socket), do: nil
end

View file

@ -0,0 +1,15 @@
defmodule BackendWeb.FallbackController do
@moduledoc """
Translates controller action results into valid `Plug.Conn` responses.
See `Phoenix.Controller.action_fallback/1` for more details.
"""
use BackendWeb, :controller
def call(conn, {:error, :not_found}) do
conn
|> put_status(:not_found)
|> put_view(BackendWeb.ErrorView)
|> render(:"404")
end
end

View file

@ -0,0 +1,13 @@
defmodule BackendWeb.GraphController do
use BackendWeb, :controller
alias Backend.Api
action_fallback BackendWeb.FallbackController
def index(conn, _params) do
nodes = Api.list_nodes()
edges = Api.list_edges()
render(conn, "index.json", nodes: nodes, edges: edges)
end
end

View file

@ -0,0 +1,27 @@
defmodule BackendWeb.InstanceController do
use BackendWeb, :controller
import Backend.Util
alias Backend.Api
action_fallback BackendWeb.FallbackController
def index(conn, _params) do
instances = Api.list_instances()
render(conn, "index.json", instances: instances)
end
def show(conn, %{"id" => domain}) do
instance = Api.get_instance!(domain)
last_crawl = get_last_crawl(domain)
render(conn, "show.json", instance: instance, crawl: last_crawl)
end
# def update(conn, %{"id" => id, "instance" => instance_params}) do
# instance = Api.get_instance!(id)
# with {:ok, %Instance{} = instance} <- Api.update_instance(instance, instance_params) do
# render(conn, "show.json", instance: instance)
# end
# end
end

View file

@ -0,0 +1,51 @@
defmodule BackendWeb.Endpoint do
use Phoenix.Endpoint, otp_app: :backend
socket("/socket", BackendWeb.UserSocket,
websocket: true,
longpoll: false
)
# Serve at "/" the static files from "priv/static" directory.
#
# You should set gzip to true if you are running phx.digest
# when deploying your static files in production.
plug(Plug.Static,
at: "/",
from: :backend,
gzip: false,
only: ~w(css fonts images js favicon.ico robots.txt)
)
# Code reloading can be explicitly enabled under the
# :code_reloader configuration of your endpoint.
if code_reloading? do
plug(Phoenix.CodeReloader)
end
plug(Plug.RequestId)
plug(Plug.Logger)
plug(Plug.Parsers,
parsers: [:urlencoded, :multipart, :json],
pass: ["*/*"],
json_decoder: Phoenix.json_library()
)
plug(Plug.MethodOverride)
plug(Plug.Head)
# The session will be stored in the cookie and signed,
# this means its contents can be read but not tampered with.
# Set :encryption_salt if you would also like to encrypt it.
plug(Plug.Session,
store: :cookie,
key: "_backend_key",
signing_salt: "HJa1j4FI"
)
# TODO
plug(Corsica, origins: "*")
plug(BackendWeb.Router)
end

View file

@ -0,0 +1,24 @@
defmodule BackendWeb.Gettext do
@moduledoc """
A module providing Internationalization with a gettext-based API.
By using [Gettext](https://hexdocs.pm/gettext),
your module gains a set of macros for translations, for example:
import BackendWeb.Gettext
# Simple translation
gettext("Here is the string to translate")
# Plural translation
ngettext("Here is the string to translate",
"Here are the strings to translate",
3)
# Domain-based translation
dgettext("errors", "Here is the error message to translate")
See the [Gettext Docs](https://hexdocs.pm/gettext) for detailed usage.
"""
use Gettext, otp_app: :backend
end

View file

@ -0,0 +1,14 @@
defmodule BackendWeb.Router do
use BackendWeb, :router
pipeline :api do
plug :accepts, ["json"]
end
scope "/api", BackendWeb do
pipe_through :api
resources "/instances", InstanceController, only: [:index, :show]
resources "/graph", GraphController, only: [:index]
end
end

View file

@ -0,0 +1,19 @@
defmodule BackendWeb.ChangesetView do
use BackendWeb, :view
@doc """
Traverses and translates changeset errors.
See `Ecto.Changeset.traverse_errors/2` and
`BackendWeb.ErrorHelpers.translate_error/1` for more details.
"""
def translate_errors(changeset) do
Ecto.Changeset.traverse_errors(changeset, &translate_error/1)
end
def render("error.json", %{changeset: changeset}) do
# When encoded, the changeset returns its errors
# as a JSON object. So we just pass it forward.
%{errors: translate_errors(changeset)}
end
end

View file

@ -0,0 +1,33 @@
defmodule BackendWeb.ErrorHelpers do
@moduledoc """
Conveniences for translating and building error messages.
"""
@doc """
Translates an error message using gettext.
"""
def translate_error({msg, opts}) do
# When using gettext, we typically pass the strings we want
# to translate as a static argument:
#
# # Translate "is invalid" in the "errors" domain
# dgettext("errors", "is invalid")
#
# # Translate the number of files with plural rules
# dngettext("errors", "1 file", "%{count} files", count)
#
# Because the error messages we show in our forms and APIs
# are defined inside Ecto, we need to translate them dynamically.
# This requires us to call the Gettext module passing our gettext
# backend as first argument.
#
# Note we use the "errors" domain, which means translations
# should be written to the errors.po file. The :count option is
# set by Ecto and indicates we should also apply plural rules.
if count = opts[:count] do
Gettext.dngettext(BackendWeb.Gettext, "errors", msg, msg, count, opts)
else
Gettext.dgettext(BackendWeb.Gettext, "errors", msg, opts)
end
end
end

View file

@ -0,0 +1,16 @@
defmodule BackendWeb.ErrorView do
use BackendWeb, :view
# If you want to customize a particular status code
# for a certain format, you may uncomment below.
# def render("500.json", _assigns) do
# %{errors: %{detail: "Internal Server Error"}}
# end
# By default, Phoenix returns the status message from
# the template name. For example, "404.json" becomes
# "Not Found".
def template_not_found(template, _assigns) do
%{errors: %{detail: Phoenix.Controller.status_message_from_template(template)}}
end
end

View file

@ -0,0 +1,36 @@
defmodule BackendWeb.GraphView do
use BackendWeb, :view
alias BackendWeb.GraphView
def render("index.json", %{nodes: nodes, edges: edges}) do
%{
nodes: render_many(nodes, GraphView, "node.json"),
edges: render_many(edges, GraphView, "edge.json")
}
end
def render("node.json", %{graph: node}) do
size =
case node.user_count > 1 do
true -> :math.log(node.user_count)
false -> 1
end
%{
id: node.domain,
label: node.domain,
size: size,
x: node.x,
y: node.y
}
end
def render("edge.json", %{graph: edge}) do
%{
id: edge.id,
source: edge.source_domain,
target: edge.target_domain,
size: edge.weight
}
end
end

View file

@ -0,0 +1,45 @@
defmodule BackendWeb.InstanceView do
use BackendWeb, :view
alias BackendWeb.InstanceView
require Logger
def render("index.json", %{instances: instances}) do
render_many(instances, InstanceView, "instance.json")
end
def render("show.json", %{instance: instance, crawl: crawl}) do
render_one(instance, InstanceView, "instance_detail.json", crawl: crawl)
end
def render("instance.json", %{instance: instance}) do
%{name: instance.domain}
end
def render("instance_detail.json", %{instance: instance, crawl: crawl}) do
Logger.info("keys: #{inspect(instance)}")
[status, last_updated] =
case crawl do
nil ->
["not crawled", nil]
_ ->
case crawl.error do
nil -> ["success", crawl.inserted_at]
err -> [err, crawl.inserted_at]
end
end
%{
name: instance.domain,
description: instance.description,
version: instance.version,
userCount: instance.user_count,
statusCount: instance.status_count,
domainCount: length(instance.peers),
peers: render_many(instance.peers, InstanceView, "instance.json"),
lastUpdated: last_updated,
status: status
}
end
end

View file

@ -0,0 +1,13 @@
defmodule Mix.Tasks.Crawl do
alias Backend.Crawler
use Mix.Task
@shortdoc "Crawl a given instance."
def run(domain) do
Mix.Task.run("app.start")
# Application.ensure_all_started(:timex)
# Mix.Task.run("loadconfig")
Crawler.run(domain)
end
end

View file

@ -1,15 +0,0 @@
#!/usr/bin/env python
import os
import sys
if __name__ == '__main__':
try:
from django.core.management import execute_from_command_line
except ImportError as exc:
raise ImportError(
"Couldn't import Django. Are you sure it's installed and "
"available on your PYTHONPATH environment variable? Did you "
"forget to activate a virtual environment?"
) from exc
execute_from_command_line(sys.argv)

65
backend/mix.exs Normal file
View file

@ -0,0 +1,65 @@
defmodule Backend.MixProject do
use Mix.Project
def project do
[
app: :backend,
version: "2.0.0-beta.1",
elixir: "~> 1.5",
elixirc_paths: elixirc_paths(Mix.env()),
compilers: [:phoenix, :gettext] ++ Mix.compilers(),
start_permanent: Mix.env() == :prod,
aliases: aliases(),
deps: deps()
]
end
# Configuration for the OTP application.
#
# Type `mix help compile.app` for more information.
def application do
[
mod: {Backend.Application, []},
extra_applications: [:logger, :runtime_tools, :mnesia]
]
end
# Specifies which paths to compile per environment.
defp elixirc_paths(:test), do: ["lib", "test/support"]
defp elixirc_paths(_), do: ["lib"]
# Specifies your project dependencies.
#
# Type `mix help deps` for examples and options.
defp deps do
[
{:phoenix, "~> 1.4.3"},
{:phoenix_pubsub, "~> 1.1"},
{:phoenix_ecto, "~> 4.0"},
{:ecto_sql, "~> 3.0"},
{:postgrex, ">= 0.0.0"},
{:gettext, "~> 0.11"},
{:jason, "~> 1.0"},
{:plug_cowboy, "~> 2.0"},
{:httpoison, "~> 1.5"},
{:timex, "~> 3.5"},
{:honeydew, "~> 1.4.3"},
{:quantum, "~> 2.3"},
{:corsica, "~> 1.1.2"}
]
end
# Aliases are shortcuts or tasks specific to the current project.
# For example, to create, migrate and run the seeds file at once:
#
# $ mix ecto.setup
#
# See the documentation for `Mix` for more info on aliases.
defp aliases do
[
"ecto.setup": ["ecto.create", "ecto.migrate", "run priv/repo/seeds.exs"],
"ecto.reset": ["ecto.drop", "ecto.setup"],
test: ["ecto.create --quiet", "ecto.migrate", "test"]
]
end
end

43
backend/mix.lock Normal file
View file

@ -0,0 +1,43 @@
%{
"artificery": {:hex, :artificery, "0.4.2", "3ded6e29e13113af52811c72f414d1e88f711410cac1b619ab3a2666bbd7efd4", [:mix], [], "hexpm"},
"certifi": {:hex, :certifi, "2.5.1", "867ce347f7c7d78563450a18a6a28a8090331e77fa02380b4a21962a65d36ee5", [:rebar3], [{:parse_trans, "~>3.3", [hex: :parse_trans, repo: "hexpm", optional: false]}], "hexpm"},
"combine": {:hex, :combine, "0.10.0", "eff8224eeb56498a2af13011d142c5e7997a80c8f5b97c499f84c841032e429f", [:mix], [], "hexpm"},
"connection": {:hex, :connection, "1.0.4", "a1cae72211f0eef17705aaededacac3eb30e6625b04a6117c1b2db6ace7d5976", [:mix], [], "hexpm"},
"corsica": {:hex, :corsica, "1.1.2", "5ad8b9dcbeeda4762d78a57c0c8c2f88e1eef8741508517c98cb79e0db1f107d", [:mix], [{:plug, "~> 1.0", [hex: :plug, repo: "hexpm", optional: false]}], "hexpm"},
"cowboy": {:hex, :cowboy, "2.6.3", "99aa50e94e685557cad82e704457336a453d4abcb77839ad22dbe71f311fcc06", [:rebar3], [{:cowlib, "~> 2.7.3", [hex: :cowlib, repo: "hexpm", optional: false]}, {:ranch, "~> 1.7.1", [hex: :ranch, repo: "hexpm", optional: false]}], "hexpm"},
"cowlib": {:hex, :cowlib, "2.7.3", "a7ffcd0917e6d50b4d5fb28e9e2085a0ceb3c97dea310505f7460ff5ed764ce9", [:rebar3], [], "hexpm"},
"crontab": {:hex, :crontab, "1.1.7", "b9219f0bdc8678b94143655a8f229716c5810c0636a4489f98c0956137e53985", [:mix], [{:ecto, "~> 1.0 or ~> 2.0 or ~> 3.0", [hex: :ecto, repo: "hexpm", optional: true]}], "hexpm"},
"db_connection": {:hex, :db_connection, "2.1.0", "122e2f62c4906bf2e49554f1e64db5030c19229aa40935f33088e7d543aa79d0", [:mix], [{:connection, "~> 1.0.2", [hex: :connection, repo: "hexpm", optional: false]}], "hexpm"},
"decimal": {:hex, :decimal, "1.8.0", "ca462e0d885f09a1c5a342dbd7c1dcf27ea63548c65a65e67334f4b61803822e", [:mix], [], "hexpm"},
"distillery": {:hex, :distillery, "2.1.1", "f9332afc2eec8a1a2b86f22429e068ef35f84a93ea1718265e740d90dd367814", [:mix], [{:artificery, "~> 0.2", [hex: :artificery, repo: "hexpm", optional: false]}], "hexpm"},
"ecto": {:hex, :ecto, "3.1.7", "fa21d06ef56cdc2fdaa62574e8c3ba34a2751d44ea34c30bc65f0728421043e5", [:mix], [{:decimal, "~> 1.6", [hex: :decimal, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}], "hexpm"},
"ecto_sql": {:hex, :ecto_sql, "3.1.6", "1e80e30d16138a729c717f73dcb938590bcdb3a4502f3012414d0cbb261045d8", [:mix], [{:db_connection, "~> 2.0", [hex: :db_connection, repo: "hexpm", optional: false]}, {:ecto, "~> 3.1.0", [hex: :ecto, repo: "hexpm", optional: false]}, {:mariaex, "~> 0.9.1", [hex: :mariaex, repo: "hexpm", optional: true]}, {:myxql, "~> 0.2.0", [hex: :myxql, repo: "hexpm", optional: true]}, {:postgrex, "~> 0.14.0 or ~> 0.15.0", [hex: :postgrex, repo: "hexpm", optional: true]}, {:telemetry, "~> 0.4.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm"},
"gen_stage": {:hex, :gen_stage, "0.14.2", "6a2a578a510c5bfca8a45e6b27552f613b41cf584b58210f017088d3d17d0b14", [:mix], [], "hexpm"},
"gen_state_machine": {:hex, :gen_state_machine, "2.0.5", "9ac15ec6e66acac994cc442dcc2c6f9796cf380ec4b08267223014be1c728a95", [:mix], [], "hexpm"},
"gettext": {:hex, :gettext, "0.17.0", "abe21542c831887a2b16f4c94556db9c421ab301aee417b7c4fbde7fbdbe01ec", [:mix], [], "hexpm"},
"hackney": {:hex, :hackney, "1.15.1", "9f8f471c844b8ce395f7b6d8398139e26ddca9ebc171a8b91342ee15a19963f4", [:rebar3], [{:certifi, "2.5.1", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "6.0.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "1.0.1", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "1.1.4", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}], "hexpm"},
"honeydew": {:hex, :honeydew, "1.4.3", "f2d976aaf8b9b914a635d2d483f1a71d2f6d8651809474dd5db581953cbebb30", [:mix], [{:ecto, "~> 3.0", [hex: :ecto, repo: "hexpm", optional: true]}], "hexpm"},
"httpoison": {:hex, :httpoison, "1.5.1", "0f55b5b673b03c5c327dac7015a67cb571b99b631acc0bc1b0b98dcd6b9f2104", [:mix], [{:hackney, "~> 1.8", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm"},
"idna": {:hex, :idna, "6.0.0", "689c46cbcdf3524c44d5f3dde8001f364cd7608a99556d8fbd8239a5798d4c10", [:rebar3], [{:unicode_util_compat, "0.4.1", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm"},
"jason": {:hex, :jason, "1.1.2", "b03dedea67a99223a2eaf9f1264ce37154564de899fd3d8b9a21b1a6fd64afe7", [:mix], [{:decimal, "~> 1.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm"},
"libring": {:hex, :libring, "1.4.0", "41246ba2f3fbc76b3971f6bce83119dfec1eee17e977a48d8a9cfaaf58c2a8d6", [:mix], [], "hexpm"},
"metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm"},
"mime": {:hex, :mime, "1.3.1", "30ce04ab3175b6ad0bdce0035cba77bba68b813d523d1aac73d9781b4d193cf8", [:mix], [], "hexpm"},
"mimerl": {:hex, :mimerl, "1.2.0", "67e2d3f571088d5cfd3e550c383094b47159f3eee8ffa08e64106cdf5e981be3", [:rebar3], [], "hexpm"},
"parse_trans": {:hex, :parse_trans, "3.3.0", "09765507a3c7590a784615cfd421d101aec25098d50b89d7aa1d66646bc571c1", [:rebar3], [], "hexpm"},
"phoenix": {:hex, :phoenix, "1.4.9", "746d098e10741c334d88143d3c94cab1756435f94387a63441792e66ec0ee974", [:mix], [{:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}, {:phoenix_pubsub, "~> 1.1", [hex: :phoenix_pubsub, repo: "hexpm", optional: false]}, {:plug, "~> 1.8.1 or ~> 1.9", [hex: :plug, repo: "hexpm", optional: false]}, {:plug_cowboy, "~> 1.0 or ~> 2.0", [hex: :plug_cowboy, repo: "hexpm", optional: true]}, {:telemetry, "~> 0.4", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm"},
"phoenix_ecto": {:hex, :phoenix_ecto, "4.0.0", "c43117a136e7399ea04ecaac73f8f23ee0ffe3e07acfcb8062fe5f4c9f0f6531", [:mix], [{:ecto, "~> 3.0", [hex: :ecto, repo: "hexpm", optional: false]}, {:phoenix_html, "~> 2.9", [hex: :phoenix_html, repo: "hexpm", optional: true]}, {:plug, "~> 1.0", [hex: :plug, repo: "hexpm", optional: false]}], "hexpm"},
"phoenix_pubsub": {:hex, :phoenix_pubsub, "1.1.2", "496c303bdf1b2e98a9d26e89af5bba3ab487ba3a3735f74bf1f4064d2a845a3e", [:mix], [], "hexpm"},
"plug": {:hex, :plug, "1.8.2", "0bcce1daa420f189a6491f3940cc77ea7fb1919761175c9c3b59800d897440fc", [:mix], [{:mime, "~> 1.0", [hex: :mime, repo: "hexpm", optional: false]}, {:plug_crypto, "~> 1.0", [hex: :plug_crypto, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4", [hex: :telemetry, repo: "hexpm", optional: true]}], "hexpm"},
"plug_cowboy": {:hex, :plug_cowboy, "2.1.0", "b75768153c3a8a9e8039d4b25bb9b14efbc58e9c4a6e6a270abff1cd30cbe320", [:mix], [{:cowboy, "~> 2.5", [hex: :cowboy, repo: "hexpm", optional: false]}, {:plug, "~> 1.7", [hex: :plug, repo: "hexpm", optional: false]}], "hexpm"},
"plug_crypto": {:hex, :plug_crypto, "1.0.0", "18e49317d3fa343f24620ed22795ec29d4a5e602d52d1513ccea0b07d8ea7d4d", [:mix], [], "hexpm"},
"postgrex": {:hex, :postgrex, "0.14.3", "5754dee2fdf6e9e508cbf49ab138df964278700b764177e8f3871e658b345a1e", [:mix], [{:connection, "~> 1.0", [hex: :connection, repo: "hexpm", optional: false]}, {:db_connection, "~> 2.0", [hex: :db_connection, repo: "hexpm", optional: false]}, {:decimal, "~> 1.5", [hex: :decimal, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}], "hexpm"},
"quantum": {:hex, :quantum, "2.3.4", "72a0e8855e2adc101459eac8454787cb74ab4169de6ca50f670e72142d4960e9", [:mix], [{:calendar, "~> 0.17", [hex: :calendar, repo: "hexpm", optional: true]}, {:crontab, "~> 1.1", [hex: :crontab, repo: "hexpm", optional: false]}, {:gen_stage, "~> 0.12", [hex: :gen_stage, repo: "hexpm", optional: false]}, {:swarm, "~> 3.3", [hex: :swarm, repo: "hexpm", optional: false]}, {:timex, "~> 3.1", [hex: :timex, repo: "hexpm", optional: true]}], "hexpm"},
"ranch": {:hex, :ranch, "1.7.1", "6b1fab51b49196860b733a49c07604465a47bdb78aa10c1c16a3d199f7f8c881", [:rebar3], [], "hexpm"},
"ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.4", "f0eafff810d2041e93f915ef59899c923f4568f4585904d010387ed74988e77b", [:make, :mix, :rebar3], [], "hexpm"},
"swarm": {:hex, :swarm, "3.4.0", "64f8b30055d74640d2186c66354b33b999438692a91be275bb89cdc7e401f448", [:mix], [{:gen_state_machine, "~> 2.0", [hex: :gen_state_machine, repo: "hexpm", optional: false]}, {:libring, "~> 1.0", [hex: :libring, repo: "hexpm", optional: false]}], "hexpm"},
"telemetry": {:hex, :telemetry, "0.4.0", "8339bee3fa8b91cb84d14c2935f8ecf399ccd87301ad6da6b71c09553834b2ab", [:rebar3], [], "hexpm"},
"timex": {:hex, :timex, "3.6.1", "efdf56d0e67a6b956cc57774353b0329c8ab7726766a11547e529357ffdc1d56", [:mix], [{:combine, "~> 0.10", [hex: :combine, repo: "hexpm", optional: false]}, {:gettext, "~> 0.10", [hex: :gettext, repo: "hexpm", optional: false]}, {:tzdata, "~> 0.1.8 or ~> 0.5 or ~> 1.0.0", [hex: :tzdata, repo: "hexpm", optional: false]}], "hexpm"},
"tzdata": {:hex, :tzdata, "1.0.1", "f6027a331af7d837471248e62733c6ebee86a72e57c613aa071ebb1f750fc71a", [:mix], [{:hackney, "~> 1.0", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm"},
"unicode_util_compat": {:hex, :unicode_util_compat, "0.4.1", "d869e4c68901dd9531385bb0c8c40444ebf624e60b6962d95952775cac5e90cd", [:rebar3], [], "hexpm"},
}

View file

@ -0,0 +1,97 @@
## `msgid`s in this file come from POT (.pot) files.
##
## Do not add, change, or remove `msgid`s manually here as
## they're tied to the ones in the corresponding POT file
## (with the same domain).
##
## Use `mix gettext.extract --merge` or `mix gettext.merge`
## to merge POT files into PO files.
msgid ""
msgstr ""
"Language: en\n"
## From Ecto.Changeset.cast/4
msgid "can't be blank"
msgstr ""
## From Ecto.Changeset.unique_constraint/3
msgid "has already been taken"
msgstr ""
## From Ecto.Changeset.put_change/3
msgid "is invalid"
msgstr ""
## From Ecto.Changeset.validate_acceptance/3
msgid "must be accepted"
msgstr ""
## From Ecto.Changeset.validate_format/3
msgid "has invalid format"
msgstr ""
## From Ecto.Changeset.validate_subset/3
msgid "has an invalid entry"
msgstr ""
## From Ecto.Changeset.validate_exclusion/3
msgid "is reserved"
msgstr ""
## From Ecto.Changeset.validate_confirmation/3
msgid "does not match confirmation"
msgstr ""
## From Ecto.Changeset.no_assoc_constraint/3
msgid "is still associated with this entry"
msgstr ""
msgid "are still associated with this entry"
msgstr ""
## From Ecto.Changeset.validate_length/3
msgid "should be %{count} character(s)"
msgid_plural "should be %{count} character(s)"
msgstr[0] ""
msgstr[1] ""
msgid "should have %{count} item(s)"
msgid_plural "should have %{count} item(s)"
msgstr[0] ""
msgstr[1] ""
msgid "should be at least %{count} character(s)"
msgid_plural "should be at least %{count} character(s)"
msgstr[0] ""
msgstr[1] ""
msgid "should have at least %{count} item(s)"
msgid_plural "should have at least %{count} item(s)"
msgstr[0] ""
msgstr[1] ""
msgid "should be at most %{count} character(s)"
msgid_plural "should be at most %{count} character(s)"
msgstr[0] ""
msgstr[1] ""
msgid "should have at most %{count} item(s)"
msgid_plural "should have at most %{count} item(s)"
msgstr[0] ""
msgstr[1] ""
## From Ecto.Changeset.validate_number/3
msgid "must be less than %{number}"
msgstr ""
msgid "must be greater than %{number}"
msgstr ""
msgid "must be less than or equal to %{number}"
msgstr ""
msgid "must be greater than or equal to %{number}"
msgstr ""
msgid "must be equal to %{number}"
msgstr ""

View file

@ -0,0 +1,4 @@
[
import_deps: [:ecto_sql],
inputs: ["*.exs"]
]

View file

@ -0,0 +1,29 @@
defmodule Backend.Repo.Migrations.CreateInstances do
use Ecto.Migration
def change do
create table(:instances) do
add :domain, :string, null: false
add :description, :text
add :user_count, :integer
add :status_count, :integer
add :version, :string
add :insularity, :float
timestamps()
end
create unique_index(:instances, [:domain])
create table(:instance_peers) do
add :source_domain, references(:instances, column: :domain, type: :string)
add :target_domain, references(:instances, column: :domain, type: :string)
add :weight, :float
timestamps()
end
create unique_index(:instance_peers, [:source_domain, :target_domain])
end
end

View file

@ -0,0 +1,15 @@
defmodule Backend.Repo.Migrations.CreateEdges do
use Ecto.Migration
def change do
create table(:edges) do
add :source_domain, references(:instances, column: :domain, type: :string), null: false
add :target_domain, references(:instances, column: :domain, type: :string), null: false
timestamps()
end
create index(:edges, [:source_domain])
create index(:edges, [:target_domain])
end
end

View file

@ -0,0 +1,20 @@
defmodule Backend.Repo.Migrations.CreateCrawls do
use Ecto.Migration
def change do
create table(:crawls) do
add :instance_domain, references(:instances, column: :domain, type: :string), null: false
add :statuses_seen, :integer
add :interactions_seen, :integer
add :error, :text
timestamps()
end
# TODO: does this actually make WHERE error IS NULL queries faster? if not, drop it
create index(:crawls, [:error])
create index(:crawls, [:inserted_at])
end
end

View file

@ -0,0 +1,16 @@
defmodule Backend.Repo.Migrations.CreateCrawlInteractions do
use Ecto.Migration
def change do
create table(:crawl_interactions) do
add :crawl_id, references(:crawls, on_delete: :delete_all), null: false
add :source_domain, references(:instances, column: :domain, type: :string), null: false
add :target_domain, references(:instances, column: :domain, type: :string), null: false
add :mentions, :integer
timestamps()
end
end
end

View file

@ -0,0 +1,10 @@
defmodule Backend.Repo.Migrations.AddInstanceCoords do
use Ecto.Migration
def change do
alter table(:instances) do
add :x, :float
add :y, :float
end
end
end

View file

@ -0,0 +1,11 @@
# Script for populating the database. You can run it as:
#
# mix run priv/repo/seeds.exs
#
# Inside the script, you can read and write to any of your
# repositories directly:
#
# Backend.Repo.insert!(%Backend.SomeSchema{})
#
# We recommend using the bang functions (`insert!`, `update!`
# and so on) as they will fail if something goes wrong.

View file

@ -0,0 +1 @@
+C multi_time_warp

View file

@ -1,28 +0,0 @@
autopep8==1.3.5
certifi==2018.8.24
chardet==3.0.4
dill==0.2.5
Django==2.1.7
django-bulk-update==2.2.0
django-cors-headers==2.4.0
django-letsencrypt==3.0.1
django-silk==3.0.1
djangorestframework==3.8.2
future==0.16.0
gprof2dot==2016.10.13
gunicorn==19.9.0
idna==2.7
Jinja2==2.10
MarkupSafe==1.0
psycopg2-binary==2.7.5
pycodestyle==2.4.0
PyFunctional==1.1.3
Pygments==2.2.0
python-dateutil==2.7.3
pytz==2018.5
requests==2.20.1
six==1.10.0
sqlparse==0.2.4
tabulate==0.7.7
tqdm==4.25.0
urllib3==1.23

View file

@ -1,3 +0,0 @@
from django.contrib import admin
# Register your models here.

View file

@ -1,5 +0,0 @@
from django.apps import AppConfig
class ScraperConfig(AppConfig):
name = 'scraper'

View file

@ -1,84 +0,0 @@
from datetime import datetime
LOCK_MODES = (
'ACCESS SHARE',
'ROW SHARE',
'ROW EXCLUSIVE',
'SHARE UPDATE EXCLUSIVE',
'SHARE',
'SHARE ROW EXCLUSIVE',
'EXCLUSIVE',
'ACCESS EXCLUSIVE',
)
def require_lock(model, lock):
"""
Decorator for PostgreSQL's table-level lock functionality
Example:
@transaction.commit_on_success
@require_lock(MyModel, 'ACCESS EXCLUSIVE')
def myview(request)
...
PostgreSQL's LOCK Documentation:
http://www.postgresql.org/docs/8.3/interactive/sql-lock.html
"""
def require_lock_decorator(view_func):
def wrapper(*args, **kwargs):
if lock not in LOCK_MODES:
raise ValueError('%s is not a PostgreSQL supported lock mode.')
from django.db import connection
cursor = connection.cursor()
cursor.execute(
'LOCK TABLE %s IN %s MODE' % (model._meta.db_table, lock)
)
return view_func(*args, **kwargs)
return wrapper
return require_lock_decorator
class InvalidResponseException(Exception):
"""Used for all responses other than HTTP 200"""
pass
class PersonalInstanceException(Exception):
"""
Used for instances that we don't want to scrape because there are too few users.
We don't want information on individuals, but aggregate statistics on instances and how they interact.
"""
pass
class BlacklistedDomainException(Exception):
"""
Used for instances whose domain is blacklisted.
"""
pass
def get_key(data, keys: list):
try:
val = data[keys.pop(0)]
while keys:
val = val[keys.pop(0)]
return val
except (KeyError, TypeError):
return ''
def validate_int(integer):
return integer if (isinstance(integer, int) and 0 <= integer < 2147483647) else None
def log(obj, text, success=False, error=False):
text = "{} - {}".format(datetime.now().isoformat(), text)
if success:
text = obj.style.SUCCESS(text)
if error:
obj.stderr.write(text)
else:
obj.stdout.write(text)

View file

@ -1,38 +0,0 @@
import subprocess
from django.core.management.base import BaseCommand
from django.conf import settings
from scraper.models import PeerRelationship, Edge
class Command(BaseCommand):
help = "Takes what's in the database and calls Gephi to create and layout a graph"
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def handle(self, *args, **options):
self.stdout.write("Creating Edges from PeerRelationships...")
# Turn symmetrical PeerRelationships into symmetrical Edges
relationships = PeerRelationship.objects.filter(source__status='success', target__status='success')
# Loop over once and put 'em into a dict for fast access
relationships = {(r.source_id, r.target_id): r for r in relationships}
edges = []
while relationships:
(source_id, target_id), outgoing = relationships.popitem()
total_statuses = outgoing.statuses_seen
mention_count = outgoing.mention_count
incoming = relationships.pop((target_id, source_id), None)
oldest_data = outgoing.last_updated
if incoming:
total_statuses += (incoming.statuses_seen)
mention_count += (incoming.mention_count)
oldest_data = min(oldest_data, incoming.last_updated)
if mention_count == 0 or total_statuses == 0:
# don't add edges with weight 0
continue
ratio = float(mention_count)/total_statuses
edges.append(Edge(source_id=source_id, target_id=target_id, weight=ratio, last_updated=oldest_data))
Edge.objects.all().delete()
Edge.objects.bulk_create(edges)

View file

@ -1,276 +0,0 @@
"""
This script starts at a seed instance and loads the list of connected
peers. From there, it scrapes the peers of all instances it finds,
gradually mapping the fediverse.
"""
import json
import multiprocessing as mp
import requests
import time
import os
from dateutil.parser import parse as datetime_parser
from datetime import datetime, timedelta, timezone
from functional import seq
from django_bulk_update.helper import bulk_update
from django.core.management.base import BaseCommand
from django import db
from django.conf import settings
from django.utils import timezone
from scraper.models import Instance, PeerRelationship
from scraper.management.commands._util import require_lock, InvalidResponseException, get_key, log, validate_int, PersonalInstanceException, BlacklistedDomainException
# TODO: use the /api/v1/server/followers and /api/v1/server/following endpoints in peertube instances
SEED = 'mastodon.social'
TIMEOUT = 20 # seconds
NUM_THREADS = 16 # roughly 40MB each
PERSONAL_INSTANCE_THRESHOLD = 10 # instances with < this many users won't be crawled
MAX_STATUSES_PER_PAGE = 40
STATUS_SCRAPE_LIMIT = 5000
INSTANCE_SCRAPE_LIMIT = 50 # note: this does not include newly discovered instances! they will always be crawled.
class Command(BaseCommand):
help = "Scrapes the entire fediverse"
def add_arguments(self, parser):
# Named (optional) arguments
parser.add_argument(
'--unlimited',
action='store_true',
dest='unlimited',
help="Crawl all stale instances rather than limiting to {}".format(INSTANCE_SCRAPE_LIMIT),
)
parser.add_argument(
'--all',
action='store_true',
dest='all',
help="Crawl all instances rather than limiting to stale ones"
)
parser.add_argument(
'--verbose',
action='store_true',
dest='verbose',
help="Verbose logging"
)
parser.add_argument(
'--instance',
dest='instance',
help="Crawl a single instance"
)
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.verbose = False
self.scraped_count = 0
f = open(os.path.join(settings.BASE_DIR, '../whitelist.txt'), 'r')
self.whitelist = seq(f.readlines()).map(lambda i: i.lower().strip()).to_list()
f.close()
def get_instance_info(self, instance_name: str):
"""Collect info about instance"""
url = 'https://' + instance_name + '/api/v1/instance'
response = requests.get(url, timeout=TIMEOUT)
json = response.json()
if response.status_code != 200 or get_key(json, ['error']):
if self.verbose:
log(self, "Couldn't get instance info for {}: {}".format(instance_name, response), error=True)
raise InvalidResponseException("Could not get info for {}".format(instance_name))
return json
def get_instance_peers(self, instance_name: str):
"""Collect connected instances"""
# The peers endpoint returns a "list of all domain names known to this instance"
# (https://github.com/tootsuite/mastodon/pull/6125)
url = 'https://' + instance_name + '/api/v1/instance/peers'
response = requests.get(url, timeout=TIMEOUT)
peers = response.json()
if response.status_code != 200 or not isinstance(peers, list) or get_key(peers, ['error']):
if self.verbose:
log(self, "Couldn't get peers for {}: {}".format(instance_name, response), error=True)
raise InvalidResponseException("Could not get peers for {}".format(instance_name))
# Get rid of peers that just say "null" and the instance itself
# Also make sure to lowercase all instance names and remove duplicates
return list(set([peer.lower() for peer in peers if peer and peer != instance_name]))
def get_statuses(self, instance_name: str):
"""Collect all statuses that mention users on other instances"""
mentions = []
datetime_threshold = datetime.now(timezone.utc) - timedelta(days=31)
statuses_seen = 0
# We'll ask for lots of statuses, but Mastodon never returns more than 40. Some Pleroma instances will ignore
# the limit and return 20.
url = 'https://{}/api/v1/timelines/public?local=true&limit={}'.format(instance_name, MAX_STATUSES_PER_PAGE)
while True:
if self.verbose:
log(self, "({} posts seen)\tGetting {}".format(statuses_seen, url))
response = requests.get(url, timeout=TIMEOUT)
statuses = response.json()
if response.status_code != 200 or get_key(statuses, ['error']):
if self.verbose:
log(self, "Couldn't get statuses for {}: {}".format(instance_name, response), error=True)
raise InvalidResponseException("Could not get statuses for {}".format(instance_name))
elif len(statuses) == 0:
break
# Get mentions from this instance
mentions.extend((seq(statuses)
.filter(lambda s: datetime_parser(s['created_at']) > datetime_threshold)
.flat_map(lambda s: s['mentions']))) # map to mentions
# Find out if we should stop here
earliest_status = statuses[-1]
earliest_time_seen = datetime_parser(earliest_status['created_at'])
statuses_seen += len(statuses)
# Mastodon returns max 40 statuses; if we ever see less than that we know there aren't any more
if earliest_time_seen < datetime_threshold or statuses_seen >= STATUS_SCRAPE_LIMIT:
break
# Continuing, so get url for next page
min_id = earliest_status['id']
url = 'https://{}/api/v1/timelines/public?local=true&limit={}&max_id={}'.format(instance_name, MAX_STATUSES_PER_PAGE, min_id)
time.sleep(2) # Sleep to avoid overloading the instance
mentions_seq = (seq(mentions)
.filter(lambda m: not m['acct'].endswith(instance_name) and '@' in m['acct'])
.map(lambda m: m['acct'].split('@')[-1]) # map to instance name
.map(lambda m: (m, 1))
.reduce_by_key(lambda x, y: x+y)) # sequence of tuples (instance, count)
mentions_by_instance = {t[0]: t[1] for t in mentions_seq} # dict of instance -> number of mentions
return mentions_by_instance, statuses_seen
def process_instance(self, instance: Instance):
"""Given an instance, get all the data we're interested in"""
data = dict()
try:
if instance.name.endswith("gab.best"):
raise BlacklistedDomainException
data['instance_name'] = instance.name
data['info'] = self.get_instance_info(instance.name)
# Check if this is a personal instance before continuing
user_count = get_key(data, ['info', 'stats', 'user_count'])
if isinstance(user_count, int)\
and user_count < PERSONAL_INSTANCE_THRESHOLD\
and instance.name not in self.whitelist:
raise PersonalInstanceException
data['peers'] = self.get_instance_peers(instance.name)
if not data['info'] and not data['peers']:
# We got a response from the instance, but it didn't have any of the information we were expecting.
raise InvalidResponseException
data['mentions'], data['statuses_seen'] = self.get_statuses(instance.name)
data['status'] = 'success'
return data
except (InvalidResponseException,
PersonalInstanceException,
BlacklistedDomainException,
requests.exceptions.RequestException,
json.decoder.JSONDecodeError) as e:
data['instance_name'] = instance.name
data['status'] = type(e).__name__
return data
@db.transaction.atomic
@require_lock(Instance, 'ACCESS EXCLUSIVE')
def save_data(self, instance, data, queue, existing_instance_ids):
"""Save data"""
# Validate the ints. Some servers that appear to be fake instances have e.g. negative numbers here.
instance.domain_count = validate_int(get_key(data, ['info', 'stats', 'domain_count']))
instance.status_count = validate_int(get_key(data, ['info', 'stats', 'status_count']))
instance.user_count = validate_int(get_key(data, ['info', 'stats', 'user_count']))
instance.description = get_key(data, ['info', 'description'])
instance.version = get_key(data, ['info', 'version'])
instance.status = get_key(data, ['status'])
instance.last_updated = timezone.now()
instance.save()
if data['status'] == 'success' and data['peers']:
# TODO: handle a peer disappeer-ing
# Create instances for the peers we haven't seen before and add them to the queue
new_instance_ids = [peer_id for peer_id in data['peers'] if peer_id not in existing_instance_ids]
# bulk_create doesn't call save(), so the auto_now_add field won't get set automatically
new_instances = [Instance(name=id, first_seen=datetime.now(), last_updated=datetime.utcfromtimestamp(0))
for id in new_instance_ids]
existing_instance_ids.extend(new_instance_ids)
Instance.objects.bulk_create(new_instances)
for new_instance in new_instances:
queue.put(new_instance)
# Create relationships we haven't seen before
existing_peer_ids = PeerRelationship.objects.filter(source=instance).values_list('target', flat=True)
new_peer_ids = [peer_id for peer_id in data['peers'] if peer_id not in existing_peer_ids]
if new_peer_ids:
# new_peers = Instance.objects.filter(name__in=new_peer_ids)
new_relationships = [PeerRelationship(source=instance, target_id=new_peer, first_seen=datetime.now())
for new_peer in new_peer_ids]
PeerRelationship.objects.bulk_create(new_relationships)
if data['status'] == 'success' and data['mentions']:
# At this point, we can assume that a relationship exists for every peer that's mentioned in statuses
mentions = data['mentions']
relationships = PeerRelationship.objects.filter(source=instance,
target_id__in=list(mentions.keys()))
for relationship in relationships:
relationship.mention_count = mentions[relationship.target_id]
relationship.statuses_seen = data['statuses_seen']
relationship.last_updated = datetime.now()
bulk_update(relationships, update_fields=['mention_count', 'statuses_seen', 'last_updated'])
log(self, "Processed {}: {}".format(data['instance_name'], data['status']))
def worker(self, queue: mp.JoinableQueue, existing_instance_ids, scraped_ids):
"""The main worker that processes instances"""
db.connections.close_all() # https://stackoverflow.com/a/38356519/3697202
while True:
instance = queue.get()
if instance.name in scraped_ids:
# If we hit this branch, it's indicative of a bug
log(self, "Skipping {}, already done. This should not have been added to the queue!".format(instance),
error=True)
queue.task_done()
else:
# Fetch data on instance
log(self, "Processing {}".format(instance.name))
data = self.process_instance(instance)
self.save_data(instance, data, queue, existing_instance_ids)
scraped_ids[instance.name] = 1
queue.task_done()
def handle(self, *args, **options):
start_time = time.time()
self.verbose = options['verbose']
if options['instance']:
stale_instance, _ = Instance.objects.get_or_create(name=options['instance'])
stale_instances = [stale_instance]
elif options['all']:
stale_instances = Instance.objects.all()
else:
stale_instances = Instance.objects.filter(last_updated__lte=datetime.now()-timedelta(days=1))
if not options['unlimited']:
stale_instances = stale_instances[:INSTANCE_SCRAPE_LIMIT]
with mp.Manager() as manager:
# Share the list of existing instances amongst all threads (to avoid each thread having to query
# for it on every instance it scrapes)
existing_instance_ids = manager.list(list(Instance.objects.values_list('name', flat=True)))
scraped_ids = manager.dict()
queue = mp.JoinableQueue()
if stale_instances:
for instance in stale_instances:
queue.put(instance)
elif not Instance.objects.exists():
instance, _ = Instance.objects.get_or_create(name=SEED)
existing_instance_ids.append(instance.name)
queue.put(instance)
pool = mp.Pool(NUM_THREADS, initializer=self.worker, initargs=(queue, existing_instance_ids, scraped_ids))
queue.join()
self.scraped_count = len(scraped_ids.keys())
end_time = time.time()
log(self, "Scraped {} instances in {:.0f}s".format(self.scraped_count, end_time - start_time), True)

View file

@ -1,67 +0,0 @@
# Generated by Django 2.1.7 on 2019-02-21 12:27
from django.db import migrations, models
import django.db.models.deletion
import django.utils.timezone
class Migration(migrations.Migration):
initial = True
dependencies = [
]
operations = [
migrations.CreateModel(
name='Edge',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('weight', models.FloatField(blank=True, null=True)),
('last_updated', models.DateTimeField(default=django.utils.timezone.now)),
],
),
migrations.CreateModel(
name='Instance',
fields=[
('name', models.CharField(max_length=200, primary_key=True, serialize=False)),
('description', models.TextField(blank=True)),
('domain_count', models.IntegerField(blank=True, null=True)),
('status_count', models.IntegerField(blank=True, null=True)),
('user_count', models.IntegerField(blank=True, null=True)),
('version', models.CharField(blank=True, max_length=1000)),
('status', models.CharField(max_length=100)),
('x_coord', models.FloatField(blank=True, null=True)),
('y_coord', models.FloatField(blank=True, null=True)),
('first_seen', models.DateTimeField(auto_now_add=True)),
('last_updated', models.DateTimeField(default=django.utils.timezone.now)),
],
),
migrations.CreateModel(
name='PeerRelationship',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('mention_count', models.IntegerField(default=0)),
('statuses_seen', models.IntegerField(default=0)),
('first_seen', models.DateTimeField(auto_now_add=True)),
('last_updated', models.DateTimeField(default=django.utils.timezone.now)),
('source', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='following_relationship', to='scraper.Instance')),
('target', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='follower_relationships', to='scraper.Instance')),
],
),
migrations.AddField(
model_name='instance',
name='peers',
field=models.ManyToManyField(through='scraper.PeerRelationship', to='scraper.Instance'),
),
migrations.AddField(
model_name='edge',
name='source',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='+', to='scraper.Instance'),
),
migrations.AddField(
model_name='edge',
name='target',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='+', to='scraper.Instance'),
),
]

View file

@ -1,24 +0,0 @@
# Generated by Django 2.1.7 on 2019-04-19 13:46
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
('scraper', '0001_initial'),
]
operations = [
migrations.AlterField(
model_name='edge',
name='source',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='targets', to='scraper.Instance'),
),
migrations.AlterField(
model_name='edge',
name='target',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='sources', to='scraper.Instance'),
),
]

View file

@ -1,59 +0,0 @@
from django.db import models
from django.utils import timezone
class Instance(models.Model):
"""
The main model that saves details of an instance and links between them in the peers
property.
Don't change the schema without verifying that the gephi script can still read the data.
"""
# Primary key
name = models.CharField(max_length=200, primary_key=True)
# Details
description = models.TextField(blank=True)
domain_count = models.IntegerField(blank=True, null=True)
status_count = models.IntegerField(blank=True, null=True)
user_count = models.IntegerField(blank=True, null=True)
version = models.CharField(max_length=1000, blank=True) # In Django CharField is never stored as NULL in the db
status = models.CharField(max_length=100)
# Foreign keys
peers = models.ManyToManyField('self', symmetrical=False, through='PeerRelationship')
# Graph
x_coord = models.FloatField(blank=True, null=True)
y_coord = models.FloatField(blank=True, null=True)
# Automatic fields
first_seen = models.DateTimeField(auto_now_add=True)
last_updated = models.DateTimeField(default=timezone.now)
class PeerRelationship(models.Model):
source = models.ForeignKey(Instance, related_name="following_relationship", on_delete=models.CASCADE)
target = models.ForeignKey(Instance, related_name="follower_relationships", on_delete=models.CASCADE)
# Interaction stats
mention_count = models.IntegerField(default=0)
statuses_seen = models.IntegerField(default=0) # because we want mention_count as a ratio
# Metadata
first_seen = models.DateTimeField(auto_now_add=True)
last_updated = models.DateTimeField(default=timezone.now)
class Edge(models.Model):
"""
This class is automatically generated from PeerRelationship using the build_edges command.
It aggregates stats from the asymmetrical PeerRelationship to a symmetrical one that's suitable for serving
to the front-end.
"""
source = models.ForeignKey(Instance, related_name='targets', on_delete=models.CASCADE)
target = models.ForeignKey(Instance, related_name='sources', on_delete=models.CASCADE)
weight = models.FloatField(blank=True, null=True)
# Metadata
last_updated = models.DateTimeField(default=timezone.now)

View file

@ -1,26 +0,0 @@
#! /bin/bash
SLEEP_SECONDS=3
>&2 echo "Checking Postgres status..."
# https://docs.docker.com/compose/startup-order/
export PGPASSWORD=$POSTGRES_PASSWORD
until psql -h db -U "$POSTGRES_USER" -p 5432 -d "$POSTGRES_DB" -c '\q'
do
>&2 echo "Postgres is unavailable - sleeping"
sleep $SLEEP_SECONDS
done
>&2 echo "Postgres is up"
python manage.py collectstatic --noinput
python manage.py migrate --noinput
if [[ $ENVIRONMENT == "development" ]]
then
>&2 echo "Running Django server on port 8000 for development"
python manage.py runserver 0.0.0.0:8000
else
>&2 echo "Running gunicorn server"
gunicorn backend.wsgi -c /config/gunicorn.conf.py
fi

View file

@ -0,0 +1,104 @@
defmodule BackendWeb.GraphControllerTest do
use BackendWeb.ConnCase
alias Backend.Api
alias Backend.Api.Graph
@create_attrs %{
id: "some id",
label: "some label",
size: 120.5,
x: 120.5,
y: 120.5
}
@update_attrs %{
id: "some updated id",
label: "some updated label",
size: 456.7,
x: 456.7,
y: 456.7
}
@invalid_attrs %{id: nil, label: nil, size: nil, x: nil, y: nil}
def fixture(:graph) do
{:ok, graph} = Api.create_graph(@create_attrs)
graph
end
setup %{conn: conn} do
{:ok, conn: put_req_header(conn, "accept", "application/json")}
end
describe "index" do
test "lists all nodes", %{conn: conn} do
conn = get(conn, Routes.graph_path(conn, :index))
assert json_response(conn, 200)["data"] == []
end
end
describe "create graph" do
test "renders graph when data is valid", %{conn: conn} do
conn = post(conn, Routes.graph_path(conn, :create), graph: @create_attrs)
assert %{"id" => id} = json_response(conn, 201)["data"]
conn = get(conn, Routes.graph_path(conn, :show, id))
assert %{
"id" => id,
"id" => "some id",
"label" => "some label",
"size" => 120.5,
"x" => 120.5,
"y" => 120.5
} = json_response(conn, 200)["data"]
end
test "renders errors when data is invalid", %{conn: conn} do
conn = post(conn, Routes.graph_path(conn, :create), graph: @invalid_attrs)
assert json_response(conn, 422)["errors"] != %{}
end
end
describe "update graph" do
setup [:create_graph]
test "renders graph when data is valid", %{conn: conn, graph: %Graph{id: id} = graph} do
conn = put(conn, Routes.graph_path(conn, :update, graph), graph: @update_attrs)
assert %{"id" => ^id} = json_response(conn, 200)["data"]
conn = get(conn, Routes.graph_path(conn, :show, id))
assert %{
"id" => id,
"id" => "some updated id",
"label" => "some updated label",
"size" => 456.7,
"x" => 456.7,
"y" => 456.7
} = json_response(conn, 200)["data"]
end
test "renders errors when data is invalid", %{conn: conn, graph: graph} do
conn = put(conn, Routes.graph_path(conn, :update, graph), graph: @invalid_attrs)
assert json_response(conn, 422)["errors"] != %{}
end
end
describe "delete graph" do
setup [:create_graph]
test "deletes chosen graph", %{conn: conn, graph: graph} do
conn = delete(conn, Routes.graph_path(conn, :delete, graph))
assert response(conn, 204)
assert_error_sent 404, fn ->
get(conn, Routes.graph_path(conn, :show, graph))
end
end
end
defp create_graph(_) do
graph = fixture(:graph)
{:ok, graph: graph}
end
end

View file

@ -0,0 +1,88 @@
defmodule BackendWeb.InstanceControllerTest do
use BackendWeb.ConnCase
alias Backend.Api
alias Backend.Api.Instance
@create_attrs %{
name: "some name"
}
@update_attrs %{
name: "some updated name"
}
@invalid_attrs %{name: nil}
def fixture(:instance) do
{:ok, instance} = Api.create_instance(@create_attrs)
instance
end
setup %{conn: conn} do
{:ok, conn: put_req_header(conn, "accept", "application/json")}
end
describe "index" do
test "lists all instances", %{conn: conn} do
conn = get(conn, Routes.instance_path(conn, :index))
assert json_response(conn, 200)["data"] == []
end
end
describe "create instance" do
test "renders instance when data is valid", %{conn: conn} do
conn = post(conn, Routes.instance_path(conn, :create), instance: @create_attrs)
assert %{"id" => id} = json_response(conn, 201)["data"]
conn = get(conn, Routes.instance_path(conn, :show, id))
assert %{
"id" => id,
"name" => "some name"
} = json_response(conn, 200)["data"]
end
test "renders errors when data is invalid", %{conn: conn} do
conn = post(conn, Routes.instance_path(conn, :create), instance: @invalid_attrs)
assert json_response(conn, 422)["errors"] != %{}
end
end
describe "update instance" do
setup [:create_instance]
test "renders instance when data is valid", %{conn: conn, instance: %Instance{id: id} = instance} do
conn = put(conn, Routes.instance_path(conn, :update, instance), instance: @update_attrs)
assert %{"id" => ^id} = json_response(conn, 200)["data"]
conn = get(conn, Routes.instance_path(conn, :show, id))
assert %{
"id" => id,
"name" => "some updated name"
} = json_response(conn, 200)["data"]
end
test "renders errors when data is invalid", %{conn: conn, instance: instance} do
conn = put(conn, Routes.instance_path(conn, :update, instance), instance: @invalid_attrs)
assert json_response(conn, 422)["errors"] != %{}
end
end
describe "delete instance" do
setup [:create_instance]
test "deletes chosen instance", %{conn: conn, instance: instance} do
conn = delete(conn, Routes.instance_path(conn, :delete, instance))
assert response(conn, 204)
assert_error_sent 404, fn ->
get(conn, Routes.instance_path(conn, :show, instance))
end
end
end
defp create_instance(_) do
instance = fixture(:instance)
{:ok, instance: instance}
end
end

View file

@ -0,0 +1,15 @@
defmodule BackendWeb.ErrorViewTest do
use BackendWeb.ConnCase, async: true
# Bring render/3 and render_to_string/3 for testing custom views
import Phoenix.View
test "renders 404.json" do
assert render(BackendWeb.ErrorView, "404.json", []) == %{errors: %{detail: "Not Found"}}
end
test "renders 500.json" do
assert render(BackendWeb.ErrorView, "500.json", []) ==
%{errors: %{detail: "Internal Server Error"}}
end
end

View file

@ -0,0 +1,37 @@
defmodule BackendWeb.ChannelCase do
@moduledoc """
This module defines the test case to be used by
channel tests.
Such tests rely on `Phoenix.ChannelTest` and also
import other functionality to make it easier
to build common data structures and query the data layer.
Finally, if the test case interacts with the database,
it cannot be async. For this reason, every test runs
inside a transaction which is reset at the beginning
of the test unless the test case is marked as async.
"""
use ExUnit.CaseTemplate
using do
quote do
# Import conveniences for testing with channels
use Phoenix.ChannelTest
# The default endpoint for testing
@endpoint BackendWeb.Endpoint
end
end
setup tags do
:ok = Ecto.Adapters.SQL.Sandbox.checkout(Backend.Repo)
unless tags[:async] do
Ecto.Adapters.SQL.Sandbox.mode(Backend.Repo, {:shared, self()})
end
:ok
end
end

View file

@ -0,0 +1,38 @@
defmodule BackendWeb.ConnCase do
@moduledoc """
This module defines the test case to be used by
tests that require setting up a connection.
Such tests rely on `Phoenix.ConnTest` and also
import other functionality to make it easier
to build common data structures and query the data layer.
Finally, if the test case interacts with the database,
it cannot be async. For this reason, every test runs
inside a transaction which is reset at the beginning
of the test unless the test case is marked as async.
"""
use ExUnit.CaseTemplate
using do
quote do
# Import conveniences for testing with connections
use Phoenix.ConnTest
alias BackendWeb.Router.Helpers, as: Routes
# The default endpoint for testing
@endpoint BackendWeb.Endpoint
end
end
setup tags do
:ok = Ecto.Adapters.SQL.Sandbox.checkout(Backend.Repo)
unless tags[:async] do
Ecto.Adapters.SQL.Sandbox.mode(Backend.Repo, {:shared, self()})
end
{:ok, conn: Phoenix.ConnTest.build_conn()}
end
end

View file

@ -0,0 +1,53 @@
defmodule Backend.DataCase do
@moduledoc """
This module defines the setup for tests requiring
access to the application's data layer.
You may define functions here to be used as helpers in
your tests.
Finally, if the test case interacts with the database,
it cannot be async. For this reason, every test runs
inside a transaction which is reset at the beginning
of the test unless the test case is marked as async.
"""
use ExUnit.CaseTemplate
using do
quote do
alias Backend.Repo
import Ecto
import Ecto.Changeset
import Ecto.Query
import Backend.DataCase
end
end
setup tags do
:ok = Ecto.Adapters.SQL.Sandbox.checkout(Backend.Repo)
unless tags[:async] do
Ecto.Adapters.SQL.Sandbox.mode(Backend.Repo, {:shared, self()})
end
:ok
end
@doc """
A helper that transforms changeset errors into a map of messages.
assert {:error, changeset} = Accounts.create_user(%{password: "short"})
assert "password is too short" in errors_on(changeset).password
assert %{password: ["password is too short"]} = errors_on(changeset)
"""
def errors_on(changeset) do
Ecto.Changeset.traverse_errors(changeset, fn {message, opts} ->
Enum.reduce(opts, message, fn {key, value}, acc ->
String.replace(acc, "%{#{key}}", to_string(value))
end)
end)
end
end

View file

@ -0,0 +1,2 @@
ExUnit.start()
Ecto.Adapters.SQL.Sandbox.mode(Backend.Repo, :manual)

View file

@ -1,13 +0,0 @@
backend.fediverse.space {
tls tao@btao.org
gzip
cors
root /srv
proxy / django:8000 {
transparent
except /static
}
}

View file

@ -1,196 +0,0 @@
#
# Server socket
#
# bind - The socket to bind.
#
# A string of the form: 'HOST', 'HOST:PORT', 'unix:PATH'.
# An IP is a valid HOST.
#
# backlog - The number of pending connections. This refers
# to the number of clients that can be waiting to be
# served. Exceeding this number results in the client
# getting an error when attempting to connect. It should
# only affect servers under significant load.
#
# Must be a positive integer. Generally set in the 64-2048
# range.
#
bind = [':8000']
#
# Worker processes
#
# workers - The number of worker processes that this server
# should keep alive for handling requests.
#
# A positive integer generally in the 2-4 x $(NUM_CORES)
# range. You'll want to vary this a bit to find the best
# for your particular application's work load.
#
# worker_class - The type of workers to use. The default
# sync class should handle most 'normal' types of work
# loads. You'll want to read
# http://docs.gunicorn.org/en/latest/design.html#choosing-a-worker-type
# for information on when you might want to choose one
# of the other worker classes.
#
# A string referring to a Python path to a subclass of
# gunicorn.workers.base.Worker. The default provided values
# can be seen at
# http://docs.gunicorn.org/en/latest/settings.html#worker-class
#
# worker_connections - For the eventlet and gevent worker classes
# this limits the maximum number of simultaneous clients that
# a single process can handle.
#
# A positive integer generally set to around 1000.
#
# timeout - If a worker does not notify the master process in this
# number of seconds it is killed and a new worker is spawned
# to replace it.
#
# Generally set to thirty seconds. Only set this noticeably
# higher if you're sure of the repercussions for sync workers.
# For the non sync workers it just means that the worker
# process is still communicating and is not tied to the length
# of time required to handle a single request.
#
# keepalive - The number of seconds to wait for the next request
# on a Keep-Alive HTTP connection.
#
# A positive integer. Generally set in the 1-5 seconds range.
#
# try:
# # fail 'successfully' if either of these modules aren't installed
# from gevent import monkey
# from psycogreen.gevent import patch_psycopg
# # setting this inside the 'try' ensures that we only
# # activate the gevent worker pool if we have gevent installed
# worker_class = 'gevent'
# workers = 4
# # this ensures forked processes are patched with gevent/gevent-psycopg2
# def do_post_fork(server, worker):
# monkey.patch_all()
# patch_psycopg()
# # you should see this text in your gunicorn logs if it was successful
# worker.log.info("Made Psycopg2 Green")
# post_fork = do_post_fork
# except ImportError:
# pass
workers = 4
# worker_connections = 1000
# timeout = 30
# keepalive = 2
#
# spew - Install a trace function that spews every line of Python
# that is executed when running the server. This is the
# nuclear option.
#
# True or False
#
spew = False
#
# Server mechanics
#
# daemon - Detach the main Gunicorn process from the controlling
# terminal with a standard fork/fork sequence.
#
# True or False
#
# pidfile - The path to a pid file to write
#
# A path string or None to not write a pid file.
#
# user - Switch worker processes to run as this user.
#
# A valid user id (as an integer) or the name of a user that
# can be retrieved with a call to pwd.getpwnam(value) or None
# to not change the worker process user.
#
# group - Switch worker process to run as this group.
#
# A valid group id (as an integer) or the name of a user that
# can be retrieved with a call to pwd.getgrnam(value) or None
# to change the worker processes group.
#
# umask - A mask for file permissions written by Gunicorn. Note that
# this affects unix socket permissions.
#
# A valid value for the os.umask(mode) call or a string
# compatible with int(value, 0) (0 means Python guesses
# the base, so values like "0", "0xFF", "0022" are valid
# for decimal, hex, and octal representations)
#
# tmp_upload_dir - A directory to store temporary request data when
# requests are read. This will most likely be disappearing soon.
#
# A path to a directory where the process owner can write. Or
# None to signal that Python should choose one on its own.
#
daemon = False
pidfile = '/var/gunicorn/.pid'
umask = 0
user = None
group = None
tmp_upload_dir = None
#
# Logging
#
# logfile - The path to a log file to write to.
#
# A path string. "-" means log to stdout.
#
# loglevel - The granularity of log output
#
# A string of "debug", "info", "warning", "error", "critical"
#
errorlog = '-'
loglevel = 'warning'
accesslog = '-'
access_log_format = '%(h)s %(t)s %(m)s %(U)s %(q)s %(H)s %(s)s %(B)s %(f)s %(a)s %(L)s'
#
# Process naming
#
# proc_name - A base to use with setproctitle to change the way
# that Gunicorn processes are reported in the system process
# table. This affects things like 'ps' and 'top'. If you're
# going to be running more than one instance of Gunicorn you'll
# probably want to set a name to tell them apart. This requires
# that you install the setproctitle module.
#
# A string or None to choose a default of something like 'gunicorn'.
#
proc_name = None
#
# Server hooks
#
# post_fork - Called just after a worker has been forked.
#
# A callable that takes a server and worker instance
# as arguments.
#
# pre_fork - Called just prior to forking the worker subprocess.
#
# A callable that accepts the same arguments as after_fork
#
# pre_exec - Called just prior to forking off a secondary
# master process during things like config reloading.
#
# A callable that takes a server instance as the sole argument.
#

View file

@ -1,38 +1,23 @@
version: '3' version: "3"
services: services:
db: db:
restart: always restart: always
networks: networks:
- database_network - database_network
django: phoenix:
restart: always restart: always
volumes: build: ./backend
- ./config/gunicorn.conf.py:/config/gunicorn.conf.py
- gunicorn-socket:/var/gunicorn
- staticfiles:/code/backend/static
networks: networks:
- database_network - database_network
- server_network
environment:
- ENVIRONMENT=production
- DJANGO_SETTINGS_MODULE=backend.settings.production
caddy:
restart: always
image: abiosoft/caddy:0.11.4-no-stats
ports:
- "80:80"
- "443:443"
volumes:
- ./config/Caddyfile:/etc/Caddyfile
- staticfiles:/srv/static
- caddycerts:/etc/caddycerts
networks:
- server_network
depends_on: depends_on:
- django - db
ports:
- "${PORT}:${PORT}"
environment: environment:
- ACME_AGREE - DATABASE_URL
- CADDYPATH=/etc/caddycerts - SECRET_KEY_BASE
- PORT
- BACKEND_HOSTNAME
gephi: gephi:
networks: networks:
- database_network - database_network
@ -41,9 +26,3 @@ services:
networks: networks:
database_network: database_network:
driver: bridge driver: bridge
server_network:
driver: bridge
volumes:
gunicorn-socket:
caddycerts:
staticfiles:

View file

@ -1,38 +1,18 @@
version: '3' version: "3"
services: services:
db: db:
image: postgres image: postgres
environment: environment:
- POSTGRES_USER - DATABASE_URL
- POSTGRES_PASSWORD
- POSTGRES_DB
ports: ports:
- "5432:5432" - "5432:5432"
volumes: volumes:
- pgdata:/var/lib/postgresql/data - pgdata:/var/lib/postgresql/data
django:
environment:
- SECRET_KEY
- POSTGRES_USER
- POSTGRES_PASSWORD
- POSTGRES_DB
- DJANGO_SETTINGS_MODULE
- ENVIRONMENT=development
build: ./backend
command: bash scripts/docker-entrypoint.sh
volumes:
- ./backend:/code
ports:
- "8000:8000"
depends_on:
- db
# This is for running the occasional graph layout task. It's in docker-compose.yml so that it's built at the same time # This is for running the occasional graph layout task. It's in docker-compose.yml so that it's built at the same time
# as everything else, but it should be run regularly with a cron job or similar. # as everything else, but it should be run regularly with a cron job or similar.
gephi: gephi:
environment: environment:
- POSTGRES_USER - DATABASE_URL
- POSTGRES_PASSWORD
- POSTGRES_DB
build: ./gephi build: ./gephi
volumes: volumes:
- gradle-cache:/code/.gradle - gradle-cache:/code/.gradle

View file

@ -1,6 +1,4 @@
SECRET_KEY=a-long-secret-key DATABASE_URL="postgres://postgres:postgres@localhost:5432/backend_dev"
POSTGRES_USER=postgres PORT=4000
POSTGRES_PASSWORD=postgres BACKEND_HOSTNAME=localhost
POSTGRES_DB=fediverse SECRET_KEY_BASE=jLqbBjtQTyZj+1yLwDV8xgZYvZKIBx1MBWbcC2a0mZqB5ivYKQ7GOqNR91g6YnR8
DJANGO_SETTINGS_MODULE=backend.settings.development
ACME_AGREE=true

21
frontend/.gitignore vendored
View file

@ -1,21 +0,0 @@
# See https://help.github.com/ignore-files/ for more about ignoring files.
# dependencies
/node_modules
# testing
/coverage
# production
/build
# misc
.DS_Store
.env.local
.env.development.local
.env.test.local
.env.production.local
npm-debug.log*
yarn-debug.log*
yarn-error.log*

View file

@ -5,7 +5,8 @@
"scripts": { "scripts": {
"start": "NODE_ENV=development react-scripts start", "start": "NODE_ENV=development react-scripts start",
"build": "react-scripts build", "build": "react-scripts build",
"lint": "tslint -p tsconfig.json -c tslint.json \"src/**/*.{ts,tsx}\"", "typecheck": "tsc --noemit",
"lint": "yarn typecheck && tslint -p tsconfig.json -c tslint.json \"src/**/*.{ts,tsx}\"",
"lint:fix": "yarn lint --fix", "lint:fix": "yarn lint --fix",
"pretty": "prettier --write \"src/**/*.{ts,tsx}\"", "pretty": "prettier --write \"src/**/*.{ts,tsx}\"",
"test": "yarn lint && react-scripts test", "test": "yarn lint && react-scripts test",
@ -27,44 +28,47 @@
"printWidth": 120 "printWidth": 120
}, },
"dependencies": { "dependencies": {
"@blueprintjs/core": "^3.4.0", "@blueprintjs/core": "^3.17.1",
"@blueprintjs/icons": "^3.1.0", "@blueprintjs/icons": "^3.9.1",
"@blueprintjs/select": "^3.1.0", "@blueprintjs/select": "^3.9.0",
"classnames": "^2.2.6", "classnames": "^2.2.6",
"cross-fetch": "^3.0.2", "cross-fetch": "^3.0.4",
"lodash": "^4.17.10", "cytoscape": "^3.8.1",
"cytoscape-cola": "^2.3.0",
"lodash": "^4.17.14",
"moment": "^2.22.2", "moment": "^2.22.2",
"normalize.css": "^8.0.0", "normalize.css": "^8.0.0",
"react": "^16.4.2", "react": "^16.4.2",
"react-dom": "^16.4.2", "react-dom": "^16.4.2",
"react-redux": "^7.0.2", "react-redux": "^7.1.0",
"react-router-dom": "^5.0.0", "react-router-dom": "^5.0.1",
"react-scripts": "^2.1.8", "react-scripts": "^3.0.1",
"react-sigma": "^1.2.30", "react-sigma": "^1.2.30",
"react-virtualized": "^9.20.1", "react-virtualized": "^9.21.1",
"redux": "^4.0.0", "redux": "^4.0.4",
"redux-thunk": "^2.3.0", "redux-thunk": "^2.3.0",
"sanitize-html": "^1.18.4", "sanitize-html": "^1.20.1",
"styled-components": "^4.2.0" "styled-components": "^4.3.2"
}, },
"devDependencies": { "devDependencies": {
"@blueprintjs/tslint-config": "^1.8.0", "@blueprintjs/tslint-config": "^1.8.1",
"@types/classnames": "^2.2.6", "@types/classnames": "^2.2.9",
"@types/jest": "^24.0.11", "@types/cytoscape": "^3.4.3",
"@types/lodash": "^4.14.116", "@types/jest": "^24.0.15",
"@types/node": "^11.13.4", "@types/lodash": "^4.14.136",
"@types/react": "^16.8.13", "@types/node": "^12.6.2",
"@types/react": "^16.8.23",
"@types/react-dom": "^16.8.4", "@types/react-dom": "^16.8.4",
"@types/react-redux": "^7.0.6", "@types/react-redux": "^7.1.1",
"@types/react-router-dom": "^4.3.2", "@types/react-router-dom": "^4.3.4",
"@types/react-virtualized": "^9.18.7", "@types/react-virtualized": "^9.21.2",
"@types/sanitize-html": "^1.18.3", "@types/sanitize-html": "^1.20.1",
"@types/styled-components": "4.1.8", "@types/styled-components": "4.1.18",
"husky": "^1.3.1", "husky": "^3.0.0",
"lint-staged": "^8.1.5", "lint-staged": "^9.2.0",
"tslint": "^5.16.0", "tslint": "^5.18.0",
"tslint-eslint-rules": "^5.4.0", "tslint-eslint-rules": "^5.4.0",
"typescript": "^3.0.1" "typescript": "^3.5.3"
}, },
"browserslist": [ "browserslist": [
">0.2%", ">0.2%",

View file

@ -0,0 +1,206 @@
import cytoscape from "cytoscape";
// import cola from "cytoscape-cola";
import * as React from "react";
import { connect } from "react-redux";
import { Dispatch } from "redux";
import styled from "styled-components";
import { DEFAULT_NODE_COLOR, SELECTED_NODE_COLOR } from "../constants";
import { selectAndLoadInstance } from "../redux/actions";
import { IAppState, IGraph } from "../redux/types";
import { ErrorState } from "./ErrorState";
// import { FloatingLayoutSelect } from "./FloatingLayoutSelect";
import { FloatingResetButton } from "./FloatingResetButton";
interface IGraphProps {
graph?: IGraph;
currentInstanceName: string | null;
selectAndLoadInstance: (name: string) => void;
}
interface IGraphState {
layoutAlgorithm: string;
isLayouting: boolean;
didError: boolean;
}
class GraphImpl extends React.Component<IGraphProps, IGraphState> {
private cy?: cytoscape.Core;
// private layout?: cytoscape.Layouts;
private cytoscapeDiv: React.RefObject<HTMLElement>;
public constructor(props: IGraphProps) {
super(props);
this.cytoscapeDiv = React.createRef();
this.state = { layoutAlgorithm: "cola", isLayouting: false, didError: false };
}
public render() {
if (this.state.didError) {
return <ErrorState />;
}
const FullDiv = styled.div`
position: absolute;
top: 50px;
bottom: 0;
right: 0;
left: 0;
`;
return (
<div>
<FullDiv id="cytoscape" ref={this.cytoscapeDiv as any} />
{/* <FloatingLayoutSelect
onItemSelect={this.handleLayoutSelect}
currentLayoutKey={this.state.layoutAlgorithm}
startLayout={this.startLayout}
stopLayout={this.stopLayout}
/> */}
<FloatingResetButton onClick={this.resetGraph} />
</div>
);
}
public componentDidMount() {
let { graph } = this.props;
if (!graph) {
this.setState({ didError: true });
return;
}
// Check that all nodes have size & coordinates; otherwise the graph will look messed up
const lengthBeforeFilter = graph.nodes.length;
graph = { ...graph, nodes: graph.nodes.filter(n => n.size && n.x && n.y) };
if (graph.nodes.length !== lengthBeforeFilter) {
// tslint:disable-next-line:no-console
console.error(
"Some nodes were missing details: " + graph.nodes.filter(n => !n.size || !n.x || !n.y).map(n => n.label)
);
this.setState({ didError: true });
}
// cytoscape.use(cola as any);
this.initGraph();
}
public componentDidUpdate() {
this.initGraph();
}
// private handleLayoutSelect = (layout: string) => {
// this.setState({ layoutAlgorithm: layout });
// };
// private startLayout = () => {
// if (!this.cy) {
// return;
// }
// const options = {
// cola: {
// animate: true,
// convergenceThreshold: 0.1,
// edgeLength: (edge: any) => 1 / edge.data("weight"),
// name: "cola"
// },
// cose: {
// animate: false,
// idealEdgeLength: (edge: any) => 1 / edge.data("weight"),
// name: "cose",
// numIter: 100
// }
// };
// this.layout = this.cy.layout(options[this.state.layoutAlgorithm] as any);
// this.layout.run();
// };
// private stopLayout = () => {
// if (!this.layout) {
// return;
// }
// this.layout.stop();
// };
private initGraph = () => {
const { graph } = this.props;
if (this.state.didError || !graph) {
return;
}
this.cy = cytoscape({
autoungrabify: true,
container: this.cytoscapeDiv.current,
elements: {
edges: graph.edges.map(edge => ({
data: {
id: edge.id || `${edge.source}${edge.target}`,
source: edge.source,
target: edge.target,
weight: edge.size
},
group: "edges" as "edges"
})),
nodes: graph.nodes.map(node => ({
data: {
id: node.id
},
group: "nodes" as "nodes",
position: {
x: node.x,
y: node.y
}
}))
},
layout: {
name: "preset"
},
selectionType: "single",
style: [
{
selector: "node:selected",
style: {
"background-color": SELECTED_NODE_COLOR,
label: "data(id)"
}
},
{
selector: "node",
style: {
"background-color": DEFAULT_NODE_COLOR
}
}
]
});
this.cy.nodes().on("select", e => {
const instanceId = e.target.data("id");
if (instanceId) {
// console.log(`selecting ${instanceId}`);
// console.log(`now selected: ${this.cy && this.cy.$(":selected")}`);
this.props.selectAndLoadInstance(instanceId);
}
});
this.cy.nodes().on("unselect", e => {
const instanceId = e.target.data("id");
if (instanceId) {
// console.log(`unselecting ${instanceId}`);
this.props.selectAndLoadInstance("");
}
});
};
private resetGraph = () => {
if (!this.cy) {
return;
}
this.cy.reset();
};
}
const mapStateToProps = (state: IAppState) => ({
currentInstanceName: state.currentInstance.currentInstanceName,
graph: state.data.graph
});
const mapDispatchToProps = (dispatch: Dispatch) => ({
selectAndLoadInstance: (instanceName: string) => dispatch(selectAndLoadInstance(instanceName) as any)
});
export const CytoscapeGraph = connect(
mapStateToProps,
mapDispatchToProps
)(GraphImpl);

View file

@ -0,0 +1,14 @@
import { Card, Elevation, ICardProps } from "@blueprintjs/core";
import * as React from "react";
import styled from "styled-components";
const FloatingCardElement = styled(Card)`
position: absolute;
bottom: 10px;
left: 10px;
z-index: 20;
`;
const FloatingCard: React.FC<ICardProps> = props => <FloatingCardElement elevation={Elevation.TWO} {...props} />;
export default FloatingCard;

View file

@ -0,0 +1,53 @@
import { Button, H6, MenuItem } from "@blueprintjs/core";
import { IconNames } from "@blueprintjs/icons";
import { ItemRenderer, Select } from "@blueprintjs/select";
import * as React from "react";
import FloatingCard from "./FloatingCard";
interface ILayoutToDisplayName {
[key: string]: string;
}
const layouts: ILayoutToDisplayName = {
cola: "COLA",
cose: "CoSE"
};
const LayoutSelect = Select.ofType<string>();
const LayoutItemRenderer: ItemRenderer<string> = (layout, { handleClick, modifiers }) => (
<MenuItem active={modifiers.active} key={layout} onClick={handleClick} text={layout} />
);
interface IFloatingLayoutSelectProps {
currentLayoutKey: string;
onItemSelect: (layout: string) => void;
startLayout: () => void;
stopLayout: () => void;
}
export const FloatingLayoutSelect: React.FC<IFloatingLayoutSelectProps> = ({
currentLayoutKey,
onItemSelect,
startLayout,
stopLayout
}) => {
return (
<FloatingCard>
<H6>Layout</H6>
<LayoutSelect
items={Object.keys(layouts)}
itemRenderer={LayoutItemRenderer}
filterable={false}
onItemSelect={onItemSelect}
popoverProps={{ minimal: true }}
>
<Button
icon="film"
rightIcon="caret-down"
text={currentLayoutKey ? layouts[currentLayoutKey] : "(No selection)"}
/>
</LayoutSelect>
<br />
<Button icon={IconNames.PLAY} onClick={startLayout} />
<Button icon={IconNames.STOP} onClick={stopLayout} />
</FloatingCard>
);
};

View file

@ -0,0 +1,12 @@
import { Button } from "@blueprintjs/core";
import * as React from "react";
import FloatingCard from "./FloatingCard";
interface IFloatingResetButtonProps {
onClick?: () => any;
}
export const FloatingResetButton: React.FC<IFloatingResetButtonProps> = ({ onClick }) => (
<FloatingCard>
<Button icon="compass" onClick={onClick} />
</FloatingCard>
);

View file

@ -58,7 +58,7 @@ class SidebarImpl extends React.Component<ISidebarProps, ISidebarState> {
className={"fediverse-sidebar-toggle-button" + closedClass} className={"fediverse-sidebar-toggle-button" + closedClass}
minimal={true} minimal={true}
/> />
<Card className={"fediverse-sidebar" + closedClass} elevation={Elevation.THREE}> <Card className={"fediverse-sidebar" + closedClass} elevation={Elevation.TWO}>
{this.renderSidebarContents()} {this.renderSidebarContents()}
</Card> </Card>
</div> </div>

View file

@ -6,8 +6,8 @@ import { NonIdealState, Spinner } from "@blueprintjs/core";
import { fetchGraph, fetchInstances } from "../../redux/actions"; import { fetchGraph, fetchInstances } from "../../redux/actions";
import { IAppState, IGraph, IInstance } from "../../redux/types"; import { IAppState, IGraph, IInstance } from "../../redux/types";
import { CytoscapeGraph } from "../CytoscapeGraph";
import { ErrorState } from "../ErrorState"; import { ErrorState } from "../ErrorState";
import { Graph } from "../Graph";
import { Sidebar } from "../Sidebar"; import { Sidebar } from "../Sidebar";
interface IGraphScreenProps { interface IGraphScreenProps {
@ -48,7 +48,7 @@ class GraphScreenImpl extends React.Component<IGraphScreenProps> {
}; };
private graphState = () => { private graphState = () => {
const content = this.props.graphLoadError ? <ErrorState /> : <Graph />; const content = this.props.graphLoadError ? <ErrorState /> : <CytoscapeGraph />;
return ( return (
<div> <div>
<Sidebar /> <Sidebar />

View file

@ -1,2 +1,5 @@
/* Screen widths less than this will be treated as mobile */ /* Screen widths less than this will be treated as mobile */
export const DESKTOP_WIDTH_THRESHOLD = 800; export const DESKTOP_WIDTH_THRESHOLD = 800;
export const DEFAULT_NODE_COLOR = "#CED9E0";
export const SELECTED_NODE_COLOR = "#48AFF0";

View file

@ -87,13 +87,7 @@ export const selectAndLoadInstance = (instanceName: string) => {
export const fetchGraph = () => { export const fetchGraph = () => {
return (dispatch: Dispatch) => { return (dispatch: Dispatch) => {
dispatch(requestGraph()); dispatch(requestGraph());
return Promise.all([getFromApi("graph/edges"), getFromApi("graph/nodes")]) return getFromApi("graph")
.then(responses => {
return {
edges: responses[0],
nodes: responses[1]
};
})
.then(graph => dispatch(receiveGraph(graph))) .then(graph => dispatch(receiveGraph(graph)))
.catch(e => dispatch(graphLoadFailed())); .catch(e => dispatch(graphLoadFailed()));
}; };

View file

@ -0,0 +1,3 @@
declare module "cytoscape-cola" {
const prototype: {};
}

Some files were not shown because too many files have changed in this diff Show more