Browse Source

refactor/elixir backend

merge-requests/53/head
Tao Bojlén 2 years ago
parent
commit
a37452f138
  1. 2
      .dokku-monorepo
  2. 171
      .gitignore
  3. 9
      backend/.dockerignore
  4. 5
      backend/.formatter.exs
  5. 65
      backend/Dockerfile
  6. 33
      backend/README.md
  7. 0
      backend/apiv1/__init__.py
  8. 8
      backend/apiv1/_util.py
  9. 5
      backend/apiv1/apps.py
  10. 105
      backend/apiv1/serializers.py
  11. 37
      backend/apiv1/views.py
  12. 0
      backend/backend/__init__.py
  13. 124
      backend/backend/settings/base.py
  14. 7
      backend/backend/settings/development.py
  15. 10
      backend/backend/settings/production.py
  16. 37
      backend/backend/urls.py
  17. 13
      backend/backend/wsgi.py
  18. 51
      backend/config/config.exs
  19. 72
      backend/config/dev.exs
  20. 57
      backend/config/prod.exs
  21. 27
      backend/config/releases.exs
  22. 18
      backend/config/test.exs
  23. 9
      backend/lib/backend.ex
  24. 68
      backend/lib/backend/api.ex
  25. 46
      backend/lib/backend/application.ex
  26. 26
      backend/lib/backend/crawl.ex
  27. 29
      backend/lib/backend/crawl_interaction.ex
  28. 45
      backend/lib/backend/crawler/api_crawler.ex
  29. 196
      backend/lib/backend/crawler/crawler.ex
  30. 193
      backend/lib/backend/crawler/crawlers/mastodon.ex
  31. 84
      backend/lib/backend/crawler/stale_instance_manager.ex
  32. 63
      backend/lib/backend/crawler/util.ex
  33. 25
      backend/lib/backend/edge.ex
  34. 41
      backend/lib/backend/instance.ex
  35. 27
      backend/lib/backend/instance_peer.ex
  36. 18
      backend/lib/backend/release.ex
  37. 5
      backend/lib/backend/repo.ex
  38. 116
      backend/lib/backend/scheduler.ex
  39. 129
      backend/lib/backend/util.ex
  40. 66
      backend/lib/backend_web.ex
  41. 33
      backend/lib/backend_web/channels/user_socket.ex
  42. 15
      backend/lib/backend_web/controllers/fallback_controller.ex
  43. 13
      backend/lib/backend_web/controllers/graph_controller.ex
  44. 27
      backend/lib/backend_web/controllers/instance_controller.ex
  45. 51
      backend/lib/backend_web/endpoint.ex
  46. 24
      backend/lib/backend_web/gettext.ex
  47. 14
      backend/lib/backend_web/router.ex
  48. 19
      backend/lib/backend_web/views/changeset_view.ex
  49. 33
      backend/lib/backend_web/views/error_helpers.ex
  50. 16
      backend/lib/backend_web/views/error_view.ex
  51. 36
      backend/lib/backend_web/views/graph_view.ex
  52. 45
      backend/lib/backend_web/views/instance_view.ex
  53. 13
      backend/lib/mix/tasks/crawl.ex
  54. 15
      backend/manage.py
  55. 65
      backend/mix.exs
  56. 43
      backend/mix.lock
  57. 97
      backend/priv/gettext/en/LC_MESSAGES/errors.po
  58. 4
      backend/priv/repo/migrations/.formatter.exs
  59. 29
      backend/priv/repo/migrations/20190624090436_create_instances.exs
  60. 15
      backend/priv/repo/migrations/20190710133755_create_edges.exs
  61. 20
      backend/priv/repo/migrations/20190710155001_create_crawls.exs
  62. 16
      backend/priv/repo/migrations/20190710155112_create_crawl_interactions.exs
  63. 10
      backend/priv/repo/migrations/20190712133009_add_instance_coords.exs
  64. 11
      backend/priv/repo/seeds.exs
  65. 1
      backend/rel/rel/vm.args.eex
  66. 28
      backend/requirements.txt
  67. 0
      backend/scraper/__init__.py
  68. 3
      backend/scraper/admin.py
  69. 5
      backend/scraper/apps.py
  70. 84
      backend/scraper/management/commands/_util.py
  71. 38
      backend/scraper/management/commands/build_edges.py
  72. 276
      backend/scraper/management/commands/scrape.py
  73. 67
      backend/scraper/migrations/0001_initial.py
  74. 24
      backend/scraper/migrations/0002_auto_20190419_1346.py
  75. 0
      backend/scraper/migrations/__init__.py
  76. 59
      backend/scraper/models.py
  77. 26
      backend/scripts/docker-entrypoint.sh
  78. 104
      backend/test/backend_web/controllers/graph_controller_test.exs
  79. 88
      backend/test/backend_web/controllers/instance_controller_test.exs
  80. 15
      backend/test/backend_web/views/error_view_test.exs
  81. 37
      backend/test/support/channel_case.ex
  82. 38
      backend/test/support/conn_case.ex
  83. 53
      backend/test/support/data_case.ex
  84. 2
      backend/test/test_helper.exs
  85. 13
      config/Caddyfile
  86. 196
      config/gunicorn.conf.py
  87. 41
      docker-compose.production.yml
  88. 26
      docker-compose.yml
  89. 10
      example.env
  90. 21
      frontend/.gitignore
  91. 60
      frontend/package.json
  92. 206
      frontend/src/components/CytoscapeGraph.tsx
  93. 14
      frontend/src/components/FloatingCard.tsx
  94. 53
      frontend/src/components/FloatingLayoutSelect.tsx
  95. 12
      frontend/src/components/FloatingResetButton.tsx
  96. 2
      frontend/src/components/Sidebar.tsx
  97. 4
      frontend/src/components/screens/GraphScreen.tsx
  98. 3
      frontend/src/constants.tsx
  99. 8
      frontend/src/redux/actions.ts
  100. 3
      frontend/src/typings/cytoscape-cola.d.ts

2
.dokku-monorepo

@ -0,0 +1,2 @@
backend=backend
gephi=gephi

171
.gitignore

@ -1,113 +1,98 @@
*.csv
.idea/
backend/backend/static/
backend/static/
*.gexf
backend/whitelist.txt
data/
.vscode/
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
./lib/
./lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Environments
.env
.env*
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Django stuff:
*.log
local_settings.py
db.sqlite3
# The directory Mix will write compiled artifacts to.
/backend/_build/
# Flask stuff:
instance/
.webassets-cache
# If you run "mix test --cover", coverage assets end up here.
/backend/cover/
# Scrapy stuff:
.scrapy
# The directory Mix downloads your dependencies sources to.
/backend/deps/
# Sphinx documentation
docs/_build/
# Where 3rd-party dependencies like ExDoc output generated docs.
/backend/doc/
# PyBuilder
target/
# Ignore .fetch files in case you like to edit your project deps locally.
/backend/.fetch
# Jupyter Notebook
.ipynb_checkpoints
# If the VM crashes, it generates a dump, let's ignore it too.
erl_crash.dump
# pyenv
.python-version
# Also ignore archive artifacts (built via "mix archive.build").
*.ez
# celery beat schedule file
celerybeat-schedule
# Ignore package tarball (built via "mix hex.build").
backend-*.tar
# SageMath parsed files
*.sage.py
# Since we are building assets from assets/,
# we ignore priv/static. You may want to comment
# this depending on your deployment strategy.
/backend/priv/static/
# Environments
.env
.env*
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Files matching config/*.secret.exs pattern contain sensitive
# data and you should not commit them into version control.
#
# Alternatively, you may comment the line below and commit the
# secrets files as long as you replace their contents by environment
# variables.
/backend/config/*.secret.exs
/backend/.elixir_ls/
*.pot
*.po
# dependencies
/frontend/node_modules
# testing
/frontend/coverage
# production
/frontend/build
# misc
.DS_Store
.env.local
.env.development.local
.env.test.local
.env.production.local
npm-debug.log*
yarn-debug.log*
yarn-error.log*
/gephi/.gradle/
/gephi/build/
/gephi/lib/*
/gephi/!lib/.gitkeep
# 64MB file but I don't have much faith that it'll remain available...
!/gephi/lib/gephi-toolkit-0.9.2.jar
*/.idea/
# Spyder project settings
.spyderproject
.spyproject
# Ignore Gradle GUI config
/gephi/gradle-app.setting
# Rope project settings
.ropeproject
# Avoid ignoring Gradle wrapper jar file (.jar files are usually ignored)
!/gephi/gradle-wrapper.jar
# mkdocs documentation
/site
# Cache of project
/gephi/.gradletasknamecache
# mypy
.mypy_cache/
*.javac

9
backend/.dockerignore

@ -0,0 +1,9 @@
_build/
deps/
.git/
.gitignore
Dockerfile
Makefile
README*
test/
priv/static/

5
backend/.formatter.exs

@ -0,0 +1,5 @@
[
import_deps: [:ecto, :phoenix],
inputs: ["*.{ex,exs}", "priv/*/seeds.exs", "{config,lib,test}/**/*.{ex,exs}"],
subdirectories: ["priv/*/migrations"]
]

65
backend/Dockerfile

@ -1,12 +1,53 @@
FROM python:3
ENV PYTHONUNBUFFERED 1
RUN apt-get update && \
apt-get install -qqy --no-install-recommends \
postgresql-client-9.6=9.6.10-0+deb9u1
RUN mkdir /code
WORKDIR /code
COPY requirements.txt /code/
RUN pip install -r requirements.txt
COPY . /code/
FROM elixir:1.9.0-alpine as build
# install build dependencies
RUN apk add --update git build-base
# prepare build dir
RUN mkdir /app
WORKDIR /app
# install hex + rebar
RUN mix local.hex --force && \
mix local.rebar --force
# set build ENV
ENV MIX_ENV=prod
# install mix dependencies
COPY mix.exs mix.lock ./
COPY config config
RUN mix deps.get
RUN mix deps.compile
# build assets
# COPY assets assets
# RUN cd assets && npm install && npm run deploy
# RUN mix phx.digest
# build project
COPY priv priv
COPY lib lib
RUN mix compile
# build release
COPY rel rel
RUN mix release
# prepare release image
FROM alpine:3.9 AS app
RUN apk add --update bash openssl
RUN mkdir /app
WORKDIR /app
ENV APP_NAME=backend
COPY --from=build /app/_build/prod/rel/${APP_NAME} ./
RUN chown -R nobody: /app
USER nobody
ENV HOME=/app
# The command to start the backend
CMD trap 'exit' INT; ${HOME}/bin/${APP_NAME} start

33
backend/README.md

@ -0,0 +1,33 @@
# fediverse.space backend
## Notes
- This project requires Elixir >= 1.9.
- Run with `SKIP_CRAWL=true` to just run the server (useful for working on the API without also crawling)
## Deployment
Deployment with Docker is handled as per the [Distillery docs](https://hexdocs.pm/distillery/guides/working_with_docker.html).
- To build a new version, run `make build` in this directory.
- To migrate a released version, run `./backend eval "Backend.Release.migrate"`
# Default README
To start your Phoenix server:
- Install dependencies with `mix deps.get`
- Create and migrate your database with `mix ecto.setup`
- Start Phoenix endpoint with `mix phx.server`
Now you can visit [`localhost:4000`](http://localhost:4000) from your browser.
Ready to run in production? Please [check our deployment guides](https://hexdocs.pm/phoenix/deployment.html).
## Learn more
- Official website: http://www.phoenixframework.org/
- Guides: https://hexdocs.pm/phoenix/overview.html
- Docs: https://hexdocs.pm/phoenix
- Mailing list: http://groups.google.com/group/phoenix-talk
- Source: https://github.com/phoenixframework/phoenix

0
backend/apiv1/__init__.py

8
backend/apiv1/_util.py

@ -1,8 +0,0 @@
def to_representation(self, instance):
"""
Object instance -> Dict of primitive datatypes.
We use a custom to_representation function to exclude empty fields in the serialized JSON.
"""
ret = super(InstanceListSerializer, self).to_representation(instance)
ret = OrderedDict(list(filter(lambda x: x[1], ret.items())))
return ret

5
backend/apiv1/apps.py

@ -1,5 +0,0 @@
from django.apps import AppConfig
class Apiv1Config(AppConfig):
name = 'apiv1'

105
backend/apiv1/serializers.py

@ -1,105 +0,0 @@
from rest_framework import serializers
import math
from collections import OrderedDict
from scraper.models import Instance, Edge
class InstanceListSerializer(serializers.ModelSerializer):
"""
Minimal instance details used in the full list of instances.
"""
class Meta:
model = Instance
fields = ('name', 'user_count')
def to_representation(self, instance):
"""
Object instance -> Dict of primitive datatypes.
We use a custom to_representation function to exclude empty fields in the serialized JSON.
"""
ret = super(InstanceListSerializer, self).to_representation(instance)
ret = OrderedDict(list(filter(lambda x: x[1], ret.items())))
return ret
class InstanceDetailSerializer(serializers.ModelSerializer):
"""
Detailed instance view.
"""
userCount = serializers.SerializerMethodField()
statusCount = serializers.SerializerMethodField()
domainCount = serializers.SerializerMethodField()
lastUpdated = serializers.SerializerMethodField()
peers = InstanceListSerializer(many=True, read_only=True)
def get_userCount(self, obj):
return obj.user_count
def get_statusCount(self, obj):
return obj.status_count
def get_domainCount(self, obj):
return obj.domain_count
def get_lastUpdated(self, obj):
return obj.last_updated
class Meta:
model = Instance
fields = ('name', 'description', 'version', 'userCount',
'statusCount', 'domainCount', 'peers', 'lastUpdated',
'status')
class EdgeSerializer(serializers.ModelSerializer):
"""
Used for displaying the graph.
"""
id = serializers.SerializerMethodField('get_pk')
size = serializers.SerializerMethodField('get_weight')
class Meta:
model = Edge
fields = ('source', 'target', 'id', 'size')
def get_pk(self, obj):
return obj.pk
def get_weight(self, obj):
return obj.weight
class NodeSerializer(serializers.ModelSerializer):
"""
Used for displaying the graph.
"""
id = serializers.SerializerMethodField('get_name')
label = serializers.SerializerMethodField('get_name')
size = serializers.SerializerMethodField()
x = serializers.SerializerMethodField()
y = serializers.SerializerMethodField()
class Meta:
model = Instance
fields = ('id', 'label', 'size', 'x', 'y')
def get_name(self, obj):
return obj.name
def get_size(self, obj):
return math.log(obj.user_count) if (obj.user_count and (obj.user_count > 1)) else 1
def get_x(self, obj):
return obj.x_coord
def get_y(self, obj):
return obj.y_coord
def to_representation(self, instance):
"""
Object instance -> Dict of primitive datatypes.
We use a custom to_representation function to exclude empty fields in the serialized JSON.
"""
ret = super(NodeSerializer, self).to_representation(instance)
ret = OrderedDict(list(filter(lambda x: x[1], ret.items())))
return ret

37
backend/apiv1/views.py

@ -1,37 +0,0 @@
from rest_framework import viewsets
from scraper.models import Instance, Edge
from apiv1.serializers import InstanceListSerializer, InstanceDetailSerializer, NodeSerializer, EdgeSerializer
class InstanceViewSet(viewsets.ReadOnlyModelViewSet):
"""API endpoint to view stats for, and the peers of, an instance"""
lookup_field = 'name'
lookup_value_regex = '[a-zA-Z0-9-_\.]+'
queryset = Instance.objects.all()
serializer_class = InstanceListSerializer
detail_serializer_class = InstanceDetailSerializer # this serializer also includes stats and a list of peers
def get_serializer_class(self):
if self.action == 'retrieve':
if hasattr(self, 'detail_serializer_class'):
return self.detail_serializer_class
return self.serializer_class
class EdgeView(viewsets.ReadOnlyModelViewSet):
"""
Endpoint to get a list of the graph's edges in a SigmaJS-friendly format.
"""
queryset = Edge.objects.all()
serializer_class = EdgeSerializer
class NodeView(viewsets.ReadOnlyModelViewSet):
"""
Endpoint to get a list of the graph's nodes in a SigmaJS-friendly format.
"""
queryset = Instance.objects.filter(status='success', x_coord__isnull=False, y_coord__isnull=False, user_count__isnull=False)\
.exclude(sources__isnull=True, targets__isnull=True)
serializer_class = NodeSerializer

0
backend/backend/__init__.py

124
backend/backend/settings/base.py

@ -1,124 +0,0 @@
"""
Django settings for backend project.
Generated by 'django-admin startproject' using Django 2.1.
For more information on this file, see
https://docs.djangoproject.com/en/2.1/topics/settings/
For the full list of settings and their values, see
https://docs.djangoproject.com/en/2.1/ref/settings/
"""
import os
import json
from django.core.exceptions import ImproperlyConfigured
SECRET_KEY = os.getenv("SECRET_KEY")
# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# Application definition
INSTALLED_APPS = [
'django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
'rest_framework',
'silk',
'corsheaders',
'scraper.apps.ScraperConfig',
'apiv1.apps.Apiv1Config',
]
MIDDLEWARE = [
'corsheaders.middleware.CorsMiddleware',
'django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.common.CommonMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.clickjacking.XFrameOptionsMiddleware',
'silk.middleware.SilkyMiddleware',
]
ROOT_URLCONF = 'backend.urls'
TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': [os.path.join(BASE_DIR, '../../frontend/build')],
'APP_DIRS': True,
'OPTIONS': {
'context_processors': [
'django.template.context_processors.debug',
'django.template.context_processors.request',
'django.contrib.auth.context_processors.auth',
'django.contrib.messages.context_processors.messages',
],
},
},
]
WSGI_APPLICATION = 'backend.wsgi.application'
# Database
# https://docs.djangoproject.com/en/2.1/ref/settings/#databases
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.postgresql',
'NAME': os.getenv("POSTGRES_DB"),
'USER': os.getenv("POSTGRES_USER"),
'PASSWORD': os.getenv("POSTGRES_PASSWORD"),
'HOST': 'db',
'PORT': 5432,
}
}
# Password validation
# https://docs.djangoproject.com/en/2.1/ref/settings/#auth-password-validators
AUTH_PASSWORD_VALIDATORS = [
{
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
},
]
# Internationalization
# https://docs.djangoproject.com/en/2.1/topics/i18n/
LANGUAGE_CODE = 'en-us'
TIME_ZONE = 'UTC'
USE_I18N = True
USE_L10N = True
USE_TZ = False
# Static files (CSS, JavaScript, Images)
# https://docs.djangoproject.com/en/2.1/howto/static-files/
STATIC_URL = '/static/'
STATICFILES_DIRS = []
STATIC_ROOT = os.path.join(BASE_DIR, 'static')

7
backend/backend/settings/development.py

@ -1,7 +0,0 @@
from .base import *
DEBUG = True
ALLOWED_HOSTS = ['localhost']
CORS_ORIGIN_ALLOW_ALL = True

10
backend/backend/settings/production.py

@ -1,10 +0,0 @@
from .base import *
DEBUG = False
ALLOWED_HOSTS = ['backend.fediverse.space']
CORS_ORIGIN_REGEX_WHITELIST = [
r'^(https?:\/\/)?(\w+\.)?(.*)?fediverse-space\.netlify\.com\/?$',
r'^(https?:\/\/)?(\w+\.)?(.*)?fediverse\.space\/?$',
]

37
backend/backend/urls.py

@ -1,37 +0,0 @@
"""backend URL Configuration
The `urlpatterns` list routes URLs to views. For more information please see:
https://docs.djangoproject.com/en/2.1/topics/http/urls/
Examples:
Function views
1. Add an import: from my_app import views
2. Add a URL to urlpatterns: path('', views.home, name='home')
Class-based views
1. Add an import: from other_app.views import Home
2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
Including another URLconf
1. Import the include() function: from django.urls import include, path
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
"""
from django.urls import path, include
from django.views.generic import TemplateView
from rest_framework import routers
from apiv1 import views
class OptionalTrailingSlashRouter(routers.DefaultRouter):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.trailing_slash = r'/?'
router = OptionalTrailingSlashRouter()
router.register(r'instances', views.InstanceViewSet)
router.register(r'graph/nodes', views.NodeView)
router.register(r'graph/edges', views.EdgeView, base_name='edge')
urlpatterns = [
path('api/v1/', include(router.urls)),
path('silk/', include('silk.urls', namespace='silk')),
]

13
backend/backend/wsgi.py

@ -1,13 +0,0 @@
"""
WSGI config for backend project.
It exposes the WSGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/2.1/howto/deployment/wsgi/
"""
import os
from django.core.wsgi import get_wsgi_application
application = get_wsgi_application()

51
backend/config/config.exs

@ -0,0 +1,51 @@
# This file is responsible for configuring your application
# and its dependencies with the aid of the Mix.Config module.
#
# This configuration file is loaded before any dependency and
# is restricted to this project.
# General application configuration
import Config
config :backend,
ecto_repos: [Backend.Repo]
# Configures the endpoint
config :backend, BackendWeb.Endpoint,
url: [host: "localhost"],
secret_key_base: "XL4NKGBN9lZMrQbMEI1KJOlwAt8S7younVJl90TdAgzmwyapr3g7BRYSNYvX0sZ9",
render_errors: [view: BackendWeb.ErrorView, accepts: ~w(json)],
pubsub: [name: Backend.PubSub, adapter: Phoenix.PubSub.PG2]
config :backend, Backend.Repo, queue_target: 5000
# Configures Elixir's Logger
config :logger, :console,
format: "$time $metadata[$level] $message\n",
metadata: [:request_id]
# Use Jason for JSON parsing in Phoenix
config :phoenix, :json_library, Jason
config :backend, :crawler,
status_age_limit_days: 28,
status_count_limit: 5000,
personal_instance_threshold: 10,
crawl_interval_mins: 30,
crawl_workers: 50,
blacklist: [
"gab.best"
],
user_agent: "fediverse.space crawler"
config :backend, Backend.Scheduler,
jobs: [
# At midnight every day
{"@daily", {Backend.Scheduler, :prune_crawls, [1, "month"]}},
# 00.15 daily
{"15 0 * * *", {Backend.Scheduler, :generate_edges, []}}
]
# Import environment specific config. This must remain at the bottom
# of this file so it overrides the configuration defined above.
import_config "#{Mix.env()}.exs"

72
backend/config/dev.exs

@ -0,0 +1,72 @@
import Config
# For development, we disable any cache and enable
# debugging and code reloading.
#
# The watchers configuration can be used to run external
# watchers to your application. For example, we use it
# with webpack to recompile .js and .css sources.
config :backend, BackendWeb.Endpoint,
http: [port: 4000],
debug_errors: true,
code_reloader: true,
check_origin: false,
watchers: []
# ## SSL Support
#
# In order to use HTTPS in development, a self-signed
# certificate can be generated by running the following
# Mix task:
#
# mix phx.gen.cert
#
# Note that this task requires Erlang/OTP 20 or later.
# Run `mix help phx.gen.cert` for more information.
#
# The `http:` config above can be replaced with:
#
# https: [
# port: 4001,
# cipher_suite: :strong,
# keyfile: "priv/cert/selfsigned_key.pem",
# certfile: "priv/cert/selfsigned.pem"
# ],
#
# If desired, both `http:` and `https:` keys can be
# configured to run both http and https servers on
# different ports.
# Do not include metadata nor timestamps in development logs
config :logger, :console, format: "[$level] $message\n"
# Set a higher stacktrace during development. Avoid configuring such
# in production as building large stacktraces may be expensive.
config :phoenix, :stacktrace_depth, 20
# Initialize plugs at runtime for faster development compilation
config :phoenix, :plug_init_mode, :runtime
# Configure your database
config :backend, Backend.Repo,
username: "postgres",
password: "postgres",
database: "backend_dev",
hostname: "localhost",
pool_size: 10
config :backend, :crawler,
status_age_limit_days: 28,
status_count_limit: 100,
personal_instance_threshold: 1,
crawl_interval_mins: 1,
crawl_workers: 10,
blacklist: [
"gab.best"
]
config :backend, Backend.Scheduler,
jobs: [
# Every 15 minutes
{"*/15 * * * *", {Backend.Scheduler, :prune_crawls, [12, "hour"]}}
]

57
backend/config/prod.exs

@ -0,0 +1,57 @@
import Config
# Do not print debug messages in production
config :logger, level: :info
# ## SSL Support
#
# To get SSL working, you will need to add the `https` key
# to the previous section and set your `:url` port to 443:
#
# config :backend, BackendWeb.Endpoint,
# ...
# url: [host: "example.com", port: 443],
# https: [
# :inet6,
# port: 443,
# cipher_suite: :strong,
# keyfile: System.get_env("SOME_APP_SSL_KEY_PATH"),
# certfile: System.get_env("SOME_APP_SSL_CERT_PATH")
# ]
#
# The `cipher_suite` is set to `:strong` to support only the
# latest and more secure SSL ciphers. This means old browsers
# and clients may not be supported. You can set it to
# `:compatible` for wider support.
#
# `:keyfile` and `:certfile` expect an absolute path to the key
# and cert in disk or a relative path inside priv, for example
# "priv/ssl/server.key". For all supported SSL configuration
# options, see https://hexdocs.pm/plug/Plug.SSL.html#configure/1
#
# We also recommend setting `force_ssl` in your endpoint, ensuring
# no data is ever sent via http, always redirecting to https:
#
# config :backend, BackendWeb.Endpoint,
# force_ssl: [hsts: true]
#
# Check `Plug.SSL` for all available options in `force_ssl`.
# ## Using releases (distillery)
#
# If you are doing OTP releases, you need to instruct Phoenix
# to start the server for all endpoints:
#
# config :phoenix, :serve_endpoints, true
#
# Alternatively, you can configure exactly which server to
# start per endpoint:
#
# config :backend, BackendWeb.Endpoint, server: true
#
# Note you can't rely on `System.get_env/1` when using releases.
# See the releases documentation accordingly.
# Finally import the config/prod.secret.exs which should be versioned
# separately.
# import_config "prod.secret.exs"

27
backend/config/releases.exs

@ -0,0 +1,27 @@
# This file is for *runtime configuration in releases* only.
# https://hexdocs.pm/phoenix/releases.html#runtime-configuration
import Config
# For production, don't forget to configure the url host
# to something meaningful, Phoenix uses this information
# when generating URLs.
config :backend, Backend.Repo,
# username: System.get_env("POSTGRES_USER"),
# password: System.get_env("POSTGRES_PASSWORD"),
# database: System.get_env("POSTGRES_DB"),
# hostname: System.get_env("POSTGRES_HOSTNAME"),
url: System.get_env("ecto://" <> "DATABASE_URL"),
pool_size: String.to_integer(System.get_env("POOL_SIZE") || "10"),
ssl: true
# show_sensitive_data_on_connection_error: true
port = String.to_integer(System.get_env("PORT") || "4000")
config :backend, BackendWeb.Endpoint,
http: [:inet6, port: port],
url: [host: System.get_env("BACKEND_HOSTNAME"), port: port],
root: ".",
secret_key_base: System.get_env("SECRET_KEY_BASE"),
server: true

18
backend/config/test.exs

@ -0,0 +1,18 @@
import Config
# We don't run a server during test. If one is required,
# you can enable the server option below.
config :backend, BackendWeb.Endpoint,
http: [port: 4002],
server: false
# Print only warnings and errors during test
config :logger, level: :warn
# Configure your database
config :backend, Backend.Repo,
username: "postgres",
password: "postgres",
database: "backend_test",
hostname: "localhost",
pool: Ecto.Adapters.SQL.Sandbox

9
backend/lib/backend.ex

@ -0,0 +1,9 @@
defmodule Backend do
@moduledoc """
Backend keeps the contexts that define your domain
and business logic.
Contexts are also responsible for managing your data, regardless
if it comes from the database, an external API or others.
"""
end

68
backend/lib/backend/api.ex

@ -0,0 +1,68 @@
defmodule Backend.Api do
alias Backend.{Crawl, Edge, Instance, Repo}
import Ecto.Query
@spec list_instances() :: [Instance.t()]
def list_instances() do
Instance
|> Repo.all()
end
@spec get_instance!(String.t()) :: Instance.t()
def get_instance!(domain) do
Instance
|> preload(:peers)
|> Repo.get_by!(domain: domain)
end
@doc """
Returns a list of instances that
* have at least one successful crawl
* have a user count (required to give the instance a size on the graph)
"""
@spec list_nodes() :: [Instance.t()]
def list_nodes() do
crawl_subquery =
Crawl
|> select([c], %{
instance_domain: c.instance_domain,
crawl_count: count(c.id)
})
|> where([c], is_nil(c.error))
|> group_by([c], c.instance_domain)
Instance
|> join(:inner, [i], c in subquery(crawl_subquery), on: i.domain == c.instance_domain)
|> where(
[i, c],
c.crawl_count > 0 and not is_nil(i.user_count) and not is_nil(i.x) and not is_nil(i.y)
)
|> select([c], [:domain, :user_count, :x, :y])
|> Repo.all()
end
@spec list_edges() :: [Edge.t()]
def list_edges() do
crawl_subquery =
Crawl
|> select([c], %{
instance_domain: c.instance_domain,
crawl_count: count(c.id)
})
|> where([c], is_nil(c.error))
|> group_by([c], c.instance_domain)
Edge
|> join(:inner, [e], c1 in subquery(crawl_subquery), on: e.source_domain == c1.instance_domain)
|> join(:inner, [e], c2 in subquery(crawl_subquery), on: e.target_domain == c2.instance_domain)
|> join(:inner, [e], i1 in Instance, on: e.source_domain == i1.domain)
|> join(:inner, [e], i2 in Instance, on: e.target_domain == i2.domain)
|> select([e], [:id, :source_domain, :target_domain, :weight])
|> where(
[e, c1, c2, i1, i2],
c1.crawl_count > 0 and c2.crawl_count > 0 and not is_nil(i1.x) and not is_nil(i1.y) and
not is_nil(i2.x) and not is_nil(i2.y) and e.source_domain != e.target_domain
)
|> Repo.all()
end
end

46
backend/lib/backend/application.ex

@ -0,0 +1,46 @@
defmodule Backend.Application do
# See https://hexdocs.pm/elixir/Application.html
# for more information on OTP Applications
@moduledoc false
use Application
require Logger
import Backend.Util
def start(_type, _args) do
crawl_worker_count = get_config(:crawl_workers)
children = [
# Start the Ecto repository
Backend.Repo,
# Start the endpoint when the application starts
BackendWeb.Endpoint,
# Crawler children
:hackney_pool.child_spec(:crawler, timeout: 15000, max_connections: crawl_worker_count),
{Task,
fn ->
Honeydew.start_queue(:crawl_queue, failure_mode: Honeydew.FailureMode.Abandon)
Honeydew.start_workers(:crawl_queue, Backend.Crawler, num: crawl_worker_count)
end},
Backend.Scheduler
]
children =
case Enum.member?(["true", 1, "1"], System.get_env("SKIP_CRAWL")) do
true -> children
false -> children ++ [Backend.Crawler.StaleInstanceManager]
end
# See https://hexdocs.pm/elixir/Supervisor.html
# for other strategies and supported options
opts = [strategy: :one_for_one, name: Backend.Supervisor]
Supervisor.start_link(children, opts)
end
# Tell Phoenix to update the endpoint configuration
# whenever the application is updated.
def config_change(changed, _new, removed) do
BackendWeb.Endpoint.config_change(changed, removed)
:ok
end
end

26
backend/lib/backend/crawl.ex

@ -0,0 +1,26 @@
defmodule Backend.Crawl do
use Ecto.Schema
import Ecto.Changeset
schema "crawls" do
belongs_to :instance, Backend.Instance,
references: :domain,
type: :string,
foreign_key: :instance_domain
field :interactions_seen, :integer
field :statuses_seen, :integer
# if something went wrong, otherwise null
field :error, :string
timestamps()
end
@doc false
def changeset(crawl, attrs) do
crawl
|> cast(attrs, [:instance, :statuses_seen, :interactions_seen, :error])
|> validate_required([:instance])
end
end

29
backend/lib/backend/crawl_interaction.ex

@ -0,0 +1,29 @@
defmodule Backend.CrawlInteraction do
use Ecto.Schema
import Ecto.Changeset
schema "crawl_interactions" do
belongs_to :crawl, Backend.Crawl
belongs_to :source, Backend.Instance,
references: :domain,
type: :string,
foreign_key: :source_domain
belongs_to :target, Backend.Instance,
references: :domain,
type: :string,
foreign_key: :target_domain
field :mentions, :integer
timestamps()
end
@doc false
def changeset(crawl_interaction, attrs) do
crawl_interaction
|> cast(attrs, [:crawl, :source, :target, :mentions])
|> validate_required([:crawl, :source, :target, :mentions])
end
end

45
backend/lib/backend/crawler/api_crawler.ex

@ -0,0 +1,45 @@
defmodule Backend.Crawler.ApiCrawler do
@moduledoc """
This module is a specification. Crawlers for all instance types must implement its behaviour.
Make sure to respect the following:
* You must adhere to the following configuration values:
* `:status_age_limit_days` specifies that you must only crawl statuses from the most recent N days
* `:status_count_limit` specifies the max number of statuses to crawl in one go
* `:personal_instance_threshold` specifies that instances with fewer than this number of users should not be crawled
* profiles with the string "nobot" (case insensitive) in their profile must not be included in any stats
* Make sure to check the most recent crawl of the instance so you don't re-crawl old statuses
"""
# {domain_mentioned, count}
@type instance_interactions :: %{String.t() => integer}
defstruct [
:version,
:description,
:user_count,
:status_count,
:peers,
:interactions,
:statuses_seen
]
@type t() :: %__MODULE__{
version: String.t(),
description: String.t(),
user_count: integer,
status_count: integer,
peers: [String.t()],
interactions: instance_interactions,
statuses_seen: integer
}
@doc """
Check whether the instance at the given domain is of the type that this ApiCrawler implements.
"""
@callback is_instance_type?(String.t()) :: boolean()
@doc """
Crawl the instance at the given domain.
"""
@callback crawl(String.t()) :: t()
end

196
backend/lib/backend/crawler/crawler.ex

@ -0,0 +1,196 @@
defmodule Backend.Crawler do
@moduledoc """
This module crawls instances. Run `run(domain)` to crawl a given domain.
"""
alias __MODULE__
alias Backend.Crawler.Crawlers.Mastodon
alias Backend.Crawler.ApiCrawler
alias Backend.{Crawl, CrawlInteraction, Repo, Instance, InstancePeer}
import Ecto.Query
import Backend.Util
require Logger
defstruct [
# the instance domain (a string)
:domain,
# a list of ApiCrawlers that will be attempted
:api_crawlers,
:found_api?,
:result,
:error
]
@type t() :: %__MODULE__{
domain: String.t(),
api_crawlers: [ApiCrawler.t()],
found_api?: boolean,
result: ApiCrawler.t() | nil,
error: String.t() | nil
}
def run(domain) do
Logger.info("Crawling #{domain}...")
HTTPoison.start()
state = %Crawler{domain: domain, api_crawlers: [], found_api?: false, result: nil, error: nil}
state
# register APICrawlers here
|> register(Mastodon)
# go!
|> crawl()
|> save()
end
# Adds a new ApiCrawler that run/1 will check.
defp register(%Crawler{api_crawlers: crawlers} = state, api_crawler) do
Map.put(state, :api_crawlers, [api_crawler | crawlers])
end
# Recursive function to check whether `domain` has an API that the head of the api_crawlers list can read.
# If so, crawls it. If not, continues with the tail of the api_crawlers list.
defp crawl(%Crawler{api_crawlers: [], domain: domain} = state) do
Logger.debug("Found no compatible API for #{domain}")
Map.put(state, :found_api?, false)
end
defp crawl(%Crawler{domain: domain, api_crawlers: [curr | remaining_crawlers]} = state) do
if curr.is_instance_type?(domain) do
Logger.debug("Found #{curr} instance")
state = Map.put(state, :found_api?, true)
try do
%Crawler{state | result: curr.crawl(domain), api_crawlers: []}
rescue
e in HTTPoison.Error ->
Map.put(state, :error, "HTTPoison error: " <> HTTPoison.Error.message(e))
e in Jason.DecodeError ->
Map.put(state, :error, "Jason DecodeError: " <> Jason.DecodeError.message(e))
e in _ ->
Map.put(state, :error, "Unknown error: " <> inspect(e))
end
else
# Nothing found so check the next APICrawler
Logger.debug("#{domain} is not an instance of #{curr}")
crawl(%Crawler{state | api_crawlers: remaining_crawlers})
end
end
# Save the state (after crawling) to the database.
defp save(%Crawler{domain: domain, result: result, found_api?: true, error: nil}) do
now = NaiveDateTime.truncate(NaiveDateTime.utc_now(), :second)
## Update the instance we crawled ##
Repo.insert!(
%Instance{
domain: domain,
description: result.description,
version: result.version,
user_count: result.user_count,
status_count: result.status_count
},
on_conflict: [
set: [
description: result.description,
version: result.version,
user_count: result.user_count,
status_count: result.status_count,
updated_at: now
]
],
conflict_target: :domain
)
# Save details of a new crawl
curr_crawl =
Repo.insert!(%Crawl{
instance_domain: domain,
interactions_seen:
result.interactions |> Map.values() |> Enum.reduce(0, fn count, acc -> count + acc end),
statuses_seen: result.statuses_seen
})
# We get a list of peers from two places:
# * the official peers endpoint (which may be disabled)
# * the interactions
peers_domains =
result.interactions
|> Map.keys()
|> list_union(result.peers)
|> Enum.filter(fn domain -> not is_blacklisted?(domain) end)
peers =
peers_domains
|> Enum.map(&%{domain: &1, inserted_at: now, updated_at: now})
Instance
|> Repo.insert_all(peers, on_conflict: :nothing, conflict_target: :domain)
Repo.transaction(fn ->
## Save peer relationships ##
# get current peers (a list of strings)
current_peers =
InstancePeer
|> where(source_domain: ^domain)
|> select([p], p.target_domain)
|> Repo.all()
wanted_peers_set = MapSet.new(peers_domains)
current_peers_set = MapSet.new(current_peers)
# delete the peers we don't want
dont_want = current_peers_set |> MapSet.difference(wanted_peers_set) |> MapSet.to_list()
if length(dont_want) > 0 do
InstancePeer
|> where(source_domain: ^domain)
|> where([p], p.target_domain in ^dont_want)
|> Repo.delete_all([])
end
# insert the ones we don't have yet
new_instance_peers =
wanted_peers_set
|> MapSet.difference(current_peers_set)
|> MapSet.to_list()
|> Enum.map(
&%{
source_domain: domain,
target_domain: &1,
inserted_at: now,
updated_at: now
}
)
InstancePeer
|> Repo.insert_all(new_instance_peers)
end)
## Save interactions ##
interactions =
result.interactions
|> Enum.filter(fn {target_domain, _count} -> not is_blacklisted?(target_domain) end)
|> Enum.map(fn {target_domain, count} ->
%{
crawl_id: curr_crawl.id,
source_domain: domain,
target_domain: target_domain,
mentions: count,
inserted_at: now,
updated_at: now
}
end)
CrawlInteraction
|> Repo.insert_all(interactions)
end
defp save(%{domain: domain, error: error}) do
Repo.insert!(%Crawl{
instance_domain: domain,
error: error
})
end
end

193
backend/lib/backend/crawler/crawlers/mastodon.ex

@ -0,0 +1,193 @@
defmodule Backend.Crawler.Crawlers.Mastodon do
require Logger
import Backend.Crawler.Util
alias Backend.Crawler.ApiCrawler
@behaviour ApiCrawler
@impl ApiCrawler
def is_instance_type?(domain) do
case get("https://#{domain}/api/v1/instance") do
{:ok, response} -> if is_http_200?(response), do: has_title?(response.body), else: false
{:error, _error} -> false
end
end
@impl ApiCrawler
def crawl(domain) do
instance = Jason.decode!(get!("https://#{domain}/api/v1/instance").body)
if get_in(instance, ["stats", "user_count"]) > get_config(:personal_instance_threshold) do
crawl_large_instance(domain, instance)
else
Map.merge(
Map.merge(
Map.take(instance, ["version", "description"]),
Map.take(instance["stats"], ["user_count", "status_count"])
)
|> Map.new(fn {k, v} -> {String.to_atom(k), v} end),
%{peers: [], interactions: %{}, statuses_seen: 0}
)
end
end
@spec crawl_large_instance(String.t(), any()) :: ApiCrawler.t()
defp crawl_large_instance(domain, instance) do
# servers may not publish peers
peers =
case get("https://#{domain}/api/v1/instance/peers") do
{:ok, response} -> if is_http_200?(response), do: Jason.decode!(response.body), else: []
{:error, _error} -> []
end
Logger.debug("Found #{length(peers)} peers.")
{interactions, statuses_seen} = get_interactions(domain)
Logger.debug(
"#{domain}: found #{
interactions |> Map.values() |> Enum.reduce(0, fn count, acc -> count + acc end)
} mentions in #{statuses_seen} statuses."
)
Map.merge(
Map.merge(
Map.take(instance, ["version", "description"]),
Map.take(instance["stats"], ["user_count", "status_count"])
)
|> Map.new(fn {k, v} -> {String.to_atom(k), v} end),
%{peers: peers, interactions: interactions, statuses_seen: statuses_seen}
)
end
@spec get_interactions(
String.t(),
String.t() | nil,
Calendar.naive_datetime() | nil,
ApiCrawler.instance_interactions(),
integer
) :: {ApiCrawler.instance_interactions(), integer}
defp get_interactions(
domain,
max_id \\ nil,
min_timestamp \\ nil,
interactions \\ %{},
statuses_seen \\ 0
) do
# If `statuses_seen == 0`, it's the first call of this function, which means we want to query the database for the
# most recent status we have.
min_timestamp =
if statuses_seen == 0 do
get_last_successful_crawl_timestamp(domain)
else
min_timestamp
end
endpoint = "https://#{domain}/api/v1/timelines/public?local=true"
endpoint =
if max_id do
endpoint <> "&max_id=#{max_id}"
else
endpoint
end
Logger.debug("Crawling #{endpoint}")
statuses =
endpoint
|> get!()
|> Map.get(:body)
|> Jason.decode!()
filtered_statuses =
statuses
|> Enum.filter(fn s -> is_after?(s["created_at"], min_timestamp) end)
if length(filtered_statuses) > 0 do
# get statuses that are eligible (i.e. users don't have #nobot in their profile) and have mentions
interactions = Map.merge(interactions, statuses_to_interactions(filtered_statuses))
statuses_seen = statuses_seen + length(filtered_statuses)
status_datetime_threshold =
NaiveDateTime.utc_now()
|> NaiveDateTime.add(get_config(:status_age_limit_days) * 24 * 3600 * -1, :second)
oldest_status = Enum.at(filtered_statuses, -1)
oldest_status_datetime =
oldest_status
|> (fn s -> s["created_at"] end).()
|> NaiveDateTime.from_iso8601!()
if NaiveDateTime.compare(oldest_status_datetime, status_datetime_threshold) == :gt and
statuses_seen < get_config(:status_count_limit) and
length(filtered_statuses) == length(statuses) do
get_interactions(domain, oldest_status["id"], min_timestamp, interactions, statuses_seen)
else
{interactions, statuses_seen}
end
else
{interactions, statuses_seen}
end
end
# To check if the endpoint works as expected
@spec has_title?(String.t()) :: boolean
defp has_title?(body) do
case Jason.decode(body) do
{:ok, decoded} -> Map.has_key?(decoded, "title")
{:error, _error} -> false
end
end
# Checks whether the status contains one or more mentions
defp is_mention?(status) do
case status["mentions"] do
[] -> false
nil -> false
_ -> true
end
end
# Checks if the author of the status has "nobot" in their profile
defp has_nobot?(status) do
account = status["account"]
fields =
account["fields"]
|> Enum.map(fn %{"name" => name, "value" => value} -> name <> value end)
|> Enum.join("")
# this also means that any users who mentioned ethnobotany in their profiles will be excluded lol ¯\_(ツ)_/¯
(account["note"] <> fields)
|> String.downcase()
|> String.contains?("nobot")
end
# This checks if the status
# a) contains one or more mentions, and
# b) that the person posting doesn't have "nobot" in their profile
defp is_eligible?(status) do
is_mention?(status) and not has_nobot?(status)
end
@spec extract_mentions_from_status(any()) :: ApiCrawler.instance_interactions()
defp extract_mentions_from_status(status) do
status["mentions"]
|> Enum.map(fn mention -> get_domain(mention["url"]) end)
|> Enum.reduce(%{}, fn domain, acc ->
Map.update(acc, domain, 1, &(&1 + 1))
end)
end
@spec statuses_to_interactions(any()) :: ApiCrawler.instance_interactions()
defp statuses_to_interactions(statuses) do
statuses
|> Enum.filter(fn status -> is_eligible?(status) end)
|> Enum.map(fn status -> extract_mentions_from_status(status) end)
|> Enum.reduce(%{}, fn map, acc ->
Map.merge(acc, map)
end)
end
end

84
backend/lib/backend/crawler/stale_instance_manager.ex

@ -0,0 +1,84 @@
defmodule Backend.Crawler.StaleInstanceManager do
use GenServer
alias Backend.{Crawl, Instance, Repo}
import Ecto.Query
import Backend.Util
require Logger
@moduledoc """
This module regularly finds stale instances (i.e. instances that haven't been updated for longer than the crawl
interval) and adds them to the job queue. It runs once a minute.
"""
def start_link(_opts) do
GenServer.start_link(__MODULE__, [], name: __MODULE__)
end
@impl true
def init(_opts) do
instance_count =
Instance
|> where([i], not is_nil(i.version))
|> select([i], count(i.domain))
|> Repo.one()