From 5ca8de5dbeb002c7b84d145345be18980241ba66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tao=20Bojl=C3=A9n?= <2803708-taobojlen@users.noreply.gitlab.com> Date: Fri, 26 Jul 2019 22:30:11 +0000 Subject: [PATCH] add full-text search --- .vscode/settings.json | 3 +- CHANGELOG.md | 4 + backend/Procfile | 2 +- backend/config/config.exs | 14 +++ backend/config/releases.exs | 3 +- backend/lib/backend/api.ex | 68 ++++++++++-- backend/lib/backend/application.ex | 3 +- .../lib/backend/crawler/crawlers/mastodon.ex | 6 +- backend/lib/backend/elasticsearch/cluster.ex | 3 + backend/lib/backend/elasticsearch/store.ex | 16 +++ backend/lib/backend/instance.ex | 15 +++ backend/lib/backend/release.ex | 12 +++ backend/lib/backend/util.ex | 10 ++ .../controllers/search_controller.ex | 6 +- backend/lib/backend_web/views/admin_view.ex | 3 - .../lib/backend_web/views/instance_view.ex | 2 +- backend/lib/backend_web/views/search_view.ex | 16 +-- backend/mix.exs | 3 +- backend/mix.lock | 2 + backend/priv/elasticsearch/instances.json | 53 ++++++++++ docker-compose.yml | 22 ++++ frontend/src/components/atoms/GraphKey.tsx | 2 +- .../src/components/molecules/Cytoscape.tsx | 100 +++++++++++++++--- .../src/components/molecules/SearchResult.tsx | 13 ++- frontend/src/components/organisms/Graph.tsx | 8 +- .../src/components/screens/AdminScreen.tsx | 28 ++--- .../src/components/screens/InstanceScreen.tsx | 6 +- .../src/components/screens/SearchScreen.tsx | 23 +++- frontend/src/constants.tsx | 24 +++-- frontend/src/redux/actions.ts | 11 +- frontend/src/redux/reducers.ts | 6 ++ frontend/src/redux/types.ts | 7 +- .../space/fediverse/graph/GraphBuilder.class | Bin 8408 -> 0 bytes 33 files changed, 410 insertions(+), 84 deletions(-) create mode 100644 backend/lib/backend/elasticsearch/cluster.ex create mode 100644 backend/lib/backend/elasticsearch/store.ex create mode 100644 backend/priv/elasticsearch/instances.json delete mode 100644 gephi/bin/main/space/fediverse/graph/GraphBuilder.class diff --git a/.vscode/settings.json b/.vscode/settings.json index 193d895..b40ffd5 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,3 +1,4 @@ { - "elixirLS.projectDir": "backend/" + "elixirLS.projectDir": "backend/", + "elixirLS.fetchDeps": false } \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index fccd59a..845756c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,10 +10,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Instance administrators can now log in to opt in or out of crawling. +- Added ElasticSearch full-text search over instance domains and descriptions. +- Search results are now highlighted on the graph. +- When you hover a search result, it is now highlighted on the graph. ### Changed - Instances are now crawled hourly instead of every 30 minutes. +- The colors for color coding have been made brighter (more visible against the dark background. ### Deprecated diff --git a/backend/Procfile b/backend/Procfile index bf37e9b..eb5141e 100644 --- a/backend/Procfile +++ b/backend/Procfile @@ -1,2 +1,2 @@ web: /app/bin/backend start -release: /app/bin/backend eval "Backend.Release.migrate" \ No newline at end of file +release: /app/bin/backend eval "Backend.Release.run_all" \ No newline at end of file diff --git a/backend/config/config.exs b/backend/config/config.exs index 2646339..db2fbb8 100644 --- a/backend/config/config.exs +++ b/backend/config/config.exs @@ -19,6 +19,20 @@ config :backend, BackendWeb.Endpoint, config :backend, Backend.Repo, queue_target: 5000 +config :backend, Backend.Elasticsearch.Cluster, + url: "http://localhost:9200", + api: Elasticsearch.API.HTTP, + json_library: Jason, + indexes: %{ + instances: %{ + settings: "priv/elasticsearch/instances.json", + store: Backend.Elasticsearch.Store, + sources: [Backend.Instance], + bulk_page_size: 1000, + bulk_wait_interval: 1_000 + } + } + # Configures Elixir's Logger config :logger, :console, format: "$time $metadata[$level] $message\n", diff --git a/backend/config/releases.exs b/backend/config/releases.exs index d5f50d1..cd7736a 100644 --- a/backend/config/releases.exs +++ b/backend/config/releases.exs @@ -14,7 +14,8 @@ config :backend, Backend.Repo, pool_size: String.to_integer(System.get_env("POOL_SIZE") || "10"), ssl: ssl -# show_sensitive_data_on_connection_error: true +config :backend, Backend.Elasticsearch.Cluster, + url: System.get_env("ELASTICSEARCH_URL") || "http://localhost:9200" port = String.to_integer(System.get_env("PORT") || "4000") diff --git a/backend/lib/backend/api.ex b/backend/lib/backend/api.ex index 07381d3..4dcc230 100644 --- a/backend/lib/backend/api.ex +++ b/backend/lib/backend/api.ex @@ -101,15 +101,67 @@ defmodule Backend.Api do end end - def search_instances(query, cursor_after \\ nil) do - ilike_query = "%#{query}%" + def search_instances(query, from \\ 0) do + page_size = 50 - %{entries: instances, metadata: metadata} = - Instance - |> where([i], ilike(i.domain, ^ilike_query) and not i.opt_out) - |> order_by(asc: :id) - |> Repo.paginate(after: cursor_after, cursor_fields: [:id], limit: 50) + search_response = + Elasticsearch.post(Backend.Elasticsearch.Cluster, "/instances/_search", %{ + "sort" => "_score", + "from" => from, + "size" => page_size, + "query" => %{ + "bool" => %{ + "should" => [ + %{ + "multi_match" => %{ + "query" => query, + "fields" => [ + "description.english" + ] + } + }, + %{ + "wildcard" => %{ + "domain.keyword" => %{ + "value" => query, + "boost" => 100 + } + } + }, + %{ + "wildcard" => %{ + "domain.keyword" => %{ + "value" => "*#{query}*", + "boost" => 1 + } + } + }, + %{ + "match" => %{ + "domain.ngram^0.5" => query + } + } + ] + } + } + }) - %{instances: instances, next: metadata.after} + with {:ok, result} <- search_response do + hits = + get_in(result, ["hits", "hits"]) + |> Enum.map(fn h -> h |> Map.get("_source") |> convert_keys_to_atoms() end) + + next = + if length(hits) < page_size do + nil + else + from + page_size + end + + %{ + hits: hits, + next: next + } + end end end diff --git a/backend/lib/backend/application.ex b/backend/lib/backend/application.ex index 63b3c30..9a42828 100644 --- a/backend/lib/backend/application.ex +++ b/backend/lib/backend/application.ex @@ -21,7 +21,8 @@ defmodule Backend.Application do Honeydew.start_queue(:crawl_queue, failure_mode: Honeydew.FailureMode.Abandon) Honeydew.start_workers(:crawl_queue, Backend.Crawler, num: crawl_worker_count) end}, - Backend.Scheduler + Backend.Scheduler, + Backend.Elasticsearch.Cluster ] children = diff --git a/backend/lib/backend/crawler/crawlers/mastodon.ex b/backend/lib/backend/crawler/crawlers/mastodon.ex index 3c740aa..5e858f5 100644 --- a/backend/lib/backend/crawler/crawlers/mastodon.ex +++ b/backend/lib/backend/crawler/crawlers/mastodon.ex @@ -32,7 +32,6 @@ defmodule Backend.Crawler.Crawlers.Mastodon do end @impl ApiCrawler - # sobelow_skip ["DOS.StringToAtom"] def crawl(domain) do instance = Jason.decode!(get!("https://#{domain}/api/v1/instance").body) @@ -48,7 +47,7 @@ defmodule Backend.Crawler.Crawlers.Mastodon do else Map.merge( Map.take(instance["stats"], ["user_count"]) - |> Map.new(fn {k, v} -> {String.to_atom(k), v} end), + |> convert_keys_to_atoms(), %{ peers: [], interactions: %{}, @@ -63,7 +62,6 @@ defmodule Backend.Crawler.Crawlers.Mastodon do end @spec crawl_large_instance(String.t(), any()) :: ApiCrawler.t() - # sobelow_skip ["DOS.StringToAtom"] defp crawl_large_instance(domain, instance) do # servers may not publish peers peers = @@ -94,7 +92,7 @@ defmodule Backend.Crawler.Crawlers.Mastodon do Map.take(instance, ["version", "description"]), Map.take(instance["stats"], ["user_count", "status_count"]) ) - |> Map.new(fn {k, v} -> {String.to_atom(k), v} end), + |> convert_keys_to_atoms(), %{ peers: peers, interactions: interactions, diff --git a/backend/lib/backend/elasticsearch/cluster.ex b/backend/lib/backend/elasticsearch/cluster.ex new file mode 100644 index 0000000..3f130c9 --- /dev/null +++ b/backend/lib/backend/elasticsearch/cluster.ex @@ -0,0 +1,3 @@ +defmodule Backend.Elasticsearch.Cluster do + use Elasticsearch.Cluster, otp_app: :backend +end diff --git a/backend/lib/backend/elasticsearch/store.ex b/backend/lib/backend/elasticsearch/store.ex new file mode 100644 index 0000000..a3c9b11 --- /dev/null +++ b/backend/lib/backend/elasticsearch/store.ex @@ -0,0 +1,16 @@ +defmodule Backend.Elasticsearch.Store do + @behaviour Elasticsearch.Store + + alias Backend.Repo + + @impl true + def stream(schema) do + Repo.stream(schema) + end + + @impl true + def transaction(fun) do + {:ok, result} = Repo.transaction(fun, timeout: :infinity) + result + end +end diff --git a/backend/lib/backend/instance.ex b/backend/lib/backend/instance.ex index a2d618f..e0d6891 100644 --- a/backend/lib/backend/instance.ex +++ b/backend/lib/backend/instance.ex @@ -46,4 +46,19 @@ defmodule Backend.Instance do |> validate_required([:domain]) |> put_assoc(:peers, attrs.peers) end + + defimpl Elasticsearch.Document, for: Backend.Instance do + def id(instance), do: instance.id + def routing(_), do: false + + def encode(instance) do + # Make sure this corresponds with priv/elasticseach/instances.json + %{ + domain: instance.domain, + description: instance.description, + type: instance.type, + user_count: instance.user_count + } + end + end end diff --git a/backend/lib/backend/release.ex b/backend/lib/backend/release.ex index 3bb1e62..f44a63e 100644 --- a/backend/lib/backend/release.ex +++ b/backend/lib/backend/release.ex @@ -1,12 +1,24 @@ defmodule Backend.Release do @app :backend + alias Elasticsearch.Index + alias Backend.Elasticsearch.Cluster + + def run_all do + migrate() + index() + end + def migrate do for repo <- repos() do {:ok, _, _} = Ecto.Migrator.with_repo(repo, &Ecto.Migrator.run(&1, :up, all: true)) end end + def index do + Index.hot_swap(Cluster, "instances") + end + def rollback(repo, version) do {:ok, _, _} = Ecto.Migrator.with_repo(repo, &Ecto.Migrator.run(&1, :down, to: version)) end diff --git a/backend/lib/backend/util.ex b/backend/lib/backend/util.ex index 4081527..652d13e 100644 --- a/backend/lib/backend/util.ex +++ b/backend/lib/backend/util.ex @@ -157,4 +157,14 @@ defmodule Backend.Util do "#{String.downcase(username)}@#{clean_domain(domain)}" end end + + @doc """ + Converts a map with string keys to a map with atom keys. + Be very careful with this -- only use it on maps where you know the keys! Never run it if the keys can be supplied + by the user. + """ + # sobelow_skip ["DOS.StringToAtom"] + def convert_keys_to_atoms(map) do + map |> Map.new(fn {k, v} -> {String.to_atom(k), v} end) + end end diff --git a/backend/lib/backend_web/controllers/search_controller.ex b/backend/lib/backend_web/controllers/search_controller.ex index e9bbe96..82a9d3a 100644 --- a/backend/lib/backend_web/controllers/search_controller.ex +++ b/backend/lib/backend_web/controllers/search_controller.ex @@ -6,8 +6,8 @@ defmodule BackendWeb.SearchController do def index(conn, params) do query = Map.get(params, "query") - cursor_after = Map.get(params, "after", nil) - %{instances: instances, next: next} = Api.search_instances(query, cursor_after) - render(conn, "index.json", instances: instances, next: next) + from = Map.get(params, "after", "0") |> String.to_integer() + %{hits: hits, next: next} = Api.search_instances(query, from) + render(conn, "index.json", hits: hits, next: next) end end diff --git a/backend/lib/backend_web/views/admin_view.ex b/backend/lib/backend_web/views/admin_view.ex index cd72cb6..319dc2f 100644 --- a/backend/lib/backend_web/views/admin_view.ex +++ b/backend/lib/backend_web/views/admin_view.ex @@ -1,11 +1,8 @@ defmodule BackendWeb.AdminView do use BackendWeb, :view - import Backend.Util require Logger def render("show.json", %{instance: instance}) do - Logger.info(inspect(instance)) - %{ domain: domain, opt_in: opt_in, diff --git a/backend/lib/backend_web/views/instance_view.ex b/backend/lib/backend_web/views/instance_view.ex index 2449c86..8391062 100644 --- a/backend/lib/backend_web/views/instance_view.ex +++ b/backend/lib/backend_web/views/instance_view.ex @@ -20,7 +20,7 @@ defmodule BackendWeb.InstanceView do end cond do - instance.user_count < user_threshold -> + instance.user_count < user_threshold and not instance.opt_in -> %{ name: instance.domain, status: "personal instance" diff --git a/backend/lib/backend_web/views/search_view.ex b/backend/lib/backend_web/views/search_view.ex index 87d23c4..d570cda 100644 --- a/backend/lib/backend_web/views/search_view.ex +++ b/backend/lib/backend_web/views/search_view.ex @@ -3,28 +3,28 @@ defmodule BackendWeb.SearchView do alias BackendWeb.SearchView import Backend.Util - def render("index.json", %{instances: instances, next: next}) do + def render("index.json", %{hits: hits, next: next}) do %{ - results: render_many(instances, SearchView, "instance.json", as: :instance), + results: render_many(hits, SearchView, "instance.json", as: :hit), next: next } end - def render("instance.json", %{instance: instance}) do + def render("instance.json", %{hit: hit}) do threshold = get_config(:personal_instance_threshold) description = - if instance.user_count != nil and instance.user_count < threshold do + if hit.user_count != nil and hit.user_count < threshold do nil else - instance.description + hit.description end %{ - name: instance.domain, + name: hit.domain, description: description, - userCount: instance.user_count, - type: instance.type + userCount: hit.user_count, + type: hit.type } end end diff --git a/backend/mix.exs b/backend/mix.exs index 571063a..af593e0 100644 --- a/backend/mix.exs +++ b/backend/mix.exs @@ -52,7 +52,8 @@ defmodule Backend.MixProject do {:public_suffix, "~> 0.6.0"}, {:idna, "~> 5.1.2", override: true}, {:swoosh, "~> 0.23.3"}, - {:ex_twilio, "~> 0.7.0"} + {:ex_twilio, "~> 0.7.0"}, + {:elasticsearch, "~> 1.0"} ] end diff --git a/backend/mix.lock b/backend/mix.lock index c9005b4..0c17276 100644 --- a/backend/mix.lock +++ b/backend/mix.lock @@ -13,6 +13,7 @@ "distillery": {:hex, :distillery, "2.1.1", "f9332afc2eec8a1a2b86f22429e068ef35f84a93ea1718265e740d90dd367814", [:mix], [{:artificery, "~> 0.2", [hex: :artificery, repo: "hexpm", optional: false]}], "hexpm"}, "ecto": {:hex, :ecto, "3.1.7", "fa21d06ef56cdc2fdaa62574e8c3ba34a2751d44ea34c30bc65f0728421043e5", [:mix], [{:decimal, "~> 1.6", [hex: :decimal, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}], "hexpm"}, "ecto_sql": {:hex, :ecto_sql, "3.1.6", "1e80e30d16138a729c717f73dcb938590bcdb3a4502f3012414d0cbb261045d8", [:mix], [{:db_connection, "~> 2.0", [hex: :db_connection, repo: "hexpm", optional: false]}, {:ecto, "~> 3.1.0", [hex: :ecto, repo: "hexpm", optional: false]}, {:mariaex, "~> 0.9.1", [hex: :mariaex, repo: "hexpm", optional: true]}, {:myxql, "~> 0.2.0", [hex: :myxql, repo: "hexpm", optional: true]}, {:postgrex, "~> 0.14.0 or ~> 0.15.0", [hex: :postgrex, repo: "hexpm", optional: true]}, {:telemetry, "~> 0.4.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm"}, + "elasticsearch": {:hex, :elasticsearch, "1.0.0", "626d3fb8e7554d9c93eb18817ae2a3d22c2a4191cc903c4644b1334469b15374", [:mix], [{:httpoison, ">= 0.0.0", [hex: :httpoison, repo: "hexpm", optional: false]}, {:poison, ">= 0.0.0", [hex: :poison, repo: "hexpm", optional: true]}, {:sigaws, "~> 0.7", [hex: :sigaws, repo: "hexpm", optional: true]}, {:vex, "~> 0.6.0", [hex: :vex, repo: "hexpm", optional: false]}], "hexpm"}, "ex_twilio": {:hex, :ex_twilio, "0.7.0", "d7ce624ef4661311ae28c3e3aa060ecb66a9f4843184d7400c29072f7d3f5a4a", [:mix], [{:httpoison, ">= 0.9.0", [hex: :httpoison, repo: "hexpm", optional: false]}, {:inflex, "~> 1.0", [hex: :inflex, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:joken, "~> 2.0", [hex: :joken, repo: "hexpm", optional: false]}, {:poison, "~> 3.0", [hex: :poison, repo: "hexpm", optional: false]}], "hexpm"}, "gen_stage": {:hex, :gen_stage, "0.14.2", "6a2a578a510c5bfca8a45e6b27552f613b41cf584b58210f017088d3d17d0b14", [:mix], [], "hexpm"}, "gen_state_machine": {:hex, :gen_state_machine, "2.0.5", "9ac15ec6e66acac994cc442dcc2c6f9796cf380ec4b08267223014be1c728a95", [:mix], [], "hexpm"}, @@ -51,4 +52,5 @@ "timex": {:hex, :timex, "3.6.1", "efdf56d0e67a6b956cc57774353b0329c8ab7726766a11547e529357ffdc1d56", [:mix], [{:combine, "~> 0.10", [hex: :combine, repo: "hexpm", optional: false]}, {:gettext, "~> 0.10", [hex: :gettext, repo: "hexpm", optional: false]}, {:tzdata, "~> 0.1.8 or ~> 0.5 or ~> 1.0.0", [hex: :tzdata, repo: "hexpm", optional: false]}], "hexpm"}, "tzdata": {:hex, :tzdata, "1.0.1", "f6027a331af7d837471248e62733c6ebee86a72e57c613aa071ebb1f750fc71a", [:mix], [{:hackney, "~> 1.0", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm"}, "unicode_util_compat": {:hex, :unicode_util_compat, "0.3.1", "a1f612a7b512638634a603c8f401892afbf99b8ce93a45041f8aaca99cadb85e", [:rebar3], [], "hexpm"}, + "vex": {:hex, :vex, "0.6.0", "4e79b396b2ec18cd909eed0450b19108d9631842598d46552dc05031100b7a56", [:mix], [], "hexpm"}, } diff --git a/backend/priv/elasticsearch/instances.json b/backend/priv/elasticsearch/instances.json new file mode 100644 index 0000000..cd985cb --- /dev/null +++ b/backend/priv/elasticsearch/instances.json @@ -0,0 +1,53 @@ +{ + "settings": { + "number_of_shards": 1, + "number_of_replicas": 0, + "analysis": { + "analyzer": { + "ngramAnalyzer": { + "tokenizer": "ngramTokenizer" + } + }, + "tokenizer": { + "ngramTokenizer": { + "type": "ngram", + "min_gram": 5, + "max_gram": 5 + } + } + } + }, + "mappings": { + "_doc": { + "properties": { + "domain": { + "type": "text", + "fields": { + "ngram": { + "type": "text", + "analyzer": "ngramAnalyzer" + }, + "keyword": { + "type": "keyword" + } + } + }, + "description": { + "type": "text", + "fields": { + "english": { + "type": "text", + "analyzer": "english" + } + } + }, + "type": { + "type": "keyword" + }, + "user_count": { + "type": "integer" + } + } + } + } +} diff --git a/docker-compose.yml b/docker-compose.yml index 7aec131..fb39f81 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -10,6 +10,22 @@ services: - pgdata:/var/lib/postgresql/data networks: - database_network + elasticsearch: + image: elasticsearch:6.8.1 + ports: + - "9200:9200" + volumes: + - esdata:/usr/share/elasticsearch/data + networks: + - phoenix_network + - es_network + # Kibana is just for development, really + kibana: + image: kibana:6.8.1 + networks: + - es_network + ports: + - "5601:5601" # This is for running the occasional graph layout task. It's in docker-compose.yml so that it's built at the same time # as everything else, but it should be run regularly with a cron job or similar. gephi: @@ -26,6 +42,7 @@ services: build: ./backend networks: - database_network + - phoenix_network depends_on: - db ports: @@ -37,7 +54,12 @@ services: - BACKEND_HOSTNAME volumes: pgdata: + esdata: gradle-cache: networks: database_network: driver: bridge + phoenix_network: + driver: bridge + es_network: + driver: bridge diff --git a/frontend/src/components/atoms/GraphKey.tsx b/frontend/src/components/atoms/GraphKey.tsx index 4ce0a54..83f6908 100644 --- a/frontend/src/components/atoms/GraphKey.tsx +++ b/frontend/src/components/atoms/GraphKey.tsx @@ -47,7 +47,7 @@ const GraphKey: React.FC = ({ current, colorSchemes, onItemSelec
Key