add most recent crawl table

This commit is contained in:
Tao Bror Bojlén 2019-08-10 16:21:22 +03:00
parent 144a6e842f
commit 2c035892d4
No known key found for this signature in database
GPG key ID: C6EC7AAB905F9E6F
6 changed files with 100 additions and 13 deletions

View file

@ -17,7 +17,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed ### Fixed
- Added missing indices on crawls and crawl_interactions tables. - Added missing indices on `crawls` and `crawl_interactions` tables.
- Added table to store most recent crawl. This speeds up the instance view by a lot!
### Security ### Security

View file

@ -6,7 +6,7 @@ defmodule Backend.Crawler do
alias __MODULE__ alias __MODULE__
alias Backend.Crawler.Crawlers.{GnuSocial, Mastodon, Misskey, Nodeinfo} alias Backend.Crawler.Crawlers.{GnuSocial, Mastodon, Misskey, Nodeinfo}
alias Backend.Crawler.ApiCrawler alias Backend.Crawler.ApiCrawler
alias Backend.{Crawl, CrawlInteraction, Repo, Instance, InstancePeer} alias Backend.{Crawl, CrawlInteraction, MostRecentCrawl, Repo, Instance, InstancePeer}
import Ecto.Query import Ecto.Query
import Backend.Util import Backend.Util
require Logger require Logger
@ -167,10 +167,23 @@ defmodule Backend.Crawler do
Repo.insert!(%Crawl{ Repo.insert!(%Crawl{
instance_domain: domain, instance_domain: domain,
interactions_seen: interactions_seen:
result.interactions |> Map.values() |> Enum.reduce(0, fn count, acc -> count + acc end), result.interactions
|> Map.values()
|> Enum.reduce(0, fn count, acc -> count + acc end),
statuses_seen: result.statuses_seen statuses_seen: result.statuses_seen
}) })
Repo.insert!(
%MostRecentCrawl{
instance_domain: domain,
crawl_id: curr_crawl.id,
inserted_at: now,
updated_at: now
},
on_conflict: {:replace, [:crawl_id, :updated_at]},
conflict_target: :instance_domain
)
# We get a list of peers from two places: # We get a list of peers from two places:
# * the official peers endpoint (which may be disabled) # * the official peers endpoint (which may be disabled)
# * the interactions # * the interactions

View file

@ -49,11 +49,24 @@ defmodule Backend.Crawler.Crawlers.GnuSocial do
{interactions, statuses_seen} = get_interactions(domain, min_timestamp) {interactions, statuses_seen} = get_interactions(domain, min_timestamp)
Map.merge(nodeinfo_result, %{ if nodeinfo_result != nil do
interactions: interactions, Map.merge(nodeinfo_result, %{
statuses_seen: statuses_seen, interactions: interactions,
peers: [] statuses_seen: statuses_seen,
}) peers: []
})
else
%{
version: nil,
description: nil,
user_count: nil,
status_count: nil,
peers: [],
interactions: interactions,
statuses_seen: statuses_seen,
instance_type: :gnusocial
}
end
end end
@spec get_interactions( @spec get_interactions(

View file

@ -0,0 +1,22 @@
defmodule Backend.MostRecentCrawl do
use Ecto.Schema
import Ecto.Changeset
schema "most_recent_crawl" do
belongs_to :instance, Backend.Instance,
references: :domain,
type: :string,
foreign_key: :instance_domain
belongs_to :crawl, Backend.Crawl
timestamps()
end
@doc false
def changeset(edge, attrs) do
edge
|> cast(attrs, [:instance, :crawl])
|> validate_required([:instance, :crawl])
end
end

View file

@ -1,7 +1,7 @@
defmodule Backend.Util do defmodule Backend.Util do
import Ecto.Query import Ecto.Query
require Logger require Logger
alias Backend.{Crawl, Repo} alias Backend.{Crawl, MostRecentCrawl, Repo}
@doc """ @doc """
Returns the given key from :backend, :crawler in the config. Returns the given key from :backend, :crawler in the config.
@ -78,11 +78,17 @@ defmodule Backend.Util do
@spec get_last_crawl(String.t()) :: Crawl.t() | nil @spec get_last_crawl(String.t()) :: Crawl.t() | nil
def get_last_crawl(domain) do def get_last_crawl(domain) do
most_recent_crawl_subquery =
MostRecentCrawl
|> select([mrc], %{
most_recent_id: mrc.crawl_id
})
|> where([mrc], mrc.instance_domain == ^domain)
Crawl Crawl
|> select([c], c) |> join(:inner, [c], mrc in subquery(most_recent_crawl_subquery),
|> where([c], c.instance_domain == ^domain) on: c.id == mrc.most_recent_id
|> order_by(desc: :id) )
|> limit(1)
|> Repo.one() |> Repo.one()
end end

View file

@ -0,0 +1,32 @@
defmodule Backend.Repo.Migrations.AddMostRecentCrawlTable do
use Ecto.Migration
def change do
create table(:most_recent_crawl) do
add :instance_domain, references(:instances, column: :domain, type: :string)
add :crawl_id, references(:crawls)
timestamps()
end
create unique_index(:most_recent_crawl, [:instance_domain])
flush()
execute(
"
INSERT INTO most_recent_crawl (instance_domain, crawl_id, updated_at, inserted_at)
SELECT
c.instance_domain,
MAX(c.id) AS crawl_id,
(SELECT NOW()) AS updated_at,
(SELECT NOW()) AS inserted_at
FROM
crawls c
GROUP BY
c.instance_domain
",
"DELETE FROM most_recent_crawl"
)
end
end