add most recent crawl table

This commit is contained in:
Tao Bror Bojlén 2019-08-10 16:21:22 +03:00
parent 144a6e842f
commit 2c035892d4
No known key found for this signature in database
GPG Key ID: C6EC7AAB905F9E6F
6 changed files with 100 additions and 13 deletions

View File

@ -17,7 +17,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed
- Added missing indices on crawls and crawl_interactions tables.
- Added missing indices on `crawls` and `crawl_interactions` tables.
- Added table to store most recent crawl. This speeds up the instance view by a lot!
### Security

View File

@ -6,7 +6,7 @@ defmodule Backend.Crawler do
alias __MODULE__
alias Backend.Crawler.Crawlers.{GnuSocial, Mastodon, Misskey, Nodeinfo}
alias Backend.Crawler.ApiCrawler
alias Backend.{Crawl, CrawlInteraction, Repo, Instance, InstancePeer}
alias Backend.{Crawl, CrawlInteraction, MostRecentCrawl, Repo, Instance, InstancePeer}
import Ecto.Query
import Backend.Util
require Logger
@ -167,10 +167,23 @@ defmodule Backend.Crawler do
Repo.insert!(%Crawl{
instance_domain: domain,
interactions_seen:
result.interactions |> Map.values() |> Enum.reduce(0, fn count, acc -> count + acc end),
result.interactions
|> Map.values()
|> Enum.reduce(0, fn count, acc -> count + acc end),
statuses_seen: result.statuses_seen
})
Repo.insert!(
%MostRecentCrawl{
instance_domain: domain,
crawl_id: curr_crawl.id,
inserted_at: now,
updated_at: now
},
on_conflict: {:replace, [:crawl_id, :updated_at]},
conflict_target: :instance_domain
)
# We get a list of peers from two places:
# * the official peers endpoint (which may be disabled)
# * the interactions

View File

@ -49,11 +49,24 @@ defmodule Backend.Crawler.Crawlers.GnuSocial do
{interactions, statuses_seen} = get_interactions(domain, min_timestamp)
Map.merge(nodeinfo_result, %{
interactions: interactions,
statuses_seen: statuses_seen,
peers: []
})
if nodeinfo_result != nil do
Map.merge(nodeinfo_result, %{
interactions: interactions,
statuses_seen: statuses_seen,
peers: []
})
else
%{
version: nil,
description: nil,
user_count: nil,
status_count: nil,
peers: [],
interactions: interactions,
statuses_seen: statuses_seen,
instance_type: :gnusocial
}
end
end
@spec get_interactions(

View File

@ -0,0 +1,22 @@
defmodule Backend.MostRecentCrawl do
use Ecto.Schema
import Ecto.Changeset
schema "most_recent_crawl" do
belongs_to :instance, Backend.Instance,
references: :domain,
type: :string,
foreign_key: :instance_domain
belongs_to :crawl, Backend.Crawl
timestamps()
end
@doc false
def changeset(edge, attrs) do
edge
|> cast(attrs, [:instance, :crawl])
|> validate_required([:instance, :crawl])
end
end

View File

@ -1,7 +1,7 @@
defmodule Backend.Util do
import Ecto.Query
require Logger
alias Backend.{Crawl, Repo}
alias Backend.{Crawl, MostRecentCrawl, Repo}
@doc """
Returns the given key from :backend, :crawler in the config.
@ -78,11 +78,17 @@ defmodule Backend.Util do
@spec get_last_crawl(String.t()) :: Crawl.t() | nil
def get_last_crawl(domain) do
most_recent_crawl_subquery =
MostRecentCrawl
|> select([mrc], %{
most_recent_id: mrc.crawl_id
})
|> where([mrc], mrc.instance_domain == ^domain)
Crawl
|> select([c], c)
|> where([c], c.instance_domain == ^domain)
|> order_by(desc: :id)
|> limit(1)
|> join(:inner, [c], mrc in subquery(most_recent_crawl_subquery),
on: c.id == mrc.most_recent_id
)
|> Repo.one()
end

View File

@ -0,0 +1,32 @@
defmodule Backend.Repo.Migrations.AddMostRecentCrawlTable do
use Ecto.Migration
def change do
create table(:most_recent_crawl) do
add :instance_domain, references(:instances, column: :domain, type: :string)
add :crawl_id, references(:crawls)
timestamps()
end
create unique_index(:most_recent_crawl, [:instance_domain])
flush()
execute(
"
INSERT INTO most_recent_crawl (instance_domain, crawl_id, updated_at, inserted_at)
SELECT
c.instance_domain,
MAX(c.id) AS crawl_id,
(SELECT NOW()) AS updated_at,
(SELECT NOW()) AS inserted_at
FROM
crawls c
GROUP BY
c.instance_domain
",
"DELETE FROM most_recent_crawl"
)
end
end