add most recent crawl table
This commit is contained in:
parent
144a6e842f
commit
2c035892d4
|
@ -17,7 +17,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
|
|
||||||
- Added missing indices on crawls and crawl_interactions tables.
|
- Added missing indices on `crawls` and `crawl_interactions` tables.
|
||||||
|
- Added table to store most recent crawl. This speeds up the instance view by a lot!
|
||||||
|
|
||||||
### Security
|
### Security
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,7 @@ defmodule Backend.Crawler do
|
||||||
alias __MODULE__
|
alias __MODULE__
|
||||||
alias Backend.Crawler.Crawlers.{GnuSocial, Mastodon, Misskey, Nodeinfo}
|
alias Backend.Crawler.Crawlers.{GnuSocial, Mastodon, Misskey, Nodeinfo}
|
||||||
alias Backend.Crawler.ApiCrawler
|
alias Backend.Crawler.ApiCrawler
|
||||||
alias Backend.{Crawl, CrawlInteraction, Repo, Instance, InstancePeer}
|
alias Backend.{Crawl, CrawlInteraction, MostRecentCrawl, Repo, Instance, InstancePeer}
|
||||||
import Ecto.Query
|
import Ecto.Query
|
||||||
import Backend.Util
|
import Backend.Util
|
||||||
require Logger
|
require Logger
|
||||||
|
@ -167,10 +167,23 @@ defmodule Backend.Crawler do
|
||||||
Repo.insert!(%Crawl{
|
Repo.insert!(%Crawl{
|
||||||
instance_domain: domain,
|
instance_domain: domain,
|
||||||
interactions_seen:
|
interactions_seen:
|
||||||
result.interactions |> Map.values() |> Enum.reduce(0, fn count, acc -> count + acc end),
|
result.interactions
|
||||||
|
|> Map.values()
|
||||||
|
|> Enum.reduce(0, fn count, acc -> count + acc end),
|
||||||
statuses_seen: result.statuses_seen
|
statuses_seen: result.statuses_seen
|
||||||
})
|
})
|
||||||
|
|
||||||
|
Repo.insert!(
|
||||||
|
%MostRecentCrawl{
|
||||||
|
instance_domain: domain,
|
||||||
|
crawl_id: curr_crawl.id,
|
||||||
|
inserted_at: now,
|
||||||
|
updated_at: now
|
||||||
|
},
|
||||||
|
on_conflict: {:replace, [:crawl_id, :updated_at]},
|
||||||
|
conflict_target: :instance_domain
|
||||||
|
)
|
||||||
|
|
||||||
# We get a list of peers from two places:
|
# We get a list of peers from two places:
|
||||||
# * the official peers endpoint (which may be disabled)
|
# * the official peers endpoint (which may be disabled)
|
||||||
# * the interactions
|
# * the interactions
|
||||||
|
|
|
@ -49,11 +49,24 @@ defmodule Backend.Crawler.Crawlers.GnuSocial do
|
||||||
|
|
||||||
{interactions, statuses_seen} = get_interactions(domain, min_timestamp)
|
{interactions, statuses_seen} = get_interactions(domain, min_timestamp)
|
||||||
|
|
||||||
Map.merge(nodeinfo_result, %{
|
if nodeinfo_result != nil do
|
||||||
interactions: interactions,
|
Map.merge(nodeinfo_result, %{
|
||||||
statuses_seen: statuses_seen,
|
interactions: interactions,
|
||||||
peers: []
|
statuses_seen: statuses_seen,
|
||||||
})
|
peers: []
|
||||||
|
})
|
||||||
|
else
|
||||||
|
%{
|
||||||
|
version: nil,
|
||||||
|
description: nil,
|
||||||
|
user_count: nil,
|
||||||
|
status_count: nil,
|
||||||
|
peers: [],
|
||||||
|
interactions: interactions,
|
||||||
|
statuses_seen: statuses_seen,
|
||||||
|
instance_type: :gnusocial
|
||||||
|
}
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
@spec get_interactions(
|
@spec get_interactions(
|
||||||
|
|
22
backend/lib/backend/most_recent_crawl.ex
Normal file
22
backend/lib/backend/most_recent_crawl.ex
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
defmodule Backend.MostRecentCrawl do
|
||||||
|
use Ecto.Schema
|
||||||
|
import Ecto.Changeset
|
||||||
|
|
||||||
|
schema "most_recent_crawl" do
|
||||||
|
belongs_to :instance, Backend.Instance,
|
||||||
|
references: :domain,
|
||||||
|
type: :string,
|
||||||
|
foreign_key: :instance_domain
|
||||||
|
|
||||||
|
belongs_to :crawl, Backend.Crawl
|
||||||
|
|
||||||
|
timestamps()
|
||||||
|
end
|
||||||
|
|
||||||
|
@doc false
|
||||||
|
def changeset(edge, attrs) do
|
||||||
|
edge
|
||||||
|
|> cast(attrs, [:instance, :crawl])
|
||||||
|
|> validate_required([:instance, :crawl])
|
||||||
|
end
|
||||||
|
end
|
|
@ -1,7 +1,7 @@
|
||||||
defmodule Backend.Util do
|
defmodule Backend.Util do
|
||||||
import Ecto.Query
|
import Ecto.Query
|
||||||
require Logger
|
require Logger
|
||||||
alias Backend.{Crawl, Repo}
|
alias Backend.{Crawl, MostRecentCrawl, Repo}
|
||||||
|
|
||||||
@doc """
|
@doc """
|
||||||
Returns the given key from :backend, :crawler in the config.
|
Returns the given key from :backend, :crawler in the config.
|
||||||
|
@ -78,11 +78,17 @@ defmodule Backend.Util do
|
||||||
|
|
||||||
@spec get_last_crawl(String.t()) :: Crawl.t() | nil
|
@spec get_last_crawl(String.t()) :: Crawl.t() | nil
|
||||||
def get_last_crawl(domain) do
|
def get_last_crawl(domain) do
|
||||||
|
most_recent_crawl_subquery =
|
||||||
|
MostRecentCrawl
|
||||||
|
|> select([mrc], %{
|
||||||
|
most_recent_id: mrc.crawl_id
|
||||||
|
})
|
||||||
|
|> where([mrc], mrc.instance_domain == ^domain)
|
||||||
|
|
||||||
Crawl
|
Crawl
|
||||||
|> select([c], c)
|
|> join(:inner, [c], mrc in subquery(most_recent_crawl_subquery),
|
||||||
|> where([c], c.instance_domain == ^domain)
|
on: c.id == mrc.most_recent_id
|
||||||
|> order_by(desc: :id)
|
)
|
||||||
|> limit(1)
|
|
||||||
|> Repo.one()
|
|> Repo.one()
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,32 @@
|
||||||
|
defmodule Backend.Repo.Migrations.AddMostRecentCrawlTable do
|
||||||
|
use Ecto.Migration
|
||||||
|
|
||||||
|
def change do
|
||||||
|
create table(:most_recent_crawl) do
|
||||||
|
add :instance_domain, references(:instances, column: :domain, type: :string)
|
||||||
|
add :crawl_id, references(:crawls)
|
||||||
|
|
||||||
|
timestamps()
|
||||||
|
end
|
||||||
|
|
||||||
|
create unique_index(:most_recent_crawl, [:instance_domain])
|
||||||
|
|
||||||
|
flush()
|
||||||
|
|
||||||
|
execute(
|
||||||
|
"
|
||||||
|
INSERT INTO most_recent_crawl (instance_domain, crawl_id, updated_at, inserted_at)
|
||||||
|
SELECT
|
||||||
|
c.instance_domain,
|
||||||
|
MAX(c.id) AS crawl_id,
|
||||||
|
(SELECT NOW()) AS updated_at,
|
||||||
|
(SELECT NOW()) AS inserted_at
|
||||||
|
FROM
|
||||||
|
crawls c
|
||||||
|
GROUP BY
|
||||||
|
c.instance_domain
|
||||||
|
",
|
||||||
|
"DELETE FROM most_recent_crawl"
|
||||||
|
)
|
||||||
|
end
|
||||||
|
end
|
Loading…
Reference in a new issue