add most recent crawl table
This commit is contained in:
parent
144a6e842f
commit
2c035892d4
|
@ -17,7 +17,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||
|
||||
### Fixed
|
||||
|
||||
- Added missing indices on crawls and crawl_interactions tables.
|
||||
- Added missing indices on `crawls` and `crawl_interactions` tables.
|
||||
- Added table to store most recent crawl. This speeds up the instance view by a lot!
|
||||
|
||||
### Security
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ defmodule Backend.Crawler do
|
|||
alias __MODULE__
|
||||
alias Backend.Crawler.Crawlers.{GnuSocial, Mastodon, Misskey, Nodeinfo}
|
||||
alias Backend.Crawler.ApiCrawler
|
||||
alias Backend.{Crawl, CrawlInteraction, Repo, Instance, InstancePeer}
|
||||
alias Backend.{Crawl, CrawlInteraction, MostRecentCrawl, Repo, Instance, InstancePeer}
|
||||
import Ecto.Query
|
||||
import Backend.Util
|
||||
require Logger
|
||||
|
@ -167,10 +167,23 @@ defmodule Backend.Crawler do
|
|||
Repo.insert!(%Crawl{
|
||||
instance_domain: domain,
|
||||
interactions_seen:
|
||||
result.interactions |> Map.values() |> Enum.reduce(0, fn count, acc -> count + acc end),
|
||||
result.interactions
|
||||
|> Map.values()
|
||||
|> Enum.reduce(0, fn count, acc -> count + acc end),
|
||||
statuses_seen: result.statuses_seen
|
||||
})
|
||||
|
||||
Repo.insert!(
|
||||
%MostRecentCrawl{
|
||||
instance_domain: domain,
|
||||
crawl_id: curr_crawl.id,
|
||||
inserted_at: now,
|
||||
updated_at: now
|
||||
},
|
||||
on_conflict: {:replace, [:crawl_id, :updated_at]},
|
||||
conflict_target: :instance_domain
|
||||
)
|
||||
|
||||
# We get a list of peers from two places:
|
||||
# * the official peers endpoint (which may be disabled)
|
||||
# * the interactions
|
||||
|
|
|
@ -49,11 +49,24 @@ defmodule Backend.Crawler.Crawlers.GnuSocial do
|
|||
|
||||
{interactions, statuses_seen} = get_interactions(domain, min_timestamp)
|
||||
|
||||
Map.merge(nodeinfo_result, %{
|
||||
interactions: interactions,
|
||||
statuses_seen: statuses_seen,
|
||||
peers: []
|
||||
})
|
||||
if nodeinfo_result != nil do
|
||||
Map.merge(nodeinfo_result, %{
|
||||
interactions: interactions,
|
||||
statuses_seen: statuses_seen,
|
||||
peers: []
|
||||
})
|
||||
else
|
||||
%{
|
||||
version: nil,
|
||||
description: nil,
|
||||
user_count: nil,
|
||||
status_count: nil,
|
||||
peers: [],
|
||||
interactions: interactions,
|
||||
statuses_seen: statuses_seen,
|
||||
instance_type: :gnusocial
|
||||
}
|
||||
end
|
||||
end
|
||||
|
||||
@spec get_interactions(
|
||||
|
|
22
backend/lib/backend/most_recent_crawl.ex
Normal file
22
backend/lib/backend/most_recent_crawl.ex
Normal file
|
@ -0,0 +1,22 @@
|
|||
defmodule Backend.MostRecentCrawl do
|
||||
use Ecto.Schema
|
||||
import Ecto.Changeset
|
||||
|
||||
schema "most_recent_crawl" do
|
||||
belongs_to :instance, Backend.Instance,
|
||||
references: :domain,
|
||||
type: :string,
|
||||
foreign_key: :instance_domain
|
||||
|
||||
belongs_to :crawl, Backend.Crawl
|
||||
|
||||
timestamps()
|
||||
end
|
||||
|
||||
@doc false
|
||||
def changeset(edge, attrs) do
|
||||
edge
|
||||
|> cast(attrs, [:instance, :crawl])
|
||||
|> validate_required([:instance, :crawl])
|
||||
end
|
||||
end
|
|
@ -1,7 +1,7 @@
|
|||
defmodule Backend.Util do
|
||||
import Ecto.Query
|
||||
require Logger
|
||||
alias Backend.{Crawl, Repo}
|
||||
alias Backend.{Crawl, MostRecentCrawl, Repo}
|
||||
|
||||
@doc """
|
||||
Returns the given key from :backend, :crawler in the config.
|
||||
|
@ -78,11 +78,17 @@ defmodule Backend.Util do
|
|||
|
||||
@spec get_last_crawl(String.t()) :: Crawl.t() | nil
|
||||
def get_last_crawl(domain) do
|
||||
most_recent_crawl_subquery =
|
||||
MostRecentCrawl
|
||||
|> select([mrc], %{
|
||||
most_recent_id: mrc.crawl_id
|
||||
})
|
||||
|> where([mrc], mrc.instance_domain == ^domain)
|
||||
|
||||
Crawl
|
||||
|> select([c], c)
|
||||
|> where([c], c.instance_domain == ^domain)
|
||||
|> order_by(desc: :id)
|
||||
|> limit(1)
|
||||
|> join(:inner, [c], mrc in subquery(most_recent_crawl_subquery),
|
||||
on: c.id == mrc.most_recent_id
|
||||
)
|
||||
|> Repo.one()
|
||||
end
|
||||
|
||||
|
|
|
@ -0,0 +1,32 @@
|
|||
defmodule Backend.Repo.Migrations.AddMostRecentCrawlTable do
|
||||
use Ecto.Migration
|
||||
|
||||
def change do
|
||||
create table(:most_recent_crawl) do
|
||||
add :instance_domain, references(:instances, column: :domain, type: :string)
|
||||
add :crawl_id, references(:crawls)
|
||||
|
||||
timestamps()
|
||||
end
|
||||
|
||||
create unique_index(:most_recent_crawl, [:instance_domain])
|
||||
|
||||
flush()
|
||||
|
||||
execute(
|
||||
"
|
||||
INSERT INTO most_recent_crawl (instance_domain, crawl_id, updated_at, inserted_at)
|
||||
SELECT
|
||||
c.instance_domain,
|
||||
MAX(c.id) AS crawl_id,
|
||||
(SELECT NOW()) AS updated_at,
|
||||
(SELECT NOW()) AS inserted_at
|
||||
FROM
|
||||
crawls c
|
||||
GROUP BY
|
||||
c.instance_domain
|
||||
",
|
||||
"DELETE FROM most_recent_crawl"
|
||||
)
|
||||
end
|
||||
end
|
Loading…
Reference in a new issue