only crawl new statuses since last crawl

This commit is contained in:
Tao Bror Bojlén 2019-07-01 17:02:15 +01:00
parent 0b7df993b9
commit d94e700e6a
No known key found for this signature in database
GPG Key ID: C6EC7AAB905F9E6F
4 changed files with 16 additions and 10 deletions

View File

@ -74,7 +74,8 @@ defmodule Backend.Crawler.Crawler do
description: result.description,
version: result.version,
user_count: result.user_count,
status_count: result.status_count
status_count: result.status_count,
last_crawl_timestamp: now
},
on_conflict: [
set: [
@ -82,6 +83,7 @@ defmodule Backend.Crawler.Crawler do
version: result.version,
user_count: result.user_count,
status_count: result.status_count,
last_crawl_timestamp: now,
updated_at: now
]
],

View File

@ -2,7 +2,7 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
require Logger
import Backend.Crawler.Util
alias Backend.Crawler.ApiCrawler
alias Backend.Interaction
alias Backend.Instance
alias Backend.Repo
import Ecto.Query
@ -148,14 +148,12 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
# most recent status we have.
min_timestamp =
if statuses_seen == 0 do
Interaction
|> where(source_domain: ^domain)
|> select([:timestamp])
|> order_by(desc: :timestamp)
|> limit(1)
|> Repo.all()
|> List.first()
|> (fn i -> i.timestamp end).()
Instance
|> select([:last_crawl_timestamp])
|> Repo.get_by(domain: domain)
|> (fn result ->
if result == nil, do: nil, else: Map.get(result, :last_crawl_timestamp)
end).()
else
min_timestamp
end

View File

@ -9,6 +9,10 @@ defmodule Backend.Instance do
field :user_count, :integer
field :version, :string
# this is distinct from `updated_at` -- it indicates when the last *successful* crawl was. `updated_at` also
# gets updated if the crawl fails.
field :last_crawl_timestamp, :naive_datetime
many_to_many :peers, Backend.Instance,
join_through: Backend.InstancePeer,
join_keys: [source: :domain, target: :domain]

View File

@ -9,6 +9,8 @@ defmodule Backend.Repo.Migrations.CreateInstances do
add :description, :text
add :version, :string
add :last_crawl_timestamp, :naive_datetime
timestamps()
end