save crawl error details

This commit is contained in:
Tao Bror Bojlén 2019-07-03 16:36:53 +01:00
parent 4cc24e0889
commit 27dc0c5260
No known key found for this signature in database
GPG Key ID: C6EC7AAB905F9E6F
6 changed files with 255 additions and 214 deletions

View File

@ -2,18 +2,17 @@ defmodule Backend.Crawler.ApiCrawler do
@moduledoc """
This module is a specification. Crawlers for all instance types must implement its behaviour.
TODO:
Make sure to respect the following configuration values:
* `:status_age_limit_days` specifies that you must only crawl statuses from the most recent N days
* `:status_count_limit` specifies the max number of statuses to crawl in one go
* `:personal_instance_threshold` specifies that instances with fewer than this number of users should not be crawled
* also, profiles with the string "nobot" (case insensitive) in their profile must not be included in any stats.
Make sure to respect the following:
* You must adhere to the following configuration values:
* `:status_age_limit_days` specifies that you must only crawl statuses from the most recent N days
* `:status_count_limit` specifies the max number of statuses to crawl in one go
* `:personal_instance_threshold` specifies that instances with fewer than this number of users should not be crawled
* profiles with the string "nobot" (case insensitive) in their profile must not be included in any stats
* Make sure to check the most recent crawl of the instance so you don't re-crawl old statuses
"""
alias __MODULE__
# {domain_mentioned, datetime}
@type instance_interaction :: {String.t(), DateTime}
# {domain_mentioned, timestamp}
@type instance_interaction :: {String.t(), NaiveDateTime}
defstruct [
:version,
@ -24,7 +23,7 @@ defmodule Backend.Crawler.ApiCrawler do
:interactions
]
@type crawl_result :: %ApiCrawler{
@type t() :: %__MODULE__{
version: String.t(),
description: String.t(),
user_count: Number,
@ -40,5 +39,5 @@ defmodule Backend.Crawler.ApiCrawler do
@doc """
Crawl the instance at the given domain.
"""
@callback crawl(String.t()) :: crawl_result()
@callback crawl(String.t()) :: t()
end

View File

@ -5,25 +5,33 @@ defmodule Backend.Crawler.Crawler do
alias __MODULE__
alias Backend.Crawler.Crawlers.Mastodon
alias Backend.Crawler.ApiCrawler
alias Backend.{Repo, Instance, Interaction, InstancePeer}
import Ecto.Query
require Logger
defstruct [
# the instance domain (a string)
:instance,
:domain,
# a list of ApiCrawlers that will be attempted
:api_crawlers,
:found_api?,
# an instance of the ApiCrawler struct
:result
:result,
:error
]
@spec run(String.t()) :: any
@type t() :: %__MODULE__{
domain: String.t(),
api_crawlers: [ApiCrawler.t()],
found_api?: boolean,
result: ApiCrawler.t() | nil,
error: String.t() | nil
}
def run(domain) do
Logger.info("Crawling #{domain}...")
HTTPoison.start()
state = %Crawler{instance: domain, api_crawlers: [], found_api?: false, result: nil}
state = %Crawler{domain: domain, api_crawlers: [], found_api?: false, result: nil, error: nil}
state
# register APICrawlers here
@ -38,135 +46,154 @@ defmodule Backend.Crawler.Crawler do
Map.put(state, :api_crawlers, [api_crawler | crawlers])
end
# Recursive function to check whether `instance` has an API that the head of the api_crawlers list can read.
# Recursive function to check whether `domain` has an API that the head of the api_crawlers list can read.
# If so, crawls it. If not, continues with the tail of the api_crawlers list.
defp crawl(%Crawler{api_crawlers: []} = state, instance) do
Logger.info("Found no compatible API for #{instance}")
defp crawl(%Crawler{api_crawlers: []} = state, domain) do
Logger.info("Found no compatible API for #{domain}")
Map.put(state, :found_api?, false)
end
defp crawl(state, instance) do
crawlers = Map.get(state, :api_crawlers, [])
curr = hd(crawlers)
defp crawl(state, domain) do
curr =
state
|> Map.get(:api_crawlers, [])
|> Kernel.hd()
if curr.is_instance_type?(instance) do
if curr.is_instance_type?(domain) do
Logger.debug("Found #{curr} instance")
state = Map.put(state, :found_api?, true)
Map.put(state, :found_api?, true)
|> Map.put(:result, curr.crawl(instance))
try do
Map.put(state, :result, curr.crawl(domain))
rescue
e in HTTPoison.Error ->
Map.put(state, :error, "HTTPoison error: " <> HTTPoison.Error.message(e))
e in Jason.DecodeError ->
Map.put(state, :error, "Jason DecodeError: " <> Jason.DecodeError.message(e))
e in _ ->
Map.put(state, :error, "Unknown error: " <> inspect(e))
end
else
# Nothing found so check the next APICrawler
Logger.debug("#{instance} is not an instance of #{curr}")
crawl(%Crawler{state | api_crawlers: tl(crawlers)}, instance)
Logger.debug("#{domain} is not an instance of #{curr}")
crawl(
%Crawler{state | api_crawlers: state |> Map.get(:api_crawlers, []) |> Kernel.tl()},
domain
)
end
end
# Save the state (after crawling) to the database.
defp save(%Crawler{instance: domain, result: result} = state) do
if state.found_api? do
now = NaiveDateTime.truncate(NaiveDateTime.utc_now(), :second)
peers_domains = result.peers
defp save(%Crawler{domain: domain, result: result, found_api?: true, error: nil}) do
now = NaiveDateTime.truncate(NaiveDateTime.utc_now(), :second)
peers_domains = result.peers
# We also get a list of the instances mentioned, in case the server doesn't publish a list of peers.
peers_from_mentions =
result.interactions
|> Enum.map(fn i -> Kernel.elem(i, 0) end)
# We also get a list of the instances mentioned, in case the server doesn't publish a list of peers.
peers_from_mentions =
result.interactions
|> Enum.map(fn i -> Kernel.elem(i, 0) end)
## Save the instance we crawled ##
Repo.insert!(
%Instance{
domain: domain,
## Save the instance we crawled ##
Repo.insert!(
%Instance{
domain: domain,
description: result.description,
version: result.version,
user_count: result.user_count,
status_count: result.status_count,
last_crawl_timestamp: now
},
on_conflict: [
set: [
description: result.description,
version: result.version,
user_count: result.user_count,
status_count: result.status_count,
last_crawl_timestamp: now
},
on_conflict: [
set: [
description: result.description,
version: result.version,
user_count: result.user_count,
status_count: result.status_count,
last_crawl_timestamp: now,
updated_at: now
]
],
conflict_target: :domain
)
last_crawl_timestamp: now,
updated_at: now,
error: nil
]
],
conflict_target: :domain
)
# If we discovered new instances from the peers endpoint or from mentions, add them
peers =
peers_domains
|> MapSet.new()
|> (fn set -> MapSet.union(set, MapSet.new(peers_from_mentions)) end).()
|> MapSet.to_list()
|> Enum.map(&%{domain: &1, inserted_at: now, updated_at: now})
# If we discovered new instances from the peers endpoint or from mentions, add them
peers =
peers_domains
|> MapSet.new()
|> (fn set -> MapSet.union(set, MapSet.new(peers_from_mentions)) end).()
|> MapSet.to_list()
|> Enum.map(&%{domain: &1, inserted_at: now, updated_at: now})
Instance
|> Repo.insert_all(peers, on_conflict: :nothing, conflict_target: :domain)
## Save peer relationships ##
Repo.transaction(fn ->
# get current peers (a list of strings)
current_peers =
InstancePeer
|> where(source_domain: ^domain)
|> select([p], p.target_domain)
|> Repo.all()
wanted_peers_set = MapSet.new(peers_domains)
current_peers_set = MapSet.new(current_peers)
# delete the peers we don't want
dont_want = current_peers_set |> MapSet.difference(wanted_peers_set) |> MapSet.to_list()
Instance
|> Repo.insert_all(peers, on_conflict: :nothing, conflict_target: :domain)
## Save peer relationships ##
Repo.transaction(fn ->
# get current peers (a list of strings)
current_peers =
InstancePeer
|> where(source_domain: ^domain)
|> where([p], p.target_domain in ^dont_want)
|> Repo.delete_all([])
|> select([p], p.target_domain)
|> Repo.all()
# insert the ones we don't have yet
new_instance_peers =
wanted_peers_set
|> MapSet.difference(current_peers_set)
|> MapSet.to_list()
|> Enum.map(
&%{
source_domain: domain,
target_domain: &1,
inserted_at: now,
updated_at: now
}
)
wanted_peers_set = MapSet.new(peers_domains)
current_peers_set = MapSet.new(current_peers)
InstancePeer
|> Repo.insert_all(new_instance_peers)
end)
# delete the peers we don't want
dont_want = current_peers_set |> MapSet.difference(wanted_peers_set) |> MapSet.to_list()
## Save interactions ##
interactions =
result.interactions
|> Enum.map(fn {target_domain, timestamp} ->
%{
InstancePeer
|> where(source_domain: ^domain)
|> where([p], p.target_domain in ^dont_want)
|> Repo.delete_all([])
# insert the ones we don't have yet
new_instance_peers =
wanted_peers_set
|> MapSet.difference(current_peers_set)
|> MapSet.to_list()
|> Enum.map(
&%{
source_domain: domain,
target_domain: target_domain,
timestamp: NaiveDateTime.truncate(timestamp, :second),
target_domain: &1,
inserted_at: now,
updated_at: now
}
end)
)
Interaction
|> Repo.insert_all(interactions)
else
Repo.insert!(
%Instance{
domain: domain
},
on_conflict: [set: [domain: :domain]],
conflict_target: :domain
)
end
InstancePeer
|> Repo.insert_all(new_instance_peers)
end)
## Save interactions ##
interactions =
result.interactions
|> Enum.map(fn {target_domain, timestamp} ->
%{
source_domain: domain,
target_domain: target_domain,
timestamp: NaiveDateTime.truncate(timestamp, :second),
inserted_at: now,
updated_at: now
}
end)
Interaction
|> Repo.insert_all(interactions)
end
defp save(%{domain: domain, error: error}) do
Repo.insert!(
%Instance{
domain: domain,
error: error
},
on_conflict: [set: [error: error]],
conflict_target: :domain
)
end
end

View File

@ -34,7 +34,7 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
end
end
@spec crawl_large_instance(String.t(), any()) :: ApiCrawler.crawl_result()
@spec crawl_large_instance(String.t(), any()) :: ApiCrawler.t()
defp crawl_large_instance(domain, instance) do
# servers may not publish peers
peers =
@ -58,14 +58,87 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
)
end
defp is_http_200?(response) do
case response do
%{status_code: 200} -> true
_ -> false
@spec get_mentions(
String.t(),
String.t() | nil,
Calendar.naive_datetime() | nil,
[ApiCrawler.instance_interaction()],
integer
) :: {[ApiCrawler.instance_interaction()], integer}
defp get_mentions(
domain,
max_id \\ nil,
min_timestamp \\ nil,
mentions \\ [],
statuses_seen \\ 0
) do
# If `statuses_seen == 0`, it's the first call of this function, which means we want to query the database for the
# most recent status we have.
min_timestamp =
if statuses_seen == 0 do
Instance
|> select([:last_crawl_timestamp])
|> Repo.get_by(domain: domain)
|> (fn result ->
if result == nil, do: nil, else: Map.get(result, :last_crawl_timestamp)
end).()
else
min_timestamp
end
endpoint = "https://#{domain}/api/v1/timelines/public?local=true"
endpoint =
if max_id do
endpoint <> "&max_id=#{max_id}"
else
endpoint
end
Logger.debug("Crawling #{endpoint}")
# TODO: handle errors here
statuses =
endpoint
|> HTTPoison.get!([], timeout: 10_000, recv_timeout: 10_000)
|> Map.get(:body)
|> Jason.decode!()
filtered_statuses =
statuses
|> Enum.filter(fn s -> is_after?(s["created_at"], min_timestamp) end)
if length(filtered_statuses) > 0 do
# get statuses that are eligible (i.e. users don't have #nobot in their profile) and have mentions
mentions = mentions ++ statuses_to_mentions(filtered_statuses)
statuses_seen = statuses_seen + length(filtered_statuses)
status_datetime_threshold =
NaiveDateTime.utc_now()
|> NaiveDateTime.add(get_config(:status_age_limit_days) * 24 * 3600 * -1, :second)
oldest_status = Enum.at(filtered_statuses, -1)
oldest_status_datetime =
oldest_status
|> (fn s -> s["created_at"] end).()
|> NaiveDateTime.from_iso8601!()
if NaiveDateTime.compare(oldest_status_datetime, status_datetime_threshold) == :gt and
statuses_seen < get_config(:status_count_limit) and
length(filtered_statuses) == length(statuses) do
get_mentions(domain, oldest_status["id"], min_timestamp, mentions, statuses_seen)
else
{mentions, statuses_seen}
end
else
{mentions, statuses_seen}
end
end
# check if the endpoint works as expected
@spec has_title?(String.t()) :: boolean
defp has_title?(body) do
case Jason.decode(body) do
{:ok, decoded} -> Map.has_key?(decoded, "title")
@ -115,97 +188,4 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
|> Enum.filter(fn status -> is_eligible?(status) end)
|> Enum.flat_map(fn status -> extract_mentions_from_status(status) end)
end
@spec is_after?(String.t(), NaiveDateTime.t() | nil) :: boolean()
defp is_after?(timestamp, threshold) do
if threshold == nil do
true
else
timestamp
|> NaiveDateTime.from_iso8601!()
# :second is the granularity used in the database
|> NaiveDateTime.truncate(:second)
|> NaiveDateTime.compare(threshold)
|> Kernel.===(:gt)
end
end
@spec get_mentions(
String.t(),
String.t() | nil,
Calendar.naive_datetime() | nil,
[ApiCrawler.instance_interaction()],
integer
) :: {[ApiCrawler.instance_interaction()], integer}
defp get_mentions(
domain,
max_id \\ nil,
min_timestamp \\ nil,
mentions \\ [],
statuses_seen \\ 0
) do
# If `statuses_seen == 0`, it's the first call of this function, which means we want to query the database for the
# most recent status we have.
min_timestamp =
if statuses_seen == 0 do
Instance
|> select([:last_crawl_timestamp])
|> Repo.get_by(domain: domain)
|> (fn result ->
if result == nil, do: nil, else: Map.get(result, :last_crawl_timestamp)
end).()
else
min_timestamp
end
endpoint = "https://#{domain}/api/v1/timelines/public?local=true"
endpoint =
if max_id do
endpoint <> "&max_id=#{max_id}"
else
endpoint
end
Logger.debug("Crawling #{endpoint}")
# TODO: handle errors here
statuses =
endpoint
|> HTTPoison.get!()
|> Map.get(:body)
|> Jason.decode!()
filtered_statuses =
statuses
|> Enum.filter(fn s -> is_after?(s["created_at"], min_timestamp) end)
if length(filtered_statuses) > 0 do
# get statuses that are eligible (i.e. users don't have #nobot in their profile) and have mentions
mentions = mentions ++ statuses_to_mentions(filtered_statuses)
statuses_seen = statuses_seen + length(filtered_statuses)
status_datetime_threshold =
NaiveDateTime.utc_now()
|> NaiveDateTime.add(get_config(:status_age_limit_days) * 24 * 3600 * -1, :second)
oldest_status = Enum.at(filtered_statuses, -1)
oldest_status_datetime =
oldest_status
|> (fn s -> s["created_at"] end).()
|> NaiveDateTime.from_iso8601!()
if NaiveDateTime.compare(oldest_status_datetime, status_datetime_threshold) == :gt and
statuses_seen < get_config(:status_count_limit) and
length(filtered_statuses) == length(statuses) do
get_mentions(domain, oldest_status["id"], min_timestamp, mentions, statuses_seen)
else
{mentions, statuses_seen}
end
else
{mentions, statuses_seen}
end
end
end

View File

@ -1,13 +1,38 @@
defmodule Backend.Crawler.Util do
@spec get_config(atom) :: any
def get_config(key) do
Application.get_env(:backend, :crawler)[key]
end
# Gets the domain from a Mastodon/Pleroma account URL
# (e.g. https://mastodon.social/@demouser or https://pleroma.site/users/demouser)
@spec get_domain(String.t()) :: String.t()
def get_domain(url) do
String.slice(url, 8..-1)
|> String.split("/")
|> Enum.at(0)
end
@spec is_http_200?(HTTPoison.Response.t()) :: boolean
def is_http_200?(%{status_code: 200}) do
true
end
def is_http_200?(_) do
false
end
@spec is_after?(String.t(), NaiveDateTime.t() | nil) :: boolean()
def is_after?(timestamp, threshold) do
if threshold == nil do
true
else
timestamp
|> NaiveDateTime.from_iso8601!()
# :second is the granularity used in the database
|> NaiveDateTime.truncate(:second)
|> NaiveDateTime.compare(threshold)
|> Kernel.===(:gt)
end
end
end

View File

@ -8,6 +8,7 @@ defmodule Backend.Instance do
field :status_count, :integer
field :user_count, :integer
field :version, :string
field :error, :string
# this is distinct from `updated_at` -- it indicates when the last *successful* crawl was. `updated_at` also
# gets updated if the crawl fails.
@ -37,7 +38,15 @@ defmodule Backend.Instance do
@doc false
def changeset(instance, attrs) do
instance
|> cast(attrs, [:domain, :user_count, :status_count, :description, :version, :updated_at])
|> cast(attrs, [
:domain,
:user_count,
:status_count,
:description,
:version,
:updated_at,
:last_crawl_timestamp
])
|> validate_required([:domain])
|> put_assoc(:peers, attrs.peers)
end

View File

@ -8,6 +8,7 @@ defmodule Backend.Repo.Migrations.CreateInstances do
add :status_count, :integer
add :description, :text
add :version, :string
add :error, :text
add :last_crawl_timestamp, :naive_datetime