2019-07-14 11:47:06 +00:00
|
|
|
defmodule Backend.Crawler do
|
|
|
|
@moduledoc """
|
|
|
|
This module crawls instances. Run `run(domain)` to crawl a given domain.
|
|
|
|
"""
|
|
|
|
|
|
|
|
alias __MODULE__
|
|
|
|
alias Backend.Crawler.Crawlers.Mastodon
|
|
|
|
alias Backend.Crawler.ApiCrawler
|
|
|
|
alias Backend.{Crawl, CrawlInteraction, Repo, Instance, InstancePeer}
|
|
|
|
import Ecto.Query
|
|
|
|
import Backend.Util
|
|
|
|
require Logger
|
|
|
|
|
|
|
|
defstruct [
|
|
|
|
# the instance domain (a string)
|
|
|
|
:domain,
|
|
|
|
# a list of ApiCrawlers that will be attempted
|
|
|
|
:api_crawlers,
|
|
|
|
:found_api?,
|
2019-07-19 20:00:28 +00:00
|
|
|
:allows_crawling?,
|
2019-07-14 11:47:06 +00:00
|
|
|
:result,
|
|
|
|
:error
|
|
|
|
]
|
|
|
|
|
|
|
|
@type t() :: %__MODULE__{
|
|
|
|
domain: String.t(),
|
|
|
|
api_crawlers: [ApiCrawler.t()],
|
|
|
|
found_api?: boolean,
|
2019-07-19 20:00:28 +00:00
|
|
|
allows_crawling?: boolean,
|
2019-07-14 11:47:06 +00:00
|
|
|
result: ApiCrawler.t() | nil,
|
|
|
|
error: String.t() | nil
|
|
|
|
}
|
|
|
|
|
|
|
|
def run(domain) do
|
|
|
|
Logger.info("Crawling #{domain}...")
|
|
|
|
HTTPoison.start()
|
2019-07-19 20:00:28 +00:00
|
|
|
|
|
|
|
state = %Crawler{
|
|
|
|
domain: domain,
|
|
|
|
api_crawlers: [],
|
|
|
|
found_api?: false,
|
|
|
|
allows_crawling?: true,
|
|
|
|
result: nil,
|
|
|
|
error: nil
|
|
|
|
}
|
2019-07-14 11:47:06 +00:00
|
|
|
|
|
|
|
state
|
|
|
|
# register APICrawlers here
|
|
|
|
|> register(Mastodon)
|
|
|
|
# go!
|
|
|
|
|> crawl()
|
|
|
|
|> save()
|
|
|
|
end
|
|
|
|
|
|
|
|
# Adds a new ApiCrawler that run/1 will check.
|
|
|
|
defp register(%Crawler{api_crawlers: crawlers} = state, api_crawler) do
|
|
|
|
Map.put(state, :api_crawlers, [api_crawler | crawlers])
|
|
|
|
end
|
|
|
|
|
|
|
|
# Recursive function to check whether `domain` has an API that the head of the api_crawlers list can read.
|
|
|
|
# If so, crawls it. If not, continues with the tail of the api_crawlers list.
|
|
|
|
defp crawl(%Crawler{api_crawlers: [], domain: domain} = state) do
|
|
|
|
Logger.debug("Found no compatible API for #{domain}")
|
|
|
|
Map.put(state, :found_api?, false)
|
|
|
|
end
|
|
|
|
|
|
|
|
defp crawl(%Crawler{domain: domain, api_crawlers: [curr | remaining_crawlers]} = state) do
|
|
|
|
if curr.is_instance_type?(domain) do
|
|
|
|
Logger.debug("Found #{curr} instance")
|
|
|
|
state = Map.put(state, :found_api?, true)
|
|
|
|
|
2019-07-19 20:00:28 +00:00
|
|
|
if curr.allows_crawling?(domain) do
|
|
|
|
try do
|
|
|
|
%Crawler{state | result: curr.crawl(domain), api_crawlers: []}
|
|
|
|
rescue
|
|
|
|
e in HTTPoison.Error ->
|
|
|
|
Map.put(state, :error, "HTTPoison error: " <> HTTPoison.Error.message(e))
|
|
|
|
|
|
|
|
e in Jason.DecodeError ->
|
|
|
|
Map.put(state, :error, "Jason DecodeError: " <> Jason.DecodeError.message(e))
|
|
|
|
|
|
|
|
e in _ ->
|
|
|
|
Map.put(state, :error, "Unknown error: " <> inspect(e))
|
|
|
|
end
|
|
|
|
else
|
|
|
|
Logger.debug("#{domain} does not allow crawling.")
|
|
|
|
Map.put(state, :allows_crawling?, false)
|
2019-07-14 11:47:06 +00:00
|
|
|
end
|
|
|
|
else
|
|
|
|
# Nothing found so check the next APICrawler
|
|
|
|
Logger.debug("#{domain} is not an instance of #{curr}")
|
|
|
|
crawl(%Crawler{state | api_crawlers: remaining_crawlers})
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
# Save the state (after crawling) to the database.
|
2019-07-19 20:00:28 +00:00
|
|
|
defp save(%Crawler{
|
|
|
|
domain: domain,
|
|
|
|
result: result,
|
|
|
|
found_api?: true,
|
|
|
|
error: nil,
|
|
|
|
allows_crawling?: true
|
|
|
|
}) do
|
2019-07-18 10:21:12 +00:00
|
|
|
now = get_now()
|
2019-07-14 11:47:06 +00:00
|
|
|
|
|
|
|
## Update the instance we crawled ##
|
|
|
|
Repo.insert!(
|
|
|
|
%Instance{
|
|
|
|
domain: domain,
|
|
|
|
description: result.description,
|
|
|
|
version: result.version,
|
|
|
|
user_count: result.user_count,
|
|
|
|
status_count: result.status_count
|
|
|
|
},
|
|
|
|
on_conflict: [
|
|
|
|
set: [
|
|
|
|
description: result.description,
|
|
|
|
version: result.version,
|
|
|
|
user_count: result.user_count,
|
|
|
|
status_count: result.status_count,
|
|
|
|
updated_at: now
|
|
|
|
]
|
|
|
|
],
|
|
|
|
conflict_target: :domain
|
|
|
|
)
|
|
|
|
|
|
|
|
# Save details of a new crawl
|
|
|
|
curr_crawl =
|
|
|
|
Repo.insert!(%Crawl{
|
|
|
|
instance_domain: domain,
|
|
|
|
interactions_seen:
|
|
|
|
result.interactions |> Map.values() |> Enum.reduce(0, fn count, acc -> count + acc end),
|
|
|
|
statuses_seen: result.statuses_seen
|
|
|
|
})
|
|
|
|
|
|
|
|
# We get a list of peers from two places:
|
|
|
|
# * the official peers endpoint (which may be disabled)
|
|
|
|
# * the interactions
|
|
|
|
peers_domains =
|
|
|
|
result.interactions
|
|
|
|
|> Map.keys()
|
|
|
|
|> list_union(result.peers)
|
|
|
|
|> Enum.filter(fn domain -> not is_blacklisted?(domain) end)
|
|
|
|
|
|
|
|
peers =
|
|
|
|
peers_domains
|
|
|
|
|> Enum.map(&%{domain: &1, inserted_at: now, updated_at: now})
|
|
|
|
|
|
|
|
Instance
|
|
|
|
|> Repo.insert_all(peers, on_conflict: :nothing, conflict_target: :domain)
|
|
|
|
|
|
|
|
Repo.transaction(fn ->
|
|
|
|
## Save peer relationships ##
|
|
|
|
# get current peers (a list of strings)
|
|
|
|
current_peers =
|
|
|
|
InstancePeer
|
|
|
|
|> where(source_domain: ^domain)
|
|
|
|
|> select([p], p.target_domain)
|
|
|
|
|> Repo.all()
|
|
|
|
|
|
|
|
wanted_peers_set = MapSet.new(peers_domains)
|
|
|
|
current_peers_set = MapSet.new(current_peers)
|
|
|
|
|
|
|
|
# delete the peers we don't want
|
|
|
|
dont_want = current_peers_set |> MapSet.difference(wanted_peers_set) |> MapSet.to_list()
|
|
|
|
|
|
|
|
if length(dont_want) > 0 do
|
|
|
|
InstancePeer
|
|
|
|
|> where(source_domain: ^domain)
|
|
|
|
|> where([p], p.target_domain in ^dont_want)
|
|
|
|
|> Repo.delete_all([])
|
|
|
|
end
|
|
|
|
|
|
|
|
# insert the ones we don't have yet
|
|
|
|
new_instance_peers =
|
|
|
|
wanted_peers_set
|
|
|
|
|> MapSet.difference(current_peers_set)
|
|
|
|
|> MapSet.to_list()
|
|
|
|
|> Enum.map(
|
|
|
|
&%{
|
|
|
|
source_domain: domain,
|
|
|
|
target_domain: &1,
|
|
|
|
inserted_at: now,
|
|
|
|
updated_at: now
|
|
|
|
}
|
|
|
|
)
|
|
|
|
|
|
|
|
InstancePeer
|
|
|
|
|> Repo.insert_all(new_instance_peers)
|
|
|
|
end)
|
|
|
|
|
|
|
|
## Save interactions ##
|
|
|
|
interactions =
|
|
|
|
result.interactions
|
|
|
|
|> Enum.filter(fn {target_domain, _count} -> not is_blacklisted?(target_domain) end)
|
|
|
|
|> Enum.map(fn {target_domain, count} ->
|
|
|
|
%{
|
|
|
|
crawl_id: curr_crawl.id,
|
|
|
|
source_domain: domain,
|
|
|
|
target_domain: target_domain,
|
|
|
|
mentions: count,
|
|
|
|
inserted_at: now,
|
|
|
|
updated_at: now
|
|
|
|
}
|
|
|
|
end)
|
|
|
|
|
|
|
|
CrawlInteraction
|
|
|
|
|> Repo.insert_all(interactions)
|
|
|
|
end
|
|
|
|
|
2019-07-19 20:00:28 +00:00
|
|
|
defp save(%{domain: domain, error: error, allows_crawling?: allows_crawling}) do
|
|
|
|
error =
|
|
|
|
cond do
|
|
|
|
not allows_crawling -> "robots.txt"
|
|
|
|
error == nil -> "no api found"
|
|
|
|
true -> "unknown error"
|
|
|
|
end
|
2019-07-19 18:19:53 +00:00
|
|
|
|
2019-07-14 11:47:06 +00:00
|
|
|
Repo.insert!(%Crawl{
|
|
|
|
instance_domain: domain,
|
|
|
|
error: error
|
|
|
|
})
|
|
|
|
end
|
|
|
|
end
|