check robots.txt for permission to crawl

This commit is contained in:
Tao Bojlén 2019-07-19 20:00:28 +00:00
parent 97f8327336
commit 2cae5dccbf
5 changed files with 64 additions and 17 deletions

View file

@ -38,6 +38,12 @@ defmodule Backend.Crawler.ApiCrawler do
Check whether the instance at the given domain is of the type that this ApiCrawler implements.
"""
@callback is_instance_type?(String.t()) :: boolean()
@doc """
Check whether the instance allows crawling according to its robots.txt or otherwise.
"""
@callback allows_crawling?(String.t()) :: boolean()
@doc """
Crawl the instance at the given domain.
"""

View file

@ -17,6 +17,7 @@ defmodule Backend.Crawler do
# a list of ApiCrawlers that will be attempted
:api_crawlers,
:found_api?,
:allows_crawling?,
:result,
:error
]
@ -25,6 +26,7 @@ defmodule Backend.Crawler do
domain: String.t(),
api_crawlers: [ApiCrawler.t()],
found_api?: boolean,
allows_crawling?: boolean,
result: ApiCrawler.t() | nil,
error: String.t() | nil
}
@ -32,7 +34,15 @@ defmodule Backend.Crawler do
def run(domain) do
Logger.info("Crawling #{domain}...")
HTTPoison.start()
state = %Crawler{domain: domain, api_crawlers: [], found_api?: false, result: nil, error: nil}
state = %Crawler{
domain: domain,
api_crawlers: [],
found_api?: false,
allows_crawling?: true,
result: nil,
error: nil
}
state
# register APICrawlers here
@ -59,6 +69,7 @@ defmodule Backend.Crawler do
Logger.debug("Found #{curr} instance")
state = Map.put(state, :found_api?, true)
if curr.allows_crawling?(domain) do
try do
%Crawler{state | result: curr.crawl(domain), api_crawlers: []}
rescue
@ -71,6 +82,10 @@ defmodule Backend.Crawler do
e in _ ->
Map.put(state, :error, "Unknown error: " <> inspect(e))
end
else
Logger.debug("#{domain} does not allow crawling.")
Map.put(state, :allows_crawling?, false)
end
else
# Nothing found so check the next APICrawler
Logger.debug("#{domain} is not an instance of #{curr}")
@ -79,7 +94,13 @@ defmodule Backend.Crawler do
end
# Save the state (after crawling) to the database.
defp save(%Crawler{domain: domain, result: result, found_api?: true, error: nil}) do
defp save(%Crawler{
domain: domain,
result: result,
found_api?: true,
error: nil,
allows_crawling?: true
}) do
now = get_now()
## Update the instance we crawled ##
@ -187,9 +208,12 @@ defmodule Backend.Crawler do
|> Repo.insert_all(interactions)
end
defp save(%{domain: domain, error: error}) do
if error == nil do
error = "no api found"
defp save(%{domain: domain, error: error, allows_crawling?: allows_crawling}) do
error =
cond do
not allows_crawling -> "robots.txt"
error == nil -> "no api found"
true -> "unknown error"
end
Repo.insert!(%Crawl{

View file

@ -14,6 +14,21 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
end
end
@impl ApiCrawler
def allows_crawling?(domain) do
endpoints = [
"/api/v1/instance",
"/api/v1/instance/peers",
"/api/v1/timelines/public"
]
user_agent = get_config(:user_agent)
endpoints
|> Enum.map(fn endpoint -> "https://#{domain}#{endpoint}" end)
|> Enum.all?(fn endpoint -> Gollum.crawlable?(user_agent, endpoint) != :uncrawlable end)
end
@impl ApiCrawler
# sobelow_skip ["DOS.StringToAtom"]
def crawl(domain) do

View file

@ -20,7 +20,7 @@ defmodule Backend.MixProject do
def application do
[
mod: {Backend.Application, []},
extra_applications: [:logger, :runtime_tools, :mnesia]
extra_applications: [:logger, :runtime_tools, :mnesia, :gollum]
]
end
@ -46,7 +46,8 @@ defmodule Backend.MixProject do
{:honeydew, "~> 1.4.3"},
{:quantum, "~> 2.3"},
{:corsica, "~> 1.1.2"},
{:sobelow, "~> 0.8", only: :dev}
{:sobelow, "~> 0.8", only: :dev},
{:gollum, "~> 0.3.2"}
]
end

View file

@ -15,6 +15,7 @@
"gen_stage": {:hex, :gen_stage, "0.14.2", "6a2a578a510c5bfca8a45e6b27552f613b41cf584b58210f017088d3d17d0b14", [:mix], [], "hexpm"},
"gen_state_machine": {:hex, :gen_state_machine, "2.0.5", "9ac15ec6e66acac994cc442dcc2c6f9796cf380ec4b08267223014be1c728a95", [:mix], [], "hexpm"},
"gettext": {:hex, :gettext, "0.17.0", "abe21542c831887a2b16f4c94556db9c421ab301aee417b7c4fbde7fbdbe01ec", [:mix], [], "hexpm"},
"gollum": {:hex, :gollum, "0.3.2", "bd6a17febb98f5c362b5c5c08d9b102738fb875016e1eefec626f5e87c23f20e", [:mix], [{:httpoison, "~> 1.5.1", [hex: :httpoison, repo: "hexpm", optional: false]}], "hexpm"},
"hackney": {:hex, :hackney, "1.15.1", "9f8f471c844b8ce395f7b6d8398139e26ddca9ebc171a8b91342ee15a19963f4", [:rebar3], [{:certifi, "2.5.1", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "6.0.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "1.0.1", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "1.1.4", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}], "hexpm"},
"honeydew": {:hex, :honeydew, "1.4.3", "f2d976aaf8b9b914a635d2d483f1a71d2f6d8651809474dd5db581953cbebb30", [:mix], [{:ecto, "~> 3.0", [hex: :ecto, repo: "hexpm", optional: true]}], "hexpm"},
"httpoison": {:hex, :httpoison, "1.5.1", "0f55b5b673b03c5c327dac7015a67cb571b99b631acc0bc1b0b98dcd6b9f2104", [:mix], [{:hackney, "~> 1.8", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm"},