check robots.txt for permission to crawl

This commit is contained in:
Tao Bojlén 2019-07-19 20:00:28 +00:00
parent 97f8327336
commit 2cae5dccbf
5 changed files with 64 additions and 17 deletions

View File

@ -38,6 +38,12 @@ defmodule Backend.Crawler.ApiCrawler do
Check whether the instance at the given domain is of the type that this ApiCrawler implements. Check whether the instance at the given domain is of the type that this ApiCrawler implements.
""" """
@callback is_instance_type?(String.t()) :: boolean() @callback is_instance_type?(String.t()) :: boolean()
@doc """
Check whether the instance allows crawling according to its robots.txt or otherwise.
"""
@callback allows_crawling?(String.t()) :: boolean()
@doc """ @doc """
Crawl the instance at the given domain. Crawl the instance at the given domain.
""" """

View File

@ -17,6 +17,7 @@ defmodule Backend.Crawler do
# a list of ApiCrawlers that will be attempted # a list of ApiCrawlers that will be attempted
:api_crawlers, :api_crawlers,
:found_api?, :found_api?,
:allows_crawling?,
:result, :result,
:error :error
] ]
@ -25,6 +26,7 @@ defmodule Backend.Crawler do
domain: String.t(), domain: String.t(),
api_crawlers: [ApiCrawler.t()], api_crawlers: [ApiCrawler.t()],
found_api?: boolean, found_api?: boolean,
allows_crawling?: boolean,
result: ApiCrawler.t() | nil, result: ApiCrawler.t() | nil,
error: String.t() | nil error: String.t() | nil
} }
@ -32,7 +34,15 @@ defmodule Backend.Crawler do
def run(domain) do def run(domain) do
Logger.info("Crawling #{domain}...") Logger.info("Crawling #{domain}...")
HTTPoison.start() HTTPoison.start()
state = %Crawler{domain: domain, api_crawlers: [], found_api?: false, result: nil, error: nil}
state = %Crawler{
domain: domain,
api_crawlers: [],
found_api?: false,
allows_crawling?: true,
result: nil,
error: nil
}
state state
# register APICrawlers here # register APICrawlers here
@ -59,17 +69,22 @@ defmodule Backend.Crawler do
Logger.debug("Found #{curr} instance") Logger.debug("Found #{curr} instance")
state = Map.put(state, :found_api?, true) state = Map.put(state, :found_api?, true)
try do if curr.allows_crawling?(domain) do
%Crawler{state | result: curr.crawl(domain), api_crawlers: []} try do
rescue %Crawler{state | result: curr.crawl(domain), api_crawlers: []}
e in HTTPoison.Error -> rescue
Map.put(state, :error, "HTTPoison error: " <> HTTPoison.Error.message(e)) e in HTTPoison.Error ->
Map.put(state, :error, "HTTPoison error: " <> HTTPoison.Error.message(e))
e in Jason.DecodeError -> e in Jason.DecodeError ->
Map.put(state, :error, "Jason DecodeError: " <> Jason.DecodeError.message(e)) Map.put(state, :error, "Jason DecodeError: " <> Jason.DecodeError.message(e))
e in _ -> e in _ ->
Map.put(state, :error, "Unknown error: " <> inspect(e)) Map.put(state, :error, "Unknown error: " <> inspect(e))
end
else
Logger.debug("#{domain} does not allow crawling.")
Map.put(state, :allows_crawling?, false)
end end
else else
# Nothing found so check the next APICrawler # Nothing found so check the next APICrawler
@ -79,7 +94,13 @@ defmodule Backend.Crawler do
end end
# Save the state (after crawling) to the database. # Save the state (after crawling) to the database.
defp save(%Crawler{domain: domain, result: result, found_api?: true, error: nil}) do defp save(%Crawler{
domain: domain,
result: result,
found_api?: true,
error: nil,
allows_crawling?: true
}) do
now = get_now() now = get_now()
## Update the instance we crawled ## ## Update the instance we crawled ##
@ -187,10 +208,13 @@ defmodule Backend.Crawler do
|> Repo.insert_all(interactions) |> Repo.insert_all(interactions)
end end
defp save(%{domain: domain, error: error}) do defp save(%{domain: domain, error: error, allows_crawling?: allows_crawling}) do
if error == nil do error =
error = "no api found" cond do
end not allows_crawling -> "robots.txt"
error == nil -> "no api found"
true -> "unknown error"
end
Repo.insert!(%Crawl{ Repo.insert!(%Crawl{
instance_domain: domain, instance_domain: domain,

View File

@ -14,6 +14,21 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
end end
end end
@impl ApiCrawler
def allows_crawling?(domain) do
endpoints = [
"/api/v1/instance",
"/api/v1/instance/peers",
"/api/v1/timelines/public"
]
user_agent = get_config(:user_agent)
endpoints
|> Enum.map(fn endpoint -> "https://#{domain}#{endpoint}" end)
|> Enum.all?(fn endpoint -> Gollum.crawlable?(user_agent, endpoint) != :uncrawlable end)
end
@impl ApiCrawler @impl ApiCrawler
# sobelow_skip ["DOS.StringToAtom"] # sobelow_skip ["DOS.StringToAtom"]
def crawl(domain) do def crawl(domain) do

View File

@ -20,7 +20,7 @@ defmodule Backend.MixProject do
def application do def application do
[ [
mod: {Backend.Application, []}, mod: {Backend.Application, []},
extra_applications: [:logger, :runtime_tools, :mnesia] extra_applications: [:logger, :runtime_tools, :mnesia, :gollum]
] ]
end end
@ -46,7 +46,8 @@ defmodule Backend.MixProject do
{:honeydew, "~> 1.4.3"}, {:honeydew, "~> 1.4.3"},
{:quantum, "~> 2.3"}, {:quantum, "~> 2.3"},
{:corsica, "~> 1.1.2"}, {:corsica, "~> 1.1.2"},
{:sobelow, "~> 0.8", only: :dev} {:sobelow, "~> 0.8", only: :dev},
{:gollum, "~> 0.3.2"}
] ]
end end

View File

@ -15,6 +15,7 @@
"gen_stage": {:hex, :gen_stage, "0.14.2", "6a2a578a510c5bfca8a45e6b27552f613b41cf584b58210f017088d3d17d0b14", [:mix], [], "hexpm"}, "gen_stage": {:hex, :gen_stage, "0.14.2", "6a2a578a510c5bfca8a45e6b27552f613b41cf584b58210f017088d3d17d0b14", [:mix], [], "hexpm"},
"gen_state_machine": {:hex, :gen_state_machine, "2.0.5", "9ac15ec6e66acac994cc442dcc2c6f9796cf380ec4b08267223014be1c728a95", [:mix], [], "hexpm"}, "gen_state_machine": {:hex, :gen_state_machine, "2.0.5", "9ac15ec6e66acac994cc442dcc2c6f9796cf380ec4b08267223014be1c728a95", [:mix], [], "hexpm"},
"gettext": {:hex, :gettext, "0.17.0", "abe21542c831887a2b16f4c94556db9c421ab301aee417b7c4fbde7fbdbe01ec", [:mix], [], "hexpm"}, "gettext": {:hex, :gettext, "0.17.0", "abe21542c831887a2b16f4c94556db9c421ab301aee417b7c4fbde7fbdbe01ec", [:mix], [], "hexpm"},
"gollum": {:hex, :gollum, "0.3.2", "bd6a17febb98f5c362b5c5c08d9b102738fb875016e1eefec626f5e87c23f20e", [:mix], [{:httpoison, "~> 1.5.1", [hex: :httpoison, repo: "hexpm", optional: false]}], "hexpm"},
"hackney": {:hex, :hackney, "1.15.1", "9f8f471c844b8ce395f7b6d8398139e26ddca9ebc171a8b91342ee15a19963f4", [:rebar3], [{:certifi, "2.5.1", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "6.0.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "1.0.1", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "1.1.4", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}], "hexpm"}, "hackney": {:hex, :hackney, "1.15.1", "9f8f471c844b8ce395f7b6d8398139e26ddca9ebc171a8b91342ee15a19963f4", [:rebar3], [{:certifi, "2.5.1", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "6.0.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "1.0.1", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "1.1.4", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}], "hexpm"},
"honeydew": {:hex, :honeydew, "1.4.3", "f2d976aaf8b9b914a635d2d483f1a71d2f6d8651809474dd5db581953cbebb30", [:mix], [{:ecto, "~> 3.0", [hex: :ecto, repo: "hexpm", optional: true]}], "hexpm"}, "honeydew": {:hex, :honeydew, "1.4.3", "f2d976aaf8b9b914a635d2d483f1a71d2f6d8651809474dd5db581953cbebb30", [:mix], [{:ecto, "~> 3.0", [hex: :ecto, repo: "hexpm", optional: true]}], "hexpm"},
"httpoison": {:hex, :httpoison, "1.5.1", "0f55b5b673b03c5c327dac7015a67cb571b99b631acc0bc1b0b98dcd6b9f2104", [:mix], [{:hackney, "~> 1.8", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm"}, "httpoison": {:hex, :httpoison, "1.5.1", "0f55b5b673b03c5c327dac7015a67cb571b99b631acc0bc1b0b98dcd6b9f2104", [:mix], [{:hackney, "~> 1.8", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm"},