check robots.txt for permission to crawl
This commit is contained in:
parent
97f8327336
commit
2cae5dccbf
|
@ -38,6 +38,12 @@ defmodule Backend.Crawler.ApiCrawler do
|
|||
Check whether the instance at the given domain is of the type that this ApiCrawler implements.
|
||||
"""
|
||||
@callback is_instance_type?(String.t()) :: boolean()
|
||||
|
||||
@doc """
|
||||
Check whether the instance allows crawling according to its robots.txt or otherwise.
|
||||
"""
|
||||
@callback allows_crawling?(String.t()) :: boolean()
|
||||
|
||||
@doc """
|
||||
Crawl the instance at the given domain.
|
||||
"""
|
||||
|
|
|
@ -17,6 +17,7 @@ defmodule Backend.Crawler do
|
|||
# a list of ApiCrawlers that will be attempted
|
||||
:api_crawlers,
|
||||
:found_api?,
|
||||
:allows_crawling?,
|
||||
:result,
|
||||
:error
|
||||
]
|
||||
|
@ -25,6 +26,7 @@ defmodule Backend.Crawler do
|
|||
domain: String.t(),
|
||||
api_crawlers: [ApiCrawler.t()],
|
||||
found_api?: boolean,
|
||||
allows_crawling?: boolean,
|
||||
result: ApiCrawler.t() | nil,
|
||||
error: String.t() | nil
|
||||
}
|
||||
|
@ -32,7 +34,15 @@ defmodule Backend.Crawler do
|
|||
def run(domain) do
|
||||
Logger.info("Crawling #{domain}...")
|
||||
HTTPoison.start()
|
||||
state = %Crawler{domain: domain, api_crawlers: [], found_api?: false, result: nil, error: nil}
|
||||
|
||||
state = %Crawler{
|
||||
domain: domain,
|
||||
api_crawlers: [],
|
||||
found_api?: false,
|
||||
allows_crawling?: true,
|
||||
result: nil,
|
||||
error: nil
|
||||
}
|
||||
|
||||
state
|
||||
# register APICrawlers here
|
||||
|
@ -59,17 +69,22 @@ defmodule Backend.Crawler do
|
|||
Logger.debug("Found #{curr} instance")
|
||||
state = Map.put(state, :found_api?, true)
|
||||
|
||||
try do
|
||||
%Crawler{state | result: curr.crawl(domain), api_crawlers: []}
|
||||
rescue
|
||||
e in HTTPoison.Error ->
|
||||
Map.put(state, :error, "HTTPoison error: " <> HTTPoison.Error.message(e))
|
||||
if curr.allows_crawling?(domain) do
|
||||
try do
|
||||
%Crawler{state | result: curr.crawl(domain), api_crawlers: []}
|
||||
rescue
|
||||
e in HTTPoison.Error ->
|
||||
Map.put(state, :error, "HTTPoison error: " <> HTTPoison.Error.message(e))
|
||||
|
||||
e in Jason.DecodeError ->
|
||||
Map.put(state, :error, "Jason DecodeError: " <> Jason.DecodeError.message(e))
|
||||
e in Jason.DecodeError ->
|
||||
Map.put(state, :error, "Jason DecodeError: " <> Jason.DecodeError.message(e))
|
||||
|
||||
e in _ ->
|
||||
Map.put(state, :error, "Unknown error: " <> inspect(e))
|
||||
e in _ ->
|
||||
Map.put(state, :error, "Unknown error: " <> inspect(e))
|
||||
end
|
||||
else
|
||||
Logger.debug("#{domain} does not allow crawling.")
|
||||
Map.put(state, :allows_crawling?, false)
|
||||
end
|
||||
else
|
||||
# Nothing found so check the next APICrawler
|
||||
|
@ -79,7 +94,13 @@ defmodule Backend.Crawler do
|
|||
end
|
||||
|
||||
# Save the state (after crawling) to the database.
|
||||
defp save(%Crawler{domain: domain, result: result, found_api?: true, error: nil}) do
|
||||
defp save(%Crawler{
|
||||
domain: domain,
|
||||
result: result,
|
||||
found_api?: true,
|
||||
error: nil,
|
||||
allows_crawling?: true
|
||||
}) do
|
||||
now = get_now()
|
||||
|
||||
## Update the instance we crawled ##
|
||||
|
@ -187,10 +208,13 @@ defmodule Backend.Crawler do
|
|||
|> Repo.insert_all(interactions)
|
||||
end
|
||||
|
||||
defp save(%{domain: domain, error: error}) do
|
||||
if error == nil do
|
||||
error = "no api found"
|
||||
end
|
||||
defp save(%{domain: domain, error: error, allows_crawling?: allows_crawling}) do
|
||||
error =
|
||||
cond do
|
||||
not allows_crawling -> "robots.txt"
|
||||
error == nil -> "no api found"
|
||||
true -> "unknown error"
|
||||
end
|
||||
|
||||
Repo.insert!(%Crawl{
|
||||
instance_domain: domain,
|
||||
|
|
|
@ -14,6 +14,21 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
|
|||
end
|
||||
end
|
||||
|
||||
@impl ApiCrawler
|
||||
def allows_crawling?(domain) do
|
||||
endpoints = [
|
||||
"/api/v1/instance",
|
||||
"/api/v1/instance/peers",
|
||||
"/api/v1/timelines/public"
|
||||
]
|
||||
|
||||
user_agent = get_config(:user_agent)
|
||||
|
||||
endpoints
|
||||
|> Enum.map(fn endpoint -> "https://#{domain}#{endpoint}" end)
|
||||
|> Enum.all?(fn endpoint -> Gollum.crawlable?(user_agent, endpoint) != :uncrawlable end)
|
||||
end
|
||||
|
||||
@impl ApiCrawler
|
||||
# sobelow_skip ["DOS.StringToAtom"]
|
||||
def crawl(domain) do
|
||||
|
|
|
@ -20,7 +20,7 @@ defmodule Backend.MixProject do
|
|||
def application do
|
||||
[
|
||||
mod: {Backend.Application, []},
|
||||
extra_applications: [:logger, :runtime_tools, :mnesia]
|
||||
extra_applications: [:logger, :runtime_tools, :mnesia, :gollum]
|
||||
]
|
||||
end
|
||||
|
||||
|
@ -46,7 +46,8 @@ defmodule Backend.MixProject do
|
|||
{:honeydew, "~> 1.4.3"},
|
||||
{:quantum, "~> 2.3"},
|
||||
{:corsica, "~> 1.1.2"},
|
||||
{:sobelow, "~> 0.8", only: :dev}
|
||||
{:sobelow, "~> 0.8", only: :dev},
|
||||
{:gollum, "~> 0.3.2"}
|
||||
]
|
||||
end
|
||||
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
"gen_stage": {:hex, :gen_stage, "0.14.2", "6a2a578a510c5bfca8a45e6b27552f613b41cf584b58210f017088d3d17d0b14", [:mix], [], "hexpm"},
|
||||
"gen_state_machine": {:hex, :gen_state_machine, "2.0.5", "9ac15ec6e66acac994cc442dcc2c6f9796cf380ec4b08267223014be1c728a95", [:mix], [], "hexpm"},
|
||||
"gettext": {:hex, :gettext, "0.17.0", "abe21542c831887a2b16f4c94556db9c421ab301aee417b7c4fbde7fbdbe01ec", [:mix], [], "hexpm"},
|
||||
"gollum": {:hex, :gollum, "0.3.2", "bd6a17febb98f5c362b5c5c08d9b102738fb875016e1eefec626f5e87c23f20e", [:mix], [{:httpoison, "~> 1.5.1", [hex: :httpoison, repo: "hexpm", optional: false]}], "hexpm"},
|
||||
"hackney": {:hex, :hackney, "1.15.1", "9f8f471c844b8ce395f7b6d8398139e26ddca9ebc171a8b91342ee15a19963f4", [:rebar3], [{:certifi, "2.5.1", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "6.0.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "1.0.1", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "1.1.4", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}], "hexpm"},
|
||||
"honeydew": {:hex, :honeydew, "1.4.3", "f2d976aaf8b9b914a635d2d483f1a71d2f6d8651809474dd5db581953cbebb30", [:mix], [{:ecto, "~> 3.0", [hex: :ecto, repo: "hexpm", optional: true]}], "hexpm"},
|
||||
"httpoison": {:hex, :httpoison, "1.5.1", "0f55b5b673b03c5c327dac7015a67cb571b99b631acc0bc1b0b98dcd6b9f2104", [:mix], [{:hackney, "~> 1.8", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm"},
|
||||
|
|
Loading…
Reference in a new issue