check robots.txt for permission to crawl
This commit is contained in:
parent
97f8327336
commit
2cae5dccbf
|
@ -38,6 +38,12 @@ defmodule Backend.Crawler.ApiCrawler do
|
||||||
Check whether the instance at the given domain is of the type that this ApiCrawler implements.
|
Check whether the instance at the given domain is of the type that this ApiCrawler implements.
|
||||||
"""
|
"""
|
||||||
@callback is_instance_type?(String.t()) :: boolean()
|
@callback is_instance_type?(String.t()) :: boolean()
|
||||||
|
|
||||||
|
@doc """
|
||||||
|
Check whether the instance allows crawling according to its robots.txt or otherwise.
|
||||||
|
"""
|
||||||
|
@callback allows_crawling?(String.t()) :: boolean()
|
||||||
|
|
||||||
@doc """
|
@doc """
|
||||||
Crawl the instance at the given domain.
|
Crawl the instance at the given domain.
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -17,6 +17,7 @@ defmodule Backend.Crawler do
|
||||||
# a list of ApiCrawlers that will be attempted
|
# a list of ApiCrawlers that will be attempted
|
||||||
:api_crawlers,
|
:api_crawlers,
|
||||||
:found_api?,
|
:found_api?,
|
||||||
|
:allows_crawling?,
|
||||||
:result,
|
:result,
|
||||||
:error
|
:error
|
||||||
]
|
]
|
||||||
|
@ -25,6 +26,7 @@ defmodule Backend.Crawler do
|
||||||
domain: String.t(),
|
domain: String.t(),
|
||||||
api_crawlers: [ApiCrawler.t()],
|
api_crawlers: [ApiCrawler.t()],
|
||||||
found_api?: boolean,
|
found_api?: boolean,
|
||||||
|
allows_crawling?: boolean,
|
||||||
result: ApiCrawler.t() | nil,
|
result: ApiCrawler.t() | nil,
|
||||||
error: String.t() | nil
|
error: String.t() | nil
|
||||||
}
|
}
|
||||||
|
@ -32,7 +34,15 @@ defmodule Backend.Crawler do
|
||||||
def run(domain) do
|
def run(domain) do
|
||||||
Logger.info("Crawling #{domain}...")
|
Logger.info("Crawling #{domain}...")
|
||||||
HTTPoison.start()
|
HTTPoison.start()
|
||||||
state = %Crawler{domain: domain, api_crawlers: [], found_api?: false, result: nil, error: nil}
|
|
||||||
|
state = %Crawler{
|
||||||
|
domain: domain,
|
||||||
|
api_crawlers: [],
|
||||||
|
found_api?: false,
|
||||||
|
allows_crawling?: true,
|
||||||
|
result: nil,
|
||||||
|
error: nil
|
||||||
|
}
|
||||||
|
|
||||||
state
|
state
|
||||||
# register APICrawlers here
|
# register APICrawlers here
|
||||||
|
@ -59,6 +69,7 @@ defmodule Backend.Crawler do
|
||||||
Logger.debug("Found #{curr} instance")
|
Logger.debug("Found #{curr} instance")
|
||||||
state = Map.put(state, :found_api?, true)
|
state = Map.put(state, :found_api?, true)
|
||||||
|
|
||||||
|
if curr.allows_crawling?(domain) do
|
||||||
try do
|
try do
|
||||||
%Crawler{state | result: curr.crawl(domain), api_crawlers: []}
|
%Crawler{state | result: curr.crawl(domain), api_crawlers: []}
|
||||||
rescue
|
rescue
|
||||||
|
@ -71,6 +82,10 @@ defmodule Backend.Crawler do
|
||||||
e in _ ->
|
e in _ ->
|
||||||
Map.put(state, :error, "Unknown error: " <> inspect(e))
|
Map.put(state, :error, "Unknown error: " <> inspect(e))
|
||||||
end
|
end
|
||||||
|
else
|
||||||
|
Logger.debug("#{domain} does not allow crawling.")
|
||||||
|
Map.put(state, :allows_crawling?, false)
|
||||||
|
end
|
||||||
else
|
else
|
||||||
# Nothing found so check the next APICrawler
|
# Nothing found so check the next APICrawler
|
||||||
Logger.debug("#{domain} is not an instance of #{curr}")
|
Logger.debug("#{domain} is not an instance of #{curr}")
|
||||||
|
@ -79,7 +94,13 @@ defmodule Backend.Crawler do
|
||||||
end
|
end
|
||||||
|
|
||||||
# Save the state (after crawling) to the database.
|
# Save the state (after crawling) to the database.
|
||||||
defp save(%Crawler{domain: domain, result: result, found_api?: true, error: nil}) do
|
defp save(%Crawler{
|
||||||
|
domain: domain,
|
||||||
|
result: result,
|
||||||
|
found_api?: true,
|
||||||
|
error: nil,
|
||||||
|
allows_crawling?: true
|
||||||
|
}) do
|
||||||
now = get_now()
|
now = get_now()
|
||||||
|
|
||||||
## Update the instance we crawled ##
|
## Update the instance we crawled ##
|
||||||
|
@ -187,9 +208,12 @@ defmodule Backend.Crawler do
|
||||||
|> Repo.insert_all(interactions)
|
|> Repo.insert_all(interactions)
|
||||||
end
|
end
|
||||||
|
|
||||||
defp save(%{domain: domain, error: error}) do
|
defp save(%{domain: domain, error: error, allows_crawling?: allows_crawling}) do
|
||||||
if error == nil do
|
error =
|
||||||
error = "no api found"
|
cond do
|
||||||
|
not allows_crawling -> "robots.txt"
|
||||||
|
error == nil -> "no api found"
|
||||||
|
true -> "unknown error"
|
||||||
end
|
end
|
||||||
|
|
||||||
Repo.insert!(%Crawl{
|
Repo.insert!(%Crawl{
|
||||||
|
|
|
@ -14,6 +14,21 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@impl ApiCrawler
|
||||||
|
def allows_crawling?(domain) do
|
||||||
|
endpoints = [
|
||||||
|
"/api/v1/instance",
|
||||||
|
"/api/v1/instance/peers",
|
||||||
|
"/api/v1/timelines/public"
|
||||||
|
]
|
||||||
|
|
||||||
|
user_agent = get_config(:user_agent)
|
||||||
|
|
||||||
|
endpoints
|
||||||
|
|> Enum.map(fn endpoint -> "https://#{domain}#{endpoint}" end)
|
||||||
|
|> Enum.all?(fn endpoint -> Gollum.crawlable?(user_agent, endpoint) != :uncrawlable end)
|
||||||
|
end
|
||||||
|
|
||||||
@impl ApiCrawler
|
@impl ApiCrawler
|
||||||
# sobelow_skip ["DOS.StringToAtom"]
|
# sobelow_skip ["DOS.StringToAtom"]
|
||||||
def crawl(domain) do
|
def crawl(domain) do
|
||||||
|
|
|
@ -20,7 +20,7 @@ defmodule Backend.MixProject do
|
||||||
def application do
|
def application do
|
||||||
[
|
[
|
||||||
mod: {Backend.Application, []},
|
mod: {Backend.Application, []},
|
||||||
extra_applications: [:logger, :runtime_tools, :mnesia]
|
extra_applications: [:logger, :runtime_tools, :mnesia, :gollum]
|
||||||
]
|
]
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -46,7 +46,8 @@ defmodule Backend.MixProject do
|
||||||
{:honeydew, "~> 1.4.3"},
|
{:honeydew, "~> 1.4.3"},
|
||||||
{:quantum, "~> 2.3"},
|
{:quantum, "~> 2.3"},
|
||||||
{:corsica, "~> 1.1.2"},
|
{:corsica, "~> 1.1.2"},
|
||||||
{:sobelow, "~> 0.8", only: :dev}
|
{:sobelow, "~> 0.8", only: :dev},
|
||||||
|
{:gollum, "~> 0.3.2"}
|
||||||
]
|
]
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -15,6 +15,7 @@
|
||||||
"gen_stage": {:hex, :gen_stage, "0.14.2", "6a2a578a510c5bfca8a45e6b27552f613b41cf584b58210f017088d3d17d0b14", [:mix], [], "hexpm"},
|
"gen_stage": {:hex, :gen_stage, "0.14.2", "6a2a578a510c5bfca8a45e6b27552f613b41cf584b58210f017088d3d17d0b14", [:mix], [], "hexpm"},
|
||||||
"gen_state_machine": {:hex, :gen_state_machine, "2.0.5", "9ac15ec6e66acac994cc442dcc2c6f9796cf380ec4b08267223014be1c728a95", [:mix], [], "hexpm"},
|
"gen_state_machine": {:hex, :gen_state_machine, "2.0.5", "9ac15ec6e66acac994cc442dcc2c6f9796cf380ec4b08267223014be1c728a95", [:mix], [], "hexpm"},
|
||||||
"gettext": {:hex, :gettext, "0.17.0", "abe21542c831887a2b16f4c94556db9c421ab301aee417b7c4fbde7fbdbe01ec", [:mix], [], "hexpm"},
|
"gettext": {:hex, :gettext, "0.17.0", "abe21542c831887a2b16f4c94556db9c421ab301aee417b7c4fbde7fbdbe01ec", [:mix], [], "hexpm"},
|
||||||
|
"gollum": {:hex, :gollum, "0.3.2", "bd6a17febb98f5c362b5c5c08d9b102738fb875016e1eefec626f5e87c23f20e", [:mix], [{:httpoison, "~> 1.5.1", [hex: :httpoison, repo: "hexpm", optional: false]}], "hexpm"},
|
||||||
"hackney": {:hex, :hackney, "1.15.1", "9f8f471c844b8ce395f7b6d8398139e26ddca9ebc171a8b91342ee15a19963f4", [:rebar3], [{:certifi, "2.5.1", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "6.0.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "1.0.1", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "1.1.4", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}], "hexpm"},
|
"hackney": {:hex, :hackney, "1.15.1", "9f8f471c844b8ce395f7b6d8398139e26ddca9ebc171a8b91342ee15a19963f4", [:rebar3], [{:certifi, "2.5.1", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "6.0.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "1.0.1", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "1.1.4", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}], "hexpm"},
|
||||||
"honeydew": {:hex, :honeydew, "1.4.3", "f2d976aaf8b9b914a635d2d483f1a71d2f6d8651809474dd5db581953cbebb30", [:mix], [{:ecto, "~> 3.0", [hex: :ecto, repo: "hexpm", optional: true]}], "hexpm"},
|
"honeydew": {:hex, :honeydew, "1.4.3", "f2d976aaf8b9b914a635d2d483f1a71d2f6d8651809474dd5db581953cbebb30", [:mix], [{:ecto, "~> 3.0", [hex: :ecto, repo: "hexpm", optional: true]}], "hexpm"},
|
||||||
"httpoison": {:hex, :httpoison, "1.5.1", "0f55b5b673b03c5c327dac7015a67cb571b99b631acc0bc1b0b98dcd6b9f2104", [:mix], [{:hackney, "~> 1.8", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm"},
|
"httpoison": {:hex, :httpoison, "1.5.1", "0f55b5b673b03c5c327dac7015a67cb571b99b631acc0bc1b0b98dcd6b9f2104", [:mix], [{:hackney, "~> 1.8", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm"},
|
||||||
|
|
Loading…
Reference in a new issue