From 2cae5dccbf73e4f0f98b1406ec9ff3620c6eb97f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tao=20Bojl=C3=A9n?= <2803708-taobojlen@users.noreply.gitlab.com> Date: Fri, 19 Jul 2019 20:00:28 +0000 Subject: [PATCH] check robots.txt for permission to crawl --- backend/lib/backend/crawler/api_crawler.ex | 6 +++ backend/lib/backend/crawler/crawler.ex | 54 +++++++++++++------ .../lib/backend/crawler/crawlers/mastodon.ex | 15 ++++++ backend/mix.exs | 5 +- backend/mix.lock | 1 + 5 files changed, 64 insertions(+), 17 deletions(-) diff --git a/backend/lib/backend/crawler/api_crawler.ex b/backend/lib/backend/crawler/api_crawler.ex index 059914b..1b9f45a 100644 --- a/backend/lib/backend/crawler/api_crawler.ex +++ b/backend/lib/backend/crawler/api_crawler.ex @@ -38,6 +38,12 @@ defmodule Backend.Crawler.ApiCrawler do Check whether the instance at the given domain is of the type that this ApiCrawler implements. """ @callback is_instance_type?(String.t()) :: boolean() + + @doc """ + Check whether the instance allows crawling according to its robots.txt or otherwise. + """ + @callback allows_crawling?(String.t()) :: boolean() + @doc """ Crawl the instance at the given domain. """ diff --git a/backend/lib/backend/crawler/crawler.ex b/backend/lib/backend/crawler/crawler.ex index 846acee..cc26661 100644 --- a/backend/lib/backend/crawler/crawler.ex +++ b/backend/lib/backend/crawler/crawler.ex @@ -17,6 +17,7 @@ defmodule Backend.Crawler do # a list of ApiCrawlers that will be attempted :api_crawlers, :found_api?, + :allows_crawling?, :result, :error ] @@ -25,6 +26,7 @@ defmodule Backend.Crawler do domain: String.t(), api_crawlers: [ApiCrawler.t()], found_api?: boolean, + allows_crawling?: boolean, result: ApiCrawler.t() | nil, error: String.t() | nil } @@ -32,7 +34,15 @@ defmodule Backend.Crawler do def run(domain) do Logger.info("Crawling #{domain}...") HTTPoison.start() - state = %Crawler{domain: domain, api_crawlers: [], found_api?: false, result: nil, error: nil} + + state = %Crawler{ + domain: domain, + api_crawlers: [], + found_api?: false, + allows_crawling?: true, + result: nil, + error: nil + } state # register APICrawlers here @@ -59,17 +69,22 @@ defmodule Backend.Crawler do Logger.debug("Found #{curr} instance") state = Map.put(state, :found_api?, true) - try do - %Crawler{state | result: curr.crawl(domain), api_crawlers: []} - rescue - e in HTTPoison.Error -> - Map.put(state, :error, "HTTPoison error: " <> HTTPoison.Error.message(e)) + if curr.allows_crawling?(domain) do + try do + %Crawler{state | result: curr.crawl(domain), api_crawlers: []} + rescue + e in HTTPoison.Error -> + Map.put(state, :error, "HTTPoison error: " <> HTTPoison.Error.message(e)) - e in Jason.DecodeError -> - Map.put(state, :error, "Jason DecodeError: " <> Jason.DecodeError.message(e)) + e in Jason.DecodeError -> + Map.put(state, :error, "Jason DecodeError: " <> Jason.DecodeError.message(e)) - e in _ -> - Map.put(state, :error, "Unknown error: " <> inspect(e)) + e in _ -> + Map.put(state, :error, "Unknown error: " <> inspect(e)) + end + else + Logger.debug("#{domain} does not allow crawling.") + Map.put(state, :allows_crawling?, false) end else # Nothing found so check the next APICrawler @@ -79,7 +94,13 @@ defmodule Backend.Crawler do end # Save the state (after crawling) to the database. - defp save(%Crawler{domain: domain, result: result, found_api?: true, error: nil}) do + defp save(%Crawler{ + domain: domain, + result: result, + found_api?: true, + error: nil, + allows_crawling?: true + }) do now = get_now() ## Update the instance we crawled ## @@ -187,10 +208,13 @@ defmodule Backend.Crawler do |> Repo.insert_all(interactions) end - defp save(%{domain: domain, error: error}) do - if error == nil do - error = "no api found" - end + defp save(%{domain: domain, error: error, allows_crawling?: allows_crawling}) do + error = + cond do + not allows_crawling -> "robots.txt" + error == nil -> "no api found" + true -> "unknown error" + end Repo.insert!(%Crawl{ instance_domain: domain, diff --git a/backend/lib/backend/crawler/crawlers/mastodon.ex b/backend/lib/backend/crawler/crawlers/mastodon.ex index f3af190..edd6bcb 100644 --- a/backend/lib/backend/crawler/crawlers/mastodon.ex +++ b/backend/lib/backend/crawler/crawlers/mastodon.ex @@ -14,6 +14,21 @@ defmodule Backend.Crawler.Crawlers.Mastodon do end end + @impl ApiCrawler + def allows_crawling?(domain) do + endpoints = [ + "/api/v1/instance", + "/api/v1/instance/peers", + "/api/v1/timelines/public" + ] + + user_agent = get_config(:user_agent) + + endpoints + |> Enum.map(fn endpoint -> "https://#{domain}#{endpoint}" end) + |> Enum.all?(fn endpoint -> Gollum.crawlable?(user_agent, endpoint) != :uncrawlable end) + end + @impl ApiCrawler # sobelow_skip ["DOS.StringToAtom"] def crawl(domain) do diff --git a/backend/mix.exs b/backend/mix.exs index fc4deec..7e6f334 100644 --- a/backend/mix.exs +++ b/backend/mix.exs @@ -20,7 +20,7 @@ defmodule Backend.MixProject do def application do [ mod: {Backend.Application, []}, - extra_applications: [:logger, :runtime_tools, :mnesia] + extra_applications: [:logger, :runtime_tools, :mnesia, :gollum] ] end @@ -46,7 +46,8 @@ defmodule Backend.MixProject do {:honeydew, "~> 1.4.3"}, {:quantum, "~> 2.3"}, {:corsica, "~> 1.1.2"}, - {:sobelow, "~> 0.8", only: :dev} + {:sobelow, "~> 0.8", only: :dev}, + {:gollum, "~> 0.3.2"} ] end diff --git a/backend/mix.lock b/backend/mix.lock index bc544f0..4c81fc3 100644 --- a/backend/mix.lock +++ b/backend/mix.lock @@ -15,6 +15,7 @@ "gen_stage": {:hex, :gen_stage, "0.14.2", "6a2a578a510c5bfca8a45e6b27552f613b41cf584b58210f017088d3d17d0b14", [:mix], [], "hexpm"}, "gen_state_machine": {:hex, :gen_state_machine, "2.0.5", "9ac15ec6e66acac994cc442dcc2c6f9796cf380ec4b08267223014be1c728a95", [:mix], [], "hexpm"}, "gettext": {:hex, :gettext, "0.17.0", "abe21542c831887a2b16f4c94556db9c421ab301aee417b7c4fbde7fbdbe01ec", [:mix], [], "hexpm"}, + "gollum": {:hex, :gollum, "0.3.2", "bd6a17febb98f5c362b5c5c08d9b102738fb875016e1eefec626f5e87c23f20e", [:mix], [{:httpoison, "~> 1.5.1", [hex: :httpoison, repo: "hexpm", optional: false]}], "hexpm"}, "hackney": {:hex, :hackney, "1.15.1", "9f8f471c844b8ce395f7b6d8398139e26ddca9ebc171a8b91342ee15a19963f4", [:rebar3], [{:certifi, "2.5.1", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "6.0.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "1.0.1", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "1.1.4", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}], "hexpm"}, "honeydew": {:hex, :honeydew, "1.4.3", "f2d976aaf8b9b914a635d2d483f1a71d2f6d8651809474dd5db581953cbebb30", [:mix], [{:ecto, "~> 3.0", [hex: :ecto, repo: "hexpm", optional: true]}], "hexpm"}, "httpoison": {:hex, :httpoison, "1.5.1", "0f55b5b673b03c5c327dac7015a67cb571b99b631acc0bc1b0b98dcd6b9f2104", [:mix], [{:hackney, "~> 1.8", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm"},