From 1bf600b1ac8ced0abef539a5f8a9808115b5ae6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tao=20Bror=20Bojl=C3=A9n?= Date: Thu, 22 Aug 2019 11:29:39 +0200 Subject: [PATCH] fix domain cleaning --- backend/lib/backend/crawler/crawler.ex | 4 +-- .../lib/backend/crawler/crawlers/friendica.ex | 6 ++--- .../lib/backend/crawler/crawlers/misskey.ex | 2 +- backend/lib/backend/crawler/util.ex | 2 +- backend/lib/backend/util.ex | 27 +++++++------------ 5 files changed, 16 insertions(+), 25 deletions(-) diff --git a/backend/lib/backend/crawler/crawler.ex b/backend/lib/backend/crawler/crawler.ex index 8971470..d5f661a 100644 --- a/backend/lib/backend/crawler/crawler.ex +++ b/backend/lib/backend/crawler/crawler.ex @@ -198,8 +198,8 @@ defmodule Backend.Crawler do |> Enum.map(&clean_domain(&1)) if not Enum.all?(peers_domains, &is_valid_domain?(&1)) do - invalid_peers = Enum.filter(peers_domains, &is_valid_domain?(&1)) - raise "#{domain} has invalid peers: #{inspect(invalid_peers)}" + invalid_peers = Enum.filter(peers_domains, fn d -> not is_valid_domain?(d) end) + raise "#{domain} has invalid peers: #{Enum.join(invalid_peers, ", ")}" end peers = diff --git a/backend/lib/backend/crawler/crawlers/friendica.ex b/backend/lib/backend/crawler/crawlers/friendica.ex index 9985bdc..4a624c9 100644 --- a/backend/lib/backend/crawler/crawlers/friendica.ex +++ b/backend/lib/backend/crawler/crawlers/friendica.ex @@ -77,14 +77,14 @@ defmodule Backend.Crawler.Crawlers.Friendica do defp to_domain(url) do url - |> strip_prefix("http://") - |> strip_prefix("https://") + |> String.replace_prefix("http://", "") + |> String.replace_prefix("https://", "") |> strip_username() end # Sometimes a url at the poco/@server endpoint has the form username@domain.tld, in which case we only want domain.tld defp strip_username(string) do - [_match, _username, domain] = Regex.run(~r/([\w-_]+@)?([\w.-_]+)/, string) + [_match, _username, domain] = Regex.run(~r/([\w\-_]+@)?([\w\.\-_]+)/, string) domain end end diff --git a/backend/lib/backend/crawler/crawlers/misskey.ex b/backend/lib/backend/crawler/crawlers/misskey.ex index 40550ba..191f003 100644 --- a/backend/lib/backend/crawler/crawlers/misskey.ex +++ b/backend/lib/backend/crawler/crawlers/misskey.ex @@ -205,7 +205,7 @@ defmodule Backend.Crawler.Crawlers.Misskey do defp extract_mentions_from_status(status) do status_content = Map.get(status, "text") - Regex.scan(~r/@\w+@([\w._-]+)/, status_content) + Regex.scan(~r/@\w+@([\w\._\-]+)/, status_content) |> Enum.map(fn [_match, domain] -> domain end) |> Enum.reduce(%{}, fn domain, acc -> Map.update(acc, domain, 1, &(&1 + 1)) diff --git a/backend/lib/backend/crawler/util.ex b/backend/lib/backend/crawler/util.ex index d859e14..2274cdd 100644 --- a/backend/lib/backend/crawler/util.ex +++ b/backend/lib/backend/crawler/util.ex @@ -9,7 +9,7 @@ defmodule Backend.Crawler.Util do # (e.g. https://mastodon.social/@demouser or https://pleroma.site/users/demouser) @spec get_domain(String.t()) :: String.t() def get_domain(url) do - [_match, domain] = Regex.run(~r/https?:\/\/([\w.-_]+)\/.*/, url) + [_match, domain] = Regex.run(~r/https?:\/\/([\w\.\-_]+)\/.*/, url) domain end diff --git a/backend/lib/backend/util.ex b/backend/lib/backend/util.ex index c53476a..225dbbd 100644 --- a/backend/lib/backend/util.ex +++ b/backend/lib/backend/util.ex @@ -129,10 +129,14 @@ defmodule Backend.Util do end def clean_domain(domain) do - domain - |> String.replace_prefix("https://", "") - |> String.trim_trailing("/") - |> String.downcase() + cleaned = + domain + |> String.replace_prefix("https://", "") + |> String.trim_trailing("/") + |> String.trim() + |> String.downcase() + + Regex.replace(~r/:\d+/, cleaned, "") end def get_account(username, domain) do @@ -203,21 +207,8 @@ defmodule Backend.Util do end end - @doc """ - Strips `prefix` from `string`. If it doesn't start with that prefix, just returns the string. - """ - @spec strip_prefix(String.t(), String.t()) :: String.t() - def strip_prefix(string, prefix) do - if String.starts_with?(string, prefix) do - prefix_length = String.length(prefix) - String.slice(string, prefix_length..-1) - else - string - end - end - @spec is_valid_domain?(String.t()) :: boolean def is_valid_domain?(domain) do - Regex.match?(~r/[\w.-_]+/, domain) + Regex.match?(~r/^[\w\.\-_]+$/, domain) end end