diff --git a/CHANGELOG.md b/CHANGELOG.md index 2afaf88..6ee4912 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added ON DELETE to `most_recent_crawl` table, such that it can handle previously-crawled but now-dead instances. - You can now login to the admin view by clicking, not just by pressing enter. +- Add handling for weirdly-formatted Friendica peers ## [2.7.0 - 2018-08-18] diff --git a/backend/lib/backend/crawler/crawlers/friendica.ex b/backend/lib/backend/crawler/crawlers/friendica.ex index 13884ef..9985bdc 100644 --- a/backend/lib/backend/crawler/crawlers/friendica.ex +++ b/backend/lib/backend/crawler/crawlers/friendica.ex @@ -76,13 +76,15 @@ defmodule Backend.Crawler.Crawlers.Friendica do end defp to_domain(url) do - url = - cond do - String.starts_with?(url, "https://") -> String.slice(url, 8..-1) - String.starts_with?(url, "http://") -> String.slice(url, 7..-1) - true -> url - end - url + |> strip_prefix("http://") + |> strip_prefix("https://") + |> strip_username() + end + + # Sometimes a url at the poco/@server endpoint has the form username@domain.tld, in which case we only want domain.tld + defp strip_username(string) do + [_match, _username, domain] = Regex.run(~r/([\w-_]+@)?([\w.-_]+)/, string) + domain end end diff --git a/backend/lib/backend/crawler/crawlers/misskey.ex b/backend/lib/backend/crawler/crawlers/misskey.ex index 24513ca..40550ba 100644 --- a/backend/lib/backend/crawler/crawlers/misskey.ex +++ b/backend/lib/backend/crawler/crawlers/misskey.ex @@ -205,7 +205,7 @@ defmodule Backend.Crawler.Crawlers.Misskey do defp extract_mentions_from_status(status) do status_content = Map.get(status, "text") - Regex.scan(~r/@\w+@([\w.-]+)/, status_content) + Regex.scan(~r/@\w+@([\w._-]+)/, status_content) |> Enum.map(fn [_match, domain] -> domain end) |> Enum.reduce(%{}, fn domain, acc -> Map.update(acc, domain, 1, &(&1 + 1)) diff --git a/backend/lib/backend/crawler/util.ex b/backend/lib/backend/crawler/util.ex index f013785..d859e14 100644 --- a/backend/lib/backend/crawler/util.ex +++ b/backend/lib/backend/crawler/util.ex @@ -9,7 +9,7 @@ defmodule Backend.Crawler.Util do # (e.g. https://mastodon.social/@demouser or https://pleroma.site/users/demouser) @spec get_domain(String.t()) :: String.t() def get_domain(url) do - [_match, domain] = Regex.run(~r/https?:\/\/([\w.-]+)\/.*/, url) + [_match, domain] = Regex.run(~r/https?:\/\/([\w.-_]+)\/.*/, url) domain end diff --git a/backend/lib/backend/util.ex b/backend/lib/backend/util.ex index 7747450..c53476a 100644 --- a/backend/lib/backend/util.ex +++ b/backend/lib/backend/util.ex @@ -202,4 +202,22 @@ defmodule Backend.Util do {:error, error} -> raise error end end + + @doc """ + Strips `prefix` from `string`. If it doesn't start with that prefix, just returns the string. + """ + @spec strip_prefix(String.t(), String.t()) :: String.t() + def strip_prefix(string, prefix) do + if String.starts_with?(string, prefix) do + prefix_length = String.length(prefix) + String.slice(string, prefix_length..-1) + else + string + end + end + + @spec is_valid_domain?(String.t()) :: boolean + def is_valid_domain?(domain) do + Regex.match?(~r/[\w.-_]+/, domain) + end end diff --git a/backend/lib/backend_web/controllers/instance_controller.ex b/backend/lib/backend_web/controllers/instance_controller.ex index 56d5f59..af72d67 100644 --- a/backend/lib/backend_web/controllers/instance_controller.ex +++ b/backend/lib/backend_web/controllers/instance_controller.ex @@ -1,7 +1,5 @@ defmodule BackendWeb.InstanceController do use BackendWeb, :controller - - import Backend.Util alias Graph.Cache action_fallback(BackendWeb.FallbackController)