fix domain cleaning

This commit is contained in:
Tao Bror Bojlén 2019-08-22 11:29:39 +02:00
parent cee52de603
commit 1bf600b1ac
No known key found for this signature in database
GPG key ID: C6EC7AAB905F9E6F
5 changed files with 16 additions and 25 deletions

View file

@ -198,8 +198,8 @@ defmodule Backend.Crawler do
|> Enum.map(&clean_domain(&1)) |> Enum.map(&clean_domain(&1))
if not Enum.all?(peers_domains, &is_valid_domain?(&1)) do if not Enum.all?(peers_domains, &is_valid_domain?(&1)) do
invalid_peers = Enum.filter(peers_domains, &is_valid_domain?(&1)) invalid_peers = Enum.filter(peers_domains, fn d -> not is_valid_domain?(d) end)
raise "#{domain} has invalid peers: #{inspect(invalid_peers)}" raise "#{domain} has invalid peers: #{Enum.join(invalid_peers, ", ")}"
end end
peers = peers =

View file

@ -77,14 +77,14 @@ defmodule Backend.Crawler.Crawlers.Friendica do
defp to_domain(url) do defp to_domain(url) do
url url
|> strip_prefix("http://") |> String.replace_prefix("http://", "")
|> strip_prefix("https://") |> String.replace_prefix("https://", "")
|> strip_username() |> strip_username()
end end
# Sometimes a url at the poco/@server endpoint has the form username@domain.tld, in which case we only want domain.tld # Sometimes a url at the poco/@server endpoint has the form username@domain.tld, in which case we only want domain.tld
defp strip_username(string) do defp strip_username(string) do
[_match, _username, domain] = Regex.run(~r/([\w-_]+@)?([\w.-_]+)/, string) [_match, _username, domain] = Regex.run(~r/([\w\-_]+@)?([\w\.\-_]+)/, string)
domain domain
end end
end end

View file

@ -205,7 +205,7 @@ defmodule Backend.Crawler.Crawlers.Misskey do
defp extract_mentions_from_status(status) do defp extract_mentions_from_status(status) do
status_content = Map.get(status, "text") status_content = Map.get(status, "text")
Regex.scan(~r/@\w+@([\w._-]+)/, status_content) Regex.scan(~r/@\w+@([\w\._\-]+)/, status_content)
|> Enum.map(fn [_match, domain] -> domain end) |> Enum.map(fn [_match, domain] -> domain end)
|> Enum.reduce(%{}, fn domain, acc -> |> Enum.reduce(%{}, fn domain, acc ->
Map.update(acc, domain, 1, &(&1 + 1)) Map.update(acc, domain, 1, &(&1 + 1))

View file

@ -9,7 +9,7 @@ defmodule Backend.Crawler.Util do
# (e.g. https://mastodon.social/@demouser or https://pleroma.site/users/demouser) # (e.g. https://mastodon.social/@demouser or https://pleroma.site/users/demouser)
@spec get_domain(String.t()) :: String.t() @spec get_domain(String.t()) :: String.t()
def get_domain(url) do def get_domain(url) do
[_match, domain] = Regex.run(~r/https?:\/\/([\w.-_]+)\/.*/, url) [_match, domain] = Regex.run(~r/https?:\/\/([\w\.\-_]+)\/.*/, url)
domain domain
end end

View file

@ -129,10 +129,14 @@ defmodule Backend.Util do
end end
def clean_domain(domain) do def clean_domain(domain) do
domain cleaned =
|> String.replace_prefix("https://", "") domain
|> String.trim_trailing("/") |> String.replace_prefix("https://", "")
|> String.downcase() |> String.trim_trailing("/")
|> String.trim()
|> String.downcase()
Regex.replace(~r/:\d+/, cleaned, "")
end end
def get_account(username, domain) do def get_account(username, domain) do
@ -203,21 +207,8 @@ defmodule Backend.Util do
end end
end end
@doc """
Strips `prefix` from `string`. If it doesn't start with that prefix, just returns the string.
"""
@spec strip_prefix(String.t(), String.t()) :: String.t()
def strip_prefix(string, prefix) do
if String.starts_with?(string, prefix) do
prefix_length = String.length(prefix)
String.slice(string, prefix_length..-1)
else
string
end
end
@spec is_valid_domain?(String.t()) :: boolean @spec is_valid_domain?(String.t()) :: boolean
def is_valid_domain?(domain) do def is_valid_domain?(domain) do
Regex.match?(~r/[\w.-_]+/, domain) Regex.match?(~r/^[\w\.\-_]+$/, domain)
end end
end end