fix domain cleaning
This commit is contained in:
parent
cee52de603
commit
1bf600b1ac
|
@ -198,8 +198,8 @@ defmodule Backend.Crawler do
|
|||
|> Enum.map(&clean_domain(&1))
|
||||
|
||||
if not Enum.all?(peers_domains, &is_valid_domain?(&1)) do
|
||||
invalid_peers = Enum.filter(peers_domains, &is_valid_domain?(&1))
|
||||
raise "#{domain} has invalid peers: #{inspect(invalid_peers)}"
|
||||
invalid_peers = Enum.filter(peers_domains, fn d -> not is_valid_domain?(d) end)
|
||||
raise "#{domain} has invalid peers: #{Enum.join(invalid_peers, ", ")}"
|
||||
end
|
||||
|
||||
peers =
|
||||
|
|
|
@ -77,14 +77,14 @@ defmodule Backend.Crawler.Crawlers.Friendica do
|
|||
|
||||
defp to_domain(url) do
|
||||
url
|
||||
|> strip_prefix("http://")
|
||||
|> strip_prefix("https://")
|
||||
|> String.replace_prefix("http://", "")
|
||||
|> String.replace_prefix("https://", "")
|
||||
|> strip_username()
|
||||
end
|
||||
|
||||
# Sometimes a url at the poco/@server endpoint has the form username@domain.tld, in which case we only want domain.tld
|
||||
defp strip_username(string) do
|
||||
[_match, _username, domain] = Regex.run(~r/([\w-_]+@)?([\w.-_]+)/, string)
|
||||
[_match, _username, domain] = Regex.run(~r/([\w\-_]+@)?([\w\.\-_]+)/, string)
|
||||
domain
|
||||
end
|
||||
end
|
||||
|
|
|
@ -205,7 +205,7 @@ defmodule Backend.Crawler.Crawlers.Misskey do
|
|||
defp extract_mentions_from_status(status) do
|
||||
status_content = Map.get(status, "text")
|
||||
|
||||
Regex.scan(~r/@\w+@([\w._-]+)/, status_content)
|
||||
Regex.scan(~r/@\w+@([\w\._\-]+)/, status_content)
|
||||
|> Enum.map(fn [_match, domain] -> domain end)
|
||||
|> Enum.reduce(%{}, fn domain, acc ->
|
||||
Map.update(acc, domain, 1, &(&1 + 1))
|
||||
|
|
|
@ -9,7 +9,7 @@ defmodule Backend.Crawler.Util do
|
|||
# (e.g. https://mastodon.social/@demouser or https://pleroma.site/users/demouser)
|
||||
@spec get_domain(String.t()) :: String.t()
|
||||
def get_domain(url) do
|
||||
[_match, domain] = Regex.run(~r/https?:\/\/([\w.-_]+)\/.*/, url)
|
||||
[_match, domain] = Regex.run(~r/https?:\/\/([\w\.\-_]+)\/.*/, url)
|
||||
domain
|
||||
end
|
||||
|
||||
|
|
|
@ -129,10 +129,14 @@ defmodule Backend.Util do
|
|||
end
|
||||
|
||||
def clean_domain(domain) do
|
||||
domain
|
||||
|> String.replace_prefix("https://", "")
|
||||
|> String.trim_trailing("/")
|
||||
|> String.downcase()
|
||||
cleaned =
|
||||
domain
|
||||
|> String.replace_prefix("https://", "")
|
||||
|> String.trim_trailing("/")
|
||||
|> String.trim()
|
||||
|> String.downcase()
|
||||
|
||||
Regex.replace(~r/:\d+/, cleaned, "")
|
||||
end
|
||||
|
||||
def get_account(username, domain) do
|
||||
|
@ -203,21 +207,8 @@ defmodule Backend.Util do
|
|||
end
|
||||
end
|
||||
|
||||
@doc """
|
||||
Strips `prefix` from `string`. If it doesn't start with that prefix, just returns the string.
|
||||
"""
|
||||
@spec strip_prefix(String.t(), String.t()) :: String.t()
|
||||
def strip_prefix(string, prefix) do
|
||||
if String.starts_with?(string, prefix) do
|
||||
prefix_length = String.length(prefix)
|
||||
String.slice(string, prefix_length..-1)
|
||||
else
|
||||
string
|
||||
end
|
||||
end
|
||||
|
||||
@spec is_valid_domain?(String.t()) :: boolean
|
||||
def is_valid_domain?(domain) do
|
||||
Regex.match?(~r/[\w.-_]+/, domain)
|
||||
Regex.match?(~r/^[\w\.\-_]+$/, domain)
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in a new issue