fix domain cleaning
This commit is contained in:
parent
cee52de603
commit
1bf600b1ac
|
@ -198,8 +198,8 @@ defmodule Backend.Crawler do
|
||||||
|> Enum.map(&clean_domain(&1))
|
|> Enum.map(&clean_domain(&1))
|
||||||
|
|
||||||
if not Enum.all?(peers_domains, &is_valid_domain?(&1)) do
|
if not Enum.all?(peers_domains, &is_valid_domain?(&1)) do
|
||||||
invalid_peers = Enum.filter(peers_domains, &is_valid_domain?(&1))
|
invalid_peers = Enum.filter(peers_domains, fn d -> not is_valid_domain?(d) end)
|
||||||
raise "#{domain} has invalid peers: #{inspect(invalid_peers)}"
|
raise "#{domain} has invalid peers: #{Enum.join(invalid_peers, ", ")}"
|
||||||
end
|
end
|
||||||
|
|
||||||
peers =
|
peers =
|
||||||
|
|
|
@ -77,14 +77,14 @@ defmodule Backend.Crawler.Crawlers.Friendica do
|
||||||
|
|
||||||
defp to_domain(url) do
|
defp to_domain(url) do
|
||||||
url
|
url
|
||||||
|> strip_prefix("http://")
|
|> String.replace_prefix("http://", "")
|
||||||
|> strip_prefix("https://")
|
|> String.replace_prefix("https://", "")
|
||||||
|> strip_username()
|
|> strip_username()
|
||||||
end
|
end
|
||||||
|
|
||||||
# Sometimes a url at the poco/@server endpoint has the form username@domain.tld, in which case we only want domain.tld
|
# Sometimes a url at the poco/@server endpoint has the form username@domain.tld, in which case we only want domain.tld
|
||||||
defp strip_username(string) do
|
defp strip_username(string) do
|
||||||
[_match, _username, domain] = Regex.run(~r/([\w-_]+@)?([\w.-_]+)/, string)
|
[_match, _username, domain] = Regex.run(~r/([\w\-_]+@)?([\w\.\-_]+)/, string)
|
||||||
domain
|
domain
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -205,7 +205,7 @@ defmodule Backend.Crawler.Crawlers.Misskey do
|
||||||
defp extract_mentions_from_status(status) do
|
defp extract_mentions_from_status(status) do
|
||||||
status_content = Map.get(status, "text")
|
status_content = Map.get(status, "text")
|
||||||
|
|
||||||
Regex.scan(~r/@\w+@([\w._-]+)/, status_content)
|
Regex.scan(~r/@\w+@([\w\._\-]+)/, status_content)
|
||||||
|> Enum.map(fn [_match, domain] -> domain end)
|
|> Enum.map(fn [_match, domain] -> domain end)
|
||||||
|> Enum.reduce(%{}, fn domain, acc ->
|
|> Enum.reduce(%{}, fn domain, acc ->
|
||||||
Map.update(acc, domain, 1, &(&1 + 1))
|
Map.update(acc, domain, 1, &(&1 + 1))
|
||||||
|
|
|
@ -9,7 +9,7 @@ defmodule Backend.Crawler.Util do
|
||||||
# (e.g. https://mastodon.social/@demouser or https://pleroma.site/users/demouser)
|
# (e.g. https://mastodon.social/@demouser or https://pleroma.site/users/demouser)
|
||||||
@spec get_domain(String.t()) :: String.t()
|
@spec get_domain(String.t()) :: String.t()
|
||||||
def get_domain(url) do
|
def get_domain(url) do
|
||||||
[_match, domain] = Regex.run(~r/https?:\/\/([\w.-_]+)\/.*/, url)
|
[_match, domain] = Regex.run(~r/https?:\/\/([\w\.\-_]+)\/.*/, url)
|
||||||
domain
|
domain
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -129,10 +129,14 @@ defmodule Backend.Util do
|
||||||
end
|
end
|
||||||
|
|
||||||
def clean_domain(domain) do
|
def clean_domain(domain) do
|
||||||
|
cleaned =
|
||||||
domain
|
domain
|
||||||
|> String.replace_prefix("https://", "")
|
|> String.replace_prefix("https://", "")
|
||||||
|> String.trim_trailing("/")
|
|> String.trim_trailing("/")
|
||||||
|
|> String.trim()
|
||||||
|> String.downcase()
|
|> String.downcase()
|
||||||
|
|
||||||
|
Regex.replace(~r/:\d+/, cleaned, "")
|
||||||
end
|
end
|
||||||
|
|
||||||
def get_account(username, domain) do
|
def get_account(username, domain) do
|
||||||
|
@ -203,21 +207,8 @@ defmodule Backend.Util do
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
@doc """
|
|
||||||
Strips `prefix` from `string`. If it doesn't start with that prefix, just returns the string.
|
|
||||||
"""
|
|
||||||
@spec strip_prefix(String.t(), String.t()) :: String.t()
|
|
||||||
def strip_prefix(string, prefix) do
|
|
||||||
if String.starts_with?(string, prefix) do
|
|
||||||
prefix_length = String.length(prefix)
|
|
||||||
String.slice(string, prefix_length..-1)
|
|
||||||
else
|
|
||||||
string
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
@spec is_valid_domain?(String.t()) :: boolean
|
@spec is_valid_domain?(String.t()) :: boolean
|
||||||
def is_valid_domain?(domain) do
|
def is_valid_domain?(domain) do
|
||||||
Regex.match?(~r/[\w.-_]+/, domain)
|
Regex.match?(~r/^[\w\.\-_]+$/, domain)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in a new issue