handle weirdly-formatted friendica peers

This commit is contained in:
Tao Bror Bojlén 2019-08-21 22:30:32 +02:00
parent c2124468a7
commit 2f1a654520
No known key found for this signature in database
GPG key ID: C6EC7AAB905F9E6F
6 changed files with 30 additions and 11 deletions

View file

@ -21,6 +21,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Added ON DELETE to `most_recent_crawl` table, such that it can handle previously-crawled but now-dead instances. - Added ON DELETE to `most_recent_crawl` table, such that it can handle previously-crawled but now-dead instances.
- You can now login to the admin view by clicking, not just by pressing enter. - You can now login to the admin view by clicking, not just by pressing enter.
- Add handling for weirdly-formatted Friendica peers
## [2.7.0 - 2018-08-18] ## [2.7.0 - 2018-08-18]

View file

@ -76,13 +76,15 @@ defmodule Backend.Crawler.Crawlers.Friendica do
end end
defp to_domain(url) do defp to_domain(url) do
url = url
cond do |> strip_prefix("http://")
String.starts_with?(url, "https://") -> String.slice(url, 8..-1) |> strip_prefix("https://")
String.starts_with?(url, "http://") -> String.slice(url, 7..-1) |> strip_username()
true -> url
end end
url # Sometimes a url at the poco/@server endpoint has the form username@domain.tld, in which case we only want domain.tld
defp strip_username(string) do
[_match, _username, domain] = Regex.run(~r/([\w-_]+@)?([\w.-_]+)/, string)
domain
end end
end end

View file

@ -205,7 +205,7 @@ defmodule Backend.Crawler.Crawlers.Misskey do
defp extract_mentions_from_status(status) do defp extract_mentions_from_status(status) do
status_content = Map.get(status, "text") status_content = Map.get(status, "text")
Regex.scan(~r/@\w+@([\w.-]+)/, status_content) Regex.scan(~r/@\w+@([\w._-]+)/, status_content)
|> Enum.map(fn [_match, domain] -> domain end) |> Enum.map(fn [_match, domain] -> domain end)
|> Enum.reduce(%{}, fn domain, acc -> |> Enum.reduce(%{}, fn domain, acc ->
Map.update(acc, domain, 1, &(&1 + 1)) Map.update(acc, domain, 1, &(&1 + 1))

View file

@ -9,7 +9,7 @@ defmodule Backend.Crawler.Util do
# (e.g. https://mastodon.social/@demouser or https://pleroma.site/users/demouser) # (e.g. https://mastodon.social/@demouser or https://pleroma.site/users/demouser)
@spec get_domain(String.t()) :: String.t() @spec get_domain(String.t()) :: String.t()
def get_domain(url) do def get_domain(url) do
[_match, domain] = Regex.run(~r/https?:\/\/([\w.-]+)\/.*/, url) [_match, domain] = Regex.run(~r/https?:\/\/([\w.-_]+)\/.*/, url)
domain domain
end end

View file

@ -202,4 +202,22 @@ defmodule Backend.Util do
{:error, error} -> raise error {:error, error} -> raise error
end end
end end
@doc """
Strips `prefix` from `string`. If it doesn't start with that prefix, just returns the string.
"""
@spec strip_prefix(String.t(), String.t()) :: String.t()
def strip_prefix(string, prefix) do
if String.starts_with?(string, prefix) do
prefix_length = String.length(prefix)
String.slice(string, prefix_length..-1)
else
string
end
end
@spec is_valid_domain?(String.t()) :: boolean
def is_valid_domain?(domain) do
Regex.match?(~r/[\w.-_]+/, domain)
end
end end

View file

@ -1,7 +1,5 @@
defmodule BackendWeb.InstanceController do defmodule BackendWeb.InstanceController do
use BackendWeb, :controller use BackendWeb, :controller
import Backend.Util
alias Graph.Cache alias Graph.Cache
action_fallback(BackendWeb.FallbackController) action_fallback(BackendWeb.FallbackController)