make sure crawl results are never overwritten with a blank ApiCrawler

This commit is contained in:
Tao Bror Bojlén 2019-10-15 13:24:37 +01:00
parent 8558f96635
commit 6715d9395f
No known key found for this signature in database
GPG key ID: C6EC7AAB905F9E6F
3 changed files with 6 additions and 2 deletions

View file

@ -62,9 +62,13 @@ config :backend, :crawler,
crawl_interval_mins: 60, crawl_interval_mins: 60,
crawl_workers: 20, crawl_workers: 20,
blacklist: [ blacklist: [
# spam
"gab.best", "gab.best",
# spam
"4chan.icu", "4chan.icu",
# *really* doesn't want to be listed on fediverse.space
"pleroma.site", "pleroma.site",
# dummy instances used for pleroma CI
"pleroma.online" "pleroma.online"
], ],
user_agent: "fediverse.space crawler", user_agent: "fediverse.space crawler",

View file

@ -34,7 +34,7 @@ defmodule Backend.Crawler.Crawlers.GnuSocial do
def crawl(domain, nodeinfo) do def crawl(domain, nodeinfo) do
if nodeinfo == nil or if nodeinfo == nil or
nodeinfo |> Map.get(:user_count) |> is_above_user_threshold?() do nodeinfo |> Map.get(:user_count) |> is_above_user_threshold?() do
Map.merge(crawl_large_instance(domain), nodeinfo) Map.merge(nodeinfo, crawl_large_instance(domain))
else else
Map.merge(ApiCrawler.get_default(), nodeinfo) Map.merge(ApiCrawler.get_default(), nodeinfo)
end end

View file

@ -166,7 +166,7 @@ defmodule Backend.Scheduler do
Edges are only generated if Edges are only generated if
* both instances have been succesfully crawled * both instances have been succesfully crawled
* neither of the instances have blocked each other * neither of the instances have blocked each other
* there are interactions in each direction * there are interactions in each direction (if :require_bidirectional_edges is true in config)
""" """
def generate_edges do def generate_edges do
now = get_now() now = get_now()