From 91f326e19bd693ee3d8e771fd7622f1c70b6356e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tao=20Bror=20Bojl=C3=A9n?= <tao@btao.org>
Date: Thu, 28 Feb 2019 17:51:03 +0000
Subject: [PATCH 1/5] handle missing data better in API

---
 backend/apiv1/views.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/backend/apiv1/views.py b/backend/apiv1/views.py
index 9dff54a..9be587f 100644
--- a/backend/apiv1/views.py
+++ b/backend/apiv1/views.py
@@ -32,5 +32,8 @@ class NodeView(viewsets.ReadOnlyModelViewSet):
     """
     Endpoint to get a list of the graph's nodes in a SigmaJS-friendly format.
     """
-    queryset = Instance.objects.filter(status='success')
+    queryset = Instance.objects.filter(status='success')\
+                               .filter(x_coord__isnull=False)\
+                               .filter(y_coord__isnull=False)\
+                               .filter(user_count__isnull=False)
     serializer_class = NodeSerializer

From 2e5696962b4c276174d4c7a18e4840dfadd85228 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tao=20Bror=20Bojl=C3=A9n?= <tao@btao.org>
Date: Thu, 28 Feb 2019 18:03:08 +0000
Subject: [PATCH 2/5] only scrape 50 instances in one go

---
 backend/scraper/management/commands/scrape.py | 34 ++++++++++++++++---
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/backend/scraper/management/commands/scrape.py b/backend/scraper/management/commands/scrape.py
index 8f30fa5..981c6c3 100644
--- a/backend/scraper/management/commands/scrape.py
+++ b/backend/scraper/management/commands/scrape.py
@@ -24,13 +24,30 @@ from scraper.management.commands._util import require_lock, InvalidResponseExcep
 SEED = 'mastodon.social'
 TIMEOUT = 20  # seconds
 NUM_THREADS = 16  # roughly 40MB each
-PERSONAL_INSTANCE_THRESHOLD = 5  # instances with < this many users won't be scraped
+PERSONAL_INSTANCE_THRESHOLD = 5  # instances with < this many users won't be crawled
+MAX_STATUSES_PER_PAGE = 100
 STATUS_SCRAPE_LIMIT = 5000
+INSTANCE_SCRAPE_LIMIT = 50  # note: this does not include newly discovered instances! they will always be crawled.
 
 
 class Command(BaseCommand):
     help = "Scrapes the entire fediverse"
 
+    def add_arguments(self, parser):
+        # Named (optional) arguments
+        parser.add_argument(
+            '--unlimited',
+            action='store_true',
+            dest='unlimited',
+            help="Crawl all stale instances rather than limiting to {}".format(INSTANCE_SCRAPE_LIMIT),
+        )
+        parser.add_argument(
+            '--all',
+            action='store_true',
+            dest='all',
+            help="Crawl all instances rather than limiting to stale ones"
+        )
+
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.scraped_count = 0
@@ -67,9 +84,9 @@ class Command(BaseCommand):
         mentions = []
         datetime_threshold = datetime.now(timezone.utc) - timedelta(days=31)
         statuses_seen = 0
-        # We'll ask for 1000 statuses, but Mastodon never returns more than 40. Some Pleroma instances will ignore
+        # We'll ask for lots of statuses, but Mastodon never returns more than 40. Some Pleroma instances will ignore
         # the limit and return 20.
-        url = 'https://' + instance_name + '/api/v1/timelines/public?local=true&limit=1000'
+        url = 'https://{}/api/v1/timelines/public?local=true&limit={}/'.format(instance_name, MAX_STATUSES_PER_PAGE)
         while True:
             response = requests.get(url, timeout=TIMEOUT)
             statuses = response.json()
@@ -91,7 +108,7 @@ class Command(BaseCommand):
                 break
             # Continuing, so get url for next page
             min_id = earliest_status['id']
-            url = 'https://' + instance_name + '/api/v1/timelines/public?local=true&limit=1000&max_id=' + min_id
+            url = 'https://{}/api/v1/timelines/public?local=true&limit={}&max_id={}'.format(instance_name, MAX_STATUSES_PER_PAGE, min_id)
             time.sleep(2)  # Sleep to avoid overloading the instance
 
         mentions_seq = (seq(mentions)
@@ -201,7 +218,14 @@ class Command(BaseCommand):
 
     def handle(self, *args, **options):
         start_time = time.time()
-        stale_instances = Instance.objects.filter(last_updated__lte=datetime.now()-timedelta(days=1))
+        if options['all']:
+            stale_instances = Instance.objects.all()
+        else:
+            stale_instances = Instance.objects.filter(last_updated__lte=datetime.now()-timedelta(days=1))
+
+        if not options['unlimited']:
+            stale_instances = stale_instances[:INSTANCE_SCRAPE_LIMIT]
+
         with mp.Manager() as manager:
             # Share the list of existing instances amongst all threads (to avoid each thread having to query
             # for it on every instance it scrapes)

From dfc115f2756ee604d91fc50617945b09fd38ec1c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tao=20Bror=20Bojl=C3=A9n?= <tao@btao.org>
Date: Thu, 28 Feb 2019 18:05:54 +0000
Subject: [PATCH 3/5] lowercase instance names (fix #48)

---
 backend/scraper/management/commands/scrape.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/backend/scraper/management/commands/scrape.py b/backend/scraper/management/commands/scrape.py
index 981c6c3..373afdd 100644
--- a/backend/scraper/management/commands/scrape.py
+++ b/backend/scraper/management/commands/scrape.py
@@ -76,7 +76,8 @@ class Command(BaseCommand):
         if response.status_code != 200 or not isinstance(peers, list) or get_key(peers, ['error']):
             raise InvalidResponseException("Could not get peers for {}".format(instance_name))
         # Get rid of peers that just say "null" and the instance itself
-        return [peer for peer in peers if peer and peer != instance_name]
+        # Also make sure to lowercase all instance names; otherwise there'll be some duplicates
+        return [peer.lower() for peer in peers if peer and peer != instance_name]
 
     @staticmethod
     def get_statuses(instance_name: str):

From e52f581067e6f518b89a5ad41886bb93eb28d33d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tao=20Bror=20Bojl=C3=A9n?= <tao@btao.org>
Date: Thu, 28 Feb 2019 18:06:16 +0000
Subject: [PATCH 4/5] bump personal instance threshold to 10

---
 backend/scraper/management/commands/scrape.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/scraper/management/commands/scrape.py b/backend/scraper/management/commands/scrape.py
index 373afdd..ddd0fea 100644
--- a/backend/scraper/management/commands/scrape.py
+++ b/backend/scraper/management/commands/scrape.py
@@ -24,7 +24,7 @@ from scraper.management.commands._util import require_lock, InvalidResponseExcep
 SEED = 'mastodon.social'
 TIMEOUT = 20  # seconds
 NUM_THREADS = 16  # roughly 40MB each
-PERSONAL_INSTANCE_THRESHOLD = 5  # instances with < this many users won't be crawled
+PERSONAL_INSTANCE_THRESHOLD = 10  # instances with < this many users won't be crawled
 MAX_STATUSES_PER_PAGE = 100
 STATUS_SCRAPE_LIMIT = 5000
 INSTANCE_SCRAPE_LIMIT = 50  # note: this does not include newly discovered instances! they will always be crawled.

From 3a66e7464ebc3304f127b651baf057fe4b0d0c6e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tao=20Bror=20Bojl=C3=A9n?= <tao@btao.org>
Date: Fri, 1 Mar 2019 14:42:05 +0000
Subject: [PATCH 5/5] fix duplicates in instance peers

---
 backend/scraper/management/commands/scrape.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backend/scraper/management/commands/scrape.py b/backend/scraper/management/commands/scrape.py
index ddd0fea..e150bba 100644
--- a/backend/scraper/management/commands/scrape.py
+++ b/backend/scraper/management/commands/scrape.py
@@ -76,8 +76,8 @@ class Command(BaseCommand):
         if response.status_code != 200 or not isinstance(peers, list) or get_key(peers, ['error']):
             raise InvalidResponseException("Could not get peers for {}".format(instance_name))
         # Get rid of peers that just say "null" and the instance itself
-        # Also make sure to lowercase all instance names; otherwise there'll be some duplicates
-        return [peer.lower() for peer in peers if peer and peer != instance_name]
+        # Also make sure to lowercase all instance names and remove duplicates
+        return list(set([peer.lower() for peer in peers if peer and peer != instance_name]))
 
     @staticmethod
     def get_statuses(instance_name: str):