From 1770333906d1485e507f7a1d96c10526c829e398 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tao=20Bror=20Bojl=C3=A9n?= <tao@btao.org>
Date: Fri, 8 Mar 2019 15:58:43 +0000
Subject: [PATCH 1/4] remove whitelist from git

---
 backend/whitelist.txt | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 backend/whitelist.txt

diff --git a/backend/whitelist.txt b/backend/whitelist.txt
deleted file mode 100644
index 3d0b213..0000000
--- a/backend/whitelist.txt
+++ /dev/null
@@ -1 +0,0 @@
-mastodon.social

From 2534b45487821737534da78504c5735d2331f5aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tao=20Bror=20Bojl=C3=A9n?= <tao@btao.org>
Date: Fri, 8 Mar 2019 15:59:07 +0000
Subject: [PATCH 2/4] remove trailing slash from endpoint crawled

this was causing issues on at least one pleroma instance, for some
reason
---
 backend/scraper/management/commands/_util.py  | 10 +++-
 backend/scraper/management/commands/scrape.py | 52 +++++++++++++------
 2 files changed, 45 insertions(+), 17 deletions(-)

diff --git a/backend/scraper/management/commands/_util.py b/backend/scraper/management/commands/_util.py
index 2ec317f..ba30d06 100644
--- a/backend/scraper/management/commands/_util.py
+++ b/backend/scraper/management/commands/_util.py
@@ -69,5 +69,11 @@ def validate_int(integer):
     return integer if (isinstance(integer, int) and 0 <= integer < 2147483647) else None
 
 
-def log(text):
-    return "{} - {}".format(datetime.now().isoformat(), text)
+def log(obj, text, success=False, error=False):
+    text = "{} - {}".format(datetime.now().isoformat(), text)
+    if success:
+        text = obj.style.SUCCESS(text)
+    if error:
+        obj.stderr.write(text)
+    else:
+        obj.stdout.write(text)
diff --git a/backend/scraper/management/commands/scrape.py b/backend/scraper/management/commands/scrape.py
index e150bba..4ef1899 100644
--- a/backend/scraper/management/commands/scrape.py
+++ b/backend/scraper/management/commands/scrape.py
@@ -21,7 +21,7 @@ from scraper.management.commands._util import require_lock, InvalidResponseExcep
 
 # TODO: use the /api/v1/server/followers and /api/v1/server/following endpoints in peertube instances
 
-SEED = 'mastodon.social'
+SEED = 'p.a3.pm'
 TIMEOUT = 20  # seconds
 NUM_THREADS = 16  # roughly 40MB each
 PERSONAL_INSTANCE_THRESHOLD = 10  # instances with < this many users won't be crawled
@@ -47,26 +47,38 @@ class Command(BaseCommand):
             dest='all',
             help="Crawl all instances rather than limiting to stale ones"
         )
+        parser.add_argument(
+            '--verbose',
+            action='store_true',
+            dest='verbose',
+            help="Verbose logging"
+        )
+        parser.add_argument(
+            '--instance',
+            dest='instance',
+            help="Crawl a single instance"
+        )
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
+        self.verbose = False
         self.scraped_count = 0
         f = open(os.path.join(settings.BASE_DIR, '../whitelist.txt'), 'r')
         self.whitelist = seq(f.readlines()).map(lambda i: i.lower().strip()).to_list()
         f.close()
 
-    @staticmethod
-    def get_instance_info(instance_name: str):
+    def get_instance_info(self, instance_name: str):
         """Collect info about instance"""
         url = 'https://' + instance_name + '/api/v1/instance'
         response = requests.get(url, timeout=TIMEOUT)
         json = response.json()
         if response.status_code != 200 or get_key(json, ['error']):
+            if self.verbose:
+                log(self, "Couldn't get instance info for {}: {}".format(instance_name, response), error=True)
             raise InvalidResponseException("Could not get info for {}".format(instance_name))
         return json
 
-    @staticmethod
-    def get_instance_peers(instance_name: str):
+    def get_instance_peers(self, instance_name: str):
         """Collect connected instances"""
         # The peers endpoint returns a "list of all domain names known to this instance"
         # (https://github.com/tootsuite/mastodon/pull/6125)
@@ -74,24 +86,29 @@ class Command(BaseCommand):
         response = requests.get(url, timeout=TIMEOUT)
         peers = response.json()
         if response.status_code != 200 or not isinstance(peers, list) or get_key(peers, ['error']):
+            if self.verbose:
+                log(self, "Couldn't get peers for {}: {}".format(instance_name, response), error=True)
             raise InvalidResponseException("Could not get peers for {}".format(instance_name))
         # Get rid of peers that just say "null" and the instance itself
         # Also make sure to lowercase all instance names and remove duplicates
         return list(set([peer.lower() for peer in peers if peer and peer != instance_name]))
 
-    @staticmethod
-    def get_statuses(instance_name: str):
+    def get_statuses(self, instance_name: str):
         """Collect all statuses that mention users on other instances"""
         mentions = []
         datetime_threshold = datetime.now(timezone.utc) - timedelta(days=31)
         statuses_seen = 0
         # We'll ask for lots of statuses, but Mastodon never returns more than 40. Some Pleroma instances will ignore
         # the limit and return 20.
-        url = 'https://{}/api/v1/timelines/public?local=true&limit={}/'.format(instance_name, MAX_STATUSES_PER_PAGE)
+        url = 'https://{}/api/v1/timelines/public?local=true&limit={}'.format(instance_name, MAX_STATUSES_PER_PAGE)
         while True:
+            if self.verbose:
+                log(self, "({} posts seen)\tGetting {}".format(statuses_seen, url))
             response = requests.get(url, timeout=TIMEOUT)
             statuses = response.json()
             if response.status_code != 200 or get_key(statuses, ['error']):
+                if self.verbose:
+                    log(self, "Couldn't get statuses for {}: {}".format(instance_name, response), error=True)
                 raise InvalidResponseException("Could not get statuses for {}".format(instance_name))
             elif len(statuses) == 0:
                 break
@@ -197,7 +214,7 @@ class Command(BaseCommand):
                 relationship.last_updated = datetime.now()
             bulk_update(relationships, update_fields=['mention_count', 'statuses_seen', 'last_updated'])
 
-        self.stdout.write(log("Processed {}: {}".format(data['instance_name'], data['status'])))
+        log(self, "Processed {}: {}".format(data['instance_name'], data['status']))
 
     def worker(self, queue: mp.JoinableQueue, existing_instance_ids, scraped_ids):
         """The main worker that processes instances"""
@@ -206,12 +223,12 @@ class Command(BaseCommand):
             instance = queue.get()
             if instance.name in scraped_ids:
                 # If we hit this branch, it's indicative of a bug
-                self.stderr.write(log("Skipping {}, already done. This should not have been added to the queue!"
-                                      .format(instance)))
+                log(self, "Skipping {}, already done. This should not have been added to the queue!".format(instance),
+                    error=True)
                 queue.task_done()
             else:
                 # Fetch data on instance
-                self.stdout.write(log("Processing {}".format(instance.name)))
+                log(self, "Processing {}".format(instance.name))
                 data = self.process_instance(instance)
                 self.save_data(instance, data, queue, existing_instance_ids)
                 scraped_ids[instance.name] = 1
@@ -219,7 +236,13 @@ class Command(BaseCommand):
 
     def handle(self, *args, **options):
         start_time = time.time()
-        if options['all']:
+
+        self.verbose = options['verbose']
+
+        if options['instance']:
+            stale_instance, _ = Instance.objects.get_or_create(name=options['instance'])
+            stale_instances = [stale_instance]
+        elif options['all']:
             stale_instances = Instance.objects.all()
         else:
             stale_instances = Instance.objects.filter(last_updated__lte=datetime.now()-timedelta(days=1))
@@ -246,5 +269,4 @@ class Command(BaseCommand):
             self.scraped_count = len(scraped_ids.keys())
 
         end_time = time.time()
-        self.stdout.write(self.style.SUCCESS(log("Scraped {} instances in {:.0f}s"
-                                                 .format(self.scraped_count, end_time - start_time))))
+        log(self, "Scraped {} instances in {:.0f}s".format(self.scraped_count, end_time - start_time), True)

From 884810fa09d17cbda9e508d73408e978172247d2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tao=20Bror=20Bojl=C3=A9n?= <tao@btao.org>
Date: Fri, 8 Mar 2019 16:05:42 +0000
Subject: [PATCH 3/4] set status limit to 40 per page

---
 backend/scraper/management/commands/scrape.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/scraper/management/commands/scrape.py b/backend/scraper/management/commands/scrape.py
index 4ef1899..233ec5c 100644
--- a/backend/scraper/management/commands/scrape.py
+++ b/backend/scraper/management/commands/scrape.py
@@ -25,7 +25,7 @@ SEED = 'p.a3.pm'
 TIMEOUT = 20  # seconds
 NUM_THREADS = 16  # roughly 40MB each
 PERSONAL_INSTANCE_THRESHOLD = 10  # instances with < this many users won't be crawled
-MAX_STATUSES_PER_PAGE = 100
+MAX_STATUSES_PER_PAGE = 40
 STATUS_SCRAPE_LIMIT = 5000
 INSTANCE_SCRAPE_LIMIT = 50  # note: this does not include newly discovered instances! they will always be crawled.
 

From bb9e224e9c2385ea9ba49b08801e6b08c64d1a4c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tao=20Bror=20Bojl=C3=A9n?= <tao@btao.org>
Date: Fri, 8 Mar 2019 16:06:12 +0000
Subject: [PATCH 4/4] update personal instance help text

---
 frontend/src/components/Sidebar.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/frontend/src/components/Sidebar.tsx b/frontend/src/components/Sidebar.tsx
index 2021dac..bc71d93 100644
--- a/frontend/src/components/Sidebar.tsx
+++ b/frontend/src/components/Sidebar.tsx
@@ -267,7 +267,7 @@ class SidebarImpl extends React.Component<ISidebarProps, ISidebarState> {
             <NonIdealState
                 icon={IconNames.BLOCKED_PERSON}
                 title="No data"
-                description="This instance has fewer than 5 users. It was not crawled in order to protect their privacy, but if it's your instance you can opt in."
+                description="This instance has fewer than 10 users. It was not crawled in order to protect their privacy, but if it's your instance you can opt in."
                 action={<AnchorButton icon={IconNames.CONFIRM} href="https://cursed.technology/@tao" target="_blank">
                             Message @tao to opt in</AnchorButton>}
             />