From 1770333906d1485e507f7a1d96c10526c829e398 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tao=20Bror=20Bojl=C3=A9n?= Date: Fri, 8 Mar 2019 15:58:43 +0000 Subject: [PATCH 1/4] remove whitelist from git --- backend/whitelist.txt | 1 - 1 file changed, 1 deletion(-) delete mode 100644 backend/whitelist.txt diff --git a/backend/whitelist.txt b/backend/whitelist.txt deleted file mode 100644 index 3d0b213..0000000 --- a/backend/whitelist.txt +++ /dev/null @@ -1 +0,0 @@ -mastodon.social From 2534b45487821737534da78504c5735d2331f5aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tao=20Bror=20Bojl=C3=A9n?= Date: Fri, 8 Mar 2019 15:59:07 +0000 Subject: [PATCH 2/4] remove trailing slash from endpoint crawled this was causing issues on at least one pleroma instance, for some reason --- backend/scraper/management/commands/_util.py | 10 +++- backend/scraper/management/commands/scrape.py | 52 +++++++++++++------ 2 files changed, 45 insertions(+), 17 deletions(-) diff --git a/backend/scraper/management/commands/_util.py b/backend/scraper/management/commands/_util.py index 2ec317f..ba30d06 100644 --- a/backend/scraper/management/commands/_util.py +++ b/backend/scraper/management/commands/_util.py @@ -69,5 +69,11 @@ def validate_int(integer): return integer if (isinstance(integer, int) and 0 <= integer < 2147483647) else None -def log(text): - return "{} - {}".format(datetime.now().isoformat(), text) +def log(obj, text, success=False, error=False): + text = "{} - {}".format(datetime.now().isoformat(), text) + if success: + text = obj.style.SUCCESS(text) + if error: + obj.stderr.write(text) + else: + obj.stdout.write(text) diff --git a/backend/scraper/management/commands/scrape.py b/backend/scraper/management/commands/scrape.py index e150bba..4ef1899 100644 --- a/backend/scraper/management/commands/scrape.py +++ b/backend/scraper/management/commands/scrape.py @@ -21,7 +21,7 @@ from scraper.management.commands._util import require_lock, InvalidResponseExcep # TODO: use the /api/v1/server/followers and /api/v1/server/following endpoints in peertube instances -SEED = 'mastodon.social' +SEED = 'p.a3.pm' TIMEOUT = 20 # seconds NUM_THREADS = 16 # roughly 40MB each PERSONAL_INSTANCE_THRESHOLD = 10 # instances with < this many users won't be crawled @@ -47,26 +47,38 @@ class Command(BaseCommand): dest='all', help="Crawl all instances rather than limiting to stale ones" ) + parser.add_argument( + '--verbose', + action='store_true', + dest='verbose', + help="Verbose logging" + ) + parser.add_argument( + '--instance', + dest='instance', + help="Crawl a single instance" + ) def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + self.verbose = False self.scraped_count = 0 f = open(os.path.join(settings.BASE_DIR, '../whitelist.txt'), 'r') self.whitelist = seq(f.readlines()).map(lambda i: i.lower().strip()).to_list() f.close() - @staticmethod - def get_instance_info(instance_name: str): + def get_instance_info(self, instance_name: str): """Collect info about instance""" url = 'https://' + instance_name + '/api/v1/instance' response = requests.get(url, timeout=TIMEOUT) json = response.json() if response.status_code != 200 or get_key(json, ['error']): + if self.verbose: + log(self, "Couldn't get instance info for {}: {}".format(instance_name, response), error=True) raise InvalidResponseException("Could not get info for {}".format(instance_name)) return json - @staticmethod - def get_instance_peers(instance_name: str): + def get_instance_peers(self, instance_name: str): """Collect connected instances""" # The peers endpoint returns a "list of all domain names known to this instance" # (https://github.com/tootsuite/mastodon/pull/6125) @@ -74,24 +86,29 @@ class Command(BaseCommand): response = requests.get(url, timeout=TIMEOUT) peers = response.json() if response.status_code != 200 or not isinstance(peers, list) or get_key(peers, ['error']): + if self.verbose: + log(self, "Couldn't get peers for {}: {}".format(instance_name, response), error=True) raise InvalidResponseException("Could not get peers for {}".format(instance_name)) # Get rid of peers that just say "null" and the instance itself # Also make sure to lowercase all instance names and remove duplicates return list(set([peer.lower() for peer in peers if peer and peer != instance_name])) - @staticmethod - def get_statuses(instance_name: str): + def get_statuses(self, instance_name: str): """Collect all statuses that mention users on other instances""" mentions = [] datetime_threshold = datetime.now(timezone.utc) - timedelta(days=31) statuses_seen = 0 # We'll ask for lots of statuses, but Mastodon never returns more than 40. Some Pleroma instances will ignore # the limit and return 20. - url = 'https://{}/api/v1/timelines/public?local=true&limit={}/'.format(instance_name, MAX_STATUSES_PER_PAGE) + url = 'https://{}/api/v1/timelines/public?local=true&limit={}'.format(instance_name, MAX_STATUSES_PER_PAGE) while True: + if self.verbose: + log(self, "({} posts seen)\tGetting {}".format(statuses_seen, url)) response = requests.get(url, timeout=TIMEOUT) statuses = response.json() if response.status_code != 200 or get_key(statuses, ['error']): + if self.verbose: + log(self, "Couldn't get statuses for {}: {}".format(instance_name, response), error=True) raise InvalidResponseException("Could not get statuses for {}".format(instance_name)) elif len(statuses) == 0: break @@ -197,7 +214,7 @@ class Command(BaseCommand): relationship.last_updated = datetime.now() bulk_update(relationships, update_fields=['mention_count', 'statuses_seen', 'last_updated']) - self.stdout.write(log("Processed {}: {}".format(data['instance_name'], data['status']))) + log(self, "Processed {}: {}".format(data['instance_name'], data['status'])) def worker(self, queue: mp.JoinableQueue, existing_instance_ids, scraped_ids): """The main worker that processes instances""" @@ -206,12 +223,12 @@ class Command(BaseCommand): instance = queue.get() if instance.name in scraped_ids: # If we hit this branch, it's indicative of a bug - self.stderr.write(log("Skipping {}, already done. This should not have been added to the queue!" - .format(instance))) + log(self, "Skipping {}, already done. This should not have been added to the queue!".format(instance), + error=True) queue.task_done() else: # Fetch data on instance - self.stdout.write(log("Processing {}".format(instance.name))) + log(self, "Processing {}".format(instance.name)) data = self.process_instance(instance) self.save_data(instance, data, queue, existing_instance_ids) scraped_ids[instance.name] = 1 @@ -219,7 +236,13 @@ class Command(BaseCommand): def handle(self, *args, **options): start_time = time.time() - if options['all']: + + self.verbose = options['verbose'] + + if options['instance']: + stale_instance, _ = Instance.objects.get_or_create(name=options['instance']) + stale_instances = [stale_instance] + elif options['all']: stale_instances = Instance.objects.all() else: stale_instances = Instance.objects.filter(last_updated__lte=datetime.now()-timedelta(days=1)) @@ -246,5 +269,4 @@ class Command(BaseCommand): self.scraped_count = len(scraped_ids.keys()) end_time = time.time() - self.stdout.write(self.style.SUCCESS(log("Scraped {} instances in {:.0f}s" - .format(self.scraped_count, end_time - start_time)))) + log(self, "Scraped {} instances in {:.0f}s".format(self.scraped_count, end_time - start_time), True) From 884810fa09d17cbda9e508d73408e978172247d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tao=20Bror=20Bojl=C3=A9n?= Date: Fri, 8 Mar 2019 16:05:42 +0000 Subject: [PATCH 3/4] set status limit to 40 per page --- backend/scraper/management/commands/scrape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/scraper/management/commands/scrape.py b/backend/scraper/management/commands/scrape.py index 4ef1899..233ec5c 100644 --- a/backend/scraper/management/commands/scrape.py +++ b/backend/scraper/management/commands/scrape.py @@ -25,7 +25,7 @@ SEED = 'p.a3.pm' TIMEOUT = 20 # seconds NUM_THREADS = 16 # roughly 40MB each PERSONAL_INSTANCE_THRESHOLD = 10 # instances with < this many users won't be crawled -MAX_STATUSES_PER_PAGE = 100 +MAX_STATUSES_PER_PAGE = 40 STATUS_SCRAPE_LIMIT = 5000 INSTANCE_SCRAPE_LIMIT = 50 # note: this does not include newly discovered instances! they will always be crawled. From bb9e224e9c2385ea9ba49b08801e6b08c64d1a4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tao=20Bror=20Bojl=C3=A9n?= Date: Fri, 8 Mar 2019 16:06:12 +0000 Subject: [PATCH 4/4] update personal instance help text --- frontend/src/components/Sidebar.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/src/components/Sidebar.tsx b/frontend/src/components/Sidebar.tsx index 2021dac..bc71d93 100644 --- a/frontend/src/components/Sidebar.tsx +++ b/frontend/src/components/Sidebar.tsx @@ -267,7 +267,7 @@ class SidebarImpl extends React.Component { Message @tao to opt in} />