From 91f326e19bd693ee3d8e771fd7622f1c70b6356e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tao=20Bror=20Bojl=C3=A9n?= Date: Thu, 28 Feb 2019 17:51:03 +0000 Subject: [PATCH 1/5] handle missing data better in API --- backend/apiv1/views.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/backend/apiv1/views.py b/backend/apiv1/views.py index 9dff54a..9be587f 100644 --- a/backend/apiv1/views.py +++ b/backend/apiv1/views.py @@ -32,5 +32,8 @@ class NodeView(viewsets.ReadOnlyModelViewSet): """ Endpoint to get a list of the graph's nodes in a SigmaJS-friendly format. """ - queryset = Instance.objects.filter(status='success') + queryset = Instance.objects.filter(status='success')\ + .filter(x_coord__isnull=False)\ + .filter(y_coord__isnull=False)\ + .filter(user_count__isnull=False) serializer_class = NodeSerializer From 2e5696962b4c276174d4c7a18e4840dfadd85228 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tao=20Bror=20Bojl=C3=A9n?= Date: Thu, 28 Feb 2019 18:03:08 +0000 Subject: [PATCH 2/5] only scrape 50 instances in one go --- backend/scraper/management/commands/scrape.py | 34 ++++++++++++++++--- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/backend/scraper/management/commands/scrape.py b/backend/scraper/management/commands/scrape.py index 8f30fa5..981c6c3 100644 --- a/backend/scraper/management/commands/scrape.py +++ b/backend/scraper/management/commands/scrape.py @@ -24,13 +24,30 @@ from scraper.management.commands._util import require_lock, InvalidResponseExcep SEED = 'mastodon.social' TIMEOUT = 20 # seconds NUM_THREADS = 16 # roughly 40MB each -PERSONAL_INSTANCE_THRESHOLD = 5 # instances with < this many users won't be scraped +PERSONAL_INSTANCE_THRESHOLD = 5 # instances with < this many users won't be crawled +MAX_STATUSES_PER_PAGE = 100 STATUS_SCRAPE_LIMIT = 5000 +INSTANCE_SCRAPE_LIMIT = 50 # note: this does not include newly discovered instances! they will always be crawled. class Command(BaseCommand): help = "Scrapes the entire fediverse" + def add_arguments(self, parser): + # Named (optional) arguments + parser.add_argument( + '--unlimited', + action='store_true', + dest='unlimited', + help="Crawl all stale instances rather than limiting to {}".format(INSTANCE_SCRAPE_LIMIT), + ) + parser.add_argument( + '--all', + action='store_true', + dest='all', + help="Crawl all instances rather than limiting to stale ones" + ) + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.scraped_count = 0 @@ -67,9 +84,9 @@ class Command(BaseCommand): mentions = [] datetime_threshold = datetime.now(timezone.utc) - timedelta(days=31) statuses_seen = 0 - # We'll ask for 1000 statuses, but Mastodon never returns more than 40. Some Pleroma instances will ignore + # We'll ask for lots of statuses, but Mastodon never returns more than 40. Some Pleroma instances will ignore # the limit and return 20. - url = 'https://' + instance_name + '/api/v1/timelines/public?local=true&limit=1000' + url = 'https://{}/api/v1/timelines/public?local=true&limit={}/'.format(instance_name, MAX_STATUSES_PER_PAGE) while True: response = requests.get(url, timeout=TIMEOUT) statuses = response.json() @@ -91,7 +108,7 @@ class Command(BaseCommand): break # Continuing, so get url for next page min_id = earliest_status['id'] - url = 'https://' + instance_name + '/api/v1/timelines/public?local=true&limit=1000&max_id=' + min_id + url = 'https://{}/api/v1/timelines/public?local=true&limit={}&max_id={}'.format(instance_name, MAX_STATUSES_PER_PAGE, min_id) time.sleep(2) # Sleep to avoid overloading the instance mentions_seq = (seq(mentions) @@ -201,7 +218,14 @@ class Command(BaseCommand): def handle(self, *args, **options): start_time = time.time() - stale_instances = Instance.objects.filter(last_updated__lte=datetime.now()-timedelta(days=1)) + if options['all']: + stale_instances = Instance.objects.all() + else: + stale_instances = Instance.objects.filter(last_updated__lte=datetime.now()-timedelta(days=1)) + + if not options['unlimited']: + stale_instances = stale_instances[:INSTANCE_SCRAPE_LIMIT] + with mp.Manager() as manager: # Share the list of existing instances amongst all threads (to avoid each thread having to query # for it on every instance it scrapes) From dfc115f2756ee604d91fc50617945b09fd38ec1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tao=20Bror=20Bojl=C3=A9n?= Date: Thu, 28 Feb 2019 18:05:54 +0000 Subject: [PATCH 3/5] lowercase instance names (fix #48) --- backend/scraper/management/commands/scrape.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backend/scraper/management/commands/scrape.py b/backend/scraper/management/commands/scrape.py index 981c6c3..373afdd 100644 --- a/backend/scraper/management/commands/scrape.py +++ b/backend/scraper/management/commands/scrape.py @@ -76,7 +76,8 @@ class Command(BaseCommand): if response.status_code != 200 or not isinstance(peers, list) or get_key(peers, ['error']): raise InvalidResponseException("Could not get peers for {}".format(instance_name)) # Get rid of peers that just say "null" and the instance itself - return [peer for peer in peers if peer and peer != instance_name] + # Also make sure to lowercase all instance names; otherwise there'll be some duplicates + return [peer.lower() for peer in peers if peer and peer != instance_name] @staticmethod def get_statuses(instance_name: str): From e52f581067e6f518b89a5ad41886bb93eb28d33d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tao=20Bror=20Bojl=C3=A9n?= Date: Thu, 28 Feb 2019 18:06:16 +0000 Subject: [PATCH 4/5] bump personal instance threshold to 10 --- backend/scraper/management/commands/scrape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/scraper/management/commands/scrape.py b/backend/scraper/management/commands/scrape.py index 373afdd..ddd0fea 100644 --- a/backend/scraper/management/commands/scrape.py +++ b/backend/scraper/management/commands/scrape.py @@ -24,7 +24,7 @@ from scraper.management.commands._util import require_lock, InvalidResponseExcep SEED = 'mastodon.social' TIMEOUT = 20 # seconds NUM_THREADS = 16 # roughly 40MB each -PERSONAL_INSTANCE_THRESHOLD = 5 # instances with < this many users won't be crawled +PERSONAL_INSTANCE_THRESHOLD = 10 # instances with < this many users won't be crawled MAX_STATUSES_PER_PAGE = 100 STATUS_SCRAPE_LIMIT = 5000 INSTANCE_SCRAPE_LIMIT = 50 # note: this does not include newly discovered instances! they will always be crawled. From 3a66e7464ebc3304f127b651baf057fe4b0d0c6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tao=20Bror=20Bojl=C3=A9n?= Date: Fri, 1 Mar 2019 14:42:05 +0000 Subject: [PATCH 5/5] fix duplicates in instance peers --- backend/scraper/management/commands/scrape.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/scraper/management/commands/scrape.py b/backend/scraper/management/commands/scrape.py index ddd0fea..e150bba 100644 --- a/backend/scraper/management/commands/scrape.py +++ b/backend/scraper/management/commands/scrape.py @@ -76,8 +76,8 @@ class Command(BaseCommand): if response.status_code != 200 or not isinstance(peers, list) or get_key(peers, ['error']): raise InvalidResponseException("Could not get peers for {}".format(instance_name)) # Get rid of peers that just say "null" and the instance itself - # Also make sure to lowercase all instance names; otherwise there'll be some duplicates - return [peer.lower() for peer in peers if peer and peer != instance_name] + # Also make sure to lowercase all instance names and remove duplicates + return list(set([peer.lower() for peer in peers if peer and peer != instance_name])) @staticmethod def get_statuses(instance_name: str):