add opt-in whitelist for small instances

This commit is contained in:
Tao Bojlen 2018-09-04 19:52:12 +02:00
parent 21b62ea3fd
commit 58d46ff640
3 changed files with 17 additions and 4 deletions

View file

@ -238,7 +238,9 @@ class SidebarImpl extends React.Component<ISidebarProps> {
<NonIdealState
icon={IconNames.BLOCKED_PERSON}
title="No data"
description="This instance has fewer than 5 users and was not crawled."
description="This instance has fewer than 5 users. It was not crawled in order to protect their privacy, but if it's your instance you can opt in."
action={<AnchorButton icon={IconNames.CONFIRM} href="https://sunbeam.city/@tao" target="_blank">
Message @tao to opt in</AnchorButton>}
/>
)
}

View file

@ -7,12 +7,14 @@ import json
import multiprocessing as mp
import requests
import time
import os
from dateutil.parser import parse as datetime_parser
from datetime import datetime, timedelta, timezone
from functional import seq
from django_bulk_update.helper import bulk_update
from django.core.management.base import BaseCommand
from django import db
from django.conf import settings
from scraper.models import Instance, PeerRelationship
from scraper.management.commands._util import require_lock, InvalidResponseException, get_key, log, validate_int, PersonalInstanceException
@ -29,10 +31,10 @@ from scraper.management.commands._util import require_lock, InvalidResponseExcep
# TODO: use the /api/v1/server/followers and /api/v1/server/following endpoints in peertube instances
SEED = 'mastodon.social'
SEED = 'mastodon.ar.al'
TIMEOUT = 20 # seconds
NUM_THREADS = 16 # roughly 40MB each
PERSONAL_INSTANCE_THRESHOLD = 5 # instances with <= this many users won't be scraped
PERSONAL_INSTANCE_THRESHOLD = 5 # instances with < this many users won't be scraped
STATUS_SCRAPE_LIMIT = 5000
@ -42,6 +44,9 @@ class Command(BaseCommand):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.scraped_count = 0
f = open(os.path.join(settings.BASE_DIR, '../whitelist.txt'), 'r')
self.whitelist = seq(f.readlines()).map(lambda i: i.lower().strip()).to_list()
f.close()
@staticmethod
def get_instance_info(instance_name: str):
@ -97,6 +102,7 @@ class Command(BaseCommand):
# Continuing, so get url for next page
min_id = earliest_status['id']
url = 'https://' + instance_name + '/api/v1/timelines/public?local=true&limit=1000&max_id=' + min_id
time.sleep(1) # Sleep to avoid overloading the instance
mentions_seq = (seq(mentions)
.filter(lambda m: not m['acct'].endswith(instance_name) and '@' in m['acct'])
@ -116,7 +122,11 @@ class Command(BaseCommand):
# Check if this is a personal instance before continuing
user_count = get_key(data, ['info', 'stats', 'user_count'])
if isinstance(user_count, int) and user_count < PERSONAL_INSTANCE_THRESHOLD:
print(self.whitelist)
print(instance.name)
if isinstance(user_count, int)\
and user_count < PERSONAL_INSTANCE_THRESHOLD\
and instance.name not in self.whitelist:
raise PersonalInstanceException
data['peers'] = self.get_instance_peers(instance.name)

1
whitelist.txt Normal file
View file

@ -0,0 +1 @@
mastodon.social