add opt-in whitelist for small instances

This commit is contained in:
Tao Bojlen 2018-09-04 19:52:12 +02:00
parent 21b62ea3fd
commit 58d46ff640
3 changed files with 17 additions and 4 deletions

View File

@ -238,7 +238,9 @@ class SidebarImpl extends React.Component<ISidebarProps> {
<NonIdealState <NonIdealState
icon={IconNames.BLOCKED_PERSON} icon={IconNames.BLOCKED_PERSON}
title="No data" title="No data"
description="This instance has fewer than 5 users and was not crawled." description="This instance has fewer than 5 users. It was not crawled in order to protect their privacy, but if it's your instance you can opt in."
action={<AnchorButton icon={IconNames.CONFIRM} href="https://sunbeam.city/@tao" target="_blank">
Message @tao to opt in</AnchorButton>}
/> />
) )
} }

View File

@ -7,12 +7,14 @@ import json
import multiprocessing as mp import multiprocessing as mp
import requests import requests
import time import time
import os
from dateutil.parser import parse as datetime_parser from dateutil.parser import parse as datetime_parser
from datetime import datetime, timedelta, timezone from datetime import datetime, timedelta, timezone
from functional import seq from functional import seq
from django_bulk_update.helper import bulk_update from django_bulk_update.helper import bulk_update
from django.core.management.base import BaseCommand from django.core.management.base import BaseCommand
from django import db from django import db
from django.conf import settings
from scraper.models import Instance, PeerRelationship from scraper.models import Instance, PeerRelationship
from scraper.management.commands._util import require_lock, InvalidResponseException, get_key, log, validate_int, PersonalInstanceException from scraper.management.commands._util import require_lock, InvalidResponseException, get_key, log, validate_int, PersonalInstanceException
@ -29,10 +31,10 @@ from scraper.management.commands._util import require_lock, InvalidResponseExcep
# TODO: use the /api/v1/server/followers and /api/v1/server/following endpoints in peertube instances # TODO: use the /api/v1/server/followers and /api/v1/server/following endpoints in peertube instances
SEED = 'mastodon.social' SEED = 'mastodon.ar.al'
TIMEOUT = 20 # seconds TIMEOUT = 20 # seconds
NUM_THREADS = 16 # roughly 40MB each NUM_THREADS = 16 # roughly 40MB each
PERSONAL_INSTANCE_THRESHOLD = 5 # instances with <= this many users won't be scraped PERSONAL_INSTANCE_THRESHOLD = 5 # instances with < this many users won't be scraped
STATUS_SCRAPE_LIMIT = 5000 STATUS_SCRAPE_LIMIT = 5000
@ -42,6 +44,9 @@ class Command(BaseCommand):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self.scraped_count = 0 self.scraped_count = 0
f = open(os.path.join(settings.BASE_DIR, '../whitelist.txt'), 'r')
self.whitelist = seq(f.readlines()).map(lambda i: i.lower().strip()).to_list()
f.close()
@staticmethod @staticmethod
def get_instance_info(instance_name: str): def get_instance_info(instance_name: str):
@ -97,6 +102,7 @@ class Command(BaseCommand):
# Continuing, so get url for next page # Continuing, so get url for next page
min_id = earliest_status['id'] min_id = earliest_status['id']
url = 'https://' + instance_name + '/api/v1/timelines/public?local=true&limit=1000&max_id=' + min_id url = 'https://' + instance_name + '/api/v1/timelines/public?local=true&limit=1000&max_id=' + min_id
time.sleep(1) # Sleep to avoid overloading the instance
mentions_seq = (seq(mentions) mentions_seq = (seq(mentions)
.filter(lambda m: not m['acct'].endswith(instance_name) and '@' in m['acct']) .filter(lambda m: not m['acct'].endswith(instance_name) and '@' in m['acct'])
@ -116,7 +122,11 @@ class Command(BaseCommand):
# Check if this is a personal instance before continuing # Check if this is a personal instance before continuing
user_count = get_key(data, ['info', 'stats', 'user_count']) user_count = get_key(data, ['info', 'stats', 'user_count'])
if isinstance(user_count, int) and user_count < PERSONAL_INSTANCE_THRESHOLD: print(self.whitelist)
print(instance.name)
if isinstance(user_count, int)\
and user_count < PERSONAL_INSTANCE_THRESHOLD\
and instance.name not in self.whitelist:
raise PersonalInstanceException raise PersonalInstanceException
data['peers'] = self.get_instance_peers(instance.name) data['peers'] = self.get_instance_peers(instance.name)

1
whitelist.txt Normal file
View File

@ -0,0 +1 @@
mastodon.social