add opt-in whitelist for small instances
This commit is contained in:
parent
21b62ea3fd
commit
58d46ff640
|
@ -238,7 +238,9 @@ class SidebarImpl extends React.Component<ISidebarProps> {
|
||||||
<NonIdealState
|
<NonIdealState
|
||||||
icon={IconNames.BLOCKED_PERSON}
|
icon={IconNames.BLOCKED_PERSON}
|
||||||
title="No data"
|
title="No data"
|
||||||
description="This instance has fewer than 5 users and was not crawled."
|
description="This instance has fewer than 5 users. It was not crawled in order to protect their privacy, but if it's your instance you can opt in."
|
||||||
|
action={<AnchorButton icon={IconNames.CONFIRM} href="https://sunbeam.city/@tao" target="_blank">
|
||||||
|
Message @tao to opt in</AnchorButton>}
|
||||||
/>
|
/>
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,12 +7,14 @@ import json
|
||||||
import multiprocessing as mp
|
import multiprocessing as mp
|
||||||
import requests
|
import requests
|
||||||
import time
|
import time
|
||||||
|
import os
|
||||||
from dateutil.parser import parse as datetime_parser
|
from dateutil.parser import parse as datetime_parser
|
||||||
from datetime import datetime, timedelta, timezone
|
from datetime import datetime, timedelta, timezone
|
||||||
from functional import seq
|
from functional import seq
|
||||||
from django_bulk_update.helper import bulk_update
|
from django_bulk_update.helper import bulk_update
|
||||||
from django.core.management.base import BaseCommand
|
from django.core.management.base import BaseCommand
|
||||||
from django import db
|
from django import db
|
||||||
|
from django.conf import settings
|
||||||
from scraper.models import Instance, PeerRelationship
|
from scraper.models import Instance, PeerRelationship
|
||||||
from scraper.management.commands._util import require_lock, InvalidResponseException, get_key, log, validate_int, PersonalInstanceException
|
from scraper.management.commands._util import require_lock, InvalidResponseException, get_key, log, validate_int, PersonalInstanceException
|
||||||
|
|
||||||
|
@ -29,10 +31,10 @@ from scraper.management.commands._util import require_lock, InvalidResponseExcep
|
||||||
|
|
||||||
# TODO: use the /api/v1/server/followers and /api/v1/server/following endpoints in peertube instances
|
# TODO: use the /api/v1/server/followers and /api/v1/server/following endpoints in peertube instances
|
||||||
|
|
||||||
SEED = 'mastodon.social'
|
SEED = 'mastodon.ar.al'
|
||||||
TIMEOUT = 20 # seconds
|
TIMEOUT = 20 # seconds
|
||||||
NUM_THREADS = 16 # roughly 40MB each
|
NUM_THREADS = 16 # roughly 40MB each
|
||||||
PERSONAL_INSTANCE_THRESHOLD = 5 # instances with <= this many users won't be scraped
|
PERSONAL_INSTANCE_THRESHOLD = 5 # instances with < this many users won't be scraped
|
||||||
STATUS_SCRAPE_LIMIT = 5000
|
STATUS_SCRAPE_LIMIT = 5000
|
||||||
|
|
||||||
|
|
||||||
|
@ -42,6 +44,9 @@ class Command(BaseCommand):
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
self.scraped_count = 0
|
self.scraped_count = 0
|
||||||
|
f = open(os.path.join(settings.BASE_DIR, '../whitelist.txt'), 'r')
|
||||||
|
self.whitelist = seq(f.readlines()).map(lambda i: i.lower().strip()).to_list()
|
||||||
|
f.close()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_instance_info(instance_name: str):
|
def get_instance_info(instance_name: str):
|
||||||
|
@ -97,6 +102,7 @@ class Command(BaseCommand):
|
||||||
# Continuing, so get url for next page
|
# Continuing, so get url for next page
|
||||||
min_id = earliest_status['id']
|
min_id = earliest_status['id']
|
||||||
url = 'https://' + instance_name + '/api/v1/timelines/public?local=true&limit=1000&max_id=' + min_id
|
url = 'https://' + instance_name + '/api/v1/timelines/public?local=true&limit=1000&max_id=' + min_id
|
||||||
|
time.sleep(1) # Sleep to avoid overloading the instance
|
||||||
|
|
||||||
mentions_seq = (seq(mentions)
|
mentions_seq = (seq(mentions)
|
||||||
.filter(lambda m: not m['acct'].endswith(instance_name) and '@' in m['acct'])
|
.filter(lambda m: not m['acct'].endswith(instance_name) and '@' in m['acct'])
|
||||||
|
@ -116,7 +122,11 @@ class Command(BaseCommand):
|
||||||
|
|
||||||
# Check if this is a personal instance before continuing
|
# Check if this is a personal instance before continuing
|
||||||
user_count = get_key(data, ['info', 'stats', 'user_count'])
|
user_count = get_key(data, ['info', 'stats', 'user_count'])
|
||||||
if isinstance(user_count, int) and user_count < PERSONAL_INSTANCE_THRESHOLD:
|
print(self.whitelist)
|
||||||
|
print(instance.name)
|
||||||
|
if isinstance(user_count, int)\
|
||||||
|
and user_count < PERSONAL_INSTANCE_THRESHOLD\
|
||||||
|
and instance.name not in self.whitelist:
|
||||||
raise PersonalInstanceException
|
raise PersonalInstanceException
|
||||||
|
|
||||||
data['peers'] = self.get_instance_peers(instance.name)
|
data['peers'] = self.get_instance_peers(instance.name)
|
||||||
|
|
1
whitelist.txt
Normal file
1
whitelist.txt
Normal file
|
@ -0,0 +1 @@
|
||||||
|
mastodon.social
|
Loading…
Reference in a new issue