improve handling of cancelled scrape

This commit is contained in:
Tao Bror Bojlén 2019-02-21 10:38:49 +00:00
parent 1b11c70430
commit 1c1f193542
No known key found for this signature in database
GPG key ID: C6EC7AAB905F9E6F
3 changed files with 12 additions and 8 deletions

View file

@ -15,6 +15,7 @@ from django_bulk_update.helper import bulk_update
from django.core.management.base import BaseCommand from django.core.management.base import BaseCommand
from django import db from django import db
from django.conf import settings from django.conf import settings
from django.utils import timezone
from scraper.models import Instance, PeerRelationship from scraper.models import Instance, PeerRelationship
from scraper.management.commands._util import require_lock, InvalidResponseException, get_key, log, validate_int, PersonalInstanceException from scraper.management.commands._util import require_lock, InvalidResponseException, get_key, log, validate_int, PersonalInstanceException
@ -24,7 +25,7 @@ SEED = 'mastodon.social'
TIMEOUT = 20 # seconds TIMEOUT = 20 # seconds
NUM_THREADS = 16 # roughly 40MB each NUM_THREADS = 16 # roughly 40MB each
PERSONAL_INSTANCE_THRESHOLD = 5 # instances with < this many users won't be scraped PERSONAL_INSTANCE_THRESHOLD = 5 # instances with < this many users won't be scraped
STATUS_SCRAPE_LIMIT = 1000 STATUS_SCRAPE_LIMIT = 100
class Command(BaseCommand): class Command(BaseCommand):
@ -144,6 +145,7 @@ class Command(BaseCommand):
instance.description = get_key(data, ['info', 'description']) instance.description = get_key(data, ['info', 'description'])
instance.version = get_key(data, ['info', 'version']) instance.version = get_key(data, ['info', 'version'])
instance.status = get_key(data, ['status']) instance.status = get_key(data, ['status'])
instance.last_updated = timezone.now()
instance.save() instance.save()
if data['status'] == 'success' and data['peers']: if data['status'] == 'success' and data['peers']:
# TODO: handle a peer disappeer-ing # TODO: handle a peer disappeer-ing

View file

@ -1,7 +1,8 @@
# Generated by Django 2.1 on 2018-09-03 14:09 # Generated by Django 2.1.7 on 2019-02-21 10:37
from django.db import migrations, models from django.db import migrations, models
import django.db.models.deletion import django.db.models.deletion
import django.utils.timezone
class Migration(migrations.Migration): class Migration(migrations.Migration):
@ -17,7 +18,7 @@ class Migration(migrations.Migration):
fields=[ fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('weight', models.FloatField(blank=True, null=True)), ('weight', models.FloatField(blank=True, null=True)),
('last_updated', models.DateTimeField()), ('last_updated', models.DateTimeField(default=django.utils.timezone.now)),
], ],
), ),
migrations.CreateModel( migrations.CreateModel(
@ -33,7 +34,7 @@ class Migration(migrations.Migration):
('x_coord', models.FloatField(blank=True, null=True)), ('x_coord', models.FloatField(blank=True, null=True)),
('y_coord', models.FloatField(blank=True, null=True)), ('y_coord', models.FloatField(blank=True, null=True)),
('first_seen', models.DateTimeField(auto_now_add=True)), ('first_seen', models.DateTimeField(auto_now_add=True)),
('last_updated', models.DateTimeField(auto_now=True)), ('last_updated', models.DateTimeField(default=django.utils.timezone.now)),
], ],
), ),
migrations.CreateModel( migrations.CreateModel(
@ -43,7 +44,7 @@ class Migration(migrations.Migration):
('mention_count', models.IntegerField(blank=True, null=True)), ('mention_count', models.IntegerField(blank=True, null=True)),
('statuses_seen', models.IntegerField(blank=True, null=True)), ('statuses_seen', models.IntegerField(blank=True, null=True)),
('first_seen', models.DateTimeField(auto_now_add=True)), ('first_seen', models.DateTimeField(auto_now_add=True)),
('last_updated', models.DateTimeField(auto_now=True)), ('last_updated', models.DateTimeField(default=django.utils.timezone.now)),
('source', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='following_relationship', to='scraper.Instance')), ('source', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='following_relationship', to='scraper.Instance')),
('target', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='follower_relationships', to='scraper.Instance')), ('target', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='follower_relationships', to='scraper.Instance')),
], ],

View file

@ -1,4 +1,5 @@
from django.db import models from django.db import models
from django.utils import timezone
class Instance(models.Model): class Instance(models.Model):
@ -28,7 +29,7 @@ class Instance(models.Model):
# Automatic fields # Automatic fields
first_seen = models.DateTimeField(auto_now_add=True) first_seen = models.DateTimeField(auto_now_add=True)
last_updated = models.DateTimeField(auto_now=True) last_updated = models.DateTimeField(default=timezone.now)
class PeerRelationship(models.Model): class PeerRelationship(models.Model):
@ -41,7 +42,7 @@ class PeerRelationship(models.Model):
# Metadata # Metadata
first_seen = models.DateTimeField(auto_now_add=True) first_seen = models.DateTimeField(auto_now_add=True)
last_updated = models.DateTimeField(auto_now=True) last_updated = models.DateTimeField(default=timezone.now)
class Edge(models.Model): class Edge(models.Model):
@ -55,4 +56,4 @@ class Edge(models.Model):
weight = models.FloatField(blank=True, null=True) weight = models.FloatField(blank=True, null=True)
# Metadata # Metadata
last_updated = models.DateTimeField(blank=False, null=False) last_updated = models.DateTimeField(default=timezone.now)