diff --git a/scraper.py b/courts_scraper/scraper.py similarity index 85% rename from scraper.py rename to courts_scraper/scraper.py index 9cdc4a8..155a551 100644 --- a/scraper.py +++ b/courts_scraper/scraper.py @@ -8,13 +8,11 @@ from bs4 import BeautifulSoup class Scraper: def __init__( - self, domains: list[str], proxy: list[str], rps_min: int, rps_max: int + self, domains: list[str], rps: int, proxy: list[str] = None ): self.domains = domains self.proxy = proxy - - self.rps_min = rps_min - self.rps_max = rps_max + self.rps = rps self.parse_tasks = set() diff --git a/main.py b/main.py index 8f597f2..c03b18d 100644 --- a/main.py +++ b/main.py @@ -7,23 +7,21 @@ import sys from sys import platform import argparse -from argparse import ArgumentParser +from argparse import ArgumentParser, Namespace import yaml import asyncio -from scraper import Scraper +from courts_scraper.scraper import Scraper def init_argparser() -> ArgumentParser: argparser = argparse.ArgumentParser( description="List fish in aquarium.", - argument_default=argparse.SUPPRESS ) argparser.add_argument( "--config", "-c", help="Path to the config file", type=pathlib.Path, - default="config.yaml", ) argparser.add_argument( "--domains", "-d", @@ -35,33 +33,11 @@ def init_argparser() -> ArgumentParser: help="Path to the proxy file", type=pathlib.Path, ) - argparser.add_argument("--rps_min", help="", type=int) - argparser.add_argument("--rps_max", help="", type=int) + argparser.add_argument("--rps", type=int) return argparser -def load_config() -> dict: - argparser = init_argparser() - args = vars(argparser.parse_args()) - - with open(args["domains"]) as domains_file: - domains = domains_file.readlines() - args["domains"] = domains - - with open(args["proxy"]) as proxy_file: - proxy = proxy_file.readlines() - args["proxy"] = proxy - - with open(args["config"]) as config_file: - config = yaml.safe_load(config_file) - config["settings"].update(args) - - # Remove config path to pass config values to the Scraper - config["settings"].pop("config") - return config - - async def main(): logger.add("project.log") logger.info("Starting...") @@ -71,8 +47,24 @@ async def main(): input() sys.exit(1) - config = load_config() - scraper = Scraper(**config["settings"]) + scraper_settings = dict() + argparser = init_argparser() + args = argparser.parse_args() + + if args.config is not None: + with open(args.config) as config_file: + scraper_settings = yaml.safe_load(config_file) + else: + scraper_settings["rps"] = args.rps + + with open(args.domains) as domains_file: + scraper_settings["domains"] = domains_file.readlines() + + if args.proxy is not None: # Optional argument + with open(args.proxy) as proxy_file: + scraper_settings["proxy"] = proxy_file.readlines() + + scraper = Scraper(**scraper_settings) await scraper.scrape()