diff --git a/requirements.txt b/requirements.txt index 736c648..f84cebd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,5 @@ loguru pretty_errors colorama pyyaml -beautifulsoup4 \ No newline at end of file +beautifulsoup4 +aiohttp[speedups] \ No newline at end of file diff --git a/scraper.py b/scraper.py index 488525e..9cdc4a8 100644 --- a/scraper.py +++ b/scraper.py @@ -1,3 +1,8 @@ +from loguru import logger + +import aiohttp +import asyncio + from bs4 import BeautifulSoup @@ -11,5 +16,22 @@ class Scraper: self.rps_min = rps_min self.rps_max = rps_max + self.parse_tasks = set() + + async def fetch(self, url: str): + async with aiohttp.ClientSession() as session: + async with session.get(url) as resp: + return await resp.text() + + async def parse(self, url: str) -> str: + resp = await self.fetch(url) + logger.debug(f"Received response from {url}!") + return resp + async def scrape(self): - ... + for domain in self.domains: + url = f"https://{domain}" + parse_task = asyncio.create_task(self.parse(url)) + self.parse_tasks.add(parse_task) + + done, _pending = await asyncio.wait(self.parse_tasks)