|
|
|
@ -1,3 +1,8 @@
|
|
|
|
|
from loguru import logger
|
|
|
|
|
|
|
|
|
|
import aiohttp
|
|
|
|
|
import asyncio
|
|
|
|
|
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -11,5 +16,22 @@ class Scraper:
|
|
|
|
|
self.rps_min = rps_min
|
|
|
|
|
self.rps_max = rps_max
|
|
|
|
|
|
|
|
|
|
self.parse_tasks = set()
|
|
|
|
|
|
|
|
|
|
async def fetch(self, url: str):
|
|
|
|
|
async with aiohttp.ClientSession() as session:
|
|
|
|
|
async with session.get(url) as resp:
|
|
|
|
|
return await resp.text()
|
|
|
|
|
|
|
|
|
|
async def parse(self, url: str) -> str:
|
|
|
|
|
resp = await self.fetch(url)
|
|
|
|
|
logger.debug(f"Received response from {url}!")
|
|
|
|
|
return resp
|
|
|
|
|
|
|
|
|
|
async def scrape(self):
|
|
|
|
|
...
|
|
|
|
|
for domain in self.domains:
|
|
|
|
|
url = f"https://{domain}"
|
|
|
|
|
parse_task = asyncio.create_task(self.parse(url))
|
|
|
|
|
self.parse_tasks.add(parse_task)
|
|
|
|
|
|
|
|
|
|
done, _pending = await asyncio.wait(self.parse_tasks)
|
|
|
|
|