Fetch urls concurrently

This commit is contained in:
riot 2022-09-10 20:50:45 +03:00
parent e044b7d9f0
commit ca13a5440d
2 changed files with 25 additions and 2 deletions

View File

@ -2,4 +2,5 @@ loguru
pretty_errors pretty_errors
colorama colorama
pyyaml pyyaml
beautifulsoup4 beautifulsoup4
aiohttp[speedups]

View File

@ -1,3 +1,8 @@
from loguru import logger
import aiohttp
import asyncio
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -11,5 +16,22 @@ class Scraper:
self.rps_min = rps_min self.rps_min = rps_min
self.rps_max = rps_max self.rps_max = rps_max
self.parse_tasks = set()
async def fetch(self, url: str):
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
return await resp.text()
async def parse(self, url: str) -> str:
resp = await self.fetch(url)
logger.debug(f"Received response from {url}!")
return resp
async def scrape(self): async def scrape(self):
... for domain in self.domains:
url = f"https://{domain}"
parse_task = asyncio.create_task(self.parse(url))
self.parse_tasks.add(parse_task)
done, _pending = await asyncio.wait(self.parse_tasks)