Fetch urls concurrently

master
LoRiot 3 months ago
parent e044b7d9f0
commit ca13a5440d
  1. 3
      requirements.txt
  2. 24
      scraper.py

@ -2,4 +2,5 @@ loguru
pretty_errors
colorama
pyyaml
beautifulsoup4
beautifulsoup4
aiohttp[speedups]

@ -1,3 +1,8 @@
from loguru import logger
import aiohttp
import asyncio
from bs4 import BeautifulSoup
@ -11,5 +16,22 @@ class Scraper:
self.rps_min = rps_min
self.rps_max = rps_max
self.parse_tasks = set()
async def fetch(self, url: str):
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
return await resp.text()
async def parse(self, url: str) -> str:
resp = await self.fetch(url)
logger.debug(f"Received response from {url}!")
return resp
async def scrape(self):
...
for domain in self.domains:
url = f"https://{domain}"
parse_task = asyncio.create_task(self.parse(url))
self.parse_tasks.add(parse_task)
done, _pending = await asyncio.wait(self.parse_tasks)