Fetch urls concurrently
This commit is contained in:
parent
e044b7d9f0
commit
ca13a5440d
|
@ -2,4 +2,5 @@ loguru
|
||||||
pretty_errors
|
pretty_errors
|
||||||
colorama
|
colorama
|
||||||
pyyaml
|
pyyaml
|
||||||
beautifulsoup4
|
beautifulsoup4
|
||||||
|
aiohttp[speedups]
|
24
scraper.py
24
scraper.py
|
@ -1,3 +1,8 @@
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
import aiohttp
|
||||||
|
import asyncio
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
|
@ -11,5 +16,22 @@ class Scraper:
|
||||||
self.rps_min = rps_min
|
self.rps_min = rps_min
|
||||||
self.rps_max = rps_max
|
self.rps_max = rps_max
|
||||||
|
|
||||||
|
self.parse_tasks = set()
|
||||||
|
|
||||||
|
async def fetch(self, url: str):
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.get(url) as resp:
|
||||||
|
return await resp.text()
|
||||||
|
|
||||||
|
async def parse(self, url: str) -> str:
|
||||||
|
resp = await self.fetch(url)
|
||||||
|
logger.debug(f"Received response from {url}!")
|
||||||
|
return resp
|
||||||
|
|
||||||
async def scrape(self):
|
async def scrape(self):
|
||||||
...
|
for domain in self.domains:
|
||||||
|
url = f"https://{domain}"
|
||||||
|
parse_task = asyncio.create_task(self.parse(url))
|
||||||
|
self.parse_tasks.add(parse_task)
|
||||||
|
|
||||||
|
done, _pending = await asyncio.wait(self.parse_tasks)
|
||||||
|
|
Reference in a new issue