Fetch urls concurrently

2022-09-10 20:50:45 +03:00 · 2022-09-10 20:50:45 +03:00 · ca13a5440d
commit ca13a5440d
parent e044b7d9f0
2 changed files with 25 additions and 2 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -2,4 +2,5 @@ loguru
 pretty_errors
 colorama
 pyyaml
-beautifulsoup4
+beautifulsoup4
+aiohttp[speedups]
--- a/scraper.py
+++ b/scraper.py
@ -1,3 +1,8 @@
+from loguru import logger
+
+import aiohttp
+import asyncio
+
 from bs4 import BeautifulSoup


@ -11,5 +16,22 @@ class Scraper:
        self.rps_min = rps_min
        self.rps_max = rps_max

+        self.parse_tasks = set()
+
+    async def fetch(self, url: str):
+        async with aiohttp.ClientSession() as session:
+            async with session.get(url) as resp:
+                return await resp.text()
+
+    async def parse(self, url: str) -> str:
+        resp = await self.fetch(url)
+        logger.debug(f"Received response from {url}!")
+        return resp
+
    async def scrape(self):
-        ...
+        for domain in self.domains:
+            url = f"https://{domain}"
+            parse_task = asyncio.create_task(self.parse(url))
+            self.parse_tasks.add(parse_task)
+
+        done, _pending = await asyncio.wait(self.parse_tasks)