This repository has been archived on 2022-09-12. You can view files and clone it, but cannot push or open issues or pull requests.
project305/courts_scraper/scraper.py

36 lines
926 B
Python
Raw Normal View History

2022-09-10 17:50:45 +00:00
from loguru import logger
import aiohttp
import asyncio
2022-09-10 15:42:43 +00:00
from bs4 import BeautifulSoup
class Scraper:
def __init__(
self, domains: list[str], rps: int, proxy: list[str] = None
2022-09-10 15:42:43 +00:00
):
self.domains = domains
self.proxy = proxy
self.rps = rps
2022-09-10 15:42:43 +00:00
2022-09-10 17:50:45 +00:00
self.parse_tasks = set()
async def fetch(self, url: str):
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
return await resp.text()
async def parse(self, url: str) -> str:
resp = await self.fetch(url)
logger.debug(f"Received response from {url}!")
return resp
2022-09-10 15:42:43 +00:00
async def scrape(self):
2022-09-10 17:50:45 +00:00
for domain in self.domains:
url = f"https://{domain}"
parse_task = asyncio.create_task(self.parse(url))
self.parse_tasks.add(parse_task)
done, _pending = await asyncio.wait(self.parse_tasks)