project305/courts_scraper/scraper.py

from loguru import logger

import aiohttp
import asyncio

from bs4 import BeautifulSoup


class Scraper:
    def __init__(
        self, domains: list[str], rps: int, proxy: list[str] = None
    ):
        self.domains = domains
        self.proxy = proxy
        self.rps = rps

        self.parse_tasks = set()

    async def fetch(self, url: str):
        async with aiohttp.ClientSession() as session:
            async with session.get(url) as resp:
                return await resp.text()

    async def parse(self, url: str) -> str:
        resp = await self.fetch(url)
        logger.debug(f"Received response from {url}!")
        return resp

    async def scrape(self):
        for domain in self.domains:
            url = f"https://{domain}"
            parse_task = asyncio.create_task(self.parse(url))
            self.parse_tasks.add(parse_task)

        done, _pending = await asyncio.wait(self.parse_tasks)
Fetch urls concurrently 2022-09-10 17:50:45 +00:00			`from loguru import logger`

			`import aiohttp`
			`import asyncio`

Create Scraper class 2022-09-10 15:42:43 +00:00			`from bs4 import BeautifulSoup`


			`class Scraper:`
			`def __init__(`
Load settings from config or from arguments 2022-09-12 13:18:21 +00:00			`self, domains: list[str], rps: int, proxy: list[str] = None`
Create Scraper class 2022-09-10 15:42:43 +00:00			`):`
			`self.domains = domains`
			`self.proxy = proxy`
Load settings from config or from arguments 2022-09-12 13:18:21 +00:00			`self.rps = rps`
Create Scraper class 2022-09-10 15:42:43 +00:00
Fetch urls concurrently 2022-09-10 17:50:45 +00:00			`self.parse_tasks = set()`

			`async def fetch(self, url: str):`
			`async with aiohttp.ClientSession() as session:`
			`async with session.get(url) as resp:`
			`return await resp.text()`

			`async def parse(self, url: str) -> str:`
			`resp = await self.fetch(url)`
			`logger.debug(f"Received response from {url}!")`
			`return resp`

Create Scraper class 2022-09-10 15:42:43 +00:00			`async def scrape(self):`
Fetch urls concurrently 2022-09-10 17:50:45 +00:00			`for domain in self.domains:`
			`url = f"https://{domain}"`
			`parse_task = asyncio.create_task(self.parse(url))`
			`self.parse_tasks.add(parse_task)`

			`done, _pending = await asyncio.wait(self.parse_tasks)`