diff --git a/courts_scraper/scraper.py b/courts_scraper/scraper.py index 155a551..66643bb 100644 --- a/courts_scraper/scraper.py +++ b/courts_scraper/scraper.py @@ -3,6 +3,7 @@ from loguru import logger import aiohttp import asyncio +from urllib.parse import parse_qs from bs4 import BeautifulSoup @@ -16,20 +17,47 @@ class Scraper: self.parse_tasks = set() - async def fetch(self, url: str): + async def fetch(self, url: str, params: dict): async with aiohttp.ClientSession() as session: - async with session.get(url) as resp: + async with session.get(url, params=params) as resp: return await resp.text() - async def parse(self, url: str) -> str: - resp = await self.fetch(url) - logger.debug(f"Received response from {url}!") - return resp + async def parse(self, url: str) -> list[str]: + params = { + 'name': 'sud_delo', 'srv_num': ['2', '2'], + 'name_op': ['r', 'r'], 'page': ['1'], + 'vnkod': ['52OS0000', '52OS0000'], 'delo_id': ['1540006'], + 'case_type': ['0'], 'new': ['0'], 'delo_table': ['u1_case'], + 'u1_case__ENTRY_DATE1D': ['01.01.2012'] + } - async def scrape(self): + resp = await self.fetch(url, params) + logger.debug(f"Received response from {url}!") + + soup = BeautifulSoup(resp, 'html.parser') + + all_pages = soup.find(id="content").td + last_page_link = all_pages.find_all("a")[-1]["href"] + last_page = int(parse_qs(last_page_link)["page"][0]) + + court_cases = [] + + for next_page in range(2, last_page+2): + for row in soup.find(id="tablcont").find_all("tr")[1:]: + court_cases.append(row) + + if next_page != last_page+1: + params["page"] = [next_page] + resp = await self.fetch(url, params) + soup = BeautifulSoup(resp, 'html.parser') + + return court_cases + + async def scrape(self) -> list: for domain in self.domains: - url = f"https://{domain}" + url = f"https://{domain}/modules.php" parse_task = asyncio.create_task(self.parse(url)) self.parse_tasks.add(parse_task) - done, _pending = await asyncio.wait(self.parse_tasks) + parsed_data = await asyncio.gather(*self.parse_tasks) + return parsed_data