This repository has been archived on 2022-09-12. You can view files and clone it, but cannot push or open issues or pull requests.
project305/courts_scraper/scraper.py

64 lines
1.9 KiB
Python

from loguru import logger
import aiohttp
import asyncio
from urllib.parse import parse_qs
from bs4 import BeautifulSoup
class Scraper:
def __init__(
self, domains: list[str], rps: int, proxy: list[str] = None
):
self.domains = domains
self.proxy = proxy
self.rps = rps
self.parse_tasks = set()
async def fetch(self, url: str, params: dict):
async with aiohttp.ClientSession() as session:
async with session.get(url, params=params) as resp:
return await resp.text()
async def parse(self, url: str) -> list[str]:
params = {
'name': 'sud_delo', 'srv_num': ['2', '2'],
'name_op': ['r', 'r'], 'page': ['1'],
'vnkod': ['52OS0000', '52OS0000'], 'delo_id': ['1540006'],
'case_type': ['0'], 'new': ['0'], 'delo_table': ['u1_case'],
'u1_case__ENTRY_DATE1D': ['01.01.2012']
}
resp = await self.fetch(url, params)
logger.debug(f"Received response from {url}!")
soup = BeautifulSoup(resp, 'html.parser')
all_pages = soup.find(id="content").td
last_page_link = all_pages.find_all("a")[-1]["href"]
last_page = int(parse_qs(last_page_link)["page"][0])
court_cases = []
for next_page in range(2, last_page+2):
for row in soup.find(id="tablcont").find_all("tr")[1:]:
court_cases.append(row)
if next_page != last_page+1:
params["page"] = [next_page]
resp = await self.fetch(url, params)
soup = BeautifulSoup(resp, 'html.parser')
return court_cases
async def scrape(self) -> list:
for domain in self.domains:
url = f"https://{domain}/modules.php"
parse_task = asyncio.create_task(self.parse(url))
self.parse_tasks.add(parse_task)
parsed_data = await asyncio.gather(*self.parse_tasks)
return parsed_data