Implement parsing of all rows on all pages
This commit is contained in:
parent
c0d709b7b5
commit
d6a44bade5
|
@ -3,6 +3,7 @@ from loguru import logger
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import asyncio
|
import asyncio
|
||||||
|
|
||||||
|
from urllib.parse import parse_qs
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
|
@ -16,20 +17,47 @@ class Scraper:
|
||||||
|
|
||||||
self.parse_tasks = set()
|
self.parse_tasks = set()
|
||||||
|
|
||||||
async def fetch(self, url: str):
|
async def fetch(self, url: str, params: dict):
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
async with session.get(url) as resp:
|
async with session.get(url, params=params) as resp:
|
||||||
return await resp.text()
|
return await resp.text()
|
||||||
|
|
||||||
async def parse(self, url: str) -> str:
|
async def parse(self, url: str) -> list[str]:
|
||||||
resp = await self.fetch(url)
|
params = {
|
||||||
logger.debug(f"Received response from {url}!")
|
'name': 'sud_delo', 'srv_num': ['2', '2'],
|
||||||
return resp
|
'name_op': ['r', 'r'], 'page': ['1'],
|
||||||
|
'vnkod': ['52OS0000', '52OS0000'], 'delo_id': ['1540006'],
|
||||||
|
'case_type': ['0'], 'new': ['0'], 'delo_table': ['u1_case'],
|
||||||
|
'u1_case__ENTRY_DATE1D': ['01.01.2012']
|
||||||
|
}
|
||||||
|
|
||||||
async def scrape(self):
|
resp = await self.fetch(url, params)
|
||||||
|
logger.debug(f"Received response from {url}!")
|
||||||
|
|
||||||
|
soup = BeautifulSoup(resp, 'html.parser')
|
||||||
|
|
||||||
|
all_pages = soup.find(id="content").td
|
||||||
|
last_page_link = all_pages.find_all("a")[-1]["href"]
|
||||||
|
last_page = int(parse_qs(last_page_link)["page"][0])
|
||||||
|
|
||||||
|
court_cases = []
|
||||||
|
|
||||||
|
for next_page in range(2, last_page+2):
|
||||||
|
for row in soup.find(id="tablcont").find_all("tr")[1:]:
|
||||||
|
court_cases.append(row)
|
||||||
|
|
||||||
|
if next_page != last_page+1:
|
||||||
|
params["page"] = [next_page]
|
||||||
|
resp = await self.fetch(url, params)
|
||||||
|
soup = BeautifulSoup(resp, 'html.parser')
|
||||||
|
|
||||||
|
return court_cases
|
||||||
|
|
||||||
|
async def scrape(self) -> list:
|
||||||
for domain in self.domains:
|
for domain in self.domains:
|
||||||
url = f"https://{domain}"
|
url = f"https://{domain}/modules.php"
|
||||||
parse_task = asyncio.create_task(self.parse(url))
|
parse_task = asyncio.create_task(self.parse(url))
|
||||||
self.parse_tasks.add(parse_task)
|
self.parse_tasks.add(parse_task)
|
||||||
|
|
||||||
done, _pending = await asyncio.wait(self.parse_tasks)
|
parsed_data = await asyncio.gather(*self.parse_tasks)
|
||||||
|
return parsed_data
|
||||||
|
|
Reference in a new issue