Implement parsing of all rows on all pages

2022-09-12 19:46:47 +03:00 · 2022-09-12 19:46:47 +03:00 · d6a44bade5
parent c0d709b7b5
commit d6a44bade5
1 changed files with 37 additions and 9 deletions
--- a/courts_scraper/scraper.py
+++ b/courts_scraper/scraper.py
@ -3,6 +3,7 @@ from loguru import logger
 import aiohttp
 import asyncio

+from urllib.parse import parse_qs
 from bs4 import BeautifulSoup


@ -16,20 +17,47 @@ class Scraper:

        self.parse_tasks = set()

-    async def fetch(self, url: str):
+    async def fetch(self, url: str, params: dict):
        async with aiohttp.ClientSession() as session:
-            async with session.get(url) as resp:
+            async with session.get(url, params=params) as resp:
                return await resp.text()

-    async def parse(self, url: str) -> str:
-        resp = await self.fetch(url)
-        logger.debug(f"Received response from {url}!")
-        return resp
+    async def parse(self, url: str) -> list[str]:
+        params = {
+            'name': 'sud_delo', 'srv_num': ['2', '2'],
+            'name_op': ['r', 'r'], 'page': ['1'],
+            'vnkod': ['52OS0000', '52OS0000'], 'delo_id': ['1540006'],
+            'case_type': ['0'], 'new': ['0'], 'delo_table': ['u1_case'],
+            'u1_case__ENTRY_DATE1D': ['01.01.2012']
+        }

-    async def scrape(self):
+        resp = await self.fetch(url, params)
+        logger.debug(f"Received response from {url}!")
+
+        soup = BeautifulSoup(resp, 'html.parser')
+
+        all_pages = soup.find(id="content").td
+        last_page_link = all_pages.find_all("a")[-1]["href"]
+        last_page = int(parse_qs(last_page_link)["page"][0])
+
+        court_cases = []
+
+        for next_page in range(2, last_page+2):
+            for row in soup.find(id="tablcont").find_all("tr")[1:]:
+                court_cases.append(row)
+
+            if next_page != last_page+1:
+                params["page"] = [next_page]
+                resp = await self.fetch(url, params)
+                soup = BeautifulSoup(resp, 'html.parser')
+
+        return court_cases
+
+    async def scrape(self) -> list:
        for domain in self.domains:
-            url = f"https://{domain}"
+            url = f"https://{domain}/modules.php"
            parse_task = asyncio.create_task(self.parse(url))
            self.parse_tasks.add(parse_task)

-        done, _pending = await asyncio.wait(self.parse_tasks)
+        parsed_data = await asyncio.gather(*self.parse_tasks)
+        return parsed_data