feat: update LLM model and increase content size limit

refactor: update timeout values in job scraper classes feat: add spoof config for renderers and vendors build: update pycache files for config and modules
2025-11-24 13:47:47 +01:00 · 2025-11-24 13:47:47 +01:00 · d025828036
commit d025828036
parent fd4e8c9c05
2 changed files with 5 additions and 5 deletions
--- a/job_scraper2.py
+++ b/job_scraper2.py
@ -202,7 +202,7 @@ class LinkedInJobScraper:
                await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)
                # Wait for URL to change or new content
                try:
-                    await page.wait_for_function("() => window.location.href.includes('start=')", timeout=30000)
+                    await page.wait_for_function("() => window.location.href.includes('start=')", timeout=60000)
                except:
                    pass
                current_page += 1
@ -360,7 +360,7 @@ class LinkedInJobScraper:
                else:
                    # If no pagination and no new jobs from scroll, check by refreshing
                    print("🔄 Refreshing page to check for new results...")
-                    await page.reload(wait_until='networkidle')
+                    await page.reload(wait_until='load')
                    await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
                    # Check for new jobs after refresh
@ -439,7 +439,7 @@ class LinkedInJobScraper:
                        try:
                            external_page = await asyncio.wait_for(page_waiter, timeout=5.0)
                            print("  🌐 External job site opened in new tab.")
-                            await external_page.wait_for_load_state("load", timeout=30000)
+                            await external_page.wait_for_load_state("load", timeout=60000)
                            await asyncio.sleep(2 * self.human_speed)
                            await self.engine._human_like_scroll(external_page)
                            await asyncio.sleep(2 * self.human_speed)
--- a/llm_agent.py
+++ b/llm_agent.py
@ -9,7 +9,7 @@ from config import GEMINI_API_KEY
 class LLMJobRefiner:
    def __init__(self):
        genai.configure(api_key=GEMINI_API_KEY)
-        self.model = genai.GenerativeModel('gemini-pro')
+        self.model = genai.GenerativeModel('gemini-latest-flash')
    async def refine_job_data(self, raw_data: Dict[str, Any], target_field: str) -> Dict[str, Any]:
        """
@ -29,7 +29,7 @@ class LLMJobRefiner:
        Target Field: {target_field}
        Raw Page Content:
-        {raw_data.get('page_content', '')[:3000]}  # Limit content size
+        {raw_data.get('page_content', '')[:6000]}  # Limit content size
        Instructions:
        1. Extract only the information relevant to the target field: {target_field}