feat: update LLM model and increase content size limit

refactor: update timeout values in job scraper classes

feat: add spoof config for renderers and vendors

build: update pycache files for config and modules
This commit is contained in:
Ofure Ikheloa 2025-11-24 13:47:47 +01:00
parent fd4e8c9c05
commit d025828036
2 changed files with 5 additions and 5 deletions

View File

@ -202,7 +202,7 @@ class LinkedInJobScraper:
await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed) await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)
# Wait for URL to change or new content # Wait for URL to change or new content
try: try:
await page.wait_for_function("() => window.location.href.includes('start=')", timeout=30000) await page.wait_for_function("() => window.location.href.includes('start=')", timeout=60000)
except: except:
pass pass
current_page += 1 current_page += 1
@ -360,7 +360,7 @@ class LinkedInJobScraper:
else: else:
# If no pagination and no new jobs from scroll, check by refreshing # If no pagination and no new jobs from scroll, check by refreshing
print("🔄 Refreshing page to check for new results...") print("🔄 Refreshing page to check for new results...")
await page.reload(wait_until='networkidle') await page.reload(wait_until='load')
await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed) await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
# Check for new jobs after refresh # Check for new jobs after refresh
@ -439,7 +439,7 @@ class LinkedInJobScraper:
try: try:
external_page = await asyncio.wait_for(page_waiter, timeout=5.0) external_page = await asyncio.wait_for(page_waiter, timeout=5.0)
print(" 🌐 External job site opened in new tab.") print(" 🌐 External job site opened in new tab.")
await external_page.wait_for_load_state("load", timeout=30000) await external_page.wait_for_load_state("load", timeout=60000)
await asyncio.sleep(2 * self.human_speed) await asyncio.sleep(2 * self.human_speed)
await self.engine._human_like_scroll(external_page) await self.engine._human_like_scroll(external_page)
await asyncio.sleep(2 * self.human_speed) await asyncio.sleep(2 * self.human_speed)

View File

@ -9,7 +9,7 @@ from config import GEMINI_API_KEY
class LLMJobRefiner: class LLMJobRefiner:
def __init__(self): def __init__(self):
genai.configure(api_key=GEMINI_API_KEY) genai.configure(api_key=GEMINI_API_KEY)
self.model = genai.GenerativeModel('gemini-pro') self.model = genai.GenerativeModel('gemini-latest-flash')
async def refine_job_data(self, raw_data: Dict[str, Any], target_field: str) -> Dict[str, Any]: async def refine_job_data(self, raw_data: Dict[str, Any], target_field: str) -> Dict[str, Any]:
""" """
@ -29,7 +29,7 @@ class LLMJobRefiner:
Target Field: {target_field} Target Field: {target_field}
Raw Page Content: Raw Page Content:
{raw_data.get('page_content', '')[:3000]} # Limit content size {raw_data.get('page_content', '')[:6000]} # Limit content size
Instructions: Instructions:
1. Extract only the information relevant to the target field: {target_field} 1. Extract only the information relevant to the target field: {target_field}