feat: update LLM model and increase content size limit
refactor: update timeout values in job scraper classes feat: add spoof config for renderers and vendors build: update pycache files for config and modules
This commit is contained in:
parent
fd4e8c9c05
commit
d025828036
@ -202,7 +202,7 @@ class LinkedInJobScraper:
|
||||
await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)
|
||||
# Wait for URL to change or new content
|
||||
try:
|
||||
await page.wait_for_function("() => window.location.href.includes('start=')", timeout=30000)
|
||||
await page.wait_for_function("() => window.location.href.includes('start=')", timeout=60000)
|
||||
except:
|
||||
pass
|
||||
current_page += 1
|
||||
@ -360,7 +360,7 @@ class LinkedInJobScraper:
|
||||
else:
|
||||
# If no pagination and no new jobs from scroll, check by refreshing
|
||||
print("🔄 Refreshing page to check for new results...")
|
||||
await page.reload(wait_until='networkidle')
|
||||
await page.reload(wait_until='load')
|
||||
await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
|
||||
|
||||
# Check for new jobs after refresh
|
||||
@ -439,7 +439,7 @@ class LinkedInJobScraper:
|
||||
try:
|
||||
external_page = await asyncio.wait_for(page_waiter, timeout=5.0)
|
||||
print(" 🌐 External job site opened in new tab.")
|
||||
await external_page.wait_for_load_state("load", timeout=30000)
|
||||
await external_page.wait_for_load_state("load", timeout=60000)
|
||||
await asyncio.sleep(2 * self.human_speed)
|
||||
await self.engine._human_like_scroll(external_page)
|
||||
await asyncio.sleep(2 * self.human_speed)
|
||||
|
||||
@ -9,7 +9,7 @@ from config import GEMINI_API_KEY
|
||||
class LLMJobRefiner:
|
||||
def __init__(self):
|
||||
genai.configure(api_key=GEMINI_API_KEY)
|
||||
self.model = genai.GenerativeModel('gemini-pro')
|
||||
self.model = genai.GenerativeModel('gemini-latest-flash')
|
||||
|
||||
async def refine_job_data(self, raw_data: Dict[str, Any], target_field: str) -> Dict[str, Any]:
|
||||
"""
|
||||
@ -29,7 +29,7 @@ class LLMJobRefiner:
|
||||
|
||||
Target Field: {target_field}
|
||||
Raw Page Content:
|
||||
{raw_data.get('page_content', '')[:3000]} # Limit content size
|
||||
{raw_data.get('page_content', '')[:6000]} # Limit content size
|
||||
|
||||
Instructions:
|
||||
1. Extract only the information relevant to the target field: {target_field}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user