feat: update LLM model and increase content size limit
refactor: update timeout values in job scraper classes feat: add spoof config for renderers and vendors build: update pycache files for config and modules
This commit is contained in:
parent
fd4e8c9c05
commit
d025828036
@ -202,7 +202,7 @@ class LinkedInJobScraper:
|
|||||||
await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)
|
await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)
|
||||||
# Wait for URL to change or new content
|
# Wait for URL to change or new content
|
||||||
try:
|
try:
|
||||||
await page.wait_for_function("() => window.location.href.includes('start=')", timeout=30000)
|
await page.wait_for_function("() => window.location.href.includes('start=')", timeout=60000)
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
current_page += 1
|
current_page += 1
|
||||||
@ -360,7 +360,7 @@ class LinkedInJobScraper:
|
|||||||
else:
|
else:
|
||||||
# If no pagination and no new jobs from scroll, check by refreshing
|
# If no pagination and no new jobs from scroll, check by refreshing
|
||||||
print("🔄 Refreshing page to check for new results...")
|
print("🔄 Refreshing page to check for new results...")
|
||||||
await page.reload(wait_until='networkidle')
|
await page.reload(wait_until='load')
|
||||||
await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
|
await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
|
||||||
|
|
||||||
# Check for new jobs after refresh
|
# Check for new jobs after refresh
|
||||||
@ -439,7 +439,7 @@ class LinkedInJobScraper:
|
|||||||
try:
|
try:
|
||||||
external_page = await asyncio.wait_for(page_waiter, timeout=5.0)
|
external_page = await asyncio.wait_for(page_waiter, timeout=5.0)
|
||||||
print(" 🌐 External job site opened in new tab.")
|
print(" 🌐 External job site opened in new tab.")
|
||||||
await external_page.wait_for_load_state("load", timeout=30000)
|
await external_page.wait_for_load_state("load", timeout=60000)
|
||||||
await asyncio.sleep(2 * self.human_speed)
|
await asyncio.sleep(2 * self.human_speed)
|
||||||
await self.engine._human_like_scroll(external_page)
|
await self.engine._human_like_scroll(external_page)
|
||||||
await asyncio.sleep(2 * self.human_speed)
|
await asyncio.sleep(2 * self.human_speed)
|
||||||
|
|||||||
@ -9,7 +9,7 @@ from config import GEMINI_API_KEY
|
|||||||
class LLMJobRefiner:
|
class LLMJobRefiner:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
genai.configure(api_key=GEMINI_API_KEY)
|
genai.configure(api_key=GEMINI_API_KEY)
|
||||||
self.model = genai.GenerativeModel('gemini-pro')
|
self.model = genai.GenerativeModel('gemini-latest-flash')
|
||||||
|
|
||||||
async def refine_job_data(self, raw_data: Dict[str, Any], target_field: str) -> Dict[str, Any]:
|
async def refine_job_data(self, raw_data: Dict[str, Any], target_field: str) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
@ -29,7 +29,7 @@ class LLMJobRefiner:
|
|||||||
|
|
||||||
Target Field: {target_field}
|
Target Field: {target_field}
|
||||||
Raw Page Content:
|
Raw Page Content:
|
||||||
{raw_data.get('page_content', '')[:3000]} # Limit content size
|
{raw_data.get('page_content', '')[:6000]} # Limit content size
|
||||||
|
|
||||||
Instructions:
|
Instructions:
|
||||||
1. Extract only the information relevant to the target field: {target_field}
|
1. Extract only the information relevant to the target field: {target_field}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user