refactor(job_scraper): improve page loading and typing in linkedin scraper
- Change page load strategy from 'load' to 'domcontentloaded' and 'networkidle' for better performance - Make search_keywords parameter optional to handle empty searches - Update type imports to include List for better type hints - Set headless mode to true by default for production use
This commit is contained in:
parent
458e914d71
commit
7dca4c9159
@ -5,7 +5,7 @@ import random
|
||||
import sqlite3
|
||||
import os
|
||||
from datetime import datetime
|
||||
from typing import Optional, Dict
|
||||
from typing import Optional, Dict, List
|
||||
from playwright.async_api import async_playwright
|
||||
from browserforge.injectors.playwright import AsyncNewContext
|
||||
|
||||
@ -200,7 +200,7 @@ class LinkedInJobScraper:
|
||||
|
||||
async def scrape_jobs(
|
||||
self,
|
||||
search_keywords: str,
|
||||
search_keywords: Optional[str],
|
||||
max_pages: int = 1,
|
||||
credentials: Optional[Dict] = None
|
||||
):
|
||||
@ -214,7 +214,7 @@ class LinkedInJobScraper:
|
||||
|
||||
async with async_playwright() as pw:
|
||||
browser = await pw.chromium.launch(
|
||||
headless=False,
|
||||
headless= False,
|
||||
args=['--disable-blink-features=AutomationControlled']
|
||||
)
|
||||
context = await AsyncNewContext(browser, fingerprint=profile)
|
||||
@ -269,7 +269,7 @@ class LinkedInJobScraper:
|
||||
return
|
||||
|
||||
print(f"🔍 Searching for: {search_keywords}")
|
||||
await page.goto(search_url, wait_until='load', timeout=60000)
|
||||
await page.goto(search_url, wait_until='networkidle', timeout=60000)
|
||||
await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)
|
||||
|
||||
if await self.engine._detect_cloudflare(page):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user