refactor(job_scraper): improve page loading and typing in linkedin scraper
- Change page load strategy from 'load' to 'domcontentloaded' and 'networkidle' for better performance - Make search_keywords parameter optional to handle empty searches - Update type imports to include List for better type hints - Set headless mode to true by default for production use
This commit is contained in:
parent
458e914d71
commit
7dca4c9159
@ -5,7 +5,7 @@ import random
|
|||||||
import sqlite3
|
import sqlite3
|
||||||
import os
|
import os
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Optional, Dict
|
from typing import Optional, Dict, List
|
||||||
from playwright.async_api import async_playwright
|
from playwright.async_api import async_playwright
|
||||||
from browserforge.injectors.playwright import AsyncNewContext
|
from browserforge.injectors.playwright import AsyncNewContext
|
||||||
|
|
||||||
@ -200,7 +200,7 @@ class LinkedInJobScraper:
|
|||||||
|
|
||||||
async def scrape_jobs(
|
async def scrape_jobs(
|
||||||
self,
|
self,
|
||||||
search_keywords: str,
|
search_keywords: Optional[str],
|
||||||
max_pages: int = 1,
|
max_pages: int = 1,
|
||||||
credentials: Optional[Dict] = None
|
credentials: Optional[Dict] = None
|
||||||
):
|
):
|
||||||
@ -214,7 +214,7 @@ class LinkedInJobScraper:
|
|||||||
|
|
||||||
async with async_playwright() as pw:
|
async with async_playwright() as pw:
|
||||||
browser = await pw.chromium.launch(
|
browser = await pw.chromium.launch(
|
||||||
headless=False,
|
headless= False,
|
||||||
args=['--disable-blink-features=AutomationControlled']
|
args=['--disable-blink-features=AutomationControlled']
|
||||||
)
|
)
|
||||||
context = await AsyncNewContext(browser, fingerprint=profile)
|
context = await AsyncNewContext(browser, fingerprint=profile)
|
||||||
@ -269,7 +269,7 @@ class LinkedInJobScraper:
|
|||||||
return
|
return
|
||||||
|
|
||||||
print(f"🔍 Searching for: {search_keywords}")
|
print(f"🔍 Searching for: {search_keywords}")
|
||||||
await page.goto(search_url, wait_until='load', timeout=60000)
|
await page.goto(search_url, wait_until='networkidle', timeout=60000)
|
||||||
await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)
|
await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)
|
||||||
|
|
||||||
if await self.engine._detect_cloudflare(page):
|
if await self.engine._detect_cloudflare(page):
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user