import asyncio import random import time from playwright.async_api import Page, BrowserContext, Browser from typing import Optional from scraping_engine import FingerprintScrapingEngine class StealthyFetcher: def __init__(self, engine: FingerprintScrapingEngine, browser: Browser, context: BrowserContext): self.engine = engine self.browser = browser self.context = context self.max_retries = 5 self.base_delay = 5 async def fetch_url(self, url: str, wait_for_selector: Optional[str] = None, timeout=300000) -> Optional[Page]: """ Fetch URL using the provided context (caller handles page creation) """ page = None try: page = await self.context.new_page() # Use networkidle for all platforms - works reliably for Ashby, Lever, and Greenhouse await page.goto(url, wait_until='domcontentloaded', timeout=min(timeout, 120000)) # Skip human behavior for Lever (already loads fully without it) if "lever.co" not in url: await self._apply_human_behavior(page) protection_type = await self._detect_protection(page) if protection_type: content_accessible = await self._is_content_accessible(page) if not content_accessible: handled = False if protection_type == "cloudflare": handled = await self._handle_cloudflare(page) elif protection_type == "captcha": handled = await self._handle_captcha(page) if not handled: return None return page except Exception as e: try: if page: await page.close() except Exception: pass raise async def _apply_human_behavior(self, page: Page): await self.engine._human_like_scroll(page) await asyncio.sleep(random.uniform(1, 3)) await self.engine._simulate_human_interaction(page) await asyncio.sleep(random.uniform(1, 2)) async def _detect_protection(self, page: Page) -> Optional[str]: content = (await page.content()).lower() if ("#cf-chl" in content or "checking your browser" in content or "just a moment" in content or "cloudflare" in content or "ddos protection" in content or "turnstile" in content): return "cloudflare" elif "captcha" in content or "robot" in content or "verify you're human" in content: return "captcha" return None async def _is_content_accessible(self, page: Page, wait_for_selector: Optional[str] = None) -> bool: try: await page.wait_for_selector("body", timeout=120000) body_text = await page.eval_on_selector("body", "el => el.innerText.toLowerCase()") if len(body_text.strip()) < 100: return False job_keywords = ['job', 'role', 'apply', 'responsibilities', 'requirements', 'qualifications'] return any(word in body_text for word in job_keywords) except: return False async def _handle_cloudflare(self, page: Page) -> bool: max_wait_time = 60 start_time = time.time() while time.time() - start_time < max_wait_time: if not await self._detect_protection(page): return True await self._apply_human_behavior(page) wait_time = min(10, 2 + random.uniform(1, 3) + (time.time() - start_time) * 0.1) await asyncio.sleep(wait_time) if (time.time() - start_time) > 15 and (time.time() - start_time) % 20 < 2: try: await page.reload(wait_until='domcontentloaded', timeout=120000) except Exception: pass return False async def _handle_captcha(self, page: Page) -> bool: return False # Avoid strategy