Web_scraping_project/fetcher.py


import asyncio
import random
import time
from playwright.async_api import Page, BrowserContext, Browser
from typing import Optional
from scraping_engine import FingerprintScrapingEngine


class StealthyFetcher:
    def __init__(self, engine: FingerprintScrapingEngine, browser: Browser, context: BrowserContext):
        self.engine = engine
        self.browser = browser
        self.context = context
        self.max_retries = 5
        self.base_delay = 5

    async def fetch_url(self, url: str, wait_for_selector: Optional[str] = None, timeout=300000) -> Optional[Page]:
        """
        Fetch URL using the provided context (caller handles page creation)
        """
        page = None
        try:
            page = await self.context.new_page()
            # Use networkidle for all platforms - works reliably for Ashby, Lever, and Greenhouse
            await page.goto(url, wait_until='domcontentloaded', timeout=min(timeout, 60000))

            # Skip human behavior for Lever (already loads fully without it)
            if "lever.co" not in url:
                await self._apply_human_behavior(page)

            protection_type = await self._detect_protection(page)
            if protection_type:
                content_accessible = await self._is_content_accessible(page)
                if not content_accessible:
                    handled = False
                    if protection_type == "cloudflare":
                        handled = await self._handle_cloudflare(page)
                    elif protection_type == "captcha":
                        handled = await self._handle_captcha(page)
                    if not handled:
                        return None

            return page
        except Exception as e:
            try:
                if page:
                    await page.close()
            except Exception:
                pass
            raise

    async def _apply_human_behavior(self, page: Page):
        await self.engine._human_like_scroll(page)
        await asyncio.sleep(random.uniform(1, 3))
        await self.engine._simulate_human_interaction(page)
        await asyncio.sleep(random.uniform(1, 2))

    async def _detect_protection(self, page: Page) -> Optional[str]:
        content = (await page.content()).lower()
        if ("#cf-chl" in content or "checking your browser" in content or
            "just a moment" in content or "cloudflare" in content or
            "ddos protection" in content or "turnstile" in content):
            return "cloudflare"
        elif "captcha" in content or "robot" in content or "verify you're human" in content:
            return "captcha"
        return None

    async def _is_content_accessible(self, page: Page, wait_for_selector: Optional[str] = None) -> bool:
        try:
            await page.wait_for_selector("body", timeout=60000)
            body_text = await page.eval_on_selector("body", "el => el.innerText.toLowerCase()")
            if len(body_text.strip()) < 100:
                return False
            job_keywords = ['job', 'role', 'apply', 'responsibilities', 'requirements', 'qualifications']
            return any(word in body_text for word in job_keywords)
        except:
            return False

    async def _handle_cloudflare(self, page: Page) -> bool:
        max_wait_time = 60
        start_time = time.time()
        while time.time() - start_time < max_wait_time:
            if not await self._detect_protection(page):
                return True
            await self._apply_human_behavior(page)
            wait_time = min(10, 2 + random.uniform(1, 3) + (time.time() - start_time) * 0.1)
            await asyncio.sleep(wait_time)
            if (time.time() - start_time) > 15 and (time.time() - start_time) % 20 < 2:
                try:
                    await page.reload(wait_until='domcontentloaded', timeout=120000)
                except Exception:
                    pass
        return False

    async def _handle_captcha(self, page: Page) -> bool:
        return False  # Avoid strategy