import asyncio import random import time from playwright.async_api import Page, BrowserContext, Browser, TimeoutError as PlaywrightTimeoutError from typing import Optional from scraping_engine import FingerprintScrapingEngine class StealthyFetcher: def __init__(self, engine: FingerprintScrapingEngine, browser: Browser, context: BrowserContext): self.engine = engine self.browser = browser self.context = context self.max_retries = 5 self.base_delay = 5 async def fetch_url(self, url: str, wait_for_selector: Optional[str] = None) -> Optional[Page]: """ Fetch a URL using stealth techniques, handling Cloudflare and other protections intelligently. """ for attempt in range(self.max_retries): try: print(f"Attempt {attempt + 1} to fetch {url}") page = await self.context.new_page() await page.goto(url, wait_until='load', timeout=120000) if wait_for_selector: try: await page.wait_for_selector(wait_for_selector, timeout=40000) except PlaywrightTimeoutError: print(f"Selector {wait_for_selector} not found immediately, continuing...") await self._apply_human_behavior(page) protection_type = await self._detect_protection(page) if protection_type: print(f"🛡️ Protection detected: {protection_type}") content_accessible = await self._is_content_accessible(page, wait_for_selector) if not content_accessible: print("🔒 Content not accessible due to protection.") handled = False if protection_type == "cloudflare": handled = await self._handle_cloudflare(page) elif protection_type == "captcha": handled = await self._handle_captcha(page) if not handled: print("❌ Failed to handle protection.") await page.close() await asyncio.sleep(self.base_delay * (2 ** attempt)) continue else: print("✅ Protection present but content is accessible — proceeding.") print(f"✅ Successfully fetched {url}") return page except Exception as e: print(f"Attempt {attempt + 1} failed for {url}: {str(e)}") if 'page' in locals(): await page.close() await asyncio.sleep(self.base_delay * (2 ** attempt)) print(f"❌ Failed to fetch {url} after {self.max_retries} attempts.") return None async def _apply_human_behavior(self, page: Page): await self.engine._human_like_scroll(page) await asyncio.sleep(random.uniform(1, 3)) await self.engine._simulate_human_interaction(page) await asyncio.sleep(random.uniform(1, 2)) async def _detect_protection(self, page: Page) -> Optional[str]: content = (await page.content()).lower() if ( "#cf-chl" in content or "checking your browser" in content or "just a moment" in content or "cloudflare" in content or "ddos protection" in content or "turnstile" in content ): return "cloudflare" elif "captcha" in content or "robot" in content or "verify you're human" in content: return "captcha" return None async def _is_content_accessible(self, page: Page, wait_for_selector: Optional[str] = None) -> bool: if wait_for_selector: try: await page.wait_for_selector(wait_for_selector, timeout=40000) return True except PlaywrightTimeoutError: pass try: body_text = await page.eval_on_selector("body", "el => el.innerText.toLowerCase()") return len(body_text.strip()) > 200 except: return False async def _handle_captcha(self, page: Page) -> bool: print("🦾 Using 'avoid' strategy for captcha — skipping page.") return False async def _handle_cloudflare(self, page: Page) -> bool: max_wait_time = 60 start_time = time.time() while time.time() - start_time < max_wait_time: if not await self._detect_protection(page): print("☁️ Cloudflare challenge resolved.") return True print("☁️ Cloudflare active, waiting...") await self._apply_human_behavior(page) wait_time = min(10, 2 + random.uniform(1, 3) + (time.time() - start_time) * 0.1) await asyncio.sleep(wait_time) if (time.time() - start_time) > 15 and (time.time() - start_time) % 20 < 2: print("🔄 Reloading page during Cloudflare wait...") await page.reload(wait_until='load', timeout=120000) print("⏰ Timeout waiting for Cloudflare resolution.") return False