Ofure Ikheloa d7d92ba8bb fix(job_scraper): increase timeout values for page navigation
The previous timeout values were too short for slower network conditions, causing premature timeouts during job scraping. Increased wait_for_function timeout from 30s to 80s and load_state timeout from 30s to 60s to accommodate slower page loads.
2025-11-27 12:28:21 +01:00

125 lines
5.2 KiB
Python

import asyncio
import random
import time
from playwright.async_api import Page, BrowserContext, Browser, TimeoutError as PlaywrightTimeoutError
from typing import Optional
from scraping_engine import FingerprintScrapingEngine
class StealthyFetcher:
def __init__(self, engine: FingerprintScrapingEngine, browser: Browser, context: BrowserContext):
self.engine = engine
self.browser = browser
self.context = context
self.max_retries = 5
self.base_delay = 5
async def fetch_url(self, url: str, wait_for_selector: Optional[str] = None) -> Optional[Page]:
"""
Fetch a URL using stealth techniques, handling Cloudflare and other protections intelligently.
"""
for attempt in range(self.max_retries):
try:
print(f"Attempt {attempt + 1} to fetch {url}")
page = await self.context.new_page()
await page.goto(url, wait_until='load', timeout=60000)
if wait_for_selector:
try:
await page.wait_for_selector(wait_for_selector, timeout=10000)
except PlaywrightTimeoutError:
print(f"Selector {wait_for_selector} not found immediately, continuing...")
await self._apply_human_behavior(page)
protection_type = await self._detect_protection(page)
if protection_type:
print(f"🛡️ Protection detected: {protection_type}")
content_accessible = await self._is_content_accessible(page, wait_for_selector)
if not content_accessible:
print("🔒 Content not accessible due to protection.")
handled = False
if protection_type == "cloudflare":
handled = await self._handle_cloudflare(page)
elif protection_type == "captcha":
handled = await self._handle_captcha(page)
if not handled:
print("❌ Failed to handle protection.")
await page.close()
await asyncio.sleep(self.base_delay * (2 ** attempt))
continue
else:
print("✅ Protection present but content is accessible — proceeding.")
print(f"✅ Successfully fetched {url}")
return page
except Exception as e:
print(f"Attempt {attempt + 1} failed for {url}: {str(e)}")
if 'page' in locals():
await page.close()
await asyncio.sleep(self.base_delay * (2 ** attempt))
print(f"❌ Failed to fetch {url} after {self.max_retries} attempts.")
return None
async def _apply_human_behavior(self, page: Page):
await self.engine._human_like_scroll(page)
await asyncio.sleep(random.uniform(1, 3))
await self.engine._simulate_human_interaction(page)
await asyncio.sleep(random.uniform(1, 2))
async def _detect_protection(self, page: Page) -> Optional[str]:
content = (await page.content()).lower()
if (
"#cf-chl" in content
or "checking your browser" in content
or "just a moment" in content
or "cloudflare" in content
or "ddos protection" in content
or "turnstile" in content
):
return "cloudflare"
elif "captcha" in content or "robot" in content or "verify you're human" in content:
return "captcha"
return None
async def _is_content_accessible(self, page: Page, wait_for_selector: Optional[str] = None) -> bool:
if wait_for_selector:
try:
await page.wait_for_selector(wait_for_selector, timeout=5000)
return True
except PlaywrightTimeoutError:
pass
try:
body_text = await page.eval_on_selector("body", "el => el.innerText.toLowerCase()")
return len(body_text.strip()) > 200
except:
return False
async def _handle_captcha(self, page: Page) -> bool:
print("🦾 Using 'avoid' strategy for captcha — skipping page.")
return False
async def _handle_cloudflare(self, page: Page) -> bool:
max_wait_time = 60
start_time = time.time()
while time.time() - start_time < max_wait_time:
if not await self._detect_protection(page):
print("☁️ Cloudflare challenge resolved.")
return True
print("☁️ Cloudflare active, waiting...")
await self._apply_human_behavior(page)
wait_time = min(10, 2 + random.uniform(1, 3) + (time.time() - start_time) * 0.1)
await asyncio.sleep(wait_time)
if (time.time() - start_time) > 15 and (time.time() - start_time) % 20 < 2:
print("🔄 Reloading page during Cloudflare wait...")
await page.reload(wait_until='load', timeout=30000)
print("⏰ Timeout waiting for Cloudflare resolution.")
return False