Ofure Ikheloa c370de83d5 Refactor scraper and sender modules for improved Redis management and SSL connection handling
- Introduced RedisManager class in scraper.py for centralized Redis operations including job tracking and caching.
- Enhanced job scraping logic in MultiPlatformJobScraper to support multiple platforms (Ashby, Lever, Greenhouse).
- Updated browser initialization and context management to ensure better resource handling.
- Improved error handling and logging throughout the scraping process.
- Added SSL connection parameters management in a new ssl_connection.py module for RabbitMQ connections.
- Refactored sender.py to utilize RedisManager for job deduplication and improved logging mechanisms.
- Enhanced CSV processing logic in sender.py with better validation and error handling.
- Updated connection parameters for RabbitMQ to support SSL configurations based on environment variables.
2025-12-12 13:48:26 +01:00

97 lines
4.0 KiB
Python

import asyncio
import random
import time
from playwright.async_api import Page, BrowserContext, Browser
from typing import Optional
from scraping_engine import FingerprintScrapingEngine
class StealthyFetcher:
def __init__(self, engine: FingerprintScrapingEngine, browser: Browser, context: BrowserContext):
self.engine = engine
self.browser = browser
self.context = context
self.max_retries = 5
self.base_delay = 5
async def fetch_url(self, url: str, wait_for_selector: Optional[str] = None, timeout=300000) -> Optional[Page]:
"""
Fetch URL using the provided context (caller handles page creation)
"""
page = None
try:
page = await self.context.new_page()
# Use networkidle for all platforms - works reliably for Ashby, Lever, and Greenhouse
await page.goto(url, wait_until='domcontentloaded', timeout=min(timeout, 60000))
# Skip human behavior for Lever (already loads fully without it)
if "lever.co" not in url:
await self._apply_human_behavior(page)
protection_type = await self._detect_protection(page)
if protection_type:
content_accessible = await self._is_content_accessible(page)
if not content_accessible:
handled = False
if protection_type == "cloudflare":
handled = await self._handle_cloudflare(page)
elif protection_type == "captcha":
handled = await self._handle_captcha(page)
if not handled:
return None
return page
except Exception as e:
try:
if page:
await page.close()
except Exception:
pass
raise
async def _apply_human_behavior(self, page: Page):
await self.engine._human_like_scroll(page)
await asyncio.sleep(random.uniform(1, 3))
await self.engine._simulate_human_interaction(page)
await asyncio.sleep(random.uniform(1, 2))
async def _detect_protection(self, page: Page) -> Optional[str]:
content = (await page.content()).lower()
if ("#cf-chl" in content or "checking your browser" in content or
"just a moment" in content or "cloudflare" in content or
"ddos protection" in content or "turnstile" in content):
return "cloudflare"
elif "captcha" in content or "robot" in content or "verify you're human" in content:
return "captcha"
return None
async def _is_content_accessible(self, page: Page, wait_for_selector: Optional[str] = None) -> bool:
try:
await page.wait_for_selector("body", timeout=60000)
body_text = await page.eval_on_selector("body", "el => el.innerText.toLowerCase()")
if len(body_text.strip()) < 100:
return False
job_keywords = ['job', 'role', 'apply', 'responsibilities', 'requirements', 'qualifications']
return any(word in body_text for word in job_keywords)
except:
return False
async def _handle_cloudflare(self, page: Page) -> bool:
max_wait_time = 60
start_time = time.time()
while time.time() - start_time < max_wait_time:
if not await self._detect_protection(page):
return True
await self._apply_human_behavior(page)
wait_time = min(10, 2 + random.uniform(1, 3) + (time.time() - start_time) * 0.1)
await asyncio.sleep(wait_time)
if (time.time() - start_time) > 15 and (time.time() - start_time) % 20 < 2:
try:
await page.reload(wait_until='domcontentloaded', timeout=120000)
except Exception:
pass
return False
async def _handle_captcha(self, page: Page) -> bool:
return False # Avoid strategy