fix(job_scraper): increase timeout values for page navigation

The previous timeout values were too short for slower network conditions, causing premature timeouts during job scraping. Increased wait_for_function timeout from 30s to 80s and load_state timeout from 30s to 60s to accommodate slower page loads.
This commit is contained in:
Ofure Ikheloa 2025-11-27 12:28:21 +01:00
parent d025828036
commit d7d92ba8bb
6 changed files with 487 additions and 307 deletions

View File

@ -8,9 +8,9 @@ from dotenv import load_dotenv
load_dotenv()
# LLM Agent Configuration
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
GEMINI_API_KEY = os.getenv("XAI_API_KEY")
if not GEMINI_API_KEY:
raise ValueError("GEMINI_API_KEY environment variable not set in .env file")
raise ValueError("XAI_API_KEY environment variable not set in .env file")
def load_spoof_config():

125
fetcher.py Normal file
View File

@ -0,0 +1,125 @@
import asyncio
import random
import time
from playwright.async_api import Page, BrowserContext, Browser, TimeoutError as PlaywrightTimeoutError
from typing import Optional
from scraping_engine import FingerprintScrapingEngine
class StealthyFetcher:
def __init__(self, engine: FingerprintScrapingEngine, browser: Browser, context: BrowserContext):
self.engine = engine
self.browser = browser
self.context = context
self.max_retries = 5
self.base_delay = 5
async def fetch_url(self, url: str, wait_for_selector: Optional[str] = None) -> Optional[Page]:
"""
Fetch a URL using stealth techniques, handling Cloudflare and other protections intelligently.
"""
for attempt in range(self.max_retries):
try:
print(f"Attempt {attempt + 1} to fetch {url}")
page = await self.context.new_page()
await page.goto(url, wait_until='load', timeout=60000)
if wait_for_selector:
try:
await page.wait_for_selector(wait_for_selector, timeout=10000)
except PlaywrightTimeoutError:
print(f"Selector {wait_for_selector} not found immediately, continuing...")
await self._apply_human_behavior(page)
protection_type = await self._detect_protection(page)
if protection_type:
print(f"🛡️ Protection detected: {protection_type}")
content_accessible = await self._is_content_accessible(page, wait_for_selector)
if not content_accessible:
print("🔒 Content not accessible due to protection.")
handled = False
if protection_type == "cloudflare":
handled = await self._handle_cloudflare(page)
elif protection_type == "captcha":
handled = await self._handle_captcha(page)
if not handled:
print("❌ Failed to handle protection.")
await page.close()
await asyncio.sleep(self.base_delay * (2 ** attempt))
continue
else:
print("✅ Protection present but content is accessible — proceeding.")
print(f"✅ Successfully fetched {url}")
return page
except Exception as e:
print(f"Attempt {attempt + 1} failed for {url}: {str(e)}")
if 'page' in locals():
await page.close()
await asyncio.sleep(self.base_delay * (2 ** attempt))
print(f"❌ Failed to fetch {url} after {self.max_retries} attempts.")
return None
async def _apply_human_behavior(self, page: Page):
await self.engine._human_like_scroll(page)
await asyncio.sleep(random.uniform(1, 3))
await self.engine._simulate_human_interaction(page)
await asyncio.sleep(random.uniform(1, 2))
async def _detect_protection(self, page: Page) -> Optional[str]:
content = (await page.content()).lower()
if (
"#cf-chl" in content
or "checking your browser" in content
or "just a moment" in content
or "cloudflare" in content
or "ddos protection" in content
or "turnstile" in content
):
return "cloudflare"
elif "captcha" in content or "robot" in content or "verify you're human" in content:
return "captcha"
return None
async def _is_content_accessible(self, page: Page, wait_for_selector: Optional[str] = None) -> bool:
if wait_for_selector:
try:
await page.wait_for_selector(wait_for_selector, timeout=5000)
return True
except PlaywrightTimeoutError:
pass
try:
body_text = await page.eval_on_selector("body", "el => el.innerText.toLowerCase()")
return len(body_text.strip()) > 200
except:
return False
async def _handle_captcha(self, page: Page) -> bool:
print("🦾 Using 'avoid' strategy for captcha — skipping page.")
return False
async def _handle_cloudflare(self, page: Page) -> bool:
max_wait_time = 60
start_time = time.time()
while time.time() - start_time < max_wait_time:
if not await self._detect_protection(page):
print("☁️ Cloudflare challenge resolved.")
return True
print("☁️ Cloudflare active, waiting...")
await self._apply_human_behavior(page)
wait_time = min(10, 2 + random.uniform(1, 3) + (time.time() - start_time) * 0.1)
await asyncio.sleep(wait_time)
if (time.time() - start_time) > 15 and (time.time() - start_time) % 20 < 2:
print("🔄 Reloading page during Cloudflare wait...")
await page.reload(wait_until='load', timeout=30000)
print("⏰ Timeout waiting for Cloudflare resolution.")
return False

View File

@ -1,14 +1,13 @@
import asyncio
import random
import sqlite3
import os
from datetime import datetime
from typing import Optional, Dict, List
from playwright.async_api import async_playwright
from typing import Optional, Dict
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
from browserforge.injectors.playwright import AsyncNewContext
from llm_agent import LLMJobRefiner
import re
from fetcher import StealthyFetcher
class LinkedInJobScraper:
@ -17,12 +16,12 @@ class LinkedInJobScraper:
engine,
db_path: str = "linkedin_jobs.db",
human_speed: float = 1.0,
target_field: str = "all"
user_request: str = "Extract all standard job details"
):
self.engine = engine
self.db_path = db_path
self.human_speed = human_speed
self.target_field = target_field
self.user_request = user_request
self._init_db()
self.llm_agent = LLMJobRefiner()
@ -61,7 +60,6 @@ class LinkedInJobScraper:
return False
async def _login(self, page, credentials: Dict) -> bool:
"""Human-realistic LinkedIn login"""
print("🔐 Navigating to LinkedIn login page...")
await page.goto("https://www.linkedin.com/login", timeout=60000)
await asyncio.sleep(random.uniform(2.0, 3.5) * self.human_speed)
@ -107,45 +105,27 @@ class LinkedInJobScraper:
return False
async def _extract_all_page_content(self, page) -> str:
"""Extract all content from the job page"""
await asyncio.sleep(2 * self.human_speed)
# Human-like scrolling to load all content
await self.engine._human_like_scroll(page)
await asyncio.sleep(2 * self.human_speed)
# Get the full page content
page_content = await page.content()
return page_content
def _calculate_keyword_match(self, title: str, keywords: str) -> float:
"""Calculate percentage of keywords matched in title"""
if not title or not keywords:
return 0.0
title_lower = title.lower()
keyword_list = [kw.strip().lower() for kw in keywords.split()]
matches = 0
for keyword in keyword_list:
if keyword in title_lower:
matches += 1
matches = sum(1 for kw in keyword_list if kw in title_lower)
return matches / len(keyword_list) if keyword_list else 0.0
def _extract_location_from_keywords(self, search_keywords: str) -> str:
"""Extract location from search keywords if present"""
location_match = re.search(r'location:\s*([^,]+)', search_keywords, re.IGNORECASE)
if location_match:
return location_match.group(1).strip().lower()
return ""
return location_match.group(1).strip().lower() if location_match else ""
async def _scrape_jobs_from_current_page(self, page, search_keywords: str, seen_job_ids, all_job_links):
"""Scrape job links from the current page that match keywords and location"""
current_links = await page.query_selector_all("a[href*='/jobs/view/']")
new_jobs = 0
# Extract location from search keywords
location_from_keywords = self._extract_location_from_keywords(search_keywords)
for link in current_links:
@ -155,22 +135,18 @@ class LinkedInJobScraper:
job_id = href.split("/view/")[-1].split("/")[0] if "/view/" in href else href
if job_id and job_id not in seen_job_ids:
# Check if job title matches keywords (at least 70% match)
title_element = await link.query_selector("span.job-title, h3, .job-card-title")
if title_element:
title = await title_element.inner_text()
match_percentage = self._calculate_keyword_match(title, search_keywords)
# Check if location matches (if specified in keywords)
location_match = True
if location_from_keywords:
# Try to get location from the job card
location_element = await link.query_selector("span.job-location, .job-card-location, .location")
if location_element:
location_text = await location_element.inner_text()
location_match = location_from_keywords in location_text.lower()
if match_percentage >= 0.7 and location_match: # At least 70% match and location matches
if match_percentage >= 0.7 and location_match:
seen_job_ids.add(job_id)
all_job_links.append((href, title))
new_jobs += 1
@ -179,28 +155,22 @@ class LinkedInJobScraper:
elif not location_match:
print(f" ⚠️ Skipping job due to location mismatch: {title[:50]}... (expected: {location_from_keywords})")
else:
# If no title element, still add to check later
seen_job_ids.add(job_id)
all_job_links.append((href, "Unknown Title"))
new_jobs += 1
return new_jobs
async def _handle_pagination(self, page, search_keywords: str, seen_job_ids, all_job_links):
"""Handle pagination by going through pages"""
current_page = 1
while True:
print(f"📄 Processing page {current_page}")
# Collect job links on current page
new_jobs = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links)
print(f" Found {new_jobs} new job(s) on page {current_page} (total: {len(all_job_links)})")
# Try to go to next page
next_btn = await page.query_selector("button[aria-label='Next']")
if next_btn and await next_btn.is_enabled():
await self._human_click(page, next_btn)
await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)
# Wait for URL to change or new content
try:
await page.wait_for_function("() => window.location.href.includes('start=')", timeout=60000)
except:
@ -211,7 +181,6 @@ class LinkedInJobScraper:
break
async def _handle_infinite_scroll(self, page, search_keywords: str, seen_job_ids, all_job_links):
"""Handle infinite scroll to load more jobs"""
last_height = await page.evaluate("document.body.scrollHeight")
no_new_jobs_count = 0
max_no_new = 3
@ -221,7 +190,6 @@ class LinkedInJobScraper:
await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
new_jobs_found = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links)
print(f" Found {new_jobs_found} new job(s) (total: {len(all_job_links)})")
new_height = await page.evaluate("document.body.scrollHeight")
@ -241,19 +209,15 @@ class LinkedInJobScraper:
max_pages: int = 1,
credentials: Optional[Dict] = None
):
# Parse location from keywords if present
location_match = re.search(r'location:\s*([^,]+)', search_keywords, re.IGNORECASE)
location = location_match.group(1).strip() if location_match else ""
# Remove location part from keywords for search
clean_keywords = re.sub(r'location:\s*[^,]+', '', search_keywords, flags=re.IGNORECASE).strip()
encoded_keywords = clean_keywords.replace(" ", "%20")
# Build search URL with location if specified
search_url = f"https://www.linkedin.com/jobs/search/?keywords={encoded_keywords}"
if location:
search_url += f"&location={location.replace(' ', '%20')}"
profile = self.engine._select_profile()
renderer = random.choice(self.engine.common_renderers[self.engine.os])
vendor = random.choice(self.engine.common_vendors)
@ -261,11 +225,11 @@ class LinkedInJobScraper:
async with async_playwright() as pw:
browser = await pw.chromium.launch(
headless= False,
headless=False,
args=['--disable-blink-features=AutomationControlled']
)
context = await AsyncNewContext(browser, fingerprint=profile)
await context.add_init_script(f"""
Object.defineProperty(navigator, 'hardwareConcurrency', {{ get: () => {profile.navigator.hardwareConcurrency} }});
Object.defineProperty(navigator, 'deviceMemory', {{ get: () => {profile.navigator.deviceMemory} }});
@ -275,6 +239,9 @@ class LinkedInJobScraper:
page = await context.new_page()
# Create a temporary fetcher for protection checks on main page
temp_fetcher = StealthyFetcher(self.engine, browser, context)
session_loaded = await self.engine.load_session(context)
login_successful = False
@ -301,118 +268,111 @@ class LinkedInJobScraper:
elif not credentials:
print(" No credentials — proceeding as guest.")
login_successful = True
else:
pass
await page.wait_for_load_state("load", timeout=60000)
print("✅ Post-login page fully loaded. Starting search...")
if await self.engine._detect_cloudflare(page):
print("☁️ Cloudflare detected on initial load.")
if not await self.engine._handle_cloudflare(page):
print("❌ Cloudflare could not be resolved.")
await browser.close()
self.engine.report_outcome("cloudflare")
return
# >>> PROTECTION CHECK USING FETCHER LOGIC <<<
protection_type = await temp_fetcher._detect_protection(page)
if protection_type:
print(f"🛡️ Protection detected on initial page: {protection_type}")
content_accessible = await temp_fetcher._is_content_accessible(page)
if not content_accessible:
print("🔒 Content not accessible.")
handled = False
if protection_type == "cloudflare":
handled = await self.engine._handle_cloudflare(page)
elif protection_type == "captcha":
handled = False
if not handled:
await browser.close()
self.engine.report_outcome("protection_block")
return
else:
print("✅ Protection present but content accessible — proceeding.")
print(f"🔍 Searching for: {search_keywords}")
await page.goto(search_url, wait_until='load', timeout=60000)
await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)
if await self.engine._detect_cloudflare(page):
print("☁️ Cloudflare detected on search page.")
if not await self.engine._handle_cloudflare(page):
await browser.close()
self.engine.report_outcome("cloudflare")
return
# >>> PROTECTION CHECK ON SEARCH PAGE <<<
protection_type = await temp_fetcher._detect_protection(page)
if protection_type:
print(f"🛡️ Protection detected on search page: {protection_type}")
content_accessible = await temp_fetcher._is_content_accessible(page)
if not content_accessible:
print("🔒 Content not accessible.")
handled = False
if protection_type == "cloudflare":
handled = await self.engine._handle_cloudflare(page)
elif protection_type == "captcha":
handled = False
if not handled:
await browser.close()
self.engine.report_outcome("protection_block")
return
else:
print("✅ Protection present but content accessible — proceeding.")
all_job_links = []
seen_job_ids = set()
# First, scrape the initial page
print("🔄 Collecting initial job links...")
initial_jobs = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links)
print(f" Found {initial_jobs} initial job(s) (total: {len(all_job_links)})")
# Loop until no new jobs are found
iteration = 1
while True:
print(f"🔄 Iteration {iteration}: Checking for new jobs...")
# First try infinite scroll
prev_job_count = len(all_job_links)
await self._handle_infinite_scroll(page, search_keywords, seen_job_ids, all_job_links)
new_jobs_count = len(all_job_links) - prev_job_count
if new_jobs_count > 0:
print(f" Found {new_jobs_count} new jobs via infinite scroll")
iteration += 1
continue # Continue with infinite scroll if new jobs found
# If no new jobs via scroll, check for pagination
continue
pagination_exists = await page.query_selector("button[aria-label='Next']")
if pagination_exists:
print("⏭️ Pagination detected. Processing pages...")
await self._handle_pagination(page, search_keywords, seen_job_ids, all_job_links)
iteration += 1
continue # Continue with pagination if new jobs found
continue
else:
# If no pagination and no new jobs from scroll, check by refreshing
print("🔄 Refreshing page to check for new results...")
await page.reload(wait_until='load')
await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
# Check for new jobs after refresh
new_jobs_after_refresh = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links)
if new_jobs_after_refresh > 0:
print(f" Found {new_jobs_after_refresh} new job(s) after refresh")
iteration += 1
continue # Continue if new jobs found after refresh
continue
else:
print("🔚 No new jobs found after refresh. Stopping.")
break
# Limit iterations to prevent infinite loops
if iteration > 10:
print("🔄 Maximum iterations reached. Stopping.")
break
print(f"✅ Collected {len(all_job_links)} unique job links.")
# Process all collected job links
scraped_count = 0
for idx, (href, title) in enumerate(all_job_links):
try:
full_url = href if href.startswith("http") else f"https://www.linkedin.com{href}"
print(f" → Opening job {idx+1}/{len(all_job_links)}: {full_url}")
await page.goto(full_url, wait_until='load', timeout=60000)
await asyncio.sleep(3 * self.human_speed)
is_cloudflare = await self.engine._detect_cloudflare(page)
page_content = await page.content()
has_captcha_text = "captcha" in page_content.lower()
captcha_present = is_cloudflare or has_captcha_text
title_element = await page.query_selector("h1.t-24")
job_data_accessible = title_element is not None
if captcha_present:
if job_data_accessible:
print(" ⚠️ CAPTCHA detected, but job data is accessible. Proceeding in stealth mode...")
await self.engine._avoid_captcha(page)
else:
print(" ⚠️ CAPTCHA detected and job data blocked. Attempting recovery...")
if not await self.engine._solve_captcha_fallback(page):
print(" ❌ CAPTCHA recovery failed. Skipping job.")
continue
title_element = await page.query_selector("h1.t-24")
if not title_element:
print(" ❌ Job data still unavailable after CAPTCHA handling. Skipping.")
continue
if not captcha_present:
await self.engine._avoid_captcha(page)
fetcher = StealthyFetcher(self.engine, browser, context)
job_page = await fetcher.fetch_url(full_url, wait_for_selector="h1.t-24")
if not job_page:
print(f" ❌ Failed to fetch job page {full_url} after retries.")
self.engine.report_outcome("fetch_failure", url=full_url)
continue
apply_btn = None
apply_selectors = [
@ -422,19 +382,19 @@ class LinkedInJobScraper:
"button:has-text('Easy Apply')"
]
for selector in apply_selectors:
apply_btn = await page.query_selector(selector)
apply_btn = await job_page.query_selector(selector)
if apply_btn:
break
page_data = None
final_url = full_url
final_url = job_page.url
if apply_btn:
print(" → Clicking 'Apply' / 'Easy Apply' button...")
page_waiter = asyncio.create_task(context.wait_for_event("page"))
await self._human_click(page, apply_btn, wait_after=False)
await self._human_click(job_page, apply_btn, wait_after=False)
external_page = None
try:
external_page = await asyncio.wait_for(page_waiter, timeout=5.0)
@ -443,68 +403,66 @@ class LinkedInJobScraper:
await asyncio.sleep(2 * self.human_speed)
await self.engine._human_like_scroll(external_page)
await asyncio.sleep(2 * self.human_speed)
page_data = await self._extract_all_page_content(external_page)
final_url = external_page.url
if not external_page.is_closed():
await external_page.close()
except asyncio.TimeoutError:
print(" 🖥️ No external tab — scraping LinkedIn job page.")
await page.wait_for_timeout(2000)
print(" 🖥️ No external tab — scraping LinkedIn job page directly.")
await job_page.wait_for_timeout(2000)
try:
await page.wait_for_selector("div.jobs-apply-button--fixed, div.jobs-easy-apply-modal", timeout=8000)
except:
await job_page.wait_for_selector("div.jobs-apply-button--fixed, div.jobs-easy-apply-modal", timeout=8000)
except PlaywrightTimeoutError:
pass
await self.engine._human_like_scroll(page)
await self.engine._human_like_scroll(job_page)
await asyncio.sleep(2 * self.human_speed)
page_data = await self._extract_all_page_content(page)
final_url = page.url
page_data = await self._extract_all_page_content(job_page)
else:
print(" ⚠️ No 'Apply' button found — scraping job details directly.")
await self.engine._human_like_scroll(page)
await self.engine._human_like_scroll(job_page)
await asyncio.sleep(2 * self.human_speed)
page_data = await self._extract_all_page_content(page)
final_url = page.url
page_data = await self._extract_all_page_content(job_page)
# Extract job ID from URL
job_id = final_url.split("/")[-2] if "/jobs/view/" in final_url else "unknown"
# Prepare raw data for LLM processing
raw_data = {
"page_content": page_data,
"url": final_url,
"job_id": job_id
"url": job_page.url,
"job_id": job_page.url.split("/")[-2] if "/jobs/view/" in job_page.url else "unknown"
}
# Send raw data to LLM agent for refinement
refined_data = await self.llm_agent.refine_job_data(raw_data, search_keywords)
# Only save if LLM successfully extracted meaningful data
refined_data = await self.llm_agent.refine_job_data(raw_data, self.user_request)
if refined_data and refined_data.get("title", "N/A") != "N/A":
# Save refined data to markdown and database through LLM agent
await self.llm_agent.save_job_data(refined_data, search_keywords)
scraped_count += 1
print(f" ✅ Scraped and refined: {refined_data['title'][:50]}...")
self.engine.report_outcome("success", url=raw_data["url"])
else:
print(f" 🟡 Could not extract meaningful data from: {final_url}")
self.engine.report_outcome("llm_failure", url=raw_data["url"])
await job_page.close()
except Exception as e:
print(f" ⚠️ Failed on job {idx+1}: {str(e)[:100]}")
if 'job_page' in locals() and job_page:
await job_page.close()
continue
finally:
print(" ↩️ Returning to LinkedIn search results...")
await page.goto(search_url, timeout=60000)
await asyncio.sleep(4 * self.human_speed)
await browser.close()
if scraped_count > 0:
self.engine.report_outcome("success")
print(f"✅ Completed! Processed {scraped_count} jobs for '{search_keywords}'.")
print(f"✅ Completed! Processed {scraped_count} jobs for '{search_keywords}' based on request '{self.user_request}'.")
else:
self.engine.report_outcome("captcha")
print("⚠️ No jobs processed successfully.")
print("⚠️ No jobs processed successfully.")

View File

@ -8,16 +8,17 @@ import asyncio
# Load environment variables
load_dotenv()
async def main():
engine = FingerprintScrapingEngine(
seed="job_scraping_engine",
seed="job_scraping_123",
target_os="windows",
db_path="job_listings.db",
markdown_path="job_listings.md"
)
# Initialize scraper with target field
scraper = LinkedInJobScraper(engine, human_speed=1.6, target_field="Web designer")
scraper = LinkedInJobScraper(engine, human_speed=1.6, user_request="Extract title, company, location, description, requirements, qualifications, nature of job(remote, onsite, hybrid) and salary")
await scraper.scrape_jobs(
search_keywords="Web Designer location:New York",

View File

@ -1,166 +1,170 @@
import google.generativeai as genai
from typing import Dict, Any
from openai import OpenAI
from typing import Dict, Any, Optional
import asyncio
import sqlite3
import os
from datetime import datetime
from config import GEMINI_API_KEY
import json
import re
from dotenv import load_dotenv
# ✅ Actually load .env
load_dotenv()
class LLMJobRefiner:
def __init__(self):
genai.configure(api_key=GEMINI_API_KEY)
self.model = genai.GenerativeModel('gemini-latest-flash')
async def refine_job_data(self, raw_data: Dict[str, Any], target_field: str) -> Dict[str, Any]:
"""
Refine raw job data using Gemini LLM based on target field
"""
prompt = f"""
You are a job data extraction assistant. Extract the following fields from the job posting:
- title
- company_name
- location
- description
- requirements
- qualifications
- salary_range
- nature_of_work (remote, onsite, or hybrid)
- job_id
Target Field: {target_field}
Raw Page Content:
{raw_data.get('page_content', '')[:6000]} # Limit content size
Instructions:
1. Extract only the information relevant to the target field: {target_field}
2. Clean up any formatting issues in the description
3. Standardize location format (city, state/country)
4. Extract salary range if mentioned in description
5. Determine nature of work (remote, onsite, or hybrid) from work arrangements
6. Ensure all fields are properly formatted
7. If a field cannot be found, use "N/A"
8. Return the refined data in JSON format
Response format (only return the JSON):
{{
"title": "...",
"company_name": "...",
"location": "...",
"description": "...",
"requirements": "...",
"qualifications": "...",
"salary_range": "...",
"nature_of_work": "...",
"job_id": "{raw_data.get('job_id', 'unknown')}",
"url": "{raw_data.get('url', 'N/A')}"
}}
"""
xai_api_key = os.getenv("XAI_API_KEY")
if not xai_api_key:
raise ValueError("XAI_API_KEY not found in environment variables.")
self.client = OpenAI(api_key=xai_api_key, base_url="https://api.x.ai/v1")
self.model = "grok-4-latest"
self.extraction_schema_cache = {}
def generate_content(self, prompt: str, system_message: str = "You are a helpful assistant.") -> str:
"""Synchronous method to call Grok via xAI API."""
try:
response = await asyncio.get_event_loop().run_in_executor(
None,
lambda: self.model.generate_content(prompt)
response = self.client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": system_message},
{"role": "user", "content": prompt}
],
temperature=0.2,
max_tokens=2048,
stream=False
)
# Parse the response and return refined data
refined_data = self._parse_llm_response(response.text)
# If parsing fails, return None
return response.choices[0].message.content or ""
except Exception as e:
print(f"Error in Grok API call: {e}")
return ""
async def refine_job_data(self, raw_data: Dict[str, Any], user_request: str) -> Optional[Dict[str, Any]]:
page_content = raw_data.get('page_content', '')
if not page_content:
return None
schema_key = user_request.lower().strip()
extraction_schema = self.extraction_schema_cache.get(schema_key)
if not extraction_schema:
extraction_schema = await self._generate_extraction_schema(user_request)
if extraction_schema:
self.extraction_schema_cache[schema_key] = extraction_schema
else:
extraction_schema = self._get_default_schema()
prompt = f"""
You are a highly skilled web data extraction assistant. Your task is to analyze the raw HTML content of a job posting page and extract specific information requested by the user.
The user's request is: "{user_request}"
The raw HTML content of the page is provided below (limited in size). The content might be noisy or unstructured.
Your goal is to:
1. Analyze the HTML structure to identify relevant sections.
2. Extract the requested information accurately.
3. Clean up formatting issues.
4. If a field cannot be found, use "N/A".
5. Return ONLY the extracted data in a JSON object based on the following schema:
{json.dumps(extraction_schema, indent=2)}
Raw Page Content (HTML):
{page_content[:6000]}
Respond with the JSON object containing the extracted data.
"""
try:
# ✅ Use self (current instance), NOT a new LLMJobRefiner()
response_text = await asyncio.get_event_loop().run_in_executor(
None,
lambda: self.generate_content(prompt)
)
refined_data = self._parse_llm_response(response_text)
if not refined_data:
return None
refined_data['job_id'] = raw_data.get('job_id', 'unknown')
refined_data['url'] = raw_data.get('url', 'N/A')
return refined_data
except Exception as e:
print(f"LLM refinement failed: {str(e)}")
return None
def _parse_llm_response(self, response_text: str) -> Dict[str, Any]:
async def _generate_extraction_schema(self, user_request: str) -> Optional[Dict[str, str]]:
schema_prompt = f"""
Based on the user's request: "{user_request}", generate a JSON schema for the data they want to extract from a job posting.
The schema should be a dictionary where keys are field names (snake_case) and values are short descriptions.
Include standard fields like title, company_name, location, description, etc., if relevant.
Respond with only the JSON schema.
"""
Parse the LLM response to extract refined job data
"""
import json
import re
# Extract JSON from response (handle markdown code blocks)
json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', response_text, re.DOTALL)
if json_match:
json_str = json_match.group(1)
else:
# If no code block, try to find JSON directly
json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
if json_match:
json_str = json_match.group(0)
else:
return None
try:
# ✅ Use self.generate_content, NOT self.model.generate_content
schema_text = await asyncio.get_event_loop().run_in_executor(
None,
lambda: self.generate_content(schema_prompt)
)
json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', schema_text, re.DOTALL)
if not json_match:
json_match = re.search(r'\{.*\}', schema_text, re.DOTALL)
if not json_match:
return None
json_str = json_match.group(1) if '```' in schema_text else json_match.group(0)
return json.loads(json_str)
except Exception as e:
print(f"Schema generation failed: {str(e)}")
return None
def _get_default_schema(self) -> Dict[str, str]:
return {
"title": "The job title",
"company_name": "The name of the company",
"location": "The location of the job",
"description": "The full job description",
"requirements": "List of job requirements",
"qualifications": "List of required qualifications",
"salary_range": "The salary range mentioned",
"nature_of_work": "Remote, onsite, or hybrid"
}
def _parse_llm_response(self, response_text: str) -> Optional[Dict[str, Any]]:
json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', response_text, re.DOTALL)
if not json_match:
json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
if not json_match:
return None
try:
return json.loads(json_match.group(1) if '```' in response_text else json_match.group(0))
except json.JSONDecodeError:
return None
async def save_job_data(self, job_data: Dict[str, Any], keyword: str):
"""
Save job data to both markdown and database
"""
# Save to database
await self._save_to_db(job_data)
# Save to markdown
await self._save_to_markdown(job_data, keyword)
async def _save_to_db(self, job_data: Dict[str, Any]):
"""
Save job data to database
"""
db_path = "linkedin_jobs.db"
os.makedirs(os.path.dirname(db_path) if os.path.dirname(db_path) else ".", exist_ok=True)
os.makedirs(os.path.dirname(db_path) or ".", exist_ok=True)
with sqlite3.connect(db_path) as conn:
cursor = conn.cursor()
cursor.execute('''
INSERT OR IGNORE INTO jobs
(title, company_name, location, description, requirements,
qualifications, salary_range, nature_of_work, job_id, url)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
''', (
job_data.get("title", "N/A"),
job_data.get("company_name", "N/A"),
job_data.get("location", "N/A"),
job_data.get("description", "N/A"),
job_data.get("requirements", "N/A"),
job_data.get("qualifications", "N/A"),
job_data.get("salary_range", "N/A"),
job_data.get("nature_of_work", "N/A"),
job_data.get("job_id", "N/A"),
job_data.get("url", "N/A")
))
fields = list(job_data.keys())
placeholders = ', '.join(['?' for _ in fields])
columns = ', '.join([f'"{col}"' for col in fields]) # Escape column names
cursor.execute(f"CREATE TABLE IF NOT EXISTS jobs ({columns})")
cursor.execute(f'INSERT INTO jobs ({columns}) VALUES ({placeholders})',
[job_data.get(field, 'N/A') for field in fields])
conn.commit()
async def _save_to_markdown(self, job_data: Dict[str, Any], keyword: str):
"""
Save job data to markdown file
"""
os.makedirs("linkedin_jobs", exist_ok=True)
# Create a single markdown file for all jobs
filename = "linkedin_jobs_scraped.md"
filepath = os.path.join("linkedin_jobs", filename)
filepath = os.path.join("linkedin_jobs", "linkedin_jobs_scraped.md")
write_header = not os.path.exists(filepath) or os.path.getsize(filepath) == 0
with open(filepath, "a", encoding="utf-8") as f:
# Only write header if file is empty
if os.path.getsize(filepath) == 0:
if write_header:
f.write(f"# LinkedIn Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
f.write(f"## Job: {job_data.get('title', 'N/A')}\n\n")
f.write(f"- **Keyword**: {keyword}\n")
f.write(f"- **Company**: {job_data.get('company_name', 'N/A')}\n")
f.write(f"- **Location**: {job_data.get('location', 'N/A')}\n")
f.write(f"- **Nature of Work**: {job_data.get('nature_of_work', 'N/A')}\n")
f.write(f"- **Salary Range**: {job_data.get('salary_range', 'N/A')}\n")
f.write(f"- **Job ID**: {job_data.get('job_id', 'N/A')}\n")
f.write(f"- **URL**: <{job_data.get('url', 'N/A')}>\n\n")
f.write(f"### Description\n\n{job_data.get('description', 'N/A')}\n\n")
f.write(f"### Requirements\n\n{job_data.get('requirements', 'N/A')}\n\n")
f.write(f"### Qualifications\n\n{job_data.get('qualifications', 'N/A')}\n\n")
f.write("---\n\n")
for key, value in job_data.items():
if key != 'title':
f.write(f"- **{key.replace('_', ' ').title()}**: {value}\n")
f.write("\n---\n\n")

View File

@ -6,10 +6,12 @@ import hashlib
import random
import os
import json
from typing import List, Optional, Dict
from playwright.async_api import Page
from typing import List, Optional, Dict, Any
from browserforge.fingerprints import FingerprintGenerator
from dotenv import load_dotenv
from config import load_spoof_config
import time
# Load environment variables
load_dotenv()
@ -45,24 +47,36 @@ class FingerprintScrapingEngine:
browser=('chrome',),
os=(self.os,)
)
self.num_variations = num_variations
# Load spoof config
spoof_config = load_spoof_config()
self.common_renderers = spoof_config["renderers"]
self.common_vendors = spoof_config["vendors"]
# Feedback system
self.feedback_file = f"feedback_{seed}.json"
# Feedback system
self.feedback = self._load_feedback()
# ← NEW: Session persistence paths
self.session_dir = "browser_sessions"
os.makedirs(self.session_dir, exist_ok=True)
self.session_path = os.path.join(self.session_dir, f"{seed}_session.json")
self.session_path = os.path.join(
self.session_dir, f"{seed}_session.json")
def _load_feedback(self):
self.optimization_params = {
"base_delay": 2.0,
"max_concurrent_requests": 4,
"request_timeout": 60000,
"retry_attempts": 3,
"captcha_handling_strategy": "avoid", # or "solve_fallback"
"cloudflare_wait_strategy": "smart_wait", # or "aggressive_reload"
}
self._update_params_from_feedback()
def _load_feedback(self) -> Dict[str, Any]:
if os.path.exists(self.feedback_file):
try:
with open(self.feedback_file, "r") as f:
@ -70,6 +84,8 @@ class FingerprintScrapingEngine:
data.setdefault("success_rate", 1.0)
data.setdefault("captcha_count", 0)
data.setdefault("cloudflare_count", 0)
data.setdefault("avg_response_time", 10.0) # New metric
data.setdefault("failed_domains", {}) # New metrice
return data
except:
pass
@ -79,16 +95,69 @@ class FingerprintScrapingEngine:
with open(self.feedback_file, "w") as f:
json.dump(self.feedback, f)
def report_outcome(self, outcome: str):
def report_outcome(self, outcome: str, url: Optional[str] = None, response_time: Optional[float] = None):
if outcome == "success":
self.feedback["success_rate"] = min(1.0, self.feedback["success_rate"] + 0.1)
self.feedback["success_rate"] = min(
1.0, self.feedback["success_rate"] + 0.05) # Smaller increment
else:
self.feedback["success_rate"] = max(0.1, self.feedback["success_rate"] - 0.2)
if outcome == "captcha":
self.feedback["captcha_count"] += 1
elif outcome == "cloudflare":
self.feedback["cloudflare_count"] += 1
self.feedback["success_rate"] = max(
0.05, self.feedback["success_rate"] - 0.1) # Smaller decrement
if outcome == "captcha":
self.feedback["captcha_count"] += 1
# Adapt strategy if many captchas
self.optimization_params["captcha_handling_strategy"] = "solve_fallback"
elif outcome == "cloudflare":
self.feedback["cloudflare_count"] += 1
# Adjust wait strategy based on frequency
if self.feedback["cloudflare_count"] > 5:
self.optimization_params["cloudflare_wait_strategy"] = "aggressive_reload"
# Track domain-specific failures
if url and outcome != "success":
domain = url.split("//")[1].split("/")[0]
if domain not in self.feedback["failed_domains"]:
self.feedback["failed_domains"][domain] = 0
self.feedback["failed_domains"][domain] += 1
# Update average response time
if response_time:
prev_avg = self.feedback.get("avg_response_time", 10.0)
# Simple moving average
self.feedback["avg_response_time"] = (
prev_avg * 0.9) + (response_time * 0.1)
self.save_feedback()
self._update_params_from_feedback() # Update params based on new feedback
def _update_params_from_feedback(self):
"""Adjust optimization parameters based on feedback."""
sr = self.feedback["success_rate"]
cc = self.feedback["captcha_count"]
cf = self.feedback["cloudflare_count"]
avg_rt = self.feedback.get("avg_response_time", 10.0)
# Adjust base delay based on success rate and avg response time
if sr < 0.6:
self.optimization_params["base_delay"] = max(
5.0, self.optimization_params["base_delay"] * 1.2)
elif sr > 0.8:
self.optimization_params["base_delay"] = min(
3.0, self.optimization_params["base_delay"] * 0.9)
# Reduce concurrency if many captchas/cloudflares
if cc > 3 or cf > 3:
self.optimization_params["max_concurrent_requests"] = max(
2, self.optimization_params["max_concurrent_requests"] - 2)
else:
# Reset to default
self.optimization_params["max_concurrent_requests"] = 4
# Increase timeout if avg response time is high
if avg_rt > 20:
self.optimization_params["request_timeout"] = 90000 # 90 seconds
print(f"Optimization Params Updated: {self.optimization_params}")
# ← NEW: Save browser context (cookies + localStorage)
async def save_session(self, context):
@ -129,7 +198,8 @@ class FingerprintScrapingEngine:
if self.feedback["success_rate"] < 0.5:
concurrency_options = [8, 4]
memory_options = [8]
profile.navigator.hardwareConcurrency = random.choice(concurrency_options)
profile.navigator.hardwareConcurrency = random.choice(
concurrency_options)
profile.navigator.deviceMemory = random.choice(memory_options)
return profile
@ -244,23 +314,6 @@ class FingerprintScrapingEngine:
await asyncio.sleep(random.uniform(0.2, 1.0))
except:
pass
async def _detect_cloudflare(self, page) -> bool:
content = await page.content()
return (
"#cf-chl" in content or
"checking your browser" in content.lower() or
"just a moment" in content.lower()
)
async def _handle_cloudflare(self, page, max_retries: int = 3):
for i in range(max_retries):
if not await self._detect_cloudflare(page):
return True
print(f"☁️ Cloudflare detected - waiting... (attempt {i+1})")
await asyncio.sleep(8 + random.uniform(2, 5))
await page.wait_for_load_state("load", timeout=60000)
return False
async def _avoid_captcha(self, page) -> bool:
await asyncio.sleep(2 + random.random() * 3)
@ -268,7 +321,7 @@ class FingerprintScrapingEngine:
await self._simulate_human_interaction(page)
await asyncio.sleep(3 + random.random() * 2)
return True
async def _solve_captcha_fallback(self, page) -> bool:
await asyncio.sleep(15 + random.random() * 10)
captcha_content = await page.content()
@ -283,3 +336,42 @@ class FingerprintScrapingEngine:
return True
return False
async def _detect_cloudflare(self, page: Page) -> bool:
"""Detect Cloudflare challenges."""
content = await page.content()
return (
"#cf-chl" in content
or "checking your browser" in content.lower()
or "just a moment" in content.lower()
or "turnstile" in content.lower() # Check for Cloudflare Turnstile
)
async def _handle_cloudflare(self, page: Page) -> bool:
"""
Handle Cloudflare challenges, including Turnstile if present.
This is a simplified approach; real-world handling might require more sophisticated logic or external solvers.
"""
max_wait_time = 60 # Total time to wait for Cloudflare to resolve
start_time = time.time()
while time.time() - start_time < max_wait_time:
if not await self._detect_cloudflare(page):
print("Cloudflare challenge resolved.")
return True
print("Cloudflare active, waiting...")
# Simulate more human-like behavior while waiting
await self._simulate_human_interaction(page)
# Wait for a random period, increasing slightly each time
wait_time = min(10, 2 + random.uniform(1, 3) +
(time.time() - start_time) * 0.1)
await asyncio.sleep(wait_time)
# Reload occasionally to trigger potential client-side checks
if (time.time() - start_time) > 15 and (time.time() - start_time) % 20 < 2:
print("Reloading page during Cloudflare wait...")
await page.reload(wait_until='load', timeout=30000)
print("Timeout waiting for Cloudflare resolution.")
return False