Compare commits

..

No commits in common. "160efadbfb1ad06cd955a0ab8389d7e853722f03" and "d7d92ba8bbab4ef66bc33e246891805f729128d7" have entirely different histories.

6 changed files with 166 additions and 330 deletions

View File

@ -8,9 +8,9 @@ from dotenv import load_dotenv
load_dotenv()
# LLM Agent Configuration
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")
if not DEEPSEEK_API_KEY:
raise ValueError("DEEPSEEK_API_KEY environment variable not set in .env file")
GEMINI_API_KEY = os.getenv("XAI_API_KEY")
if not GEMINI_API_KEY:
raise ValueError("XAI_API_KEY environment variable not set in .env file")
def load_spoof_config():

View File

@ -23,11 +23,11 @@ class StealthyFetcher:
print(f"Attempt {attempt + 1} to fetch {url}")
page = await self.context.new_page()
await page.goto(url, wait_until='load', timeout=120000)
await page.goto(url, wait_until='load', timeout=60000)
if wait_for_selector:
try:
await page.wait_for_selector(wait_for_selector, timeout=40000)
await page.wait_for_selector(wait_for_selector, timeout=10000)
except PlaywrightTimeoutError:
print(f"Selector {wait_for_selector} not found immediately, continuing...")
@ -88,7 +88,7 @@ class StealthyFetcher:
async def _is_content_accessible(self, page: Page, wait_for_selector: Optional[str] = None) -> bool:
if wait_for_selector:
try:
await page.wait_for_selector(wait_for_selector, timeout=40000)
await page.wait_for_selector(wait_for_selector, timeout=5000)
return True
except PlaywrightTimeoutError:
pass
@ -118,7 +118,7 @@ class StealthyFetcher:
if (time.time() - start_time) > 15 and (time.time() - start_time) % 20 < 2:
print("🔄 Reloading page during Cloudflare wait...")
await page.reload(wait_until='load', timeout=120000)
await page.reload(wait_until='load', timeout=30000)
print("⏰ Timeout waiting for Cloudflare resolution.")
return False

View File

@ -1,12 +1,13 @@
import asyncio
import random
import sqlite3
import os
from typing import Optional, Dict
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
from browserforge.injectors.playwright import AsyncNewContext
from llm_agent import LLMJobRefiner
import re
from fetcher import StealthyFetcher
from datetime import datetime
class LinkedInJobScraper:
@ -25,8 +26,25 @@ class LinkedInJobScraper:
self.llm_agent = LLMJobRefiner()
def _init_db(self):
# This method is kept for backward compatibility but LLMJobRefiner handles PostgreSQL now
pass
os.makedirs(os.path.dirname(self.db_path) if os.path.dirname(self.db_path) else ".", exist_ok=True)
with sqlite3.connect(self.db_path) as conn:
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS jobs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
title TEXT,
company_name TEXT,
location TEXT,
description TEXT,
requirements TEXT,
qualifications TEXT,
salary_range TEXT,
nature_of_work TEXT,
job_id TEXT,
url TEXT UNIQUE
)
''')
conn.commit()
async def _human_click(self, page, element, wait_after: bool = True):
if not element:
@ -43,7 +61,7 @@ class LinkedInJobScraper:
async def _login(self, page, credentials: Dict) -> bool:
print("🔐 Navigating to LinkedIn login page...")
await page.goto("https://www.linkedin.com/login", timeout=120000)
await page.goto("https://www.linkedin.com/login", timeout=60000)
await asyncio.sleep(random.uniform(2.0, 3.5) * self.human_speed)
email_field = await page.query_selector('input[name="session_key"]')
@ -86,11 +104,7 @@ class LinkedInJobScraper:
print("❌ Login may have failed.")
return False
async def _extract_page_content_for_llm(self, page) -> str:
"""
Extract raw page content as HTML/text for LLM processing
The LLM will handle all extraction logic, not specific selectors
"""
async def _extract_all_page_content(self, page) -> str:
await asyncio.sleep(2 * self.human_speed)
await self.engine._human_like_scroll(page)
await asyncio.sleep(2 * self.human_speed)
@ -158,7 +172,7 @@ class LinkedInJobScraper:
await self._human_click(page, next_btn)
await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)
try:
await page.wait_for_function("() => window.location.href.includes('start=')", timeout=120000)
await page.wait_for_function("() => window.location.href.includes('start=')", timeout=60000)
except:
pass
current_page += 1
@ -233,7 +247,7 @@ class LinkedInJobScraper:
if session_loaded:
print("🔁 Using saved session — verifying login...")
await page.goto("https://www.linkedin.com/feed/", timeout=120000)
await page.goto("https://www.linkedin.com/feed/", timeout=60000)
if "feed" in page.url and "login" not in page.url:
print("✅ Session still valid.")
login_successful = True
@ -255,7 +269,7 @@ class LinkedInJobScraper:
print(" No credentials — proceeding as guest.")
login_successful = True
await page.wait_for_load_state("load", timeout=120000)
await page.wait_for_load_state("load", timeout=60000)
print("✅ Post-login page fully loaded. Starting search...")
# >>> PROTECTION CHECK USING FETCHER LOGIC <<<
@ -278,7 +292,7 @@ class LinkedInJobScraper:
print("✅ Protection present but content accessible — proceeding.")
print(f"🔍 Searching for: {search_keywords}")
await page.goto(search_url, wait_until='load', timeout=120000)
await page.goto(search_url, wait_until='load', timeout=60000)
await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)
# >>> PROTECTION CHECK ON SEARCH PAGE <<<
@ -308,7 +322,7 @@ class LinkedInJobScraper:
print(f" Found {initial_jobs} initial job(s) (total: {len(all_job_links)})")
iteration = 1
while True and iteration >= 5:
while True:
print(f"🔄 Iteration {iteration}: Checking for new jobs...")
prev_job_count = len(all_job_links)
@ -341,6 +355,10 @@ class LinkedInJobScraper:
print("🔚 No new jobs found after refresh. Stopping.")
break
if iteration > 10:
print("🔄 Maximum iterations reached. Stopping.")
break
print(f"✅ Collected {len(all_job_links)} unique job links.")
scraped_count = 0
@ -368,9 +386,8 @@ class LinkedInJobScraper:
if apply_btn:
break
final_url = full_url
external_url = None
page_content = None
page_data = None
final_url = job_page.url
if apply_btn:
print(" → Clicking 'Apply' / 'Easy Apply' button...")
@ -382,61 +399,44 @@ class LinkedInJobScraper:
try:
external_page = await asyncio.wait_for(page_waiter, timeout=5.0)
print(" 🌐 External job site opened in new tab.")
await external_page.wait_for_load_state("load", timeout=120000)
await external_page.wait_for_load_state("load", timeout=60000)
await asyncio.sleep(2 * self.human_speed)
await self.engine._human_like_scroll(external_page)
await asyncio.sleep(2 * self.human_speed)
# Extract raw content from external page for LLM processing
external_url = external_page.url
final_url = external_url
page_content = await self._extract_page_content_for_llm(external_page)
page_data = await self._extract_all_page_content(external_page)
final_url = external_page.url
if not external_page.is_closed():
await external_page.close()
except asyncio.TimeoutError:
print(" 🖥️ No external tab — scraping LinkedIn job page directly.")
await job_page.wait_for_timeout(60000)
await job_page.wait_for_timeout(2000)
try:
await job_page.wait_for_selector("div.jobs-apply-button--fixed, div.jobs-easy-apply-modal", timeout=80000)
await job_page.wait_for_selector("div.jobs-apply-button--fixed, div.jobs-easy-apply-modal", timeout=8000)
except PlaywrightTimeoutError:
pass
await self.engine._human_like_scroll(job_page)
await asyncio.sleep(2 * self.human_speed)
page_content = await self._extract_page_content_for_llm(job_page)
page_data = await self._extract_all_page_content(job_page)
else:
print(" ⚠️ No 'Apply' button found — scraping job details directly.")
await self.engine._human_like_scroll(job_page)
await asyncio.sleep(2 * self.human_speed)
page_content = await self._extract_page_content_for_llm(job_page)
page_data = await self._extract_all_page_content(job_page)
job_id = full_url.split("/")[-2] if "/jobs/view/" in full_url else "unknown"
job_id = final_url.split("/")[-2] if "/jobs/view/" in final_url else "unknown"
raw_data = {
"page_content": page_content,
"url": final_url,
"job_id": job_id,
"search_keywords": search_keywords
"page_content": page_data,
"url": job_page.url,
"job_id": job_page.url.split("/")[-2] if "/jobs/view/" in job_page.url else "unknown"
}
# LLM agent is now fully responsible for extraction and validation
refined_data = await self.llm_agent.refine_job_data(raw_data, self.user_request)
if refined_data and refined_data.get("title", "N/A") != "N/A":
# Ensure compulsory fields are present (fallback if LLM missed them)
compulsory_fields = ['company_name', 'job_id', 'url']
for field in compulsory_fields:
if not refined_data.get(field) or refined_data[field] in ["N/A", "", "Unknown"]:
if field == 'job_id':
refined_data[field] = job_id
elif field == 'url':
refined_data[field] = final_url
elif field == 'company_name':
refined_data[field] = "Unknown Company"
refined_data['scraped_at'] = datetime.now().isoformat()
refined_data['category'] = clean_keywords
await self.llm_agent.save_job_data(refined_data, search_keywords)
scraped_count += 1
print(f" ✅ Scraped and refined: {refined_data['title'][:50]}...")
@ -455,7 +455,7 @@ class LinkedInJobScraper:
finally:
print(" ↩️ Returning to LinkedIn search results...")
await page.goto(search_url, timeout=120000)
await page.goto(search_url, timeout=60000)
await asyncio.sleep(4 * self.human_speed)
await browser.close()

View File

@ -4,8 +4,6 @@ from job_scraper2 import LinkedInJobScraper
import os
from dotenv import load_dotenv
import asyncio
import random
import time
# Load environment variables
load_dotenv()
@ -13,7 +11,7 @@ load_dotenv()
async def main():
engine = FingerprintScrapingEngine(
seed="job_scraping_12",
seed="job_scraping_123",
target_os="windows",
db_path="job_listings.db",
markdown_path="job_listings.md"
@ -22,50 +20,13 @@ async def main():
# Initialize scraper with target field
scraper = LinkedInJobScraper(engine, human_speed=1.6, user_request="Extract title, company, location, description, requirements, qualifications, nature of job(remote, onsite, hybrid) and salary")
# List of job titles to cycle through
job_titles = [
"Software Engineer",
"Data Scientist",
"Product Manager",
"UX Designer",
"DevOps Engineer",
"Machine Learning Engineer",
"Frontend Developer",
"Backend Developer",
"Full Stack Developer",
"Data Analyst"
]
fixed_location = "New York"
# Keep cycling through all job titles
while True:
# Shuffle job titles to randomize order
random.shuffle(job_titles)
for job_title in job_titles:
search_keywords = f"{job_title} location:{fixed_location}"
print(f"\n{'='*60}")
print(f"Starting scrape for: {search_keywords}")
print(f"{'='*60}")
await scraper.scrape_jobs(
search_keywords=search_keywords,
credentials={
"email": os.getenv("SCRAPING_USERNAME"),
"password": os.getenv("SCRAPING_PASSWORD")
}
)
print(f"\n✅ Completed scraping for: {job_title}")
print(f"⏳ Waiting 2 minutes before next job title...")
# Wait 2 minutes before next job title
time.sleep(120)
print(f"\n✅ Completed full cycle of all job titles")
print(f"🔄 Starting new cycle...")
await scraper.scrape_jobs(
search_keywords="Web Designer location:New York",
credentials={
"email": os.getenv("SCRAPING_USERNAME"),
"password": os.getenv("SCRAPING_PASSWORD")
}
)
if __name__ == "__main__":
asyncio.run(main())

View File

@ -1,219 +1,131 @@
from openai import OpenAI
from typing import Dict, Any
from typing import Dict, Any, Optional
import asyncio
import psycopg2
import sqlite3
import os
from datetime import datetime
import json
import re
from bs4 import BeautifulSoup
from dotenv import load_dotenv
# Load environment variables from .env
# ✅ Actually load .env
load_dotenv()
class LLMJobRefiner:
def __init__(self):
deepseek_api_key = os.getenv("DEEPSEEK_API_KEY")
if not deepseek_api_key:
raise ValueError("DEEPSEEK_API_KEY not found in .env file.")
xai_api_key = os.getenv("XAI_API_KEY")
if not xai_api_key:
raise ValueError("XAI_API_KEY not found in environment variables.")
# Database credentials from .env
self.db_url = os.getenv("DB_URL")
self.db_username = os.getenv("DB_USERNAME")
self.db_password = os.getenv("DB_PASSWORD")
self.db_host = os.getenv("DB_HOST")
self.db_port = os.getenv("DB_PORT")
self.client = OpenAI(api_key=xai_api_key, base_url="https://api.x.ai/v1")
self.model = "grok-4-latest"
self.extraction_schema_cache = {}
if not self.db_url or not self.db_username or not self.db_password:
raise ValueError("Database credentials not found in .env file.")
# DeepSeek uses OpenAI-compatible API
self.client = OpenAI(
api_key=deepseek_api_key,
base_url="https://api.deepseek.com/v1"
)
self.model = "deepseek-chat"
self._init_db()
def _init_db(self):
"""Initialize PostgreSQL database connection and create table"""
try:
self.db_url = os.getenv("DB_URL")
if self.db_url and "supabase.com" in self.db_url:
conn = psycopg2.connect(
host=self.db_host,
port=self.db_port,
database="postgres",
user=self.db_username,
password=self.db_password
)
else:
conn = psycopg2.connect(
host=self.db_host,
port=self.db_port,
database="postgres",
user=self.db_username,
password=self.db_password
)
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS jobs (
id SERIAL PRIMARY KEY,
title TEXT,
company_name TEXT,
location TEXT,
description TEXT,
requirements TEXT,
qualifications TEXT,
salary_range TEXT,
nature_of_work TEXT,
job_id TEXT UNIQUE,
url TEXT,
category TEXT,
scraped_at TIMESTAMP,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
# Ensure the uniqueness constraint exists
cursor.execute('''
ALTER TABLE jobs DROP CONSTRAINT IF EXISTS jobs_job_id_key;
ALTER TABLE jobs ADD CONSTRAINT jobs_job_id_key UNIQUE (job_id);
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_job_id ON jobs(job_id)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_category ON jobs(category)')
conn.commit()
cursor.close()
conn.close()
print("✅ PostgreSQL database initialized successfully")
except Exception as e:
print(f"❌ Database initialization error: {e}")
raise
def _clean_html_for_llm(self, html_content: str) -> str:
"""Clean HTML to make it more readable for LLM while preserving structure"""
try:
soup = BeautifulSoup(html_content, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style", "nav", "footer", "header"]):
script.decompose()
# Extract text but keep some structure
text = soup.get_text(separator=' ', strip=True)
# Clean up whitespace
text = re.sub(r'\s+', ' ', text)
# Limit length for LLM context
if len(text) > 10000:
text = text[:10000] + "..."
return text
except Exception as e:
print(f"HTML cleaning error: {e}")
# Fallback to raw content if cleaning fails
return html_content[:100000] if len(html_content) > 100000 else html_content
def _generate_content_sync(self, prompt: str) -> str:
"""Synchronous call to DeepSeek API"""
def generate_content(self, prompt: str, system_message: str = "You are a helpful assistant.") -> str:
"""Synchronous method to call Grok via xAI API."""
try:
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
messages=[
{"role": "system", "content": system_message},
{"role": "user", "content": prompt}
],
temperature=0.2,
max_tokens=2048,
stream=False
)
return response.choices[0].message.content or ""
except Exception as e:
print(f"DeepSeek API error: {e}")
print(f"Error in Grok API call: {e}")
return ""
async def refine_job_data(self, raw_data: Dict[str, Any], target_field: str) -> Dict[str, Any]:
# Clean the raw HTML content for better LLM processing
async def refine_job_data(self, raw_data: Dict[str, Any], user_request: str) -> Optional[Dict[str, Any]]:
page_content = raw_data.get('page_content', '')
cleaned_content = self._clean_html_for_llm(page_content)
if not page_content:
return None
# Get job_id and url from raw data
job_id = raw_data.get('job_id', 'unknown')
url = raw_data.get('url', 'N/A')
schema_key = user_request.lower().strip()
extraction_schema = self.extraction_schema_cache.get(schema_key)
if not extraction_schema:
extraction_schema = await self._generate_extraction_schema(user_request)
if extraction_schema:
self.extraction_schema_cache[schema_key] = extraction_schema
else:
extraction_schema = self._get_default_schema()
prompt = f"""
You are a job posting data extractor with two modes:
You are a highly skilled web data extraction assistant. Your task is to analyze the raw HTML content of a job posting page and extract specific information requested by the user.
The user's request is: "{user_request}"
The raw HTML content of the page is provided below (limited in size). The content might be noisy or unstructured.
Your goal is to:
1. Analyze the HTML structure to identify relevant sections.
2. Extract the requested information accurately.
3. Clean up formatting issues.
4. If a field cannot be found, use "N/A".
5. Return ONLY the extracted data in a JSON object based on the following schema:
{json.dumps(extraction_schema, indent=2)}
Raw Page Content (HTML):
{page_content[:6000]}
PRIMARY MODE (PREFERRED):
- Extract EXACT text as it appears on the page for all fields
- DO NOT summarize, paraphrase, or interpret
- Copy verbatim content including original wording and formatting
FALLBACK MODE (ONLY IF FIELD IS MISSING):
- If a field is NOT explicitly stated anywhere in the content, you MAY infer it using clear contextual clues
- Inference rules:
* company_name: Look for patterns like "at [Company]", "Join [Company]", "[Company] is hiring"
* nature_of_work: Look for "remote", "onsite", "hybrid", "work from home", "office-based"
* location: Extract city/state/country mentions near job title or details
* title: Use the largest/primary heading if no explicit "job title" label exists
REQUIRED FIELDS (must always have a value):
- title: Exact job title or best inference
- company_name: Exact company name or best inference
- job_id: Use provided: {job_id}
- url: Use provided: {url}
OPTIONAL FIELDS (use exact text or "N/A" if not present and not inferable):
- location
- description
- requirements
- qualifications
- salary_range
- nature_of_work
Page Content:
{cleaned_content}
Response format (ONLY return this JSON):
{{
"title": "...",
"company_name": "...",
"location": "...",
"description": "...",
"requirements": "...",
"qualifications": "...",
"salary_range": "...",
"nature_of_work": "...",
"job_id": "{job_id}",
"url": "{url}"
}}
Respond with the JSON object containing the extracted data.
"""
try:
# ✅ Use self (current instance), NOT a new LLMJobRefiner()
response_text = await asyncio.get_event_loop().run_in_executor(
None,
lambda: self._generate_content_sync(prompt)
lambda: self.generate_content(prompt)
)
refined_data = self._parse_llm_response(response_text)
if not refined_data:
return None
# Final validation - ensure required fields are present and meaningful
if refined_data:
required_fields = ['title', 'company_name', 'job_id', 'url']
for field in required_fields:
if not refined_data.get(field) or refined_data[field] in ["N/A", "", "Unknown", "Company", "Job"]:
return None # LLM failed to extract properly
return refined_data
return None
refined_data['job_id'] = raw_data.get('job_id', 'unknown')
refined_data['url'] = raw_data.get('url', 'N/A')
return refined_data
except Exception as e:
print(f"LLM refinement failed: {str(e)}")
return None
def _parse_llm_response(self, response_text: str) -> Dict[str, Any]:
async def _generate_extraction_schema(self, user_request: str) -> Optional[Dict[str, str]]:
schema_prompt = f"""
Based on the user's request: "{user_request}", generate a JSON schema for the data they want to extract from a job posting.
The schema should be a dictionary where keys are field names (snake_case) and values are short descriptions.
Include standard fields like title, company_name, location, description, etc., if relevant.
Respond with only the JSON schema.
"""
try:
# ✅ Use self.generate_content, NOT self.model.generate_content
schema_text = await asyncio.get_event_loop().run_in_executor(
None,
lambda: self.generate_content(schema_prompt)
)
json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', schema_text, re.DOTALL)
if not json_match:
json_match = re.search(r'\{.*\}', schema_text, re.DOTALL)
if not json_match:
return None
json_str = json_match.group(1) if '```' in schema_text else json_match.group(0)
return json.loads(json_str)
except Exception as e:
print(f"Schema generation failed: {str(e)}")
return None
def _get_default_schema(self) -> Dict[str, str]:
return {
"title": "The job title",
"company_name": "The name of the company",
"location": "The location of the job",
"description": "The full job description",
"requirements": "List of job requirements",
"qualifications": "List of required qualifications",
"salary_range": "The salary range mentioned",
"nature_of_work": "Remote, onsite, or hybrid"
}
def _parse_llm_response(self, response_text: str) -> Optional[Dict[str, Any]]:
json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', response_text, re.DOTALL)
if not json_match:
json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
@ -230,46 +142,17 @@ class LLMJobRefiner:
await self._save_to_markdown(job_data, keyword)
async def _save_to_db(self, job_data: Dict[str, Any]):
"""Save job data to PostgreSQL database with job_id uniqueness"""
try:
conn = psycopg2.connect(
host=self.db_host,
port=self.db_port,
database="postgres",
user=self.db_username,
password=self.db_password
)
db_path = "linkedin_jobs.db"
os.makedirs(os.path.dirname(db_path) or ".", exist_ok=True)
with sqlite3.connect(db_path) as conn:
cursor = conn.cursor()
cursor.execute('''
INSERT INTO jobs
(title, company_name, location, description, requirements,
qualifications, salary_range, nature_of_work, job_id, url, category, scraped_at)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (job_id) DO NOTHING
''', (
job_data.get("title", "N/A"),
job_data.get("company_name", "N/A"),
job_data.get("location", "N/A"),
job_data.get("description", "N/A"),
job_data.get("requirements", "N/A"),
job_data.get("qualifications", "N/A"),
job_data.get("salary_range", "N/A"),
job_data.get("nature_of_work", "N/A"),
job_data.get("job_id", "N/A"),
job_data.get("url", "N/A"),
job_data.get("category", "N/A"),
job_data.get("scraped_at")
))
fields = list(job_data.keys())
placeholders = ', '.join(['?' for _ in fields])
columns = ', '.join([f'"{col}"' for col in fields]) # Escape column names
cursor.execute(f"CREATE TABLE IF NOT EXISTS jobs ({columns})")
cursor.execute(f'INSERT INTO jobs ({columns}) VALUES ({placeholders})',
[job_data.get(field, 'N/A') for field in fields])
conn.commit()
cursor.close()
conn.close()
print(f" 💾 Saved job to category '{job_data.get('category', 'N/A')}' with job_id: {job_data.get('job_id', 'N/A')}")
except Exception as e:
print(f"❌ Database save error: {e}")
async def _save_to_markdown(self, job_data: Dict[str, Any], keyword: str):
os.makedirs("linkedin_jobs", exist_ok=True)
@ -281,15 +164,7 @@ class LLMJobRefiner:
f.write(f"# LinkedIn Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
f.write(f"## Job: {job_data.get('title', 'N/A')}\n\n")
f.write(f"- **Keyword**: {keyword}\n")
f.write(f"- **Company**: {job_data.get('company_name', 'N/A')}\n")
f.write(f"- **Location**: {job_data.get('location', 'N/A')}\n")
f.write(f"- **Nature of Work**: {job_data.get('nature_of_work', 'N/A')}\n")
f.write(f"- **Salary Range**: {job_data.get('salary_range', 'N/A')}\n")
f.write(f"- **Job ID**: {job_data.get('job_id', 'N/A')}\n")
f.write(f"- **Category**: {job_data.get('category', 'N/A')}\n")
f.write(f"- **Scraped At**: {job_data.get('scraped_at', 'N/A')}\n")
f.write(f"- **URL**: <{job_data.get('url', 'N/A')}>\n\n")
f.write(f"### Description\n\n{job_data.get('description', 'N/A')}\n\n")
f.write(f"### Requirements\n\n{job_data.get('requirements', 'N/A')}\n\n")
f.write(f"### Qualifications\n\n{job_data.get('qualifications', 'N/A')}\n\n")
f.write("---\n\n")
for key, value in job_data.items():
if key != 'title':
f.write(f"- **{key.replace('_', ' ').title()}**: {value}\n")
f.write("\n---\n\n")

View File

@ -69,7 +69,7 @@ class FingerprintScrapingEngine:
self.optimization_params = {
"base_delay": 2.0,
"max_concurrent_requests": 4,
"request_timeout": 120000,
"request_timeout": 60000,
"retry_attempts": 3,
"captcha_handling_strategy": "avoid", # or "solve_fallback"
"cloudflare_wait_strategy": "smart_wait", # or "aggressive_reload"
@ -155,7 +155,7 @@ class FingerprintScrapingEngine:
# Increase timeout if avg response time is high
if avg_rt > 20:
self.optimization_params["request_timeout"] = 150000 # 90 seconds
self.optimization_params["request_timeout"] = 90000 # 90 seconds
print(f"Optimization Params Updated: {self.optimization_params}")
@ -371,7 +371,7 @@ class FingerprintScrapingEngine:
# Reload occasionally to trigger potential client-side checks
if (time.time() - start_time) > 15 and (time.time() - start_time) % 20 < 2:
print("Reloading page during Cloudflare wait...")
await page.reload(wait_until='load', timeout=80000)
await page.reload(wait_until='load', timeout=30000)
print("Timeout waiting for Cloudflare resolution.")
return False