Compare commits

..

2 Commits

Author SHA1 Message Date
160efadbfb modifications to work with postgre and use llm to extract and refine data 2025-12-05 17:00:43 +01:00
4f78a845ae refactor(llm_agent): switch from XAI to DeepSeek API and simplify job refinement
- Replace XAI/Grok integration with DeepSeek's OpenAI-compatible API
- Remove schema generation and caching logic
- Simplify prompt structure and response parsing
- Standardize database schema and markdown output format
- Update config to use DEEPSEEK_API_KEY instead of XAI_API_KEY
- Change default search keyword in linkedin_main.py
2025-12-01 10:25:37 +01:00
6 changed files with 330 additions and 166 deletions

View File

@ -8,9 +8,9 @@ from dotenv import load_dotenv
load_dotenv()
# LLM Agent Configuration
GEMINI_API_KEY = os.getenv("XAI_API_KEY")
if not GEMINI_API_KEY:
raise ValueError("XAI_API_KEY environment variable not set in .env file")
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")
if not DEEPSEEK_API_KEY:
raise ValueError("DEEPSEEK_API_KEY environment variable not set in .env file")
def load_spoof_config():

View File

@ -23,11 +23,11 @@ class StealthyFetcher:
print(f"Attempt {attempt + 1} to fetch {url}")
page = await self.context.new_page()
await page.goto(url, wait_until='load', timeout=60000)
await page.goto(url, wait_until='load', timeout=120000)
if wait_for_selector:
try:
await page.wait_for_selector(wait_for_selector, timeout=10000)
await page.wait_for_selector(wait_for_selector, timeout=40000)
except PlaywrightTimeoutError:
print(f"Selector {wait_for_selector} not found immediately, continuing...")
@ -88,7 +88,7 @@ class StealthyFetcher:
async def _is_content_accessible(self, page: Page, wait_for_selector: Optional[str] = None) -> bool:
if wait_for_selector:
try:
await page.wait_for_selector(wait_for_selector, timeout=5000)
await page.wait_for_selector(wait_for_selector, timeout=40000)
return True
except PlaywrightTimeoutError:
pass
@ -118,7 +118,7 @@ class StealthyFetcher:
if (time.time() - start_time) > 15 and (time.time() - start_time) % 20 < 2:
print("🔄 Reloading page during Cloudflare wait...")
await page.reload(wait_until='load', timeout=30000)
await page.reload(wait_until='load', timeout=120000)
print("⏰ Timeout waiting for Cloudflare resolution.")
return False

View File

@ -1,13 +1,12 @@
import asyncio
import random
import sqlite3
import os
from typing import Optional, Dict
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
from browserforge.injectors.playwright import AsyncNewContext
from llm_agent import LLMJobRefiner
import re
from fetcher import StealthyFetcher
from datetime import datetime
class LinkedInJobScraper:
@ -26,25 +25,8 @@ class LinkedInJobScraper:
self.llm_agent = LLMJobRefiner()
def _init_db(self):
os.makedirs(os.path.dirname(self.db_path) if os.path.dirname(self.db_path) else ".", exist_ok=True)
with sqlite3.connect(self.db_path) as conn:
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS jobs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
title TEXT,
company_name TEXT,
location TEXT,
description TEXT,
requirements TEXT,
qualifications TEXT,
salary_range TEXT,
nature_of_work TEXT,
job_id TEXT,
url TEXT UNIQUE
)
''')
conn.commit()
# This method is kept for backward compatibility but LLMJobRefiner handles PostgreSQL now
pass
async def _human_click(self, page, element, wait_after: bool = True):
if not element:
@ -61,7 +43,7 @@ class LinkedInJobScraper:
async def _login(self, page, credentials: Dict) -> bool:
print("🔐 Navigating to LinkedIn login page...")
await page.goto("https://www.linkedin.com/login", timeout=60000)
await page.goto("https://www.linkedin.com/login", timeout=120000)
await asyncio.sleep(random.uniform(2.0, 3.5) * self.human_speed)
email_field = await page.query_selector('input[name="session_key"]')
@ -104,7 +86,11 @@ class LinkedInJobScraper:
print("❌ Login may have failed.")
return False
async def _extract_all_page_content(self, page) -> str:
async def _extract_page_content_for_llm(self, page) -> str:
"""
Extract raw page content as HTML/text for LLM processing
The LLM will handle all extraction logic, not specific selectors
"""
await asyncio.sleep(2 * self.human_speed)
await self.engine._human_like_scroll(page)
await asyncio.sleep(2 * self.human_speed)
@ -172,7 +158,7 @@ class LinkedInJobScraper:
await self._human_click(page, next_btn)
await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)
try:
await page.wait_for_function("() => window.location.href.includes('start=')", timeout=60000)
await page.wait_for_function("() => window.location.href.includes('start=')", timeout=120000)
except:
pass
current_page += 1
@ -247,7 +233,7 @@ class LinkedInJobScraper:
if session_loaded:
print("🔁 Using saved session — verifying login...")
await page.goto("https://www.linkedin.com/feed/", timeout=60000)
await page.goto("https://www.linkedin.com/feed/", timeout=120000)
if "feed" in page.url and "login" not in page.url:
print("✅ Session still valid.")
login_successful = True
@ -269,7 +255,7 @@ class LinkedInJobScraper:
print(" No credentials — proceeding as guest.")
login_successful = True
await page.wait_for_load_state("load", timeout=60000)
await page.wait_for_load_state("load", timeout=120000)
print("✅ Post-login page fully loaded. Starting search...")
# >>> PROTECTION CHECK USING FETCHER LOGIC <<<
@ -292,7 +278,7 @@ class LinkedInJobScraper:
print("✅ Protection present but content accessible — proceeding.")
print(f"🔍 Searching for: {search_keywords}")
await page.goto(search_url, wait_until='load', timeout=60000)
await page.goto(search_url, wait_until='load', timeout=120000)
await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)
# >>> PROTECTION CHECK ON SEARCH PAGE <<<
@ -322,7 +308,7 @@ class LinkedInJobScraper:
print(f" Found {initial_jobs} initial job(s) (total: {len(all_job_links)})")
iteration = 1
while True:
while True and iteration >= 5:
print(f"🔄 Iteration {iteration}: Checking for new jobs...")
prev_job_count = len(all_job_links)
@ -355,10 +341,6 @@ class LinkedInJobScraper:
print("🔚 No new jobs found after refresh. Stopping.")
break
if iteration > 10:
print("🔄 Maximum iterations reached. Stopping.")
break
print(f"✅ Collected {len(all_job_links)} unique job links.")
scraped_count = 0
@ -386,8 +368,9 @@ class LinkedInJobScraper:
if apply_btn:
break
page_data = None
final_url = job_page.url
final_url = full_url
external_url = None
page_content = None
if apply_btn:
print(" → Clicking 'Apply' / 'Easy Apply' button...")
@ -399,44 +382,61 @@ class LinkedInJobScraper:
try:
external_page = await asyncio.wait_for(page_waiter, timeout=5.0)
print(" 🌐 External job site opened in new tab.")
await external_page.wait_for_load_state("load", timeout=60000)
await external_page.wait_for_load_state("load", timeout=120000)
await asyncio.sleep(2 * self.human_speed)
await self.engine._human_like_scroll(external_page)
await asyncio.sleep(2 * self.human_speed)
page_data = await self._extract_all_page_content(external_page)
final_url = external_page.url
# Extract raw content from external page for LLM processing
external_url = external_page.url
final_url = external_url
page_content = await self._extract_page_content_for_llm(external_page)
if not external_page.is_closed():
await external_page.close()
except asyncio.TimeoutError:
print(" 🖥️ No external tab — scraping LinkedIn job page directly.")
await job_page.wait_for_timeout(2000)
await job_page.wait_for_timeout(60000)
try:
await job_page.wait_for_selector("div.jobs-apply-button--fixed, div.jobs-easy-apply-modal", timeout=8000)
await job_page.wait_for_selector("div.jobs-apply-button--fixed, div.jobs-easy-apply-modal", timeout=80000)
except PlaywrightTimeoutError:
pass
await self.engine._human_like_scroll(job_page)
await asyncio.sleep(2 * self.human_speed)
page_data = await self._extract_all_page_content(job_page)
page_content = await self._extract_page_content_for_llm(job_page)
else:
print(" ⚠️ No 'Apply' button found — scraping job details directly.")
await self.engine._human_like_scroll(job_page)
await asyncio.sleep(2 * self.human_speed)
page_data = await self._extract_all_page_content(job_page)
page_content = await self._extract_page_content_for_llm(job_page)
job_id = final_url.split("/")[-2] if "/jobs/view/" in final_url else "unknown"
job_id = full_url.split("/")[-2] if "/jobs/view/" in full_url else "unknown"
raw_data = {
"page_content": page_data,
"url": job_page.url,
"job_id": job_page.url.split("/")[-2] if "/jobs/view/" in job_page.url else "unknown"
"page_content": page_content,
"url": final_url,
"job_id": job_id,
"search_keywords": search_keywords
}
# LLM agent is now fully responsible for extraction and validation
refined_data = await self.llm_agent.refine_job_data(raw_data, self.user_request)
if refined_data and refined_data.get("title", "N/A") != "N/A":
# Ensure compulsory fields are present (fallback if LLM missed them)
compulsory_fields = ['company_name', 'job_id', 'url']
for field in compulsory_fields:
if not refined_data.get(field) or refined_data[field] in ["N/A", "", "Unknown"]:
if field == 'job_id':
refined_data[field] = job_id
elif field == 'url':
refined_data[field] = final_url
elif field == 'company_name':
refined_data[field] = "Unknown Company"
refined_data['scraped_at'] = datetime.now().isoformat()
refined_data['category'] = clean_keywords
await self.llm_agent.save_job_data(refined_data, search_keywords)
scraped_count += 1
print(f" ✅ Scraped and refined: {refined_data['title'][:50]}...")
@ -455,7 +455,7 @@ class LinkedInJobScraper:
finally:
print(" ↩️ Returning to LinkedIn search results...")
await page.goto(search_url, timeout=60000)
await page.goto(search_url, timeout=120000)
await asyncio.sleep(4 * self.human_speed)
await browser.close()

View File

@ -4,6 +4,8 @@ from job_scraper2 import LinkedInJobScraper
import os
from dotenv import load_dotenv
import asyncio
import random
import time
# Load environment variables
load_dotenv()
@ -11,7 +13,7 @@ load_dotenv()
async def main():
engine = FingerprintScrapingEngine(
seed="job_scraping_123",
seed="job_scraping_12",
target_os="windows",
db_path="job_listings.db",
markdown_path="job_listings.md"
@ -20,13 +22,50 @@ async def main():
# Initialize scraper with target field
scraper = LinkedInJobScraper(engine, human_speed=1.6, user_request="Extract title, company, location, description, requirements, qualifications, nature of job(remote, onsite, hybrid) and salary")
await scraper.scrape_jobs(
search_keywords="Web Designer location:New York",
credentials={
"email": os.getenv("SCRAPING_USERNAME"),
"password": os.getenv("SCRAPING_PASSWORD")
}
)
# List of job titles to cycle through
job_titles = [
"Software Engineer",
"Data Scientist",
"Product Manager",
"UX Designer",
"DevOps Engineer",
"Machine Learning Engineer",
"Frontend Developer",
"Backend Developer",
"Full Stack Developer",
"Data Analyst"
]
fixed_location = "New York"
# Keep cycling through all job titles
while True:
# Shuffle job titles to randomize order
random.shuffle(job_titles)
for job_title in job_titles:
search_keywords = f"{job_title} location:{fixed_location}"
print(f"\n{'='*60}")
print(f"Starting scrape for: {search_keywords}")
print(f"{'='*60}")
await scraper.scrape_jobs(
search_keywords=search_keywords,
credentials={
"email": os.getenv("SCRAPING_USERNAME"),
"password": os.getenv("SCRAPING_PASSWORD")
}
)
print(f"\n✅ Completed scraping for: {job_title}")
print(f"⏳ Waiting 2 minutes before next job title...")
# Wait 2 minutes before next job title
time.sleep(120)
print(f"\n✅ Completed full cycle of all job titles")
print(f"🔄 Starting new cycle...")
if __name__ == "__main__":
asyncio.run(main())

View File

@ -1,131 +1,219 @@
from openai import OpenAI
from typing import Dict, Any, Optional
from typing import Dict, Any
import asyncio
import sqlite3
import psycopg2
import os
from datetime import datetime
import json
import re
from bs4 import BeautifulSoup
from dotenv import load_dotenv
# ✅ Actually load .env
# Load environment variables from .env
load_dotenv()
class LLMJobRefiner:
def __init__(self):
xai_api_key = os.getenv("XAI_API_KEY")
if not xai_api_key:
raise ValueError("XAI_API_KEY not found in environment variables.")
deepseek_api_key = os.getenv("DEEPSEEK_API_KEY")
if not deepseek_api_key:
raise ValueError("DEEPSEEK_API_KEY not found in .env file.")
self.client = OpenAI(api_key=xai_api_key, base_url="https://api.x.ai/v1")
self.model = "grok-4-latest"
self.extraction_schema_cache = {}
# Database credentials from .env
self.db_url = os.getenv("DB_URL")
self.db_username = os.getenv("DB_USERNAME")
self.db_password = os.getenv("DB_PASSWORD")
self.db_host = os.getenv("DB_HOST")
self.db_port = os.getenv("DB_PORT")
def generate_content(self, prompt: str, system_message: str = "You are a helpful assistant.") -> str:
"""Synchronous method to call Grok via xAI API."""
if not self.db_url or not self.db_username or not self.db_password:
raise ValueError("Database credentials not found in .env file.")
# DeepSeek uses OpenAI-compatible API
self.client = OpenAI(
api_key=deepseek_api_key,
base_url="https://api.deepseek.com/v1"
)
self.model = "deepseek-chat"
self._init_db()
def _init_db(self):
"""Initialize PostgreSQL database connection and create table"""
try:
self.db_url = os.getenv("DB_URL")
if self.db_url and "supabase.com" in self.db_url:
conn = psycopg2.connect(
host=self.db_host,
port=self.db_port,
database="postgres",
user=self.db_username,
password=self.db_password
)
else:
conn = psycopg2.connect(
host=self.db_host,
port=self.db_port,
database="postgres",
user=self.db_username,
password=self.db_password
)
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS jobs (
id SERIAL PRIMARY KEY,
title TEXT,
company_name TEXT,
location TEXT,
description TEXT,
requirements TEXT,
qualifications TEXT,
salary_range TEXT,
nature_of_work TEXT,
job_id TEXT UNIQUE,
url TEXT,
category TEXT,
scraped_at TIMESTAMP,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
# Ensure the uniqueness constraint exists
cursor.execute('''
ALTER TABLE jobs DROP CONSTRAINT IF EXISTS jobs_job_id_key;
ALTER TABLE jobs ADD CONSTRAINT jobs_job_id_key UNIQUE (job_id);
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_job_id ON jobs(job_id)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_category ON jobs(category)')
conn.commit()
cursor.close()
conn.close()
print("✅ PostgreSQL database initialized successfully")
except Exception as e:
print(f"❌ Database initialization error: {e}")
raise
def _clean_html_for_llm(self, html_content: str) -> str:
"""Clean HTML to make it more readable for LLM while preserving structure"""
try:
soup = BeautifulSoup(html_content, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style", "nav", "footer", "header"]):
script.decompose()
# Extract text but keep some structure
text = soup.get_text(separator=' ', strip=True)
# Clean up whitespace
text = re.sub(r'\s+', ' ', text)
# Limit length for LLM context
if len(text) > 10000:
text = text[:10000] + "..."
return text
except Exception as e:
print(f"HTML cleaning error: {e}")
# Fallback to raw content if cleaning fails
return html_content[:100000] if len(html_content) > 100000 else html_content
def _generate_content_sync(self, prompt: str) -> str:
"""Synchronous call to DeepSeek API"""
try:
response = self.client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": system_message},
{"role": "user", "content": prompt}
],
messages=[{"role": "user", "content": prompt}],
temperature=0.2,
max_tokens=2048,
stream=False
)
return response.choices[0].message.content or ""
except Exception as e:
print(f"Error in Grok API call: {e}")
print(f"DeepSeek API error: {e}")
return ""
async def refine_job_data(self, raw_data: Dict[str, Any], user_request: str) -> Optional[Dict[str, Any]]:
async def refine_job_data(self, raw_data: Dict[str, Any], target_field: str) -> Dict[str, Any]:
# Clean the raw HTML content for better LLM processing
page_content = raw_data.get('page_content', '')
if not page_content:
return None
cleaned_content = self._clean_html_for_llm(page_content)
schema_key = user_request.lower().strip()
extraction_schema = self.extraction_schema_cache.get(schema_key)
if not extraction_schema:
extraction_schema = await self._generate_extraction_schema(user_request)
if extraction_schema:
self.extraction_schema_cache[schema_key] = extraction_schema
else:
extraction_schema = self._get_default_schema()
# Get job_id and url from raw data
job_id = raw_data.get('job_id', 'unknown')
url = raw_data.get('url', 'N/A')
prompt = f"""
You are a highly skilled web data extraction assistant. Your task is to analyze the raw HTML content of a job posting page and extract specific information requested by the user.
The user's request is: "{user_request}"
The raw HTML content of the page is provided below (limited in size). The content might be noisy or unstructured.
Your goal is to:
1. Analyze the HTML structure to identify relevant sections.
2. Extract the requested information accurately.
3. Clean up formatting issues.
4. If a field cannot be found, use "N/A".
5. Return ONLY the extracted data in a JSON object based on the following schema:
{json.dumps(extraction_schema, indent=2)}
Raw Page Content (HTML):
{page_content[:6000]}
You are a job posting data extractor with two modes:
Respond with the JSON object containing the extracted data.
PRIMARY MODE (PREFERRED):
- Extract EXACT text as it appears on the page for all fields
- DO NOT summarize, paraphrase, or interpret
- Copy verbatim content including original wording and formatting
FALLBACK MODE (ONLY IF FIELD IS MISSING):
- If a field is NOT explicitly stated anywhere in the content, you MAY infer it using clear contextual clues
- Inference rules:
* company_name: Look for patterns like "at [Company]", "Join [Company]", "[Company] is hiring"
* nature_of_work: Look for "remote", "onsite", "hybrid", "work from home", "office-based"
* location: Extract city/state/country mentions near job title or details
* title: Use the largest/primary heading if no explicit "job title" label exists
REQUIRED FIELDS (must always have a value):
- title: Exact job title or best inference
- company_name: Exact company name or best inference
- job_id: Use provided: {job_id}
- url: Use provided: {url}
OPTIONAL FIELDS (use exact text or "N/A" if not present and not inferable):
- location
- description
- requirements
- qualifications
- salary_range
- nature_of_work
Page Content:
{cleaned_content}
Response format (ONLY return this JSON):
{{
"title": "...",
"company_name": "...",
"location": "...",
"description": "...",
"requirements": "...",
"qualifications": "...",
"salary_range": "...",
"nature_of_work": "...",
"job_id": "{job_id}",
"url": "{url}"
}}
"""
try:
# ✅ Use self (current instance), NOT a new LLMJobRefiner()
response_text = await asyncio.get_event_loop().run_in_executor(
None,
lambda: self.generate_content(prompt)
lambda: self._generate_content_sync(prompt)
)
refined_data = self._parse_llm_response(response_text)
if not refined_data:
return None
refined_data['job_id'] = raw_data.get('job_id', 'unknown')
refined_data['url'] = raw_data.get('url', 'N/A')
return refined_data
# Final validation - ensure required fields are present and meaningful
if refined_data:
required_fields = ['title', 'company_name', 'job_id', 'url']
for field in required_fields:
if not refined_data.get(field) or refined_data[field] in ["N/A", "", "Unknown", "Company", "Job"]:
return None # LLM failed to extract properly
return refined_data
return None
except Exception as e:
print(f"LLM refinement failed: {str(e)}")
return None
async def _generate_extraction_schema(self, user_request: str) -> Optional[Dict[str, str]]:
schema_prompt = f"""
Based on the user's request: "{user_request}", generate a JSON schema for the data they want to extract from a job posting.
The schema should be a dictionary where keys are field names (snake_case) and values are short descriptions.
Include standard fields like title, company_name, location, description, etc., if relevant.
Respond with only the JSON schema.
"""
try:
# ✅ Use self.generate_content, NOT self.model.generate_content
schema_text = await asyncio.get_event_loop().run_in_executor(
None,
lambda: self.generate_content(schema_prompt)
)
json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', schema_text, re.DOTALL)
if not json_match:
json_match = re.search(r'\{.*\}', schema_text, re.DOTALL)
if not json_match:
return None
json_str = json_match.group(1) if '```' in schema_text else json_match.group(0)
return json.loads(json_str)
except Exception as e:
print(f"Schema generation failed: {str(e)}")
return None
def _get_default_schema(self) -> Dict[str, str]:
return {
"title": "The job title",
"company_name": "The name of the company",
"location": "The location of the job",
"description": "The full job description",
"requirements": "List of job requirements",
"qualifications": "List of required qualifications",
"salary_range": "The salary range mentioned",
"nature_of_work": "Remote, onsite, or hybrid"
}
def _parse_llm_response(self, response_text: str) -> Optional[Dict[str, Any]]:
def _parse_llm_response(self, response_text: str) -> Dict[str, Any]:
json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', response_text, re.DOTALL)
if not json_match:
json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
@ -142,17 +230,46 @@ class LLMJobRefiner:
await self._save_to_markdown(job_data, keyword)
async def _save_to_db(self, job_data: Dict[str, Any]):
db_path = "linkedin_jobs.db"
os.makedirs(os.path.dirname(db_path) or ".", exist_ok=True)
with sqlite3.connect(db_path) as conn:
"""Save job data to PostgreSQL database with job_id uniqueness"""
try:
conn = psycopg2.connect(
host=self.db_host,
port=self.db_port,
database="postgres",
user=self.db_username,
password=self.db_password
)
cursor = conn.cursor()
fields = list(job_data.keys())
placeholders = ', '.join(['?' for _ in fields])
columns = ', '.join([f'"{col}"' for col in fields]) # Escape column names
cursor.execute(f"CREATE TABLE IF NOT EXISTS jobs ({columns})")
cursor.execute(f'INSERT INTO jobs ({columns}) VALUES ({placeholders})',
[job_data.get(field, 'N/A') for field in fields])
cursor.execute('''
INSERT INTO jobs
(title, company_name, location, description, requirements,
qualifications, salary_range, nature_of_work, job_id, url, category, scraped_at)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (job_id) DO NOTHING
''', (
job_data.get("title", "N/A"),
job_data.get("company_name", "N/A"),
job_data.get("location", "N/A"),
job_data.get("description", "N/A"),
job_data.get("requirements", "N/A"),
job_data.get("qualifications", "N/A"),
job_data.get("salary_range", "N/A"),
job_data.get("nature_of_work", "N/A"),
job_data.get("job_id", "N/A"),
job_data.get("url", "N/A"),
job_data.get("category", "N/A"),
job_data.get("scraped_at")
))
conn.commit()
cursor.close()
conn.close()
print(f" 💾 Saved job to category '{job_data.get('category', 'N/A')}' with job_id: {job_data.get('job_id', 'N/A')}")
except Exception as e:
print(f"❌ Database save error: {e}")
async def _save_to_markdown(self, job_data: Dict[str, Any], keyword: str):
os.makedirs("linkedin_jobs", exist_ok=True)
@ -164,7 +281,15 @@ class LLMJobRefiner:
f.write(f"# LinkedIn Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
f.write(f"## Job: {job_data.get('title', 'N/A')}\n\n")
f.write(f"- **Keyword**: {keyword}\n")
for key, value in job_data.items():
if key != 'title':
f.write(f"- **{key.replace('_', ' ').title()}**: {value}\n")
f.write("\n---\n\n")
f.write(f"- **Company**: {job_data.get('company_name', 'N/A')}\n")
f.write(f"- **Location**: {job_data.get('location', 'N/A')}\n")
f.write(f"- **Nature of Work**: {job_data.get('nature_of_work', 'N/A')}\n")
f.write(f"- **Salary Range**: {job_data.get('salary_range', 'N/A')}\n")
f.write(f"- **Job ID**: {job_data.get('job_id', 'N/A')}\n")
f.write(f"- **Category**: {job_data.get('category', 'N/A')}\n")
f.write(f"- **Scraped At**: {job_data.get('scraped_at', 'N/A')}\n")
f.write(f"- **URL**: <{job_data.get('url', 'N/A')}>\n\n")
f.write(f"### Description\n\n{job_data.get('description', 'N/A')}\n\n")
f.write(f"### Requirements\n\n{job_data.get('requirements', 'N/A')}\n\n")
f.write(f"### Qualifications\n\n{job_data.get('qualifications', 'N/A')}\n\n")
f.write("---\n\n")

View File

@ -69,7 +69,7 @@ class FingerprintScrapingEngine:
self.optimization_params = {
"base_delay": 2.0,
"max_concurrent_requests": 4,
"request_timeout": 60000,
"request_timeout": 120000,
"retry_attempts": 3,
"captcha_handling_strategy": "avoid", # or "solve_fallback"
"cloudflare_wait_strategy": "smart_wait", # or "aggressive_reload"
@ -155,7 +155,7 @@ class FingerprintScrapingEngine:
# Increase timeout if avg response time is high
if avg_rt > 20:
self.optimization_params["request_timeout"] = 90000 # 90 seconds
self.optimization_params["request_timeout"] = 150000 # 90 seconds
print(f"Optimization Params Updated: {self.optimization_params}")
@ -371,7 +371,7 @@ class FingerprintScrapingEngine:
# Reload occasionally to trigger potential client-side checks
if (time.time() - start_time) > 15 and (time.time() - start_time) % 20 < 2:
print("Reloading page during Cloudflare wait...")
await page.reload(wait_until='load', timeout=30000)
await page.reload(wait_until='load', timeout=80000)
print("Timeout waiting for Cloudflare resolution.")
return False