- Update fetch timeout in StealthyFetcher for improved reliability. - Refactor LLMJobRefiner to create and manage Quelah Jobs table in PostgreSQL. - Modify RedisManager to track sent job counts for jobs.csv and adjust deduplication logic. - Implement job URL-based deduplication across scraper and sender.
354 lines
14 KiB
Python
354 lines
14 KiB
Python
|
||
from openai import OpenAI
|
||
from typing import Dict, Any
|
||
import asyncio
|
||
import psycopg2
|
||
import os
|
||
from datetime import datetime
|
||
import json
|
||
import re
|
||
from bs4 import BeautifulSoup
|
||
from dotenv import load_dotenv
|
||
|
||
# Load environment variables from .env
|
||
load_dotenv()
|
||
|
||
|
||
class LLMJobRefiner:
|
||
def __init__(self):
|
||
deepseek_api_key = os.getenv("DEEPSEEK_API_KEY")
|
||
if not deepseek_api_key:
|
||
raise ValueError("DEEPSEEK_API_KEY not found in .env file.")
|
||
|
||
# Database credentials from .env
|
||
self.db_url = os.getenv("DB_URL")
|
||
self.db_username = os.getenv("DB_USERNAME")
|
||
self.db_password = os.getenv("DB_PASSWORD")
|
||
self.db_host = os.getenv("DB_HOST")
|
||
self.db_port = os.getenv("DB_PORT")
|
||
|
||
if not self.db_url or not self.db_username or not self.db_password:
|
||
raise ValueError("Database credentials not found in .env file.")
|
||
|
||
# DeepSeek uses OpenAI-compatible API
|
||
self.client = OpenAI(
|
||
api_key=deepseek_api_key,
|
||
base_url="https://api.deepseek.com/v1"
|
||
)
|
||
self.model = "deepseek-chat"
|
||
self._init_db()
|
||
|
||
def _init_db(self):
|
||
"""Initialize PostgreSQL database connection and create Quelah Jobs table"""
|
||
try:
|
||
conn = psycopg2.connect(
|
||
host=self.db_host,
|
||
port=self.db_port,
|
||
database="postgres",
|
||
user=self.db_username,
|
||
password=self.db_password
|
||
)
|
||
cursor = conn.cursor()
|
||
|
||
# ✅ CREATE NEW TABLE: quelah_jobs (no requirements field)
|
||
cursor.execute('''
|
||
CREATE TABLE IF NOT EXISTS quelah_jobs (
|
||
id SERIAL PRIMARY KEY,
|
||
title TEXT,
|
||
company_name TEXT,
|
||
location TEXT,
|
||
description TEXT,
|
||
qualifications TEXT,
|
||
salary_range TEXT,
|
||
nature_of_work TEXT,
|
||
apply_type TEXT DEFAULT 'signup',
|
||
job_id TEXT UNIQUE,
|
||
url TEXT,
|
||
category TEXT,
|
||
scraped_at TIMESTAMP,
|
||
posted_date TEXT,
|
||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||
)
|
||
''')
|
||
|
||
# Ensure uniqueness constraint
|
||
cursor.execute('''
|
||
ALTER TABLE quelah_jobs DROP CONSTRAINT IF EXISTS quelah_jobs_job_id_key;
|
||
ALTER TABLE quelah_jobs ADD CONSTRAINT quelah_jobs_job_id_key UNIQUE (job_id);
|
||
''')
|
||
|
||
# Create indexes
|
||
cursor.execute('CREATE INDEX IF NOT EXISTS idx_quelah_job_id ON quelah_jobs(job_id)')
|
||
cursor.execute('CREATE INDEX IF NOT EXISTS idx_quelah_category ON quelah_jobs(category)')
|
||
cursor.execute('CREATE INDEX IF NOT EXISTS idx_quelah_posted_date ON quelah_jobs(posted_date)')
|
||
cursor.execute('CREATE INDEX IF NOT EXISTS idx_quelah_apply_type ON quelah_jobs(apply_type)')
|
||
|
||
conn.commit()
|
||
cursor.close()
|
||
conn.close()
|
||
print("✅ Quelah Jobs table initialized successfully")
|
||
except Exception as e:
|
||
print(f"❌ Database initialization error: {e}")
|
||
raise
|
||
|
||
def _clean_html_for_llm(self, html_content: str) -> str:
|
||
"""Clean HTML to make it more readable for LLM while preserving key job structure"""
|
||
try:
|
||
soup = BeautifulSoup(html_content, 'html.parser')
|
||
|
||
# Remove unwanted elements
|
||
for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside', 'noscript']):
|
||
element.decompose()
|
||
|
||
# Keep only main content containers
|
||
main_content = None
|
||
candidates = [
|
||
soup.find('main'),
|
||
soup.find('div', class_=re.compile(r'job|posting|content')),
|
||
soup.find('article'),
|
||
soup.body
|
||
]
|
||
|
||
for candidate in candidates:
|
||
if candidate:
|
||
main_content = candidate
|
||
break
|
||
|
||
if not main_content:
|
||
main_content = soup.body or soup
|
||
|
||
# Extract text with some structure
|
||
lines = []
|
||
for elem in main_content.descendants:
|
||
if isinstance(elem, str):
|
||
text = elem.strip()
|
||
if text and len(text) > 5: # Skip short fragments
|
||
lines.append(text)
|
||
elif elem.name in ['h1', 'h2', 'h3', 'h4', 'p', 'li', 'strong', 'b']:
|
||
text = elem.get_text().strip()
|
||
if text:
|
||
lines.append(text)
|
||
|
||
# Join with newlines for better LLM parsing
|
||
cleaned = '\n'.join(lines)
|
||
|
||
# Limit length for LLM context
|
||
if len(cleaned) > 10000:
|
||
cleaned = cleaned[:10000] + "..."
|
||
|
||
return cleaned
|
||
except Exception as e:
|
||
print(f"HTML cleaning error: {e}")
|
||
return html_content[:100000] if len(html_content) > 100000 else html_content
|
||
|
||
def _generate_content_sync(self, prompt: str) -> str:
|
||
"""Synchronous call to DeepSeek API"""
|
||
try:
|
||
response = self.client.chat.completions.create(
|
||
model=self.model,
|
||
messages=[{"role": "user", "content": prompt}],
|
||
temperature=0.2,
|
||
max_tokens=2048,
|
||
stream=False
|
||
)
|
||
return response.choices[0].message.content or ""
|
||
except Exception as e:
|
||
print(f"DeepSeek API error: {e}")
|
||
return ""
|
||
|
||
async def refine_job_data(self, raw_data: Dict[str, Any], target_field: str) -> Dict[str, Any]:
|
||
page_content = raw_data.get('page_content', '')
|
||
cleaned_content = self._clean_html_for_llm(page_content)
|
||
job_id = raw_data.get('job_id', 'unknown')
|
||
url = raw_data.get('url', 'N/A')
|
||
posted_date = raw_data.get('posted_date', "12/01/25") # ✅ Fixed date
|
||
|
||
# Detect platform from URL (for prompt only)
|
||
platform = "unknown"
|
||
if "ashbyhq.com" in url:
|
||
platform = "ashby"
|
||
elif "lever.co" in url:
|
||
platform = "lever"
|
||
elif "greenhouse.io" in url:
|
||
platform = "greenhouse"
|
||
|
||
# Platform-specific instructions
|
||
platform_instructions = ""
|
||
if platform == "ashby":
|
||
platform_instructions = """
|
||
For Ashby jobs:
|
||
- Title is usually in <h1> or <h2>
|
||
- Company name is often in <meta> or header
|
||
- Description is in <div class="job-posting"> or <article>
|
||
- Look for sections like "About Us", "What you'll do", "Qualifications", "Benefits"
|
||
- Location may be in <span> near job title or in metadata
|
||
"""
|
||
elif platform == "lever":
|
||
platform_instructions = """
|
||
For Lever jobs:
|
||
- Title is in <h1> or <h2>
|
||
- Company name is in <title> or header
|
||
- Description is in <div class="job-description"> or <section>
|
||
- Look for headings like "What you'll do", "What you'll need", "Why join us"
|
||
- Location is often in <div class="location">
|
||
"""
|
||
elif platform == "greenhouse":
|
||
platform_instructions = """
|
||
For Greenhouse jobs:
|
||
- Title is in <h1> or <h2>
|
||
- Company name is in <meta> or header
|
||
- Description is in <div class="job-desc"> or <section>
|
||
- Look for headings like "Role overview", "What you'll do", "What you bring"
|
||
- Location is often in <div class="location">
|
||
"""
|
||
|
||
prompt = f"""
|
||
You are an expert job posting parser. Extract information EXACTLY as it appears in the text. DO NOT summarize, paraphrase, or invent.
|
||
|
||
CRITICAL INSTRUCTIONS:
|
||
{platform_instructions}
|
||
|
||
FIELD RULES:
|
||
- description: MUST include ALL role details, responsibilities, and overview. Never "Not provided" if any job description exists.
|
||
- qualifications: MUST include ALL required skills, experience, education, and preferred qualifications. Combine them.
|
||
- location: Extract city, state, or remote status if available.
|
||
- salary_range: Extract if explicitly mentioned (e.g., "$70,000–$85,000").
|
||
- nature_of_work: Extract if mentioned (e.g., "Part-time", "Remote", "On-site").
|
||
|
||
REQUIRED FIELDS (must have valid values, never "N/A"):
|
||
- title, company_name, job_id, url, description
|
||
|
||
OPTIONAL FIELDS (can be "Not provided" if the information is actually not provided):
|
||
- location, salary_range, nature_of_work
|
||
|
||
⚠️ IMPORTANT: Do NOT include or extract a "requirements" field. Focus only on description and qualifications.
|
||
|
||
Page Content:
|
||
{cleaned_content}
|
||
|
||
Response format (ONLY return this JSON):
|
||
{{
|
||
"title": "...",
|
||
"company_name": "...",
|
||
"location": "...",
|
||
"description": "...",
|
||
"qualifications": "...",
|
||
"salary_range": "...",
|
||
"nature_of_work": "...",
|
||
"job_id": "{job_id}",
|
||
"url": "{url}"
|
||
}}
|
||
"""
|
||
|
||
try:
|
||
response_text = await asyncio.get_event_loop().run_in_executor(
|
||
None,
|
||
lambda: self._generate_content_sync(prompt)
|
||
)
|
||
refined_data = self._parse_llm_response(response_text)
|
||
|
||
if not refined_data:
|
||
return None
|
||
|
||
# Validate required fields
|
||
required_fields = ['title', 'company_name', 'job_id', 'url', 'description']
|
||
for field in required_fields:
|
||
if not refined_data.get(field) or refined_data[field].strip() in ["N/A", "", "Unknown", "Company", "Job"]:
|
||
return None
|
||
|
||
# Add the fixed posted_date
|
||
refined_data['posted_date'] = posted_date
|
||
|
||
return refined_data
|
||
|
||
except Exception as e:
|
||
print(f"LLM refinement failed: {str(e)}")
|
||
return None
|
||
|
||
def _parse_llm_response(self, response_text: str) -> Dict[str, Any]:
|
||
json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', response_text, re.DOTALL)
|
||
if not json_match:
|
||
json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
|
||
if not json_match:
|
||
return None
|
||
|
||
try:
|
||
return json.loads(json_match.group(1) if '```' in response_text else json_match.group(0))
|
||
except json.JSONDecodeError:
|
||
return None
|
||
|
||
async def save_job_data(self, job_data: Dict[str, Any], keyword: str, platform: str = "quelah"):
|
||
"""Save ALL jobs to Quelah Jobs table and markdown"""
|
||
await self._save_to_db_quelah(job_data)
|
||
await self._save_to_markdown_quelah(job_data, keyword)
|
||
|
||
async def _save_to_db_quelah(self, job_data: Dict[str, Any]):
|
||
"""Save job data to Quelah Jobs table"""
|
||
try:
|
||
conn = psycopg2.connect(
|
||
host=self.db_host,
|
||
port=self.db_port,
|
||
database="postgres",
|
||
user=self.db_username,
|
||
password=self.db_password
|
||
)
|
||
cursor = conn.cursor()
|
||
|
||
# Set apply_type if not present
|
||
apply_type = job_data.get("apply_type", "signup")
|
||
|
||
cursor.execute('''
|
||
INSERT INTO quelah_jobs
|
||
(title, company_name, location, description, qualifications,
|
||
salary_range, nature_of_work, apply_type, job_id, url, category, scraped_at, posted_date)
|
||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||
ON CONFLICT (job_id) DO NOTHING
|
||
''', (
|
||
job_data.get("title", "N/A"),
|
||
job_data.get("company_name", "N/A"),
|
||
job_data.get("location", "N/A"),
|
||
job_data.get("description", "N/A"),
|
||
job_data.get("qualifications", "N/A"),
|
||
job_data.get("salary_range", "N/A"),
|
||
job_data.get("nature_of_work", "N/A"),
|
||
apply_type,
|
||
job_data.get("job_id", "N/A"),
|
||
job_data.get("url", "N/A"),
|
||
job_data.get("category", "N/A"),
|
||
job_data.get("scraped_at"),
|
||
job_data.get("posted_date", "12/01/25") # Fixed date
|
||
))
|
||
|
||
conn.commit()
|
||
cursor.close()
|
||
conn.close()
|
||
|
||
print(f" 💾 Saved to Quelah Jobs | Job ID: {job_data.get('job_id', 'N/A')}")
|
||
|
||
except Exception as e:
|
||
print(f"❌ Database save error: {e}")
|
||
|
||
async def _save_to_markdown_quelah(self, job_data: Dict[str, Any], keyword: str):
|
||
os.makedirs("quelah_jobs", exist_ok=True)
|
||
filepath = os.path.join("quelah_jobs", "quelah_jobs.md")
|
||
write_header = not os.path.exists(filepath) or os.path.getsize(filepath) == 0
|
||
|
||
with open(filepath, "a", encoding="utf-8") as f:
|
||
if write_header:
|
||
f.write(f"# Quelah Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
|
||
f.write(f"## Job: {job_data.get('title', 'N/A')}\n\n")
|
||
f.write(f"- *Keyword*: {keyword}\n")
|
||
f.write(f"- *Company*: {job_data.get('company_name', 'N/A')}\n")
|
||
f.write(f"- *Location*: {job_data.get('location', 'N/A')}\n")
|
||
f.write(f"- *Nature of Work*: {job_data.get('nature_of_work', 'N/A')}\n")
|
||
f.write(f"- *Salary Range*: {job_data.get('salary_range', 'N/A')}\n")
|
||
f.write(f"- *Apply Type*: {job_data.get('apply_type', 'signup')}\n")
|
||
f.write(f"- *Job ID*: {job_data.get('job_id', 'N/A')}\n")
|
||
f.write(f"- *Posted Date*: {job_data.get('posted_date', '12/01/25')}\n") # Fixed date
|
||
f.write(f"- *Category*: {job_data.get('category', 'N/A')}\n")
|
||
f.write(f"- *Scraped At*: {job_data.get('scraped_at', 'N/A')}\n")
|
||
f.write(f"- *URL*: <{job_data.get('url', 'N/A')}>\n\n")
|
||
f.write(f"### Description\n\n{job_data.get('description', 'N/A')}\n\n")
|
||
# ✅ REMOVED requirements section
|
||
f.write(f"### Qualifications\n\n{job_data.get('qualifications', 'N/A')}\n\n")
|
||
f.write("---\n\n") |