from openai import OpenAI from typing import Dict, Any import asyncio import psycopg2 import os from datetime import datetime import json import re from bs4 import BeautifulSoup from dotenv import load_dotenv # Load environment variables from .env load_dotenv() class LLMJobRefiner: def __init__(self): deepseek_api_key = os.getenv("DEEPSEEK_API_KEY") if not deepseek_api_key: raise ValueError("DEEPSEEK_API_KEY not found in .env file.") # Database credentials from .env self.db_url = os.getenv("DB_URL") self.db_username = os.getenv("DB_USERNAME") self.db_password = os.getenv("DB_PASSWORD") self.db_host = os.getenv("DB_HOST") self.db_port = os.getenv("DB_PORT") if not self.db_url or not self.db_username or not self.db_password: raise ValueError("Database credentials not found in .env file.") # DeepSeek uses OpenAI-compatible API self.client = OpenAI( api_key=deepseek_api_key, base_url="https://api.deepseek.com/v1" ) self.model = "deepseek-chat" self._init_db() def _init_db(self): """Initialize PostgreSQL database connection and create table""" try: self.db_url = os.getenv("DB_URL") if self.db_url and "supabase.com" in self.db_url: conn = psycopg2.connect( host=self.db_host, port=self.db_port, database="postgres", user=self.db_username, password=self.db_password ) else: conn = psycopg2.connect( host=self.db_host, port=self.db_port, database="postgres", user=self.db_username, password=self.db_password ) cursor = conn.cursor() # Create table if it doesn't exist cursor.execute(''' CREATE TABLE IF NOT EXISTS jobs ( id SERIAL PRIMARY KEY, title TEXT, company_name TEXT, location TEXT, description TEXT, requirements TEXT, qualifications TEXT, salary_range TEXT, nature_of_work TEXT, job_id TEXT UNIQUE, url TEXT, category TEXT, scraped_at TIMESTAMP, posted_date TEXT, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) ''') # Add apply_type column if it doesn't exist cursor.execute(''' ALTER TABLE jobs ADD COLUMN IF NOT EXISTS apply_type TEXT DEFAULT 'signup' ''') # Ensure the uniqueness constraint exists cursor.execute(''' ALTER TABLE jobs DROP CONSTRAINT IF EXISTS jobs_job_id_key; ALTER TABLE jobs ADD CONSTRAINT jobs_job_id_key UNIQUE (job_id); ''') cursor.execute('CREATE INDEX IF NOT EXISTS idx_job_id ON jobs(job_id)') cursor.execute('CREATE INDEX IF NOT EXISTS idx_category ON jobs(category)') cursor.execute('CREATE INDEX IF NOT EXISTS idx_posted_date ON jobs(posted_date)') cursor.execute('CREATE INDEX IF NOT EXISTS idx_apply_type ON jobs(apply_type)') conn.commit() cursor.close() conn.close() print("✅ PostgreSQL database initialized successfully") except Exception as e: print(f"❌ Database initialization error: {e}") raise def _clean_html_for_llm(self, html_content: str) -> str: """Clean HTML to make it more readable for LLM while preserving key job structure""" try: soup = BeautifulSoup(html_content, 'html.parser') # Remove unwanted elements for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside', 'noscript']): element.decompose() # Keep only main content containers (platform-specific) main_content = None candidates = [ soup.find('main'), soup.find('div', class_=re.compile(r'job|posting|content')), soup.find('article'), soup.body ] for candidate in candidates: if candidate: main_content = candidate break if not main_content: main_content = soup.body or soup # Extract text with some structure lines = [] for elem in main_content.descendants: if isinstance(elem, str): text = elem.strip() if text and len(text) > 5: # Skip short fragments lines.append(text) elif elem.name in ['h1', 'h2', 'h3', 'h4', 'p', 'li', 'strong', 'b']: text = elem.get_text().strip() if text: lines.append(text) # Join with newlines for better LLM parsing cleaned = '\n'.join(lines) # Limit length for LLM context if len(cleaned) > 10000: cleaned = cleaned[:10000] + "..." return cleaned except Exception as e: print(f"HTML cleaning error: {e}") return html_content[:100000] if len(html_content) > 100000 else html_content def _generate_content_sync(self, prompt: str) -> str: """Synchronous call to DeepSeek API""" try: response = self.client.chat.completions.create( model=self.model, messages=[{"role": "user", "content": prompt}], temperature=0.2, max_tokens=2048, stream=False ) return response.choices[0].message.content or "" except Exception as e: print(f"DeepSeek API error: {e}") return "" async def refine_job_data(self, raw_data: Dict[str, Any], target_field: str) -> Dict[str, Any]: page_content = raw_data.get('page_content', '') cleaned_content = self._clean_html_for_llm(page_content) job_id = raw_data.get('job_id', 'unknown') url = raw_data.get('url', 'N/A') posted_date = raw_data.get('posted_date', datetime.now().strftime("%m/%d/%y")) # Detect platform from URL platform = "unknown" if "ashbyhq.com" in url: platform = "ashby" elif "lever.co" in url: platform = "lever" elif "greenhouse.io" in url: platform = "greenhouse" # Platform-specific instructions platform_instructions = "" if platform == "ashby": platform_instructions = """ For Ashby jobs: - Title is usually in