Web_scraping_project/llm_agent.py


from openai import OpenAI
from typing import Dict, Any
import asyncio
import psycopg2
import os
from datetime import datetime
import json
import re
from bs4 import BeautifulSoup
from dotenv import load_dotenv

# Load environment variables from .env
load_dotenv()


class LLMJobRefiner:
    def __init__(self):
        deepseek_api_key = os.getenv("DEEPSEEK_API_KEY")
        if not deepseek_api_key:
            raise ValueError("DEEPSEEK_API_KEY not found in .env file.")

        # Database credentials from .env
        self.db_url = os.getenv("DB_URL")
        self.db_username = os.getenv("DB_USERNAME")
        self.db_password = os.getenv("DB_PASSWORD")
        self.db_host = os.getenv("DB_HOST")
        self.db_port = os.getenv("DB_PORT")

        if not self.db_url or not self.db_username or not self.db_password:
            raise ValueError("Database credentials not found in .env file.")

        # DeepSeek uses OpenAI-compatible API
        self.client = OpenAI(
            api_key=deepseek_api_key,
            base_url="https://api.deepseek.com/v1"
        )
        self.model = "deepseek-chat"
        self._init_db()

    def _init_db(self):
        """Initialize PostgreSQL database connection and create Quelah Jobs table"""
        try:
            conn = psycopg2.connect(
                host=self.db_host,
                port=self.db_port,
                database="postgres",
                user=self.db_username,
                password=self.db_password
            )
            cursor = conn.cursor()

            # ✅ CREATE NEW TABLE: quelah_jobs (no requirements field)
            cursor.execute('''
                CREATE TABLE IF NOT EXISTS quelah_jobs (
                    id SERIAL PRIMARY KEY,
                    title TEXT,
                    company_name TEXT,
                    location TEXT,
                    description TEXT,
                    qualifications TEXT,
                    salary_range TEXT,
                    nature_of_work TEXT,
                    apply_type TEXT DEFAULT 'signup',
                    job_id TEXT UNIQUE,
                    url TEXT,
                    category TEXT,
                    scraped_at TIMESTAMP,
                    posted_date TEXT,
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
                )
            ''')

            # Ensure uniqueness constraint
            cursor.execute('''
                ALTER TABLE quelah_jobs DROP CONSTRAINT IF EXISTS quelah_jobs_job_id_key;
                ALTER TABLE quelah_jobs ADD CONSTRAINT quelah_jobs_job_id_key UNIQUE (job_id);
            ''')

            # Create indexes
            cursor.execute('CREATE INDEX IF NOT EXISTS idx_quelah_job_id ON quelah_jobs(job_id)')
            cursor.execute('CREATE INDEX IF NOT EXISTS idx_quelah_category ON quelah_jobs(category)')
            cursor.execute('CREATE INDEX IF NOT EXISTS idx_quelah_posted_date ON quelah_jobs(posted_date)')
            cursor.execute('CREATE INDEX IF NOT EXISTS idx_quelah_apply_type ON quelah_jobs(apply_type)')

            conn.commit()
            cursor.close()
            conn.close()
            print("✅ Quelah Jobs table initialized successfully")
        except Exception as e:
            print(f"❌ Database initialization error: {e}")
            raise

    def _clean_html_for_llm(self, html_content: str) -> str:
        """Clean HTML to make it more readable for LLM while preserving key job structure"""
        try:
            soup = BeautifulSoup(html_content, 'html.parser')

            # Remove unwanted elements
            for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside', 'noscript']):
                element.decompose()

            # Keep only main content containers
            main_content = None
            candidates = [
                soup.find('main'),
                soup.find('div', class_=re.compile(r'job|posting|content')),
                soup.find('article'),
                soup.body
            ]

            for candidate in candidates:
                if candidate:
                    main_content = candidate
                    break

            if not main_content:
                main_content = soup.body or soup

            # Extract text with some structure
            lines = []
            for elem in main_content.descendants:
                if isinstance(elem, str):
                    text = elem.strip()
                    if text and len(text) > 5:  # Skip short fragments
                        lines.append(text)
                elif elem.name in ['h1', 'h2', 'h3', 'h4', 'p', 'li', 'strong', 'b']:
                    text = elem.get_text().strip()
                    if text:
                        lines.append(text)

            # Join with newlines for better LLM parsing
            cleaned = '\n'.join(lines)

            # Limit length for LLM context
            if len(cleaned) > 10000:
                cleaned = cleaned[:10000] + "..."

            return cleaned
        except Exception as e:
            print(f"HTML cleaning error: {e}")
            return html_content[:100000] if len(html_content) > 100000 else html_content

    def _generate_content_sync(self, prompt: str) -> str:
        """Synchronous call to DeepSeek API"""
        try:
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[{"role": "user", "content": prompt}],
                temperature=0.2,
                max_tokens=2048,
                stream=False
            )
            return response.choices[0].message.content or ""
        except Exception as e:
            print(f"DeepSeek API error: {e}")
            return ""

    async def refine_job_data(self, raw_data: Dict[str, Any], target_field: str) -> Dict[str, Any]:
        page_content = raw_data.get('page_content', '')
        cleaned_content = self._clean_html_for_llm(page_content)
        job_id = raw_data.get('job_id', 'unknown')
        url = raw_data.get('url', 'N/A')
        posted_date = raw_data.get('posted_date', "12/01/25")  # ✅ Fixed date

        # Detect platform from URL (for prompt only)
        platform = "unknown"
        if "ashbyhq.com" in url:
            platform = "ashby"
        elif "lever.co" in url:
            platform = "lever"
        elif "greenhouse.io" in url:
            platform = "greenhouse"

        # Platform-specific instructions
        platform_instructions = ""
        if platform == "ashby":
            platform_instructions = """
        For Ashby jobs:
        - Title is usually in <h1> or <h2>
        - Company name is often in <meta> or header
        - Description is in <div class="job-posting"> or <article>
        - Look for sections like "About Us", "What you'll do", "Qualifications", "Benefits"
        - Location may be in <span> near job title or in metadata
        """
        elif platform == "lever":
            platform_instructions = """
        For Lever jobs:
        - Title is in <h1> or <h2>
        - Company name is in <title> or header
        - Description is in <div class="job-description"> or <section>
        - Look for headings like "What you'll do", "What you'll need", "Why join us"
        - Location is often in <div class="location">
        """
        elif platform == "greenhouse":
            platform_instructions = """
        For Greenhouse jobs:
        - Title is in <h1> or <h2>
        - Company name is in <meta> or header
        - Description is in <div class="job-desc"> or <section>
        - Look for headings like "Role overview", "What you'll do", "What you bring"
        - Location is often in <div class="location">
        """

        prompt = f"""
You are an expert job posting parser. Extract information EXACTLY as it appears in the text. DO NOT summarize, paraphrase, or invent.

CRITICAL INSTRUCTIONS:
{platform_instructions}

FIELD RULES:
- description: MUST include ALL role details, responsibilities, and overview. Never "Not provided" if any job description exists.
- qualifications: MUST include ALL required skills, experience, education, and preferred qualifications. Combine them.
- location: Extract city, state, or remote status if available.
- salary_range: Extract if explicitly mentioned (e.g., "$70,000–$85,000").
- nature_of_work: Extract if mentioned (e.g., "Part-time", "Remote", "On-site").

REQUIRED FIELDS (must have valid values, never "N/A"):
- title, company_name, job_id, url, description

OPTIONAL FIELDS (can be "Not provided" if the information is actually not provided):
- location, salary_range, nature_of_work

⚠️ IMPORTANT: Do NOT include or extract a "requirements" field. Focus only on description and qualifications.

Page Content:
{cleaned_content}

Response format (ONLY return this JSON):
{{
    "title": "...",
    "company_name": "...",
    "location": "...",
    "description": "...",
    "qualifications": "...",
    "salary_range": "...",
    "nature_of_work": "...",
    "job_id": "{job_id}",
    "url": "{url}"
}}
"""

        try:
            response_text = await asyncio.get_event_loop().run_in_executor(
                None,
                lambda: self._generate_content_sync(prompt)
            )
            refined_data = self._parse_llm_response(response_text)

            if not refined_data:
                return None

            # Validate required fields
            required_fields = ['title', 'company_name', 'job_id', 'url', 'description']
            for field in required_fields:
                if not refined_data.get(field) or refined_data[field].strip() in ["N/A", "", "Unknown", "Company", "Job"]:
                    return None

            # Add the fixed posted_date
            refined_data['posted_date'] = posted_date

            return refined_data

        except Exception as e:
            print(f"LLM refinement failed: {str(e)}")
            return None

    def _parse_llm_response(self, response_text: str) -> Dict[str, Any]:
        json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', response_text, re.DOTALL)
        if not json_match:
            json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
        if not json_match:
            return None

        try:
            return json.loads(json_match.group(1) if '```' in response_text else json_match.group(0))
        except json.JSONDecodeError:
            return None

    async def save_job_data(self, job_data: Dict[str, Any], keyword: str, platform: str = "quelah"):
        """Save ALL jobs to Quelah Jobs table and markdown"""
        await self._save_to_db_quelah(job_data)
        await self._save_to_markdown_quelah(job_data, keyword)

    async def _save_to_db_quelah(self, job_data: Dict[str, Any]):
        """Save job data to Quelah Jobs table"""
        try:
            conn = psycopg2.connect(
                host=self.db_host,
                port=self.db_port,
                database="postgres",
                user=self.db_username,
                password=self.db_password
            )
            cursor = conn.cursor()

            # Set apply_type if not present
            apply_type = job_data.get("apply_type", "signup")

            cursor.execute('''
                INSERT INTO quelah_jobs
                (title, company_name, location, description, qualifications,
                 salary_range, nature_of_work, apply_type, job_id, url, category, scraped_at, posted_date)
                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                ON CONFLICT (job_id) DO NOTHING
            ''', (
                job_data.get("title", "N/A"),
                job_data.get("company_name", "N/A"),
                job_data.get("location", "N/A"),
                job_data.get("description", "N/A"),
                job_data.get("qualifications", "N/A"),
                job_data.get("salary_range", "N/A"),
                job_data.get("nature_of_work", "N/A"),
                apply_type,
                job_data.get("job_id", "N/A"),
                job_data.get("url", "N/A"),
                job_data.get("category", "N/A"),
                job_data.get("scraped_at"),
                job_data.get("posted_date", "12/01/25")  # Fixed date
            ))

            conn.commit()
            cursor.close()
            conn.close()

            print(f"  💾 Saved to Quelah Jobs | Job ID: {job_data.get('job_id', 'N/A')}")

        except Exception as e:
            print(f"❌ Database save error: {e}")

    async def _save_to_markdown_quelah(self, job_data: Dict[str, Any], keyword: str):
        os.makedirs("quelah_jobs", exist_ok=True)
        filepath = os.path.join("quelah_jobs", "quelah_jobs.md")
        write_header = not os.path.exists(filepath) or os.path.getsize(filepath) == 0

        with open(filepath, "a", encoding="utf-8") as f:
            if write_header:
                f.write(f"# Quelah Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
            f.write(f"## Job: {job_data.get('title', 'N/A')}\n\n")
            f.write(f"- *Keyword*: {keyword}\n")
            f.write(f"- *Company*: {job_data.get('company_name', 'N/A')}\n")
            f.write(f"- *Location*: {job_data.get('location', 'N/A')}\n")
            f.write(f"- *Nature of Work*: {job_data.get('nature_of_work', 'N/A')}\n")
            f.write(f"- *Salary Range*: {job_data.get('salary_range', 'N/A')}\n")
            f.write(f"- *Apply Type*: {job_data.get('apply_type', 'signup')}\n")
            f.write(f"- *Job ID*: {job_data.get('job_id', 'N/A')}\n")
            f.write(f"- *Posted Date*: {job_data.get('posted_date', '12/01/25')}\n")  # Fixed date
            f.write(f"- *Category*: {job_data.get('category', 'N/A')}\n")
            f.write(f"- *Scraped At*: {job_data.get('scraped_at', 'N/A')}\n")
            f.write(f"- *URL*: <{job_data.get('url', 'N/A')}>\n\n")
            f.write(f"### Description\n\n{job_data.get('description', 'N/A')}\n\n")
            # ✅ REMOVED requirements section
            f.write(f"### Qualifications\n\n{job_data.get('qualifications', 'N/A')}\n\n")
            f.write("---\n\n")