Web_scraping_project/llm_agent.py

from openai import OpenAI
from typing import Dict, Any
import asyncio
import sqlite3
import os
from datetime import datetime
import json
import re
from dotenv import load_dotenv

# Load environment variables from .env
load_dotenv()

class LLMJobRefiner:
    def __init__(self):
        deepseek_api_key = os.getenv("DEEPSEEK_API_KEY")
        if not deepseek_api_key:
            raise ValueError("DEEPSEEK_API_KEY not found in .env file.")

        # DeepSeek uses OpenAI-compatible API
        self.client = OpenAI(
            api_key=deepseek_api_key,
            base_url="https://api.deepseek.com/v1"
        )
        self.model = "deepseek-chat"  # or "deepseek-coder" if preferred

    def _generate_content_sync(self, prompt: str) -> str:
        """Synchronous call to DeepSeek API"""
        try:
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[{"role": "user", "content": prompt}],
                temperature=0.2,
                max_tokens=2048,
                stream=False
            )
            return response.choices[0].message.content or ""
        except Exception as e:
            print(f"DeepSeek API error: {e}")
            return ""

    async def refine_job_data(self, raw_data: Dict[str, Any], target_field: str) -> Dict[str, Any]:
        prompt = f"""
        You are a job data extraction assistant. Extract the following fields from the job posting:
        - title
        - company_name
        - location
        - description
        - requirements
        - qualifications
        - salary_range
        - nature_of_work (remote, onsite, or hybrid)
        - job_id

        Target Field: {target_field}
        Raw Page Content:
        {raw_data.get('page_content', '')}

        Instructions:
        1. Extract only the information relevant to the target field: {target_field}
        2. Clean up any formatting issues in the description
        3. Standardize location format (city, state/country)
        4. Extract salary range if mentioned
        5. Determine nature of work from work arrangements
        6. Ensure all fields are properly formatted
        7. If a field cannot be found, use "N/A"
        8. Return ONLY the refined data in JSON format

        Response format (only return the JSON):
        {{
            "title": "...",
            "company_name": "...",
            "location": "...",
            "description": "...",
            "requirements": "...",
            "qualifications": "...",
            "salary_range": "...",
            "nature_of_work": "...",
            "job_id": "{raw_data.get('job_id', 'unknown')}",
            "url": "{raw_data.get('url', 'N/A')}"
        }}
        """

        try:
            response_text = await asyncio.get_event_loop().run_in_executor(
                None,
                lambda: self._generate_content_sync(prompt)
            )
            refined_data = self._parse_llm_response(response_text)
            return refined_data if refined_data else None
        except Exception as e:
            print(f"LLM refinement failed: {str(e)}")
            return None

    def _parse_llm_response(self, response_text: str) -> Dict[str, Any]:
        json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', response_text, re.DOTALL)
        if not json_match:
            json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
        if not json_match:
            return None

        try:
            return json.loads(json_match.group(1) if '```' in response_text else json_match.group(0))
        except json.JSONDecodeError:
            return None

    async def save_job_data(self, job_data: Dict[str, Any], keyword: str):
        await self._save_to_db(job_data)
        await self._save_to_markdown(job_data, keyword)

    async def _save_to_db(self, job_data: Dict[str, Any]):
        db_path = "linkedin_jobs.db"
        os.makedirs(os.path.dirname(db_path) or ".", exist_ok=True)
        with sqlite3.connect(db_path) as conn:
            cursor = conn.cursor()
            cursor.execute('''
                CREATE TABLE IF NOT EXISTS jobs (
                    title TEXT, company_name TEXT, location TEXT, description TEXT,
                    requirements TEXT, qualifications TEXT, salary_range TEXT,
                    nature_of_work TEXT, job_id TEXT, url TEXT
                )
            ''')
            cursor.execute('''
                INSERT OR IGNORE INTO jobs
                (title, company_name, location, description, requirements,
                 qualifications, salary_range, nature_of_work, job_id, url)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            ''', (
                job_data.get("title", "N/A"),
                job_data.get("company_name", "N/A"),
                job_data.get("location", "N/A"),
                job_data.get("description", "N/A"),
                job_data.get("requirements", "N/A"),
                job_data.get("qualifications", "N/A"),
                job_data.get("salary_range", "N/A"),
                job_data.get("nature_of_work", "N/A"),
                job_data.get("job_id", "N/A"),
                job_data.get("url", "N/A")
            ))
            conn.commit()

    async def _save_to_markdown(self, job_data: Dict[str, Any], keyword: str):
        os.makedirs("linkedin_jobs", exist_ok=True)
        filepath = os.path.join("linkedin_jobs", "linkedin_jobs_scraped.md")
        write_header = not os.path.exists(filepath) or os.path.getsize(filepath) == 0

        with open(filepath, "a", encoding="utf-8") as f:
            if write_header:
                f.write(f"# LinkedIn Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
            f.write(f"## Job: {job_data.get('title', 'N/A')}\n\n")
            f.write(f"- **Keyword**: {keyword}\n")
            f.write(f"- **Company**: {job_data.get('company_name', 'N/A')}\n")
            f.write(f"- **Location**: {job_data.get('location', 'N/A')}\n")
            f.write(f"- **Nature of Work**: {job_data.get('nature_of_work', 'N/A')}\n")
            f.write(f"- **Salary Range**: {job_data.get('salary_range', 'N/A')}\n")
            f.write(f"- **Job ID**: {job_data.get('job_id', 'N/A')}\n")
            f.write(f"- **URL**: <{job_data.get('url', 'N/A')}>\n\n")
            f.write(f"### Description\n\n{job_data.get('description', 'N/A')}\n\n")
            f.write(f"### Requirements\n\n{job_data.get('requirements', 'N/A')}\n\n")
            f.write(f"### Qualifications\n\n{job_data.get('qualifications', 'N/A')}\n\n")
            f.write("---\n\n")