from openai import OpenAI from typing import Dict, Any import asyncio import sqlite3 import os from datetime import datetime import json import re from dotenv import load_dotenv # Load environment variables from .env load_dotenv() class LLMJobRefiner: def __init__(self): deepseek_api_key = os.getenv("DEEPSEEK_API_KEY") if not deepseek_api_key: raise ValueError("DEEPSEEK_API_KEY not found in .env file.") # DeepSeek uses OpenAI-compatible API self.client = OpenAI( api_key=deepseek_api_key, base_url="https://api.deepseek.com/v1" ) self.model = "deepseek-chat" # or "deepseek-coder" if preferred def _generate_content_sync(self, prompt: str) -> str: """Synchronous call to DeepSeek API""" try: response = self.client.chat.completions.create( model=self.model, messages=[{"role": "user", "content": prompt}], temperature=0.2, max_tokens=2048, stream=False ) return response.choices[0].message.content or "" except Exception as e: print(f"DeepSeek API error: {e}") return "" async def refine_job_data(self, raw_data: Dict[str, Any], target_field: str) -> Dict[str, Any]: prompt = f""" You are a job data extraction assistant. Extract the following fields from the job posting: - title - company_name - location - description - requirements - qualifications - salary_range - nature_of_work (remote, onsite, or hybrid) - job_id Target Field: {target_field} Raw Page Content: {raw_data.get('page_content', '')} Instructions: 1. Extract only the information relevant to the target field: {target_field} 2. Clean up any formatting issues in the description 3. Standardize location format (city, state/country) 4. Extract salary range if mentioned 5. Determine nature of work from work arrangements 6. Ensure all fields are properly formatted 7. If a field cannot be found, use "N/A" 8. Return ONLY the refined data in JSON format Response format (only return the JSON): {{ "title": "...", "company_name": "...", "location": "...", "description": "...", "requirements": "...", "qualifications": "...", "salary_range": "...", "nature_of_work": "...", "job_id": "{raw_data.get('job_id', 'unknown')}", "url": "{raw_data.get('url', 'N/A')}" }} """ try: response_text = await asyncio.get_event_loop().run_in_executor( None, lambda: self._generate_content_sync(prompt) ) refined_data = self._parse_llm_response(response_text) return refined_data if refined_data else None except Exception as e: print(f"LLM refinement failed: {str(e)}") return None def _parse_llm_response(self, response_text: str) -> Dict[str, Any]: json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', response_text, re.DOTALL) if not json_match: json_match = re.search(r'\{.*\}', response_text, re.DOTALL) if not json_match: return None try: return json.loads(json_match.group(1) if '```' in response_text else json_match.group(0)) except json.JSONDecodeError: return None async def save_job_data(self, job_data: Dict[str, Any], keyword: str): await self._save_to_db(job_data) await self._save_to_markdown(job_data, keyword) async def _save_to_db(self, job_data: Dict[str, Any]): db_path = "linkedin_jobs.db" os.makedirs(os.path.dirname(db_path) or ".", exist_ok=True) with sqlite3.connect(db_path) as conn: cursor = conn.cursor() cursor.execute(''' CREATE TABLE IF NOT EXISTS jobs ( title TEXT, company_name TEXT, location TEXT, description TEXT, requirements TEXT, qualifications TEXT, salary_range TEXT, nature_of_work TEXT, job_id TEXT, url TEXT ) ''') cursor.execute(''' INSERT OR IGNORE INTO jobs (title, company_name, location, description, requirements, qualifications, salary_range, nature_of_work, job_id, url) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ''', ( job_data.get("title", "N/A"), job_data.get("company_name", "N/A"), job_data.get("location", "N/A"), job_data.get("description", "N/A"), job_data.get("requirements", "N/A"), job_data.get("qualifications", "N/A"), job_data.get("salary_range", "N/A"), job_data.get("nature_of_work", "N/A"), job_data.get("job_id", "N/A"), job_data.get("url", "N/A") )) conn.commit() async def _save_to_markdown(self, job_data: Dict[str, Any], keyword: str): os.makedirs("linkedin_jobs", exist_ok=True) filepath = os.path.join("linkedin_jobs", "linkedin_jobs_scraped.md") write_header = not os.path.exists(filepath) or os.path.getsize(filepath) == 0 with open(filepath, "a", encoding="utf-8") as f: if write_header: f.write(f"# LinkedIn Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n") f.write(f"## Job: {job_data.get('title', 'N/A')}\n\n") f.write(f"- **Keyword**: {keyword}\n") f.write(f"- **Company**: {job_data.get('company_name', 'N/A')}\n") f.write(f"- **Location**: {job_data.get('location', 'N/A')}\n") f.write(f"- **Nature of Work**: {job_data.get('nature_of_work', 'N/A')}\n") f.write(f"- **Salary Range**: {job_data.get('salary_range', 'N/A')}\n") f.write(f"- **Job ID**: {job_data.get('job_id', 'N/A')}\n") f.write(f"- **URL**: <{job_data.get('url', 'N/A')}>\n\n") f.write(f"### Description\n\n{job_data.get('description', 'N/A')}\n\n") f.write(f"### Requirements\n\n{job_data.get('requirements', 'N/A')}\n\n") f.write(f"### Qualifications\n\n{job_data.get('qualifications', 'N/A')}\n\n") f.write("---\n\n")