- Replace XAI/Grok integration with DeepSeek's OpenAI-compatible API - Remove schema generation and caching logic - Simplify prompt structure and response parsing - Standardize database schema and markdown output format - Update config to use DEEPSEEK_API_KEY instead of XAI_API_KEY - Change default search keyword in linkedin_main.py
161 lines
6.5 KiB
Python
161 lines
6.5 KiB
Python
from openai import OpenAI
|
|
from typing import Dict, Any
|
|
import asyncio
|
|
import sqlite3
|
|
import os
|
|
from datetime import datetime
|
|
import json
|
|
import re
|
|
from dotenv import load_dotenv
|
|
|
|
# Load environment variables from .env
|
|
load_dotenv()
|
|
|
|
class LLMJobRefiner:
|
|
def __init__(self):
|
|
deepseek_api_key = os.getenv("DEEPSEEK_API_KEY")
|
|
if not deepseek_api_key:
|
|
raise ValueError("DEEPSEEK_API_KEY not found in .env file.")
|
|
|
|
# DeepSeek uses OpenAI-compatible API
|
|
self.client = OpenAI(
|
|
api_key=deepseek_api_key,
|
|
base_url="https://api.deepseek.com/v1"
|
|
)
|
|
self.model = "deepseek-chat" # or "deepseek-coder" if preferred
|
|
|
|
def _generate_content_sync(self, prompt: str) -> str:
|
|
"""Synchronous call to DeepSeek API"""
|
|
try:
|
|
response = self.client.chat.completions.create(
|
|
model=self.model,
|
|
messages=[{"role": "user", "content": prompt}],
|
|
temperature=0.2,
|
|
max_tokens=2048,
|
|
stream=False
|
|
)
|
|
return response.choices[0].message.content or ""
|
|
except Exception as e:
|
|
print(f"DeepSeek API error: {e}")
|
|
return ""
|
|
|
|
async def refine_job_data(self, raw_data: Dict[str, Any], target_field: str) -> Dict[str, Any]:
|
|
prompt = f"""
|
|
You are a job data extraction assistant. Extract the following fields from the job posting:
|
|
- title
|
|
- company_name
|
|
- location
|
|
- description
|
|
- requirements
|
|
- qualifications
|
|
- salary_range
|
|
- nature_of_work (remote, onsite, or hybrid)
|
|
- job_id
|
|
|
|
Target Field: {target_field}
|
|
Raw Page Content:
|
|
{raw_data.get('page_content', '')}
|
|
|
|
Instructions:
|
|
1. Extract only the information relevant to the target field: {target_field}
|
|
2. Clean up any formatting issues in the description
|
|
3. Standardize location format (city, state/country)
|
|
4. Extract salary range if mentioned
|
|
5. Determine nature of work from work arrangements
|
|
6. Ensure all fields are properly formatted
|
|
7. If a field cannot be found, use "N/A"
|
|
8. Return ONLY the refined data in JSON format
|
|
|
|
Response format (only return the JSON):
|
|
{{
|
|
"title": "...",
|
|
"company_name": "...",
|
|
"location": "...",
|
|
"description": "...",
|
|
"requirements": "...",
|
|
"qualifications": "...",
|
|
"salary_range": "...",
|
|
"nature_of_work": "...",
|
|
"job_id": "{raw_data.get('job_id', 'unknown')}",
|
|
"url": "{raw_data.get('url', 'N/A')}"
|
|
}}
|
|
"""
|
|
|
|
try:
|
|
response_text = await asyncio.get_event_loop().run_in_executor(
|
|
None,
|
|
lambda: self._generate_content_sync(prompt)
|
|
)
|
|
refined_data = self._parse_llm_response(response_text)
|
|
return refined_data if refined_data else None
|
|
except Exception as e:
|
|
print(f"LLM refinement failed: {str(e)}")
|
|
return None
|
|
|
|
def _parse_llm_response(self, response_text: str) -> Dict[str, Any]:
|
|
json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', response_text, re.DOTALL)
|
|
if not json_match:
|
|
json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
|
|
if not json_match:
|
|
return None
|
|
|
|
try:
|
|
return json.loads(json_match.group(1) if '```' in response_text else json_match.group(0))
|
|
except json.JSONDecodeError:
|
|
return None
|
|
|
|
async def save_job_data(self, job_data: Dict[str, Any], keyword: str):
|
|
await self._save_to_db(job_data)
|
|
await self._save_to_markdown(job_data, keyword)
|
|
|
|
async def _save_to_db(self, job_data: Dict[str, Any]):
|
|
db_path = "linkedin_jobs.db"
|
|
os.makedirs(os.path.dirname(db_path) or ".", exist_ok=True)
|
|
with sqlite3.connect(db_path) as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute('''
|
|
CREATE TABLE IF NOT EXISTS jobs (
|
|
title TEXT, company_name TEXT, location TEXT, description TEXT,
|
|
requirements TEXT, qualifications TEXT, salary_range TEXT,
|
|
nature_of_work TEXT, job_id TEXT, url TEXT
|
|
)
|
|
''')
|
|
cursor.execute('''
|
|
INSERT OR IGNORE INTO jobs
|
|
(title, company_name, location, description, requirements,
|
|
qualifications, salary_range, nature_of_work, job_id, url)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
''', (
|
|
job_data.get("title", "N/A"),
|
|
job_data.get("company_name", "N/A"),
|
|
job_data.get("location", "N/A"),
|
|
job_data.get("description", "N/A"),
|
|
job_data.get("requirements", "N/A"),
|
|
job_data.get("qualifications", "N/A"),
|
|
job_data.get("salary_range", "N/A"),
|
|
job_data.get("nature_of_work", "N/A"),
|
|
job_data.get("job_id", "N/A"),
|
|
job_data.get("url", "N/A")
|
|
))
|
|
conn.commit()
|
|
|
|
async def _save_to_markdown(self, job_data: Dict[str, Any], keyword: str):
|
|
os.makedirs("linkedin_jobs", exist_ok=True)
|
|
filepath = os.path.join("linkedin_jobs", "linkedin_jobs_scraped.md")
|
|
write_header = not os.path.exists(filepath) or os.path.getsize(filepath) == 0
|
|
|
|
with open(filepath, "a", encoding="utf-8") as f:
|
|
if write_header:
|
|
f.write(f"# LinkedIn Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
|
|
f.write(f"## Job: {job_data.get('title', 'N/A')}\n\n")
|
|
f.write(f"- **Keyword**: {keyword}\n")
|
|
f.write(f"- **Company**: {job_data.get('company_name', 'N/A')}\n")
|
|
f.write(f"- **Location**: {job_data.get('location', 'N/A')}\n")
|
|
f.write(f"- **Nature of Work**: {job_data.get('nature_of_work', 'N/A')}\n")
|
|
f.write(f"- **Salary Range**: {job_data.get('salary_range', 'N/A')}\n")
|
|
f.write(f"- **Job ID**: {job_data.get('job_id', 'N/A')}\n")
|
|
f.write(f"- **URL**: <{job_data.get('url', 'N/A')}>\n\n")
|
|
f.write(f"### Description\n\n{job_data.get('description', 'N/A')}\n\n")
|
|
f.write(f"### Requirements\n\n{job_data.get('requirements', 'N/A')}\n\n")
|
|
f.write(f"### Qualifications\n\n{job_data.get('qualifications', 'N/A')}\n\n")
|
|
f.write("---\n\n") |