refactor(llm_agent): switch from XAI to DeepSeek API and simplify job refinement
- Replace XAI/Grok integration with DeepSeek's OpenAI-compatible API - Remove schema generation and caching logic - Simplify prompt structure and response parsing - Standardize database schema and markdown output format - Update config to use DEEPSEEK_API_KEY instead of XAI_API_KEY - Change default search keyword in linkedin_main.py
This commit is contained in:
parent
d7d92ba8bb
commit
4f78a845ae
@ -8,9 +8,9 @@ from dotenv import load_dotenv
|
|||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
# LLM Agent Configuration
|
# LLM Agent Configuration
|
||||||
GEMINI_API_KEY = os.getenv("XAI_API_KEY")
|
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")
|
||||||
if not GEMINI_API_KEY:
|
if not DEEPSEEK_API_KEY:
|
||||||
raise ValueError("XAI_API_KEY environment variable not set in .env file")
|
raise ValueError("DEEPSEEK_API_KEY environment variable not set in .env file")
|
||||||
|
|
||||||
|
|
||||||
def load_spoof_config():
|
def load_spoof_config():
|
||||||
|
|||||||
@ -21,7 +21,7 @@ async def main():
|
|||||||
scraper = LinkedInJobScraper(engine, human_speed=1.6, user_request="Extract title, company, location, description, requirements, qualifications, nature of job(remote, onsite, hybrid) and salary")
|
scraper = LinkedInJobScraper(engine, human_speed=1.6, user_request="Extract title, company, location, description, requirements, qualifications, nature of job(remote, onsite, hybrid) and salary")
|
||||||
|
|
||||||
await scraper.scrape_jobs(
|
await scraper.scrape_jobs(
|
||||||
search_keywords="Web Designer location:New York",
|
search_keywords="Lecturer location:New York",
|
||||||
credentials={
|
credentials={
|
||||||
"email": os.getenv("SCRAPING_USERNAME"),
|
"email": os.getenv("SCRAPING_USERNAME"),
|
||||||
"password": os.getenv("SCRAPING_PASSWORD")
|
"password": os.getenv("SCRAPING_PASSWORD")
|
||||||
|
|||||||
189
llm_agent.py
189
llm_agent.py
@ -1,6 +1,5 @@
|
|||||||
|
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
from typing import Dict, Any, Optional
|
from typing import Dict, Any
|
||||||
import asyncio
|
import asyncio
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import os
|
import os
|
||||||
@ -9,123 +8,91 @@ import json
|
|||||||
import re
|
import re
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
# ✅ Actually load .env
|
# Load environment variables from .env
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
class LLMJobRefiner:
|
class LLMJobRefiner:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
xai_api_key = os.getenv("XAI_API_KEY")
|
deepseek_api_key = os.getenv("DEEPSEEK_API_KEY")
|
||||||
if not xai_api_key:
|
if not deepseek_api_key:
|
||||||
raise ValueError("XAI_API_KEY not found in environment variables.")
|
raise ValueError("DEEPSEEK_API_KEY not found in .env file.")
|
||||||
|
|
||||||
self.client = OpenAI(api_key=xai_api_key, base_url="https://api.x.ai/v1")
|
# DeepSeek uses OpenAI-compatible API
|
||||||
self.model = "grok-4-latest"
|
self.client = OpenAI(
|
||||||
self.extraction_schema_cache = {}
|
api_key=deepseek_api_key,
|
||||||
|
base_url="https://api.deepseek.com/v1"
|
||||||
|
)
|
||||||
|
self.model = "deepseek-chat" # or "deepseek-coder" if preferred
|
||||||
|
|
||||||
def generate_content(self, prompt: str, system_message: str = "You are a helpful assistant.") -> str:
|
def _generate_content_sync(self, prompt: str) -> str:
|
||||||
"""Synchronous method to call Grok via xAI API."""
|
"""Synchronous call to DeepSeek API"""
|
||||||
try:
|
try:
|
||||||
response = self.client.chat.completions.create(
|
response = self.client.chat.completions.create(
|
||||||
model=self.model,
|
model=self.model,
|
||||||
messages=[
|
messages=[{"role": "user", "content": prompt}],
|
||||||
{"role": "system", "content": system_message},
|
|
||||||
{"role": "user", "content": prompt}
|
|
||||||
],
|
|
||||||
temperature=0.2,
|
temperature=0.2,
|
||||||
max_tokens=2048,
|
max_tokens=2048,
|
||||||
stream=False
|
stream=False
|
||||||
)
|
)
|
||||||
return response.choices[0].message.content or ""
|
return response.choices[0].message.content or ""
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error in Grok API call: {e}")
|
print(f"DeepSeek API error: {e}")
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
async def refine_job_data(self, raw_data: Dict[str, Any], user_request: str) -> Optional[Dict[str, Any]]:
|
async def refine_job_data(self, raw_data: Dict[str, Any], target_field: str) -> Dict[str, Any]:
|
||||||
page_content = raw_data.get('page_content', '')
|
|
||||||
if not page_content:
|
|
||||||
return None
|
|
||||||
|
|
||||||
schema_key = user_request.lower().strip()
|
|
||||||
extraction_schema = self.extraction_schema_cache.get(schema_key)
|
|
||||||
if not extraction_schema:
|
|
||||||
extraction_schema = await self._generate_extraction_schema(user_request)
|
|
||||||
if extraction_schema:
|
|
||||||
self.extraction_schema_cache[schema_key] = extraction_schema
|
|
||||||
else:
|
|
||||||
extraction_schema = self._get_default_schema()
|
|
||||||
|
|
||||||
prompt = f"""
|
prompt = f"""
|
||||||
You are a highly skilled web data extraction assistant. Your task is to analyze the raw HTML content of a job posting page and extract specific information requested by the user.
|
You are a job data extraction assistant. Extract the following fields from the job posting:
|
||||||
The user's request is: "{user_request}"
|
- title
|
||||||
The raw HTML content of the page is provided below (limited in size). The content might be noisy or unstructured.
|
- company_name
|
||||||
Your goal is to:
|
- location
|
||||||
1. Analyze the HTML structure to identify relevant sections.
|
- description
|
||||||
2. Extract the requested information accurately.
|
- requirements
|
||||||
3. Clean up formatting issues.
|
- qualifications
|
||||||
4. If a field cannot be found, use "N/A".
|
- salary_range
|
||||||
5. Return ONLY the extracted data in a JSON object based on the following schema:
|
- nature_of_work (remote, onsite, or hybrid)
|
||||||
{json.dumps(extraction_schema, indent=2)}
|
- job_id
|
||||||
Raw Page Content (HTML):
|
|
||||||
{page_content[:6000]}
|
|
||||||
|
|
||||||
Respond with the JSON object containing the extracted data.
|
Target Field: {target_field}
|
||||||
|
Raw Page Content:
|
||||||
|
{raw_data.get('page_content', '')}
|
||||||
|
|
||||||
|
Instructions:
|
||||||
|
1. Extract only the information relevant to the target field: {target_field}
|
||||||
|
2. Clean up any formatting issues in the description
|
||||||
|
3. Standardize location format (city, state/country)
|
||||||
|
4. Extract salary range if mentioned
|
||||||
|
5. Determine nature of work from work arrangements
|
||||||
|
6. Ensure all fields are properly formatted
|
||||||
|
7. If a field cannot be found, use "N/A"
|
||||||
|
8. Return ONLY the refined data in JSON format
|
||||||
|
|
||||||
|
Response format (only return the JSON):
|
||||||
|
{{
|
||||||
|
"title": "...",
|
||||||
|
"company_name": "...",
|
||||||
|
"location": "...",
|
||||||
|
"description": "...",
|
||||||
|
"requirements": "...",
|
||||||
|
"qualifications": "...",
|
||||||
|
"salary_range": "...",
|
||||||
|
"nature_of_work": "...",
|
||||||
|
"job_id": "{raw_data.get('job_id', 'unknown')}",
|
||||||
|
"url": "{raw_data.get('url', 'N/A')}"
|
||||||
|
}}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# ✅ Use self (current instance), NOT a new LLMJobRefiner()
|
|
||||||
response_text = await asyncio.get_event_loop().run_in_executor(
|
response_text = await asyncio.get_event_loop().run_in_executor(
|
||||||
None,
|
None,
|
||||||
lambda: self.generate_content(prompt)
|
lambda: self._generate_content_sync(prompt)
|
||||||
)
|
)
|
||||||
refined_data = self._parse_llm_response(response_text)
|
refined_data = self._parse_llm_response(response_text)
|
||||||
if not refined_data:
|
return refined_data if refined_data else None
|
||||||
return None
|
|
||||||
|
|
||||||
refined_data['job_id'] = raw_data.get('job_id', 'unknown')
|
|
||||||
refined_data['url'] = raw_data.get('url', 'N/A')
|
|
||||||
return refined_data
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"LLM refinement failed: {str(e)}")
|
print(f"LLM refinement failed: {str(e)}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
async def _generate_extraction_schema(self, user_request: str) -> Optional[Dict[str, str]]:
|
def _parse_llm_response(self, response_text: str) -> Dict[str, Any]:
|
||||||
schema_prompt = f"""
|
|
||||||
Based on the user's request: "{user_request}", generate a JSON schema for the data they want to extract from a job posting.
|
|
||||||
The schema should be a dictionary where keys are field names (snake_case) and values are short descriptions.
|
|
||||||
Include standard fields like title, company_name, location, description, etc., if relevant.
|
|
||||||
Respond with only the JSON schema.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
# ✅ Use self.generate_content, NOT self.model.generate_content
|
|
||||||
schema_text = await asyncio.get_event_loop().run_in_executor(
|
|
||||||
None,
|
|
||||||
lambda: self.generate_content(schema_prompt)
|
|
||||||
)
|
|
||||||
json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', schema_text, re.DOTALL)
|
|
||||||
if not json_match:
|
|
||||||
json_match = re.search(r'\{.*\}', schema_text, re.DOTALL)
|
|
||||||
if not json_match:
|
|
||||||
return None
|
|
||||||
|
|
||||||
json_str = json_match.group(1) if '```' in schema_text else json_match.group(0)
|
|
||||||
return json.loads(json_str)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Schema generation failed: {str(e)}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _get_default_schema(self) -> Dict[str, str]:
|
|
||||||
return {
|
|
||||||
"title": "The job title",
|
|
||||||
"company_name": "The name of the company",
|
|
||||||
"location": "The location of the job",
|
|
||||||
"description": "The full job description",
|
|
||||||
"requirements": "List of job requirements",
|
|
||||||
"qualifications": "List of required qualifications",
|
|
||||||
"salary_range": "The salary range mentioned",
|
|
||||||
"nature_of_work": "Remote, onsite, or hybrid"
|
|
||||||
}
|
|
||||||
|
|
||||||
def _parse_llm_response(self, response_text: str) -> Optional[Dict[str, Any]]:
|
|
||||||
json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', response_text, re.DOTALL)
|
json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', response_text, re.DOTALL)
|
||||||
if not json_match:
|
if not json_match:
|
||||||
json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
|
json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
|
||||||
@ -146,12 +113,30 @@ class LLMJobRefiner:
|
|||||||
os.makedirs(os.path.dirname(db_path) or ".", exist_ok=True)
|
os.makedirs(os.path.dirname(db_path) or ".", exist_ok=True)
|
||||||
with sqlite3.connect(db_path) as conn:
|
with sqlite3.connect(db_path) as conn:
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
fields = list(job_data.keys())
|
cursor.execute('''
|
||||||
placeholders = ', '.join(['?' for _ in fields])
|
CREATE TABLE IF NOT EXISTS jobs (
|
||||||
columns = ', '.join([f'"{col}"' for col in fields]) # Escape column names
|
title TEXT, company_name TEXT, location TEXT, description TEXT,
|
||||||
cursor.execute(f"CREATE TABLE IF NOT EXISTS jobs ({columns})")
|
requirements TEXT, qualifications TEXT, salary_range TEXT,
|
||||||
cursor.execute(f'INSERT INTO jobs ({columns}) VALUES ({placeholders})',
|
nature_of_work TEXT, job_id TEXT, url TEXT
|
||||||
[job_data.get(field, 'N/A') for field in fields])
|
)
|
||||||
|
''')
|
||||||
|
cursor.execute('''
|
||||||
|
INSERT OR IGNORE INTO jobs
|
||||||
|
(title, company_name, location, description, requirements,
|
||||||
|
qualifications, salary_range, nature_of_work, job_id, url)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||||
|
''', (
|
||||||
|
job_data.get("title", "N/A"),
|
||||||
|
job_data.get("company_name", "N/A"),
|
||||||
|
job_data.get("location", "N/A"),
|
||||||
|
job_data.get("description", "N/A"),
|
||||||
|
job_data.get("requirements", "N/A"),
|
||||||
|
job_data.get("qualifications", "N/A"),
|
||||||
|
job_data.get("salary_range", "N/A"),
|
||||||
|
job_data.get("nature_of_work", "N/A"),
|
||||||
|
job_data.get("job_id", "N/A"),
|
||||||
|
job_data.get("url", "N/A")
|
||||||
|
))
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
async def _save_to_markdown(self, job_data: Dict[str, Any], keyword: str):
|
async def _save_to_markdown(self, job_data: Dict[str, Any], keyword: str):
|
||||||
@ -164,7 +149,13 @@ class LLMJobRefiner:
|
|||||||
f.write(f"# LinkedIn Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
|
f.write(f"# LinkedIn Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
|
||||||
f.write(f"## Job: {job_data.get('title', 'N/A')}\n\n")
|
f.write(f"## Job: {job_data.get('title', 'N/A')}\n\n")
|
||||||
f.write(f"- **Keyword**: {keyword}\n")
|
f.write(f"- **Keyword**: {keyword}\n")
|
||||||
for key, value in job_data.items():
|
f.write(f"- **Company**: {job_data.get('company_name', 'N/A')}\n")
|
||||||
if key != 'title':
|
f.write(f"- **Location**: {job_data.get('location', 'N/A')}\n")
|
||||||
f.write(f"- **{key.replace('_', ' ').title()}**: {value}\n")
|
f.write(f"- **Nature of Work**: {job_data.get('nature_of_work', 'N/A')}\n")
|
||||||
f.write("\n---\n\n")
|
f.write(f"- **Salary Range**: {job_data.get('salary_range', 'N/A')}\n")
|
||||||
|
f.write(f"- **Job ID**: {job_data.get('job_id', 'N/A')}\n")
|
||||||
|
f.write(f"- **URL**: <{job_data.get('url', 'N/A')}>\n\n")
|
||||||
|
f.write(f"### Description\n\n{job_data.get('description', 'N/A')}\n\n")
|
||||||
|
f.write(f"### Requirements\n\n{job_data.get('requirements', 'N/A')}\n\n")
|
||||||
|
f.write(f"### Qualifications\n\n{job_data.get('qualifications', 'N/A')}\n\n")
|
||||||
|
f.write("---\n\n")
|
||||||
Loading…
x
Reference in New Issue
Block a user