diff --git a/config.py b/config.py index a352318..1365877 100644 --- a/config.py +++ b/config.py @@ -8,9 +8,9 @@ from dotenv import load_dotenv load_dotenv() # LLM Agent Configuration -GEMINI_API_KEY = os.getenv("XAI_API_KEY") -if not GEMINI_API_KEY: - raise ValueError("XAI_API_KEY environment variable not set in .env file") +DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY") +if not DEEPSEEK_API_KEY: + raise ValueError("DEEPSEEK_API_KEY environment variable not set in .env file") def load_spoof_config(): diff --git a/linkedin_main.py b/linkedin_main.py index 6986fc4..c97f031 100644 --- a/linkedin_main.py +++ b/linkedin_main.py @@ -21,7 +21,7 @@ async def main(): scraper = LinkedInJobScraper(engine, human_speed=1.6, user_request="Extract title, company, location, description, requirements, qualifications, nature of job(remote, onsite, hybrid) and salary") await scraper.scrape_jobs( - search_keywords="Web Designer location:New York", + search_keywords="Lecturer location:New York", credentials={ "email": os.getenv("SCRAPING_USERNAME"), "password": os.getenv("SCRAPING_PASSWORD") diff --git a/llm_agent.py b/llm_agent.py index 818fc0c..e59b3eb 100644 --- a/llm_agent.py +++ b/llm_agent.py @@ -1,6 +1,5 @@ - from openai import OpenAI -from typing import Dict, Any, Optional +from typing import Dict, Any import asyncio import sqlite3 import os @@ -9,123 +8,91 @@ import json import re from dotenv import load_dotenv -# ✅ Actually load .env +# Load environment variables from .env load_dotenv() class LLMJobRefiner: def __init__(self): - xai_api_key = os.getenv("XAI_API_KEY") - if not xai_api_key: - raise ValueError("XAI_API_KEY not found in environment variables.") + deepseek_api_key = os.getenv("DEEPSEEK_API_KEY") + if not deepseek_api_key: + raise ValueError("DEEPSEEK_API_KEY not found in .env file.") - self.client = OpenAI(api_key=xai_api_key, base_url="https://api.x.ai/v1") - self.model = "grok-4-latest" - self.extraction_schema_cache = {} + # DeepSeek uses OpenAI-compatible API + self.client = OpenAI( + api_key=deepseek_api_key, + base_url="https://api.deepseek.com/v1" + ) + self.model = "deepseek-chat" # or "deepseek-coder" if preferred - def generate_content(self, prompt: str, system_message: str = "You are a helpful assistant.") -> str: - """Synchronous method to call Grok via xAI API.""" + def _generate_content_sync(self, prompt: str) -> str: + """Synchronous call to DeepSeek API""" try: response = self.client.chat.completions.create( model=self.model, - messages=[ - {"role": "system", "content": system_message}, - {"role": "user", "content": prompt} - ], + messages=[{"role": "user", "content": prompt}], temperature=0.2, max_tokens=2048, stream=False ) return response.choices[0].message.content or "" except Exception as e: - print(f"Error in Grok API call: {e}") + print(f"DeepSeek API error: {e}") return "" - async def refine_job_data(self, raw_data: Dict[str, Any], user_request: str) -> Optional[Dict[str, Any]]: - page_content = raw_data.get('page_content', '') - if not page_content: - return None - - schema_key = user_request.lower().strip() - extraction_schema = self.extraction_schema_cache.get(schema_key) - if not extraction_schema: - extraction_schema = await self._generate_extraction_schema(user_request) - if extraction_schema: - self.extraction_schema_cache[schema_key] = extraction_schema - else: - extraction_schema = self._get_default_schema() - + async def refine_job_data(self, raw_data: Dict[str, Any], target_field: str) -> Dict[str, Any]: prompt = f""" - You are a highly skilled web data extraction assistant. Your task is to analyze the raw HTML content of a job posting page and extract specific information requested by the user. - The user's request is: "{user_request}" - The raw HTML content of the page is provided below (limited in size). The content might be noisy or unstructured. - Your goal is to: - 1. Analyze the HTML structure to identify relevant sections. - 2. Extract the requested information accurately. - 3. Clean up formatting issues. - 4. If a field cannot be found, use "N/A". - 5. Return ONLY the extracted data in a JSON object based on the following schema: - {json.dumps(extraction_schema, indent=2)} - Raw Page Content (HTML): - {page_content[:6000]} + You are a job data extraction assistant. Extract the following fields from the job posting: + - title + - company_name + - location + - description + - requirements + - qualifications + - salary_range + - nature_of_work (remote, onsite, or hybrid) + - job_id - Respond with the JSON object containing the extracted data. + Target Field: {target_field} + Raw Page Content: + {raw_data.get('page_content', '')} + + Instructions: + 1. Extract only the information relevant to the target field: {target_field} + 2. Clean up any formatting issues in the description + 3. Standardize location format (city, state/country) + 4. Extract salary range if mentioned + 5. Determine nature of work from work arrangements + 6. Ensure all fields are properly formatted + 7. If a field cannot be found, use "N/A" + 8. Return ONLY the refined data in JSON format + + Response format (only return the JSON): + {{ + "title": "...", + "company_name": "...", + "location": "...", + "description": "...", + "requirements": "...", + "qualifications": "...", + "salary_range": "...", + "nature_of_work": "...", + "job_id": "{raw_data.get('job_id', 'unknown')}", + "url": "{raw_data.get('url', 'N/A')}" + }} """ try: - # ✅ Use self (current instance), NOT a new LLMJobRefiner() response_text = await asyncio.get_event_loop().run_in_executor( None, - lambda: self.generate_content(prompt) + lambda: self._generate_content_sync(prompt) ) refined_data = self._parse_llm_response(response_text) - if not refined_data: - return None - - refined_data['job_id'] = raw_data.get('job_id', 'unknown') - refined_data['url'] = raw_data.get('url', 'N/A') - return refined_data + return refined_data if refined_data else None except Exception as e: print(f"LLM refinement failed: {str(e)}") return None - async def _generate_extraction_schema(self, user_request: str) -> Optional[Dict[str, str]]: - schema_prompt = f""" - Based on the user's request: "{user_request}", generate a JSON schema for the data they want to extract from a job posting. - The schema should be a dictionary where keys are field names (snake_case) and values are short descriptions. - Include standard fields like title, company_name, location, description, etc., if relevant. - Respond with only the JSON schema. - """ - try: - # ✅ Use self.generate_content, NOT self.model.generate_content - schema_text = await asyncio.get_event_loop().run_in_executor( - None, - lambda: self.generate_content(schema_prompt) - ) - json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', schema_text, re.DOTALL) - if not json_match: - json_match = re.search(r'\{.*\}', schema_text, re.DOTALL) - if not json_match: - return None - - json_str = json_match.group(1) if '```' in schema_text else json_match.group(0) - return json.loads(json_str) - except Exception as e: - print(f"Schema generation failed: {str(e)}") - return None - - def _get_default_schema(self) -> Dict[str, str]: - return { - "title": "The job title", - "company_name": "The name of the company", - "location": "The location of the job", - "description": "The full job description", - "requirements": "List of job requirements", - "qualifications": "List of required qualifications", - "salary_range": "The salary range mentioned", - "nature_of_work": "Remote, onsite, or hybrid" - } - - def _parse_llm_response(self, response_text: str) -> Optional[Dict[str, Any]]: + def _parse_llm_response(self, response_text: str) -> Dict[str, Any]: json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', response_text, re.DOTALL) if not json_match: json_match = re.search(r'\{.*\}', response_text, re.DOTALL) @@ -146,12 +113,30 @@ class LLMJobRefiner: os.makedirs(os.path.dirname(db_path) or ".", exist_ok=True) with sqlite3.connect(db_path) as conn: cursor = conn.cursor() - fields = list(job_data.keys()) - placeholders = ', '.join(['?' for _ in fields]) - columns = ', '.join([f'"{col}"' for col in fields]) # Escape column names - cursor.execute(f"CREATE TABLE IF NOT EXISTS jobs ({columns})") - cursor.execute(f'INSERT INTO jobs ({columns}) VALUES ({placeholders})', - [job_data.get(field, 'N/A') for field in fields]) + cursor.execute(''' + CREATE TABLE IF NOT EXISTS jobs ( + title TEXT, company_name TEXT, location TEXT, description TEXT, + requirements TEXT, qualifications TEXT, salary_range TEXT, + nature_of_work TEXT, job_id TEXT, url TEXT + ) + ''') + cursor.execute(''' + INSERT OR IGNORE INTO jobs + (title, company_name, location, description, requirements, + qualifications, salary_range, nature_of_work, job_id, url) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ''', ( + job_data.get("title", "N/A"), + job_data.get("company_name", "N/A"), + job_data.get("location", "N/A"), + job_data.get("description", "N/A"), + job_data.get("requirements", "N/A"), + job_data.get("qualifications", "N/A"), + job_data.get("salary_range", "N/A"), + job_data.get("nature_of_work", "N/A"), + job_data.get("job_id", "N/A"), + job_data.get("url", "N/A") + )) conn.commit() async def _save_to_markdown(self, job_data: Dict[str, Any], keyword: str): @@ -164,7 +149,13 @@ class LLMJobRefiner: f.write(f"# LinkedIn Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n") f.write(f"## Job: {job_data.get('title', 'N/A')}\n\n") f.write(f"- **Keyword**: {keyword}\n") - for key, value in job_data.items(): - if key != 'title': - f.write(f"- **{key.replace('_', ' ').title()}**: {value}\n") - f.write("\n---\n\n") + f.write(f"- **Company**: {job_data.get('company_name', 'N/A')}\n") + f.write(f"- **Location**: {job_data.get('location', 'N/A')}\n") + f.write(f"- **Nature of Work**: {job_data.get('nature_of_work', 'N/A')}\n") + f.write(f"- **Salary Range**: {job_data.get('salary_range', 'N/A')}\n") + f.write(f"- **Job ID**: {job_data.get('job_id', 'N/A')}\n") + f.write(f"- **URL**: <{job_data.get('url', 'N/A')}>\n\n") + f.write(f"### Description\n\n{job_data.get('description', 'N/A')}\n\n") + f.write(f"### Requirements\n\n{job_data.get('requirements', 'N/A')}\n\n") + f.write(f"### Qualifications\n\n{job_data.get('qualifications', 'N/A')}\n\n") + f.write("---\n\n") \ No newline at end of file