modifications to work with postgre and use llm to extract and refine data

refactor(llm_agent): switch from XAI to DeepSeek API and simplify job refinement
- Replace XAI/Grok integration with DeepSeek's OpenAI-compatible API - Remove schema generation and caching logic - Simplify prompt structure and response parsing - Standardize database schema and markdown output format - Update config to use DEEPSEEK_API_KEY instead of XAI_API_KEY - Change default search keyword in linkedin_main.py
2025-12-05 17:00:43 +01:00 · 2025-12-01 10:25:37 +01:00
6 changed files with 330 additions and 166 deletions
--- a/config.py
+++ b/config.py
@ -8,9 +8,9 @@ from dotenv import load_dotenv
 load_dotenv()

 # LLM Agent Configuration
-GEMINI_API_KEY = os.getenv("XAI_API_KEY")
-if not GEMINI_API_KEY:
-    raise ValueError("XAI_API_KEY environment variable not set in .env file")
+DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")
+if not DEEPSEEK_API_KEY:
+    raise ValueError("DEEPSEEK_API_KEY environment variable not set in .env file")


 def load_spoof_config():
--- a/fetcher.py
+++ b/fetcher.py
@ -23,11 +23,11 @@ class StealthyFetcher:
                print(f"Attempt {attempt + 1} to fetch {url}")
                page = await self.context.new_page()

-                await page.goto(url, wait_until='load', timeout=60000)
+                await page.goto(url, wait_until='load', timeout=120000)

                if wait_for_selector:
                    try:
-                        await page.wait_for_selector(wait_for_selector, timeout=10000)
+                        await page.wait_for_selector(wait_for_selector, timeout=40000)
                    except PlaywrightTimeoutError:
                        print(f"Selector {wait_for_selector} not found immediately, continuing...")

@ -88,7 +88,7 @@ class StealthyFetcher:
    async def _is_content_accessible(self, page: Page, wait_for_selector: Optional[str] = None) -> bool:
        if wait_for_selector:
            try:
-                await page.wait_for_selector(wait_for_selector, timeout=5000)
+                await page.wait_for_selector(wait_for_selector, timeout=40000)
                return True
            except PlaywrightTimeoutError:
                pass
@ -118,7 +118,7 @@ class StealthyFetcher:

            if (time.time() - start_time) > 15 and (time.time() - start_time) % 20 < 2:
                print("🔄 Reloading page during Cloudflare wait...")
-                await page.reload(wait_until='load', timeout=30000)
+                await page.reload(wait_until='load', timeout=120000)

        print("⏰ Timeout waiting for Cloudflare resolution.")
        return False
--- a/job_scraper2.py
+++ b/job_scraper2.py
@ -1,13 +1,12 @@
 import asyncio
 import random
-import sqlite3
-import os
 from typing import Optional, Dict
 from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
 from browserforge.injectors.playwright import AsyncNewContext
 from llm_agent import LLMJobRefiner
 import re
 from fetcher import StealthyFetcher
+from datetime import datetime


 class LinkedInJobScraper:
@ -26,25 +25,8 @@ class LinkedInJobScraper:
        self.llm_agent = LLMJobRefiner()

    def _init_db(self):
-        os.makedirs(os.path.dirname(self.db_path) if os.path.dirname(self.db_path) else ".", exist_ok=True)
-        with sqlite3.connect(self.db_path) as conn:
-            cursor = conn.cursor()
-            cursor.execute('''
-                CREATE TABLE IF NOT EXISTS jobs (
-                    id INTEGER PRIMARY KEY AUTOINCREMENT,
-                    title TEXT,
-                    company_name TEXT,
-                    location TEXT,
-                    description TEXT,
-                    requirements TEXT,
-                    qualifications TEXT,
-                    salary_range TEXT,
-                    nature_of_work TEXT,
-                    job_id TEXT,
-                    url TEXT UNIQUE
-                )
-            ''')
-            conn.commit()
+        # This method is kept for backward compatibility but LLMJobRefiner handles PostgreSQL now
+        pass

    async def _human_click(self, page, element, wait_after: bool = True):
        if not element:
@ -61,7 +43,7 @@ class LinkedInJobScraper:

    async def _login(self, page, credentials: Dict) -> bool:
        print("🔐 Navigating to LinkedIn login page...")
-        await page.goto("https://www.linkedin.com/login", timeout=60000)
+        await page.goto("https://www.linkedin.com/login", timeout=120000)
        await asyncio.sleep(random.uniform(2.0, 3.5) * self.human_speed)

        email_field = await page.query_selector('input[name="session_key"]')
@ -104,7 +86,11 @@ class LinkedInJobScraper:
        print("❌ Login may have failed.")
        return False

-    async def _extract_all_page_content(self, page) -> str:
+    async def _extract_page_content_for_llm(self, page) -> str:
+        """
+        Extract raw page content as HTML/text for LLM processing
+        The LLM will handle all extraction logic, not specific selectors
+        """
        await asyncio.sleep(2 * self.human_speed)
        await self.engine._human_like_scroll(page)
        await asyncio.sleep(2 * self.human_speed)
@ -172,7 +158,7 @@ class LinkedInJobScraper:
                await self._human_click(page, next_btn)
                await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)
                try:
-                    await page.wait_for_function("() => window.location.href.includes('start=')", timeout=60000)
+                    await page.wait_for_function("() => window.location.href.includes('start=')", timeout=120000)
                except:
                    pass
                current_page += 1
@ -247,7 +233,7 @@ class LinkedInJobScraper:

            if session_loaded:
                print("🔁 Using saved session — verifying login...")
-                await page.goto("https://www.linkedin.com/feed/", timeout=60000)
+                await page.goto("https://www.linkedin.com/feed/", timeout=120000)
                if "feed" in page.url and "login" not in page.url:
                    print("✅ Session still valid.")
                    login_successful = True
@ -269,7 +255,7 @@ class LinkedInJobScraper:
                print("ℹ️ No credentials — proceeding as guest.")
                login_successful = True

-            await page.wait_for_load_state("load", timeout=60000)
+            await page.wait_for_load_state("load", timeout=120000)
            print("✅ Post-login page fully loaded. Starting search...")

            # >>> PROTECTION CHECK USING FETCHER LOGIC <<<
@ -292,7 +278,7 @@ class LinkedInJobScraper:
                    print("✅ Protection present but content accessible — proceeding.")

            print(f"🔍 Searching for: {search_keywords}")
-            await page.goto(search_url, wait_until='load', timeout=60000)
+            await page.goto(search_url, wait_until='load', timeout=120000)
            await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)

            # >>> PROTECTION CHECK ON SEARCH PAGE <<<
@ -322,7 +308,7 @@ class LinkedInJobScraper:
            print(f"   ➕ Found {initial_jobs} initial job(s) (total: {len(all_job_links)})")

            iteration = 1
-            while True:
+            while True and iteration >= 5:
                print(f"🔄 Iteration {iteration}: Checking for new jobs...")

                prev_job_count = len(all_job_links)
@ -355,10 +341,6 @@ class LinkedInJobScraper:
                        print("🔚 No new jobs found after refresh. Stopping.")
                        break

-                if iteration > 10:
-                    print("🔄 Maximum iterations reached. Stopping.")
-                    break
-
            print(f"✅ Collected {len(all_job_links)} unique job links.")

            scraped_count = 0
@ -386,8 +368,9 @@ class LinkedInJobScraper:
                        if apply_btn:
                            break

-                    page_data = None
-                    final_url = job_page.url
+                    final_url = full_url
+                    external_url = None
+                    page_content = None

                    if apply_btn:
                        print("  → Clicking 'Apply' / 'Easy Apply' button...")
@ -399,44 +382,61 @@ class LinkedInJobScraper:
                        try:
                            external_page = await asyncio.wait_for(page_waiter, timeout=5.0)
                            print("  🌐 External job site opened in new tab.")
-                            await external_page.wait_for_load_state("load", timeout=60000)
+                            await external_page.wait_for_load_state("load", timeout=120000)
                            await asyncio.sleep(2 * self.human_speed)
                            await self.engine._human_like_scroll(external_page)
                            await asyncio.sleep(2 * self.human_speed)

-                            page_data = await self._extract_all_page_content(external_page)
-                            final_url = external_page.url
+                            # Extract raw content from external page for LLM processing
+                            external_url = external_page.url
+                            final_url = external_url
+                            page_content = await self._extract_page_content_for_llm(external_page)

                            if not external_page.is_closed():
                                await external_page.close()

                        except asyncio.TimeoutError:
                            print("  🖥️ No external tab — scraping LinkedIn job page directly.")
-                            await job_page.wait_for_timeout(2000)
+                            await job_page.wait_for_timeout(60000)
                            try:
-                                await job_page.wait_for_selector("div.jobs-apply-button--fixed, div.jobs-easy-apply-modal", timeout=8000)
+                                await job_page.wait_for_selector("div.jobs-apply-button--fixed, div.jobs-easy-apply-modal", timeout=80000)
                            except PlaywrightTimeoutError:
                                pass
                            await self.engine._human_like_scroll(job_page)
                            await asyncio.sleep(2 * self.human_speed)
-                            page_data = await self._extract_all_page_content(job_page)
+                            page_content = await self._extract_page_content_for_llm(job_page)
                    else:
                        print("    ⚠️ No 'Apply' button found — scraping job details directly.")
                        await self.engine._human_like_scroll(job_page)
                        await asyncio.sleep(2 * self.human_speed)
-                        page_data = await self._extract_all_page_content(job_page)
+                        page_content = await self._extract_page_content_for_llm(job_page)

-                    job_id = final_url.split("/")[-2] if "/jobs/view/" in final_url else "unknown"
+                    job_id = full_url.split("/")[-2] if "/jobs/view/" in full_url else "unknown"

                    raw_data = {
-                        "page_content": page_data,
-                        "url": job_page.url,
-                        "job_id": job_page.url.split("/")[-2] if "/jobs/view/" in job_page.url else "unknown"
+                        "page_content": page_content,
+                        "url": final_url,
+                        "job_id": job_id,
+                        "search_keywords": search_keywords
                    }

+                    # LLM agent is now fully responsible for extraction and validation
                    refined_data = await self.llm_agent.refine_job_data(raw_data, self.user_request)

                    if refined_data and refined_data.get("title", "N/A") != "N/A":
+                        # Ensure compulsory fields are present (fallback if LLM missed them)
+                        compulsory_fields = ['company_name', 'job_id', 'url']
+                        for field in compulsory_fields:
+                            if not refined_data.get(field) or refined_data[field] in ["N/A", "", "Unknown"]:
+                                if field == 'job_id':
+                                    refined_data[field] = job_id
+                                elif field == 'url':
+                                    refined_data[field] = final_url
+                                elif field == 'company_name':
+                                    refined_data[field] = "Unknown Company"
+                        
+                        refined_data['scraped_at'] = datetime.now().isoformat()
+                        refined_data['category'] = clean_keywords
                        await self.llm_agent.save_job_data(refined_data, search_keywords)
                        scraped_count += 1
                        print(f"  ✅ Scraped and refined: {refined_data['title'][:50]}...")
@ -455,7 +455,7 @@ class LinkedInJobScraper:

                finally:
                    print("  ↩️ Returning to LinkedIn search results...")
-                    await page.goto(search_url, timeout=60000)
+                    await page.goto(search_url, timeout=120000)
                    await asyncio.sleep(4 * self.human_speed)

            await browser.close()
--- a/linkedin_main.py
+++ b/linkedin_main.py
@ -4,6 +4,8 @@ from job_scraper2 import LinkedInJobScraper
 import os
 from dotenv import load_dotenv
 import asyncio
+import random
+import time

 # Load environment variables
 load_dotenv()
@ -11,7 +13,7 @@ load_dotenv()

 async def main():
    engine = FingerprintScrapingEngine(
-        seed="job_scraping_123",
+        seed="job_scraping_12",
        target_os="windows",
        db_path="job_listings.db",
        markdown_path="job_listings.md"
@ -20,13 +22,50 @@ async def main():
    # Initialize scraper with target field
    scraper = LinkedInJobScraper(engine, human_speed=1.6, user_request="Extract title, company, location, description, requirements, qualifications, nature of job(remote, onsite, hybrid) and salary")

-    await scraper.scrape_jobs(
-        search_keywords="Web Designer location:New York",
-        credentials={
-            "email": os.getenv("SCRAPING_USERNAME"),
-            "password": os.getenv("SCRAPING_PASSWORD")
-        }
-    )
+    # List of job titles to cycle through
+    job_titles = [
+        "Software Engineer",
+        "Data Scientist",
+        "Product Manager",
+        "UX Designer",
+        "DevOps Engineer",
+        "Machine Learning Engineer",
+        "Frontend Developer",
+        "Backend Developer",
+        "Full Stack Developer",
+        "Data Analyst"
+    ]
+    
+    fixed_location = "New York"
+    
+    # Keep cycling through all job titles
+    while True:
+        # Shuffle job titles to randomize order
+        random.shuffle(job_titles)
+        
+        for job_title in job_titles:
+            search_keywords = f"{job_title} location:{fixed_location}"
+            
+            print(f"\n{'='*60}")
+            print(f"Starting scrape for: {search_keywords}")
+            print(f"{'='*60}")
+            
+            await scraper.scrape_jobs(
+                search_keywords=search_keywords,
+                credentials={
+                    "email": os.getenv("SCRAPING_USERNAME"),
+                    "password": os.getenv("SCRAPING_PASSWORD")
+                }
+            )
+            
+            print(f"\n✅ Completed scraping for: {job_title}")
+            print(f"⏳ Waiting 2 minutes before next job title...")
+            
+            # Wait 2 minutes before next job title
+            time.sleep(120)
+        
+        print(f"\n✅ Completed full cycle of all job titles")
+        print(f"🔄 Starting new cycle...")

 if __name__ == "__main__":
    asyncio.run(main())
--- a/llm_agent.py
+++ b/llm_agent.py
@ -1,131 +1,219 @@

 from openai import OpenAI
-from typing import Dict, Any, Optional
+from typing import Dict, Any
 import asyncio
-import sqlite3
+import psycopg2
 import os
 from datetime import datetime
 import json
 import re
+from bs4 import BeautifulSoup
 from dotenv import load_dotenv

-# ✅ Actually load .env
+# Load environment variables from .env
 load_dotenv()

+
 class LLMJobRefiner:
    def __init__(self):
-        xai_api_key = os.getenv("XAI_API_KEY")
-        if not xai_api_key:
-            raise ValueError("XAI_API_KEY not found in environment variables.")
+        deepseek_api_key = os.getenv("DEEPSEEK_API_KEY")
+        if not deepseek_api_key:
+            raise ValueError("DEEPSEEK_API_KEY not found in .env file.")
        
-        self.client = OpenAI(api_key=xai_api_key, base_url="https://api.x.ai/v1")
-        self.model = "grok-4-latest"
-        self.extraction_schema_cache = {}
+        # Database credentials from .env
+        self.db_url = os.getenv("DB_URL")
+        self.db_username = os.getenv("DB_USERNAME")
+        self.db_password = os.getenv("DB_PASSWORD")
+        self.db_host = os.getenv("DB_HOST")
+        self.db_port = os.getenv("DB_PORT")
        
-    def generate_content(self, prompt: str, system_message: str = "You are a helpful assistant.") -> str:
-        """Synchronous method to call Grok via xAI API."""
+        if not self.db_url or not self.db_username or not self.db_password:
+            raise ValueError("Database credentials not found in .env file.")
+        
+        # DeepSeek uses OpenAI-compatible API
+        self.client = OpenAI(
+            api_key=deepseek_api_key,
+            base_url="https://api.deepseek.com/v1"
+        )
+        self.model = "deepseek-chat"
+        self._init_db()
+
+    def _init_db(self):
+        """Initialize PostgreSQL database connection and create table"""
+        try:
+            self.db_url = os.getenv("DB_URL")
+            if self.db_url and "supabase.com" in self.db_url:
+                conn = psycopg2.connect(
+                    host=self.db_host,
+                    port=self.db_port,
+                    database="postgres",
+                    user=self.db_username,
+                    password=self.db_password
+                )
+            else:
+                conn = psycopg2.connect(
+                    host=self.db_host,
+                    port=self.db_port,
+                    database="postgres",
+                    user=self.db_username,
+                    password=self.db_password
+            )
+            cursor = conn.cursor()
+            
+            cursor.execute('''
+                CREATE TABLE IF NOT EXISTS jobs (
+                    id SERIAL PRIMARY KEY,
+                    title TEXT,
+                    company_name TEXT,
+                    location TEXT,
+                    description TEXT,
+                    requirements TEXT,
+                    qualifications TEXT,
+                    salary_range TEXT,
+                    nature_of_work TEXT,
+                    job_id TEXT UNIQUE,
+                    url TEXT,
+                    category TEXT,
+                    scraped_at TIMESTAMP,
+                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+                )
+            ''')
+            
+            # Ensure the uniqueness constraint exists
+            cursor.execute('''
+            ALTER TABLE jobs DROP CONSTRAINT IF EXISTS jobs_job_id_key;
+            ALTER TABLE jobs ADD CONSTRAINT jobs_job_id_key UNIQUE (job_id);
+        ''')
+            
+            cursor.execute('CREATE INDEX IF NOT EXISTS idx_job_id ON jobs(job_id)')
+            cursor.execute('CREATE INDEX IF NOT EXISTS idx_category ON jobs(category)')
+            
+            conn.commit()
+            cursor.close()
+            conn.close()
+            print("✅ PostgreSQL database initialized successfully")
+        except Exception as e:
+            print(f"❌ Database initialization error: {e}")
+            raise
+
+    def _clean_html_for_llm(self, html_content: str) -> str:
+        """Clean HTML to make it more readable for LLM while preserving structure"""
+        try:
+            soup = BeautifulSoup(html_content, 'html.parser')
+            
+            # Remove script and style elements
+            for script in soup(["script", "style", "nav", "footer", "header"]):
+                script.decompose()
+            
+            # Extract text but keep some structure
+            text = soup.get_text(separator=' ', strip=True)
+            
+            # Clean up whitespace
+            text = re.sub(r'\s+', ' ', text)
+            
+            # Limit length for LLM context
+            if len(text) > 10000:
+                text = text[:10000] + "..."
+            
+            return text
+        except Exception as e:
+            print(f"HTML cleaning error: {e}")
+            # Fallback to raw content if cleaning fails
+            return html_content[:100000] if len(html_content) > 100000 else html_content
+
+    def _generate_content_sync(self, prompt: str) -> str:
+        """Synchronous call to DeepSeek API"""
        try:
            response = self.client.chat.completions.create(
                model=self.model,
-                messages=[
-                    {"role": "system", "content": system_message},
-                    {"role": "user", "content": prompt}
-                ],
+                messages=[{"role": "user", "content": prompt}],
                temperature=0.2,
                max_tokens=2048,
                stream=False
            )
            return response.choices[0].message.content or ""
        except Exception as e:
-            print(f"Error in Grok API call: {e}")
+            print(f"DeepSeek API error: {e}")
            return ""

-    async def refine_job_data(self, raw_data: Dict[str, Any], user_request: str) -> Optional[Dict[str, Any]]:
+    async def refine_job_data(self, raw_data: Dict[str, Any], target_field: str) -> Dict[str, Any]:
+        # Clean the raw HTML content for better LLM processing
        page_content = raw_data.get('page_content', '')
-        if not page_content:
-            return None
+        cleaned_content = self._clean_html_for_llm(page_content)
        
-        schema_key = user_request.lower().strip()
-        extraction_schema = self.extraction_schema_cache.get(schema_key)
-        if not extraction_schema:
-            extraction_schema = await self._generate_extraction_schema(user_request)
-            if extraction_schema:
-                self.extraction_schema_cache[schema_key] = extraction_schema
-            else:
-                extraction_schema = self._get_default_schema()
+        # Get job_id and url from raw data
+        job_id = raw_data.get('job_id', 'unknown')
+        url = raw_data.get('url', 'N/A')
        
        prompt = f"""
-        You are a highly skilled web data extraction assistant. Your task is to analyze the raw HTML content of a job posting page and extract specific information requested by the user.
-        The user's request is: "{user_request}"
-        The raw HTML content of the page is provided below (limited in size). The content might be noisy or unstructured.
-        Your goal is to:
-        1. Analyze the HTML structure to identify relevant sections.
-        2. Extract the requested information accurately.
-        3. Clean up formatting issues.
-        4. If a field cannot be found, use "N/A".
-        5. Return ONLY the extracted data in a JSON object based on the following schema:
-        {json.dumps(extraction_schema, indent=2)}
-        Raw Page Content (HTML):
-        {page_content[:6000]}
+        You are a job posting data extractor with two modes:

-        Respond with the JSON object containing the extracted data.
+        PRIMARY MODE (PREFERRED):
+        - Extract EXACT text as it appears on the page for all fields
+        - DO NOT summarize, paraphrase, or interpret
+        - Copy verbatim content including original wording and formatting
+
+        FALLBACK MODE (ONLY IF FIELD IS MISSING):
+        - If a field is NOT explicitly stated anywhere in the content, you MAY infer it using clear contextual clues
+        - Inference rules:
+          * company_name: Look for patterns like "at [Company]", "Join [Company]", "[Company] is hiring"
+          * nature_of_work: Look for "remote", "onsite", "hybrid", "work from home", "office-based"
+          * location: Extract city/state/country mentions near job title or details
+          * title: Use the largest/primary heading if no explicit "job title" label exists
+
+        REQUIRED FIELDS (must always have a value):
+        - title: Exact job title or best inference
+        - company_name: Exact company name or best inference  
+        - job_id: Use provided: {job_id}
+        - url: Use provided: {url}
+
+        OPTIONAL FIELDS (use exact text or "N/A" if not present and not inferable):
+        - location
+        - description  
+        - requirements
+        - qualifications
+        - salary_range
+        - nature_of_work
+
+        Page Content:
+        {cleaned_content}
+        Response format (ONLY return this JSON):
+        {{
+            "title": "...",
+            "company_name": "...",
+            "location": "...",
+            "description": "...",
+            "requirements": "...",
+            "qualifications": "...",
+            "salary_range": "...",
+            "nature_of_work": "...",
+            "job_id": "{job_id}",
+            "url": "{url}"
+        }}
        """

        try:
-            # ✅ Use self (current instance), NOT a new LLMJobRefiner()
            response_text = await asyncio.get_event_loop().run_in_executor(
                None,
-                lambda: self.generate_content(prompt)
+                lambda: self._generate_content_sync(prompt)
            )
            refined_data = self._parse_llm_response(response_text)
-            if not refined_data:
-                return None
            
-            refined_data['job_id'] = raw_data.get('job_id', 'unknown')
-            refined_data['url'] = raw_data.get('url', 'N/A')
-            return refined_data
+            # Final validation - ensure required fields are present and meaningful
+            if refined_data:
+                required_fields = ['title', 'company_name', 'job_id', 'url']
+                for field in required_fields:
+                    if not refined_data.get(field) or refined_data[field] in ["N/A", "", "Unknown", "Company", "Job"]:
+                        return None  # LLM failed to extract properly
+                
+                return refined_data
+            return None
+            
        except Exception as e:
            print(f"LLM refinement failed: {str(e)}")
            return None

-    async def _generate_extraction_schema(self, user_request: str) -> Optional[Dict[str, str]]:
-        schema_prompt = f"""
-        Based on the user's request: "{user_request}", generate a JSON schema for the data they want to extract from a job posting.
-        The schema should be a dictionary where keys are field names (snake_case) and values are short descriptions.
-        Include standard fields like title, company_name, location, description, etc., if relevant.
-        Respond with only the JSON schema.
-        """
-        try:
-            # ✅ Use self.generate_content, NOT self.model.generate_content
-            schema_text = await asyncio.get_event_loop().run_in_executor(
-                None,
-                lambda: self.generate_content(schema_prompt)
-            )
-            json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', schema_text, re.DOTALL)
-            if not json_match:
-                json_match = re.search(r'\{.*\}', schema_text, re.DOTALL)
-            if not json_match:
-                return None
-
-            json_str = json_match.group(1) if '```' in schema_text else json_match.group(0)
-            return json.loads(json_str)
-        except Exception as e:
-            print(f"Schema generation failed: {str(e)}")
-            return None
-
-    def _get_default_schema(self) -> Dict[str, str]:
-        return {
-            "title": "The job title",
-            "company_name": "The name of the company",
-            "location": "The location of the job",
-            "description": "The full job description",
-            "requirements": "List of job requirements",
-            "qualifications": "List of required qualifications",
-            "salary_range": "The salary range mentioned",
-            "nature_of_work": "Remote, onsite, or hybrid"
-        }
-
-    def _parse_llm_response(self, response_text: str) -> Optional[Dict[str, Any]]:
+    def _parse_llm_response(self, response_text: str) -> Dict[str, Any]:
        json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', response_text, re.DOTALL)
        if not json_match:
            json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
@ -142,17 +230,46 @@ class LLMJobRefiner:
        await self._save_to_markdown(job_data, keyword)

    async def _save_to_db(self, job_data: Dict[str, Any]):
-        db_path = "linkedin_jobs.db"
-        os.makedirs(os.path.dirname(db_path) or ".", exist_ok=True)
-        with sqlite3.connect(db_path) as conn:
+        """Save job data to PostgreSQL database with job_id uniqueness"""
+        try:
+            conn = psycopg2.connect(
+                    host=self.db_host,
+                    port=self.db_port,
+                    database="postgres",
+                    user=self.db_username,
+                    password=self.db_password
+            )
            cursor = conn.cursor()
-            fields = list(job_data.keys())
-            placeholders = ', '.join(['?' for _ in fields])
-            columns = ', '.join([f'"{col}"' for col in fields])  # Escape column names
-            cursor.execute(f"CREATE TABLE IF NOT EXISTS jobs ({columns})")
-            cursor.execute(f'INSERT INTO jobs ({columns}) VALUES ({placeholders})', 
-                          [job_data.get(field, 'N/A') for field in fields])
+            
+            cursor.execute('''
+                INSERT INTO jobs 
+                (title, company_name, location, description, requirements, 
+                 qualifications, salary_range, nature_of_work, job_id, url, category, scraped_at)
+                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
+                ON CONFLICT (job_id) DO NOTHING
+            ''', (
+                job_data.get("title", "N/A"),
+                job_data.get("company_name", "N/A"),
+                job_data.get("location", "N/A"),
+                job_data.get("description", "N/A"),
+                job_data.get("requirements", "N/A"),
+                job_data.get("qualifications", "N/A"),
+                job_data.get("salary_range", "N/A"),
+                job_data.get("nature_of_work", "N/A"),
+                job_data.get("job_id", "N/A"),
+                job_data.get("url", "N/A"),
+                job_data.get("category", "N/A"),
+                job_data.get("scraped_at")
+            ))
+            
            conn.commit()
+            cursor.close()
+            conn.close()
+            
+            print(f"  💾 Saved job to category '{job_data.get('category', 'N/A')}' with job_id: {job_data.get('job_id', 'N/A')}")
+            
+        except Exception as e:
+            print(f"❌ Database save error: {e}")

    async def _save_to_markdown(self, job_data: Dict[str, Any], keyword: str):
        os.makedirs("linkedin_jobs", exist_ok=True)
@ -164,7 +281,15 @@ class LLMJobRefiner:
                f.write(f"# LinkedIn Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
            f.write(f"## Job: {job_data.get('title', 'N/A')}\n\n")
            f.write(f"- **Keyword**: {keyword}\n")
-            for key, value in job_data.items():
-                if key != 'title':
-                    f.write(f"- **{key.replace('_', ' ').title()}**: {value}\n")
-            f.write("\n---\n\n")
+            f.write(f"- **Company**: {job_data.get('company_name', 'N/A')}\n")
+            f.write(f"- **Location**: {job_data.get('location', 'N/A')}\n")
+            f.write(f"- **Nature of Work**: {job_data.get('nature_of_work', 'N/A')}\n")
+            f.write(f"- **Salary Range**: {job_data.get('salary_range', 'N/A')}\n")
+            f.write(f"- **Job ID**: {job_data.get('job_id', 'N/A')}\n")
+            f.write(f"- **Category**: {job_data.get('category', 'N/A')}\n")
+            f.write(f"- **Scraped At**: {job_data.get('scraped_at', 'N/A')}\n")
+            f.write(f"- **URL**: <{job_data.get('url', 'N/A')}>\n\n")
+            f.write(f"### Description\n\n{job_data.get('description', 'N/A')}\n\n")
+            f.write(f"### Requirements\n\n{job_data.get('requirements', 'N/A')}\n\n")
+            f.write(f"### Qualifications\n\n{job_data.get('qualifications', 'N/A')}\n\n")
+            f.write("---\n\n")
--- a/scraping_engine.py
+++ b/scraping_engine.py
@ -69,7 +69,7 @@ class FingerprintScrapingEngine:
        self.optimization_params = {
            "base_delay": 2.0,
            "max_concurrent_requests": 4,
-            "request_timeout": 60000,
+            "request_timeout": 120000,
            "retry_attempts": 3,
            "captcha_handling_strategy": "avoid",  # or "solve_fallback"
            "cloudflare_wait_strategy": "smart_wait",  # or "aggressive_reload"
@ -155,7 +155,7 @@ class FingerprintScrapingEngine:

        # Increase timeout if avg response time is high
        if avg_rt > 20:
-            self.optimization_params["request_timeout"] = 90000   # 90 seconds
+            self.optimization_params["request_timeout"] = 150000   # 90 seconds

        print(f"Optimization Params Updated: {self.optimization_params}")

@ -371,7 +371,7 @@ class FingerprintScrapingEngine:
            # Reload occasionally to trigger potential client-side checks
            if (time.time() - start_time) > 15 and (time.time() - start_time) % 20 < 2:
                print("Reloading page during Cloudflare wait...")
-                await page.reload(wait_until='load', timeout=30000)
+                await page.reload(wait_until='load', timeout=80000)

        print("Timeout waiting for Cloudflare resolution.")
        return False