6 changed files with 166 additions and 330 deletions
--- a/config.py
+++ b/config.py
@ -8,9 +8,9 @@ from dotenv import load_dotenv
 load_dotenv()

 # LLM Agent Configuration
-DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")
-if not DEEPSEEK_API_KEY:
-    raise ValueError("DEEPSEEK_API_KEY environment variable not set in .env file")
+GEMINI_API_KEY = os.getenv("XAI_API_KEY")
+if not GEMINI_API_KEY:
+    raise ValueError("XAI_API_KEY environment variable not set in .env file")


 def load_spoof_config():
--- a/fetcher.py
+++ b/fetcher.py
@ -23,11 +23,11 @@ class StealthyFetcher:
                print(f"Attempt {attempt + 1} to fetch {url}")
                page = await self.context.new_page()

-                await page.goto(url, wait_until='load', timeout=120000)
+                await page.goto(url, wait_until='load', timeout=60000)

                if wait_for_selector:
                    try:
-                        await page.wait_for_selector(wait_for_selector, timeout=40000)
+                        await page.wait_for_selector(wait_for_selector, timeout=10000)
                    except PlaywrightTimeoutError:
                        print(f"Selector {wait_for_selector} not found immediately, continuing...")

@ -88,7 +88,7 @@ class StealthyFetcher:
    async def _is_content_accessible(self, page: Page, wait_for_selector: Optional[str] = None) -> bool:
        if wait_for_selector:
            try:
-                await page.wait_for_selector(wait_for_selector, timeout=40000)
+                await page.wait_for_selector(wait_for_selector, timeout=5000)
                return True
            except PlaywrightTimeoutError:
                pass
@ -118,7 +118,7 @@ class StealthyFetcher:

            if (time.time() - start_time) > 15 and (time.time() - start_time) % 20 < 2:
                print("🔄 Reloading page during Cloudflare wait...")
-                await page.reload(wait_until='load', timeout=120000)
+                await page.reload(wait_until='load', timeout=30000)

        print("⏰ Timeout waiting for Cloudflare resolution.")
        return False
--- a/job_scraper2.py
+++ b/job_scraper2.py
@ -1,12 +1,13 @@
 import asyncio
 import random
+import sqlite3
+import os
 from typing import Optional, Dict
 from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
 from browserforge.injectors.playwright import AsyncNewContext
 from llm_agent import LLMJobRefiner
 import re
 from fetcher import StealthyFetcher
-from datetime import datetime


 class LinkedInJobScraper:
@ -25,8 +26,25 @@ class LinkedInJobScraper:
        self.llm_agent = LLMJobRefiner()

    def _init_db(self):
-        # This method is kept for backward compatibility but LLMJobRefiner handles PostgreSQL now
-        pass
+        os.makedirs(os.path.dirname(self.db_path) if os.path.dirname(self.db_path) else ".", exist_ok=True)
+        with sqlite3.connect(self.db_path) as conn:
+            cursor = conn.cursor()
+            cursor.execute('''
+                CREATE TABLE IF NOT EXISTS jobs (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    title TEXT,
+                    company_name TEXT,
+                    location TEXT,
+                    description TEXT,
+                    requirements TEXT,
+                    qualifications TEXT,
+                    salary_range TEXT,
+                    nature_of_work TEXT,
+                    job_id TEXT,
+                    url TEXT UNIQUE
+                )
+            ''')
+            conn.commit()

    async def _human_click(self, page, element, wait_after: bool = True):
        if not element:
@ -43,7 +61,7 @@ class LinkedInJobScraper:

    async def _login(self, page, credentials: Dict) -> bool:
        print("🔐 Navigating to LinkedIn login page...")
-        await page.goto("https://www.linkedin.com/login", timeout=120000)
+        await page.goto("https://www.linkedin.com/login", timeout=60000)
        await asyncio.sleep(random.uniform(2.0, 3.5) * self.human_speed)

        email_field = await page.query_selector('input[name="session_key"]')
@ -86,11 +104,7 @@ class LinkedInJobScraper:
        print("❌ Login may have failed.")
        return False

-    async def _extract_page_content_for_llm(self, page) -> str:
-        """
-        Extract raw page content as HTML/text for LLM processing
-        The LLM will handle all extraction logic, not specific selectors
-        """
+    async def _extract_all_page_content(self, page) -> str:
        await asyncio.sleep(2 * self.human_speed)
        await self.engine._human_like_scroll(page)
        await asyncio.sleep(2 * self.human_speed)
@ -158,7 +172,7 @@ class LinkedInJobScraper:
                await self._human_click(page, next_btn)
                await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)
                try:
-                    await page.wait_for_function("() => window.location.href.includes('start=')", timeout=120000)
+                    await page.wait_for_function("() => window.location.href.includes('start=')", timeout=60000)
                except:
                    pass
                current_page += 1
@ -233,7 +247,7 @@ class LinkedInJobScraper:

            if session_loaded:
                print("🔁 Using saved session — verifying login...")
-                await page.goto("https://www.linkedin.com/feed/", timeout=120000)
+                await page.goto("https://www.linkedin.com/feed/", timeout=60000)
                if "feed" in page.url and "login" not in page.url:
                    print("✅ Session still valid.")
                    login_successful = True
@ -255,7 +269,7 @@ class LinkedInJobScraper:
                print("ℹ️ No credentials — proceeding as guest.")
                login_successful = True

-            await page.wait_for_load_state("load", timeout=120000)
+            await page.wait_for_load_state("load", timeout=60000)
            print("✅ Post-login page fully loaded. Starting search...")

            # >>> PROTECTION CHECK USING FETCHER LOGIC <<<
@ -278,7 +292,7 @@ class LinkedInJobScraper:
                    print("✅ Protection present but content accessible — proceeding.")

            print(f"🔍 Searching for: {search_keywords}")
-            await page.goto(search_url, wait_until='load', timeout=120000)
+            await page.goto(search_url, wait_until='load', timeout=60000)
            await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)

            # >>> PROTECTION CHECK ON SEARCH PAGE <<<
@ -308,7 +322,7 @@ class LinkedInJobScraper:
            print(f"   ➕ Found {initial_jobs} initial job(s) (total: {len(all_job_links)})")

            iteration = 1
-            while True and iteration >= 5:
+            while True:
                print(f"🔄 Iteration {iteration}: Checking for new jobs...")

                prev_job_count = len(all_job_links)
@ -341,6 +355,10 @@ class LinkedInJobScraper:
                        print("🔚 No new jobs found after refresh. Stopping.")
                        break

+                if iteration > 10:
+                    print("🔄 Maximum iterations reached. Stopping.")
+                    break
+
            print(f"✅ Collected {len(all_job_links)} unique job links.")

            scraped_count = 0
@ -368,9 +386,8 @@ class LinkedInJobScraper:
                        if apply_btn:
                            break

-                    final_url = full_url
-                    external_url = None
-                    page_content = None
+                    page_data = None
+                    final_url = job_page.url

                    if apply_btn:
                        print("  → Clicking 'Apply' / 'Easy Apply' button...")
@ -382,61 +399,44 @@ class LinkedInJobScraper:
                        try:
                            external_page = await asyncio.wait_for(page_waiter, timeout=5.0)
                            print("  🌐 External job site opened in new tab.")
-                            await external_page.wait_for_load_state("load", timeout=120000)
+                            await external_page.wait_for_load_state("load", timeout=60000)
                            await asyncio.sleep(2 * self.human_speed)
                            await self.engine._human_like_scroll(external_page)
                            await asyncio.sleep(2 * self.human_speed)

-                            # Extract raw content from external page for LLM processing
-                            external_url = external_page.url
-                            final_url = external_url
-                            page_content = await self._extract_page_content_for_llm(external_page)
+                            page_data = await self._extract_all_page_content(external_page)
+                            final_url = external_page.url

                            if not external_page.is_closed():
                                await external_page.close()

                        except asyncio.TimeoutError:
                            print("  🖥️ No external tab — scraping LinkedIn job page directly.")
-                            await job_page.wait_for_timeout(60000)
+                            await job_page.wait_for_timeout(2000)
                            try:
-                                await job_page.wait_for_selector("div.jobs-apply-button--fixed, div.jobs-easy-apply-modal", timeout=80000)
+                                await job_page.wait_for_selector("div.jobs-apply-button--fixed, div.jobs-easy-apply-modal", timeout=8000)
                            except PlaywrightTimeoutError:
                                pass
                            await self.engine._human_like_scroll(job_page)
                            await asyncio.sleep(2 * self.human_speed)
-                            page_content = await self._extract_page_content_for_llm(job_page)
+                            page_data = await self._extract_all_page_content(job_page)
                    else:
                        print("    ⚠️ No 'Apply' button found — scraping job details directly.")
                        await self.engine._human_like_scroll(job_page)
                        await asyncio.sleep(2 * self.human_speed)
-                        page_content = await self._extract_page_content_for_llm(job_page)
+                        page_data = await self._extract_all_page_content(job_page)

-                    job_id = full_url.split("/")[-2] if "/jobs/view/" in full_url else "unknown"
+                    job_id = final_url.split("/")[-2] if "/jobs/view/" in final_url else "unknown"

                    raw_data = {
-                        "page_content": page_content,
-                        "url": final_url,
-                        "job_id": job_id,
-                        "search_keywords": search_keywords
+                        "page_content": page_data,
+                        "url": job_page.url,
+                        "job_id": job_page.url.split("/")[-2] if "/jobs/view/" in job_page.url else "unknown"
                    }

-                    # LLM agent is now fully responsible for extraction and validation
                    refined_data = await self.llm_agent.refine_job_data(raw_data, self.user_request)

                    if refined_data and refined_data.get("title", "N/A") != "N/A":
-                        # Ensure compulsory fields are present (fallback if LLM missed them)
-                        compulsory_fields = ['company_name', 'job_id', 'url']
-                        for field in compulsory_fields:
-                            if not refined_data.get(field) or refined_data[field] in ["N/A", "", "Unknown"]:
-                                if field == 'job_id':
-                                    refined_data[field] = job_id
-                                elif field == 'url':
-                                    refined_data[field] = final_url
-                                elif field == 'company_name':
-                                    refined_data[field] = "Unknown Company"
-                        
-                        refined_data['scraped_at'] = datetime.now().isoformat()
-                        refined_data['category'] = clean_keywords
                        await self.llm_agent.save_job_data(refined_data, search_keywords)
                        scraped_count += 1
                        print(f"  ✅ Scraped and refined: {refined_data['title'][:50]}...")
@ -455,7 +455,7 @@ class LinkedInJobScraper:

                finally:
                    print("  ↩️ Returning to LinkedIn search results...")
-                    await page.goto(search_url, timeout=120000)
+                    await page.goto(search_url, timeout=60000)
                    await asyncio.sleep(4 * self.human_speed)

            await browser.close()
--- a/linkedin_main.py
+++ b/linkedin_main.py
@ -4,8 +4,6 @@ from job_scraper2 import LinkedInJobScraper
 import os
 from dotenv import load_dotenv
 import asyncio
-import random
-import time

 # Load environment variables
 load_dotenv()
@ -13,7 +11,7 @@ load_dotenv()

 async def main():
    engine = FingerprintScrapingEngine(
-        seed="job_scraping_12",
+        seed="job_scraping_123",
        target_os="windows",
        db_path="job_listings.db",
        markdown_path="job_listings.md"
@ -22,50 +20,13 @@ async def main():
    # Initialize scraper with target field
    scraper = LinkedInJobScraper(engine, human_speed=1.6, user_request="Extract title, company, location, description, requirements, qualifications, nature of job(remote, onsite, hybrid) and salary")

-    # List of job titles to cycle through
-    job_titles = [
-        "Software Engineer",
-        "Data Scientist",
-        "Product Manager",
-        "UX Designer",
-        "DevOps Engineer",
-        "Machine Learning Engineer",
-        "Frontend Developer",
-        "Backend Developer",
-        "Full Stack Developer",
-        "Data Analyst"
-    ]
-    
-    fixed_location = "New York"
-    
-    # Keep cycling through all job titles
-    while True:
-        # Shuffle job titles to randomize order
-        random.shuffle(job_titles)
-        
-        for job_title in job_titles:
-            search_keywords = f"{job_title} location:{fixed_location}"
-            
-            print(f"\n{'='*60}")
-            print(f"Starting scrape for: {search_keywords}")
-            print(f"{'='*60}")
-            
-            await scraper.scrape_jobs(
-                search_keywords=search_keywords,
-                credentials={
-                    "email": os.getenv("SCRAPING_USERNAME"),
-                    "password": os.getenv("SCRAPING_PASSWORD")
-                }
-            )
-            
-            print(f"\n✅ Completed scraping for: {job_title}")
-            print(f"⏳ Waiting 2 minutes before next job title...")
-            
-            # Wait 2 minutes before next job title
-            time.sleep(120)
-        
-        print(f"\n✅ Completed full cycle of all job titles")
-        print(f"🔄 Starting new cycle...")
+    await scraper.scrape_jobs(
+        search_keywords="Web Designer location:New York",
+        credentials={
+            "email": os.getenv("SCRAPING_USERNAME"),
+            "password": os.getenv("SCRAPING_PASSWORD")
+        }
+    )

 if __name__ == "__main__":
    asyncio.run(main())
--- a/llm_agent.py
+++ b/llm_agent.py
@ -1,219 +1,131 @@

 from openai import OpenAI
-from typing import Dict, Any
+from typing import Dict, Any, Optional
 import asyncio
-import psycopg2
+import sqlite3
 import os
 from datetime import datetime
 import json
 import re
-from bs4 import BeautifulSoup
 from dotenv import load_dotenv

-# Load environment variables from .env
+# ✅ Actually load .env
 load_dotenv()

-
 class LLMJobRefiner:
    def __init__(self):
-        deepseek_api_key = os.getenv("DEEPSEEK_API_KEY")
-        if not deepseek_api_key:
-            raise ValueError("DEEPSEEK_API_KEY not found in .env file.")
+        xai_api_key = os.getenv("XAI_API_KEY")
+        if not xai_api_key:
+            raise ValueError("XAI_API_KEY not found in environment variables.")
        
-        # Database credentials from .env
-        self.db_url = os.getenv("DB_URL")
-        self.db_username = os.getenv("DB_USERNAME")
-        self.db_password = os.getenv("DB_PASSWORD")
-        self.db_host = os.getenv("DB_HOST")
-        self.db_port = os.getenv("DB_PORT")
+        self.client = OpenAI(api_key=xai_api_key, base_url="https://api.x.ai/v1")
+        self.model = "grok-4-latest"
+        self.extraction_schema_cache = {}

-        if not self.db_url or not self.db_username or not self.db_password:
-            raise ValueError("Database credentials not found in .env file.")
-        
-        # DeepSeek uses OpenAI-compatible API
-        self.client = OpenAI(
-            api_key=deepseek_api_key,
-            base_url="https://api.deepseek.com/v1"
-        )
-        self.model = "deepseek-chat"
-        self._init_db()
-
-    def _init_db(self):
-        """Initialize PostgreSQL database connection and create table"""
-        try:
-            self.db_url = os.getenv("DB_URL")
-            if self.db_url and "supabase.com" in self.db_url:
-                conn = psycopg2.connect(
-                    host=self.db_host,
-                    port=self.db_port,
-                    database="postgres",
-                    user=self.db_username,
-                    password=self.db_password
-                )
-            else:
-                conn = psycopg2.connect(
-                    host=self.db_host,
-                    port=self.db_port,
-                    database="postgres",
-                    user=self.db_username,
-                    password=self.db_password
-            )
-            cursor = conn.cursor()
-            
-            cursor.execute('''
-                CREATE TABLE IF NOT EXISTS jobs (
-                    id SERIAL PRIMARY KEY,
-                    title TEXT,
-                    company_name TEXT,
-                    location TEXT,
-                    description TEXT,
-                    requirements TEXT,
-                    qualifications TEXT,
-                    salary_range TEXT,
-                    nature_of_work TEXT,
-                    job_id TEXT UNIQUE,
-                    url TEXT,
-                    category TEXT,
-                    scraped_at TIMESTAMP,
-                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
-                )
-            ''')
-            
-            # Ensure the uniqueness constraint exists
-            cursor.execute('''
-            ALTER TABLE jobs DROP CONSTRAINT IF EXISTS jobs_job_id_key;
-            ALTER TABLE jobs ADD CONSTRAINT jobs_job_id_key UNIQUE (job_id);
-        ''')
-            
-            cursor.execute('CREATE INDEX IF NOT EXISTS idx_job_id ON jobs(job_id)')
-            cursor.execute('CREATE INDEX IF NOT EXISTS idx_category ON jobs(category)')
-            
-            conn.commit()
-            cursor.close()
-            conn.close()
-            print("✅ PostgreSQL database initialized successfully")
-        except Exception as e:
-            print(f"❌ Database initialization error: {e}")
-            raise
-
-    def _clean_html_for_llm(self, html_content: str) -> str:
-        """Clean HTML to make it more readable for LLM while preserving structure"""
-        try:
-            soup = BeautifulSoup(html_content, 'html.parser')
-            
-            # Remove script and style elements
-            for script in soup(["script", "style", "nav", "footer", "header"]):
-                script.decompose()
-            
-            # Extract text but keep some structure
-            text = soup.get_text(separator=' ', strip=True)
-            
-            # Clean up whitespace
-            text = re.sub(r'\s+', ' ', text)
-            
-            # Limit length for LLM context
-            if len(text) > 10000:
-                text = text[:10000] + "..."
-            
-            return text
-        except Exception as e:
-            print(f"HTML cleaning error: {e}")
-            # Fallback to raw content if cleaning fails
-            return html_content[:100000] if len(html_content) > 100000 else html_content
-
-    def _generate_content_sync(self, prompt: str) -> str:
-        """Synchronous call to DeepSeek API"""
+    def generate_content(self, prompt: str, system_message: str = "You are a helpful assistant.") -> str:
+        """Synchronous method to call Grok via xAI API."""
        try:
            response = self.client.chat.completions.create(
                model=self.model,
-                messages=[{"role": "user", "content": prompt}],
+                messages=[
+                    {"role": "system", "content": system_message},
+                    {"role": "user", "content": prompt}
+                ],
                temperature=0.2,
                max_tokens=2048,
                stream=False
            )
            return response.choices[0].message.content or ""
        except Exception as e:
-            print(f"DeepSeek API error: {e}")
+            print(f"Error in Grok API call: {e}")
            return ""

-    async def refine_job_data(self, raw_data: Dict[str, Any], target_field: str) -> Dict[str, Any]:
-        # Clean the raw HTML content for better LLM processing
+    async def refine_job_data(self, raw_data: Dict[str, Any], user_request: str) -> Optional[Dict[str, Any]]:
        page_content = raw_data.get('page_content', '')
-        cleaned_content = self._clean_html_for_llm(page_content)
+        if not page_content:
+            return None

-        # Get job_id and url from raw data
-        job_id = raw_data.get('job_id', 'unknown')
-        url = raw_data.get('url', 'N/A')
+        schema_key = user_request.lower().strip()
+        extraction_schema = self.extraction_schema_cache.get(schema_key)
+        if not extraction_schema:
+            extraction_schema = await self._generate_extraction_schema(user_request)
+            if extraction_schema:
+                self.extraction_schema_cache[schema_key] = extraction_schema
+            else:
+                extraction_schema = self._get_default_schema()

        prompt = f"""
-        You are a job posting data extractor with two modes:
+        You are a highly skilled web data extraction assistant. Your task is to analyze the raw HTML content of a job posting page and extract specific information requested by the user.
+        The user's request is: "{user_request}"
+        The raw HTML content of the page is provided below (limited in size). The content might be noisy or unstructured.
+        Your goal is to:
+        1. Analyze the HTML structure to identify relevant sections.
+        2. Extract the requested information accurately.
+        3. Clean up formatting issues.
+        4. If a field cannot be found, use "N/A".
+        5. Return ONLY the extracted data in a JSON object based on the following schema:
+        {json.dumps(extraction_schema, indent=2)}
+        Raw Page Content (HTML):
+        {page_content[:6000]}

-        PRIMARY MODE (PREFERRED):
-        - Extract EXACT text as it appears on the page for all fields
-        - DO NOT summarize, paraphrase, or interpret
-        - Copy verbatim content including original wording and formatting
-
-        FALLBACK MODE (ONLY IF FIELD IS MISSING):
-        - If a field is NOT explicitly stated anywhere in the content, you MAY infer it using clear contextual clues
-        - Inference rules:
-          * company_name: Look for patterns like "at [Company]", "Join [Company]", "[Company] is hiring"
-          * nature_of_work: Look for "remote", "onsite", "hybrid", "work from home", "office-based"
-          * location: Extract city/state/country mentions near job title or details
-          * title: Use the largest/primary heading if no explicit "job title" label exists
-
-        REQUIRED FIELDS (must always have a value):
-        - title: Exact job title or best inference
-        - company_name: Exact company name or best inference  
-        - job_id: Use provided: {job_id}
-        - url: Use provided: {url}
-
-        OPTIONAL FIELDS (use exact text or "N/A" if not present and not inferable):
-        - location
-        - description  
-        - requirements
-        - qualifications
-        - salary_range
-        - nature_of_work
-
-        Page Content:
-        {cleaned_content}
-        Response format (ONLY return this JSON):
-        {{
-            "title": "...",
-            "company_name": "...",
-            "location": "...",
-            "description": "...",
-            "requirements": "...",
-            "qualifications": "...",
-            "salary_range": "...",
-            "nature_of_work": "...",
-            "job_id": "{job_id}",
-            "url": "{url}"
-        }}
+        Respond with the JSON object containing the extracted data.
        """

        try:
+            # ✅ Use self (current instance), NOT a new LLMJobRefiner()
            response_text = await asyncio.get_event_loop().run_in_executor(
                None,
-                lambda: self._generate_content_sync(prompt)
+                lambda: self.generate_content(prompt)
            )
            refined_data = self._parse_llm_response(response_text)
+            if not refined_data:
+                return None

-            # Final validation - ensure required fields are present and meaningful
-            if refined_data:
-                required_fields = ['title', 'company_name', 'job_id', 'url']
-                for field in required_fields:
-                    if not refined_data.get(field) or refined_data[field] in ["N/A", "", "Unknown", "Company", "Job"]:
-                        return None  # LLM failed to extract properly
-                
-                return refined_data
-            return None
-            
+            refined_data['job_id'] = raw_data.get('job_id', 'unknown')
+            refined_data['url'] = raw_data.get('url', 'N/A')
+            return refined_data
        except Exception as e:
            print(f"LLM refinement failed: {str(e)}")
            return None

-    def _parse_llm_response(self, response_text: str) -> Dict[str, Any]:
+    async def _generate_extraction_schema(self, user_request: str) -> Optional[Dict[str, str]]:
+        schema_prompt = f"""
+        Based on the user's request: "{user_request}", generate a JSON schema for the data they want to extract from a job posting.
+        The schema should be a dictionary where keys are field names (snake_case) and values are short descriptions.
+        Include standard fields like title, company_name, location, description, etc., if relevant.
+        Respond with only the JSON schema.
+        """
+        try:
+            # ✅ Use self.generate_content, NOT self.model.generate_content
+            schema_text = await asyncio.get_event_loop().run_in_executor(
+                None,
+                lambda: self.generate_content(schema_prompt)
+            )
+            json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', schema_text, re.DOTALL)
+            if not json_match:
+                json_match = re.search(r'\{.*\}', schema_text, re.DOTALL)
+            if not json_match:
+                return None
+
+            json_str = json_match.group(1) if '```' in schema_text else json_match.group(0)
+            return json.loads(json_str)
+        except Exception as e:
+            print(f"Schema generation failed: {str(e)}")
+            return None
+
+    def _get_default_schema(self) -> Dict[str, str]:
+        return {
+            "title": "The job title",
+            "company_name": "The name of the company",
+            "location": "The location of the job",
+            "description": "The full job description",
+            "requirements": "List of job requirements",
+            "qualifications": "List of required qualifications",
+            "salary_range": "The salary range mentioned",
+            "nature_of_work": "Remote, onsite, or hybrid"
+        }
+
+    def _parse_llm_response(self, response_text: str) -> Optional[Dict[str, Any]]:
        json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', response_text, re.DOTALL)
        if not json_match:
            json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
@ -230,46 +142,17 @@ class LLMJobRefiner:
        await self._save_to_markdown(job_data, keyword)

    async def _save_to_db(self, job_data: Dict[str, Any]):
-        """Save job data to PostgreSQL database with job_id uniqueness"""
-        try:
-            conn = psycopg2.connect(
-                    host=self.db_host,
-                    port=self.db_port,
-                    database="postgres",
-                    user=self.db_username,
-                    password=self.db_password
-            )
+        db_path = "linkedin_jobs.db"
+        os.makedirs(os.path.dirname(db_path) or ".", exist_ok=True)
+        with sqlite3.connect(db_path) as conn:
            cursor = conn.cursor()
-            
-            cursor.execute('''
-                INSERT INTO jobs 
-                (title, company_name, location, description, requirements, 
-                 qualifications, salary_range, nature_of_work, job_id, url, category, scraped_at)
-                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
-                ON CONFLICT (job_id) DO NOTHING
-            ''', (
-                job_data.get("title", "N/A"),
-                job_data.get("company_name", "N/A"),
-                job_data.get("location", "N/A"),
-                job_data.get("description", "N/A"),
-                job_data.get("requirements", "N/A"),
-                job_data.get("qualifications", "N/A"),
-                job_data.get("salary_range", "N/A"),
-                job_data.get("nature_of_work", "N/A"),
-                job_data.get("job_id", "N/A"),
-                job_data.get("url", "N/A"),
-                job_data.get("category", "N/A"),
-                job_data.get("scraped_at")
-            ))
-            
+            fields = list(job_data.keys())
+            placeholders = ', '.join(['?' for _ in fields])
+            columns = ', '.join([f'"{col}"' for col in fields])  # Escape column names
+            cursor.execute(f"CREATE TABLE IF NOT EXISTS jobs ({columns})")
+            cursor.execute(f'INSERT INTO jobs ({columns}) VALUES ({placeholders})', 
+                          [job_data.get(field, 'N/A') for field in fields])
            conn.commit()
-            cursor.close()
-            conn.close()
-            
-            print(f"  💾 Saved job to category '{job_data.get('category', 'N/A')}' with job_id: {job_data.get('job_id', 'N/A')}")
-            
-        except Exception as e:
-            print(f"❌ Database save error: {e}")

    async def _save_to_markdown(self, job_data: Dict[str, Any], keyword: str):
        os.makedirs("linkedin_jobs", exist_ok=True)
@ -281,15 +164,7 @@ class LLMJobRefiner:
                f.write(f"# LinkedIn Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
            f.write(f"## Job: {job_data.get('title', 'N/A')}\n\n")
            f.write(f"- **Keyword**: {keyword}\n")
-            f.write(f"- **Company**: {job_data.get('company_name', 'N/A')}\n")
-            f.write(f"- **Location**: {job_data.get('location', 'N/A')}\n")
-            f.write(f"- **Nature of Work**: {job_data.get('nature_of_work', 'N/A')}\n")
-            f.write(f"- **Salary Range**: {job_data.get('salary_range', 'N/A')}\n")
-            f.write(f"- **Job ID**: {job_data.get('job_id', 'N/A')}\n")
-            f.write(f"- **Category**: {job_data.get('category', 'N/A')}\n")
-            f.write(f"- **Scraped At**: {job_data.get('scraped_at', 'N/A')}\n")
-            f.write(f"- **URL**: <{job_data.get('url', 'N/A')}>\n\n")
-            f.write(f"### Description\n\n{job_data.get('description', 'N/A')}\n\n")
-            f.write(f"### Requirements\n\n{job_data.get('requirements', 'N/A')}\n\n")
-            f.write(f"### Qualifications\n\n{job_data.get('qualifications', 'N/A')}\n\n")
-            f.write("---\n\n")
+            for key, value in job_data.items():
+                if key != 'title':
+                    f.write(f"- **{key.replace('_', ' ').title()}**: {value}\n")
+            f.write("\n---\n\n")
--- a/scraping_engine.py
+++ b/scraping_engine.py
@ -69,7 +69,7 @@ class FingerprintScrapingEngine:
        self.optimization_params = {
            "base_delay": 2.0,
            "max_concurrent_requests": 4,
-            "request_timeout": 120000,
+            "request_timeout": 60000,
            "retry_attempts": 3,
            "captcha_handling_strategy": "avoid",  # or "solve_fallback"
            "cloudflare_wait_strategy": "smart_wait",  # or "aggressive_reload"
@ -155,7 +155,7 @@ class FingerprintScrapingEngine:

        # Increase timeout if avg response time is high
        if avg_rt > 20:
-            self.optimization_params["request_timeout"] = 150000   # 90 seconds
+            self.optimization_params["request_timeout"] = 90000   # 90 seconds

        print(f"Optimization Params Updated: {self.optimization_params}")

@ -371,7 +371,7 @@ class FingerprintScrapingEngine:
            # Reload occasionally to trigger potential client-side checks
            if (time.time() - start_time) > 15 and (time.time() - start_time) % 20 < 2:
                print("Reloading page during Cloudflare wait...")
-                await page.reload(wait_until='load', timeout=80000)
+                await page.reload(wait_until='load', timeout=30000)

        print("Timeout waiting for Cloudflare resolution.")
        return False