Increase timeout for selector waits and refine job extraction logic in LLMJobRefiner and CryptoJobScraper

2025-12-30 12:19:18 +01:00 · 2025-12-30 12:19:18 +01:00 · 06f8e8b086
commit 06f8e8b086
parent 38ef08c734
4 changed files with 425 additions and 224 deletions
--- a/fetcher.py
+++ b/fetcher.py
@ -27,7 +27,7 @@ class StealthyFetcher:
                if wait_for_selector:
                    try:
-                        await page.wait_for_selector(wait_for_selector, timeout=40000)
+                        await page.wait_for_selector(wait_for_selector, timeout=120000)
                    except PlaywrightTimeoutError:
                        print(f"Selector {wait_for_selector} not found immediately, continuing...")
@ -88,7 +88,7 @@ class StealthyFetcher:
    async def _is_content_accessible(self, page: Page, wait_for_selector: Optional[str] = None) -> bool:
        if wait_for_selector:
            try:
-                await page.wait_for_selector(wait_for_selector, timeout=40000)
+                await page.wait_for_selector(wait_for_selector, timeout=120000)
                return True
            except PlaywrightTimeoutError:
                pass
--- a/llm_agent.py
+++ b/llm_agent.py
@ -21,13 +21,12 @@ class LLMJobRefiner:
            raise ValueError("DEEPSEEK_API_KEY not found in .env file.")
        # Database credentials from .env
        self.db_url = os.getenv("DB_URL")
        self.db_username = os.getenv("DB_USERNAME")
        self.db_password = os.getenv("DB_PASSWORD")
        self.db_host = os.getenv("DB_HOST")
        self.db_port = os.getenv("DB_PORT")
-        if not self.db_url or not self.db_username or not self.db_password:
+        if not self.db_username or not self.db_password:
            raise ValueError("Database credentials not found in .env file.")
        # DeepSeek uses OpenAI-compatible API
@ -41,22 +40,12 @@ class LLMJobRefiner:
    def _init_db(self):
        """Initialize PostgreSQL database connection and create table"""
        try:
-            self.db_url = os.getenv("DB_URL")
+            conn = psycopg2.connect(
-            if self.db_url and "supabase.com" in self.db_url:
+                host=self.db_host,
-                conn = psycopg2.connect(
+                port=self.db_port,
-                    host=self.db_host,
+                database="postgres",
-                    port=self.db_port,
+                user=self.db_username,
-                    database="postgres",
+                password=self.db_password
                    user=self.db_username,
                    password=self.db_password
                )
            else:
                conn = psycopg2.connect(
                    host=self.db_host,
                    port=self.db_port,
                    database="postgres",
                    user=self.db_username,
                    password=self.db_password
            )
            cursor = conn.cursor()
@ -113,8 +102,8 @@ class LLMJobRefiner:
            text = re.sub(r'\s+', ' ', text)
            # Limit length for LLM context
-            if len(text) > 10000:
+            if len(text) > 100000:
-                text = text[:10000] + "..."
+                text = text[:100000] + "..."
            return text
        except Exception as e:
@ -128,7 +117,7 @@ class LLMJobRefiner:
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[{"role": "user", "content": prompt}],
-                temperature=0.2,
+                temperature=0.1,
                max_tokens=2048,
                stream=False
            )
@ -145,38 +134,52 @@ class LLMJobRefiner:
        posted_date = raw_data.get('posted_date', datetime.now().strftime("%m/%d/%y"))
        prompt = f"""
-        You are a job posting data extractor.
+You are an expert job posting data extractor. Your task is to extract AND infer fields from the provided job posting.
-        
+
-        EXTRACT EXACT TEXT - DO NOT SUMMARIZE, PARAPHRASE, OR INVENT.
+### CORE RULES:
-        
+1. **NEVER invent, summarize, or paraphrase** — extract **exact wording** when available.
-        For these critical fields, follow these rules:
+2. **For critical fields (title, company_name, job_id, url, description):**
-        - description: Extract ALL job description text. If ANY job details exist (duties, responsibilities, overview), include them. Only use "Not provided" if absolutely no description exists.
+   - These MUST be present and meaningful.
-        - requirements: Extract ALL requirements text. If ANY requirements exist (skills, experience, education needed), include them. Only use "Not provided" if none exist.
+   - If not explicitly stated, **infer from context** (e.g., page title, headings, "About Us", etc.).
-        - qualifications: Extract ALL qualifications text. If ANY qualifications exist, include them. Only use "Not provided" if none exist.
+   - **NEVER return "Not provided" or "N/A" for these fields.**
-        
+3. **For optional fields (location, salary_range, etc.):**
-        REQUIRED FIELDS (must have valid values, never "N/A"):
+   - Extract exact text if present.
-        - title, company_name, job_id, url
+   - If absent but inferable (e.g., "Remote (US)", "Full-time"), **infer it**.
-        
+   - Only return "Not provided" if truly absent and non-inferable.
-        OPTIONAL FIELDS (can be "Not provided"):
+
-        - location, salary_range, nature_of_work
+### FIELD DEFINITIONS:
-        
+- **title**: The job title. Look in <h1>, page title, or bold headings.
-        Page Content:
+- **company_name**: Company name. Look in logo alt text, footer, "About [X]", or page title.
-        {cleaned_content}
+- **description**: Main job overview, responsibilities, or duties. Combine relevant paragraphs if needed. **Must not be empty.**
-        
+- **requirements**: Required skills, experience, or qualifications.
-        Response format (ONLY return this JSON):
+- **qualifications**: Educational or certification requirements.
-        {{
+- **location**: Office location or remote policy.
-            "title": "...",
+- **salary_range**: Exact compensation info.
-            "company_name": "...",
+- **nature_of_work**: Employment type (Full-time, Contract, Internship, etc.).
-            "location": "...",
+
-            "description": "...",
+### OUTPUT FORMAT:
-            "requirements": "...",
+Return ONLY a valid JSON object with these keys:
-            "qualifications": "...",
+{{
-            "salary_range": "...",
+    "title": "...",
-            "nature_of_work": "...",
+    "company_name": "...",
-            "job_id": "{job_id}",
+    "location": "...",
-            "url": "{url}"
+    "description": "...",
-        }}
+    "requirements": "...",
-        """
+    "qualifications": "...",
    "salary_range": "...",
    "nature_of_work": "...",
    "job_id": "{job_id}",
    "url": "{url}"
 }}
 - **Critical fields must NEVER be "Not provided", "N/A", empty, or generic** (e.g., "Company", "Job Title").
 - **Optional fields may be "Not provided" ONLY if truly absent.**
 - **Do not include markdown, explanations, or extra text.**
 - **Use double quotes for JSON.**
 Page Content:
 {cleaned_content}
 """
        try:
            response_text = await asyncio.get_event_loop().run_in_executor(
@ -188,31 +191,23 @@ class LLMJobRefiner:
            if not refined_data:
                return None
-            # Validate required fields
+            # Validate critical fields — reject if missing or placeholder
-            required_fields = ['title', 'company_name', 'job_id', 'url']
+            critical_fields = ['title', 'company_name', 'job_id', 'url', 'description']
-            for field in required_fields:
+            for field in critical_fields:
-                if not refined_data.get(field) or refined_data[field].strip() in ["N/A", "", "Unknown", "Company", "Job"]:
+                value = refined_data.get(field, "").strip()
-                    return None
+                if not value or value.lower() in ["n/a", "not provided", "unknown", "company", "job", "title", ""]:
                    print(f"  ❌ Critical field '{field}' is invalid: '{value}'")
                    return None  # This job will NOT be saved — as per requirement
-            # CRITICAL: Validate content fields - check if they SHOULD exist
+            # Optional fields: allow "Not provided", but ensure they're strings
-            content_fields = ['description', 'requirements', 'qualifications']
+            optional_fields = ['location', 'requirements', 'qualifications', 'salary_range', 'nature_of_work']
-            cleaned_original = cleaned_content.lower()
+            for field in optional_fields:
                if field not in refined_data:
                    refined_data[field] = "Not provided"
                elif not isinstance(refined_data[field], str):
                    refined_data[field] = str(refined_data[field])
            # Simple heuristic: if page contains job-related keywords, content fields should NOT be "Not provided"
            job_indicators = ['responsibilit', 'duties', 'require', 'qualifi', 'skill', 'experienc', 'educat', 'degree', 'bachelor', 'master']
            has_job_content = any(indicator in cleaned_original for indicator in job_indicators)
            if has_job_content:
                for field in content_fields:
                    value = refined_data.get(field, "").strip()
                    if value in ["Not provided", "N/A", ""]:
                        # LLM failed to extract existing content
                        print(f"  ⚠️ LLM returned '{value}' for {field} but job content appears present")
                        return None
            # Add the posted_date to the refined data
            refined_data['posted_date'] = posted_date
            return refined_data
        except Exception as e:
@ -220,15 +215,22 @@ class LLMJobRefiner:
            return None
    def _parse_llm_response(self, response_text: str) -> Dict[str, Any]:
        # Try to extract JSON from markdown code block
        json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', response_text, re.DOTALL)
        if not json_match:
-            json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
+            # Try to find raw JSON object
            json_match = re.search(r'\{[^{}]*\{[^{}]*\}[^{}]*\}|\{.*\}', response_text, re.DOTALL)
        if not json_match:
            return None
        try:
-            return json.loads(json_match.group(1) if '```' in response_text else json_match.group(0))
+            json_str = json_match.group(1) if '```' in response_text else json_match.group(0)
-        except json.JSONDecodeError:
+            # Clean common issues
            json_str = re.sub(r'\s+', ' ', json_str)
            json_str = re.sub(r',\s*([\]}\)])', r'\1', json_str)  # Remove trailing commas
            return json.loads(json_str)
        except json.JSONDecodeError as e:
            print(f"JSON parsing error: {e}")
            return None
    async def save_job_data(self, job_data: Dict[str, Any], keyword: str):
@ -239,11 +241,11 @@ class LLMJobRefiner:
        """Save job data to PostgreSQL database with job_id uniqueness"""
        try:
            conn = psycopg2.connect(
-                    host=self.db_host,
+                host=self.db_host,
-                    port=self.db_port,
+                port=self.db_port,
-                    database="postgres",
+                database="postgres",
-                    user=self.db_username,
+                user=self.db_username,
-                    password=self.db_password
+                password=self.db_password
            )
            cursor = conn.cursor()
@ -254,50 +256,50 @@ class LLMJobRefiner:
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (job_id) DO NOTHING
 ''', (
-                job_data.get("title", "N/A"),
+                job_data.get("title", "Not provided"),
-                job_data.get("company_name", "N/A"),
+                job_data.get("company_name", "Not provided"),
-                job_data.get("location", "N/A"),
+                job_data.get("location", "Not provided"),
-                job_data.get("description", "N/A"),
+                job_data.get("description", "Not provided"),
-                job_data.get("requirements", "N/A"),
+                job_data.get("requirements", "Not provided"),
-                job_data.get("qualifications", "N/A"),
+                job_data.get("qualifications", "Not provided"),
-                job_data.get("salary_range", "N/A"),
+                job_data.get("salary_range", "Not provided"),
-                job_data.get("nature_of_work", "N/A"),
+                job_data.get("nature_of_work", "Not provided"),
-                job_data.get("job_id", "N/A"),
+                job_data.get("job_id", "unknown"),
                job_data.get("url", "N/A"),
-                job_data.get("category", "N/A"),
+                job_data.get("category", "all"),
                job_data.get("scraped_at"),
-                job_data.get("posted_date", "N/A")
+                job_data.get("posted_date", datetime.now().strftime("%m/%d/%y"))
            ))
            conn.commit()
            cursor.close()
            conn.close()
-            print(f"  💾 Saved job to category '{job_data.get('category', 'N/A')}' with job_id: {job_data.get('job_id', 'N/A')}")
+            print(f"  💾 Saved job to category '{job_data.get('category', 'all')}' with job_id: {job_data.get('job_id', 'unknown')}")
        except Exception as e:
            print(f"❌ Database save error: {e}")
    async def _save_to_markdown(self, job_data: Dict[str, Any], keyword: str):
-        os.makedirs("linkedin_jobs", exist_ok=True)
+        os.makedirs("crypto_jobs", exist_ok=True)
-        filepath = os.path.join("linkedin_jobs", "linkedin_jobs_scraped.md")
+        filepath = os.path.join("crypto_jobs", "crypto_jobs_scraped.md")
        write_header = not os.path.exists(filepath) or os.path.getsize(filepath) == 0
        with open(filepath, "a", encoding="utf-8") as f:
            if write_header:
-                f.write(f"# LinkedIn Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
+                f.write(f"# Crypto Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
-            f.write(f"## Job: {job_data.get('title', 'N/A')}\n\n")
+            f.write(f"## Job: {job_data.get('title', 'Not provided')}\n\n")
            f.write(f"- **Keyword**: {keyword}\n")
-            f.write(f"- **Company**: {job_data.get('company_name', 'N/A')}\n")
+            f.write(f"- **Company**: {job_data.get('company_name', 'Not provided')}\n")
-            f.write(f"- **Location**: {job_data.get('location', 'N/A')}\n")
+            f.write(f"- **Location**: {job_data.get('location', 'Not provided')}\n")
-            f.write(f"- **Nature of Work**: {job_data.get('nature_of_work', 'N/A')}\n")
+            f.write(f"- **Nature of Work**: {job_data.get('nature_of_work', 'Not provided')}\n")
-            f.write(f"- **Salary Range**: {job_data.get('salary_range', 'N/A')}\n")
+            f.write(f"- **Salary Range**: {job_data.get('salary_range', 'Not provided')}\n")
-            f.write(f"- **Job ID**: {job_data.get('job_id', 'N/A')}\n")
+            f.write(f"- **Job ID**: {job_data.get('job_id', 'unknown')}\n")
            f.write(f"- **Posted Date**: {job_data.get('posted_date', 'N/A')}\n")
-            f.write(f"- **Category**: {job_data.get('category', 'N/A')}\n")
+            f.write(f"- **Category**: {job_data.get('category', 'all')}\n")
            f.write(f"- **Scraped At**: {job_data.get('scraped_at', 'N/A')}\n")
            f.write(f"- **URL**: <{job_data.get('url', 'N/A')}>\n\n")
-            f.write(f"### Description\n\n{job_data.get('description', 'N/A')}\n\n")
+            f.write(f"### Description\n\n{job_data.get('description', 'Not provided')}\n\n")
-            f.write(f"### Requirements\n\n{job_data.get('requirements', 'N/A')}\n\n")
+            f.write(f"### Requirements\n\n{job_data.get('requirements', 'Not provided')}\n\n")
-            f.write(f"### Qualifications\n\n{job_data.get('qualifications', 'N/A')}\n\n")
+            f.write(f"### Qualifications\n\n{job_data.get('qualifications', 'Not provided')}\n\n")
            f.write("---\n\n")
--- a/main.py
+++ b/main.py
@ -1,3 +1,4 @@
 from scraping_engine import FingerprintScrapingEngine
 from scraper import CryptoJobScraper  # Updated class name
 import os
@ -20,16 +21,15 @@ async def main():
    scraper = CryptoJobScraper(engine, human_speed=1.3, user_request="Extract title, company, location, description, requirements, qualifications, nature of work, and salary")
    job_titles = [
-        "Blockchain Engineer",
+        "Customer Support",
-        "Smart Contract Developer",
+        "Design",
-        "DeFi Analyst",
+        "Engineering",
-        "Web3 Developer",
+        "Finance",
-        "Crypto Researcher",
+        "Marketing",
-        "Solidity Developer",
+        "Operations",
-        "Protocol Engineer",
+        "Product",
-        "Tokenomics Specialist",
+        "Sales"       
-        "Zero-Knowledge Proof Engineer",
+
        "Crypto Compliance Officer"
    ]
    while True:
--- a/scraper.py
+++ b/scraper.py
@ -6,10 +6,11 @@ from playwright.async_api import async_playwright, TimeoutError as PlaywrightTim
 from browserforge.injectors.playwright import AsyncNewContext
 from llm_agent import LLMJobRefiner
 import re
 from fetcher import StealthyFetcher
 from datetime import datetime
 import json
 import redis
 from urllib.parse import urlparse
 import hashlib
 class CryptoJobScraper:
@ -25,7 +26,29 @@ class CryptoJobScraper:
        self.human_speed = human_speed
        self.user_request = user_request
        self.llm_agent = LLMJobRefiner()
-        self.redis_client = redis.Redis(host='localhost', port=6379, db=0, decode_responses=True)
+        self.redis_client = redis.Redis(host='=localhost', port=6379, db=0, decode_responses=True)
        self.FORBIDDEN_ATS_DOMAINS = [
            'ashby', 'ashbyhq',
            'greenhouse', 'boards.greenhouse.io',
            'gem', 'gem.com',
            'rippling',
            'myworkday', 'myworkdayjobs',
            'smartrecruiters',
            'workable',
            'lever', 'jobs.lever.co',
        ]
        self.INVALID_CONTENT_PHRASES = [
            "invalid job url",
            "cookie consent",
            "privacy policy",
            "not a valid job",
            "job not found",
            "page not found",
            "The requested job post could not be found. It may have been removed."
            "this page does not contain a job description"
        ]
    async def _human_click(self, page, element, wait_after: bool = True):
        if not element:
@ -55,60 +78,127 @@ class CryptoJobScraper:
        matches = sum(1 for kw in keyword_list if kw in title_lower)
        return matches / len(keyword_list) if keyword_list else 0.0
-    async def _scrape_jobs_from_current_page(self, page, search_keywords: str, seen_job_ids, all_job_links):
+    async def _extract_job_title_from_card(self, card) -> str:
-        current_links = await page.query_selector_all("a[href*='/job/']")
+        try:
-        new_jobs = 0
+            title_selectors = [
-        
+                'h3', 'h2', 'h4',
-        for link in current_links:
+                'strong', 'span'
-            href = await link.get_attribute("href")
+            ]
-            if not href or not href.startswith("http"):
+            for selector in title_selectors:
-                href = "https://cryptocurrencyjobs.co" + href
+                title_element = await card.query_selector(selector)
-            job_id = href.split("/")[-1] if href.endswith("/") else href.split("/")[-1]
+                if title_element:
-            
+                    title_text = await title_element.inner_text()
-            if job_id and job_id not in seen_job_ids:
+                    if title_text and len(title_text.strip()) > 3:
-                title_element = await link.query_selector("h3, .job-title")
+                        return title_text.strip()
-                title = (await title_element.inner_text()) if title_element else "Unknown Title"
+
            card_text = await card.inner_text()
            lines = [line.strip() for line in card_text.split('\n') if line.strip()]
            if lines:
                for line in lines:
                    if len(line) > 5 and not any(skip in line.lower() for skip in ['today', 'featured', 'yesterday', 'company', 'location']):
                        return line
            return "Unknown Title"
        except:
            return "Unknown Title"
    async def _collect_job_elements_from_page(self, page, search_keywords: str, seen_slugs):
        job_cards = []
        job_found = False
        await asyncio.sleep(3 * self.human_speed)
        try:
            await page.wait_for_selector('a[href^="/"][href*="-"]', timeout=60000)
            candidates = await page.query_selector_all('a[href^="/"][href*="-"]')
            for link in candidates:
                href = await link.get_attribute("href") or ""
                href = href.rstrip('/')
                if not href or len(href.split('/')) != 3:
                    continue
                if '-' not in href.split('/')[-1]:
                    continue
                slug = href.split('/')[-1]
                if len(slug) < 8 or slug.startswith(('login', 'signup', 'about', 'terms')):
                    continue
                full_url = "https://cryptocurrencyjobs.co" + href if not href.startswith('http') else href
                if slug in seen_slugs:
                    continue
                title = await self._extract_job_title_from_card(link)
                if not title or title == "Unknown Title":
                    title = slug.replace('-', ' ').title()
                match_percentage = self._calculate_keyword_match(title, search_keywords)
-                
+                if match_percentage >= 0.4 or not search_keywords.strip():
-                if match_percentage >= 0.5:  # Lower threshold than LinkedIn
+                    seen_slugs.add(slug)
-                    seen_job_ids.add(job_id)
+                    job_cards.append((full_url, title, link))
-                    all_job_links.append((href, title))
+                    job_found = True
                    new_jobs += 1
                else:
                    print(f"   ⚠️ Skipping job due to low keyword match: {title[:50]}... (match: {match_percentage:.2%})")
        return new_jobs
-    async def _handle_pagination(self, page, search_keywords: str, seen_job_ids, all_job_links):
+            print(f"   ✅ Collected {len(job_cards)} valid job links after filtering ({len(candidates)} raw candidates).")
        current_page = 1
        while True:
            print(f"📄 Processing page {current_page}")
            new_jobs = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links)
            print(f"   ➕ Found {new_jobs} new job(s) (total: {len(all_job_links)})")
-            next_btn = await page.query_selector('a[rel="next"]')
+        except Exception as e:
-            if next_btn:
+            print(f"   ⚠️ Error collecting job cards: {e}")
-                next_url = await next_btn.get_attribute("href")
+
-                if next_url and not next_url.startswith("http"):
+        if not job_found:
-                    next_url = "https://cryptocurrencyjobs.co" + next_url
+            print("   ❌ No valid job listings passed filters.")
-                await page.goto(next_url, timeout=120000)
+
-                await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
+        return job_cards
-                current_page += 1
+
-            else:
+    async def _handle_pagination_and_collect_all(self, page, search_keywords: str, seen_slugs):
-                print("🔚 No 'Next' page — stopping pagination.")
+        all_job_elements = []
        scroll_attempt = 0
        max_scrolls = 40
        prev_count = 0
        while scroll_attempt < max_scrolls:
            print(f"   Scroll attempt {scroll_attempt + 1} | Current total jobs: {len(all_job_elements)}")
            page_elements = await self._collect_job_elements_from_page(page, search_keywords, seen_slugs)
            all_job_elements.extend(page_elements)
            current_count = len(all_job_elements)
            if current_count == prev_count and scroll_attempt > 3:
                print("   🔚 No new jobs after several scrolls → assuming end of list.")
                break
-    async def _extract_job_posted_date(self, page) -> str:
+            prev_count = current_count
            await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
            await asyncio.sleep(random.uniform(2.5, 5.5) * self.human_speed)
            try:
                load_more = await page.query_selector(
                    'button:has-text("Load more"), button:has-text("More"), div[role="button"]:has-text("Load"), a:has-text("Load more")'
                )
                if load_more:
                    print("   Found 'Load more' button → clicking...")
                    await self._human_click(page, load_more)
                    await asyncio.sleep(random.uniform(3.0, 6.0) * self.human_speed)
            except:
                pass
            scroll_attempt += 1
        print(f"   Finished scrolling → collected {len(all_job_elements)} unique job links.")
        return all_job_elements
    async def _extract_job_posted_date_from_card(self, card) -> str:
        try:
-            date_element = await page.query_selector(".job-posted-date, .job-date, time")
+            card_text = await card.inner_text()
-            if date_element:
+            if "Today" in card_text:
-                date_text = await date_element.inner_text()
+                return datetime.now().strftime("%m/%d/%y")
-                if "Today" in date_text:
+            elif "Yesterday" in card_text:
-                    return datetime.now().strftime("%m/%d/%y")
+                from datetime import timedelta
-                elif "Yesterday" in date_text:
+                return (datetime.now() - timedelta(days=1)).strftime("%m/%d/%y")
-                    yesterday = datetime.now().replace(day=datetime.now().day - 1)
+            else:
-                    return yesterday.strftime("%m/%d/%y")
+                match = re.search(r'(\d+)d', card_text)
-                else:
+                if match:
-                    return datetime.now().strftime("%m/%d/%y")
+                    days = int(match.group(1))
                    from datetime import timedelta
                    return (datetime.now() - timedelta(days=days)).strftime("%m/%d/%y")
        except:
            pass
        return datetime.now().strftime("%m/%d/%y")
@ -126,15 +216,62 @@ class CryptoJobScraper:
        except Exception as e:
            print(f"   ❌ Failed to add job to Redis cache: {str(e)}")
    async def _is_forbidden_ats_url(self, url: str) -> bool:
        url_lower = url.lower()
        return any(domain in url_lower for domain in self.FORBIDDEN_ATS_DOMAINS)
    async def _is_invalid_job_page(self, page_content: str) -> bool:
        content_lower = page_content.lower()
        return any(phrase in content_lower for phrase in self.INVALID_CONTENT_PHRASES)
    def _extract_job_id_from_url(self, url: str) -> Optional[str]:
        """
        Extract job ID from URL. Returns ID if it contains at least one digit.
        Otherwise, returns None (but does NOT mean skip!).
        """
        try:
            parsed = urlparse(url)
            path_parts = [p for p in parsed.path.split('/') if p]
            if not path_parts:
                return None
            candidate = path_parts[-1]
            candidate = re.split(r'[?#]', candidate)[0]
            candidate = re.sub(r'\.html?$', '', candidate)
            if not candidate or not any(c.isdigit() for c in candidate):
                return None
            # Avoid title-like strings (with spaces or long words + no structure)
            if re.search(r'[A-Za-z]{6,}\s', candidate):
                return None
            return candidate
        except:
            return None
    async def scrape_jobs(
        self,
        search_keywords: Optional[str],
        max_pages: int = 1,
        credentials: Optional[Dict] = None
    ):
-        # cryptocurrencyjobs.co uses URL params differently
+        query = ""
-        encoded_keywords = search_keywords.replace(" ", "%20")
+        location = ""
-        search_url = f"https://cryptocurrencyjobs.co/?q={encoded_keywords}"
+        if search_keywords and search_keywords.strip():
            parts = search_keywords.split(',', 1)
            query = parts[0].strip()
            if len(parts) > 1:
                location = parts[1].strip()
        clean_query = query.replace(' ', '+')
        clean_location = location.replace(' ', '+')
        search_url = "https://cryptocurrencyjobs.co/"
        if clean_query:
            search_url += f"?query={clean_query}"
            if clean_location:
                search_url += f"&location={clean_location}"
        profile = self.engine._select_profile()
        renderer = random.choice(self.engine.common_renderers[self.engine.os])
@ -156,46 +293,107 @@ class CryptoJobScraper:
            await context.add_init_script(spoof_script)
            page = await context.new_page()
-
+            print(f"🔍 Searching for: {search_keywords or 'all jobs'}")
-            # Fetch main search page
+            print(f"   🔗 URL: {search_url}")
-            print(f"🔍 Searching for: {search_keywords}")
+            await page.goto(search_url, wait_until='networkidle', timeout=120000)
            await page.goto(search_url, wait_until='load', timeout=120000)
            await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
-            all_job_links = []
+            try:
-            seen_job_ids = set()
+                await page.wait_for_selector("a[href^='/'][href*='-']", timeout=10000)
            except:
                print("   ⚠️ No job links found initially, waiting longer...")
                await asyncio.sleep(5 * self.human_speed)
-            print("🔄 Collecting job links from search results...")
+            seen_slugs = set()
-            await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links)
+            all_job_elements = await self._handle_pagination_and_collect_all(page, search_keywords, seen_slugs)
-            await self._handle_pagination(page, search_keywords, seen_job_ids, all_job_links)
+            print(f"✅ Collected {len(all_job_elements)} unique job links.")
            print(f"✅ Collected {len(all_job_links)} unique job links.")
            scraped_count = 0
-            for idx, (href, title) in enumerate(all_job_links):
+            for idx, (href, title, job_element) in enumerate(all_job_elements):
                job_detail_page = None
                apply_page = None
                skip_job = False
                final_scrape_url = None
                try:
-                    full_url = href
+                    print(f"  → Processing job {idx+1}/{len(all_job_elements)}: {title}")
                    print(f"  → Opening job {idx+1}/{len(all_job_links)}: {full_url}")
-                    fetcher = StealthyFetcher(self.engine, browser, context)
+                    posted_date = await self._extract_job_posted_date_from_card(job_element)
-                    job_page = await fetcher.fetch_url(full_url, wait_for_selector="h1")
+
-                    if not job_page:
+                    job_detail_page = await context.new_page()
-                        print(f" ❌ Failed to fetch job page {full_url}")
+                    await job_detail_page.goto(href, wait_until='networkidle', timeout=60000)
-                        await self._add_job_to_redis_cache(full_url, full_url.split("/")[-1], "fetch_failure")
+                    await asyncio.sleep(2 * self.human_speed)
-                        self.engine.report_outcome("fetch_failure", url=full_url)
+
                    # Check for invalid content
                    page_content = await job_detail_page.content()
                    if await self._is_invalid_job_page(page_content):
                        print("   🚫 Page contains invalid content → skipping.")
                        await job_detail_page.close()
                        continue
-                    posted_date = await self._extract_job_posted_date(job_page)
+                    # Try to click apply
                    apply_clicked = False
                    apply_selectors = [
                        'a[href*="apply"], a:text("Apply"), a:text("Apply Now"), a:text("Apply here")',
                        'button:text("Apply"), button:has-text("Apply")',
                        '[data-testid="apply-button"], [aria-label*="apply"], [role="button"]:has-text("Apply")',
                        'a.btn-apply, .apply-button, .apply-link, a:has-text("Apply")',
                        'a[rel="noopener"]:has-text("Apply")',
                    ]
-                    await self.engine._human_like_scroll(job_page)
+                    for sel in apply_selectors:
-                    await asyncio.sleep(2 * self.human_speed)
+                        apply_elem = await job_detail_page.query_selector(sel)
-                    page_content = await self._extract_page_content_for_llm(job_page)
+                        if apply_elem:
                            print(f"   🔗 Found Apply element with selector: {sel}")
                            await self._human_click(job_detail_page, apply_elem, wait_after=True)
                            apply_clicked = True
                            break
-                    job_id = full_url.split("/")[-1] if full_url.split("/")[-1] else "unknown"
+                    apply_page = job_detail_page
                    if apply_clicked:
                        await asyncio.sleep(random.uniform(3.0, 6.0) * self.human_speed)
                        pages = context.pages
                        new_pages = [p for p in pages if p != job_detail_page and p.url != "about:blank"]
                        if new_pages:
                            candidate_page = new_pages[-1]
                            new_url = candidate_page.url.strip()
                            print(f"   New tab opened: {new_url}")
                            if new_url and await self._is_forbidden_ats_url(new_url):
                                print("   🚫 New URL is a forbidden ATS → skipping job.")
                                if candidate_page != job_detail_page:
                                    await candidate_page.close()
                                await job_detail_page.close()
                                skip_job = True
                            else:
                                apply_page = candidate_page
                        else:
                            print("   No new tab → using original page.")
                    if skip_job:
                        continue
                    final_scrape_url = apply_page.url
                    # Re-check invalid content on final page
                    page_content = await self._extract_page_content_for_llm(apply_page)
                    if await self._is_invalid_job_page(page_content):
                        print("   🚫 Final page contains invalid content → skipping.")
                        if apply_page != job_detail_page:
                            await apply_page.close()
                        await job_detail_page.close()
                        continue
                    # Extract job ID — but do NOT fail if missing
                    job_id = self._extract_job_id_from_url(final_scrape_url)
                    if not job_id:
                        # Fallback: hash the URL to create a stable, unique ID
                        job_id = "job_" + hashlib.md5(final_scrape_url.encode()).hexdigest()[:12]
                    raw_data = {
                        "page_content": page_content,
-                        "url": full_url,
+                        "url": final_scrape_url,
                        "job_id": job_id,
                        "search_keywords": search_keywords,
                        "posted_date": posted_date
@ -210,44 +408,45 @@ class CryptoJobScraper:
                                if field == 'job_id':
                                    refined_data[field] = job_id
                                elif field == 'url':
-                                    refined_data[field] = full_url
+                                    refined_data[field] = final_scrape_url
                                elif field == 'company_name':
                                    refined_data[field] = "Unknown Company"
                        refined_data['scraped_at'] = datetime.now().isoformat()
                        refined_data['category'] = search_keywords
                        refined_data['posted_date'] = posted_date
                        await self.llm_agent.save_job_data(refined_data, search_keywords)
                        scraped_count += 1
                        print(f"  ✅ Scraped and refined: {refined_data['title'][:50]}...")
                        self.engine.report_outcome("success", url=raw_data["url"])
                    else:
                        print(f"  🟡 Could not extract meaningful data from: {full_url}")
                        await self._add_job_to_redis_cache(full_url, job_id, "llm_failure")
                        self.engine.report_outcome("llm_failure", url=raw_data["url"])
-                    await job_page.close()
+                        refined_data['scraped_at'] = datetime.now().isoformat()
                        refined_data['category'] = search_keywords or "all"
                        refined_data['posted_date'] = posted_date
                        await self.llm_agent.save_job_data(refined_data, search_keywords or "all")
                        scraped_count += 1
                        print(f"  ✅ Scraped: {refined_data['title'][:50]}... (Job ID: {job_id})")
                        self.engine.report_outcome("success", url=final_scrape_url)
                    else:
                        print(f"  🟡 Could not extract meaningful data from: {final_scrape_url}")
                        await self._add_job_to_redis_cache(final_scrape_url, job_id, "llm_failure")
                        self.engine.report_outcome("llm_failure", url=final_scrape_url)
                    if apply_page != job_detail_page and not apply_page.is_closed():
                        await apply_page.close()
                    if job_detail_page and not job_detail_page.is_closed():
                        await job_detail_page.close()
                except Exception as e:
                    error_msg = str(e)[:100]
                    print(f"    ⚠️ Failed on job {idx+1}: {error_msg}")
-                    job_id = full_url.split("/")[-1] if 'full_url' in locals() else "unknown"
+                    job_id_for_log = "unknown"
-                    job_url = full_url if 'full_url' in locals() else "unknown"
+                    if 'final_scrape_url' in locals() and final_scrape_url:
-                    await self._add_job_to_redis_cache(job_url, job_id, f"exception: {error_msg}")
+                        job_id_for_log = "job_" + hashlib.md5(final_scrape_url.encode()).hexdigest()[:12]
-                    if 'job_page' in locals() and job_page:
+                    await self._add_job_to_redis_cache(href, job_id_for_log, f"exception: {error_msg}")
-                        await job_page.close()
+                    if job_detail_page and not job_detail_page.is_closed():
                        await job_detail_page.close()
                    if 'apply_page' in locals() and apply_page and apply_page != job_detail_page and not apply_page.is_closed():
                        await apply_page.close()
                    continue
                finally:
                    print("  ↩️ Returning to search results...")
                    await page.goto(search_url, timeout=120000)
                    await asyncio.sleep(4 * self.human_speed)
            await browser.close()
            if scraped_count > 0:
                self.engine.report_outcome("success")
-                print(f"✅ Completed! Processed {scraped_count} jobs for '{search_keywords}' based on request '{self.user_request}'.")
+                print(f"✅ Completed! Processed {scraped_count} jobs for '{search_keywords or 'all jobs'}'.")
            else:
                self.engine.report_outcome("scraping_error")
                print("⚠️ No jobs processed successfully.")