Web_scraping_project/scraper.py


import asyncio
import random
from typing import Optional, Dict
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
from browserforge.injectors.playwright import AsyncNewContext
from llm_agent import LLMJobRefiner
import re
from datetime import datetime
import json
import redis
from urllib.parse import urlparse
import hashlib


class CryptoJobScraper:
    def __init__(
        self,
        engine,
        db_path: str = "crypto_jobs.db",
        human_speed: float = 1.0,
        user_request: str = "Extract all standard job details"
    ):
        self.engine = engine
        self.db_path = db_path
        self.human_speed = human_speed
        self.user_request = user_request
        self.llm_agent = LLMJobRefiner()
        self.redis_client = redis.Redis(host='=localhost', port=6379, db=0, decode_responses=True)

        self.FORBIDDEN_ATS_DOMAINS = [
            'ashby', 'ashbyhq',
            'greenhouse', 'boards.greenhouse.io',
            'gem', 'gem.com',
            'rippling',
            'myworkday', 'myworkdayjobs',
            'smartrecruiters',
            'workable',
            'lever', 'jobs.lever.co',
        ]

        self.INVALID_CONTENT_PHRASES = [
            "invalid job url",
            "cookie consent",
            "privacy policy",
            "not a valid job",
            "job not found",
            "page not found",
            "The requested job post could not be found. It may have been removed."
            "this page does not contain a job description"
        ]

    async def _human_click(self, page, element, wait_after: bool = True):
        if not element:
            return False
        await element.scroll_into_view_if_needed()
        await asyncio.sleep(random.uniform(0.3, 0.8) * self.human_speed)
        try:
            await element.click()
            if wait_after:
                await asyncio.sleep(random.uniform(2, 4) * self.human_speed)
            return True
        except:
            return False

    async def _extract_page_content_for_llm(self, page) -> str:
        await asyncio.sleep(2 * self.human_speed)
        await self.engine._human_like_scroll(page)
        await asyncio.sleep(2 * self.human_speed)
        page_content = await page.content()
        return page_content

    def _calculate_keyword_match(self, title: str, keywords: str) -> float:
        if not title or not keywords:
            return 0.0
        title_lower = title.lower()
        keyword_list = [kw.strip().lower() for kw in keywords.split()]
        matches = sum(1 for kw in keyword_list if kw in title_lower)
        return matches / len(keyword_list) if keyword_list else 0.0

    async def _extract_job_title_from_card(self, card) -> str:
        try:
            title_selectors = [
                'h3', 'h2', 'h4',
                'strong', 'span'
            ]
            for selector in title_selectors:
                title_element = await card.query_selector(selector)
                if title_element:
                    title_text = await title_element.inner_text()
                    if title_text and len(title_text.strip()) > 3:
                        return title_text.strip()

            card_text = await card.inner_text()
            lines = [line.strip() for line in card_text.split('\n') if line.strip()]
            if lines:
                for line in lines:
                    if len(line) > 5 and not any(skip in line.lower() for skip in ['today', 'featured', 'yesterday', 'company', 'location']):
                        return line
            return "Unknown Title"
        except:
            return "Unknown Title"

    async def _collect_job_elements_from_page(self, page, search_keywords: str, seen_slugs):
        job_cards = []
        job_found = False

        await asyncio.sleep(3 * self.human_speed)

        try:
            await page.wait_for_selector('a[href^="/"][href*="-"]', timeout=60000)
            candidates = await page.query_selector_all('a[href^="/"][href*="-"]')

            for link in candidates:
                href = await link.get_attribute("href") or ""
                href = href.rstrip('/')
                if not href or len(href.split('/')) != 3:
                    continue
                if '-' not in href.split('/')[-1]:
                    continue
                slug = href.split('/')[-1]
                if len(slug) < 8 or slug.startswith(('login', 'signup', 'about', 'terms')):
                    continue

                full_url = "https://cryptocurrencyjobs.co" + href if not href.startswith('http') else href
                if slug in seen_slugs:
                    continue

                title = await self._extract_job_title_from_card(link)
                if not title or title == "Unknown Title":
                    title = slug.replace('-', ' ').title()

                match_percentage = self._calculate_keyword_match(title, search_keywords)
                if match_percentage >= 0.4 or not search_keywords.strip():
                    seen_slugs.add(slug)
                    job_cards.append((full_url, title, link))
                    job_found = True

            print(f"   ✅ Collected {len(job_cards)} valid job links after filtering ({len(candidates)} raw candidates).")

        except Exception as e:
            print(f"   ⚠️ Error collecting job cards: {e}")

        if not job_found:
            print("   ❌ No valid job listings passed filters.")

        return job_cards

    async def _handle_pagination_and_collect_all(self, page, search_keywords: str, seen_slugs):
        all_job_elements = []
        scroll_attempt = 0
        max_scrolls = 40
        prev_count = 0

        while scroll_attempt < max_scrolls:
            print(f"   Scroll attempt {scroll_attempt + 1} | Current total jobs: {len(all_job_elements)}")

            page_elements = await self._collect_job_elements_from_page(page, search_keywords, seen_slugs)
            all_job_elements.extend(page_elements)

            current_count = len(all_job_elements)

            if current_count == prev_count and scroll_attempt > 3:
                print("   🔚 No new jobs after several scrolls → assuming end of list.")
                break

            prev_count = current_count

            await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
            await asyncio.sleep(random.uniform(2.5, 5.5) * self.human_speed)

            try:
                load_more = await page.query_selector(
                    'button:has-text("Load more"), button:has-text("More"), div[role="button"]:has-text("Load"), a:has-text("Load more")'
                )
                if load_more:
                    print("   Found 'Load more' button → clicking...")
                    await self._human_click(page, load_more)
                    await asyncio.sleep(random.uniform(3.0, 6.0) * self.human_speed)
            except:
                pass

            scroll_attempt += 1

        print(f"   Finished scrolling → collected {len(all_job_elements)} unique job links.")
        return all_job_elements

    async def _extract_job_posted_date_from_card(self, card) -> str:
        try:
            card_text = await card.inner_text()
            if "Today" in card_text:
                return datetime.now().strftime("%m/%d/%y")
            elif "Yesterday" in card_text:
                from datetime import timedelta
                return (datetime.now() - timedelta(days=1)).strftime("%m/%d/%y")
            else:
                match = re.search(r'(\d+)d', card_text)
                if match:
                    days = int(match.group(1))
                    from datetime import timedelta
                    return (datetime.now() - timedelta(days=days)).strftime("%m/%d/%y")
        except:
            pass
        return datetime.now().strftime("%m/%d/%y")

    async def _add_job_to_redis_cache(self, job_url: str, job_id: str, error_type: str):
        try:
            job_data = {
                "job_url": job_url,
                "job_id": job_id,
                "error_type": error_type,
                "timestamp": datetime.now().isoformat()
            }
            self.redis_client.hset("failed_crypto_jobs", job_id, json.dumps(job_data))
            print(f"   📦 Added failed job to Redis cache: {job_id} (Error: {error_type})")
        except Exception as e:
            print(f"   ❌ Failed to add job to Redis cache: {str(e)}")

    async def _is_forbidden_ats_url(self, url: str) -> bool:
        url_lower = url.lower()
        return any(domain in url_lower for domain in self.FORBIDDEN_ATS_DOMAINS)

    async def _is_invalid_job_page(self, page_content: str) -> bool:
        content_lower = page_content.lower()
        return any(phrase in content_lower for phrase in self.INVALID_CONTENT_PHRASES)

    def _extract_job_id_from_url(self, url: str) -> Optional[str]:
        """
        Extract job ID from URL. Returns ID if it contains at least one digit.
        Otherwise, returns None (but does NOT mean skip!).
        """
        try:
            parsed = urlparse(url)
            path_parts = [p for p in parsed.path.split('/') if p]
            if not path_parts:
                return None

            candidate = path_parts[-1]
            candidate = re.split(r'[?#]', candidate)[0]
            candidate = re.sub(r'\.html?$', '', candidate)

            if not candidate or not any(c.isdigit() for c in candidate):
                return None

            # Avoid title-like strings (with spaces or long words + no structure)
            if re.search(r'[A-Za-z]{6,}\s', candidate):
                return None

            return candidate
        except:
            return None

    async def scrape_jobs(
        self,
        search_keywords: Optional[str],
        max_pages: int = 1,
        credentials: Optional[Dict] = None
    ):
        query = ""
        location = ""
        if search_keywords and search_keywords.strip():
            parts = search_keywords.split(',', 1)
            query = parts[0].strip()
            if len(parts) > 1:
                location = parts[1].strip()

        clean_query = query.replace(' ', '+')
        clean_location = location.replace(' ', '+')

        search_url = "https://cryptocurrencyjobs.co/"
        if clean_query:
            search_url += f"?query={clean_query}"
            if clean_location:
                search_url += f"&location={clean_location}"

        profile = self.engine._select_profile()
        renderer = random.choice(self.engine.common_renderers[self.engine.os])
        vendor = random.choice(self.engine.common_vendors)
        spoof_script = self.engine._get_spoof_script(renderer, vendor)

        async with async_playwright() as pw:
            browser = await pw.chromium.launch(
                headless=False,
                args=['--disable-blink-features=AutomationControlled']
            )
            context = await AsyncNewContext(browser, fingerprint=profile)

            await context.add_init_script(f"""
                Object.defineProperty(navigator, 'hardwareConcurrency', {{ get: () => {profile.navigator.hardwareConcurrency} }});
                Object.defineProperty(navigator, 'deviceMemory', {{ get: () => {profile.navigator.deviceMemory} }});
                Object.defineProperty(navigator, 'platform', {{ get: () => '{profile.navigator.platform}' }});
            """)
            await context.add_init_script(spoof_script)

            page = await context.new_page()
            print(f"🔍 Searching for: {search_keywords or 'all jobs'}")
            print(f"   🔗 URL: {search_url}")
            await page.goto(search_url, wait_until='networkidle', timeout=120000)
            await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)

            try:
                await page.wait_for_selector("a[href^='/'][href*='-']", timeout=10000)
            except:
                print("   ⚠️ No job links found initially, waiting longer...")
                await asyncio.sleep(5 * self.human_speed)

            seen_slugs = set()
            all_job_elements = await self._handle_pagination_and_collect_all(page, search_keywords, seen_slugs)
            print(f"✅ Collected {len(all_job_elements)} unique job links.")

            scraped_count = 0
            for idx, (href, title, job_element) in enumerate(all_job_elements):
                job_detail_page = None
                apply_page = None
                skip_job = False
                final_scrape_url = None
                try:
                    print(f"  → Processing job {idx+1}/{len(all_job_elements)}: {title}")

                    posted_date = await self._extract_job_posted_date_from_card(job_element)

                    job_detail_page = await context.new_page()
                    await job_detail_page.goto(href, wait_until='networkidle', timeout=60000)
                    await asyncio.sleep(2 * self.human_speed)

                    # Check for invalid content
                    page_content = await job_detail_page.content()
                    if await self._is_invalid_job_page(page_content):
                        print("   🚫 Page contains invalid content → skipping.")
                        await job_detail_page.close()
                        continue

                    # Try to click apply
                    apply_clicked = False
                    apply_selectors = [
                        'a[href*="apply"], a:text("Apply"), a:text("Apply Now"), a:text("Apply here")',
                        'button:text("Apply"), button:has-text("Apply")',
                        '[data-testid="apply-button"], [aria-label*="apply"], [role="button"]:has-text("Apply")',
                        'a.btn-apply, .apply-button, .apply-link, a:has-text("Apply")',
                        'a[rel="noopener"]:has-text("Apply")',
                    ]

                    for sel in apply_selectors:
                        apply_elem = await job_detail_page.query_selector(sel)
                        if apply_elem:
                            print(f"   🔗 Found Apply element with selector: {sel}")
                            await self._human_click(job_detail_page, apply_elem, wait_after=True)
                            apply_clicked = True
                            break

                    apply_page = job_detail_page

                    if apply_clicked:
                        await asyncio.sleep(random.uniform(3.0, 6.0) * self.human_speed)
                        pages = context.pages
                        new_pages = [p for p in pages if p != job_detail_page and p.url != "about:blank"]

                        if new_pages:
                            candidate_page = new_pages[-1]
                            new_url = candidate_page.url.strip()
                            print(f"   New tab opened: {new_url}")

                            if new_url and await self._is_forbidden_ats_url(new_url):
                                print("   🚫 New URL is a forbidden ATS → skipping job.")
                                if candidate_page != job_detail_page:
                                    await candidate_page.close()
                                await job_detail_page.close()
                                skip_job = True
                            else:
                                apply_page = candidate_page
                        else:
                            print("   No new tab → using original page.")

                    if skip_job:
                        continue

                    final_scrape_url = apply_page.url

                    # Re-check invalid content on final page
                    page_content = await self._extract_page_content_for_llm(apply_page)
                    if await self._is_invalid_job_page(page_content):
                        print("   🚫 Final page contains invalid content → skipping.")
                        if apply_page != job_detail_page:
                            await apply_page.close()
                        await job_detail_page.close()
                        continue

                    # Extract job ID — but do NOT fail if missing
                    job_id = self._extract_job_id_from_url(final_scrape_url)
                    if not job_id:
                        # Fallback: hash the URL to create a stable, unique ID
                        job_id = "job_" + hashlib.md5(final_scrape_url.encode()).hexdigest()[:12]

                    raw_data = {
                        "page_content": page_content,
                        "url": final_scrape_url,
                        "job_id": job_id,
                        "search_keywords": search_keywords,
                        "posted_date": posted_date
                    }

                    refined_data = await self.llm_agent.refine_job_data(raw_data, self.user_request)

                    if refined_data and refined_data.get("title", "N/A") != "N/A":
                        compulsory_fields = ['company_name', 'job_id', 'url']
                        for field in compulsory_fields:
                            if not refined_data.get(field) or refined_data[field] in ["N/A", "", "Unknown"]:
                                if field == 'job_id':
                                    refined_data[field] = job_id
                                elif field == 'url':
                                    refined_data[field] = final_scrape_url
                                elif field == 'company_name':
                                    refined_data[field] = "Unknown Company"

                        refined_data['scraped_at'] = datetime.now().isoformat()
                        refined_data['category'] = search_keywords or "all"
                        refined_data['posted_date'] = posted_date
                        await self.llm_agent.save_job_data(refined_data, search_keywords or "all")
                        scraped_count += 1
                        print(f"  ✅ Scraped: {refined_data['title'][:50]}... (Job ID: {job_id})")
                        self.engine.report_outcome("success", url=final_scrape_url)
                    else:
                        print(f"  🟡 Could not extract meaningful data from: {final_scrape_url}")
                        await self._add_job_to_redis_cache(final_scrape_url, job_id, "llm_failure")
                        self.engine.report_outcome("llm_failure", url=final_scrape_url)

                    if apply_page != job_detail_page and not apply_page.is_closed():
                        await apply_page.close()
                    if job_detail_page and not job_detail_page.is_closed():
                        await job_detail_page.close()

                except Exception as e:
                    error_msg = str(e)[:100]
                    print(f"    ⚠️ Failed on job {idx+1}: {error_msg}")
                    job_id_for_log = "unknown"
                    if 'final_scrape_url' in locals() and final_scrape_url:
                        job_id_for_log = "job_" + hashlib.md5(final_scrape_url.encode()).hexdigest()[:12]
                    await self._add_job_to_redis_cache(href, job_id_for_log, f"exception: {error_msg}")
                    if job_detail_page and not job_detail_page.is_closed():
                        await job_detail_page.close()
                    if 'apply_page' in locals() and apply_page and apply_page != job_detail_page and not apply_page.is_closed():
                        await apply_page.close()
                    continue

            await browser.close()

            if scraped_count > 0:
                self.engine.report_outcome("success")
                print(f"✅ Completed! Processed {scraped_count} jobs for '{search_keywords or 'all jobs'}'.")
            else:
                self.engine.report_outcome("scraping_error")
                print("⚠️ No jobs processed successfully.")