Web_scraping_project/job_scraper2.py


import asyncio
import random
import sqlite3
import os
from datetime import datetime
from typing import Optional, Dict, List
from playwright.async_api import async_playwright
from browserforge.injectors.playwright import AsyncNewContext
from llm_agent import LLMJobRefiner
import re


class LinkedInJobScraper:
    def __init__(
        self,
        engine,
        db_path: str = "linkedin_jobs.db",
        human_speed: float = 1.0,
        target_field: str = "all"
    ):
        self.engine = engine
        self.db_path = db_path
        self.human_speed = human_speed
        self.target_field = target_field
        self._init_db()
        self.llm_agent = LLMJobRefiner()

    def _init_db(self):
        os.makedirs(os.path.dirname(self.db_path) if os.path.dirname(self.db_path) else ".", exist_ok=True)
        with sqlite3.connect(self.db_path) as conn:
            cursor = conn.cursor()
            cursor.execute('''
                CREATE TABLE IF NOT EXISTS jobs (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    title TEXT,
                    company_name TEXT,
                    location TEXT,
                    description TEXT,
                    requirements TEXT,
                    qualifications TEXT,
                    salary_range TEXT,
                    nature_of_work TEXT,
                    job_id TEXT,
                    url TEXT UNIQUE
                )
            ''')
            conn.commit()

    async def _human_click(self, page, element, wait_after: bool = True):
        if not element:
            return False
        await element.scroll_into_view_if_needed()
        await asyncio.sleep(random.uniform(0.3, 0.8) * self.human_speed)
        try:
            await element.click()
            if wait_after:
                await asyncio.sleep(random.uniform(2, 4) * self.human_speed)
            return True
        except:
            return False

    async def _login(self, page, credentials: Dict) -> bool:
        """Human-realistic LinkedIn login"""
        print("🔐 Navigating to LinkedIn login page...")
        await page.goto("https://www.linkedin.com/login", timeout=60000)
        await asyncio.sleep(random.uniform(2.0, 3.5) * self.human_speed)

        email_field = await page.query_selector('input[name="session_key"]')
        if not email_field:
            print("❌ Email field not found.")
            return False

        print("✍️ Typing username...")
        await email_field.click()
        await asyncio.sleep(random.uniform(0.4, 0.9) * self.human_speed)
        for char in credentials["email"]:
            await page.keyboard.type(char)
            await asyncio.sleep(random.uniform(0.06, 0.14) * self.human_speed)
        await asyncio.sleep(random.uniform(1.0, 1.8) * self.human_speed)

        password_field = await page.query_selector('input[name="session_password"]')
        if not password_field:
            print("❌ Password field not found.")
            return False

        print("🔒 Typing password...")
        await password_field.click()
        await asyncio.sleep(random.uniform(0.3, 0.7) * self.human_speed)
        for char in credentials["password"]:
            await page.keyboard.type(char)
            await asyncio.sleep(random.uniform(0.08, 0.16) * self.human_speed)
        await asyncio.sleep(random.uniform(0.8, 1.5) * self.human_speed)

        print("✅ Submitting login form...")
        await page.keyboard.press("Enter")

        for _ in range(15):
            current_url = page.url
            if "/feed" in current_url or "/jobs" in current_url:
                if "login" not in current_url:
                    print("✅ Login successful!")
                    await asyncio.sleep(random.uniform(2.0, 3.0) * self.human_speed)
                    return True
            await asyncio.sleep(1)
        print("❌ Login may have failed.")
        return False

    async def _extract_all_page_content(self, page) -> str:
        """Extract all content from the job page"""
        await asyncio.sleep(2 * self.human_speed)

        # Human-like scrolling to load all content
        await self.engine._human_like_scroll(page)
        await asyncio.sleep(2 * self.human_speed)

        # Get the full page content
        page_content = await page.content()
        return page_content

    def _calculate_keyword_match(self, title: str, keywords: str) -> float:
        """Calculate percentage of keywords matched in title"""
        if not title or not keywords:
            return 0.0

        title_lower = title.lower()
        keyword_list = [kw.strip().lower() for kw in keywords.split()]

        matches = 0
        for keyword in keyword_list:
            if keyword in title_lower:
                matches += 1

        return matches / len(keyword_list) if keyword_list else 0.0

    def _extract_location_from_keywords(self, search_keywords: str) -> str:
        """Extract location from search keywords if present"""
        location_match = re.search(r'location:\s*([^,]+)', search_keywords, re.IGNORECASE)
        if location_match:
            return location_match.group(1).strip().lower()
        return ""

    async def _scrape_jobs_from_current_page(self, page, search_keywords: str, seen_job_ids, all_job_links):
        """Scrape job links from the current page that match keywords and location"""
        current_links = await page.query_selector_all("a[href*='/jobs/view/']")
        new_jobs = 0

        # Extract location from search keywords
        location_from_keywords = self._extract_location_from_keywords(search_keywords)

        for link in current_links:
            href = await link.get_attribute("href")
            if href:
                full_url = href if href.startswith("http") else f"https://www.linkedin.com{href}"
                job_id = href.split("/view/")[-1].split("/")[0] if "/view/" in href else href

                if job_id and job_id not in seen_job_ids:
                    # Check if job title matches keywords (at least 70% match)
                    title_element = await link.query_selector("span.job-title, h3, .job-card-title")
                    if title_element:
                        title = await title_element.inner_text()
                        match_percentage = self._calculate_keyword_match(title, search_keywords)

                        # Check if location matches (if specified in keywords)
                        location_match = True
                        if location_from_keywords:
                            # Try to get location from the job card
                            location_element = await link.query_selector("span.job-location, .job-card-location, .location")
                            if location_element:
                                location_text = await location_element.inner_text()
                                location_match = location_from_keywords in location_text.lower()

                        if match_percentage >= 0.7 and location_match:  # At least 70% match and location matches
                            seen_job_ids.add(job_id)
                            all_job_links.append((href, title))
                            new_jobs += 1
                        elif match_percentage < 0.7:
                            print(f"   ⚠️ Skipping job due to low keyword match: {title[:50]}... (match: {match_percentage:.2%})")
                        elif not location_match:
                            print(f"   ⚠️ Skipping job due to location mismatch: {title[:50]}... (expected: {location_from_keywords})")
                    else:
                        # If no title element, still add to check later
                        seen_job_ids.add(job_id)
                        all_job_links.append((href, "Unknown Title"))
                        new_jobs += 1
        return new_jobs

    async def _handle_pagination(self, page, search_keywords: str, seen_job_ids, all_job_links):
        """Handle pagination by going through pages"""
        current_page = 1
        while True:
            print(f"📄 Processing page {current_page}")

            # Collect job links on current page
            new_jobs = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links)
            print(f"   ➕ Found {new_jobs} new job(s) on page {current_page} (total: {len(all_job_links)})")

            # Try to go to next page
            next_btn = await page.query_selector("button[aria-label='Next']")
            if next_btn and await next_btn.is_enabled():
                await self._human_click(page, next_btn)
                await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)
                # Wait for URL to change or new content
                try:
                    await page.wait_for_function("() => window.location.href.includes('start=')", timeout=60000)
                except:
                    pass
                current_page += 1
            else:
                print("🔚 'Next' button not available — stopping pagination.")
                break

    async def _handle_infinite_scroll(self, page, search_keywords: str, seen_job_ids, all_job_links):
        """Handle infinite scroll to load more jobs"""
        last_height = await page.evaluate("document.body.scrollHeight")
        no_new_jobs_count = 0
        max_no_new = 3

        while no_new_jobs_count < max_no_new:
            await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
            await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)

            new_jobs_found = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links)

            print(f"   ➕ Found {new_jobs_found} new job(s) (total: {len(all_job_links)})")

            new_height = await page.evaluate("document.body.scrollHeight")
            if new_height == last_height:
                no_new_jobs_count += 1
            else:
                no_new_jobs_count = 0
                last_height = new_height

            if new_jobs_found == 0 and no_new_jobs_count >= 1:
                print("🔚 No new jobs loaded. Stopping scroll.")
                break

    async def scrape_jobs(
        self,
        search_keywords: Optional[str],
        max_pages: int = 1,
        credentials: Optional[Dict] = None
    ):
        # Parse location from keywords if present
        location_match = re.search(r'location:\s*([^,]+)', search_keywords, re.IGNORECASE)
        location = location_match.group(1).strip() if location_match else ""

        # Remove location part from keywords for search
        clean_keywords = re.sub(r'location:\s*[^,]+', '', search_keywords, flags=re.IGNORECASE).strip()
        encoded_keywords = clean_keywords.replace(" ", "%20")

        # Build search URL with location if specified
        search_url = f"https://www.linkedin.com/jobs/search/?keywords={encoded_keywords}"
        if location:
            search_url += f"&location={location.replace(' ', '%20')}"

        profile = self.engine._select_profile()
        renderer = random.choice(self.engine.common_renderers[self.engine.os])
        vendor = random.choice(self.engine.common_vendors)
        spoof_script = self.engine._get_spoof_script(renderer, vendor)

        async with async_playwright() as pw:
            browser = await pw.chromium.launch(
                headless= False,
                args=['--disable-blink-features=AutomationControlled']
            )
            context = await AsyncNewContext(browser, fingerprint=profile)

            await context.add_init_script(f"""
                Object.defineProperty(navigator, 'hardwareConcurrency', {{ get: () => {profile.navigator.hardwareConcurrency} }});
                Object.defineProperty(navigator, 'deviceMemory', {{ get: () => {profile.navigator.deviceMemory} }});
                Object.defineProperty(navigator, 'platform', {{ get: () => '{profile.navigator.platform}' }});
            """)
            await context.add_init_script(spoof_script)

            page = await context.new_page()

            session_loaded = await self.engine.load_session(context)
            login_successful = False

            if session_loaded:
                print("🔁 Using saved session — verifying login...")
                await page.goto("https://www.linkedin.com/feed/", timeout=60000)
                if "feed" in page.url and "login" not in page.url:
                    print("✅ Session still valid.")
                    login_successful = True
                else:
                    print("⚠️ Saved session expired — re-authenticating.")
                    session_loaded = False

            if not session_loaded and credentials:
                print("🔐 Performing fresh login...")
                login_successful = await self._login(page, credentials)
                if login_successful:
                    await self.engine.save_session(context)
                else:
                    print("❌ Login failed. Exiting.")
                    await browser.close()
                    self.engine.report_outcome("block")
                    return
            elif not credentials:
                print("ℹ️ No credentials — proceeding as guest.")
                login_successful = True
            else:
                pass

            await page.wait_for_load_state("load", timeout=60000)
            print("✅ Post-login page fully loaded. Starting search...")

            if await self.engine._detect_cloudflare(page):
                print("☁️ Cloudflare detected on initial load.")
                if not await self.engine._handle_cloudflare(page):
                    print("❌ Cloudflare could not be resolved.")
                    await browser.close()
                    self.engine.report_outcome("cloudflare")
                    return

            print(f"🔍 Searching for: {search_keywords}")
            await page.goto(search_url, wait_until='load', timeout=60000)
            await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)

            if await self.engine._detect_cloudflare(page):
                print("☁️ Cloudflare detected on search page.")
                if not await self.engine._handle_cloudflare(page):
                    await browser.close()
                    self.engine.report_outcome("cloudflare")
                    return

            all_job_links = []
            seen_job_ids = set()

            # First, scrape the initial page
            print("🔄 Collecting initial job links...")
            initial_jobs = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links)
            print(f"   ➕ Found {initial_jobs} initial job(s) (total: {len(all_job_links)})")

            # Loop until no new jobs are found
            iteration = 1
            while True:
                print(f"🔄 Iteration {iteration}: Checking for new jobs...")

                # First try infinite scroll
                prev_job_count = len(all_job_links)
                await self._handle_infinite_scroll(page, search_keywords, seen_job_ids, all_job_links)
                new_jobs_count = len(all_job_links) - prev_job_count

                if new_jobs_count > 0:
                    print(f"   ➕ Found {new_jobs_count} new jobs via infinite scroll")
                    iteration += 1
                    continue  # Continue with infinite scroll if new jobs found

                # If no new jobs via scroll, check for pagination
                pagination_exists = await page.query_selector("button[aria-label='Next']")

                if pagination_exists:
                    print("⏭️ Pagination detected. Processing pages...")
                    await self._handle_pagination(page, search_keywords, seen_job_ids, all_job_links)
                    iteration += 1
                    continue  # Continue with pagination if new jobs found
                else:
                    # If no pagination and no new jobs from scroll, check by refreshing
                    print("🔄 Refreshing page to check for new results...")
                    await page.reload(wait_until='load')
                    await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)

                    # Check for new jobs after refresh
                    new_jobs_after_refresh = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links)
                    if new_jobs_after_refresh > 0:
                        print(f"   ➕ Found {new_jobs_after_refresh} new job(s) after refresh")
                        iteration += 1
                        continue  # Continue if new jobs found after refresh
                    else:
                        print("🔚 No new jobs found after refresh. Stopping.")
                        break

                # Limit iterations to prevent infinite loops
                if iteration > 10:
                    print("🔄 Maximum iterations reached. Stopping.")
                    break

            print(f"✅ Collected {len(all_job_links)} unique job links.")

            # Process all collected job links
            scraped_count = 0
            for idx, (href, title) in enumerate(all_job_links):
                try:
                    full_url = href if href.startswith("http") else f"https://www.linkedin.com{href}"
                    print(f"  → Opening job {idx+1}/{len(all_job_links)}: {full_url}")
                    await page.goto(full_url, wait_until='load', timeout=60000)
                    await asyncio.sleep(3 * self.human_speed)

                    is_cloudflare = await self.engine._detect_cloudflare(page)
                    page_content = await page.content()
                    has_captcha_text = "captcha" in page_content.lower()
                    captcha_present = is_cloudflare or has_captcha_text

                    title_element = await page.query_selector("h1.t-24")
                    job_data_accessible = title_element is not None

                    if captcha_present:
                        if job_data_accessible:
                            print("  ⚠️ CAPTCHA detected, but job data is accessible. Proceeding in stealth mode...")
                            await self.engine._avoid_captcha(page)
                        else:
                            print("  ⚠️ CAPTCHA detected and job data blocked. Attempting recovery...")
                            if not await self.engine._solve_captcha_fallback(page):
                                print("  ❌ CAPTCHA recovery failed. Skipping job.")
                                continue
                            title_element = await page.query_selector("h1.t-24")
                            if not title_element:
                                print("  ❌ Job data still unavailable after CAPTCHA handling. Skipping.")
                                continue

                    if not captcha_present:
                        await self.engine._avoid_captcha(page)

                    apply_btn = None
                    apply_selectors = [
                        "button[aria-label*='Apply']",
                        "button:has-text('Apply')",
                        "a:has-text('Apply')",
                        "button:has-text('Easy Apply')"
                    ]
                    for selector in apply_selectors:
                        apply_btn = await page.query_selector(selector)
                        if apply_btn:
                            break

                    page_data = None
                    final_url = full_url

                    if apply_btn:
                        print("  → Clicking 'Apply' / 'Easy Apply' button...")

                        page_waiter = asyncio.create_task(context.wait_for_event("page"))
                        await self._human_click(page, apply_btn, wait_after=False)

                        external_page = None
                        try:
                            external_page = await asyncio.wait_for(page_waiter, timeout=5.0)
                            print("  🌐 External job site opened in new tab.")
                            await external_page.wait_for_load_state("load", timeout=60000)
                            await asyncio.sleep(2 * self.human_speed)
                            await self.engine._human_like_scroll(external_page)
                            await asyncio.sleep(2 * self.human_speed)

                            page_data = await self._extract_all_page_content(external_page)
                            final_url = external_page.url

                            if not external_page.is_closed():
                                await external_page.close()

                        except asyncio.TimeoutError:
                            print("  🖥️ No external tab — scraping LinkedIn job page.")
                            await page.wait_for_timeout(2000)
                            try:
                                await page.wait_for_selector("div.jobs-apply-button--fixed, div.jobs-easy-apply-modal", timeout=8000)
                            except:
                                pass
                            await self.engine._human_like_scroll(page)
                            await asyncio.sleep(2 * self.human_speed)
                            page_data = await self._extract_all_page_content(page)
                            final_url = page.url
                    else:
                        print("    ⚠️ No 'Apply' button found — scraping job details directly.")
                        await self.engine._human_like_scroll(page)
                        await asyncio.sleep(2 * self.human_speed)
                        page_data = await self._extract_all_page_content(page)
                        final_url = page.url

                    # Extract job ID from URL
                    job_id = final_url.split("/")[-2] if "/jobs/view/" in final_url else "unknown"

                    # Prepare raw data for LLM processing
                    raw_data = {
                        "page_content": page_data,
                        "url": final_url,
                        "job_id": job_id
                    }

                    # Send raw data to LLM agent for refinement
                    refined_data = await self.llm_agent.refine_job_data(raw_data, search_keywords)

                    # Only save if LLM successfully extracted meaningful data
                    if refined_data and refined_data.get("title", "N/A") != "N/A":
                        # Save refined data to markdown and database through LLM agent
                        await self.llm_agent.save_job_data(refined_data, search_keywords)

                        scraped_count += 1
                        print(f"  ✅ Scraped and refined: {refined_data['title'][:50]}...")
                    else:
                        print(f"  🟡 Could not extract meaningful data from: {final_url}")

                except Exception as e:
                    print(f"    ⚠️ Failed on job {idx+1}: {str(e)[:100]}")
                    continue

                finally:
                    print("  ↩️ Returning to LinkedIn search results...")
                    await page.goto(search_url, timeout=60000)
                    await asyncio.sleep(4 * self.human_speed)

            await browser.close()

            if scraped_count > 0:
                self.engine.report_outcome("success")
                print(f"✅ Completed! Processed {scraped_count} jobs for '{search_keywords}'.")
            else:
                self.engine.report_outcome("captcha")
                print("⚠️ No jobs processed successfully.")