Add job_scraper.py

2025-11-20 18:59:46 +00:00 · 2025-11-20 18:59:46 +00:00 · f52868edfa
commit f52868edfa
parent 1a216a1aa8
1 changed files with 308 additions and 0 deletions
--- a/job_scraper.py
+++ b/job_scraper.py
@ -0,0 +1,308 @@
 import asyncio
 import random
 import sqlite3
 import os
 from datetime import datetime
 from typing import Optional, Dict
 from playwright.async_api import async_playwright
 from browserforge.injectors.playwright import AsyncNewContext
 class LinkedInJobScraper:
    def __init__(
        self,
        engine,
        db_path: str = "linkedin_jobs.db",
        human_speed: float = 1.0
    ):
        self.engine = engine
        self.db_path = db_path
        self.human_speed = human_speed
        self._init_db()
    def _init_db(self):
        os.makedirs(os.path.dirname(self.db_path) if os.path.dirname(self.db_path) else ".", exist_ok=True)
        with sqlite3.connect(self.db_path) as conn:
            cursor = conn.cursor()
            cursor.execute('''
                CREATE TABLE IF NOT EXISTS jobs (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    keyword TEXT,
                    title TEXT,
                    company TEXT,
                    location TEXT,
                    salary TEXT,
                    description TEXT,
                    url TEXT UNIQUE,
                    workplace_type TEXT,
                    scraped_at DATETIME DEFAULT CURRENT_TIMESTAMP
                )
            ''')
            conn.commit()
    async def _human_click(self, page, element, wait_after: bool = True):
        if not element:
            return False
        await element.scroll_into_view_if_needed()
        await asyncio.sleep(random.uniform(0.3, 0.8) * self.human_speed)
        try:
            await element.click()
            if wait_after:
                await asyncio.sleep(random.uniform(2, 4) * self.human_speed)
            return True
        except:
            return False
    async def _login(self, page, credentials: Dict) -> bool:
        """Human-realistic LinkedIn login"""
        print("🔐 Navigating to LinkedIn login page...")
        await page.goto("https://www.linkedin.com/login", timeout=60000)
        await asyncio.sleep(random.uniform(2.0, 3.5) * self.human_speed)
        email_field = await page.query_selector('input[name="session_key"]')
        if not email_field:
            print("❌ Email field not found.")
            return False
        print("✍️ Typing username...")
        await email_field.click()
        await asyncio.sleep(random.uniform(0.4, 0.9) * self.human_speed)
        for char in credentials["email"]:
            await page.keyboard.type(char)
            await asyncio.sleep(random.uniform(0.06, 0.14) * self.human_speed)
        await asyncio.sleep(random.uniform(1.0, 1.8) * self.human_speed)
        password_field = await page.query_selector('input[name="session_password"]')
        if not password_field:
            print("❌ Password field not found.")
            return False
        print("🔒 Typing password...")
        await password_field.click()
        await asyncio.sleep(random.uniform(0.3, 0.7) * self.human_speed)
        for char in credentials["password"]:
            await page.keyboard.type(char)
            await asyncio.sleep(random.uniform(0.08, 0.16) * self.human_speed)
        await asyncio.sleep(random.uniform(0.8, 1.5) * self.human_speed)
        print("✅ Submitting login form...")
        await page.keyboard.press("Enter")
        for _ in range(15):
            current_url = page.url
            if "/feed" in current_url or "/jobs" in current_url:
                if "login" not in current_url:
                    print("✅ Login successful!")
                    await asyncio.sleep(random.uniform(2.0, 3.0) * self.human_speed)
                    return True
            await asyncio.sleep(1)
        print("❌ Login may have failed.")
        return False
    async def _extract_job_details(self, page) -> Dict:
        """Extract from ANY job page: LinkedIn Easy Apply OR external site"""
        await asyncio.sleep(2 * self.human_speed)
        async def get_text(selector: str) -> str:
            try:
                el = await page.query_selector(selector)
                if el:
                    text = await el.inner_text()
                    return text.strip() if text else "N/A"
            except:
                pass
            return "N/A"
        # Try multiple strategies for each field
        title = await get_text("h1.t-24")  # LinkedIn
        if title == "N/A":
            title = await get_text("h1, h2")  # External
        company = await get_text("a.app-aware-link[href*='/company/']")  # LinkedIn
        if company == "N/A":
            company = await get_text("div.org, .company, [class*='company']")  # External
        location = await get_text("span[class*='location']")  # LinkedIn
        if location == "N/A":
            location = await get_text(".location, [class*='location']")
        description = await get_text("div[class*='description__text']")  # LinkedIn
        if description == "N/A":
            description = await get_text(".job-desc, .description, main, body")
        # Workplace & salary — LinkedIn only (external may not have)
        workplace = await get_text("span.job-workplace-type") or "N/A"
        salary = await get_text("span.salary") or "N/A"
        return {
            "title": title,
            "company": company,
            "location": location,
            "workplace_type": workplace,
            "salary": salary,
            "description": description,
            "url": page.url
        }
    async def _save_to_markdown(self, job_data: Dict, keyword: str):
        os.makedirs("linkedin_jobs", exist_ok=True)
        clean_keyword = keyword.replace(" ", "_")
        filename = f"linkedin_{clean_keyword}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
        filepath = os.path.join("linkedin_jobs", filename)
        with open(filepath, "w", encoding="utf-8") as f:
            f.write(f"# {job_data['title']}\n\n")
            f.write(f"- **Company**: {job_data['company']}\n")
            f.write(f"- **Location**: {job_data['location']}\n")
            f.write(f"- **Workplace**: {job_data['workplace_type']}\n")
            f.write(f"- **Salary**: {job_data['salary']}\n")
            f.write(f"- **URL**: <{job_data['url']}>\n\n")
            f.write(f"## Description\n\n{job_data['description']}\n")
    async def _save_to_db(self, job_data: Dict, keyword: str):
        with sqlite3.connect(self.db_path) as conn:
            cursor = conn.cursor()
            cursor.execute('''
                INSERT OR IGNORE INTO jobs 
                (keyword, title, company, location, salary, description, url, workplace_type)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?)
            ''', (
                keyword,
                job_data["title"],
                job_data["company"],
                job_data["location"],
                job_data["salary"],
                job_data["description"],
                job_data["url"],
                job_data["workplace_type"]
            ))
            conn.commit()
    async def scrape_jobs(
        self,
        search_keywords: str,
        max_pages: int = 1,
        credentials: Optional[Dict] = None
    ):
        encoded_keywords = search_keywords.replace(" ", "%20")
        search_url = f"https://www.linkedin.com/jobs/search/?keywords={encoded_keywords}"
        profile = self.engine._select_profile()
        renderer = random.choice(self.engine.common_renderers[self.engine.os])
        vendor = random.choice(self.engine.common_vendors)
        spoof_script = self.engine._get_spoof_script(renderer, vendor)
        async with async_playwright() as pw:
            browser = await pw.chromium.launch(
                headless=False,
                args=['--disable-blink-features=AutomationControlled']
            )
            context = await AsyncNewContext(browser, fingerprint=profile)
            await context.add_init_script(f"""
                Object.defineProperty(navigator, 'hardwareConcurrency', {{ get: () => {profile.navigator.hardwareConcurrency} }});
                Object.defineProperty(navigator, 'deviceMemory', {{ get: () => {profile.navigator.deviceMemory} }});
                Object.defineProperty(navigator, 'platform', {{ get: () => '{profile.navigator.platform}' }});
            """)
            await context.add_init_script(spoof_script)
            page = await context.new_page()
            if credentials:
                print("🔐 Attempting LinkedIn login...")
                if not await self._login(page, credentials):
                    print("❌ Login failed. Exiting.")
                    await browser.close()
                    return
            else:
                print("ℹ️ No credentials — proceeding as guest.")
            print(f"🔍 Searching for: {search_keywords}")
            await page.goto(search_url, wait_until='load', timeout=60000)
            await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)
            scraped_count = 0
            all_job_links = []
            # Collect job links
            for page_num in range(1, max_pages + 1):
                print(f"📄 Collecting job links from page {page_num}/{max_pages}")
                for _ in range(50):
                    links = await page.query_selector_all("a[href*='/jobs/view/']")
                    if links:
                        for link in links:
                            href = await link.get_attribute("href")
                            if href and href not in all_job_links:
                                all_job_links.append(href)
                        break
                    await asyncio.sleep(1)
                print(f"   ➕ Found {len(links) if 'links' in locals() else 0} new job links.")
                if page_num < max_pages:
                    next_btn = await page.query_selector("button[aria-label='Next']")
                    if next_btn and await next_btn.is_enabled():
                        await self._human_click(page, next_btn)
                        await asyncio.sleep(4 * self.human_speed)
                    else:
                        print("🔚 No next page.")
                        break
            # Process each job
            for idx, href in enumerate(all_job_links):
                try:
                    full_url = href if href.startswith("http") else f"https://www.linkedin.com{href}"
                    print(f"  → Opening job {idx+1}: {full_url}")
                    await page.goto(full_url, wait_until='load', timeout=60000)
                    await asyncio.sleep(3 * self.human_speed)
                    if not await page.query_selector("h1.t-24"):
                        print(f"    ⚠️ Invalid job page, skipping.")
                        continue
                    # Find and click the main "Apply" button
                    apply_btn = None
                    apply_selectors = [
                        "button[aria-label*='Apply']",
                        "button:has-text('Apply')",
                        "a:has-text('Apply')",
                        "button:has-text('Easy Apply')"
                    ]
                    for selector in apply_selectors:
                        apply_btn = await page.query_selector(selector)
                        if apply_btn:
                            break
                    if not apply_btn:
                        print(f"    ⚠️ No 'Apply' button found, skipping.")
                        continue
                    # Click "Apply"
                    print(f"  → Clicking 'Apply' / 'Easy Apply' button...")
                    await self._human_click(page, apply_btn, wait_after=False)
                    await asyncio.sleep(4 * self.human_speed)  # Wait for next page/form to load
                    # Now scrape WHATEVER page is displayed (Easy Apply form OR external site)
                    job_data = await self._extract_job_details(page)
                    if job_data["title"] == "N/A" and "linkedin.com" in page.url:
                        # On LinkedIn but no title → likely Easy Apply form; use job ID as title
                        job_id = full_url.split("/")[-2] if "/jobs/view/" in full_url else "unknown"
                        job_data["title"] = f"Easy Apply Job - ID {job_id}"
                    await self._save_to_db(job_data, search_keywords)
                    await self._save_to_markdown(job_data, search_keywords)
                    scraped_count += 1
                    domain = "LinkedIn (Easy Apply)" if "linkedin.com" in page.url else "External Site"
                    print(f"  ✅ Scraped ({domain}): {job_data['title'][:50]}...")
                except Exception as e:
                    print(f"    ⚠️ Failed on job {idx+1}: {str(e)[:100]}")
                finally:
                    # Return to search results
                    print("  ↩️ Returning to LinkedIn search results...")
                    await page.goto(search_url, timeout=60000)
                    await asyncio.sleep(4 * self.human_speed)
            await browser.close()
            print(f"✅ Completed! Scraped {scraped_count} job pages (internal + external) for '{search_keywords}'.")