Delete job_scraper2.py

2025-12-05 16:44:52 +00:00 · 2025-12-05 16:44:52 +00:00 · 0942339426
commit 0942339426
parent 7e80801f89
1 changed files with 0 additions and 468 deletions
--- a/job_scraper2.py
+++ b/job_scraper2.py
@ -1,468 +0,0 @@
-import asyncio
-import random
-from typing import Optional, Dict
-from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
-from browserforge.injectors.playwright import AsyncNewContext
-from llm_agent import LLMJobRefiner
-import re
-from fetcher import StealthyFetcher
-from datetime import datetime
-
-
-class LinkedInJobScraper:
-    def __init__(
-        self,
-        engine,
-        db_path: str = "linkedin_jobs.db",
-        human_speed: float = 1.0,
-        user_request: str = "Extract all standard job details"
-    ):
-        self.engine = engine
-        self.db_path = db_path
-        self.human_speed = human_speed
-        self.user_request = user_request
-        self._init_db()
-        self.llm_agent = LLMJobRefiner()
-
-    def _init_db(self):
-        # This method is kept for backward compatibility but LLMJobRefiner handles PostgreSQL now
-        pass
-
-    async def _human_click(self, page, element, wait_after: bool = True):
-        if not element:
-            return False
-        await element.scroll_into_view_if_needed()
-        await asyncio.sleep(random.uniform(0.3, 0.8) * self.human_speed)
-        try:
-            await element.click()
-            if wait_after:
-                await asyncio.sleep(random.uniform(2, 4) * self.human_speed)
-            return True
-        except:
-            return False
-
-    async def _login(self, page, credentials: Dict) -> bool:
-        print("🔐 Navigating to LinkedIn login page...")
-        await page.goto("https://www.linkedin.com/login", timeout=120000)
-        await asyncio.sleep(random.uniform(2.0, 3.5) * self.human_speed)
-
-        email_field = await page.query_selector('input[name="session_key"]')
-        if not email_field:
-            print("❌ Email field not found.")
-            return False
-        
-        print("✍️ Typing username...")
-        await email_field.click()
-        await asyncio.sleep(random.uniform(0.4, 0.9) * self.human_speed)
-        for char in credentials["email"]:
-            await page.keyboard.type(char)
-            await asyncio.sleep(random.uniform(0.06, 0.14) * self.human_speed)
-        await asyncio.sleep(random.uniform(1.0, 1.8) * self.human_speed)
-
-        password_field = await page.query_selector('input[name="session_password"]')
-        if not password_field:
-            print("❌ Password field not found.")
-            return False
-
-        print("🔒 Typing password...")
-        await password_field.click()
-        await asyncio.sleep(random.uniform(0.3, 0.7) * self.human_speed)
-        for char in credentials["password"]:
-            await page.keyboard.type(char)
-            await asyncio.sleep(random.uniform(0.08, 0.16) * self.human_speed)
-        await asyncio.sleep(random.uniform(0.8, 1.5) * self.human_speed)
-
-        print("✅ Submitting login form...")
-        await page.keyboard.press("Enter")
-        
-        for _ in range(15):
-            current_url = page.url
-            if "/feed" in current_url or "/jobs" in current_url:
-                if "login" not in current_url:
-                    print("✅ Login successful!")
-                    await asyncio.sleep(random.uniform(2.0, 3.0) * self.human_speed)
-                    return True
-            await asyncio.sleep(1)
-        print("❌ Login may have failed.")
-        return False
-
-    async def _extract_page_content_for_llm(self, page) -> str:
-        """
-        Extract raw page content as HTML/text for LLM processing
-        The LLM will handle all extraction logic, not specific selectors
-        """
-        await asyncio.sleep(2 * self.human_speed)
-        await self.engine._human_like_scroll(page)
-        await asyncio.sleep(2 * self.human_speed)
-        page_content = await page.content()
-        return page_content
-
-    def _calculate_keyword_match(self, title: str, keywords: str) -> float:
-        if not title or not keywords:
-            return 0.0
-        title_lower = title.lower()
-        keyword_list = [kw.strip().lower() for kw in keywords.split()]
-        matches = sum(1 for kw in keyword_list if kw in title_lower)
-        return matches / len(keyword_list) if keyword_list else 0.0
-
-    def _extract_location_from_keywords(self, search_keywords: str) -> str:
-        location_match = re.search(r'location:\s*([^,]+)', search_keywords, re.IGNORECASE)
-        return location_match.group(1).strip().lower() if location_match else ""
-
-    async def _scrape_jobs_from_current_page(self, page, search_keywords: str, seen_job_ids, all_job_links):
-        current_links = await page.query_selector_all("a[href*='/jobs/view/']")
-        new_jobs = 0
-        location_from_keywords = self._extract_location_from_keywords(search_keywords)
-        
-        for link in current_links:
-            href = await link.get_attribute("href")
-            if href:
-                full_url = href if href.startswith("http") else f"https://www.linkedin.com{href}"
-                job_id = href.split("/view/")[-1].split("/")[0] if "/view/" in href else href
-                
-                if job_id and job_id not in seen_job_ids:
-                    title_element = await link.query_selector("span.job-title, h3, .job-card-title")
-                    if title_element:
-                        title = await title_element.inner_text()
-                        match_percentage = self._calculate_keyword_match(title, search_keywords)
-                        location_match = True
-                        if location_from_keywords:
-                            location_element = await link.query_selector("span.job-location, .job-card-location, .location")
-                            if location_element:
-                                location_text = await location_element.inner_text()
-                                location_match = location_from_keywords in location_text.lower()
-                        
-                        if match_percentage >= 0.7 and location_match:
-                            seen_job_ids.add(job_id)
-                            all_job_links.append((href, title))
-                            new_jobs += 1
-                        elif match_percentage < 0.7:
-                            print(f"   ⚠️ Skipping job due to low keyword match: {title[:50]}... (match: {match_percentage:.2%})")
-                        elif not location_match:
-                            print(f"   ⚠️ Skipping job due to location mismatch: {title[:50]}... (expected: {location_from_keywords})")
-                    else:
-                        seen_job_ids.add(job_id)
-                        all_job_links.append((href, "Unknown Title"))
-                        new_jobs += 1
-        return new_jobs
-
-    async def _handle_pagination(self, page, search_keywords: str, seen_job_ids, all_job_links):
-        current_page = 1
-        while True:
-            print(f"📄 Processing page {current_page}")
-            new_jobs = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links)
-            print(f"   ➕ Found {new_jobs} new job(s) on page {current_page} (total: {len(all_job_links)})")
-
-            next_btn = await page.query_selector("button[aria-label='Next']")
-            if next_btn and await next_btn.is_enabled():
-                await self._human_click(page, next_btn)
-                await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)
-                try:
-                    await page.wait_for_function("() => window.location.href.includes('start=')", timeout=120000)
-                except:
-                    pass
-                current_page += 1
-            else:
-                print("🔚 'Next' button not available — stopping pagination.")
-                break
-
-    async def _handle_infinite_scroll(self, page, search_keywords: str, seen_job_ids, all_job_links):
-        last_height = await page.evaluate("document.body.scrollHeight")
-        no_new_jobs_count = 0
-        max_no_new = 3
-
-        while no_new_jobs_count < max_no_new:
-            await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
-            await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
-
-            new_jobs_found = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links)
-            print(f"   ➕ Found {new_jobs_found} new job(s) (total: {len(all_job_links)})")
-
-            new_height = await page.evaluate("document.body.scrollHeight")
-            if new_height == last_height:
-                no_new_jobs_count += 1
-            else:
-                no_new_jobs_count = 0
-                last_height = new_height
-
-            if new_jobs_found == 0 and no_new_jobs_count >= 1:
-                print("🔚 No new jobs loaded. Stopping scroll.")
-                break
-
-    async def scrape_jobs(
-        self,
-        search_keywords: Optional[str],
-        max_pages: int = 1,
-        credentials: Optional[Dict] = None
-    ):
-        location_match = re.search(r'location:\s*([^,]+)', search_keywords, re.IGNORECASE)
-        location = location_match.group(1).strip() if location_match else ""
-        clean_keywords = re.sub(r'location:\s*[^,]+', '', search_keywords, flags=re.IGNORECASE).strip()
-        encoded_keywords = clean_keywords.replace(" ", "%20")
-
-        search_url = f"https://www.linkedin.com/jobs/search/?keywords={encoded_keywords}"
-        if location:
-            search_url += f"&location={location.replace(' ', '%20')}"
-
-        profile = self.engine._select_profile()
-        renderer = random.choice(self.engine.common_renderers[self.engine.os])
-        vendor = random.choice(self.engine.common_vendors)
-        spoof_script = self.engine._get_spoof_script(renderer, vendor)
-
-        async with async_playwright() as pw:
-            browser = await pw.chromium.launch(
-                headless=False,
-                args=['--disable-blink-features=AutomationControlled']
-            )
-            context = await AsyncNewContext(browser, fingerprint=profile)
-
-            await context.add_init_script(f"""
-                Object.defineProperty(navigator, 'hardwareConcurrency', {{ get: () => {profile.navigator.hardwareConcurrency} }});
-                Object.defineProperty(navigator, 'deviceMemory', {{ get: () => {profile.navigator.deviceMemory} }});
-                Object.defineProperty(navigator, 'platform', {{ get: () => '{profile.navigator.platform}' }});
-            """)
-            await context.add_init_script(spoof_script)
-
-            page = await context.new_page()
-
-            # Create a temporary fetcher for protection checks on main page
-            temp_fetcher = StealthyFetcher(self.engine, browser, context)
-
-            session_loaded = await self.engine.load_session(context)
-            login_successful = False
-
-            if session_loaded:
-                print("🔁 Using saved session — verifying login...")
-                await page.goto("https://www.linkedin.com/feed/", timeout=120000)
-                if "feed" in page.url and "login" not in page.url:
-                    print("✅ Session still valid.")
-                    login_successful = True
-                else:
-                    print("⚠️ Saved session expired — re-authenticating.")
-                    session_loaded = False
-
-            if not session_loaded and credentials:
-                print("🔐 Performing fresh login...")
-                login_successful = await self._login(page, credentials)
-                if login_successful:
-                    await self.engine.save_session(context)
-                else:
-                    print("❌ Login failed. Exiting.")
-                    await browser.close()
-                    self.engine.report_outcome("block")
-                    return
-            elif not credentials:
-                print("ℹ️ No credentials — proceeding as guest.")
-                login_successful = True
-
-            await page.wait_for_load_state("load", timeout=120000)
-            print("✅ Post-login page fully loaded. Starting search...")
-
-            # >>> PROTECTION CHECK USING FETCHER LOGIC <<<
-            protection_type = await temp_fetcher._detect_protection(page)
-            if protection_type:
-                print(f"🛡️ Protection detected on initial page: {protection_type}")
-                content_accessible = await temp_fetcher._is_content_accessible(page)
-                if not content_accessible:
-                    print("🔒 Content not accessible.")
-                    handled = False
-                    if protection_type == "cloudflare":
-                        handled = await self.engine._handle_cloudflare(page)
-                    elif protection_type == "captcha":
-                        handled = False
-                    if not handled:
-                        await browser.close()
-                        self.engine.report_outcome("protection_block")
-                        return
-                else:
-                    print("✅ Protection present but content accessible — proceeding.")
-
-            print(f"🔍 Searching for: {search_keywords}")
-            await page.goto(search_url, wait_until='load', timeout=120000)
-            await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)
-
-            # >>> PROTECTION CHECK ON SEARCH PAGE <<<
-            protection_type = await temp_fetcher._detect_protection(page)
-            if protection_type:
-                print(f"🛡️ Protection detected on search page: {protection_type}")
-                content_accessible = await temp_fetcher._is_content_accessible(page)
-                if not content_accessible:
-                    print("🔒 Content not accessible.")
-                    handled = False
-                    if protection_type == "cloudflare":
-                        handled = await self.engine._handle_cloudflare(page)
-                    elif protection_type == "captcha":
-                        handled = False
-                    if not handled:
-                        await browser.close()
-                        self.engine.report_outcome("protection_block")
-                        return
-                else:
-                    print("✅ Protection present but content accessible — proceeding.")
-
-            all_job_links = []
-            seen_job_ids = set()
-
-            print("🔄 Collecting initial job links...")
-            initial_jobs = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links)
-            print(f"   ➕ Found {initial_jobs} initial job(s) (total: {len(all_job_links)})")
-
-            iteration = 1
-            while True and iteration >= 5:
-                print(f"🔄 Iteration {iteration}: Checking for new jobs...")
-
-                prev_job_count = len(all_job_links)
-                await self._handle_infinite_scroll(page, search_keywords, seen_job_ids, all_job_links)
-                new_jobs_count = len(all_job_links) - prev_job_count
-
-                if new_jobs_count > 0:
-                    print(f"   ➕ Found {new_jobs_count} new jobs via infinite scroll")
-                    iteration += 1
-                    continue
-
-                pagination_exists = await page.query_selector("button[aria-label='Next']")
-
-                if pagination_exists:
-                    print("⏭️ Pagination detected. Processing pages...")
-                    await self._handle_pagination(page, search_keywords, seen_job_ids, all_job_links)
-                    iteration += 1
-                    continue
-                else:
-                    print("🔄 Refreshing page to check for new results...")
-                    await page.reload(wait_until='load')
-                    await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
-
-                    new_jobs_after_refresh = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links)
-                    if new_jobs_after_refresh > 0:
-                        print(f"   ➕ Found {new_jobs_after_refresh} new job(s) after refresh")
-                        iteration += 1
-                        continue
-                    else:
-                        print("🔚 No new jobs found after refresh. Stopping.")
-                        break
-
-            print(f"✅ Collected {len(all_job_links)} unique job links.")
-
-            scraped_count = 0
-            for idx, (href, title) in enumerate(all_job_links):
-                try:
-                    full_url = href if href.startswith("http") else f"https://www.linkedin.com{href}"
-                    print(f"  → Opening job {idx+1}/{len(all_job_links)}: {full_url}")
-
-                    fetcher = StealthyFetcher(self.engine, browser, context)
-                    job_page = await fetcher.fetch_url(full_url, wait_for_selector="h1.t-24")
-                    if not job_page:
-                        print(f" ❌ Failed to fetch job page {full_url} after retries.")
-                        self.engine.report_outcome("fetch_failure", url=full_url)
-                        continue
-
-                    apply_btn = None
-                    apply_selectors = [
-                        "button[aria-label*='Apply']",
-                        "button:has-text('Apply')",
-                        "a:has-text('Apply')",
-                        "button:has-text('Easy Apply')"
-                    ]
-                    for selector in apply_selectors:
-                        apply_btn = await job_page.query_selector(selector)
-                        if apply_btn:
-                            break
-
-                    final_url = full_url
-                    external_url = None
-                    page_content = None
-
-                    if apply_btn:
-                        print("  → Clicking 'Apply' / 'Easy Apply' button...")
-
-                        page_waiter = asyncio.create_task(context.wait_for_event("page"))
-                        await self._human_click(job_page, apply_btn, wait_after=False)
-
-                        external_page = None
-                        try:
-                            external_page = await asyncio.wait_for(page_waiter, timeout=5.0)
-                            print("  🌐 External job site opened in new tab.")
-                            await external_page.wait_for_load_state("load", timeout=120000)
-                            await asyncio.sleep(2 * self.human_speed)
-                            await self.engine._human_like_scroll(external_page)
-                            await asyncio.sleep(2 * self.human_speed)
-
-                            # Extract raw content from external page for LLM processing
-                            external_url = external_page.url
-                            final_url = external_url
-                            page_content = await self._extract_page_content_for_llm(external_page)
-
-                            if not external_page.is_closed():
-                                await external_page.close()
-
-                        except asyncio.TimeoutError:
-                            print("  🖥️ No external tab — scraping LinkedIn job page directly.")
-                            await job_page.wait_for_timeout(60000)
-                            try:
-                                await job_page.wait_for_selector("div.jobs-apply-button--fixed, div.jobs-easy-apply-modal", timeout=80000)
-                            except PlaywrightTimeoutError:
-                                pass
-                            await self.engine._human_like_scroll(job_page)
-                            await asyncio.sleep(2 * self.human_speed)
-                            page_content = await self._extract_page_content_for_llm(job_page)
-                    else:
-                        print("    ⚠️ No 'Apply' button found — scraping job details directly.")
-                        await self.engine._human_like_scroll(job_page)
-                        await asyncio.sleep(2 * self.human_speed)
-                        page_content = await self._extract_page_content_for_llm(job_page)
-
-                    job_id = full_url.split("/")[-2] if "/jobs/view/" in full_url else "unknown"
-
-                    raw_data = {
-                        "page_content": page_content,
-                        "url": final_url,
-                        "job_id": job_id,
-                        "search_keywords": search_keywords
-                    }
-
-                    # LLM agent is now fully responsible for extraction and validation
-                    refined_data = await self.llm_agent.refine_job_data(raw_data, self.user_request)
-
-                    if refined_data and refined_data.get("title", "N/A") != "N/A":
-                        # Ensure compulsory fields are present (fallback if LLM missed them)
-                        compulsory_fields = ['company_name', 'job_id', 'url']
-                        for field in compulsory_fields:
-                            if not refined_data.get(field) or refined_data[field] in ["N/A", "", "Unknown"]:
-                                if field == 'job_id':
-                                    refined_data[field] = job_id
-                                elif field == 'url':
-                                    refined_data[field] = final_url
-                                elif field == 'company_name':
-                                    refined_data[field] = "Unknown Company"
-                        
-                        refined_data['scraped_at'] = datetime.now().isoformat()
-                        refined_data['category'] = clean_keywords
-                        await self.llm_agent.save_job_data(refined_data, search_keywords)
-                        scraped_count += 1
-                        print(f"  ✅ Scraped and refined: {refined_data['title'][:50]}...")
-                        self.engine.report_outcome("success", url=raw_data["url"])
-                    else:
-                        print(f"  🟡 Could not extract meaningful data from: {final_url}")
-                        self.engine.report_outcome("llm_failure", url=raw_data["url"])
-
-                    await job_page.close()
-
-                except Exception as e:
-                    print(f"    ⚠️ Failed on job {idx+1}: {str(e)[:100]}")
-                    if 'job_page' in locals() and job_page:
-                        await job_page.close()
-                    continue
-
-                finally:
-                    print("  ↩️ Returning to LinkedIn search results...")
-                    await page.goto(search_url, timeout=120000)
-                    await asyncio.sleep(4 * self.human_speed)
-
-            await browser.close()
-
-            if scraped_count > 0:
-                self.engine.report_outcome("success")
-                print(f"✅ Completed! Processed {scraped_count} jobs for '{search_keywords}' based on request '{self.user_request}'.")
-            else:
-                self.engine.report_outcome("captcha")
-                print("⚠️ No jobs processed successfully.")