import asyncio import random import sqlite3 import os from datetime import datetime from typing import Optional, Dict, List from playwright.async_api import async_playwright from browserforge.injectors.playwright import AsyncNewContext from llm_agent import LLMJobRefiner import re class LinkedInJobScraper: def __init__( self, engine, db_path: str = "linkedin_jobs.db", human_speed: float = 1.0, target_field: str = "all" ): self.engine = engine self.db_path = db_path self.human_speed = human_speed self.target_field = target_field self._init_db() self.llm_agent = LLMJobRefiner() def _init_db(self): os.makedirs(os.path.dirname(self.db_path) if os.path.dirname(self.db_path) else ".", exist_ok=True) with sqlite3.connect(self.db_path) as conn: cursor = conn.cursor() cursor.execute(''' CREATE TABLE IF NOT EXISTS jobs ( id INTEGER PRIMARY KEY AUTOINCREMENT, title TEXT, company_name TEXT, location TEXT, description TEXT, requirements TEXT, qualifications TEXT, salary_range TEXT, nature_of_work TEXT, job_id TEXT, url TEXT UNIQUE ) ''') conn.commit() async def _human_click(self, page, element, wait_after: bool = True): if not element: return False await element.scroll_into_view_if_needed() await asyncio.sleep(random.uniform(0.3, 0.8) * self.human_speed) try: await element.click() if wait_after: await asyncio.sleep(random.uniform(2, 4) * self.human_speed) return True except: return False async def _login(self, page, credentials: Dict) -> bool: """Human-realistic LinkedIn login""" print("🔐 Navigating to LinkedIn login page...") await page.goto("https://www.linkedin.com/login", timeout=60000) await asyncio.sleep(random.uniform(2.0, 3.5) * self.human_speed) email_field = await page.query_selector('input[name="session_key"]') if not email_field: print("❌ Email field not found.") return False print("âœī¸ Typing username...") await email_field.click() await asyncio.sleep(random.uniform(0.4, 0.9) * self.human_speed) for char in credentials["email"]: await page.keyboard.type(char) await asyncio.sleep(random.uniform(0.06, 0.14) * self.human_speed) await asyncio.sleep(random.uniform(1.0, 1.8) * self.human_speed) password_field = await page.query_selector('input[name="session_password"]') if not password_field: print("❌ Password field not found.") return False print("🔒 Typing password...") await password_field.click() await asyncio.sleep(random.uniform(0.3, 0.7) * self.human_speed) for char in credentials["password"]: await page.keyboard.type(char) await asyncio.sleep(random.uniform(0.08, 0.16) * self.human_speed) await asyncio.sleep(random.uniform(0.8, 1.5) * self.human_speed) print("✅ Submitting login form...") await page.keyboard.press("Enter") for _ in range(15): current_url = page.url if "/feed" in current_url or "/jobs" in current_url: if "login" not in current_url: print("✅ Login successful!") await asyncio.sleep(random.uniform(2.0, 3.0) * self.human_speed) return True await asyncio.sleep(1) print("❌ Login may have failed.") return False async def _extract_all_page_content(self, page) -> str: """Extract all content from the job page""" await asyncio.sleep(2 * self.human_speed) # Human-like scrolling to load all content await self.engine._human_like_scroll(page) await asyncio.sleep(2 * self.human_speed) # Get the full page content page_content = await page.content() return page_content def _calculate_keyword_match(self, title: str, keywords: str) -> float: """Calculate percentage of keywords matched in title""" if not title or not keywords: return 0.0 title_lower = title.lower() keyword_list = [kw.strip().lower() for kw in keywords.split()] matches = 0 for keyword in keyword_list: if keyword in title_lower: matches += 1 return matches / len(keyword_list) if keyword_list else 0.0 def _extract_location_from_keywords(self, search_keywords: str) -> str: """Extract location from search keywords if present""" location_match = re.search(r'location:\s*([^,]+)', search_keywords, re.IGNORECASE) if location_match: return location_match.group(1).strip().lower() return "" async def _scrape_jobs_from_current_page(self, page, search_keywords: str, seen_job_ids, all_job_links): """Scrape job links from the current page that match keywords and location""" current_links = await page.query_selector_all("a[href*='/jobs/view/']") new_jobs = 0 # Extract location from search keywords location_from_keywords = self._extract_location_from_keywords(search_keywords) for link in current_links: href = await link.get_attribute("href") if href: full_url = href if href.startswith("http") else f"https://www.linkedin.com{href}" job_id = href.split("/view/")[-1].split("/")[0] if "/view/" in href else href if job_id and job_id not in seen_job_ids: # Check if job title matches keywords (at least 70% match) title_element = await link.query_selector("span.job-title, h3, .job-card-title") if title_element: title = await title_element.inner_text() match_percentage = self._calculate_keyword_match(title, search_keywords) # Check if location matches (if specified in keywords) location_match = True if location_from_keywords: # Try to get location from the job card location_element = await link.query_selector("span.job-location, .job-card-location, .location") if location_element: location_text = await location_element.inner_text() location_match = location_from_keywords in location_text.lower() if match_percentage >= 0.7 and location_match: # At least 70% match and location matches seen_job_ids.add(job_id) all_job_links.append((href, title)) new_jobs += 1 elif match_percentage < 0.7: print(f" âš ī¸ Skipping job due to low keyword match: {title[:50]}... (match: {match_percentage:.2%})") elif not location_match: print(f" âš ī¸ Skipping job due to location mismatch: {title[:50]}... (expected: {location_from_keywords})") else: # If no title element, still add to check later seen_job_ids.add(job_id) all_job_links.append((href, "Unknown Title")) new_jobs += 1 return new_jobs async def _handle_pagination(self, page, search_keywords: str, seen_job_ids, all_job_links): """Handle pagination by going through pages""" current_page = 1 while True: print(f"📄 Processing page {current_page}") # Collect job links on current page new_jobs = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links) print(f" ➕ Found {new_jobs} new job(s) on page {current_page} (total: {len(all_job_links)})") # Try to go to next page next_btn = await page.query_selector("button[aria-label='Next']") if next_btn and await next_btn.is_enabled(): await self._human_click(page, next_btn) await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed) # Wait for URL to change or new content try: await page.wait_for_function("() => window.location.href.includes('start=')", timeout=30000) except: pass current_page += 1 else: print("🔚 'Next' button not available — stopping pagination.") break async def _handle_infinite_scroll(self, page, search_keywords: str, seen_job_ids, all_job_links): """Handle infinite scroll to load more jobs""" last_height = await page.evaluate("document.body.scrollHeight") no_new_jobs_count = 0 max_no_new = 3 while no_new_jobs_count < max_no_new: await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed) new_jobs_found = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links) print(f" ➕ Found {new_jobs_found} new job(s) (total: {len(all_job_links)})") new_height = await page.evaluate("document.body.scrollHeight") if new_height == last_height: no_new_jobs_count += 1 else: no_new_jobs_count = 0 last_height = new_height if new_jobs_found == 0 and no_new_jobs_count >= 1: print("🔚 No new jobs loaded. Stopping scroll.") break async def scrape_jobs( self, search_keywords: Optional[str], max_pages: int = 1, credentials: Optional[Dict] = None ): # Parse location from keywords if present location_match = re.search(r'location:\s*([^,]+)', search_keywords, re.IGNORECASE) location = location_match.group(1).strip() if location_match else "" # Remove location part from keywords for search clean_keywords = re.sub(r'location:\s*[^,]+', '', search_keywords, flags=re.IGNORECASE).strip() encoded_keywords = clean_keywords.replace(" ", "%20") # Build search URL with location if specified search_url = f"https://www.linkedin.com/jobs/search/?keywords={encoded_keywords}" if location: search_url += f"&location={location.replace(' ', '%20')}" profile = self.engine._select_profile() renderer = random.choice(self.engine.common_renderers[self.engine.os]) vendor = random.choice(self.engine.common_vendors) spoof_script = self.engine._get_spoof_script(renderer, vendor) async with async_playwright() as pw: browser = await pw.chromium.launch( headless= False, args=['--disable-blink-features=AutomationControlled'] ) context = await AsyncNewContext(browser, fingerprint=profile) await context.add_init_script(f""" Object.defineProperty(navigator, 'hardwareConcurrency', {{ get: () => {profile.navigator.hardwareConcurrency} }}); Object.defineProperty(navigator, 'deviceMemory', {{ get: () => {profile.navigator.deviceMemory} }}); Object.defineProperty(navigator, 'platform', {{ get: () => '{profile.navigator.platform}' }}); """) await context.add_init_script(spoof_script) page = await context.new_page() session_loaded = await self.engine.load_session(context) login_successful = False if session_loaded: print("🔁 Using saved session — verifying login...") await page.goto("https://www.linkedin.com/feed/", timeout=60000) if "feed" in page.url and "login" not in page.url: print("✅ Session still valid.") login_successful = True else: print("âš ī¸ Saved session expired — re-authenticating.") session_loaded = False if not session_loaded and credentials: print("🔐 Performing fresh login...") login_successful = await self._login(page, credentials) if login_successful: await self.engine.save_session(context) else: print("❌ Login failed. Exiting.") await browser.close() self.engine.report_outcome("block") return elif not credentials: print("â„šī¸ No credentials — proceeding as guest.") login_successful = True else: pass await page.wait_for_load_state("load", timeout=60000) print("✅ Post-login page fully loaded. Starting search...") if await self.engine._detect_cloudflare(page): print("â˜ī¸ Cloudflare detected on initial load.") if not await self.engine._handle_cloudflare(page): print("❌ Cloudflare could not be resolved.") await browser.close() self.engine.report_outcome("cloudflare") return print(f"🔍 Searching for: {search_keywords}") await page.goto(search_url, wait_until='load', timeout=60000) await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed) if await self.engine._detect_cloudflare(page): print("â˜ī¸ Cloudflare detected on search page.") if not await self.engine._handle_cloudflare(page): await browser.close() self.engine.report_outcome("cloudflare") return all_job_links = [] seen_job_ids = set() # First, scrape the initial page print("🔄 Collecting initial job links...") initial_jobs = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links) print(f" ➕ Found {initial_jobs} initial job(s) (total: {len(all_job_links)})") # Loop until no new jobs are found iteration = 1 while True: print(f"🔄 Iteration {iteration}: Checking for new jobs...") # First try infinite scroll prev_job_count = len(all_job_links) await self._handle_infinite_scroll(page, search_keywords, seen_job_ids, all_job_links) new_jobs_count = len(all_job_links) - prev_job_count if new_jobs_count > 0: print(f" ➕ Found {new_jobs_count} new jobs via infinite scroll") iteration += 1 continue # Continue with infinite scroll if new jobs found # If no new jobs via scroll, check for pagination pagination_exists = await page.query_selector("button[aria-label='Next']") if pagination_exists: print("â­ī¸ Pagination detected. Processing pages...") await self._handle_pagination(page, search_keywords, seen_job_ids, all_job_links) iteration += 1 continue # Continue with pagination if new jobs found else: # If no pagination and no new jobs from scroll, check by refreshing print("🔄 Refreshing page to check for new results...") await page.reload(wait_until='networkidle') await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed) # Check for new jobs after refresh new_jobs_after_refresh = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links) if new_jobs_after_refresh > 0: print(f" ➕ Found {new_jobs_after_refresh} new job(s) after refresh") iteration += 1 continue # Continue if new jobs found after refresh else: print("🔚 No new jobs found after refresh. Stopping.") break # Limit iterations to prevent infinite loops if iteration > 10: print("🔄 Maximum iterations reached. Stopping.") break print(f"✅ Collected {len(all_job_links)} unique job links.") # Process all collected job links scraped_count = 0 for idx, (href, title) in enumerate(all_job_links): try: full_url = href if href.startswith("http") else f"https://www.linkedin.com{href}" print(f" → Opening job {idx+1}/{len(all_job_links)}: {full_url}") await page.goto(full_url, wait_until='load', timeout=60000) await asyncio.sleep(3 * self.human_speed) is_cloudflare = await self.engine._detect_cloudflare(page) page_content = await page.content() has_captcha_text = "captcha" in page_content.lower() captcha_present = is_cloudflare or has_captcha_text title_element = await page.query_selector("h1.t-24") job_data_accessible = title_element is not None if captcha_present: if job_data_accessible: print(" âš ī¸ CAPTCHA detected, but job data is accessible. Proceeding in stealth mode...") await self.engine._avoid_captcha(page) else: print(" âš ī¸ CAPTCHA detected and job data blocked. Attempting recovery...") if not await self.engine._solve_captcha_fallback(page): print(" ❌ CAPTCHA recovery failed. Skipping job.") continue title_element = await page.query_selector("h1.t-24") if not title_element: print(" ❌ Job data still unavailable after CAPTCHA handling. Skipping.") continue if not captcha_present: await self.engine._avoid_captcha(page) apply_btn = None apply_selectors = [ "button[aria-label*='Apply']", "button:has-text('Apply')", "a:has-text('Apply')", "button:has-text('Easy Apply')" ] for selector in apply_selectors: apply_btn = await page.query_selector(selector) if apply_btn: break page_data = None final_url = full_url if apply_btn: print(" → Clicking 'Apply' / 'Easy Apply' button...") page_waiter = asyncio.create_task(context.wait_for_event("page")) await self._human_click(page, apply_btn, wait_after=False) external_page = None try: external_page = await asyncio.wait_for(page_waiter, timeout=5.0) print(" 🌐 External job site opened in new tab.") await external_page.wait_for_load_state("load", timeout=30000) await asyncio.sleep(2 * self.human_speed) await self.engine._human_like_scroll(external_page) await asyncio.sleep(2 * self.human_speed) page_data = await self._extract_all_page_content(external_page) final_url = external_page.url if not external_page.is_closed(): await external_page.close() except asyncio.TimeoutError: print(" đŸ–Ĩī¸ No external tab — scraping LinkedIn job page.") await page.wait_for_timeout(2000) try: await page.wait_for_selector("div.jobs-apply-button--fixed, div.jobs-easy-apply-modal", timeout=8000) except: pass await self.engine._human_like_scroll(page) await asyncio.sleep(2 * self.human_speed) page_data = await self._extract_all_page_content(page) final_url = page.url else: print(" âš ī¸ No 'Apply' button found — scraping job details directly.") await self.engine._human_like_scroll(page) await asyncio.sleep(2 * self.human_speed) page_data = await self._extract_all_page_content(page) final_url = page.url # Extract job ID from URL job_id = final_url.split("/")[-2] if "/jobs/view/" in final_url else "unknown" # Prepare raw data for LLM processing raw_data = { "page_content": page_data, "url": final_url, "job_id": job_id } # Send raw data to LLM agent for refinement refined_data = await self.llm_agent.refine_job_data(raw_data, search_keywords) # Only save if LLM successfully extracted meaningful data if refined_data and refined_data.get("title", "N/A") != "N/A": # Save refined data to markdown and database through LLM agent await self.llm_agent.save_job_data(refined_data, search_keywords) scraped_count += 1 print(f" ✅ Scraped and refined: {refined_data['title'][:50]}...") else: print(f" 🟡 Could not extract meaningful data from: {final_url}") except Exception as e: print(f" âš ī¸ Failed on job {idx+1}: {str(e)[:100]}") continue finally: print(" â†Šī¸ Returning to LinkedIn search results...") await page.goto(search_url, timeout=60000) await asyncio.sleep(4 * self.human_speed) await browser.close() if scraped_count > 0: self.engine.report_outcome("success") print(f"✅ Completed! Processed {scraped_count} jobs for '{search_keywords}'.") else: self.engine.report_outcome("captcha") print("âš ī¸ No jobs processed successfully.")