diff --git a/.env b/.env deleted file mode 100644 index 5a01cb9..0000000 --- a/.env +++ /dev/null @@ -1,8 +0,0 @@ -SCRAPING_USERNAME="anointedsaviour1@gmail.com" -SCRAPING_PASSWORD="PeaceIkheloa2478#" -DEEPSEEK_API_KEY=sk-90ce747579f6469ea88a97e0168b7a34 -DB_URL="jdbc:postgresql://aws-0-us-west-1.pooler.supabase.com:5432/postgres" -DB_USERNAME="postgres.gezjetnnesuwczhzoqll" -DB_PASSWORD="Vx3UbzWzxoVRUqQH" -DB_PORT=5432 -DB_HOST="aws-0-us-west-1.pooler.supabase.com" \ No newline at end of file diff --git a/feedback_job_scraping_12.json b/feedback_job_scraping_12.json deleted file mode 100644 index 29d9a43..0000000 --- a/feedback_job_scraping_12.json +++ /dev/null @@ -1 +0,0 @@ -{"success_rate": 1.0, "captcha_count": 0, "cloudflare_count": 0, "avg_response_time": 10.0, "failed_domains": {}} \ No newline at end of file diff --git a/feedback_job_scraping_123.json b/feedback_job_scraping_123.json deleted file mode 100644 index af763df..0000000 --- a/feedback_job_scraping_123.json +++ /dev/null @@ -1 +0,0 @@ -{"success_rate": 0.9, "captcha_count": 0, "cloudflare_count": 0, "avg_response_time": 10.0, "failed_domains": {}} \ No newline at end of file diff --git a/job_scraper.py b/job_scraper.py deleted file mode 100644 index 8b9b9db..0000000 --- a/job_scraper.py +++ /dev/null @@ -1,491 +0,0 @@ - - -import asyncio -import random -import sqlite3 -import os -from datetime import datetime -from typing import Optional, Dict, List -from playwright.async_api import async_playwright -from browserforge.injectors.playwright import AsyncNewContext - - -class LinkedInJobScraper: - def __init__( - self, - engine, - db_path: str = "linkedin_jobs.db", - human_speed: float = 1.0 - ): - self.engine = engine - self.db_path = db_path - self.human_speed = human_speed - self._init_db() - - def _init_db(self): - os.makedirs(os.path.dirname(self.db_path) if os.path.dirname(self.db_path) else ".", exist_ok=True) - with sqlite3.connect(self.db_path) as conn: - cursor = conn.cursor() - cursor.execute(''' - CREATE TABLE IF NOT EXISTS jobs ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - keyword TEXT, - title TEXT, - company TEXT, - location TEXT, - salary TEXT, - description TEXT, - url TEXT UNIQUE, - workplace_type TEXT, - scraped_at DATETIME DEFAULT CURRENT_TIMESTAMP - ) - ''') - conn.commit() - - async def _human_click(self, page, element, wait_after: bool = True): - if not element: - return False - await element.scroll_into_view_if_needed() - await asyncio.sleep(random.uniform(0.3, 0.8) * self.human_speed) - try: - await element.click() - if wait_after: - await asyncio.sleep(random.uniform(2, 4) * self.human_speed) - return True - except: - return False - - async def _login(self, page, credentials: Dict) -> bool: - """Human-realistic LinkedIn login""" - print("šŸ” Navigating to LinkedIn login page...") - await page.goto("https://www.linkedin.com/login", timeout=60000) - await asyncio.sleep(random.uniform(2.0, 3.5) * self.human_speed) - - email_field = await page.query_selector('input[name="session_key"]') - if not email_field: - print("āŒ Email field not found.") - return False - - print("āœļø Typing username...") - await email_field.click() - await asyncio.sleep(random.uniform(0.4, 0.9) * self.human_speed) - for char in credentials["email"]: - await page.keyboard.type(char) - await asyncio.sleep(random.uniform(0.06, 0.14) * self.human_speed) - await asyncio.sleep(random.uniform(1.0, 1.8) * self.human_speed) - - password_field = await page.query_selector('input[name="session_password"]') - if not password_field: - print("āŒ Password field not found.") - return False - - print("šŸ”’ Typing password...") - await password_field.click() - await asyncio.sleep(random.uniform(0.3, 0.7) * self.human_speed) - for char in credentials["password"]: - await page.keyboard.type(char) - await asyncio.sleep(random.uniform(0.08, 0.16) * self.human_speed) - await asyncio.sleep(random.uniform(0.8, 1.5) * self.human_speed) - - print("āœ… Submitting login form...") - await page.keyboard.press("Enter") - - for _ in range(15): - current_url = page.url - if "/feed" in current_url or "/jobs" in current_url: - if "login" not in current_url: - print("āœ… Login successful!") - await asyncio.sleep(random.uniform(2.0, 3.0) * self.human_speed) - return True - await asyncio.sleep(1) - print("āŒ Login may have failed.") - return False - - async def _extract_job_details(self, page) -> Dict: - """Extract from ANY job page: LinkedIn Easy Apply OR external site""" - await asyncio.sleep(2 * self.human_speed) - - async def get_text(selector: str) -> str: - try: - el = await page.query_selector(selector) - if el: - text = await el.inner_text() - return text.strip() if text else "N/A" - except: - pass - return "N/A" - - title = await get_text("h1.t-24") - if title == "N/A": - title = await get_text("h1, h2") - - company = await get_text("a.app-aware-link[href*='/company/']") - if company == "N/A": - company = await get_text("div.org, .company, [class*='company']") - - location = await get_text("span[class*='location']") - if location == "N/A": - location = await get_text(".location, [class*='location']") - - description = await get_text("div[class*='description__text']") - if description == "N/A": - description = await get_text(".job-desc, .description, main, body") - - workplace = await get_text("span.job-workplace-type") or "N/A" - salary = await get_text("span.salary") or "N/A" - - return { - "title": title, - "company": company, - "location": location, - "workplace_type": workplace, - "salary": salary, - "description": description, - "url": page.url - } - - async def _save_to_markdown(self, job_data: Dict, keyword: str, verified: bool=True): - """Save to appropriate folder using job ID to avoid duplication""" - folder = "linkedin_jobs" if verified else "linkedin_jobs_unverified" - os.makedirs(folder, exist_ok=True) - - # Extract job ID from URL for LinkedIn jobs - url = job_data.get("url", "") - if "/jobs/view/" in url: - try: - job_id = url.split("/view/")[1].split("/")[0] - except: - job_id = "unknown" - else: - # For external jobs, use a hash of the URL (first 12 chars) - import hashlib - job_id = hashlib.md5(url.encode()).hexdigest()[:12] - - clean_keyword = keyword.replace(" ", "_") - filename = f"linkedin_{clean_keyword}_job_{job_id}.md" - filepath = os.path.join(folder, filename) - - # Only save if file doesn't already exist (idempotent) - if os.path.exists(filepath): - print(f" šŸ“ Skipping duplicate Markdown file: {filename}") - return - - with open(filepath, "w", encoding="utf-8") as f: - f.write(f"# {job_data['title']}\n\n") - f.write(f"- **Company**: {job_data['company']}\n") - f.write(f"- **Location**: {job_data['location']}\n") - f.write(f"- **Workplace**: {job_data['workplace_type']}\n") - f.write(f"- **Salary**: {job_data['salary']}\n") - f.write(f"- **URL**: <{url}>\n\n") - f.write(f"## Description\n\n{job_data['description']}\n") - - async def _save_to_db(self, job_data: Dict, keyword: str): - with sqlite3.connect(self.db_path) as conn: - cursor = conn.cursor() - cursor.execute(''' - INSERT OR IGNORE INTO jobs - (keyword, title, company, location, salary, description, url, workplace_type) - VALUES (?, ?, ?, ?, ?, ?, ?, ?) - ''', ( - keyword, - job_data["title"], - job_data["company"], - job_data["location"], - job_data["salary"], - job_data["description"], - job_data["url"], - job_data["workplace_type"] - )) - conn.commit() - - async def scrape_jobs( - self, - search_keywords: Optional[str], - max_pages: int = 1, - credentials: Optional[Dict] = None - ): - encoded_keywords = search_keywords.replace(" ", "%20") - search_url = f"https://www.linkedin.com/jobs/search/?keywords={encoded_keywords}" - - profile = self.engine._select_profile() - renderer = random.choice(self.engine.common_renderers[self.engine.os]) - vendor = random.choice(self.engine.common_vendors) - spoof_script = self.engine._get_spoof_script(renderer, vendor) - - async with async_playwright() as pw: - browser = await pw.chromium.launch( - headless= False, - args=['--disable-blink-features=AutomationControlled'] - ) - context = await AsyncNewContext(browser, fingerprint=profile) - - await context.add_init_script(f""" - Object.defineProperty(navigator, 'hardwareConcurrency', {{ get: () => {profile.navigator.hardwareConcurrency} }}); - Object.defineProperty(navigator, 'deviceMemory', {{ get: () => {profile.navigator.deviceMemory} }}); - Object.defineProperty(navigator, 'platform', {{ get: () => '{profile.navigator.platform}' }}); - """) - await context.add_init_script(spoof_script) - - page = await context.new_page() - - session_loaded = await self.engine.load_session(context) - login_successful = False - - if session_loaded: - print("šŸ” Using saved session — verifying login...") - await page.goto("https://www.linkedin.com/feed/", timeout=80000) - if "feed" in page.url and "login" not in page.url: - print("āœ… Session still valid.") - login_successful = True - else: - print("āš ļø Saved session expired — re-authenticating.") - session_loaded = False - - if not session_loaded and credentials: - print("šŸ” Performing fresh login...") - login_successful = await self._login(page, credentials) - if login_successful: - await self.engine.save_session(context) - else: - print("āŒ Login failed. Exiting.") - await browser.close() - self.engine.report_outcome("block") - return - elif not credentials: - print("ā„¹ļø No credentials — proceeding as guest.") - login_successful = True - else: - pass - - await page.wait_for_load_state("load", timeout=60000) - print("āœ… Post-login page fully loaded. Starting search...") - - if await self.engine._detect_cloudflare(page): - print("ā˜ļø Cloudflare detected on initial load.") - if not await self.engine._handle_cloudflare(page): - print("āŒ Cloudflare could not be resolved.") - await browser.close() - self.engine.report_outcome("cloudflare") - return - - print(f"šŸ” Searching for: {search_keywords}") - await page.goto(search_url, wait_until='load', timeout=80000) - await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed) - - if await self.engine._detect_cloudflare(page): - print("ā˜ļø Cloudflare detected on search page.") - if not await self.engine._handle_cloudflare(page): - await browser.close() - self.engine.report_outcome("cloudflare") - return - - scraped_count = 0 - all_job_links = [] - seen_job_ids = set() - - # ← NEW: Scroll once to reveal pagination (if any) - print("šŸ”„ Scrolling to bottom to reveal pagination controls...") - await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") - await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed) - - # Check if pagination exists - pagination_exists = await page.query_selector("button[aria-label='Next']") - if pagination_exists: - print("ā­ļø Pagination detected. Using page navigation.") - current_page = 1 - while current_page <= max_pages: - print(f"šŸ“„ Processing page {current_page}/{max_pages}") - - # Collect job links on current page - current_links = await page.query_selector_all("a[href*='/jobs/view/']") - new_jobs = 0 - for link in current_links: - href = await link.get_attribute("href") - if href: - job_id = href.split("/view/")[-1].split("/")[0] if "/view/" in href else href - if job_id and job_id not in seen_job_ids: - seen_job_ids.add(job_id) - all_job_links.append(href) - new_jobs += 1 - - print(f" āž• Found {new_jobs} new job(s) on page {current_page} (total: {len(all_job_links)})") - - # Try to go to next page - if current_page < max_pages: - next_btn = await page.query_selector("button[aria-label='Next']") - if next_btn and await next_btn.is_enabled(): - await self._human_click(page, next_btn) - await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed) - # Wait for URL to change or new content - try: - await page.wait_for_function("() => window.location.href.includes('start=')", timeout=60000) - except: - pass - else: - print("šŸ”š 'Next' button not available — stopping pagination.") - break - current_page += 1 - - else: - print("šŸ”„ No pagination found. Falling back to infinite scroll...") - last_height = await page.evaluate("document.body.scrollHeight") - no_new_jobs_count = 0 - max_no_new = 3 - - while no_new_jobs_count < max_no_new: - await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") - await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed) - - current_links = await page.query_selector_all("a[href*='/jobs/view/']") - new_jobs_found = 0 - - for link in current_links: - href = await link.get_attribute("href") - if href: - job_id = href.split("/view/")[-1].split("/")[0] if "/view/" in href else href - if job_id and job_id not in seen_job_ids: - seen_job_ids.add(job_id) - all_job_links.append(href) - new_jobs_found += 1 - - print(f" āž• Found {new_jobs_found} new job(s) (total: {len(all_job_links)})") - - new_height = await page.evaluate("document.body.scrollHeight") - if new_height == last_height: - no_new_jobs_count += 1 - else: - no_new_jobs_count = 0 - last_height = new_height - - if new_jobs_found == 0 and no_new_jobs_count >= 1: - print("šŸ”š No new jobs loaded. Stopping scroll.") - break - - print(f"āœ… Collected {len(all_job_links)} unique job links.") - - # ← Rest of job processing loop unchanged - scraped_count = 0 - for idx, href in enumerate(all_job_links): - try: - full_url = href if href.startswith("http") else f"https://www.linkedin.com{href}" - print(f" → Opening job {idx+1}/{len(all_job_links)}: {full_url}") - await page.goto(full_url, wait_until='load', timeout=60000) - await asyncio.sleep(3 * self.human_speed) - - is_cloudflare = await self.engine._detect_cloudflare(page) - page_content = await page.content() - has_captcha_text = "captcha" in page_content.lower() - captcha_present = is_cloudflare or has_captcha_text - - title_element = await page.query_selector("h1.t-24") - job_data_accessible = title_element is not None - - if captcha_present: - if job_data_accessible: - print(" āš ļø CAPTCHA detected, but job data is accessible. Proceeding in stealth mode...") - await self.engine._avoid_captcha(page) - else: - print(" āš ļø CAPTCHA detected and job data blocked. Attempting recovery...") - if not await self.engine._solve_captcha_fallback(page): - print(" āŒ CAPTCHA recovery failed. Skipping job.") - continue - title_element = await page.query_selector("h1.t-24") - if not title_element: - print(" āŒ Job data still unavailable after CAPTCHA handling. Skipping.") - continue - - if not captcha_present: - await self.engine._avoid_captcha(page) - - apply_btn = None - apply_selectors = [ - "button[aria-label*='Apply']", - "button:has-text('Apply')", - "a:has-text('Apply')", - "button:has-text('Easy Apply')" - ] - for selector in apply_selectors: - apply_btn = await page.query_selector(selector) - if apply_btn: - break - - job_data = None - final_url = full_url - - if apply_btn: - print(" → Clicking 'Apply' / 'Easy Apply' button...") - - page_waiter = asyncio.create_task(context.wait_for_event("page")) - await self._human_click(page, apply_btn, wait_after=False) - - external_page = None - try: - external_page = await asyncio.wait_for(page_waiter, timeout=5.0) - print(" 🌐 External job site opened in new tab.") - await external_page.wait_for_load_state("load", timeout=60000) - await asyncio.sleep(2 * self.human_speed) - await self.engine._human_like_scroll(external_page) - await asyncio.sleep(2 * self.human_speed) - - job_data = await self._extract_job_details(external_page) - final_url = external_page.url - - if not external_page.is_closed(): - await external_page.close() - - except asyncio.TimeoutError: - print(" šŸ–„ļø No external tab — scraping LinkedIn job page.") - await page.wait_for_timeout(2000) - try: - await page.wait_for_selector("div.jobs-apply-button--fixed, div.jobs-easy-apply-modal", timeout=8000) - except: - pass - await self.engine._human_like_scroll(page) - await asyncio.sleep(2 * self.human_speed) - job_data = await self._extract_job_details(page) - final_url = page.url - else: - print(" āš ļø No 'Apply' button found — scraping job details directly.") - await self.engine._human_like_scroll(page) - await asyncio.sleep(2 * self.human_speed) - job_data = await self._extract_job_details(page) - final_url = page.url - - job_data["url"] = final_url - - if job_data["title"] == "N/A" and "linkedin.com" in final_url: - job_id = final_url.split("/")[-2] if "/jobs/view/" in final_url else "unknown" - job_data["title"] = f"Easy Apply Job - ID {job_id}" - - is_meaningful = ( - job_data["title"] != "N/A" or - job_data["company"] != "N/A" or - (job_data["description"] != "N/A" and len(job_data["description"]) > 20) - ) - - if is_meaningful: - await self._save_to_db(job_data, search_keywords) - await self._save_to_markdown(job_data, search_keywords, verified=True) - scraped_count += 1 - print(f" āœ… Scraped (verified): {job_data['title'][:50]}...") - else: - await self._save_to_markdown(job_data, search_keywords, verified=False) - print(f" 🟔 Scraped (unverified): {final_url} — low-quality data") - - except Exception as e: - print(f" āš ļø Failed on job {idx+1}: {str(e)[:100]}") - continue - - finally: - print(" ā†©ļø Returning to LinkedIn search results...") - await page.goto(search_url, timeout=60000) - await asyncio.sleep(4 * self.human_speed) - - await browser.close() - - if scraped_count > 0: - self.engine.report_outcome("success") - print(f"āœ… Completed! Saved {scraped_count} verified + additional unverified jobs for '{search_keywords}'.") - else: - self.engine.report_outcome("captcha") - print("āš ļø No verified jobs scraped — check 'linkedin_jobs_unverified' for raw outputs.") diff --git a/job_scraper2.py b/job_scraper2.py deleted file mode 100644 index e7158f5..0000000 --- a/job_scraper2.py +++ /dev/null @@ -1,468 +0,0 @@ -import asyncio -import random -from typing import Optional, Dict -from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError -from browserforge.injectors.playwright import AsyncNewContext -from llm_agent import LLMJobRefiner -import re -from fetcher import StealthyFetcher -from datetime import datetime - - -class LinkedInJobScraper: - def __init__( - self, - engine, - db_path: str = "linkedin_jobs.db", - human_speed: float = 1.0, - user_request: str = "Extract all standard job details" - ): - self.engine = engine - self.db_path = db_path - self.human_speed = human_speed - self.user_request = user_request - self._init_db() - self.llm_agent = LLMJobRefiner() - - def _init_db(self): - # This method is kept for backward compatibility but LLMJobRefiner handles PostgreSQL now - pass - - async def _human_click(self, page, element, wait_after: bool = True): - if not element: - return False - await element.scroll_into_view_if_needed() - await asyncio.sleep(random.uniform(0.3, 0.8) * self.human_speed) - try: - await element.click() - if wait_after: - await asyncio.sleep(random.uniform(2, 4) * self.human_speed) - return True - except: - return False - - async def _login(self, page, credentials: Dict) -> bool: - print("šŸ” Navigating to LinkedIn login page...") - await page.goto("https://www.linkedin.com/login", timeout=120000) - await asyncio.sleep(random.uniform(2.0, 3.5) * self.human_speed) - - email_field = await page.query_selector('input[name="session_key"]') - if not email_field: - print("āŒ Email field not found.") - return False - - print("āœļø Typing username...") - await email_field.click() - await asyncio.sleep(random.uniform(0.4, 0.9) * self.human_speed) - for char in credentials["email"]: - await page.keyboard.type(char) - await asyncio.sleep(random.uniform(0.06, 0.14) * self.human_speed) - await asyncio.sleep(random.uniform(1.0, 1.8) * self.human_speed) - - password_field = await page.query_selector('input[name="session_password"]') - if not password_field: - print("āŒ Password field not found.") - return False - - print("šŸ”’ Typing password...") - await password_field.click() - await asyncio.sleep(random.uniform(0.3, 0.7) * self.human_speed) - for char in credentials["password"]: - await page.keyboard.type(char) - await asyncio.sleep(random.uniform(0.08, 0.16) * self.human_speed) - await asyncio.sleep(random.uniform(0.8, 1.5) * self.human_speed) - - print("āœ… Submitting login form...") - await page.keyboard.press("Enter") - - for _ in range(15): - current_url = page.url - if "/feed" in current_url or "/jobs" in current_url: - if "login" not in current_url: - print("āœ… Login successful!") - await asyncio.sleep(random.uniform(2.0, 3.0) * self.human_speed) - return True - await asyncio.sleep(1) - print("āŒ Login may have failed.") - return False - - async def _extract_page_content_for_llm(self, page) -> str: - """ - Extract raw page content as HTML/text for LLM processing - The LLM will handle all extraction logic, not specific selectors - """ - await asyncio.sleep(2 * self.human_speed) - await self.engine._human_like_scroll(page) - await asyncio.sleep(2 * self.human_speed) - page_content = await page.content() - return page_content - - def _calculate_keyword_match(self, title: str, keywords: str) -> float: - if not title or not keywords: - return 0.0 - title_lower = title.lower() - keyword_list = [kw.strip().lower() for kw in keywords.split()] - matches = sum(1 for kw in keyword_list if kw in title_lower) - return matches / len(keyword_list) if keyword_list else 0.0 - - def _extract_location_from_keywords(self, search_keywords: str) -> str: - location_match = re.search(r'location:\s*([^,]+)', search_keywords, re.IGNORECASE) - return location_match.group(1).strip().lower() if location_match else "" - - async def _scrape_jobs_from_current_page(self, page, search_keywords: str, seen_job_ids, all_job_links): - current_links = await page.query_selector_all("a[href*='/jobs/view/']") - new_jobs = 0 - location_from_keywords = self._extract_location_from_keywords(search_keywords) - - for link in current_links: - href = await link.get_attribute("href") - if href: - full_url = href if href.startswith("http") else f"https://www.linkedin.com{href}" - job_id = href.split("/view/")[-1].split("/")[0] if "/view/" in href else href - - if job_id and job_id not in seen_job_ids: - title_element = await link.query_selector("span.job-title, h3, .job-card-title") - if title_element: - title = await title_element.inner_text() - match_percentage = self._calculate_keyword_match(title, search_keywords) - location_match = True - if location_from_keywords: - location_element = await link.query_selector("span.job-location, .job-card-location, .location") - if location_element: - location_text = await location_element.inner_text() - location_match = location_from_keywords in location_text.lower() - - if match_percentage >= 0.7 and location_match: - seen_job_ids.add(job_id) - all_job_links.append((href, title)) - new_jobs += 1 - elif match_percentage < 0.7: - print(f" āš ļø Skipping job due to low keyword match: {title[:50]}... (match: {match_percentage:.2%})") - elif not location_match: - print(f" āš ļø Skipping job due to location mismatch: {title[:50]}... (expected: {location_from_keywords})") - else: - seen_job_ids.add(job_id) - all_job_links.append((href, "Unknown Title")) - new_jobs += 1 - return new_jobs - - async def _handle_pagination(self, page, search_keywords: str, seen_job_ids, all_job_links): - current_page = 1 - while True: - print(f"šŸ“„ Processing page {current_page}") - new_jobs = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links) - print(f" āž• Found {new_jobs} new job(s) on page {current_page} (total: {len(all_job_links)})") - - next_btn = await page.query_selector("button[aria-label='Next']") - if next_btn and await next_btn.is_enabled(): - await self._human_click(page, next_btn) - await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed) - try: - await page.wait_for_function("() => window.location.href.includes('start=')", timeout=120000) - except: - pass - current_page += 1 - else: - print("šŸ”š 'Next' button not available — stopping pagination.") - break - - async def _handle_infinite_scroll(self, page, search_keywords: str, seen_job_ids, all_job_links): - last_height = await page.evaluate("document.body.scrollHeight") - no_new_jobs_count = 0 - max_no_new = 3 - - while no_new_jobs_count < max_no_new: - await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") - await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed) - - new_jobs_found = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links) - print(f" āž• Found {new_jobs_found} new job(s) (total: {len(all_job_links)})") - - new_height = await page.evaluate("document.body.scrollHeight") - if new_height == last_height: - no_new_jobs_count += 1 - else: - no_new_jobs_count = 0 - last_height = new_height - - if new_jobs_found == 0 and no_new_jobs_count >= 1: - print("šŸ”š No new jobs loaded. Stopping scroll.") - break - - async def scrape_jobs( - self, - search_keywords: Optional[str], - max_pages: int = 1, - credentials: Optional[Dict] = None - ): - location_match = re.search(r'location:\s*([^,]+)', search_keywords, re.IGNORECASE) - location = location_match.group(1).strip() if location_match else "" - clean_keywords = re.sub(r'location:\s*[^,]+', '', search_keywords, flags=re.IGNORECASE).strip() - encoded_keywords = clean_keywords.replace(" ", "%20") - - search_url = f"https://www.linkedin.com/jobs/search/?keywords={encoded_keywords}" - if location: - search_url += f"&location={location.replace(' ', '%20')}" - - profile = self.engine._select_profile() - renderer = random.choice(self.engine.common_renderers[self.engine.os]) - vendor = random.choice(self.engine.common_vendors) - spoof_script = self.engine._get_spoof_script(renderer, vendor) - - async with async_playwright() as pw: - browser = await pw.chromium.launch( - headless=False, - args=['--disable-blink-features=AutomationControlled'] - ) - context = await AsyncNewContext(browser, fingerprint=profile) - - await context.add_init_script(f""" - Object.defineProperty(navigator, 'hardwareConcurrency', {{ get: () => {profile.navigator.hardwareConcurrency} }}); - Object.defineProperty(navigator, 'deviceMemory', {{ get: () => {profile.navigator.deviceMemory} }}); - Object.defineProperty(navigator, 'platform', {{ get: () => '{profile.navigator.platform}' }}); - """) - await context.add_init_script(spoof_script) - - page = await context.new_page() - - # Create a temporary fetcher for protection checks on main page - temp_fetcher = StealthyFetcher(self.engine, browser, context) - - session_loaded = await self.engine.load_session(context) - login_successful = False - - if session_loaded: - print("šŸ” Using saved session — verifying login...") - await page.goto("https://www.linkedin.com/feed/", timeout=120000) - if "feed" in page.url and "login" not in page.url: - print("āœ… Session still valid.") - login_successful = True - else: - print("āš ļø Saved session expired — re-authenticating.") - session_loaded = False - - if not session_loaded and credentials: - print("šŸ” Performing fresh login...") - login_successful = await self._login(page, credentials) - if login_successful: - await self.engine.save_session(context) - else: - print("āŒ Login failed. Exiting.") - await browser.close() - self.engine.report_outcome("block") - return - elif not credentials: - print("ā„¹ļø No credentials — proceeding as guest.") - login_successful = True - - await page.wait_for_load_state("load", timeout=120000) - print("āœ… Post-login page fully loaded. Starting search...") - - # >>> PROTECTION CHECK USING FETCHER LOGIC <<< - protection_type = await temp_fetcher._detect_protection(page) - if protection_type: - print(f"šŸ›”ļø Protection detected on initial page: {protection_type}") - content_accessible = await temp_fetcher._is_content_accessible(page) - if not content_accessible: - print("šŸ”’ Content not accessible.") - handled = False - if protection_type == "cloudflare": - handled = await self.engine._handle_cloudflare(page) - elif protection_type == "captcha": - handled = False - if not handled: - await browser.close() - self.engine.report_outcome("protection_block") - return - else: - print("āœ… Protection present but content accessible — proceeding.") - - print(f"šŸ” Searching for: {search_keywords}") - await page.goto(search_url, wait_until='load', timeout=120000) - await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed) - - # >>> PROTECTION CHECK ON SEARCH PAGE <<< - protection_type = await temp_fetcher._detect_protection(page) - if protection_type: - print(f"šŸ›”ļø Protection detected on search page: {protection_type}") - content_accessible = await temp_fetcher._is_content_accessible(page) - if not content_accessible: - print("šŸ”’ Content not accessible.") - handled = False - if protection_type == "cloudflare": - handled = await self.engine._handle_cloudflare(page) - elif protection_type == "captcha": - handled = False - if not handled: - await browser.close() - self.engine.report_outcome("protection_block") - return - else: - print("āœ… Protection present but content accessible — proceeding.") - - all_job_links = [] - seen_job_ids = set() - - print("šŸ”„ Collecting initial job links...") - initial_jobs = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links) - print(f" āž• Found {initial_jobs} initial job(s) (total: {len(all_job_links)})") - - iteration = 1 - while True and iteration >= 5: - print(f"šŸ”„ Iteration {iteration}: Checking for new jobs...") - - prev_job_count = len(all_job_links) - await self._handle_infinite_scroll(page, search_keywords, seen_job_ids, all_job_links) - new_jobs_count = len(all_job_links) - prev_job_count - - if new_jobs_count > 0: - print(f" āž• Found {new_jobs_count} new jobs via infinite scroll") - iteration += 1 - continue - - pagination_exists = await page.query_selector("button[aria-label='Next']") - - if pagination_exists: - print("ā­ļø Pagination detected. Processing pages...") - await self._handle_pagination(page, search_keywords, seen_job_ids, all_job_links) - iteration += 1 - continue - else: - print("šŸ”„ Refreshing page to check for new results...") - await page.reload(wait_until='load') - await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed) - - new_jobs_after_refresh = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links) - if new_jobs_after_refresh > 0: - print(f" āž• Found {new_jobs_after_refresh} new job(s) after refresh") - iteration += 1 - continue - else: - print("šŸ”š No new jobs found after refresh. Stopping.") - break - - print(f"āœ… Collected {len(all_job_links)} unique job links.") - - scraped_count = 0 - for idx, (href, title) in enumerate(all_job_links): - try: - full_url = href if href.startswith("http") else f"https://www.linkedin.com{href}" - print(f" → Opening job {idx+1}/{len(all_job_links)}: {full_url}") - - fetcher = StealthyFetcher(self.engine, browser, context) - job_page = await fetcher.fetch_url(full_url, wait_for_selector="h1.t-24") - if not job_page: - print(f" āŒ Failed to fetch job page {full_url} after retries.") - self.engine.report_outcome("fetch_failure", url=full_url) - continue - - apply_btn = None - apply_selectors = [ - "button[aria-label*='Apply']", - "button:has-text('Apply')", - "a:has-text('Apply')", - "button:has-text('Easy Apply')" - ] - for selector in apply_selectors: - apply_btn = await job_page.query_selector(selector) - if apply_btn: - break - - final_url = full_url - external_url = None - page_content = None - - if apply_btn: - print(" → Clicking 'Apply' / 'Easy Apply' button...") - - page_waiter = asyncio.create_task(context.wait_for_event("page")) - await self._human_click(job_page, apply_btn, wait_after=False) - - external_page = None - try: - external_page = await asyncio.wait_for(page_waiter, timeout=5.0) - print(" 🌐 External job site opened in new tab.") - await external_page.wait_for_load_state("load", timeout=120000) - await asyncio.sleep(2 * self.human_speed) - await self.engine._human_like_scroll(external_page) - await asyncio.sleep(2 * self.human_speed) - - # Extract raw content from external page for LLM processing - external_url = external_page.url - final_url = external_url - page_content = await self._extract_page_content_for_llm(external_page) - - if not external_page.is_closed(): - await external_page.close() - - except asyncio.TimeoutError: - print(" šŸ–„ļø No external tab — scraping LinkedIn job page directly.") - await job_page.wait_for_timeout(60000) - try: - await job_page.wait_for_selector("div.jobs-apply-button--fixed, div.jobs-easy-apply-modal", timeout=80000) - except PlaywrightTimeoutError: - pass - await self.engine._human_like_scroll(job_page) - await asyncio.sleep(2 * self.human_speed) - page_content = await self._extract_page_content_for_llm(job_page) - else: - print(" āš ļø No 'Apply' button found — scraping job details directly.") - await self.engine._human_like_scroll(job_page) - await asyncio.sleep(2 * self.human_speed) - page_content = await self._extract_page_content_for_llm(job_page) - - job_id = full_url.split("/")[-2] if "/jobs/view/" in full_url else "unknown" - - raw_data = { - "page_content": page_content, - "url": final_url, - "job_id": job_id, - "search_keywords": search_keywords - } - - # LLM agent is now fully responsible for extraction and validation - refined_data = await self.llm_agent.refine_job_data(raw_data, self.user_request) - - if refined_data and refined_data.get("title", "N/A") != "N/A": - # Ensure compulsory fields are present (fallback if LLM missed them) - compulsory_fields = ['company_name', 'job_id', 'url'] - for field in compulsory_fields: - if not refined_data.get(field) or refined_data[field] in ["N/A", "", "Unknown"]: - if field == 'job_id': - refined_data[field] = job_id - elif field == 'url': - refined_data[field] = final_url - elif field == 'company_name': - refined_data[field] = "Unknown Company" - - refined_data['scraped_at'] = datetime.now().isoformat() - refined_data['category'] = clean_keywords - await self.llm_agent.save_job_data(refined_data, search_keywords) - scraped_count += 1 - print(f" āœ… Scraped and refined: {refined_data['title'][:50]}...") - self.engine.report_outcome("success", url=raw_data["url"]) - else: - print(f" 🟔 Could not extract meaningful data from: {final_url}") - self.engine.report_outcome("llm_failure", url=raw_data["url"]) - - await job_page.close() - - except Exception as e: - print(f" āš ļø Failed on job {idx+1}: {str(e)[:100]}") - if 'job_page' in locals() and job_page: - await job_page.close() - continue - - finally: - print(" ā†©ļø Returning to LinkedIn search results...") - await page.goto(search_url, timeout=120000) - await asyncio.sleep(4 * self.human_speed) - - await browser.close() - - if scraped_count > 0: - self.engine.report_outcome("success") - print(f"āœ… Completed! Processed {scraped_count} jobs for '{search_keywords}' based on request '{self.user_request}'.") - else: - self.engine.report_outcome("captcha") - print("āš ļø No jobs processed successfully.") diff --git a/linkedin_main.py b/linkedin_main.py deleted file mode 100644 index 59bb061..0000000 --- a/linkedin_main.py +++ /dev/null @@ -1,71 +0,0 @@ - -from scraping_engine import FingerprintScrapingEngine -from job_scraper2 import LinkedInJobScraper -import os -from dotenv import load_dotenv -import asyncio -import random -import time - -# Load environment variables -load_dotenv() - - -async def main(): - engine = FingerprintScrapingEngine( - seed="job_scraping_12", - target_os="windows", - db_path="job_listings.db", - markdown_path="job_listings.md" - ) - - # Initialize scraper with target field - scraper = LinkedInJobScraper(engine, human_speed=1.6, user_request="Extract title, company, location, description, requirements, qualifications, nature of job(remote, onsite, hybrid) and salary") - - # List of job titles to cycle through - job_titles = [ - "Software Engineer", - "Data Scientist", - "Product Manager", - "UX Designer", - "DevOps Engineer", - "Machine Learning Engineer", - "Frontend Developer", - "Backend Developer", - "Full Stack Developer", - "Data Analyst" - ] - - fixed_location = "New York" - - # Keep cycling through all job titles - while True: - # Shuffle job titles to randomize order - random.shuffle(job_titles) - - for job_title in job_titles: - search_keywords = f"{job_title} location:{fixed_location}" - - print(f"\n{'='*60}") - print(f"Starting scrape for: {search_keywords}") - print(f"{'='*60}") - - await scraper.scrape_jobs( - search_keywords=search_keywords, - credentials={ - "email": os.getenv("SCRAPING_USERNAME"), - "password": os.getenv("SCRAPING_PASSWORD") - } - ) - - print(f"\nāœ… Completed scraping for: {job_title}") - print(f"ā³ Waiting 2 minutes before next job title...") - - # Wait 2 minutes before next job title - time.sleep(120) - - print(f"\nāœ… Completed full cycle of all job titles") - print(f"šŸ”„ Starting new cycle...") - -if __name__ == "__main__": - asyncio.run(main())