diff --git a/config.py b/config.py new file mode 100644 index 0000000..eac08fa --- /dev/null +++ b/config.py @@ -0,0 +1,39 @@ + +import os +import json + + +def load_spoof_config(): + """Load spoof data from JSON config file. Falls back to defaults if missing.""" + config_path = os.path.join(os.path.dirname(__file__), "spoof_config.json") + + if os.path.exists(config_path): + with open(config_path, "r", encoding="utf-8") as f: + return json.load(f) + else: + # Generate default config file on first run + default_config = { + "renderers": { + "windows": [ + "ANGLE (Intel, Intel(R) UHD Graphics 630 (0x00003E9B) Direct3D11 vs_5_0 ps_5_0, D3D11)", + "ANGLE (Intel, Intel(R) UHD Graphics (0x00009A49) Direct3D11 vs_5_0 ps_5_0, D3D11)", + "ANGLE (Intel(R) Iris(TM) Graphics 540 Direct3D11 vs_5_0 ps_5_0)", + "ANGLE (Intel, Intel(R) UHD Graphics 620 (0x00005916) Direct3D11 vs_5_0 ps_5_0, D3D11)", + "ANGLE (Intel, Intel(R) HD Graphics 530 (0x0000191B) Direct3D11 vs_5_0 ps_5_0, D3D11)", + "ANGLE (Intel, Intel(R) UHD Graphics 600 (0x00003180) Direct3D11 vs_5_0 ps_5_0, D3D11)", + "ANGLE (Intel, Intel(R) Iris(R) Xe Graphics (0x00009A49) Direct3D11 vs_5_0 ps_5_0, D3D11)", + ], + "macos": [ + "Intel HD Graphics 530 OpenGL Engine", + "Intel Iris Graphics 6100 OpenGL Engine", + "Intel UHD Graphics 630 OpenGL Engine", + "Intel HD Graphics 4000 OpenGL Engine", + "Intel Iris Pro OpenGL Engine", + "Intel UHD Graphics 617 OpenGL Engine", + ] + }, + "vendors": ["Intel Inc.", "Intel", "Intel Corporation"] + } + with open(config_path, "w", encoding="utf-8") as f: + json.dump(default_config, f, indent=2) + return default_config diff --git a/job_scraper.py b/job_scraper.py index 6919a95..2f4f093 100644 --- a/job_scraper.py +++ b/job_scraper.py @@ -1,4 +1,5 @@ + import asyncio import random import sqlite3 @@ -114,24 +115,22 @@ class LinkedInJobScraper: pass return "N/A" - # Try multiple strategies for each field - title = await get_text("h1.t-24") # LinkedIn + title = await get_text("h1.t-24") if title == "N/A": - title = await get_text("h1, h2") # External + title = await get_text("h1, h2") - company = await get_text("a.app-aware-link[href*='/company/']") # LinkedIn + company = await get_text("a.app-aware-link[href*='/company/']") if company == "N/A": - company = await get_text("div.org, .company, [class*='company']") # External + company = await get_text("div.org, .company, [class*='company']") - location = await get_text("span[class*='location']") # LinkedIn + location = await get_text("span[class*='location']") if location == "N/A": location = await get_text(".location, [class*='location']") - description = await get_text("div[class*='description__text']") # LinkedIn + description = await get_text("div[class*='description__text']") if description == "N/A": description = await get_text(".job-desc, .description, main, body") - # Workplace & salary — LinkedIn only (external may not have) workplace = await get_text("span.job-workplace-type") or "N/A" salary = await get_text("span.salary") or "N/A" @@ -145,19 +144,39 @@ class LinkedInJobScraper: "url": page.url } - async def _save_to_markdown(self, job_data: Dict, keyword: str): - os.makedirs("linkedin_jobs", exist_ok=True) - clean_keyword = keyword.replace(" ", "_") - filename = f"linkedin_{clean_keyword}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md" - filepath = os.path.join("linkedin_jobs", filename) + async def _save_to_markdown(self, job_data: Dict, keyword: str, verified: bool=True): + """Save to appropriate folder using job ID to avoid duplication""" + folder = "linkedin_jobs" if verified else "linkedin_jobs_unverified" + os.makedirs(folder, exist_ok=True) + # Extract job ID from URL for LinkedIn jobs + url = job_data.get("url", "") + if "/jobs/view/" in url: + try: + job_id = url.split("/view/")[1].split("/")[0] + except: + job_id = "unknown" + else: + # For external jobs, use a hash of the URL (first 12 chars) + import hashlib + job_id = hashlib.md5(url.encode()).hexdigest()[:12] + + clean_keyword = keyword.replace(" ", "_") + filename = f"linkedin_{clean_keyword}_job_{job_id}.md" + filepath = os.path.join(folder, filename) + + # Only save if file doesn't already exist (idempotent) + if os.path.exists(filepath): + print(f" 📝 Skipping duplicate Markdown file: {filename}") + return + with open(filepath, "w", encoding="utf-8") as f: f.write(f"# {job_data['title']}\n\n") f.write(f"- **Company**: {job_data['company']}\n") f.write(f"- **Location**: {job_data['location']}\n") f.write(f"- **Workplace**: {job_data['workplace_type']}\n") f.write(f"- **Salary**: {job_data['salary']}\n") - f.write(f"- **URL**: <{job_data['url']}>\n\n") + f.write(f"- **URL**: <{url}>\n\n") f.write(f"## Description\n\n{job_data['description']}\n") async def _save_to_db(self, job_data: Dict, keyword: str): @@ -208,59 +227,176 @@ class LinkedInJobScraper: await context.add_init_script(spoof_script) page = await context.new_page() - - if credentials: - print("🔐 Attempting LinkedIn login...") - if not await self._login(page, credentials): + + session_loaded = await self.engine.load_session(context) + login_successful = False + + if session_loaded: + print("🔁 Using saved session — verifying login...") + await page.goto("https://www.linkedin.com/feed/", timeout=60000) + if "feed" in page.url and "login" not in page.url: + print("✅ Session still valid.") + login_successful = True + else: + print("âš ī¸ Saved session expired — re-authenticating.") + session_loaded = False + + if not session_loaded and credentials: + print("🔐 Performing fresh login...") + login_successful = await self._login(page, credentials) + if login_successful: + await self.engine.save_session(context) + else: print("❌ Login failed. Exiting.") await browser.close() + self.engine.report_outcome("block") return - else: + elif not credentials: print("â„šī¸ No credentials — proceeding as guest.") + login_successful = True + else: + pass + + await page.wait_for_load_state("load", timeout=60000) + print("✅ Post-login page fully loaded. Starting search...") + + if await self.engine._detect_cloudflare(page): + print("â˜ī¸ Cloudflare detected on initial load.") + if not await self.engine._handle_cloudflare(page): + print("❌ Cloudflare could not be resolved.") + await browser.close() + self.engine.report_outcome("cloudflare") + return print(f"🔍 Searching for: {search_keywords}") await page.goto(search_url, wait_until='load', timeout=60000) await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed) + if await self.engine._detect_cloudflare(page): + print("â˜ī¸ Cloudflare detected on search page.") + if not await self.engine._handle_cloudflare(page): + await browser.close() + self.engine.report_outcome("cloudflare") + return + scraped_count = 0 all_job_links = [] + seen_job_ids = set() - # Collect job links - for page_num in range(1, max_pages + 1): - print(f"📄 Collecting job links from page {page_num}/{max_pages}") - for _ in range(50): - links = await page.query_selector_all("a[href*='/jobs/view/']") - if links: - for link in links: - href = await link.get_attribute("href") - if href and href not in all_job_links: + # ← NEW: Scroll once to reveal pagination (if any) + print("🔄 Scrolling to bottom to reveal pagination controls...") + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed) + + # Check if pagination exists + pagination_exists = await page.query_selector("button[aria-label='Next']") + if pagination_exists: + print("â­ī¸ Pagination detected. Using page navigation.") + current_page = 1 + while current_page <= max_pages: + print(f"📄 Processing page {current_page}/{max_pages}") + + # Collect job links on current page + current_links = await page.query_selector_all("a[href*='/jobs/view/']") + new_jobs = 0 + for link in current_links: + href = await link.get_attribute("href") + if href: + job_id = href.split("/view/")[-1].split("/")[0] if "/view/" in href else href + if job_id and job_id not in seen_job_ids: + seen_job_ids.add(job_id) all_job_links.append(href) - break - await asyncio.sleep(1) - print(f" ➕ Found {len(links) if 'links' in locals() else 0} new job links.") - - if page_num < max_pages: - next_btn = await page.query_selector("button[aria-label='Next']") - if next_btn and await next_btn.is_enabled(): - await self._human_click(page, next_btn) - await asyncio.sleep(4 * self.human_speed) + new_jobs += 1 + + print(f" ➕ Found {new_jobs} new job(s) on page {current_page} (total: {len(all_job_links)})") + + # Try to go to next page + if current_page < max_pages: + next_btn = await page.query_selector("button[aria-label='Next']") + if next_btn and await next_btn.is_enabled(): + await self._human_click(page, next_btn) + await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed) + # Wait for URL to change or new content + try: + await page.wait_for_function("() => window.location.href.includes('start=')", timeout=30000) + except: + pass + else: + print("🔚 'Next' button not available — stopping pagination.") + break + current_page += 1 + + else: + print("🔄 No pagination found. Falling back to infinite scroll...") + last_height = await page.evaluate("document.body.scrollHeight") + no_new_jobs_count = 0 + max_no_new = 3 + + while no_new_jobs_count < max_no_new: + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed) + + current_links = await page.query_selector_all("a[href*='/jobs/view/']") + new_jobs_found = 0 + + for link in current_links: + href = await link.get_attribute("href") + if href: + job_id = href.split("/view/")[-1].split("/")[0] if "/view/" in href else href + if job_id and job_id not in seen_job_ids: + seen_job_ids.add(job_id) + all_job_links.append(href) + new_jobs_found += 1 + + print(f" ➕ Found {new_jobs_found} new job(s) (total: {len(all_job_links)})") + + new_height = await page.evaluate("document.body.scrollHeight") + if new_height == last_height: + no_new_jobs_count += 1 else: - print("🔚 No next page.") + no_new_jobs_count = 0 + last_height = new_height + + if new_jobs_found == 0 and no_new_jobs_count >= 1: + print("🔚 No new jobs loaded. Stopping scroll.") break - # Process each job + print(f"✅ Collected {len(all_job_links)} unique job links.") + + # ← Rest of job processing loop unchanged + scraped_count = 0 for idx, href in enumerate(all_job_links): try: full_url = href if href.startswith("http") else f"https://www.linkedin.com{href}" - print(f" → Opening job {idx+1}: {full_url}") + print(f" → Opening job {idx+1}/{len(all_job_links)}: {full_url}") await page.goto(full_url, wait_until='load', timeout=60000) await asyncio.sleep(3 * self.human_speed) - if not await page.query_selector("h1.t-24"): - print(f" âš ī¸ Invalid job page, skipping.") - continue + is_cloudflare = await self.engine._detect_cloudflare(page) + page_content = await page.content() + has_captcha_text = "captcha" in page_content.lower() + captcha_present = is_cloudflare or has_captcha_text + + title_element = await page.query_selector("h1.t-24") + job_data_accessible = title_element is not None + + if captcha_present: + if job_data_accessible: + print(" âš ī¸ CAPTCHA detected, but job data is accessible. Proceeding in stealth mode...") + await self.engine._avoid_captcha(page) + else: + print(" âš ī¸ CAPTCHA detected and job data blocked. Attempting recovery...") + if not await self.engine._solve_captcha_fallback(page): + print(" ❌ CAPTCHA recovery failed. Skipping job.") + continue + title_element = await page.query_selector("h1.t-24") + if not title_element: + print(" ❌ Job data still unavailable after CAPTCHA handling. Skipping.") + continue + + if not captcha_present: + await self.engine._avoid_captcha(page) - # Find and click the main "Apply" button apply_btn = None apply_selectors = [ "button[aria-label*='Apply']", @@ -273,36 +409,83 @@ class LinkedInJobScraper: if apply_btn: break - if not apply_btn: - print(f" âš ī¸ No 'Apply' button found, skipping.") - continue + job_data = None + final_url = full_url - # Click "Apply" - print(f" → Clicking 'Apply' / 'Easy Apply' button...") - await self._human_click(page, apply_btn, wait_after=False) - await asyncio.sleep(4 * self.human_speed) # Wait for next page/form to load + if apply_btn: + print(" → Clicking 'Apply' / 'Easy Apply' button...") + + page_waiter = asyncio.create_task(context.wait_for_event("page")) + await self._human_click(page, apply_btn, wait_after=False) + + external_page = None + try: + external_page = await asyncio.wait_for(page_waiter, timeout=5.0) + print(" 🌐 External job site opened in new tab.") + await external_page.wait_for_load_state("load", timeout=30000) + await asyncio.sleep(2 * self.human_speed) + await self.engine._human_like_scroll(external_page) + await asyncio.sleep(2 * self.human_speed) + + job_data = await self._extract_job_details(external_page) + final_url = external_page.url + + if not external_page.is_closed(): + await external_page.close() + + except asyncio.TimeoutError: + print(" đŸ–Ĩī¸ No external tab — scraping LinkedIn job page.") + await page.wait_for_timeout(2000) + try: + await page.wait_for_selector("div.jobs-apply-button--fixed, div.jobs-easy-apply-modal", timeout=8000) + except: + pass + await self.engine._human_like_scroll(page) + await asyncio.sleep(2 * self.human_speed) + job_data = await self._extract_job_details(page) + final_url = page.url + else: + print(" âš ī¸ No 'Apply' button found — scraping job details directly.") + await self.engine._human_like_scroll(page) + await asyncio.sleep(2 * self.human_speed) + job_data = await self._extract_job_details(page) + final_url = page.url - # Now scrape WHATEVER page is displayed (Easy Apply form OR external site) - job_data = await self._extract_job_details(page) - if job_data["title"] == "N/A" and "linkedin.com" in page.url: - # On LinkedIn but no title → likely Easy Apply form; use job ID as title - job_id = full_url.split("/")[-2] if "/jobs/view/" in full_url else "unknown" - job_data["title"] = f"Easy Apply Job - ID {job_id}" + job_data["url"] = final_url - await self._save_to_db(job_data, search_keywords) - await self._save_to_markdown(job_data, search_keywords) - scraped_count += 1 - domain = "LinkedIn (Easy Apply)" if "linkedin.com" in page.url else "External Site" - print(f" ✅ Scraped ({domain}): {job_data['title'][:50]}...") + if job_data["title"] == "N/A" and "linkedin.com" in final_url: + job_id = final_url.split("/")[-2] if "/jobs/view/" in final_url else "unknown" + job_data["title"] = f"Easy Apply Job - ID {job_id}" + + is_meaningful = ( + job_data["title"] != "N/A" or + job_data["company"] != "N/A" or + (job_data["description"] != "N/A" and len(job_data["description"]) > 20) + ) + + if is_meaningful: + await self._save_to_db(job_data, search_keywords) + await self._save_to_markdown(job_data, search_keywords, verified=True) + scraped_count += 1 + print(f" ✅ Scraped (verified): {job_data['title'][:50]}...") + else: + await self._save_to_markdown(job_data, search_keywords, verified=False) + print(f" 🟡 Scraped (unverified): {final_url} — low-quality data") except Exception as e: print(f" âš ī¸ Failed on job {idx+1}: {str(e)[:100]}") + continue finally: - # Return to search results print(" â†Šī¸ Returning to LinkedIn search results...") await page.goto(search_url, timeout=60000) await asyncio.sleep(4 * self.human_speed) await browser.close() - print(f"✅ Completed! Scraped {scraped_count} job pages (internal + external) for '{search_keywords}'.") + + if scraped_count > 0: + self.engine.report_outcome("success") + print(f"✅ Completed! Saved {scraped_count} verified + additional unverified jobs for '{search_keywords}'.") + else: + self.engine.report_outcome("captcha") + print("âš ī¸ No verified jobs scraped — check 'linkedin_jobs_unverified' for raw outputs.") diff --git a/linkedin_main.py b/linkedin_main.py index dc08dd4..5d3e277 100644 --- a/linkedin_main.py +++ b/linkedin_main.py @@ -10,13 +10,13 @@ async def main(): target_os="windows", db_path="job_listings.db", markdown_path="job_listings.md", - search_keywords="Accountant", + search_keywords="Data Anaylst" ) - scraper = LinkedInJobScraper(engine, human_speed=1.2) + scraper = LinkedInJobScraper(engine, human_speed=1.6) await scraper.scrape_jobs( - search_keywords="Accountant", # ← Your search terms + search_keywords="Data Anaylst", # ← Your search terms max_pages=3, credentials={ "email": os.getenv("SCRAPING_USERNAME"), diff --git a/scraping_engine.py b/scraping_engine.py index ecc7eed..336d773 100644 --- a/scraping_engine.py +++ b/scraping_engine.py @@ -1,12 +1,15 @@ + # scraping_engine.py import asyncio import hashlib import random import os +import json from typing import List, Optional, Dict from browserforge.fingerprints import FingerprintGenerator from dotenv import load_dotenv +from config import load_spoof_config # Load environment variables load_dotenv() @@ -27,7 +30,6 @@ class FingerprintScrapingEngine: if target_os not in ['windows', 'macos']: raise ValueError("operating_system must be 'windows' or 'macos'") - # Load credentials from .env if not provided if login_credentials is None: username = os.getenv("SCRAPING_USERNAME") password = os.getenv("SCRAPING_PASSWORD") @@ -47,37 +49,99 @@ class FingerprintScrapingEngine: ) self.num_variations = num_variations - self.common_renderers = { - 'windows': [ - "ANGLE (Intel, Intel(R) UHD Graphics 630 (0x00003E9B) Direct3D11 vs_5_0 ps_5_0, D3D11)", - "ANGLE (Intel, Intel(R) UHD Graphics (0x00009A49) Direct3D11 vs_5_0 ps_5_0, D3D11)", - "ANGLE (Intel(R) Iris(TM) Graphics 540 Direct3D11 vs_5_0 ps_5_0)", - "ANGLE (Intel, Intel(R) UHD Graphics 620 (0x00005916) Direct3D11 vs_5_0 ps_5_0, D3D11)", - "ANGLE (Intel, Intel(R) HD Graphics 530 (0x0000191B) Direct3D11 vs_5_0 ps_5_0, D3D11)", - "ANGLE (Intel, Intel(R) UHD Graphics 600 (0x00003180) Direct3D11 vs_5_0 ps_5_0, D3D11)", - "ANGLE (Intel, Intel(R) Iris(R) Xe Graphics (0x00009A49) Direct3D11 vs_5_0 ps_5_0, D3D11)", - ], - 'macos': [ - "Intel HD Graphics 530 OpenGL Engine", - "Intel Iris Graphics 6100 OpenGL Engine", - "Intel UHD Graphics 630 OpenGL Engine", - "Intel HD Graphics 4000 OpenGL Engine", - "Intel Iris Pro OpenGL Engine", - "Intel UHD Graphics 617 OpenGL Engine", - ] - } - self.common_vendors = ["Intel Inc.", "Intel", "Intel Corporation"] + + # Load spoof config + spoof_config = load_spoof_config() + self.common_renderers = spoof_config["renderers"] + self.common_vendors = spoof_config["vendors"] + + # Feedback system + self.feedback_file = f"feedback_{seed}.json" + self.feedback = self._load_feedback() + + # ← NEW: Session persistence paths + self.session_dir = "browser_sessions" + os.makedirs(self.session_dir, exist_ok=True) + self.session_path = os.path.join(self.session_dir, f"{seed}_session.json") + + def _load_feedback(self): + if os.path.exists(self.feedback_file): + try: + with open(self.feedback_file, "r") as f: + data = json.load(f) + data.setdefault("success_rate", 1.0) + data.setdefault("captcha_count", 0) + data.setdefault("cloudflare_count", 0) + return data + except: + pass + return {"success_rate": 1.0, "captcha_count": 0, "cloudflare_count": 0} + + def save_feedback(self): + with open(self.feedback_file, "w") as f: + json.dump(self.feedback, f) + + def report_outcome(self, outcome: str): + if outcome == "success": + self.feedback["success_rate"] = min(1.0, self.feedback["success_rate"] + 0.1) + else: + self.feedback["success_rate"] = max(0.1, self.feedback["success_rate"] - 0.2) + if outcome == "captcha": + self.feedback["captcha_count"] += 1 + elif outcome == "cloudflare": + self.feedback["cloudflare_count"] += 1 + self.save_feedback() + + # ← NEW: Save browser context (cookies + localStorage) + async def save_session(self, context): + """Save authenticated session to disk tied to seed""" + try: + storage = await context.storage_state() + with open(self.session_path, "w", encoding="utf-8") as f: + json.dump(storage, f, indent=2) + print(f"💾 Session saved for seed '{self.seed}'") + except Exception as e: + print(f"âš ī¸ Failed to save session: {e}") + + # ← NEW: Load session if exists + async def load_session(self, context): + """Restore session if available""" + if os.path.exists(self.session_path): + try: + with open(self.session_path, "r", encoding="utf-8") as f: + storage = json.load(f) + await context.add_cookies(storage.get("cookies", [])) + # Note: Playwright doesn't support localStorage restore via API directly, + # but cookies are the main auth carrier (e.g., li_at on LinkedIn) + print(f"🔁 Reusing session for seed '{self.seed}'") + return True + except Exception as e: + print(f"âš ī¸ Failed to load session: {e}") + # Optionally delete corrupted session + if os.path.exists(self.session_path): + os.remove(self.session_path) + return False def _select_profile(self): seed_hash = int(hashlib.sha256(self.seed.encode()).hexdigest(), 16) random.seed(seed_hash) profile = self.fingerprint_generator.generate() - profile.navigator.hardwareConcurrency = random.choice([4, 8, 12, 16]) - profile.navigator.deviceMemory = random.choice([4, 8]) + concurrency_options = [4, 8, 12, 16] + memory_options = [4, 8] + if self.feedback["success_rate"] < 0.5: + concurrency_options = [8, 4] + memory_options = [8] + profile.navigator.hardwareConcurrency = random.choice(concurrency_options) + profile.navigator.deviceMemory = random.choice(memory_options) return profile def _get_spoof_script(self, renderer: str, vendor: str): seed_hash = int(hashlib.sha256(self.seed.encode()).hexdigest(), 16) + if self.feedback["captcha_count"] > 2: + noise_factor = seed_hash % 100000000 + 100000000 + else: + noise_factor = seed_hash % 100000000 + return f""" (function() {{ const originalGetContext = HTMLCanvasElement.prototype.getContext; @@ -113,7 +177,7 @@ class FingerprintScrapingEngine: if (ctx) {{ const imageData = ctx.getImageData(0, 0, this.width, this.height); for (let i = 0; i < imageData.data.length; i += 4) {{ - const noise = (Math.sin({seed_hash % 100000000} + i) * 0.5 + 0.5) * 2 - 1; + const noise = (Math.sin({noise_factor} + i) * 0.5 + 0.5) * 2 - 1; imageData.data[i] = Math.min(255, Math.max(0, imageData.data[i] + noise)); imageData.data[i+1] = Math.min(255, Math.max(0, imageData.data[i+1] + noise)); imageData.data[i+2] = Math.min(255, Math.max(0, imageData.data[i+2] + noise)); @@ -184,7 +248,6 @@ class FingerprintScrapingEngine: pass async def _detect_cloudflare(self, page) -> bool: - """Detect Cloudflare challenge pages""" content = await page.content() return ( "#cf-chl" in content or @@ -193,7 +256,6 @@ class FingerprintScrapingEngine: ) async def _handle_cloudflare(self, page, max_retries: int = 3): - """Wait for Cloudflare to resolve""" for i in range(max_retries): if not await self._detect_cloudflare(page): return True