feat(scraping): enhance job scraping with session persistence and feedback system

- Add config module for spoof data management
- Implement session persistence to reuse authenticated sessions
- Add feedback system to track success rates and adjust fingerprinting
- Improve job link collection with pagination and scroll detection
- Separate verified/unverified job listings into different folders
- Enhance error handling for CAPTCHA and Cloudflare challenges
This commit is contained in:
Ofure Ikheloa 2025-11-21 16:51:26 +01:00
parent 68495a0a54
commit 458e914d71
4 changed files with 377 additions and 93 deletions

39
config.py Normal file
View File

@ -0,0 +1,39 @@
import os
import json
def load_spoof_config():
"""Load spoof data from JSON config file. Falls back to defaults if missing."""
config_path = os.path.join(os.path.dirname(__file__), "spoof_config.json")
if os.path.exists(config_path):
with open(config_path, "r", encoding="utf-8") as f:
return json.load(f)
else:
# Generate default config file on first run
default_config = {
"renderers": {
"windows": [
"ANGLE (Intel, Intel(R) UHD Graphics 630 (0x00003E9B) Direct3D11 vs_5_0 ps_5_0, D3D11)",
"ANGLE (Intel, Intel(R) UHD Graphics (0x00009A49) Direct3D11 vs_5_0 ps_5_0, D3D11)",
"ANGLE (Intel(R) Iris(TM) Graphics 540 Direct3D11 vs_5_0 ps_5_0)",
"ANGLE (Intel, Intel(R) UHD Graphics 620 (0x00005916) Direct3D11 vs_5_0 ps_5_0, D3D11)",
"ANGLE (Intel, Intel(R) HD Graphics 530 (0x0000191B) Direct3D11 vs_5_0 ps_5_0, D3D11)",
"ANGLE (Intel, Intel(R) UHD Graphics 600 (0x00003180) Direct3D11 vs_5_0 ps_5_0, D3D11)",
"ANGLE (Intel, Intel(R) Iris(R) Xe Graphics (0x00009A49) Direct3D11 vs_5_0 ps_5_0, D3D11)",
],
"macos": [
"Intel HD Graphics 530 OpenGL Engine",
"Intel Iris Graphics 6100 OpenGL Engine",
"Intel UHD Graphics 630 OpenGL Engine",
"Intel HD Graphics 4000 OpenGL Engine",
"Intel Iris Pro OpenGL Engine",
"Intel UHD Graphics 617 OpenGL Engine",
]
},
"vendors": ["Intel Inc.", "Intel", "Intel Corporation"]
}
with open(config_path, "w", encoding="utf-8") as f:
json.dump(default_config, f, indent=2)
return default_config

View File

@ -1,4 +1,5 @@
import asyncio
import random
import sqlite3
@ -114,24 +115,22 @@ class LinkedInJobScraper:
pass
return "N/A"
# Try multiple strategies for each field
title = await get_text("h1.t-24") # LinkedIn
title = await get_text("h1.t-24")
if title == "N/A":
title = await get_text("h1, h2") # External
title = await get_text("h1, h2")
company = await get_text("a.app-aware-link[href*='/company/']") # LinkedIn
company = await get_text("a.app-aware-link[href*='/company/']")
if company == "N/A":
company = await get_text("div.org, .company, [class*='company']") # External
company = await get_text("div.org, .company, [class*='company']")
location = await get_text("span[class*='location']") # LinkedIn
location = await get_text("span[class*='location']")
if location == "N/A":
location = await get_text(".location, [class*='location']")
description = await get_text("div[class*='description__text']") # LinkedIn
description = await get_text("div[class*='description__text']")
if description == "N/A":
description = await get_text(".job-desc, .description, main, body")
# Workplace & salary — LinkedIn only (external may not have)
workplace = await get_text("span.job-workplace-type") or "N/A"
salary = await get_text("span.salary") or "N/A"
@ -145,19 +144,39 @@ class LinkedInJobScraper:
"url": page.url
}
async def _save_to_markdown(self, job_data: Dict, keyword: str):
os.makedirs("linkedin_jobs", exist_ok=True)
clean_keyword = keyword.replace(" ", "_")
filename = f"linkedin_{clean_keyword}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
filepath = os.path.join("linkedin_jobs", filename)
async def _save_to_markdown(self, job_data: Dict, keyword: str, verified: bool=True):
"""Save to appropriate folder using job ID to avoid duplication"""
folder = "linkedin_jobs" if verified else "linkedin_jobs_unverified"
os.makedirs(folder, exist_ok=True)
# Extract job ID from URL for LinkedIn jobs
url = job_data.get("url", "")
if "/jobs/view/" in url:
try:
job_id = url.split("/view/")[1].split("/")[0]
except:
job_id = "unknown"
else:
# For external jobs, use a hash of the URL (first 12 chars)
import hashlib
job_id = hashlib.md5(url.encode()).hexdigest()[:12]
clean_keyword = keyword.replace(" ", "_")
filename = f"linkedin_{clean_keyword}_job_{job_id}.md"
filepath = os.path.join(folder, filename)
# Only save if file doesn't already exist (idempotent)
if os.path.exists(filepath):
print(f" 📝 Skipping duplicate Markdown file: {filename}")
return
with open(filepath, "w", encoding="utf-8") as f:
f.write(f"# {job_data['title']}\n\n")
f.write(f"- **Company**: {job_data['company']}\n")
f.write(f"- **Location**: {job_data['location']}\n")
f.write(f"- **Workplace**: {job_data['workplace_type']}\n")
f.write(f"- **Salary**: {job_data['salary']}\n")
f.write(f"- **URL**: <{job_data['url']}>\n\n")
f.write(f"- **URL**: <{url}>\n\n")
f.write(f"## Description\n\n{job_data['description']}\n")
async def _save_to_db(self, job_data: Dict, keyword: str):
@ -208,59 +227,176 @@ class LinkedInJobScraper:
await context.add_init_script(spoof_script)
page = await context.new_page()
if credentials:
print("🔐 Attempting LinkedIn login...")
if not await self._login(page, credentials):
session_loaded = await self.engine.load_session(context)
login_successful = False
if session_loaded:
print("🔁 Using saved session — verifying login...")
await page.goto("https://www.linkedin.com/feed/", timeout=60000)
if "feed" in page.url and "login" not in page.url:
print("✅ Session still valid.")
login_successful = True
else:
print("⚠️ Saved session expired — re-authenticating.")
session_loaded = False
if not session_loaded and credentials:
print("🔐 Performing fresh login...")
login_successful = await self._login(page, credentials)
if login_successful:
await self.engine.save_session(context)
else:
print("❌ Login failed. Exiting.")
await browser.close()
self.engine.report_outcome("block")
return
else:
elif not credentials:
print(" No credentials — proceeding as guest.")
login_successful = True
else:
pass
await page.wait_for_load_state("load", timeout=60000)
print("✅ Post-login page fully loaded. Starting search...")
if await self.engine._detect_cloudflare(page):
print("☁️ Cloudflare detected on initial load.")
if not await self.engine._handle_cloudflare(page):
print("❌ Cloudflare could not be resolved.")
await browser.close()
self.engine.report_outcome("cloudflare")
return
print(f"🔍 Searching for: {search_keywords}")
await page.goto(search_url, wait_until='load', timeout=60000)
await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)
if await self.engine._detect_cloudflare(page):
print("☁️ Cloudflare detected on search page.")
if not await self.engine._handle_cloudflare(page):
await browser.close()
self.engine.report_outcome("cloudflare")
return
scraped_count = 0
all_job_links = []
seen_job_ids = set()
# Collect job links
for page_num in range(1, max_pages + 1):
print(f"📄 Collecting job links from page {page_num}/{max_pages}")
for _ in range(50):
links = await page.query_selector_all("a[href*='/jobs/view/']")
if links:
for link in links:
href = await link.get_attribute("href")
if href and href not in all_job_links:
# ← NEW: Scroll once to reveal pagination (if any)
print("🔄 Scrolling to bottom to reveal pagination controls...")
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
# Check if pagination exists
pagination_exists = await page.query_selector("button[aria-label='Next']")
if pagination_exists:
print("⏭️ Pagination detected. Using page navigation.")
current_page = 1
while current_page <= max_pages:
print(f"📄 Processing page {current_page}/{max_pages}")
# Collect job links on current page
current_links = await page.query_selector_all("a[href*='/jobs/view/']")
new_jobs = 0
for link in current_links:
href = await link.get_attribute("href")
if href:
job_id = href.split("/view/")[-1].split("/")[0] if "/view/" in href else href
if job_id and job_id not in seen_job_ids:
seen_job_ids.add(job_id)
all_job_links.append(href)
break
await asyncio.sleep(1)
print(f" Found {len(links) if 'links' in locals() else 0} new job links.")
if page_num < max_pages:
next_btn = await page.query_selector("button[aria-label='Next']")
if next_btn and await next_btn.is_enabled():
await self._human_click(page, next_btn)
await asyncio.sleep(4 * self.human_speed)
new_jobs += 1
print(f" Found {new_jobs} new job(s) on page {current_page} (total: {len(all_job_links)})")
# Try to go to next page
if current_page < max_pages:
next_btn = await page.query_selector("button[aria-label='Next']")
if next_btn and await next_btn.is_enabled():
await self._human_click(page, next_btn)
await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)
# Wait for URL to change or new content
try:
await page.wait_for_function("() => window.location.href.includes('start=')", timeout=30000)
except:
pass
else:
print("🔚 'Next' button not available — stopping pagination.")
break
current_page += 1
else:
print("🔄 No pagination found. Falling back to infinite scroll...")
last_height = await page.evaluate("document.body.scrollHeight")
no_new_jobs_count = 0
max_no_new = 3
while no_new_jobs_count < max_no_new:
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
current_links = await page.query_selector_all("a[href*='/jobs/view/']")
new_jobs_found = 0
for link in current_links:
href = await link.get_attribute("href")
if href:
job_id = href.split("/view/")[-1].split("/")[0] if "/view/" in href else href
if job_id and job_id not in seen_job_ids:
seen_job_ids.add(job_id)
all_job_links.append(href)
new_jobs_found += 1
print(f" Found {new_jobs_found} new job(s) (total: {len(all_job_links)})")
new_height = await page.evaluate("document.body.scrollHeight")
if new_height == last_height:
no_new_jobs_count += 1
else:
print("🔚 No next page.")
no_new_jobs_count = 0
last_height = new_height
if new_jobs_found == 0 and no_new_jobs_count >= 1:
print("🔚 No new jobs loaded. Stopping scroll.")
break
# Process each job
print(f"✅ Collected {len(all_job_links)} unique job links.")
# ← Rest of job processing loop unchanged
scraped_count = 0
for idx, href in enumerate(all_job_links):
try:
full_url = href if href.startswith("http") else f"https://www.linkedin.com{href}"
print(f" → Opening job {idx+1}: {full_url}")
print(f" → Opening job {idx+1}/{len(all_job_links)}: {full_url}")
await page.goto(full_url, wait_until='load', timeout=60000)
await asyncio.sleep(3 * self.human_speed)
if not await page.query_selector("h1.t-24"):
print(f" ⚠️ Invalid job page, skipping.")
continue
is_cloudflare = await self.engine._detect_cloudflare(page)
page_content = await page.content()
has_captcha_text = "captcha" in page_content.lower()
captcha_present = is_cloudflare or has_captcha_text
title_element = await page.query_selector("h1.t-24")
job_data_accessible = title_element is not None
if captcha_present:
if job_data_accessible:
print(" ⚠️ CAPTCHA detected, but job data is accessible. Proceeding in stealth mode...")
await self.engine._avoid_captcha(page)
else:
print(" ⚠️ CAPTCHA detected and job data blocked. Attempting recovery...")
if not await self.engine._solve_captcha_fallback(page):
print(" ❌ CAPTCHA recovery failed. Skipping job.")
continue
title_element = await page.query_selector("h1.t-24")
if not title_element:
print(" ❌ Job data still unavailable after CAPTCHA handling. Skipping.")
continue
if not captcha_present:
await self.engine._avoid_captcha(page)
# Find and click the main "Apply" button
apply_btn = None
apply_selectors = [
"button[aria-label*='Apply']",
@ -273,36 +409,83 @@ class LinkedInJobScraper:
if apply_btn:
break
if not apply_btn:
print(f" ⚠️ No 'Apply' button found, skipping.")
continue
job_data = None
final_url = full_url
# Click "Apply"
print(f" → Clicking 'Apply' / 'Easy Apply' button...")
await self._human_click(page, apply_btn, wait_after=False)
await asyncio.sleep(4 * self.human_speed) # Wait for next page/form to load
if apply_btn:
print(" → Clicking 'Apply' / 'Easy Apply' button...")
page_waiter = asyncio.create_task(context.wait_for_event("page"))
await self._human_click(page, apply_btn, wait_after=False)
external_page = None
try:
external_page = await asyncio.wait_for(page_waiter, timeout=5.0)
print(" 🌐 External job site opened in new tab.")
await external_page.wait_for_load_state("load", timeout=30000)
await asyncio.sleep(2 * self.human_speed)
await self.engine._human_like_scroll(external_page)
await asyncio.sleep(2 * self.human_speed)
job_data = await self._extract_job_details(external_page)
final_url = external_page.url
if not external_page.is_closed():
await external_page.close()
except asyncio.TimeoutError:
print(" 🖥️ No external tab — scraping LinkedIn job page.")
await page.wait_for_timeout(2000)
try:
await page.wait_for_selector("div.jobs-apply-button--fixed, div.jobs-easy-apply-modal", timeout=8000)
except:
pass
await self.engine._human_like_scroll(page)
await asyncio.sleep(2 * self.human_speed)
job_data = await self._extract_job_details(page)
final_url = page.url
else:
print(" ⚠️ No 'Apply' button found — scraping job details directly.")
await self.engine._human_like_scroll(page)
await asyncio.sleep(2 * self.human_speed)
job_data = await self._extract_job_details(page)
final_url = page.url
# Now scrape WHATEVER page is displayed (Easy Apply form OR external site)
job_data = await self._extract_job_details(page)
if job_data["title"] == "N/A" and "linkedin.com" in page.url:
# On LinkedIn but no title → likely Easy Apply form; use job ID as title
job_id = full_url.split("/")[-2] if "/jobs/view/" in full_url else "unknown"
job_data["title"] = f"Easy Apply Job - ID {job_id}"
job_data["url"] = final_url
await self._save_to_db(job_data, search_keywords)
await self._save_to_markdown(job_data, search_keywords)
scraped_count += 1
domain = "LinkedIn (Easy Apply)" if "linkedin.com" in page.url else "External Site"
print(f" ✅ Scraped ({domain}): {job_data['title'][:50]}...")
if job_data["title"] == "N/A" and "linkedin.com" in final_url:
job_id = final_url.split("/")[-2] if "/jobs/view/" in final_url else "unknown"
job_data["title"] = f"Easy Apply Job - ID {job_id}"
is_meaningful = (
job_data["title"] != "N/A" or
job_data["company"] != "N/A" or
(job_data["description"] != "N/A" and len(job_data["description"]) > 20)
)
if is_meaningful:
await self._save_to_db(job_data, search_keywords)
await self._save_to_markdown(job_data, search_keywords, verified=True)
scraped_count += 1
print(f" ✅ Scraped (verified): {job_data['title'][:50]}...")
else:
await self._save_to_markdown(job_data, search_keywords, verified=False)
print(f" 🟡 Scraped (unverified): {final_url} — low-quality data")
except Exception as e:
print(f" ⚠️ Failed on job {idx+1}: {str(e)[:100]}")
continue
finally:
# Return to search results
print(" ↩️ Returning to LinkedIn search results...")
await page.goto(search_url, timeout=60000)
await asyncio.sleep(4 * self.human_speed)
await browser.close()
print(f"✅ Completed! Scraped {scraped_count} job pages (internal + external) for '{search_keywords}'.")
if scraped_count > 0:
self.engine.report_outcome("success")
print(f"✅ Completed! Saved {scraped_count} verified + additional unverified jobs for '{search_keywords}'.")
else:
self.engine.report_outcome("captcha")
print("⚠️ No verified jobs scraped — check 'linkedin_jobs_unverified' for raw outputs.")

View File

@ -10,13 +10,13 @@ async def main():
target_os="windows",
db_path="job_listings.db",
markdown_path="job_listings.md",
search_keywords="Accountant",
search_keywords="Data Anaylst"
)
scraper = LinkedInJobScraper(engine, human_speed=1.2)
scraper = LinkedInJobScraper(engine, human_speed=1.6)
await scraper.scrape_jobs(
search_keywords="Accountant", # ← Your search terms
search_keywords="Data Anaylst", # ← Your search terms
max_pages=3,
credentials={
"email": os.getenv("SCRAPING_USERNAME"),

View File

@ -1,12 +1,15 @@
# scraping_engine.py
import asyncio
import hashlib
import random
import os
import json
from typing import List, Optional, Dict
from browserforge.fingerprints import FingerprintGenerator
from dotenv import load_dotenv
from config import load_spoof_config
# Load environment variables
load_dotenv()
@ -27,7 +30,6 @@ class FingerprintScrapingEngine:
if target_os not in ['windows', 'macos']:
raise ValueError("operating_system must be 'windows' or 'macos'")
# Load credentials from .env if not provided
if login_credentials is None:
username = os.getenv("SCRAPING_USERNAME")
password = os.getenv("SCRAPING_PASSWORD")
@ -47,37 +49,99 @@ class FingerprintScrapingEngine:
)
self.num_variations = num_variations
self.common_renderers = {
'windows': [
"ANGLE (Intel, Intel(R) UHD Graphics 630 (0x00003E9B) Direct3D11 vs_5_0 ps_5_0, D3D11)",
"ANGLE (Intel, Intel(R) UHD Graphics (0x00009A49) Direct3D11 vs_5_0 ps_5_0, D3D11)",
"ANGLE (Intel(R) Iris(TM) Graphics 540 Direct3D11 vs_5_0 ps_5_0)",
"ANGLE (Intel, Intel(R) UHD Graphics 620 (0x00005916) Direct3D11 vs_5_0 ps_5_0, D3D11)",
"ANGLE (Intel, Intel(R) HD Graphics 530 (0x0000191B) Direct3D11 vs_5_0 ps_5_0, D3D11)",
"ANGLE (Intel, Intel(R) UHD Graphics 600 (0x00003180) Direct3D11 vs_5_0 ps_5_0, D3D11)",
"ANGLE (Intel, Intel(R) Iris(R) Xe Graphics (0x00009A49) Direct3D11 vs_5_0 ps_5_0, D3D11)",
],
'macos': [
"Intel HD Graphics 530 OpenGL Engine",
"Intel Iris Graphics 6100 OpenGL Engine",
"Intel UHD Graphics 630 OpenGL Engine",
"Intel HD Graphics 4000 OpenGL Engine",
"Intel Iris Pro OpenGL Engine",
"Intel UHD Graphics 617 OpenGL Engine",
]
}
self.common_vendors = ["Intel Inc.", "Intel", "Intel Corporation"]
# Load spoof config
spoof_config = load_spoof_config()
self.common_renderers = spoof_config["renderers"]
self.common_vendors = spoof_config["vendors"]
# Feedback system
self.feedback_file = f"feedback_{seed}.json"
self.feedback = self._load_feedback()
# ← NEW: Session persistence paths
self.session_dir = "browser_sessions"
os.makedirs(self.session_dir, exist_ok=True)
self.session_path = os.path.join(self.session_dir, f"{seed}_session.json")
def _load_feedback(self):
if os.path.exists(self.feedback_file):
try:
with open(self.feedback_file, "r") as f:
data = json.load(f)
data.setdefault("success_rate", 1.0)
data.setdefault("captcha_count", 0)
data.setdefault("cloudflare_count", 0)
return data
except:
pass
return {"success_rate": 1.0, "captcha_count": 0, "cloudflare_count": 0}
def save_feedback(self):
with open(self.feedback_file, "w") as f:
json.dump(self.feedback, f)
def report_outcome(self, outcome: str):
if outcome == "success":
self.feedback["success_rate"] = min(1.0, self.feedback["success_rate"] + 0.1)
else:
self.feedback["success_rate"] = max(0.1, self.feedback["success_rate"] - 0.2)
if outcome == "captcha":
self.feedback["captcha_count"] += 1
elif outcome == "cloudflare":
self.feedback["cloudflare_count"] += 1
self.save_feedback()
# ← NEW: Save browser context (cookies + localStorage)
async def save_session(self, context):
"""Save authenticated session to disk tied to seed"""
try:
storage = await context.storage_state()
with open(self.session_path, "w", encoding="utf-8") as f:
json.dump(storage, f, indent=2)
print(f"💾 Session saved for seed '{self.seed}'")
except Exception as e:
print(f"⚠️ Failed to save session: {e}")
# ← NEW: Load session if exists
async def load_session(self, context):
"""Restore session if available"""
if os.path.exists(self.session_path):
try:
with open(self.session_path, "r", encoding="utf-8") as f:
storage = json.load(f)
await context.add_cookies(storage.get("cookies", []))
# Note: Playwright doesn't support localStorage restore via API directly,
# but cookies are the main auth carrier (e.g., li_at on LinkedIn)
print(f"🔁 Reusing session for seed '{self.seed}'")
return True
except Exception as e:
print(f"⚠️ Failed to load session: {e}")
# Optionally delete corrupted session
if os.path.exists(self.session_path):
os.remove(self.session_path)
return False
def _select_profile(self):
seed_hash = int(hashlib.sha256(self.seed.encode()).hexdigest(), 16)
random.seed(seed_hash)
profile = self.fingerprint_generator.generate()
profile.navigator.hardwareConcurrency = random.choice([4, 8, 12, 16])
profile.navigator.deviceMemory = random.choice([4, 8])
concurrency_options = [4, 8, 12, 16]
memory_options = [4, 8]
if self.feedback["success_rate"] < 0.5:
concurrency_options = [8, 4]
memory_options = [8]
profile.navigator.hardwareConcurrency = random.choice(concurrency_options)
profile.navigator.deviceMemory = random.choice(memory_options)
return profile
def _get_spoof_script(self, renderer: str, vendor: str):
seed_hash = int(hashlib.sha256(self.seed.encode()).hexdigest(), 16)
if self.feedback["captcha_count"] > 2:
noise_factor = seed_hash % 100000000 + 100000000
else:
noise_factor = seed_hash % 100000000
return f"""
(function() {{
const originalGetContext = HTMLCanvasElement.prototype.getContext;
@ -113,7 +177,7 @@ class FingerprintScrapingEngine:
if (ctx) {{
const imageData = ctx.getImageData(0, 0, this.width, this.height);
for (let i = 0; i < imageData.data.length; i += 4) {{
const noise = (Math.sin({seed_hash % 100000000} + i) * 0.5 + 0.5) * 2 - 1;
const noise = (Math.sin({noise_factor} + i) * 0.5 + 0.5) * 2 - 1;
imageData.data[i] = Math.min(255, Math.max(0, imageData.data[i] + noise));
imageData.data[i+1] = Math.min(255, Math.max(0, imageData.data[i+1] + noise));
imageData.data[i+2] = Math.min(255, Math.max(0, imageData.data[i+2] + noise));
@ -184,7 +248,6 @@ class FingerprintScrapingEngine:
pass
async def _detect_cloudflare(self, page) -> bool:
"""Detect Cloudflare challenge pages"""
content = await page.content()
return (
"#cf-chl" in content or
@ -193,7 +256,6 @@ class FingerprintScrapingEngine:
)
async def _handle_cloudflare(self, page, max_retries: int = 3):
"""Wait for Cloudflare to resolve"""
for i in range(max_retries):
if not await self._detect_cloudflare(page):
return True