fix(job_scraper): increase timeout values for page navigation
The previous timeout values were too short for slower network conditions, causing premature timeouts during job scraping. Increased wait_for_function timeout from 30s to 80s and load_state timeout from 30s to 60s to accommodate slower page loads.
This commit is contained in:
parent
d025828036
commit
d7d92ba8bb
@ -8,9 +8,9 @@ from dotenv import load_dotenv
|
|||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
# LLM Agent Configuration
|
# LLM Agent Configuration
|
||||||
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
GEMINI_API_KEY = os.getenv("XAI_API_KEY")
|
||||||
if not GEMINI_API_KEY:
|
if not GEMINI_API_KEY:
|
||||||
raise ValueError("GEMINI_API_KEY environment variable not set in .env file")
|
raise ValueError("XAI_API_KEY environment variable not set in .env file")
|
||||||
|
|
||||||
|
|
||||||
def load_spoof_config():
|
def load_spoof_config():
|
||||||
|
|||||||
125
fetcher.py
Normal file
125
fetcher.py
Normal file
@ -0,0 +1,125 @@
|
|||||||
|
import asyncio
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
from playwright.async_api import Page, BrowserContext, Browser, TimeoutError as PlaywrightTimeoutError
|
||||||
|
from typing import Optional
|
||||||
|
from scraping_engine import FingerprintScrapingEngine
|
||||||
|
|
||||||
|
|
||||||
|
class StealthyFetcher:
|
||||||
|
def __init__(self, engine: FingerprintScrapingEngine, browser: Browser, context: BrowserContext):
|
||||||
|
self.engine = engine
|
||||||
|
self.browser = browser
|
||||||
|
self.context = context
|
||||||
|
self.max_retries = 5
|
||||||
|
self.base_delay = 5
|
||||||
|
|
||||||
|
async def fetch_url(self, url: str, wait_for_selector: Optional[str] = None) -> Optional[Page]:
|
||||||
|
"""
|
||||||
|
Fetch a URL using stealth techniques, handling Cloudflare and other protections intelligently.
|
||||||
|
"""
|
||||||
|
for attempt in range(self.max_retries):
|
||||||
|
try:
|
||||||
|
print(f"Attempt {attempt + 1} to fetch {url}")
|
||||||
|
page = await self.context.new_page()
|
||||||
|
|
||||||
|
await page.goto(url, wait_until='load', timeout=60000)
|
||||||
|
|
||||||
|
if wait_for_selector:
|
||||||
|
try:
|
||||||
|
await page.wait_for_selector(wait_for_selector, timeout=10000)
|
||||||
|
except PlaywrightTimeoutError:
|
||||||
|
print(f"Selector {wait_for_selector} not found immediately, continuing...")
|
||||||
|
|
||||||
|
await self._apply_human_behavior(page)
|
||||||
|
|
||||||
|
protection_type = await self._detect_protection(page)
|
||||||
|
if protection_type:
|
||||||
|
print(f"🛡️ Protection detected: {protection_type}")
|
||||||
|
content_accessible = await self._is_content_accessible(page, wait_for_selector)
|
||||||
|
if not content_accessible:
|
||||||
|
print("🔒 Content not accessible due to protection.")
|
||||||
|
handled = False
|
||||||
|
if protection_type == "cloudflare":
|
||||||
|
handled = await self._handle_cloudflare(page)
|
||||||
|
elif protection_type == "captcha":
|
||||||
|
handled = await self._handle_captcha(page)
|
||||||
|
if not handled:
|
||||||
|
print("❌ Failed to handle protection.")
|
||||||
|
await page.close()
|
||||||
|
await asyncio.sleep(self.base_delay * (2 ** attempt))
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
print("✅ Protection present but content is accessible — proceeding.")
|
||||||
|
|
||||||
|
print(f"✅ Successfully fetched {url}")
|
||||||
|
return page
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Attempt {attempt + 1} failed for {url}: {str(e)}")
|
||||||
|
if 'page' in locals():
|
||||||
|
await page.close()
|
||||||
|
await asyncio.sleep(self.base_delay * (2 ** attempt))
|
||||||
|
|
||||||
|
print(f"❌ Failed to fetch {url} after {self.max_retries} attempts.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def _apply_human_behavior(self, page: Page):
|
||||||
|
await self.engine._human_like_scroll(page)
|
||||||
|
await asyncio.sleep(random.uniform(1, 3))
|
||||||
|
await self.engine._simulate_human_interaction(page)
|
||||||
|
await asyncio.sleep(random.uniform(1, 2))
|
||||||
|
|
||||||
|
async def _detect_protection(self, page: Page) -> Optional[str]:
|
||||||
|
content = (await page.content()).lower()
|
||||||
|
if (
|
||||||
|
"#cf-chl" in content
|
||||||
|
or "checking your browser" in content
|
||||||
|
or "just a moment" in content
|
||||||
|
or "cloudflare" in content
|
||||||
|
or "ddos protection" in content
|
||||||
|
or "turnstile" in content
|
||||||
|
):
|
||||||
|
return "cloudflare"
|
||||||
|
elif "captcha" in content or "robot" in content or "verify you're human" in content:
|
||||||
|
return "captcha"
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def _is_content_accessible(self, page: Page, wait_for_selector: Optional[str] = None) -> bool:
|
||||||
|
if wait_for_selector:
|
||||||
|
try:
|
||||||
|
await page.wait_for_selector(wait_for_selector, timeout=5000)
|
||||||
|
return True
|
||||||
|
except PlaywrightTimeoutError:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
body_text = await page.eval_on_selector("body", "el => el.innerText.toLowerCase()")
|
||||||
|
return len(body_text.strip()) > 200
|
||||||
|
except:
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def _handle_captcha(self, page: Page) -> bool:
|
||||||
|
print("🦾 Using 'avoid' strategy for captcha — skipping page.")
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def _handle_cloudflare(self, page: Page) -> bool:
|
||||||
|
max_wait_time = 60
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
while time.time() - start_time < max_wait_time:
|
||||||
|
if not await self._detect_protection(page):
|
||||||
|
print("☁️ Cloudflare challenge resolved.")
|
||||||
|
return True
|
||||||
|
|
||||||
|
print("☁️ Cloudflare active, waiting...")
|
||||||
|
await self._apply_human_behavior(page)
|
||||||
|
wait_time = min(10, 2 + random.uniform(1, 3) + (time.time() - start_time) * 0.1)
|
||||||
|
await asyncio.sleep(wait_time)
|
||||||
|
|
||||||
|
if (time.time() - start_time) > 15 and (time.time() - start_time) % 20 < 2:
|
||||||
|
print("🔄 Reloading page during Cloudflare wait...")
|
||||||
|
await page.reload(wait_until='load', timeout=30000)
|
||||||
|
|
||||||
|
print("⏰ Timeout waiting for Cloudflare resolution.")
|
||||||
|
return False
|
||||||
|
|
||||||
188
job_scraper2.py
188
job_scraper2.py
@ -1,14 +1,13 @@
|
|||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import random
|
import random
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import os
|
import os
|
||||||
from datetime import datetime
|
from typing import Optional, Dict
|
||||||
from typing import Optional, Dict, List
|
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
|
||||||
from playwright.async_api import async_playwright
|
|
||||||
from browserforge.injectors.playwright import AsyncNewContext
|
from browserforge.injectors.playwright import AsyncNewContext
|
||||||
from llm_agent import LLMJobRefiner
|
from llm_agent import LLMJobRefiner
|
||||||
import re
|
import re
|
||||||
|
from fetcher import StealthyFetcher
|
||||||
|
|
||||||
|
|
||||||
class LinkedInJobScraper:
|
class LinkedInJobScraper:
|
||||||
@ -17,12 +16,12 @@ class LinkedInJobScraper:
|
|||||||
engine,
|
engine,
|
||||||
db_path: str = "linkedin_jobs.db",
|
db_path: str = "linkedin_jobs.db",
|
||||||
human_speed: float = 1.0,
|
human_speed: float = 1.0,
|
||||||
target_field: str = "all"
|
user_request: str = "Extract all standard job details"
|
||||||
):
|
):
|
||||||
self.engine = engine
|
self.engine = engine
|
||||||
self.db_path = db_path
|
self.db_path = db_path
|
||||||
self.human_speed = human_speed
|
self.human_speed = human_speed
|
||||||
self.target_field = target_field
|
self.user_request = user_request
|
||||||
self._init_db()
|
self._init_db()
|
||||||
self.llm_agent = LLMJobRefiner()
|
self.llm_agent = LLMJobRefiner()
|
||||||
|
|
||||||
@ -61,7 +60,6 @@ class LinkedInJobScraper:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
async def _login(self, page, credentials: Dict) -> bool:
|
async def _login(self, page, credentials: Dict) -> bool:
|
||||||
"""Human-realistic LinkedIn login"""
|
|
||||||
print("🔐 Navigating to LinkedIn login page...")
|
print("🔐 Navigating to LinkedIn login page...")
|
||||||
await page.goto("https://www.linkedin.com/login", timeout=60000)
|
await page.goto("https://www.linkedin.com/login", timeout=60000)
|
||||||
await asyncio.sleep(random.uniform(2.0, 3.5) * self.human_speed)
|
await asyncio.sleep(random.uniform(2.0, 3.5) * self.human_speed)
|
||||||
@ -107,45 +105,27 @@ class LinkedInJobScraper:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
async def _extract_all_page_content(self, page) -> str:
|
async def _extract_all_page_content(self, page) -> str:
|
||||||
"""Extract all content from the job page"""
|
|
||||||
await asyncio.sleep(2 * self.human_speed)
|
await asyncio.sleep(2 * self.human_speed)
|
||||||
|
|
||||||
# Human-like scrolling to load all content
|
|
||||||
await self.engine._human_like_scroll(page)
|
await self.engine._human_like_scroll(page)
|
||||||
await asyncio.sleep(2 * self.human_speed)
|
await asyncio.sleep(2 * self.human_speed)
|
||||||
|
|
||||||
# Get the full page content
|
|
||||||
page_content = await page.content()
|
page_content = await page.content()
|
||||||
return page_content
|
return page_content
|
||||||
|
|
||||||
def _calculate_keyword_match(self, title: str, keywords: str) -> float:
|
def _calculate_keyword_match(self, title: str, keywords: str) -> float:
|
||||||
"""Calculate percentage of keywords matched in title"""
|
|
||||||
if not title or not keywords:
|
if not title or not keywords:
|
||||||
return 0.0
|
return 0.0
|
||||||
|
|
||||||
title_lower = title.lower()
|
title_lower = title.lower()
|
||||||
keyword_list = [kw.strip().lower() for kw in keywords.split()]
|
keyword_list = [kw.strip().lower() for kw in keywords.split()]
|
||||||
|
matches = sum(1 for kw in keyword_list if kw in title_lower)
|
||||||
matches = 0
|
|
||||||
for keyword in keyword_list:
|
|
||||||
if keyword in title_lower:
|
|
||||||
matches += 1
|
|
||||||
|
|
||||||
return matches / len(keyword_list) if keyword_list else 0.0
|
return matches / len(keyword_list) if keyword_list else 0.0
|
||||||
|
|
||||||
def _extract_location_from_keywords(self, search_keywords: str) -> str:
|
def _extract_location_from_keywords(self, search_keywords: str) -> str:
|
||||||
"""Extract location from search keywords if present"""
|
|
||||||
location_match = re.search(r'location:\s*([^,]+)', search_keywords, re.IGNORECASE)
|
location_match = re.search(r'location:\s*([^,]+)', search_keywords, re.IGNORECASE)
|
||||||
if location_match:
|
return location_match.group(1).strip().lower() if location_match else ""
|
||||||
return location_match.group(1).strip().lower()
|
|
||||||
return ""
|
|
||||||
|
|
||||||
async def _scrape_jobs_from_current_page(self, page, search_keywords: str, seen_job_ids, all_job_links):
|
async def _scrape_jobs_from_current_page(self, page, search_keywords: str, seen_job_ids, all_job_links):
|
||||||
"""Scrape job links from the current page that match keywords and location"""
|
|
||||||
current_links = await page.query_selector_all("a[href*='/jobs/view/']")
|
current_links = await page.query_selector_all("a[href*='/jobs/view/']")
|
||||||
new_jobs = 0
|
new_jobs = 0
|
||||||
|
|
||||||
# Extract location from search keywords
|
|
||||||
location_from_keywords = self._extract_location_from_keywords(search_keywords)
|
location_from_keywords = self._extract_location_from_keywords(search_keywords)
|
||||||
|
|
||||||
for link in current_links:
|
for link in current_links:
|
||||||
@ -155,22 +135,18 @@ class LinkedInJobScraper:
|
|||||||
job_id = href.split("/view/")[-1].split("/")[0] if "/view/" in href else href
|
job_id = href.split("/view/")[-1].split("/")[0] if "/view/" in href else href
|
||||||
|
|
||||||
if job_id and job_id not in seen_job_ids:
|
if job_id and job_id not in seen_job_ids:
|
||||||
# Check if job title matches keywords (at least 70% match)
|
|
||||||
title_element = await link.query_selector("span.job-title, h3, .job-card-title")
|
title_element = await link.query_selector("span.job-title, h3, .job-card-title")
|
||||||
if title_element:
|
if title_element:
|
||||||
title = await title_element.inner_text()
|
title = await title_element.inner_text()
|
||||||
match_percentage = self._calculate_keyword_match(title, search_keywords)
|
match_percentage = self._calculate_keyword_match(title, search_keywords)
|
||||||
|
|
||||||
# Check if location matches (if specified in keywords)
|
|
||||||
location_match = True
|
location_match = True
|
||||||
if location_from_keywords:
|
if location_from_keywords:
|
||||||
# Try to get location from the job card
|
|
||||||
location_element = await link.query_selector("span.job-location, .job-card-location, .location")
|
location_element = await link.query_selector("span.job-location, .job-card-location, .location")
|
||||||
if location_element:
|
if location_element:
|
||||||
location_text = await location_element.inner_text()
|
location_text = await location_element.inner_text()
|
||||||
location_match = location_from_keywords in location_text.lower()
|
location_match = location_from_keywords in location_text.lower()
|
||||||
|
|
||||||
if match_percentage >= 0.7 and location_match: # At least 70% match and location matches
|
if match_percentage >= 0.7 and location_match:
|
||||||
seen_job_ids.add(job_id)
|
seen_job_ids.add(job_id)
|
||||||
all_job_links.append((href, title))
|
all_job_links.append((href, title))
|
||||||
new_jobs += 1
|
new_jobs += 1
|
||||||
@ -179,28 +155,22 @@ class LinkedInJobScraper:
|
|||||||
elif not location_match:
|
elif not location_match:
|
||||||
print(f" ⚠️ Skipping job due to location mismatch: {title[:50]}... (expected: {location_from_keywords})")
|
print(f" ⚠️ Skipping job due to location mismatch: {title[:50]}... (expected: {location_from_keywords})")
|
||||||
else:
|
else:
|
||||||
# If no title element, still add to check later
|
|
||||||
seen_job_ids.add(job_id)
|
seen_job_ids.add(job_id)
|
||||||
all_job_links.append((href, "Unknown Title"))
|
all_job_links.append((href, "Unknown Title"))
|
||||||
new_jobs += 1
|
new_jobs += 1
|
||||||
return new_jobs
|
return new_jobs
|
||||||
|
|
||||||
async def _handle_pagination(self, page, search_keywords: str, seen_job_ids, all_job_links):
|
async def _handle_pagination(self, page, search_keywords: str, seen_job_ids, all_job_links):
|
||||||
"""Handle pagination by going through pages"""
|
|
||||||
current_page = 1
|
current_page = 1
|
||||||
while True:
|
while True:
|
||||||
print(f"📄 Processing page {current_page}")
|
print(f"📄 Processing page {current_page}")
|
||||||
|
|
||||||
# Collect job links on current page
|
|
||||||
new_jobs = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links)
|
new_jobs = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links)
|
||||||
print(f" ➕ Found {new_jobs} new job(s) on page {current_page} (total: {len(all_job_links)})")
|
print(f" ➕ Found {new_jobs} new job(s) on page {current_page} (total: {len(all_job_links)})")
|
||||||
|
|
||||||
# Try to go to next page
|
|
||||||
next_btn = await page.query_selector("button[aria-label='Next']")
|
next_btn = await page.query_selector("button[aria-label='Next']")
|
||||||
if next_btn and await next_btn.is_enabled():
|
if next_btn and await next_btn.is_enabled():
|
||||||
await self._human_click(page, next_btn)
|
await self._human_click(page, next_btn)
|
||||||
await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)
|
await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)
|
||||||
# Wait for URL to change or new content
|
|
||||||
try:
|
try:
|
||||||
await page.wait_for_function("() => window.location.href.includes('start=')", timeout=60000)
|
await page.wait_for_function("() => window.location.href.includes('start=')", timeout=60000)
|
||||||
except:
|
except:
|
||||||
@ -211,7 +181,6 @@ class LinkedInJobScraper:
|
|||||||
break
|
break
|
||||||
|
|
||||||
async def _handle_infinite_scroll(self, page, search_keywords: str, seen_job_ids, all_job_links):
|
async def _handle_infinite_scroll(self, page, search_keywords: str, seen_job_ids, all_job_links):
|
||||||
"""Handle infinite scroll to load more jobs"""
|
|
||||||
last_height = await page.evaluate("document.body.scrollHeight")
|
last_height = await page.evaluate("document.body.scrollHeight")
|
||||||
no_new_jobs_count = 0
|
no_new_jobs_count = 0
|
||||||
max_no_new = 3
|
max_no_new = 3
|
||||||
@ -221,7 +190,6 @@ class LinkedInJobScraper:
|
|||||||
await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
|
await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
|
||||||
|
|
||||||
new_jobs_found = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links)
|
new_jobs_found = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links)
|
||||||
|
|
||||||
print(f" ➕ Found {new_jobs_found} new job(s) (total: {len(all_job_links)})")
|
print(f" ➕ Found {new_jobs_found} new job(s) (total: {len(all_job_links)})")
|
||||||
|
|
||||||
new_height = await page.evaluate("document.body.scrollHeight")
|
new_height = await page.evaluate("document.body.scrollHeight")
|
||||||
@ -241,15 +209,11 @@ class LinkedInJobScraper:
|
|||||||
max_pages: int = 1,
|
max_pages: int = 1,
|
||||||
credentials: Optional[Dict] = None
|
credentials: Optional[Dict] = None
|
||||||
):
|
):
|
||||||
# Parse location from keywords if present
|
|
||||||
location_match = re.search(r'location:\s*([^,]+)', search_keywords, re.IGNORECASE)
|
location_match = re.search(r'location:\s*([^,]+)', search_keywords, re.IGNORECASE)
|
||||||
location = location_match.group(1).strip() if location_match else ""
|
location = location_match.group(1).strip() if location_match else ""
|
||||||
|
|
||||||
# Remove location part from keywords for search
|
|
||||||
clean_keywords = re.sub(r'location:\s*[^,]+', '', search_keywords, flags=re.IGNORECASE).strip()
|
clean_keywords = re.sub(r'location:\s*[^,]+', '', search_keywords, flags=re.IGNORECASE).strip()
|
||||||
encoded_keywords = clean_keywords.replace(" ", "%20")
|
encoded_keywords = clean_keywords.replace(" ", "%20")
|
||||||
|
|
||||||
# Build search URL with location if specified
|
|
||||||
search_url = f"https://www.linkedin.com/jobs/search/?keywords={encoded_keywords}"
|
search_url = f"https://www.linkedin.com/jobs/search/?keywords={encoded_keywords}"
|
||||||
if location:
|
if location:
|
||||||
search_url += f"&location={location.replace(' ', '%20')}"
|
search_url += f"&location={location.replace(' ', '%20')}"
|
||||||
@ -261,7 +225,7 @@ class LinkedInJobScraper:
|
|||||||
|
|
||||||
async with async_playwright() as pw:
|
async with async_playwright() as pw:
|
||||||
browser = await pw.chromium.launch(
|
browser = await pw.chromium.launch(
|
||||||
headless= False,
|
headless=False,
|
||||||
args=['--disable-blink-features=AutomationControlled']
|
args=['--disable-blink-features=AutomationControlled']
|
||||||
)
|
)
|
||||||
context = await AsyncNewContext(browser, fingerprint=profile)
|
context = await AsyncNewContext(browser, fingerprint=profile)
|
||||||
@ -275,6 +239,9 @@ class LinkedInJobScraper:
|
|||||||
|
|
||||||
page = await context.new_page()
|
page = await context.new_page()
|
||||||
|
|
||||||
|
# Create a temporary fetcher for protection checks on main page
|
||||||
|
temp_fetcher = StealthyFetcher(self.engine, browser, context)
|
||||||
|
|
||||||
session_loaded = await self.engine.load_session(context)
|
session_loaded = await self.engine.load_session(context)
|
||||||
login_successful = False
|
login_successful = False
|
||||||
|
|
||||||
@ -301,45 +268,63 @@ class LinkedInJobScraper:
|
|||||||
elif not credentials:
|
elif not credentials:
|
||||||
print("ℹ️ No credentials — proceeding as guest.")
|
print("ℹ️ No credentials — proceeding as guest.")
|
||||||
login_successful = True
|
login_successful = True
|
||||||
else:
|
|
||||||
pass
|
|
||||||
|
|
||||||
await page.wait_for_load_state("load", timeout=60000)
|
await page.wait_for_load_state("load", timeout=60000)
|
||||||
print("✅ Post-login page fully loaded. Starting search...")
|
print("✅ Post-login page fully loaded. Starting search...")
|
||||||
|
|
||||||
if await self.engine._detect_cloudflare(page):
|
# >>> PROTECTION CHECK USING FETCHER LOGIC <<<
|
||||||
print("☁️ Cloudflare detected on initial load.")
|
protection_type = await temp_fetcher._detect_protection(page)
|
||||||
if not await self.engine._handle_cloudflare(page):
|
if protection_type:
|
||||||
print("❌ Cloudflare could not be resolved.")
|
print(f"🛡️ Protection detected on initial page: {protection_type}")
|
||||||
|
content_accessible = await temp_fetcher._is_content_accessible(page)
|
||||||
|
if not content_accessible:
|
||||||
|
print("🔒 Content not accessible.")
|
||||||
|
handled = False
|
||||||
|
if protection_type == "cloudflare":
|
||||||
|
handled = await self.engine._handle_cloudflare(page)
|
||||||
|
elif protection_type == "captcha":
|
||||||
|
handled = False
|
||||||
|
if not handled:
|
||||||
await browser.close()
|
await browser.close()
|
||||||
self.engine.report_outcome("cloudflare")
|
self.engine.report_outcome("protection_block")
|
||||||
return
|
return
|
||||||
|
else:
|
||||||
|
print("✅ Protection present but content accessible — proceeding.")
|
||||||
|
|
||||||
print(f"🔍 Searching for: {search_keywords}")
|
print(f"🔍 Searching for: {search_keywords}")
|
||||||
await page.goto(search_url, wait_until='load', timeout=60000)
|
await page.goto(search_url, wait_until='load', timeout=60000)
|
||||||
await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)
|
await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)
|
||||||
|
|
||||||
if await self.engine._detect_cloudflare(page):
|
# >>> PROTECTION CHECK ON SEARCH PAGE <<<
|
||||||
print("☁️ Cloudflare detected on search page.")
|
protection_type = await temp_fetcher._detect_protection(page)
|
||||||
if not await self.engine._handle_cloudflare(page):
|
if protection_type:
|
||||||
|
print(f"🛡️ Protection detected on search page: {protection_type}")
|
||||||
|
content_accessible = await temp_fetcher._is_content_accessible(page)
|
||||||
|
if not content_accessible:
|
||||||
|
print("🔒 Content not accessible.")
|
||||||
|
handled = False
|
||||||
|
if protection_type == "cloudflare":
|
||||||
|
handled = await self.engine._handle_cloudflare(page)
|
||||||
|
elif protection_type == "captcha":
|
||||||
|
handled = False
|
||||||
|
if not handled:
|
||||||
await browser.close()
|
await browser.close()
|
||||||
self.engine.report_outcome("cloudflare")
|
self.engine.report_outcome("protection_block")
|
||||||
return
|
return
|
||||||
|
else:
|
||||||
|
print("✅ Protection present but content accessible — proceeding.")
|
||||||
|
|
||||||
all_job_links = []
|
all_job_links = []
|
||||||
seen_job_ids = set()
|
seen_job_ids = set()
|
||||||
|
|
||||||
# First, scrape the initial page
|
|
||||||
print("🔄 Collecting initial job links...")
|
print("🔄 Collecting initial job links...")
|
||||||
initial_jobs = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links)
|
initial_jobs = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links)
|
||||||
print(f" ➕ Found {initial_jobs} initial job(s) (total: {len(all_job_links)})")
|
print(f" ➕ Found {initial_jobs} initial job(s) (total: {len(all_job_links)})")
|
||||||
|
|
||||||
# Loop until no new jobs are found
|
|
||||||
iteration = 1
|
iteration = 1
|
||||||
while True:
|
while True:
|
||||||
print(f"🔄 Iteration {iteration}: Checking for new jobs...")
|
print(f"🔄 Iteration {iteration}: Checking for new jobs...")
|
||||||
|
|
||||||
# First try infinite scroll
|
|
||||||
prev_job_count = len(all_job_links)
|
prev_job_count = len(all_job_links)
|
||||||
await self._handle_infinite_scroll(page, search_keywords, seen_job_ids, all_job_links)
|
await self._handle_infinite_scroll(page, search_keywords, seen_job_ids, all_job_links)
|
||||||
new_jobs_count = len(all_job_links) - prev_job_count
|
new_jobs_count = len(all_job_links) - prev_job_count
|
||||||
@ -347,72 +332,47 @@ class LinkedInJobScraper:
|
|||||||
if new_jobs_count > 0:
|
if new_jobs_count > 0:
|
||||||
print(f" ➕ Found {new_jobs_count} new jobs via infinite scroll")
|
print(f" ➕ Found {new_jobs_count} new jobs via infinite scroll")
|
||||||
iteration += 1
|
iteration += 1
|
||||||
continue # Continue with infinite scroll if new jobs found
|
continue
|
||||||
|
|
||||||
# If no new jobs via scroll, check for pagination
|
|
||||||
pagination_exists = await page.query_selector("button[aria-label='Next']")
|
pagination_exists = await page.query_selector("button[aria-label='Next']")
|
||||||
|
|
||||||
if pagination_exists:
|
if pagination_exists:
|
||||||
print("⏭️ Pagination detected. Processing pages...")
|
print("⏭️ Pagination detected. Processing pages...")
|
||||||
await self._handle_pagination(page, search_keywords, seen_job_ids, all_job_links)
|
await self._handle_pagination(page, search_keywords, seen_job_ids, all_job_links)
|
||||||
iteration += 1
|
iteration += 1
|
||||||
continue # Continue with pagination if new jobs found
|
continue
|
||||||
else:
|
else:
|
||||||
# If no pagination and no new jobs from scroll, check by refreshing
|
|
||||||
print("🔄 Refreshing page to check for new results...")
|
print("🔄 Refreshing page to check for new results...")
|
||||||
await page.reload(wait_until='load')
|
await page.reload(wait_until='load')
|
||||||
await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
|
await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
|
||||||
|
|
||||||
# Check for new jobs after refresh
|
|
||||||
new_jobs_after_refresh = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links)
|
new_jobs_after_refresh = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links)
|
||||||
if new_jobs_after_refresh > 0:
|
if new_jobs_after_refresh > 0:
|
||||||
print(f" ➕ Found {new_jobs_after_refresh} new job(s) after refresh")
|
print(f" ➕ Found {new_jobs_after_refresh} new job(s) after refresh")
|
||||||
iteration += 1
|
iteration += 1
|
||||||
continue # Continue if new jobs found after refresh
|
continue
|
||||||
else:
|
else:
|
||||||
print("🔚 No new jobs found after refresh. Stopping.")
|
print("🔚 No new jobs found after refresh. Stopping.")
|
||||||
break
|
break
|
||||||
|
|
||||||
# Limit iterations to prevent infinite loops
|
|
||||||
if iteration > 10:
|
if iteration > 10:
|
||||||
print("🔄 Maximum iterations reached. Stopping.")
|
print("🔄 Maximum iterations reached. Stopping.")
|
||||||
break
|
break
|
||||||
|
|
||||||
print(f"✅ Collected {len(all_job_links)} unique job links.")
|
print(f"✅ Collected {len(all_job_links)} unique job links.")
|
||||||
|
|
||||||
# Process all collected job links
|
|
||||||
scraped_count = 0
|
scraped_count = 0
|
||||||
for idx, (href, title) in enumerate(all_job_links):
|
for idx, (href, title) in enumerate(all_job_links):
|
||||||
try:
|
try:
|
||||||
full_url = href if href.startswith("http") else f"https://www.linkedin.com{href}"
|
full_url = href if href.startswith("http") else f"https://www.linkedin.com{href}"
|
||||||
print(f" → Opening job {idx+1}/{len(all_job_links)}: {full_url}")
|
print(f" → Opening job {idx+1}/{len(all_job_links)}: {full_url}")
|
||||||
await page.goto(full_url, wait_until='load', timeout=60000)
|
|
||||||
await asyncio.sleep(3 * self.human_speed)
|
|
||||||
|
|
||||||
is_cloudflare = await self.engine._detect_cloudflare(page)
|
fetcher = StealthyFetcher(self.engine, browser, context)
|
||||||
page_content = await page.content()
|
job_page = await fetcher.fetch_url(full_url, wait_for_selector="h1.t-24")
|
||||||
has_captcha_text = "captcha" in page_content.lower()
|
if not job_page:
|
||||||
captcha_present = is_cloudflare or has_captcha_text
|
print(f" ❌ Failed to fetch job page {full_url} after retries.")
|
||||||
|
self.engine.report_outcome("fetch_failure", url=full_url)
|
||||||
title_element = await page.query_selector("h1.t-24")
|
|
||||||
job_data_accessible = title_element is not None
|
|
||||||
|
|
||||||
if captcha_present:
|
|
||||||
if job_data_accessible:
|
|
||||||
print(" ⚠️ CAPTCHA detected, but job data is accessible. Proceeding in stealth mode...")
|
|
||||||
await self.engine._avoid_captcha(page)
|
|
||||||
else:
|
|
||||||
print(" ⚠️ CAPTCHA detected and job data blocked. Attempting recovery...")
|
|
||||||
if not await self.engine._solve_captcha_fallback(page):
|
|
||||||
print(" ❌ CAPTCHA recovery failed. Skipping job.")
|
|
||||||
continue
|
continue
|
||||||
title_element = await page.query_selector("h1.t-24")
|
|
||||||
if not title_element:
|
|
||||||
print(" ❌ Job data still unavailable after CAPTCHA handling. Skipping.")
|
|
||||||
continue
|
|
||||||
|
|
||||||
if not captcha_present:
|
|
||||||
await self.engine._avoid_captcha(page)
|
|
||||||
|
|
||||||
apply_btn = None
|
apply_btn = None
|
||||||
apply_selectors = [
|
apply_selectors = [
|
||||||
@ -422,18 +382,18 @@ class LinkedInJobScraper:
|
|||||||
"button:has-text('Easy Apply')"
|
"button:has-text('Easy Apply')"
|
||||||
]
|
]
|
||||||
for selector in apply_selectors:
|
for selector in apply_selectors:
|
||||||
apply_btn = await page.query_selector(selector)
|
apply_btn = await job_page.query_selector(selector)
|
||||||
if apply_btn:
|
if apply_btn:
|
||||||
break
|
break
|
||||||
|
|
||||||
page_data = None
|
page_data = None
|
||||||
final_url = full_url
|
final_url = job_page.url
|
||||||
|
|
||||||
if apply_btn:
|
if apply_btn:
|
||||||
print(" → Clicking 'Apply' / 'Easy Apply' button...")
|
print(" → Clicking 'Apply' / 'Easy Apply' button...")
|
||||||
|
|
||||||
page_waiter = asyncio.create_task(context.wait_for_event("page"))
|
page_waiter = asyncio.create_task(context.wait_for_event("page"))
|
||||||
await self._human_click(page, apply_btn, wait_after=False)
|
await self._human_click(job_page, apply_btn, wait_after=False)
|
||||||
|
|
||||||
external_page = None
|
external_page = None
|
||||||
try:
|
try:
|
||||||
@ -451,48 +411,46 @@ class LinkedInJobScraper:
|
|||||||
await external_page.close()
|
await external_page.close()
|
||||||
|
|
||||||
except asyncio.TimeoutError:
|
except asyncio.TimeoutError:
|
||||||
print(" 🖥️ No external tab — scraping LinkedIn job page.")
|
print(" 🖥️ No external tab — scraping LinkedIn job page directly.")
|
||||||
await page.wait_for_timeout(2000)
|
await job_page.wait_for_timeout(2000)
|
||||||
try:
|
try:
|
||||||
await page.wait_for_selector("div.jobs-apply-button--fixed, div.jobs-easy-apply-modal", timeout=8000)
|
await job_page.wait_for_selector("div.jobs-apply-button--fixed, div.jobs-easy-apply-modal", timeout=8000)
|
||||||
except:
|
except PlaywrightTimeoutError:
|
||||||
pass
|
pass
|
||||||
await self.engine._human_like_scroll(page)
|
await self.engine._human_like_scroll(job_page)
|
||||||
await asyncio.sleep(2 * self.human_speed)
|
await asyncio.sleep(2 * self.human_speed)
|
||||||
page_data = await self._extract_all_page_content(page)
|
page_data = await self._extract_all_page_content(job_page)
|
||||||
final_url = page.url
|
|
||||||
else:
|
else:
|
||||||
print(" ⚠️ No 'Apply' button found — scraping job details directly.")
|
print(" ⚠️ No 'Apply' button found — scraping job details directly.")
|
||||||
await self.engine._human_like_scroll(page)
|
await self.engine._human_like_scroll(job_page)
|
||||||
await asyncio.sleep(2 * self.human_speed)
|
await asyncio.sleep(2 * self.human_speed)
|
||||||
page_data = await self._extract_all_page_content(page)
|
page_data = await self._extract_all_page_content(job_page)
|
||||||
final_url = page.url
|
|
||||||
|
|
||||||
# Extract job ID from URL
|
|
||||||
job_id = final_url.split("/")[-2] if "/jobs/view/" in final_url else "unknown"
|
job_id = final_url.split("/")[-2] if "/jobs/view/" in final_url else "unknown"
|
||||||
|
|
||||||
# Prepare raw data for LLM processing
|
|
||||||
raw_data = {
|
raw_data = {
|
||||||
"page_content": page_data,
|
"page_content": page_data,
|
||||||
"url": final_url,
|
"url": job_page.url,
|
||||||
"job_id": job_id
|
"job_id": job_page.url.split("/")[-2] if "/jobs/view/" in job_page.url else "unknown"
|
||||||
}
|
}
|
||||||
|
|
||||||
# Send raw data to LLM agent for refinement
|
refined_data = await self.llm_agent.refine_job_data(raw_data, self.user_request)
|
||||||
refined_data = await self.llm_agent.refine_job_data(raw_data, search_keywords)
|
|
||||||
|
|
||||||
# Only save if LLM successfully extracted meaningful data
|
|
||||||
if refined_data and refined_data.get("title", "N/A") != "N/A":
|
if refined_data and refined_data.get("title", "N/A") != "N/A":
|
||||||
# Save refined data to markdown and database through LLM agent
|
|
||||||
await self.llm_agent.save_job_data(refined_data, search_keywords)
|
await self.llm_agent.save_job_data(refined_data, search_keywords)
|
||||||
|
|
||||||
scraped_count += 1
|
scraped_count += 1
|
||||||
print(f" ✅ Scraped and refined: {refined_data['title'][:50]}...")
|
print(f" ✅ Scraped and refined: {refined_data['title'][:50]}...")
|
||||||
|
self.engine.report_outcome("success", url=raw_data["url"])
|
||||||
else:
|
else:
|
||||||
print(f" 🟡 Could not extract meaningful data from: {final_url}")
|
print(f" 🟡 Could not extract meaningful data from: {final_url}")
|
||||||
|
self.engine.report_outcome("llm_failure", url=raw_data["url"])
|
||||||
|
|
||||||
|
await job_page.close()
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f" ⚠️ Failed on job {idx+1}: {str(e)[:100]}")
|
print(f" ⚠️ Failed on job {idx+1}: {str(e)[:100]}")
|
||||||
|
if 'job_page' in locals() and job_page:
|
||||||
|
await job_page.close()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
@ -504,7 +462,7 @@ class LinkedInJobScraper:
|
|||||||
|
|
||||||
if scraped_count > 0:
|
if scraped_count > 0:
|
||||||
self.engine.report_outcome("success")
|
self.engine.report_outcome("success")
|
||||||
print(f"✅ Completed! Processed {scraped_count} jobs for '{search_keywords}'.")
|
print(f"✅ Completed! Processed {scraped_count} jobs for '{search_keywords}' based on request '{self.user_request}'.")
|
||||||
else:
|
else:
|
||||||
self.engine.report_outcome("captcha")
|
self.engine.report_outcome("captcha")
|
||||||
print("⚠️ No jobs processed successfully.")
|
print("⚠️ No jobs processed successfully.")
|
||||||
@ -8,16 +8,17 @@ import asyncio
|
|||||||
# Load environment variables
|
# Load environment variables
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
engine = FingerprintScrapingEngine(
|
engine = FingerprintScrapingEngine(
|
||||||
seed="job_scraping_engine",
|
seed="job_scraping_123",
|
||||||
target_os="windows",
|
target_os="windows",
|
||||||
db_path="job_listings.db",
|
db_path="job_listings.db",
|
||||||
markdown_path="job_listings.md"
|
markdown_path="job_listings.md"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Initialize scraper with target field
|
# Initialize scraper with target field
|
||||||
scraper = LinkedInJobScraper(engine, human_speed=1.6, target_field="Web designer")
|
scraper = LinkedInJobScraper(engine, human_speed=1.6, user_request="Extract title, company, location, description, requirements, qualifications, nature of job(remote, onsite, hybrid) and salary")
|
||||||
|
|
||||||
await scraper.scrape_jobs(
|
await scraper.scrape_jobs(
|
||||||
search_keywords="Web Designer location:New York",
|
search_keywords="Web Designer location:New York",
|
||||||
|
|||||||
236
llm_agent.py
236
llm_agent.py
@ -1,166 +1,170 @@
|
|||||||
import google.generativeai as genai
|
|
||||||
from typing import Dict, Any
|
from openai import OpenAI
|
||||||
|
from typing import Dict, Any, Optional
|
||||||
import asyncio
|
import asyncio
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import os
|
import os
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from config import GEMINI_API_KEY
|
import json
|
||||||
|
import re
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
# ✅ Actually load .env
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
class LLMJobRefiner:
|
class LLMJobRefiner:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
genai.configure(api_key=GEMINI_API_KEY)
|
xai_api_key = os.getenv("XAI_API_KEY")
|
||||||
self.model = genai.GenerativeModel('gemini-latest-flash')
|
if not xai_api_key:
|
||||||
|
raise ValueError("XAI_API_KEY not found in environment variables.")
|
||||||
|
|
||||||
|
self.client = OpenAI(api_key=xai_api_key, base_url="https://api.x.ai/v1")
|
||||||
|
self.model = "grok-4-latest"
|
||||||
|
self.extraction_schema_cache = {}
|
||||||
|
|
||||||
|
def generate_content(self, prompt: str, system_message: str = "You are a helpful assistant.") -> str:
|
||||||
|
"""Synchronous method to call Grok via xAI API."""
|
||||||
|
try:
|
||||||
|
response = self.client.chat.completions.create(
|
||||||
|
model=self.model,
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": system_message},
|
||||||
|
{"role": "user", "content": prompt}
|
||||||
|
],
|
||||||
|
temperature=0.2,
|
||||||
|
max_tokens=2048,
|
||||||
|
stream=False
|
||||||
|
)
|
||||||
|
return response.choices[0].message.content or ""
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error in Grok API call: {e}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
async def refine_job_data(self, raw_data: Dict[str, Any], user_request: str) -> Optional[Dict[str, Any]]:
|
||||||
|
page_content = raw_data.get('page_content', '')
|
||||||
|
if not page_content:
|
||||||
|
return None
|
||||||
|
|
||||||
|
schema_key = user_request.lower().strip()
|
||||||
|
extraction_schema = self.extraction_schema_cache.get(schema_key)
|
||||||
|
if not extraction_schema:
|
||||||
|
extraction_schema = await self._generate_extraction_schema(user_request)
|
||||||
|
if extraction_schema:
|
||||||
|
self.extraction_schema_cache[schema_key] = extraction_schema
|
||||||
|
else:
|
||||||
|
extraction_schema = self._get_default_schema()
|
||||||
|
|
||||||
async def refine_job_data(self, raw_data: Dict[str, Any], target_field: str) -> Dict[str, Any]:
|
|
||||||
"""
|
|
||||||
Refine raw job data using Gemini LLM based on target field
|
|
||||||
"""
|
|
||||||
prompt = f"""
|
prompt = f"""
|
||||||
You are a job data extraction assistant. Extract the following fields from the job posting:
|
You are a highly skilled web data extraction assistant. Your task is to analyze the raw HTML content of a job posting page and extract specific information requested by the user.
|
||||||
- title
|
The user's request is: "{user_request}"
|
||||||
- company_name
|
The raw HTML content of the page is provided below (limited in size). The content might be noisy or unstructured.
|
||||||
- location
|
Your goal is to:
|
||||||
- description
|
1. Analyze the HTML structure to identify relevant sections.
|
||||||
- requirements
|
2. Extract the requested information accurately.
|
||||||
- qualifications
|
3. Clean up formatting issues.
|
||||||
- salary_range
|
4. If a field cannot be found, use "N/A".
|
||||||
- nature_of_work (remote, onsite, or hybrid)
|
5. Return ONLY the extracted data in a JSON object based on the following schema:
|
||||||
- job_id
|
{json.dumps(extraction_schema, indent=2)}
|
||||||
|
Raw Page Content (HTML):
|
||||||
|
{page_content[:6000]}
|
||||||
|
|
||||||
Target Field: {target_field}
|
Respond with the JSON object containing the extracted data.
|
||||||
Raw Page Content:
|
|
||||||
{raw_data.get('page_content', '')[:6000]} # Limit content size
|
|
||||||
|
|
||||||
Instructions:
|
|
||||||
1. Extract only the information relevant to the target field: {target_field}
|
|
||||||
2. Clean up any formatting issues in the description
|
|
||||||
3. Standardize location format (city, state/country)
|
|
||||||
4. Extract salary range if mentioned in description
|
|
||||||
5. Determine nature of work (remote, onsite, or hybrid) from work arrangements
|
|
||||||
6. Ensure all fields are properly formatted
|
|
||||||
7. If a field cannot be found, use "N/A"
|
|
||||||
8. Return the refined data in JSON format
|
|
||||||
|
|
||||||
Response format (only return the JSON):
|
|
||||||
{{
|
|
||||||
"title": "...",
|
|
||||||
"company_name": "...",
|
|
||||||
"location": "...",
|
|
||||||
"description": "...",
|
|
||||||
"requirements": "...",
|
|
||||||
"qualifications": "...",
|
|
||||||
"salary_range": "...",
|
|
||||||
"nature_of_work": "...",
|
|
||||||
"job_id": "{raw_data.get('job_id', 'unknown')}",
|
|
||||||
"url": "{raw_data.get('url', 'N/A')}"
|
|
||||||
}}
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = await asyncio.get_event_loop().run_in_executor(
|
# ✅ Use self (current instance), NOT a new LLMJobRefiner()
|
||||||
|
response_text = await asyncio.get_event_loop().run_in_executor(
|
||||||
None,
|
None,
|
||||||
lambda: self.model.generate_content(prompt)
|
lambda: self.generate_content(prompt)
|
||||||
)
|
)
|
||||||
|
refined_data = self._parse_llm_response(response_text)
|
||||||
# Parse the response and return refined data
|
|
||||||
refined_data = self._parse_llm_response(response.text)
|
|
||||||
|
|
||||||
# If parsing fails, return None
|
|
||||||
if not refined_data:
|
if not refined_data:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
refined_data['job_id'] = raw_data.get('job_id', 'unknown')
|
||||||
|
refined_data['url'] = raw_data.get('url', 'N/A')
|
||||||
return refined_data
|
return refined_data
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"LLM refinement failed: {str(e)}")
|
print(f"LLM refinement failed: {str(e)}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _parse_llm_response(self, response_text: str) -> Dict[str, Any]:
|
async def _generate_extraction_schema(self, user_request: str) -> Optional[Dict[str, str]]:
|
||||||
|
schema_prompt = f"""
|
||||||
|
Based on the user's request: "{user_request}", generate a JSON schema for the data they want to extract from a job posting.
|
||||||
|
The schema should be a dictionary where keys are field names (snake_case) and values are short descriptions.
|
||||||
|
Include standard fields like title, company_name, location, description, etc., if relevant.
|
||||||
|
Respond with only the JSON schema.
|
||||||
"""
|
"""
|
||||||
Parse the LLM response to extract refined job data
|
try:
|
||||||
"""
|
# ✅ Use self.generate_content, NOT self.model.generate_content
|
||||||
import json
|
schema_text = await asyncio.get_event_loop().run_in_executor(
|
||||||
import re
|
None,
|
||||||
|
lambda: self.generate_content(schema_prompt)
|
||||||
|
)
|
||||||
|
json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', schema_text, re.DOTALL)
|
||||||
|
if not json_match:
|
||||||
|
json_match = re.search(r'\{.*\}', schema_text, re.DOTALL)
|
||||||
|
if not json_match:
|
||||||
|
return None
|
||||||
|
|
||||||
# Extract JSON from response (handle markdown code blocks)
|
json_str = json_match.group(1) if '```' in schema_text else json_match.group(0)
|
||||||
|
return json.loads(json_str)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Schema generation failed: {str(e)}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _get_default_schema(self) -> Dict[str, str]:
|
||||||
|
return {
|
||||||
|
"title": "The job title",
|
||||||
|
"company_name": "The name of the company",
|
||||||
|
"location": "The location of the job",
|
||||||
|
"description": "The full job description",
|
||||||
|
"requirements": "List of job requirements",
|
||||||
|
"qualifications": "List of required qualifications",
|
||||||
|
"salary_range": "The salary range mentioned",
|
||||||
|
"nature_of_work": "Remote, onsite, or hybrid"
|
||||||
|
}
|
||||||
|
|
||||||
|
def _parse_llm_response(self, response_text: str) -> Optional[Dict[str, Any]]:
|
||||||
json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', response_text, re.DOTALL)
|
json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', response_text, re.DOTALL)
|
||||||
if json_match:
|
if not json_match:
|
||||||
json_str = json_match.group(1)
|
|
||||||
else:
|
|
||||||
# If no code block, try to find JSON directly
|
|
||||||
json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
|
json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
|
||||||
if json_match:
|
if not json_match:
|
||||||
json_str = json_match.group(0)
|
|
||||||
else:
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return json.loads(json_str)
|
return json.loads(json_match.group(1) if '```' in response_text else json_match.group(0))
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
async def save_job_data(self, job_data: Dict[str, Any], keyword: str):
|
async def save_job_data(self, job_data: Dict[str, Any], keyword: str):
|
||||||
"""
|
|
||||||
Save job data to both markdown and database
|
|
||||||
"""
|
|
||||||
# Save to database
|
|
||||||
await self._save_to_db(job_data)
|
await self._save_to_db(job_data)
|
||||||
|
|
||||||
# Save to markdown
|
|
||||||
await self._save_to_markdown(job_data, keyword)
|
await self._save_to_markdown(job_data, keyword)
|
||||||
|
|
||||||
async def _save_to_db(self, job_data: Dict[str, Any]):
|
async def _save_to_db(self, job_data: Dict[str, Any]):
|
||||||
"""
|
|
||||||
Save job data to database
|
|
||||||
"""
|
|
||||||
db_path = "linkedin_jobs.db"
|
db_path = "linkedin_jobs.db"
|
||||||
os.makedirs(os.path.dirname(db_path) if os.path.dirname(db_path) else ".", exist_ok=True)
|
os.makedirs(os.path.dirname(db_path) or ".", exist_ok=True)
|
||||||
|
|
||||||
with sqlite3.connect(db_path) as conn:
|
with sqlite3.connect(db_path) as conn:
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
cursor.execute('''
|
fields = list(job_data.keys())
|
||||||
INSERT OR IGNORE INTO jobs
|
placeholders = ', '.join(['?' for _ in fields])
|
||||||
(title, company_name, location, description, requirements,
|
columns = ', '.join([f'"{col}"' for col in fields]) # Escape column names
|
||||||
qualifications, salary_range, nature_of_work, job_id, url)
|
cursor.execute(f"CREATE TABLE IF NOT EXISTS jobs ({columns})")
|
||||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
cursor.execute(f'INSERT INTO jobs ({columns}) VALUES ({placeholders})',
|
||||||
''', (
|
[job_data.get(field, 'N/A') for field in fields])
|
||||||
job_data.get("title", "N/A"),
|
|
||||||
job_data.get("company_name", "N/A"),
|
|
||||||
job_data.get("location", "N/A"),
|
|
||||||
job_data.get("description", "N/A"),
|
|
||||||
job_data.get("requirements", "N/A"),
|
|
||||||
job_data.get("qualifications", "N/A"),
|
|
||||||
job_data.get("salary_range", "N/A"),
|
|
||||||
job_data.get("nature_of_work", "N/A"),
|
|
||||||
job_data.get("job_id", "N/A"),
|
|
||||||
job_data.get("url", "N/A")
|
|
||||||
))
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
async def _save_to_markdown(self, job_data: Dict[str, Any], keyword: str):
|
async def _save_to_markdown(self, job_data: Dict[str, Any], keyword: str):
|
||||||
"""
|
|
||||||
Save job data to markdown file
|
|
||||||
"""
|
|
||||||
os.makedirs("linkedin_jobs", exist_ok=True)
|
os.makedirs("linkedin_jobs", exist_ok=True)
|
||||||
|
filepath = os.path.join("linkedin_jobs", "linkedin_jobs_scraped.md")
|
||||||
# Create a single markdown file for all jobs
|
write_header = not os.path.exists(filepath) or os.path.getsize(filepath) == 0
|
||||||
filename = "linkedin_jobs_scraped.md"
|
|
||||||
filepath = os.path.join("linkedin_jobs", filename)
|
|
||||||
|
|
||||||
with open(filepath, "a", encoding="utf-8") as f:
|
with open(filepath, "a", encoding="utf-8") as f:
|
||||||
# Only write header if file is empty
|
if write_header:
|
||||||
if os.path.getsize(filepath) == 0:
|
|
||||||
f.write(f"# LinkedIn Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
|
f.write(f"# LinkedIn Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
|
||||||
|
|
||||||
f.write(f"## Job: {job_data.get('title', 'N/A')}\n\n")
|
f.write(f"## Job: {job_data.get('title', 'N/A')}\n\n")
|
||||||
f.write(f"- **Keyword**: {keyword}\n")
|
f.write(f"- **Keyword**: {keyword}\n")
|
||||||
f.write(f"- **Company**: {job_data.get('company_name', 'N/A')}\n")
|
for key, value in job_data.items():
|
||||||
f.write(f"- **Location**: {job_data.get('location', 'N/A')}\n")
|
if key != 'title':
|
||||||
f.write(f"- **Nature of Work**: {job_data.get('nature_of_work', 'N/A')}\n")
|
f.write(f"- **{key.replace('_', ' ').title()}**: {value}\n")
|
||||||
f.write(f"- **Salary Range**: {job_data.get('salary_range', 'N/A')}\n")
|
f.write("\n---\n\n")
|
||||||
f.write(f"- **Job ID**: {job_data.get('job_id', 'N/A')}\n")
|
|
||||||
f.write(f"- **URL**: <{job_data.get('url', 'N/A')}>\n\n")
|
|
||||||
f.write(f"### Description\n\n{job_data.get('description', 'N/A')}\n\n")
|
|
||||||
f.write(f"### Requirements\n\n{job_data.get('requirements', 'N/A')}\n\n")
|
|
||||||
f.write(f"### Qualifications\n\n{job_data.get('qualifications', 'N/A')}\n\n")
|
|
||||||
f.write("---\n\n")
|
|
||||||
|
|||||||
@ -6,10 +6,12 @@ import hashlib
|
|||||||
import random
|
import random
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
from typing import List, Optional, Dict
|
from playwright.async_api import Page
|
||||||
|
from typing import List, Optional, Dict, Any
|
||||||
from browserforge.fingerprints import FingerprintGenerator
|
from browserforge.fingerprints import FingerprintGenerator
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
from config import load_spoof_config
|
from config import load_spoof_config
|
||||||
|
import time
|
||||||
|
|
||||||
# Load environment variables
|
# Load environment variables
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
@ -53,16 +55,28 @@ class FingerprintScrapingEngine:
|
|||||||
self.common_renderers = spoof_config["renderers"]
|
self.common_renderers = spoof_config["renderers"]
|
||||||
self.common_vendors = spoof_config["vendors"]
|
self.common_vendors = spoof_config["vendors"]
|
||||||
|
|
||||||
# Feedback system
|
|
||||||
self.feedback_file = f"feedback_{seed}.json"
|
self.feedback_file = f"feedback_{seed}.json"
|
||||||
|
|
||||||
|
# Feedback system
|
||||||
self.feedback = self._load_feedback()
|
self.feedback = self._load_feedback()
|
||||||
|
|
||||||
# ← NEW: Session persistence paths
|
# ← NEW: Session persistence paths
|
||||||
self.session_dir = "browser_sessions"
|
self.session_dir = "browser_sessions"
|
||||||
os.makedirs(self.session_dir, exist_ok=True)
|
os.makedirs(self.session_dir, exist_ok=True)
|
||||||
self.session_path = os.path.join(self.session_dir, f"{seed}_session.json")
|
self.session_path = os.path.join(
|
||||||
|
self.session_dir, f"{seed}_session.json")
|
||||||
|
|
||||||
def _load_feedback(self):
|
self.optimization_params = {
|
||||||
|
"base_delay": 2.0,
|
||||||
|
"max_concurrent_requests": 4,
|
||||||
|
"request_timeout": 60000,
|
||||||
|
"retry_attempts": 3,
|
||||||
|
"captcha_handling_strategy": "avoid", # or "solve_fallback"
|
||||||
|
"cloudflare_wait_strategy": "smart_wait", # or "aggressive_reload"
|
||||||
|
}
|
||||||
|
self._update_params_from_feedback()
|
||||||
|
|
||||||
|
def _load_feedback(self) -> Dict[str, Any]:
|
||||||
if os.path.exists(self.feedback_file):
|
if os.path.exists(self.feedback_file):
|
||||||
try:
|
try:
|
||||||
with open(self.feedback_file, "r") as f:
|
with open(self.feedback_file, "r") as f:
|
||||||
@ -70,6 +84,8 @@ class FingerprintScrapingEngine:
|
|||||||
data.setdefault("success_rate", 1.0)
|
data.setdefault("success_rate", 1.0)
|
||||||
data.setdefault("captcha_count", 0)
|
data.setdefault("captcha_count", 0)
|
||||||
data.setdefault("cloudflare_count", 0)
|
data.setdefault("cloudflare_count", 0)
|
||||||
|
data.setdefault("avg_response_time", 10.0) # New metric
|
||||||
|
data.setdefault("failed_domains", {}) # New metrice
|
||||||
return data
|
return data
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
@ -79,16 +95,69 @@ class FingerprintScrapingEngine:
|
|||||||
with open(self.feedback_file, "w") as f:
|
with open(self.feedback_file, "w") as f:
|
||||||
json.dump(self.feedback, f)
|
json.dump(self.feedback, f)
|
||||||
|
|
||||||
def report_outcome(self, outcome: str):
|
def report_outcome(self, outcome: str, url: Optional[str] = None, response_time: Optional[float] = None):
|
||||||
if outcome == "success":
|
if outcome == "success":
|
||||||
self.feedback["success_rate"] = min(1.0, self.feedback["success_rate"] + 0.1)
|
self.feedback["success_rate"] = min(
|
||||||
|
1.0, self.feedback["success_rate"] + 0.05) # Smaller increment
|
||||||
else:
|
else:
|
||||||
self.feedback["success_rate"] = max(0.1, self.feedback["success_rate"] - 0.2)
|
self.feedback["success_rate"] = max(
|
||||||
|
0.05, self.feedback["success_rate"] - 0.1) # Smaller decrement
|
||||||
|
|
||||||
if outcome == "captcha":
|
if outcome == "captcha":
|
||||||
self.feedback["captcha_count"] += 1
|
self.feedback["captcha_count"] += 1
|
||||||
|
# Adapt strategy if many captchas
|
||||||
|
self.optimization_params["captcha_handling_strategy"] = "solve_fallback"
|
||||||
elif outcome == "cloudflare":
|
elif outcome == "cloudflare":
|
||||||
self.feedback["cloudflare_count"] += 1
|
self.feedback["cloudflare_count"] += 1
|
||||||
|
# Adjust wait strategy based on frequency
|
||||||
|
if self.feedback["cloudflare_count"] > 5:
|
||||||
|
self.optimization_params["cloudflare_wait_strategy"] = "aggressive_reload"
|
||||||
|
|
||||||
|
# Track domain-specific failures
|
||||||
|
if url and outcome != "success":
|
||||||
|
domain = url.split("//")[1].split("/")[0]
|
||||||
|
if domain not in self.feedback["failed_domains"]:
|
||||||
|
self.feedback["failed_domains"][domain] = 0
|
||||||
|
self.feedback["failed_domains"][domain] += 1
|
||||||
|
|
||||||
|
# Update average response time
|
||||||
|
if response_time:
|
||||||
|
prev_avg = self.feedback.get("avg_response_time", 10.0)
|
||||||
|
# Simple moving average
|
||||||
|
self.feedback["avg_response_time"] = (
|
||||||
|
prev_avg * 0.9) + (response_time * 0.1)
|
||||||
|
|
||||||
self.save_feedback()
|
self.save_feedback()
|
||||||
|
self._update_params_from_feedback() # Update params based on new feedback
|
||||||
|
|
||||||
|
def _update_params_from_feedback(self):
|
||||||
|
"""Adjust optimization parameters based on feedback."""
|
||||||
|
sr = self.feedback["success_rate"]
|
||||||
|
cc = self.feedback["captcha_count"]
|
||||||
|
cf = self.feedback["cloudflare_count"]
|
||||||
|
avg_rt = self.feedback.get("avg_response_time", 10.0)
|
||||||
|
|
||||||
|
# Adjust base delay based on success rate and avg response time
|
||||||
|
if sr < 0.6:
|
||||||
|
self.optimization_params["base_delay"] = max(
|
||||||
|
5.0, self.optimization_params["base_delay"] * 1.2)
|
||||||
|
elif sr > 0.8:
|
||||||
|
self.optimization_params["base_delay"] = min(
|
||||||
|
3.0, self.optimization_params["base_delay"] * 0.9)
|
||||||
|
|
||||||
|
# Reduce concurrency if many captchas/cloudflares
|
||||||
|
if cc > 3 or cf > 3:
|
||||||
|
self.optimization_params["max_concurrent_requests"] = max(
|
||||||
|
2, self.optimization_params["max_concurrent_requests"] - 2)
|
||||||
|
else:
|
||||||
|
# Reset to default
|
||||||
|
self.optimization_params["max_concurrent_requests"] = 4
|
||||||
|
|
||||||
|
# Increase timeout if avg response time is high
|
||||||
|
if avg_rt > 20:
|
||||||
|
self.optimization_params["request_timeout"] = 90000 # 90 seconds
|
||||||
|
|
||||||
|
print(f"Optimization Params Updated: {self.optimization_params}")
|
||||||
|
|
||||||
# ← NEW: Save browser context (cookies + localStorage)
|
# ← NEW: Save browser context (cookies + localStorage)
|
||||||
async def save_session(self, context):
|
async def save_session(self, context):
|
||||||
@ -129,7 +198,8 @@ class FingerprintScrapingEngine:
|
|||||||
if self.feedback["success_rate"] < 0.5:
|
if self.feedback["success_rate"] < 0.5:
|
||||||
concurrency_options = [8, 4]
|
concurrency_options = [8, 4]
|
||||||
memory_options = [8]
|
memory_options = [8]
|
||||||
profile.navigator.hardwareConcurrency = random.choice(concurrency_options)
|
profile.navigator.hardwareConcurrency = random.choice(
|
||||||
|
concurrency_options)
|
||||||
profile.navigator.deviceMemory = random.choice(memory_options)
|
profile.navigator.deviceMemory = random.choice(memory_options)
|
||||||
return profile
|
return profile
|
||||||
|
|
||||||
@ -245,23 +315,6 @@ class FingerprintScrapingEngine:
|
|||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
async def _detect_cloudflare(self, page) -> bool:
|
|
||||||
content = await page.content()
|
|
||||||
return (
|
|
||||||
"#cf-chl" in content or
|
|
||||||
"checking your browser" in content.lower() or
|
|
||||||
"just a moment" in content.lower()
|
|
||||||
)
|
|
||||||
|
|
||||||
async def _handle_cloudflare(self, page, max_retries: int = 3):
|
|
||||||
for i in range(max_retries):
|
|
||||||
if not await self._detect_cloudflare(page):
|
|
||||||
return True
|
|
||||||
print(f"☁️ Cloudflare detected - waiting... (attempt {i+1})")
|
|
||||||
await asyncio.sleep(8 + random.uniform(2, 5))
|
|
||||||
await page.wait_for_load_state("load", timeout=60000)
|
|
||||||
return False
|
|
||||||
|
|
||||||
async def _avoid_captcha(self, page) -> bool:
|
async def _avoid_captcha(self, page) -> bool:
|
||||||
await asyncio.sleep(2 + random.random() * 3)
|
await asyncio.sleep(2 + random.random() * 3)
|
||||||
await self._human_like_scroll(page)
|
await self._human_like_scroll(page)
|
||||||
@ -283,3 +336,42 @@ class FingerprintScrapingEngine:
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
async def _detect_cloudflare(self, page: Page) -> bool:
|
||||||
|
"""Detect Cloudflare challenges."""
|
||||||
|
content = await page.content()
|
||||||
|
return (
|
||||||
|
"#cf-chl" in content
|
||||||
|
or "checking your browser" in content.lower()
|
||||||
|
or "just a moment" in content.lower()
|
||||||
|
or "turnstile" in content.lower() # Check for Cloudflare Turnstile
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _handle_cloudflare(self, page: Page) -> bool:
|
||||||
|
"""
|
||||||
|
Handle Cloudflare challenges, including Turnstile if present.
|
||||||
|
This is a simplified approach; real-world handling might require more sophisticated logic or external solvers.
|
||||||
|
"""
|
||||||
|
max_wait_time = 60 # Total time to wait for Cloudflare to resolve
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
while time.time() - start_time < max_wait_time:
|
||||||
|
if not await self._detect_cloudflare(page):
|
||||||
|
print("Cloudflare challenge resolved.")
|
||||||
|
return True
|
||||||
|
|
||||||
|
print("Cloudflare active, waiting...")
|
||||||
|
# Simulate more human-like behavior while waiting
|
||||||
|
await self._simulate_human_interaction(page)
|
||||||
|
# Wait for a random period, increasing slightly each time
|
||||||
|
wait_time = min(10, 2 + random.uniform(1, 3) +
|
||||||
|
(time.time() - start_time) * 0.1)
|
||||||
|
await asyncio.sleep(wait_time)
|
||||||
|
|
||||||
|
# Reload occasionally to trigger potential client-side checks
|
||||||
|
if (time.time() - start_time) > 15 and (time.time() - start_time) % 20 < 2:
|
||||||
|
print("Reloading page during Cloudflare wait...")
|
||||||
|
await page.reload(wait_until='load', timeout=30000)
|
||||||
|
|
||||||
|
print("Timeout waiting for Cloudflare resolution.")
|
||||||
|
return False
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user