modifications to work with postgre and use llm to extract and refine data

This commit is contained in:
Ofure Ikheloa 2025-12-05 17:00:43 +01:00
parent 4f78a845ae
commit 160efadbfb
5 changed files with 275 additions and 102 deletions

View File

@ -23,11 +23,11 @@ class StealthyFetcher:
print(f"Attempt {attempt + 1} to fetch {url}") print(f"Attempt {attempt + 1} to fetch {url}")
page = await self.context.new_page() page = await self.context.new_page()
await page.goto(url, wait_until='load', timeout=60000) await page.goto(url, wait_until='load', timeout=120000)
if wait_for_selector: if wait_for_selector:
try: try:
await page.wait_for_selector(wait_for_selector, timeout=10000) await page.wait_for_selector(wait_for_selector, timeout=40000)
except PlaywrightTimeoutError: except PlaywrightTimeoutError:
print(f"Selector {wait_for_selector} not found immediately, continuing...") print(f"Selector {wait_for_selector} not found immediately, continuing...")
@ -88,7 +88,7 @@ class StealthyFetcher:
async def _is_content_accessible(self, page: Page, wait_for_selector: Optional[str] = None) -> bool: async def _is_content_accessible(self, page: Page, wait_for_selector: Optional[str] = None) -> bool:
if wait_for_selector: if wait_for_selector:
try: try:
await page.wait_for_selector(wait_for_selector, timeout=5000) await page.wait_for_selector(wait_for_selector, timeout=40000)
return True return True
except PlaywrightTimeoutError: except PlaywrightTimeoutError:
pass pass
@ -118,7 +118,7 @@ class StealthyFetcher:
if (time.time() - start_time) > 15 and (time.time() - start_time) % 20 < 2: if (time.time() - start_time) > 15 and (time.time() - start_time) % 20 < 2:
print("🔄 Reloading page during Cloudflare wait...") print("🔄 Reloading page during Cloudflare wait...")
await page.reload(wait_until='load', timeout=30000) await page.reload(wait_until='load', timeout=120000)
print("⏰ Timeout waiting for Cloudflare resolution.") print("⏰ Timeout waiting for Cloudflare resolution.")
return False return False

View File

@ -1,13 +1,12 @@
import asyncio import asyncio
import random import random
import sqlite3
import os
from typing import Optional, Dict from typing import Optional, Dict
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
from browserforge.injectors.playwright import AsyncNewContext from browserforge.injectors.playwright import AsyncNewContext
from llm_agent import LLMJobRefiner from llm_agent import LLMJobRefiner
import re import re
from fetcher import StealthyFetcher from fetcher import StealthyFetcher
from datetime import datetime
class LinkedInJobScraper: class LinkedInJobScraper:
@ -26,25 +25,8 @@ class LinkedInJobScraper:
self.llm_agent = LLMJobRefiner() self.llm_agent = LLMJobRefiner()
def _init_db(self): def _init_db(self):
os.makedirs(os.path.dirname(self.db_path) if os.path.dirname(self.db_path) else ".", exist_ok=True) # This method is kept for backward compatibility but LLMJobRefiner handles PostgreSQL now
with sqlite3.connect(self.db_path) as conn: pass
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS jobs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
title TEXT,
company_name TEXT,
location TEXT,
description TEXT,
requirements TEXT,
qualifications TEXT,
salary_range TEXT,
nature_of_work TEXT,
job_id TEXT,
url TEXT UNIQUE
)
''')
conn.commit()
async def _human_click(self, page, element, wait_after: bool = True): async def _human_click(self, page, element, wait_after: bool = True):
if not element: if not element:
@ -61,7 +43,7 @@ class LinkedInJobScraper:
async def _login(self, page, credentials: Dict) -> bool: async def _login(self, page, credentials: Dict) -> bool:
print("🔐 Navigating to LinkedIn login page...") print("🔐 Navigating to LinkedIn login page...")
await page.goto("https://www.linkedin.com/login", timeout=60000) await page.goto("https://www.linkedin.com/login", timeout=120000)
await asyncio.sleep(random.uniform(2.0, 3.5) * self.human_speed) await asyncio.sleep(random.uniform(2.0, 3.5) * self.human_speed)
email_field = await page.query_selector('input[name="session_key"]') email_field = await page.query_selector('input[name="session_key"]')
@ -104,7 +86,11 @@ class LinkedInJobScraper:
print("❌ Login may have failed.") print("❌ Login may have failed.")
return False return False
async def _extract_all_page_content(self, page) -> str: async def _extract_page_content_for_llm(self, page) -> str:
"""
Extract raw page content as HTML/text for LLM processing
The LLM will handle all extraction logic, not specific selectors
"""
await asyncio.sleep(2 * self.human_speed) await asyncio.sleep(2 * self.human_speed)
await self.engine._human_like_scroll(page) await self.engine._human_like_scroll(page)
await asyncio.sleep(2 * self.human_speed) await asyncio.sleep(2 * self.human_speed)
@ -172,7 +158,7 @@ class LinkedInJobScraper:
await self._human_click(page, next_btn) await self._human_click(page, next_btn)
await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed) await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)
try: try:
await page.wait_for_function("() => window.location.href.includes('start=')", timeout=60000) await page.wait_for_function("() => window.location.href.includes('start=')", timeout=120000)
except: except:
pass pass
current_page += 1 current_page += 1
@ -247,7 +233,7 @@ class LinkedInJobScraper:
if session_loaded: if session_loaded:
print("🔁 Using saved session — verifying login...") print("🔁 Using saved session — verifying login...")
await page.goto("https://www.linkedin.com/feed/", timeout=60000) await page.goto("https://www.linkedin.com/feed/", timeout=120000)
if "feed" in page.url and "login" not in page.url: if "feed" in page.url and "login" not in page.url:
print("✅ Session still valid.") print("✅ Session still valid.")
login_successful = True login_successful = True
@ -269,7 +255,7 @@ class LinkedInJobScraper:
print(" No credentials — proceeding as guest.") print(" No credentials — proceeding as guest.")
login_successful = True login_successful = True
await page.wait_for_load_state("load", timeout=60000) await page.wait_for_load_state("load", timeout=120000)
print("✅ Post-login page fully loaded. Starting search...") print("✅ Post-login page fully loaded. Starting search...")
# >>> PROTECTION CHECK USING FETCHER LOGIC <<< # >>> PROTECTION CHECK USING FETCHER LOGIC <<<
@ -292,7 +278,7 @@ class LinkedInJobScraper:
print("✅ Protection present but content accessible — proceeding.") print("✅ Protection present but content accessible — proceeding.")
print(f"🔍 Searching for: {search_keywords}") print(f"🔍 Searching for: {search_keywords}")
await page.goto(search_url, wait_until='load', timeout=60000) await page.goto(search_url, wait_until='load', timeout=120000)
await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed) await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)
# >>> PROTECTION CHECK ON SEARCH PAGE <<< # >>> PROTECTION CHECK ON SEARCH PAGE <<<
@ -322,7 +308,7 @@ class LinkedInJobScraper:
print(f" Found {initial_jobs} initial job(s) (total: {len(all_job_links)})") print(f" Found {initial_jobs} initial job(s) (total: {len(all_job_links)})")
iteration = 1 iteration = 1
while True: while True and iteration >= 5:
print(f"🔄 Iteration {iteration}: Checking for new jobs...") print(f"🔄 Iteration {iteration}: Checking for new jobs...")
prev_job_count = len(all_job_links) prev_job_count = len(all_job_links)
@ -355,10 +341,6 @@ class LinkedInJobScraper:
print("🔚 No new jobs found after refresh. Stopping.") print("🔚 No new jobs found after refresh. Stopping.")
break break
if iteration > 10:
print("🔄 Maximum iterations reached. Stopping.")
break
print(f"✅ Collected {len(all_job_links)} unique job links.") print(f"✅ Collected {len(all_job_links)} unique job links.")
scraped_count = 0 scraped_count = 0
@ -386,8 +368,9 @@ class LinkedInJobScraper:
if apply_btn: if apply_btn:
break break
page_data = None final_url = full_url
final_url = job_page.url external_url = None
page_content = None
if apply_btn: if apply_btn:
print(" → Clicking 'Apply' / 'Easy Apply' button...") print(" → Clicking 'Apply' / 'Easy Apply' button...")
@ -399,44 +382,61 @@ class LinkedInJobScraper:
try: try:
external_page = await asyncio.wait_for(page_waiter, timeout=5.0) external_page = await asyncio.wait_for(page_waiter, timeout=5.0)
print(" 🌐 External job site opened in new tab.") print(" 🌐 External job site opened in new tab.")
await external_page.wait_for_load_state("load", timeout=60000) await external_page.wait_for_load_state("load", timeout=120000)
await asyncio.sleep(2 * self.human_speed) await asyncio.sleep(2 * self.human_speed)
await self.engine._human_like_scroll(external_page) await self.engine._human_like_scroll(external_page)
await asyncio.sleep(2 * self.human_speed) await asyncio.sleep(2 * self.human_speed)
page_data = await self._extract_all_page_content(external_page) # Extract raw content from external page for LLM processing
final_url = external_page.url external_url = external_page.url
final_url = external_url
page_content = await self._extract_page_content_for_llm(external_page)
if not external_page.is_closed(): if not external_page.is_closed():
await external_page.close() await external_page.close()
except asyncio.TimeoutError: except asyncio.TimeoutError:
print(" 🖥️ No external tab — scraping LinkedIn job page directly.") print(" 🖥️ No external tab — scraping LinkedIn job page directly.")
await job_page.wait_for_timeout(2000) await job_page.wait_for_timeout(60000)
try: try:
await job_page.wait_for_selector("div.jobs-apply-button--fixed, div.jobs-easy-apply-modal", timeout=8000) await job_page.wait_for_selector("div.jobs-apply-button--fixed, div.jobs-easy-apply-modal", timeout=80000)
except PlaywrightTimeoutError: except PlaywrightTimeoutError:
pass pass
await self.engine._human_like_scroll(job_page) await self.engine._human_like_scroll(job_page)
await asyncio.sleep(2 * self.human_speed) await asyncio.sleep(2 * self.human_speed)
page_data = await self._extract_all_page_content(job_page) page_content = await self._extract_page_content_for_llm(job_page)
else: else:
print(" ⚠️ No 'Apply' button found — scraping job details directly.") print(" ⚠️ No 'Apply' button found — scraping job details directly.")
await self.engine._human_like_scroll(job_page) await self.engine._human_like_scroll(job_page)
await asyncio.sleep(2 * self.human_speed) await asyncio.sleep(2 * self.human_speed)
page_data = await self._extract_all_page_content(job_page) page_content = await self._extract_page_content_for_llm(job_page)
job_id = final_url.split("/")[-2] if "/jobs/view/" in final_url else "unknown" job_id = full_url.split("/")[-2] if "/jobs/view/" in full_url else "unknown"
raw_data = { raw_data = {
"page_content": page_data, "page_content": page_content,
"url": job_page.url, "url": final_url,
"job_id": job_page.url.split("/")[-2] if "/jobs/view/" in job_page.url else "unknown" "job_id": job_id,
"search_keywords": search_keywords
} }
# LLM agent is now fully responsible for extraction and validation
refined_data = await self.llm_agent.refine_job_data(raw_data, self.user_request) refined_data = await self.llm_agent.refine_job_data(raw_data, self.user_request)
if refined_data and refined_data.get("title", "N/A") != "N/A": if refined_data and refined_data.get("title", "N/A") != "N/A":
# Ensure compulsory fields are present (fallback if LLM missed them)
compulsory_fields = ['company_name', 'job_id', 'url']
for field in compulsory_fields:
if not refined_data.get(field) or refined_data[field] in ["N/A", "", "Unknown"]:
if field == 'job_id':
refined_data[field] = job_id
elif field == 'url':
refined_data[field] = final_url
elif field == 'company_name':
refined_data[field] = "Unknown Company"
refined_data['scraped_at'] = datetime.now().isoformat()
refined_data['category'] = clean_keywords
await self.llm_agent.save_job_data(refined_data, search_keywords) await self.llm_agent.save_job_data(refined_data, search_keywords)
scraped_count += 1 scraped_count += 1
print(f" ✅ Scraped and refined: {refined_data['title'][:50]}...") print(f" ✅ Scraped and refined: {refined_data['title'][:50]}...")
@ -455,7 +455,7 @@ class LinkedInJobScraper:
finally: finally:
print(" ↩️ Returning to LinkedIn search results...") print(" ↩️ Returning to LinkedIn search results...")
await page.goto(search_url, timeout=60000) await page.goto(search_url, timeout=120000)
await asyncio.sleep(4 * self.human_speed) await asyncio.sleep(4 * self.human_speed)
await browser.close() await browser.close()

View File

@ -4,6 +4,8 @@ from job_scraper2 import LinkedInJobScraper
import os import os
from dotenv import load_dotenv from dotenv import load_dotenv
import asyncio import asyncio
import random
import time
# Load environment variables # Load environment variables
load_dotenv() load_dotenv()
@ -11,7 +13,7 @@ load_dotenv()
async def main(): async def main():
engine = FingerprintScrapingEngine( engine = FingerprintScrapingEngine(
seed="job_scraping_123", seed="job_scraping_12",
target_os="windows", target_os="windows",
db_path="job_listings.db", db_path="job_listings.db",
markdown_path="job_listings.md" markdown_path="job_listings.md"
@ -20,13 +22,50 @@ async def main():
# Initialize scraper with target field # Initialize scraper with target field
scraper = LinkedInJobScraper(engine, human_speed=1.6, user_request="Extract title, company, location, description, requirements, qualifications, nature of job(remote, onsite, hybrid) and salary") scraper = LinkedInJobScraper(engine, human_speed=1.6, user_request="Extract title, company, location, description, requirements, qualifications, nature of job(remote, onsite, hybrid) and salary")
# List of job titles to cycle through
job_titles = [
"Software Engineer",
"Data Scientist",
"Product Manager",
"UX Designer",
"DevOps Engineer",
"Machine Learning Engineer",
"Frontend Developer",
"Backend Developer",
"Full Stack Developer",
"Data Analyst"
]
fixed_location = "New York"
# Keep cycling through all job titles
while True:
# Shuffle job titles to randomize order
random.shuffle(job_titles)
for job_title in job_titles:
search_keywords = f"{job_title} location:{fixed_location}"
print(f"\n{'='*60}")
print(f"Starting scrape for: {search_keywords}")
print(f"{'='*60}")
await scraper.scrape_jobs( await scraper.scrape_jobs(
search_keywords="Lecturer location:New York", search_keywords=search_keywords,
credentials={ credentials={
"email": os.getenv("SCRAPING_USERNAME"), "email": os.getenv("SCRAPING_USERNAME"),
"password": os.getenv("SCRAPING_PASSWORD") "password": os.getenv("SCRAPING_PASSWORD")
} }
) )
print(f"\n✅ Completed scraping for: {job_title}")
print(f"⏳ Waiting 2 minutes before next job title...")
# Wait 2 minutes before next job title
time.sleep(120)
print(f"\n✅ Completed full cycle of all job titles")
print(f"🔄 Starting new cycle...")
if __name__ == "__main__": if __name__ == "__main__":
asyncio.run(main()) asyncio.run(main())

View File

@ -1,28 +1,125 @@
from openai import OpenAI from openai import OpenAI
from typing import Dict, Any from typing import Dict, Any
import asyncio import asyncio
import sqlite3 import psycopg2
import os import os
from datetime import datetime from datetime import datetime
import json import json
import re import re
from bs4 import BeautifulSoup
from dotenv import load_dotenv from dotenv import load_dotenv
# Load environment variables from .env # Load environment variables from .env
load_dotenv() load_dotenv()
class LLMJobRefiner: class LLMJobRefiner:
def __init__(self): def __init__(self):
deepseek_api_key = os.getenv("DEEPSEEK_API_KEY") deepseek_api_key = os.getenv("DEEPSEEK_API_KEY")
if not deepseek_api_key: if not deepseek_api_key:
raise ValueError("DEEPSEEK_API_KEY not found in .env file.") raise ValueError("DEEPSEEK_API_KEY not found in .env file.")
# Database credentials from .env
self.db_url = os.getenv("DB_URL")
self.db_username = os.getenv("DB_USERNAME")
self.db_password = os.getenv("DB_PASSWORD")
self.db_host = os.getenv("DB_HOST")
self.db_port = os.getenv("DB_PORT")
if not self.db_url or not self.db_username or not self.db_password:
raise ValueError("Database credentials not found in .env file.")
# DeepSeek uses OpenAI-compatible API # DeepSeek uses OpenAI-compatible API
self.client = OpenAI( self.client = OpenAI(
api_key=deepseek_api_key, api_key=deepseek_api_key,
base_url="https://api.deepseek.com/v1" base_url="https://api.deepseek.com/v1"
) )
self.model = "deepseek-chat" # or "deepseek-coder" if preferred self.model = "deepseek-chat"
self._init_db()
def _init_db(self):
"""Initialize PostgreSQL database connection and create table"""
try:
self.db_url = os.getenv("DB_URL")
if self.db_url and "supabase.com" in self.db_url:
conn = psycopg2.connect(
host=self.db_host,
port=self.db_port,
database="postgres",
user=self.db_username,
password=self.db_password
)
else:
conn = psycopg2.connect(
host=self.db_host,
port=self.db_port,
database="postgres",
user=self.db_username,
password=self.db_password
)
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS jobs (
id SERIAL PRIMARY KEY,
title TEXT,
company_name TEXT,
location TEXT,
description TEXT,
requirements TEXT,
qualifications TEXT,
salary_range TEXT,
nature_of_work TEXT,
job_id TEXT UNIQUE,
url TEXT,
category TEXT,
scraped_at TIMESTAMP,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
# Ensure the uniqueness constraint exists
cursor.execute('''
ALTER TABLE jobs DROP CONSTRAINT IF EXISTS jobs_job_id_key;
ALTER TABLE jobs ADD CONSTRAINT jobs_job_id_key UNIQUE (job_id);
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_job_id ON jobs(job_id)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_category ON jobs(category)')
conn.commit()
cursor.close()
conn.close()
print("✅ PostgreSQL database initialized successfully")
except Exception as e:
print(f"❌ Database initialization error: {e}")
raise
def _clean_html_for_llm(self, html_content: str) -> str:
"""Clean HTML to make it more readable for LLM while preserving structure"""
try:
soup = BeautifulSoup(html_content, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style", "nav", "footer", "header"]):
script.decompose()
# Extract text but keep some structure
text = soup.get_text(separator=' ', strip=True)
# Clean up whitespace
text = re.sub(r'\s+', ' ', text)
# Limit length for LLM context
if len(text) > 10000:
text = text[:10000] + "..."
return text
except Exception as e:
print(f"HTML cleaning error: {e}")
# Fallback to raw content if cleaning fails
return html_content[:100000] if len(html_content) > 100000 else html_content
def _generate_content_sync(self, prompt: str) -> str: def _generate_content_sync(self, prompt: str) -> str:
"""Synchronous call to DeepSeek API""" """Synchronous call to DeepSeek API"""
@ -40,33 +137,47 @@ class LLMJobRefiner:
return "" return ""
async def refine_job_data(self, raw_data: Dict[str, Any], target_field: str) -> Dict[str, Any]: async def refine_job_data(self, raw_data: Dict[str, Any], target_field: str) -> Dict[str, Any]:
# Clean the raw HTML content for better LLM processing
page_content = raw_data.get('page_content', '')
cleaned_content = self._clean_html_for_llm(page_content)
# Get job_id and url from raw data
job_id = raw_data.get('job_id', 'unknown')
url = raw_data.get('url', 'N/A')
prompt = f""" prompt = f"""
You are a job data extraction assistant. Extract the following fields from the job posting: You are a job posting data extractor with two modes:
- title
- company_name PRIMARY MODE (PREFERRED):
- Extract EXACT text as it appears on the page for all fields
- DO NOT summarize, paraphrase, or interpret
- Copy verbatim content including original wording and formatting
FALLBACK MODE (ONLY IF FIELD IS MISSING):
- If a field is NOT explicitly stated anywhere in the content, you MAY infer it using clear contextual clues
- Inference rules:
* company_name: Look for patterns like "at [Company]", "Join [Company]", "[Company] is hiring"
* nature_of_work: Look for "remote", "onsite", "hybrid", "work from home", "office-based"
* location: Extract city/state/country mentions near job title or details
* title: Use the largest/primary heading if no explicit "job title" label exists
REQUIRED FIELDS (must always have a value):
- title: Exact job title or best inference
- company_name: Exact company name or best inference
- job_id: Use provided: {job_id}
- url: Use provided: {url}
OPTIONAL FIELDS (use exact text or "N/A" if not present and not inferable):
- location - location
- description - description
- requirements - requirements
- qualifications - qualifications
- salary_range - salary_range
- nature_of_work (remote, onsite, or hybrid) - nature_of_work
- job_id
Target Field: {target_field} Page Content:
Raw Page Content: {cleaned_content}
{raw_data.get('page_content', '')} Response format (ONLY return this JSON):
Instructions:
1. Extract only the information relevant to the target field: {target_field}
2. Clean up any formatting issues in the description
3. Standardize location format (city, state/country)
4. Extract salary range if mentioned
5. Determine nature of work from work arrangements
6. Ensure all fields are properly formatted
7. If a field cannot be found, use "N/A"
8. Return ONLY the refined data in JSON format
Response format (only return the JSON):
{{ {{
"title": "...", "title": "...",
"company_name": "...", "company_name": "...",
@ -76,8 +187,8 @@ class LLMJobRefiner:
"qualifications": "...", "qualifications": "...",
"salary_range": "...", "salary_range": "...",
"nature_of_work": "...", "nature_of_work": "...",
"job_id": "{raw_data.get('job_id', 'unknown')}", "job_id": "{job_id}",
"url": "{raw_data.get('url', 'N/A')}" "url": "{url}"
}} }}
""" """
@ -87,7 +198,17 @@ class LLMJobRefiner:
lambda: self._generate_content_sync(prompt) lambda: self._generate_content_sync(prompt)
) )
refined_data = self._parse_llm_response(response_text) refined_data = self._parse_llm_response(response_text)
return refined_data if refined_data else None
# Final validation - ensure required fields are present and meaningful
if refined_data:
required_fields = ['title', 'company_name', 'job_id', 'url']
for field in required_fields:
if not refined_data.get(field) or refined_data[field] in ["N/A", "", "Unknown", "Company", "Job"]:
return None # LLM failed to extract properly
return refined_data
return None
except Exception as e: except Exception as e:
print(f"LLM refinement failed: {str(e)}") print(f"LLM refinement failed: {str(e)}")
return None return None
@ -109,22 +230,23 @@ class LLMJobRefiner:
await self._save_to_markdown(job_data, keyword) await self._save_to_markdown(job_data, keyword)
async def _save_to_db(self, job_data: Dict[str, Any]): async def _save_to_db(self, job_data: Dict[str, Any]):
db_path = "linkedin_jobs.db" """Save job data to PostgreSQL database with job_id uniqueness"""
os.makedirs(os.path.dirname(db_path) or ".", exist_ok=True) try:
with sqlite3.connect(db_path) as conn: conn = psycopg2.connect(
cursor = conn.cursor() host=self.db_host,
cursor.execute(''' port=self.db_port,
CREATE TABLE IF NOT EXISTS jobs ( database="postgres",
title TEXT, company_name TEXT, location TEXT, description TEXT, user=self.db_username,
requirements TEXT, qualifications TEXT, salary_range TEXT, password=self.db_password
nature_of_work TEXT, job_id TEXT, url TEXT
) )
''') cursor = conn.cursor()
cursor.execute(''' cursor.execute('''
INSERT OR IGNORE INTO jobs INSERT INTO jobs
(title, company_name, location, description, requirements, (title, company_name, location, description, requirements,
qualifications, salary_range, nature_of_work, job_id, url) qualifications, salary_range, nature_of_work, job_id, url, category, scraped_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (job_id) DO NOTHING
''', ( ''', (
job_data.get("title", "N/A"), job_data.get("title", "N/A"),
job_data.get("company_name", "N/A"), job_data.get("company_name", "N/A"),
@ -135,9 +257,19 @@ class LLMJobRefiner:
job_data.get("salary_range", "N/A"), job_data.get("salary_range", "N/A"),
job_data.get("nature_of_work", "N/A"), job_data.get("nature_of_work", "N/A"),
job_data.get("job_id", "N/A"), job_data.get("job_id", "N/A"),
job_data.get("url", "N/A") job_data.get("url", "N/A"),
job_data.get("category", "N/A"),
job_data.get("scraped_at")
)) ))
conn.commit() conn.commit()
cursor.close()
conn.close()
print(f" 💾 Saved job to category '{job_data.get('category', 'N/A')}' with job_id: {job_data.get('job_id', 'N/A')}")
except Exception as e:
print(f"❌ Database save error: {e}")
async def _save_to_markdown(self, job_data: Dict[str, Any], keyword: str): async def _save_to_markdown(self, job_data: Dict[str, Any], keyword: str):
os.makedirs("linkedin_jobs", exist_ok=True) os.makedirs("linkedin_jobs", exist_ok=True)
@ -154,6 +286,8 @@ class LLMJobRefiner:
f.write(f"- **Nature of Work**: {job_data.get('nature_of_work', 'N/A')}\n") f.write(f"- **Nature of Work**: {job_data.get('nature_of_work', 'N/A')}\n")
f.write(f"- **Salary Range**: {job_data.get('salary_range', 'N/A')}\n") f.write(f"- **Salary Range**: {job_data.get('salary_range', 'N/A')}\n")
f.write(f"- **Job ID**: {job_data.get('job_id', 'N/A')}\n") f.write(f"- **Job ID**: {job_data.get('job_id', 'N/A')}\n")
f.write(f"- **Category**: {job_data.get('category', 'N/A')}\n")
f.write(f"- **Scraped At**: {job_data.get('scraped_at', 'N/A')}\n")
f.write(f"- **URL**: <{job_data.get('url', 'N/A')}>\n\n") f.write(f"- **URL**: <{job_data.get('url', 'N/A')}>\n\n")
f.write(f"### Description\n\n{job_data.get('description', 'N/A')}\n\n") f.write(f"### Description\n\n{job_data.get('description', 'N/A')}\n\n")
f.write(f"### Requirements\n\n{job_data.get('requirements', 'N/A')}\n\n") f.write(f"### Requirements\n\n{job_data.get('requirements', 'N/A')}\n\n")

View File

@ -69,7 +69,7 @@ class FingerprintScrapingEngine:
self.optimization_params = { self.optimization_params = {
"base_delay": 2.0, "base_delay": 2.0,
"max_concurrent_requests": 4, "max_concurrent_requests": 4,
"request_timeout": 60000, "request_timeout": 120000,
"retry_attempts": 3, "retry_attempts": 3,
"captcha_handling_strategy": "avoid", # or "solve_fallback" "captcha_handling_strategy": "avoid", # or "solve_fallback"
"cloudflare_wait_strategy": "smart_wait", # or "aggressive_reload" "cloudflare_wait_strategy": "smart_wait", # or "aggressive_reload"
@ -155,7 +155,7 @@ class FingerprintScrapingEngine:
# Increase timeout if avg response time is high # Increase timeout if avg response time is high
if avg_rt > 20: if avg_rt > 20:
self.optimization_params["request_timeout"] = 90000 # 90 seconds self.optimization_params["request_timeout"] = 150000 # 90 seconds
print(f"Optimization Params Updated: {self.optimization_params}") print(f"Optimization Params Updated: {self.optimization_params}")
@ -371,7 +371,7 @@ class FingerprintScrapingEngine:
# Reload occasionally to trigger potential client-side checks # Reload occasionally to trigger potential client-side checks
if (time.time() - start_time) > 15 and (time.time() - start_time) % 20 < 2: if (time.time() - start_time) > 15 and (time.time() - start_time) % 20 < 2:
print("Reloading page during Cloudflare wait...") print("Reloading page during Cloudflare wait...")
await page.reload(wait_until='load', timeout=30000) await page.reload(wait_until='load', timeout=80000)
print("Timeout waiting for Cloudflare resolution.") print("Timeout waiting for Cloudflare resolution.")
return False return False