modifications to work with postgre and use llm to extract and refine data
This commit is contained in:
parent
4f78a845ae
commit
160efadbfb
@ -23,11 +23,11 @@ class StealthyFetcher:
|
|||||||
print(f"Attempt {attempt + 1} to fetch {url}")
|
print(f"Attempt {attempt + 1} to fetch {url}")
|
||||||
page = await self.context.new_page()
|
page = await self.context.new_page()
|
||||||
|
|
||||||
await page.goto(url, wait_until='load', timeout=60000)
|
await page.goto(url, wait_until='load', timeout=120000)
|
||||||
|
|
||||||
if wait_for_selector:
|
if wait_for_selector:
|
||||||
try:
|
try:
|
||||||
await page.wait_for_selector(wait_for_selector, timeout=10000)
|
await page.wait_for_selector(wait_for_selector, timeout=40000)
|
||||||
except PlaywrightTimeoutError:
|
except PlaywrightTimeoutError:
|
||||||
print(f"Selector {wait_for_selector} not found immediately, continuing...")
|
print(f"Selector {wait_for_selector} not found immediately, continuing...")
|
||||||
|
|
||||||
@ -88,7 +88,7 @@ class StealthyFetcher:
|
|||||||
async def _is_content_accessible(self, page: Page, wait_for_selector: Optional[str] = None) -> bool:
|
async def _is_content_accessible(self, page: Page, wait_for_selector: Optional[str] = None) -> bool:
|
||||||
if wait_for_selector:
|
if wait_for_selector:
|
||||||
try:
|
try:
|
||||||
await page.wait_for_selector(wait_for_selector, timeout=5000)
|
await page.wait_for_selector(wait_for_selector, timeout=40000)
|
||||||
return True
|
return True
|
||||||
except PlaywrightTimeoutError:
|
except PlaywrightTimeoutError:
|
||||||
pass
|
pass
|
||||||
@ -118,7 +118,7 @@ class StealthyFetcher:
|
|||||||
|
|
||||||
if (time.time() - start_time) > 15 and (time.time() - start_time) % 20 < 2:
|
if (time.time() - start_time) > 15 and (time.time() - start_time) % 20 < 2:
|
||||||
print("🔄 Reloading page during Cloudflare wait...")
|
print("🔄 Reloading page during Cloudflare wait...")
|
||||||
await page.reload(wait_until='load', timeout=30000)
|
await page.reload(wait_until='load', timeout=120000)
|
||||||
|
|
||||||
print("⏰ Timeout waiting for Cloudflare resolution.")
|
print("⏰ Timeout waiting for Cloudflare resolution.")
|
||||||
return False
|
return False
|
||||||
|
|||||||
@ -1,13 +1,12 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import random
|
import random
|
||||||
import sqlite3
|
|
||||||
import os
|
|
||||||
from typing import Optional, Dict
|
from typing import Optional, Dict
|
||||||
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
|
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
|
||||||
from browserforge.injectors.playwright import AsyncNewContext
|
from browserforge.injectors.playwright import AsyncNewContext
|
||||||
from llm_agent import LLMJobRefiner
|
from llm_agent import LLMJobRefiner
|
||||||
import re
|
import re
|
||||||
from fetcher import StealthyFetcher
|
from fetcher import StealthyFetcher
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
|
||||||
class LinkedInJobScraper:
|
class LinkedInJobScraper:
|
||||||
@ -26,25 +25,8 @@ class LinkedInJobScraper:
|
|||||||
self.llm_agent = LLMJobRefiner()
|
self.llm_agent = LLMJobRefiner()
|
||||||
|
|
||||||
def _init_db(self):
|
def _init_db(self):
|
||||||
os.makedirs(os.path.dirname(self.db_path) if os.path.dirname(self.db_path) else ".", exist_ok=True)
|
# This method is kept for backward compatibility but LLMJobRefiner handles PostgreSQL now
|
||||||
with sqlite3.connect(self.db_path) as conn:
|
pass
|
||||||
cursor = conn.cursor()
|
|
||||||
cursor.execute('''
|
|
||||||
CREATE TABLE IF NOT EXISTS jobs (
|
|
||||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
||||||
title TEXT,
|
|
||||||
company_name TEXT,
|
|
||||||
location TEXT,
|
|
||||||
description TEXT,
|
|
||||||
requirements TEXT,
|
|
||||||
qualifications TEXT,
|
|
||||||
salary_range TEXT,
|
|
||||||
nature_of_work TEXT,
|
|
||||||
job_id TEXT,
|
|
||||||
url TEXT UNIQUE
|
|
||||||
)
|
|
||||||
''')
|
|
||||||
conn.commit()
|
|
||||||
|
|
||||||
async def _human_click(self, page, element, wait_after: bool = True):
|
async def _human_click(self, page, element, wait_after: bool = True):
|
||||||
if not element:
|
if not element:
|
||||||
@ -61,7 +43,7 @@ class LinkedInJobScraper:
|
|||||||
|
|
||||||
async def _login(self, page, credentials: Dict) -> bool:
|
async def _login(self, page, credentials: Dict) -> bool:
|
||||||
print("🔐 Navigating to LinkedIn login page...")
|
print("🔐 Navigating to LinkedIn login page...")
|
||||||
await page.goto("https://www.linkedin.com/login", timeout=60000)
|
await page.goto("https://www.linkedin.com/login", timeout=120000)
|
||||||
await asyncio.sleep(random.uniform(2.0, 3.5) * self.human_speed)
|
await asyncio.sleep(random.uniform(2.0, 3.5) * self.human_speed)
|
||||||
|
|
||||||
email_field = await page.query_selector('input[name="session_key"]')
|
email_field = await page.query_selector('input[name="session_key"]')
|
||||||
@ -104,7 +86,11 @@ class LinkedInJobScraper:
|
|||||||
print("❌ Login may have failed.")
|
print("❌ Login may have failed.")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
async def _extract_all_page_content(self, page) -> str:
|
async def _extract_page_content_for_llm(self, page) -> str:
|
||||||
|
"""
|
||||||
|
Extract raw page content as HTML/text for LLM processing
|
||||||
|
The LLM will handle all extraction logic, not specific selectors
|
||||||
|
"""
|
||||||
await asyncio.sleep(2 * self.human_speed)
|
await asyncio.sleep(2 * self.human_speed)
|
||||||
await self.engine._human_like_scroll(page)
|
await self.engine._human_like_scroll(page)
|
||||||
await asyncio.sleep(2 * self.human_speed)
|
await asyncio.sleep(2 * self.human_speed)
|
||||||
@ -172,7 +158,7 @@ class LinkedInJobScraper:
|
|||||||
await self._human_click(page, next_btn)
|
await self._human_click(page, next_btn)
|
||||||
await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)
|
await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)
|
||||||
try:
|
try:
|
||||||
await page.wait_for_function("() => window.location.href.includes('start=')", timeout=60000)
|
await page.wait_for_function("() => window.location.href.includes('start=')", timeout=120000)
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
current_page += 1
|
current_page += 1
|
||||||
@ -247,7 +233,7 @@ class LinkedInJobScraper:
|
|||||||
|
|
||||||
if session_loaded:
|
if session_loaded:
|
||||||
print("🔁 Using saved session — verifying login...")
|
print("🔁 Using saved session — verifying login...")
|
||||||
await page.goto("https://www.linkedin.com/feed/", timeout=60000)
|
await page.goto("https://www.linkedin.com/feed/", timeout=120000)
|
||||||
if "feed" in page.url and "login" not in page.url:
|
if "feed" in page.url and "login" not in page.url:
|
||||||
print("✅ Session still valid.")
|
print("✅ Session still valid.")
|
||||||
login_successful = True
|
login_successful = True
|
||||||
@ -269,7 +255,7 @@ class LinkedInJobScraper:
|
|||||||
print("ℹ️ No credentials — proceeding as guest.")
|
print("ℹ️ No credentials — proceeding as guest.")
|
||||||
login_successful = True
|
login_successful = True
|
||||||
|
|
||||||
await page.wait_for_load_state("load", timeout=60000)
|
await page.wait_for_load_state("load", timeout=120000)
|
||||||
print("✅ Post-login page fully loaded. Starting search...")
|
print("✅ Post-login page fully loaded. Starting search...")
|
||||||
|
|
||||||
# >>> PROTECTION CHECK USING FETCHER LOGIC <<<
|
# >>> PROTECTION CHECK USING FETCHER LOGIC <<<
|
||||||
@ -292,7 +278,7 @@ class LinkedInJobScraper:
|
|||||||
print("✅ Protection present but content accessible — proceeding.")
|
print("✅ Protection present but content accessible — proceeding.")
|
||||||
|
|
||||||
print(f"🔍 Searching for: {search_keywords}")
|
print(f"🔍 Searching for: {search_keywords}")
|
||||||
await page.goto(search_url, wait_until='load', timeout=60000)
|
await page.goto(search_url, wait_until='load', timeout=120000)
|
||||||
await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)
|
await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)
|
||||||
|
|
||||||
# >>> PROTECTION CHECK ON SEARCH PAGE <<<
|
# >>> PROTECTION CHECK ON SEARCH PAGE <<<
|
||||||
@ -322,7 +308,7 @@ class LinkedInJobScraper:
|
|||||||
print(f" ➕ Found {initial_jobs} initial job(s) (total: {len(all_job_links)})")
|
print(f" ➕ Found {initial_jobs} initial job(s) (total: {len(all_job_links)})")
|
||||||
|
|
||||||
iteration = 1
|
iteration = 1
|
||||||
while True:
|
while True and iteration >= 5:
|
||||||
print(f"🔄 Iteration {iteration}: Checking for new jobs...")
|
print(f"🔄 Iteration {iteration}: Checking for new jobs...")
|
||||||
|
|
||||||
prev_job_count = len(all_job_links)
|
prev_job_count = len(all_job_links)
|
||||||
@ -355,10 +341,6 @@ class LinkedInJobScraper:
|
|||||||
print("🔚 No new jobs found after refresh. Stopping.")
|
print("🔚 No new jobs found after refresh. Stopping.")
|
||||||
break
|
break
|
||||||
|
|
||||||
if iteration > 10:
|
|
||||||
print("🔄 Maximum iterations reached. Stopping.")
|
|
||||||
break
|
|
||||||
|
|
||||||
print(f"✅ Collected {len(all_job_links)} unique job links.")
|
print(f"✅ Collected {len(all_job_links)} unique job links.")
|
||||||
|
|
||||||
scraped_count = 0
|
scraped_count = 0
|
||||||
@ -386,8 +368,9 @@ class LinkedInJobScraper:
|
|||||||
if apply_btn:
|
if apply_btn:
|
||||||
break
|
break
|
||||||
|
|
||||||
page_data = None
|
final_url = full_url
|
||||||
final_url = job_page.url
|
external_url = None
|
||||||
|
page_content = None
|
||||||
|
|
||||||
if apply_btn:
|
if apply_btn:
|
||||||
print(" → Clicking 'Apply' / 'Easy Apply' button...")
|
print(" → Clicking 'Apply' / 'Easy Apply' button...")
|
||||||
@ -399,44 +382,61 @@ class LinkedInJobScraper:
|
|||||||
try:
|
try:
|
||||||
external_page = await asyncio.wait_for(page_waiter, timeout=5.0)
|
external_page = await asyncio.wait_for(page_waiter, timeout=5.0)
|
||||||
print(" 🌐 External job site opened in new tab.")
|
print(" 🌐 External job site opened in new tab.")
|
||||||
await external_page.wait_for_load_state("load", timeout=60000)
|
await external_page.wait_for_load_state("load", timeout=120000)
|
||||||
await asyncio.sleep(2 * self.human_speed)
|
await asyncio.sleep(2 * self.human_speed)
|
||||||
await self.engine._human_like_scroll(external_page)
|
await self.engine._human_like_scroll(external_page)
|
||||||
await asyncio.sleep(2 * self.human_speed)
|
await asyncio.sleep(2 * self.human_speed)
|
||||||
|
|
||||||
page_data = await self._extract_all_page_content(external_page)
|
# Extract raw content from external page for LLM processing
|
||||||
final_url = external_page.url
|
external_url = external_page.url
|
||||||
|
final_url = external_url
|
||||||
|
page_content = await self._extract_page_content_for_llm(external_page)
|
||||||
|
|
||||||
if not external_page.is_closed():
|
if not external_page.is_closed():
|
||||||
await external_page.close()
|
await external_page.close()
|
||||||
|
|
||||||
except asyncio.TimeoutError:
|
except asyncio.TimeoutError:
|
||||||
print(" 🖥️ No external tab — scraping LinkedIn job page directly.")
|
print(" 🖥️ No external tab — scraping LinkedIn job page directly.")
|
||||||
await job_page.wait_for_timeout(2000)
|
await job_page.wait_for_timeout(60000)
|
||||||
try:
|
try:
|
||||||
await job_page.wait_for_selector("div.jobs-apply-button--fixed, div.jobs-easy-apply-modal", timeout=8000)
|
await job_page.wait_for_selector("div.jobs-apply-button--fixed, div.jobs-easy-apply-modal", timeout=80000)
|
||||||
except PlaywrightTimeoutError:
|
except PlaywrightTimeoutError:
|
||||||
pass
|
pass
|
||||||
await self.engine._human_like_scroll(job_page)
|
await self.engine._human_like_scroll(job_page)
|
||||||
await asyncio.sleep(2 * self.human_speed)
|
await asyncio.sleep(2 * self.human_speed)
|
||||||
page_data = await self._extract_all_page_content(job_page)
|
page_content = await self._extract_page_content_for_llm(job_page)
|
||||||
else:
|
else:
|
||||||
print(" ⚠️ No 'Apply' button found — scraping job details directly.")
|
print(" ⚠️ No 'Apply' button found — scraping job details directly.")
|
||||||
await self.engine._human_like_scroll(job_page)
|
await self.engine._human_like_scroll(job_page)
|
||||||
await asyncio.sleep(2 * self.human_speed)
|
await asyncio.sleep(2 * self.human_speed)
|
||||||
page_data = await self._extract_all_page_content(job_page)
|
page_content = await self._extract_page_content_for_llm(job_page)
|
||||||
|
|
||||||
job_id = final_url.split("/")[-2] if "/jobs/view/" in final_url else "unknown"
|
job_id = full_url.split("/")[-2] if "/jobs/view/" in full_url else "unknown"
|
||||||
|
|
||||||
raw_data = {
|
raw_data = {
|
||||||
"page_content": page_data,
|
"page_content": page_content,
|
||||||
"url": job_page.url,
|
"url": final_url,
|
||||||
"job_id": job_page.url.split("/")[-2] if "/jobs/view/" in job_page.url else "unknown"
|
"job_id": job_id,
|
||||||
|
"search_keywords": search_keywords
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# LLM agent is now fully responsible for extraction and validation
|
||||||
refined_data = await self.llm_agent.refine_job_data(raw_data, self.user_request)
|
refined_data = await self.llm_agent.refine_job_data(raw_data, self.user_request)
|
||||||
|
|
||||||
if refined_data and refined_data.get("title", "N/A") != "N/A":
|
if refined_data and refined_data.get("title", "N/A") != "N/A":
|
||||||
|
# Ensure compulsory fields are present (fallback if LLM missed them)
|
||||||
|
compulsory_fields = ['company_name', 'job_id', 'url']
|
||||||
|
for field in compulsory_fields:
|
||||||
|
if not refined_data.get(field) or refined_data[field] in ["N/A", "", "Unknown"]:
|
||||||
|
if field == 'job_id':
|
||||||
|
refined_data[field] = job_id
|
||||||
|
elif field == 'url':
|
||||||
|
refined_data[field] = final_url
|
||||||
|
elif field == 'company_name':
|
||||||
|
refined_data[field] = "Unknown Company"
|
||||||
|
|
||||||
|
refined_data['scraped_at'] = datetime.now().isoformat()
|
||||||
|
refined_data['category'] = clean_keywords
|
||||||
await self.llm_agent.save_job_data(refined_data, search_keywords)
|
await self.llm_agent.save_job_data(refined_data, search_keywords)
|
||||||
scraped_count += 1
|
scraped_count += 1
|
||||||
print(f" ✅ Scraped and refined: {refined_data['title'][:50]}...")
|
print(f" ✅ Scraped and refined: {refined_data['title'][:50]}...")
|
||||||
@ -455,7 +455,7 @@ class LinkedInJobScraper:
|
|||||||
|
|
||||||
finally:
|
finally:
|
||||||
print(" ↩️ Returning to LinkedIn search results...")
|
print(" ↩️ Returning to LinkedIn search results...")
|
||||||
await page.goto(search_url, timeout=60000)
|
await page.goto(search_url, timeout=120000)
|
||||||
await asyncio.sleep(4 * self.human_speed)
|
await asyncio.sleep(4 * self.human_speed)
|
||||||
|
|
||||||
await browser.close()
|
await browser.close()
|
||||||
|
|||||||
@ -4,6 +4,8 @@ from job_scraper2 import LinkedInJobScraper
|
|||||||
import os
|
import os
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
|
||||||
# Load environment variables
|
# Load environment variables
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
@ -11,7 +13,7 @@ load_dotenv()
|
|||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
engine = FingerprintScrapingEngine(
|
engine = FingerprintScrapingEngine(
|
||||||
seed="job_scraping_123",
|
seed="job_scraping_12",
|
||||||
target_os="windows",
|
target_os="windows",
|
||||||
db_path="job_listings.db",
|
db_path="job_listings.db",
|
||||||
markdown_path="job_listings.md"
|
markdown_path="job_listings.md"
|
||||||
@ -20,13 +22,50 @@ async def main():
|
|||||||
# Initialize scraper with target field
|
# Initialize scraper with target field
|
||||||
scraper = LinkedInJobScraper(engine, human_speed=1.6, user_request="Extract title, company, location, description, requirements, qualifications, nature of job(remote, onsite, hybrid) and salary")
|
scraper = LinkedInJobScraper(engine, human_speed=1.6, user_request="Extract title, company, location, description, requirements, qualifications, nature of job(remote, onsite, hybrid) and salary")
|
||||||
|
|
||||||
await scraper.scrape_jobs(
|
# List of job titles to cycle through
|
||||||
search_keywords="Lecturer location:New York",
|
job_titles = [
|
||||||
credentials={
|
"Software Engineer",
|
||||||
"email": os.getenv("SCRAPING_USERNAME"),
|
"Data Scientist",
|
||||||
"password": os.getenv("SCRAPING_PASSWORD")
|
"Product Manager",
|
||||||
}
|
"UX Designer",
|
||||||
)
|
"DevOps Engineer",
|
||||||
|
"Machine Learning Engineer",
|
||||||
|
"Frontend Developer",
|
||||||
|
"Backend Developer",
|
||||||
|
"Full Stack Developer",
|
||||||
|
"Data Analyst"
|
||||||
|
]
|
||||||
|
|
||||||
|
fixed_location = "New York"
|
||||||
|
|
||||||
|
# Keep cycling through all job titles
|
||||||
|
while True:
|
||||||
|
# Shuffle job titles to randomize order
|
||||||
|
random.shuffle(job_titles)
|
||||||
|
|
||||||
|
for job_title in job_titles:
|
||||||
|
search_keywords = f"{job_title} location:{fixed_location}"
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Starting scrape for: {search_keywords}")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
|
||||||
|
await scraper.scrape_jobs(
|
||||||
|
search_keywords=search_keywords,
|
||||||
|
credentials={
|
||||||
|
"email": os.getenv("SCRAPING_USERNAME"),
|
||||||
|
"password": os.getenv("SCRAPING_PASSWORD")
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"\n✅ Completed scraping for: {job_title}")
|
||||||
|
print(f"⏳ Waiting 2 minutes before next job title...")
|
||||||
|
|
||||||
|
# Wait 2 minutes before next job title
|
||||||
|
time.sleep(120)
|
||||||
|
|
||||||
|
print(f"\n✅ Completed full cycle of all job titles")
|
||||||
|
print(f"🔄 Starting new cycle...")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
asyncio.run(main())
|
asyncio.run(main())
|
||||||
|
|||||||
214
llm_agent.py
214
llm_agent.py
@ -1,28 +1,125 @@
|
|||||||
|
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
from typing import Dict, Any
|
from typing import Dict, Any
|
||||||
import asyncio
|
import asyncio
|
||||||
import sqlite3
|
import psycopg2
|
||||||
import os
|
import os
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
# Load environment variables from .env
|
# Load environment variables from .env
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
class LLMJobRefiner:
|
class LLMJobRefiner:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
deepseek_api_key = os.getenv("DEEPSEEK_API_KEY")
|
deepseek_api_key = os.getenv("DEEPSEEK_API_KEY")
|
||||||
if not deepseek_api_key:
|
if not deepseek_api_key:
|
||||||
raise ValueError("DEEPSEEK_API_KEY not found in .env file.")
|
raise ValueError("DEEPSEEK_API_KEY not found in .env file.")
|
||||||
|
|
||||||
|
# Database credentials from .env
|
||||||
|
self.db_url = os.getenv("DB_URL")
|
||||||
|
self.db_username = os.getenv("DB_USERNAME")
|
||||||
|
self.db_password = os.getenv("DB_PASSWORD")
|
||||||
|
self.db_host = os.getenv("DB_HOST")
|
||||||
|
self.db_port = os.getenv("DB_PORT")
|
||||||
|
|
||||||
|
if not self.db_url or not self.db_username or not self.db_password:
|
||||||
|
raise ValueError("Database credentials not found in .env file.")
|
||||||
|
|
||||||
# DeepSeek uses OpenAI-compatible API
|
# DeepSeek uses OpenAI-compatible API
|
||||||
self.client = OpenAI(
|
self.client = OpenAI(
|
||||||
api_key=deepseek_api_key,
|
api_key=deepseek_api_key,
|
||||||
base_url="https://api.deepseek.com/v1"
|
base_url="https://api.deepseek.com/v1"
|
||||||
)
|
)
|
||||||
self.model = "deepseek-chat" # or "deepseek-coder" if preferred
|
self.model = "deepseek-chat"
|
||||||
|
self._init_db()
|
||||||
|
|
||||||
|
def _init_db(self):
|
||||||
|
"""Initialize PostgreSQL database connection and create table"""
|
||||||
|
try:
|
||||||
|
self.db_url = os.getenv("DB_URL")
|
||||||
|
if self.db_url and "supabase.com" in self.db_url:
|
||||||
|
conn = psycopg2.connect(
|
||||||
|
host=self.db_host,
|
||||||
|
port=self.db_port,
|
||||||
|
database="postgres",
|
||||||
|
user=self.db_username,
|
||||||
|
password=self.db_password
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
conn = psycopg2.connect(
|
||||||
|
host=self.db_host,
|
||||||
|
port=self.db_port,
|
||||||
|
database="postgres",
|
||||||
|
user=self.db_username,
|
||||||
|
password=self.db_password
|
||||||
|
)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
cursor.execute('''
|
||||||
|
CREATE TABLE IF NOT EXISTS jobs (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
title TEXT,
|
||||||
|
company_name TEXT,
|
||||||
|
location TEXT,
|
||||||
|
description TEXT,
|
||||||
|
requirements TEXT,
|
||||||
|
qualifications TEXT,
|
||||||
|
salary_range TEXT,
|
||||||
|
nature_of_work TEXT,
|
||||||
|
job_id TEXT UNIQUE,
|
||||||
|
url TEXT,
|
||||||
|
category TEXT,
|
||||||
|
scraped_at TIMESTAMP,
|
||||||
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||||
|
)
|
||||||
|
''')
|
||||||
|
|
||||||
|
# Ensure the uniqueness constraint exists
|
||||||
|
cursor.execute('''
|
||||||
|
ALTER TABLE jobs DROP CONSTRAINT IF EXISTS jobs_job_id_key;
|
||||||
|
ALTER TABLE jobs ADD CONSTRAINT jobs_job_id_key UNIQUE (job_id);
|
||||||
|
''')
|
||||||
|
|
||||||
|
cursor.execute('CREATE INDEX IF NOT EXISTS idx_job_id ON jobs(job_id)')
|
||||||
|
cursor.execute('CREATE INDEX IF NOT EXISTS idx_category ON jobs(category)')
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
cursor.close()
|
||||||
|
conn.close()
|
||||||
|
print("✅ PostgreSQL database initialized successfully")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Database initialization error: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def _clean_html_for_llm(self, html_content: str) -> str:
|
||||||
|
"""Clean HTML to make it more readable for LLM while preserving structure"""
|
||||||
|
try:
|
||||||
|
soup = BeautifulSoup(html_content, 'html.parser')
|
||||||
|
|
||||||
|
# Remove script and style elements
|
||||||
|
for script in soup(["script", "style", "nav", "footer", "header"]):
|
||||||
|
script.decompose()
|
||||||
|
|
||||||
|
# Extract text but keep some structure
|
||||||
|
text = soup.get_text(separator=' ', strip=True)
|
||||||
|
|
||||||
|
# Clean up whitespace
|
||||||
|
text = re.sub(r'\s+', ' ', text)
|
||||||
|
|
||||||
|
# Limit length for LLM context
|
||||||
|
if len(text) > 10000:
|
||||||
|
text = text[:10000] + "..."
|
||||||
|
|
||||||
|
return text
|
||||||
|
except Exception as e:
|
||||||
|
print(f"HTML cleaning error: {e}")
|
||||||
|
# Fallback to raw content if cleaning fails
|
||||||
|
return html_content[:100000] if len(html_content) > 100000 else html_content
|
||||||
|
|
||||||
def _generate_content_sync(self, prompt: str) -> str:
|
def _generate_content_sync(self, prompt: str) -> str:
|
||||||
"""Synchronous call to DeepSeek API"""
|
"""Synchronous call to DeepSeek API"""
|
||||||
@ -40,33 +137,47 @@ class LLMJobRefiner:
|
|||||||
return ""
|
return ""
|
||||||
|
|
||||||
async def refine_job_data(self, raw_data: Dict[str, Any], target_field: str) -> Dict[str, Any]:
|
async def refine_job_data(self, raw_data: Dict[str, Any], target_field: str) -> Dict[str, Any]:
|
||||||
|
# Clean the raw HTML content for better LLM processing
|
||||||
|
page_content = raw_data.get('page_content', '')
|
||||||
|
cleaned_content = self._clean_html_for_llm(page_content)
|
||||||
|
|
||||||
|
# Get job_id and url from raw data
|
||||||
|
job_id = raw_data.get('job_id', 'unknown')
|
||||||
|
url = raw_data.get('url', 'N/A')
|
||||||
|
|
||||||
prompt = f"""
|
prompt = f"""
|
||||||
You are a job data extraction assistant. Extract the following fields from the job posting:
|
You are a job posting data extractor with two modes:
|
||||||
- title
|
|
||||||
- company_name
|
PRIMARY MODE (PREFERRED):
|
||||||
|
- Extract EXACT text as it appears on the page for all fields
|
||||||
|
- DO NOT summarize, paraphrase, or interpret
|
||||||
|
- Copy verbatim content including original wording and formatting
|
||||||
|
|
||||||
|
FALLBACK MODE (ONLY IF FIELD IS MISSING):
|
||||||
|
- If a field is NOT explicitly stated anywhere in the content, you MAY infer it using clear contextual clues
|
||||||
|
- Inference rules:
|
||||||
|
* company_name: Look for patterns like "at [Company]", "Join [Company]", "[Company] is hiring"
|
||||||
|
* nature_of_work: Look for "remote", "onsite", "hybrid", "work from home", "office-based"
|
||||||
|
* location: Extract city/state/country mentions near job title or details
|
||||||
|
* title: Use the largest/primary heading if no explicit "job title" label exists
|
||||||
|
|
||||||
|
REQUIRED FIELDS (must always have a value):
|
||||||
|
- title: Exact job title or best inference
|
||||||
|
- company_name: Exact company name or best inference
|
||||||
|
- job_id: Use provided: {job_id}
|
||||||
|
- url: Use provided: {url}
|
||||||
|
|
||||||
|
OPTIONAL FIELDS (use exact text or "N/A" if not present and not inferable):
|
||||||
- location
|
- location
|
||||||
- description
|
- description
|
||||||
- requirements
|
- requirements
|
||||||
- qualifications
|
- qualifications
|
||||||
- salary_range
|
- salary_range
|
||||||
- nature_of_work (remote, onsite, or hybrid)
|
- nature_of_work
|
||||||
- job_id
|
|
||||||
|
|
||||||
Target Field: {target_field}
|
Page Content:
|
||||||
Raw Page Content:
|
{cleaned_content}
|
||||||
{raw_data.get('page_content', '')}
|
Response format (ONLY return this JSON):
|
||||||
|
|
||||||
Instructions:
|
|
||||||
1. Extract only the information relevant to the target field: {target_field}
|
|
||||||
2. Clean up any formatting issues in the description
|
|
||||||
3. Standardize location format (city, state/country)
|
|
||||||
4. Extract salary range if mentioned
|
|
||||||
5. Determine nature of work from work arrangements
|
|
||||||
6. Ensure all fields are properly formatted
|
|
||||||
7. If a field cannot be found, use "N/A"
|
|
||||||
8. Return ONLY the refined data in JSON format
|
|
||||||
|
|
||||||
Response format (only return the JSON):
|
|
||||||
{{
|
{{
|
||||||
"title": "...",
|
"title": "...",
|
||||||
"company_name": "...",
|
"company_name": "...",
|
||||||
@ -76,8 +187,8 @@ class LLMJobRefiner:
|
|||||||
"qualifications": "...",
|
"qualifications": "...",
|
||||||
"salary_range": "...",
|
"salary_range": "...",
|
||||||
"nature_of_work": "...",
|
"nature_of_work": "...",
|
||||||
"job_id": "{raw_data.get('job_id', 'unknown')}",
|
"job_id": "{job_id}",
|
||||||
"url": "{raw_data.get('url', 'N/A')}"
|
"url": "{url}"
|
||||||
}}
|
}}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -87,7 +198,17 @@ class LLMJobRefiner:
|
|||||||
lambda: self._generate_content_sync(prompt)
|
lambda: self._generate_content_sync(prompt)
|
||||||
)
|
)
|
||||||
refined_data = self._parse_llm_response(response_text)
|
refined_data = self._parse_llm_response(response_text)
|
||||||
return refined_data if refined_data else None
|
|
||||||
|
# Final validation - ensure required fields are present and meaningful
|
||||||
|
if refined_data:
|
||||||
|
required_fields = ['title', 'company_name', 'job_id', 'url']
|
||||||
|
for field in required_fields:
|
||||||
|
if not refined_data.get(field) or refined_data[field] in ["N/A", "", "Unknown", "Company", "Job"]:
|
||||||
|
return None # LLM failed to extract properly
|
||||||
|
|
||||||
|
return refined_data
|
||||||
|
return None
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"LLM refinement failed: {str(e)}")
|
print(f"LLM refinement failed: {str(e)}")
|
||||||
return None
|
return None
|
||||||
@ -109,22 +230,23 @@ class LLMJobRefiner:
|
|||||||
await self._save_to_markdown(job_data, keyword)
|
await self._save_to_markdown(job_data, keyword)
|
||||||
|
|
||||||
async def _save_to_db(self, job_data: Dict[str, Any]):
|
async def _save_to_db(self, job_data: Dict[str, Any]):
|
||||||
db_path = "linkedin_jobs.db"
|
"""Save job data to PostgreSQL database with job_id uniqueness"""
|
||||||
os.makedirs(os.path.dirname(db_path) or ".", exist_ok=True)
|
try:
|
||||||
with sqlite3.connect(db_path) as conn:
|
conn = psycopg2.connect(
|
||||||
|
host=self.db_host,
|
||||||
|
port=self.db_port,
|
||||||
|
database="postgres",
|
||||||
|
user=self.db_username,
|
||||||
|
password=self.db_password
|
||||||
|
)
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
CREATE TABLE IF NOT EXISTS jobs (
|
INSERT INTO jobs
|
||||||
title TEXT, company_name TEXT, location TEXT, description TEXT,
|
|
||||||
requirements TEXT, qualifications TEXT, salary_range TEXT,
|
|
||||||
nature_of_work TEXT, job_id TEXT, url TEXT
|
|
||||||
)
|
|
||||||
''')
|
|
||||||
cursor.execute('''
|
|
||||||
INSERT OR IGNORE INTO jobs
|
|
||||||
(title, company_name, location, description, requirements,
|
(title, company_name, location, description, requirements,
|
||||||
qualifications, salary_range, nature_of_work, job_id, url)
|
qualifications, salary_range, nature_of_work, job_id, url, category, scraped_at)
|
||||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||||
|
ON CONFLICT (job_id) DO NOTHING
|
||||||
''', (
|
''', (
|
||||||
job_data.get("title", "N/A"),
|
job_data.get("title", "N/A"),
|
||||||
job_data.get("company_name", "N/A"),
|
job_data.get("company_name", "N/A"),
|
||||||
@ -135,9 +257,19 @@ class LLMJobRefiner:
|
|||||||
job_data.get("salary_range", "N/A"),
|
job_data.get("salary_range", "N/A"),
|
||||||
job_data.get("nature_of_work", "N/A"),
|
job_data.get("nature_of_work", "N/A"),
|
||||||
job_data.get("job_id", "N/A"),
|
job_data.get("job_id", "N/A"),
|
||||||
job_data.get("url", "N/A")
|
job_data.get("url", "N/A"),
|
||||||
|
job_data.get("category", "N/A"),
|
||||||
|
job_data.get("scraped_at")
|
||||||
))
|
))
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
cursor.close()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
print(f" 💾 Saved job to category '{job_data.get('category', 'N/A')}' with job_id: {job_data.get('job_id', 'N/A')}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Database save error: {e}")
|
||||||
|
|
||||||
async def _save_to_markdown(self, job_data: Dict[str, Any], keyword: str):
|
async def _save_to_markdown(self, job_data: Dict[str, Any], keyword: str):
|
||||||
os.makedirs("linkedin_jobs", exist_ok=True)
|
os.makedirs("linkedin_jobs", exist_ok=True)
|
||||||
@ -154,6 +286,8 @@ class LLMJobRefiner:
|
|||||||
f.write(f"- **Nature of Work**: {job_data.get('nature_of_work', 'N/A')}\n")
|
f.write(f"- **Nature of Work**: {job_data.get('nature_of_work', 'N/A')}\n")
|
||||||
f.write(f"- **Salary Range**: {job_data.get('salary_range', 'N/A')}\n")
|
f.write(f"- **Salary Range**: {job_data.get('salary_range', 'N/A')}\n")
|
||||||
f.write(f"- **Job ID**: {job_data.get('job_id', 'N/A')}\n")
|
f.write(f"- **Job ID**: {job_data.get('job_id', 'N/A')}\n")
|
||||||
|
f.write(f"- **Category**: {job_data.get('category', 'N/A')}\n")
|
||||||
|
f.write(f"- **Scraped At**: {job_data.get('scraped_at', 'N/A')}\n")
|
||||||
f.write(f"- **URL**: <{job_data.get('url', 'N/A')}>\n\n")
|
f.write(f"- **URL**: <{job_data.get('url', 'N/A')}>\n\n")
|
||||||
f.write(f"### Description\n\n{job_data.get('description', 'N/A')}\n\n")
|
f.write(f"### Description\n\n{job_data.get('description', 'N/A')}\n\n")
|
||||||
f.write(f"### Requirements\n\n{job_data.get('requirements', 'N/A')}\n\n")
|
f.write(f"### Requirements\n\n{job_data.get('requirements', 'N/A')}\n\n")
|
||||||
|
|||||||
@ -69,7 +69,7 @@ class FingerprintScrapingEngine:
|
|||||||
self.optimization_params = {
|
self.optimization_params = {
|
||||||
"base_delay": 2.0,
|
"base_delay": 2.0,
|
||||||
"max_concurrent_requests": 4,
|
"max_concurrent_requests": 4,
|
||||||
"request_timeout": 60000,
|
"request_timeout": 120000,
|
||||||
"retry_attempts": 3,
|
"retry_attempts": 3,
|
||||||
"captcha_handling_strategy": "avoid", # or "solve_fallback"
|
"captcha_handling_strategy": "avoid", # or "solve_fallback"
|
||||||
"cloudflare_wait_strategy": "smart_wait", # or "aggressive_reload"
|
"cloudflare_wait_strategy": "smart_wait", # or "aggressive_reload"
|
||||||
@ -155,7 +155,7 @@ class FingerprintScrapingEngine:
|
|||||||
|
|
||||||
# Increase timeout if avg response time is high
|
# Increase timeout if avg response time is high
|
||||||
if avg_rt > 20:
|
if avg_rt > 20:
|
||||||
self.optimization_params["request_timeout"] = 90000 # 90 seconds
|
self.optimization_params["request_timeout"] = 150000 # 90 seconds
|
||||||
|
|
||||||
print(f"Optimization Params Updated: {self.optimization_params}")
|
print(f"Optimization Params Updated: {self.optimization_params}")
|
||||||
|
|
||||||
@ -371,7 +371,7 @@ class FingerprintScrapingEngine:
|
|||||||
# Reload occasionally to trigger potential client-side checks
|
# Reload occasionally to trigger potential client-side checks
|
||||||
if (time.time() - start_time) > 15 and (time.time() - start_time) % 20 < 2:
|
if (time.time() - start_time) > 15 and (time.time() - start_time) % 20 < 2:
|
||||||
print("Reloading page during Cloudflare wait...")
|
print("Reloading page during Cloudflare wait...")
|
||||||
await page.reload(wait_until='load', timeout=30000)
|
await page.reload(wait_until='load', timeout=80000)
|
||||||
|
|
||||||
print("Timeout waiting for Cloudflare resolution.")
|
print("Timeout waiting for Cloudflare resolution.")
|
||||||
return False
|
return False
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user