Refactor scraper and sender modules for improved Redis management and SSL connection handling

- Introduced RedisManager class in scraper.py for centralized Redis operations including job tracking and caching.
- Enhanced job scraping logic in MultiPlatformJobScraper to support multiple platforms (Ashby, Lever, Greenhouse).
- Updated browser initialization and context management to ensure better resource handling.
- Improved error handling and logging throughout the scraping process.
- Added SSL connection parameters management in a new ssl_connection.py module for RabbitMQ connections.
- Refactored sender.py to utilize RedisManager for job deduplication and improved logging mechanisms.
- Enhanced CSV processing logic in sender.py with better validation and error handling.
- Updated connection parameters for RabbitMQ to support SSL configurations based on environment variables.
This commit is contained in:
Ofure Ikheloa 2025-12-12 13:48:26 +01:00
parent 0c447d0f77
commit c370de83d5
8 changed files with 803 additions and 346 deletions

9
config.ini Normal file
View File

@ -0,0 +1,9 @@
[rabbitmq]
queue_name = job_queue
[files]
directory = C:\Users\OfuRich\jobs\csv
[logging]
log_file = C:\Users\OfuRich\Documents\ai jobhub\Web_scraping_project\logs\sender.log

View File

@ -6,6 +6,7 @@ from dotenv import load_dotenv
# Load environment variables from .env file # Load environment variables from .env file
load_dotenv() load_dotenv()
directory = "C:/Users/OfuRich/Downloads"
# LLM Agent Configuration # LLM Agent Configuration
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY") DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")

View File

@ -1,7 +1,8 @@
import asyncio import asyncio
import random import random
import time import time
from playwright.async_api import Page, BrowserContext, Browser, TimeoutError as PlaywrightTimeoutError from playwright.async_api import Page, BrowserContext, Browser
from typing import Optional from typing import Optional
from scraping_engine import FingerprintScrapingEngine from scraping_engine import FingerprintScrapingEngine
@ -14,56 +15,41 @@ class StealthyFetcher:
self.max_retries = 5 self.max_retries = 5
self.base_delay = 5 self.base_delay = 5
async def fetch_url(self, url: str, wait_for_selector: Optional[str] = None) -> Optional[Page]: async def fetch_url(self, url: str, wait_for_selector: Optional[str] = None, timeout=300000) -> Optional[Page]:
""" """
Fetch a URL using stealth techniques, handling Cloudflare and other protections intelligently. Fetch URL using the provided context (caller handles page creation)
""" """
for attempt in range(self.max_retries): page = None
try: try:
print(f"Attempt {attempt + 1} to fetch {url}")
page = await self.context.new_page() page = await self.context.new_page()
# Use networkidle for all platforms - works reliably for Ashby, Lever, and Greenhouse
await page.goto(url, wait_until='domcontentloaded', timeout=min(timeout, 60000))
await page.goto(url, wait_until='load', timeout=120000) # Skip human behavior for Lever (already loads fully without it)
if "lever.co" not in url:
if wait_for_selector:
try:
await page.wait_for_selector(wait_for_selector, timeout=40000)
except PlaywrightTimeoutError:
print(f"Selector {wait_for_selector} not found immediately, continuing...")
await self._apply_human_behavior(page) await self._apply_human_behavior(page)
protection_type = await self._detect_protection(page) protection_type = await self._detect_protection(page)
if protection_type: if protection_type:
print(f"🛡️ Protection detected: {protection_type}") content_accessible = await self._is_content_accessible(page)
content_accessible = await self._is_content_accessible(page, wait_for_selector)
if not content_accessible: if not content_accessible:
print("🔒 Content not accessible due to protection.")
handled = False handled = False
if protection_type == "cloudflare": if protection_type == "cloudflare":
handled = await self._handle_cloudflare(page) handled = await self._handle_cloudflare(page)
elif protection_type == "captcha": elif protection_type == "captcha":
handled = await self._handle_captcha(page) handled = await self._handle_captcha(page)
if not handled: if not handled:
print("❌ Failed to handle protection.")
await page.close()
await asyncio.sleep(self.base_delay * (2 ** attempt))
continue
else:
print("✅ Protection present but content is accessible — proceeding.")
print(f"✅ Successfully fetched {url}")
return page
except Exception as e:
print(f"Attempt {attempt + 1} failed for {url}: {str(e)}")
if 'page' in locals():
await page.close()
await asyncio.sleep(self.base_delay * (2 ** attempt))
print(f"❌ Failed to fetch {url} after {self.max_retries} attempts.")
return None return None
return page
except Exception as e:
try:
if page:
await page.close()
except Exception:
pass
raise
async def _apply_human_behavior(self, page: Page): async def _apply_human_behavior(self, page: Page):
await self.engine._human_like_scroll(page) await self.engine._human_like_scroll(page)
await asyncio.sleep(random.uniform(1, 3)) await asyncio.sleep(random.uniform(1, 3))
@ -72,54 +58,40 @@ class StealthyFetcher:
async def _detect_protection(self, page: Page) -> Optional[str]: async def _detect_protection(self, page: Page) -> Optional[str]:
content = (await page.content()).lower() content = (await page.content()).lower()
if ( if ("#cf-chl" in content or "checking your browser" in content or
"#cf-chl" in content "just a moment" in content or "cloudflare" in content or
or "checking your browser" in content "ddos protection" in content or "turnstile" in content):
or "just a moment" in content
or "cloudflare" in content
or "ddos protection" in content
or "turnstile" in content
):
return "cloudflare" return "cloudflare"
elif "captcha" in content or "robot" in content or "verify you're human" in content: elif "captcha" in content or "robot" in content or "verify you're human" in content:
return "captcha" return "captcha"
return None return None
async def _is_content_accessible(self, page: Page, wait_for_selector: Optional[str] = None) -> bool: async def _is_content_accessible(self, page: Page, wait_for_selector: Optional[str] = None) -> bool:
if wait_for_selector:
try:
await page.wait_for_selector(wait_for_selector, timeout=40000)
return True
except PlaywrightTimeoutError:
pass
try: try:
await page.wait_for_selector("body", timeout=60000)
body_text = await page.eval_on_selector("body", "el => el.innerText.toLowerCase()") body_text = await page.eval_on_selector("body", "el => el.innerText.toLowerCase()")
return len(body_text.strip()) > 200 if len(body_text.strip()) < 100:
except:
return False return False
job_keywords = ['job', 'role', 'apply', 'responsibilities', 'requirements', 'qualifications']
async def _handle_captcha(self, page: Page) -> bool: return any(word in body_text for word in job_keywords)
print("🦾 Using 'avoid' strategy for captcha — skipping page.") except:
return False return False
async def _handle_cloudflare(self, page: Page) -> bool: async def _handle_cloudflare(self, page: Page) -> bool:
max_wait_time = 60 max_wait_time = 60
start_time = time.time() start_time = time.time()
while time.time() - start_time < max_wait_time: while time.time() - start_time < max_wait_time:
if not await self._detect_protection(page): if not await self._detect_protection(page):
print("☁️ Cloudflare challenge resolved.")
return True return True
print("☁️ Cloudflare active, waiting...")
await self._apply_human_behavior(page) await self._apply_human_behavior(page)
wait_time = min(10, 2 + random.uniform(1, 3) + (time.time() - start_time) * 0.1) wait_time = min(10, 2 + random.uniform(1, 3) + (time.time() - start_time) * 0.1)
await asyncio.sleep(wait_time) await asyncio.sleep(wait_time)
if (time.time() - start_time) > 15 and (time.time() - start_time) % 20 < 2: if (time.time() - start_time) > 15 and (time.time() - start_time) % 20 < 2:
print("🔄 Reloading page during Cloudflare wait...") try:
await page.reload(wait_until='load', timeout=120000) await page.reload(wait_until='domcontentloaded', timeout=120000)
except Exception:
print("⏰ Timeout waiting for Cloudflare resolution.") pass
return False return False
async def _handle_captcha(self, page: Page) -> bool:
return False # Avoid strategy

View File

@ -60,6 +60,7 @@ class LLMJobRefiner:
) )
cursor = conn.cursor() cursor = conn.cursor()
# Create table if it doesn't exist
cursor.execute(''' cursor.execute('''
CREATE TABLE IF NOT EXISTS jobs ( CREATE TABLE IF NOT EXISTS jobs (
id SERIAL PRIMARY KEY, id SERIAL PRIMARY KEY,
@ -80,6 +81,12 @@ class LLMJobRefiner:
) )
''') ''')
# Add apply_type column if it doesn't exist
cursor.execute('''
ALTER TABLE jobs
ADD COLUMN IF NOT EXISTS apply_type TEXT DEFAULT 'signup'
''')
# Ensure the uniqueness constraint exists # Ensure the uniqueness constraint exists
cursor.execute(''' cursor.execute('''
ALTER TABLE jobs DROP CONSTRAINT IF EXISTS jobs_job_id_key; ALTER TABLE jobs DROP CONSTRAINT IF EXISTS jobs_job_id_key;
@ -89,6 +96,7 @@ class LLMJobRefiner:
cursor.execute('CREATE INDEX IF NOT EXISTS idx_job_id ON jobs(job_id)') cursor.execute('CREATE INDEX IF NOT EXISTS idx_job_id ON jobs(job_id)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_category ON jobs(category)') cursor.execute('CREATE INDEX IF NOT EXISTS idx_category ON jobs(category)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_posted_date ON jobs(posted_date)') cursor.execute('CREATE INDEX IF NOT EXISTS idx_posted_date ON jobs(posted_date)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_apply_type ON jobs(apply_type)')
conn.commit() conn.commit()
cursor.close() cursor.close()
@ -99,28 +107,53 @@ class LLMJobRefiner:
raise raise
def _clean_html_for_llm(self, html_content: str) -> str: def _clean_html_for_llm(self, html_content: str) -> str:
"""Clean HTML to make it more readable for LLM while preserving structure""" """Clean HTML to make it more readable for LLM while preserving key job structure"""
try: try:
soup = BeautifulSoup(html_content, 'html.parser') soup = BeautifulSoup(html_content, 'html.parser')
# Remove script and style elements # Remove unwanted elements
for script in soup(["script", "style", "nav", "footer", "header"]): for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside', 'noscript']):
script.decompose() element.decompose()
# Extract text but keep some structure # Keep only main content containers (platform-specific)
text = soup.get_text(separator=' ', strip=True) main_content = None
candidates = [
soup.find('main'),
soup.find('div', class_=re.compile(r'job|posting|content')),
soup.find('article'),
soup.body
]
# Clean up whitespace for candidate in candidates:
text = re.sub(r'\s+', ' ', text) if candidate:
main_content = candidate
break
if not main_content:
main_content = soup.body or soup
# Extract text with some structure
lines = []
for elem in main_content.descendants:
if isinstance(elem, str):
text = elem.strip()
if text and len(text) > 5: # Skip short fragments
lines.append(text)
elif elem.name in ['h1', 'h2', 'h3', 'h4', 'p', 'li', 'strong', 'b']:
text = elem.get_text().strip()
if text:
lines.append(text)
# Join with newlines for better LLM parsing
cleaned = '\n'.join(lines)
# Limit length for LLM context # Limit length for LLM context
if len(text) > 10000: if len(cleaned) > 10000:
text = text[:10000] + "..." cleaned = cleaned[:10000] + "..."
return text return cleaned
except Exception as e: except Exception as e:
print(f"HTML cleaning error: {e}") print(f"HTML cleaning error: {e}")
# Fallback to raw content if cleaning fails
return html_content[:100000] if len(html_content) > 100000 else html_content return html_content[:100000] if len(html_content) > 100000 else html_content
def _generate_content_sync(self, prompt: str) -> str: def _generate_content_sync(self, prompt: str) -> str:
@ -145,22 +178,58 @@ class LLMJobRefiner:
url = raw_data.get('url', 'N/A') url = raw_data.get('url', 'N/A')
posted_date = raw_data.get('posted_date', datetime.now().strftime("%m/%d/%y")) posted_date = raw_data.get('posted_date', datetime.now().strftime("%m/%d/%y"))
# Detect platform from URL
platform = "unknown"
if "ashbyhq.com" in url:
platform = "ashby"
elif "lever.co" in url:
platform = "lever"
elif "greenhouse.io" in url:
platform = "greenhouse"
# Platform-specific instructions
platform_instructions = ""
if platform == "ashby":
platform_instructions = """
For Ashby jobs:
- Title is usually in <h1> or <h2>
- Company name is often in <meta> or header
- Description is in <div class="job-posting"> or <article>
- Look for sections like "About Us", "What you'll do", "Requirements", "Benefits"
- Location may be in <span> near job title or in metadata
"""
elif platform == "lever":
platform_instructions = """
For Lever jobs:
- Title is in <h1> or <h2>
- Company name is in <title> or header
- Description is in <div class="job-description"> or <section>
- Look for headings like "What you'll do", "What you'll need", "Why join us"
- Location is often in <div class="location">
"""
elif platform == "greenhouse":
platform_instructions = """
For Greenhouse jobs:
- Title is in <h1> or <h2>
- Company name is in <meta> or header
- Description is in <div class="job-desc"> or <section>
- Look for headings like "Role overview", "What you'll do", "What you bring"
- Location is often in <div class="location">
"""
prompt = f""" prompt = f"""
You are an expert job posting parser. Extract information EXACTLY as it appears in the text. DO NOT summarize, paraphrase, or invent. You are an expert job posting parser. Extract information EXACTLY as it appears in the text. DO NOT summarize, paraphrase, or invent.
CRITICAL INSTRUCTIONS: CRITICAL INSTRUCTIONS:
- The job is from AMAZON. Look for these exact section headings: {platform_instructions}
- "## Basic Qualifications" extract as "qualifications"
- "## Preferred Qualifications" include this in "qualifications" too
- "## Description" or "About the Role" or "Key job responsibilities" extract as "description"
- "You Will:" or "Job responsibilities" include in "description"
- Requirements are often embedded in qualifications or description
FIELD RULES: FIELD RULES:
- description: MUST include ALL role details, responsibilities, and overview. Never "Not provided" if any job description exists. - description: MUST include ALL role details, responsibilities, and overview. Never "Not provided" if any job description exists.
- qualifications: MUST include ALL content from "Basic Qualifications" and "Preferred Qualifications" sections. Combine them. - qualifications: MUST include ALL required skills, experience, education, and preferred qualifications. Combine them.
- requirements: If no separate "requirements" section, extract required skills/experience from qualifications/description. - requirements: If no separate "requirements" section, extract required skills/experience from qualifications/description.
- For Amazon jobs, company_name = "Amazon". - location: Extract city, state, or remote status if available.
- salary_range: Extract if explicitly mentioned (e.g., "$70,000$85,000").
- nature_of_work: Extract if mentioned (e.g., "Part-time", "Remote", "On-site").
REQUIRED FIELDS (must have valid values, never "N/A"): REQUIRED FIELDS (must have valid values, never "N/A"):
- title, company_name, job_id, url - title, company_name, job_id, url
@ -201,22 +270,6 @@ FIELD RULES:
if not refined_data.get(field) or refined_data[field].strip() in ["N/A", "", "Unknown", "Company", "Job"]: if not refined_data.get(field) or refined_data[field].strip() in ["N/A", "", "Unknown", "Company", "Job"]:
return None return None
# CRITICAL: Validate content fields - check if they SHOULD exist
content_fields = ['description', 'qualifications']
cleaned_original = cleaned_content.lower()
# Simple heuristic: if page contains job-related keywords, content fields should NOT be "Not provided"
job_indicators = ['responsibilit', 'duties', 'require', 'qualifi', 'skill', 'experienc', 'educat', 'degree', 'bachelor', 'master']
has_job_content = any(indicator in cleaned_original for indicator in job_indicators)
if has_job_content:
for field in content_fields:
value = refined_data.get(field, "").strip()
if value in ["Not provided", "N/A", ""]:
# LLM failed to extract existing content
print(f" ⚠ LLM returned '{value}' for {field} but job content appears present")
return None
# Add the posted_date to the refined data # Add the posted_date to the refined data
refined_data['posted_date'] = posted_date refined_data['posted_date'] = posted_date
@ -254,11 +307,15 @@ FIELD RULES:
) )
cursor = conn.cursor() cursor = conn.cursor()
# Add apply_type to job_data if not present (default to 'signup')
if 'apply_type' not in job_data:
job_data['apply_type'] = 'signup'
cursor.execute(''' cursor.execute('''
INSERT INTO jobs INSERT INTO jobs
(title, company_name, location, description, requirements, (title, company_name, location, description, requirements,
qualifications, salary_range, nature_of_work, job_id, url, category, scraped_at, posted_date) qualifications, salary_range, nature_of_work, apply_type, job_id, url, category, scraped_at, posted_date)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (job_id) DO NOTHING ON CONFLICT (job_id) DO NOTHING
''', ( ''', (
job_data.get("title", "N/A"), job_data.get("title", "N/A"),
@ -269,6 +326,7 @@ FIELD RULES:
job_data.get("qualifications", "N/A"), job_data.get("qualifications", "N/A"),
job_data.get("salary_range", "N/A"), job_data.get("salary_range", "N/A"),
job_data.get("nature_of_work", "N/A"), job_data.get("nature_of_work", "N/A"),
job_data.get("apply_type", "signup"), # Default to signup if not provided
job_data.get("job_id", "N/A"), job_data.get("job_id", "N/A"),
job_data.get("url", "N/A"), job_data.get("url", "N/A"),
job_data.get("category", "N/A"), job_data.get("category", "N/A"),
@ -299,6 +357,7 @@ FIELD RULES:
f.write(f"- *Location*: {job_data.get('location', 'N/A')}\n") f.write(f"- *Location*: {job_data.get('location', 'N/A')}\n")
f.write(f"- *Nature of Work*: {job_data.get('nature_of_work', 'N/A')}\n") f.write(f"- *Nature of Work*: {job_data.get('nature_of_work', 'N/A')}\n")
f.write(f"- *Salary Range*: {job_data.get('salary_range', 'N/A')}\n") f.write(f"- *Salary Range*: {job_data.get('salary_range', 'N/A')}\n")
f.write(f"- *Apply Type*: {job_data.get('apply_type', 'signup')}\n") # Add apply type to markdown
f.write(f"- *Job ID*: {job_data.get('job_id', 'N/A')}\n") f.write(f"- *Job ID*: {job_data.get('job_id', 'N/A')}\n")
f.write(f"- *Posted Date*: {job_data.get('posted_date', 'N/A')}\n") f.write(f"- *Posted Date*: {job_data.get('posted_date', 'N/A')}\n")
f.write(f"- *Category*: {job_data.get('category', 'N/A')}\n") f.write(f"- *Category*: {job_data.get('category', 'N/A')}\n")

View File

@ -5,33 +5,127 @@ import os
import json import json
import time import time
from typing import Optional, Dict from typing import Optional, Dict
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError from playwright.async_api import async_playwright
from browserforge.injectors.playwright import AsyncNewContext from browserforge.injectors.playwright import AsyncNewContext
from llm_agent import LLMJobRefiner from llm_agent import LLMJobRefiner
from fetcher import StealthyFetcher from fetcher import StealthyFetcher
from datetime import datetime from datetime import datetime
import redis
import pika import pika
from tenacity import retry, stop_after_attempt, wait_exponential
import logging import logging
from tenacity import retry, stop_after_attempt, wait_exponential
# Import your engine
from scraping_engine import FingerprintScrapingEngine from scraping_engine import FingerprintScrapingEngine
from dotenv import load_dotenv
from ssl_connection import create_ssl_connection_parameters # Import from ssl.py
import redis
load_dotenv()
# Configure logging # Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# Environment variables # Environment variables
RABBITMQ_HOST = os.getenv("RABBITMQ_HOST") RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "localhost")
RABBITMQ_PORT = os.getenv("RABBITMQ_PORT") RABBITMQ_PORT = int(os.getenv("RABBITMQ_PORT", "5671"))
RABBITMQ_USER = os.getenv("RABBITMQ_USER") RABBITMQ_SSL_ENABLED = os.getenv("RABBITMQ_SSL_ENABLED", "false").lower() == "true"
RABBITMQ_PASS = os.getenv("RABBITMQ_PASS")
REDIS_HOST = os.getenv("REDIS_HOST") # Redis configuration
REDIS_PORT = os.getenv("REDIS_PORT") REDIS_HOST = os.getenv('REDIS_HOST', 'redis-scrape.thejobhub.xyz')
REDIS_PORT = int(os.getenv('REDIS_PORT', '6380'))
REDIS_PASSWORD = os.getenv('REDIS_PASSWORD')
REDIS_SSL_ENABLED = os.getenv('REDIS_SSL_ENABLED', 'true').lower() == 'true'
class AshbyJobScraper: class RedisManager:
"""Manages Redis connection and operations for job tracking and caching."""
def __init__(self):
self.redis_client = None
self._connect()
def _connect(self):
"""Establish connection to Redis server."""
if not REDIS_PASSWORD:
logger.warning("Warning: REDIS_PASSWORD not found in environment.")
try:
self.redis_client = redis.Redis(
host=REDIS_HOST,
port=REDIS_PORT,
password=REDIS_PASSWORD,
ssl=REDIS_SSL_ENABLED,
ssl_cert_reqs=None,
socket_connect_timeout=10,
socket_timeout=30,
retry_on_timeout=True
)
response = self.redis_client.ping()
logger.info(f"Connected to Redis at {REDIS_HOST}:{REDIS_PORT}! Response: {response}")
except Exception as e:
logger.error(f"Failed to connect to Redis: {e}")
self.redis_client = None
def is_job_seen(self, job_id: str) -> bool:
if not self.redis_client:
return False
try:
return bool(self.redis_client.exists(f"job_seen:{job_id}"))
except Exception as e:
logger.error(f"Redis error checking job_seen: {e}")
return False
def mark_job_seen(self, job_id: str):
if not self.redis_client:
return
try:
self.redis_client.setex(f"job_seen:{job_id}", 2592000, "1")
except Exception as e:
logger.error(f"Redis error marking job_seen: {e}")
def get_cached_llm_result(self, job_url: str) -> Optional[Dict]:
if not self.redis_client:
return None
try:
cached_data = self.redis_client.get(f"llm_cache:{job_url}")
if cached_data:
return json.loads(cached_data)
return None
except Exception as e:
logger.error(f"Redis error getting LLM cache: {e}")
return None
def cache_llm_result(self, job_url: str, result: Dict):
if not self.redis_client:
return
try:
self.redis_client.setex(f"llm_cache:{job_url}", 604800, json.dumps(result))
except Exception as e:
logger.error(f"Redis error caching LLM result: {e}")
def add_job_to_error_cache(self, job_url: str, job_id: str, error_type: str):
if not self.redis_client:
return
try:
error_data = {
"job_url": job_url,
"job_id": job_id,
"error_type": error_type,
"timestamp": datetime.now().isoformat()
}
self.redis_client.setex(f"error_cache:{job_id}", 3600, json.dumps(error_data))
except Exception as e:
logger.error(f"Redis error adding to error cache: {e}")
class MultiPlatformJobScraper:
def __init__( def __init__(
self, self,
engine: FingerprintScrapingEngine, engine: FingerprintScrapingEngine,
@ -40,35 +134,78 @@ class AshbyJobScraper:
self.engine = engine self.engine = engine
self.user_request = user_request self.user_request = user_request
self.llm_agent = LLMJobRefiner() self.llm_agent = LLMJobRefiner()
self.redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=0, decode_responses=True)
self.browser = None self.browser = None
self.context = None self.pw = None
self.redis_manager = RedisManager()
async def init_browser(self): async def init_browser(self):
"""Initialize browser once using engine's fingerprint""" if self.browser is not None:
try:
await self.browser.new_page()
await self.close_browser()
except:
await self.close_browser()
if self.browser is None: if self.browser is None:
try:
profile = self.engine._select_profile() profile = self.engine._select_profile()
renderer = random.choice(self.engine.common_renderers[self.engine.os]) renderer = random.choice(self.engine.common_renderers[self.engine.os])
vendor = random.choice(self.engine.common_vendors) vendor = random.choice(self.engine.common_vendors)
spoof_script = self.engine._get_spoof_script(renderer, vendor) spoof_script = self.engine._get_spoof_script(renderer, vendor)
pw = await async_playwright().start() self.pw = await async_playwright().start()
self.browser = await pw.chromium.launch( self.browser = await self.pw.chromium.launch(
headless=True, headless=True,
args=['--disable-blink-features=AutomationControlled'] args=[
'--disable-blink-features=AutomationControlled',
'--no-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu'
]
) )
self.context = await AsyncNewContext(self.browser, fingerprint=profile) logger.info("✅ Browser launched (will reuse for all jobs)")
await self.context.add_init_script(f""" except Exception as e:
logger.error(f"💥 Failed to launch browser: {e}")
raise
async def create_fresh_context(self):
if self.browser is None:
await self.init_browser()
try:
await self.browser.new_page()
except Exception:
logger.warning("Browser appears dead. Reinitializing...")
await self.close_browser()
await self.init_browser()
profile = self.engine._select_profile()
context = await AsyncNewContext(self.browser, fingerprint=profile)
await context.add_init_script(f"""
Object.defineProperty(navigator, 'hardwareConcurrency', {{ get: () => {profile.navigator.hardwareConcurrency} }}); Object.defineProperty(navigator, 'hardwareConcurrency', {{ get: () => {profile.navigator.hardwareConcurrency} }});
Object.defineProperty(navigator, 'deviceMemory', {{ get: () => {profile.navigator.deviceMemory} }}); Object.defineProperty(navigator, 'deviceMemory', {{ get: () => {profile.navigator.deviceMemory} }});
Object.defineProperty(navigator, 'platform', {{ get: () => '{profile.navigator.platform}' }}); Object.defineProperty(navigator, 'platform', {{ get: () => '{profile.navigator.platform}' }});
""") """)
await self.context.add_init_script(spoof_script) spoof_script = self.engine._get_spoof_script(
random.choice(self.engine.common_renderers[self.engine.os]),
random.choice(self.engine.common_vendors)
)
await context.add_init_script(spoof_script)
return context
async def close_browser(self): async def close_browser(self):
if self.browser: if self.browser:
try:
await self.browser.close() await self.browser.close()
except:
pass
self.browser = None self.browser = None
if self.pw:
try:
await self.pw.stop()
except:
pass
self.pw = None
async def _safe_inner_text(self, element): async def _safe_inner_text(self, element):
if not element: if not element:
@ -95,47 +232,50 @@ class AshbyJobScraper:
async def _extract_page_content_for_llm(self, page) -> str: async def _extract_page_content_for_llm(self, page) -> str:
speed = self.engine.optimization_params.get("base_delay", 2.0) speed = self.engine.optimization_params.get("base_delay", 2.0)
await asyncio.sleep(2 * (speed / 2)) await asyncio.sleep(2 * (speed / 2))
if "lever.co" not in page.url:
await self.engine._human_like_scroll(page) await self.engine._human_like_scroll(page)
await asyncio.sleep(2 * (speed / 2)) await asyncio.sleep(2 * (speed / 2))
return await page.content() return await page.content()
async def _is_job_seen(self, job_id: str) -> bool: async def _is_job_seen(self, job_id: str) -> bool:
return self.redis_client.get(f"seen_job:{job_id}") is not None return self.redis_manager.is_job_seen(job_id)
async def _mark_job_seen(self, job_id: str): async def _mark_job_seen(self, job_id: str):
self.redis_client.setex(f"seen_job:{job_id}", 7 * 24 * 3600, "1") self.redis_manager.mark_job_seen(job_id)
async def _get_cached_llm_result(self, job_url: str) -> Optional[Dict]: async def _get_cached_llm_result(self, job_url: str) -> Optional[Dict]:
cached = self.redis_client.get(f"llm_cache:{job_url}") return self.redis_manager.get_cached_llm_result(job_url)
if cached:
return json.loads(cached)
return None
async def _cache_llm_result(self, job_url: str, result: Dict): async def _cache_llm_result(self, job_url: str, result: Dict):
self.redis_client.setex(f"llm_cache:{job_url}", 7 * 24 * 3600, json.dumps(result)) self.redis_manager.cache_llm_result(job_url, result)
async def _add_job_to_redis_cache(self, job_url: str, job_id: str, error_type: str): async def _add_job_to_redis_cache(self, job_url: str, job_id: str, error_type: str):
try: logger.info(f" 📦 Adding failed job to Redis cache: {job_id} (Error: {error_type})")
job_data = { self.redis_manager.add_job_to_error_cache(job_url, job_id, error_type)
"job_url": job_url,
"job_id": job_id,
"error_type": error_type,
"timestamp": datetime.now().isoformat()
}
self.redis_client.hset("failed_jobs", job_id, json.dumps(job_data))
logger.info(f"📦 Added failed job to Redis cache: {job_id} (Error: {error_type})")
except Exception as e:
logger.error(f"❌ Failed to add to Redis: {str(e)}")
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10)) def _get_platform(self, url: str) -> str:
if "ashbyhq.com" in url:
return "ashby"
elif "lever.co" in url:
return "lever"
elif "greenhouse.io" in url:
return "greenhouse"
else:
return "unknown"
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
async def scrape_job( async def scrape_job(
self, self,
job_url: str, job_url: str,
company_name: str, company_name: str,
message_id: str message_id: str
): ):
job_id = job_url.strip("/").split("/")[-1] platform = self._get_platform(job_url)
if platform == "unknown":
logger.info(f"⏭️ Skipping unsupported platform: {job_url}")
return True
job_id = job_url.strip("/").split("/")[-1]
if await self._is_job_seen(job_id): if await self._is_job_seen(job_id):
logger.info(f"⏭️ Skipping already processed job: {job_id}") logger.info(f"⏭️ Skipping already processed job: {job_id}")
return True return True
@ -147,42 +287,65 @@ class AshbyJobScraper:
await self._mark_job_seen(job_id) await self._mark_job_seen(job_id)
return True return True
context = None
page = None page = None
start_time = time.time() start_time = time.time()
try: try:
await self.init_browser() context = await self.create_fresh_context()
page = await self.context.new_page() page = await context.new_page()
# Fetch with timeout from engine config
timeout_ms = self.engine.optimization_params.get("request_timeout", 120000) timeout_ms = self.engine.optimization_params.get("request_timeout", 120000)
temp_fetcher = StealthyFetcher(self.engine, self.browser, self.context) temp_fetcher = StealthyFetcher(self.engine, self.browser, context)
fetch_timeout = 60000 if platform == "lever" else timeout_ms
job_page = await asyncio.wait_for( job_page = await asyncio.wait_for(
temp_fetcher.fetch_url(job_url, wait_for_selector="h1"), temp_fetcher.fetch_url(job_url, wait_for_selector="h1", timeout=fetch_timeout),
timeout=timeout_ms / 1000.0 timeout=fetch_timeout / 1000.0
) )
if not job_page: # Check if job still exists (minimal content validation)
await self._add_job_to_redis_cache(job_url, job_id, "fetch_failure") page_content = await job_page.content()
self.engine.report_outcome("fetch_failure", url=job_url) if len(page_content.strip()) < 500: # Arbitrary threshold for "page exists"
logger.error(f"❌ Job no longer exists (empty/deleted): {job_url}")
await self._add_job_to_redis_cache(job_url, job_id, "job_not_found")
self.engine.report_outcome("job_not_found", url=job_url)
return False return False
# Handle Cloudflare if detected if platform == "ashby":
if await self.engine._detect_cloudflare(job_page): try:
success = await self.engine._handle_cloudflare(job_page) await job_page.wait_for_selector("div[class*='job-posting'], article, main", timeout=60000)
if not success: except Exception:
await self._add_job_to_redis_cache(job_url, job_id, "cloudflare") logger.warning(f"⚠️ Ashby page didn't load properly: {job_url}")
self.engine.report_outcome("cloudflare", url=job_url)
return False return False
elif platform == "lever":
pass
elif platform == "greenhouse":
try:
await job_page.wait_for_selector("div.job-desc, section", timeout=60000)
except Exception:
pass
# 🔑 APPLY TYPE LOGIC
if platform in ["ashby", "lever", "greenhouse"]:
apply_type = 'AI' # Always AI for these platforms
else:
# For other platforms: check if form is accessible without login
apply_btn = await job_page.query_selector("button:has-text('Apply for this job'), button:has-text('Apply now')") apply_btn = await job_page.query_selector("button:has-text('Apply for this job'), button:has-text('Apply now')")
apply_type = 'signup' apply_type = 'signup' # default
if apply_btn: if apply_btn:
await self._human_click(job_page, apply_btn) await self._human_click(job_page, apply_btn)
speed = self.engine.optimization_params.get("base_delay", 2.0) speed = self.engine.optimization_params.get("base_delay", 2.0)
await asyncio.sleep(2 * (speed / 2)) await asyncio.sleep(2 * (speed / 2))
form = await job_page.query_selector("form, div[class*='application-form']") form = await job_page.query_selector("form, div[class*='application-form']")
if form: if form:
# Check for login prompts in form
login_indicators = await job_page.query_selector("input[type='email'], input[type='password'], text='sign in', text='log in'")
if not login_indicators:
apply_type = 'AI' apply_type = 'AI'
else:
apply_type = 'signup'
else:
apply_type = 'signup'
final_url = job_url final_url = job_url
page_content = await self._extract_page_content_for_llm(job_page) page_content = await self._extract_page_content_for_llm(job_page)
@ -196,8 +359,7 @@ class AshbyJobScraper:
"posted_date": posted_date "posted_date": posted_date
} }
# LLM call with timeout llm_timeout = max(60, self.engine.feedback.get("avg_response_time", 10) * 2)
llm_timeout = max(30, self.engine.feedback.get("avg_response_time", 10) * 2)
refined_data = await asyncio.wait_for( refined_data = await asyncio.wait_for(
self.llm_agent.refine_job_data(raw_data, self.user_request), self.llm_agent.refine_job_data(raw_data, self.user_request),
timeout=llm_timeout timeout=llm_timeout
@ -215,11 +377,14 @@ class AshbyJobScraper:
elif field == 'company_name': elif field == 'company_name':
refined_data[field] = company_name refined_data[field] = company_name
refined_data['apply_type'] = apply_type refined_data.update({
refined_data['scraped_at'] = datetime.now().isoformat() 'apply_type': apply_type,
refined_data['category'] = company_name 'scraped_at': datetime.now().isoformat(),
refined_data['posted_date'] = posted_date 'category': company_name,
refined_data['message_id'] = message_id 'posted_date': posted_date,
'message_id': message_id,
'platform': platform
})
await self.llm_agent.save_job_data(refined_data, company_name) await self.llm_agent.save_job_data(refined_data, company_name)
await self._cache_llm_result(job_url, refined_data) await self._cache_llm_result(job_url, refined_data)
@ -227,7 +392,7 @@ class AshbyJobScraper:
response_time = time.time() - start_time response_time = time.time() - start_time
self.engine.report_outcome("success", url=final_url, response_time=response_time) self.engine.report_outcome("success", url=final_url, response_time=response_time)
logger.info(f"✅ Scraped: {refined_data['title'][:50]}...") logger.info(f"✅ Scraped ({platform}): {refined_data['title'][:50]}... (Apply Type: {apply_type})")
success = True success = True
else: else:
logger.warning(f"🟡 LLM failed to refine: {final_url}") logger.warning(f"🟡 LLM failed to refine: {final_url}")
@ -237,18 +402,32 @@ class AshbyJobScraper:
return success return success
except asyncio.TimeoutError: except asyncio.TimeoutError:
logger.error(f"⏰ Timeout processing job: {job_url}") logger.error(f"⏰ Timeout processing job ({platform}): {job_url}")
await self._add_job_to_redis_cache(job_url, job_id, "timeout") await self._add_job_to_redis_cache(job_url, job_id, "timeout")
self.engine.report_outcome("timeout", url=job_url) self.engine.report_outcome("timeout", url=job_url)
return False return False
except Exception as e: except Exception as e:
logger.error(f"💥 Error processing job {job_url}: {str(e)}") error_msg = str(e)
if "NoneType" in error_msg or "disconnected" in error_msg or "Browser" in error_msg:
logger.warning("Browser connection lost. Forcing reinitialization.")
await self.close_browser()
# 🔍 Distinguish job-not-found vs other errors
if "page.goto: net::ERR_ABORTED" in error_msg or "page.goto: net::ERR_FAILED" in error_msg:
logger.error(f"❌ Job no longer exists (404/network error): {job_url}")
await self._add_job_to_redis_cache(job_url, job_id, "job_not_found")
self.engine.report_outcome("job_not_found", url=job_url)
else:
logger.error(f"💥 Error processing job ({platform}) {job_url}: {error_msg}")
await self._add_job_to_redis_cache(job_url, job_id, "exception") await self._add_job_to_redis_cache(job_url, job_id, "exception")
self.engine.report_outcome("exception", url=job_url) self.engine.report_outcome("exception", url=job_url)
return False return False
finally: finally:
if page: if context:
await page.close() try:
await context.close()
except Exception:
pass
# Global metrics # Global metrics
METRICS = { METRICS = {
@ -259,7 +438,8 @@ METRICS = {
"start_time": time.time() "start_time": time.time()
} }
async def process_message_async(scraper: AshbyJobScraper, ch, method, properties, body):
async def process_message_async(scraper: MultiPlatformJobScraper, ch, method, properties, body):
try: try:
job_data = json.loads(body) job_data = json.loads(body)
job_link = job_data['job_link'] job_link = job_data['job_link']
@ -267,7 +447,6 @@ async def process_message_async(scraper: AshbyJobScraper, ch, method, properties
message_id = properties.message_id or f"msg_{int(time.time()*1000)}" message_id = properties.message_id or f"msg_{int(time.time()*1000)}"
logger.info(f"📥 Processing job: {job_link} (ID: {message_id})") logger.info(f"📥 Processing job: {job_link} (ID: {message_id})")
success = await scraper.scrape_job(job_link, company_name, message_id) success = await scraper.scrape_job(job_link, company_name, message_id)
METRICS["processed"] += 1 METRICS["processed"] += 1
@ -275,7 +454,6 @@ async def process_message_async(scraper: AshbyJobScraper, ch, method, properties
METRICS["success"] += 1 METRICS["success"] += 1
else: else:
METRICS["failed"] += 1 METRICS["failed"] += 1
except json.JSONDecodeError: except json.JSONDecodeError:
logger.error("❌ Invalid JSON in message") logger.error("❌ Invalid JSON in message")
METRICS["failed"] += 1 METRICS["failed"] += 1
@ -285,33 +463,31 @@ async def process_message_async(scraper: AshbyJobScraper, ch, method, properties
finally: finally:
ch.basic_ack(delivery_tag=method.delivery_tag) ch.basic_ack(delivery_tag=method.delivery_tag)
def callback_wrapper(scraper: AshbyJobScraper):
def callback_wrapper(scraper: MultiPlatformJobScraper):
def callback(ch, method, properties, body): def callback(ch, method, properties, body):
asyncio.run(process_message_async(scraper, ch, method, properties, body)) asyncio.run(process_message_async(scraper, ch, method, properties, body))
return callback return callback
def start_consumer(): def start_consumer():
# Initialize REAL engine
engine = FingerprintScrapingEngine( engine = FingerprintScrapingEngine(
seed="ashby_scraper", seed="multiplatform_scraper",
target_os="windows", target_os="windows",
num_variations=10 num_variations=10
) )
scraper = AshbyJobScraper(engine) scraper = MultiPlatformJobScraper(engine)
# RabbitMQ connection with retries
connection = None connection = None
for attempt in range(5): for attempt in range(5):
try: try:
credentials = pika.PlainCredentials(RABBITMQ_USER, RABBITMQ_PASS) parameters = create_ssl_connection_parameters()
parameters = pika.ConnectionParameters(
host=RABBITMQ_HOST, if RABBITMQ_SSL_ENABLED:
port=RABBITMQ_PORT, logger.info(f"Connecting to RabbitMQ over SSL at {RABBITMQ_HOST}:{RABBITMQ_PORT}")
virtual_host='/', else:
credentials=credentials, logger.info(f"Connecting to RabbitMQ at {RABBITMQ_HOST}:{RABBITMQ_PORT}")
heartbeat=600,
blocked_connection_timeout=300
)
connection = pika.BlockingConnection(parameters) connection = pika.BlockingConnection(parameters)
break break
except Exception as e: except Exception as e:
@ -327,7 +503,7 @@ def start_consumer():
channel.basic_qos(prefetch_count=1) channel.basic_qos(prefetch_count=1)
channel.basic_consume(queue='job_queue', on_message_callback=callback_wrapper(scraper)) channel.basic_consume(queue='job_queue', on_message_callback=callback_wrapper(scraper))
logger.info('Waiting for messages. To exit press CTRL+C') logger.info('Waiting for messages (Ashby, Lever, Greenhouse). To exit press CTRL+C')
try: try:
channel.start_consuming() channel.start_consuming()
except KeyboardInterrupt: except KeyboardInterrupt:
@ -336,5 +512,6 @@ def start_consumer():
connection.close() connection.close()
asyncio.run(scraper.close_browser()) asyncio.run(scraper.close_browser())
if __name__ == "__main__": if __name__ == "__main__":
start_consumer() start_consumer()

View File

@ -371,7 +371,7 @@ class FingerprintScrapingEngine:
# Reload occasionally to trigger potential client-side checks # Reload occasionally to trigger potential client-side checks
if (time.time() - start_time) > 15 and (time.time() - start_time) % 20 < 2: if (time.time() - start_time) > 15 and (time.time() - start_time) % 20 < 2:
print("Reloading page during Cloudflare wait...") print("Reloading page during Cloudflare wait...")
await page.reload(wait_until='load', timeout=80000) await page.reload(wait_until='domcontentloaded', timeout=120000)
print("Timeout waiting for Cloudflare resolution.") print("Timeout waiting for Cloudflare resolution.")
return False return False

259
sender.py
View File

@ -10,21 +10,74 @@ import uuid
from configparser import ConfigParser from configparser import ConfigParser
import pika import pika
import redis import redis
import ssl
from dotenv import load_dotenv
from datetime import datetime
load_dotenv()
class RedisManager:
"""Manages Redis connection and operations for job deduplication."""
def __init__(self):
self.redis_host = os.getenv('REDIS_HOST', 'redis-scrape.thejobhub.xyz')
self.redis_port = int(os.getenv('REDIS_PORT', '6380'))
self.redis_password = os.getenv('REDIS_PASSWORD')
self.redis_ssl_enabled = os.getenv('REDIS_SSL_ENABLED', 'true').lower() == 'true'
self.redis_client = None
self._connect()
def _connect(self):
if not self.redis_password:
print("Warning: REDIS_PASSWORD not found in environment.")
try:
self.redis_client = redis.Redis(
host=self.redis_host,
port=self.redis_port,
password=self.redis_password,
ssl=self.redis_ssl_enabled,
ssl_cert_reqs=None,
socket_connect_timeout=10,
socket_timeout=30,
decode_responses=True
)
response = self.redis_client.ping()
print(f"Connected to Redis at {self.redis_host}:{self.redis_port}! Response: {response}")
except Exception as e:
print(f"Failed to connect to Redis: {e}")
self.redis_client = None
def is_job_seen(self, job_url):
if not self.redis_client:
return False
try:
return bool(self.redis_client.exists(f"sent_job:{job_url}"))
except Exception:
return False
def mark_job_sent(self, job_url):
if not self.redis_client:
return
try:
self.redis_client.setex(f"sent_job:{job_url}", 7 * 24 * 3600, "1")
except Exception:
pass
class Sender: class Sender:
def __init__(self, config_file='config.ini'): def __init__(self, config_file='config.ini'):
self.config = ConfigParser() self.config = ConfigParser()
self.config.read(config_file) self.config.read(config_file)
# RabbitMQ from env vars with fallbacks
self.rabbitmq_host = os.getenv("RABBITMQ_HOST") self.rabbitmq_host = os.getenv("RABBITMQ_HOST")
self.rabbitmq_port = os.getenv("RABBITMQ_PORT") self.rabbitmq_port = int(os.getenv("RABBITMQ_PORT") or 5672)
self.username = os.getenv("RABBITMQ_USER") self.username = os.getenv("RABBITMQ_USER")
self.password = os.getenv("RABBITMQ_PASS") self.password = os.getenv("RABBITMQ_PASS")
self.queue_name = self.config.get('rabbitmq', 'queue_name', fallback='job_queue') self.queue_name = self.config.get('rabbitmq', 'queue_name', fallback='job_queue')
self.directory = self.config.get('files', 'directory', fallback=os.path.join(os.path.expanduser("~"), "jobs", "csv")) self.directory = self.config.get('files', 'directory', fallback=os.path.join(os.path.expanduser("~"), "jobs", "csv"))
# Cross-platform log path: use user's home directory
default_log_dir = os.path.join(os.path.expanduser("~"), ".web_scraping_project", "logs") default_log_dir = os.path.join(os.path.expanduser("~"), ".web_scraping_project", "logs")
os.makedirs(default_log_dir, exist_ok=True) os.makedirs(default_log_dir, exist_ok=True)
default_log_file = os.path.join(default_log_dir, "sender.log") default_log_file = os.path.join(default_log_dir, "sender.log")
@ -32,20 +85,34 @@ class Sender:
self.virtual_host = self.config.get('rabbitmq', 'virtual_hash', fallback='/') self.virtual_host = self.config.get('rabbitmq', 'virtual_hash', fallback='/')
self.batch_size = 500 self.batch_size = 500
self.retry_attempts = 5 # Increased for robustness self.retry_attempts = 5
self.retry_sleep = 2 self.retry_sleep = 2
self.check_interval = 30 # More frequent polling self.check_interval = 30
# Redis for deduplication self.use_ssl = os.getenv("RABBITMQ_SSL_ENABLED", "false").lower() == "true"
redis_host = os.getenv("REDIS_HOST") if self.rabbitmq_port is None:
redis_port = os.getenv("REDIS_PORT") self.rabbitmq_port = "5671" if self.use_ssl else "5672"
self.redis_client = redis.Redis(host=redis_host, port=redis_port, db=1, decode_responses=True) else:
self.rabbitmq_port = int(self.rabbitmq_port)
self.redis_manager = RedisManager()
# Ensure log directory exists before configuring logging
log_dir = os.path.dirname(self.log_file) log_dir = os.path.dirname(self.log_file)
os.makedirs(log_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True)
logging.basicConfig(filename=self.log_file, level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
self.logger = logging.getLogger(__name__) self.logger = logging.getLogger('sender')
self.logger.setLevel(logging.INFO)
self.logger.handlers.clear()
file_handler = logging.FileHandler(self.log_file, encoding='utf-8')
file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
file_handler.setFormatter(file_formatter)
self.logger.addHandler(file_handler)
console_handler = logging.StreamHandler(sys.stdout)
console_formatter = logging.Formatter('%(levelname)s: %(message)s')
console_handler.setFormatter(console_formatter)
self.logger.addHandler(console_handler)
self.connection = None self.connection = None
self.channel = None self.channel = None
@ -54,21 +121,54 @@ class Sender:
signal.signal(signal.SIGTERM, self.graceful_shutdown) signal.signal(signal.SIGTERM, self.graceful_shutdown)
signal.signal(signal.SIGINT, self.graceful_shutdown) signal.signal(signal.SIGINT, self.graceful_shutdown)
def _create_ssl_options(self):
if not self.use_ssl:
return None
context = ssl.create_default_context()
verify_ssl = os.getenv("RABBITMQ_SSL_VERIFY", "false").lower() == "true"
if verify_ssl:
context.check_hostname = True
context.verify_mode = ssl.CERT_REQUIRED
else:
context.check_hostname = False
context.verify_mode = ssl.CERT_NONE
return pika.SSLOptions(context, self.rabbitmq_host)
def connect(self): def connect(self):
try: try:
if not self.rabbitmq_host:
self.logger.error("RABBITMQ_HOST environment variable is not set")
return False
if not self.username:
self.logger.error("RABBITMQ_USER environment variable is not set")
return False
if not self.password:
self.logger.error("RABBITMQ_PASS environment variable is not set")
return False
self.logger.info(f"Attempting to connect with host={self.rabbitmq_host}, port={self.rabbitmq_port}, user={self.username}")
credentials = pika.PlainCredentials(self.username, self.password) credentials = pika.PlainCredentials(self.username, self.password)
parameters = pika.ConnectionParameters( params = {
host=self.rabbitmq_host, 'host': self.rabbitmq_host,
port=self.rabbitmq_port, 'port': self.rabbitmq_port,
virtual_host=self.virtual_host, 'virtual_host': self.virtual_host,
credentials=credentials, 'credentials': credentials,
heartbeat=600, 'heartbeat': 600,
blocked_connection_timeout=300 'blocked_connection_timeout': 300
) }
if self.use_ssl:
params['ssl_options'] = self._create_ssl_options()
self.logger.info(f"Connecting to RabbitMQ over SSL at {self.rabbitmq_host}:{self.rabbitmq_port}")
else:
self.logger.info(f"Connecting to RabbitMQ at {self.rabbitmq_host}:{self.rabbitmq_port}")
parameters = pika.ConnectionParameters(**params)
self.connection = pika.BlockingConnection(parameters) self.connection = pika.BlockingConnection(parameters)
self.channel = self.connection.channel() self.channel = self.connection.channel()
self.channel.queue_declare(queue=self.queue_name, durable=True) self.channel.queue_declare(queue=self.queue_name, durable=True)
self.logger.info("Connected to RabbitMQ") self.logger.info("Connected to RabbitMQ successfully")
return True return True
except Exception as e: except Exception as e:
self.logger.error(f"Failed to connect to RabbitMQ: {str(e)}") self.logger.error(f"Failed to connect to RabbitMQ: {str(e)}")
@ -96,95 +196,154 @@ class Sender:
except Exception as e: except Exception as e:
self.logger.error(f"Failed to send message (attempt {attempt+1}): {str(e)}") self.logger.error(f"Failed to send message (attempt {attempt+1}): {str(e)}")
if attempt < self.retry_attempts - 1: if attempt < self.retry_attempts - 1:
time.sleep(self.retry_sleep * (2 ** attempt)) # Exponential backoff time.sleep(self.retry_sleep * (2 ** attempt))
if not self.reconnect(): if not self.reconnect():
return False return False
return False return False
def is_job_seen(self, job_url): def is_job_seen(self, job_url):
"""Check if job was sent recently (7 days)""" return self.redis_manager.is_job_seen(job_url)
return self.redis_client.get(f"sent_job:{job_url}") is not None
def mark_job_sent(self, job_url): def mark_job_sent(self, job_url):
"""Mark job as sent with 7-day TTL""" self.redis_manager.mark_job_sent(job_url)
self.redis_client.setex(f"sent_job:{job_url}", 7 * 24 * 3600, "1")
def process_csv(self, file_path): def process_csv(self, file_path):
try: try:
with open(file_path, 'r') as csvfile: with open(file_path, 'r', encoding='utf-8') as csvfile:
reader = csv.DictReader(csvfile) reader = csv.DictReader(csvfile)
batch = []
sent_count = 0 sent_count = 0
skipped_count = 0 skipped_count = 0
for row in reader: self.logger.info(f"CSV headers found: {reader.fieldnames}")
# Validate required fields
if 'job_link' not in row or 'company_name' not in row:
self.logger.warning(f"Skipping invalid row in {file_path}: {row}")
continue
job_link = row['job_link'].strip() for row_num, row in enumerate(reader, start=1):
company_name = row['company_name'].strip() # ✅ IMMEDIATE EXIT CHECK
if not self.running:
self.logger.info("Shutdown requested during CSV processing. Exiting...")
return sent_count
if not job_link or not company_name: if 'url' not in row or 'company' not in row:
self.logger.warning(f"Skipping empty row in {file_path}") self.logger.warning(f"Skipping row {row_num}: missing 'url' or 'company' field. Row: {row}")
continue
# Deduplication
if self.is_job_seen(job_link):
skipped_count += 1 skipped_count += 1
continue continue
job_data = { url = row['url'].strip()
'job_link': job_link, company = row['company'].strip()
'company_name': company_name
}
if not url:
self.logger.warning(f"Skipping row {row_num}: empty URL. Company: '{company}'")
skipped_count += 1
continue
if not company:
self.logger.warning(f"Skipping row {row_num}: empty company. URL: {url}")
skipped_count += 1
continue
if not url.startswith(('http://', 'https://')):
self.logger.warning(f"Skipping row {row_num}: invalid URL format. URL: {url}")
skipped_count += 1
continue
if self.is_job_seen(url):
self.logger.info(f"Skipping row {row_num}: job already sent (deduplicated). URL: {url}")
skipped_count += 1
continue
job_data = {'job_link': url, 'company_name': company}
message_id = str(uuid.uuid4()) message_id = str(uuid.uuid4())
message = json.dumps(job_data) message = json.dumps(job_data)
if self.send_message(message, message_id): if self.send_message(message, message_id):
sent_count += 1 sent_count += 1
self.mark_job_sent(job_link) self.mark_job_sent(url)
else: else:
self.logger.error(f"Failed to send job: {job_link}") self.logger.error(f"Failed to send job (row {row_num}): {url}")
skipped_count += 1
if (sent_count + skipped_count) % 100 == 0: if (sent_count + skipped_count) % 100 == 0:
self.logger.info(f"Progress: {sent_count} sent, {skipped_count} skipped from {file_path}") self.logger.info(f"Progress: {sent_count} sent, {skipped_count} skipped from {file_path}")
self.logger.info(f"Completed {file_path}: {sent_count} sent, {skipped_count} skipped") self.logger.info(f"Completed {file_path}: {sent_count} sent, {skipped_count} skipped")
try:
os.rename(file_path, file_path + '.processed') os.rename(file_path, file_path + '.processed')
self.logger.info(f"Processed and renamed {file_path} to {file_path}.processed") self.logger.info(f"Processed and renamed {file_path} to {file_path}.processed")
except Exception as rename_error:
self.logger.error(f"Failed to rename {file_path}: {str(rename_error)}")
marker_file = file_path + '.processed_marker'
with open(marker_file, 'w') as f:
f.write(f"Processed at {datetime.now().isoformat()}")
self.logger.info(f"Created marker file: {marker_file}")
return sent_count return sent_count
except Exception as e: except Exception as e:
self.logger.error(f"Error processing {file_path}: {str(e)}") self.logger.error(f"Error processing {file_path}: {str(e)}")
return 0 return 0
def find_new_csvs(self): def find_new_csvs(self):
if not self.running: # ✅ IMMEDIATE EXIT CHECK
return []
if not os.path.exists(self.directory):
return []
files = [f for f in os.listdir(self.directory) if f.endswith('.csv') and not f.endswith('.processed')] files = [f for f in os.listdir(self.directory) if f.endswith('.csv') and not f.endswith('.processed')]
files.sort() files.sort()
return [os.path.join(self.directory, f) for f in files] return [os.path.join(self.directory, f) for f in files]
def run(self): def run(self):
if not self.connect(): if not self.connect():
self.logger.error("RabbitMQ connection failed, exiting")
sys.exit(1) sys.exit(1)
try:
while self.running: while self.running:
new_files = self.find_new_csvs() new_files = self.find_new_csvs()
if new_files: if new_files:
for file_path in new_files: for file_path in new_files:
if not self.running: # ✅ IMMEDIATE EXIT CHECK
break
self.logger.info(f"Processing {file_path}") self.logger.info(f"Processing {file_path}")
sent = self.process_csv(file_path) sent = self.process_csv(file_path)
self.logger.info(f"Sent {sent} jobs from {file_path}") self.logger.info(f"Sent {sent} jobs from {file_path}")
else: else:
self.logger.info("No new CSV files found") self.logger.info("No new CSV files found")
time.sleep(self.check_interval)
# Replace blocking sleep with interruptible sleep
for _ in range(self.check_interval):
if not self.running:
break
time.sleep(1)
except KeyboardInterrupt:
# This should not normally be reached due to signal handler, but added for safety
pass
finally:
if self.connection and self.connection.is_open: if self.connection and self.connection.is_open:
self.logger.info("Closing RabbitMQ connection...")
self.connection.close() self.connection.close()
def graceful_shutdown(self, signum, frame): def graceful_shutdown(self, signum, frame):
self.logger.info("Received shutdown signal") self.logger.info("Received shutdown signal. Initiating graceful shutdown...")
self.running = False self.running = False # This will break all loops
if __name__ == '__main__': if __name__ == '__main__':
required_vars = ['RABBITMQ_HOST', 'RABBITMQ_PORT', 'RABBITMQ_USER', 'RABBITMQ_PASS']
missing_vars = [var for var in required_vars if not os.getenv(var)]
if missing_vars:
print(f"Missing environment variables: {missing_vars}")
print("Check your .env file and ensure load_dotenv() is working")
sys.exit(1)
sender = Sender() sender = Sender()
print(f"Using directory: {sender.directory}")
print(f"Directory exists: {os.path.exists(sender.directory)}")
if os.path.exists(sender.directory):
print(f"Files: {os.listdir(sender.directory)}")
try:
sender.run() sender.run()
except KeyboardInterrupt:
# Fallback in case signal handler doesn't catch it
sender.logger.info("KeyboardInterrupt caught in main. Exiting.")
sys.exit(0)

80
ssl_connection.py Normal file
View File

@ -0,0 +1,80 @@
import pika
import ssl
import os
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
def create_ssl_connection_parameters():
"""
Create and return RabbitMQ connection parameters with SSL configuration.
This function handles both SSL and non-SSL connections based on environment variables.
"""
# Load environment variables with fallbacks
rabbitmq_host = os.getenv('RABBITMQ_HOST')
rabbitmq_port = int(os.getenv('RABBITMQ_PORT', '5671'))
rabbitmq_user = os.getenv('RABBITMQ_USER')
rabbitmq_pass = os.getenv('RABBITMQ_PASS', 'ofure-scrape')
rabbitmq_ssl_enabled = os.getenv('RABBITMQ_SSL_ENABLED', 'true').lower() == 'true'
rabbitmq_ssl_verify = os.getenv('RABBITMQ_SSL_VERIFY', 'false').lower() == 'true'
# Validate credentials
if not rabbitmq_pass or rabbitmq_pass == 'YOUR_STRONG_PASSWORD':
print("Warning: Using placeholder or empty password. Please check .env file.")
credentials = pika.PlainCredentials(rabbitmq_user, rabbitmq_pass)
if rabbitmq_ssl_enabled:
# SSL Context
context = ssl.create_default_context()
context.check_hostname = rabbitmq_ssl_verify
context.verify_mode = ssl.CERT_REQUIRED if rabbitmq_ssl_verify else ssl.CERT_NONE
ssl_options = pika.SSLOptions(context, rabbitmq_host)
params = pika.ConnectionParameters(
host=rabbitmq_host,
port=rabbitmq_port,
credentials=credentials,
ssl_options=ssl_options,
heartbeat=600,
blocked_connection_timeout=300,
virtual_host='/'
)
else:
# Non-SSL connection
params = pika.ConnectionParameters(
host=rabbitmq_host,
port=rabbitmq_port if rabbitmq_port != 5671 else 5672, # Default non-SSL port
credentials=credentials,
heartbeat=600,
blocked_connection_timeout=300,
virtual_host='/'
)
return params
def test_connection():
"""
Test function to verify RabbitMQ connection (original functionality preserved).
"""
try:
params = create_ssl_connection_parameters()
connection = pika.BlockingConnection(params)
channel = connection.channel()
print("Connected to Secure RabbitMQ!")
connection.close()
return True
except Exception as e:
import traceback
print(f"Failed to connect: {e!r}")
traceback.print_exc()
return False
# Keep the original test functionality when run directly
if __name__ == "__main__":
test_connection()