Enhance job handling in scraper and sender modules:
- Update fetch timeout in StealthyFetcher for improved reliability. - Refactor LLMJobRefiner to create and manage Quelah Jobs table in PostgreSQL. - Modify RedisManager to track sent job counts for jobs.csv and adjust deduplication logic. - Implement job URL-based deduplication across scraper and sender.
This commit is contained in:
parent
c370de83d5
commit
b13d14d26d
@ -23,7 +23,7 @@ class StealthyFetcher:
|
|||||||
try:
|
try:
|
||||||
page = await self.context.new_page()
|
page = await self.context.new_page()
|
||||||
# Use networkidle for all platforms - works reliably for Ashby, Lever, and Greenhouse
|
# Use networkidle for all platforms - works reliably for Ashby, Lever, and Greenhouse
|
||||||
await page.goto(url, wait_until='domcontentloaded', timeout=min(timeout, 60000))
|
await page.goto(url, wait_until='domcontentloaded', timeout=min(timeout, 120000))
|
||||||
|
|
||||||
# Skip human behavior for Lever (already loads fully without it)
|
# Skip human behavior for Lever (already loads fully without it)
|
||||||
if "lever.co" not in url:
|
if "lever.co" not in url:
|
||||||
@ -68,7 +68,7 @@ class StealthyFetcher:
|
|||||||
|
|
||||||
async def _is_content_accessible(self, page: Page, wait_for_selector: Optional[str] = None) -> bool:
|
async def _is_content_accessible(self, page: Page, wait_for_selector: Optional[str] = None) -> bool:
|
||||||
try:
|
try:
|
||||||
await page.wait_for_selector("body", timeout=60000)
|
await page.wait_for_selector("body", timeout=120000)
|
||||||
body_text = await page.eval_on_selector("body", "el => el.innerText.toLowerCase()")
|
body_text = await page.eval_on_selector("body", "el => el.innerText.toLowerCase()")
|
||||||
if len(body_text.strip()) < 100:
|
if len(body_text.strip()) < 100:
|
||||||
return False
|
return False
|
||||||
|
|||||||
151
llm_agent.py
151
llm_agent.py
@ -39,39 +39,29 @@ class LLMJobRefiner:
|
|||||||
self._init_db()
|
self._init_db()
|
||||||
|
|
||||||
def _init_db(self):
|
def _init_db(self):
|
||||||
"""Initialize PostgreSQL database connection and create table"""
|
"""Initialize PostgreSQL database connection and create Quelah Jobs table"""
|
||||||
try:
|
try:
|
||||||
self.db_url = os.getenv("DB_URL")
|
conn = psycopg2.connect(
|
||||||
if self.db_url and "supabase.com" in self.db_url:
|
host=self.db_host,
|
||||||
conn = psycopg2.connect(
|
port=self.db_port,
|
||||||
host=self.db_host,
|
database="postgres",
|
||||||
port=self.db_port,
|
user=self.db_username,
|
||||||
database="postgres",
|
password=self.db_password
|
||||||
user=self.db_username,
|
|
||||||
password=self.db_password
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
conn = psycopg2.connect(
|
|
||||||
host=self.db_host,
|
|
||||||
port=self.db_port,
|
|
||||||
database="postgres",
|
|
||||||
user=self.db_username,
|
|
||||||
password=self.db_password
|
|
||||||
)
|
)
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
# Create table if it doesn't exist
|
# ✅ CREATE NEW TABLE: quelah_jobs (no requirements field)
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
CREATE TABLE IF NOT EXISTS jobs (
|
CREATE TABLE IF NOT EXISTS quelah_jobs (
|
||||||
id SERIAL PRIMARY KEY,
|
id SERIAL PRIMARY KEY,
|
||||||
title TEXT,
|
title TEXT,
|
||||||
company_name TEXT,
|
company_name TEXT,
|
||||||
location TEXT,
|
location TEXT,
|
||||||
description TEXT,
|
description TEXT,
|
||||||
requirements TEXT,
|
|
||||||
qualifications TEXT,
|
qualifications TEXT,
|
||||||
salary_range TEXT,
|
salary_range TEXT,
|
||||||
nature_of_work TEXT,
|
nature_of_work TEXT,
|
||||||
|
apply_type TEXT DEFAULT 'signup',
|
||||||
job_id TEXT UNIQUE,
|
job_id TEXT UNIQUE,
|
||||||
url TEXT,
|
url TEXT,
|
||||||
category TEXT,
|
category TEXT,
|
||||||
@ -81,27 +71,22 @@ class LLMJobRefiner:
|
|||||||
)
|
)
|
||||||
''')
|
''')
|
||||||
|
|
||||||
# Add apply_type column if it doesn't exist
|
# Ensure uniqueness constraint
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
ALTER TABLE jobs
|
ALTER TABLE quelah_jobs DROP CONSTRAINT IF EXISTS quelah_jobs_job_id_key;
|
||||||
ADD COLUMN IF NOT EXISTS apply_type TEXT DEFAULT 'signup'
|
ALTER TABLE quelah_jobs ADD CONSTRAINT quelah_jobs_job_id_key UNIQUE (job_id);
|
||||||
''')
|
''')
|
||||||
|
|
||||||
# Ensure the uniqueness constraint exists
|
# Create indexes
|
||||||
cursor.execute('''
|
cursor.execute('CREATE INDEX IF NOT EXISTS idx_quelah_job_id ON quelah_jobs(job_id)')
|
||||||
ALTER TABLE jobs DROP CONSTRAINT IF EXISTS jobs_job_id_key;
|
cursor.execute('CREATE INDEX IF NOT EXISTS idx_quelah_category ON quelah_jobs(category)')
|
||||||
ALTER TABLE jobs ADD CONSTRAINT jobs_job_id_key UNIQUE (job_id);
|
cursor.execute('CREATE INDEX IF NOT EXISTS idx_quelah_posted_date ON quelah_jobs(posted_date)')
|
||||||
''')
|
cursor.execute('CREATE INDEX IF NOT EXISTS idx_quelah_apply_type ON quelah_jobs(apply_type)')
|
||||||
|
|
||||||
cursor.execute('CREATE INDEX IF NOT EXISTS idx_job_id ON jobs(job_id)')
|
|
||||||
cursor.execute('CREATE INDEX IF NOT EXISTS idx_category ON jobs(category)')
|
|
||||||
cursor.execute('CREATE INDEX IF NOT EXISTS idx_posted_date ON jobs(posted_date)')
|
|
||||||
cursor.execute('CREATE INDEX IF NOT EXISTS idx_apply_type ON jobs(apply_type)')
|
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
cursor.close()
|
cursor.close()
|
||||||
conn.close()
|
conn.close()
|
||||||
print("✅ PostgreSQL database initialized successfully")
|
print("✅ Quelah Jobs table initialized successfully")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"❌ Database initialization error: {e}")
|
print(f"❌ Database initialization error: {e}")
|
||||||
raise
|
raise
|
||||||
@ -111,18 +96,18 @@ class LLMJobRefiner:
|
|||||||
try:
|
try:
|
||||||
soup = BeautifulSoup(html_content, 'html.parser')
|
soup = BeautifulSoup(html_content, 'html.parser')
|
||||||
|
|
||||||
# Remove unwanted elements
|
# Remove unwanted elements
|
||||||
for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside', 'noscript']):
|
for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside', 'noscript']):
|
||||||
element.decompose()
|
element.decompose()
|
||||||
|
|
||||||
# Keep only main content containers (platform-specific)
|
# Keep only main content containers
|
||||||
main_content = None
|
main_content = None
|
||||||
candidates = [
|
candidates = [
|
||||||
soup.find('main'),
|
soup.find('main'),
|
||||||
soup.find('div', class_=re.compile(r'job|posting|content')),
|
soup.find('div', class_=re.compile(r'job|posting|content')),
|
||||||
soup.find('article'),
|
soup.find('article'),
|
||||||
soup.body
|
soup.body
|
||||||
]
|
]
|
||||||
|
|
||||||
for candidate in candidates:
|
for candidate in candidates:
|
||||||
if candidate:
|
if candidate:
|
||||||
@ -132,22 +117,22 @@ class LLMJobRefiner:
|
|||||||
if not main_content:
|
if not main_content:
|
||||||
main_content = soup.body or soup
|
main_content = soup.body or soup
|
||||||
|
|
||||||
# Extract text with some structure
|
# Extract text with some structure
|
||||||
lines = []
|
lines = []
|
||||||
for elem in main_content.descendants:
|
for elem in main_content.descendants:
|
||||||
if isinstance(elem, str):
|
if isinstance(elem, str):
|
||||||
text = elem.strip()
|
text = elem.strip()
|
||||||
if text and len(text) > 5: # Skip short fragments
|
if text and len(text) > 5: # Skip short fragments
|
||||||
lines.append(text)
|
lines.append(text)
|
||||||
elif elem.name in ['h1', 'h2', 'h3', 'h4', 'p', 'li', 'strong', 'b']:
|
elif elem.name in ['h1', 'h2', 'h3', 'h4', 'p', 'li', 'strong', 'b']:
|
||||||
text = elem.get_text().strip()
|
text = elem.get_text().strip()
|
||||||
if text:
|
if text:
|
||||||
lines.append(text)
|
lines.append(text)
|
||||||
|
|
||||||
# Join with newlines for better LLM parsing
|
# Join with newlines for better LLM parsing
|
||||||
cleaned = '\n'.join(lines)
|
cleaned = '\n'.join(lines)
|
||||||
|
|
||||||
# Limit length for LLM context
|
# Limit length for LLM context
|
||||||
if len(cleaned) > 10000:
|
if len(cleaned) > 10000:
|
||||||
cleaned = cleaned[:10000] + "..."
|
cleaned = cleaned[:10000] + "..."
|
||||||
|
|
||||||
@ -176,9 +161,9 @@ class LLMJobRefiner:
|
|||||||
cleaned_content = self._clean_html_for_llm(page_content)
|
cleaned_content = self._clean_html_for_llm(page_content)
|
||||||
job_id = raw_data.get('job_id', 'unknown')
|
job_id = raw_data.get('job_id', 'unknown')
|
||||||
url = raw_data.get('url', 'N/A')
|
url = raw_data.get('url', 'N/A')
|
||||||
posted_date = raw_data.get('posted_date', datetime.now().strftime("%m/%d/%y"))
|
posted_date = raw_data.get('posted_date', "12/01/25") # ✅ Fixed date
|
||||||
|
|
||||||
# Detect platform from URL
|
# Detect platform from URL (for prompt only)
|
||||||
platform = "unknown"
|
platform = "unknown"
|
||||||
if "ashbyhq.com" in url:
|
if "ashbyhq.com" in url:
|
||||||
platform = "ashby"
|
platform = "ashby"
|
||||||
@ -187,7 +172,7 @@ class LLMJobRefiner:
|
|||||||
elif "greenhouse.io" in url:
|
elif "greenhouse.io" in url:
|
||||||
platform = "greenhouse"
|
platform = "greenhouse"
|
||||||
|
|
||||||
# Platform-specific instructions
|
# Platform-specific instructions
|
||||||
platform_instructions = ""
|
platform_instructions = ""
|
||||||
if platform == "ashby":
|
if platform == "ashby":
|
||||||
platform_instructions = """
|
platform_instructions = """
|
||||||
@ -195,7 +180,7 @@ class LLMJobRefiner:
|
|||||||
- Title is usually in <h1> or <h2>
|
- Title is usually in <h1> or <h2>
|
||||||
- Company name is often in <meta> or header
|
- Company name is often in <meta> or header
|
||||||
- Description is in <div class="job-posting"> or <article>
|
- Description is in <div class="job-posting"> or <article>
|
||||||
- Look for sections like "About Us", "What you'll do", "Requirements", "Benefits"
|
- Look for sections like "About Us", "What you'll do", "Qualifications", "Benefits"
|
||||||
- Location may be in <span> near job title or in metadata
|
- Location may be in <span> near job title or in metadata
|
||||||
"""
|
"""
|
||||||
elif platform == "lever":
|
elif platform == "lever":
|
||||||
@ -226,17 +211,18 @@ CRITICAL INSTRUCTIONS:
|
|||||||
FIELD RULES:
|
FIELD RULES:
|
||||||
- description: MUST include ALL role details, responsibilities, and overview. Never "Not provided" if any job description exists.
|
- description: MUST include ALL role details, responsibilities, and overview. Never "Not provided" if any job description exists.
|
||||||
- qualifications: MUST include ALL required skills, experience, education, and preferred qualifications. Combine them.
|
- qualifications: MUST include ALL required skills, experience, education, and preferred qualifications. Combine them.
|
||||||
- requirements: If no separate "requirements" section, extract required skills/experience from qualifications/description.
|
|
||||||
- location: Extract city, state, or remote status if available.
|
- location: Extract city, state, or remote status if available.
|
||||||
- salary_range: Extract if explicitly mentioned (e.g., "$70,000–$85,000").
|
- salary_range: Extract if explicitly mentioned (e.g., "$70,000–$85,000").
|
||||||
- nature_of_work: Extract if mentioned (e.g., "Part-time", "Remote", "On-site").
|
- nature_of_work: Extract if mentioned (e.g., "Part-time", "Remote", "On-site").
|
||||||
|
|
||||||
REQUIRED FIELDS (must have valid values, never "N/A"):
|
REQUIRED FIELDS (must have valid values, never "N/A"):
|
||||||
- title, company_name, job_id, url
|
- title, company_name, job_id, url, description
|
||||||
|
|
||||||
OPTIONAL FIELDS (can be "Not provided"):
|
OPTIONAL FIELDS (can be "Not provided" if the information is actually not provided):
|
||||||
- location, salary_range, nature_of_work
|
- location, salary_range, nature_of_work
|
||||||
|
|
||||||
|
⚠️ IMPORTANT: Do NOT include or extract a "requirements" field. Focus only on description and qualifications.
|
||||||
|
|
||||||
Page Content:
|
Page Content:
|
||||||
{cleaned_content}
|
{cleaned_content}
|
||||||
|
|
||||||
@ -258,19 +244,19 @@ Response format (ONLY return this JSON):
|
|||||||
response_text = await asyncio.get_event_loop().run_in_executor(
|
response_text = await asyncio.get_event_loop().run_in_executor(
|
||||||
None,
|
None,
|
||||||
lambda: self._generate_content_sync(prompt)
|
lambda: self._generate_content_sync(prompt)
|
||||||
)
|
)
|
||||||
refined_data = self._parse_llm_response(response_text)
|
refined_data = self._parse_llm_response(response_text)
|
||||||
|
|
||||||
if not refined_data:
|
if not refined_data:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Validate required fields
|
# Validate required fields
|
||||||
required_fields = ['title', 'company_name', 'job_id', 'url']
|
required_fields = ['title', 'company_name', 'job_id', 'url', 'description']
|
||||||
for field in required_fields:
|
for field in required_fields:
|
||||||
if not refined_data.get(field) or refined_data[field].strip() in ["N/A", "", "Unknown", "Company", "Job"]:
|
if not refined_data.get(field) or refined_data[field].strip() in ["N/A", "", "Unknown", "Company", "Job"]:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Add the posted_date to the refined data
|
# Add the fixed posted_date
|
||||||
refined_data['posted_date'] = posted_date
|
refined_data['posted_date'] = posted_date
|
||||||
|
|
||||||
return refined_data
|
return refined_data
|
||||||
@ -291,79 +277,78 @@ Response format (ONLY return this JSON):
|
|||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
async def save_job_data(self, job_data: Dict[str, Any], keyword: str):
|
async def save_job_data(self, job_data: Dict[str, Any], keyword: str, platform: str = "quelah"):
|
||||||
await self._save_to_db(job_data)
|
"""Save ALL jobs to Quelah Jobs table and markdown"""
|
||||||
await self._save_to_markdown(job_data, keyword)
|
await self._save_to_db_quelah(job_data)
|
||||||
|
await self._save_to_markdown_quelah(job_data, keyword)
|
||||||
|
|
||||||
async def _save_to_db(self, job_data: Dict[str, Any]):
|
async def _save_to_db_quelah(self, job_data: Dict[str, Any]):
|
||||||
"""Save job data to PostgreSQL database with job_id uniqueness"""
|
"""Save job data to Quelah Jobs table"""
|
||||||
try:
|
try:
|
||||||
conn = psycopg2.connect(
|
conn = psycopg2.connect(
|
||||||
host=self.db_host,
|
host=self.db_host,
|
||||||
port=self.db_port,
|
port=self.db_port,
|
||||||
database="postgres",
|
database="postgres",
|
||||||
user=self.db_username,
|
user=self.db_username,
|
||||||
password=self.db_password
|
password=self.db_password
|
||||||
)
|
)
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
# Add apply_type to job_data if not present (default to 'signup')
|
# Set apply_type if not present
|
||||||
if 'apply_type' not in job_data:
|
apply_type = job_data.get("apply_type", "signup")
|
||||||
job_data['apply_type'] = 'signup'
|
|
||||||
|
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
INSERT INTO jobs
|
INSERT INTO quelah_jobs
|
||||||
(title, company_name, location, description, requirements,
|
(title, company_name, location, description, qualifications,
|
||||||
qualifications, salary_range, nature_of_work, apply_type, job_id, url, category, scraped_at, posted_date)
|
salary_range, nature_of_work, apply_type, job_id, url, category, scraped_at, posted_date)
|
||||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||||
ON CONFLICT (job_id) DO NOTHING
|
ON CONFLICT (job_id) DO NOTHING
|
||||||
''', (
|
''', (
|
||||||
job_data.get("title", "N/A"),
|
job_data.get("title", "N/A"),
|
||||||
job_data.get("company_name", "N/A"),
|
job_data.get("company_name", "N/A"),
|
||||||
job_data.get("location", "N/A"),
|
job_data.get("location", "N/A"),
|
||||||
job_data.get("description", "N/A"),
|
job_data.get("description", "N/A"),
|
||||||
job_data.get("requirements", "N/A"),
|
|
||||||
job_data.get("qualifications", "N/A"),
|
job_data.get("qualifications", "N/A"),
|
||||||
job_data.get("salary_range", "N/A"),
|
job_data.get("salary_range", "N/A"),
|
||||||
job_data.get("nature_of_work", "N/A"),
|
job_data.get("nature_of_work", "N/A"),
|
||||||
job_data.get("apply_type", "signup"), # Default to signup if not provided
|
apply_type,
|
||||||
job_data.get("job_id", "N/A"),
|
job_data.get("job_id", "N/A"),
|
||||||
job_data.get("url", "N/A"),
|
job_data.get("url", "N/A"),
|
||||||
job_data.get("category", "N/A"),
|
job_data.get("category", "N/A"),
|
||||||
job_data.get("scraped_at"),
|
job_data.get("scraped_at"),
|
||||||
job_data.get("posted_date", "N/A")
|
job_data.get("posted_date", "12/01/25") # Fixed date
|
||||||
))
|
))
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
cursor.close()
|
cursor.close()
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
print(f" 💾 Saved job to category '{job_data.get('category', 'N/A')}' with job_id: {job_data.get('job_id', 'N/A')}")
|
print(f" 💾 Saved to Quelah Jobs | Job ID: {job_data.get('job_id', 'N/A')}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"❌ Database save error: {e}")
|
print(f"❌ Database save error: {e}")
|
||||||
|
|
||||||
async def _save_to_markdown(self, job_data: Dict[str, Any], keyword: str):
|
async def _save_to_markdown_quelah(self, job_data: Dict[str, Any], keyword: str):
|
||||||
os.makedirs("linkedin_jobs", exist_ok=True)
|
os.makedirs("quelah_jobs", exist_ok=True)
|
||||||
filepath = os.path.join("linkedin_jobs", "linkedin_jobs_scraped.md")
|
filepath = os.path.join("quelah_jobs", "quelah_jobs.md")
|
||||||
write_header = not os.path.exists(filepath) or os.path.getsize(filepath) == 0
|
write_header = not os.path.exists(filepath) or os.path.getsize(filepath) == 0
|
||||||
|
|
||||||
with open(filepath, "a", encoding="utf-8") as f:
|
with open(filepath, "a", encoding="utf-8") as f:
|
||||||
if write_header:
|
if write_header:
|
||||||
f.write(f"# LinkedIn Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
|
f.write(f"# Quelah Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
|
||||||
f.write(f"## Job: {job_data.get('title', 'N/A')}\n\n")
|
f.write(f"## Job: {job_data.get('title', 'N/A')}\n\n")
|
||||||
f.write(f"- *Keyword*: {keyword}\n")
|
f.write(f"- *Keyword*: {keyword}\n")
|
||||||
f.write(f"- *Company*: {job_data.get('company_name', 'N/A')}\n")
|
f.write(f"- *Company*: {job_data.get('company_name', 'N/A')}\n")
|
||||||
f.write(f"- *Location*: {job_data.get('location', 'N/A')}\n")
|
f.write(f"- *Location*: {job_data.get('location', 'N/A')}\n")
|
||||||
f.write(f"- *Nature of Work*: {job_data.get('nature_of_work', 'N/A')}\n")
|
f.write(f"- *Nature of Work*: {job_data.get('nature_of_work', 'N/A')}\n")
|
||||||
f.write(f"- *Salary Range*: {job_data.get('salary_range', 'N/A')}\n")
|
f.write(f"- *Salary Range*: {job_data.get('salary_range', 'N/A')}\n")
|
||||||
f.write(f"- *Apply Type*: {job_data.get('apply_type', 'signup')}\n") # Add apply type to markdown
|
f.write(f"- *Apply Type*: {job_data.get('apply_type', 'signup')}\n")
|
||||||
f.write(f"- *Job ID*: {job_data.get('job_id', 'N/A')}\n")
|
f.write(f"- *Job ID*: {job_data.get('job_id', 'N/A')}\n")
|
||||||
f.write(f"- *Posted Date*: {job_data.get('posted_date', 'N/A')}\n")
|
f.write(f"- *Posted Date*: {job_data.get('posted_date', '12/01/25')}\n") # Fixed date
|
||||||
f.write(f"- *Category*: {job_data.get('category', 'N/A')}\n")
|
f.write(f"- *Category*: {job_data.get('category', 'N/A')}\n")
|
||||||
f.write(f"- *Scraped At*: {job_data.get('scraped_at', 'N/A')}\n")
|
f.write(f"- *Scraped At*: {job_data.get('scraped_at', 'N/A')}\n")
|
||||||
f.write(f"- *URL*: <{job_data.get('url', 'N/A')}>\n\n")
|
f.write(f"- *URL*: <{job_data.get('url', 'N/A')}>\n\n")
|
||||||
f.write(f"### Description\n\n{job_data.get('description', 'N/A')}\n\n")
|
f.write(f"### Description\n\n{job_data.get('description', 'N/A')}\n\n")
|
||||||
f.write(f"### Requirements\n\n{job_data.get('requirements', 'N/A')}\n\n")
|
# ✅ REMOVED requirements section
|
||||||
f.write(f"### Qualifications\n\n{job_data.get('qualifications', 'N/A')}\n\n")
|
f.write(f"### Qualifications\n\n{job_data.get('qualifications', 'N/A')}\n\n")
|
||||||
f.write("---\n\n")
|
f.write("---\n\n")
|
||||||
199
scraper.py
199
scraper.py
@ -68,22 +68,24 @@ class RedisManager:
|
|||||||
logger.error(f"Failed to connect to Redis: {e}")
|
logger.error(f"Failed to connect to Redis: {e}")
|
||||||
self.redis_client = None
|
self.redis_client = None
|
||||||
|
|
||||||
def is_job_seen(self, job_id: str) -> bool:
|
def is_job_seen(self, job_url: str) -> bool:
|
||||||
|
"""✅ CHANGED: Check by job URL instead of job ID"""
|
||||||
if not self.redis_client:
|
if not self.redis_client:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return bool(self.redis_client.exists(f"job_seen:{job_id}"))
|
return bool(self.redis_client.exists(f"job_seen:{job_url}"))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Redis error checking job_seen: {e}")
|
logger.error(f"Redis error checking job_seen: {e}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def mark_job_seen(self, job_id: str):
|
def mark_job_seen(self, job_url: str):
|
||||||
|
"""✅ CHANGED: Mark by job URL instead of job ID"""
|
||||||
if not self.redis_client:
|
if not self.redis_client:
|
||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.redis_client.setex(f"job_seen:{job_id}", 2592000, "1")
|
self.redis_client.setex(f"job_seen:{job_url}", 2592000, "1")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Redis error marking job_seen: {e}")
|
logger.error(f"Redis error marking job_seen: {e}")
|
||||||
|
|
||||||
@ -237,11 +239,13 @@ class MultiPlatformJobScraper:
|
|||||||
await asyncio.sleep(2 * (speed / 2))
|
await asyncio.sleep(2 * (speed / 2))
|
||||||
return await page.content()
|
return await page.content()
|
||||||
|
|
||||||
async def _is_job_seen(self, job_id: str) -> bool:
|
async def _is_job_seen(self, job_url: str) -> bool:
|
||||||
return self.redis_manager.is_job_seen(job_id)
|
"""✅ Use job URL for deduplication"""
|
||||||
|
return self.redis_manager.is_job_seen(job_url)
|
||||||
|
|
||||||
async def _mark_job_seen(self, job_id: str):
|
async def _mark_job_seen(self, job_url: str):
|
||||||
self.redis_manager.mark_job_seen(job_id)
|
"""✅ Use job URL for marking"""
|
||||||
|
self.redis_manager.mark_job_seen(job_url)
|
||||||
|
|
||||||
async def _get_cached_llm_result(self, job_url: str) -> Optional[Dict]:
|
async def _get_cached_llm_result(self, job_url: str) -> Optional[Dict]:
|
||||||
return self.redis_manager.get_cached_llm_result(job_url)
|
return self.redis_manager.get_cached_llm_result(job_url)
|
||||||
@ -263,6 +267,36 @@ class MultiPlatformJobScraper:
|
|||||||
else:
|
else:
|
||||||
return "unknown"
|
return "unknown"
|
||||||
|
|
||||||
|
def _is_job_expired_or_invalid(self, page_content: str) -> bool:
|
||||||
|
"""Check if job is expired, removed, or has no description"""
|
||||||
|
content_lower = page_content.lower()
|
||||||
|
|
||||||
|
# Check for JavaScript-only pages
|
||||||
|
if "you need to enable javascript to run this app" in content_lower:
|
||||||
|
return True
|
||||||
|
|
||||||
|
invalid_phrases = [
|
||||||
|
"job no longer available",
|
||||||
|
"position has been filled",
|
||||||
|
"this job has expired",
|
||||||
|
"page not found",
|
||||||
|
"404 error",
|
||||||
|
"job has been closed",
|
||||||
|
"erweima.png", # Detect spam/ad content
|
||||||
|
"wocao03.com",
|
||||||
|
"github.com/wocao01"
|
||||||
|
]
|
||||||
|
|
||||||
|
for phrase in invalid_phrases:
|
||||||
|
if phrase in content_lower:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Check for meaningful description content
|
||||||
|
description_keywords = ['responsibilities', 'requirements', 'description', 'duties', 'role', 'about the']
|
||||||
|
has_description = any(kw in content_lower for kw in description_keywords)
|
||||||
|
|
||||||
|
return not has_description
|
||||||
|
|
||||||
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
||||||
async def scrape_job(
|
async def scrape_job(
|
||||||
self,
|
self,
|
||||||
@ -271,20 +305,21 @@ class MultiPlatformJobScraper:
|
|||||||
message_id: str
|
message_id: str
|
||||||
):
|
):
|
||||||
platform = self._get_platform(job_url)
|
platform = self._get_platform(job_url)
|
||||||
if platform == "unknown":
|
|
||||||
logger.info(f"⏭️ Skipping unsupported platform: {job_url}")
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
# ✅ ONLY extract job_id from URL
|
||||||
job_id = job_url.strip("/").split("/")[-1]
|
job_id = job_url.strip("/").split("/")[-1]
|
||||||
if await self._is_job_seen(job_id):
|
|
||||||
logger.info(f"⏭️ Skipping already processed job: {job_id}")
|
# ✅ Check if already processed BY URL (not job_id)
|
||||||
|
if await self._is_job_seen(job_url):
|
||||||
|
logger.info(f"⏭️ Skipping already processed job URL: {job_url}")
|
||||||
return True
|
return True
|
||||||
|
|
||||||
cached_result = await self._get_cached_llm_result(job_url)
|
cached_result = await self._get_cached_llm_result(job_url)
|
||||||
if cached_result:
|
if cached_result:
|
||||||
logger.info(f"📦 Using cached LLM result for: {job_url}")
|
logger.info(f"📦 Using cached LLM result for: {job_url}")
|
||||||
await self.llm_agent.save_job_data(cached_result, company_name)
|
# Save to Quelah Jobs - company_name will be overridden by LLM if found
|
||||||
await self._mark_job_seen(job_id)
|
await self.llm_agent.save_job_data(cached_result, company_name, "quelah")
|
||||||
|
await self._mark_job_seen(job_url) # ✅ Mark by URL
|
||||||
return True
|
return True
|
||||||
|
|
||||||
context = None
|
context = None
|
||||||
@ -298,64 +333,77 @@ class MultiPlatformJobScraper:
|
|||||||
temp_fetcher = StealthyFetcher(self.engine, self.browser, context)
|
temp_fetcher = StealthyFetcher(self.engine, self.browser, context)
|
||||||
|
|
||||||
fetch_timeout = 60000 if platform == "lever" else timeout_ms
|
fetch_timeout = 60000 if platform == "lever" else timeout_ms
|
||||||
job_page = await asyncio.wait_for(
|
|
||||||
temp_fetcher.fetch_url(job_url, wait_for_selector="h1", timeout=fetch_timeout),
|
|
||||||
timeout=fetch_timeout / 1000.0
|
|
||||||
)
|
|
||||||
|
|
||||||
# Check if job still exists (minimal content validation)
|
|
||||||
page_content = await job_page.content()
|
|
||||||
if len(page_content.strip()) < 500: # Arbitrary threshold for "page exists"
|
|
||||||
logger.error(f"❌ Job no longer exists (empty/deleted): {job_url}")
|
|
||||||
await self._add_job_to_redis_cache(job_url, job_id, "job_not_found")
|
|
||||||
self.engine.report_outcome("job_not_found", url=job_url)
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
# ✅ PLATFORM-SPECIFIC WAIT LOGIC WITH ASHBY FIX
|
||||||
if platform == "ashby":
|
if platform == "ashby":
|
||||||
try:
|
# Ashby requires JS execution - wait for network idle + job content
|
||||||
await job_page.wait_for_selector("div[class*='job-posting'], article, main", timeout=60000)
|
job_page = await asyncio.wait_for(
|
||||||
except Exception:
|
temp_fetcher.fetch_url(job_url, wait_for_selector=None, timeout=fetch_timeout),
|
||||||
logger.warning(f"⚠️ Ashby page didn't load properly: {job_url}")
|
timeout=fetch_timeout / 100.0
|
||||||
return False
|
)
|
||||||
elif platform == "lever":
|
if job_page:
|
||||||
pass
|
# Wait for React hydration (job content to appear)
|
||||||
|
try:
|
||||||
|
await job_page.wait_for_function(
|
||||||
|
"document.querySelector('h1') && document.querySelector('h1').innerText.length > 0",
|
||||||
|
timeout=120000
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
# Fallback: check if we got valid content
|
||||||
|
content = await job_page.content()
|
||||||
|
if "you need to enable javascript" in content.lower():
|
||||||
|
logger.warning(f"⚠️ Ashby page still shows JS error: {job_url}")
|
||||||
|
raise Exception("Ashby JS content not loaded")
|
||||||
elif platform == "greenhouse":
|
elif platform == "greenhouse":
|
||||||
try:
|
job_page = await asyncio.wait_for(
|
||||||
await job_page.wait_for_selector("div.job-desc, section", timeout=60000)
|
temp_fetcher.fetch_url(job_url, wait_for_selector="h1, div.job-desc", timeout=fetch_timeout),
|
||||||
except Exception:
|
timeout=fetch_timeout / 1000.0
|
||||||
pass
|
)
|
||||||
|
else: # lever & others
|
||||||
|
job_page = await asyncio.wait_for(
|
||||||
|
temp_fetcher.fetch_url(job_url, wait_for_selector="h1", timeout=fetch_timeout),
|
||||||
|
timeout=fetch_timeout / 1000.0
|
||||||
|
)
|
||||||
|
|
||||||
# 🔑 APPLY TYPE LOGIC
|
if job_page is None:
|
||||||
|
logger.error(f"❌ Failed to load page for {job_url}")
|
||||||
|
await self._add_job_to_redis_cache(job_url, job_id, "page_load_failed")
|
||||||
|
await self._mark_job_seen(job_url)
|
||||||
|
return True
|
||||||
|
|
||||||
|
page_content = await job_page.content()
|
||||||
|
|
||||||
|
if self._is_job_expired_or_invalid(page_content):
|
||||||
|
logger.warning(f"🗑️ Discarding invalid job: {job_url}")
|
||||||
|
self.engine.report_outcome("job_discarded", url=job_url)
|
||||||
|
await self._mark_job_seen(job_url) # ✅ Mark by URL
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Apply type logic
|
||||||
if platform in ["ashby", "lever", "greenhouse"]:
|
if platform in ["ashby", "lever", "greenhouse"]:
|
||||||
apply_type = 'AI' # Always AI for these platforms
|
apply_type = 'AI'
|
||||||
else:
|
else:
|
||||||
# For other platforms: check if form is accessible without login
|
|
||||||
apply_btn = await job_page.query_selector("button:has-text('Apply for this job'), button:has-text('Apply now')")
|
apply_btn = await job_page.query_selector("button:has-text('Apply for this job'), button:has-text('Apply now')")
|
||||||
apply_type = 'signup' # default
|
apply_type = 'signup'
|
||||||
if apply_btn:
|
if apply_btn:
|
||||||
await self._human_click(job_page, apply_btn)
|
await self._human_click(job_page, apply_btn)
|
||||||
speed = self.engine.optimization_params.get("base_delay", 2.0)
|
speed = self.engine.optimization_params.get("base_delay", 2.0)
|
||||||
await asyncio.sleep(2 * (speed / 2))
|
await asyncio.sleep(2 * (speed / 2))
|
||||||
form = await job_page.query_selector("form, div[class*='application-form']")
|
form = await job_page.query_selector("form, div[class*='application-form']")
|
||||||
if form:
|
if form:
|
||||||
# Check for login prompts in form
|
|
||||||
login_indicators = await job_page.query_selector("input[type='email'], input[type='password'], text='sign in', text='log in'")
|
login_indicators = await job_page.query_selector("input[type='email'], input[type='password'], text='sign in', text='log in'")
|
||||||
if not login_indicators:
|
if not login_indicators:
|
||||||
apply_type = 'AI'
|
apply_type = 'AI'
|
||||||
else:
|
|
||||||
apply_type = 'signup'
|
|
||||||
else:
|
|
||||||
apply_type = 'signup'
|
|
||||||
|
|
||||||
final_url = job_url
|
final_url = job_url
|
||||||
page_content = await self._extract_page_content_for_llm(job_page)
|
page_content = await self._extract_page_content_for_llm(job_page)
|
||||||
posted_date = datetime.now().strftime("%m/%d/%y")
|
posted_date = "12/01/25" # Fixed date
|
||||||
|
|
||||||
raw_data = {
|
raw_data = {
|
||||||
"page_content": page_content,
|
"page_content": page_content,
|
||||||
"url": final_url,
|
"url": final_url,
|
||||||
"job_id": job_id,
|
"job_id": job_id,
|
||||||
"search_keywords": company_name,
|
"search_keywords": company_name, # Only used if LLM can't find company
|
||||||
"posted_date": posted_date
|
"posted_date": posted_date
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -367,15 +415,18 @@ class MultiPlatformJobScraper:
|
|||||||
|
|
||||||
success = False
|
success = False
|
||||||
if refined_data and refined_data.get("title", "N/A") != "N/A":
|
if refined_data and refined_data.get("title", "N/A") != "N/A":
|
||||||
compulsory_fields = ['company_name', 'job_id', 'url']
|
# ✅ ONLY job_id, url are guaranteed - everything else from LLM
|
||||||
|
compulsory_fields = ['job_id', 'url']
|
||||||
for field in compulsory_fields:
|
for field in compulsory_fields:
|
||||||
if not refined_data.get(field) or refined_data[field] in ["N/A", "", "Unknown"]:
|
if not refined_data.get(field) or refined_data[field] in ["N/A", "", "Unknown"]:
|
||||||
if field == 'job_id':
|
if field == 'job_id':
|
||||||
refined_data[field] = job_id
|
refined_data[field] = job_id
|
||||||
elif field == 'url':
|
elif field == 'url':
|
||||||
refined_data[field] = final_url
|
refined_data[field] = final_url
|
||||||
elif field == 'company_name':
|
|
||||||
refined_data[field] = company_name
|
# Company name: prefer LLM extraction, fallback to queue
|
||||||
|
if not refined_data.get('company_name') or refined_data['company_name'] in ["N/A", "", "Unknown"]:
|
||||||
|
refined_data['company_name'] = company_name
|
||||||
|
|
||||||
refined_data.update({
|
refined_data.update({
|
||||||
'apply_type': apply_type,
|
'apply_type': apply_type,
|
||||||
@ -386,42 +437,51 @@ class MultiPlatformJobScraper:
|
|||||||
'platform': platform
|
'platform': platform
|
||||||
})
|
})
|
||||||
|
|
||||||
await self.llm_agent.save_job_data(refined_data, company_name)
|
await self.llm_agent.save_job_data(refined_data, company_name, "quelah")
|
||||||
await self._cache_llm_result(job_url, refined_data)
|
await self._cache_llm_result(job_url, refined_data)
|
||||||
await self._mark_job_seen(job_id)
|
await self._mark_job_seen(job_url) # ✅ Mark by URL
|
||||||
|
|
||||||
response_time = time.time() - start_time
|
response_time = time.time() - start_time
|
||||||
self.engine.report_outcome("success", url=final_url, response_time=response_time)
|
self.engine.report_outcome("success", url=final_url, response_time=response_time)
|
||||||
logger.info(f"✅ Scraped ({platform}): {refined_data['title'][:50]}... (Apply Type: {apply_type})")
|
logger.info(f"✅ Saved to Quelah Jobs ({platform}): {refined_data['title'][:50]}...")
|
||||||
success = True
|
success = True
|
||||||
else:
|
else:
|
||||||
logger.warning(f"🟡 LLM failed to refine: {final_url}")
|
logger.warning(f"🟡 LLM failed to refine: {final_url}")
|
||||||
await self._add_job_to_redis_cache(final_url, job_id, "llm_failure")
|
await self._add_job_to_redis_cache(final_url, job_id, "llm_failure")
|
||||||
|
await self._mark_job_seen(job_url) # ✅ Mark by URL
|
||||||
self.engine.report_outcome("llm_failure", url=final_url)
|
self.engine.report_outcome("llm_failure", url=final_url)
|
||||||
|
return True
|
||||||
|
|
||||||
return success
|
return success
|
||||||
|
|
||||||
except asyncio.TimeoutError:
|
except asyncio.TimeoutError:
|
||||||
logger.error(f"⏰ Timeout processing job ({platform}): {job_url}")
|
logger.error(f"⏰ Timeout processing job ({platform}): {job_url}")
|
||||||
await self._add_job_to_redis_cache(job_url, job_id, "timeout")
|
await self._add_job_to_redis_cache(job_url, job_id, "timeout")
|
||||||
|
await self._mark_job_seen(job_url) # ✅ Mark by URL
|
||||||
self.engine.report_outcome("timeout", url=job_url)
|
self.engine.report_outcome("timeout", url=job_url)
|
||||||
return False
|
return True
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_msg = str(e)
|
error_msg = str(e)
|
||||||
if "NoneType" in error_msg or "disconnected" in error_msg or "Browser" in error_msg:
|
if "NoneType" in error_msg or "disconnected" in error_msg or "Browser" in error_msg:
|
||||||
logger.warning("Browser connection lost. Forcing reinitialization.")
|
logger.warning("Browser connection lost. Forcing reinitialization.")
|
||||||
await self.close_browser()
|
await self.close_browser()
|
||||||
|
|
||||||
# 🔍 Distinguish job-not-found vs other errors
|
error_type = "exception"
|
||||||
if "page.goto: net::ERR_ABORTED" in error_msg or "page.goto: net::ERR_FAILED" in error_msg:
|
if "page.goto: net::ERR_ABORTED" in error_msg or "page.goto: net::ERR_FAILED" in error_msg:
|
||||||
logger.error(f"❌ Job no longer exists (404/network error): {job_url}")
|
error_type = "job_not_found"
|
||||||
await self._add_job_to_redis_cache(job_url, job_id, "job_not_found")
|
|
||||||
self.engine.report_outcome("job_not_found", url=job_url)
|
|
||||||
else:
|
else:
|
||||||
logger.error(f"💥 Error processing job ({platform}) {job_url}: {error_msg}")
|
if "required" in error_msg.lower() or "missing" in error_msg.lower():
|
||||||
await self._add_job_to_redis_cache(job_url, job_id, "exception")
|
error_type = "missing_fields"
|
||||||
self.engine.report_outcome("exception", url=job_url)
|
elif "captcha" in error_msg.lower() or "cloudflare" in error_msg.lower():
|
||||||
return False
|
error_type = "anti_bot_protection"
|
||||||
|
|
||||||
|
logger.error(f"💥 Error processing job ({platform}) {job_url}: {error_msg}")
|
||||||
|
await self._add_job_to_redis_cache(job_url, job_id, error_type)
|
||||||
|
await self._mark_job_seen(job_url) # ✅ Mark by URL
|
||||||
|
self.engine.report_outcome(error_type, url=job_url)
|
||||||
|
return True
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
if context:
|
if context:
|
||||||
try:
|
try:
|
||||||
@ -447,20 +507,19 @@ async def process_message_async(scraper: MultiPlatformJobScraper, ch, method, pr
|
|||||||
message_id = properties.message_id or f"msg_{int(time.time()*1000)}"
|
message_id = properties.message_id or f"msg_{int(time.time()*1000)}"
|
||||||
|
|
||||||
logger.info(f"📥 Processing job: {job_link} (ID: {message_id})")
|
logger.info(f"📥 Processing job: {job_link} (ID: {message_id})")
|
||||||
success = await scraper.scrape_job(job_link, company_name, message_id)
|
_ = await scraper.scrape_job(job_link, company_name, message_id)
|
||||||
|
|
||||||
METRICS["processed"] += 1
|
METRICS["processed"] += 1
|
||||||
if success:
|
|
||||||
METRICS["success"] += 1
|
|
||||||
else:
|
|
||||||
METRICS["failed"] += 1
|
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
logger.error("❌ Invalid JSON in message")
|
logger.error("❌ Invalid JSON in message")
|
||||||
|
ch.basic_ack(delivery_tag=method.delivery_tag)
|
||||||
METRICS["failed"] += 1
|
METRICS["failed"] += 1
|
||||||
|
return
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"💥 Unexpected error: {str(e)}")
|
logger.error(f"💥 Unexpected error: {str(e)}")
|
||||||
METRICS["failed"] += 1
|
METRICS["failed"] += 1
|
||||||
finally:
|
finally:
|
||||||
|
# ✅ CRITICAL: Acknowledge ALL messages
|
||||||
ch.basic_ack(delivery_tag=method.delivery_tag)
|
ch.basic_ack(delivery_tag=method.delivery_tag)
|
||||||
|
|
||||||
|
|
||||||
@ -503,7 +562,7 @@ def start_consumer():
|
|||||||
channel.basic_qos(prefetch_count=1)
|
channel.basic_qos(prefetch_count=1)
|
||||||
channel.basic_consume(queue='job_queue', on_message_callback=callback_wrapper(scraper))
|
channel.basic_consume(queue='job_queue', on_message_callback=callback_wrapper(scraper))
|
||||||
|
|
||||||
logger.info('Waiting for messages (Ashby, Lever, Greenhouse). To exit press CTRL+C')
|
logger.info('Waiting for messages (All platforms → Quelah Jobs). To exit press CTRL+C')
|
||||||
try:
|
try:
|
||||||
channel.start_consuming()
|
channel.start_consuming()
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
|
|||||||
53
sender.py
53
sender.py
@ -21,7 +21,7 @@ class RedisManager:
|
|||||||
"""Manages Redis connection and operations for job deduplication."""
|
"""Manages Redis connection and operations for job deduplication."""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.redis_host = os.getenv('REDIS_HOST', 'redis-scrape.thejobhub.xyz')
|
self.redis_host = os.getenv('REDIS_HOST')
|
||||||
self.redis_port = int(os.getenv('REDIS_PORT', '6380'))
|
self.redis_port = int(os.getenv('REDIS_PORT', '6380'))
|
||||||
self.redis_password = os.getenv('REDIS_PASSWORD')
|
self.redis_password = os.getenv('REDIS_PASSWORD')
|
||||||
self.redis_ssl_enabled = os.getenv('REDIS_SSL_ENABLED', 'true').lower() == 'true'
|
self.redis_ssl_enabled = os.getenv('REDIS_SSL_ENABLED', 'true').lower() == 'true'
|
||||||
@ -65,6 +65,26 @@ class RedisManager:
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# NEW: Track total sent jobs for jobs.csv
|
||||||
|
def get_jobs_csv_sent_count(self):
|
||||||
|
if not self.redis_client:
|
||||||
|
return 0
|
||||||
|
try:
|
||||||
|
count = self.redis_client.get("jobs_csv_sent_count")
|
||||||
|
return int(count) if count else 0
|
||||||
|
except Exception:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def increment_jobs_csv_sent_count(self):
|
||||||
|
if not self.redis_client:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
self.redis_client.incr("jobs_csv_sent_count")
|
||||||
|
# Set 30-day expiry to avoid stale data
|
||||||
|
self.redis_client.expire("jobs_csv_sent_count", 2592000)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class Sender:
|
class Sender:
|
||||||
def __init__(self, config_file='config.ini'):
|
def __init__(self, config_file='config.ini'):
|
||||||
@ -201,13 +221,21 @@ class Sender:
|
|||||||
return False
|
return False
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def is_job_seen(self, job_url):
|
def is_job_seen(self, job_url, filename):
|
||||||
|
"""Custom dedup logic: disable for jobs.csv until 6000 sent"""
|
||||||
|
if filename == "jobs.csv":
|
||||||
|
sent_count = self.redis_manager.get_jobs_csv_sent_count()
|
||||||
|
if sent_count < 6000:
|
||||||
|
return False # Always resend
|
||||||
return self.redis_manager.is_job_seen(job_url)
|
return self.redis_manager.is_job_seen(job_url)
|
||||||
|
|
||||||
def mark_job_sent(self, job_url):
|
def mark_job_sent(self, job_url, filename):
|
||||||
self.redis_manager.mark_job_sent(job_url)
|
self.redis_manager.mark_job_sent(job_url)
|
||||||
|
if filename == "jobs.csv":
|
||||||
|
self.redis_manager.increment_jobs_csv_sent_count()
|
||||||
|
|
||||||
def process_csv(self, file_path):
|
def process_csv(self, file_path):
|
||||||
|
filename = os.path.basename(file_path)
|
||||||
try:
|
try:
|
||||||
with open(file_path, 'r', encoding='utf-8') as csvfile:
|
with open(file_path, 'r', encoding='utf-8') as csvfile:
|
||||||
reader = csv.DictReader(csvfile)
|
reader = csv.DictReader(csvfile)
|
||||||
@ -217,7 +245,6 @@ class Sender:
|
|||||||
self.logger.info(f"CSV headers found: {reader.fieldnames}")
|
self.logger.info(f"CSV headers found: {reader.fieldnames}")
|
||||||
|
|
||||||
for row_num, row in enumerate(reader, start=1):
|
for row_num, row in enumerate(reader, start=1):
|
||||||
# ✅ IMMEDIATE EXIT CHECK
|
|
||||||
if not self.running:
|
if not self.running:
|
||||||
self.logger.info("Shutdown requested during CSV processing. Exiting...")
|
self.logger.info("Shutdown requested during CSV processing. Exiting...")
|
||||||
return sent_count
|
return sent_count
|
||||||
@ -245,7 +272,8 @@ class Sender:
|
|||||||
skipped_count += 1
|
skipped_count += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if self.is_job_seen(url):
|
# ✅ Modified: Pass filename to is_job_seen
|
||||||
|
if self.is_job_seen(url, filename):
|
||||||
self.logger.info(f"Skipping row {row_num}: job already sent (deduplicated). URL: {url}")
|
self.logger.info(f"Skipping row {row_num}: job already sent (deduplicated). URL: {url}")
|
||||||
skipped_count += 1
|
skipped_count += 1
|
||||||
continue
|
continue
|
||||||
@ -256,13 +284,15 @@ class Sender:
|
|||||||
|
|
||||||
if self.send_message(message, message_id):
|
if self.send_message(message, message_id):
|
||||||
sent_count += 1
|
sent_count += 1
|
||||||
self.mark_job_sent(url)
|
# ✅ Modified: Pass filename to mark_job_sent
|
||||||
|
self.mark_job_sent(url, filename)
|
||||||
else:
|
else:
|
||||||
self.logger.error(f"Failed to send job (row {row_num}): {url}")
|
self.logger.error(f"Failed to send job (row {row_num}): {url}")
|
||||||
skipped_count += 1
|
skipped_count += 1
|
||||||
|
|
||||||
if (sent_count + skipped_count) % 100 == 0:
|
if (sent_count + skipped_count) % 100 == 0:
|
||||||
self.logger.info(f"Progress: {sent_count} sent, {skipped_count} skipped from {file_path}")
|
current_total = self.redis_manager.get_jobs_csv_sent_count() if filename == "jobs.csv" else "N/A"
|
||||||
|
self.logger.info(f"Progress: {sent_count} sent, {skipped_count} skipped from {file_path} (jobs.csv total: {current_total})")
|
||||||
|
|
||||||
self.logger.info(f"Completed {file_path}: {sent_count} sent, {skipped_count} skipped")
|
self.logger.info(f"Completed {file_path}: {sent_count} sent, {skipped_count} skipped")
|
||||||
|
|
||||||
@ -282,7 +312,7 @@ class Sender:
|
|||||||
return 0
|
return 0
|
||||||
|
|
||||||
def find_new_csvs(self):
|
def find_new_csvs(self):
|
||||||
if not self.running: # ✅ IMMEDIATE EXIT CHECK
|
if not self.running:
|
||||||
return []
|
return []
|
||||||
if not os.path.exists(self.directory):
|
if not os.path.exists(self.directory):
|
||||||
return []
|
return []
|
||||||
@ -300,7 +330,7 @@ class Sender:
|
|||||||
new_files = self.find_new_csvs()
|
new_files = self.find_new_csvs()
|
||||||
if new_files:
|
if new_files:
|
||||||
for file_path in new_files:
|
for file_path in new_files:
|
||||||
if not self.running: # ✅ IMMEDIATE EXIT CHECK
|
if not self.running:
|
||||||
break
|
break
|
||||||
self.logger.info(f"Processing {file_path}")
|
self.logger.info(f"Processing {file_path}")
|
||||||
sent = self.process_csv(file_path)
|
sent = self.process_csv(file_path)
|
||||||
@ -308,13 +338,11 @@ class Sender:
|
|||||||
else:
|
else:
|
||||||
self.logger.info("No new CSV files found")
|
self.logger.info("No new CSV files found")
|
||||||
|
|
||||||
# Replace blocking sleep with interruptible sleep
|
|
||||||
for _ in range(self.check_interval):
|
for _ in range(self.check_interval):
|
||||||
if not self.running:
|
if not self.running:
|
||||||
break
|
break
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
# This should not normally be reached due to signal handler, but added for safety
|
|
||||||
pass
|
pass
|
||||||
finally:
|
finally:
|
||||||
if self.connection and self.connection.is_open:
|
if self.connection and self.connection.is_open:
|
||||||
@ -323,7 +351,7 @@ class Sender:
|
|||||||
|
|
||||||
def graceful_shutdown(self, signum, frame):
|
def graceful_shutdown(self, signum, frame):
|
||||||
self.logger.info("Received shutdown signal. Initiating graceful shutdown...")
|
self.logger.info("Received shutdown signal. Initiating graceful shutdown...")
|
||||||
self.running = False # This will break all loops
|
self.running = False
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
@ -344,6 +372,5 @@ if __name__ == '__main__':
|
|||||||
try:
|
try:
|
||||||
sender.run()
|
sender.run()
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
# Fallback in case signal handler doesn't catch it
|
|
||||||
sender.logger.info("KeyboardInterrupt caught in main. Exiting.")
|
sender.logger.info("KeyboardInterrupt caught in main. Exiting.")
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
Loading…
x
Reference in New Issue
Block a user