Refactor job tracking to use job ID instead of job URL in RedisManager methods
This commit is contained in:
parent
b13d14d26d
commit
2c5b42b7bd
238
scraper.py
238
scraper.py
@ -68,24 +68,22 @@ class RedisManager:
|
|||||||
logger.error(f"Failed to connect to Redis: {e}")
|
logger.error(f"Failed to connect to Redis: {e}")
|
||||||
self.redis_client = None
|
self.redis_client = None
|
||||||
|
|
||||||
def is_job_seen(self, job_url: str) -> bool:
|
def is_job_seen(self, job_id: str) -> bool:
|
||||||
"""✅ CHANGED: Check by job URL instead of job ID"""
|
|
||||||
if not self.redis_client:
|
if not self.redis_client:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return bool(self.redis_client.exists(f"job_seen:{job_url}"))
|
return bool(self.redis_client.exists(f"job_seen:{job_id}"))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Redis error checking job_seen: {e}")
|
logger.error(f"Redis error checking job_seen: {e}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def mark_job_seen(self, job_url: str):
|
def mark_job_seen(self, job_id: str):
|
||||||
"""✅ CHANGED: Mark by job URL instead of job ID"""
|
|
||||||
if not self.redis_client:
|
if not self.redis_client:
|
||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.redis_client.setex(f"job_seen:{job_url}", 2592000, "1")
|
self.redis_client.setex(f"job_seen:{job_id}", 2592000, "1")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Redis error marking job_seen: {e}")
|
logger.error(f"Redis error marking job_seen: {e}")
|
||||||
|
|
||||||
@ -239,13 +237,11 @@ class MultiPlatformJobScraper:
|
|||||||
await asyncio.sleep(2 * (speed / 2))
|
await asyncio.sleep(2 * (speed / 2))
|
||||||
return await page.content()
|
return await page.content()
|
||||||
|
|
||||||
async def _is_job_seen(self, job_url: str) -> bool:
|
async def _is_job_seen(self, job_id: str) -> bool:
|
||||||
"""✅ Use job URL for deduplication"""
|
return self.redis_manager.is_job_seen(job_id)
|
||||||
return self.redis_manager.is_job_seen(job_url)
|
|
||||||
|
|
||||||
async def _mark_job_seen(self, job_url: str):
|
async def _mark_job_seen(self, job_id: str):
|
||||||
"""✅ Use job URL for marking"""
|
self.redis_manager.mark_job_seen(job_id)
|
||||||
self.redis_manager.mark_job_seen(job_url)
|
|
||||||
|
|
||||||
async def _get_cached_llm_result(self, job_url: str) -> Optional[Dict]:
|
async def _get_cached_llm_result(self, job_url: str) -> Optional[Dict]:
|
||||||
return self.redis_manager.get_cached_llm_result(job_url)
|
return self.redis_manager.get_cached_llm_result(job_url)
|
||||||
@ -267,36 +263,6 @@ class MultiPlatformJobScraper:
|
|||||||
else:
|
else:
|
||||||
return "unknown"
|
return "unknown"
|
||||||
|
|
||||||
def _is_job_expired_or_invalid(self, page_content: str) -> bool:
|
|
||||||
"""Check if job is expired, removed, or has no description"""
|
|
||||||
content_lower = page_content.lower()
|
|
||||||
|
|
||||||
# Check for JavaScript-only pages
|
|
||||||
if "you need to enable javascript to run this app" in content_lower:
|
|
||||||
return True
|
|
||||||
|
|
||||||
invalid_phrases = [
|
|
||||||
"job no longer available",
|
|
||||||
"position has been filled",
|
|
||||||
"this job has expired",
|
|
||||||
"page not found",
|
|
||||||
"404 error",
|
|
||||||
"job has been closed",
|
|
||||||
"erweima.png", # Detect spam/ad content
|
|
||||||
"wocao03.com",
|
|
||||||
"github.com/wocao01"
|
|
||||||
]
|
|
||||||
|
|
||||||
for phrase in invalid_phrases:
|
|
||||||
if phrase in content_lower:
|
|
||||||
return True
|
|
||||||
|
|
||||||
# Check for meaningful description content
|
|
||||||
description_keywords = ['responsibilities', 'requirements', 'description', 'duties', 'role', 'about the']
|
|
||||||
has_description = any(kw in content_lower for kw in description_keywords)
|
|
||||||
|
|
||||||
return not has_description
|
|
||||||
|
|
||||||
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
||||||
async def scrape_job(
|
async def scrape_job(
|
||||||
self,
|
self,
|
||||||
@ -305,21 +271,20 @@ class MultiPlatformJobScraper:
|
|||||||
message_id: str
|
message_id: str
|
||||||
):
|
):
|
||||||
platform = self._get_platform(job_url)
|
platform = self._get_platform(job_url)
|
||||||
|
if platform == "unknown":
|
||||||
|
logger.info(f"⏭️ Skipping unsupported platform: {job_url}")
|
||||||
|
return True
|
||||||
|
|
||||||
# ✅ ONLY extract job_id from URL
|
|
||||||
job_id = job_url.strip("/").split("/")[-1]
|
job_id = job_url.strip("/").split("/")[-1]
|
||||||
|
if await self._is_job_seen(job_id):
|
||||||
# ✅ Check if already processed BY URL (not job_id)
|
logger.info(f"⏭️ Skipping already processed job: {job_id}")
|
||||||
if await self._is_job_seen(job_url):
|
|
||||||
logger.info(f"⏭️ Skipping already processed job URL: {job_url}")
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
cached_result = await self._get_cached_llm_result(job_url)
|
cached_result = await self._get_cached_llm_result(job_url)
|
||||||
if cached_result:
|
if cached_result:
|
||||||
logger.info(f"📦 Using cached LLM result for: {job_url}")
|
logger.info(f"📦 Using cached LLM result for: {job_url}")
|
||||||
# Save to Quelah Jobs - company_name will be overridden by LLM if found
|
await self.llm_agent.save_job_data(cached_result, company_name)
|
||||||
await self.llm_agent.save_job_data(cached_result, company_name, "quelah")
|
await self._mark_job_seen(job_id)
|
||||||
await self._mark_job_seen(job_url) # ✅ Mark by URL
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
context = None
|
context = None
|
||||||
@ -333,77 +298,86 @@ class MultiPlatformJobScraper:
|
|||||||
temp_fetcher = StealthyFetcher(self.engine, self.browser, context)
|
temp_fetcher = StealthyFetcher(self.engine, self.browser, context)
|
||||||
|
|
||||||
fetch_timeout = 60000 if platform == "lever" else timeout_ms
|
fetch_timeout = 60000 if platform == "lever" else timeout_ms
|
||||||
|
job_page = await asyncio.wait_for(
|
||||||
|
temp_fetcher.fetch_url(job_url, wait_for_selector="h1", timeout=fetch_timeout),
|
||||||
|
timeout=fetch_timeout / 1000.0
|
||||||
|
)
|
||||||
|
|
||||||
# ✅ PLATFORM-SPECIFIC WAIT LOGIC WITH ASHBY FIX
|
# Check if job still exists (minimal content validation)
|
||||||
if platform == "ashby":
|
|
||||||
# Ashby requires JS execution - wait for network idle + job content
|
|
||||||
job_page = await asyncio.wait_for(
|
|
||||||
temp_fetcher.fetch_url(job_url, wait_for_selector=None, timeout=fetch_timeout),
|
|
||||||
timeout=fetch_timeout / 100.0
|
|
||||||
)
|
|
||||||
if job_page:
|
|
||||||
# Wait for React hydration (job content to appear)
|
|
||||||
try:
|
|
||||||
await job_page.wait_for_function(
|
|
||||||
"document.querySelector('h1') && document.querySelector('h1').innerText.length > 0",
|
|
||||||
timeout=120000
|
|
||||||
)
|
|
||||||
except Exception:
|
|
||||||
# Fallback: check if we got valid content
|
|
||||||
content = await job_page.content()
|
|
||||||
if "you need to enable javascript" in content.lower():
|
|
||||||
logger.warning(f"⚠️ Ashby page still shows JS error: {job_url}")
|
|
||||||
raise Exception("Ashby JS content not loaded")
|
|
||||||
elif platform == "greenhouse":
|
|
||||||
job_page = await asyncio.wait_for(
|
|
||||||
temp_fetcher.fetch_url(job_url, wait_for_selector="h1, div.job-desc", timeout=fetch_timeout),
|
|
||||||
timeout=fetch_timeout / 1000.0
|
|
||||||
)
|
|
||||||
else: # lever & others
|
|
||||||
job_page = await asyncio.wait_for(
|
|
||||||
temp_fetcher.fetch_url(job_url, wait_for_selector="h1", timeout=fetch_timeout),
|
|
||||||
timeout=fetch_timeout / 1000.0
|
|
||||||
)
|
|
||||||
|
|
||||||
if job_page is None:
|
|
||||||
logger.error(f"❌ Failed to load page for {job_url}")
|
|
||||||
await self._add_job_to_redis_cache(job_url, job_id, "page_load_failed")
|
|
||||||
await self._mark_job_seen(job_url)
|
|
||||||
return True
|
|
||||||
|
|
||||||
page_content = await job_page.content()
|
page_content = await job_page.content()
|
||||||
|
if len(page_content.strip()) < 500: # Arbitrary threshold for "page exists"
|
||||||
|
logger.error(f"❌ Job no longer exists (empty/deleted): {job_url}")
|
||||||
|
await self._add_job_to_redis_cache(job_url, job_id, "job_not_found")
|
||||||
|
self.engine.report_outcome("job_not_found", url=job_url)
|
||||||
|
return False
|
||||||
|
|
||||||
if self._is_job_expired_or_invalid(page_content):
|
if platform == "ashby":
|
||||||
logger.warning(f"🗑️ Discarding invalid job: {job_url}")
|
try:
|
||||||
self.engine.report_outcome("job_discarded", url=job_url)
|
await job_page.wait_for_selector("div[class*='job-posting'], article, main", timeout=60000)
|
||||||
await self._mark_job_seen(job_url) # ✅ Mark by URL
|
except Exception:
|
||||||
return True
|
logger.warning(f"⚠️ Ashby page didn't load properly: {job_url}")
|
||||||
|
return False
|
||||||
|
elif platform == "lever":
|
||||||
|
pass
|
||||||
|
elif platform == "greenhouse":
|
||||||
|
try:
|
||||||
|
await job_page.wait_for_selector("div.job-desc, section", timeout=60000)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
# Apply type logic
|
# Extract page content for initial validation
|
||||||
|
page_content = await self._extract_page_content_for_llm(job_page)
|
||||||
|
|
||||||
|
# Check for job expiration or unavailability indicators
|
||||||
|
page_text_lower = page_content.lower()
|
||||||
|
job_unavailable_indicators = [
|
||||||
|
"job no longer available",
|
||||||
|
"position has been filled",
|
||||||
|
"this job has expired",
|
||||||
|
"job posting has expired",
|
||||||
|
"no longer accepting applications",
|
||||||
|
"position is closed",
|
||||||
|
"job is no longer active",
|
||||||
|
"this position is no longer open"
|
||||||
|
]
|
||||||
|
|
||||||
|
if any(indicator in page_text_lower for indicator in job_unavailable_indicators):
|
||||||
|
logger.error(f"❌ Job no longer available/expired: {job_url}")
|
||||||
|
await self._add_job_to_redis_cache(job_url, job_id, "job_not_found")
|
||||||
|
self.engine.report_outcome("job_not_found", url=job_url)
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 🔑 APPLY TYPE LOGIC
|
||||||
if platform in ["ashby", "lever", "greenhouse"]:
|
if platform in ["ashby", "lever", "greenhouse"]:
|
||||||
apply_type = 'AI'
|
apply_type = 'AI' # Always AI for these platforms
|
||||||
else:
|
else:
|
||||||
|
# For other platforms: check if form is accessible without login
|
||||||
apply_btn = await job_page.query_selector("button:has-text('Apply for this job'), button:has-text('Apply now')")
|
apply_btn = await job_page.query_selector("button:has-text('Apply for this job'), button:has-text('Apply now')")
|
||||||
apply_type = 'signup'
|
apply_type = 'signup' # default
|
||||||
if apply_btn:
|
if apply_btn:
|
||||||
await self._human_click(job_page, apply_btn)
|
await self._human_click(job_page, apply_btn)
|
||||||
speed = self.engine.optimization_params.get("base_delay", 2.0)
|
speed = self.engine.optimization_params.get("base_delay", 2.0)
|
||||||
await asyncio.sleep(2 * (speed / 2))
|
await asyncio.sleep(2 * (speed / 2))
|
||||||
form = await job_page.query_selector("form, div[class*='application-form']")
|
form = await job_page.query_selector("form, div[class*='application-form']")
|
||||||
if form:
|
if form:
|
||||||
|
# Check for login prompts in form
|
||||||
login_indicators = await job_page.query_selector("input[type='email'], input[type='password'], text='sign in', text='log in'")
|
login_indicators = await job_page.query_selector("input[type='email'], input[type='password'], text='sign in', text='log in'")
|
||||||
if not login_indicators:
|
if not login_indicators:
|
||||||
apply_type = 'AI'
|
apply_type = 'AI'
|
||||||
|
else:
|
||||||
|
apply_type = 'signup'
|
||||||
|
else:
|
||||||
|
apply_type = 'signup'
|
||||||
|
|
||||||
final_url = job_url
|
final_url = job_url
|
||||||
page_content = await self._extract_page_content_for_llm(job_page)
|
# Hardcode posted_date to Dec 1st 2025
|
||||||
posted_date = "12/01/25" # Fixed date
|
posted_date = "12/01/25"
|
||||||
|
|
||||||
raw_data = {
|
raw_data = {
|
||||||
"page_content": page_content,
|
"page_content": page_content,
|
||||||
"url": final_url,
|
"url": final_url,
|
||||||
"job_id": job_id,
|
"job_id": job_id,
|
||||||
"search_keywords": company_name, # Only used if LLM can't find company
|
"search_keywords": company_name,
|
||||||
"posted_date": posted_date
|
"posted_date": posted_date
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -415,18 +389,23 @@ class MultiPlatformJobScraper:
|
|||||||
|
|
||||||
success = False
|
success = False
|
||||||
if refined_data and refined_data.get("title", "N/A") != "N/A":
|
if refined_data and refined_data.get("title", "N/A") != "N/A":
|
||||||
# ✅ ONLY job_id, url are guaranteed - everything else from LLM
|
# Check if description is missing or empty
|
||||||
compulsory_fields = ['job_id', 'url']
|
description = refined_data.get("description", "").strip()
|
||||||
|
if not description or description in ["N/A", "Unknown", ""]:
|
||||||
|
logger.error(f"❌ Job discarded - missing description: {final_url}")
|
||||||
|
await self._add_job_to_redis_cache(final_url, job_id, "job_not_found")
|
||||||
|
self.engine.report_outcome("job_not_found", url=final_url)
|
||||||
|
return False
|
||||||
|
|
||||||
|
compulsory_fields = ['company_name', 'job_id', 'url']
|
||||||
for field in compulsory_fields:
|
for field in compulsory_fields:
|
||||||
if not refined_data.get(field) or refined_data[field] in ["N/A", "", "Unknown"]:
|
if not refined_data.get(field) or refined_data[field] in ["N/A", "", "Unknown"]:
|
||||||
if field == 'job_id':
|
if field == 'job_id':
|
||||||
refined_data[field] = job_id
|
refined_data[field] = job_id
|
||||||
elif field == 'url':
|
elif field == 'url':
|
||||||
refined_data[field] = final_url
|
refined_data[field] = final_url
|
||||||
|
elif field == 'company_name':
|
||||||
# Company name: prefer LLM extraction, fallback to queue
|
refined_data[field] = company_name
|
||||||
if not refined_data.get('company_name') or refined_data['company_name'] in ["N/A", "", "Unknown"]:
|
|
||||||
refined_data['company_name'] = company_name
|
|
||||||
|
|
||||||
refined_data.update({
|
refined_data.update({
|
||||||
'apply_type': apply_type,
|
'apply_type': apply_type,
|
||||||
@ -437,51 +416,51 @@ class MultiPlatformJobScraper:
|
|||||||
'platform': platform
|
'platform': platform
|
||||||
})
|
})
|
||||||
|
|
||||||
await self.llm_agent.save_job_data(refined_data, company_name, "quelah")
|
await self.llm_agent.save_job_data(refined_data, company_name)
|
||||||
await self._cache_llm_result(job_url, refined_data)
|
await self._cache_llm_result(job_url, refined_data)
|
||||||
await self._mark_job_seen(job_url) # ✅ Mark by URL
|
await self._mark_job_seen(job_id)
|
||||||
|
|
||||||
response_time = time.time() - start_time
|
response_time = time.time() - start_time
|
||||||
self.engine.report_outcome("success", url=final_url, response_time=response_time)
|
self.engine.report_outcome("success", url=final_url, response_time=response_time)
|
||||||
logger.info(f"✅ Saved to Quelah Jobs ({platform}): {refined_data['title'][:50]}...")
|
logger.info(f"✅ Scraped ({platform}): {refined_data['title'][:50]}... (Apply Type: {apply_type})")
|
||||||
success = True
|
success = True
|
||||||
else:
|
else:
|
||||||
logger.warning(f"🟡 LLM failed to refine: {final_url}")
|
logger.warning(f"🟡 LLM failed to refine: {final_url}")
|
||||||
await self._add_job_to_redis_cache(final_url, job_id, "llm_failure")
|
await self._add_job_to_redis_cache(final_url, job_id, "llm_failure")
|
||||||
await self._mark_job_seen(job_url) # ✅ Mark by URL
|
|
||||||
self.engine.report_outcome("llm_failure", url=final_url)
|
self.engine.report_outcome("llm_failure", url=final_url)
|
||||||
return True
|
|
||||||
|
|
||||||
return success
|
return success
|
||||||
|
|
||||||
except asyncio.TimeoutError:
|
except asyncio.TimeoutError:
|
||||||
logger.error(f"⏰ Timeout processing job ({platform}): {job_url}")
|
logger.error(f"⏰ Timeout processing job ({platform}): {job_url}")
|
||||||
await self._add_job_to_redis_cache(job_url, job_id, "timeout")
|
await self._add_job_to_redis_cache(job_url, job_id, "timeout")
|
||||||
await self._mark_job_seen(job_url) # ✅ Mark by URL
|
|
||||||
self.engine.report_outcome("timeout", url=job_url)
|
self.engine.report_outcome("timeout", url=job_url)
|
||||||
return True
|
return False
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_msg = str(e)
|
error_msg = str(e)
|
||||||
if "NoneType" in error_msg or "disconnected" in error_msg or "Browser" in error_msg:
|
if "NoneType" in error_msg or "disconnected" in error_msg or "Browser" in error_msg:
|
||||||
logger.warning("Browser connection lost. Forcing reinitialization.")
|
logger.warning("Browser connection lost. Forcing reinitialization.")
|
||||||
await self.close_browser()
|
await self.close_browser()
|
||||||
|
|
||||||
error_type = "exception"
|
# 🔍 Distinguish job-not-found vs other errors
|
||||||
if "page.goto: net::ERR_ABORTED" in error_msg or "page.goto: net::ERR_FAILED" in error_msg:
|
if "page.goto: net::ERR_ABORTED" in error_msg or "page.goto: net::ERR_FAILED" in error_msg:
|
||||||
error_type = "job_not_found"
|
logger.error(f"❌ Job no longer exists (404/network error): {job_url}")
|
||||||
|
await self._add_job_to_redis_cache(job_url, job_id, "job_not_found")
|
||||||
|
self.engine.report_outcome("job_not_found", url=job_url)
|
||||||
else:
|
else:
|
||||||
if "required" in error_msg.lower() or "missing" in error_msg.lower():
|
# Categorize other errors
|
||||||
error_type = "missing_fields"
|
error_type = "exception"
|
||||||
elif "captcha" in error_msg.lower() or "cloudflare" in error_msg.lower():
|
if "timeout" in error_msg.lower():
|
||||||
error_type = "anti_bot_protection"
|
error_type = "timeout"
|
||||||
|
elif "llm" in error_msg.lower() or "refine" in error_msg.lower():
|
||||||
logger.error(f"💥 Error processing job ({platform}) {job_url}: {error_msg}")
|
error_type = "llm_failure"
|
||||||
await self._add_job_to_redis_cache(job_url, job_id, error_type)
|
else:
|
||||||
await self._mark_job_seen(job_url) # ✅ Mark by URL
|
error_type = "scraping_error"
|
||||||
self.engine.report_outcome(error_type, url=job_url)
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
logger.error(f"💥 Error processing job ({platform}) {job_url}: {error_msg}")
|
||||||
|
await self._add_job_to_redis_cache(job_url, job_id, error_type)
|
||||||
|
self.engine.report_outcome(error_type, url=job_url)
|
||||||
|
return False
|
||||||
finally:
|
finally:
|
||||||
if context:
|
if context:
|
||||||
try:
|
try:
|
||||||
@ -507,19 +486,20 @@ async def process_message_async(scraper: MultiPlatformJobScraper, ch, method, pr
|
|||||||
message_id = properties.message_id or f"msg_{int(time.time()*1000)}"
|
message_id = properties.message_id or f"msg_{int(time.time()*1000)}"
|
||||||
|
|
||||||
logger.info(f"📥 Processing job: {job_link} (ID: {message_id})")
|
logger.info(f"📥 Processing job: {job_link} (ID: {message_id})")
|
||||||
_ = await scraper.scrape_job(job_link, company_name, message_id)
|
success = await scraper.scrape_job(job_link, company_name, message_id)
|
||||||
|
|
||||||
METRICS["processed"] += 1
|
METRICS["processed"] += 1
|
||||||
|
if success:
|
||||||
|
METRICS["success"] += 1
|
||||||
|
else:
|
||||||
|
METRICS["failed"] += 1
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
logger.error("❌ Invalid JSON in message")
|
logger.error("❌ Invalid JSON in message")
|
||||||
ch.basic_ack(delivery_tag=method.delivery_tag)
|
|
||||||
METRICS["failed"] += 1
|
METRICS["failed"] += 1
|
||||||
return
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"💥 Unexpected error: {str(e)}")
|
logger.error(f"💥 Unexpected error: {str(e)}")
|
||||||
METRICS["failed"] += 1
|
METRICS["failed"] += 1
|
||||||
finally:
|
finally:
|
||||||
# ✅ CRITICAL: Acknowledge ALL messages
|
|
||||||
ch.basic_ack(delivery_tag=method.delivery_tag)
|
ch.basic_ack(delivery_tag=method.delivery_tag)
|
||||||
|
|
||||||
|
|
||||||
@ -562,7 +542,7 @@ def start_consumer():
|
|||||||
channel.basic_qos(prefetch_count=1)
|
channel.basic_qos(prefetch_count=1)
|
||||||
channel.basic_consume(queue='job_queue', on_message_callback=callback_wrapper(scraper))
|
channel.basic_consume(queue='job_queue', on_message_callback=callback_wrapper(scraper))
|
||||||
|
|
||||||
logger.info('Waiting for messages (All platforms → Quelah Jobs). To exit press CTRL+C')
|
logger.info('Waiting for messages (Ashby, Lever, Greenhouse). To exit press CTRL+C')
|
||||||
try:
|
try:
|
||||||
channel.start_consuming()
|
channel.start_consuming()
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user