Refactor environment variable handling in scraper; remove default values for RabbitMQ and Redis configurations. Enhance job validation by checking for all compulsory fields before processing.

This commit is contained in:
Ofure Ikheloa 2025-12-15 09:37:52 +01:00
parent 2c5b42b7bd
commit 87c67265f8

View File

@ -26,12 +26,12 @@ logging.basicConfig(level=logging.INFO,
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# Environment variables # Environment variables
RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "localhost") RABBITMQ_HOST = os.getenv("RABBITMQ_HOST")
RABBITMQ_PORT = int(os.getenv("RABBITMQ_PORT", "5671")) RABBITMQ_PORT = int(os.getenv("RABBITMQ_PORT", "5671"))
RABBITMQ_SSL_ENABLED = os.getenv("RABBITMQ_SSL_ENABLED", "false").lower() == "true" RABBITMQ_SSL_ENABLED = os.getenv("RABBITMQ_SSL_ENABLED", "false").lower() == "true"
# Redis configuration # Redis configuration
REDIS_HOST = os.getenv('REDIS_HOST', 'redis-scrape.thejobhub.xyz') REDIS_HOST = os.getenv('REDIS_HOST')
REDIS_PORT = int(os.getenv('REDIS_PORT', '6380')) REDIS_PORT = int(os.getenv('REDIS_PORT', '6380'))
REDIS_PASSWORD = os.getenv('REDIS_PASSWORD') REDIS_PASSWORD = os.getenv('REDIS_PASSWORD')
REDIS_SSL_ENABLED = os.getenv('REDIS_SSL_ENABLED', 'true').lower() == 'true' REDIS_SSL_ENABLED = os.getenv('REDIS_SSL_ENABLED', 'true').lower() == 'true'
@ -389,24 +389,25 @@ class MultiPlatformJobScraper:
success = False success = False
if refined_data and refined_data.get("title", "N/A") != "N/A": if refined_data and refined_data.get("title", "N/A") != "N/A":
# Check if description is missing or empty # Define all compulsory fields that must be present and valid
description = refined_data.get("description", "").strip() compulsory_fields = ['company_name', 'job_id', 'url', 'title', 'description']
if not description or description in ["N/A", "Unknown", ""]:
logger.error(f"❌ Job discarded - missing description: {final_url}") # Validate all compulsory fields
missing_fields = []
for field in compulsory_fields:
field_value = refined_data.get(field, "").strip()
if not field_value or field_value in ["N/A", "Unknown", ""]:
missing_fields.append(field)
# If any compulsory field or description is missing, discard the job
if missing_fields:
logger.error(f"❌ Job discarded - missing compulsory fields {missing_fields}: {final_url}")
await self._add_job_to_redis_cache(final_url, job_id, "job_not_found") await self._add_job_to_redis_cache(final_url, job_id, "job_not_found")
self.engine.report_outcome("job_not_found", url=final_url) self.engine.report_outcome("job_not_found", url=final_url)
return False return False
compulsory_fields = ['company_name', 'job_id', 'url'] # If we get here, all compulsory fields are present and valid
for field in compulsory_fields: # Update with additional metadata
if not refined_data.get(field) or refined_data[field] in ["N/A", "", "Unknown"]:
if field == 'job_id':
refined_data[field] = job_id
elif field == 'url':
refined_data[field] = final_url
elif field == 'company_name':
refined_data[field] = company_name
refined_data.update({ refined_data.update({
'apply_type': apply_type, 'apply_type': apply_type,
'scraped_at': datetime.now().isoformat(), 'scraped_at': datetime.now().isoformat(),