Refactor environment variable handling in scraper; remove default values for RabbitMQ and Redis configurations. Enhance job validation by checking for all compulsory fields before processing.

This commit is contained in:
Ofure Ikheloa 2025-12-15 09:37:52 +01:00
parent 2c5b42b7bd
commit 87c67265f8

View File

@ -26,12 +26,12 @@ logging.basicConfig(level=logging.INFO,
logger = logging.getLogger(__name__)
# Environment variables
RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "localhost")
RABBITMQ_HOST = os.getenv("RABBITMQ_HOST")
RABBITMQ_PORT = int(os.getenv("RABBITMQ_PORT", "5671"))
RABBITMQ_SSL_ENABLED = os.getenv("RABBITMQ_SSL_ENABLED", "false").lower() == "true"
# Redis configuration
REDIS_HOST = os.getenv('REDIS_HOST', 'redis-scrape.thejobhub.xyz')
REDIS_HOST = os.getenv('REDIS_HOST')
REDIS_PORT = int(os.getenv('REDIS_PORT', '6380'))
REDIS_PASSWORD = os.getenv('REDIS_PASSWORD')
REDIS_SSL_ENABLED = os.getenv('REDIS_SSL_ENABLED', 'true').lower() == 'true'
@ -389,24 +389,25 @@ class MultiPlatformJobScraper:
success = False
if refined_data and refined_data.get("title", "N/A") != "N/A":
# Check if description is missing or empty
description = refined_data.get("description", "").strip()
if not description or description in ["N/A", "Unknown", ""]:
logger.error(f"❌ Job discarded - missing description: {final_url}")
# Define all compulsory fields that must be present and valid
compulsory_fields = ['company_name', 'job_id', 'url', 'title', 'description']
# Validate all compulsory fields
missing_fields = []
for field in compulsory_fields:
field_value = refined_data.get(field, "").strip()
if not field_value or field_value in ["N/A", "Unknown", ""]:
missing_fields.append(field)
# If any compulsory field or description is missing, discard the job
if missing_fields:
logger.error(f"❌ Job discarded - missing compulsory fields {missing_fields}: {final_url}")
await self._add_job_to_redis_cache(final_url, job_id, "job_not_found")
self.engine.report_outcome("job_not_found", url=final_url)
return False
compulsory_fields = ['company_name', 'job_id', 'url']
for field in compulsory_fields:
if not refined_data.get(field) or refined_data[field] in ["N/A", "", "Unknown"]:
if field == 'job_id':
refined_data[field] = job_id
elif field == 'url':
refined_data[field] = final_url
elif field == 'company_name':
refined_data[field] = company_name
# If we get here, all compulsory fields are present and valid
# Update with additional metadata
refined_data.update({
'apply_type': apply_type,
'scraped_at': datetime.now().isoformat(),