Refactor environment variable handling in scraper; remove default values for RabbitMQ and Redis configurations. Enhance job validation by checking for all compulsory fields before processing.
This commit is contained in:
parent
2c5b42b7bd
commit
87c67265f8
33
scraper.py
33
scraper.py
@ -26,12 +26,12 @@ logging.basicConfig(level=logging.INFO,
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Environment variables
|
||||
RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "localhost")
|
||||
RABBITMQ_HOST = os.getenv("RABBITMQ_HOST")
|
||||
RABBITMQ_PORT = int(os.getenv("RABBITMQ_PORT", "5671"))
|
||||
RABBITMQ_SSL_ENABLED = os.getenv("RABBITMQ_SSL_ENABLED", "false").lower() == "true"
|
||||
|
||||
# Redis configuration
|
||||
REDIS_HOST = os.getenv('REDIS_HOST', 'redis-scrape.thejobhub.xyz')
|
||||
REDIS_HOST = os.getenv('REDIS_HOST')
|
||||
REDIS_PORT = int(os.getenv('REDIS_PORT', '6380'))
|
||||
REDIS_PASSWORD = os.getenv('REDIS_PASSWORD')
|
||||
REDIS_SSL_ENABLED = os.getenv('REDIS_SSL_ENABLED', 'true').lower() == 'true'
|
||||
@ -389,24 +389,25 @@ class MultiPlatformJobScraper:
|
||||
|
||||
success = False
|
||||
if refined_data and refined_data.get("title", "N/A") != "N/A":
|
||||
# Check if description is missing or empty
|
||||
description = refined_data.get("description", "").strip()
|
||||
if not description or description in ["N/A", "Unknown", ""]:
|
||||
logger.error(f"❌ Job discarded - missing description: {final_url}")
|
||||
# Define all compulsory fields that must be present and valid
|
||||
compulsory_fields = ['company_name', 'job_id', 'url', 'title', 'description']
|
||||
|
||||
# Validate all compulsory fields
|
||||
missing_fields = []
|
||||
for field in compulsory_fields:
|
||||
field_value = refined_data.get(field, "").strip()
|
||||
if not field_value or field_value in ["N/A", "Unknown", ""]:
|
||||
missing_fields.append(field)
|
||||
|
||||
# If any compulsory field or description is missing, discard the job
|
||||
if missing_fields:
|
||||
logger.error(f"❌ Job discarded - missing compulsory fields {missing_fields}: {final_url}")
|
||||
await self._add_job_to_redis_cache(final_url, job_id, "job_not_found")
|
||||
self.engine.report_outcome("job_not_found", url=final_url)
|
||||
return False
|
||||
|
||||
compulsory_fields = ['company_name', 'job_id', 'url']
|
||||
for field in compulsory_fields:
|
||||
if not refined_data.get(field) or refined_data[field] in ["N/A", "", "Unknown"]:
|
||||
if field == 'job_id':
|
||||
refined_data[field] = job_id
|
||||
elif field == 'url':
|
||||
refined_data[field] = final_url
|
||||
elif field == 'company_name':
|
||||
refined_data[field] = company_name
|
||||
|
||||
# If we get here, all compulsory fields are present and valid
|
||||
# Update with additional metadata
|
||||
refined_data.update({
|
||||
'apply_type': apply_type,
|
||||
'scraped_at': datetime.now().isoformat(),
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user