Refactor environment variable handling in scraper; remove default values for RabbitMQ and Redis configurations. Enhance job validation by checking for all compulsory fields before processing.
This commit is contained in:
parent
2c5b42b7bd
commit
87c67265f8
33
scraper.py
33
scraper.py
@ -26,12 +26,12 @@ logging.basicConfig(level=logging.INFO,
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# Environment variables
|
# Environment variables
|
||||||
RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "localhost")
|
RABBITMQ_HOST = os.getenv("RABBITMQ_HOST")
|
||||||
RABBITMQ_PORT = int(os.getenv("RABBITMQ_PORT", "5671"))
|
RABBITMQ_PORT = int(os.getenv("RABBITMQ_PORT", "5671"))
|
||||||
RABBITMQ_SSL_ENABLED = os.getenv("RABBITMQ_SSL_ENABLED", "false").lower() == "true"
|
RABBITMQ_SSL_ENABLED = os.getenv("RABBITMQ_SSL_ENABLED", "false").lower() == "true"
|
||||||
|
|
||||||
# Redis configuration
|
# Redis configuration
|
||||||
REDIS_HOST = os.getenv('REDIS_HOST', 'redis-scrape.thejobhub.xyz')
|
REDIS_HOST = os.getenv('REDIS_HOST')
|
||||||
REDIS_PORT = int(os.getenv('REDIS_PORT', '6380'))
|
REDIS_PORT = int(os.getenv('REDIS_PORT', '6380'))
|
||||||
REDIS_PASSWORD = os.getenv('REDIS_PASSWORD')
|
REDIS_PASSWORD = os.getenv('REDIS_PASSWORD')
|
||||||
REDIS_SSL_ENABLED = os.getenv('REDIS_SSL_ENABLED', 'true').lower() == 'true'
|
REDIS_SSL_ENABLED = os.getenv('REDIS_SSL_ENABLED', 'true').lower() == 'true'
|
||||||
@ -389,24 +389,25 @@ class MultiPlatformJobScraper:
|
|||||||
|
|
||||||
success = False
|
success = False
|
||||||
if refined_data and refined_data.get("title", "N/A") != "N/A":
|
if refined_data and refined_data.get("title", "N/A") != "N/A":
|
||||||
# Check if description is missing or empty
|
# Define all compulsory fields that must be present and valid
|
||||||
description = refined_data.get("description", "").strip()
|
compulsory_fields = ['company_name', 'job_id', 'url', 'title', 'description']
|
||||||
if not description or description in ["N/A", "Unknown", ""]:
|
|
||||||
logger.error(f"❌ Job discarded - missing description: {final_url}")
|
# Validate all compulsory fields
|
||||||
|
missing_fields = []
|
||||||
|
for field in compulsory_fields:
|
||||||
|
field_value = refined_data.get(field, "").strip()
|
||||||
|
if not field_value or field_value in ["N/A", "Unknown", ""]:
|
||||||
|
missing_fields.append(field)
|
||||||
|
|
||||||
|
# If any compulsory field or description is missing, discard the job
|
||||||
|
if missing_fields:
|
||||||
|
logger.error(f"❌ Job discarded - missing compulsory fields {missing_fields}: {final_url}")
|
||||||
await self._add_job_to_redis_cache(final_url, job_id, "job_not_found")
|
await self._add_job_to_redis_cache(final_url, job_id, "job_not_found")
|
||||||
self.engine.report_outcome("job_not_found", url=final_url)
|
self.engine.report_outcome("job_not_found", url=final_url)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
compulsory_fields = ['company_name', 'job_id', 'url']
|
# If we get here, all compulsory fields are present and valid
|
||||||
for field in compulsory_fields:
|
# Update with additional metadata
|
||||||
if not refined_data.get(field) or refined_data[field] in ["N/A", "", "Unknown"]:
|
|
||||||
if field == 'job_id':
|
|
||||||
refined_data[field] = job_id
|
|
||||||
elif field == 'url':
|
|
||||||
refined_data[field] = final_url
|
|
||||||
elif field == 'company_name':
|
|
||||||
refined_data[field] = company_name
|
|
||||||
|
|
||||||
refined_data.update({
|
refined_data.update({
|
||||||
'apply_type': apply_type,
|
'apply_type': apply_type,
|
||||||
'scraped_at': datetime.now().isoformat(),
|
'scraped_at': datetime.now().isoformat(),
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user