diff --git a/scraper.py b/scraper.py index e5ef051..c3db070 100644 --- a/scraper.py +++ b/scraper.py @@ -26,12 +26,12 @@ logging.basicConfig(level=logging.INFO, logger = logging.getLogger(__name__) # Environment variables -RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "localhost") +RABBITMQ_HOST = os.getenv("RABBITMQ_HOST") RABBITMQ_PORT = int(os.getenv("RABBITMQ_PORT", "5671")) RABBITMQ_SSL_ENABLED = os.getenv("RABBITMQ_SSL_ENABLED", "false").lower() == "true" # Redis configuration -REDIS_HOST = os.getenv('REDIS_HOST', 'redis-scrape.thejobhub.xyz') +REDIS_HOST = os.getenv('REDIS_HOST') REDIS_PORT = int(os.getenv('REDIS_PORT', '6380')) REDIS_PASSWORD = os.getenv('REDIS_PASSWORD') REDIS_SSL_ENABLED = os.getenv('REDIS_SSL_ENABLED', 'true').lower() == 'true' @@ -389,24 +389,25 @@ class MultiPlatformJobScraper: success = False if refined_data and refined_data.get("title", "N/A") != "N/A": - # Check if description is missing or empty - description = refined_data.get("description", "").strip() - if not description or description in ["N/A", "Unknown", ""]: - logger.error(f"❌ Job discarded - missing description: {final_url}") + # Define all compulsory fields that must be present and valid + compulsory_fields = ['company_name', 'job_id', 'url', 'title', 'description'] + + # Validate all compulsory fields + missing_fields = [] + for field in compulsory_fields: + field_value = refined_data.get(field, "").strip() + if not field_value or field_value in ["N/A", "Unknown", ""]: + missing_fields.append(field) + + # If any compulsory field or description is missing, discard the job + if missing_fields: + logger.error(f"❌ Job discarded - missing compulsory fields {missing_fields}: {final_url}") await self._add_job_to_redis_cache(final_url, job_id, "job_not_found") self.engine.report_outcome("job_not_found", url=final_url) return False - compulsory_fields = ['company_name', 'job_id', 'url'] - for field in compulsory_fields: - if not refined_data.get(field) or refined_data[field] in ["N/A", "", "Unknown"]: - if field == 'job_id': - refined_data[field] = job_id - elif field == 'url': - refined_data[field] = final_url - elif field == 'company_name': - refined_data[field] = company_name - + # If we get here, all compulsory fields are present and valid + # Update with additional metadata refined_data.update({ 'apply_type': apply_type, 'scraped_at': datetime.now().isoformat(),