Refactor RedisManager methods for improved error handling and logging; streamline job validation process by ensuring all compulsory fields are checked before processing.
This commit is contained in:
parent
87c67265f8
commit
e2e1bc442e
73
scraper.py
73
scraper.py
@ -15,28 +15,22 @@ import logging
|
|||||||
from tenacity import retry, stop_after_attempt, wait_exponential
|
from tenacity import retry, stop_after_attempt, wait_exponential
|
||||||
from scraping_engine import FingerprintScrapingEngine
|
from scraping_engine import FingerprintScrapingEngine
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
from ssl_connection import create_ssl_connection_parameters # Import from ssl.py
|
from ssl_connection import create_ssl_connection_parameters # Import from ssl.py
|
||||||
import redis
|
import redis
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
# Configure logging
|
# Configure logging
|
||||||
logging.basicConfig(level=logging.INFO,
|
logging.basicConfig(level=logging.INFO,
|
||||||
format='%(asctime)s - %(levelname)s - %(message)s')
|
format='%(asctime)s - %(levelname)s - %(message)s')
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# Environment variables
|
# Environment variables
|
||||||
RABBITMQ_HOST = os.getenv("RABBITMQ_HOST")
|
RABBITMQ_HOST = os.getenv("RABBITMQ_HOST")
|
||||||
RABBITMQ_PORT = int(os.getenv("RABBITMQ_PORT", "5671"))
|
RABBITMQ_PORT = int(os.getenv("RABBITMQ_PORT", "5671"))
|
||||||
RABBITMQ_SSL_ENABLED = os.getenv("RABBITMQ_SSL_ENABLED", "false").lower() == "true"
|
RABBITMQ_SSL_ENABLED = os.getenv("RABBITMQ_SSL_ENABLED", "false").lower() == "true"
|
||||||
|
|
||||||
# Redis configuration
|
# Redis configuration
|
||||||
REDIS_HOST = os.getenv('REDIS_HOST')
|
REDIS_HOST = os.getenv('REDIS_HOST')
|
||||||
REDIS_PORT = int(os.getenv('REDIS_PORT', '6380'))
|
REDIS_PORT = int(os.getenv('REDIS_PORT', '6380'))
|
||||||
REDIS_PASSWORD = os.getenv('REDIS_PASSWORD')
|
REDIS_PASSWORD = os.getenv('REDIS_PASSWORD')
|
||||||
REDIS_SSL_ENABLED = os.getenv('REDIS_SSL_ENABLED', 'true').lower() == 'true'
|
REDIS_SSL_ENABLED = os.getenv('REDIS_SSL_ENABLED', 'true').lower() == 'true'
|
||||||
|
|
||||||
|
|
||||||
class RedisManager:
|
class RedisManager:
|
||||||
"""Manages Redis connection and operations for job tracking and caching."""
|
"""Manages Redis connection and operations for job tracking and caching."""
|
||||||
|
|
||||||
@ -145,14 +139,12 @@ class MultiPlatformJobScraper:
|
|||||||
await self.close_browser()
|
await self.close_browser()
|
||||||
except:
|
except:
|
||||||
await self.close_browser()
|
await self.close_browser()
|
||||||
|
|
||||||
if self.browser is None:
|
if self.browser is None:
|
||||||
try:
|
try:
|
||||||
profile = self.engine._select_profile()
|
profile = self.engine._select_profile()
|
||||||
renderer = random.choice(self.engine.common_renderers[self.engine.os])
|
renderer = random.choice(self.engine.common_renderers[self.engine.os])
|
||||||
vendor = random.choice(self.engine.common_vendors)
|
vendor = random.choice(self.engine.common_vendors)
|
||||||
spoof_script = self.engine._get_spoof_script(renderer, vendor)
|
spoof_script = self.engine._get_spoof_script(renderer, vendor)
|
||||||
|
|
||||||
self.pw = await async_playwright().start()
|
self.pw = await async_playwright().start()
|
||||||
self.browser = await self.pw.chromium.launch(
|
self.browser = await self.pw.chromium.launch(
|
||||||
headless=True,
|
headless=True,
|
||||||
@ -178,7 +170,6 @@ class MultiPlatformJobScraper:
|
|||||||
logger.warning("Browser appears dead. Reinitializing...")
|
logger.warning("Browser appears dead. Reinitializing...")
|
||||||
await self.close_browser()
|
await self.close_browser()
|
||||||
await self.init_browser()
|
await self.init_browser()
|
||||||
|
|
||||||
profile = self.engine._select_profile()
|
profile = self.engine._select_profile()
|
||||||
context = await AsyncNewContext(self.browser, fingerprint=profile)
|
context = await AsyncNewContext(self.browser, fingerprint=profile)
|
||||||
await context.add_init_script(f"""
|
await context.add_init_script(f"""
|
||||||
@ -274,43 +265,36 @@ class MultiPlatformJobScraper:
|
|||||||
if platform == "unknown":
|
if platform == "unknown":
|
||||||
logger.info(f"⏭️ Skipping unsupported platform: {job_url}")
|
logger.info(f"⏭️ Skipping unsupported platform: {job_url}")
|
||||||
return True
|
return True
|
||||||
|
|
||||||
job_id = job_url.strip("/").split("/")[-1]
|
job_id = job_url.strip("/").split("/")[-1]
|
||||||
if await self._is_job_seen(job_id):
|
if await self._is_job_seen(job_id):
|
||||||
logger.info(f"⏭️ Skipping already processed job: {job_id}")
|
logger.info(f"⏭️ Skipping already processed job: {job_id}")
|
||||||
return True
|
return True
|
||||||
|
|
||||||
cached_result = await self._get_cached_llm_result(job_url)
|
cached_result = await self._get_cached_llm_result(job_url)
|
||||||
if cached_result:
|
if cached_result:
|
||||||
logger.info(f"📦 Using cached LLM result for: {job_url}")
|
logger.info(f"📦 Using cached LLM result for: {job_url}")
|
||||||
await self.llm_agent.save_job_data(cached_result, company_name)
|
await self.llm_agent.save_job_data(cached_result, company_name)
|
||||||
await self._mark_job_seen(job_id)
|
await self._mark_job_seen(job_id)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
context = None
|
context = None
|
||||||
page = None
|
page = None
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
try:
|
try:
|
||||||
context = await self.create_fresh_context()
|
context = await self.create_fresh_context()
|
||||||
page = await context.new_page()
|
page = await context.new_page()
|
||||||
|
|
||||||
timeout_ms = self.engine.optimization_params.get("request_timeout", 120000)
|
timeout_ms = self.engine.optimization_params.get("request_timeout", 120000)
|
||||||
temp_fetcher = StealthyFetcher(self.engine, self.browser, context)
|
temp_fetcher = StealthyFetcher(self.engine, self.browser, context)
|
||||||
|
|
||||||
fetch_timeout = 60000 if platform == "lever" else timeout_ms
|
fetch_timeout = 60000 if platform == "lever" else timeout_ms
|
||||||
job_page = await asyncio.wait_for(
|
job_page = await asyncio.wait_for(
|
||||||
temp_fetcher.fetch_url(job_url, wait_for_selector="h1", timeout=fetch_timeout),
|
temp_fetcher.fetch_url(job_url, wait_for_selector="h1", timeout=fetch_timeout),
|
||||||
timeout=fetch_timeout / 1000.0
|
timeout=fetch_timeout / 1000.0
|
||||||
)
|
)
|
||||||
|
|
||||||
# Check if job still exists (minimal content validation)
|
# Check if job still exists (minimal content validation)
|
||||||
page_content = await job_page.content()
|
page_content = await job_page.content()
|
||||||
if len(page_content.strip()) < 500: # Arbitrary threshold for "page exists"
|
if len(page_content.strip()) < 500: # Arbitrary threshold for "page exists"
|
||||||
logger.error(f"❌ Job no longer exists (empty/deleted): {job_url}")
|
logger.error(f"❌ Job no longer exists (empty/deleted): {job_url}")
|
||||||
await self._add_job_to_redis_cache(job_url, job_id, "job_not_found")
|
await self._add_job_to_redis_cache(job_url, job_id, "job_not_found")
|
||||||
self.engine.report_outcome("job_not_found", url=job_url)
|
self.engine.report_outcome("job_not_found", url=job_url)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if platform == "ashby":
|
if platform == "ashby":
|
||||||
try:
|
try:
|
||||||
await job_page.wait_for_selector("div[class*='job-posting'], article, main", timeout=60000)
|
await job_page.wait_for_selector("div[class*='job-posting'], article, main", timeout=60000)
|
||||||
@ -324,7 +308,6 @@ class MultiPlatformJobScraper:
|
|||||||
await job_page.wait_for_selector("div.job-desc, section", timeout=60000)
|
await job_page.wait_for_selector("div.job-desc, section", timeout=60000)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Extract page content for initial validation
|
# Extract page content for initial validation
|
||||||
page_content = await self._extract_page_content_for_llm(job_page)
|
page_content = await self._extract_page_content_for_llm(job_page)
|
||||||
|
|
||||||
@ -346,14 +329,13 @@ class MultiPlatformJobScraper:
|
|||||||
await self._add_job_to_redis_cache(job_url, job_id, "job_not_found")
|
await self._add_job_to_redis_cache(job_url, job_id, "job_not_found")
|
||||||
self.engine.report_outcome("job_not_found", url=job_url)
|
self.engine.report_outcome("job_not_found", url=job_url)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# 🔑 APPLY TYPE LOGIC
|
# 🔑 APPLY TYPE LOGIC
|
||||||
if platform in ["ashby", "lever", "greenhouse"]:
|
if platform in ["ashby", "lever", "greenhouse"]:
|
||||||
apply_type = 'AI' # Always AI for these platforms
|
apply_type = 'AI' # Always AI for these platforms
|
||||||
else:
|
else:
|
||||||
# For other platforms: check if form is accessible without login
|
# For other platforms: check if form is accessible without login
|
||||||
apply_btn = await job_page.query_selector("button:has-text('Apply for this job'), button:has-text('Apply now')")
|
apply_btn = await job_page.query_selector("button:has-text('Apply for this job'), button:has-text('Apply now')")
|
||||||
apply_type = 'signup' # default
|
apply_type = 'signup' # default
|
||||||
if apply_btn:
|
if apply_btn:
|
||||||
await self._human_click(job_page, apply_btn)
|
await self._human_click(job_page, apply_btn)
|
||||||
speed = self.engine.optimization_params.get("base_delay", 2.0)
|
speed = self.engine.optimization_params.get("base_delay", 2.0)
|
||||||
@ -368,11 +350,9 @@ class MultiPlatformJobScraper:
|
|||||||
apply_type = 'signup'
|
apply_type = 'signup'
|
||||||
else:
|
else:
|
||||||
apply_type = 'signup'
|
apply_type = 'signup'
|
||||||
|
|
||||||
final_url = job_url
|
final_url = job_url
|
||||||
# Hardcode posted_date to Dec 1st 2025
|
# Hardcode posted_date to Dec 1st 2025
|
||||||
posted_date = "12/01/25"
|
posted_date = "12/01/25"
|
||||||
|
|
||||||
raw_data = {
|
raw_data = {
|
||||||
"page_content": page_content,
|
"page_content": page_content,
|
||||||
"url": final_url,
|
"url": final_url,
|
||||||
@ -380,34 +360,32 @@ class MultiPlatformJobScraper:
|
|||||||
"search_keywords": company_name,
|
"search_keywords": company_name,
|
||||||
"posted_date": posted_date
|
"posted_date": posted_date
|
||||||
}
|
}
|
||||||
|
|
||||||
llm_timeout = max(60, self.engine.feedback.get("avg_response_time", 10) * 2)
|
llm_timeout = max(60, self.engine.feedback.get("avg_response_time", 10) * 2)
|
||||||
refined_data = await asyncio.wait_for(
|
refined_data = await asyncio.wait_for(
|
||||||
self.llm_agent.refine_job_data(raw_data, self.user_request),
|
self.llm_agent.refine_job_data(raw_data, self.user_request),
|
||||||
timeout=llm_timeout
|
timeout=llm_timeout
|
||||||
)
|
)
|
||||||
|
|
||||||
success = False
|
success = False
|
||||||
if refined_data and refined_data.get("title", "N/A") != "N/A":
|
if refined_data:
|
||||||
# Define all compulsory fields that must be present and valid
|
# Define compulsory fields that must be present and valid
|
||||||
compulsory_fields = ['company_name', 'job_id', 'url', 'title', 'description']
|
compulsory_fields = ['title', 'company_name', 'description', 'job_id', 'url']
|
||||||
|
|
||||||
# Validate all compulsory fields
|
# Check if ALL compulsory fields are present and valid BEFORE any processing
|
||||||
missing_fields = []
|
missing_fields = []
|
||||||
for field in compulsory_fields:
|
for field in compulsory_fields:
|
||||||
field_value = refined_data.get(field, "").strip()
|
field_value = refined_data.get(field, "")
|
||||||
if not field_value or field_value in ["N/A", "Unknown", ""]:
|
if not field_value or str(field_value).strip() in ["", "N/A", "Unknown", "Not provided", "Not available", "Company", "Job"]:
|
||||||
missing_fields.append(field)
|
missing_fields.append(field)
|
||||||
|
|
||||||
# If any compulsory field or description is missing, discard the job
|
# If any compulsory field is missing, discard the job immediately
|
||||||
if missing_fields:
|
if missing_fields:
|
||||||
logger.error(f"❌ Job discarded - missing compulsory fields {missing_fields}: {final_url}")
|
logger.error(f"❌ Job discarded - missing compulsory fields: {', '.join(missing_fields)} : {final_url}")
|
||||||
await self._add_job_to_redis_cache(final_url, job_id, "job_not_found")
|
error_type = "missing_compulsory_fields"
|
||||||
self.engine.report_outcome("job_not_found", url=final_url)
|
await self._add_job_to_redis_cache(final_url, job_id, error_type)
|
||||||
|
self.engine.report_outcome(error_type, url=final_url)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# If we get here, all compulsory fields are present and valid
|
# If we get here, all compulsory fields are valid - now add additional metadata
|
||||||
# Update with additional metadata
|
|
||||||
refined_data.update({
|
refined_data.update({
|
||||||
'apply_type': apply_type,
|
'apply_type': apply_type,
|
||||||
'scraped_at': datetime.now().isoformat(),
|
'scraped_at': datetime.now().isoformat(),
|
||||||
@ -417,6 +395,7 @@ class MultiPlatformJobScraper:
|
|||||||
'platform': platform
|
'platform': platform
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# Save to database and markdown
|
||||||
await self.llm_agent.save_job_data(refined_data, company_name)
|
await self.llm_agent.save_job_data(refined_data, company_name)
|
||||||
await self._cache_llm_result(job_url, refined_data)
|
await self._cache_llm_result(job_url, refined_data)
|
||||||
await self._mark_job_seen(job_id)
|
await self._mark_job_seen(job_id)
|
||||||
@ -429,9 +408,7 @@ class MultiPlatformJobScraper:
|
|||||||
logger.warning(f"🟡 LLM failed to refine: {final_url}")
|
logger.warning(f"🟡 LLM failed to refine: {final_url}")
|
||||||
await self._add_job_to_redis_cache(final_url, job_id, "llm_failure")
|
await self._add_job_to_redis_cache(final_url, job_id, "llm_failure")
|
||||||
self.engine.report_outcome("llm_failure", url=final_url)
|
self.engine.report_outcome("llm_failure", url=final_url)
|
||||||
|
|
||||||
return success
|
return success
|
||||||
|
|
||||||
except asyncio.TimeoutError:
|
except asyncio.TimeoutError:
|
||||||
logger.error(f"⏰ Timeout processing job ({platform}): {job_url}")
|
logger.error(f"⏰ Timeout processing job ({platform}): {job_url}")
|
||||||
await self._add_job_to_redis_cache(job_url, job_id, "timeout")
|
await self._add_job_to_redis_cache(job_url, job_id, "timeout")
|
||||||
@ -468,7 +445,6 @@ class MultiPlatformJobScraper:
|
|||||||
await context.close()
|
await context.close()
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Global metrics
|
# Global metrics
|
||||||
METRICS = {
|
METRICS = {
|
||||||
"processed": 0,
|
"processed": 0,
|
||||||
@ -477,18 +453,14 @@ METRICS = {
|
|||||||
"skipped": 0,
|
"skipped": 0,
|
||||||
"start_time": time.time()
|
"start_time": time.time()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
async def process_message_async(scraper: MultiPlatformJobScraper, ch, method, properties, body):
|
async def process_message_async(scraper: MultiPlatformJobScraper, ch, method, properties, body):
|
||||||
try:
|
try:
|
||||||
job_data = json.loads(body)
|
job_data = json.loads(body)
|
||||||
job_link = job_data['job_link']
|
job_link = job_data['job_link']
|
||||||
company_name = job_data['company_name']
|
company_name = job_data['company_name']
|
||||||
message_id = properties.message_id or f"msg_{int(time.time()*1000)}"
|
message_id = properties.message_id or f"msg_{int(time.time()*1000)}"
|
||||||
|
|
||||||
logger.info(f"📥 Processing job: {job_link} (ID: {message_id})")
|
logger.info(f"📥 Processing job: {job_link} (ID: {message_id})")
|
||||||
success = await scraper.scrape_job(job_link, company_name, message_id)
|
success = await scraper.scrape_job(job_link, company_name, message_id)
|
||||||
|
|
||||||
METRICS["processed"] += 1
|
METRICS["processed"] += 1
|
||||||
if success:
|
if success:
|
||||||
METRICS["success"] += 1
|
METRICS["success"] += 1
|
||||||
@ -502,14 +474,10 @@ async def process_message_async(scraper: MultiPlatformJobScraper, ch, method, pr
|
|||||||
METRICS["failed"] += 1
|
METRICS["failed"] += 1
|
||||||
finally:
|
finally:
|
||||||
ch.basic_ack(delivery_tag=method.delivery_tag)
|
ch.basic_ack(delivery_tag=method.delivery_tag)
|
||||||
|
|
||||||
|
|
||||||
def callback_wrapper(scraper: MultiPlatformJobScraper):
|
def callback_wrapper(scraper: MultiPlatformJobScraper):
|
||||||
def callback(ch, method, properties, body):
|
def callback(ch, method, properties, body):
|
||||||
asyncio.run(process_message_async(scraper, ch, method, properties, body))
|
asyncio.run(process_message_async(scraper, ch, method, properties, body))
|
||||||
return callback
|
return callback
|
||||||
|
|
||||||
|
|
||||||
def start_consumer():
|
def start_consumer():
|
||||||
engine = FingerprintScrapingEngine(
|
engine = FingerprintScrapingEngine(
|
||||||
seed="multiplatform_scraper",
|
seed="multiplatform_scraper",
|
||||||
@ -517,7 +485,6 @@ def start_consumer():
|
|||||||
num_variations=10
|
num_variations=10
|
||||||
)
|
)
|
||||||
scraper = MultiPlatformJobScraper(engine)
|
scraper = MultiPlatformJobScraper(engine)
|
||||||
|
|
||||||
connection = None
|
connection = None
|
||||||
for attempt in range(5):
|
for attempt in range(5):
|
||||||
try:
|
try:
|
||||||
@ -527,22 +494,18 @@ def start_consumer():
|
|||||||
logger.info(f"Connecting to RabbitMQ over SSL at {RABBITMQ_HOST}:{RABBITMQ_PORT}")
|
logger.info(f"Connecting to RabbitMQ over SSL at {RABBITMQ_HOST}:{RABBITMQ_PORT}")
|
||||||
else:
|
else:
|
||||||
logger.info(f"Connecting to RabbitMQ at {RABBITMQ_HOST}:{RABBITMQ_PORT}")
|
logger.info(f"Connecting to RabbitMQ at {RABBITMQ_HOST}:{RABBITMQ_PORT}")
|
||||||
|
|
||||||
connection = pika.BlockingConnection(parameters)
|
connection = pika.BlockingConnection(parameters)
|
||||||
break
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"RabbitMQ connection attempt {attempt + 1} failed: {e}")
|
logger.error(f"RabbitMQ connection attempt {attempt + 1} failed: {e}")
|
||||||
time.sleep(2 ** attempt)
|
time.sleep(2 ** attempt)
|
||||||
|
|
||||||
if not connection:
|
if not connection:
|
||||||
logger.error("Failed to connect to RabbitMQ after retries")
|
logger.error("Failed to connect to RabbitMQ after retries")
|
||||||
return
|
return
|
||||||
|
|
||||||
channel = connection.channel()
|
channel = connection.channel()
|
||||||
channel.queue_declare(queue='job_queue', durable=True)
|
channel.queue_declare(queue='job_queue', durable=True)
|
||||||
channel.basic_qos(prefetch_count=1)
|
channel.basic_qos(prefetch_count=1)
|
||||||
channel.basic_consume(queue='job_queue', on_message_callback=callback_wrapper(scraper))
|
channel.basic_consume(queue='job_queue', on_message_callback=callback_wrapper(scraper))
|
||||||
|
|
||||||
logger.info('Waiting for messages (Ashby, Lever, Greenhouse). To exit press CTRL+C')
|
logger.info('Waiting for messages (Ashby, Lever, Greenhouse). To exit press CTRL+C')
|
||||||
try:
|
try:
|
||||||
channel.start_consuming()
|
channel.start_consuming()
|
||||||
@ -551,7 +514,5 @@ def start_consumer():
|
|||||||
channel.stop_consuming()
|
channel.stop_consuming()
|
||||||
connection.close()
|
connection.close()
|
||||||
asyncio.run(scraper.close_browser())
|
asyncio.run(scraper.close_browser())
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
start_consumer()
|
start_consumer()
|
||||||
Loading…
x
Reference in New Issue
Block a user