Compare commits
No commits in common. "0c447d0f77357e2e00e4103abefe294c28516f80" and "c0c7925be3804e81f7b75b6728c166a3751ec8ea" have entirely different histories.
0c447d0f77
...
c0c7925be3
13
scraper.py
13
scraper.py
@ -23,13 +23,12 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# Environment variables
|
# Environment variables
|
||||||
RABBITMQ_HOST = os.getenv("RABBITMQ_HOST")
|
RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitq.thejobhub.xyz")
|
||||||
RABBITMQ_PORT = os.getenv("RABBITMQ_PORT")
|
RABBITMQ_PORT = int(os.getenv("RABBITMQ_PORT", "5672"))
|
||||||
RABBITMQ_USER = os.getenv("RABBITMQ_USER")
|
RABBITMQ_USER = os.getenv("RABBITMQ_USER", "guest")
|
||||||
RABBITMQ_PASS = os.getenv("RABBITMQ_PASS")
|
RABBITMQ_PASS = os.getenv("RABBITMQ_PASS", "guest")
|
||||||
REDIS_HOST = os.getenv("REDIS_HOST")
|
REDIS_HOST = os.getenv("REDIS_HOST", "redis-scrape.thejobhub.xyz")
|
||||||
REDIS_PORT = os.getenv("REDIS_PORT")
|
REDIS_PORT = int(os.getenv("REDIS_PORT", "6379"))
|
||||||
|
|
||||||
|
|
||||||
class AshbyJobScraper:
|
class AshbyJobScraper:
|
||||||
def __init__(
|
def __init__(
|
||||||
|
|||||||
27
sender.py
27
sender.py
@ -1,4 +1,3 @@
|
|||||||
|
|
||||||
import csv
|
import csv
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
@ -10,6 +9,7 @@ import uuid
|
|||||||
from configparser import ConfigParser
|
from configparser import ConfigParser
|
||||||
import pika
|
import pika
|
||||||
import redis
|
import redis
|
||||||
|
import os
|
||||||
|
|
||||||
class Sender:
|
class Sender:
|
||||||
def __init__(self, config_file='config.ini'):
|
def __init__(self, config_file='config.ini'):
|
||||||
@ -17,19 +17,13 @@ class Sender:
|
|||||||
self.config.read(config_file)
|
self.config.read(config_file)
|
||||||
|
|
||||||
# RabbitMQ from env vars with fallbacks
|
# RabbitMQ from env vars with fallbacks
|
||||||
self.rabbitmq_host = os.getenv("RABBITMQ_HOST")
|
self.rabbitmq_host = os.getenv("RABBITMQ_HOST", self.config.get('rabbitmq', 'url', fallback='rabbitq.thejobhub.xyz'))
|
||||||
self.rabbitmq_port = os.getenv("RABBITMQ_PORT")
|
self.rabbitmq_port = int(os.getenv("RABBITMQ_PORT", self.config.get('rabbitmq', 'port', fallback='5672')))
|
||||||
self.username = os.getenv("RABBITMQ_USER")
|
self.username = os.getenv("RABBITMQ_USER", self.config.get('rabbitmq', 'username', fallback='guest'))
|
||||||
self.password = os.getenv("RABBITMQ_PASS")
|
self.password = os.getenv("RABBITMQ_PASS", self.config.get('rabbitmq', 'password', fallback='guest'))
|
||||||
self.queue_name = self.config.get('rabbitmq', 'queue_name', fallback='job_queue')
|
self.queue_name = self.config.get('rabbitmq', 'queue_name', fallback='job_queue')
|
||||||
self.directory = self.config.get('files', 'directory', fallback=os.path.join(os.path.expanduser("~"), "jobs", "csv"))
|
self.directory = self.config.get('files', 'directory', fallback='/var/jobs/csv')
|
||||||
|
self.log_file = self.config.get('logging', 'log_file', fallback='/var/logs/sender.log')
|
||||||
# Cross-platform log path: use user's home directory
|
|
||||||
default_log_dir = os.path.join(os.path.expanduser("~"), ".web_scraping_project", "logs")
|
|
||||||
os.makedirs(default_log_dir, exist_ok=True)
|
|
||||||
default_log_file = os.path.join(default_log_dir, "sender.log")
|
|
||||||
self.log_file = self.config.get('logging', 'log_file', fallback=default_log_file)
|
|
||||||
|
|
||||||
self.virtual_host = self.config.get('rabbitmq', 'virtual_hash', fallback='/')
|
self.virtual_host = self.config.get('rabbitmq', 'virtual_hash', fallback='/')
|
||||||
self.batch_size = 500
|
self.batch_size = 500
|
||||||
self.retry_attempts = 5 # Increased for robustness
|
self.retry_attempts = 5 # Increased for robustness
|
||||||
@ -37,13 +31,10 @@ class Sender:
|
|||||||
self.check_interval = 30 # More frequent polling
|
self.check_interval = 30 # More frequent polling
|
||||||
|
|
||||||
# Redis for deduplication
|
# Redis for deduplication
|
||||||
redis_host = os.getenv("REDIS_HOST")
|
redis_host = os.getenv("REDIS_HOST", "redis-scrape.thejobhub.xyz")
|
||||||
redis_port = os.getenv("REDIS_PORT")
|
redis_port = int(os.getenv("REDIS_PORT", "6379"))
|
||||||
self.redis_client = redis.Redis(host=redis_host, port=redis_port, db=1, decode_responses=True)
|
self.redis_client = redis.Redis(host=redis_host, port=redis_port, db=1, decode_responses=True)
|
||||||
|
|
||||||
# Ensure log directory exists before configuring logging
|
|
||||||
log_dir = os.path.dirname(self.log_file)
|
|
||||||
os.makedirs(log_dir, exist_ok=True)
|
|
||||||
logging.basicConfig(filename=self.log_file, level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
logging.basicConfig(filename=self.log_file, level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||||
self.logger = logging.getLogger(__name__)
|
self.logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user