Increase timeout for selector waits and refine job extraction logic in LLMJobRefiner and CryptoJobScraper
This commit is contained in:
parent
38ef08c734
commit
06f8e8b086
@ -27,7 +27,7 @@ class StealthyFetcher:
|
|||||||
|
|
||||||
if wait_for_selector:
|
if wait_for_selector:
|
||||||
try:
|
try:
|
||||||
await page.wait_for_selector(wait_for_selector, timeout=40000)
|
await page.wait_for_selector(wait_for_selector, timeout=120000)
|
||||||
except PlaywrightTimeoutError:
|
except PlaywrightTimeoutError:
|
||||||
print(f"Selector {wait_for_selector} not found immediately, continuing...")
|
print(f"Selector {wait_for_selector} not found immediately, continuing...")
|
||||||
|
|
||||||
@ -88,7 +88,7 @@ class StealthyFetcher:
|
|||||||
async def _is_content_accessible(self, page: Page, wait_for_selector: Optional[str] = None) -> bool:
|
async def _is_content_accessible(self, page: Page, wait_for_selector: Optional[str] = None) -> bool:
|
||||||
if wait_for_selector:
|
if wait_for_selector:
|
||||||
try:
|
try:
|
||||||
await page.wait_for_selector(wait_for_selector, timeout=40000)
|
await page.wait_for_selector(wait_for_selector, timeout=120000)
|
||||||
return True
|
return True
|
||||||
except PlaywrightTimeoutError:
|
except PlaywrightTimeoutError:
|
||||||
pass
|
pass
|
||||||
|
|||||||
218
llm_agent.py
218
llm_agent.py
@ -21,13 +21,12 @@ class LLMJobRefiner:
|
|||||||
raise ValueError("DEEPSEEK_API_KEY not found in .env file.")
|
raise ValueError("DEEPSEEK_API_KEY not found in .env file.")
|
||||||
|
|
||||||
# Database credentials from .env
|
# Database credentials from .env
|
||||||
self.db_url = os.getenv("DB_URL")
|
|
||||||
self.db_username = os.getenv("DB_USERNAME")
|
self.db_username = os.getenv("DB_USERNAME")
|
||||||
self.db_password = os.getenv("DB_PASSWORD")
|
self.db_password = os.getenv("DB_PASSWORD")
|
||||||
self.db_host = os.getenv("DB_HOST")
|
self.db_host = os.getenv("DB_HOST")
|
||||||
self.db_port = os.getenv("DB_PORT")
|
self.db_port = os.getenv("DB_PORT")
|
||||||
|
|
||||||
if not self.db_url or not self.db_username or not self.db_password:
|
if not self.db_username or not self.db_password:
|
||||||
raise ValueError("Database credentials not found in .env file.")
|
raise ValueError("Database credentials not found in .env file.")
|
||||||
|
|
||||||
# DeepSeek uses OpenAI-compatible API
|
# DeepSeek uses OpenAI-compatible API
|
||||||
@ -41,22 +40,12 @@ class LLMJobRefiner:
|
|||||||
def _init_db(self):
|
def _init_db(self):
|
||||||
"""Initialize PostgreSQL database connection and create table"""
|
"""Initialize PostgreSQL database connection and create table"""
|
||||||
try:
|
try:
|
||||||
self.db_url = os.getenv("DB_URL")
|
conn = psycopg2.connect(
|
||||||
if self.db_url and "supabase.com" in self.db_url:
|
host=self.db_host,
|
||||||
conn = psycopg2.connect(
|
port=self.db_port,
|
||||||
host=self.db_host,
|
database="postgres",
|
||||||
port=self.db_port,
|
user=self.db_username,
|
||||||
database="postgres",
|
password=self.db_password
|
||||||
user=self.db_username,
|
|
||||||
password=self.db_password
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
conn = psycopg2.connect(
|
|
||||||
host=self.db_host,
|
|
||||||
port=self.db_port,
|
|
||||||
database="postgres",
|
|
||||||
user=self.db_username,
|
|
||||||
password=self.db_password
|
|
||||||
)
|
)
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
@ -113,8 +102,8 @@ class LLMJobRefiner:
|
|||||||
text = re.sub(r'\s+', ' ', text)
|
text = re.sub(r'\s+', ' ', text)
|
||||||
|
|
||||||
# Limit length for LLM context
|
# Limit length for LLM context
|
||||||
if len(text) > 10000:
|
if len(text) > 100000:
|
||||||
text = text[:10000] + "..."
|
text = text[:100000] + "..."
|
||||||
|
|
||||||
return text
|
return text
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -128,7 +117,7 @@ class LLMJobRefiner:
|
|||||||
response = self.client.chat.completions.create(
|
response = self.client.chat.completions.create(
|
||||||
model=self.model,
|
model=self.model,
|
||||||
messages=[{"role": "user", "content": prompt}],
|
messages=[{"role": "user", "content": prompt}],
|
||||||
temperature=0.2,
|
temperature=0.1,
|
||||||
max_tokens=2048,
|
max_tokens=2048,
|
||||||
stream=False
|
stream=False
|
||||||
)
|
)
|
||||||
@ -145,38 +134,52 @@ class LLMJobRefiner:
|
|||||||
posted_date = raw_data.get('posted_date', datetime.now().strftime("%m/%d/%y"))
|
posted_date = raw_data.get('posted_date', datetime.now().strftime("%m/%d/%y"))
|
||||||
|
|
||||||
prompt = f"""
|
prompt = f"""
|
||||||
You are a job posting data extractor.
|
You are an expert job posting data extractor. Your task is to extract AND infer fields from the provided job posting.
|
||||||
|
|
||||||
EXTRACT EXACT TEXT - DO NOT SUMMARIZE, PARAPHRASE, OR INVENT.
|
### CORE RULES:
|
||||||
|
1. **NEVER invent, summarize, or paraphrase** — extract **exact wording** when available.
|
||||||
For these critical fields, follow these rules:
|
2. **For critical fields (title, company_name, job_id, url, description):**
|
||||||
- description: Extract ALL job description text. If ANY job details exist (duties, responsibilities, overview), include them. Only use "Not provided" if absolutely no description exists.
|
- These MUST be present and meaningful.
|
||||||
- requirements: Extract ALL requirements text. If ANY requirements exist (skills, experience, education needed), include them. Only use "Not provided" if none exist.
|
- If not explicitly stated, **infer from context** (e.g., page title, headings, "About Us", etc.).
|
||||||
- qualifications: Extract ALL qualifications text. If ANY qualifications exist, include them. Only use "Not provided" if none exist.
|
- **NEVER return "Not provided" or "N/A" for these fields.**
|
||||||
|
3. **For optional fields (location, salary_range, etc.):**
|
||||||
REQUIRED FIELDS (must have valid values, never "N/A"):
|
- Extract exact text if present.
|
||||||
- title, company_name, job_id, url
|
- If absent but inferable (e.g., "Remote (US)", "Full-time"), **infer it**.
|
||||||
|
- Only return "Not provided" if truly absent and non-inferable.
|
||||||
OPTIONAL FIELDS (can be "Not provided"):
|
|
||||||
- location, salary_range, nature_of_work
|
### FIELD DEFINITIONS:
|
||||||
|
- **title**: The job title. Look in <h1>, page title, or bold headings.
|
||||||
Page Content:
|
- **company_name**: Company name. Look in logo alt text, footer, "About [X]", or page title.
|
||||||
{cleaned_content}
|
- **description**: Main job overview, responsibilities, or duties. Combine relevant paragraphs if needed. **Must not be empty.**
|
||||||
|
- **requirements**: Required skills, experience, or qualifications.
|
||||||
Response format (ONLY return this JSON):
|
- **qualifications**: Educational or certification requirements.
|
||||||
{{
|
- **location**: Office location or remote policy.
|
||||||
"title": "...",
|
- **salary_range**: Exact compensation info.
|
||||||
"company_name": "...",
|
- **nature_of_work**: Employment type (Full-time, Contract, Internship, etc.).
|
||||||
"location": "...",
|
|
||||||
"description": "...",
|
### OUTPUT FORMAT:
|
||||||
"requirements": "...",
|
Return ONLY a valid JSON object with these keys:
|
||||||
"qualifications": "...",
|
{{
|
||||||
"salary_range": "...",
|
"title": "...",
|
||||||
"nature_of_work": "...",
|
"company_name": "...",
|
||||||
"job_id": "{job_id}",
|
"location": "...",
|
||||||
"url": "{url}"
|
"description": "...",
|
||||||
}}
|
"requirements": "...",
|
||||||
"""
|
"qualifications": "...",
|
||||||
|
"salary_range": "...",
|
||||||
|
"nature_of_work": "...",
|
||||||
|
"job_id": "{job_id}",
|
||||||
|
"url": "{url}"
|
||||||
|
}}
|
||||||
|
|
||||||
|
- **Critical fields must NEVER be "Not provided", "N/A", empty, or generic** (e.g., "Company", "Job Title").
|
||||||
|
- **Optional fields may be "Not provided" ONLY if truly absent.**
|
||||||
|
- **Do not include markdown, explanations, or extra text.**
|
||||||
|
- **Use double quotes for JSON.**
|
||||||
|
|
||||||
|
Page Content:
|
||||||
|
{cleaned_content}
|
||||||
|
"""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response_text = await asyncio.get_event_loop().run_in_executor(
|
response_text = await asyncio.get_event_loop().run_in_executor(
|
||||||
@ -188,31 +191,23 @@ class LLMJobRefiner:
|
|||||||
if not refined_data:
|
if not refined_data:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Validate required fields
|
# Validate critical fields — reject if missing or placeholder
|
||||||
required_fields = ['title', 'company_name', 'job_id', 'url']
|
critical_fields = ['title', 'company_name', 'job_id', 'url', 'description']
|
||||||
for field in required_fields:
|
for field in critical_fields:
|
||||||
if not refined_data.get(field) or refined_data[field].strip() in ["N/A", "", "Unknown", "Company", "Job"]:
|
value = refined_data.get(field, "").strip()
|
||||||
return None
|
if not value or value.lower() in ["n/a", "not provided", "unknown", "company", "job", "title", ""]:
|
||||||
|
print(f" ❌ Critical field '{field}' is invalid: '{value}'")
|
||||||
|
return None # This job will NOT be saved — as per requirement
|
||||||
|
|
||||||
# CRITICAL: Validate content fields - check if they SHOULD exist
|
# Optional fields: allow "Not provided", but ensure they're strings
|
||||||
content_fields = ['description', 'requirements', 'qualifications']
|
optional_fields = ['location', 'requirements', 'qualifications', 'salary_range', 'nature_of_work']
|
||||||
cleaned_original = cleaned_content.lower()
|
for field in optional_fields:
|
||||||
|
if field not in refined_data:
|
||||||
|
refined_data[field] = "Not provided"
|
||||||
|
elif not isinstance(refined_data[field], str):
|
||||||
|
refined_data[field] = str(refined_data[field])
|
||||||
|
|
||||||
# Simple heuristic: if page contains job-related keywords, content fields should NOT be "Not provided"
|
|
||||||
job_indicators = ['responsibilit', 'duties', 'require', 'qualifi', 'skill', 'experienc', 'educat', 'degree', 'bachelor', 'master']
|
|
||||||
has_job_content = any(indicator in cleaned_original for indicator in job_indicators)
|
|
||||||
|
|
||||||
if has_job_content:
|
|
||||||
for field in content_fields:
|
|
||||||
value = refined_data.get(field, "").strip()
|
|
||||||
if value in ["Not provided", "N/A", ""]:
|
|
||||||
# LLM failed to extract existing content
|
|
||||||
print(f" ⚠️ LLM returned '{value}' for {field} but job content appears present")
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Add the posted_date to the refined data
|
|
||||||
refined_data['posted_date'] = posted_date
|
refined_data['posted_date'] = posted_date
|
||||||
|
|
||||||
return refined_data
|
return refined_data
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -220,15 +215,22 @@ class LLMJobRefiner:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
def _parse_llm_response(self, response_text: str) -> Dict[str, Any]:
|
def _parse_llm_response(self, response_text: str) -> Dict[str, Any]:
|
||||||
|
# Try to extract JSON from markdown code block
|
||||||
json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', response_text, re.DOTALL)
|
json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', response_text, re.DOTALL)
|
||||||
if not json_match:
|
if not json_match:
|
||||||
json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
|
# Try to find raw JSON object
|
||||||
|
json_match = re.search(r'\{[^{}]*\{[^{}]*\}[^{}]*\}|\{.*\}', response_text, re.DOTALL)
|
||||||
if not json_match:
|
if not json_match:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return json.loads(json_match.group(1) if '```' in response_text else json_match.group(0))
|
json_str = json_match.group(1) if '```' in response_text else json_match.group(0)
|
||||||
except json.JSONDecodeError:
|
# Clean common issues
|
||||||
|
json_str = re.sub(r'\s+', ' ', json_str)
|
||||||
|
json_str = re.sub(r',\s*([\]}\)])', r'\1', json_str) # Remove trailing commas
|
||||||
|
return json.loads(json_str)
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
print(f"JSON parsing error: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
async def save_job_data(self, job_data: Dict[str, Any], keyword: str):
|
async def save_job_data(self, job_data: Dict[str, Any], keyword: str):
|
||||||
@ -239,11 +241,11 @@ class LLMJobRefiner:
|
|||||||
"""Save job data to PostgreSQL database with job_id uniqueness"""
|
"""Save job data to PostgreSQL database with job_id uniqueness"""
|
||||||
try:
|
try:
|
||||||
conn = psycopg2.connect(
|
conn = psycopg2.connect(
|
||||||
host=self.db_host,
|
host=self.db_host,
|
||||||
port=self.db_port,
|
port=self.db_port,
|
||||||
database="postgres",
|
database="postgres",
|
||||||
user=self.db_username,
|
user=self.db_username,
|
||||||
password=self.db_password
|
password=self.db_password
|
||||||
)
|
)
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
@ -254,50 +256,50 @@ class LLMJobRefiner:
|
|||||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||||
ON CONFLICT (job_id) DO NOTHING
|
ON CONFLICT (job_id) DO NOTHING
|
||||||
''', (
|
''', (
|
||||||
job_data.get("title", "N/A"),
|
job_data.get("title", "Not provided"),
|
||||||
job_data.get("company_name", "N/A"),
|
job_data.get("company_name", "Not provided"),
|
||||||
job_data.get("location", "N/A"),
|
job_data.get("location", "Not provided"),
|
||||||
job_data.get("description", "N/A"),
|
job_data.get("description", "Not provided"),
|
||||||
job_data.get("requirements", "N/A"),
|
job_data.get("requirements", "Not provided"),
|
||||||
job_data.get("qualifications", "N/A"),
|
job_data.get("qualifications", "Not provided"),
|
||||||
job_data.get("salary_range", "N/A"),
|
job_data.get("salary_range", "Not provided"),
|
||||||
job_data.get("nature_of_work", "N/A"),
|
job_data.get("nature_of_work", "Not provided"),
|
||||||
job_data.get("job_id", "N/A"),
|
job_data.get("job_id", "unknown"),
|
||||||
job_data.get("url", "N/A"),
|
job_data.get("url", "N/A"),
|
||||||
job_data.get("category", "N/A"),
|
job_data.get("category", "all"),
|
||||||
job_data.get("scraped_at"),
|
job_data.get("scraped_at"),
|
||||||
job_data.get("posted_date", "N/A")
|
job_data.get("posted_date", datetime.now().strftime("%m/%d/%y"))
|
||||||
))
|
))
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
cursor.close()
|
cursor.close()
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
print(f" 💾 Saved job to category '{job_data.get('category', 'N/A')}' with job_id: {job_data.get('job_id', 'N/A')}")
|
print(f" 💾 Saved job to category '{job_data.get('category', 'all')}' with job_id: {job_data.get('job_id', 'unknown')}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"❌ Database save error: {e}")
|
print(f"❌ Database save error: {e}")
|
||||||
|
|
||||||
async def _save_to_markdown(self, job_data: Dict[str, Any], keyword: str):
|
async def _save_to_markdown(self, job_data: Dict[str, Any], keyword: str):
|
||||||
os.makedirs("linkedin_jobs", exist_ok=True)
|
os.makedirs("crypto_jobs", exist_ok=True)
|
||||||
filepath = os.path.join("linkedin_jobs", "linkedin_jobs_scraped.md")
|
filepath = os.path.join("crypto_jobs", "crypto_jobs_scraped.md")
|
||||||
write_header = not os.path.exists(filepath) or os.path.getsize(filepath) == 0
|
write_header = not os.path.exists(filepath) or os.path.getsize(filepath) == 0
|
||||||
|
|
||||||
with open(filepath, "a", encoding="utf-8") as f:
|
with open(filepath, "a", encoding="utf-8") as f:
|
||||||
if write_header:
|
if write_header:
|
||||||
f.write(f"# LinkedIn Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
|
f.write(f"# Crypto Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
|
||||||
f.write(f"## Job: {job_data.get('title', 'N/A')}\n\n")
|
f.write(f"## Job: {job_data.get('title', 'Not provided')}\n\n")
|
||||||
f.write(f"- **Keyword**: {keyword}\n")
|
f.write(f"- **Keyword**: {keyword}\n")
|
||||||
f.write(f"- **Company**: {job_data.get('company_name', 'N/A')}\n")
|
f.write(f"- **Company**: {job_data.get('company_name', 'Not provided')}\n")
|
||||||
f.write(f"- **Location**: {job_data.get('location', 'N/A')}\n")
|
f.write(f"- **Location**: {job_data.get('location', 'Not provided')}\n")
|
||||||
f.write(f"- **Nature of Work**: {job_data.get('nature_of_work', 'N/A')}\n")
|
f.write(f"- **Nature of Work**: {job_data.get('nature_of_work', 'Not provided')}\n")
|
||||||
f.write(f"- **Salary Range**: {job_data.get('salary_range', 'N/A')}\n")
|
f.write(f"- **Salary Range**: {job_data.get('salary_range', 'Not provided')}\n")
|
||||||
f.write(f"- **Job ID**: {job_data.get('job_id', 'N/A')}\n")
|
f.write(f"- **Job ID**: {job_data.get('job_id', 'unknown')}\n")
|
||||||
f.write(f"- **Posted Date**: {job_data.get('posted_date', 'N/A')}\n")
|
f.write(f"- **Posted Date**: {job_data.get('posted_date', 'N/A')}\n")
|
||||||
f.write(f"- **Category**: {job_data.get('category', 'N/A')}\n")
|
f.write(f"- **Category**: {job_data.get('category', 'all')}\n")
|
||||||
f.write(f"- **Scraped At**: {job_data.get('scraped_at', 'N/A')}\n")
|
f.write(f"- **Scraped At**: {job_data.get('scraped_at', 'N/A')}\n")
|
||||||
f.write(f"- **URL**: <{job_data.get('url', 'N/A')}>\n\n")
|
f.write(f"- **URL**: <{job_data.get('url', 'N/A')}>\n\n")
|
||||||
f.write(f"### Description\n\n{job_data.get('description', 'N/A')}\n\n")
|
f.write(f"### Description\n\n{job_data.get('description', 'Not provided')}\n\n")
|
||||||
f.write(f"### Requirements\n\n{job_data.get('requirements', 'N/A')}\n\n")
|
f.write(f"### Requirements\n\n{job_data.get('requirements', 'Not provided')}\n\n")
|
||||||
f.write(f"### Qualifications\n\n{job_data.get('qualifications', 'N/A')}\n\n")
|
f.write(f"### Qualifications\n\n{job_data.get('qualifications', 'Not provided')}\n\n")
|
||||||
f.write("---\n\n")
|
f.write("---\n\n")
|
||||||
20
main.py
20
main.py
@ -1,3 +1,4 @@
|
|||||||
|
|
||||||
from scraping_engine import FingerprintScrapingEngine
|
from scraping_engine import FingerprintScrapingEngine
|
||||||
from scraper import CryptoJobScraper # Updated class name
|
from scraper import CryptoJobScraper # Updated class name
|
||||||
import os
|
import os
|
||||||
@ -20,16 +21,15 @@ async def main():
|
|||||||
scraper = CryptoJobScraper(engine, human_speed=1.3, user_request="Extract title, company, location, description, requirements, qualifications, nature of work, and salary")
|
scraper = CryptoJobScraper(engine, human_speed=1.3, user_request="Extract title, company, location, description, requirements, qualifications, nature of work, and salary")
|
||||||
|
|
||||||
job_titles = [
|
job_titles = [
|
||||||
"Blockchain Engineer",
|
"Customer Support",
|
||||||
"Smart Contract Developer",
|
"Design",
|
||||||
"DeFi Analyst",
|
"Engineering",
|
||||||
"Web3 Developer",
|
"Finance",
|
||||||
"Crypto Researcher",
|
"Marketing",
|
||||||
"Solidity Developer",
|
"Operations",
|
||||||
"Protocol Engineer",
|
"Product",
|
||||||
"Tokenomics Specialist",
|
"Sales"
|
||||||
"Zero-Knowledge Proof Engineer",
|
|
||||||
"Crypto Compliance Officer"
|
|
||||||
]
|
]
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
|
|||||||
407
scraper.py
407
scraper.py
@ -6,10 +6,11 @@ from playwright.async_api import async_playwright, TimeoutError as PlaywrightTim
|
|||||||
from browserforge.injectors.playwright import AsyncNewContext
|
from browserforge.injectors.playwright import AsyncNewContext
|
||||||
from llm_agent import LLMJobRefiner
|
from llm_agent import LLMJobRefiner
|
||||||
import re
|
import re
|
||||||
from fetcher import StealthyFetcher
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import json
|
import json
|
||||||
import redis
|
import redis
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
import hashlib
|
||||||
|
|
||||||
|
|
||||||
class CryptoJobScraper:
|
class CryptoJobScraper:
|
||||||
@ -25,7 +26,29 @@ class CryptoJobScraper:
|
|||||||
self.human_speed = human_speed
|
self.human_speed = human_speed
|
||||||
self.user_request = user_request
|
self.user_request = user_request
|
||||||
self.llm_agent = LLMJobRefiner()
|
self.llm_agent = LLMJobRefiner()
|
||||||
self.redis_client = redis.Redis(host='localhost', port=6379, db=0, decode_responses=True)
|
self.redis_client = redis.Redis(host='=localhost', port=6379, db=0, decode_responses=True)
|
||||||
|
|
||||||
|
self.FORBIDDEN_ATS_DOMAINS = [
|
||||||
|
'ashby', 'ashbyhq',
|
||||||
|
'greenhouse', 'boards.greenhouse.io',
|
||||||
|
'gem', 'gem.com',
|
||||||
|
'rippling',
|
||||||
|
'myworkday', 'myworkdayjobs',
|
||||||
|
'smartrecruiters',
|
||||||
|
'workable',
|
||||||
|
'lever', 'jobs.lever.co',
|
||||||
|
]
|
||||||
|
|
||||||
|
self.INVALID_CONTENT_PHRASES = [
|
||||||
|
"invalid job url",
|
||||||
|
"cookie consent",
|
||||||
|
"privacy policy",
|
||||||
|
"not a valid job",
|
||||||
|
"job not found",
|
||||||
|
"page not found",
|
||||||
|
"The requested job post could not be found. It may have been removed."
|
||||||
|
"this page does not contain a job description"
|
||||||
|
]
|
||||||
|
|
||||||
async def _human_click(self, page, element, wait_after: bool = True):
|
async def _human_click(self, page, element, wait_after: bool = True):
|
||||||
if not element:
|
if not element:
|
||||||
@ -55,60 +78,127 @@ class CryptoJobScraper:
|
|||||||
matches = sum(1 for kw in keyword_list if kw in title_lower)
|
matches = sum(1 for kw in keyword_list if kw in title_lower)
|
||||||
return matches / len(keyword_list) if keyword_list else 0.0
|
return matches / len(keyword_list) if keyword_list else 0.0
|
||||||
|
|
||||||
async def _scrape_jobs_from_current_page(self, page, search_keywords: str, seen_job_ids, all_job_links):
|
async def _extract_job_title_from_card(self, card) -> str:
|
||||||
current_links = await page.query_selector_all("a[href*='/job/']")
|
try:
|
||||||
new_jobs = 0
|
title_selectors = [
|
||||||
|
'h3', 'h2', 'h4',
|
||||||
for link in current_links:
|
'strong', 'span'
|
||||||
href = await link.get_attribute("href")
|
]
|
||||||
if not href or not href.startswith("http"):
|
for selector in title_selectors:
|
||||||
href = "https://cryptocurrencyjobs.co" + href
|
title_element = await card.query_selector(selector)
|
||||||
job_id = href.split("/")[-1] if href.endswith("/") else href.split("/")[-1]
|
if title_element:
|
||||||
|
title_text = await title_element.inner_text()
|
||||||
if job_id and job_id not in seen_job_ids:
|
if title_text and len(title_text.strip()) > 3:
|
||||||
title_element = await link.query_selector("h3, .job-title")
|
return title_text.strip()
|
||||||
title = (await title_element.inner_text()) if title_element else "Unknown Title"
|
|
||||||
|
card_text = await card.inner_text()
|
||||||
|
lines = [line.strip() for line in card_text.split('\n') if line.strip()]
|
||||||
|
if lines:
|
||||||
|
for line in lines:
|
||||||
|
if len(line) > 5 and not any(skip in line.lower() for skip in ['today', 'featured', 'yesterday', 'company', 'location']):
|
||||||
|
return line
|
||||||
|
return "Unknown Title"
|
||||||
|
except:
|
||||||
|
return "Unknown Title"
|
||||||
|
|
||||||
|
async def _collect_job_elements_from_page(self, page, search_keywords: str, seen_slugs):
|
||||||
|
job_cards = []
|
||||||
|
job_found = False
|
||||||
|
|
||||||
|
await asyncio.sleep(3 * self.human_speed)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await page.wait_for_selector('a[href^="/"][href*="-"]', timeout=60000)
|
||||||
|
candidates = await page.query_selector_all('a[href^="/"][href*="-"]')
|
||||||
|
|
||||||
|
for link in candidates:
|
||||||
|
href = await link.get_attribute("href") or ""
|
||||||
|
href = href.rstrip('/')
|
||||||
|
if not href or len(href.split('/')) != 3:
|
||||||
|
continue
|
||||||
|
if '-' not in href.split('/')[-1]:
|
||||||
|
continue
|
||||||
|
slug = href.split('/')[-1]
|
||||||
|
if len(slug) < 8 or slug.startswith(('login', 'signup', 'about', 'terms')):
|
||||||
|
continue
|
||||||
|
|
||||||
|
full_url = "https://cryptocurrencyjobs.co" + href if not href.startswith('http') else href
|
||||||
|
if slug in seen_slugs:
|
||||||
|
continue
|
||||||
|
|
||||||
|
title = await self._extract_job_title_from_card(link)
|
||||||
|
if not title or title == "Unknown Title":
|
||||||
|
title = slug.replace('-', ' ').title()
|
||||||
|
|
||||||
match_percentage = self._calculate_keyword_match(title, search_keywords)
|
match_percentage = self._calculate_keyword_match(title, search_keywords)
|
||||||
|
if match_percentage >= 0.4 or not search_keywords.strip():
|
||||||
if match_percentage >= 0.5: # Lower threshold than LinkedIn
|
seen_slugs.add(slug)
|
||||||
seen_job_ids.add(job_id)
|
job_cards.append((full_url, title, link))
|
||||||
all_job_links.append((href, title))
|
job_found = True
|
||||||
new_jobs += 1
|
|
||||||
else:
|
|
||||||
print(f" ⚠️ Skipping job due to low keyword match: {title[:50]}... (match: {match_percentage:.2%})")
|
|
||||||
return new_jobs
|
|
||||||
|
|
||||||
async def _handle_pagination(self, page, search_keywords: str, seen_job_ids, all_job_links):
|
print(f" ✅ Collected {len(job_cards)} valid job links after filtering ({len(candidates)} raw candidates).")
|
||||||
current_page = 1
|
|
||||||
while True:
|
|
||||||
print(f"📄 Processing page {current_page}")
|
|
||||||
new_jobs = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links)
|
|
||||||
print(f" ➕ Found {new_jobs} new job(s) (total: {len(all_job_links)})")
|
|
||||||
|
|
||||||
next_btn = await page.query_selector('a[rel="next"]')
|
except Exception as e:
|
||||||
if next_btn:
|
print(f" ⚠️ Error collecting job cards: {e}")
|
||||||
next_url = await next_btn.get_attribute("href")
|
|
||||||
if next_url and not next_url.startswith("http"):
|
if not job_found:
|
||||||
next_url = "https://cryptocurrencyjobs.co" + next_url
|
print(" ❌ No valid job listings passed filters.")
|
||||||
await page.goto(next_url, timeout=120000)
|
|
||||||
await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
|
return job_cards
|
||||||
current_page += 1
|
|
||||||
else:
|
async def _handle_pagination_and_collect_all(self, page, search_keywords: str, seen_slugs):
|
||||||
print("🔚 No 'Next' page — stopping pagination.")
|
all_job_elements = []
|
||||||
|
scroll_attempt = 0
|
||||||
|
max_scrolls = 40
|
||||||
|
prev_count = 0
|
||||||
|
|
||||||
|
while scroll_attempt < max_scrolls:
|
||||||
|
print(f" Scroll attempt {scroll_attempt + 1} | Current total jobs: {len(all_job_elements)}")
|
||||||
|
|
||||||
|
page_elements = await self._collect_job_elements_from_page(page, search_keywords, seen_slugs)
|
||||||
|
all_job_elements.extend(page_elements)
|
||||||
|
|
||||||
|
current_count = len(all_job_elements)
|
||||||
|
|
||||||
|
if current_count == prev_count and scroll_attempt > 3:
|
||||||
|
print(" 🔚 No new jobs after several scrolls → assuming end of list.")
|
||||||
break
|
break
|
||||||
|
|
||||||
async def _extract_job_posted_date(self, page) -> str:
|
prev_count = current_count
|
||||||
|
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
await asyncio.sleep(random.uniform(2.5, 5.5) * self.human_speed)
|
||||||
|
|
||||||
|
try:
|
||||||
|
load_more = await page.query_selector(
|
||||||
|
'button:has-text("Load more"), button:has-text("More"), div[role="button"]:has-text("Load"), a:has-text("Load more")'
|
||||||
|
)
|
||||||
|
if load_more:
|
||||||
|
print(" Found 'Load more' button → clicking...")
|
||||||
|
await self._human_click(page, load_more)
|
||||||
|
await asyncio.sleep(random.uniform(3.0, 6.0) * self.human_speed)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
scroll_attempt += 1
|
||||||
|
|
||||||
|
print(f" Finished scrolling → collected {len(all_job_elements)} unique job links.")
|
||||||
|
return all_job_elements
|
||||||
|
|
||||||
|
async def _extract_job_posted_date_from_card(self, card) -> str:
|
||||||
try:
|
try:
|
||||||
date_element = await page.query_selector(".job-posted-date, .job-date, time")
|
card_text = await card.inner_text()
|
||||||
if date_element:
|
if "Today" in card_text:
|
||||||
date_text = await date_element.inner_text()
|
return datetime.now().strftime("%m/%d/%y")
|
||||||
if "Today" in date_text:
|
elif "Yesterday" in card_text:
|
||||||
return datetime.now().strftime("%m/%d/%y")
|
from datetime import timedelta
|
||||||
elif "Yesterday" in date_text:
|
return (datetime.now() - timedelta(days=1)).strftime("%m/%d/%y")
|
||||||
yesterday = datetime.now().replace(day=datetime.now().day - 1)
|
else:
|
||||||
return yesterday.strftime("%m/%d/%y")
|
match = re.search(r'(\d+)d', card_text)
|
||||||
else:
|
if match:
|
||||||
return datetime.now().strftime("%m/%d/%y")
|
days = int(match.group(1))
|
||||||
|
from datetime import timedelta
|
||||||
|
return (datetime.now() - timedelta(days=days)).strftime("%m/%d/%y")
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
return datetime.now().strftime("%m/%d/%y")
|
return datetime.now().strftime("%m/%d/%y")
|
||||||
@ -126,15 +216,62 @@ class CryptoJobScraper:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f" ❌ Failed to add job to Redis cache: {str(e)}")
|
print(f" ❌ Failed to add job to Redis cache: {str(e)}")
|
||||||
|
|
||||||
|
async def _is_forbidden_ats_url(self, url: str) -> bool:
|
||||||
|
url_lower = url.lower()
|
||||||
|
return any(domain in url_lower for domain in self.FORBIDDEN_ATS_DOMAINS)
|
||||||
|
|
||||||
|
async def _is_invalid_job_page(self, page_content: str) -> bool:
|
||||||
|
content_lower = page_content.lower()
|
||||||
|
return any(phrase in content_lower for phrase in self.INVALID_CONTENT_PHRASES)
|
||||||
|
|
||||||
|
def _extract_job_id_from_url(self, url: str) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Extract job ID from URL. Returns ID if it contains at least one digit.
|
||||||
|
Otherwise, returns None (but does NOT mean skip!).
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
parsed = urlparse(url)
|
||||||
|
path_parts = [p for p in parsed.path.split('/') if p]
|
||||||
|
if not path_parts:
|
||||||
|
return None
|
||||||
|
|
||||||
|
candidate = path_parts[-1]
|
||||||
|
candidate = re.split(r'[?#]', candidate)[0]
|
||||||
|
candidate = re.sub(r'\.html?$', '', candidate)
|
||||||
|
|
||||||
|
if not candidate or not any(c.isdigit() for c in candidate):
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Avoid title-like strings (with spaces or long words + no structure)
|
||||||
|
if re.search(r'[A-Za-z]{6,}\s', candidate):
|
||||||
|
return None
|
||||||
|
|
||||||
|
return candidate
|
||||||
|
except:
|
||||||
|
return None
|
||||||
|
|
||||||
async def scrape_jobs(
|
async def scrape_jobs(
|
||||||
self,
|
self,
|
||||||
search_keywords: Optional[str],
|
search_keywords: Optional[str],
|
||||||
max_pages: int = 1,
|
max_pages: int = 1,
|
||||||
credentials: Optional[Dict] = None
|
credentials: Optional[Dict] = None
|
||||||
):
|
):
|
||||||
# cryptocurrencyjobs.co uses URL params differently
|
query = ""
|
||||||
encoded_keywords = search_keywords.replace(" ", "%20")
|
location = ""
|
||||||
search_url = f"https://cryptocurrencyjobs.co/?q={encoded_keywords}"
|
if search_keywords and search_keywords.strip():
|
||||||
|
parts = search_keywords.split(',', 1)
|
||||||
|
query = parts[0].strip()
|
||||||
|
if len(parts) > 1:
|
||||||
|
location = parts[1].strip()
|
||||||
|
|
||||||
|
clean_query = query.replace(' ', '+')
|
||||||
|
clean_location = location.replace(' ', '+')
|
||||||
|
|
||||||
|
search_url = "https://cryptocurrencyjobs.co/"
|
||||||
|
if clean_query:
|
||||||
|
search_url += f"?query={clean_query}"
|
||||||
|
if clean_location:
|
||||||
|
search_url += f"&location={clean_location}"
|
||||||
|
|
||||||
profile = self.engine._select_profile()
|
profile = self.engine._select_profile()
|
||||||
renderer = random.choice(self.engine.common_renderers[self.engine.os])
|
renderer = random.choice(self.engine.common_renderers[self.engine.os])
|
||||||
@ -156,46 +293,107 @@ class CryptoJobScraper:
|
|||||||
await context.add_init_script(spoof_script)
|
await context.add_init_script(spoof_script)
|
||||||
|
|
||||||
page = await context.new_page()
|
page = await context.new_page()
|
||||||
|
print(f"🔍 Searching for: {search_keywords or 'all jobs'}")
|
||||||
# Fetch main search page
|
print(f" 🔗 URL: {search_url}")
|
||||||
print(f"🔍 Searching for: {search_keywords}")
|
await page.goto(search_url, wait_until='networkidle', timeout=120000)
|
||||||
await page.goto(search_url, wait_until='load', timeout=120000)
|
|
||||||
await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
|
await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
|
||||||
|
|
||||||
all_job_links = []
|
try:
|
||||||
seen_job_ids = set()
|
await page.wait_for_selector("a[href^='/'][href*='-']", timeout=10000)
|
||||||
|
except:
|
||||||
|
print(" ⚠️ No job links found initially, waiting longer...")
|
||||||
|
await asyncio.sleep(5 * self.human_speed)
|
||||||
|
|
||||||
print("🔄 Collecting job links from search results...")
|
seen_slugs = set()
|
||||||
await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links)
|
all_job_elements = await self._handle_pagination_and_collect_all(page, search_keywords, seen_slugs)
|
||||||
await self._handle_pagination(page, search_keywords, seen_job_ids, all_job_links)
|
print(f"✅ Collected {len(all_job_elements)} unique job links.")
|
||||||
|
|
||||||
print(f"✅ Collected {len(all_job_links)} unique job links.")
|
|
||||||
|
|
||||||
scraped_count = 0
|
scraped_count = 0
|
||||||
for idx, (href, title) in enumerate(all_job_links):
|
for idx, (href, title, job_element) in enumerate(all_job_elements):
|
||||||
|
job_detail_page = None
|
||||||
|
apply_page = None
|
||||||
|
skip_job = False
|
||||||
|
final_scrape_url = None
|
||||||
try:
|
try:
|
||||||
full_url = href
|
print(f" → Processing job {idx+1}/{len(all_job_elements)}: {title}")
|
||||||
print(f" → Opening job {idx+1}/{len(all_job_links)}: {full_url}")
|
|
||||||
|
|
||||||
fetcher = StealthyFetcher(self.engine, browser, context)
|
posted_date = await self._extract_job_posted_date_from_card(job_element)
|
||||||
job_page = await fetcher.fetch_url(full_url, wait_for_selector="h1")
|
|
||||||
if not job_page:
|
job_detail_page = await context.new_page()
|
||||||
print(f" ❌ Failed to fetch job page {full_url}")
|
await job_detail_page.goto(href, wait_until='networkidle', timeout=60000)
|
||||||
await self._add_job_to_redis_cache(full_url, full_url.split("/")[-1], "fetch_failure")
|
await asyncio.sleep(2 * self.human_speed)
|
||||||
self.engine.report_outcome("fetch_failure", url=full_url)
|
|
||||||
|
# Check for invalid content
|
||||||
|
page_content = await job_detail_page.content()
|
||||||
|
if await self._is_invalid_job_page(page_content):
|
||||||
|
print(" 🚫 Page contains invalid content → skipping.")
|
||||||
|
await job_detail_page.close()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
posted_date = await self._extract_job_posted_date(job_page)
|
# Try to click apply
|
||||||
|
apply_clicked = False
|
||||||
|
apply_selectors = [
|
||||||
|
'a[href*="apply"], a:text("Apply"), a:text("Apply Now"), a:text("Apply here")',
|
||||||
|
'button:text("Apply"), button:has-text("Apply")',
|
||||||
|
'[data-testid="apply-button"], [aria-label*="apply"], [role="button"]:has-text("Apply")',
|
||||||
|
'a.btn-apply, .apply-button, .apply-link, a:has-text("Apply")',
|
||||||
|
'a[rel="noopener"]:has-text("Apply")',
|
||||||
|
]
|
||||||
|
|
||||||
await self.engine._human_like_scroll(job_page)
|
for sel in apply_selectors:
|
||||||
await asyncio.sleep(2 * self.human_speed)
|
apply_elem = await job_detail_page.query_selector(sel)
|
||||||
page_content = await self._extract_page_content_for_llm(job_page)
|
if apply_elem:
|
||||||
|
print(f" 🔗 Found Apply element with selector: {sel}")
|
||||||
|
await self._human_click(job_detail_page, apply_elem, wait_after=True)
|
||||||
|
apply_clicked = True
|
||||||
|
break
|
||||||
|
|
||||||
job_id = full_url.split("/")[-1] if full_url.split("/")[-1] else "unknown"
|
apply_page = job_detail_page
|
||||||
|
|
||||||
|
if apply_clicked:
|
||||||
|
await asyncio.sleep(random.uniform(3.0, 6.0) * self.human_speed)
|
||||||
|
pages = context.pages
|
||||||
|
new_pages = [p for p in pages if p != job_detail_page and p.url != "about:blank"]
|
||||||
|
|
||||||
|
if new_pages:
|
||||||
|
candidate_page = new_pages[-1]
|
||||||
|
new_url = candidate_page.url.strip()
|
||||||
|
print(f" New tab opened: {new_url}")
|
||||||
|
|
||||||
|
if new_url and await self._is_forbidden_ats_url(new_url):
|
||||||
|
print(" 🚫 New URL is a forbidden ATS → skipping job.")
|
||||||
|
if candidate_page != job_detail_page:
|
||||||
|
await candidate_page.close()
|
||||||
|
await job_detail_page.close()
|
||||||
|
skip_job = True
|
||||||
|
else:
|
||||||
|
apply_page = candidate_page
|
||||||
|
else:
|
||||||
|
print(" No new tab → using original page.")
|
||||||
|
|
||||||
|
if skip_job:
|
||||||
|
continue
|
||||||
|
|
||||||
|
final_scrape_url = apply_page.url
|
||||||
|
|
||||||
|
# Re-check invalid content on final page
|
||||||
|
page_content = await self._extract_page_content_for_llm(apply_page)
|
||||||
|
if await self._is_invalid_job_page(page_content):
|
||||||
|
print(" 🚫 Final page contains invalid content → skipping.")
|
||||||
|
if apply_page != job_detail_page:
|
||||||
|
await apply_page.close()
|
||||||
|
await job_detail_page.close()
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Extract job ID — but do NOT fail if missing
|
||||||
|
job_id = self._extract_job_id_from_url(final_scrape_url)
|
||||||
|
if not job_id:
|
||||||
|
# Fallback: hash the URL to create a stable, unique ID
|
||||||
|
job_id = "job_" + hashlib.md5(final_scrape_url.encode()).hexdigest()[:12]
|
||||||
|
|
||||||
raw_data = {
|
raw_data = {
|
||||||
"page_content": page_content,
|
"page_content": page_content,
|
||||||
"url": full_url,
|
"url": final_scrape_url,
|
||||||
"job_id": job_id,
|
"job_id": job_id,
|
||||||
"search_keywords": search_keywords,
|
"search_keywords": search_keywords,
|
||||||
"posted_date": posted_date
|
"posted_date": posted_date
|
||||||
@ -210,44 +408,45 @@ class CryptoJobScraper:
|
|||||||
if field == 'job_id':
|
if field == 'job_id':
|
||||||
refined_data[field] = job_id
|
refined_data[field] = job_id
|
||||||
elif field == 'url':
|
elif field == 'url':
|
||||||
refined_data[field] = full_url
|
refined_data[field] = final_scrape_url
|
||||||
elif field == 'company_name':
|
elif field == 'company_name':
|
||||||
refined_data[field] = "Unknown Company"
|
refined_data[field] = "Unknown Company"
|
||||||
|
|
||||||
refined_data['scraped_at'] = datetime.now().isoformat()
|
|
||||||
refined_data['category'] = search_keywords
|
|
||||||
refined_data['posted_date'] = posted_date
|
|
||||||
await self.llm_agent.save_job_data(refined_data, search_keywords)
|
|
||||||
scraped_count += 1
|
|
||||||
print(f" ✅ Scraped and refined: {refined_data['title'][:50]}...")
|
|
||||||
self.engine.report_outcome("success", url=raw_data["url"])
|
|
||||||
else:
|
|
||||||
print(f" 🟡 Could not extract meaningful data from: {full_url}")
|
|
||||||
await self._add_job_to_redis_cache(full_url, job_id, "llm_failure")
|
|
||||||
self.engine.report_outcome("llm_failure", url=raw_data["url"])
|
|
||||||
|
|
||||||
await job_page.close()
|
refined_data['scraped_at'] = datetime.now().isoformat()
|
||||||
|
refined_data['category'] = search_keywords or "all"
|
||||||
|
refined_data['posted_date'] = posted_date
|
||||||
|
await self.llm_agent.save_job_data(refined_data, search_keywords or "all")
|
||||||
|
scraped_count += 1
|
||||||
|
print(f" ✅ Scraped: {refined_data['title'][:50]}... (Job ID: {job_id})")
|
||||||
|
self.engine.report_outcome("success", url=final_scrape_url)
|
||||||
|
else:
|
||||||
|
print(f" 🟡 Could not extract meaningful data from: {final_scrape_url}")
|
||||||
|
await self._add_job_to_redis_cache(final_scrape_url, job_id, "llm_failure")
|
||||||
|
self.engine.report_outcome("llm_failure", url=final_scrape_url)
|
||||||
|
|
||||||
|
if apply_page != job_detail_page and not apply_page.is_closed():
|
||||||
|
await apply_page.close()
|
||||||
|
if job_detail_page and not job_detail_page.is_closed():
|
||||||
|
await job_detail_page.close()
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_msg = str(e)[:100]
|
error_msg = str(e)[:100]
|
||||||
print(f" ⚠️ Failed on job {idx+1}: {error_msg}")
|
print(f" ⚠️ Failed on job {idx+1}: {error_msg}")
|
||||||
job_id = full_url.split("/")[-1] if 'full_url' in locals() else "unknown"
|
job_id_for_log = "unknown"
|
||||||
job_url = full_url if 'full_url' in locals() else "unknown"
|
if 'final_scrape_url' in locals() and final_scrape_url:
|
||||||
await self._add_job_to_redis_cache(job_url, job_id, f"exception: {error_msg}")
|
job_id_for_log = "job_" + hashlib.md5(final_scrape_url.encode()).hexdigest()[:12]
|
||||||
if 'job_page' in locals() and job_page:
|
await self._add_job_to_redis_cache(href, job_id_for_log, f"exception: {error_msg}")
|
||||||
await job_page.close()
|
if job_detail_page and not job_detail_page.is_closed():
|
||||||
|
await job_detail_page.close()
|
||||||
|
if 'apply_page' in locals() and apply_page and apply_page != job_detail_page and not apply_page.is_closed():
|
||||||
|
await apply_page.close()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
finally:
|
|
||||||
print(" ↩️ Returning to search results...")
|
|
||||||
await page.goto(search_url, timeout=120000)
|
|
||||||
await asyncio.sleep(4 * self.human_speed)
|
|
||||||
|
|
||||||
await browser.close()
|
await browser.close()
|
||||||
|
|
||||||
if scraped_count > 0:
|
if scraped_count > 0:
|
||||||
self.engine.report_outcome("success")
|
self.engine.report_outcome("success")
|
||||||
print(f"✅ Completed! Processed {scraped_count} jobs for '{search_keywords}' based on request '{self.user_request}'.")
|
print(f"✅ Completed! Processed {scraped_count} jobs for '{search_keywords or 'all jobs'}'.")
|
||||||
else:
|
else:
|
||||||
self.engine.report_outcome("scraping_error")
|
self.engine.report_outcome("scraping_error")
|
||||||
print("⚠️ No jobs processed successfully.")
|
print("⚠️ No jobs processed successfully.")
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user