Increase timeout for selector waits and refine job extraction logic in LLMJobRefiner and CryptoJobScraper

This commit is contained in:
Ofure Ikheloa 2025-12-30 12:19:18 +01:00
parent 38ef08c734
commit 06f8e8b086
4 changed files with 425 additions and 224 deletions

View File

@ -27,7 +27,7 @@ class StealthyFetcher:
if wait_for_selector:
try:
await page.wait_for_selector(wait_for_selector, timeout=40000)
await page.wait_for_selector(wait_for_selector, timeout=120000)
except PlaywrightTimeoutError:
print(f"Selector {wait_for_selector} not found immediately, continuing...")
@ -88,7 +88,7 @@ class StealthyFetcher:
async def _is_content_accessible(self, page: Page, wait_for_selector: Optional[str] = None) -> bool:
if wait_for_selector:
try:
await page.wait_for_selector(wait_for_selector, timeout=40000)
await page.wait_for_selector(wait_for_selector, timeout=120000)
return True
except PlaywrightTimeoutError:
pass

View File

@ -21,13 +21,12 @@ class LLMJobRefiner:
raise ValueError("DEEPSEEK_API_KEY not found in .env file.")
# Database credentials from .env
self.db_url = os.getenv("DB_URL")
self.db_username = os.getenv("DB_USERNAME")
self.db_password = os.getenv("DB_PASSWORD")
self.db_host = os.getenv("DB_HOST")
self.db_port = os.getenv("DB_PORT")
if not self.db_url or not self.db_username or not self.db_password:
if not self.db_username or not self.db_password:
raise ValueError("Database credentials not found in .env file.")
# DeepSeek uses OpenAI-compatible API
@ -41,16 +40,6 @@ class LLMJobRefiner:
def _init_db(self):
"""Initialize PostgreSQL database connection and create table"""
try:
self.db_url = os.getenv("DB_URL")
if self.db_url and "supabase.com" in self.db_url:
conn = psycopg2.connect(
host=self.db_host,
port=self.db_port,
database="postgres",
user=self.db_username,
password=self.db_password
)
else:
conn = psycopg2.connect(
host=self.db_host,
port=self.db_port,
@ -113,8 +102,8 @@ class LLMJobRefiner:
text = re.sub(r'\s+', ' ', text)
# Limit length for LLM context
if len(text) > 10000:
text = text[:10000] + "..."
if len(text) > 100000:
text = text[:100000] + "..."
return text
except Exception as e:
@ -128,7 +117,7 @@ class LLMJobRefiner:
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
temperature=0.2,
temperature=0.1,
max_tokens=2048,
stream=False
)
@ -145,26 +134,32 @@ class LLMJobRefiner:
posted_date = raw_data.get('posted_date', datetime.now().strftime("%m/%d/%y"))
prompt = f"""
You are a job posting data extractor.
You are an expert job posting data extractor. Your task is to extract AND infer fields from the provided job posting.
EXTRACT EXACT TEXT - DO NOT SUMMARIZE, PARAPHRASE, OR INVENT.
### CORE RULES:
1. **NEVER invent, summarize, or paraphrase** extract **exact wording** when available.
2. **For critical fields (title, company_name, job_id, url, description):**
- These MUST be present and meaningful.
- If not explicitly stated, **infer from context** (e.g., page title, headings, "About Us", etc.).
- **NEVER return "Not provided" or "N/A" for these fields.**
3. **For optional fields (location, salary_range, etc.):**
- Extract exact text if present.
- If absent but inferable (e.g., "Remote (US)", "Full-time"), **infer it**.
- Only return "Not provided" if truly absent and non-inferable.
For these critical fields, follow these rules:
- description: Extract ALL job description text. If ANY job details exist (duties, responsibilities, overview), include them. Only use "Not provided" if absolutely no description exists.
- requirements: Extract ALL requirements text. If ANY requirements exist (skills, experience, education needed), include them. Only use "Not provided" if none exist.
- qualifications: Extract ALL qualifications text. If ANY qualifications exist, include them. Only use "Not provided" if none exist.
### FIELD DEFINITIONS:
- **title**: The job title. Look in <h1>, page title, or bold headings.
- **company_name**: Company name. Look in logo alt text, footer, "About [X]", or page title.
- **description**: Main job overview, responsibilities, or duties. Combine relevant paragraphs if needed. **Must not be empty.**
- **requirements**: Required skills, experience, or qualifications.
- **qualifications**: Educational or certification requirements.
- **location**: Office location or remote policy.
- **salary_range**: Exact compensation info.
- **nature_of_work**: Employment type (Full-time, Contract, Internship, etc.).
REQUIRED FIELDS (must have valid values, never "N/A"):
- title, company_name, job_id, url
OPTIONAL FIELDS (can be "Not provided"):
- location, salary_range, nature_of_work
Page Content:
{cleaned_content}
Response format (ONLY return this JSON):
{{
### OUTPUT FORMAT:
Return ONLY a valid JSON object with these keys:
{{
"title": "...",
"company_name": "...",
"location": "...",
@ -175,8 +170,16 @@ class LLMJobRefiner:
"nature_of_work": "...",
"job_id": "{job_id}",
"url": "{url}"
}}
"""
}}
- **Critical fields must NEVER be "Not provided", "N/A", empty, or generic** (e.g., "Company", "Job Title").
- **Optional fields may be "Not provided" ONLY if truly absent.**
- **Do not include markdown, explanations, or extra text.**
- **Use double quotes for JSON.**
Page Content:
{cleaned_content}
"""
try:
response_text = await asyncio.get_event_loop().run_in_executor(
@ -188,31 +191,23 @@ class LLMJobRefiner:
if not refined_data:
return None
# Validate required fields
required_fields = ['title', 'company_name', 'job_id', 'url']
for field in required_fields:
if not refined_data.get(field) or refined_data[field].strip() in ["N/A", "", "Unknown", "Company", "Job"]:
return None
# CRITICAL: Validate content fields - check if they SHOULD exist
content_fields = ['description', 'requirements', 'qualifications']
cleaned_original = cleaned_content.lower()
# Simple heuristic: if page contains job-related keywords, content fields should NOT be "Not provided"
job_indicators = ['responsibilit', 'duties', 'require', 'qualifi', 'skill', 'experienc', 'educat', 'degree', 'bachelor', 'master']
has_job_content = any(indicator in cleaned_original for indicator in job_indicators)
if has_job_content:
for field in content_fields:
# Validate critical fields — reject if missing or placeholder
critical_fields = ['title', 'company_name', 'job_id', 'url', 'description']
for field in critical_fields:
value = refined_data.get(field, "").strip()
if value in ["Not provided", "N/A", ""]:
# LLM failed to extract existing content
print(f" ⚠️ LLM returned '{value}' for {field} but job content appears present")
return None
if not value or value.lower() in ["n/a", "not provided", "unknown", "company", "job", "title", ""]:
print(f" ❌ Critical field '{field}' is invalid: '{value}'")
return None # This job will NOT be saved — as per requirement
# Optional fields: allow "Not provided", but ensure they're strings
optional_fields = ['location', 'requirements', 'qualifications', 'salary_range', 'nature_of_work']
for field in optional_fields:
if field not in refined_data:
refined_data[field] = "Not provided"
elif not isinstance(refined_data[field], str):
refined_data[field] = str(refined_data[field])
# Add the posted_date to the refined data
refined_data['posted_date'] = posted_date
return refined_data
except Exception as e:
@ -220,15 +215,22 @@ class LLMJobRefiner:
return None
def _parse_llm_response(self, response_text: str) -> Dict[str, Any]:
# Try to extract JSON from markdown code block
json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', response_text, re.DOTALL)
if not json_match:
json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
# Try to find raw JSON object
json_match = re.search(r'\{[^{}]*\{[^{}]*\}[^{}]*\}|\{.*\}', response_text, re.DOTALL)
if not json_match:
return None
try:
return json.loads(json_match.group(1) if '```' in response_text else json_match.group(0))
except json.JSONDecodeError:
json_str = json_match.group(1) if '```' in response_text else json_match.group(0)
# Clean common issues
json_str = re.sub(r'\s+', ' ', json_str)
json_str = re.sub(r',\s*([\]}\)])', r'\1', json_str) # Remove trailing commas
return json.loads(json_str)
except json.JSONDecodeError as e:
print(f"JSON parsing error: {e}")
return None
async def save_job_data(self, job_data: Dict[str, Any], keyword: str):
@ -254,50 +256,50 @@ class LLMJobRefiner:
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (job_id) DO NOTHING
''', (
job_data.get("title", "N/A"),
job_data.get("company_name", "N/A"),
job_data.get("location", "N/A"),
job_data.get("description", "N/A"),
job_data.get("requirements", "N/A"),
job_data.get("qualifications", "N/A"),
job_data.get("salary_range", "N/A"),
job_data.get("nature_of_work", "N/A"),
job_data.get("job_id", "N/A"),
job_data.get("title", "Not provided"),
job_data.get("company_name", "Not provided"),
job_data.get("location", "Not provided"),
job_data.get("description", "Not provided"),
job_data.get("requirements", "Not provided"),
job_data.get("qualifications", "Not provided"),
job_data.get("salary_range", "Not provided"),
job_data.get("nature_of_work", "Not provided"),
job_data.get("job_id", "unknown"),
job_data.get("url", "N/A"),
job_data.get("category", "N/A"),
job_data.get("category", "all"),
job_data.get("scraped_at"),
job_data.get("posted_date", "N/A")
job_data.get("posted_date", datetime.now().strftime("%m/%d/%y"))
))
conn.commit()
cursor.close()
conn.close()
print(f" 💾 Saved job to category '{job_data.get('category', 'N/A')}' with job_id: {job_data.get('job_id', 'N/A')}")
print(f" 💾 Saved job to category '{job_data.get('category', 'all')}' with job_id: {job_data.get('job_id', 'unknown')}")
except Exception as e:
print(f"❌ Database save error: {e}")
async def _save_to_markdown(self, job_data: Dict[str, Any], keyword: str):
os.makedirs("linkedin_jobs", exist_ok=True)
filepath = os.path.join("linkedin_jobs", "linkedin_jobs_scraped.md")
os.makedirs("crypto_jobs", exist_ok=True)
filepath = os.path.join("crypto_jobs", "crypto_jobs_scraped.md")
write_header = not os.path.exists(filepath) or os.path.getsize(filepath) == 0
with open(filepath, "a", encoding="utf-8") as f:
if write_header:
f.write(f"# LinkedIn Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
f.write(f"## Job: {job_data.get('title', 'N/A')}\n\n")
f.write(f"# Crypto Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
f.write(f"## Job: {job_data.get('title', 'Not provided')}\n\n")
f.write(f"- **Keyword**: {keyword}\n")
f.write(f"- **Company**: {job_data.get('company_name', 'N/A')}\n")
f.write(f"- **Location**: {job_data.get('location', 'N/A')}\n")
f.write(f"- **Nature of Work**: {job_data.get('nature_of_work', 'N/A')}\n")
f.write(f"- **Salary Range**: {job_data.get('salary_range', 'N/A')}\n")
f.write(f"- **Job ID**: {job_data.get('job_id', 'N/A')}\n")
f.write(f"- **Company**: {job_data.get('company_name', 'Not provided')}\n")
f.write(f"- **Location**: {job_data.get('location', 'Not provided')}\n")
f.write(f"- **Nature of Work**: {job_data.get('nature_of_work', 'Not provided')}\n")
f.write(f"- **Salary Range**: {job_data.get('salary_range', 'Not provided')}\n")
f.write(f"- **Job ID**: {job_data.get('job_id', 'unknown')}\n")
f.write(f"- **Posted Date**: {job_data.get('posted_date', 'N/A')}\n")
f.write(f"- **Category**: {job_data.get('category', 'N/A')}\n")
f.write(f"- **Category**: {job_data.get('category', 'all')}\n")
f.write(f"- **Scraped At**: {job_data.get('scraped_at', 'N/A')}\n")
f.write(f"- **URL**: <{job_data.get('url', 'N/A')}>\n\n")
f.write(f"### Description\n\n{job_data.get('description', 'N/A')}\n\n")
f.write(f"### Requirements\n\n{job_data.get('requirements', 'N/A')}\n\n")
f.write(f"### Qualifications\n\n{job_data.get('qualifications', 'N/A')}\n\n")
f.write(f"### Description\n\n{job_data.get('description', 'Not provided')}\n\n")
f.write(f"### Requirements\n\n{job_data.get('requirements', 'Not provided')}\n\n")
f.write(f"### Qualifications\n\n{job_data.get('qualifications', 'Not provided')}\n\n")
f.write("---\n\n")

20
main.py
View File

@ -1,3 +1,4 @@
from scraping_engine import FingerprintScrapingEngine
from scraper import CryptoJobScraper # Updated class name
import os
@ -20,16 +21,15 @@ async def main():
scraper = CryptoJobScraper(engine, human_speed=1.3, user_request="Extract title, company, location, description, requirements, qualifications, nature of work, and salary")
job_titles = [
"Blockchain Engineer",
"Smart Contract Developer",
"DeFi Analyst",
"Web3 Developer",
"Crypto Researcher",
"Solidity Developer",
"Protocol Engineer",
"Tokenomics Specialist",
"Zero-Knowledge Proof Engineer",
"Crypto Compliance Officer"
"Customer Support",
"Design",
"Engineering",
"Finance",
"Marketing",
"Operations",
"Product",
"Sales"
]
while True:

View File

@ -6,10 +6,11 @@ from playwright.async_api import async_playwright, TimeoutError as PlaywrightTim
from browserforge.injectors.playwright import AsyncNewContext
from llm_agent import LLMJobRefiner
import re
from fetcher import StealthyFetcher
from datetime import datetime
import json
import redis
from urllib.parse import urlparse
import hashlib
class CryptoJobScraper:
@ -25,7 +26,29 @@ class CryptoJobScraper:
self.human_speed = human_speed
self.user_request = user_request
self.llm_agent = LLMJobRefiner()
self.redis_client = redis.Redis(host='localhost', port=6379, db=0, decode_responses=True)
self.redis_client = redis.Redis(host='=localhost', port=6379, db=0, decode_responses=True)
self.FORBIDDEN_ATS_DOMAINS = [
'ashby', 'ashbyhq',
'greenhouse', 'boards.greenhouse.io',
'gem', 'gem.com',
'rippling',
'myworkday', 'myworkdayjobs',
'smartrecruiters',
'workable',
'lever', 'jobs.lever.co',
]
self.INVALID_CONTENT_PHRASES = [
"invalid job url",
"cookie consent",
"privacy policy",
"not a valid job",
"job not found",
"page not found",
"The requested job post could not be found. It may have been removed."
"this page does not contain a job description"
]
async def _human_click(self, page, element, wait_after: bool = True):
if not element:
@ -55,60 +78,127 @@ class CryptoJobScraper:
matches = sum(1 for kw in keyword_list if kw in title_lower)
return matches / len(keyword_list) if keyword_list else 0.0
async def _scrape_jobs_from_current_page(self, page, search_keywords: str, seen_job_ids, all_job_links):
current_links = await page.query_selector_all("a[href*='/job/']")
new_jobs = 0
async def _extract_job_title_from_card(self, card) -> str:
try:
title_selectors = [
'h3', 'h2', 'h4',
'strong', 'span'
]
for selector in title_selectors:
title_element = await card.query_selector(selector)
if title_element:
title_text = await title_element.inner_text()
if title_text and len(title_text.strip()) > 3:
return title_text.strip()
for link in current_links:
href = await link.get_attribute("href")
if not href or not href.startswith("http"):
href = "https://cryptocurrencyjobs.co" + href
job_id = href.split("/")[-1] if href.endswith("/") else href.split("/")[-1]
card_text = await card.inner_text()
lines = [line.strip() for line in card_text.split('\n') if line.strip()]
if lines:
for line in lines:
if len(line) > 5 and not any(skip in line.lower() for skip in ['today', 'featured', 'yesterday', 'company', 'location']):
return line
return "Unknown Title"
except:
return "Unknown Title"
async def _collect_job_elements_from_page(self, page, search_keywords: str, seen_slugs):
job_cards = []
job_found = False
await asyncio.sleep(3 * self.human_speed)
try:
await page.wait_for_selector('a[href^="/"][href*="-"]', timeout=60000)
candidates = await page.query_selector_all('a[href^="/"][href*="-"]')
for link in candidates:
href = await link.get_attribute("href") or ""
href = href.rstrip('/')
if not href or len(href.split('/')) != 3:
continue
if '-' not in href.split('/')[-1]:
continue
slug = href.split('/')[-1]
if len(slug) < 8 or slug.startswith(('login', 'signup', 'about', 'terms')):
continue
full_url = "https://cryptocurrencyjobs.co" + href if not href.startswith('http') else href
if slug in seen_slugs:
continue
title = await self._extract_job_title_from_card(link)
if not title or title == "Unknown Title":
title = slug.replace('-', ' ').title()
if job_id and job_id not in seen_job_ids:
title_element = await link.query_selector("h3, .job-title")
title = (await title_element.inner_text()) if title_element else "Unknown Title"
match_percentage = self._calculate_keyword_match(title, search_keywords)
if match_percentage >= 0.4 or not search_keywords.strip():
seen_slugs.add(slug)
job_cards.append((full_url, title, link))
job_found = True
if match_percentage >= 0.5: # Lower threshold than LinkedIn
seen_job_ids.add(job_id)
all_job_links.append((href, title))
new_jobs += 1
else:
print(f" ⚠️ Skipping job due to low keyword match: {title[:50]}... (match: {match_percentage:.2%})")
return new_jobs
print(f" ✅ Collected {len(job_cards)} valid job links after filtering ({len(candidates)} raw candidates).")
async def _handle_pagination(self, page, search_keywords: str, seen_job_ids, all_job_links):
current_page = 1
while True:
print(f"📄 Processing page {current_page}")
new_jobs = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links)
print(f" Found {new_jobs} new job(s) (total: {len(all_job_links)})")
except Exception as e:
print(f" ⚠️ Error collecting job cards: {e}")
next_btn = await page.query_selector('a[rel="next"]')
if next_btn:
next_url = await next_btn.get_attribute("href")
if next_url and not next_url.startswith("http"):
next_url = "https://cryptocurrencyjobs.co" + next_url
await page.goto(next_url, timeout=120000)
await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
current_page += 1
else:
print("🔚 No 'Next' page — stopping pagination.")
if not job_found:
print(" ❌ No valid job listings passed filters.")
return job_cards
async def _handle_pagination_and_collect_all(self, page, search_keywords: str, seen_slugs):
all_job_elements = []
scroll_attempt = 0
max_scrolls = 40
prev_count = 0
while scroll_attempt < max_scrolls:
print(f" Scroll attempt {scroll_attempt + 1} | Current total jobs: {len(all_job_elements)}")
page_elements = await self._collect_job_elements_from_page(page, search_keywords, seen_slugs)
all_job_elements.extend(page_elements)
current_count = len(all_job_elements)
if current_count == prev_count and scroll_attempt > 3:
print(" 🔚 No new jobs after several scrolls → assuming end of list.")
break
async def _extract_job_posted_date(self, page) -> str:
prev_count = current_count
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await asyncio.sleep(random.uniform(2.5, 5.5) * self.human_speed)
try:
date_element = await page.query_selector(".job-posted-date, .job-date, time")
if date_element:
date_text = await date_element.inner_text()
if "Today" in date_text:
load_more = await page.query_selector(
'button:has-text("Load more"), button:has-text("More"), div[role="button"]:has-text("Load"), a:has-text("Load more")'
)
if load_more:
print(" Found 'Load more' button → clicking...")
await self._human_click(page, load_more)
await asyncio.sleep(random.uniform(3.0, 6.0) * self.human_speed)
except:
pass
scroll_attempt += 1
print(f" Finished scrolling → collected {len(all_job_elements)} unique job links.")
return all_job_elements
async def _extract_job_posted_date_from_card(self, card) -> str:
try:
card_text = await card.inner_text()
if "Today" in card_text:
return datetime.now().strftime("%m/%d/%y")
elif "Yesterday" in date_text:
yesterday = datetime.now().replace(day=datetime.now().day - 1)
return yesterday.strftime("%m/%d/%y")
elif "Yesterday" in card_text:
from datetime import timedelta
return (datetime.now() - timedelta(days=1)).strftime("%m/%d/%y")
else:
return datetime.now().strftime("%m/%d/%y")
match = re.search(r'(\d+)d', card_text)
if match:
days = int(match.group(1))
from datetime import timedelta
return (datetime.now() - timedelta(days=days)).strftime("%m/%d/%y")
except:
pass
return datetime.now().strftime("%m/%d/%y")
@ -126,15 +216,62 @@ class CryptoJobScraper:
except Exception as e:
print(f" ❌ Failed to add job to Redis cache: {str(e)}")
async def _is_forbidden_ats_url(self, url: str) -> bool:
url_lower = url.lower()
return any(domain in url_lower for domain in self.FORBIDDEN_ATS_DOMAINS)
async def _is_invalid_job_page(self, page_content: str) -> bool:
content_lower = page_content.lower()
return any(phrase in content_lower for phrase in self.INVALID_CONTENT_PHRASES)
def _extract_job_id_from_url(self, url: str) -> Optional[str]:
"""
Extract job ID from URL. Returns ID if it contains at least one digit.
Otherwise, returns None (but does NOT mean skip!).
"""
try:
parsed = urlparse(url)
path_parts = [p for p in parsed.path.split('/') if p]
if not path_parts:
return None
candidate = path_parts[-1]
candidate = re.split(r'[?#]', candidate)[0]
candidate = re.sub(r'\.html?$', '', candidate)
if not candidate or not any(c.isdigit() for c in candidate):
return None
# Avoid title-like strings (with spaces or long words + no structure)
if re.search(r'[A-Za-z]{6,}\s', candidate):
return None
return candidate
except:
return None
async def scrape_jobs(
self,
search_keywords: Optional[str],
max_pages: int = 1,
credentials: Optional[Dict] = None
):
# cryptocurrencyjobs.co uses URL params differently
encoded_keywords = search_keywords.replace(" ", "%20")
search_url = f"https://cryptocurrencyjobs.co/?q={encoded_keywords}"
query = ""
location = ""
if search_keywords and search_keywords.strip():
parts = search_keywords.split(',', 1)
query = parts[0].strip()
if len(parts) > 1:
location = parts[1].strip()
clean_query = query.replace(' ', '+')
clean_location = location.replace(' ', '+')
search_url = "https://cryptocurrencyjobs.co/"
if clean_query:
search_url += f"?query={clean_query}"
if clean_location:
search_url += f"&location={clean_location}"
profile = self.engine._select_profile()
renderer = random.choice(self.engine.common_renderers[self.engine.os])
@ -156,46 +293,107 @@ class CryptoJobScraper:
await context.add_init_script(spoof_script)
page = await context.new_page()
# Fetch main search page
print(f"🔍 Searching for: {search_keywords}")
await page.goto(search_url, wait_until='load', timeout=120000)
print(f"🔍 Searching for: {search_keywords or 'all jobs'}")
print(f" 🔗 URL: {search_url}")
await page.goto(search_url, wait_until='networkidle', timeout=120000)
await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
all_job_links = []
seen_job_ids = set()
try:
await page.wait_for_selector("a[href^='/'][href*='-']", timeout=10000)
except:
print(" ⚠️ No job links found initially, waiting longer...")
await asyncio.sleep(5 * self.human_speed)
print("🔄 Collecting job links from search results...")
await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links)
await self._handle_pagination(page, search_keywords, seen_job_ids, all_job_links)
print(f"✅ Collected {len(all_job_links)} unique job links.")
seen_slugs = set()
all_job_elements = await self._handle_pagination_and_collect_all(page, search_keywords, seen_slugs)
print(f"✅ Collected {len(all_job_elements)} unique job links.")
scraped_count = 0
for idx, (href, title) in enumerate(all_job_links):
for idx, (href, title, job_element) in enumerate(all_job_elements):
job_detail_page = None
apply_page = None
skip_job = False
final_scrape_url = None
try:
full_url = href
print(f" → Opening job {idx+1}/{len(all_job_links)}: {full_url}")
print(f" → Processing job {idx+1}/{len(all_job_elements)}: {title}")
fetcher = StealthyFetcher(self.engine, browser, context)
job_page = await fetcher.fetch_url(full_url, wait_for_selector="h1")
if not job_page:
print(f" ❌ Failed to fetch job page {full_url}")
await self._add_job_to_redis_cache(full_url, full_url.split("/")[-1], "fetch_failure")
self.engine.report_outcome("fetch_failure", url=full_url)
posted_date = await self._extract_job_posted_date_from_card(job_element)
job_detail_page = await context.new_page()
await job_detail_page.goto(href, wait_until='networkidle', timeout=60000)
await asyncio.sleep(2 * self.human_speed)
# Check for invalid content
page_content = await job_detail_page.content()
if await self._is_invalid_job_page(page_content):
print(" 🚫 Page contains invalid content → skipping.")
await job_detail_page.close()
continue
posted_date = await self._extract_job_posted_date(job_page)
# Try to click apply
apply_clicked = False
apply_selectors = [
'a[href*="apply"], a:text("Apply"), a:text("Apply Now"), a:text("Apply here")',
'button:text("Apply"), button:has-text("Apply")',
'[data-testid="apply-button"], [aria-label*="apply"], [role="button"]:has-text("Apply")',
'a.btn-apply, .apply-button, .apply-link, a:has-text("Apply")',
'a[rel="noopener"]:has-text("Apply")',
]
await self.engine._human_like_scroll(job_page)
await asyncio.sleep(2 * self.human_speed)
page_content = await self._extract_page_content_for_llm(job_page)
for sel in apply_selectors:
apply_elem = await job_detail_page.query_selector(sel)
if apply_elem:
print(f" 🔗 Found Apply element with selector: {sel}")
await self._human_click(job_detail_page, apply_elem, wait_after=True)
apply_clicked = True
break
job_id = full_url.split("/")[-1] if full_url.split("/")[-1] else "unknown"
apply_page = job_detail_page
if apply_clicked:
await asyncio.sleep(random.uniform(3.0, 6.0) * self.human_speed)
pages = context.pages
new_pages = [p for p in pages if p != job_detail_page and p.url != "about:blank"]
if new_pages:
candidate_page = new_pages[-1]
new_url = candidate_page.url.strip()
print(f" New tab opened: {new_url}")
if new_url and await self._is_forbidden_ats_url(new_url):
print(" 🚫 New URL is a forbidden ATS → skipping job.")
if candidate_page != job_detail_page:
await candidate_page.close()
await job_detail_page.close()
skip_job = True
else:
apply_page = candidate_page
else:
print(" No new tab → using original page.")
if skip_job:
continue
final_scrape_url = apply_page.url
# Re-check invalid content on final page
page_content = await self._extract_page_content_for_llm(apply_page)
if await self._is_invalid_job_page(page_content):
print(" 🚫 Final page contains invalid content → skipping.")
if apply_page != job_detail_page:
await apply_page.close()
await job_detail_page.close()
continue
# Extract job ID — but do NOT fail if missing
job_id = self._extract_job_id_from_url(final_scrape_url)
if not job_id:
# Fallback: hash the URL to create a stable, unique ID
job_id = "job_" + hashlib.md5(final_scrape_url.encode()).hexdigest()[:12]
raw_data = {
"page_content": page_content,
"url": full_url,
"url": final_scrape_url,
"job_id": job_id,
"search_keywords": search_keywords,
"posted_date": posted_date
@ -210,44 +408,45 @@ class CryptoJobScraper:
if field == 'job_id':
refined_data[field] = job_id
elif field == 'url':
refined_data[field] = full_url
refined_data[field] = final_scrape_url
elif field == 'company_name':
refined_data[field] = "Unknown Company"
refined_data['scraped_at'] = datetime.now().isoformat()
refined_data['category'] = search_keywords
refined_data['category'] = search_keywords or "all"
refined_data['posted_date'] = posted_date
await self.llm_agent.save_job_data(refined_data, search_keywords)
await self.llm_agent.save_job_data(refined_data, search_keywords or "all")
scraped_count += 1
print(f" ✅ Scraped and refined: {refined_data['title'][:50]}...")
self.engine.report_outcome("success", url=raw_data["url"])
print(f" ✅ Scraped: {refined_data['title'][:50]}... (Job ID: {job_id})")
self.engine.report_outcome("success", url=final_scrape_url)
else:
print(f" 🟡 Could not extract meaningful data from: {full_url}")
await self._add_job_to_redis_cache(full_url, job_id, "llm_failure")
self.engine.report_outcome("llm_failure", url=raw_data["url"])
print(f" 🟡 Could not extract meaningful data from: {final_scrape_url}")
await self._add_job_to_redis_cache(final_scrape_url, job_id, "llm_failure")
self.engine.report_outcome("llm_failure", url=final_scrape_url)
await job_page.close()
if apply_page != job_detail_page and not apply_page.is_closed():
await apply_page.close()
if job_detail_page and not job_detail_page.is_closed():
await job_detail_page.close()
except Exception as e:
error_msg = str(e)[:100]
print(f" ⚠️ Failed on job {idx+1}: {error_msg}")
job_id = full_url.split("/")[-1] if 'full_url' in locals() else "unknown"
job_url = full_url if 'full_url' in locals() else "unknown"
await self._add_job_to_redis_cache(job_url, job_id, f"exception: {error_msg}")
if 'job_page' in locals() and job_page:
await job_page.close()
job_id_for_log = "unknown"
if 'final_scrape_url' in locals() and final_scrape_url:
job_id_for_log = "job_" + hashlib.md5(final_scrape_url.encode()).hexdigest()[:12]
await self._add_job_to_redis_cache(href, job_id_for_log, f"exception: {error_msg}")
if job_detail_page and not job_detail_page.is_closed():
await job_detail_page.close()
if 'apply_page' in locals() and apply_page and apply_page != job_detail_page and not apply_page.is_closed():
await apply_page.close()
continue
finally:
print(" ↩️ Returning to search results...")
await page.goto(search_url, timeout=120000)
await asyncio.sleep(4 * self.human_speed)
await browser.close()
if scraped_count > 0:
self.engine.report_outcome("success")
print(f"✅ Completed! Processed {scraped_count} jobs for '{search_keywords}' based on request '{self.user_request}'.")
print(f"✅ Completed! Processed {scraped_count} jobs for '{search_keywords or 'all jobs'}'.")
else:
self.engine.report_outcome("scraping_error")
print("⚠️ No jobs processed successfully.")