Increase timeout for selector waits and refine job extraction logic in LLMJobRefiner and CryptoJobScraper

This commit is contained in:
Ofure Ikheloa 2025-12-30 12:19:18 +01:00
parent 38ef08c734
commit 06f8e8b086
4 changed files with 425 additions and 224 deletions

View File

@ -27,7 +27,7 @@ class StealthyFetcher:
if wait_for_selector: if wait_for_selector:
try: try:
await page.wait_for_selector(wait_for_selector, timeout=40000) await page.wait_for_selector(wait_for_selector, timeout=120000)
except PlaywrightTimeoutError: except PlaywrightTimeoutError:
print(f"Selector {wait_for_selector} not found immediately, continuing...") print(f"Selector {wait_for_selector} not found immediately, continuing...")
@ -88,7 +88,7 @@ class StealthyFetcher:
async def _is_content_accessible(self, page: Page, wait_for_selector: Optional[str] = None) -> bool: async def _is_content_accessible(self, page: Page, wait_for_selector: Optional[str] = None) -> bool:
if wait_for_selector: if wait_for_selector:
try: try:
await page.wait_for_selector(wait_for_selector, timeout=40000) await page.wait_for_selector(wait_for_selector, timeout=120000)
return True return True
except PlaywrightTimeoutError: except PlaywrightTimeoutError:
pass pass

View File

@ -21,13 +21,12 @@ class LLMJobRefiner:
raise ValueError("DEEPSEEK_API_KEY not found in .env file.") raise ValueError("DEEPSEEK_API_KEY not found in .env file.")
# Database credentials from .env # Database credentials from .env
self.db_url = os.getenv("DB_URL")
self.db_username = os.getenv("DB_USERNAME") self.db_username = os.getenv("DB_USERNAME")
self.db_password = os.getenv("DB_PASSWORD") self.db_password = os.getenv("DB_PASSWORD")
self.db_host = os.getenv("DB_HOST") self.db_host = os.getenv("DB_HOST")
self.db_port = os.getenv("DB_PORT") self.db_port = os.getenv("DB_PORT")
if not self.db_url or not self.db_username or not self.db_password: if not self.db_username or not self.db_password:
raise ValueError("Database credentials not found in .env file.") raise ValueError("Database credentials not found in .env file.")
# DeepSeek uses OpenAI-compatible API # DeepSeek uses OpenAI-compatible API
@ -41,16 +40,6 @@ class LLMJobRefiner:
def _init_db(self): def _init_db(self):
"""Initialize PostgreSQL database connection and create table""" """Initialize PostgreSQL database connection and create table"""
try: try:
self.db_url = os.getenv("DB_URL")
if self.db_url and "supabase.com" in self.db_url:
conn = psycopg2.connect(
host=self.db_host,
port=self.db_port,
database="postgres",
user=self.db_username,
password=self.db_password
)
else:
conn = psycopg2.connect( conn = psycopg2.connect(
host=self.db_host, host=self.db_host,
port=self.db_port, port=self.db_port,
@ -113,8 +102,8 @@ class LLMJobRefiner:
text = re.sub(r'\s+', ' ', text) text = re.sub(r'\s+', ' ', text)
# Limit length for LLM context # Limit length for LLM context
if len(text) > 10000: if len(text) > 100000:
text = text[:10000] + "..." text = text[:100000] + "..."
return text return text
except Exception as e: except Exception as e:
@ -128,7 +117,7 @@ class LLMJobRefiner:
response = self.client.chat.completions.create( response = self.client.chat.completions.create(
model=self.model, model=self.model,
messages=[{"role": "user", "content": prompt}], messages=[{"role": "user", "content": prompt}],
temperature=0.2, temperature=0.1,
max_tokens=2048, max_tokens=2048,
stream=False stream=False
) )
@ -145,26 +134,32 @@ class LLMJobRefiner:
posted_date = raw_data.get('posted_date', datetime.now().strftime("%m/%d/%y")) posted_date = raw_data.get('posted_date', datetime.now().strftime("%m/%d/%y"))
prompt = f""" prompt = f"""
You are a job posting data extractor. You are an expert job posting data extractor. Your task is to extract AND infer fields from the provided job posting.
EXTRACT EXACT TEXT - DO NOT SUMMARIZE, PARAPHRASE, OR INVENT. ### CORE RULES:
1. **NEVER invent, summarize, or paraphrase** extract **exact wording** when available.
2. **For critical fields (title, company_name, job_id, url, description):**
- These MUST be present and meaningful.
- If not explicitly stated, **infer from context** (e.g., page title, headings, "About Us", etc.).
- **NEVER return "Not provided" or "N/A" for these fields.**
3. **For optional fields (location, salary_range, etc.):**
- Extract exact text if present.
- If absent but inferable (e.g., "Remote (US)", "Full-time"), **infer it**.
- Only return "Not provided" if truly absent and non-inferable.
For these critical fields, follow these rules: ### FIELD DEFINITIONS:
- description: Extract ALL job description text. If ANY job details exist (duties, responsibilities, overview), include them. Only use "Not provided" if absolutely no description exists. - **title**: The job title. Look in <h1>, page title, or bold headings.
- requirements: Extract ALL requirements text. If ANY requirements exist (skills, experience, education needed), include them. Only use "Not provided" if none exist. - **company_name**: Company name. Look in logo alt text, footer, "About [X]", or page title.
- qualifications: Extract ALL qualifications text. If ANY qualifications exist, include them. Only use "Not provided" if none exist. - **description**: Main job overview, responsibilities, or duties. Combine relevant paragraphs if needed. **Must not be empty.**
- **requirements**: Required skills, experience, or qualifications.
- **qualifications**: Educational or certification requirements.
- **location**: Office location or remote policy.
- **salary_range**: Exact compensation info.
- **nature_of_work**: Employment type (Full-time, Contract, Internship, etc.).
REQUIRED FIELDS (must have valid values, never "N/A"): ### OUTPUT FORMAT:
- title, company_name, job_id, url Return ONLY a valid JSON object with these keys:
{{
OPTIONAL FIELDS (can be "Not provided"):
- location, salary_range, nature_of_work
Page Content:
{cleaned_content}
Response format (ONLY return this JSON):
{{
"title": "...", "title": "...",
"company_name": "...", "company_name": "...",
"location": "...", "location": "...",
@ -175,8 +170,16 @@ class LLMJobRefiner:
"nature_of_work": "...", "nature_of_work": "...",
"job_id": "{job_id}", "job_id": "{job_id}",
"url": "{url}" "url": "{url}"
}} }}
"""
- **Critical fields must NEVER be "Not provided", "N/A", empty, or generic** (e.g., "Company", "Job Title").
- **Optional fields may be "Not provided" ONLY if truly absent.**
- **Do not include markdown, explanations, or extra text.**
- **Use double quotes for JSON.**
Page Content:
{cleaned_content}
"""
try: try:
response_text = await asyncio.get_event_loop().run_in_executor( response_text = await asyncio.get_event_loop().run_in_executor(
@ -188,31 +191,23 @@ class LLMJobRefiner:
if not refined_data: if not refined_data:
return None return None
# Validate required fields # Validate critical fields — reject if missing or placeholder
required_fields = ['title', 'company_name', 'job_id', 'url'] critical_fields = ['title', 'company_name', 'job_id', 'url', 'description']
for field in required_fields: for field in critical_fields:
if not refined_data.get(field) or refined_data[field].strip() in ["N/A", "", "Unknown", "Company", "Job"]:
return None
# CRITICAL: Validate content fields - check if they SHOULD exist
content_fields = ['description', 'requirements', 'qualifications']
cleaned_original = cleaned_content.lower()
# Simple heuristic: if page contains job-related keywords, content fields should NOT be "Not provided"
job_indicators = ['responsibilit', 'duties', 'require', 'qualifi', 'skill', 'experienc', 'educat', 'degree', 'bachelor', 'master']
has_job_content = any(indicator in cleaned_original for indicator in job_indicators)
if has_job_content:
for field in content_fields:
value = refined_data.get(field, "").strip() value = refined_data.get(field, "").strip()
if value in ["Not provided", "N/A", ""]: if not value or value.lower() in ["n/a", "not provided", "unknown", "company", "job", "title", ""]:
# LLM failed to extract existing content print(f" ❌ Critical field '{field}' is invalid: '{value}'")
print(f" ⚠️ LLM returned '{value}' for {field} but job content appears present") return None # This job will NOT be saved — as per requirement
return None
# Optional fields: allow "Not provided", but ensure they're strings
optional_fields = ['location', 'requirements', 'qualifications', 'salary_range', 'nature_of_work']
for field in optional_fields:
if field not in refined_data:
refined_data[field] = "Not provided"
elif not isinstance(refined_data[field], str):
refined_data[field] = str(refined_data[field])
# Add the posted_date to the refined data
refined_data['posted_date'] = posted_date refined_data['posted_date'] = posted_date
return refined_data return refined_data
except Exception as e: except Exception as e:
@ -220,15 +215,22 @@ class LLMJobRefiner:
return None return None
def _parse_llm_response(self, response_text: str) -> Dict[str, Any]: def _parse_llm_response(self, response_text: str) -> Dict[str, Any]:
# Try to extract JSON from markdown code block
json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', response_text, re.DOTALL) json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', response_text, re.DOTALL)
if not json_match: if not json_match:
json_match = re.search(r'\{.*\}', response_text, re.DOTALL) # Try to find raw JSON object
json_match = re.search(r'\{[^{}]*\{[^{}]*\}[^{}]*\}|\{.*\}', response_text, re.DOTALL)
if not json_match: if not json_match:
return None return None
try: try:
return json.loads(json_match.group(1) if '```' in response_text else json_match.group(0)) json_str = json_match.group(1) if '```' in response_text else json_match.group(0)
except json.JSONDecodeError: # Clean common issues
json_str = re.sub(r'\s+', ' ', json_str)
json_str = re.sub(r',\s*([\]}\)])', r'\1', json_str) # Remove trailing commas
return json.loads(json_str)
except json.JSONDecodeError as e:
print(f"JSON parsing error: {e}")
return None return None
async def save_job_data(self, job_data: Dict[str, Any], keyword: str): async def save_job_data(self, job_data: Dict[str, Any], keyword: str):
@ -254,50 +256,50 @@ class LLMJobRefiner:
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (job_id) DO NOTHING ON CONFLICT (job_id) DO NOTHING
''', ( ''', (
job_data.get("title", "N/A"), job_data.get("title", "Not provided"),
job_data.get("company_name", "N/A"), job_data.get("company_name", "Not provided"),
job_data.get("location", "N/A"), job_data.get("location", "Not provided"),
job_data.get("description", "N/A"), job_data.get("description", "Not provided"),
job_data.get("requirements", "N/A"), job_data.get("requirements", "Not provided"),
job_data.get("qualifications", "N/A"), job_data.get("qualifications", "Not provided"),
job_data.get("salary_range", "N/A"), job_data.get("salary_range", "Not provided"),
job_data.get("nature_of_work", "N/A"), job_data.get("nature_of_work", "Not provided"),
job_data.get("job_id", "N/A"), job_data.get("job_id", "unknown"),
job_data.get("url", "N/A"), job_data.get("url", "N/A"),
job_data.get("category", "N/A"), job_data.get("category", "all"),
job_data.get("scraped_at"), job_data.get("scraped_at"),
job_data.get("posted_date", "N/A") job_data.get("posted_date", datetime.now().strftime("%m/%d/%y"))
)) ))
conn.commit() conn.commit()
cursor.close() cursor.close()
conn.close() conn.close()
print(f" 💾 Saved job to category '{job_data.get('category', 'N/A')}' with job_id: {job_data.get('job_id', 'N/A')}") print(f" 💾 Saved job to category '{job_data.get('category', 'all')}' with job_id: {job_data.get('job_id', 'unknown')}")
except Exception as e: except Exception as e:
print(f"❌ Database save error: {e}") print(f"❌ Database save error: {e}")
async def _save_to_markdown(self, job_data: Dict[str, Any], keyword: str): async def _save_to_markdown(self, job_data: Dict[str, Any], keyword: str):
os.makedirs("linkedin_jobs", exist_ok=True) os.makedirs("crypto_jobs", exist_ok=True)
filepath = os.path.join("linkedin_jobs", "linkedin_jobs_scraped.md") filepath = os.path.join("crypto_jobs", "crypto_jobs_scraped.md")
write_header = not os.path.exists(filepath) or os.path.getsize(filepath) == 0 write_header = not os.path.exists(filepath) or os.path.getsize(filepath) == 0
with open(filepath, "a", encoding="utf-8") as f: with open(filepath, "a", encoding="utf-8") as f:
if write_header: if write_header:
f.write(f"# LinkedIn Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n") f.write(f"# Crypto Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
f.write(f"## Job: {job_data.get('title', 'N/A')}\n\n") f.write(f"## Job: {job_data.get('title', 'Not provided')}\n\n")
f.write(f"- **Keyword**: {keyword}\n") f.write(f"- **Keyword**: {keyword}\n")
f.write(f"- **Company**: {job_data.get('company_name', 'N/A')}\n") f.write(f"- **Company**: {job_data.get('company_name', 'Not provided')}\n")
f.write(f"- **Location**: {job_data.get('location', 'N/A')}\n") f.write(f"- **Location**: {job_data.get('location', 'Not provided')}\n")
f.write(f"- **Nature of Work**: {job_data.get('nature_of_work', 'N/A')}\n") f.write(f"- **Nature of Work**: {job_data.get('nature_of_work', 'Not provided')}\n")
f.write(f"- **Salary Range**: {job_data.get('salary_range', 'N/A')}\n") f.write(f"- **Salary Range**: {job_data.get('salary_range', 'Not provided')}\n")
f.write(f"- **Job ID**: {job_data.get('job_id', 'N/A')}\n") f.write(f"- **Job ID**: {job_data.get('job_id', 'unknown')}\n")
f.write(f"- **Posted Date**: {job_data.get('posted_date', 'N/A')}\n") f.write(f"- **Posted Date**: {job_data.get('posted_date', 'N/A')}\n")
f.write(f"- **Category**: {job_data.get('category', 'N/A')}\n") f.write(f"- **Category**: {job_data.get('category', 'all')}\n")
f.write(f"- **Scraped At**: {job_data.get('scraped_at', 'N/A')}\n") f.write(f"- **Scraped At**: {job_data.get('scraped_at', 'N/A')}\n")
f.write(f"- **URL**: <{job_data.get('url', 'N/A')}>\n\n") f.write(f"- **URL**: <{job_data.get('url', 'N/A')}>\n\n")
f.write(f"### Description\n\n{job_data.get('description', 'N/A')}\n\n") f.write(f"### Description\n\n{job_data.get('description', 'Not provided')}\n\n")
f.write(f"### Requirements\n\n{job_data.get('requirements', 'N/A')}\n\n") f.write(f"### Requirements\n\n{job_data.get('requirements', 'Not provided')}\n\n")
f.write(f"### Qualifications\n\n{job_data.get('qualifications', 'N/A')}\n\n") f.write(f"### Qualifications\n\n{job_data.get('qualifications', 'Not provided')}\n\n")
f.write("---\n\n") f.write("---\n\n")

20
main.py
View File

@ -1,3 +1,4 @@
from scraping_engine import FingerprintScrapingEngine from scraping_engine import FingerprintScrapingEngine
from scraper import CryptoJobScraper # Updated class name from scraper import CryptoJobScraper # Updated class name
import os import os
@ -20,16 +21,15 @@ async def main():
scraper = CryptoJobScraper(engine, human_speed=1.3, user_request="Extract title, company, location, description, requirements, qualifications, nature of work, and salary") scraper = CryptoJobScraper(engine, human_speed=1.3, user_request="Extract title, company, location, description, requirements, qualifications, nature of work, and salary")
job_titles = [ job_titles = [
"Blockchain Engineer", "Customer Support",
"Smart Contract Developer", "Design",
"DeFi Analyst", "Engineering",
"Web3 Developer", "Finance",
"Crypto Researcher", "Marketing",
"Solidity Developer", "Operations",
"Protocol Engineer", "Product",
"Tokenomics Specialist", "Sales"
"Zero-Knowledge Proof Engineer",
"Crypto Compliance Officer"
] ]
while True: while True:

View File

@ -6,10 +6,11 @@ from playwright.async_api import async_playwright, TimeoutError as PlaywrightTim
from browserforge.injectors.playwright import AsyncNewContext from browserforge.injectors.playwright import AsyncNewContext
from llm_agent import LLMJobRefiner from llm_agent import LLMJobRefiner
import re import re
from fetcher import StealthyFetcher
from datetime import datetime from datetime import datetime
import json import json
import redis import redis
from urllib.parse import urlparse
import hashlib
class CryptoJobScraper: class CryptoJobScraper:
@ -25,7 +26,29 @@ class CryptoJobScraper:
self.human_speed = human_speed self.human_speed = human_speed
self.user_request = user_request self.user_request = user_request
self.llm_agent = LLMJobRefiner() self.llm_agent = LLMJobRefiner()
self.redis_client = redis.Redis(host='localhost', port=6379, db=0, decode_responses=True) self.redis_client = redis.Redis(host='=localhost', port=6379, db=0, decode_responses=True)
self.FORBIDDEN_ATS_DOMAINS = [
'ashby', 'ashbyhq',
'greenhouse', 'boards.greenhouse.io',
'gem', 'gem.com',
'rippling',
'myworkday', 'myworkdayjobs',
'smartrecruiters',
'workable',
'lever', 'jobs.lever.co',
]
self.INVALID_CONTENT_PHRASES = [
"invalid job url",
"cookie consent",
"privacy policy",
"not a valid job",
"job not found",
"page not found",
"The requested job post could not be found. It may have been removed."
"this page does not contain a job description"
]
async def _human_click(self, page, element, wait_after: bool = True): async def _human_click(self, page, element, wait_after: bool = True):
if not element: if not element:
@ -55,60 +78,127 @@ class CryptoJobScraper:
matches = sum(1 for kw in keyword_list if kw in title_lower) matches = sum(1 for kw in keyword_list if kw in title_lower)
return matches / len(keyword_list) if keyword_list else 0.0 return matches / len(keyword_list) if keyword_list else 0.0
async def _scrape_jobs_from_current_page(self, page, search_keywords: str, seen_job_ids, all_job_links): async def _extract_job_title_from_card(self, card) -> str:
current_links = await page.query_selector_all("a[href*='/job/']") try:
new_jobs = 0 title_selectors = [
'h3', 'h2', 'h4',
'strong', 'span'
]
for selector in title_selectors:
title_element = await card.query_selector(selector)
if title_element:
title_text = await title_element.inner_text()
if title_text and len(title_text.strip()) > 3:
return title_text.strip()
for link in current_links: card_text = await card.inner_text()
href = await link.get_attribute("href") lines = [line.strip() for line in card_text.split('\n') if line.strip()]
if not href or not href.startswith("http"): if lines:
href = "https://cryptocurrencyjobs.co" + href for line in lines:
job_id = href.split("/")[-1] if href.endswith("/") else href.split("/")[-1] if len(line) > 5 and not any(skip in line.lower() for skip in ['today', 'featured', 'yesterday', 'company', 'location']):
return line
return "Unknown Title"
except:
return "Unknown Title"
async def _collect_job_elements_from_page(self, page, search_keywords: str, seen_slugs):
job_cards = []
job_found = False
await asyncio.sleep(3 * self.human_speed)
try:
await page.wait_for_selector('a[href^="/"][href*="-"]', timeout=60000)
candidates = await page.query_selector_all('a[href^="/"][href*="-"]')
for link in candidates:
href = await link.get_attribute("href") or ""
href = href.rstrip('/')
if not href or len(href.split('/')) != 3:
continue
if '-' not in href.split('/')[-1]:
continue
slug = href.split('/')[-1]
if len(slug) < 8 or slug.startswith(('login', 'signup', 'about', 'terms')):
continue
full_url = "https://cryptocurrencyjobs.co" + href if not href.startswith('http') else href
if slug in seen_slugs:
continue
title = await self._extract_job_title_from_card(link)
if not title or title == "Unknown Title":
title = slug.replace('-', ' ').title()
if job_id and job_id not in seen_job_ids:
title_element = await link.query_selector("h3, .job-title")
title = (await title_element.inner_text()) if title_element else "Unknown Title"
match_percentage = self._calculate_keyword_match(title, search_keywords) match_percentage = self._calculate_keyword_match(title, search_keywords)
if match_percentage >= 0.4 or not search_keywords.strip():
seen_slugs.add(slug)
job_cards.append((full_url, title, link))
job_found = True
if match_percentage >= 0.5: # Lower threshold than LinkedIn print(f" ✅ Collected {len(job_cards)} valid job links after filtering ({len(candidates)} raw candidates).")
seen_job_ids.add(job_id)
all_job_links.append((href, title))
new_jobs += 1
else:
print(f" ⚠️ Skipping job due to low keyword match: {title[:50]}... (match: {match_percentage:.2%})")
return new_jobs
async def _handle_pagination(self, page, search_keywords: str, seen_job_ids, all_job_links): except Exception as e:
current_page = 1 print(f" ⚠️ Error collecting job cards: {e}")
while True:
print(f"📄 Processing page {current_page}")
new_jobs = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links)
print(f" Found {new_jobs} new job(s) (total: {len(all_job_links)})")
next_btn = await page.query_selector('a[rel="next"]') if not job_found:
if next_btn: print(" ❌ No valid job listings passed filters.")
next_url = await next_btn.get_attribute("href")
if next_url and not next_url.startswith("http"): return job_cards
next_url = "https://cryptocurrencyjobs.co" + next_url
await page.goto(next_url, timeout=120000) async def _handle_pagination_and_collect_all(self, page, search_keywords: str, seen_slugs):
await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed) all_job_elements = []
current_page += 1 scroll_attempt = 0
else: max_scrolls = 40
print("🔚 No 'Next' page — stopping pagination.") prev_count = 0
while scroll_attempt < max_scrolls:
print(f" Scroll attempt {scroll_attempt + 1} | Current total jobs: {len(all_job_elements)}")
page_elements = await self._collect_job_elements_from_page(page, search_keywords, seen_slugs)
all_job_elements.extend(page_elements)
current_count = len(all_job_elements)
if current_count == prev_count and scroll_attempt > 3:
print(" 🔚 No new jobs after several scrolls → assuming end of list.")
break break
async def _extract_job_posted_date(self, page) -> str: prev_count = current_count
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await asyncio.sleep(random.uniform(2.5, 5.5) * self.human_speed)
try: try:
date_element = await page.query_selector(".job-posted-date, .job-date, time") load_more = await page.query_selector(
if date_element: 'button:has-text("Load more"), button:has-text("More"), div[role="button"]:has-text("Load"), a:has-text("Load more")'
date_text = await date_element.inner_text() )
if "Today" in date_text: if load_more:
print(" Found 'Load more' button → clicking...")
await self._human_click(page, load_more)
await asyncio.sleep(random.uniform(3.0, 6.0) * self.human_speed)
except:
pass
scroll_attempt += 1
print(f" Finished scrolling → collected {len(all_job_elements)} unique job links.")
return all_job_elements
async def _extract_job_posted_date_from_card(self, card) -> str:
try:
card_text = await card.inner_text()
if "Today" in card_text:
return datetime.now().strftime("%m/%d/%y") return datetime.now().strftime("%m/%d/%y")
elif "Yesterday" in date_text: elif "Yesterday" in card_text:
yesterday = datetime.now().replace(day=datetime.now().day - 1) from datetime import timedelta
return yesterday.strftime("%m/%d/%y") return (datetime.now() - timedelta(days=1)).strftime("%m/%d/%y")
else: else:
return datetime.now().strftime("%m/%d/%y") match = re.search(r'(\d+)d', card_text)
if match:
days = int(match.group(1))
from datetime import timedelta
return (datetime.now() - timedelta(days=days)).strftime("%m/%d/%y")
except: except:
pass pass
return datetime.now().strftime("%m/%d/%y") return datetime.now().strftime("%m/%d/%y")
@ -126,15 +216,62 @@ class CryptoJobScraper:
except Exception as e: except Exception as e:
print(f" ❌ Failed to add job to Redis cache: {str(e)}") print(f" ❌ Failed to add job to Redis cache: {str(e)}")
async def _is_forbidden_ats_url(self, url: str) -> bool:
url_lower = url.lower()
return any(domain in url_lower for domain in self.FORBIDDEN_ATS_DOMAINS)
async def _is_invalid_job_page(self, page_content: str) -> bool:
content_lower = page_content.lower()
return any(phrase in content_lower for phrase in self.INVALID_CONTENT_PHRASES)
def _extract_job_id_from_url(self, url: str) -> Optional[str]:
"""
Extract job ID from URL. Returns ID if it contains at least one digit.
Otherwise, returns None (but does NOT mean skip!).
"""
try:
parsed = urlparse(url)
path_parts = [p for p in parsed.path.split('/') if p]
if not path_parts:
return None
candidate = path_parts[-1]
candidate = re.split(r'[?#]', candidate)[0]
candidate = re.sub(r'\.html?$', '', candidate)
if not candidate or not any(c.isdigit() for c in candidate):
return None
# Avoid title-like strings (with spaces or long words + no structure)
if re.search(r'[A-Za-z]{6,}\s', candidate):
return None
return candidate
except:
return None
async def scrape_jobs( async def scrape_jobs(
self, self,
search_keywords: Optional[str], search_keywords: Optional[str],
max_pages: int = 1, max_pages: int = 1,
credentials: Optional[Dict] = None credentials: Optional[Dict] = None
): ):
# cryptocurrencyjobs.co uses URL params differently query = ""
encoded_keywords = search_keywords.replace(" ", "%20") location = ""
search_url = f"https://cryptocurrencyjobs.co/?q={encoded_keywords}" if search_keywords and search_keywords.strip():
parts = search_keywords.split(',', 1)
query = parts[0].strip()
if len(parts) > 1:
location = parts[1].strip()
clean_query = query.replace(' ', '+')
clean_location = location.replace(' ', '+')
search_url = "https://cryptocurrencyjobs.co/"
if clean_query:
search_url += f"?query={clean_query}"
if clean_location:
search_url += f"&location={clean_location}"
profile = self.engine._select_profile() profile = self.engine._select_profile()
renderer = random.choice(self.engine.common_renderers[self.engine.os]) renderer = random.choice(self.engine.common_renderers[self.engine.os])
@ -156,46 +293,107 @@ class CryptoJobScraper:
await context.add_init_script(spoof_script) await context.add_init_script(spoof_script)
page = await context.new_page() page = await context.new_page()
print(f"🔍 Searching for: {search_keywords or 'all jobs'}")
# Fetch main search page print(f" 🔗 URL: {search_url}")
print(f"🔍 Searching for: {search_keywords}") await page.goto(search_url, wait_until='networkidle', timeout=120000)
await page.goto(search_url, wait_until='load', timeout=120000)
await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed) await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
all_job_links = [] try:
seen_job_ids = set() await page.wait_for_selector("a[href^='/'][href*='-']", timeout=10000)
except:
print(" ⚠️ No job links found initially, waiting longer...")
await asyncio.sleep(5 * self.human_speed)
print("🔄 Collecting job links from search results...") seen_slugs = set()
await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links) all_job_elements = await self._handle_pagination_and_collect_all(page, search_keywords, seen_slugs)
await self._handle_pagination(page, search_keywords, seen_job_ids, all_job_links) print(f"✅ Collected {len(all_job_elements)} unique job links.")
print(f"✅ Collected {len(all_job_links)} unique job links.")
scraped_count = 0 scraped_count = 0
for idx, (href, title) in enumerate(all_job_links): for idx, (href, title, job_element) in enumerate(all_job_elements):
job_detail_page = None
apply_page = None
skip_job = False
final_scrape_url = None
try: try:
full_url = href print(f" → Processing job {idx+1}/{len(all_job_elements)}: {title}")
print(f" → Opening job {idx+1}/{len(all_job_links)}: {full_url}")
fetcher = StealthyFetcher(self.engine, browser, context) posted_date = await self._extract_job_posted_date_from_card(job_element)
job_page = await fetcher.fetch_url(full_url, wait_for_selector="h1")
if not job_page: job_detail_page = await context.new_page()
print(f" ❌ Failed to fetch job page {full_url}") await job_detail_page.goto(href, wait_until='networkidle', timeout=60000)
await self._add_job_to_redis_cache(full_url, full_url.split("/")[-1], "fetch_failure") await asyncio.sleep(2 * self.human_speed)
self.engine.report_outcome("fetch_failure", url=full_url)
# Check for invalid content
page_content = await job_detail_page.content()
if await self._is_invalid_job_page(page_content):
print(" 🚫 Page contains invalid content → skipping.")
await job_detail_page.close()
continue continue
posted_date = await self._extract_job_posted_date(job_page) # Try to click apply
apply_clicked = False
apply_selectors = [
'a[href*="apply"], a:text("Apply"), a:text("Apply Now"), a:text("Apply here")',
'button:text("Apply"), button:has-text("Apply")',
'[data-testid="apply-button"], [aria-label*="apply"], [role="button"]:has-text("Apply")',
'a.btn-apply, .apply-button, .apply-link, a:has-text("Apply")',
'a[rel="noopener"]:has-text("Apply")',
]
await self.engine._human_like_scroll(job_page) for sel in apply_selectors:
await asyncio.sleep(2 * self.human_speed) apply_elem = await job_detail_page.query_selector(sel)
page_content = await self._extract_page_content_for_llm(job_page) if apply_elem:
print(f" 🔗 Found Apply element with selector: {sel}")
await self._human_click(job_detail_page, apply_elem, wait_after=True)
apply_clicked = True
break
job_id = full_url.split("/")[-1] if full_url.split("/")[-1] else "unknown" apply_page = job_detail_page
if apply_clicked:
await asyncio.sleep(random.uniform(3.0, 6.0) * self.human_speed)
pages = context.pages
new_pages = [p for p in pages if p != job_detail_page and p.url != "about:blank"]
if new_pages:
candidate_page = new_pages[-1]
new_url = candidate_page.url.strip()
print(f" New tab opened: {new_url}")
if new_url and await self._is_forbidden_ats_url(new_url):
print(" 🚫 New URL is a forbidden ATS → skipping job.")
if candidate_page != job_detail_page:
await candidate_page.close()
await job_detail_page.close()
skip_job = True
else:
apply_page = candidate_page
else:
print(" No new tab → using original page.")
if skip_job:
continue
final_scrape_url = apply_page.url
# Re-check invalid content on final page
page_content = await self._extract_page_content_for_llm(apply_page)
if await self._is_invalid_job_page(page_content):
print(" 🚫 Final page contains invalid content → skipping.")
if apply_page != job_detail_page:
await apply_page.close()
await job_detail_page.close()
continue
# Extract job ID — but do NOT fail if missing
job_id = self._extract_job_id_from_url(final_scrape_url)
if not job_id:
# Fallback: hash the URL to create a stable, unique ID
job_id = "job_" + hashlib.md5(final_scrape_url.encode()).hexdigest()[:12]
raw_data = { raw_data = {
"page_content": page_content, "page_content": page_content,
"url": full_url, "url": final_scrape_url,
"job_id": job_id, "job_id": job_id,
"search_keywords": search_keywords, "search_keywords": search_keywords,
"posted_date": posted_date "posted_date": posted_date
@ -210,44 +408,45 @@ class CryptoJobScraper:
if field == 'job_id': if field == 'job_id':
refined_data[field] = job_id refined_data[field] = job_id
elif field == 'url': elif field == 'url':
refined_data[field] = full_url refined_data[field] = final_scrape_url
elif field == 'company_name': elif field == 'company_name':
refined_data[field] = "Unknown Company" refined_data[field] = "Unknown Company"
refined_data['scraped_at'] = datetime.now().isoformat() refined_data['scraped_at'] = datetime.now().isoformat()
refined_data['category'] = search_keywords refined_data['category'] = search_keywords or "all"
refined_data['posted_date'] = posted_date refined_data['posted_date'] = posted_date
await self.llm_agent.save_job_data(refined_data, search_keywords) await self.llm_agent.save_job_data(refined_data, search_keywords or "all")
scraped_count += 1 scraped_count += 1
print(f" ✅ Scraped and refined: {refined_data['title'][:50]}...") print(f" ✅ Scraped: {refined_data['title'][:50]}... (Job ID: {job_id})")
self.engine.report_outcome("success", url=raw_data["url"]) self.engine.report_outcome("success", url=final_scrape_url)
else: else:
print(f" 🟡 Could not extract meaningful data from: {full_url}") print(f" 🟡 Could not extract meaningful data from: {final_scrape_url}")
await self._add_job_to_redis_cache(full_url, job_id, "llm_failure") await self._add_job_to_redis_cache(final_scrape_url, job_id, "llm_failure")
self.engine.report_outcome("llm_failure", url=raw_data["url"]) self.engine.report_outcome("llm_failure", url=final_scrape_url)
await job_page.close() if apply_page != job_detail_page and not apply_page.is_closed():
await apply_page.close()
if job_detail_page and not job_detail_page.is_closed():
await job_detail_page.close()
except Exception as e: except Exception as e:
error_msg = str(e)[:100] error_msg = str(e)[:100]
print(f" ⚠️ Failed on job {idx+1}: {error_msg}") print(f" ⚠️ Failed on job {idx+1}: {error_msg}")
job_id = full_url.split("/")[-1] if 'full_url' in locals() else "unknown" job_id_for_log = "unknown"
job_url = full_url if 'full_url' in locals() else "unknown" if 'final_scrape_url' in locals() and final_scrape_url:
await self._add_job_to_redis_cache(job_url, job_id, f"exception: {error_msg}") job_id_for_log = "job_" + hashlib.md5(final_scrape_url.encode()).hexdigest()[:12]
if 'job_page' in locals() and job_page: await self._add_job_to_redis_cache(href, job_id_for_log, f"exception: {error_msg}")
await job_page.close() if job_detail_page and not job_detail_page.is_closed():
await job_detail_page.close()
if 'apply_page' in locals() and apply_page and apply_page != job_detail_page and not apply_page.is_closed():
await apply_page.close()
continue continue
finally:
print(" ↩️ Returning to search results...")
await page.goto(search_url, timeout=120000)
await asyncio.sleep(4 * self.human_speed)
await browser.close() await browser.close()
if scraped_count > 0: if scraped_count > 0:
self.engine.report_outcome("success") self.engine.report_outcome("success")
print(f"✅ Completed! Processed {scraped_count} jobs for '{search_keywords}' based on request '{self.user_request}'.") print(f"✅ Completed! Processed {scraped_count} jobs for '{search_keywords or 'all jobs'}'.")
else: else:
self.engine.report_outcome("scraping_error") self.engine.report_outcome("scraping_error")
print("⚠️ No jobs processed successfully.") print("⚠️ No jobs processed successfully.")