"Specifically for scraping job postings from Amazon Jobs." import asyncio import random from typing import Optional, Dict from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError from browserforge.injectors.playwright import AsyncNewContext from llm_agent import LLMJobRefiner import re from fetcher import StealthyFetcher from datetime import datetime import json import redis class AmazonJobScraper: def __init__( self, engine, db_path: str = "amazon_jobs.db", human_speed: float = 1.0, user_request: str = "Extract all standard job details" ): self.engine = engine self.db_path = db_path self.human_speed = human_speed self.user_request = user_request self._init_db() self.llm_agent = LLMJobRefiner() self.redis_client = redis.Redis(host='localhost', port=6379, db=0, decode_responses=True) def _init_db(self): pass # Handled by LLMJobRefiner async def _human_click(self, page, element, wait_after: bool = True): if not element: return False await element.scroll_into_view_if_needed() await asyncio.sleep(random.uniform(0.3, 0.8) * self.human_speed) try: await element.click() if wait_after: await asyncio.sleep(random.uniform(2, 4) * self.human_speed) return True except: return False async def _login(self, page, credentials: Dict) -> bool: # Amazon job pages do NOT require login. # Skip login unless we're scraping internal dashboards (not needed here). return True async def _extract_page_content_for_llm(self, page) -> str: await asyncio.sleep(2 * self.human_speed) await self.engine._human_like_scroll(page) await asyncio.sleep(2 * self.human_speed) return await page.content() def _calculate_keyword_match(self, title: str, keywords: str) -> float: if not title or not keywords: return 0.0 title_lower = title.lower() keyword_list = [kw.strip().lower() for kw in keywords.split()] matches = sum(1 for kw in keyword_list if kw in title_lower) return matches / len(keyword_list) if keyword_list else 0.0 def _extract_location_from_keywords(self, search_keywords: str) -> str: location_match = re.search(r'location:\s*([^,]+)', search_keywords, re.IGNORECASE) return location_match.group(1).strip().lower() if location_match else "" async def _scrape_jobs_from_current_page(self, page, search_keywords: str, seen_job_ids, all_job_links): current_links = await page.query_selector_all("a[href*='/jobs/']") new_jobs = 0 location_from_keywords = self._extract_location_from_keywords(search_keywords) for link in current_links: href = await link.get_attribute("href") if not href or "page=" in href or "search?" in href: continue full_url = href if href.startswith("http") else f"https://www.amazon.jobs{href}" job_id = href.strip("/").split("/")[-1] if href else "unknown" if job_id and job_id not in seen_job_ids: title_element = await link.query_selector("h3") or await link.query_selector(".job-title") title = await title_element.inner_text() if title_element else "Unknown Title" match_percentage = self._calculate_keyword_match(title, search_keywords) location_match = True if location_from_keywords: location_element = await link.query_selector(".location-and-id") if location_element: location_text = await location_element.inner_text() location_match = location_from_keywords in location_text.lower() if match_percentage >= 0.7 and location_match: seen_job_ids.add(job_id) all_job_links.append((href, title)) new_jobs += 1 elif match_percentage < 0.7: print(f" âš ī¸ Skipping job due to low keyword match: {title[:50]}... (match: {match_percentage:.2%})") elif not location_match: print(f" âš ī¸ Skipping job due to location mismatch: {title[:50]}... (expected: {location_from_keywords})") else: seen_job_ids.add(job_id) all_job_links.append((href, "Unknown Title")) new_jobs += 1 return new_jobs async def _handle_pagination(self, page, search_keywords: str, seen_job_ids, all_job_links): current_page = 1 while current_page <= 10: # Amazon limits to ~10 pages publicly print(f"📄 Processing page {current_page}") new_jobs = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links) print(f" ➕ Found {new_jobs} new job(s) on page {current_page} (total: {len(all_job_links)})") next_btn = await page.query_selector("a[aria-label='Next page']") if next_btn: next_url = await next_btn.get_attribute("href") if next_url: full_next_url = next_url if next_url.startswith("http") else f"https://www.amazon.jobs{next_url}" print(f" âžĄī¸ Navigating to next page: {full_next_url}") await page.goto(full_next_url, timeout=120000) await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed) current_page += 1 else: break else: print("🔚 No 'Next' button found — stopping pagination.") break async def _extract_job_posted_date(self, page) -> str: try: # Amazon often includes "Posted X days ago" in job description content = await page.content() match = re.search(r'Posted\s+(\d+)\s+day[s]?\s+ago', content, re.IGNORECASE) if match: days_ago = int(match.group(1)) posted_date = datetime.now() - timedelta(days=days_ago) return posted_date.strftime("%m/%d/%y") # Fallback: check for explicit date in page (rare) date_match = re.search(r'(\d{1,2})/(\d{1,2})/(\d{4})', content) if date_match: month, day, year = date_match.groups() return f"{month.zfill(2)}/{day.zfill(2)}/{year[-2:]}" # Default to today return datetime.now().strftime("%m/%d/%y") except Exception as e: print(f" âš ī¸ Error extracting Amazon posted date: {str(e)}") return datetime.now().strftime("%m/%d/%y") async def _add_job_to_redis_cache(self, job_url: str, job_id: str, error_type: str): try: job_data = { "job_url": job_url, "job_id": job_id, "error_type": error_type, "timestamp": datetime.now().isoformat() } self.redis_client.hset("failed_jobs", job_id, json.dumps(job_data)) print(f" đŸ“Ļ Added failed job to Redis cache: {job_id} (Error: {error_type})") except Exception as e: print(f" ❌ Failed to add job to Redis cache: {str(e)}") async def scrape_jobs( self, search_keywords: Optional[str], max_pages: int = 1, credentials: Optional[Dict] = None ): from datetime import timedelta # needed for date math location_match = re.search(r'location:\s*([^,]+)', search_keywords, re.IGNORECASE) location = location_match.group(1).strip() if location_match else "" clean_keywords = re.sub(r'location:\s*[^,]+', '', search_keywords, flags=re.IGNORECASE).strip() encoded_keywords = clean_keywords.replace(" ", "+") # Amazon uses + for spaces search_url = f"https://www.amazon.jobs/en/search?base_query={encoded_keywords}" if location: # Amazon uses location filter via `loc_query` search_url += f"&loc_query={location.replace(' ', '+')}" profile = self.engine._select_profile() renderer = random.choice(self.engine.common_renderers[self.engine.os]) vendor = random.choice(self.engine.common_vendors) spoof_script = self.engine._get_spoof_script(renderer, vendor) async with async_playwright() as pw: browser = await pw.chromium.launch( headless=False, args=['--disable-blink-features=AutomationControlled'] ) context = await AsyncNewContext(browser, fingerprint=profile) await context.add_init_script(f""" Object.defineProperty(navigator, 'hardwareConcurrency', {{ get: () => {profile.navigator.hardwareConcurrency} }}); Object.defineProperty(navigator, 'deviceMemory', {{ get: () => {profile.navigator.deviceMemory} }}); Object.defineProperty(navigator, 'platform', {{ get: () => '{profile.navigator.platform}' }}); """) await context.add_init_script(spoof_script) page = await context.new_page() temp_fetcher = StealthyFetcher(self.engine, browser, context) print("✅ Bypassing login (Amazon jobs are public)...") login_successful = True await page.wait_for_load_state("load", timeout=120000) # Protection check (same as LinkedIn logic) protection_type = await temp_fetcher._detect_protection(page) if protection_type: print(f"đŸ›Ąī¸ Protection detected on initial page: {protection_type}") content_accessible = await temp_fetcher._is_content_accessible(page) if not content_accessible: handled = await self.engine._handle_cloudflare(page) if protection_type == "cloudflare" else False if not handled: await browser.close() self.engine.report_outcome("protection_block") return else: print("✅ Protection present but content accessible — proceeding.") print(f"🔍 Searching Amazon for: {search_keywords}") await page.goto(search_url, wait_until='load', timeout=120000) await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed) # Protection check on search page protection_type = await temp_fetcher._detect_protection(page) if protection_type: print(f"đŸ›Ąī¸ Protection detected on search page: {protection_type}") content_accessible = await temp_fetcher._is_content_accessible(page) if not content_accessible: handled = await self.engine._handle_cloudflare(page) if protection_type == "cloudflare" else False if not handled: await browser.close() self.engine.report_outcome("protection_block") return else: print("✅ Protection present but content accessible — proceeding.") all_job_links = [] seen_job_ids = set() print("🔄 Collecting initial job links...") initial_jobs = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links) print(f" ➕ Found {initial_jobs} initial job(s) (total: {len(all_job_links)})") # Amazon uses pagination (not infinite scroll) await self._handle_pagination(page, search_keywords, seen_job_ids, all_job_links) print(f"✅ Collected {len(all_job_links)} unique job links.") scraped_count = 0 for idx, (href, title) in enumerate(all_job_links): try: full_url = href if href.startswith("http") else f"https://www.amazon.jobs{href}" print(f" → Opening job {idx+1}/{len(all_job_links)}: {full_url}") fetcher = StealthyFetcher(self.engine, browser, context) job_page = await fetcher.fetch_url(full_url, wait_for_selector="h1[data-testid='job-title']") if not job_page: print(f" ❌ Failed to fetch job page {full_url} after retries.") job_id = href.strip("/").split("/")[-1] if href else "unknown" await self._add_job_to_redis_cache(full_url, job_id, "fetch_failure") self.engine.report_outcome("fetch_failure", url=full_url) continue posted_date = await self._extract_job_posted_date(job_page) print(f" 📅 Posted date extracted: {posted_date}") apply_btn = await job_page.query_selector("a:has-text('Apply now'), button:has-text('Apply now')") final_url = full_url external_url = None page_content = None if apply_btn: apply_href = await apply_btn.get_attribute("href") if apply_href and apply_href.startswith("http"): print(" 🌐 Detected external apply URL — capturing directly.") external_url = apply_href final_url = external_url # We won't navigate; just pass Amazon job page to LLM page_content = await self._extract_page_content_for_llm(job_page) else: print(" → Clicking 'Apply now' (may open new tab)...") page_waiter = asyncio.create_task(context.wait_for_event("page")) await self._human_click(job_page, apply_btn, wait_after=False) external_page = None try: external_page = await asyncio.wait_for(page_waiter, timeout=5.0) print(" 🌐 External job site opened in new tab.") await external_page.wait_for_load_state("load", timeout=120000) await asyncio.sleep(2 * self.human_speed) await self.engine._human_like_scroll(external_page) external_url = external_page.url final_url = external_url page_content = await self._extract_page_content_for_llm(external_page) if not external_page.is_closed(): await external_page.close() except asyncio.TimeoutError: print(" đŸ–Ĩī¸ No external tab — using Amazon job page.") page_content = await self._extract_page_content_for_llm(job_page) else: print(" âš ī¸ No 'Apply now' button — scraping job page directly.") page_content = await self._extract_page_content_for_llm(job_page) job_id = href.strip("/").split("/")[-1] if href else "unknown" raw_data = { "page_content": page_content, "url": final_url, "job_id": job_id, "search_keywords": search_keywords, "posted_date": posted_date } refined_data = await self.llm_agent.refine_job_data(raw_data, self.user_request) if refined_data and refined_data.get("title", "N/A") != "N/A": compulsory_fields = ['company_name', 'job_id', 'url'] for field in compulsory_fields: if not refined_data.get(field) or refined_data[field] in ["N/A", "", "Unknown"]: if field == 'job_id': refined_data[field] = job_id elif field == 'url': refined_data[field] = final_url elif field == 'company_name': refined_data[field] = "Amazon" refined_data['scraped_at'] = datetime.now().isoformat() refined_data['category'] = clean_keywords refined_data['posted_date'] = posted_date await self.llm_agent.save_job_data(refined_data, search_keywords) scraped_count += 1 print(f" ✅ Scraped and refined: {refined_data['title'][:50]}...") self.engine.report_outcome("success", url=raw_data["url"]) else: print(f" 🟡 Could not extract meaningful data from: {final_url}") await self._add_job_to_redis_cache(final_url, job_id, "llm_failure") self.engine.report_outcome("llm_failure", url=raw_data["url"]) await job_page.close() except Exception as e: error_msg = str(e)[:100] print(f" âš ī¸ Failed on job {idx+1}: {error_msg}") job_id = (href.strip("/").split("/")[-1] if href else "unknown") if 'href' in locals() else "unknown" job_url = full_url if 'full_url' in locals() else "unknown" await self._add_job_to_redis_cache(job_url, job_id, f"exception: {error_msg}") if 'job_page' in locals() and job_page: await job_page.close() continue finally: print(" â†Šī¸ Returning to Amazon search results...") await page.goto(search_url, timeout=120000) await asyncio.sleep(4 * self.human_speed) await browser.close() if scraped_count > 0: self.engine.report_outcome("success") print(f"✅ Completed! Processed {scraped_count} jobs for '{search_keywords}' based on request '{self.user_request}'.") else: self.engine.report_outcome("captcha") print("âš ī¸ No jobs processed successfully.")