diff --git a/amazon_job_scraper.py b/amazon_job_scraper.py new file mode 100644 index 0000000..db2968f --- /dev/null +++ b/amazon_job_scraper.py @@ -0,0 +1,235 @@ + +import asyncio +import random +import re +from typing import Optional, Dict +from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError +from browserforge.injectors.playwright import AsyncNewContext +from llm_agent import LLMJobRefiner +from fetcher import StealthyFetcher +from datetime import datetime + + +class AmazonJobScraper: + def __init__( + self, + engine, + db_path: str = "amazon_jobs.db", + human_speed: float = 1.0, + user_request: str = "Extract all standard job details" + ): + self.engine = engine + self.db_path = db_path + self.human_speed = human_speed + self.user_request = user_request + self.llm_agent = LLMJobRefiner() + + async def _human_click(self, page, element, wait_after: bool = True): + if not element: + return False + await element.scroll_into_view_if_needed() + await asyncio.sleep(random.uniform(0.3, 0.8) * self.human_speed) + try: + await element.click() + if wait_after: + await asyncio.sleep(random.uniform(1.0, 2.0) * self.human_speed) + return True + except: + return False + + def _extract_location_from_keywords(self, search_keywords: str) -> str: + location_match = re.search(r'location:\s*([^,]+)', search_keywords, re.IGNORECASE) + return location_match.group(1).strip() if location_match else "" + + def _build_amazon_search_url(self, keywords: str) -> str: + clean_keywords = re.sub(r'location:\s*[^,]+', '', keywords, flags=re.IGNORECASE).strip() + location = self._extract_location_from_keywords(keywords) + + base_url = "https://www.amazon.jobs/en/search?" + params = [] + + if clean_keywords: + params.append(f"base_query={clean_keywords.replace(' ', '+')}") + if location: + params.append(f"loc_query={location.replace(' ', '+')}") + params.append("offset=0") + params.append("result_limit=10") + + return base_url + "&".join(params) + + async def _extract_page_content_for_llm(self, page) -> str: + await asyncio.sleep(2 * self.human_speed) + await self.engine._human_like_scroll(page) + await asyncio.sleep(2 * self.human_speed) + return await page.content() + + async def _scrape_job_links_from_page(self, page, seen_job_ids, all_job_links): + job_cards = await page.query_selector_all('div.job-tile a[href^="/en/jobs/"]') + new_jobs = 0 + for card in job_cards: + href = await card.get_attribute("href") + if not href: + continue + full_url = f"https://www.amazon.jobs{href}" if href.startswith("/") else href + job_id = href.split("/")[-1] if href.split("/")[-1] else "unknown" + + if job_id in seen_job_ids: + continue + + title_element = await card.query_selector('h3') + title = await title_element.inner_text() if title_element else "Unknown Title" + + seen_job_ids.add(job_id) + all_job_links.append((full_url, title)) + new_jobs += 1 + + return new_jobs + + async def _scroll_and_collect_jobs(self, page, seen_job_ids, all_job_links, max_pages=5): + offset = 0 + jobs_per_page = 10 + for page_num in range(max_pages): + print(f"šŸ“„ Fetching Amazon job page {page_num + 1} (offset: {offset})") + current_url = page.url + if "offset=" in current_url: + base_url = current_url.split("offset=")[0] + new_url = base_url + f"offset={offset}&result_limit={jobs_per_page}" + else: + new_url = current_url + f"&offset={offset}&result_limit={jobs_per_page}" + + await page.goto(new_url, wait_until='domcontentloaded', timeout=120000) + await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed) + + new_jobs = await self._scrape_job_links_from_page(page, seen_job_ids, all_job_links) + print(f" āž• Found {new_jobs} new job(s) on page {page_num + 1} (total: {len(all_job_links)})") + + if new_jobs == 0 and page_num > 0: + print("šŸ”š No new jobs — stopping pagination.") + break + + offset += jobs_per_page + + async def scrape_jobs( + self, + search_keywords: Optional[str], + max_pages: int = 5, + credentials: Optional[Dict] = None # Not used for Amazon + ): + search_url = self._build_amazon_search_url(search_keywords) + print(f"šŸ” Amazon search URL: {search_url}") + + profile = self.engine._select_profile() + renderer = random.choice(self.engine.common_renderers[self.engine.os]) + vendor = random.choice(self.engine.common_vendors) + spoof_script = self.engine._get_spoof_script(renderer, vendor) + + async with async_playwright() as pw: + browser = await pw.chromium.launch( + headless=False, + args=['--disable-blink-features=AutomationControlled'] + ) + context = await AsyncNewContext(browser, fingerprint=profile) + + await context.add_init_script(f""" + Object.defineProperty(navigator, 'hardwareConcurrency', {{ get: () => {profile.navigator.hardwareConcurrency} }}); + Object.defineProperty(navigator, 'deviceMemory', {{ get: () => {profile.navigator.deviceMemory} }}); + Object.defineProperty(navigator, 'platform', {{ get: () => '{profile.navigator.platform}' }}); + """) + await context.add_init_script(spoof_script) + + page = await context.new_page() + temp_fetcher = StealthyFetcher(self.engine, browser, context) + + # Amazon doesn't require login + print("🌐 Navigating to Amazon Jobs (no login required)...") + await page.goto(search_url, wait_until='domcontentloaded', timeout=120000) + await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed) + + # Protection check + protection_type = await temp_fetcher._detect_protection(page) + if protection_type: + print(f"šŸ›”ļø Protection detected: {protection_type}") + content_accessible = await temp_fetcher._is_content_accessible(page) + if not content_accessible: + handled = await self.engine._handle_cloudflare(page) if protection_type == "cloudflare" else False + if not handled: + await browser.close() + self.engine.report_outcome("protection_block") + return + else: + print("āœ… Protection present but content accessible.") + + all_job_links = [] + seen_job_ids = set() + + print("šŸ”„ Collecting job links via pagination...") + await self._scroll_and_collect_jobs(page, seen_job_ids, all_job_links, max_pages=max_pages) + + print(f"āœ… Collected {len(all_job_links)} unique Amazon job links.") + + scraped_count = 0 + for idx, (job_url, title) in enumerate(all_job_links): + try: + print(f" → Opening job {idx+1}/{len(all_job_links)}: {job_url}") + fetcher = StealthyFetcher(self.engine, browser, context) + job_page = await fetcher.fetch_url(job_url, wait_for_selector="h1.job-title") + + if not job_page: + print(f" āŒ Failed to fetch job page: {job_url}") + self.engine.report_outcome("fetch_failure", url=job_url) + continue + + # Extract raw HTML for LLM + await self.engine._human_like_scroll(job_page) + await asyncio.sleep(2 * self.human_speed) + page_content = await self._extract_page_content_for_llm(job_page) + + job_id = job_url.split("/")[-1] if job_url.split("/")[-1] else "unknown" + + raw_data = { + "page_content": page_content, + "url": job_url, + "job_id": job_id, + "search_keywords": search_keywords + } + + refined_data = await self.llm_agent.refine_job_data(raw_data, self.user_request) + + if refined_data and refined_data.get("title", "N/A") != "N/A": + # Ensure compulsory fields + compulsory_fields = ['company_name', 'job_id', 'url'] + for field in compulsory_fields: + if not refined_data.get(field) or refined_data[field] in ["N/A", "", "Unknown"]: + if field == 'job_id': + refined_data[field] = job_id + elif field == 'url': + refined_data[field] = job_url + elif field == 'company_name': + refined_data[field] = "Amazon" + + refined_data['scraped_at'] = datetime.now().isoformat() + refined_data['category'] = re.sub(r'location:\s*[^,]+', '', search_keywords, flags=re.IGNORECASE).strip() + await self.llm_agent.save_job_data(refined_data, search_keywords) + scraped_count += 1 + print(f" āœ… Scraped and refined: {refined_data['title'][:50]}...") + self.engine.report_outcome("success", url=job_url) + else: + print(f" 🟔 LLM could not extract valid data from: {job_url}") + self.engine.report_outcome("llm_failure", url=job_url) + + await job_page.close() + + except Exception as e: + print(f" āš ļø Failed on job {idx+1}: {str(e)[:100]}") + if 'job_page' in locals() and job_page: + await job_page.close() + continue + + await browser.close() + + if scraped_count > 0: + self.engine.report_outcome("success") + print(f"āœ… Completed! Processed {scraped_count} Amazon jobs for '{search_keywords}'.") + else: + self.engine.report_outcome("no_jobs") + print("āš ļø No Amazon jobs processed successfully.") diff --git a/amazon_main.py b/amazon_main.py new file mode 100644 index 0000000..b6f7f07 --- /dev/null +++ b/amazon_main.py @@ -0,0 +1,61 @@ + +from scraping_engine import FingerprintScrapingEngine +from job_scraper2 import AmazonJobScraper # Updated class name +import os +from dotenv import load_dotenv +import asyncio +import random +import time + +load_dotenv() + +async def main(): + engine = FingerprintScrapingEngine( + seed="amazon_job_scraping_12", + target_os="windows", + db_path="amazon_jobs.db", + markdown_path="amazon_jobs.md" + ) + + scraper = AmazonJobScraper( + engine, + human_speed=1.4, + user_request="Extract title, company, location, description, basic qualifications, preferred qualifications, job ID, and job type (full-time, part-time, etc.)" + ) + + job_titles = [ + "Software Development Engineer", + "Data Scientist", + "Product Manager", + "UX Designer", + "Solutions Architect", + "Machine Learning Engineer", + "Frontend Engineer", + "Backend Engineer", + "Full Stack Engineer", + "Data Engineer" + ] + + fixed_location = "United States" # Amazon uses country/region, not city + + while True: + random.shuffle(job_titles) + for job_title in job_titles: + search_keywords = f"{job_title} location:{fixed_location}" + print(f"\n{'='*60}") + print(f"Starting Amazon scrape for: {search_keywords}") + print(f"{'='*60}") + + await scraper.scrape_jobs( + search_keywords=search_keywords, + max_pages=3 # Amazon loads 10 per page; 3 pages = ~30 jobs + ) + + print(f"\nāœ… Completed scraping for: {job_title}") + print(f"ā³ Waiting 90 seconds before next job title...") + time.sleep(90) + + print(f"\nāœ… Completed full cycle. Restarting...") + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file