diff --git a/tr.py b/tr.py deleted file mode 100644 index 2c287cd..0000000 --- a/tr.py +++ /dev/null @@ -1,348 +0,0 @@ -To adapt your existing **LinkedIn job scraper** to scrape **Amazon job listings** instead, you need to **completely replace LinkedIn-specific logic** with Amazon-specific logic, because: - -- Amazon jobs are hosted on a **different domain**: `https://www.amazon.jobs` -- The **DOM structure**, job URLs, pagination, selectors, and flow are **entirely different** -- **No login is required** — Amazon job listings are public -- Amazon uses **infinite scroll with API pagination**, not traditional “Next” buttons -- Job detail pages are **self-contained** — no external apply redirects like LinkedIn - -Below is the **fully modified `job_scraper2.py`** (renamed internally to `AmazonJobScraper`) that scrapes Amazon jobs using the same engine architecture but adapted for Amazon’s site. - -> ✅ **You should rename the file to `amazon_job_scraper.py`** and update `amazon_main.py` accordingly. - ---- - -### ✅ Modified `job_scraper2.py` → **Amazon Job Scraper** - -```python -# job_scraper2.py (now for Amazon) -import asyncio -import random -import re -from typing import Optional, Dict -from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError -from browserforge.injectors.playwright import AsyncNewContext -from llm_agent import LLMJobRefiner -from fetcher import StealthyFetcher -from datetime import datetime - - -class AmazonJobScraper: - def __init__( - self, - engine, - db_path: str = "amazon_jobs.db", - human_speed: float = 1.0, - user_request: str = "Extract all standard job details" - ): - self.engine = engine - self.db_path = db_path - self.human_speed = human_speed - self.user_request = user_request - self.llm_agent = LLMJobRefiner() - - async def _human_click(self, page, element, wait_after: bool = True): - if not element: - return False - await element.scroll_into_view_if_needed() - await asyncio.sleep(random.uniform(0.3, 0.8) * self.human_speed) - try: - await element.click() - if wait_after: - await asyncio.sleep(random.uniform(1.0, 2.0) * self.human_speed) - return True - except: - return False - - def _extract_location_from_keywords(self, search_keywords: str) -> str: - location_match = re.search(r'location:\s*([^,]+)', search_keywords, re.IGNORECASE) - return location_match.group(1).strip() if location_match else "" - - def _build_amazon_search_url(self, keywords: str) -> str: - clean_keywords = re.sub(r'location:\s*[^,]+', '', keywords, flags=re.IGNORECASE).strip() - location = self._extract_location_from_keywords(keywords) - - base_url = "https://www.amazon.jobs/en/search?" - params = [] - - if clean_keywords: - params.append(f"base_query={clean_keywords.replace(' ', '+')}") - if location: - params.append(f"loc_query={location.replace(' ', '+')}") - params.append("offset=0") - params.append("result_limit=10") - - return base_url + "&".join(params) - - async def _extract_page_content_for_llm(self, page) -> str: - await asyncio.sleep(2 * self.human_speed) - await self.engine._human_like_scroll(page) - await asyncio.sleep(2 * self.human_speed) - return await page.content() - - async def _scrape_job_links_from_page(self, page, seen_job_ids, all_job_links): - job_cards = await page.query_selector_all('div.job-tile a[href^="/en/jobs/"]') - new_jobs = 0 - for card in job_cards: - href = await card.get_attribute("href") - if not href: - continue - full_url = f"https://www.amazon.jobs{href}" if href.startswith("/") else href - job_id = href.split("/")[-1] if href.split("/")[-1] else "unknown" - - if job_id in seen_job_ids: - continue - - title_element = await card.query_selector('h3') - title = await title_element.inner_text() if title_element else "Unknown Title" - - seen_job_ids.add(job_id) - all_job_links.append((full_url, title)) - new_jobs += 1 - - return new_jobs - - async def _scroll_and_collect_jobs(self, page, seen_job_ids, all_job_links, max_pages=5): - offset = 0 - jobs_per_page = 10 - for page_num in range(max_pages): - print(f"📄 Fetching Amazon job page {page_num + 1} (offset: {offset})") - current_url = page.url - if "offset=" in current_url: - base_url = current_url.split("offset=")[0] - new_url = base_url + f"offset={offset}&result_limit={jobs_per_page}" - else: - new_url = current_url + f"&offset={offset}&result_limit={jobs_per_page}" - - await page.goto(new_url, wait_until='domcontentloaded', timeout=120000) - await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed) - - new_jobs = await self._scrape_job_links_from_page(page, seen_job_ids, all_job_links) - print(f" ➕ Found {new_jobs} new job(s) on page {page_num + 1} (total: {len(all_job_links)})") - - if new_jobs == 0 and page_num > 0: - print("🔚 No new jobs — stopping pagination.") - break - - offset += jobs_per_page - - async def scrape_jobs( - self, - search_keywords: Optional[str], - max_pages: int = 5, - credentials: Optional[Dict] = None # Not used for Amazon - ): - search_url = self._build_amazon_search_url(search_keywords) - print(f"🔍 Amazon search URL: {search_url}") - - profile = self.engine._select_profile() - renderer = random.choice(self.engine.common_renderers[self.engine.os]) - vendor = random.choice(self.engine.common_vendors) - spoof_script = self.engine._get_spoof_script(renderer, vendor) - - async with async_playwright() as pw: - browser = await pw.chromium.launch( - headless=False, - args=['--disable-blink-features=AutomationControlled'] - ) - context = await AsyncNewContext(browser, fingerprint=profile) - - await context.add_init_script(f""" - Object.defineProperty(navigator, 'hardwareConcurrency', {{ get: () => {profile.navigator.hardwareConcurrency} }}); - Object.defineProperty(navigator, 'deviceMemory', {{ get: () => {profile.navigator.deviceMemory} }}); - Object.defineProperty(navigator, 'platform', {{ get: () => '{profile.navigator.platform}' }}); - """) - await context.add_init_script(spoof_script) - - page = await context.new_page() - temp_fetcher = StealthyFetcher(self.engine, browser, context) - - # Amazon doesn't require login - print("🌐 Navigating to Amazon Jobs (no login required)...") - await page.goto(search_url, wait_until='domcontentloaded', timeout=120000) - await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed) - - # Protection check - protection_type = await temp_fetcher._detect_protection(page) - if protection_type: - print(f"🛡️ Protection detected: {protection_type}") - content_accessible = await temp_fetcher._is_content_accessible(page) - if not content_accessible: - handled = await self.engine._handle_cloudflare(page) if protection_type == "cloudflare" else False - if not handled: - await browser.close() - self.engine.report_outcome("protection_block") - return - else: - print("✅ Protection present but content accessible.") - - all_job_links = [] - seen_job_ids = set() - - print("🔄 Collecting job links via pagination...") - await self._scroll_and_collect_jobs(page, seen_job_ids, all_job_links, max_pages=max_pages) - - print(f"✅ Collected {len(all_job_links)} unique Amazon job links.") - - scraped_count = 0 - for idx, (job_url, title) in enumerate(all_job_links): - try: - print(f" → Opening job {idx+1}/{len(all_job_links)}: {job_url}") - fetcher = StealthyFetcher(self.engine, browser, context) - job_page = await fetcher.fetch_url(job_url, wait_for_selector="h1.job-title") - - if not job_page: - print(f" ❌ Failed to fetch job page: {job_url}") - self.engine.report_outcome("fetch_failure", url=job_url) - continue - - # Extract raw HTML for LLM - await self.engine._human_like_scroll(job_page) - await asyncio.sleep(2 * self.human_speed) - page_content = await self._extract_page_content_for_llm(job_page) - - job_id = job_url.split("/")[-1] if job_url.split("/")[-1] else "unknown" - - raw_data = { - "page_content": page_content, - "url": job_url, - "job_id": job_id, - "search_keywords": search_keywords - } - - refined_data = await self.llm_agent.refine_job_data(raw_data, self.user_request) - - if refined_data and refined_data.get("title", "N/A") != "N/A": - # Ensure compulsory fields - compulsory_fields = ['company_name', 'job_id', 'url'] - for field in compulsory_fields: - if not refined_data.get(field) or refined_data[field] in ["N/A", "", "Unknown"]: - if field == 'job_id': - refined_data[field] = job_id - elif field == 'url': - refined_data[field] = job_url - elif field == 'company_name': - refined_data[field] = "Amazon" - - refined_data['scraped_at'] = datetime.now().isoformat() - refined_data['category'] = re.sub(r'location:\s*[^,]+', '', search_keywords, flags=re.IGNORECASE).strip() - await self.llm_agent.save_job_data(refined_data, search_keywords) - scraped_count += 1 - print(f" ✅ Scraped and refined: {refined_data['title'][:50]}...") - self.engine.report_outcome("success", url=job_url) - else: - print(f" 🟡 LLM could not extract valid data from: {job_url}") - self.engine.report_outcome("llm_failure", url=job_url) - - await job_page.close() - - except Exception as e: - print(f" ⚠️ Failed on job {idx+1}: {str(e)[:100]}") - if 'job_page' in locals() and job_page: - await job_page.close() - continue - - await browser.close() - - if scraped_count > 0: - self.engine.report_outcome("success") - print(f"✅ Completed! Processed {scraped_count} Amazon jobs for '{search_keywords}'.") - else: - self.engine.report_outcome("no_jobs") - print("⚠️ No Amazon jobs processed successfully.") -``` - ---- - -### ✅ Modified `linkedin_main.py` → **amazon_main.py** - -```python -# amazon_main.py -from scraping_engine import FingerprintScrapingEngine -from job_scraper2 import AmazonJobScraper # Updated class name -import os -from dotenv import load_dotenv -import asyncio -import random -import time - -load_dotenv() - -async def main(): - engine = FingerprintScrapingEngine( - seed="amazon_job_scraping_12", - target_os="windows", - db_path="amazon_jobs.db", - markdown_path="amazon_jobs.md" - ) - - scraper = AmazonJobScraper( - engine, - human_speed=1.4, - user_request="Extract title, company, location, description, basic qualifications, preferred qualifications, job ID, and job type (full-time, part-time, etc.)" - ) - - job_titles = [ - "Software Development Engineer", - "Data Scientist", - "Product Manager", - "UX Designer", - "Solutions Architect", - "Machine Learning Engineer", - "Frontend Engineer", - "Backend Engineer", - "Full Stack Engineer", - "Data Engineer" - ] - - fixed_location = "United States" # Amazon uses country/region, not city - - while True: - random.shuffle(job_titles) - for job_title in job_titles: - search_keywords = f"{job_title} location:{fixed_location}" - print(f"\n{'='*60}") - print(f"Starting Amazon scrape for: {search_keywords}") - print(f"{'='*60}") - - await scraper.scrape_jobs( - search_keywords=search_keywords, - max_pages=3 # Amazon loads 10 per page; 3 pages = ~30 jobs - ) - - print(f"\n✅ Completed scraping for: {job_title}") - print(f"⏳ Waiting 90 seconds before next job title...") - time.sleep(90) - - print(f"\n✅ Completed full cycle. Restarting...") - -if __name__ == "__main__": - asyncio.run(main()) -``` - ---- - -### 🔑 Key Changes Summary - -| Feature | LinkedIn | Amazon | -|-------|--------|--------| -| **Login** | Required | ❌ Not needed | -| **Job URL** | `/jobs/view/123` | `/en/jobs/123` | -| **Pagination** | “Next” button or infinite scroll | API-style `offset=0&result_limit=10` | -| **Apply Flow** | Modal or external redirect | All details on-page | -| **Location** | City/state (e.g., "New York") | Country/region (e.g., "United States") | -| **Selectors** | Complex job cards | Simple `div.job-tile a` | - ---- - -### ✅ Next Steps - -1. **Rename files**: - - `job_scraper2.py` → keep name but now contains `AmazonJobScraper` - - `linkedin_main.py` → `amazon_main.py` - -2. **Update `.env`** — credentials are no longer needed, but you can remove them. - -3. **Test** with a single job title first before running the full loop. - -Let me know if you also want to adjust the `LLMJobRefiner` prompt for Amazon’s job description format! \ No newline at end of file