Delete tr.py
This commit is contained in:
parent
370fce0514
commit
9ed5641540
348
tr.py
348
tr.py
@ -1,348 +0,0 @@
|
|||||||
To adapt your existing **LinkedIn job scraper** to scrape **Amazon job listings** instead, you need to **completely replace LinkedIn-specific logic** with Amazon-specific logic, because:
|
|
||||||
|
|
||||||
- Amazon jobs are hosted on a **different domain**: `https://www.amazon.jobs`
|
|
||||||
- The **DOM structure**, job URLs, pagination, selectors, and flow are **entirely different**
|
|
||||||
- **No login is required** — Amazon job listings are public
|
|
||||||
- Amazon uses **infinite scroll with API pagination**, not traditional “Next” buttons
|
|
||||||
- Job detail pages are **self-contained** — no external apply redirects like LinkedIn
|
|
||||||
|
|
||||||
Below is the **fully modified `job_scraper2.py`** (renamed internally to `AmazonJobScraper`) that scrapes Amazon jobs using the same engine architecture but adapted for Amazon’s site.
|
|
||||||
|
|
||||||
> ✅ **You should rename the file to `amazon_job_scraper.py`** and update `amazon_main.py` accordingly.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### ✅ Modified `job_scraper2.py` → **Amazon Job Scraper**
|
|
||||||
|
|
||||||
```python
|
|
||||||
# job_scraper2.py (now for Amazon)
|
|
||||||
import asyncio
|
|
||||||
import random
|
|
||||||
import re
|
|
||||||
from typing import Optional, Dict
|
|
||||||
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
|
|
||||||
from browserforge.injectors.playwright import AsyncNewContext
|
|
||||||
from llm_agent import LLMJobRefiner
|
|
||||||
from fetcher import StealthyFetcher
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
|
|
||||||
class AmazonJobScraper:
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
engine,
|
|
||||||
db_path: str = "amazon_jobs.db",
|
|
||||||
human_speed: float = 1.0,
|
|
||||||
user_request: str = "Extract all standard job details"
|
|
||||||
):
|
|
||||||
self.engine = engine
|
|
||||||
self.db_path = db_path
|
|
||||||
self.human_speed = human_speed
|
|
||||||
self.user_request = user_request
|
|
||||||
self.llm_agent = LLMJobRefiner()
|
|
||||||
|
|
||||||
async def _human_click(self, page, element, wait_after: bool = True):
|
|
||||||
if not element:
|
|
||||||
return False
|
|
||||||
await element.scroll_into_view_if_needed()
|
|
||||||
await asyncio.sleep(random.uniform(0.3, 0.8) * self.human_speed)
|
|
||||||
try:
|
|
||||||
await element.click()
|
|
||||||
if wait_after:
|
|
||||||
await asyncio.sleep(random.uniform(1.0, 2.0) * self.human_speed)
|
|
||||||
return True
|
|
||||||
except:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def _extract_location_from_keywords(self, search_keywords: str) -> str:
|
|
||||||
location_match = re.search(r'location:\s*([^,]+)', search_keywords, re.IGNORECASE)
|
|
||||||
return location_match.group(1).strip() if location_match else ""
|
|
||||||
|
|
||||||
def _build_amazon_search_url(self, keywords: str) -> str:
|
|
||||||
clean_keywords = re.sub(r'location:\s*[^,]+', '', keywords, flags=re.IGNORECASE).strip()
|
|
||||||
location = self._extract_location_from_keywords(keywords)
|
|
||||||
|
|
||||||
base_url = "https://www.amazon.jobs/en/search?"
|
|
||||||
params = []
|
|
||||||
|
|
||||||
if clean_keywords:
|
|
||||||
params.append(f"base_query={clean_keywords.replace(' ', '+')}")
|
|
||||||
if location:
|
|
||||||
params.append(f"loc_query={location.replace(' ', '+')}")
|
|
||||||
params.append("offset=0")
|
|
||||||
params.append("result_limit=10")
|
|
||||||
|
|
||||||
return base_url + "&".join(params)
|
|
||||||
|
|
||||||
async def _extract_page_content_for_llm(self, page) -> str:
|
|
||||||
await asyncio.sleep(2 * self.human_speed)
|
|
||||||
await self.engine._human_like_scroll(page)
|
|
||||||
await asyncio.sleep(2 * self.human_speed)
|
|
||||||
return await page.content()
|
|
||||||
|
|
||||||
async def _scrape_job_links_from_page(self, page, seen_job_ids, all_job_links):
|
|
||||||
job_cards = await page.query_selector_all('div.job-tile a[href^="/en/jobs/"]')
|
|
||||||
new_jobs = 0
|
|
||||||
for card in job_cards:
|
|
||||||
href = await card.get_attribute("href")
|
|
||||||
if not href:
|
|
||||||
continue
|
|
||||||
full_url = f"https://www.amazon.jobs{href}" if href.startswith("/") else href
|
|
||||||
job_id = href.split("/")[-1] if href.split("/")[-1] else "unknown"
|
|
||||||
|
|
||||||
if job_id in seen_job_ids:
|
|
||||||
continue
|
|
||||||
|
|
||||||
title_element = await card.query_selector('h3')
|
|
||||||
title = await title_element.inner_text() if title_element else "Unknown Title"
|
|
||||||
|
|
||||||
seen_job_ids.add(job_id)
|
|
||||||
all_job_links.append((full_url, title))
|
|
||||||
new_jobs += 1
|
|
||||||
|
|
||||||
return new_jobs
|
|
||||||
|
|
||||||
async def _scroll_and_collect_jobs(self, page, seen_job_ids, all_job_links, max_pages=5):
|
|
||||||
offset = 0
|
|
||||||
jobs_per_page = 10
|
|
||||||
for page_num in range(max_pages):
|
|
||||||
print(f"📄 Fetching Amazon job page {page_num + 1} (offset: {offset})")
|
|
||||||
current_url = page.url
|
|
||||||
if "offset=" in current_url:
|
|
||||||
base_url = current_url.split("offset=")[0]
|
|
||||||
new_url = base_url + f"offset={offset}&result_limit={jobs_per_page}"
|
|
||||||
else:
|
|
||||||
new_url = current_url + f"&offset={offset}&result_limit={jobs_per_page}"
|
|
||||||
|
|
||||||
await page.goto(new_url, wait_until='domcontentloaded', timeout=120000)
|
|
||||||
await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
|
|
||||||
|
|
||||||
new_jobs = await self._scrape_job_links_from_page(page, seen_job_ids, all_job_links)
|
|
||||||
print(f" ➕ Found {new_jobs} new job(s) on page {page_num + 1} (total: {len(all_job_links)})")
|
|
||||||
|
|
||||||
if new_jobs == 0 and page_num > 0:
|
|
||||||
print("🔚 No new jobs — stopping pagination.")
|
|
||||||
break
|
|
||||||
|
|
||||||
offset += jobs_per_page
|
|
||||||
|
|
||||||
async def scrape_jobs(
|
|
||||||
self,
|
|
||||||
search_keywords: Optional[str],
|
|
||||||
max_pages: int = 5,
|
|
||||||
credentials: Optional[Dict] = None # Not used for Amazon
|
|
||||||
):
|
|
||||||
search_url = self._build_amazon_search_url(search_keywords)
|
|
||||||
print(f"🔍 Amazon search URL: {search_url}")
|
|
||||||
|
|
||||||
profile = self.engine._select_profile()
|
|
||||||
renderer = random.choice(self.engine.common_renderers[self.engine.os])
|
|
||||||
vendor = random.choice(self.engine.common_vendors)
|
|
||||||
spoof_script = self.engine._get_spoof_script(renderer, vendor)
|
|
||||||
|
|
||||||
async with async_playwright() as pw:
|
|
||||||
browser = await pw.chromium.launch(
|
|
||||||
headless=False,
|
|
||||||
args=['--disable-blink-features=AutomationControlled']
|
|
||||||
)
|
|
||||||
context = await AsyncNewContext(browser, fingerprint=profile)
|
|
||||||
|
|
||||||
await context.add_init_script(f"""
|
|
||||||
Object.defineProperty(navigator, 'hardwareConcurrency', {{ get: () => {profile.navigator.hardwareConcurrency} }});
|
|
||||||
Object.defineProperty(navigator, 'deviceMemory', {{ get: () => {profile.navigator.deviceMemory} }});
|
|
||||||
Object.defineProperty(navigator, 'platform', {{ get: () => '{profile.navigator.platform}' }});
|
|
||||||
""")
|
|
||||||
await context.add_init_script(spoof_script)
|
|
||||||
|
|
||||||
page = await context.new_page()
|
|
||||||
temp_fetcher = StealthyFetcher(self.engine, browser, context)
|
|
||||||
|
|
||||||
# Amazon doesn't require login
|
|
||||||
print("🌐 Navigating to Amazon Jobs (no login required)...")
|
|
||||||
await page.goto(search_url, wait_until='domcontentloaded', timeout=120000)
|
|
||||||
await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
|
|
||||||
|
|
||||||
# Protection check
|
|
||||||
protection_type = await temp_fetcher._detect_protection(page)
|
|
||||||
if protection_type:
|
|
||||||
print(f"🛡️ Protection detected: {protection_type}")
|
|
||||||
content_accessible = await temp_fetcher._is_content_accessible(page)
|
|
||||||
if not content_accessible:
|
|
||||||
handled = await self.engine._handle_cloudflare(page) if protection_type == "cloudflare" else False
|
|
||||||
if not handled:
|
|
||||||
await browser.close()
|
|
||||||
self.engine.report_outcome("protection_block")
|
|
||||||
return
|
|
||||||
else:
|
|
||||||
print("✅ Protection present but content accessible.")
|
|
||||||
|
|
||||||
all_job_links = []
|
|
||||||
seen_job_ids = set()
|
|
||||||
|
|
||||||
print("🔄 Collecting job links via pagination...")
|
|
||||||
await self._scroll_and_collect_jobs(page, seen_job_ids, all_job_links, max_pages=max_pages)
|
|
||||||
|
|
||||||
print(f"✅ Collected {len(all_job_links)} unique Amazon job links.")
|
|
||||||
|
|
||||||
scraped_count = 0
|
|
||||||
for idx, (job_url, title) in enumerate(all_job_links):
|
|
||||||
try:
|
|
||||||
print(f" → Opening job {idx+1}/{len(all_job_links)}: {job_url}")
|
|
||||||
fetcher = StealthyFetcher(self.engine, browser, context)
|
|
||||||
job_page = await fetcher.fetch_url(job_url, wait_for_selector="h1.job-title")
|
|
||||||
|
|
||||||
if not job_page:
|
|
||||||
print(f" ❌ Failed to fetch job page: {job_url}")
|
|
||||||
self.engine.report_outcome("fetch_failure", url=job_url)
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Extract raw HTML for LLM
|
|
||||||
await self.engine._human_like_scroll(job_page)
|
|
||||||
await asyncio.sleep(2 * self.human_speed)
|
|
||||||
page_content = await self._extract_page_content_for_llm(job_page)
|
|
||||||
|
|
||||||
job_id = job_url.split("/")[-1] if job_url.split("/")[-1] else "unknown"
|
|
||||||
|
|
||||||
raw_data = {
|
|
||||||
"page_content": page_content,
|
|
||||||
"url": job_url,
|
|
||||||
"job_id": job_id,
|
|
||||||
"search_keywords": search_keywords
|
|
||||||
}
|
|
||||||
|
|
||||||
refined_data = await self.llm_agent.refine_job_data(raw_data, self.user_request)
|
|
||||||
|
|
||||||
if refined_data and refined_data.get("title", "N/A") != "N/A":
|
|
||||||
# Ensure compulsory fields
|
|
||||||
compulsory_fields = ['company_name', 'job_id', 'url']
|
|
||||||
for field in compulsory_fields:
|
|
||||||
if not refined_data.get(field) or refined_data[field] in ["N/A", "", "Unknown"]:
|
|
||||||
if field == 'job_id':
|
|
||||||
refined_data[field] = job_id
|
|
||||||
elif field == 'url':
|
|
||||||
refined_data[field] = job_url
|
|
||||||
elif field == 'company_name':
|
|
||||||
refined_data[field] = "Amazon"
|
|
||||||
|
|
||||||
refined_data['scraped_at'] = datetime.now().isoformat()
|
|
||||||
refined_data['category'] = re.sub(r'location:\s*[^,]+', '', search_keywords, flags=re.IGNORECASE).strip()
|
|
||||||
await self.llm_agent.save_job_data(refined_data, search_keywords)
|
|
||||||
scraped_count += 1
|
|
||||||
print(f" ✅ Scraped and refined: {refined_data['title'][:50]}...")
|
|
||||||
self.engine.report_outcome("success", url=job_url)
|
|
||||||
else:
|
|
||||||
print(f" 🟡 LLM could not extract valid data from: {job_url}")
|
|
||||||
self.engine.report_outcome("llm_failure", url=job_url)
|
|
||||||
|
|
||||||
await job_page.close()
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f" ⚠️ Failed on job {idx+1}: {str(e)[:100]}")
|
|
||||||
if 'job_page' in locals() and job_page:
|
|
||||||
await job_page.close()
|
|
||||||
continue
|
|
||||||
|
|
||||||
await browser.close()
|
|
||||||
|
|
||||||
if scraped_count > 0:
|
|
||||||
self.engine.report_outcome("success")
|
|
||||||
print(f"✅ Completed! Processed {scraped_count} Amazon jobs for '{search_keywords}'.")
|
|
||||||
else:
|
|
||||||
self.engine.report_outcome("no_jobs")
|
|
||||||
print("⚠️ No Amazon jobs processed successfully.")
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### ✅ Modified `linkedin_main.py` → **amazon_main.py**
|
|
||||||
|
|
||||||
```python
|
|
||||||
# amazon_main.py
|
|
||||||
from scraping_engine import FingerprintScrapingEngine
|
|
||||||
from job_scraper2 import AmazonJobScraper # Updated class name
|
|
||||||
import os
|
|
||||||
from dotenv import load_dotenv
|
|
||||||
import asyncio
|
|
||||||
import random
|
|
||||||
import time
|
|
||||||
|
|
||||||
load_dotenv()
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
engine = FingerprintScrapingEngine(
|
|
||||||
seed="amazon_job_scraping_12",
|
|
||||||
target_os="windows",
|
|
||||||
db_path="amazon_jobs.db",
|
|
||||||
markdown_path="amazon_jobs.md"
|
|
||||||
)
|
|
||||||
|
|
||||||
scraper = AmazonJobScraper(
|
|
||||||
engine,
|
|
||||||
human_speed=1.4,
|
|
||||||
user_request="Extract title, company, location, description, basic qualifications, preferred qualifications, job ID, and job type (full-time, part-time, etc.)"
|
|
||||||
)
|
|
||||||
|
|
||||||
job_titles = [
|
|
||||||
"Software Development Engineer",
|
|
||||||
"Data Scientist",
|
|
||||||
"Product Manager",
|
|
||||||
"UX Designer",
|
|
||||||
"Solutions Architect",
|
|
||||||
"Machine Learning Engineer",
|
|
||||||
"Frontend Engineer",
|
|
||||||
"Backend Engineer",
|
|
||||||
"Full Stack Engineer",
|
|
||||||
"Data Engineer"
|
|
||||||
]
|
|
||||||
|
|
||||||
fixed_location = "United States" # Amazon uses country/region, not city
|
|
||||||
|
|
||||||
while True:
|
|
||||||
random.shuffle(job_titles)
|
|
||||||
for job_title in job_titles:
|
|
||||||
search_keywords = f"{job_title} location:{fixed_location}"
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print(f"Starting Amazon scrape for: {search_keywords}")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
|
|
||||||
await scraper.scrape_jobs(
|
|
||||||
search_keywords=search_keywords,
|
|
||||||
max_pages=3 # Amazon loads 10 per page; 3 pages = ~30 jobs
|
|
||||||
)
|
|
||||||
|
|
||||||
print(f"\n✅ Completed scraping for: {job_title}")
|
|
||||||
print(f"⏳ Waiting 90 seconds before next job title...")
|
|
||||||
time.sleep(90)
|
|
||||||
|
|
||||||
print(f"\n✅ Completed full cycle. Restarting...")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
asyncio.run(main())
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### 🔑 Key Changes Summary
|
|
||||||
|
|
||||||
| Feature | LinkedIn | Amazon |
|
|
||||||
|-------|--------|--------|
|
|
||||||
| **Login** | Required | ❌ Not needed |
|
|
||||||
| **Job URL** | `/jobs/view/123` | `/en/jobs/123` |
|
|
||||||
| **Pagination** | “Next” button or infinite scroll | API-style `offset=0&result_limit=10` |
|
|
||||||
| **Apply Flow** | Modal or external redirect | All details on-page |
|
|
||||||
| **Location** | City/state (e.g., "New York") | Country/region (e.g., "United States") |
|
|
||||||
| **Selectors** | Complex job cards | Simple `div.job-tile a` |
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### ✅ Next Steps
|
|
||||||
|
|
||||||
1. **Rename files**:
|
|
||||||
- `job_scraper2.py` → keep name but now contains `AmazonJobScraper`
|
|
||||||
- `linkedin_main.py` → `amazon_main.py`
|
|
||||||
|
|
||||||
2. **Update `.env`** — credentials are no longer needed, but you can remove them.
|
|
||||||
|
|
||||||
3. **Test** with a single job title first before running the full loop.
|
|
||||||
|
|
||||||
Let me know if you also want to adjust the `LLMJobRefiner` prompt for Amazon’s job description format!
|
|
||||||
Loading…
x
Reference in New Issue
Block a user