Web_scraping_project/amazon_main.py
2025-12-05 17:25:54 +01:00

61 lines
1.8 KiB
Python

from scraping_engine import FingerprintScrapingEngine
from job_scraper2 import AmazonJobScraper # Updated class name
import os
from dotenv import load_dotenv
import asyncio
import random
import time
load_dotenv()
async def main():
engine = FingerprintScrapingEngine(
seed="amazon_job_scraping_12",
target_os="windows",
db_path="amazon_jobs.db",
markdown_path="amazon_jobs.md"
)
scraper = AmazonJobScraper(
engine,
human_speed=1.4,
user_request="Extract title, company, location, description, basic qualifications, preferred qualifications, job ID, and job type (full-time, part-time, etc.)"
)
job_titles = [
"Software Development Engineer",
"Data Scientist",
"Product Manager",
"UX Designer",
"Solutions Architect",
"Machine Learning Engineer",
"Frontend Engineer",
"Backend Engineer",
"Full Stack Engineer",
"Data Engineer"
]
fixed_location = "United States" # Amazon uses country/region, not city
while True:
random.shuffle(job_titles)
for job_title in job_titles:
search_keywords = f"{job_title} location:{fixed_location}"
print(f"\n{'='*60}")
print(f"Starting Amazon scrape for: {search_keywords}")
print(f"{'='*60}")
await scraper.scrape_jobs(
search_keywords=search_keywords,
max_pages=3 # Amazon loads 10 per page; 3 pages = ~30 jobs
)
print(f"\n✅ Completed scraping for: {job_title}")
print(f"⏳ Waiting 90 seconds before next job title...")
time.sleep(90)
print(f"\n✅ Completed full cycle. Restarting...")
if __name__ == "__main__":
asyncio.run(main())