diff --git a/amazon_job_scraper.py b/amazon_job_scraper.py index c73f3dc..442975c 100644 --- a/amazon_job_scraper.py +++ b/amazon_job_scraper.py @@ -166,7 +166,7 @@ class AmazonJobScraper: async def scrape_jobs( self, search_keywords: Optional[str], - max_pages: int = 1, + max_pages: int = 400, credentials: Optional[Dict] = None ): from datetime import timedelta # needed for date math diff --git a/amazon_main.py b/amazon_main.py index 1b0199a..b8c1504 100644 --- a/amazon_main.py +++ b/amazon_main.py @@ -47,12 +47,12 @@ async def main(): await scraper.scrape_jobs( search_keywords=search_keywords, - max_pages=3 # Amazon loads 10 per page; 3 pages = ~30 jobs + max_pages=400 # Amazon loads 10 per page; 3 pages = ~30 jobs ) print(f"\n✅ Completed scraping for: {job_title}") print(f"⏳ Waiting 90 seconds before next job title...") - time.sleep(90) + time.sleep(120) print(f"\n✅ Completed full cycle. Restarting...") diff --git a/llm_agent.py b/llm_agent.py index be1a0b6..2f23313 100644 --- a/llm_agent.py +++ b/llm_agent.py @@ -75,6 +75,7 @@ class LLMJobRefiner: url TEXT, category TEXT, scraped_at TIMESTAMP, + posted_date TEXT, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) ''') @@ -87,7 +88,8 @@ class LLMJobRefiner: cursor.execute('CREATE INDEX IF NOT EXISTS idx_job_id ON jobs(job_id)') cursor.execute('CREATE INDEX IF NOT EXISTS idx_category ON jobs(category)') - + cursor.execute('CREATE INDEX IF NOT EXISTS idx_posted_date ON jobs(posted_date)') + conn.commit() cursor.close() conn.close() @@ -137,49 +139,31 @@ class LLMJobRefiner: return "" async def refine_job_data(self, raw_data: Dict[str, Any], target_field: str) -> Dict[str, Any]: - # Clean the raw HTML content for better LLM processing page_content = raw_data.get('page_content', '') cleaned_content = self._clean_html_for_llm(page_content) - - # Get job_id and url from raw data job_id = raw_data.get('job_id', 'unknown') url = raw_data.get('url', 'N/A') + posted_date = raw_data.get('posted_date', datetime.now().strftime("%m/%d/%y")) prompt = f""" - You are a job posting data extractor with two modes: - - CRITICAL INSTRUCTIONS FOR TEXT FIELDS: - - description: Extract the COMPLETE job description text (all paragraphs, bullet points, everything) - - requirements: Extract the COMPLETE requirements section text if present (all details, don't summarize) - - qualifications: Extract the COMPLETE qualifications section text if present (all details, don't summarize) - - If these sections are not explicitly labeled but exist in the content, extract the relevant portions - - PRIMARY MODE (PREFERRED): - - Extract EXACT text as it appears on the page for all fields - - DO NOT summarize, paraphrase, or interpret - - Copy verbatim content including original wording and formatting + You are a job posting data extractor. + + EXTRACT EXACT TEXT - DO NOT SUMMARIZE, PARAPHRASE, OR INVENT. + + For these critical fields, follow these rules: + - description: Extract ALL job description text. If ANY job details exist (duties, responsibilities, overview), include them. Only use "Not provided" if absolutely no description exists. + - requirements: Extract ALL requirements text. If ANY requirements exist (skills, experience, education needed), include them. Only use "Not provided" if none exist. + - qualifications: Extract ALL qualifications text. If ANY qualifications exist, include them. Only use "Not provided" if none exist. + + REQUIRED FIELDS (must have valid values, never "N/A"): + - title, company_name, job_id, url + + OPTIONAL FIELDS (can be "Not provided"): + - location, salary_range, nature_of_work - - FALLBACK MODE (ONLY IF FIELD IS MISSING): - - If a field is NOT explicitly stated anywhere in the content, you MAY infer it using clear contextual clues - - Inference rules: - * company_name: Look for patterns like "at [Company]", "Join [Company]", "[Company] is hiring" - * nature_of_work: Look for "remote", "onsite", "hybrid", "work from home", "office-based" - * location: Extract city/state/country mentions near job title or details - * title: Use the largest/primary heading if no explicit "job title" label exists - - REQUIRED FIELDS (must always have a value): - - title: Exact job title or best inference - - company_name: Exact company name or best inference - - job_id: Use provided: {job_id} - - url: Use provided: {url} - - OPTIONAL FIELDS (use exact text or "N/A" if not present and not inferable): - - salary_range - - nature_of_work - Page Content: {cleaned_content} + Response format (ONLY return this JSON): {{ "title": "...", @@ -202,15 +186,35 @@ class LLMJobRefiner: ) refined_data = self._parse_llm_response(response_text) - # Final validation - ensure required fields are present and meaningful - if refined_data: - required_fields = ['title', 'company_name', 'job_id', 'url'] - for field in required_fields: - if not refined_data.get(field) or refined_data[field] in ["N/A", "", "Unknown", "Company", "Job"]: - return None # LLM failed to extract properly + if not refined_data: + return None - return refined_data - return None + # Validate required fields + required_fields = ['title', 'company_name', 'job_id', 'url'] + for field in required_fields: + if not refined_data.get(field) or refined_data[field].strip() in ["N/A", "", "Unknown", "Company", "Job"]: + return None + + # CRITICAL: Validate content fields - check if they SHOULD exist + content_fields = ['description', 'requirements', 'qualifications'] + cleaned_original = cleaned_content.lower() + + # Simple heuristic: if page contains job-related keywords, content fields should NOT be "Not provided" + job_indicators = ['responsibilit', 'duties', 'require', 'qualifi', 'skill', 'experienc', 'educat', 'degree', 'bachelor', 'master'] + has_job_content = any(indicator in cleaned_original for indicator in job_indicators) + + if has_job_content: + for field in content_fields: + value = refined_data.get(field, "").strip() + if value in ["Not provided", "N/A", ""]: + # LLM failed to extract existing content + print(f" ⚠ LLM returned '{value}' for {field} but job content appears present") + return None + + # Add the posted_date to the refined data + refined_data['posted_date'] = posted_date + + return refined_data except Exception as e: print(f"LLM refinement failed: {str(e)}") @@ -247,8 +251,8 @@ class LLMJobRefiner: cursor.execute(''' INSERT INTO jobs (title, company_name, location, description, requirements, - qualifications, salary_range, nature_of_work, job_id, url, category, scraped_at) - VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + qualifications, salary_range, nature_of_work, job_id, url, category, scraped_at, posted_date) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON CONFLICT (job_id) DO NOTHING ''', ( job_data.get("title", "N/A"), @@ -262,7 +266,8 @@ class LLMJobRefiner: job_data.get("job_id", "N/A"), job_data.get("url", "N/A"), job_data.get("category", "N/A"), - job_data.get("scraped_at") + job_data.get("scraped_at"), + job_data.get("posted_date", "N/A") )) conn.commit() @@ -283,15 +288,16 @@ class LLMJobRefiner: if write_header: f.write(f"# LinkedIn Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n") f.write(f"## Job: {job_data.get('title', 'N/A')}\n\n") - f.write(f"- **Keyword**: {keyword}\n") - f.write(f"- **Company**: {job_data.get('company_name', 'N/A')}\n") - f.write(f"- **Location**: {job_data.get('location', 'N/A')}\n") - f.write(f"- **Nature of Work**: {job_data.get('nature_of_work', 'N/A')}\n") - f.write(f"- **Salary Range**: {job_data.get('salary_range', 'N/A')}\n") - f.write(f"- **Job ID**: {job_data.get('job_id', 'N/A')}\n") - f.write(f"- **Category**: {job_data.get('category', 'N/A')}\n") - f.write(f"- **Scraped At**: {job_data.get('scraped_at', 'N/A')}\n") - f.write(f"- **URL**: <{job_data.get('url', 'N/A')}>\n\n") + f.write(f"- *Keyword*: {keyword}\n") + f.write(f"- *Company*: {job_data.get('company_name', 'N/A')}\n") + f.write(f"- *Location*: {job_data.get('location', 'N/A')}\n") + f.write(f"- *Nature of Work*: {job_data.get('nature_of_work', 'N/A')}\n") + f.write(f"- *Salary Range*: {job_data.get('salary_range', 'N/A')}\n") + f.write(f"- *Job ID*: {job_data.get('job_id', 'N/A')}\n") + f.write(f"- *Posted Date*: {job_data.get('posted_date', 'N/A')}\n") + f.write(f"- *Category*: {job_data.get('category', 'N/A')}\n") + f.write(f"- *Scraped At*: {job_data.get('scraped_at', 'N/A')}\n") + f.write(f"- *URL*: <{job_data.get('url', 'N/A')}>\n\n") f.write(f"### Description\n\n{job_data.get('description', 'N/A')}\n\n") f.write(f"### Requirements\n\n{job_data.get('requirements', 'N/A')}\n\n") f.write(f"### Qualifications\n\n{job_data.get('qualifications', 'N/A')}\n\n") diff --git a/trim.py b/trim.py new file mode 100644 index 0000000..a0e8ecd --- /dev/null +++ b/trim.py @@ -0,0 +1,31 @@ +# Keep cycling through all job titles + while True: + # Shuffle job titles to randomize order + random.shuffle(job_titles) + + for job_title in job_titles: + search_keywords = f"{job_title} location:{fixed_location}" + + print(f"\n{'='*60}") + print(f"Starting scrape for: {search_keywords}") + print(f"{'='*60}") + + await scraper.scrape_jobs( + search_keywords=search_keywords, + credentials={ + "email": os.getenv("SCRAPING_USERNAME"), + "password": os.getenv("SCRAPING_PASSWORD") + } + ) + + print(f"\n✅ Completed scraping for: {job_title}") + print(f"⏳ Waiting 2 minutes before next job title...") + + # Wait 2 minutes before next job title + time.sleep(120) + + print(f"\n✅ Completed full cycle of all job titles") + print(f"🔄 Starting new cycle...") + +if _name_ == "_main_": + asyncio.run(main()) \ No newline at end of file