Increase max pages to scrape and extend wait time between job title scrapes; add posted date to job data extraction

2025-12-09 09:30:44 +01:00 · 2025-12-09 09:30:44 +01:00 · e216db35f9
commit e216db35f9
parent cbcffa8cd4
4 changed files with 95 additions and 58 deletions
--- a/amazon_job_scraper.py
+++ b/amazon_job_scraper.py
@ -166,7 +166,7 @@ class AmazonJobScraper:
    async def scrape_jobs(
        self,
        search_keywords: Optional[str],
-        max_pages: int = 1,
+        max_pages: int = 400,
        credentials: Optional[Dict] = None
    ):
        from datetime import timedelta  # needed for date math
--- a/amazon_main.py
+++ b/amazon_main.py
@ -47,12 +47,12 @@ async def main():
            
            await scraper.scrape_jobs(
                search_keywords=search_keywords,
-                max_pages=3  # Amazon loads 10 per page; 3 pages = ~30 jobs
+                max_pages=400  # Amazon loads 10 per page; 3 pages = ~30 jobs
            )
            
            print(f"\n✅ Completed scraping for: {job_title}")
            print(f"⏳ Waiting 90 seconds before next job title...")
-            time.sleep(90)
+            time.sleep(120)
        
        print(f"\n✅ Completed full cycle. Restarting...")

--- a/llm_agent.py
+++ b/llm_agent.py
@ -75,6 +75,7 @@ class LLMJobRefiner:
                    url TEXT,
                    category TEXT,
                    scraped_at TIMESTAMP,
+                    posted_date TEXT,
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
                )
            ''')
@ -87,6 +88,7 @@ class LLMJobRefiner:
            
            cursor.execute('CREATE INDEX IF NOT EXISTS idx_job_id ON jobs(job_id)')
            cursor.execute('CREATE INDEX IF NOT EXISTS idx_category ON jobs(category)')
+            cursor.execute('CREATE INDEX IF NOT EXISTS idx_posted_date ON jobs(posted_date)')
           
            conn.commit()
            cursor.close()
@ -137,49 +139,31 @@ class LLMJobRefiner:
            return ""

    async def refine_job_data(self, raw_data: Dict[str, Any], target_field: str) -> Dict[str, Any]:
-        # Clean the raw HTML content for better LLM processing
        page_content = raw_data.get('page_content', '')
        cleaned_content = self._clean_html_for_llm(page_content)
-        
-        # Get job_id and url from raw data
        job_id = raw_data.get('job_id', 'unknown')
        url = raw_data.get('url', 'N/A')
+        posted_date = raw_data.get('posted_date', datetime.now().strftime("%m/%d/%y"))
        
        prompt = f"""
-        You are a job posting data extractor with two modes:
+        You are a job posting data extractor.
        
-        CRITICAL INSTRUCTIONS FOR TEXT FIELDS:
-        - description: Extract the COMPLETE job description text (all paragraphs, bullet points, everything)
-        - requirements: Extract the COMPLETE requirements section text if present (all details, don't summarize)
-        - qualifications: Extract the COMPLETE qualifications section text if present (all details, don't summarize)
-        - If these sections are not explicitly labeled but exist in the content, extract the relevant portions
+        EXTRACT EXACT TEXT - DO NOT SUMMARIZE, PARAPHRASE, OR INVENT.
        
-        PRIMARY MODE (PREFERRED):
-        - Extract EXACT text as it appears on the page for all fields
-        - DO NOT summarize, paraphrase, or interpret
-        - Copy verbatim content including original wording and formatting
+        For these critical fields, follow these rules:
+        - description: Extract ALL job description text. If ANY job details exist (duties, responsibilities, overview), include them. Only use "Not provided" if absolutely no description exists.
+        - requirements: Extract ALL requirements text. If ANY requirements exist (skills, experience, education needed), include them. Only use "Not provided" if none exist.
+        - qualifications: Extract ALL qualifications text. If ANY qualifications exist, include them. Only use "Not provided" if none exist.
        
+        REQUIRED FIELDS (must have valid values, never "N/A"):
+        - title, company_name, job_id, url
        
-        FALLBACK MODE (ONLY IF FIELD IS MISSING):
-        - If a field is NOT explicitly stated anywhere in the content, you MAY infer it using clear contextual clues
-        - Inference rules:
-          * company_name: Look for patterns like "at [Company]", "Join [Company]", "[Company] is hiring"
-          * nature_of_work: Look for "remote", "onsite", "hybrid", "work from home", "office-based"
-          * location: Extract city/state/country mentions near job title or details
-          * title: Use the largest/primary heading if no explicit "job title" label exists
-
-        REQUIRED FIELDS (must always have a value):
-        - title: Exact job title or best inference
-        - company_name: Exact company name or best inference  
-        - job_id: Use provided: {job_id}
-        - url: Use provided: {url}
-
-        OPTIONAL FIELDS (use exact text or "N/A" if not present and not inferable):
-        - salary_range
-        - nature_of_work
+        OPTIONAL FIELDS (can be "Not provided"):
+        - location, salary_range, nature_of_work
        
        Page Content:
        {cleaned_content}
+        
        Response format (ONLY return this JSON):
        {{
            "title": "...",
@ -202,15 +186,35 @@ class LLMJobRefiner:
            )
            refined_data = self._parse_llm_response(response_text)
            
-            # Final validation - ensure required fields are present and meaningful
-            if refined_data:
-                required_fields = ['title', 'company_name', 'job_id', 'url']
-                for field in required_fields:
-                    if not refined_data.get(field) or refined_data[field] in ["N/A", "", "Unknown", "Company", "Job"]:
-                        return None  # LLM failed to extract properly
+            if not refined_data:
+                return None
                
-                return refined_data
-            return None
+            # Validate required fields
+            required_fields = ['title', 'company_name', 'job_id', 'url']
+            for field in required_fields:
+                if not refined_data.get(field) or refined_data[field].strip() in ["N/A", "", "Unknown", "Company", "Job"]:
+                    return None
+            
+            # CRITICAL: Validate content fields - check if they SHOULD exist
+            content_fields = ['description', 'requirements', 'qualifications']
+            cleaned_original = cleaned_content.lower()
+            
+            # Simple heuristic: if page contains job-related keywords, content fields should NOT be "Not provided"
+            job_indicators = ['responsibilit', 'duties', 'require', 'qualifi', 'skill', 'experienc', 'educat', 'degree', 'bachelor', 'master']
+            has_job_content = any(indicator in cleaned_original for indicator in job_indicators)
+            
+            if has_job_content:
+                for field in content_fields:
+                    value = refined_data.get(field, "").strip()
+                    if value in ["Not provided", "N/A", ""]:
+                        # LLM failed to extract existing content
+                        print(f"  ⚠ LLM returned '{value}' for {field} but job content appears present")
+                        return None
+            
+            # Add the posted_date to the refined data
+            refined_data['posted_date'] = posted_date
+            
+            return refined_data
            
        except Exception as e:
            print(f"LLM refinement failed: {str(e)}")
@ -247,8 +251,8 @@ class LLMJobRefiner:
            cursor.execute('''
                INSERT INTO jobs 
                (title, company_name, location, description, requirements, 
-                 qualifications, salary_range, nature_of_work, job_id, url, category, scraped_at)
-                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
+                 qualifications, salary_range, nature_of_work, job_id, url, category, scraped_at, posted_date)
+                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                ON CONFLICT (job_id) DO NOTHING
            ''', (
                job_data.get("title", "N/A"),
@ -262,7 +266,8 @@ class LLMJobRefiner:
                job_data.get("job_id", "N/A"),
                job_data.get("url", "N/A"),
                job_data.get("category", "N/A"),
-                job_data.get("scraped_at")
+                job_data.get("scraped_at"),
+                job_data.get("posted_date", "N/A")
            ))
            
            conn.commit()
@ -283,15 +288,16 @@ class LLMJobRefiner:
            if write_header:
                f.write(f"# LinkedIn Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
            f.write(f"## Job: {job_data.get('title', 'N/A')}\n\n")
-            f.write(f"- **Keyword**: {keyword}\n")
-            f.write(f"- **Company**: {job_data.get('company_name', 'N/A')}\n")
-            f.write(f"- **Location**: {job_data.get('location', 'N/A')}\n")
-            f.write(f"- **Nature of Work**: {job_data.get('nature_of_work', 'N/A')}\n")
-            f.write(f"- **Salary Range**: {job_data.get('salary_range', 'N/A')}\n")
-            f.write(f"- **Job ID**: {job_data.get('job_id', 'N/A')}\n")
-            f.write(f"- **Category**: {job_data.get('category', 'N/A')}\n")
-            f.write(f"- **Scraped At**: {job_data.get('scraped_at', 'N/A')}\n")
-            f.write(f"- **URL**: <{job_data.get('url', 'N/A')}>\n\n")
+            f.write(f"- *Keyword*: {keyword}\n")
+            f.write(f"- *Company*: {job_data.get('company_name', 'N/A')}\n")
+            f.write(f"- *Location*: {job_data.get('location', 'N/A')}\n")
+            f.write(f"- *Nature of Work*: {job_data.get('nature_of_work', 'N/A')}\n")
+            f.write(f"- *Salary Range*: {job_data.get('salary_range', 'N/A')}\n")
+            f.write(f"- *Job ID*: {job_data.get('job_id', 'N/A')}\n")
+            f.write(f"- *Posted Date*: {job_data.get('posted_date', 'N/A')}\n")
+            f.write(f"- *Category*: {job_data.get('category', 'N/A')}\n")
+            f.write(f"- *Scraped At*: {job_data.get('scraped_at', 'N/A')}\n")
+            f.write(f"- *URL*: <{job_data.get('url', 'N/A')}>\n\n")
            f.write(f"### Description\n\n{job_data.get('description', 'N/A')}\n\n")
            f.write(f"### Requirements\n\n{job_data.get('requirements', 'N/A')}\n\n")
            f.write(f"### Qualifications\n\n{job_data.get('qualifications', 'N/A')}\n\n")
--- a/trim.py
+++ b/trim.py
@ -0,0 +1,31 @@
+# Keep cycling through all job titles
+    while True:
+        # Shuffle job titles to randomize order
+        random.shuffle(job_titles)
+        
+        for job_title in job_titles:
+            search_keywords = f"{job_title} location:{fixed_location}"
+            
+            print(f"\n{'='*60}")
+            print(f"Starting scrape for: {search_keywords}")
+            print(f"{'='*60}")
+            
+            await scraper.scrape_jobs(
+                search_keywords=search_keywords,
+                credentials={
+                    "email": os.getenv("SCRAPING_USERNAME"),
+                    "password": os.getenv("SCRAPING_PASSWORD")
+                }
+            )
+            
+            print(f"\n✅ Completed scraping for: {job_title}")
+            print(f"⏳ Waiting 2 minutes before next job title...")
+            
+            # Wait 2 minutes before next job title
+            time.sleep(120)
+        
+        print(f"\n✅ Completed full cycle of all job titles")
+        print(f"🔄 Starting new cycle...")
+
+if _name_ == "_main_":
+    asyncio.run(main())