Increase max pages to scrape and extend wait time between job title scrapes; add posted date to job data extraction
This commit is contained in:
parent
cbcffa8cd4
commit
e216db35f9
@ -166,7 +166,7 @@ class AmazonJobScraper:
|
|||||||
async def scrape_jobs(
|
async def scrape_jobs(
|
||||||
self,
|
self,
|
||||||
search_keywords: Optional[str],
|
search_keywords: Optional[str],
|
||||||
max_pages: int = 1,
|
max_pages: int = 400,
|
||||||
credentials: Optional[Dict] = None
|
credentials: Optional[Dict] = None
|
||||||
):
|
):
|
||||||
from datetime import timedelta # needed for date math
|
from datetime import timedelta # needed for date math
|
||||||
|
|||||||
@ -47,12 +47,12 @@ async def main():
|
|||||||
|
|
||||||
await scraper.scrape_jobs(
|
await scraper.scrape_jobs(
|
||||||
search_keywords=search_keywords,
|
search_keywords=search_keywords,
|
||||||
max_pages=3 # Amazon loads 10 per page; 3 pages = ~30 jobs
|
max_pages=400 # Amazon loads 10 per page; 3 pages = ~30 jobs
|
||||||
)
|
)
|
||||||
|
|
||||||
print(f"\n✅ Completed scraping for: {job_title}")
|
print(f"\n✅ Completed scraping for: {job_title}")
|
||||||
print(f"⏳ Waiting 90 seconds before next job title...")
|
print(f"⏳ Waiting 90 seconds before next job title...")
|
||||||
time.sleep(90)
|
time.sleep(120)
|
||||||
|
|
||||||
print(f"\n✅ Completed full cycle. Restarting...")
|
print(f"\n✅ Completed full cycle. Restarting...")
|
||||||
|
|
||||||
|
|||||||
106
llm_agent.py
106
llm_agent.py
@ -75,6 +75,7 @@ class LLMJobRefiner:
|
|||||||
url TEXT,
|
url TEXT,
|
||||||
category TEXT,
|
category TEXT,
|
||||||
scraped_at TIMESTAMP,
|
scraped_at TIMESTAMP,
|
||||||
|
posted_date TEXT,
|
||||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||||
)
|
)
|
||||||
''')
|
''')
|
||||||
@ -87,6 +88,7 @@ class LLMJobRefiner:
|
|||||||
|
|
||||||
cursor.execute('CREATE INDEX IF NOT EXISTS idx_job_id ON jobs(job_id)')
|
cursor.execute('CREATE INDEX IF NOT EXISTS idx_job_id ON jobs(job_id)')
|
||||||
cursor.execute('CREATE INDEX IF NOT EXISTS idx_category ON jobs(category)')
|
cursor.execute('CREATE INDEX IF NOT EXISTS idx_category ON jobs(category)')
|
||||||
|
cursor.execute('CREATE INDEX IF NOT EXISTS idx_posted_date ON jobs(posted_date)')
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
cursor.close()
|
cursor.close()
|
||||||
@ -137,49 +139,31 @@ class LLMJobRefiner:
|
|||||||
return ""
|
return ""
|
||||||
|
|
||||||
async def refine_job_data(self, raw_data: Dict[str, Any], target_field: str) -> Dict[str, Any]:
|
async def refine_job_data(self, raw_data: Dict[str, Any], target_field: str) -> Dict[str, Any]:
|
||||||
# Clean the raw HTML content for better LLM processing
|
|
||||||
page_content = raw_data.get('page_content', '')
|
page_content = raw_data.get('page_content', '')
|
||||||
cleaned_content = self._clean_html_for_llm(page_content)
|
cleaned_content = self._clean_html_for_llm(page_content)
|
||||||
|
|
||||||
# Get job_id and url from raw data
|
|
||||||
job_id = raw_data.get('job_id', 'unknown')
|
job_id = raw_data.get('job_id', 'unknown')
|
||||||
url = raw_data.get('url', 'N/A')
|
url = raw_data.get('url', 'N/A')
|
||||||
|
posted_date = raw_data.get('posted_date', datetime.now().strftime("%m/%d/%y"))
|
||||||
|
|
||||||
prompt = f"""
|
prompt = f"""
|
||||||
You are a job posting data extractor with two modes:
|
You are a job posting data extractor.
|
||||||
|
|
||||||
CRITICAL INSTRUCTIONS FOR TEXT FIELDS:
|
EXTRACT EXACT TEXT - DO NOT SUMMARIZE, PARAPHRASE, OR INVENT.
|
||||||
- description: Extract the COMPLETE job description text (all paragraphs, bullet points, everything)
|
|
||||||
- requirements: Extract the COMPLETE requirements section text if present (all details, don't summarize)
|
|
||||||
- qualifications: Extract the COMPLETE qualifications section text if present (all details, don't summarize)
|
|
||||||
- If these sections are not explicitly labeled but exist in the content, extract the relevant portions
|
|
||||||
|
|
||||||
PRIMARY MODE (PREFERRED):
|
For these critical fields, follow these rules:
|
||||||
- Extract EXACT text as it appears on the page for all fields
|
- description: Extract ALL job description text. If ANY job details exist (duties, responsibilities, overview), include them. Only use "Not provided" if absolutely no description exists.
|
||||||
- DO NOT summarize, paraphrase, or interpret
|
- requirements: Extract ALL requirements text. If ANY requirements exist (skills, experience, education needed), include them. Only use "Not provided" if none exist.
|
||||||
- Copy verbatim content including original wording and formatting
|
- qualifications: Extract ALL qualifications text. If ANY qualifications exist, include them. Only use "Not provided" if none exist.
|
||||||
|
|
||||||
|
REQUIRED FIELDS (must have valid values, never "N/A"):
|
||||||
|
- title, company_name, job_id, url
|
||||||
|
|
||||||
FALLBACK MODE (ONLY IF FIELD IS MISSING):
|
OPTIONAL FIELDS (can be "Not provided"):
|
||||||
- If a field is NOT explicitly stated anywhere in the content, you MAY infer it using clear contextual clues
|
- location, salary_range, nature_of_work
|
||||||
- Inference rules:
|
|
||||||
* company_name: Look for patterns like "at [Company]", "Join [Company]", "[Company] is hiring"
|
|
||||||
* nature_of_work: Look for "remote", "onsite", "hybrid", "work from home", "office-based"
|
|
||||||
* location: Extract city/state/country mentions near job title or details
|
|
||||||
* title: Use the largest/primary heading if no explicit "job title" label exists
|
|
||||||
|
|
||||||
REQUIRED FIELDS (must always have a value):
|
|
||||||
- title: Exact job title or best inference
|
|
||||||
- company_name: Exact company name or best inference
|
|
||||||
- job_id: Use provided: {job_id}
|
|
||||||
- url: Use provided: {url}
|
|
||||||
|
|
||||||
OPTIONAL FIELDS (use exact text or "N/A" if not present and not inferable):
|
|
||||||
- salary_range
|
|
||||||
- nature_of_work
|
|
||||||
|
|
||||||
Page Content:
|
Page Content:
|
||||||
{cleaned_content}
|
{cleaned_content}
|
||||||
|
|
||||||
Response format (ONLY return this JSON):
|
Response format (ONLY return this JSON):
|
||||||
{{
|
{{
|
||||||
"title": "...",
|
"title": "...",
|
||||||
@ -202,15 +186,35 @@ class LLMJobRefiner:
|
|||||||
)
|
)
|
||||||
refined_data = self._parse_llm_response(response_text)
|
refined_data = self._parse_llm_response(response_text)
|
||||||
|
|
||||||
# Final validation - ensure required fields are present and meaningful
|
if not refined_data:
|
||||||
if refined_data:
|
return None
|
||||||
required_fields = ['title', 'company_name', 'job_id', 'url']
|
|
||||||
for field in required_fields:
|
|
||||||
if not refined_data.get(field) or refined_data[field] in ["N/A", "", "Unknown", "Company", "Job"]:
|
|
||||||
return None # LLM failed to extract properly
|
|
||||||
|
|
||||||
return refined_data
|
# Validate required fields
|
||||||
return None
|
required_fields = ['title', 'company_name', 'job_id', 'url']
|
||||||
|
for field in required_fields:
|
||||||
|
if not refined_data.get(field) or refined_data[field].strip() in ["N/A", "", "Unknown", "Company", "Job"]:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# CRITICAL: Validate content fields - check if they SHOULD exist
|
||||||
|
content_fields = ['description', 'requirements', 'qualifications']
|
||||||
|
cleaned_original = cleaned_content.lower()
|
||||||
|
|
||||||
|
# Simple heuristic: if page contains job-related keywords, content fields should NOT be "Not provided"
|
||||||
|
job_indicators = ['responsibilit', 'duties', 'require', 'qualifi', 'skill', 'experienc', 'educat', 'degree', 'bachelor', 'master']
|
||||||
|
has_job_content = any(indicator in cleaned_original for indicator in job_indicators)
|
||||||
|
|
||||||
|
if has_job_content:
|
||||||
|
for field in content_fields:
|
||||||
|
value = refined_data.get(field, "").strip()
|
||||||
|
if value in ["Not provided", "N/A", ""]:
|
||||||
|
# LLM failed to extract existing content
|
||||||
|
print(f" ⚠ LLM returned '{value}' for {field} but job content appears present")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Add the posted_date to the refined data
|
||||||
|
refined_data['posted_date'] = posted_date
|
||||||
|
|
||||||
|
return refined_data
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"LLM refinement failed: {str(e)}")
|
print(f"LLM refinement failed: {str(e)}")
|
||||||
@ -247,8 +251,8 @@ class LLMJobRefiner:
|
|||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
INSERT INTO jobs
|
INSERT INTO jobs
|
||||||
(title, company_name, location, description, requirements,
|
(title, company_name, location, description, requirements,
|
||||||
qualifications, salary_range, nature_of_work, job_id, url, category, scraped_at)
|
qualifications, salary_range, nature_of_work, job_id, url, category, scraped_at, posted_date)
|
||||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||||
ON CONFLICT (job_id) DO NOTHING
|
ON CONFLICT (job_id) DO NOTHING
|
||||||
''', (
|
''', (
|
||||||
job_data.get("title", "N/A"),
|
job_data.get("title", "N/A"),
|
||||||
@ -262,7 +266,8 @@ class LLMJobRefiner:
|
|||||||
job_data.get("job_id", "N/A"),
|
job_data.get("job_id", "N/A"),
|
||||||
job_data.get("url", "N/A"),
|
job_data.get("url", "N/A"),
|
||||||
job_data.get("category", "N/A"),
|
job_data.get("category", "N/A"),
|
||||||
job_data.get("scraped_at")
|
job_data.get("scraped_at"),
|
||||||
|
job_data.get("posted_date", "N/A")
|
||||||
))
|
))
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
@ -283,15 +288,16 @@ class LLMJobRefiner:
|
|||||||
if write_header:
|
if write_header:
|
||||||
f.write(f"# LinkedIn Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
|
f.write(f"# LinkedIn Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
|
||||||
f.write(f"## Job: {job_data.get('title', 'N/A')}\n\n")
|
f.write(f"## Job: {job_data.get('title', 'N/A')}\n\n")
|
||||||
f.write(f"- **Keyword**: {keyword}\n")
|
f.write(f"- *Keyword*: {keyword}\n")
|
||||||
f.write(f"- **Company**: {job_data.get('company_name', 'N/A')}\n")
|
f.write(f"- *Company*: {job_data.get('company_name', 'N/A')}\n")
|
||||||
f.write(f"- **Location**: {job_data.get('location', 'N/A')}\n")
|
f.write(f"- *Location*: {job_data.get('location', 'N/A')}\n")
|
||||||
f.write(f"- **Nature of Work**: {job_data.get('nature_of_work', 'N/A')}\n")
|
f.write(f"- *Nature of Work*: {job_data.get('nature_of_work', 'N/A')}\n")
|
||||||
f.write(f"- **Salary Range**: {job_data.get('salary_range', 'N/A')}\n")
|
f.write(f"- *Salary Range*: {job_data.get('salary_range', 'N/A')}\n")
|
||||||
f.write(f"- **Job ID**: {job_data.get('job_id', 'N/A')}\n")
|
f.write(f"- *Job ID*: {job_data.get('job_id', 'N/A')}\n")
|
||||||
f.write(f"- **Category**: {job_data.get('category', 'N/A')}\n")
|
f.write(f"- *Posted Date*: {job_data.get('posted_date', 'N/A')}\n")
|
||||||
f.write(f"- **Scraped At**: {job_data.get('scraped_at', 'N/A')}\n")
|
f.write(f"- *Category*: {job_data.get('category', 'N/A')}\n")
|
||||||
f.write(f"- **URL**: <{job_data.get('url', 'N/A')}>\n\n")
|
f.write(f"- *Scraped At*: {job_data.get('scraped_at', 'N/A')}\n")
|
||||||
|
f.write(f"- *URL*: <{job_data.get('url', 'N/A')}>\n\n")
|
||||||
f.write(f"### Description\n\n{job_data.get('description', 'N/A')}\n\n")
|
f.write(f"### Description\n\n{job_data.get('description', 'N/A')}\n\n")
|
||||||
f.write(f"### Requirements\n\n{job_data.get('requirements', 'N/A')}\n\n")
|
f.write(f"### Requirements\n\n{job_data.get('requirements', 'N/A')}\n\n")
|
||||||
f.write(f"### Qualifications\n\n{job_data.get('qualifications', 'N/A')}\n\n")
|
f.write(f"### Qualifications\n\n{job_data.get('qualifications', 'N/A')}\n\n")
|
||||||
|
|||||||
31
trim.py
Normal file
31
trim.py
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
# Keep cycling through all job titles
|
||||||
|
while True:
|
||||||
|
# Shuffle job titles to randomize order
|
||||||
|
random.shuffle(job_titles)
|
||||||
|
|
||||||
|
for job_title in job_titles:
|
||||||
|
search_keywords = f"{job_title} location:{fixed_location}"
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Starting scrape for: {search_keywords}")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
|
||||||
|
await scraper.scrape_jobs(
|
||||||
|
search_keywords=search_keywords,
|
||||||
|
credentials={
|
||||||
|
"email": os.getenv("SCRAPING_USERNAME"),
|
||||||
|
"password": os.getenv("SCRAPING_PASSWORD")
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"\n✅ Completed scraping for: {job_title}")
|
||||||
|
print(f"⏳ Waiting 2 minutes before next job title...")
|
||||||
|
|
||||||
|
# Wait 2 minutes before next job title
|
||||||
|
time.sleep(120)
|
||||||
|
|
||||||
|
print(f"\n✅ Completed full cycle of all job titles")
|
||||||
|
print(f"🔄 Starting new cycle...")
|
||||||
|
|
||||||
|
if _name_ == "_main_":
|
||||||
|
asyncio.run(main())
|
||||||
Loading…
x
Reference in New Issue
Block a user