modify llm agent to compulsorily identify and scrape all provided fields
This commit is contained in:
parent
224b9c3122
commit
8fa59ba69b
74
llm_agent.py
74
llm_agent.py
@ -137,49 +137,30 @@ class LLMJobRefiner:
|
||||
return ""
|
||||
|
||||
async def refine_job_data(self, raw_data: Dict[str, Any], target_field: str) -> Dict[str, Any]:
|
||||
# Clean the raw HTML content for better LLM processing
|
||||
page_content = raw_data.get('page_content', '')
|
||||
cleaned_content = self._clean_html_for_llm(page_content)
|
||||
|
||||
# Get job_id and url from raw data
|
||||
job_id = raw_data.get('job_id', 'unknown')
|
||||
url = raw_data.get('url', 'N/A')
|
||||
|
||||
prompt = f"""
|
||||
You are a job posting data extractor with two modes:
|
||||
You are a job posting data extractor.
|
||||
|
||||
CRITICAL INSTRUCTIONS FOR TEXT FIELDS:
|
||||
- description: Extract the COMPLETE job description text (all paragraphs, bullet points, everything)
|
||||
- requirements: Extract the COMPLETE requirements section text if present (all details, don't summarize)
|
||||
- qualifications: Extract the COMPLETE qualifications section text if present (all details, don't summarize)
|
||||
- If these sections are not explicitly labeled but exist in the content, extract the relevant portions
|
||||
EXTRACT EXACT TEXT - DO NOT SUMMARIZE, PARAPHRASE, OR INVENT.
|
||||
|
||||
PRIMARY MODE (PREFERRED):
|
||||
- Extract EXACT text as it appears on the page for all fields
|
||||
- DO NOT summarize, paraphrase, or interpret
|
||||
- Copy verbatim content including original wording and formatting
|
||||
For these critical fields, follow these rules:
|
||||
- description: Extract ALL job description text. If ANY job details exist (duties, responsibilities, overview), include them. Only use "Not provided" if absolutely no description exists.
|
||||
- requirements: Extract ALL requirements text. If ANY requirements exist (skills, experience, education needed), include them. Only use "Not provided" if none exist.
|
||||
- qualifications: Extract ALL qualifications text. If ANY qualifications exist, include them. Only use "Not provided" if none exist.
|
||||
|
||||
REQUIRED FIELDS (must have valid values, never "N/A"):
|
||||
- title, company_name, job_id, url
|
||||
|
||||
FALLBACK MODE (ONLY IF FIELD IS MISSING):
|
||||
- If a field is NOT explicitly stated anywhere in the content, you MAY infer it using clear contextual clues
|
||||
- Inference rules:
|
||||
* company_name: Look for patterns like "at [Company]", "Join [Company]", "[Company] is hiring"
|
||||
* nature_of_work: Look for "remote", "onsite", "hybrid", "work from home", "office-based"
|
||||
* location: Extract city/state/country mentions near job title or details
|
||||
* title: Use the largest/primary heading if no explicit "job title" label exists
|
||||
|
||||
REQUIRED FIELDS (must always have a value):
|
||||
- title: Exact job title or best inference
|
||||
- company_name: Exact company name or best inference
|
||||
- job_id: Use provided: {job_id}
|
||||
- url: Use provided: {url}
|
||||
|
||||
OPTIONAL FIELDS (use exact text or "N/A" if not present and not inferable):
|
||||
- salary_range
|
||||
- nature_of_work
|
||||
OPTIONAL FIELDS (can be "Not provided"):
|
||||
- location, salary_range, nature_of_work
|
||||
|
||||
Page Content:
|
||||
{cleaned_content}
|
||||
|
||||
Response format (ONLY return this JSON):
|
||||
{{
|
||||
"title": "...",
|
||||
@ -202,15 +183,32 @@ class LLMJobRefiner:
|
||||
)
|
||||
refined_data = self._parse_llm_response(response_text)
|
||||
|
||||
# Final validation - ensure required fields are present and meaningful
|
||||
if refined_data:
|
||||
required_fields = ['title', 'company_name', 'job_id', 'url']
|
||||
for field in required_fields:
|
||||
if not refined_data.get(field) or refined_data[field] in ["N/A", "", "Unknown", "Company", "Job"]:
|
||||
return None # LLM failed to extract properly
|
||||
if not refined_data:
|
||||
return None
|
||||
|
||||
return refined_data
|
||||
return None
|
||||
# Validate required fields
|
||||
required_fields = ['title', 'company_name', 'job_id', 'url']
|
||||
for field in required_fields:
|
||||
if not refined_data.get(field) or refined_data[field].strip() in ["N/A", "", "Unknown", "Company", "Job"]:
|
||||
return None
|
||||
|
||||
# CRITICAL: Validate content fields - check if they SHOULD exist
|
||||
content_fields = ['description', 'requirements', 'qualifications']
|
||||
cleaned_original = cleaned_content.lower()
|
||||
|
||||
# Simple heuristic: if page contains job-related keywords, content fields should NOT be "Not provided"
|
||||
job_indicators = ['responsibilit', 'duties', 'require', 'qualifi', 'skill', 'experienc', 'educat', 'degree', 'bachelor', 'master']
|
||||
has_job_content = any(indicator in cleaned_original for indicator in job_indicators)
|
||||
|
||||
if has_job_content:
|
||||
for field in content_fields:
|
||||
value = refined_data.get(field, "").strip()
|
||||
if value in ["Not provided", "N/A", ""]:
|
||||
# LLM failed to extract existing content
|
||||
print(f" ⚠️ LLM returned '{value}' for {field} but job content appears present")
|
||||
return None
|
||||
|
||||
return refined_data
|
||||
|
||||
except Exception as e:
|
||||
print(f"LLM refinement failed: {str(e)}")
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user