Web_scraping_project/comparator.py

166 lines
6.1 KiB
Python

import csv
import os
from urllib.parse import urlparse
# Define platform mappings: (input_file, companies_file, platform_name)
platforms = [
("ashby.csv", "ashbycompanies.csv", "ashby"),
("gem.csv", "gemcompanies.csv", "gem"),
("greenhouse.csv", "greenhousecompanies.csv", "greenhouse"),
("lever.csv", "levercompanies.csv", "lever"),
("rippling.csv", "ripplingcompanies.csv", "rippling"),
("workable.csv", "workablecompanies.csv", "workable"),
("workday.csv", "workdaycompanies.csv", "workday"),
]
def normalize_url(platform, url):
"""Normalize URL to a company identifier based on platform."""
if not url:
return None
try:
parsed = urlparse(url.lower().strip())
netloc = parsed.netloc
path = parsed.path
if platform == "ashby":
# https://jobs.ashbyhq.com/company_slug/...
if "ashbyhq.com" in netloc:
parts = [p for p in path.split('/') if p]
return parts[0] if parts else None
elif platform == "greenhouse":
# https://boards.greenhouse.io/company_slug/...
if "greenhouse.io" in netloc:
parts = [p for p in path.split('/') if p]
if len(parts) >= 2 and parts[0] == "boards":
return parts[1]
elif len(parts) >= 1:
return parts[0]
return None
elif platform == "lever":
# https://jobs.lever.co/company_slug/...
if "lever.co" in netloc:
parts = [p for p in path.split('/') if p]
return parts[0] if parts else None
elif platform == "workable":
# https://apply.workable.com/company_slug/...
if "workable.com" in netloc:
parts = [p for p in path.split('/') if p]
# Usually: /company_slug/j/jobid/ → take first non-'j' segment
for part in parts:
if part != 'j' and len(part) > 2:
return part
return parts[0] if parts else None
elif platform == "workday":
# https://company.workday.com/... → company = subdomain
if "myworkdayjobs.com" in netloc or "wd" in netloc:
# Extract subdomain before main domain
subdomain = netloc.split('.')[0]
if subdomain and subdomain not in ['www', 'jobs', 'apply', '']:
return subdomain
# Fallback: look for company in path (rare)
parts = [p for p in path.split('/') if p]
if parts:
return parts[0]
return None
elif platform == "gem":
# https://gem.com/company/... or https://www.gem.com/careers/company/...
if "gem.com" in netloc:
parts = [p for p in path.split('/') if p]
# Often: /company-slug or /careers/company-slug
for i, part in enumerate(parts):
if part in ['company', 'careers', 'jobs']:
if i + 1 < len(parts):
return parts[i + 1]
return parts[0] if parts else None
elif platform == "rippling":
# Rippling uses generic domain; hard to extract company
# Best effort: use full domain + first path segment
if "rippling.com" in netloc:
parts = [p for p in path.split('/') if p]
if parts:
return f"{netloc}/{parts[0]}"
return netloc
# Fallback: return full URL if unrecognized
return url
except Exception:
return url
def read_company_signatures(filepath, platform):
"""Read and normalize company identifiers from companies CSV."""
if not os.path.exists(filepath):
return set()
signatures = set()
with open(filepath, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
url = row.get('url', '').strip()
if url:
sig = normalize_url(platform, url)
if sig:
signatures.add(sig)
return signatures
def filter_csv_by_signatures(input_file, excluded_signatures, platform):
"""Keep only rows whose normalized URL is NOT in excluded_signatures."""
if not os.path.exists(input_file):
return [], None
kept_rows = []
with open(input_file, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
fieldnames = reader.fieldnames
for row in reader:
url = row.get('url', '').strip()
if not url:
kept_rows.append(row) # keep if no URL (shouldn't happen)
continue
sig = normalize_url(platform, url)
if sig not in excluded_signatures:
kept_rows.append(row)
return kept_rows, fieldnames
def write_csv(filepath, rows, fieldnames):
"""Write rows to CSV file."""
with open(filepath, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(rows)
def main():
for input_file, companies_file, platform in platforms:
print(f"Processing {input_file} against {companies_file} using '{platform}' normalizer...")
# Step 1: Load and normalize known company signatures
known_signatures = read_company_signatures(companies_file, platform)
print(f" → Loaded {len(known_signatures)} known company signatures from {companies_file}")
# Step 2: Filter input file using signatures
kept_rows, fieldnames = filter_csv_by_signatures(input_file, known_signatures, platform)
# Step 3: Write back filtered data
if fieldnames:
write_csv(input_file, kept_rows, fieldnames)
print(f" → Kept {len(kept_rows)} new job URLs in {input_file}")
else:
if os.path.exists(input_file):
os.remove(input_file)
print(f"{input_file} was empty or invalid — removed.")
print("\n✅ All platforms processed successfully.")
if __name__ == "__main__":
main()