166 lines
6.1 KiB
Python
166 lines
6.1 KiB
Python
|
|
import csv
|
|
import os
|
|
from urllib.parse import urlparse
|
|
|
|
# Define platform mappings: (input_file, companies_file, platform_name)
|
|
platforms = [
|
|
("ashby.csv", "ashbycompanies.csv", "ashby"),
|
|
("gem.csv", "gemcompanies.csv", "gem"),
|
|
("greenhouse.csv", "greenhousecompanies.csv", "greenhouse"),
|
|
("lever.csv", "levercompanies.csv", "lever"),
|
|
("rippling.csv", "ripplingcompanies.csv", "rippling"),
|
|
("workable.csv", "workablecompanies.csv", "workable"),
|
|
("workday.csv", "workdaycompanies.csv", "workday"),
|
|
]
|
|
|
|
|
|
def normalize_url(platform, url):
|
|
"""Normalize URL to a company identifier based on platform."""
|
|
if not url:
|
|
return None
|
|
try:
|
|
parsed = urlparse(url.lower().strip())
|
|
netloc = parsed.netloc
|
|
path = parsed.path
|
|
|
|
if platform == "ashby":
|
|
# https://jobs.ashbyhq.com/company_slug/...
|
|
if "ashbyhq.com" in netloc:
|
|
parts = [p for p in path.split('/') if p]
|
|
return parts[0] if parts else None
|
|
|
|
elif platform == "greenhouse":
|
|
# https://boards.greenhouse.io/company_slug/...
|
|
if "greenhouse.io" in netloc:
|
|
parts = [p for p in path.split('/') if p]
|
|
if len(parts) >= 2 and parts[0] == "boards":
|
|
return parts[1]
|
|
elif len(parts) >= 1:
|
|
return parts[0]
|
|
return None
|
|
|
|
elif platform == "lever":
|
|
# https://jobs.lever.co/company_slug/...
|
|
if "lever.co" in netloc:
|
|
parts = [p for p in path.split('/') if p]
|
|
return parts[0] if parts else None
|
|
|
|
elif platform == "workable":
|
|
# https://apply.workable.com/company_slug/...
|
|
if "workable.com" in netloc:
|
|
parts = [p for p in path.split('/') if p]
|
|
# Usually: /company_slug/j/jobid/ → take first non-'j' segment
|
|
for part in parts:
|
|
if part != 'j' and len(part) > 2:
|
|
return part
|
|
return parts[0] if parts else None
|
|
|
|
elif platform == "workday":
|
|
# https://company.workday.com/... → company = subdomain
|
|
if "myworkdayjobs.com" in netloc or "wd" in netloc:
|
|
# Extract subdomain before main domain
|
|
subdomain = netloc.split('.')[0]
|
|
if subdomain and subdomain not in ['www', 'jobs', 'apply', '']:
|
|
return subdomain
|
|
# Fallback: look for company in path (rare)
|
|
parts = [p for p in path.split('/') if p]
|
|
if parts:
|
|
return parts[0]
|
|
return None
|
|
|
|
elif platform == "gem":
|
|
# https://gem.com/company/... or https://www.gem.com/careers/company/...
|
|
if "gem.com" in netloc:
|
|
parts = [p for p in path.split('/') if p]
|
|
# Often: /company-slug or /careers/company-slug
|
|
for i, part in enumerate(parts):
|
|
if part in ['company', 'careers', 'jobs']:
|
|
if i + 1 < len(parts):
|
|
return parts[i + 1]
|
|
return parts[0] if parts else None
|
|
|
|
elif platform == "rippling":
|
|
# Rippling uses generic domain; hard to extract company
|
|
# Best effort: use full domain + first path segment
|
|
if "rippling.com" in netloc:
|
|
parts = [p for p in path.split('/') if p]
|
|
if parts:
|
|
return f"{netloc}/{parts[0]}"
|
|
return netloc
|
|
|
|
# Fallback: return full URL if unrecognized
|
|
return url
|
|
|
|
except Exception:
|
|
return url
|
|
|
|
|
|
def read_company_signatures(filepath, platform):
|
|
"""Read and normalize company identifiers from companies CSV."""
|
|
if not os.path.exists(filepath):
|
|
return set()
|
|
signatures = set()
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
reader = csv.DictReader(f)
|
|
for row in reader:
|
|
url = row.get('url', '').strip()
|
|
if url:
|
|
sig = normalize_url(platform, url)
|
|
if sig:
|
|
signatures.add(sig)
|
|
return signatures
|
|
|
|
|
|
def filter_csv_by_signatures(input_file, excluded_signatures, platform):
|
|
"""Keep only rows whose normalized URL is NOT in excluded_signatures."""
|
|
if not os.path.exists(input_file):
|
|
return [], None
|
|
kept_rows = []
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
reader = csv.DictReader(f)
|
|
fieldnames = reader.fieldnames
|
|
for row in reader:
|
|
url = row.get('url', '').strip()
|
|
if not url:
|
|
kept_rows.append(row) # keep if no URL (shouldn't happen)
|
|
continue
|
|
sig = normalize_url(platform, url)
|
|
if sig not in excluded_signatures:
|
|
kept_rows.append(row)
|
|
return kept_rows, fieldnames
|
|
|
|
|
|
def write_csv(filepath, rows, fieldnames):
|
|
"""Write rows to CSV file."""
|
|
with open(filepath, 'w', newline='', encoding='utf-8') as f:
|
|
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
writer.writerows(rows)
|
|
|
|
|
|
def main():
|
|
for input_file, companies_file, platform in platforms:
|
|
print(f"Processing {input_file} against {companies_file} using '{platform}' normalizer...")
|
|
|
|
# Step 1: Load and normalize known company signatures
|
|
known_signatures = read_company_signatures(companies_file, platform)
|
|
print(f" → Loaded {len(known_signatures)} known company signatures from {companies_file}")
|
|
|
|
# Step 2: Filter input file using signatures
|
|
kept_rows, fieldnames = filter_csv_by_signatures(input_file, known_signatures, platform)
|
|
|
|
# Step 3: Write back filtered data
|
|
if fieldnames:
|
|
write_csv(input_file, kept_rows, fieldnames)
|
|
print(f" → Kept {len(kept_rows)} new job URLs in {input_file}")
|
|
else:
|
|
if os.path.exists(input_file):
|
|
os.remove(input_file)
|
|
print(f" → {input_file} was empty or invalid — removed.")
|
|
|
|
print("\n✅ All platforms processed successfully.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |