import csv import os from urllib.parse import urlparse # Define platform mappings: (input_file, companies_file, platform_name) platforms = [ ("ashby.csv", "ashbycompanies.csv", "ashby"), ("gem.csv", "gemcompanies.csv", "gem"), ("greenhouse.csv", "greenhousecompanies.csv", "greenhouse"), ("lever.csv", "levercompanies.csv", "lever"), ("rippling.csv", "ripplingcompanies.csv", "rippling"), ("workable.csv", "workablecompanies.csv", "workable"), ("workday.csv", "workdaycompanies.csv", "workday"), ] def normalize_url(platform, url): """Normalize URL to a company identifier based on platform.""" if not url: return None try: parsed = urlparse(url.lower().strip()) netloc = parsed.netloc path = parsed.path if platform == "ashby": # https://jobs.ashbyhq.com/company_slug/... if "ashbyhq.com" in netloc: parts = [p for p in path.split('/') if p] return parts[0] if parts else None elif platform == "greenhouse": # https://boards.greenhouse.io/company_slug/... if "greenhouse.io" in netloc: parts = [p for p in path.split('/') if p] if len(parts) >= 2 and parts[0] == "boards": return parts[1] elif len(parts) >= 1: return parts[0] return None elif platform == "lever": # https://jobs.lever.co/company_slug/... if "lever.co" in netloc: parts = [p for p in path.split('/') if p] return parts[0] if parts else None elif platform == "workable": # https://apply.workable.com/company_slug/... if "workable.com" in netloc: parts = [p for p in path.split('/') if p] # Usually: /company_slug/j/jobid/ → take first non-'j' segment for part in parts: if part != 'j' and len(part) > 2: return part return parts[0] if parts else None elif platform == "workday": # https://company.workday.com/... → company = subdomain if "myworkdayjobs.com" in netloc or "wd" in netloc: # Extract subdomain before main domain subdomain = netloc.split('.')[0] if subdomain and subdomain not in ['www', 'jobs', 'apply', '']: return subdomain # Fallback: look for company in path (rare) parts = [p for p in path.split('/') if p] if parts: return parts[0] return None elif platform == "gem": # https://gem.com/company/... or https://www.gem.com/careers/company/... if "gem.com" in netloc: parts = [p for p in path.split('/') if p] # Often: /company-slug or /careers/company-slug for i, part in enumerate(parts): if part in ['company', 'careers', 'jobs']: if i + 1 < len(parts): return parts[i + 1] return parts[0] if parts else None elif platform == "rippling": # Rippling uses generic domain; hard to extract company # Best effort: use full domain + first path segment if "rippling.com" in netloc: parts = [p for p in path.split('/') if p] if parts: return f"{netloc}/{parts[0]}" return netloc # Fallback: return full URL if unrecognized return url except Exception: return url def read_company_signatures(filepath, platform): """Read and normalize company identifiers from companies CSV.""" if not os.path.exists(filepath): return set() signatures = set() with open(filepath, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: url = row.get('url', '').strip() if url: sig = normalize_url(platform, url) if sig: signatures.add(sig) return signatures def filter_csv_by_signatures(input_file, excluded_signatures, platform): """Keep only rows whose normalized URL is NOT in excluded_signatures.""" if not os.path.exists(input_file): return [], None kept_rows = [] with open(input_file, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) fieldnames = reader.fieldnames for row in reader: url = row.get('url', '').strip() if not url: kept_rows.append(row) # keep if no URL (shouldn't happen) continue sig = normalize_url(platform, url) if sig not in excluded_signatures: kept_rows.append(row) return kept_rows, fieldnames def write_csv(filepath, rows, fieldnames): """Write rows to CSV file.""" with open(filepath, 'w', newline='', encoding='utf-8') as f: writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer.writerows(rows) def main(): for input_file, companies_file, platform in platforms: print(f"Processing {input_file} against {companies_file} using '{platform}' normalizer...") # Step 1: Load and normalize known company signatures known_signatures = read_company_signatures(companies_file, platform) print(f" → Loaded {len(known_signatures)} known company signatures from {companies_file}") # Step 2: Filter input file using signatures kept_rows, fieldnames = filter_csv_by_signatures(input_file, known_signatures, platform) # Step 3: Write back filtered data if fieldnames: write_csv(input_file, kept_rows, fieldnames) print(f" → Kept {len(kept_rows)} new job URLs in {input_file}") else: if os.path.exists(input_file): os.remove(input_file) print(f" → {input_file} was empty or invalid — removed.") print("\n✅ All platforms processed successfully.") if __name__ == "__main__": main()