Web_scraping_project/comparator.py


import csv
import os
from urllib.parse import urlparse

# Define platform mappings: (input_file, companies_file, platform_name)
platforms = [
    ("ashby.csv", "ashbycompanies.csv", "ashby"),
    ("gem.csv", "gemcompanies.csv", "gem"),
    ("greenhouse.csv", "greenhousecompanies.csv", "greenhouse"),
    ("lever.csv", "levercompanies.csv", "lever"),
    ("rippling.csv", "ripplingcompanies.csv", "rippling"),
    ("workable.csv", "workablecompanies.csv", "workable"),
    ("workday.csv", "workdaycompanies.csv", "workday"),
]


def normalize_url(platform, url):
    """Normalize URL to a company identifier based on platform."""
    if not url:
        return None
    try:
        parsed = urlparse(url.lower().strip())
        netloc = parsed.netloc
        path = parsed.path

        if platform == "ashby":
            # https://jobs.ashbyhq.com/company_slug/...
            if "ashbyhq.com" in netloc:
                parts = [p for p in path.split('/') if p]
                return parts[0] if parts else None

        elif platform == "greenhouse":
            # https://boards.greenhouse.io/company_slug/...
            if "greenhouse.io" in netloc:
                parts = [p for p in path.split('/') if p]
                if len(parts) >= 2 and parts[0] == "boards":
                    return parts[1]
                elif len(parts) >= 1:
                    return parts[0]
                return None

        elif platform == "lever":
            # https://jobs.lever.co/company_slug/...
            if "lever.co" in netloc:
                parts = [p for p in path.split('/') if p]
                return parts[0] if parts else None

        elif platform == "workable":
            # https://apply.workable.com/company_slug/...
            if "workable.com" in netloc:
                parts = [p for p in path.split('/') if p]
                # Usually: /company_slug/j/jobid/ → take first non-'j' segment
                for part in parts:
                    if part != 'j' and len(part) > 2:
                        return part
                return parts[0] if parts else None

        elif platform == "workday":
            # https://company.workday.com/... → company = subdomain
            if "myworkdayjobs.com" in netloc or "wd" in netloc:
                # Extract subdomain before main domain
                subdomain = netloc.split('.')[0]
                if subdomain and subdomain not in ['www', 'jobs', 'apply', '']:
                    return subdomain
                # Fallback: look for company in path (rare)
                parts = [p for p in path.split('/') if p]
                if parts:
                    return parts[0]
                return None

        elif platform == "gem":
            # https://gem.com/company/... or https://www.gem.com/careers/company/...
            if "gem.com" in netloc:
                parts = [p for p in path.split('/') if p]
                # Often: /company-slug or /careers/company-slug
                for i, part in enumerate(parts):
                    if part in ['company', 'careers', 'jobs']:
                        if i + 1 < len(parts):
                            return parts[i + 1]
                return parts[0] if parts else None

        elif platform == "rippling":
            # Rippling uses generic domain; hard to extract company
            # Best effort: use full domain + first path segment
            if "rippling.com" in netloc:
                parts = [p for p in path.split('/') if p]
                if parts:
                    return f"{netloc}/{parts[0]}"
                return netloc

        # Fallback: return full URL if unrecognized
        return url

    except Exception:
        return url


def read_company_signatures(filepath, platform):
    """Read and normalize company identifiers from companies CSV."""
    if not os.path.exists(filepath):
        return set()
    signatures = set()
    with open(filepath, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            url = row.get('url', '').strip()
            if url:
                sig = normalize_url(platform, url)
                if sig:
                    signatures.add(sig)
    return signatures


def filter_csv_by_signatures(input_file, excluded_signatures, platform):
    """Keep only rows whose normalized URL is NOT in excluded_signatures."""
    if not os.path.exists(input_file):
        return [], None
    kept_rows = []
    with open(input_file, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        fieldnames = reader.fieldnames
        for row in reader:
            url = row.get('url', '').strip()
            if not url:
                kept_rows.append(row)  # keep if no URL (shouldn't happen)
                continue
            sig = normalize_url(platform, url)
            if sig not in excluded_signatures:
                kept_rows.append(row)
    return kept_rows, fieldnames


def write_csv(filepath, rows, fieldnames):
    """Write rows to CSV file."""
    with open(filepath, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(rows)


def main():
    for input_file, companies_file, platform in platforms:
        print(f"Processing {input_file} against {companies_file} using '{platform}' normalizer...")

        # Step 1: Load and normalize known company signatures
        known_signatures = read_company_signatures(companies_file, platform)
        print(f"  → Loaded {len(known_signatures)} known company signatures from {companies_file}")

        # Step 2: Filter input file using signatures
        kept_rows, fieldnames = filter_csv_by_signatures(input_file, known_signatures, platform)

        # Step 3: Write back filtered data
        if fieldnames:
            write_csv(input_file, kept_rows, fieldnames)
            print(f"  → Kept {len(kept_rows)} new job URLs in {input_file}")
        else:
            if os.path.exists(input_file):
                os.remove(input_file)
            print(f"  → {input_file} was empty or invalid — removed.")

    print("\n✅ All platforms processed successfully.")


if __name__ == "__main__":
    main()