Refactor code structure for improved readability and maintainability

Increase timeout for selector waits and refine job extraction logic in LLMJobRefiner and CryptoJobScraper
2025-12-31 11:41:19 +01:00 · 2025-12-30 12:19:18 +01:00
20 changed files with 13021 additions and 224 deletions
--- a/ashby.csv
+++ b/ashby.csv
@ -0,0 +1,10 @@
 url,timestamp
 https://jobs.ashbyhq.com/stellar/a8377cf4-280b-4eb3-ac44-a4c9020c2eaf?utm_source=cryptocurrencyjobs.co,2025-12-31T08:32:17.821505
 https://jobs.ashbyhq.com/artemisanalytics/5f61b6c6-147c-4707-9003-a9632455b984?utm_source=cryptocurrencyjobs.co,2025-12-31T08:51:57.190172
 https://jobs.ashbyhq.com/lightning/2d77b496-ab0d-4e54-bcf8-33260d1bab6b?utm_source=cryptocurrencyjobs.co,2025-12-31T09:07:09.491831
 https://jobs.ashbyhq.com/Braiins/cee9cf74-6049-4dab-aae7-96bef0082689?utm_source=cryptocurrencyjobs.co,2025-12-31T09:35:28.137181
 https://jobs.ashbyhq.com/blockstream/80ebab98-0039-48bf-86d9-9a2a7962b005?utm_source=cryptocurrencyjobs.co,2025-12-31T10:21:19.253356
 https://jobs.ashbyhq.com/dynamic/fde8a9ff-9701-485f-a8d1-e717c170f215?utm_source=cryptocurrencyjobs.co,2025-12-31T10:25:55.141543
 https://jobs.ashbyhq.com/ether.fi/6eb1e350-71ce-47f7-a363-3fa3c521dacb?utm_source=cryptocurrencyjobs.co,2025-12-31T10:44:35.913725
 https://chainlinklabs.com/open-roles?ashby_jid=112a76d3-4dfd-4eea-828c-41465760b3ef&utm_source=ccj,2025-12-31T10:49:07.453900
 https://jobs.ashbyhq.com/stellar/cdad9af1-9e64-4fd4-8e2c-f87389f1dd16?utm_source=cryptocurrencyjobs.co,2025-12-31T11:13:58.119967
--- a/ashbycompanies.csv
+++ b/ashbycompanies.csv
--- a/comparator.py
+++ b/comparator.py
@ -0,0 +1,166 @@
 import csv
 import os
 from urllib.parse import urlparse
 # Define platform mappings: (input_file, companies_file, platform_name)
 platforms = [
    ("ashby.csv", "ashbycompanies.csv", "ashby"),
    ("gem.csv", "gemcompanies.csv", "gem"),
    ("greenhouse.csv", "greenhousecompanies.csv", "greenhouse"),
    ("lever.csv", "levercompanies.csv", "lever"),
    ("rippling.csv", "ripplingcompanies.csv", "rippling"),
    ("workable.csv", "workablecompanies.csv", "workable"),
    ("workday.csv", "workdaycompanies.csv", "workday"),
 ]
 def normalize_url(platform, url):
    """Normalize URL to a company identifier based on platform."""
    if not url:
        return None
    try:
        parsed = urlparse(url.lower().strip())
        netloc = parsed.netloc
        path = parsed.path
        if platform == "ashby":
            # https://jobs.ashbyhq.com/company_slug/...
            if "ashbyhq.com" in netloc:
                parts = [p for p in path.split('/') if p]
                return parts[0] if parts else None
        elif platform == "greenhouse":
            # https://boards.greenhouse.io/company_slug/...
            if "greenhouse.io" in netloc:
                parts = [p for p in path.split('/') if p]
                if len(parts) >= 2 and parts[0] == "boards":
                    return parts[1]
                elif len(parts) >= 1:
                    return parts[0]
                return None
        elif platform == "lever":
            # https://jobs.lever.co/company_slug/...
            if "lever.co" in netloc:
                parts = [p for p in path.split('/') if p]
                return parts[0] if parts else None
        elif platform == "workable":
            # https://apply.workable.com/company_slug/...
            if "workable.com" in netloc:
                parts = [p for p in path.split('/') if p]
                # Usually: /company_slug/j/jobid/ → take first non-'j' segment
                for part in parts:
                    if part != 'j' and len(part) > 2:
                        return part
                return parts[0] if parts else None
        elif platform == "workday":
            # https://company.workday.com/... → company = subdomain
            if "myworkdayjobs.com" in netloc or "wd" in netloc:
                # Extract subdomain before main domain
                subdomain = netloc.split('.')[0]
                if subdomain and subdomain not in ['www', 'jobs', 'apply', '']:
                    return subdomain
                # Fallback: look for company in path (rare)
                parts = [p for p in path.split('/') if p]
                if parts:
                    return parts[0]
                return None
        elif platform == "gem":
            # https://gem.com/company/... or https://www.gem.com/careers/company/...
            if "gem.com" in netloc:
                parts = [p for p in path.split('/') if p]
                # Often: /company-slug or /careers/company-slug
                for i, part in enumerate(parts):
                    if part in ['company', 'careers', 'jobs']:
                        if i + 1 < len(parts):
                            return parts[i + 1]
                return parts[0] if parts else None
        elif platform == "rippling":
            # Rippling uses generic domain; hard to extract company
            # Best effort: use full domain + first path segment
            if "rippling.com" in netloc:
                parts = [p for p in path.split('/') if p]
                if parts:
                    return f"{netloc}/{parts[0]}"
                return netloc
        # Fallback: return full URL if unrecognized
        return url
    except Exception:
        return url
 def read_company_signatures(filepath, platform):
    """Read and normalize company identifiers from companies CSV."""
    if not os.path.exists(filepath):
        return set()
    signatures = set()
    with open(filepath, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            url = row.get('url', '').strip()
            if url:
                sig = normalize_url(platform, url)
                if sig:
                    signatures.add(sig)
    return signatures
 def filter_csv_by_signatures(input_file, excluded_signatures, platform):
    """Keep only rows whose normalized URL is NOT in excluded_signatures."""
    if not os.path.exists(input_file):
        return [], None
    kept_rows = []
    with open(input_file, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        fieldnames = reader.fieldnames
        for row in reader:
            url = row.get('url', '').strip()
            if not url:
                kept_rows.append(row)  # keep if no URL (shouldn't happen)
                continue
            sig = normalize_url(platform, url)
            if sig not in excluded_signatures:
                kept_rows.append(row)
    return kept_rows, fieldnames
 def write_csv(filepath, rows, fieldnames):
    """Write rows to CSV file."""
    with open(filepath, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(rows)
 def main():
    for input_file, companies_file, platform in platforms:
        print(f"Processing {input_file} against {companies_file} using '{platform}' normalizer...")
        # Step 1: Load and normalize known company signatures
        known_signatures = read_company_signatures(companies_file, platform)
        print(f"  → Loaded {len(known_signatures)} known company signatures from {companies_file}")
        # Step 2: Filter input file using signatures
        kept_rows, fieldnames = filter_csv_by_signatures(input_file, known_signatures, platform)
        # Step 3: Write back filtered data
        if fieldnames:
            write_csv(input_file, kept_rows, fieldnames)
            print(f"  → Kept {len(kept_rows)} new job URLs in {input_file}")
        else:
            if os.path.exists(input_file):
                os.remove(input_file)
            print(f"  → {input_file} was empty or invalid — removed.")
    print("\n✅ All platforms processed successfully.")
 if __name__ == "__main__":
    main()
--- a/fetcher.py
+++ b/fetcher.py
@ -27,7 +27,7 @@ class StealthyFetcher:
                if wait_for_selector:
                    try:
-                        await page.wait_for_selector(wait_for_selector, timeout=40000)
+                        await page.wait_for_selector(wait_for_selector, timeout=120000)
                    except PlaywrightTimeoutError:
                        print(f"Selector {wait_for_selector} not found immediately, continuing...")
@ -88,7 +88,7 @@ class StealthyFetcher:
    async def _is_content_accessible(self, page: Page, wait_for_selector: Optional[str] = None) -> bool:
        if wait_for_selector:
            try:
-                await page.wait_for_selector(wait_for_selector, timeout=40000)
+                await page.wait_for_selector(wait_for_selector, timeout=120000)
                return True
            except PlaywrightTimeoutError:
                pass
--- a/gem.csv
+++ b/gem.csv
@ -0,0 +1 @@
 url,timestamp
--- a/gemcompanies.csv
+++ b/gemcompanies.csv
@ -0,0 +1,508 @@
 name,url
 10Xconstruction Ai,https://jobs.gem.com/10xconstruction-ai
 11X Ai,https://jobs.gem.com/11x-ai
 43North,https://jobs.gem.com/43north
 8020 Consulting,https://jobs.gem.com/8020-consulting
 A16Z Speedrun,https://jobs.gem.com/a16z-speedrun
 Aarden Ai,https://jobs.gem.com/aarden-ai
 Accel,https://jobs.gem.com/accel
 Accelos,https://jobs.gem.com/accelos
 Acre,https://jobs.gem.com/acre
 Advancelevelllc Com,https://jobs.gem.com/advancelevelllc-com
 Agenta Ai,https://jobs.gem.com/agenta-ai
 Agentnoon,https://jobs.gem.com/agentnoon
 Agora,https://jobs.gem.com/agora
 Aionex Xyz,https://jobs.gem.com/aionex-xyz
 Aiphrodite Ai,https://jobs.gem.com/aiphrodite-ai
 Airframe,https://jobs.gem.com/airframe
 Airvet Com,https://jobs.gem.com/airvet-com
 Alex And Ani,https://jobs.gem.com/alex-and-ani
 Alinia Ai,https://jobs.gem.com/alinia-ai
 Alitheon,https://jobs.gem.com/alitheon
 Alpharun,https://jobs.gem.com/alpharun
 Altzero Xyz,https://jobs.gem.com/altzero-xyz
 Amya Agency,https://jobs.gem.com/amya-agency
 Andrenam,https://jobs.gem.com/andrenam
 Anysphere,https://jobs.gem.com/anysphere
 Aoniclife,https://jobs.gem.com/aoniclife
 Apartment List,https://jobs.gem.com/apartment-list
 Apella,https://jobs.gem.com/apella
 Apticore Io,https://jobs.gem.com/apticore-io
 Arlo,https://jobs.gem.com/arlo
 Ascenda Loyalty,https://jobs.gem.com/ascenda-loyalty
 Ascendarc,https://jobs.gem.com/ascendarc
 Astroforge Io,https://jobs.gem.com/astroforge-io
 Atla Ai Com,https://jobs.gem.com/atla-ai-com
 Atomica,https://jobs.gem.com/atomica
 Audicus,https://jobs.gem.com/audicus
 Aurelian Io,https://jobs.gem.com/aurelian-io
 Aureliussystems Us,https://jobs.gem.com/aureliussystems-us
 Autopilotbrand Com,https://jobs.gem.com/autopilotbrand-com
 Avoca,https://jobs.gem.com/avoca
 Avol,https://jobs.gem.com/avol
 Axonify,https://jobs.gem.com/axonify
 Backops Ai,https://jobs.gem.com/backops-ai
 Basalt Health,https://jobs.gem.com/basalt-health
 Baxter Aerospace,https://jobs.gem.com/baxter-aerospace
 Bead Ai,https://jobs.gem.com/bead-ai
 Benbase,https://jobs.gem.com/benbase
 Better Auth,https://jobs.gem.com/better-auth
 Betterbasket Ai,https://jobs.gem.com/betterbasket-ai
 Bigeye,https://jobs.gem.com/bigeye
 Bigpanda,https://jobs.gem.com/bigpanda
 Bikky,https://jobs.gem.com/bikky
 Bilt,https://jobs.gem.com/bilt
 Binarly,https://jobs.gem.com/binarly
 Biofire,https://jobs.gem.com/biofire
 Biorender,https://jobs.gem.com/biorender
 Biorender Inc  Ats,https://jobs.gem.com/biorender-inc--ats
 Birdwood Therapeutics,https://jobs.gem.com/birdwood-therapeutics
 Black Ore,https://jobs.gem.com/black-ore
 Blaze Ai,https://jobs.gem.com/blaze-ai
 Blazetalent,https://jobs.gem.com/blazetalent
 Blend Inc,https://jobs.gem.com/blend-inc
 Blue J,https://jobs.gem.com/blue-j
 Bluejeanfinancial Com,https://jobs.gem.com/bluejeanfinancial-com
 Blueonion Ai,https://jobs.gem.com/blueonion-ai
 Blueprint,https://jobs.gem.com/blueprint
 Bluesky,https://jobs.gem.com/bluesky
 Blume Technologies,https://jobs.gem.com/blume-technologies
 Bohler ,https://jobs.gem.com/bohler-
 Bohler Engineering Gemats,https://jobs.gem.com/bohler-engineering-gemats
 Bolna,https://jobs.gem.com/bolna
 Bond Partners,https://jobs.gem.com/bond-partners
 Boost Robotics,https://jobs.gem.com/boost-robotics
 Boredm,https://jobs.gem.com/boredm
 Breadcrumb Ai,https://jobs.gem.com/breadcrumb-ai
 Breakline Ats,https://jobs.gem.com/breakline-ats
 Breakline Education,https://jobs.gem.com/breakline-education
 Brewbird,https://jobs.gem.com/brewbird
 Buildtrayd Com,https://jobs.gem.com/buildtrayd-com
 Bull Moose Xyz,https://jobs.gem.com/bull-moose-xyz
 Cadstrom Io,https://jobs.gem.com/cadstrom-io
 Caffelabs Com,https://jobs.gem.com/caffelabs-com
 Calaveras,https://jobs.gem.com/calaveras
 Canals,https://jobs.gem.com/canals
 Caplight Com,https://jobs.gem.com/caplight-com
 Carbon,https://jobs.gem.com/carbon
 Cardnexus,https://jobs.gem.com/cardnexus
 Careers,https://jobs.gem.com/careers
 Carry,https://jobs.gem.com/carry
 Caseflood Ai,https://jobs.gem.com/caseflood-ai
 Cellbyte,https://jobs.gem.com/cellbyte
 Chartahealth,https://jobs.gem.com/chartahealth
 Civrobotics Com,https://jobs.gem.com/civrobotics-com
 Clarity,https://jobs.gem.com/clarity
 Clearchecks Com Ats,https://jobs.gem.com/clearchecks-com-ats
 Clearesthealth Com,https://jobs.gem.com/clearesthealth-com
 Cloudanix Com,https://jobs.gem.com/cloudanix-com
 Cloudraft,https://jobs.gem.com/cloudraft
 Codegen,https://jobs.gem.com/codegen
 Codesignal,https://jobs.gem.com/codesignal
 Cognna,https://jobs.gem.com/cognna
 Cogram,https://jobs.gem.com/cogram
 Comfy,https://jobs.gem.com/comfy
 Conductorai,https://jobs.gem.com/conductorai
 Confida Ai,https://jobs.gem.com/confida-ai
 Context Wtf,https://jobs.gem.com/context-wtf
 Contour App,https://jobs.gem.com/contour-app
 Converge,https://jobs.gem.com/converge
 Coupa Software Inc Ats 1,https://jobs.gem.com/coupa-software-inc-ats-1
 Cox Exponential,https://jobs.gem.com/cox-exponential
 Crabi Robotics Com,https://jobs.gem.com/crabi-robotics-com
 Crackenagi,https://jobs.gem.com/crackenagi
 Create Talent Group,https://jobs.gem.com/create-talent-group
 Createdbyhumans Ai,https://jobs.gem.com/createdbyhumans-ai
 Credit Key,https://jobs.gem.com/credit-key
 Crosby,https://jobs.gem.com/crosby
 Curex Org,https://jobs.gem.com/curex-org
 Curiouscardinals Com,https://jobs.gem.com/curiouscardinals-com
 Cyvl,https://jobs.gem.com/cyvl
 D4M International,https://jobs.gem.com/d4m-international
 Dalus,https://jobs.gem.com/dalus
 Dash Fi,https://jobs.gem.com/dash-fi
 Data Masters,https://jobs.gem.com/data-masters
 Datacurve Ai,https://jobs.gem.com/datacurve-ai
 Dataday Technology Solutions,https://jobs.gem.com/dataday-technology-solutions
 Datagrid,https://jobs.gem.com/datagrid
 Dawn Media,https://jobs.gem.com/dawn-media
 Daxko,https://jobs.gem.com/daxko
 Deep Infra,https://jobs.gem.com/deep-infra
 Deliver,https://jobs.gem.com/deliver
 Detections Ai,https://jobs.gem.com/detections-ai
 Dianahr Ai,https://jobs.gem.com/dianahr-ai
 Distributed Spectrum,https://jobs.gem.com/distributed-spectrum
 Dlvrlog,https://jobs.gem.com/dlvrlog
 Doowii,https://jobs.gem.com/doowii
 Dragme,https://jobs.gem.com/dragme
 Dragonfly Careers,https://jobs.gem.com/dragonfly-careers
 Dropback,https://jobs.gem.com/dropback
 Durin,https://jobs.gem.com/durin
 Dydx,https://jobs.gem.com/dydx
 Eats2Seats,https://jobs.gem.com/eats2seats
 Echelon,https://jobs.gem.com/echelon
 Ecocart Io,https://jobs.gem.com/ecocart-io
 Edgetrace Ai,https://jobs.gem.com/edgetrace-ai
 Efference Ai,https://jobs.gem.com/efference-ai
 Elite Talent Consulting,https://jobs.gem.com/elite-talent-consulting
 Eliza,https://jobs.gem.com/eliza
 Elloe Ai,https://jobs.gem.com/elloe-ai
 Elo Ai,https://jobs.gem.com/elo-ai
 Emerge Career,https://jobs.gem.com/emerge-career
 Engineering  Codified,https://jobs.gem.com/engineering--codified
 Entrusted Contracting,https://jobs.gem.com/entrusted-contracting
 Escargot Com,https://jobs.gem.com/escargot-com
 Everfit Io,https://jobs.gem.com/everfit-io
 Excelity Careers,https://jobs.gem.com/excelity-careers
 Exponent,https://jobs.gem.com/exponent
 Ezraailabs Tech,https://jobs.gem.com/ezraailabs-tech
 Fabric,https://jobs.gem.com/fabric
 Fabrichealth,https://jobs.gem.com/fabrichealth
 Fancypeople,https://jobs.gem.com/fancypeople
 Fanpierlabs Com,https://jobs.gem.com/fanpierlabs-com
 Faraday,https://jobs.gem.com/faraday
 Fathom Org,https://jobs.gem.com/fathom-org
 Felix,https://jobs.gem.com/felix
 Ferry Health,https://jobs.gem.com/ferry-health
 Fetch Ats,https://jobs.gem.com/fetch-ats
 Fifthdoor Com,https://jobs.gem.com/fifthdoor-com
 Fireflies,https://jobs.gem.com/fireflies
 Firestorm,https://jobs.gem.com/firestorm
 Flatfee Corp,https://jobs.gem.com/flatfee-corp
 Flint,https://jobs.gem.com/flint
 Floot,https://jobs.gem.com/floot
 Forgent Ai,https://jobs.gem.com/forgent-ai
 Fountainplatform Com,https://jobs.gem.com/fountainplatform-com
 Foxbox Digital,https://jobs.gem.com/foxbox-digital
 Freestone Grove Partners,https://jobs.gem.com/freestone-grove-partners
 Freshbooks,https://jobs.gem.com/freshbooks
 Fridayharbor Ai,https://jobs.gem.com/fridayharbor-ai
 Fuelfinance,https://jobs.gem.com/fuelfinance
 Fulcrumcareers,https://jobs.gem.com/fulcrumcareers
 Function Health,https://jobs.gem.com/function-health
 Galadyne,https://jobs.gem.com/galadyne
 Galaxyventures,https://jobs.gem.com/galaxyventures
 Gc Ai,https://jobs.gem.com/gc-ai
 Gem,https://jobs.gem.com/gem
 Gem Mckesson,https://jobs.gem.com/gem-mckesson
 Gem Test Board,https://jobs.gem.com/gem-test-board
 Generation Alpha Transistor,https://jobs.gem.com/generation-alpha-transistor
 Genspark,https://jobs.gem.com/genspark
 Gerra,https://jobs.gem.com/gerra
 Getaero Io,https://jobs.gem.com/getaero-io
 Getbirdeye Com Au,https://jobs.gem.com/getbirdeye-com-au
 Getro,https://jobs.gem.com/getro
 Gigaml,https://jobs.gem.com/gigaml
 Go Cadre,https://jobs.gem.com/go-cadre
 Goatrecruit Com,https://jobs.gem.com/goatrecruit-com
 Good Life Companies,https://jobs.gem.com/good-life-companies
 Goodbill,https://jobs.gem.com/goodbill
 Grailpay Com,https://jobs.gem.com/grailpay-com
 Granger Construction,https://jobs.gem.com/granger-construction
 Gratia Health,https://jobs.gem.com/gratia-health
 Greenlite Ai,https://jobs.gem.com/greenlite-ai
 Greenvalleyjobs,https://jobs.gem.com/greenvalleyjobs
 Grit,https://jobs.gem.com/grit
 Groq,https://jobs.gem.com/groq
 Growthbook,https://jobs.gem.com/growthbook
 Guardrail Ai,https://jobs.gem.com/guardrail-ai
 Guidesage Ai,https://jobs.gem.com/guidesage-ai
 Hallow,https://jobs.gem.com/hallow
 Happydance Partnership Integration,https://jobs.gem.com/happydance-partnership-integration
 Harmonic,https://jobs.gem.com/harmonic
 Hash,https://jobs.gem.com/hash
 Hayla,https://jobs.gem.com/hayla
 Heavy Construction Systems Specialists Llc,https://jobs.gem.com/heavy-construction-systems-specialists-llc
 Helix,https://jobs.gem.com/helix
 Hellotrade,https://jobs.gem.com/hellotrade
 Helm Health,https://jobs.gem.com/helm-health
 Hilabs Ie,https://jobs.gem.com/hilabs-ie
 Hipeople,https://jobs.gem.com/hipeople
 Holacasa Yc W23,https://jobs.gem.com/holacasa-yc-w23
 Homeboost,https://jobs.gem.com/homeboost
 Hospitable,https://jobs.gem.com/hospitable
 Howrecruit Io,https://jobs.gem.com/howrecruit-io
 Hubspot,https://jobs.gem.com/hubspot
 Hypernatural Ai,https://jobs.gem.com/hypernatural-ai
 Inception,https://jobs.gem.com/inception
 Index Exchange,https://jobs.gem.com/index-exchange
 Infrastructure Modernization Solutions,https://jobs.gem.com/infrastructure-modernization-solutions
 Inspiration Commerce Group,https://jobs.gem.com/inspiration-commerce-group
 Inspiresemi Com,https://jobs.gem.com/inspiresemi-com
 Instrumental Inc ,https://jobs.gem.com/instrumental-inc-
 Integral Xyz,https://jobs.gem.com/integral-xyz
 Integrationscaptain,https://jobs.gem.com/integrationscaptain
 Intelligentresourcing Co,https://jobs.gem.com/intelligentresourcing-co
 Interfere Old,https://jobs.gem.com/interfere-old
 Invoicebutler Ai,https://jobs.gem.com/invoicebutler-ai
 Iris,https://jobs.gem.com/iris
 Ironsite Ai,https://jobs.gem.com/ironsite-ai
 Itsvaleria Co,https://jobs.gem.com/itsvaleria-co
 Jaguaracareers,https://jobs.gem.com/jaguaracareers
 Janie,https://jobs.gem.com/janie
 Jayla Careers,https://jobs.gem.com/jayla-careers
 Jobma,https://jobs.gem.com/jobma
 Joinanvil Com,https://jobs.gem.com/joinanvil-com
 Joinformal,https://jobs.gem.com/joinformal
 Joyful Health,https://jobs.gem.com/joyful-health
 Kaikaku,https://jobs.gem.com/kaikaku
 Kaironhealth,https://jobs.gem.com/kaironhealth
 Kaironhealth Com,https://jobs.gem.com/kaironhealth-com
 Kanu Ai,https://jobs.gem.com/kanu-ai
 Kcs Hiring,https://jobs.gem.com/kcs-hiring
 Keru Ai,https://jobs.gem.com/keru-ai
 Key To Web3,https://jobs.gem.com/key-to-web3
 Knight Electric Inc ,https://jobs.gem.com/knight-electric-inc-
 Kollectiv Ai,https://jobs.gem.com/kollectiv-ai
 Kumo Ai,https://jobs.gem.com/kumo-ai
 Lantern,https://jobs.gem.com/lantern
 Lavapayments Com,https://jobs.gem.com/lavapayments-com
 Leap Tools,https://jobs.gem.com/leap-tools
 Letsdata,https://jobs.gem.com/letsdata
 Letter Ai,https://jobs.gem.com/letter-ai
 Level,https://jobs.gem.com/level
 Linktree,https://jobs.gem.com/linktree
 Little Otter,https://jobs.gem.com/little-otter
 Lower Llc,https://jobs.gem.com/lower-llc
 Lumalabs Ai,https://jobs.gem.com/lumalabs-ai
 Lunajoy,https://jobs.gem.com/lunajoy
 Lunch,https://jobs.gem.com/lunch
 Lunos Ai,https://jobs.gem.com/lunos-ai
 Magnetic,https://jobs.gem.com/magnetic
 Manifest,https://jobs.gem.com/manifest
 Manifested Com,https://jobs.gem.com/manifested-com
 Marble Health,https://jobs.gem.com/marble-health
 Mavi,https://jobs.gem.com/mavi
 Meetdex Ai,https://jobs.gem.com/meetdex-ai
 Megapot,https://jobs.gem.com/megapot
 Meineautosdirekt,https://jobs.gem.com/meineautosdirekt
 Menten Ai,https://jobs.gem.com/menten-ai
 Merge Sandbox,https://jobs.gem.com/merge-sandbox
 Metal Ai,https://jobs.gem.com/metal-ai
 Microsoft Demo Gem Com,https://jobs.gem.com/microsoft-demo-gem-com
 Mimicrobotics Com,https://jobs.gem.com/mimicrobotics-com
 Mission,https://jobs.gem.com/mission
 Moosehead Talent,https://jobs.gem.com/moosehead-talent
 Motion,https://jobs.gem.com/motion
 Moxa,https://jobs.gem.com/moxa
 Multiplierhq,https://jobs.gem.com/multiplierhq
 Multiscale Ai,https://jobs.gem.com/multiscale-ai
 Myprize,https://jobs.gem.com/myprize
 Myriad Technology,https://jobs.gem.com/myriad-technology
 Myrrsgroup,https://jobs.gem.com/myrrsgroup
 Nabla Bio,https://jobs.gem.com/nabla-bio
 Nacelle,https://jobs.gem.com/nacelle
 Nativemsg,https://jobs.gem.com/nativemsg
 Nclusion,https://jobs.gem.com/nclusion
 Nerve,https://jobs.gem.com/nerve
 Newcrew,https://jobs.gem.com/newcrew
 Ngram,https://jobs.gem.com/ngram
 Nimble,https://jobs.gem.com/nimble
 Niva,https://jobs.gem.com/niva
 Nominal,https://jobs.gem.com/nominal
 Northone,https://jobs.gem.com/northone
 Ntop,https://jobs.gem.com/ntop
 Nue Ai,https://jobs.gem.com/nue-ai
 Nutrislice,https://jobs.gem.com/nutrislice
 Nuvo,https://jobs.gem.com/nuvo
 Obin Ai,https://jobs.gem.com/obin-ai
 Obsidian Systems,https://jobs.gem.com/obsidian-systems
 Odo Do,https://jobs.gem.com/odo-do
 Omegahhagency Com,https://jobs.gem.com/omegahhagency-com
 Ondo Finance,https://jobs.gem.com/ondo-finance
 Onesignal,https://jobs.gem.com/onesignal
 Onesignal Ats,https://jobs.gem.com/onesignal-ats
 Onezyme,https://jobs.gem.com/onezyme
 Onfrontiers,https://jobs.gem.com/onfrontiers
 Openphone,https://jobs.gem.com/openphone
 Openreqstaffing,https://jobs.gem.com/openreqstaffing
 Opine,https://jobs.gem.com/opine
 Ora So,https://jobs.gem.com/ora-so
 Overlay,https://jobs.gem.com/overlay
 Overwatch,https://jobs.gem.com/overwatch
 Paces,https://jobs.gem.com/paces
 Pae,https://jobs.gem.com/pae
 Pagebound,https://jobs.gem.com/pagebound
 Pally,https://jobs.gem.com/pally
 Paramark,https://jobs.gem.com/paramark
 Partao,https://jobs.gem.com/partao
 Partnerhq,https://jobs.gem.com/partnerhq
 Patlytics,https://jobs.gem.com/patlytics
 Pave,https://jobs.gem.com/pave
 Perceptyx,https://jobs.gem.com/perceptyx
 Photalabs Com,https://jobs.gem.com/photalabs-com
 Photon,https://jobs.gem.com/photon
 Pinnacleconnect Llc,https://jobs.gem.com/pinnacleconnect-llc
 Piqenergy Com,https://jobs.gem.com/piqenergy-com
 Planet Fans,https://jobs.gem.com/planet-fans
 Planned,https://jobs.gem.com/planned
 Plixai,https://jobs.gem.com/plixai
 Pogo Recruiting,https://jobs.gem.com/pogo-recruiting
 Polar,https://jobs.gem.com/polar
 Polywork,https://jobs.gem.com/polywork
 Pomerium,https://jobs.gem.com/pomerium
 Portal Ai,https://jobs.gem.com/portal-ai
 Poseidonaero,https://jobs.gem.com/poseidonaero
 Prahsys Com,https://jobs.gem.com/prahsys-com
 Praxisiq Ai,https://jobs.gem.com/praxisiq-ai
 Precision Ai,https://jobs.gem.com/precision-ai
 Prodia,https://jobs.gem.com/prodia
 Productboard,https://jobs.gem.com/productboard
 Productboard Ats,https://jobs.gem.com/productboard-ats
 Prohost Ai,https://jobs.gem.com/prohost-ai
 Project Method,https://jobs.gem.com/project-method
 Promptql,https://jobs.gem.com/promptql
 Propel,https://jobs.gem.com/propel
 Prospermedical Com,https://jobs.gem.com/prospermedical-com
 Protegeai,https://jobs.gem.com/protegeai
 Questdb Com,https://jobs.gem.com/questdb-com
 Quitwithjones,https://jobs.gem.com/quitwithjones
 Quo,https://jobs.gem.com/quo
 Rain Aero,https://jobs.gem.com/rain-aero
 Raincode Bahrain W L L,https://jobs.gem.com/raincode-bahrain-w-l-l
 Raylu Ai,https://jobs.gem.com/raylu-ai
 Rctsglobal Com,https://jobs.gem.com/rctsglobal-com
 Rditrials,https://jobs.gem.com/rditrials
 Rebuild Work,https://jobs.gem.com/rebuild-work
 Redcar,https://jobs.gem.com/redcar
 Redenvelope Co,https://jobs.gem.com/redenvelope-co
 Redo,https://jobs.gem.com/redo
 Rektech,https://jobs.gem.com/rektech
 Renew,https://jobs.gem.com/renew
 Resprop,https://jobs.gem.com/resprop
 Retool,https://jobs.gem.com/retool
 Revolutionparts,https://jobs.gem.com/revolutionparts
 Rex,https://jobs.gem.com/rex
 Rf Renovo Management Company Llc,https://jobs.gem.com/rf-renovo-management-company-llc
 Riley,https://jobs.gem.com/riley
 Rinsed,https://jobs.gem.com/rinsed
 Risely Ai,https://jobs.gem.com/risely-ai
 Rivia,https://jobs.gem.com/rivia
 Roadio Ai,https://jobs.gem.com/roadio-ai
 Roamless,https://jobs.gem.com/roamless
 Roe Ai,https://jobs.gem.com/roe-ai
 Rossibuilders Com,https://jobs.gem.com/rossibuilders-com
 Roundhouse Media,https://jobs.gem.com/roundhouse-media
 Rove,https://jobs.gem.com/rove
 Runsybil,https://jobs.gem.com/runsybil
 Sadnaconsulting Com,https://jobs.gem.com/sadnaconsulting-com
 Sailorhealth Com,https://jobs.gem.com/sailorhealth-com
 Sales Marker,https://jobs.gem.com/sales-marker
 Salesqueze Com,https://jobs.gem.com/salesqueze-com
 Sandbar Inc,https://jobs.gem.com/sandbar-inc
 Sandboxschonfeld Com,https://jobs.gem.com/sandboxschonfeld-com
 Sauron Systems,https://jobs.gem.com/sauron-systems
 Scope Labs,https://jobs.gem.com/scope-labs
 Scowtt Com,https://jobs.gem.com/scowtt-com
 Seated,https://jobs.gem.com/seated
 Seed2Series Com,https://jobs.gem.com/seed2series-com
 Seniorverse,https://jobs.gem.com/seniorverse
 Sennder Gmbh,https://jobs.gem.com/sennder-gmbh
 Senndertechnologies Gmbh,https://jobs.gem.com/senndertechnologies-gmbh
 Sensorum Health,https://jobs.gem.com/sensorum-health
 Serv Ai,https://jobs.gem.com/serv-ai
 Seven Starling,https://jobs.gem.com/seven-starling
 Shef Com,https://jobs.gem.com/shef-com
 Shorebird Dev,https://jobs.gem.com/shorebird-dev
 Showtime,https://jobs.gem.com/showtime
 Signoz,https://jobs.gem.com/signoz
 Silkline,https://jobs.gem.com/silkline
 Skypilot Co,https://jobs.gem.com/skypilot-co
 Slash,https://jobs.gem.com/slash
 Sleep Center,https://jobs.gem.com/sleep-center
 Smacktechnologies Com,https://jobs.gem.com/smacktechnologies-com
 Snout,https://jobs.gem.com/snout
 Softup Technologies,https://jobs.gem.com/softup-technologies
 Sohar Health,https://jobs.gem.com/sohar-health
 Soundhound,https://jobs.gem.com/soundhound
 Spawn,https://jobs.gem.com/spawn
 Spellbrush,https://jobs.gem.com/spellbrush
 Sphere Semi,https://jobs.gem.com/sphere-semi
 Ssg,https://jobs.gem.com/ssg
 Stack Auth Com,https://jobs.gem.com/stack-auth-com
 Startup People Solutions,https://jobs.gem.com/startup-people-solutions
 Stealth Startup,https://jobs.gem.com/stealth-startup
 Stockapp Com,https://jobs.gem.com/stockapp-com
 Stryke,https://jobs.gem.com/stryke
 Sunsethq Com,https://jobs.gem.com/sunsethq-com
 Super Hi Fi,https://jobs.gem.com/super-hi-fi
 Superblocks,https://jobs.gem.com/superblocks
 Supersonik Ai,https://jobs.gem.com/supersonik-ai
 Supio,https://jobs.gem.com/supio
 Suppliercanada Com,https://jobs.gem.com/suppliercanada-com
 Switchgrowth Com,https://jobs.gem.com/switchgrowth-com
 Symbolica,https://jobs.gem.com/symbolica
 Syndesus,https://jobs.gem.com/syndesus
 System Two Security,https://jobs.gem.com/system-two-security
 Taxgpt Inc ,https://jobs.gem.com/taxgpt-inc-
 Taxo Ai,https://jobs.gem.com/taxo-ai
 Tektome Com,https://jobs.gem.com/tektome-com
 Telora,https://jobs.gem.com/telora
 Tensorstax Com,https://jobs.gem.com/tensorstax-com
 Tenx Recruiting,https://jobs.gem.com/tenx-recruiting
 Terraai Earth,https://jobs.gem.com/terraai-earth
 Test Board,https://jobs.gem.com/test-board
 The Boring Company,https://jobs.gem.com/the-boring-company
 The Brewer Garrett Company,https://jobs.gem.com/the-brewer-garrett-company
 The Talent Project Com,https://jobs.gem.com/the-talent-project-com
 Theburntapp Com,https://jobs.gem.com/theburntapp-com
 Theinterface,https://jobs.gem.com/theinterface
 Thejobbridge,https://jobs.gem.com/thejobbridge
 Thelma,https://jobs.gem.com/thelma
 Theluckyfoundation,https://jobs.gem.com/theluckyfoundation
 Thenewclub Fyi,https://jobs.gem.com/thenewclub-fyi
 Theseus Us,https://jobs.gem.com/theseus-us
 Thinkific,https://jobs.gem.com/thinkific
 Third Dimension,https://jobs.gem.com/third-dimension
 Thrivory,https://jobs.gem.com/thrivory
 Thunder,https://jobs.gem.com/thunder
 Thunder Compute,https://jobs.gem.com/thunder-compute
 Timetoperform,https://jobs.gem.com/timetoperform
 Token Transit,https://jobs.gem.com/token-transit
 Toolhouse Ai,https://jobs.gem.com/toolhouse-ai
 Torchsystems Com,https://jobs.gem.com/torchsystems-com
 Transluce,https://jobs.gem.com/transluce
 Trashlab,https://jobs.gem.com/trashlab
 Tricentis,https://jobs.gem.com/tricentis
 Trilliumhiring Com,https://jobs.gem.com/trilliumhiring-com
 Tripworks Com,https://jobs.gem.com/tripworks-com
 Tristero,https://jobs.gem.com/tristero
 Trojan Trading,https://jobs.gem.com/trojan-trading
 Tropic,https://jobs.gem.com/tropic
 Trybree Com,https://jobs.gem.com/trybree-com
 Tryhelium Com,https://jobs.gem.com/tryhelium-com
 Tungsten Dev,https://jobs.gem.com/tungsten-dev
 Turbohome,https://jobs.gem.com/turbohome
 Twentyfour7 Dev,https://jobs.gem.com/twentyfour7-dev
 Unify Ai,https://jobs.gem.com/unify-ai
 Untolabs Com,https://jobs.gem.com/untolabs-com
 Up Labs,https://jobs.gem.com/up-labs
 Useful,https://jobs.gem.com/useful
 Usemalleable Com,https://jobs.gem.com/usemalleable-com
 Vamo Xyz,https://jobs.gem.com/vamo-xyz
 Vanguard Cleaning Systems,https://jobs.gem.com/vanguard-cleaning-systems
 Vantaca,https://jobs.gem.com/vantaca
 Vantager,https://jobs.gem.com/vantager
 Vantara Ai,https://jobs.gem.com/vantara-ai
 Vectorworks,https://jobs.gem.com/vectorworks
 Vectrasim,https://jobs.gem.com/vectrasim
 Veho Technologies,https://jobs.gem.com/veho-technologies
 Ventionteams Com,https://jobs.gem.com/ventionteams-com
 Venture Guides,https://jobs.gem.com/venture-guides
 Vercel Ats Sandbox,https://jobs.gem.com/vercel-ats-sandbox
 Vesseltalent Com,https://jobs.gem.com/vesseltalent-com
 Voker Ai,https://jobs.gem.com/voker-ai
 Voltai Com,https://jobs.gem.com/voltai-com
 Wayback Labs,https://jobs.gem.com/wayback-labs
 Webflow Ats Sandbox,https://jobs.gem.com/webflow-ats-sandbox
 Western Governors University,https://jobs.gem.com/western-governors-university
 Whatconverts,https://jobs.gem.com/whatconverts
 Wiseroad Recruiting Inc,https://jobs.gem.com/wiseroad-recruiting-inc
 Wizecamel,https://jobs.gem.com/wizecamel
 Wolfjaw Careers,https://jobs.gem.com/wolfjaw-careers
 Wonolo,https://jobs.gem.com/wonolo
 Woodsideai,https://jobs.gem.com/woodsideai
 Youtrip,https://jobs.gem.com/youtrip
 Zefi Ai,https://jobs.gem.com/zefi-ai
 Zep,https://jobs.gem.com/zep
 Zorrorx,https://jobs.gem.com/zorrorx
--- a/greenhouse.csv
+++ b/greenhouse.csv
@ -0,0 +1,6 @@
 url,timestamp
 https://job-boards.eu.greenhouse.io/bcbgroup/jobs/4681083101?gh_src=cryptocurrencyjobs.co,2025-12-31T08:35:23.424931
 https://job-boards.greenhouse.io/securitize/jobs/4074121009?gh_src=cryptocurrencyjobs.co,2025-12-31T09:19:17.349713
 https://job-boards.eu.greenhouse.io/bcbgroup/jobs/4681102101?gh_src=cryptocurrencyjobs.co,2025-12-31T09:58:36.919216
 https://job-boards.greenhouse.io/kiosk/jobs/4427184005?gh_src=cryptocurrencyjobs.co,2025-12-31T10:10:51.176114
 https://job-boards.eu.greenhouse.io/bcbgroup/jobs/4681083101?gh_src=cryptocurrencyjobs.co,2025-12-31T11:02:31.869728
--- a/greenhousecompanies.csv
+++ b/greenhousecompanies.csv
--- a/lever.csv
+++ b/lever.csv
@ -0,0 +1,7 @@
 url,timestamp
 https://jobs.eu.lever.co/kaiko/3f7f3db9-4a6a-4047-8760-bc52c3d03e05?lever-origin=applied&lever-source%5B%5D=cryptocurrencyjobs.co,2025-12-31T09:20:28.542417
 https://jobs.lever.co/waterfall/880fb1b4-2515-4534-9970-53c497c82f12?lever-origin=applied&lever-source%5B%5D=cryptocurrencyjobs.co,2025-12-31T10:08:17.316072
 https://jobs.lever.co/obol-tech/fcccd493-54e4-425a-b9bd-82fa6f7e6aff?lever-origin=applied&lever-source%5B%5D=cryptocurrencyjobs.co,2025-12-31T10:35:29.164452
 https://jobs.eu.lever.co/coinspaid/7605e154-4b1d-45ee-b1d4-35edea13d80b?lever-origin=applied&lever-source%5B%5D=cryptocurrencyjobs.co,2025-12-31T10:51:38.852693
 https://jobs.lever.co/vedatechlabs/9c59c96c-2bb0-47b0-88fe-5d5a9fd85997?lever-origin=applied&lever-source%5B%5D=cryptocurrencyjobs.co,2025-12-31T11:02:16.120852
 https://jobs.eu.lever.co/kaiko/3f7f3db9-4a6a-4047-8760-bc52c3d03e05?lever-origin=applied&lever-source%5B%5D=cryptocurrencyjobs.co,2025-12-31T11:16:43.218273
--- a/levercompanies.csv
+++ b/levercompanies.csv
--- a/linkedin.csv
+++ b/linkedin.csv
@ -0,0 +1,8 @@
 url,timestamp
 https://www.linkedin.com/jobs/view/operations-analyst-at-amber-group-4325538653/?ref=cryptocurrencyjobs.co,2025-12-31T09:20:11.544002
 https://www.linkedin.com/jobs/view/hr-operations-intern-sg-at-matrixport-official-4338171692/?ref=cryptocurrencyjobs.co,2025-12-31T09:25:10.499933
 https://www.linkedin.com/jobs/view/operations-analyst-at-matrixport-official-4235087267/?ref=cryptocurrencyjobs.co,2025-12-31T09:33:53.104120
 https://www.linkedin.com/jobs/view/business-operations-analyst-at-matrixport-official-4215538150/?ref=cryptocurrencyjobs.co,2025-12-31T09:34:24.186519
 https://www.linkedin.com/jobs/view/graduate-hiring-business-operations-analyst-wealth-management-at-matrixport-official-4131687672/?ref=cryptocurrencyjobs.co,2025-12-31T09:36:47.038648
 https://www.linkedin.com/jobs/view/customer-support-specialist-at-matrixport-official-4323103235/?ref=cryptocurrencyjobs.co,2025-12-31T10:39:57.272414
 https://www.linkedin.com/jobs/view/finance-intern-at-amber-group-4248725225/?ref=cryptocurrencyjobs.co,2025-12-31T11:31:03.349275
--- a/llm_agent.py
+++ b/llm_agent.py
@ -21,13 +21,12 @@ class LLMJobRefiner:
            raise ValueError("DEEPSEEK_API_KEY not found in .env file.")
        # Database credentials from .env
        self.db_url = os.getenv("DB_URL")
        self.db_username = os.getenv("DB_USERNAME")
        self.db_password = os.getenv("DB_PASSWORD")
        self.db_host = os.getenv("DB_HOST")
        self.db_port = os.getenv("DB_PORT")
-        if not self.db_url or not self.db_username or not self.db_password:
+        if not self.db_username or not self.db_password:
            raise ValueError("Database credentials not found in .env file.")
        # DeepSeek uses OpenAI-compatible API
@ -41,16 +40,6 @@ class LLMJobRefiner:
    def _init_db(self):
        """Initialize PostgreSQL database connection and create table"""
        try:
            self.db_url = os.getenv("DB_URL")
            if self.db_url and "supabase.com" in self.db_url:
                conn = psycopg2.connect(
                    host=self.db_host,
                    port=self.db_port,
                    database="postgres",
                    user=self.db_username,
                    password=self.db_password
                )
            else:
            conn = psycopg2.connect(
                host=self.db_host,
                port=self.db_port,
@ -113,8 +102,8 @@ class LLMJobRefiner:
            text = re.sub(r'\s+', ' ', text)
            # Limit length for LLM context
-            if len(text) > 10000:
+            if len(text) > 100000:
-                text = text[:10000] + "..."
+                text = text[:100000] + "..."
            return text
        except Exception as e:
@ -128,7 +117,7 @@ class LLMJobRefiner:
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[{"role": "user", "content": prompt}],
-                temperature=0.2,
+                temperature=0.1,
                max_tokens=2048,
                stream=False
            )
@ -145,26 +134,32 @@ class LLMJobRefiner:
        posted_date = raw_data.get('posted_date', datetime.now().strftime("%m/%d/%y"))
        prompt = f"""
-        You are a job posting data extractor.
+You are an expert job posting data extractor. Your task is to extract AND infer fields from the provided job posting.
-        EXTRACT EXACT TEXT - DO NOT SUMMARIZE, PARAPHRASE, OR INVENT.
+### CORE RULES:
 1. **NEVER invent, summarize, or paraphrase** — extract **exact wording** when available.
 2. **For critical fields (title, company_name, job_id, url, description):**
   - These MUST be present and meaningful.
   - If not explicitly stated, **infer from context** (e.g., page title, headings, "About Us", etc.).
   - **NEVER return "Not provided" or "N/A" for these fields.**
 3. **For optional fields (location, salary_range, etc.):**
   - Extract exact text if present.
   - If absent but inferable (e.g., "Remote (US)", "Full-time"), **infer it**.
   - Only return "Not provided" if truly absent and non-inferable.
-        For these critical fields, follow these rules:
+### FIELD DEFINITIONS:
-        - description: Extract ALL job description text. If ANY job details exist (duties, responsibilities, overview), include them. Only use "Not provided" if absolutely no description exists.
+- **title**: The job title. Look in <h1>, page title, or bold headings.
-        - requirements: Extract ALL requirements text. If ANY requirements exist (skills, experience, education needed), include them. Only use "Not provided" if none exist.
+- **company_name**: Company name. Look in logo alt text, footer, "About [X]", or page title.
-        - qualifications: Extract ALL qualifications text. If ANY qualifications exist, include them. Only use "Not provided" if none exist.
+- **description**: Main job overview, responsibilities, or duties. Combine relevant paragraphs if needed. **Must not be empty.**
 - **requirements**: Required skills, experience, or qualifications.
 - **qualifications**: Educational or certification requirements.
 - **location**: Office location or remote policy.
 - **salary_range**: Exact compensation info.
 - **nature_of_work**: Employment type (Full-time, Contract, Internship, etc.).
-        REQUIRED FIELDS (must have valid values, never "N/A"):
+### OUTPUT FORMAT:
-        - title, company_name, job_id, url
+Return ONLY a valid JSON object with these keys:
-        
+{{
        OPTIONAL FIELDS (can be "Not provided"):
        - location, salary_range, nature_of_work
        Page Content:
        {cleaned_content}
        Response format (ONLY return this JSON):
        {{
    "title": "...",
    "company_name": "...",
    "location": "...",
@ -175,8 +170,16 @@ class LLMJobRefiner:
    "nature_of_work": "...",
    "job_id": "{job_id}",
    "url": "{url}"
-        }}
+}}
-        """
+
 - **Critical fields must NEVER be "Not provided", "N/A", empty, or generic** (e.g., "Company", "Job Title").
 - **Optional fields may be "Not provided" ONLY if truly absent.**
 - **Do not include markdown, explanations, or extra text.**
 - **Use double quotes for JSON.**
 Page Content:
 {cleaned_content}
 """
        try:
            response_text = await asyncio.get_event_loop().run_in_executor(
@ -188,31 +191,23 @@ class LLMJobRefiner:
            if not refined_data:
                return None
-            # Validate required fields
+            # Validate critical fields — reject if missing or placeholder
-            required_fields = ['title', 'company_name', 'job_id', 'url']
+            critical_fields = ['title', 'company_name', 'job_id', 'url', 'description']
-            for field in required_fields:
+            for field in critical_fields:
                if not refined_data.get(field) or refined_data[field].strip() in ["N/A", "", "Unknown", "Company", "Job"]:
                    return None
            # CRITICAL: Validate content fields - check if they SHOULD exist
            content_fields = ['description', 'requirements', 'qualifications']
            cleaned_original = cleaned_content.lower()
            # Simple heuristic: if page contains job-related keywords, content fields should NOT be "Not provided"
            job_indicators = ['responsibilit', 'duties', 'require', 'qualifi', 'skill', 'experienc', 'educat', 'degree', 'bachelor', 'master']
            has_job_content = any(indicator in cleaned_original for indicator in job_indicators)
            if has_job_content:
                for field in content_fields:
                value = refined_data.get(field, "").strip()
-                    if value in ["Not provided", "N/A", ""]:
+                if not value or value.lower() in ["n/a", "not provided", "unknown", "company", "job", "title", ""]:
-                        # LLM failed to extract existing content
+                    print(f"  ❌ Critical field '{field}' is invalid: '{value}'")
-                        print(f"  ⚠️ LLM returned '{value}' for {field} but job content appears present")
+                    return None  # This job will NOT be saved — as per requirement
-                        return None
+            
            # Optional fields: allow "Not provided", but ensure they're strings
            optional_fields = ['location', 'requirements', 'qualifications', 'salary_range', 'nature_of_work']
            for field in optional_fields:
                if field not in refined_data:
                    refined_data[field] = "Not provided"
                elif not isinstance(refined_data[field], str):
                    refined_data[field] = str(refined_data[field])
            # Add the posted_date to the refined data
            refined_data['posted_date'] = posted_date
            return refined_data
        except Exception as e:
@ -220,15 +215,22 @@ class LLMJobRefiner:
            return None
    def _parse_llm_response(self, response_text: str) -> Dict[str, Any]:
        # Try to extract JSON from markdown code block
        json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', response_text, re.DOTALL)
        if not json_match:
-            json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
+            # Try to find raw JSON object
            json_match = re.search(r'\{[^{}]*\{[^{}]*\}[^{}]*\}|\{.*\}', response_text, re.DOTALL)
        if not json_match:
            return None
        try:
-            return json.loads(json_match.group(1) if '```' in response_text else json_match.group(0))
+            json_str = json_match.group(1) if '```' in response_text else json_match.group(0)
-        except json.JSONDecodeError:
+            # Clean common issues
            json_str = re.sub(r'\s+', ' ', json_str)
            json_str = re.sub(r',\s*([\]}\)])', r'\1', json_str)  # Remove trailing commas
            return json.loads(json_str)
        except json.JSONDecodeError as e:
            print(f"JSON parsing error: {e}")
            return None
    async def save_job_data(self, job_data: Dict[str, Any], keyword: str):
@ -254,50 +256,50 @@ class LLMJobRefiner:
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (job_id) DO NOTHING
 ''', (
-                job_data.get("title", "N/A"),
+                job_data.get("title", "Not provided"),
-                job_data.get("company_name", "N/A"),
+                job_data.get("company_name", "Not provided"),
-                job_data.get("location", "N/A"),
+                job_data.get("location", "Not provided"),
-                job_data.get("description", "N/A"),
+                job_data.get("description", "Not provided"),
-                job_data.get("requirements", "N/A"),
+                job_data.get("requirements", "Not provided"),
-                job_data.get("qualifications", "N/A"),
+                job_data.get("qualifications", "Not provided"),
-                job_data.get("salary_range", "N/A"),
+                job_data.get("salary_range", "Not provided"),
-                job_data.get("nature_of_work", "N/A"),
+                job_data.get("nature_of_work", "Not provided"),
-                job_data.get("job_id", "N/A"),
+                job_data.get("job_id", "unknown"),
                job_data.get("url", "N/A"),
-                job_data.get("category", "N/A"),
+                job_data.get("category", "all"),
                job_data.get("scraped_at"),
-                job_data.get("posted_date", "N/A")
+                job_data.get("posted_date", datetime.now().strftime("%m/%d/%y"))
            ))
            conn.commit()
            cursor.close()
            conn.close()
-            print(f"  💾 Saved job to category '{job_data.get('category', 'N/A')}' with job_id: {job_data.get('job_id', 'N/A')}")
+            print(f"  💾 Saved job to category '{job_data.get('category', 'all')}' with job_id: {job_data.get('job_id', 'unknown')}")
        except Exception as e:
            print(f"❌ Database save error: {e}")
    async def _save_to_markdown(self, job_data: Dict[str, Any], keyword: str):
-        os.makedirs("linkedin_jobs", exist_ok=True)
+        os.makedirs("crypto_jobs", exist_ok=True)
-        filepath = os.path.join("linkedin_jobs", "linkedin_jobs_scraped.md")
+        filepath = os.path.join("crypto_jobs", "crypto_jobs_scraped.md")
        write_header = not os.path.exists(filepath) or os.path.getsize(filepath) == 0
        with open(filepath, "a", encoding="utf-8") as f:
            if write_header:
-                f.write(f"# LinkedIn Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
+                f.write(f"# Crypto Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
-            f.write(f"## Job: {job_data.get('title', 'N/A')}\n\n")
+            f.write(f"## Job: {job_data.get('title', 'Not provided')}\n\n")
            f.write(f"- **Keyword**: {keyword}\n")
-            f.write(f"- **Company**: {job_data.get('company_name', 'N/A')}\n")
+            f.write(f"- **Company**: {job_data.get('company_name', 'Not provided')}\n")
-            f.write(f"- **Location**: {job_data.get('location', 'N/A')}\n")
+            f.write(f"- **Location**: {job_data.get('location', 'Not provided')}\n")
-            f.write(f"- **Nature of Work**: {job_data.get('nature_of_work', 'N/A')}\n")
+            f.write(f"- **Nature of Work**: {job_data.get('nature_of_work', 'Not provided')}\n")
-            f.write(f"- **Salary Range**: {job_data.get('salary_range', 'N/A')}\n")
+            f.write(f"- **Salary Range**: {job_data.get('salary_range', 'Not provided')}\n")
-            f.write(f"- **Job ID**: {job_data.get('job_id', 'N/A')}\n")
+            f.write(f"- **Job ID**: {job_data.get('job_id', 'unknown')}\n")
            f.write(f"- **Posted Date**: {job_data.get('posted_date', 'N/A')}\n")
-            f.write(f"- **Category**: {job_data.get('category', 'N/A')}\n")
+            f.write(f"- **Category**: {job_data.get('category', 'all')}\n")
            f.write(f"- **Scraped At**: {job_data.get('scraped_at', 'N/A')}\n")
            f.write(f"- **URL**: <{job_data.get('url', 'N/A')}>\n\n")
-            f.write(f"### Description\n\n{job_data.get('description', 'N/A')}\n\n")
+            f.write(f"### Description\n\n{job_data.get('description', 'Not provided')}\n\n")
-            f.write(f"### Requirements\n\n{job_data.get('requirements', 'N/A')}\n\n")
+            f.write(f"### Requirements\n\n{job_data.get('requirements', 'Not provided')}\n\n")
-            f.write(f"### Qualifications\n\n{job_data.get('qualifications', 'N/A')}\n\n")
+            f.write(f"### Qualifications\n\n{job_data.get('qualifications', 'Not provided')}\n\n")
            f.write("---\n\n")
--- a/main.py
+++ b/main.py
@ -1,3 +1,4 @@
 from scraping_engine import FingerprintScrapingEngine
 from scraper import CryptoJobScraper  # Updated class name
 import os
@ -20,16 +21,15 @@ async def main():
    scraper = CryptoJobScraper(engine, human_speed=1.3, user_request="Extract title, company, location, description, requirements, qualifications, nature of work, and salary")
    job_titles = [
-        "Blockchain Engineer",
+        "Customer Support",
-        "Smart Contract Developer",
+        "Design",
-        "DeFi Analyst",
+        "Engineering",
-        "Web3 Developer",
+        "Finance",
-        "Crypto Researcher",
+        "Marketing",
-        "Solidity Developer",
+        "Operations",
-        "Protocol Engineer",
+        "Product",
-        "Tokenomics Specialist",
+        "Sales"       
-        "Zero-Knowledge Proof Engineer",
+
        "Crypto Compliance Officer"
    ]
    while True:
--- a/rippling.csv
+++ b/rippling.csv
@ -0,0 +1 @@
 url,timestamp
--- a/ripplingcompanies.csv
+++ b/ripplingcompanies.csv
--- a/scraper.py
+++ b/scraper.py
@ -6,11 +6,13 @@ from playwright.async_api import async_playwright, TimeoutError as PlaywrightTim
 from browserforge.injectors.playwright import AsyncNewContext
 from llm_agent import LLMJobRefiner
 import re
 from fetcher import StealthyFetcher
 from datetime import datetime
 import json
 import redis
-
+from urllib.parse import urlparse
 import hashlib
 import csv
 import os
 class CryptoJobScraper:
    def __init__(
@ -27,6 +29,29 @@ class CryptoJobScraper:
        self.llm_agent = LLMJobRefiner()
        self.redis_client = redis.Redis(host='localhost', port=6379, db=0, decode_responses=True)
        self.FORBIDDEN_ATS_DOMAINS = [
            'ashby', 'ashbyhq',
            'greenhouse', 'boards.greenhouse.io',
            'gem', 'gem.com',
            'rippling',
            'myworkday', 'myworkdayjobs',
            'smartrecruiters',
            'workable',
            'lever', 'jobs.lever.co',
            'linkedin.com'  # ✅ Added LinkedIn
        ]
        self.INVALID_CONTENT_PHRASES = [
            "invalid job url",
            "cookie consent",
            "privacy policy",
            "not a valid job",
            "job not found",
            "page not found",
            "The requested job post could not be found. It may have been removed.",
            "this page does not contain a job description"
        ]
    async def _human_click(self, page, element, wait_after: bool = True):
        if not element:
            return False
@ -55,60 +80,127 @@ class CryptoJobScraper:
        matches = sum(1 for kw in keyword_list if kw in title_lower)
        return matches / len(keyword_list) if keyword_list else 0.0
-    async def _scrape_jobs_from_current_page(self, page, search_keywords: str, seen_job_ids, all_job_links):
+    async def _extract_job_title_from_card(self, card) -> str:
-        current_links = await page.query_selector_all("a[href*='/job/']")
+        try:
-        new_jobs = 0
+            title_selectors = [
                'h3', 'h2', 'h4',
                'strong', 'span'
            ]
            for selector in title_selectors:
                title_element = await card.query_selector(selector)
                if title_element:
                    title_text = await title_element.inner_text()
                    if title_text and len(title_text.strip()) > 3:
                        return title_text.strip()
-        for link in current_links:
+            card_text = await card.inner_text()
-            href = await link.get_attribute("href")
+            lines = [line.strip() for line in card_text.split('\n') if line.strip()]
-            if not href or not href.startswith("http"):
+            if lines:
-                href = "https://cryptocurrencyjobs.co" + href
+                for line in lines:
-            job_id = href.split("/")[-1] if href.endswith("/") else href.split("/")[-1]
+                    if len(line) > 5 and not any(skip in line.lower() for skip in ['today', 'featured', 'yesterday', 'company', 'location']):
                        return line
            return "Unknown Title"
        except:
            return "Unknown Title"
    async def _collect_job_elements_from_page(self, page, search_keywords: str, seen_slugs):
        job_cards = []
        job_found = False
        await asyncio.sleep(3 * self.human_speed)
        try:
            await page.wait_for_selector('a[href^="/"][href*="-"]', timeout=60000)
            candidates = await page.query_selector_all('a[href^="/"][href*="-"]')
            for link in candidates:
                href = await link.get_attribute("href") or ""
                href = href.rstrip('/')
                if not href or len(href.split('/')) != 3:
                    continue
                if '-' not in href.split('/')[-1]:
                    continue
                slug = href.split('/')[-1]
                if len(slug) < 8 or slug.startswith(('login', 'signup', 'about', 'terms')):
                    continue
                full_url = "https://cryptocurrencyjobs.co" + href if not href.startswith('http') else href
                if slug in seen_slugs:
                    continue
                title = await self._extract_job_title_from_card(link)
                if not title or title == "Unknown Title":
                    title = slug.replace('-', ' ').title()
            if job_id and job_id not in seen_job_ids:
                title_element = await link.query_selector("h3, .job-title")
                title = (await title_element.inner_text()) if title_element else "Unknown Title"
                match_percentage = self._calculate_keyword_match(title, search_keywords)
                if match_percentage >= 0.4 or not search_keywords.strip():
                    seen_slugs.add(slug)
                    job_cards.append((full_url, title, link))
                    job_found = True
-                if match_percentage >= 0.5:  # Lower threshold than LinkedIn
+            print(f"   ✅ Collected {len(job_cards)} valid job links after filtering ({len(candidates)} raw candidates).")
                    seen_job_ids.add(job_id)
                    all_job_links.append((href, title))
                    new_jobs += 1
                else:
                    print(f"   ⚠️ Skipping job due to low keyword match: {title[:50]}... (match: {match_percentage:.2%})")
        return new_jobs
-    async def _handle_pagination(self, page, search_keywords: str, seen_job_ids, all_job_links):
+        except Exception as e:
-        current_page = 1
+            print(f"   ⚠️ Error collecting job cards: {e}")
        while True:
            print(f"📄 Processing page {current_page}")
            new_jobs = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links)
            print(f"   ➕ Found {new_jobs} new job(s) (total: {len(all_job_links)})")
-            next_btn = await page.query_selector('a[rel="next"]')
+        if not job_found:
-            if next_btn:
+            print("   ❌ No valid job listings passed filters.")
-                next_url = await next_btn.get_attribute("href")
+
-                if next_url and not next_url.startswith("http"):
+        return job_cards
-                    next_url = "https://cryptocurrencyjobs.co" + next_url
+
-                await page.goto(next_url, timeout=120000)
+    async def _handle_pagination_and_collect_all(self, page, search_keywords: str, seen_slugs):
-                await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
+        all_job_elements = []
-                current_page += 1
+        scroll_attempt = 0
-            else:
+        max_scrolls = 40
-                print("🔚 No 'Next' page — stopping pagination.")
+        prev_count = 0
        while scroll_attempt < max_scrolls:
            print(f"   Scroll attempt {scroll_attempt + 1} | Current total jobs: {len(all_job_elements)}")
            page_elements = await self._collect_job_elements_from_page(page, search_keywords, seen_slugs)
            all_job_elements.extend(page_elements)
            current_count = len(all_job_elements)
            if current_count == prev_count and scroll_attempt > 3:
                print("   🔚 No new jobs after several scrolls → assuming end of list.")
                break
-    async def _extract_job_posted_date(self, page) -> str:
+            prev_count = current_count
            await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
            await asyncio.sleep(random.uniform(2.5, 5.5) * self.human_speed)
            try:
-            date_element = await page.query_selector(".job-posted-date, .job-date, time")
+                load_more = await page.query_selector(
-            if date_element:
+                    'button:has-text("Load more"), button:has-text("More"), div[role="button"]:has-text("Load"), a:has-text("Load more")'
-                date_text = await date_element.inner_text()
+                )
-                if "Today" in date_text:
+                if load_more:
                    print("   Found 'Load more' button → clicking...")
                    await self._human_click(page, load_more)
                    await asyncio.sleep(random.uniform(3.0, 6.0) * self.human_speed)
            except:
                pass
            scroll_attempt += 1
        print(f"   Finished scrolling → collected {len(all_job_elements)} unique job links.")
        return all_job_elements
    async def _extract_job_posted_date_from_card(self, card) -> str:
        try:
            card_text = await card.inner_text()
            if "Today" in card_text:
                return datetime.now().strftime("%m/%d/%y")
-                elif "Yesterday" in date_text:
+            elif "Yesterday" in card_text:
-                    yesterday = datetime.now().replace(day=datetime.now().day - 1)
+                from datetime import timedelta
-                    return yesterday.strftime("%m/%d/%y")
+                return (datetime.now() - timedelta(days=1)).strftime("%m/%d/%y")
            else:
-                    return datetime.now().strftime("%m/%d/%y")
+                match = re.search(r'(\d+)d', card_text)
                if match:
                    days = int(match.group(1))
                    from datetime import timedelta
                    return (datetime.now() - timedelta(days=days)).strftime("%m/%d/%y")
        except:
            pass
        return datetime.now().strftime("%m/%d/%y")
@ -126,15 +218,103 @@ class CryptoJobScraper:
        except Exception as e:
            print(f"   ❌ Failed to add job to Redis cache: {str(e)}")
    async def _is_forbidden_ats_url(self, url: str) -> bool:
        url_lower = url.lower()
        return any(domain in url_lower for domain in self.FORBIDDEN_ATS_DOMAINS)
    def _get_ats_platform_name(self, url: str) -> str:
        """Return canonical ATS name based on URL (e.g., 'ashby', 'greenhouse')"""
        url_lower = url.lower()
        # Order matters: more specific first
        if 'boards.greenhouse.io' in url_lower:
            return 'greenhouse'
        elif 'jobs.lever.co' in url_lower:
            return 'lever'
        elif 'myworkdayjobs' in url_lower or 'myworkday' in url_lower:
            return 'workday'
        elif 'linkedin.com' in url_lower:
            return 'linkedin'
        elif 'ashbyhq.com' in url_lower or 'ashby' in url_lower:
            return 'ashby'
        elif 'gem.com' in url_lower or 'gem' in url_lower:
            return 'gem'
        elif 'rippling' in url_lower:
            return 'rippling'
        elif 'smartrecruiters' in url_lower:
            return 'smartrecruiters'
        elif 'workable' in url_lower:
            return 'workable'
        else:
            # Fallback: extract domain part
            try:
                parsed = urlparse(url)
                domain = parsed.netloc.lower()
                for forbidden in self.FORBIDDEN_ATS_DOMAINS:
                    if forbidden in domain:
                        return forbidden.split('.')[0] if '.' in forbidden else forbidden
            except:
                pass
            return 'forbidden_ats'
    def _log_forbidden_ats_url(self, url: str, platform: str):
        """Append forbidden URL to {platform}.csv"""
        filename = f"{platform}.csv"
        file_exists = os.path.isfile(filename)
        with open(filename, 'a', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            if not file_exists:
                writer.writerow(['url', 'timestamp'])
            writer.writerow([url, datetime.now().isoformat()])
        print(f"   📥 Logged forbidden ATS URL to {filename}: {url}")
    async def _is_invalid_job_page(self, page_content: str) -> bool:
        content_lower = page_content.lower()
        return any(phrase in content_lower for phrase in self.INVALID_CONTENT_PHRASES)
    def _extract_job_id_from_url(self, url: str) -> Optional[str]:
        try:
            parsed = urlparse(url)
            path_parts = [p for p in parsed.path.split('/') if p]
            if not path_parts:
                return None
            candidate = path_parts[-1]
            candidate = re.split(r'[?#]', candidate)[0]
            candidate = re.sub(r'\.html?$', '', candidate)
            if not candidate or not any(c.isdigit() for c in candidate):
                return None
            if re.search(r'[A-Za-z]{6,}\s', candidate):
                return None
            return candidate
        except:
            return None
    async def scrape_jobs(
        self,
        search_keywords: Optional[str],
        max_pages: int = 1,
        credentials: Optional[Dict] = None
    ):
-        # cryptocurrencyjobs.co uses URL params differently
+        query = ""
-        encoded_keywords = search_keywords.replace(" ", "%20")
+        location = ""
-        search_url = f"https://cryptocurrencyjobs.co/?q={encoded_keywords}"
+        if search_keywords and search_keywords.strip():
            parts = search_keywords.split(',', 1)
            query = parts[0].strip()
            if len(parts) > 1:
                location = parts[1].strip()
        clean_query = query.replace(' ', '+')
        clean_location = location.replace(' ', '+')
        search_url = "https://cryptocurrencyjobs.co/"
        if clean_query:
            search_url += f"?query={clean_query}"
            if clean_location:
                search_url += f"&location={clean_location}"
        profile = self.engine._select_profile()
        renderer = random.choice(self.engine.common_renderers[self.engine.os])
@ -156,46 +336,103 @@ class CryptoJobScraper:
            await context.add_init_script(spoof_script)
            page = await context.new_page()
-
+            print(f"🔍 Searching for: {search_keywords or 'all jobs'}")
-            # Fetch main search page
+            print(f"   🔗 URL: {search_url}")
-            print(f"🔍 Searching for: {search_keywords}")
+            await page.goto(search_url, wait_until='networkidle', timeout=120000)
            await page.goto(search_url, wait_until='load', timeout=120000)
            await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
-            all_job_links = []
+            try:
-            seen_job_ids = set()
+                await page.wait_for_selector("a[href^='/'][href*='-']", timeout=10000)
            except:
                print("   ⚠️ No job links found initially, waiting longer...")
                await asyncio.sleep(5 * self.human_speed)
-            print("🔄 Collecting job links from search results...")
+            seen_slugs = set()
-            await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links)
+            all_job_elements = await self._handle_pagination_and_collect_all(page, search_keywords, seen_slugs)
-            await self._handle_pagination(page, search_keywords, seen_job_ids, all_job_links)
+            print(f"✅ Collected {len(all_job_elements)} unique job links.")
            print(f"✅ Collected {len(all_job_links)} unique job links.")
            scraped_count = 0
-            for idx, (href, title) in enumerate(all_job_links):
+            for idx, (href, title, job_element) in enumerate(all_job_elements):
                job_detail_page = None
                apply_page = None
                skip_job = False
                final_scrape_url = None
                try:
-                    full_url = href
+                    print(f"  → Processing job {idx+1}/{len(all_job_elements)}: {title}")
                    print(f"  → Opening job {idx+1}/{len(all_job_links)}: {full_url}")
-                    fetcher = StealthyFetcher(self.engine, browser, context)
+                    posted_date = await self._extract_job_posted_date_from_card(job_element)
-                    job_page = await fetcher.fetch_url(full_url, wait_for_selector="h1")
+
-                    if not job_page:
+                    job_detail_page = await context.new_page()
-                        print(f" ❌ Failed to fetch job page {full_url}")
+                    await job_detail_page.goto(href, wait_until='networkidle', timeout=60000)
-                        await self._add_job_to_redis_cache(full_url, full_url.split("/")[-1], "fetch_failure")
+                    await asyncio.sleep(2 * self.human_speed)
-                        self.engine.report_outcome("fetch_failure", url=full_url)
+
                    page_content = await job_detail_page.content()
                    if await self._is_invalid_job_page(page_content):
                        print("   🚫 Page contains invalid content → skipping.")
                        await job_detail_page.close()
                        continue
-                    posted_date = await self._extract_job_posted_date(job_page)
+                    apply_clicked = False
                    apply_selectors = [
                        'a[href*="apply"], a:text("Apply"), a:text("Apply Now"), a:text("Apply here")',
                        'button:text("Apply"), button:has-text("Apply")',
                        '[data-testid="apply-button"], [aria-label*="apply"], [role="button"]:has-text("Apply")',
                        'a.btn-apply, .apply-button, .apply-link, a:has-text("Apply")',
                        'a[rel="noopener"]:has-text("Apply")',
                    ]
-                    await self.engine._human_like_scroll(job_page)
+                    for sel in apply_selectors:
-                    await asyncio.sleep(2 * self.human_speed)
+                        apply_elem = await job_detail_page.query_selector(sel)
-                    page_content = await self._extract_page_content_for_llm(job_page)
+                        if apply_elem:
                            print(f"   🔗 Found Apply element with selector: {sel}")
                            await self._human_click(job_detail_page, apply_elem, wait_after=True)
                            apply_clicked = True
                            break
-                    job_id = full_url.split("/")[-1] if full_url.split("/")[-1] else "unknown"
+                    apply_page = job_detail_page
                    if apply_clicked:
                        await asyncio.sleep(random.uniform(3.0, 6.0) * self.human_speed)
                        pages = context.pages
                        new_pages = [p for p in pages if p != job_detail_page and p.url != "about:blank"]
                        if new_pages:
                            candidate_page = new_pages[-1]
                            new_url = candidate_page.url.strip()
                            print(f"   New tab opened: {new_url}")
                            if new_url and await self._is_forbidden_ats_url(new_url):
                                platform = self._get_ats_platform_name(new_url)
                                self._log_forbidden_ats_url(new_url, platform)
                                if candidate_page != job_detail_page:
                                    await candidate_page.close()
                                await job_detail_page.close()
                                skip_job = True
                            else:
                                apply_page = candidate_page
                        else:
                            print("   No new tab → using original page.")
                    if skip_job:
                        continue
                    final_scrape_url = apply_page.url
                    page_content = await self._extract_page_content_for_llm(apply_page)
                    if await self._is_invalid_job_page(page_content):
                        print("   🚫 Final page contains invalid content → skipping.")
                        if apply_page != job_detail_page:
                            await apply_page.close()
                        await job_detail_page.close()
                        continue
                    job_id = self._extract_job_id_from_url(final_scrape_url)
                    if not job_id:
                        job_id = "job_" + hashlib.md5(final_scrape_url.encode()).hexdigest()[:12]
                    raw_data = {
                        "page_content": page_content,
-                        "url": full_url,
+                        "url": final_scrape_url,
                        "job_id": job_id,
                        "search_keywords": search_keywords,
                        "posted_date": posted_date
@ -210,44 +447,45 @@ class CryptoJobScraper:
                                if field == 'job_id':
                                    refined_data[field] = job_id
                                elif field == 'url':
-                                    refined_data[field] = full_url
+                                    refined_data[field] = final_scrape_url
                                elif field == 'company_name':
                                    refined_data[field] = "Unknown Company"
                        refined_data['scraped_at'] = datetime.now().isoformat()
-                        refined_data['category'] = search_keywords
+                        refined_data['category'] = search_keywords or "all"
                        refined_data['posted_date'] = posted_date
-                        await self.llm_agent.save_job_data(refined_data, search_keywords)
+                        await self.llm_agent.save_job_data(refined_data, search_keywords or "all")
                        scraped_count += 1
-                        print(f"  ✅ Scraped and refined: {refined_data['title'][:50]}...")
+                        print(f"  ✅ Scraped: {refined_data['title'][:50]}... (Job ID: {job_id})")
-                        self.engine.report_outcome("success", url=raw_data["url"])
+                        self.engine.report_outcome("success", url=final_scrape_url)
                    else:
-                        print(f"  🟡 Could not extract meaningful data from: {full_url}")
+                        print(f"  🟡 Could not extract meaningful data from: {final_scrape_url}")
-                        await self._add_job_to_redis_cache(full_url, job_id, "llm_failure")
+                        await self._add_job_to_redis_cache(final_scrape_url, job_id, "llm_failure")
-                        self.engine.report_outcome("llm_failure", url=raw_data["url"])
+                        self.engine.report_outcome("llm_failure", url=final_scrape_url)
-                    await job_page.close()
+                    if apply_page != job_detail_page and not apply_page.is_closed():
                        await apply_page.close()
                    if job_detail_page and not job_detail_page.is_closed():
                        await job_detail_page.close()
                except Exception as e:
                    error_msg = str(e)[:100]
                    print(f"    ⚠️ Failed on job {idx+1}: {error_msg}")
-                    job_id = full_url.split("/")[-1] if 'full_url' in locals() else "unknown"
+                    job_id_for_log = "unknown"
-                    job_url = full_url if 'full_url' in locals() else "unknown"
+                    if 'final_scrape_url' in locals() and final_scrape_url:
-                    await self._add_job_to_redis_cache(job_url, job_id, f"exception: {error_msg}")
+                        job_id_for_log = "job_" + hashlib.md5(final_scrape_url.encode()).hexdigest()[:12]
-                    if 'job_page' in locals() and job_page:
+                    await self._add_job_to_redis_cache(href, job_id_for_log, f"exception: {error_msg}")
-                        await job_page.close()
+                    if job_detail_page and not job_detail_page.is_closed():
                        await job_detail_page.close()
                    if 'apply_page' in locals() and apply_page and apply_page != job_detail_page and not apply_page.is_closed():
                        await apply_page.close()
                    continue
                finally:
                    print("  ↩️ Returning to search results...")
                    await page.goto(search_url, timeout=120000)
                    await asyncio.sleep(4 * self.human_speed)
            await browser.close()
            if scraped_count > 0:
                self.engine.report_outcome("success")
-                print(f"✅ Completed! Processed {scraped_count} jobs for '{search_keywords}' based on request '{self.user_request}'.")
+                print(f"✅ Completed! Processed {scraped_count} jobs for '{search_keywords or 'all jobs'}'.")
            else:
                self.engine.report_outcome("scraping_error")
                print("⚠️ No jobs processed successfully.")
--- a/workable.csv
+++ b/workable.csv
@ -0,0 +1,5 @@
 url,timestamp
 https://apply.workable.com/thetie/j/C54DFC9985/?ref=cryptocurrencyjobs.co,2025-12-31T08:24:45.755671
 https://apply.workable.com/thetie/j/C54DFC9985/?ref=cryptocurrencyjobs.co,2025-12-31T09:51:08.343642
 https://apply.workable.com/thetie/j/2745433865/?ref=cryptocurrencyjobs.co,2025-12-31T09:51:28.331543
 https://apply.workable.com/thetie/j/1A6C8F2913/?ref=cryptocurrencyjobs.co,2025-12-31T11:22:54.623723
--- a/workablecompanies.csv
+++ b/workablecompanies.csv
--- a/workday.csv
+++ b/workday.csv
@ -0,0 +1 @@
 url,timestamp
--- a/workdaycompanies.csv
+++ b/workdaycompanies.csv
Author	SHA1	Message	Date
Ofure Ikheloa	b0e90972b1	Refactor code structure for improved readability and maintainability	2025-12-31 11:41:19 +01:00
Ofure Ikheloa	06f8e8b086	Increase timeout for selector waits and refine job extraction logic in LLMJobRefiner and CryptoJobScraper	2025-12-30 12:19:18 +01:00