Compare commits
No commits in common. "b0e90972b1f5cd9883a38dac4cfb94eb11709c42" and "38ef08c734cdee20dce2cba6dd056783dafc02f6" have entirely different histories.
b0e90972b1
...
38ef08c734
10
ashby.csv
10
ashby.csv
@ -1,10 +0,0 @@
|
||||
url,timestamp
|
||||
https://jobs.ashbyhq.com/stellar/a8377cf4-280b-4eb3-ac44-a4c9020c2eaf?utm_source=cryptocurrencyjobs.co,2025-12-31T08:32:17.821505
|
||||
https://jobs.ashbyhq.com/artemisanalytics/5f61b6c6-147c-4707-9003-a9632455b984?utm_source=cryptocurrencyjobs.co,2025-12-31T08:51:57.190172
|
||||
https://jobs.ashbyhq.com/lightning/2d77b496-ab0d-4e54-bcf8-33260d1bab6b?utm_source=cryptocurrencyjobs.co,2025-12-31T09:07:09.491831
|
||||
https://jobs.ashbyhq.com/Braiins/cee9cf74-6049-4dab-aae7-96bef0082689?utm_source=cryptocurrencyjobs.co,2025-12-31T09:35:28.137181
|
||||
https://jobs.ashbyhq.com/blockstream/80ebab98-0039-48bf-86d9-9a2a7962b005?utm_source=cryptocurrencyjobs.co,2025-12-31T10:21:19.253356
|
||||
https://jobs.ashbyhq.com/dynamic/fde8a9ff-9701-485f-a8d1-e717c170f215?utm_source=cryptocurrencyjobs.co,2025-12-31T10:25:55.141543
|
||||
https://jobs.ashbyhq.com/ether.fi/6eb1e350-71ce-47f7-a363-3fa3c521dacb?utm_source=cryptocurrencyjobs.co,2025-12-31T10:44:35.913725
|
||||
https://chainlinklabs.com/open-roles?ashby_jid=112a76d3-4dfd-4eea-828c-41465760b3ef&utm_source=ccj,2025-12-31T10:49:07.453900
|
||||
https://jobs.ashbyhq.com/stellar/cdad9af1-9e64-4fd4-8e2c-f87389f1dd16?utm_source=cryptocurrencyjobs.co,2025-12-31T11:13:58.119967
|
||||
|
1591
ashbycompanies.csv
1591
ashbycompanies.csv
File diff suppressed because it is too large
Load Diff
166
comparator.py
166
comparator.py
@ -1,166 +0,0 @@
|
||||
|
||||
import csv
|
||||
import os
|
||||
from urllib.parse import urlparse
|
||||
|
||||
# Define platform mappings: (input_file, companies_file, platform_name)
|
||||
platforms = [
|
||||
("ashby.csv", "ashbycompanies.csv", "ashby"),
|
||||
("gem.csv", "gemcompanies.csv", "gem"),
|
||||
("greenhouse.csv", "greenhousecompanies.csv", "greenhouse"),
|
||||
("lever.csv", "levercompanies.csv", "lever"),
|
||||
("rippling.csv", "ripplingcompanies.csv", "rippling"),
|
||||
("workable.csv", "workablecompanies.csv", "workable"),
|
||||
("workday.csv", "workdaycompanies.csv", "workday"),
|
||||
]
|
||||
|
||||
|
||||
def normalize_url(platform, url):
|
||||
"""Normalize URL to a company identifier based on platform."""
|
||||
if not url:
|
||||
return None
|
||||
try:
|
||||
parsed = urlparse(url.lower().strip())
|
||||
netloc = parsed.netloc
|
||||
path = parsed.path
|
||||
|
||||
if platform == "ashby":
|
||||
# https://jobs.ashbyhq.com/company_slug/...
|
||||
if "ashbyhq.com" in netloc:
|
||||
parts = [p for p in path.split('/') if p]
|
||||
return parts[0] if parts else None
|
||||
|
||||
elif platform == "greenhouse":
|
||||
# https://boards.greenhouse.io/company_slug/...
|
||||
if "greenhouse.io" in netloc:
|
||||
parts = [p for p in path.split('/') if p]
|
||||
if len(parts) >= 2 and parts[0] == "boards":
|
||||
return parts[1]
|
||||
elif len(parts) >= 1:
|
||||
return parts[0]
|
||||
return None
|
||||
|
||||
elif platform == "lever":
|
||||
# https://jobs.lever.co/company_slug/...
|
||||
if "lever.co" in netloc:
|
||||
parts = [p for p in path.split('/') if p]
|
||||
return parts[0] if parts else None
|
||||
|
||||
elif platform == "workable":
|
||||
# https://apply.workable.com/company_slug/...
|
||||
if "workable.com" in netloc:
|
||||
parts = [p for p in path.split('/') if p]
|
||||
# Usually: /company_slug/j/jobid/ → take first non-'j' segment
|
||||
for part in parts:
|
||||
if part != 'j' and len(part) > 2:
|
||||
return part
|
||||
return parts[0] if parts else None
|
||||
|
||||
elif platform == "workday":
|
||||
# https://company.workday.com/... → company = subdomain
|
||||
if "myworkdayjobs.com" in netloc or "wd" in netloc:
|
||||
# Extract subdomain before main domain
|
||||
subdomain = netloc.split('.')[0]
|
||||
if subdomain and subdomain not in ['www', 'jobs', 'apply', '']:
|
||||
return subdomain
|
||||
# Fallback: look for company in path (rare)
|
||||
parts = [p for p in path.split('/') if p]
|
||||
if parts:
|
||||
return parts[0]
|
||||
return None
|
||||
|
||||
elif platform == "gem":
|
||||
# https://gem.com/company/... or https://www.gem.com/careers/company/...
|
||||
if "gem.com" in netloc:
|
||||
parts = [p for p in path.split('/') if p]
|
||||
# Often: /company-slug or /careers/company-slug
|
||||
for i, part in enumerate(parts):
|
||||
if part in ['company', 'careers', 'jobs']:
|
||||
if i + 1 < len(parts):
|
||||
return parts[i + 1]
|
||||
return parts[0] if parts else None
|
||||
|
||||
elif platform == "rippling":
|
||||
# Rippling uses generic domain; hard to extract company
|
||||
# Best effort: use full domain + first path segment
|
||||
if "rippling.com" in netloc:
|
||||
parts = [p for p in path.split('/') if p]
|
||||
if parts:
|
||||
return f"{netloc}/{parts[0]}"
|
||||
return netloc
|
||||
|
||||
# Fallback: return full URL if unrecognized
|
||||
return url
|
||||
|
||||
except Exception:
|
||||
return url
|
||||
|
||||
|
||||
def read_company_signatures(filepath, platform):
|
||||
"""Read and normalize company identifiers from companies CSV."""
|
||||
if not os.path.exists(filepath):
|
||||
return set()
|
||||
signatures = set()
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
reader = csv.DictReader(f)
|
||||
for row in reader:
|
||||
url = row.get('url', '').strip()
|
||||
if url:
|
||||
sig = normalize_url(platform, url)
|
||||
if sig:
|
||||
signatures.add(sig)
|
||||
return signatures
|
||||
|
||||
|
||||
def filter_csv_by_signatures(input_file, excluded_signatures, platform):
|
||||
"""Keep only rows whose normalized URL is NOT in excluded_signatures."""
|
||||
if not os.path.exists(input_file):
|
||||
return [], None
|
||||
kept_rows = []
|
||||
with open(input_file, 'r', encoding='utf-8') as f:
|
||||
reader = csv.DictReader(f)
|
||||
fieldnames = reader.fieldnames
|
||||
for row in reader:
|
||||
url = row.get('url', '').strip()
|
||||
if not url:
|
||||
kept_rows.append(row) # keep if no URL (shouldn't happen)
|
||||
continue
|
||||
sig = normalize_url(platform, url)
|
||||
if sig not in excluded_signatures:
|
||||
kept_rows.append(row)
|
||||
return kept_rows, fieldnames
|
||||
|
||||
|
||||
def write_csv(filepath, rows, fieldnames):
|
||||
"""Write rows to CSV file."""
|
||||
with open(filepath, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(rows)
|
||||
|
||||
|
||||
def main():
|
||||
for input_file, companies_file, platform in platforms:
|
||||
print(f"Processing {input_file} against {companies_file} using '{platform}' normalizer...")
|
||||
|
||||
# Step 1: Load and normalize known company signatures
|
||||
known_signatures = read_company_signatures(companies_file, platform)
|
||||
print(f" → Loaded {len(known_signatures)} known company signatures from {companies_file}")
|
||||
|
||||
# Step 2: Filter input file using signatures
|
||||
kept_rows, fieldnames = filter_csv_by_signatures(input_file, known_signatures, platform)
|
||||
|
||||
# Step 3: Write back filtered data
|
||||
if fieldnames:
|
||||
write_csv(input_file, kept_rows, fieldnames)
|
||||
print(f" → Kept {len(kept_rows)} new job URLs in {input_file}")
|
||||
else:
|
||||
if os.path.exists(input_file):
|
||||
os.remove(input_file)
|
||||
print(f" → {input_file} was empty or invalid — removed.")
|
||||
|
||||
print("\n✅ All platforms processed successfully.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -27,7 +27,7 @@ class StealthyFetcher:
|
||||
|
||||
if wait_for_selector:
|
||||
try:
|
||||
await page.wait_for_selector(wait_for_selector, timeout=120000)
|
||||
await page.wait_for_selector(wait_for_selector, timeout=40000)
|
||||
except PlaywrightTimeoutError:
|
||||
print(f"Selector {wait_for_selector} not found immediately, continuing...")
|
||||
|
||||
@ -88,7 +88,7 @@ class StealthyFetcher:
|
||||
async def _is_content_accessible(self, page: Page, wait_for_selector: Optional[str] = None) -> bool:
|
||||
if wait_for_selector:
|
||||
try:
|
||||
await page.wait_for_selector(wait_for_selector, timeout=120000)
|
||||
await page.wait_for_selector(wait_for_selector, timeout=40000)
|
||||
return True
|
||||
except PlaywrightTimeoutError:
|
||||
pass
|
||||
|
||||
508
gemcompanies.csv
508
gemcompanies.csv
@ -1,508 +0,0 @@
|
||||
name,url
|
||||
10Xconstruction Ai,https://jobs.gem.com/10xconstruction-ai
|
||||
11X Ai,https://jobs.gem.com/11x-ai
|
||||
43North,https://jobs.gem.com/43north
|
||||
8020 Consulting,https://jobs.gem.com/8020-consulting
|
||||
A16Z Speedrun,https://jobs.gem.com/a16z-speedrun
|
||||
Aarden Ai,https://jobs.gem.com/aarden-ai
|
||||
Accel,https://jobs.gem.com/accel
|
||||
Accelos,https://jobs.gem.com/accelos
|
||||
Acre,https://jobs.gem.com/acre
|
||||
Advancelevelllc Com,https://jobs.gem.com/advancelevelllc-com
|
||||
Agenta Ai,https://jobs.gem.com/agenta-ai
|
||||
Agentnoon,https://jobs.gem.com/agentnoon
|
||||
Agora,https://jobs.gem.com/agora
|
||||
Aionex Xyz,https://jobs.gem.com/aionex-xyz
|
||||
Aiphrodite Ai,https://jobs.gem.com/aiphrodite-ai
|
||||
Airframe,https://jobs.gem.com/airframe
|
||||
Airvet Com,https://jobs.gem.com/airvet-com
|
||||
Alex And Ani,https://jobs.gem.com/alex-and-ani
|
||||
Alinia Ai,https://jobs.gem.com/alinia-ai
|
||||
Alitheon,https://jobs.gem.com/alitheon
|
||||
Alpharun,https://jobs.gem.com/alpharun
|
||||
Altzero Xyz,https://jobs.gem.com/altzero-xyz
|
||||
Amya Agency,https://jobs.gem.com/amya-agency
|
||||
Andrenam,https://jobs.gem.com/andrenam
|
||||
Anysphere,https://jobs.gem.com/anysphere
|
||||
Aoniclife,https://jobs.gem.com/aoniclife
|
||||
Apartment List,https://jobs.gem.com/apartment-list
|
||||
Apella,https://jobs.gem.com/apella
|
||||
Apticore Io,https://jobs.gem.com/apticore-io
|
||||
Arlo,https://jobs.gem.com/arlo
|
||||
Ascenda Loyalty,https://jobs.gem.com/ascenda-loyalty
|
||||
Ascendarc,https://jobs.gem.com/ascendarc
|
||||
Astroforge Io,https://jobs.gem.com/astroforge-io
|
||||
Atla Ai Com,https://jobs.gem.com/atla-ai-com
|
||||
Atomica,https://jobs.gem.com/atomica
|
||||
Audicus,https://jobs.gem.com/audicus
|
||||
Aurelian Io,https://jobs.gem.com/aurelian-io
|
||||
Aureliussystems Us,https://jobs.gem.com/aureliussystems-us
|
||||
Autopilotbrand Com,https://jobs.gem.com/autopilotbrand-com
|
||||
Avoca,https://jobs.gem.com/avoca
|
||||
Avol,https://jobs.gem.com/avol
|
||||
Axonify,https://jobs.gem.com/axonify
|
||||
Backops Ai,https://jobs.gem.com/backops-ai
|
||||
Basalt Health,https://jobs.gem.com/basalt-health
|
||||
Baxter Aerospace,https://jobs.gem.com/baxter-aerospace
|
||||
Bead Ai,https://jobs.gem.com/bead-ai
|
||||
Benbase,https://jobs.gem.com/benbase
|
||||
Better Auth,https://jobs.gem.com/better-auth
|
||||
Betterbasket Ai,https://jobs.gem.com/betterbasket-ai
|
||||
Bigeye,https://jobs.gem.com/bigeye
|
||||
Bigpanda,https://jobs.gem.com/bigpanda
|
||||
Bikky,https://jobs.gem.com/bikky
|
||||
Bilt,https://jobs.gem.com/bilt
|
||||
Binarly,https://jobs.gem.com/binarly
|
||||
Biofire,https://jobs.gem.com/biofire
|
||||
Biorender,https://jobs.gem.com/biorender
|
||||
Biorender Inc Ats,https://jobs.gem.com/biorender-inc--ats
|
||||
Birdwood Therapeutics,https://jobs.gem.com/birdwood-therapeutics
|
||||
Black Ore,https://jobs.gem.com/black-ore
|
||||
Blaze Ai,https://jobs.gem.com/blaze-ai
|
||||
Blazetalent,https://jobs.gem.com/blazetalent
|
||||
Blend Inc,https://jobs.gem.com/blend-inc
|
||||
Blue J,https://jobs.gem.com/blue-j
|
||||
Bluejeanfinancial Com,https://jobs.gem.com/bluejeanfinancial-com
|
||||
Blueonion Ai,https://jobs.gem.com/blueonion-ai
|
||||
Blueprint,https://jobs.gem.com/blueprint
|
||||
Bluesky,https://jobs.gem.com/bluesky
|
||||
Blume Technologies,https://jobs.gem.com/blume-technologies
|
||||
Bohler ,https://jobs.gem.com/bohler-
|
||||
Bohler Engineering Gemats,https://jobs.gem.com/bohler-engineering-gemats
|
||||
Bolna,https://jobs.gem.com/bolna
|
||||
Bond Partners,https://jobs.gem.com/bond-partners
|
||||
Boost Robotics,https://jobs.gem.com/boost-robotics
|
||||
Boredm,https://jobs.gem.com/boredm
|
||||
Breadcrumb Ai,https://jobs.gem.com/breadcrumb-ai
|
||||
Breakline Ats,https://jobs.gem.com/breakline-ats
|
||||
Breakline Education,https://jobs.gem.com/breakline-education
|
||||
Brewbird,https://jobs.gem.com/brewbird
|
||||
Buildtrayd Com,https://jobs.gem.com/buildtrayd-com
|
||||
Bull Moose Xyz,https://jobs.gem.com/bull-moose-xyz
|
||||
Cadstrom Io,https://jobs.gem.com/cadstrom-io
|
||||
Caffelabs Com,https://jobs.gem.com/caffelabs-com
|
||||
Calaveras,https://jobs.gem.com/calaveras
|
||||
Canals,https://jobs.gem.com/canals
|
||||
Caplight Com,https://jobs.gem.com/caplight-com
|
||||
Carbon,https://jobs.gem.com/carbon
|
||||
Cardnexus,https://jobs.gem.com/cardnexus
|
||||
Careers,https://jobs.gem.com/careers
|
||||
Carry,https://jobs.gem.com/carry
|
||||
Caseflood Ai,https://jobs.gem.com/caseflood-ai
|
||||
Cellbyte,https://jobs.gem.com/cellbyte
|
||||
Chartahealth,https://jobs.gem.com/chartahealth
|
||||
Civrobotics Com,https://jobs.gem.com/civrobotics-com
|
||||
Clarity,https://jobs.gem.com/clarity
|
||||
Clearchecks Com Ats,https://jobs.gem.com/clearchecks-com-ats
|
||||
Clearesthealth Com,https://jobs.gem.com/clearesthealth-com
|
||||
Cloudanix Com,https://jobs.gem.com/cloudanix-com
|
||||
Cloudraft,https://jobs.gem.com/cloudraft
|
||||
Codegen,https://jobs.gem.com/codegen
|
||||
Codesignal,https://jobs.gem.com/codesignal
|
||||
Cognna,https://jobs.gem.com/cognna
|
||||
Cogram,https://jobs.gem.com/cogram
|
||||
Comfy,https://jobs.gem.com/comfy
|
||||
Conductorai,https://jobs.gem.com/conductorai
|
||||
Confida Ai,https://jobs.gem.com/confida-ai
|
||||
Context Wtf,https://jobs.gem.com/context-wtf
|
||||
Contour App,https://jobs.gem.com/contour-app
|
||||
Converge,https://jobs.gem.com/converge
|
||||
Coupa Software Inc Ats 1,https://jobs.gem.com/coupa-software-inc-ats-1
|
||||
Cox Exponential,https://jobs.gem.com/cox-exponential
|
||||
Crabi Robotics Com,https://jobs.gem.com/crabi-robotics-com
|
||||
Crackenagi,https://jobs.gem.com/crackenagi
|
||||
Create Talent Group,https://jobs.gem.com/create-talent-group
|
||||
Createdbyhumans Ai,https://jobs.gem.com/createdbyhumans-ai
|
||||
Credit Key,https://jobs.gem.com/credit-key
|
||||
Crosby,https://jobs.gem.com/crosby
|
||||
Curex Org,https://jobs.gem.com/curex-org
|
||||
Curiouscardinals Com,https://jobs.gem.com/curiouscardinals-com
|
||||
Cyvl,https://jobs.gem.com/cyvl
|
||||
D4M International,https://jobs.gem.com/d4m-international
|
||||
Dalus,https://jobs.gem.com/dalus
|
||||
Dash Fi,https://jobs.gem.com/dash-fi
|
||||
Data Masters,https://jobs.gem.com/data-masters
|
||||
Datacurve Ai,https://jobs.gem.com/datacurve-ai
|
||||
Dataday Technology Solutions,https://jobs.gem.com/dataday-technology-solutions
|
||||
Datagrid,https://jobs.gem.com/datagrid
|
||||
Dawn Media,https://jobs.gem.com/dawn-media
|
||||
Daxko,https://jobs.gem.com/daxko
|
||||
Deep Infra,https://jobs.gem.com/deep-infra
|
||||
Deliver,https://jobs.gem.com/deliver
|
||||
Detections Ai,https://jobs.gem.com/detections-ai
|
||||
Dianahr Ai,https://jobs.gem.com/dianahr-ai
|
||||
Distributed Spectrum,https://jobs.gem.com/distributed-spectrum
|
||||
Dlvrlog,https://jobs.gem.com/dlvrlog
|
||||
Doowii,https://jobs.gem.com/doowii
|
||||
Dragme,https://jobs.gem.com/dragme
|
||||
Dragonfly Careers,https://jobs.gem.com/dragonfly-careers
|
||||
Dropback,https://jobs.gem.com/dropback
|
||||
Durin,https://jobs.gem.com/durin
|
||||
Dydx,https://jobs.gem.com/dydx
|
||||
Eats2Seats,https://jobs.gem.com/eats2seats
|
||||
Echelon,https://jobs.gem.com/echelon
|
||||
Ecocart Io,https://jobs.gem.com/ecocart-io
|
||||
Edgetrace Ai,https://jobs.gem.com/edgetrace-ai
|
||||
Efference Ai,https://jobs.gem.com/efference-ai
|
||||
Elite Talent Consulting,https://jobs.gem.com/elite-talent-consulting
|
||||
Eliza,https://jobs.gem.com/eliza
|
||||
Elloe Ai,https://jobs.gem.com/elloe-ai
|
||||
Elo Ai,https://jobs.gem.com/elo-ai
|
||||
Emerge Career,https://jobs.gem.com/emerge-career
|
||||
Engineering Codified,https://jobs.gem.com/engineering--codified
|
||||
Entrusted Contracting,https://jobs.gem.com/entrusted-contracting
|
||||
Escargot Com,https://jobs.gem.com/escargot-com
|
||||
Everfit Io,https://jobs.gem.com/everfit-io
|
||||
Excelity Careers,https://jobs.gem.com/excelity-careers
|
||||
Exponent,https://jobs.gem.com/exponent
|
||||
Ezraailabs Tech,https://jobs.gem.com/ezraailabs-tech
|
||||
Fabric,https://jobs.gem.com/fabric
|
||||
Fabrichealth,https://jobs.gem.com/fabrichealth
|
||||
Fancypeople,https://jobs.gem.com/fancypeople
|
||||
Fanpierlabs Com,https://jobs.gem.com/fanpierlabs-com
|
||||
Faraday,https://jobs.gem.com/faraday
|
||||
Fathom Org,https://jobs.gem.com/fathom-org
|
||||
Felix,https://jobs.gem.com/felix
|
||||
Ferry Health,https://jobs.gem.com/ferry-health
|
||||
Fetch Ats,https://jobs.gem.com/fetch-ats
|
||||
Fifthdoor Com,https://jobs.gem.com/fifthdoor-com
|
||||
Fireflies,https://jobs.gem.com/fireflies
|
||||
Firestorm,https://jobs.gem.com/firestorm
|
||||
Flatfee Corp,https://jobs.gem.com/flatfee-corp
|
||||
Flint,https://jobs.gem.com/flint
|
||||
Floot,https://jobs.gem.com/floot
|
||||
Forgent Ai,https://jobs.gem.com/forgent-ai
|
||||
Fountainplatform Com,https://jobs.gem.com/fountainplatform-com
|
||||
Foxbox Digital,https://jobs.gem.com/foxbox-digital
|
||||
Freestone Grove Partners,https://jobs.gem.com/freestone-grove-partners
|
||||
Freshbooks,https://jobs.gem.com/freshbooks
|
||||
Fridayharbor Ai,https://jobs.gem.com/fridayharbor-ai
|
||||
Fuelfinance,https://jobs.gem.com/fuelfinance
|
||||
Fulcrumcareers,https://jobs.gem.com/fulcrumcareers
|
||||
Function Health,https://jobs.gem.com/function-health
|
||||
Galadyne,https://jobs.gem.com/galadyne
|
||||
Galaxyventures,https://jobs.gem.com/galaxyventures
|
||||
Gc Ai,https://jobs.gem.com/gc-ai
|
||||
Gem,https://jobs.gem.com/gem
|
||||
Gem Mckesson,https://jobs.gem.com/gem-mckesson
|
||||
Gem Test Board,https://jobs.gem.com/gem-test-board
|
||||
Generation Alpha Transistor,https://jobs.gem.com/generation-alpha-transistor
|
||||
Genspark,https://jobs.gem.com/genspark
|
||||
Gerra,https://jobs.gem.com/gerra
|
||||
Getaero Io,https://jobs.gem.com/getaero-io
|
||||
Getbirdeye Com Au,https://jobs.gem.com/getbirdeye-com-au
|
||||
Getro,https://jobs.gem.com/getro
|
||||
Gigaml,https://jobs.gem.com/gigaml
|
||||
Go Cadre,https://jobs.gem.com/go-cadre
|
||||
Goatrecruit Com,https://jobs.gem.com/goatrecruit-com
|
||||
Good Life Companies,https://jobs.gem.com/good-life-companies
|
||||
Goodbill,https://jobs.gem.com/goodbill
|
||||
Grailpay Com,https://jobs.gem.com/grailpay-com
|
||||
Granger Construction,https://jobs.gem.com/granger-construction
|
||||
Gratia Health,https://jobs.gem.com/gratia-health
|
||||
Greenlite Ai,https://jobs.gem.com/greenlite-ai
|
||||
Greenvalleyjobs,https://jobs.gem.com/greenvalleyjobs
|
||||
Grit,https://jobs.gem.com/grit
|
||||
Groq,https://jobs.gem.com/groq
|
||||
Growthbook,https://jobs.gem.com/growthbook
|
||||
Guardrail Ai,https://jobs.gem.com/guardrail-ai
|
||||
Guidesage Ai,https://jobs.gem.com/guidesage-ai
|
||||
Hallow,https://jobs.gem.com/hallow
|
||||
Happydance Partnership Integration,https://jobs.gem.com/happydance-partnership-integration
|
||||
Harmonic,https://jobs.gem.com/harmonic
|
||||
Hash,https://jobs.gem.com/hash
|
||||
Hayla,https://jobs.gem.com/hayla
|
||||
Heavy Construction Systems Specialists Llc,https://jobs.gem.com/heavy-construction-systems-specialists-llc
|
||||
Helix,https://jobs.gem.com/helix
|
||||
Hellotrade,https://jobs.gem.com/hellotrade
|
||||
Helm Health,https://jobs.gem.com/helm-health
|
||||
Hilabs Ie,https://jobs.gem.com/hilabs-ie
|
||||
Hipeople,https://jobs.gem.com/hipeople
|
||||
Holacasa Yc W23,https://jobs.gem.com/holacasa-yc-w23
|
||||
Homeboost,https://jobs.gem.com/homeboost
|
||||
Hospitable,https://jobs.gem.com/hospitable
|
||||
Howrecruit Io,https://jobs.gem.com/howrecruit-io
|
||||
Hubspot,https://jobs.gem.com/hubspot
|
||||
Hypernatural Ai,https://jobs.gem.com/hypernatural-ai
|
||||
Inception,https://jobs.gem.com/inception
|
||||
Index Exchange,https://jobs.gem.com/index-exchange
|
||||
Infrastructure Modernization Solutions,https://jobs.gem.com/infrastructure-modernization-solutions
|
||||
Inspiration Commerce Group,https://jobs.gem.com/inspiration-commerce-group
|
||||
Inspiresemi Com,https://jobs.gem.com/inspiresemi-com
|
||||
Instrumental Inc ,https://jobs.gem.com/instrumental-inc-
|
||||
Integral Xyz,https://jobs.gem.com/integral-xyz
|
||||
Integrationscaptain,https://jobs.gem.com/integrationscaptain
|
||||
Intelligentresourcing Co,https://jobs.gem.com/intelligentresourcing-co
|
||||
Interfere Old,https://jobs.gem.com/interfere-old
|
||||
Invoicebutler Ai,https://jobs.gem.com/invoicebutler-ai
|
||||
Iris,https://jobs.gem.com/iris
|
||||
Ironsite Ai,https://jobs.gem.com/ironsite-ai
|
||||
Itsvaleria Co,https://jobs.gem.com/itsvaleria-co
|
||||
Jaguaracareers,https://jobs.gem.com/jaguaracareers
|
||||
Janie,https://jobs.gem.com/janie
|
||||
Jayla Careers,https://jobs.gem.com/jayla-careers
|
||||
Jobma,https://jobs.gem.com/jobma
|
||||
Joinanvil Com,https://jobs.gem.com/joinanvil-com
|
||||
Joinformal,https://jobs.gem.com/joinformal
|
||||
Joyful Health,https://jobs.gem.com/joyful-health
|
||||
Kaikaku,https://jobs.gem.com/kaikaku
|
||||
Kaironhealth,https://jobs.gem.com/kaironhealth
|
||||
Kaironhealth Com,https://jobs.gem.com/kaironhealth-com
|
||||
Kanu Ai,https://jobs.gem.com/kanu-ai
|
||||
Kcs Hiring,https://jobs.gem.com/kcs-hiring
|
||||
Keru Ai,https://jobs.gem.com/keru-ai
|
||||
Key To Web3,https://jobs.gem.com/key-to-web3
|
||||
Knight Electric Inc ,https://jobs.gem.com/knight-electric-inc-
|
||||
Kollectiv Ai,https://jobs.gem.com/kollectiv-ai
|
||||
Kumo Ai,https://jobs.gem.com/kumo-ai
|
||||
Lantern,https://jobs.gem.com/lantern
|
||||
Lavapayments Com,https://jobs.gem.com/lavapayments-com
|
||||
Leap Tools,https://jobs.gem.com/leap-tools
|
||||
Letsdata,https://jobs.gem.com/letsdata
|
||||
Letter Ai,https://jobs.gem.com/letter-ai
|
||||
Level,https://jobs.gem.com/level
|
||||
Linktree,https://jobs.gem.com/linktree
|
||||
Little Otter,https://jobs.gem.com/little-otter
|
||||
Lower Llc,https://jobs.gem.com/lower-llc
|
||||
Lumalabs Ai,https://jobs.gem.com/lumalabs-ai
|
||||
Lunajoy,https://jobs.gem.com/lunajoy
|
||||
Lunch,https://jobs.gem.com/lunch
|
||||
Lunos Ai,https://jobs.gem.com/lunos-ai
|
||||
Magnetic,https://jobs.gem.com/magnetic
|
||||
Manifest,https://jobs.gem.com/manifest
|
||||
Manifested Com,https://jobs.gem.com/manifested-com
|
||||
Marble Health,https://jobs.gem.com/marble-health
|
||||
Mavi,https://jobs.gem.com/mavi
|
||||
Meetdex Ai,https://jobs.gem.com/meetdex-ai
|
||||
Megapot,https://jobs.gem.com/megapot
|
||||
Meineautosdirekt,https://jobs.gem.com/meineautosdirekt
|
||||
Menten Ai,https://jobs.gem.com/menten-ai
|
||||
Merge Sandbox,https://jobs.gem.com/merge-sandbox
|
||||
Metal Ai,https://jobs.gem.com/metal-ai
|
||||
Microsoft Demo Gem Com,https://jobs.gem.com/microsoft-demo-gem-com
|
||||
Mimicrobotics Com,https://jobs.gem.com/mimicrobotics-com
|
||||
Mission,https://jobs.gem.com/mission
|
||||
Moosehead Talent,https://jobs.gem.com/moosehead-talent
|
||||
Motion,https://jobs.gem.com/motion
|
||||
Moxa,https://jobs.gem.com/moxa
|
||||
Multiplierhq,https://jobs.gem.com/multiplierhq
|
||||
Multiscale Ai,https://jobs.gem.com/multiscale-ai
|
||||
Myprize,https://jobs.gem.com/myprize
|
||||
Myriad Technology,https://jobs.gem.com/myriad-technology
|
||||
Myrrsgroup,https://jobs.gem.com/myrrsgroup
|
||||
Nabla Bio,https://jobs.gem.com/nabla-bio
|
||||
Nacelle,https://jobs.gem.com/nacelle
|
||||
Nativemsg,https://jobs.gem.com/nativemsg
|
||||
Nclusion,https://jobs.gem.com/nclusion
|
||||
Nerve,https://jobs.gem.com/nerve
|
||||
Newcrew,https://jobs.gem.com/newcrew
|
||||
Ngram,https://jobs.gem.com/ngram
|
||||
Nimble,https://jobs.gem.com/nimble
|
||||
Niva,https://jobs.gem.com/niva
|
||||
Nominal,https://jobs.gem.com/nominal
|
||||
Northone,https://jobs.gem.com/northone
|
||||
Ntop,https://jobs.gem.com/ntop
|
||||
Nue Ai,https://jobs.gem.com/nue-ai
|
||||
Nutrislice,https://jobs.gem.com/nutrislice
|
||||
Nuvo,https://jobs.gem.com/nuvo
|
||||
Obin Ai,https://jobs.gem.com/obin-ai
|
||||
Obsidian Systems,https://jobs.gem.com/obsidian-systems
|
||||
Odo Do,https://jobs.gem.com/odo-do
|
||||
Omegahhagency Com,https://jobs.gem.com/omegahhagency-com
|
||||
Ondo Finance,https://jobs.gem.com/ondo-finance
|
||||
Onesignal,https://jobs.gem.com/onesignal
|
||||
Onesignal Ats,https://jobs.gem.com/onesignal-ats
|
||||
Onezyme,https://jobs.gem.com/onezyme
|
||||
Onfrontiers,https://jobs.gem.com/onfrontiers
|
||||
Openphone,https://jobs.gem.com/openphone
|
||||
Openreqstaffing,https://jobs.gem.com/openreqstaffing
|
||||
Opine,https://jobs.gem.com/opine
|
||||
Ora So,https://jobs.gem.com/ora-so
|
||||
Overlay,https://jobs.gem.com/overlay
|
||||
Overwatch,https://jobs.gem.com/overwatch
|
||||
Paces,https://jobs.gem.com/paces
|
||||
Pae,https://jobs.gem.com/pae
|
||||
Pagebound,https://jobs.gem.com/pagebound
|
||||
Pally,https://jobs.gem.com/pally
|
||||
Paramark,https://jobs.gem.com/paramark
|
||||
Partao,https://jobs.gem.com/partao
|
||||
Partnerhq,https://jobs.gem.com/partnerhq
|
||||
Patlytics,https://jobs.gem.com/patlytics
|
||||
Pave,https://jobs.gem.com/pave
|
||||
Perceptyx,https://jobs.gem.com/perceptyx
|
||||
Photalabs Com,https://jobs.gem.com/photalabs-com
|
||||
Photon,https://jobs.gem.com/photon
|
||||
Pinnacleconnect Llc,https://jobs.gem.com/pinnacleconnect-llc
|
||||
Piqenergy Com,https://jobs.gem.com/piqenergy-com
|
||||
Planet Fans,https://jobs.gem.com/planet-fans
|
||||
Planned,https://jobs.gem.com/planned
|
||||
Plixai,https://jobs.gem.com/plixai
|
||||
Pogo Recruiting,https://jobs.gem.com/pogo-recruiting
|
||||
Polar,https://jobs.gem.com/polar
|
||||
Polywork,https://jobs.gem.com/polywork
|
||||
Pomerium,https://jobs.gem.com/pomerium
|
||||
Portal Ai,https://jobs.gem.com/portal-ai
|
||||
Poseidonaero,https://jobs.gem.com/poseidonaero
|
||||
Prahsys Com,https://jobs.gem.com/prahsys-com
|
||||
Praxisiq Ai,https://jobs.gem.com/praxisiq-ai
|
||||
Precision Ai,https://jobs.gem.com/precision-ai
|
||||
Prodia,https://jobs.gem.com/prodia
|
||||
Productboard,https://jobs.gem.com/productboard
|
||||
Productboard Ats,https://jobs.gem.com/productboard-ats
|
||||
Prohost Ai,https://jobs.gem.com/prohost-ai
|
||||
Project Method,https://jobs.gem.com/project-method
|
||||
Promptql,https://jobs.gem.com/promptql
|
||||
Propel,https://jobs.gem.com/propel
|
||||
Prospermedical Com,https://jobs.gem.com/prospermedical-com
|
||||
Protegeai,https://jobs.gem.com/protegeai
|
||||
Questdb Com,https://jobs.gem.com/questdb-com
|
||||
Quitwithjones,https://jobs.gem.com/quitwithjones
|
||||
Quo,https://jobs.gem.com/quo
|
||||
Rain Aero,https://jobs.gem.com/rain-aero
|
||||
Raincode Bahrain W L L,https://jobs.gem.com/raincode-bahrain-w-l-l
|
||||
Raylu Ai,https://jobs.gem.com/raylu-ai
|
||||
Rctsglobal Com,https://jobs.gem.com/rctsglobal-com
|
||||
Rditrials,https://jobs.gem.com/rditrials
|
||||
Rebuild Work,https://jobs.gem.com/rebuild-work
|
||||
Redcar,https://jobs.gem.com/redcar
|
||||
Redenvelope Co,https://jobs.gem.com/redenvelope-co
|
||||
Redo,https://jobs.gem.com/redo
|
||||
Rektech,https://jobs.gem.com/rektech
|
||||
Renew,https://jobs.gem.com/renew
|
||||
Resprop,https://jobs.gem.com/resprop
|
||||
Retool,https://jobs.gem.com/retool
|
||||
Revolutionparts,https://jobs.gem.com/revolutionparts
|
||||
Rex,https://jobs.gem.com/rex
|
||||
Rf Renovo Management Company Llc,https://jobs.gem.com/rf-renovo-management-company-llc
|
||||
Riley,https://jobs.gem.com/riley
|
||||
Rinsed,https://jobs.gem.com/rinsed
|
||||
Risely Ai,https://jobs.gem.com/risely-ai
|
||||
Rivia,https://jobs.gem.com/rivia
|
||||
Roadio Ai,https://jobs.gem.com/roadio-ai
|
||||
Roamless,https://jobs.gem.com/roamless
|
||||
Roe Ai,https://jobs.gem.com/roe-ai
|
||||
Rossibuilders Com,https://jobs.gem.com/rossibuilders-com
|
||||
Roundhouse Media,https://jobs.gem.com/roundhouse-media
|
||||
Rove,https://jobs.gem.com/rove
|
||||
Runsybil,https://jobs.gem.com/runsybil
|
||||
Sadnaconsulting Com,https://jobs.gem.com/sadnaconsulting-com
|
||||
Sailorhealth Com,https://jobs.gem.com/sailorhealth-com
|
||||
Sales Marker,https://jobs.gem.com/sales-marker
|
||||
Salesqueze Com,https://jobs.gem.com/salesqueze-com
|
||||
Sandbar Inc,https://jobs.gem.com/sandbar-inc
|
||||
Sandboxschonfeld Com,https://jobs.gem.com/sandboxschonfeld-com
|
||||
Sauron Systems,https://jobs.gem.com/sauron-systems
|
||||
Scope Labs,https://jobs.gem.com/scope-labs
|
||||
Scowtt Com,https://jobs.gem.com/scowtt-com
|
||||
Seated,https://jobs.gem.com/seated
|
||||
Seed2Series Com,https://jobs.gem.com/seed2series-com
|
||||
Seniorverse,https://jobs.gem.com/seniorverse
|
||||
Sennder Gmbh,https://jobs.gem.com/sennder-gmbh
|
||||
Senndertechnologies Gmbh,https://jobs.gem.com/senndertechnologies-gmbh
|
||||
Sensorum Health,https://jobs.gem.com/sensorum-health
|
||||
Serv Ai,https://jobs.gem.com/serv-ai
|
||||
Seven Starling,https://jobs.gem.com/seven-starling
|
||||
Shef Com,https://jobs.gem.com/shef-com
|
||||
Shorebird Dev,https://jobs.gem.com/shorebird-dev
|
||||
Showtime,https://jobs.gem.com/showtime
|
||||
Signoz,https://jobs.gem.com/signoz
|
||||
Silkline,https://jobs.gem.com/silkline
|
||||
Skypilot Co,https://jobs.gem.com/skypilot-co
|
||||
Slash,https://jobs.gem.com/slash
|
||||
Sleep Center,https://jobs.gem.com/sleep-center
|
||||
Smacktechnologies Com,https://jobs.gem.com/smacktechnologies-com
|
||||
Snout,https://jobs.gem.com/snout
|
||||
Softup Technologies,https://jobs.gem.com/softup-technologies
|
||||
Sohar Health,https://jobs.gem.com/sohar-health
|
||||
Soundhound,https://jobs.gem.com/soundhound
|
||||
Spawn,https://jobs.gem.com/spawn
|
||||
Spellbrush,https://jobs.gem.com/spellbrush
|
||||
Sphere Semi,https://jobs.gem.com/sphere-semi
|
||||
Ssg,https://jobs.gem.com/ssg
|
||||
Stack Auth Com,https://jobs.gem.com/stack-auth-com
|
||||
Startup People Solutions,https://jobs.gem.com/startup-people-solutions
|
||||
Stealth Startup,https://jobs.gem.com/stealth-startup
|
||||
Stockapp Com,https://jobs.gem.com/stockapp-com
|
||||
Stryke,https://jobs.gem.com/stryke
|
||||
Sunsethq Com,https://jobs.gem.com/sunsethq-com
|
||||
Super Hi Fi,https://jobs.gem.com/super-hi-fi
|
||||
Superblocks,https://jobs.gem.com/superblocks
|
||||
Supersonik Ai,https://jobs.gem.com/supersonik-ai
|
||||
Supio,https://jobs.gem.com/supio
|
||||
Suppliercanada Com,https://jobs.gem.com/suppliercanada-com
|
||||
Switchgrowth Com,https://jobs.gem.com/switchgrowth-com
|
||||
Symbolica,https://jobs.gem.com/symbolica
|
||||
Syndesus,https://jobs.gem.com/syndesus
|
||||
System Two Security,https://jobs.gem.com/system-two-security
|
||||
Taxgpt Inc ,https://jobs.gem.com/taxgpt-inc-
|
||||
Taxo Ai,https://jobs.gem.com/taxo-ai
|
||||
Tektome Com,https://jobs.gem.com/tektome-com
|
||||
Telora,https://jobs.gem.com/telora
|
||||
Tensorstax Com,https://jobs.gem.com/tensorstax-com
|
||||
Tenx Recruiting,https://jobs.gem.com/tenx-recruiting
|
||||
Terraai Earth,https://jobs.gem.com/terraai-earth
|
||||
Test Board,https://jobs.gem.com/test-board
|
||||
The Boring Company,https://jobs.gem.com/the-boring-company
|
||||
The Brewer Garrett Company,https://jobs.gem.com/the-brewer-garrett-company
|
||||
The Talent Project Com,https://jobs.gem.com/the-talent-project-com
|
||||
Theburntapp Com,https://jobs.gem.com/theburntapp-com
|
||||
Theinterface,https://jobs.gem.com/theinterface
|
||||
Thejobbridge,https://jobs.gem.com/thejobbridge
|
||||
Thelma,https://jobs.gem.com/thelma
|
||||
Theluckyfoundation,https://jobs.gem.com/theluckyfoundation
|
||||
Thenewclub Fyi,https://jobs.gem.com/thenewclub-fyi
|
||||
Theseus Us,https://jobs.gem.com/theseus-us
|
||||
Thinkific,https://jobs.gem.com/thinkific
|
||||
Third Dimension,https://jobs.gem.com/third-dimension
|
||||
Thrivory,https://jobs.gem.com/thrivory
|
||||
Thunder,https://jobs.gem.com/thunder
|
||||
Thunder Compute,https://jobs.gem.com/thunder-compute
|
||||
Timetoperform,https://jobs.gem.com/timetoperform
|
||||
Token Transit,https://jobs.gem.com/token-transit
|
||||
Toolhouse Ai,https://jobs.gem.com/toolhouse-ai
|
||||
Torchsystems Com,https://jobs.gem.com/torchsystems-com
|
||||
Transluce,https://jobs.gem.com/transluce
|
||||
Trashlab,https://jobs.gem.com/trashlab
|
||||
Tricentis,https://jobs.gem.com/tricentis
|
||||
Trilliumhiring Com,https://jobs.gem.com/trilliumhiring-com
|
||||
Tripworks Com,https://jobs.gem.com/tripworks-com
|
||||
Tristero,https://jobs.gem.com/tristero
|
||||
Trojan Trading,https://jobs.gem.com/trojan-trading
|
||||
Tropic,https://jobs.gem.com/tropic
|
||||
Trybree Com,https://jobs.gem.com/trybree-com
|
||||
Tryhelium Com,https://jobs.gem.com/tryhelium-com
|
||||
Tungsten Dev,https://jobs.gem.com/tungsten-dev
|
||||
Turbohome,https://jobs.gem.com/turbohome
|
||||
Twentyfour7 Dev,https://jobs.gem.com/twentyfour7-dev
|
||||
Unify Ai,https://jobs.gem.com/unify-ai
|
||||
Untolabs Com,https://jobs.gem.com/untolabs-com
|
||||
Up Labs,https://jobs.gem.com/up-labs
|
||||
Useful,https://jobs.gem.com/useful
|
||||
Usemalleable Com,https://jobs.gem.com/usemalleable-com
|
||||
Vamo Xyz,https://jobs.gem.com/vamo-xyz
|
||||
Vanguard Cleaning Systems,https://jobs.gem.com/vanguard-cleaning-systems
|
||||
Vantaca,https://jobs.gem.com/vantaca
|
||||
Vantager,https://jobs.gem.com/vantager
|
||||
Vantara Ai,https://jobs.gem.com/vantara-ai
|
||||
Vectorworks,https://jobs.gem.com/vectorworks
|
||||
Vectrasim,https://jobs.gem.com/vectrasim
|
||||
Veho Technologies,https://jobs.gem.com/veho-technologies
|
||||
Ventionteams Com,https://jobs.gem.com/ventionteams-com
|
||||
Venture Guides,https://jobs.gem.com/venture-guides
|
||||
Vercel Ats Sandbox,https://jobs.gem.com/vercel-ats-sandbox
|
||||
Vesseltalent Com,https://jobs.gem.com/vesseltalent-com
|
||||
Voker Ai,https://jobs.gem.com/voker-ai
|
||||
Voltai Com,https://jobs.gem.com/voltai-com
|
||||
Wayback Labs,https://jobs.gem.com/wayback-labs
|
||||
Webflow Ats Sandbox,https://jobs.gem.com/webflow-ats-sandbox
|
||||
Western Governors University,https://jobs.gem.com/western-governors-university
|
||||
Whatconverts,https://jobs.gem.com/whatconverts
|
||||
Wiseroad Recruiting Inc,https://jobs.gem.com/wiseroad-recruiting-inc
|
||||
Wizecamel,https://jobs.gem.com/wizecamel
|
||||
Wolfjaw Careers,https://jobs.gem.com/wolfjaw-careers
|
||||
Wonolo,https://jobs.gem.com/wonolo
|
||||
Woodsideai,https://jobs.gem.com/woodsideai
|
||||
Youtrip,https://jobs.gem.com/youtrip
|
||||
Zefi Ai,https://jobs.gem.com/zefi-ai
|
||||
Zep,https://jobs.gem.com/zep
|
||||
Zorrorx,https://jobs.gem.com/zorrorx
|
||||
|
@ -1,6 +0,0 @@
|
||||
url,timestamp
|
||||
https://job-boards.eu.greenhouse.io/bcbgroup/jobs/4681083101?gh_src=cryptocurrencyjobs.co,2025-12-31T08:35:23.424931
|
||||
https://job-boards.greenhouse.io/securitize/jobs/4074121009?gh_src=cryptocurrencyjobs.co,2025-12-31T09:19:17.349713
|
||||
https://job-boards.eu.greenhouse.io/bcbgroup/jobs/4681102101?gh_src=cryptocurrencyjobs.co,2025-12-31T09:58:36.919216
|
||||
https://job-boards.greenhouse.io/kiosk/jobs/4427184005?gh_src=cryptocurrencyjobs.co,2025-12-31T10:10:51.176114
|
||||
https://job-boards.eu.greenhouse.io/bcbgroup/jobs/4681083101?gh_src=cryptocurrencyjobs.co,2025-12-31T11:02:31.869728
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,7 +0,0 @@
|
||||
url,timestamp
|
||||
https://jobs.eu.lever.co/kaiko/3f7f3db9-4a6a-4047-8760-bc52c3d03e05?lever-origin=applied&lever-source%5B%5D=cryptocurrencyjobs.co,2025-12-31T09:20:28.542417
|
||||
https://jobs.lever.co/waterfall/880fb1b4-2515-4534-9970-53c497c82f12?lever-origin=applied&lever-source%5B%5D=cryptocurrencyjobs.co,2025-12-31T10:08:17.316072
|
||||
https://jobs.lever.co/obol-tech/fcccd493-54e4-425a-b9bd-82fa6f7e6aff?lever-origin=applied&lever-source%5B%5D=cryptocurrencyjobs.co,2025-12-31T10:35:29.164452
|
||||
https://jobs.eu.lever.co/coinspaid/7605e154-4b1d-45ee-b1d4-35edea13d80b?lever-origin=applied&lever-source%5B%5D=cryptocurrencyjobs.co,2025-12-31T10:51:38.852693
|
||||
https://jobs.lever.co/vedatechlabs/9c59c96c-2bb0-47b0-88fe-5d5a9fd85997?lever-origin=applied&lever-source%5B%5D=cryptocurrencyjobs.co,2025-12-31T11:02:16.120852
|
||||
https://jobs.eu.lever.co/kaiko/3f7f3db9-4a6a-4047-8760-bc52c3d03e05?lever-origin=applied&lever-source%5B%5D=cryptocurrencyjobs.co,2025-12-31T11:16:43.218273
|
||||
|
1792
levercompanies.csv
1792
levercompanies.csv
File diff suppressed because it is too large
Load Diff
@ -1,8 +0,0 @@
|
||||
url,timestamp
|
||||
https://www.linkedin.com/jobs/view/operations-analyst-at-amber-group-4325538653/?ref=cryptocurrencyjobs.co,2025-12-31T09:20:11.544002
|
||||
https://www.linkedin.com/jobs/view/hr-operations-intern-sg-at-matrixport-official-4338171692/?ref=cryptocurrencyjobs.co,2025-12-31T09:25:10.499933
|
||||
https://www.linkedin.com/jobs/view/operations-analyst-at-matrixport-official-4235087267/?ref=cryptocurrencyjobs.co,2025-12-31T09:33:53.104120
|
||||
https://www.linkedin.com/jobs/view/business-operations-analyst-at-matrixport-official-4215538150/?ref=cryptocurrencyjobs.co,2025-12-31T09:34:24.186519
|
||||
https://www.linkedin.com/jobs/view/graduate-hiring-business-operations-analyst-wealth-management-at-matrixport-official-4131687672/?ref=cryptocurrencyjobs.co,2025-12-31T09:36:47.038648
|
||||
https://www.linkedin.com/jobs/view/customer-support-specialist-at-matrixport-official-4323103235/?ref=cryptocurrencyjobs.co,2025-12-31T10:39:57.272414
|
||||
https://www.linkedin.com/jobs/view/finance-intern-at-amber-group-4248725225/?ref=cryptocurrencyjobs.co,2025-12-31T11:31:03.349275
|
||||
|
218
llm_agent.py
218
llm_agent.py
@ -21,12 +21,13 @@ class LLMJobRefiner:
|
||||
raise ValueError("DEEPSEEK_API_KEY not found in .env file.")
|
||||
|
||||
# Database credentials from .env
|
||||
self.db_url = os.getenv("DB_URL")
|
||||
self.db_username = os.getenv("DB_USERNAME")
|
||||
self.db_password = os.getenv("DB_PASSWORD")
|
||||
self.db_host = os.getenv("DB_HOST")
|
||||
self.db_port = os.getenv("DB_PORT")
|
||||
|
||||
if not self.db_username or not self.db_password:
|
||||
if not self.db_url or not self.db_username or not self.db_password:
|
||||
raise ValueError("Database credentials not found in .env file.")
|
||||
|
||||
# DeepSeek uses OpenAI-compatible API
|
||||
@ -40,12 +41,22 @@ class LLMJobRefiner:
|
||||
def _init_db(self):
|
||||
"""Initialize PostgreSQL database connection and create table"""
|
||||
try:
|
||||
conn = psycopg2.connect(
|
||||
host=self.db_host,
|
||||
port=self.db_port,
|
||||
database="postgres",
|
||||
user=self.db_username,
|
||||
password=self.db_password
|
||||
self.db_url = os.getenv("DB_URL")
|
||||
if self.db_url and "supabase.com" in self.db_url:
|
||||
conn = psycopg2.connect(
|
||||
host=self.db_host,
|
||||
port=self.db_port,
|
||||
database="postgres",
|
||||
user=self.db_username,
|
||||
password=self.db_password
|
||||
)
|
||||
else:
|
||||
conn = psycopg2.connect(
|
||||
host=self.db_host,
|
||||
port=self.db_port,
|
||||
database="postgres",
|
||||
user=self.db_username,
|
||||
password=self.db_password
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
|
||||
@ -102,8 +113,8 @@ class LLMJobRefiner:
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
|
||||
# Limit length for LLM context
|
||||
if len(text) > 100000:
|
||||
text = text[:100000] + "..."
|
||||
if len(text) > 10000:
|
||||
text = text[:10000] + "..."
|
||||
|
||||
return text
|
||||
except Exception as e:
|
||||
@ -117,7 +128,7 @@ class LLMJobRefiner:
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
temperature=0.1,
|
||||
temperature=0.2,
|
||||
max_tokens=2048,
|
||||
stream=False
|
||||
)
|
||||
@ -134,52 +145,38 @@ class LLMJobRefiner:
|
||||
posted_date = raw_data.get('posted_date', datetime.now().strftime("%m/%d/%y"))
|
||||
|
||||
prompt = f"""
|
||||
You are an expert job posting data extractor. Your task is to extract AND infer fields from the provided job posting.
|
||||
|
||||
### CORE RULES:
|
||||
1. **NEVER invent, summarize, or paraphrase** — extract **exact wording** when available.
|
||||
2. **For critical fields (title, company_name, job_id, url, description):**
|
||||
- These MUST be present and meaningful.
|
||||
- If not explicitly stated, **infer from context** (e.g., page title, headings, "About Us", etc.).
|
||||
- **NEVER return "Not provided" or "N/A" for these fields.**
|
||||
3. **For optional fields (location, salary_range, etc.):**
|
||||
- Extract exact text if present.
|
||||
- If absent but inferable (e.g., "Remote (US)", "Full-time"), **infer it**.
|
||||
- Only return "Not provided" if truly absent and non-inferable.
|
||||
|
||||
### FIELD DEFINITIONS:
|
||||
- **title**: The job title. Look in <h1>, page title, or bold headings.
|
||||
- **company_name**: Company name. Look in logo alt text, footer, "About [X]", or page title.
|
||||
- **description**: Main job overview, responsibilities, or duties. Combine relevant paragraphs if needed. **Must not be empty.**
|
||||
- **requirements**: Required skills, experience, or qualifications.
|
||||
- **qualifications**: Educational or certification requirements.
|
||||
- **location**: Office location or remote policy.
|
||||
- **salary_range**: Exact compensation info.
|
||||
- **nature_of_work**: Employment type (Full-time, Contract, Internship, etc.).
|
||||
|
||||
### OUTPUT FORMAT:
|
||||
Return ONLY a valid JSON object with these keys:
|
||||
{{
|
||||
"title": "...",
|
||||
"company_name": "...",
|
||||
"location": "...",
|
||||
"description": "...",
|
||||
"requirements": "...",
|
||||
"qualifications": "...",
|
||||
"salary_range": "...",
|
||||
"nature_of_work": "...",
|
||||
"job_id": "{job_id}",
|
||||
"url": "{url}"
|
||||
}}
|
||||
|
||||
- **Critical fields must NEVER be "Not provided", "N/A", empty, or generic** (e.g., "Company", "Job Title").
|
||||
- **Optional fields may be "Not provided" ONLY if truly absent.**
|
||||
- **Do not include markdown, explanations, or extra text.**
|
||||
- **Use double quotes for JSON.**
|
||||
|
||||
Page Content:
|
||||
{cleaned_content}
|
||||
"""
|
||||
You are a job posting data extractor.
|
||||
|
||||
EXTRACT EXACT TEXT - DO NOT SUMMARIZE, PARAPHRASE, OR INVENT.
|
||||
|
||||
For these critical fields, follow these rules:
|
||||
- description: Extract ALL job description text. If ANY job details exist (duties, responsibilities, overview), include them. Only use "Not provided" if absolutely no description exists.
|
||||
- requirements: Extract ALL requirements text. If ANY requirements exist (skills, experience, education needed), include them. Only use "Not provided" if none exist.
|
||||
- qualifications: Extract ALL qualifications text. If ANY qualifications exist, include them. Only use "Not provided" if none exist.
|
||||
|
||||
REQUIRED FIELDS (must have valid values, never "N/A"):
|
||||
- title, company_name, job_id, url
|
||||
|
||||
OPTIONAL FIELDS (can be "Not provided"):
|
||||
- location, salary_range, nature_of_work
|
||||
|
||||
Page Content:
|
||||
{cleaned_content}
|
||||
|
||||
Response format (ONLY return this JSON):
|
||||
{{
|
||||
"title": "...",
|
||||
"company_name": "...",
|
||||
"location": "...",
|
||||
"description": "...",
|
||||
"requirements": "...",
|
||||
"qualifications": "...",
|
||||
"salary_range": "...",
|
||||
"nature_of_work": "...",
|
||||
"job_id": "{job_id}",
|
||||
"url": "{url}"
|
||||
}}
|
||||
"""
|
||||
|
||||
try:
|
||||
response_text = await asyncio.get_event_loop().run_in_executor(
|
||||
@ -191,23 +188,31 @@ Page Content:
|
||||
if not refined_data:
|
||||
return None
|
||||
|
||||
# Validate critical fields — reject if missing or placeholder
|
||||
critical_fields = ['title', 'company_name', 'job_id', 'url', 'description']
|
||||
for field in critical_fields:
|
||||
value = refined_data.get(field, "").strip()
|
||||
if not value or value.lower() in ["n/a", "not provided", "unknown", "company", "job", "title", ""]:
|
||||
print(f" ❌ Critical field '{field}' is invalid: '{value}'")
|
||||
return None # This job will NOT be saved — as per requirement
|
||||
# Validate required fields
|
||||
required_fields = ['title', 'company_name', 'job_id', 'url']
|
||||
for field in required_fields:
|
||||
if not refined_data.get(field) or refined_data[field].strip() in ["N/A", "", "Unknown", "Company", "Job"]:
|
||||
return None
|
||||
|
||||
# Optional fields: allow "Not provided", but ensure they're strings
|
||||
optional_fields = ['location', 'requirements', 'qualifications', 'salary_range', 'nature_of_work']
|
||||
for field in optional_fields:
|
||||
if field not in refined_data:
|
||||
refined_data[field] = "Not provided"
|
||||
elif not isinstance(refined_data[field], str):
|
||||
refined_data[field] = str(refined_data[field])
|
||||
# CRITICAL: Validate content fields - check if they SHOULD exist
|
||||
content_fields = ['description', 'requirements', 'qualifications']
|
||||
cleaned_original = cleaned_content.lower()
|
||||
|
||||
# Simple heuristic: if page contains job-related keywords, content fields should NOT be "Not provided"
|
||||
job_indicators = ['responsibilit', 'duties', 'require', 'qualifi', 'skill', 'experienc', 'educat', 'degree', 'bachelor', 'master']
|
||||
has_job_content = any(indicator in cleaned_original for indicator in job_indicators)
|
||||
|
||||
if has_job_content:
|
||||
for field in content_fields:
|
||||
value = refined_data.get(field, "").strip()
|
||||
if value in ["Not provided", "N/A", ""]:
|
||||
# LLM failed to extract existing content
|
||||
print(f" ⚠️ LLM returned '{value}' for {field} but job content appears present")
|
||||
return None
|
||||
|
||||
# Add the posted_date to the refined data
|
||||
refined_data['posted_date'] = posted_date
|
||||
|
||||
return refined_data
|
||||
|
||||
except Exception as e:
|
||||
@ -215,22 +220,15 @@ Page Content:
|
||||
return None
|
||||
|
||||
def _parse_llm_response(self, response_text: str) -> Dict[str, Any]:
|
||||
# Try to extract JSON from markdown code block
|
||||
json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', response_text, re.DOTALL)
|
||||
if not json_match:
|
||||
# Try to find raw JSON object
|
||||
json_match = re.search(r'\{[^{}]*\{[^{}]*\}[^{}]*\}|\{.*\}', response_text, re.DOTALL)
|
||||
json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
|
||||
if not json_match:
|
||||
return None
|
||||
|
||||
try:
|
||||
json_str = json_match.group(1) if '```' in response_text else json_match.group(0)
|
||||
# Clean common issues
|
||||
json_str = re.sub(r'\s+', ' ', json_str)
|
||||
json_str = re.sub(r',\s*([\]}\)])', r'\1', json_str) # Remove trailing commas
|
||||
return json.loads(json_str)
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"JSON parsing error: {e}")
|
||||
return json.loads(json_match.group(1) if '```' in response_text else json_match.group(0))
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
|
||||
async def save_job_data(self, job_data: Dict[str, Any], keyword: str):
|
||||
@ -241,11 +239,11 @@ Page Content:
|
||||
"""Save job data to PostgreSQL database with job_id uniqueness"""
|
||||
try:
|
||||
conn = psycopg2.connect(
|
||||
host=self.db_host,
|
||||
port=self.db_port,
|
||||
database="postgres",
|
||||
user=self.db_username,
|
||||
password=self.db_password
|
||||
host=self.db_host,
|
||||
port=self.db_port,
|
||||
database="postgres",
|
||||
user=self.db_username,
|
||||
password=self.db_password
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
|
||||
@ -256,50 +254,50 @@ Page Content:
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||
ON CONFLICT (job_id) DO NOTHING
|
||||
''', (
|
||||
job_data.get("title", "Not provided"),
|
||||
job_data.get("company_name", "Not provided"),
|
||||
job_data.get("location", "Not provided"),
|
||||
job_data.get("description", "Not provided"),
|
||||
job_data.get("requirements", "Not provided"),
|
||||
job_data.get("qualifications", "Not provided"),
|
||||
job_data.get("salary_range", "Not provided"),
|
||||
job_data.get("nature_of_work", "Not provided"),
|
||||
job_data.get("job_id", "unknown"),
|
||||
job_data.get("title", "N/A"),
|
||||
job_data.get("company_name", "N/A"),
|
||||
job_data.get("location", "N/A"),
|
||||
job_data.get("description", "N/A"),
|
||||
job_data.get("requirements", "N/A"),
|
||||
job_data.get("qualifications", "N/A"),
|
||||
job_data.get("salary_range", "N/A"),
|
||||
job_data.get("nature_of_work", "N/A"),
|
||||
job_data.get("job_id", "N/A"),
|
||||
job_data.get("url", "N/A"),
|
||||
job_data.get("category", "all"),
|
||||
job_data.get("category", "N/A"),
|
||||
job_data.get("scraped_at"),
|
||||
job_data.get("posted_date", datetime.now().strftime("%m/%d/%y"))
|
||||
job_data.get("posted_date", "N/A")
|
||||
))
|
||||
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
print(f" 💾 Saved job to category '{job_data.get('category', 'all')}' with job_id: {job_data.get('job_id', 'unknown')}")
|
||||
print(f" 💾 Saved job to category '{job_data.get('category', 'N/A')}' with job_id: {job_data.get('job_id', 'N/A')}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Database save error: {e}")
|
||||
|
||||
async def _save_to_markdown(self, job_data: Dict[str, Any], keyword: str):
|
||||
os.makedirs("crypto_jobs", exist_ok=True)
|
||||
filepath = os.path.join("crypto_jobs", "crypto_jobs_scraped.md")
|
||||
os.makedirs("linkedin_jobs", exist_ok=True)
|
||||
filepath = os.path.join("linkedin_jobs", "linkedin_jobs_scraped.md")
|
||||
write_header = not os.path.exists(filepath) or os.path.getsize(filepath) == 0
|
||||
|
||||
with open(filepath, "a", encoding="utf-8") as f:
|
||||
if write_header:
|
||||
f.write(f"# Crypto Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
|
||||
f.write(f"## Job: {job_data.get('title', 'Not provided')}\n\n")
|
||||
f.write(f"# LinkedIn Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
|
||||
f.write(f"## Job: {job_data.get('title', 'N/A')}\n\n")
|
||||
f.write(f"- **Keyword**: {keyword}\n")
|
||||
f.write(f"- **Company**: {job_data.get('company_name', 'Not provided')}\n")
|
||||
f.write(f"- **Location**: {job_data.get('location', 'Not provided')}\n")
|
||||
f.write(f"- **Nature of Work**: {job_data.get('nature_of_work', 'Not provided')}\n")
|
||||
f.write(f"- **Salary Range**: {job_data.get('salary_range', 'Not provided')}\n")
|
||||
f.write(f"- **Job ID**: {job_data.get('job_id', 'unknown')}\n")
|
||||
f.write(f"- **Company**: {job_data.get('company_name', 'N/A')}\n")
|
||||
f.write(f"- **Location**: {job_data.get('location', 'N/A')}\n")
|
||||
f.write(f"- **Nature of Work**: {job_data.get('nature_of_work', 'N/A')}\n")
|
||||
f.write(f"- **Salary Range**: {job_data.get('salary_range', 'N/A')}\n")
|
||||
f.write(f"- **Job ID**: {job_data.get('job_id', 'N/A')}\n")
|
||||
f.write(f"- **Posted Date**: {job_data.get('posted_date', 'N/A')}\n")
|
||||
f.write(f"- **Category**: {job_data.get('category', 'all')}\n")
|
||||
f.write(f"- **Category**: {job_data.get('category', 'N/A')}\n")
|
||||
f.write(f"- **Scraped At**: {job_data.get('scraped_at', 'N/A')}\n")
|
||||
f.write(f"- **URL**: <{job_data.get('url', 'N/A')}>\n\n")
|
||||
f.write(f"### Description\n\n{job_data.get('description', 'Not provided')}\n\n")
|
||||
f.write(f"### Requirements\n\n{job_data.get('requirements', 'Not provided')}\n\n")
|
||||
f.write(f"### Qualifications\n\n{job_data.get('qualifications', 'Not provided')}\n\n")
|
||||
f.write(f"### Description\n\n{job_data.get('description', 'N/A')}\n\n")
|
||||
f.write(f"### Requirements\n\n{job_data.get('requirements', 'N/A')}\n\n")
|
||||
f.write(f"### Qualifications\n\n{job_data.get('qualifications', 'N/A')}\n\n")
|
||||
f.write("---\n\n")
|
||||
20
main.py
20
main.py
@ -1,4 +1,3 @@
|
||||
|
||||
from scraping_engine import FingerprintScrapingEngine
|
||||
from scraper import CryptoJobScraper # Updated class name
|
||||
import os
|
||||
@ -21,15 +20,16 @@ async def main():
|
||||
scraper = CryptoJobScraper(engine, human_speed=1.3, user_request="Extract title, company, location, description, requirements, qualifications, nature of work, and salary")
|
||||
|
||||
job_titles = [
|
||||
"Customer Support",
|
||||
"Design",
|
||||
"Engineering",
|
||||
"Finance",
|
||||
"Marketing",
|
||||
"Operations",
|
||||
"Product",
|
||||
"Sales"
|
||||
|
||||
"Blockchain Engineer",
|
||||
"Smart Contract Developer",
|
||||
"DeFi Analyst",
|
||||
"Web3 Developer",
|
||||
"Crypto Researcher",
|
||||
"Solidity Developer",
|
||||
"Protocol Engineer",
|
||||
"Tokenomics Specialist",
|
||||
"Zero-Knowledge Proof Engineer",
|
||||
"Crypto Compliance Officer"
|
||||
]
|
||||
|
||||
while True:
|
||||
|
||||
@ -1 +0,0 @@
|
||||
url,timestamp
|
||||
|
File diff suppressed because it is too large
Load Diff
440
scraper.py
440
scraper.py
@ -6,13 +6,11 @@ from playwright.async_api import async_playwright, TimeoutError as PlaywrightTim
|
||||
from browserforge.injectors.playwright import AsyncNewContext
|
||||
from llm_agent import LLMJobRefiner
|
||||
import re
|
||||
from fetcher import StealthyFetcher
|
||||
from datetime import datetime
|
||||
import json
|
||||
import redis
|
||||
from urllib.parse import urlparse
|
||||
import hashlib
|
||||
import csv
|
||||
import os
|
||||
|
||||
|
||||
class CryptoJobScraper:
|
||||
def __init__(
|
||||
@ -29,29 +27,6 @@ class CryptoJobScraper:
|
||||
self.llm_agent = LLMJobRefiner()
|
||||
self.redis_client = redis.Redis(host='localhost', port=6379, db=0, decode_responses=True)
|
||||
|
||||
self.FORBIDDEN_ATS_DOMAINS = [
|
||||
'ashby', 'ashbyhq',
|
||||
'greenhouse', 'boards.greenhouse.io',
|
||||
'gem', 'gem.com',
|
||||
'rippling',
|
||||
'myworkday', 'myworkdayjobs',
|
||||
'smartrecruiters',
|
||||
'workable',
|
||||
'lever', 'jobs.lever.co',
|
||||
'linkedin.com' # ✅ Added LinkedIn
|
||||
]
|
||||
|
||||
self.INVALID_CONTENT_PHRASES = [
|
||||
"invalid job url",
|
||||
"cookie consent",
|
||||
"privacy policy",
|
||||
"not a valid job",
|
||||
"job not found",
|
||||
"page not found",
|
||||
"The requested job post could not be found. It may have been removed.",
|
||||
"this page does not contain a job description"
|
||||
]
|
||||
|
||||
async def _human_click(self, page, element, wait_after: bool = True):
|
||||
if not element:
|
||||
return False
|
||||
@ -80,127 +55,60 @@ class CryptoJobScraper:
|
||||
matches = sum(1 for kw in keyword_list if kw in title_lower)
|
||||
return matches / len(keyword_list) if keyword_list else 0.0
|
||||
|
||||
async def _extract_job_title_from_card(self, card) -> str:
|
||||
try:
|
||||
title_selectors = [
|
||||
'h3', 'h2', 'h4',
|
||||
'strong', 'span'
|
||||
]
|
||||
for selector in title_selectors:
|
||||
title_element = await card.query_selector(selector)
|
||||
if title_element:
|
||||
title_text = await title_element.inner_text()
|
||||
if title_text and len(title_text.strip()) > 3:
|
||||
return title_text.strip()
|
||||
|
||||
card_text = await card.inner_text()
|
||||
lines = [line.strip() for line in card_text.split('\n') if line.strip()]
|
||||
if lines:
|
||||
for line in lines:
|
||||
if len(line) > 5 and not any(skip in line.lower() for skip in ['today', 'featured', 'yesterday', 'company', 'location']):
|
||||
return line
|
||||
return "Unknown Title"
|
||||
except:
|
||||
return "Unknown Title"
|
||||
|
||||
async def _collect_job_elements_from_page(self, page, search_keywords: str, seen_slugs):
|
||||
job_cards = []
|
||||
job_found = False
|
||||
|
||||
await asyncio.sleep(3 * self.human_speed)
|
||||
|
||||
try:
|
||||
await page.wait_for_selector('a[href^="/"][href*="-"]', timeout=60000)
|
||||
candidates = await page.query_selector_all('a[href^="/"][href*="-"]')
|
||||
|
||||
for link in candidates:
|
||||
href = await link.get_attribute("href") or ""
|
||||
href = href.rstrip('/')
|
||||
if not href or len(href.split('/')) != 3:
|
||||
continue
|
||||
if '-' not in href.split('/')[-1]:
|
||||
continue
|
||||
slug = href.split('/')[-1]
|
||||
if len(slug) < 8 or slug.startswith(('login', 'signup', 'about', 'terms')):
|
||||
continue
|
||||
|
||||
full_url = "https://cryptocurrencyjobs.co" + href if not href.startswith('http') else href
|
||||
if slug in seen_slugs:
|
||||
continue
|
||||
|
||||
title = await self._extract_job_title_from_card(link)
|
||||
if not title or title == "Unknown Title":
|
||||
title = slug.replace('-', ' ').title()
|
||||
|
||||
async def _scrape_jobs_from_current_page(self, page, search_keywords: str, seen_job_ids, all_job_links):
|
||||
current_links = await page.query_selector_all("a[href*='/job/']")
|
||||
new_jobs = 0
|
||||
|
||||
for link in current_links:
|
||||
href = await link.get_attribute("href")
|
||||
if not href or not href.startswith("http"):
|
||||
href = "https://cryptocurrencyjobs.co" + href
|
||||
job_id = href.split("/")[-1] if href.endswith("/") else href.split("/")[-1]
|
||||
|
||||
if job_id and job_id not in seen_job_ids:
|
||||
title_element = await link.query_selector("h3, .job-title")
|
||||
title = (await title_element.inner_text()) if title_element else "Unknown Title"
|
||||
match_percentage = self._calculate_keyword_match(title, search_keywords)
|
||||
if match_percentage >= 0.4 or not search_keywords.strip():
|
||||
seen_slugs.add(slug)
|
||||
job_cards.append((full_url, title, link))
|
||||
job_found = True
|
||||
|
||||
if match_percentage >= 0.5: # Lower threshold than LinkedIn
|
||||
seen_job_ids.add(job_id)
|
||||
all_job_links.append((href, title))
|
||||
new_jobs += 1
|
||||
else:
|
||||
print(f" ⚠️ Skipping job due to low keyword match: {title[:50]}... (match: {match_percentage:.2%})")
|
||||
return new_jobs
|
||||
|
||||
print(f" ✅ Collected {len(job_cards)} valid job links after filtering ({len(candidates)} raw candidates).")
|
||||
async def _handle_pagination(self, page, search_keywords: str, seen_job_ids, all_job_links):
|
||||
current_page = 1
|
||||
while True:
|
||||
print(f"📄 Processing page {current_page}")
|
||||
new_jobs = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links)
|
||||
print(f" ➕ Found {new_jobs} new job(s) (total: {len(all_job_links)})")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ⚠️ Error collecting job cards: {e}")
|
||||
|
||||
if not job_found:
|
||||
print(" ❌ No valid job listings passed filters.")
|
||||
|
||||
return job_cards
|
||||
|
||||
async def _handle_pagination_and_collect_all(self, page, search_keywords: str, seen_slugs):
|
||||
all_job_elements = []
|
||||
scroll_attempt = 0
|
||||
max_scrolls = 40
|
||||
prev_count = 0
|
||||
|
||||
while scroll_attempt < max_scrolls:
|
||||
print(f" Scroll attempt {scroll_attempt + 1} | Current total jobs: {len(all_job_elements)}")
|
||||
|
||||
page_elements = await self._collect_job_elements_from_page(page, search_keywords, seen_slugs)
|
||||
all_job_elements.extend(page_elements)
|
||||
|
||||
current_count = len(all_job_elements)
|
||||
|
||||
if current_count == prev_count and scroll_attempt > 3:
|
||||
print(" 🔚 No new jobs after several scrolls → assuming end of list.")
|
||||
next_btn = await page.query_selector('a[rel="next"]')
|
||||
if next_btn:
|
||||
next_url = await next_btn.get_attribute("href")
|
||||
if next_url and not next_url.startswith("http"):
|
||||
next_url = "https://cryptocurrencyjobs.co" + next_url
|
||||
await page.goto(next_url, timeout=120000)
|
||||
await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
|
||||
current_page += 1
|
||||
else:
|
||||
print("🔚 No 'Next' page — stopping pagination.")
|
||||
break
|
||||
|
||||
prev_count = current_count
|
||||
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
await asyncio.sleep(random.uniform(2.5, 5.5) * self.human_speed)
|
||||
|
||||
try:
|
||||
load_more = await page.query_selector(
|
||||
'button:has-text("Load more"), button:has-text("More"), div[role="button"]:has-text("Load"), a:has-text("Load more")'
|
||||
)
|
||||
if load_more:
|
||||
print(" Found 'Load more' button → clicking...")
|
||||
await self._human_click(page, load_more)
|
||||
await asyncio.sleep(random.uniform(3.0, 6.0) * self.human_speed)
|
||||
except:
|
||||
pass
|
||||
|
||||
scroll_attempt += 1
|
||||
|
||||
print(f" Finished scrolling → collected {len(all_job_elements)} unique job links.")
|
||||
return all_job_elements
|
||||
|
||||
async def _extract_job_posted_date_from_card(self, card) -> str:
|
||||
async def _extract_job_posted_date(self, page) -> str:
|
||||
try:
|
||||
card_text = await card.inner_text()
|
||||
if "Today" in card_text:
|
||||
return datetime.now().strftime("%m/%d/%y")
|
||||
elif "Yesterday" in card_text:
|
||||
from datetime import timedelta
|
||||
return (datetime.now() - timedelta(days=1)).strftime("%m/%d/%y")
|
||||
else:
|
||||
match = re.search(r'(\d+)d', card_text)
|
||||
if match:
|
||||
days = int(match.group(1))
|
||||
from datetime import timedelta
|
||||
return (datetime.now() - timedelta(days=days)).strftime("%m/%d/%y")
|
||||
date_element = await page.query_selector(".job-posted-date, .job-date, time")
|
||||
if date_element:
|
||||
date_text = await date_element.inner_text()
|
||||
if "Today" in date_text:
|
||||
return datetime.now().strftime("%m/%d/%y")
|
||||
elif "Yesterday" in date_text:
|
||||
yesterday = datetime.now().replace(day=datetime.now().day - 1)
|
||||
return yesterday.strftime("%m/%d/%y")
|
||||
else:
|
||||
return datetime.now().strftime("%m/%d/%y")
|
||||
except:
|
||||
pass
|
||||
return datetime.now().strftime("%m/%d/%y")
|
||||
@ -218,103 +126,15 @@ class CryptoJobScraper:
|
||||
except Exception as e:
|
||||
print(f" ❌ Failed to add job to Redis cache: {str(e)}")
|
||||
|
||||
async def _is_forbidden_ats_url(self, url: str) -> bool:
|
||||
url_lower = url.lower()
|
||||
return any(domain in url_lower for domain in self.FORBIDDEN_ATS_DOMAINS)
|
||||
|
||||
def _get_ats_platform_name(self, url: str) -> str:
|
||||
"""Return canonical ATS name based on URL (e.g., 'ashby', 'greenhouse')"""
|
||||
url_lower = url.lower()
|
||||
|
||||
# Order matters: more specific first
|
||||
if 'boards.greenhouse.io' in url_lower:
|
||||
return 'greenhouse'
|
||||
elif 'jobs.lever.co' in url_lower:
|
||||
return 'lever'
|
||||
elif 'myworkdayjobs' in url_lower or 'myworkday' in url_lower:
|
||||
return 'workday'
|
||||
elif 'linkedin.com' in url_lower:
|
||||
return 'linkedin'
|
||||
elif 'ashbyhq.com' in url_lower or 'ashby' in url_lower:
|
||||
return 'ashby'
|
||||
elif 'gem.com' in url_lower or 'gem' in url_lower:
|
||||
return 'gem'
|
||||
elif 'rippling' in url_lower:
|
||||
return 'rippling'
|
||||
elif 'smartrecruiters' in url_lower:
|
||||
return 'smartrecruiters'
|
||||
elif 'workable' in url_lower:
|
||||
return 'workable'
|
||||
else:
|
||||
# Fallback: extract domain part
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc.lower()
|
||||
for forbidden in self.FORBIDDEN_ATS_DOMAINS:
|
||||
if forbidden in domain:
|
||||
return forbidden.split('.')[0] if '.' in forbidden else forbidden
|
||||
except:
|
||||
pass
|
||||
return 'forbidden_ats'
|
||||
|
||||
def _log_forbidden_ats_url(self, url: str, platform: str):
|
||||
"""Append forbidden URL to {platform}.csv"""
|
||||
filename = f"{platform}.csv"
|
||||
file_exists = os.path.isfile(filename)
|
||||
with open(filename, 'a', newline='', encoding='utf-8') as f:
|
||||
writer = csv.writer(f)
|
||||
if not file_exists:
|
||||
writer.writerow(['url', 'timestamp'])
|
||||
writer.writerow([url, datetime.now().isoformat()])
|
||||
print(f" 📥 Logged forbidden ATS URL to {filename}: {url}")
|
||||
|
||||
async def _is_invalid_job_page(self, page_content: str) -> bool:
|
||||
content_lower = page_content.lower()
|
||||
return any(phrase in content_lower for phrase in self.INVALID_CONTENT_PHRASES)
|
||||
|
||||
def _extract_job_id_from_url(self, url: str) -> Optional[str]:
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
path_parts = [p for p in parsed.path.split('/') if p]
|
||||
if not path_parts:
|
||||
return None
|
||||
|
||||
candidate = path_parts[-1]
|
||||
candidate = re.split(r'[?#]', candidate)[0]
|
||||
candidate = re.sub(r'\.html?$', '', candidate)
|
||||
|
||||
if not candidate or not any(c.isdigit() for c in candidate):
|
||||
return None
|
||||
|
||||
if re.search(r'[A-Za-z]{6,}\s', candidate):
|
||||
return None
|
||||
|
||||
return candidate
|
||||
except:
|
||||
return None
|
||||
|
||||
async def scrape_jobs(
|
||||
self,
|
||||
search_keywords: Optional[str],
|
||||
max_pages: int = 1,
|
||||
credentials: Optional[Dict] = None
|
||||
):
|
||||
query = ""
|
||||
location = ""
|
||||
if search_keywords and search_keywords.strip():
|
||||
parts = search_keywords.split(',', 1)
|
||||
query = parts[0].strip()
|
||||
if len(parts) > 1:
|
||||
location = parts[1].strip()
|
||||
|
||||
clean_query = query.replace(' ', '+')
|
||||
clean_location = location.replace(' ', '+')
|
||||
|
||||
search_url = "https://cryptocurrencyjobs.co/"
|
||||
if clean_query:
|
||||
search_url += f"?query={clean_query}"
|
||||
if clean_location:
|
||||
search_url += f"&location={clean_location}"
|
||||
# cryptocurrencyjobs.co uses URL params differently
|
||||
encoded_keywords = search_keywords.replace(" ", "%20")
|
||||
search_url = f"https://cryptocurrencyjobs.co/?q={encoded_keywords}"
|
||||
|
||||
profile = self.engine._select_profile()
|
||||
renderer = random.choice(self.engine.common_renderers[self.engine.os])
|
||||
@ -336,103 +156,46 @@ class CryptoJobScraper:
|
||||
await context.add_init_script(spoof_script)
|
||||
|
||||
page = await context.new_page()
|
||||
print(f"🔍 Searching for: {search_keywords or 'all jobs'}")
|
||||
print(f" 🔗 URL: {search_url}")
|
||||
await page.goto(search_url, wait_until='networkidle', timeout=120000)
|
||||
|
||||
# Fetch main search page
|
||||
print(f"🔍 Searching for: {search_keywords}")
|
||||
await page.goto(search_url, wait_until='load', timeout=120000)
|
||||
await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
|
||||
|
||||
try:
|
||||
await page.wait_for_selector("a[href^='/'][href*='-']", timeout=10000)
|
||||
except:
|
||||
print(" ⚠️ No job links found initially, waiting longer...")
|
||||
await asyncio.sleep(5 * self.human_speed)
|
||||
all_job_links = []
|
||||
seen_job_ids = set()
|
||||
|
||||
seen_slugs = set()
|
||||
all_job_elements = await self._handle_pagination_and_collect_all(page, search_keywords, seen_slugs)
|
||||
print(f"✅ Collected {len(all_job_elements)} unique job links.")
|
||||
print("🔄 Collecting job links from search results...")
|
||||
await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links)
|
||||
await self._handle_pagination(page, search_keywords, seen_job_ids, all_job_links)
|
||||
|
||||
print(f"✅ Collected {len(all_job_links)} unique job links.")
|
||||
|
||||
scraped_count = 0
|
||||
for idx, (href, title, job_element) in enumerate(all_job_elements):
|
||||
job_detail_page = None
|
||||
apply_page = None
|
||||
skip_job = False
|
||||
final_scrape_url = None
|
||||
for idx, (href, title) in enumerate(all_job_links):
|
||||
try:
|
||||
print(f" → Processing job {idx+1}/{len(all_job_elements)}: {title}")
|
||||
full_url = href
|
||||
print(f" → Opening job {idx+1}/{len(all_job_links)}: {full_url}")
|
||||
|
||||
posted_date = await self._extract_job_posted_date_from_card(job_element)
|
||||
fetcher = StealthyFetcher(self.engine, browser, context)
|
||||
job_page = await fetcher.fetch_url(full_url, wait_for_selector="h1")
|
||||
if not job_page:
|
||||
print(f" ❌ Failed to fetch job page {full_url}")
|
||||
await self._add_job_to_redis_cache(full_url, full_url.split("/")[-1], "fetch_failure")
|
||||
self.engine.report_outcome("fetch_failure", url=full_url)
|
||||
continue
|
||||
|
||||
job_detail_page = await context.new_page()
|
||||
await job_detail_page.goto(href, wait_until='networkidle', timeout=60000)
|
||||
posted_date = await self._extract_job_posted_date(job_page)
|
||||
|
||||
await self.engine._human_like_scroll(job_page)
|
||||
await asyncio.sleep(2 * self.human_speed)
|
||||
page_content = await self._extract_page_content_for_llm(job_page)
|
||||
|
||||
page_content = await job_detail_page.content()
|
||||
if await self._is_invalid_job_page(page_content):
|
||||
print(" 🚫 Page contains invalid content → skipping.")
|
||||
await job_detail_page.close()
|
||||
continue
|
||||
|
||||
apply_clicked = False
|
||||
apply_selectors = [
|
||||
'a[href*="apply"], a:text("Apply"), a:text("Apply Now"), a:text("Apply here")',
|
||||
'button:text("Apply"), button:has-text("Apply")',
|
||||
'[data-testid="apply-button"], [aria-label*="apply"], [role="button"]:has-text("Apply")',
|
||||
'a.btn-apply, .apply-button, .apply-link, a:has-text("Apply")',
|
||||
'a[rel="noopener"]:has-text("Apply")',
|
||||
]
|
||||
|
||||
for sel in apply_selectors:
|
||||
apply_elem = await job_detail_page.query_selector(sel)
|
||||
if apply_elem:
|
||||
print(f" 🔗 Found Apply element with selector: {sel}")
|
||||
await self._human_click(job_detail_page, apply_elem, wait_after=True)
|
||||
apply_clicked = True
|
||||
break
|
||||
|
||||
apply_page = job_detail_page
|
||||
|
||||
if apply_clicked:
|
||||
await asyncio.sleep(random.uniform(3.0, 6.0) * self.human_speed)
|
||||
pages = context.pages
|
||||
new_pages = [p for p in pages if p != job_detail_page and p.url != "about:blank"]
|
||||
|
||||
if new_pages:
|
||||
candidate_page = new_pages[-1]
|
||||
new_url = candidate_page.url.strip()
|
||||
print(f" New tab opened: {new_url}")
|
||||
|
||||
if new_url and await self._is_forbidden_ats_url(new_url):
|
||||
platform = self._get_ats_platform_name(new_url)
|
||||
self._log_forbidden_ats_url(new_url, platform)
|
||||
if candidate_page != job_detail_page:
|
||||
await candidate_page.close()
|
||||
await job_detail_page.close()
|
||||
skip_job = True
|
||||
else:
|
||||
apply_page = candidate_page
|
||||
else:
|
||||
print(" No new tab → using original page.")
|
||||
|
||||
if skip_job:
|
||||
continue
|
||||
|
||||
final_scrape_url = apply_page.url
|
||||
|
||||
page_content = await self._extract_page_content_for_llm(apply_page)
|
||||
if await self._is_invalid_job_page(page_content):
|
||||
print(" 🚫 Final page contains invalid content → skipping.")
|
||||
if apply_page != job_detail_page:
|
||||
await apply_page.close()
|
||||
await job_detail_page.close()
|
||||
continue
|
||||
|
||||
job_id = self._extract_job_id_from_url(final_scrape_url)
|
||||
if not job_id:
|
||||
job_id = "job_" + hashlib.md5(final_scrape_url.encode()).hexdigest()[:12]
|
||||
job_id = full_url.split("/")[-1] if full_url.split("/")[-1] else "unknown"
|
||||
|
||||
raw_data = {
|
||||
"page_content": page_content,
|
||||
"url": final_scrape_url,
|
||||
"url": full_url,
|
||||
"job_id": job_id,
|
||||
"search_keywords": search_keywords,
|
||||
"posted_date": posted_date
|
||||
@ -447,45 +210,44 @@ class CryptoJobScraper:
|
||||
if field == 'job_id':
|
||||
refined_data[field] = job_id
|
||||
elif field == 'url':
|
||||
refined_data[field] = final_scrape_url
|
||||
refined_data[field] = full_url
|
||||
elif field == 'company_name':
|
||||
refined_data[field] = "Unknown Company"
|
||||
|
||||
|
||||
refined_data['scraped_at'] = datetime.now().isoformat()
|
||||
refined_data['category'] = search_keywords or "all"
|
||||
refined_data['category'] = search_keywords
|
||||
refined_data['posted_date'] = posted_date
|
||||
await self.llm_agent.save_job_data(refined_data, search_keywords or "all")
|
||||
await self.llm_agent.save_job_data(refined_data, search_keywords)
|
||||
scraped_count += 1
|
||||
print(f" ✅ Scraped: {refined_data['title'][:50]}... (Job ID: {job_id})")
|
||||
self.engine.report_outcome("success", url=final_scrape_url)
|
||||
print(f" ✅ Scraped and refined: {refined_data['title'][:50]}...")
|
||||
self.engine.report_outcome("success", url=raw_data["url"])
|
||||
else:
|
||||
print(f" 🟡 Could not extract meaningful data from: {final_scrape_url}")
|
||||
await self._add_job_to_redis_cache(final_scrape_url, job_id, "llm_failure")
|
||||
self.engine.report_outcome("llm_failure", url=final_scrape_url)
|
||||
print(f" 🟡 Could not extract meaningful data from: {full_url}")
|
||||
await self._add_job_to_redis_cache(full_url, job_id, "llm_failure")
|
||||
self.engine.report_outcome("llm_failure", url=raw_data["url"])
|
||||
|
||||
if apply_page != job_detail_page and not apply_page.is_closed():
|
||||
await apply_page.close()
|
||||
if job_detail_page and not job_detail_page.is_closed():
|
||||
await job_detail_page.close()
|
||||
await job_page.close()
|
||||
|
||||
except Exception as e:
|
||||
error_msg = str(e)[:100]
|
||||
print(f" ⚠️ Failed on job {idx+1}: {error_msg}")
|
||||
job_id_for_log = "unknown"
|
||||
if 'final_scrape_url' in locals() and final_scrape_url:
|
||||
job_id_for_log = "job_" + hashlib.md5(final_scrape_url.encode()).hexdigest()[:12]
|
||||
await self._add_job_to_redis_cache(href, job_id_for_log, f"exception: {error_msg}")
|
||||
if job_detail_page and not job_detail_page.is_closed():
|
||||
await job_detail_page.close()
|
||||
if 'apply_page' in locals() and apply_page and apply_page != job_detail_page and not apply_page.is_closed():
|
||||
await apply_page.close()
|
||||
job_id = full_url.split("/")[-1] if 'full_url' in locals() else "unknown"
|
||||
job_url = full_url if 'full_url' in locals() else "unknown"
|
||||
await self._add_job_to_redis_cache(job_url, job_id, f"exception: {error_msg}")
|
||||
if 'job_page' in locals() and job_page:
|
||||
await job_page.close()
|
||||
continue
|
||||
|
||||
finally:
|
||||
print(" ↩️ Returning to search results...")
|
||||
await page.goto(search_url, timeout=120000)
|
||||
await asyncio.sleep(4 * self.human_speed)
|
||||
|
||||
await browser.close()
|
||||
|
||||
if scraped_count > 0:
|
||||
self.engine.report_outcome("success")
|
||||
print(f"✅ Completed! Processed {scraped_count} jobs for '{search_keywords or 'all jobs'}'.")
|
||||
print(f"✅ Completed! Processed {scraped_count} jobs for '{search_keywords}' based on request '{self.user_request}'.")
|
||||
else:
|
||||
self.engine.report_outcome("scraping_error")
|
||||
print("⚠️ No jobs processed successfully.")
|
||||
|
||||
@ -1,5 +0,0 @@
|
||||
url,timestamp
|
||||
https://apply.workable.com/thetie/j/C54DFC9985/?ref=cryptocurrencyjobs.co,2025-12-31T08:24:45.755671
|
||||
https://apply.workable.com/thetie/j/C54DFC9985/?ref=cryptocurrencyjobs.co,2025-12-31T09:51:08.343642
|
||||
https://apply.workable.com/thetie/j/2745433865/?ref=cryptocurrencyjobs.co,2025-12-31T09:51:28.331543
|
||||
https://apply.workable.com/thetie/j/1A6C8F2913/?ref=cryptocurrencyjobs.co,2025-12-31T11:22:54.623723
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1 +0,0 @@
|
||||
url,timestamp
|
||||
|
1045
workdaycompanies.csv
1045
workdaycompanies.csv
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user