Compare commits
2 Commits
38ef08c734
...
b0e90972b1
| Author | SHA1 | Date | |
|---|---|---|---|
| b0e90972b1 | |||
| 06f8e8b086 |
10
ashby.csv
Normal file
10
ashby.csv
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
url,timestamp
|
||||||
|
https://jobs.ashbyhq.com/stellar/a8377cf4-280b-4eb3-ac44-a4c9020c2eaf?utm_source=cryptocurrencyjobs.co,2025-12-31T08:32:17.821505
|
||||||
|
https://jobs.ashbyhq.com/artemisanalytics/5f61b6c6-147c-4707-9003-a9632455b984?utm_source=cryptocurrencyjobs.co,2025-12-31T08:51:57.190172
|
||||||
|
https://jobs.ashbyhq.com/lightning/2d77b496-ab0d-4e54-bcf8-33260d1bab6b?utm_source=cryptocurrencyjobs.co,2025-12-31T09:07:09.491831
|
||||||
|
https://jobs.ashbyhq.com/Braiins/cee9cf74-6049-4dab-aae7-96bef0082689?utm_source=cryptocurrencyjobs.co,2025-12-31T09:35:28.137181
|
||||||
|
https://jobs.ashbyhq.com/blockstream/80ebab98-0039-48bf-86d9-9a2a7962b005?utm_source=cryptocurrencyjobs.co,2025-12-31T10:21:19.253356
|
||||||
|
https://jobs.ashbyhq.com/dynamic/fde8a9ff-9701-485f-a8d1-e717c170f215?utm_source=cryptocurrencyjobs.co,2025-12-31T10:25:55.141543
|
||||||
|
https://jobs.ashbyhq.com/ether.fi/6eb1e350-71ce-47f7-a363-3fa3c521dacb?utm_source=cryptocurrencyjobs.co,2025-12-31T10:44:35.913725
|
||||||
|
https://chainlinklabs.com/open-roles?ashby_jid=112a76d3-4dfd-4eea-828c-41465760b3ef&utm_source=ccj,2025-12-31T10:49:07.453900
|
||||||
|
https://jobs.ashbyhq.com/stellar/cdad9af1-9e64-4fd4-8e2c-f87389f1dd16?utm_source=cryptocurrencyjobs.co,2025-12-31T11:13:58.119967
|
||||||
|
1591
ashbycompanies.csv
Normal file
1591
ashbycompanies.csv
Normal file
File diff suppressed because it is too large
Load Diff
166
comparator.py
Normal file
166
comparator.py
Normal file
@ -0,0 +1,166 @@
|
|||||||
|
|
||||||
|
import csv
|
||||||
|
import os
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
# Define platform mappings: (input_file, companies_file, platform_name)
|
||||||
|
platforms = [
|
||||||
|
("ashby.csv", "ashbycompanies.csv", "ashby"),
|
||||||
|
("gem.csv", "gemcompanies.csv", "gem"),
|
||||||
|
("greenhouse.csv", "greenhousecompanies.csv", "greenhouse"),
|
||||||
|
("lever.csv", "levercompanies.csv", "lever"),
|
||||||
|
("rippling.csv", "ripplingcompanies.csv", "rippling"),
|
||||||
|
("workable.csv", "workablecompanies.csv", "workable"),
|
||||||
|
("workday.csv", "workdaycompanies.csv", "workday"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_url(platform, url):
|
||||||
|
"""Normalize URL to a company identifier based on platform."""
|
||||||
|
if not url:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
parsed = urlparse(url.lower().strip())
|
||||||
|
netloc = parsed.netloc
|
||||||
|
path = parsed.path
|
||||||
|
|
||||||
|
if platform == "ashby":
|
||||||
|
# https://jobs.ashbyhq.com/company_slug/...
|
||||||
|
if "ashbyhq.com" in netloc:
|
||||||
|
parts = [p for p in path.split('/') if p]
|
||||||
|
return parts[0] if parts else None
|
||||||
|
|
||||||
|
elif platform == "greenhouse":
|
||||||
|
# https://boards.greenhouse.io/company_slug/...
|
||||||
|
if "greenhouse.io" in netloc:
|
||||||
|
parts = [p for p in path.split('/') if p]
|
||||||
|
if len(parts) >= 2 and parts[0] == "boards":
|
||||||
|
return parts[1]
|
||||||
|
elif len(parts) >= 1:
|
||||||
|
return parts[0]
|
||||||
|
return None
|
||||||
|
|
||||||
|
elif platform == "lever":
|
||||||
|
# https://jobs.lever.co/company_slug/...
|
||||||
|
if "lever.co" in netloc:
|
||||||
|
parts = [p for p in path.split('/') if p]
|
||||||
|
return parts[0] if parts else None
|
||||||
|
|
||||||
|
elif platform == "workable":
|
||||||
|
# https://apply.workable.com/company_slug/...
|
||||||
|
if "workable.com" in netloc:
|
||||||
|
parts = [p for p in path.split('/') if p]
|
||||||
|
# Usually: /company_slug/j/jobid/ → take first non-'j' segment
|
||||||
|
for part in parts:
|
||||||
|
if part != 'j' and len(part) > 2:
|
||||||
|
return part
|
||||||
|
return parts[0] if parts else None
|
||||||
|
|
||||||
|
elif platform == "workday":
|
||||||
|
# https://company.workday.com/... → company = subdomain
|
||||||
|
if "myworkdayjobs.com" in netloc or "wd" in netloc:
|
||||||
|
# Extract subdomain before main domain
|
||||||
|
subdomain = netloc.split('.')[0]
|
||||||
|
if subdomain and subdomain not in ['www', 'jobs', 'apply', '']:
|
||||||
|
return subdomain
|
||||||
|
# Fallback: look for company in path (rare)
|
||||||
|
parts = [p for p in path.split('/') if p]
|
||||||
|
if parts:
|
||||||
|
return parts[0]
|
||||||
|
return None
|
||||||
|
|
||||||
|
elif platform == "gem":
|
||||||
|
# https://gem.com/company/... or https://www.gem.com/careers/company/...
|
||||||
|
if "gem.com" in netloc:
|
||||||
|
parts = [p for p in path.split('/') if p]
|
||||||
|
# Often: /company-slug or /careers/company-slug
|
||||||
|
for i, part in enumerate(parts):
|
||||||
|
if part in ['company', 'careers', 'jobs']:
|
||||||
|
if i + 1 < len(parts):
|
||||||
|
return parts[i + 1]
|
||||||
|
return parts[0] if parts else None
|
||||||
|
|
||||||
|
elif platform == "rippling":
|
||||||
|
# Rippling uses generic domain; hard to extract company
|
||||||
|
# Best effort: use full domain + first path segment
|
||||||
|
if "rippling.com" in netloc:
|
||||||
|
parts = [p for p in path.split('/') if p]
|
||||||
|
if parts:
|
||||||
|
return f"{netloc}/{parts[0]}"
|
||||||
|
return netloc
|
||||||
|
|
||||||
|
# Fallback: return full URL if unrecognized
|
||||||
|
return url
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
def read_company_signatures(filepath, platform):
|
||||||
|
"""Read and normalize company identifiers from companies CSV."""
|
||||||
|
if not os.path.exists(filepath):
|
||||||
|
return set()
|
||||||
|
signatures = set()
|
||||||
|
with open(filepath, 'r', encoding='utf-8') as f:
|
||||||
|
reader = csv.DictReader(f)
|
||||||
|
for row in reader:
|
||||||
|
url = row.get('url', '').strip()
|
||||||
|
if url:
|
||||||
|
sig = normalize_url(platform, url)
|
||||||
|
if sig:
|
||||||
|
signatures.add(sig)
|
||||||
|
return signatures
|
||||||
|
|
||||||
|
|
||||||
|
def filter_csv_by_signatures(input_file, excluded_signatures, platform):
|
||||||
|
"""Keep only rows whose normalized URL is NOT in excluded_signatures."""
|
||||||
|
if not os.path.exists(input_file):
|
||||||
|
return [], None
|
||||||
|
kept_rows = []
|
||||||
|
with open(input_file, 'r', encoding='utf-8') as f:
|
||||||
|
reader = csv.DictReader(f)
|
||||||
|
fieldnames = reader.fieldnames
|
||||||
|
for row in reader:
|
||||||
|
url = row.get('url', '').strip()
|
||||||
|
if not url:
|
||||||
|
kept_rows.append(row) # keep if no URL (shouldn't happen)
|
||||||
|
continue
|
||||||
|
sig = normalize_url(platform, url)
|
||||||
|
if sig not in excluded_signatures:
|
||||||
|
kept_rows.append(row)
|
||||||
|
return kept_rows, fieldnames
|
||||||
|
|
||||||
|
|
||||||
|
def write_csv(filepath, rows, fieldnames):
|
||||||
|
"""Write rows to CSV file."""
|
||||||
|
with open(filepath, 'w', newline='', encoding='utf-8') as f:
|
||||||
|
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
||||||
|
writer.writeheader()
|
||||||
|
writer.writerows(rows)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
for input_file, companies_file, platform in platforms:
|
||||||
|
print(f"Processing {input_file} against {companies_file} using '{platform}' normalizer...")
|
||||||
|
|
||||||
|
# Step 1: Load and normalize known company signatures
|
||||||
|
known_signatures = read_company_signatures(companies_file, platform)
|
||||||
|
print(f" → Loaded {len(known_signatures)} known company signatures from {companies_file}")
|
||||||
|
|
||||||
|
# Step 2: Filter input file using signatures
|
||||||
|
kept_rows, fieldnames = filter_csv_by_signatures(input_file, known_signatures, platform)
|
||||||
|
|
||||||
|
# Step 3: Write back filtered data
|
||||||
|
if fieldnames:
|
||||||
|
write_csv(input_file, kept_rows, fieldnames)
|
||||||
|
print(f" → Kept {len(kept_rows)} new job URLs in {input_file}")
|
||||||
|
else:
|
||||||
|
if os.path.exists(input_file):
|
||||||
|
os.remove(input_file)
|
||||||
|
print(f" → {input_file} was empty or invalid — removed.")
|
||||||
|
|
||||||
|
print("\n✅ All platforms processed successfully.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@ -27,7 +27,7 @@ class StealthyFetcher:
|
|||||||
|
|
||||||
if wait_for_selector:
|
if wait_for_selector:
|
||||||
try:
|
try:
|
||||||
await page.wait_for_selector(wait_for_selector, timeout=40000)
|
await page.wait_for_selector(wait_for_selector, timeout=120000)
|
||||||
except PlaywrightTimeoutError:
|
except PlaywrightTimeoutError:
|
||||||
print(f"Selector {wait_for_selector} not found immediately, continuing...")
|
print(f"Selector {wait_for_selector} not found immediately, continuing...")
|
||||||
|
|
||||||
@ -88,7 +88,7 @@ class StealthyFetcher:
|
|||||||
async def _is_content_accessible(self, page: Page, wait_for_selector: Optional[str] = None) -> bool:
|
async def _is_content_accessible(self, page: Page, wait_for_selector: Optional[str] = None) -> bool:
|
||||||
if wait_for_selector:
|
if wait_for_selector:
|
||||||
try:
|
try:
|
||||||
await page.wait_for_selector(wait_for_selector, timeout=40000)
|
await page.wait_for_selector(wait_for_selector, timeout=120000)
|
||||||
return True
|
return True
|
||||||
except PlaywrightTimeoutError:
|
except PlaywrightTimeoutError:
|
||||||
pass
|
pass
|
||||||
|
|||||||
508
gemcompanies.csv
Normal file
508
gemcompanies.csv
Normal file
@ -0,0 +1,508 @@
|
|||||||
|
name,url
|
||||||
|
10Xconstruction Ai,https://jobs.gem.com/10xconstruction-ai
|
||||||
|
11X Ai,https://jobs.gem.com/11x-ai
|
||||||
|
43North,https://jobs.gem.com/43north
|
||||||
|
8020 Consulting,https://jobs.gem.com/8020-consulting
|
||||||
|
A16Z Speedrun,https://jobs.gem.com/a16z-speedrun
|
||||||
|
Aarden Ai,https://jobs.gem.com/aarden-ai
|
||||||
|
Accel,https://jobs.gem.com/accel
|
||||||
|
Accelos,https://jobs.gem.com/accelos
|
||||||
|
Acre,https://jobs.gem.com/acre
|
||||||
|
Advancelevelllc Com,https://jobs.gem.com/advancelevelllc-com
|
||||||
|
Agenta Ai,https://jobs.gem.com/agenta-ai
|
||||||
|
Agentnoon,https://jobs.gem.com/agentnoon
|
||||||
|
Agora,https://jobs.gem.com/agora
|
||||||
|
Aionex Xyz,https://jobs.gem.com/aionex-xyz
|
||||||
|
Aiphrodite Ai,https://jobs.gem.com/aiphrodite-ai
|
||||||
|
Airframe,https://jobs.gem.com/airframe
|
||||||
|
Airvet Com,https://jobs.gem.com/airvet-com
|
||||||
|
Alex And Ani,https://jobs.gem.com/alex-and-ani
|
||||||
|
Alinia Ai,https://jobs.gem.com/alinia-ai
|
||||||
|
Alitheon,https://jobs.gem.com/alitheon
|
||||||
|
Alpharun,https://jobs.gem.com/alpharun
|
||||||
|
Altzero Xyz,https://jobs.gem.com/altzero-xyz
|
||||||
|
Amya Agency,https://jobs.gem.com/amya-agency
|
||||||
|
Andrenam,https://jobs.gem.com/andrenam
|
||||||
|
Anysphere,https://jobs.gem.com/anysphere
|
||||||
|
Aoniclife,https://jobs.gem.com/aoniclife
|
||||||
|
Apartment List,https://jobs.gem.com/apartment-list
|
||||||
|
Apella,https://jobs.gem.com/apella
|
||||||
|
Apticore Io,https://jobs.gem.com/apticore-io
|
||||||
|
Arlo,https://jobs.gem.com/arlo
|
||||||
|
Ascenda Loyalty,https://jobs.gem.com/ascenda-loyalty
|
||||||
|
Ascendarc,https://jobs.gem.com/ascendarc
|
||||||
|
Astroforge Io,https://jobs.gem.com/astroforge-io
|
||||||
|
Atla Ai Com,https://jobs.gem.com/atla-ai-com
|
||||||
|
Atomica,https://jobs.gem.com/atomica
|
||||||
|
Audicus,https://jobs.gem.com/audicus
|
||||||
|
Aurelian Io,https://jobs.gem.com/aurelian-io
|
||||||
|
Aureliussystems Us,https://jobs.gem.com/aureliussystems-us
|
||||||
|
Autopilotbrand Com,https://jobs.gem.com/autopilotbrand-com
|
||||||
|
Avoca,https://jobs.gem.com/avoca
|
||||||
|
Avol,https://jobs.gem.com/avol
|
||||||
|
Axonify,https://jobs.gem.com/axonify
|
||||||
|
Backops Ai,https://jobs.gem.com/backops-ai
|
||||||
|
Basalt Health,https://jobs.gem.com/basalt-health
|
||||||
|
Baxter Aerospace,https://jobs.gem.com/baxter-aerospace
|
||||||
|
Bead Ai,https://jobs.gem.com/bead-ai
|
||||||
|
Benbase,https://jobs.gem.com/benbase
|
||||||
|
Better Auth,https://jobs.gem.com/better-auth
|
||||||
|
Betterbasket Ai,https://jobs.gem.com/betterbasket-ai
|
||||||
|
Bigeye,https://jobs.gem.com/bigeye
|
||||||
|
Bigpanda,https://jobs.gem.com/bigpanda
|
||||||
|
Bikky,https://jobs.gem.com/bikky
|
||||||
|
Bilt,https://jobs.gem.com/bilt
|
||||||
|
Binarly,https://jobs.gem.com/binarly
|
||||||
|
Biofire,https://jobs.gem.com/biofire
|
||||||
|
Biorender,https://jobs.gem.com/biorender
|
||||||
|
Biorender Inc Ats,https://jobs.gem.com/biorender-inc--ats
|
||||||
|
Birdwood Therapeutics,https://jobs.gem.com/birdwood-therapeutics
|
||||||
|
Black Ore,https://jobs.gem.com/black-ore
|
||||||
|
Blaze Ai,https://jobs.gem.com/blaze-ai
|
||||||
|
Blazetalent,https://jobs.gem.com/blazetalent
|
||||||
|
Blend Inc,https://jobs.gem.com/blend-inc
|
||||||
|
Blue J,https://jobs.gem.com/blue-j
|
||||||
|
Bluejeanfinancial Com,https://jobs.gem.com/bluejeanfinancial-com
|
||||||
|
Blueonion Ai,https://jobs.gem.com/blueonion-ai
|
||||||
|
Blueprint,https://jobs.gem.com/blueprint
|
||||||
|
Bluesky,https://jobs.gem.com/bluesky
|
||||||
|
Blume Technologies,https://jobs.gem.com/blume-technologies
|
||||||
|
Bohler ,https://jobs.gem.com/bohler-
|
||||||
|
Bohler Engineering Gemats,https://jobs.gem.com/bohler-engineering-gemats
|
||||||
|
Bolna,https://jobs.gem.com/bolna
|
||||||
|
Bond Partners,https://jobs.gem.com/bond-partners
|
||||||
|
Boost Robotics,https://jobs.gem.com/boost-robotics
|
||||||
|
Boredm,https://jobs.gem.com/boredm
|
||||||
|
Breadcrumb Ai,https://jobs.gem.com/breadcrumb-ai
|
||||||
|
Breakline Ats,https://jobs.gem.com/breakline-ats
|
||||||
|
Breakline Education,https://jobs.gem.com/breakline-education
|
||||||
|
Brewbird,https://jobs.gem.com/brewbird
|
||||||
|
Buildtrayd Com,https://jobs.gem.com/buildtrayd-com
|
||||||
|
Bull Moose Xyz,https://jobs.gem.com/bull-moose-xyz
|
||||||
|
Cadstrom Io,https://jobs.gem.com/cadstrom-io
|
||||||
|
Caffelabs Com,https://jobs.gem.com/caffelabs-com
|
||||||
|
Calaveras,https://jobs.gem.com/calaveras
|
||||||
|
Canals,https://jobs.gem.com/canals
|
||||||
|
Caplight Com,https://jobs.gem.com/caplight-com
|
||||||
|
Carbon,https://jobs.gem.com/carbon
|
||||||
|
Cardnexus,https://jobs.gem.com/cardnexus
|
||||||
|
Careers,https://jobs.gem.com/careers
|
||||||
|
Carry,https://jobs.gem.com/carry
|
||||||
|
Caseflood Ai,https://jobs.gem.com/caseflood-ai
|
||||||
|
Cellbyte,https://jobs.gem.com/cellbyte
|
||||||
|
Chartahealth,https://jobs.gem.com/chartahealth
|
||||||
|
Civrobotics Com,https://jobs.gem.com/civrobotics-com
|
||||||
|
Clarity,https://jobs.gem.com/clarity
|
||||||
|
Clearchecks Com Ats,https://jobs.gem.com/clearchecks-com-ats
|
||||||
|
Clearesthealth Com,https://jobs.gem.com/clearesthealth-com
|
||||||
|
Cloudanix Com,https://jobs.gem.com/cloudanix-com
|
||||||
|
Cloudraft,https://jobs.gem.com/cloudraft
|
||||||
|
Codegen,https://jobs.gem.com/codegen
|
||||||
|
Codesignal,https://jobs.gem.com/codesignal
|
||||||
|
Cognna,https://jobs.gem.com/cognna
|
||||||
|
Cogram,https://jobs.gem.com/cogram
|
||||||
|
Comfy,https://jobs.gem.com/comfy
|
||||||
|
Conductorai,https://jobs.gem.com/conductorai
|
||||||
|
Confida Ai,https://jobs.gem.com/confida-ai
|
||||||
|
Context Wtf,https://jobs.gem.com/context-wtf
|
||||||
|
Contour App,https://jobs.gem.com/contour-app
|
||||||
|
Converge,https://jobs.gem.com/converge
|
||||||
|
Coupa Software Inc Ats 1,https://jobs.gem.com/coupa-software-inc-ats-1
|
||||||
|
Cox Exponential,https://jobs.gem.com/cox-exponential
|
||||||
|
Crabi Robotics Com,https://jobs.gem.com/crabi-robotics-com
|
||||||
|
Crackenagi,https://jobs.gem.com/crackenagi
|
||||||
|
Create Talent Group,https://jobs.gem.com/create-talent-group
|
||||||
|
Createdbyhumans Ai,https://jobs.gem.com/createdbyhumans-ai
|
||||||
|
Credit Key,https://jobs.gem.com/credit-key
|
||||||
|
Crosby,https://jobs.gem.com/crosby
|
||||||
|
Curex Org,https://jobs.gem.com/curex-org
|
||||||
|
Curiouscardinals Com,https://jobs.gem.com/curiouscardinals-com
|
||||||
|
Cyvl,https://jobs.gem.com/cyvl
|
||||||
|
D4M International,https://jobs.gem.com/d4m-international
|
||||||
|
Dalus,https://jobs.gem.com/dalus
|
||||||
|
Dash Fi,https://jobs.gem.com/dash-fi
|
||||||
|
Data Masters,https://jobs.gem.com/data-masters
|
||||||
|
Datacurve Ai,https://jobs.gem.com/datacurve-ai
|
||||||
|
Dataday Technology Solutions,https://jobs.gem.com/dataday-technology-solutions
|
||||||
|
Datagrid,https://jobs.gem.com/datagrid
|
||||||
|
Dawn Media,https://jobs.gem.com/dawn-media
|
||||||
|
Daxko,https://jobs.gem.com/daxko
|
||||||
|
Deep Infra,https://jobs.gem.com/deep-infra
|
||||||
|
Deliver,https://jobs.gem.com/deliver
|
||||||
|
Detections Ai,https://jobs.gem.com/detections-ai
|
||||||
|
Dianahr Ai,https://jobs.gem.com/dianahr-ai
|
||||||
|
Distributed Spectrum,https://jobs.gem.com/distributed-spectrum
|
||||||
|
Dlvrlog,https://jobs.gem.com/dlvrlog
|
||||||
|
Doowii,https://jobs.gem.com/doowii
|
||||||
|
Dragme,https://jobs.gem.com/dragme
|
||||||
|
Dragonfly Careers,https://jobs.gem.com/dragonfly-careers
|
||||||
|
Dropback,https://jobs.gem.com/dropback
|
||||||
|
Durin,https://jobs.gem.com/durin
|
||||||
|
Dydx,https://jobs.gem.com/dydx
|
||||||
|
Eats2Seats,https://jobs.gem.com/eats2seats
|
||||||
|
Echelon,https://jobs.gem.com/echelon
|
||||||
|
Ecocart Io,https://jobs.gem.com/ecocart-io
|
||||||
|
Edgetrace Ai,https://jobs.gem.com/edgetrace-ai
|
||||||
|
Efference Ai,https://jobs.gem.com/efference-ai
|
||||||
|
Elite Talent Consulting,https://jobs.gem.com/elite-talent-consulting
|
||||||
|
Eliza,https://jobs.gem.com/eliza
|
||||||
|
Elloe Ai,https://jobs.gem.com/elloe-ai
|
||||||
|
Elo Ai,https://jobs.gem.com/elo-ai
|
||||||
|
Emerge Career,https://jobs.gem.com/emerge-career
|
||||||
|
Engineering Codified,https://jobs.gem.com/engineering--codified
|
||||||
|
Entrusted Contracting,https://jobs.gem.com/entrusted-contracting
|
||||||
|
Escargot Com,https://jobs.gem.com/escargot-com
|
||||||
|
Everfit Io,https://jobs.gem.com/everfit-io
|
||||||
|
Excelity Careers,https://jobs.gem.com/excelity-careers
|
||||||
|
Exponent,https://jobs.gem.com/exponent
|
||||||
|
Ezraailabs Tech,https://jobs.gem.com/ezraailabs-tech
|
||||||
|
Fabric,https://jobs.gem.com/fabric
|
||||||
|
Fabrichealth,https://jobs.gem.com/fabrichealth
|
||||||
|
Fancypeople,https://jobs.gem.com/fancypeople
|
||||||
|
Fanpierlabs Com,https://jobs.gem.com/fanpierlabs-com
|
||||||
|
Faraday,https://jobs.gem.com/faraday
|
||||||
|
Fathom Org,https://jobs.gem.com/fathom-org
|
||||||
|
Felix,https://jobs.gem.com/felix
|
||||||
|
Ferry Health,https://jobs.gem.com/ferry-health
|
||||||
|
Fetch Ats,https://jobs.gem.com/fetch-ats
|
||||||
|
Fifthdoor Com,https://jobs.gem.com/fifthdoor-com
|
||||||
|
Fireflies,https://jobs.gem.com/fireflies
|
||||||
|
Firestorm,https://jobs.gem.com/firestorm
|
||||||
|
Flatfee Corp,https://jobs.gem.com/flatfee-corp
|
||||||
|
Flint,https://jobs.gem.com/flint
|
||||||
|
Floot,https://jobs.gem.com/floot
|
||||||
|
Forgent Ai,https://jobs.gem.com/forgent-ai
|
||||||
|
Fountainplatform Com,https://jobs.gem.com/fountainplatform-com
|
||||||
|
Foxbox Digital,https://jobs.gem.com/foxbox-digital
|
||||||
|
Freestone Grove Partners,https://jobs.gem.com/freestone-grove-partners
|
||||||
|
Freshbooks,https://jobs.gem.com/freshbooks
|
||||||
|
Fridayharbor Ai,https://jobs.gem.com/fridayharbor-ai
|
||||||
|
Fuelfinance,https://jobs.gem.com/fuelfinance
|
||||||
|
Fulcrumcareers,https://jobs.gem.com/fulcrumcareers
|
||||||
|
Function Health,https://jobs.gem.com/function-health
|
||||||
|
Galadyne,https://jobs.gem.com/galadyne
|
||||||
|
Galaxyventures,https://jobs.gem.com/galaxyventures
|
||||||
|
Gc Ai,https://jobs.gem.com/gc-ai
|
||||||
|
Gem,https://jobs.gem.com/gem
|
||||||
|
Gem Mckesson,https://jobs.gem.com/gem-mckesson
|
||||||
|
Gem Test Board,https://jobs.gem.com/gem-test-board
|
||||||
|
Generation Alpha Transistor,https://jobs.gem.com/generation-alpha-transistor
|
||||||
|
Genspark,https://jobs.gem.com/genspark
|
||||||
|
Gerra,https://jobs.gem.com/gerra
|
||||||
|
Getaero Io,https://jobs.gem.com/getaero-io
|
||||||
|
Getbirdeye Com Au,https://jobs.gem.com/getbirdeye-com-au
|
||||||
|
Getro,https://jobs.gem.com/getro
|
||||||
|
Gigaml,https://jobs.gem.com/gigaml
|
||||||
|
Go Cadre,https://jobs.gem.com/go-cadre
|
||||||
|
Goatrecruit Com,https://jobs.gem.com/goatrecruit-com
|
||||||
|
Good Life Companies,https://jobs.gem.com/good-life-companies
|
||||||
|
Goodbill,https://jobs.gem.com/goodbill
|
||||||
|
Grailpay Com,https://jobs.gem.com/grailpay-com
|
||||||
|
Granger Construction,https://jobs.gem.com/granger-construction
|
||||||
|
Gratia Health,https://jobs.gem.com/gratia-health
|
||||||
|
Greenlite Ai,https://jobs.gem.com/greenlite-ai
|
||||||
|
Greenvalleyjobs,https://jobs.gem.com/greenvalleyjobs
|
||||||
|
Grit,https://jobs.gem.com/grit
|
||||||
|
Groq,https://jobs.gem.com/groq
|
||||||
|
Growthbook,https://jobs.gem.com/growthbook
|
||||||
|
Guardrail Ai,https://jobs.gem.com/guardrail-ai
|
||||||
|
Guidesage Ai,https://jobs.gem.com/guidesage-ai
|
||||||
|
Hallow,https://jobs.gem.com/hallow
|
||||||
|
Happydance Partnership Integration,https://jobs.gem.com/happydance-partnership-integration
|
||||||
|
Harmonic,https://jobs.gem.com/harmonic
|
||||||
|
Hash,https://jobs.gem.com/hash
|
||||||
|
Hayla,https://jobs.gem.com/hayla
|
||||||
|
Heavy Construction Systems Specialists Llc,https://jobs.gem.com/heavy-construction-systems-specialists-llc
|
||||||
|
Helix,https://jobs.gem.com/helix
|
||||||
|
Hellotrade,https://jobs.gem.com/hellotrade
|
||||||
|
Helm Health,https://jobs.gem.com/helm-health
|
||||||
|
Hilabs Ie,https://jobs.gem.com/hilabs-ie
|
||||||
|
Hipeople,https://jobs.gem.com/hipeople
|
||||||
|
Holacasa Yc W23,https://jobs.gem.com/holacasa-yc-w23
|
||||||
|
Homeboost,https://jobs.gem.com/homeboost
|
||||||
|
Hospitable,https://jobs.gem.com/hospitable
|
||||||
|
Howrecruit Io,https://jobs.gem.com/howrecruit-io
|
||||||
|
Hubspot,https://jobs.gem.com/hubspot
|
||||||
|
Hypernatural Ai,https://jobs.gem.com/hypernatural-ai
|
||||||
|
Inception,https://jobs.gem.com/inception
|
||||||
|
Index Exchange,https://jobs.gem.com/index-exchange
|
||||||
|
Infrastructure Modernization Solutions,https://jobs.gem.com/infrastructure-modernization-solutions
|
||||||
|
Inspiration Commerce Group,https://jobs.gem.com/inspiration-commerce-group
|
||||||
|
Inspiresemi Com,https://jobs.gem.com/inspiresemi-com
|
||||||
|
Instrumental Inc ,https://jobs.gem.com/instrumental-inc-
|
||||||
|
Integral Xyz,https://jobs.gem.com/integral-xyz
|
||||||
|
Integrationscaptain,https://jobs.gem.com/integrationscaptain
|
||||||
|
Intelligentresourcing Co,https://jobs.gem.com/intelligentresourcing-co
|
||||||
|
Interfere Old,https://jobs.gem.com/interfere-old
|
||||||
|
Invoicebutler Ai,https://jobs.gem.com/invoicebutler-ai
|
||||||
|
Iris,https://jobs.gem.com/iris
|
||||||
|
Ironsite Ai,https://jobs.gem.com/ironsite-ai
|
||||||
|
Itsvaleria Co,https://jobs.gem.com/itsvaleria-co
|
||||||
|
Jaguaracareers,https://jobs.gem.com/jaguaracareers
|
||||||
|
Janie,https://jobs.gem.com/janie
|
||||||
|
Jayla Careers,https://jobs.gem.com/jayla-careers
|
||||||
|
Jobma,https://jobs.gem.com/jobma
|
||||||
|
Joinanvil Com,https://jobs.gem.com/joinanvil-com
|
||||||
|
Joinformal,https://jobs.gem.com/joinformal
|
||||||
|
Joyful Health,https://jobs.gem.com/joyful-health
|
||||||
|
Kaikaku,https://jobs.gem.com/kaikaku
|
||||||
|
Kaironhealth,https://jobs.gem.com/kaironhealth
|
||||||
|
Kaironhealth Com,https://jobs.gem.com/kaironhealth-com
|
||||||
|
Kanu Ai,https://jobs.gem.com/kanu-ai
|
||||||
|
Kcs Hiring,https://jobs.gem.com/kcs-hiring
|
||||||
|
Keru Ai,https://jobs.gem.com/keru-ai
|
||||||
|
Key To Web3,https://jobs.gem.com/key-to-web3
|
||||||
|
Knight Electric Inc ,https://jobs.gem.com/knight-electric-inc-
|
||||||
|
Kollectiv Ai,https://jobs.gem.com/kollectiv-ai
|
||||||
|
Kumo Ai,https://jobs.gem.com/kumo-ai
|
||||||
|
Lantern,https://jobs.gem.com/lantern
|
||||||
|
Lavapayments Com,https://jobs.gem.com/lavapayments-com
|
||||||
|
Leap Tools,https://jobs.gem.com/leap-tools
|
||||||
|
Letsdata,https://jobs.gem.com/letsdata
|
||||||
|
Letter Ai,https://jobs.gem.com/letter-ai
|
||||||
|
Level,https://jobs.gem.com/level
|
||||||
|
Linktree,https://jobs.gem.com/linktree
|
||||||
|
Little Otter,https://jobs.gem.com/little-otter
|
||||||
|
Lower Llc,https://jobs.gem.com/lower-llc
|
||||||
|
Lumalabs Ai,https://jobs.gem.com/lumalabs-ai
|
||||||
|
Lunajoy,https://jobs.gem.com/lunajoy
|
||||||
|
Lunch,https://jobs.gem.com/lunch
|
||||||
|
Lunos Ai,https://jobs.gem.com/lunos-ai
|
||||||
|
Magnetic,https://jobs.gem.com/magnetic
|
||||||
|
Manifest,https://jobs.gem.com/manifest
|
||||||
|
Manifested Com,https://jobs.gem.com/manifested-com
|
||||||
|
Marble Health,https://jobs.gem.com/marble-health
|
||||||
|
Mavi,https://jobs.gem.com/mavi
|
||||||
|
Meetdex Ai,https://jobs.gem.com/meetdex-ai
|
||||||
|
Megapot,https://jobs.gem.com/megapot
|
||||||
|
Meineautosdirekt,https://jobs.gem.com/meineautosdirekt
|
||||||
|
Menten Ai,https://jobs.gem.com/menten-ai
|
||||||
|
Merge Sandbox,https://jobs.gem.com/merge-sandbox
|
||||||
|
Metal Ai,https://jobs.gem.com/metal-ai
|
||||||
|
Microsoft Demo Gem Com,https://jobs.gem.com/microsoft-demo-gem-com
|
||||||
|
Mimicrobotics Com,https://jobs.gem.com/mimicrobotics-com
|
||||||
|
Mission,https://jobs.gem.com/mission
|
||||||
|
Moosehead Talent,https://jobs.gem.com/moosehead-talent
|
||||||
|
Motion,https://jobs.gem.com/motion
|
||||||
|
Moxa,https://jobs.gem.com/moxa
|
||||||
|
Multiplierhq,https://jobs.gem.com/multiplierhq
|
||||||
|
Multiscale Ai,https://jobs.gem.com/multiscale-ai
|
||||||
|
Myprize,https://jobs.gem.com/myprize
|
||||||
|
Myriad Technology,https://jobs.gem.com/myriad-technology
|
||||||
|
Myrrsgroup,https://jobs.gem.com/myrrsgroup
|
||||||
|
Nabla Bio,https://jobs.gem.com/nabla-bio
|
||||||
|
Nacelle,https://jobs.gem.com/nacelle
|
||||||
|
Nativemsg,https://jobs.gem.com/nativemsg
|
||||||
|
Nclusion,https://jobs.gem.com/nclusion
|
||||||
|
Nerve,https://jobs.gem.com/nerve
|
||||||
|
Newcrew,https://jobs.gem.com/newcrew
|
||||||
|
Ngram,https://jobs.gem.com/ngram
|
||||||
|
Nimble,https://jobs.gem.com/nimble
|
||||||
|
Niva,https://jobs.gem.com/niva
|
||||||
|
Nominal,https://jobs.gem.com/nominal
|
||||||
|
Northone,https://jobs.gem.com/northone
|
||||||
|
Ntop,https://jobs.gem.com/ntop
|
||||||
|
Nue Ai,https://jobs.gem.com/nue-ai
|
||||||
|
Nutrislice,https://jobs.gem.com/nutrislice
|
||||||
|
Nuvo,https://jobs.gem.com/nuvo
|
||||||
|
Obin Ai,https://jobs.gem.com/obin-ai
|
||||||
|
Obsidian Systems,https://jobs.gem.com/obsidian-systems
|
||||||
|
Odo Do,https://jobs.gem.com/odo-do
|
||||||
|
Omegahhagency Com,https://jobs.gem.com/omegahhagency-com
|
||||||
|
Ondo Finance,https://jobs.gem.com/ondo-finance
|
||||||
|
Onesignal,https://jobs.gem.com/onesignal
|
||||||
|
Onesignal Ats,https://jobs.gem.com/onesignal-ats
|
||||||
|
Onezyme,https://jobs.gem.com/onezyme
|
||||||
|
Onfrontiers,https://jobs.gem.com/onfrontiers
|
||||||
|
Openphone,https://jobs.gem.com/openphone
|
||||||
|
Openreqstaffing,https://jobs.gem.com/openreqstaffing
|
||||||
|
Opine,https://jobs.gem.com/opine
|
||||||
|
Ora So,https://jobs.gem.com/ora-so
|
||||||
|
Overlay,https://jobs.gem.com/overlay
|
||||||
|
Overwatch,https://jobs.gem.com/overwatch
|
||||||
|
Paces,https://jobs.gem.com/paces
|
||||||
|
Pae,https://jobs.gem.com/pae
|
||||||
|
Pagebound,https://jobs.gem.com/pagebound
|
||||||
|
Pally,https://jobs.gem.com/pally
|
||||||
|
Paramark,https://jobs.gem.com/paramark
|
||||||
|
Partao,https://jobs.gem.com/partao
|
||||||
|
Partnerhq,https://jobs.gem.com/partnerhq
|
||||||
|
Patlytics,https://jobs.gem.com/patlytics
|
||||||
|
Pave,https://jobs.gem.com/pave
|
||||||
|
Perceptyx,https://jobs.gem.com/perceptyx
|
||||||
|
Photalabs Com,https://jobs.gem.com/photalabs-com
|
||||||
|
Photon,https://jobs.gem.com/photon
|
||||||
|
Pinnacleconnect Llc,https://jobs.gem.com/pinnacleconnect-llc
|
||||||
|
Piqenergy Com,https://jobs.gem.com/piqenergy-com
|
||||||
|
Planet Fans,https://jobs.gem.com/planet-fans
|
||||||
|
Planned,https://jobs.gem.com/planned
|
||||||
|
Plixai,https://jobs.gem.com/plixai
|
||||||
|
Pogo Recruiting,https://jobs.gem.com/pogo-recruiting
|
||||||
|
Polar,https://jobs.gem.com/polar
|
||||||
|
Polywork,https://jobs.gem.com/polywork
|
||||||
|
Pomerium,https://jobs.gem.com/pomerium
|
||||||
|
Portal Ai,https://jobs.gem.com/portal-ai
|
||||||
|
Poseidonaero,https://jobs.gem.com/poseidonaero
|
||||||
|
Prahsys Com,https://jobs.gem.com/prahsys-com
|
||||||
|
Praxisiq Ai,https://jobs.gem.com/praxisiq-ai
|
||||||
|
Precision Ai,https://jobs.gem.com/precision-ai
|
||||||
|
Prodia,https://jobs.gem.com/prodia
|
||||||
|
Productboard,https://jobs.gem.com/productboard
|
||||||
|
Productboard Ats,https://jobs.gem.com/productboard-ats
|
||||||
|
Prohost Ai,https://jobs.gem.com/prohost-ai
|
||||||
|
Project Method,https://jobs.gem.com/project-method
|
||||||
|
Promptql,https://jobs.gem.com/promptql
|
||||||
|
Propel,https://jobs.gem.com/propel
|
||||||
|
Prospermedical Com,https://jobs.gem.com/prospermedical-com
|
||||||
|
Protegeai,https://jobs.gem.com/protegeai
|
||||||
|
Questdb Com,https://jobs.gem.com/questdb-com
|
||||||
|
Quitwithjones,https://jobs.gem.com/quitwithjones
|
||||||
|
Quo,https://jobs.gem.com/quo
|
||||||
|
Rain Aero,https://jobs.gem.com/rain-aero
|
||||||
|
Raincode Bahrain W L L,https://jobs.gem.com/raincode-bahrain-w-l-l
|
||||||
|
Raylu Ai,https://jobs.gem.com/raylu-ai
|
||||||
|
Rctsglobal Com,https://jobs.gem.com/rctsglobal-com
|
||||||
|
Rditrials,https://jobs.gem.com/rditrials
|
||||||
|
Rebuild Work,https://jobs.gem.com/rebuild-work
|
||||||
|
Redcar,https://jobs.gem.com/redcar
|
||||||
|
Redenvelope Co,https://jobs.gem.com/redenvelope-co
|
||||||
|
Redo,https://jobs.gem.com/redo
|
||||||
|
Rektech,https://jobs.gem.com/rektech
|
||||||
|
Renew,https://jobs.gem.com/renew
|
||||||
|
Resprop,https://jobs.gem.com/resprop
|
||||||
|
Retool,https://jobs.gem.com/retool
|
||||||
|
Revolutionparts,https://jobs.gem.com/revolutionparts
|
||||||
|
Rex,https://jobs.gem.com/rex
|
||||||
|
Rf Renovo Management Company Llc,https://jobs.gem.com/rf-renovo-management-company-llc
|
||||||
|
Riley,https://jobs.gem.com/riley
|
||||||
|
Rinsed,https://jobs.gem.com/rinsed
|
||||||
|
Risely Ai,https://jobs.gem.com/risely-ai
|
||||||
|
Rivia,https://jobs.gem.com/rivia
|
||||||
|
Roadio Ai,https://jobs.gem.com/roadio-ai
|
||||||
|
Roamless,https://jobs.gem.com/roamless
|
||||||
|
Roe Ai,https://jobs.gem.com/roe-ai
|
||||||
|
Rossibuilders Com,https://jobs.gem.com/rossibuilders-com
|
||||||
|
Roundhouse Media,https://jobs.gem.com/roundhouse-media
|
||||||
|
Rove,https://jobs.gem.com/rove
|
||||||
|
Runsybil,https://jobs.gem.com/runsybil
|
||||||
|
Sadnaconsulting Com,https://jobs.gem.com/sadnaconsulting-com
|
||||||
|
Sailorhealth Com,https://jobs.gem.com/sailorhealth-com
|
||||||
|
Sales Marker,https://jobs.gem.com/sales-marker
|
||||||
|
Salesqueze Com,https://jobs.gem.com/salesqueze-com
|
||||||
|
Sandbar Inc,https://jobs.gem.com/sandbar-inc
|
||||||
|
Sandboxschonfeld Com,https://jobs.gem.com/sandboxschonfeld-com
|
||||||
|
Sauron Systems,https://jobs.gem.com/sauron-systems
|
||||||
|
Scope Labs,https://jobs.gem.com/scope-labs
|
||||||
|
Scowtt Com,https://jobs.gem.com/scowtt-com
|
||||||
|
Seated,https://jobs.gem.com/seated
|
||||||
|
Seed2Series Com,https://jobs.gem.com/seed2series-com
|
||||||
|
Seniorverse,https://jobs.gem.com/seniorverse
|
||||||
|
Sennder Gmbh,https://jobs.gem.com/sennder-gmbh
|
||||||
|
Senndertechnologies Gmbh,https://jobs.gem.com/senndertechnologies-gmbh
|
||||||
|
Sensorum Health,https://jobs.gem.com/sensorum-health
|
||||||
|
Serv Ai,https://jobs.gem.com/serv-ai
|
||||||
|
Seven Starling,https://jobs.gem.com/seven-starling
|
||||||
|
Shef Com,https://jobs.gem.com/shef-com
|
||||||
|
Shorebird Dev,https://jobs.gem.com/shorebird-dev
|
||||||
|
Showtime,https://jobs.gem.com/showtime
|
||||||
|
Signoz,https://jobs.gem.com/signoz
|
||||||
|
Silkline,https://jobs.gem.com/silkline
|
||||||
|
Skypilot Co,https://jobs.gem.com/skypilot-co
|
||||||
|
Slash,https://jobs.gem.com/slash
|
||||||
|
Sleep Center,https://jobs.gem.com/sleep-center
|
||||||
|
Smacktechnologies Com,https://jobs.gem.com/smacktechnologies-com
|
||||||
|
Snout,https://jobs.gem.com/snout
|
||||||
|
Softup Technologies,https://jobs.gem.com/softup-technologies
|
||||||
|
Sohar Health,https://jobs.gem.com/sohar-health
|
||||||
|
Soundhound,https://jobs.gem.com/soundhound
|
||||||
|
Spawn,https://jobs.gem.com/spawn
|
||||||
|
Spellbrush,https://jobs.gem.com/spellbrush
|
||||||
|
Sphere Semi,https://jobs.gem.com/sphere-semi
|
||||||
|
Ssg,https://jobs.gem.com/ssg
|
||||||
|
Stack Auth Com,https://jobs.gem.com/stack-auth-com
|
||||||
|
Startup People Solutions,https://jobs.gem.com/startup-people-solutions
|
||||||
|
Stealth Startup,https://jobs.gem.com/stealth-startup
|
||||||
|
Stockapp Com,https://jobs.gem.com/stockapp-com
|
||||||
|
Stryke,https://jobs.gem.com/stryke
|
||||||
|
Sunsethq Com,https://jobs.gem.com/sunsethq-com
|
||||||
|
Super Hi Fi,https://jobs.gem.com/super-hi-fi
|
||||||
|
Superblocks,https://jobs.gem.com/superblocks
|
||||||
|
Supersonik Ai,https://jobs.gem.com/supersonik-ai
|
||||||
|
Supio,https://jobs.gem.com/supio
|
||||||
|
Suppliercanada Com,https://jobs.gem.com/suppliercanada-com
|
||||||
|
Switchgrowth Com,https://jobs.gem.com/switchgrowth-com
|
||||||
|
Symbolica,https://jobs.gem.com/symbolica
|
||||||
|
Syndesus,https://jobs.gem.com/syndesus
|
||||||
|
System Two Security,https://jobs.gem.com/system-two-security
|
||||||
|
Taxgpt Inc ,https://jobs.gem.com/taxgpt-inc-
|
||||||
|
Taxo Ai,https://jobs.gem.com/taxo-ai
|
||||||
|
Tektome Com,https://jobs.gem.com/tektome-com
|
||||||
|
Telora,https://jobs.gem.com/telora
|
||||||
|
Tensorstax Com,https://jobs.gem.com/tensorstax-com
|
||||||
|
Tenx Recruiting,https://jobs.gem.com/tenx-recruiting
|
||||||
|
Terraai Earth,https://jobs.gem.com/terraai-earth
|
||||||
|
Test Board,https://jobs.gem.com/test-board
|
||||||
|
The Boring Company,https://jobs.gem.com/the-boring-company
|
||||||
|
The Brewer Garrett Company,https://jobs.gem.com/the-brewer-garrett-company
|
||||||
|
The Talent Project Com,https://jobs.gem.com/the-talent-project-com
|
||||||
|
Theburntapp Com,https://jobs.gem.com/theburntapp-com
|
||||||
|
Theinterface,https://jobs.gem.com/theinterface
|
||||||
|
Thejobbridge,https://jobs.gem.com/thejobbridge
|
||||||
|
Thelma,https://jobs.gem.com/thelma
|
||||||
|
Theluckyfoundation,https://jobs.gem.com/theluckyfoundation
|
||||||
|
Thenewclub Fyi,https://jobs.gem.com/thenewclub-fyi
|
||||||
|
Theseus Us,https://jobs.gem.com/theseus-us
|
||||||
|
Thinkific,https://jobs.gem.com/thinkific
|
||||||
|
Third Dimension,https://jobs.gem.com/third-dimension
|
||||||
|
Thrivory,https://jobs.gem.com/thrivory
|
||||||
|
Thunder,https://jobs.gem.com/thunder
|
||||||
|
Thunder Compute,https://jobs.gem.com/thunder-compute
|
||||||
|
Timetoperform,https://jobs.gem.com/timetoperform
|
||||||
|
Token Transit,https://jobs.gem.com/token-transit
|
||||||
|
Toolhouse Ai,https://jobs.gem.com/toolhouse-ai
|
||||||
|
Torchsystems Com,https://jobs.gem.com/torchsystems-com
|
||||||
|
Transluce,https://jobs.gem.com/transluce
|
||||||
|
Trashlab,https://jobs.gem.com/trashlab
|
||||||
|
Tricentis,https://jobs.gem.com/tricentis
|
||||||
|
Trilliumhiring Com,https://jobs.gem.com/trilliumhiring-com
|
||||||
|
Tripworks Com,https://jobs.gem.com/tripworks-com
|
||||||
|
Tristero,https://jobs.gem.com/tristero
|
||||||
|
Trojan Trading,https://jobs.gem.com/trojan-trading
|
||||||
|
Tropic,https://jobs.gem.com/tropic
|
||||||
|
Trybree Com,https://jobs.gem.com/trybree-com
|
||||||
|
Tryhelium Com,https://jobs.gem.com/tryhelium-com
|
||||||
|
Tungsten Dev,https://jobs.gem.com/tungsten-dev
|
||||||
|
Turbohome,https://jobs.gem.com/turbohome
|
||||||
|
Twentyfour7 Dev,https://jobs.gem.com/twentyfour7-dev
|
||||||
|
Unify Ai,https://jobs.gem.com/unify-ai
|
||||||
|
Untolabs Com,https://jobs.gem.com/untolabs-com
|
||||||
|
Up Labs,https://jobs.gem.com/up-labs
|
||||||
|
Useful,https://jobs.gem.com/useful
|
||||||
|
Usemalleable Com,https://jobs.gem.com/usemalleable-com
|
||||||
|
Vamo Xyz,https://jobs.gem.com/vamo-xyz
|
||||||
|
Vanguard Cleaning Systems,https://jobs.gem.com/vanguard-cleaning-systems
|
||||||
|
Vantaca,https://jobs.gem.com/vantaca
|
||||||
|
Vantager,https://jobs.gem.com/vantager
|
||||||
|
Vantara Ai,https://jobs.gem.com/vantara-ai
|
||||||
|
Vectorworks,https://jobs.gem.com/vectorworks
|
||||||
|
Vectrasim,https://jobs.gem.com/vectrasim
|
||||||
|
Veho Technologies,https://jobs.gem.com/veho-technologies
|
||||||
|
Ventionteams Com,https://jobs.gem.com/ventionteams-com
|
||||||
|
Venture Guides,https://jobs.gem.com/venture-guides
|
||||||
|
Vercel Ats Sandbox,https://jobs.gem.com/vercel-ats-sandbox
|
||||||
|
Vesseltalent Com,https://jobs.gem.com/vesseltalent-com
|
||||||
|
Voker Ai,https://jobs.gem.com/voker-ai
|
||||||
|
Voltai Com,https://jobs.gem.com/voltai-com
|
||||||
|
Wayback Labs,https://jobs.gem.com/wayback-labs
|
||||||
|
Webflow Ats Sandbox,https://jobs.gem.com/webflow-ats-sandbox
|
||||||
|
Western Governors University,https://jobs.gem.com/western-governors-university
|
||||||
|
Whatconverts,https://jobs.gem.com/whatconverts
|
||||||
|
Wiseroad Recruiting Inc,https://jobs.gem.com/wiseroad-recruiting-inc
|
||||||
|
Wizecamel,https://jobs.gem.com/wizecamel
|
||||||
|
Wolfjaw Careers,https://jobs.gem.com/wolfjaw-careers
|
||||||
|
Wonolo,https://jobs.gem.com/wonolo
|
||||||
|
Woodsideai,https://jobs.gem.com/woodsideai
|
||||||
|
Youtrip,https://jobs.gem.com/youtrip
|
||||||
|
Zefi Ai,https://jobs.gem.com/zefi-ai
|
||||||
|
Zep,https://jobs.gem.com/zep
|
||||||
|
Zorrorx,https://jobs.gem.com/zorrorx
|
||||||
|
6
greenhouse.csv
Normal file
6
greenhouse.csv
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
url,timestamp
|
||||||
|
https://job-boards.eu.greenhouse.io/bcbgroup/jobs/4681083101?gh_src=cryptocurrencyjobs.co,2025-12-31T08:35:23.424931
|
||||||
|
https://job-boards.greenhouse.io/securitize/jobs/4074121009?gh_src=cryptocurrencyjobs.co,2025-12-31T09:19:17.349713
|
||||||
|
https://job-boards.eu.greenhouse.io/bcbgroup/jobs/4681102101?gh_src=cryptocurrencyjobs.co,2025-12-31T09:58:36.919216
|
||||||
|
https://job-boards.greenhouse.io/kiosk/jobs/4427184005?gh_src=cryptocurrencyjobs.co,2025-12-31T10:10:51.176114
|
||||||
|
https://job-boards.eu.greenhouse.io/bcbgroup/jobs/4681083101?gh_src=cryptocurrencyjobs.co,2025-12-31T11:02:31.869728
|
||||||
|
2544
greenhousecompanies.csv
Normal file
2544
greenhousecompanies.csv
Normal file
File diff suppressed because it is too large
Load Diff
7
lever.csv
Normal file
7
lever.csv
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
url,timestamp
|
||||||
|
https://jobs.eu.lever.co/kaiko/3f7f3db9-4a6a-4047-8760-bc52c3d03e05?lever-origin=applied&lever-source%5B%5D=cryptocurrencyjobs.co,2025-12-31T09:20:28.542417
|
||||||
|
https://jobs.lever.co/waterfall/880fb1b4-2515-4534-9970-53c497c82f12?lever-origin=applied&lever-source%5B%5D=cryptocurrencyjobs.co,2025-12-31T10:08:17.316072
|
||||||
|
https://jobs.lever.co/obol-tech/fcccd493-54e4-425a-b9bd-82fa6f7e6aff?lever-origin=applied&lever-source%5B%5D=cryptocurrencyjobs.co,2025-12-31T10:35:29.164452
|
||||||
|
https://jobs.eu.lever.co/coinspaid/7605e154-4b1d-45ee-b1d4-35edea13d80b?lever-origin=applied&lever-source%5B%5D=cryptocurrencyjobs.co,2025-12-31T10:51:38.852693
|
||||||
|
https://jobs.lever.co/vedatechlabs/9c59c96c-2bb0-47b0-88fe-5d5a9fd85997?lever-origin=applied&lever-source%5B%5D=cryptocurrencyjobs.co,2025-12-31T11:02:16.120852
|
||||||
|
https://jobs.eu.lever.co/kaiko/3f7f3db9-4a6a-4047-8760-bc52c3d03e05?lever-origin=applied&lever-source%5B%5D=cryptocurrencyjobs.co,2025-12-31T11:16:43.218273
|
||||||
|
1792
levercompanies.csv
Normal file
1792
levercompanies.csv
Normal file
File diff suppressed because it is too large
Load Diff
8
linkedin.csv
Normal file
8
linkedin.csv
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
url,timestamp
|
||||||
|
https://www.linkedin.com/jobs/view/operations-analyst-at-amber-group-4325538653/?ref=cryptocurrencyjobs.co,2025-12-31T09:20:11.544002
|
||||||
|
https://www.linkedin.com/jobs/view/hr-operations-intern-sg-at-matrixport-official-4338171692/?ref=cryptocurrencyjobs.co,2025-12-31T09:25:10.499933
|
||||||
|
https://www.linkedin.com/jobs/view/operations-analyst-at-matrixport-official-4235087267/?ref=cryptocurrencyjobs.co,2025-12-31T09:33:53.104120
|
||||||
|
https://www.linkedin.com/jobs/view/business-operations-analyst-at-matrixport-official-4215538150/?ref=cryptocurrencyjobs.co,2025-12-31T09:34:24.186519
|
||||||
|
https://www.linkedin.com/jobs/view/graduate-hiring-business-operations-analyst-wealth-management-at-matrixport-official-4131687672/?ref=cryptocurrencyjobs.co,2025-12-31T09:36:47.038648
|
||||||
|
https://www.linkedin.com/jobs/view/customer-support-specialist-at-matrixport-official-4323103235/?ref=cryptocurrencyjobs.co,2025-12-31T10:39:57.272414
|
||||||
|
https://www.linkedin.com/jobs/view/finance-intern-at-amber-group-4248725225/?ref=cryptocurrencyjobs.co,2025-12-31T11:31:03.349275
|
||||||
|
170
llm_agent.py
170
llm_agent.py
@ -21,13 +21,12 @@ class LLMJobRefiner:
|
|||||||
raise ValueError("DEEPSEEK_API_KEY not found in .env file.")
|
raise ValueError("DEEPSEEK_API_KEY not found in .env file.")
|
||||||
|
|
||||||
# Database credentials from .env
|
# Database credentials from .env
|
||||||
self.db_url = os.getenv("DB_URL")
|
|
||||||
self.db_username = os.getenv("DB_USERNAME")
|
self.db_username = os.getenv("DB_USERNAME")
|
||||||
self.db_password = os.getenv("DB_PASSWORD")
|
self.db_password = os.getenv("DB_PASSWORD")
|
||||||
self.db_host = os.getenv("DB_HOST")
|
self.db_host = os.getenv("DB_HOST")
|
||||||
self.db_port = os.getenv("DB_PORT")
|
self.db_port = os.getenv("DB_PORT")
|
||||||
|
|
||||||
if not self.db_url or not self.db_username or not self.db_password:
|
if not self.db_username or not self.db_password:
|
||||||
raise ValueError("Database credentials not found in .env file.")
|
raise ValueError("Database credentials not found in .env file.")
|
||||||
|
|
||||||
# DeepSeek uses OpenAI-compatible API
|
# DeepSeek uses OpenAI-compatible API
|
||||||
@ -41,16 +40,6 @@ class LLMJobRefiner:
|
|||||||
def _init_db(self):
|
def _init_db(self):
|
||||||
"""Initialize PostgreSQL database connection and create table"""
|
"""Initialize PostgreSQL database connection and create table"""
|
||||||
try:
|
try:
|
||||||
self.db_url = os.getenv("DB_URL")
|
|
||||||
if self.db_url and "supabase.com" in self.db_url:
|
|
||||||
conn = psycopg2.connect(
|
|
||||||
host=self.db_host,
|
|
||||||
port=self.db_port,
|
|
||||||
database="postgres",
|
|
||||||
user=self.db_username,
|
|
||||||
password=self.db_password
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
conn = psycopg2.connect(
|
conn = psycopg2.connect(
|
||||||
host=self.db_host,
|
host=self.db_host,
|
||||||
port=self.db_port,
|
port=self.db_port,
|
||||||
@ -113,8 +102,8 @@ class LLMJobRefiner:
|
|||||||
text = re.sub(r'\s+', ' ', text)
|
text = re.sub(r'\s+', ' ', text)
|
||||||
|
|
||||||
# Limit length for LLM context
|
# Limit length for LLM context
|
||||||
if len(text) > 10000:
|
if len(text) > 100000:
|
||||||
text = text[:10000] + "..."
|
text = text[:100000] + "..."
|
||||||
|
|
||||||
return text
|
return text
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -128,7 +117,7 @@ class LLMJobRefiner:
|
|||||||
response = self.client.chat.completions.create(
|
response = self.client.chat.completions.create(
|
||||||
model=self.model,
|
model=self.model,
|
||||||
messages=[{"role": "user", "content": prompt}],
|
messages=[{"role": "user", "content": prompt}],
|
||||||
temperature=0.2,
|
temperature=0.1,
|
||||||
max_tokens=2048,
|
max_tokens=2048,
|
||||||
stream=False
|
stream=False
|
||||||
)
|
)
|
||||||
@ -145,26 +134,32 @@ class LLMJobRefiner:
|
|||||||
posted_date = raw_data.get('posted_date', datetime.now().strftime("%m/%d/%y"))
|
posted_date = raw_data.get('posted_date', datetime.now().strftime("%m/%d/%y"))
|
||||||
|
|
||||||
prompt = f"""
|
prompt = f"""
|
||||||
You are a job posting data extractor.
|
You are an expert job posting data extractor. Your task is to extract AND infer fields from the provided job posting.
|
||||||
|
|
||||||
EXTRACT EXACT TEXT - DO NOT SUMMARIZE, PARAPHRASE, OR INVENT.
|
### CORE RULES:
|
||||||
|
1. **NEVER invent, summarize, or paraphrase** — extract **exact wording** when available.
|
||||||
|
2. **For critical fields (title, company_name, job_id, url, description):**
|
||||||
|
- These MUST be present and meaningful.
|
||||||
|
- If not explicitly stated, **infer from context** (e.g., page title, headings, "About Us", etc.).
|
||||||
|
- **NEVER return "Not provided" or "N/A" for these fields.**
|
||||||
|
3. **For optional fields (location, salary_range, etc.):**
|
||||||
|
- Extract exact text if present.
|
||||||
|
- If absent but inferable (e.g., "Remote (US)", "Full-time"), **infer it**.
|
||||||
|
- Only return "Not provided" if truly absent and non-inferable.
|
||||||
|
|
||||||
For these critical fields, follow these rules:
|
### FIELD DEFINITIONS:
|
||||||
- description: Extract ALL job description text. If ANY job details exist (duties, responsibilities, overview), include them. Only use "Not provided" if absolutely no description exists.
|
- **title**: The job title. Look in <h1>, page title, or bold headings.
|
||||||
- requirements: Extract ALL requirements text. If ANY requirements exist (skills, experience, education needed), include them. Only use "Not provided" if none exist.
|
- **company_name**: Company name. Look in logo alt text, footer, "About [X]", or page title.
|
||||||
- qualifications: Extract ALL qualifications text. If ANY qualifications exist, include them. Only use "Not provided" if none exist.
|
- **description**: Main job overview, responsibilities, or duties. Combine relevant paragraphs if needed. **Must not be empty.**
|
||||||
|
- **requirements**: Required skills, experience, or qualifications.
|
||||||
|
- **qualifications**: Educational or certification requirements.
|
||||||
|
- **location**: Office location or remote policy.
|
||||||
|
- **salary_range**: Exact compensation info.
|
||||||
|
- **nature_of_work**: Employment type (Full-time, Contract, Internship, etc.).
|
||||||
|
|
||||||
REQUIRED FIELDS (must have valid values, never "N/A"):
|
### OUTPUT FORMAT:
|
||||||
- title, company_name, job_id, url
|
Return ONLY a valid JSON object with these keys:
|
||||||
|
{{
|
||||||
OPTIONAL FIELDS (can be "Not provided"):
|
|
||||||
- location, salary_range, nature_of_work
|
|
||||||
|
|
||||||
Page Content:
|
|
||||||
{cleaned_content}
|
|
||||||
|
|
||||||
Response format (ONLY return this JSON):
|
|
||||||
{{
|
|
||||||
"title": "...",
|
"title": "...",
|
||||||
"company_name": "...",
|
"company_name": "...",
|
||||||
"location": "...",
|
"location": "...",
|
||||||
@ -175,8 +170,16 @@ class LLMJobRefiner:
|
|||||||
"nature_of_work": "...",
|
"nature_of_work": "...",
|
||||||
"job_id": "{job_id}",
|
"job_id": "{job_id}",
|
||||||
"url": "{url}"
|
"url": "{url}"
|
||||||
}}
|
}}
|
||||||
"""
|
|
||||||
|
- **Critical fields must NEVER be "Not provided", "N/A", empty, or generic** (e.g., "Company", "Job Title").
|
||||||
|
- **Optional fields may be "Not provided" ONLY if truly absent.**
|
||||||
|
- **Do not include markdown, explanations, or extra text.**
|
||||||
|
- **Use double quotes for JSON.**
|
||||||
|
|
||||||
|
Page Content:
|
||||||
|
{cleaned_content}
|
||||||
|
"""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response_text = await asyncio.get_event_loop().run_in_executor(
|
response_text = await asyncio.get_event_loop().run_in_executor(
|
||||||
@ -188,31 +191,23 @@ class LLMJobRefiner:
|
|||||||
if not refined_data:
|
if not refined_data:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Validate required fields
|
# Validate critical fields — reject if missing or placeholder
|
||||||
required_fields = ['title', 'company_name', 'job_id', 'url']
|
critical_fields = ['title', 'company_name', 'job_id', 'url', 'description']
|
||||||
for field in required_fields:
|
for field in critical_fields:
|
||||||
if not refined_data.get(field) or refined_data[field].strip() in ["N/A", "", "Unknown", "Company", "Job"]:
|
|
||||||
return None
|
|
||||||
|
|
||||||
# CRITICAL: Validate content fields - check if they SHOULD exist
|
|
||||||
content_fields = ['description', 'requirements', 'qualifications']
|
|
||||||
cleaned_original = cleaned_content.lower()
|
|
||||||
|
|
||||||
# Simple heuristic: if page contains job-related keywords, content fields should NOT be "Not provided"
|
|
||||||
job_indicators = ['responsibilit', 'duties', 'require', 'qualifi', 'skill', 'experienc', 'educat', 'degree', 'bachelor', 'master']
|
|
||||||
has_job_content = any(indicator in cleaned_original for indicator in job_indicators)
|
|
||||||
|
|
||||||
if has_job_content:
|
|
||||||
for field in content_fields:
|
|
||||||
value = refined_data.get(field, "").strip()
|
value = refined_data.get(field, "").strip()
|
||||||
if value in ["Not provided", "N/A", ""]:
|
if not value or value.lower() in ["n/a", "not provided", "unknown", "company", "job", "title", ""]:
|
||||||
# LLM failed to extract existing content
|
print(f" ❌ Critical field '{field}' is invalid: '{value}'")
|
||||||
print(f" ⚠️ LLM returned '{value}' for {field} but job content appears present")
|
return None # This job will NOT be saved — as per requirement
|
||||||
return None
|
|
||||||
|
# Optional fields: allow "Not provided", but ensure they're strings
|
||||||
|
optional_fields = ['location', 'requirements', 'qualifications', 'salary_range', 'nature_of_work']
|
||||||
|
for field in optional_fields:
|
||||||
|
if field not in refined_data:
|
||||||
|
refined_data[field] = "Not provided"
|
||||||
|
elif not isinstance(refined_data[field], str):
|
||||||
|
refined_data[field] = str(refined_data[field])
|
||||||
|
|
||||||
# Add the posted_date to the refined data
|
|
||||||
refined_data['posted_date'] = posted_date
|
refined_data['posted_date'] = posted_date
|
||||||
|
|
||||||
return refined_data
|
return refined_data
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -220,15 +215,22 @@ class LLMJobRefiner:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
def _parse_llm_response(self, response_text: str) -> Dict[str, Any]:
|
def _parse_llm_response(self, response_text: str) -> Dict[str, Any]:
|
||||||
|
# Try to extract JSON from markdown code block
|
||||||
json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', response_text, re.DOTALL)
|
json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', response_text, re.DOTALL)
|
||||||
if not json_match:
|
if not json_match:
|
||||||
json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
|
# Try to find raw JSON object
|
||||||
|
json_match = re.search(r'\{[^{}]*\{[^{}]*\}[^{}]*\}|\{.*\}', response_text, re.DOTALL)
|
||||||
if not json_match:
|
if not json_match:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return json.loads(json_match.group(1) if '```' in response_text else json_match.group(0))
|
json_str = json_match.group(1) if '```' in response_text else json_match.group(0)
|
||||||
except json.JSONDecodeError:
|
# Clean common issues
|
||||||
|
json_str = re.sub(r'\s+', ' ', json_str)
|
||||||
|
json_str = re.sub(r',\s*([\]}\)])', r'\1', json_str) # Remove trailing commas
|
||||||
|
return json.loads(json_str)
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
print(f"JSON parsing error: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
async def save_job_data(self, job_data: Dict[str, Any], keyword: str):
|
async def save_job_data(self, job_data: Dict[str, Any], keyword: str):
|
||||||
@ -254,50 +256,50 @@ class LLMJobRefiner:
|
|||||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||||
ON CONFLICT (job_id) DO NOTHING
|
ON CONFLICT (job_id) DO NOTHING
|
||||||
''', (
|
''', (
|
||||||
job_data.get("title", "N/A"),
|
job_data.get("title", "Not provided"),
|
||||||
job_data.get("company_name", "N/A"),
|
job_data.get("company_name", "Not provided"),
|
||||||
job_data.get("location", "N/A"),
|
job_data.get("location", "Not provided"),
|
||||||
job_data.get("description", "N/A"),
|
job_data.get("description", "Not provided"),
|
||||||
job_data.get("requirements", "N/A"),
|
job_data.get("requirements", "Not provided"),
|
||||||
job_data.get("qualifications", "N/A"),
|
job_data.get("qualifications", "Not provided"),
|
||||||
job_data.get("salary_range", "N/A"),
|
job_data.get("salary_range", "Not provided"),
|
||||||
job_data.get("nature_of_work", "N/A"),
|
job_data.get("nature_of_work", "Not provided"),
|
||||||
job_data.get("job_id", "N/A"),
|
job_data.get("job_id", "unknown"),
|
||||||
job_data.get("url", "N/A"),
|
job_data.get("url", "N/A"),
|
||||||
job_data.get("category", "N/A"),
|
job_data.get("category", "all"),
|
||||||
job_data.get("scraped_at"),
|
job_data.get("scraped_at"),
|
||||||
job_data.get("posted_date", "N/A")
|
job_data.get("posted_date", datetime.now().strftime("%m/%d/%y"))
|
||||||
))
|
))
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
cursor.close()
|
cursor.close()
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
print(f" 💾 Saved job to category '{job_data.get('category', 'N/A')}' with job_id: {job_data.get('job_id', 'N/A')}")
|
print(f" 💾 Saved job to category '{job_data.get('category', 'all')}' with job_id: {job_data.get('job_id', 'unknown')}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"❌ Database save error: {e}")
|
print(f"❌ Database save error: {e}")
|
||||||
|
|
||||||
async def _save_to_markdown(self, job_data: Dict[str, Any], keyword: str):
|
async def _save_to_markdown(self, job_data: Dict[str, Any], keyword: str):
|
||||||
os.makedirs("linkedin_jobs", exist_ok=True)
|
os.makedirs("crypto_jobs", exist_ok=True)
|
||||||
filepath = os.path.join("linkedin_jobs", "linkedin_jobs_scraped.md")
|
filepath = os.path.join("crypto_jobs", "crypto_jobs_scraped.md")
|
||||||
write_header = not os.path.exists(filepath) or os.path.getsize(filepath) == 0
|
write_header = not os.path.exists(filepath) or os.path.getsize(filepath) == 0
|
||||||
|
|
||||||
with open(filepath, "a", encoding="utf-8") as f:
|
with open(filepath, "a", encoding="utf-8") as f:
|
||||||
if write_header:
|
if write_header:
|
||||||
f.write(f"# LinkedIn Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
|
f.write(f"# Crypto Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
|
||||||
f.write(f"## Job: {job_data.get('title', 'N/A')}\n\n")
|
f.write(f"## Job: {job_data.get('title', 'Not provided')}\n\n")
|
||||||
f.write(f"- **Keyword**: {keyword}\n")
|
f.write(f"- **Keyword**: {keyword}\n")
|
||||||
f.write(f"- **Company**: {job_data.get('company_name', 'N/A')}\n")
|
f.write(f"- **Company**: {job_data.get('company_name', 'Not provided')}\n")
|
||||||
f.write(f"- **Location**: {job_data.get('location', 'N/A')}\n")
|
f.write(f"- **Location**: {job_data.get('location', 'Not provided')}\n")
|
||||||
f.write(f"- **Nature of Work**: {job_data.get('nature_of_work', 'N/A')}\n")
|
f.write(f"- **Nature of Work**: {job_data.get('nature_of_work', 'Not provided')}\n")
|
||||||
f.write(f"- **Salary Range**: {job_data.get('salary_range', 'N/A')}\n")
|
f.write(f"- **Salary Range**: {job_data.get('salary_range', 'Not provided')}\n")
|
||||||
f.write(f"- **Job ID**: {job_data.get('job_id', 'N/A')}\n")
|
f.write(f"- **Job ID**: {job_data.get('job_id', 'unknown')}\n")
|
||||||
f.write(f"- **Posted Date**: {job_data.get('posted_date', 'N/A')}\n")
|
f.write(f"- **Posted Date**: {job_data.get('posted_date', 'N/A')}\n")
|
||||||
f.write(f"- **Category**: {job_data.get('category', 'N/A')}\n")
|
f.write(f"- **Category**: {job_data.get('category', 'all')}\n")
|
||||||
f.write(f"- **Scraped At**: {job_data.get('scraped_at', 'N/A')}\n")
|
f.write(f"- **Scraped At**: {job_data.get('scraped_at', 'N/A')}\n")
|
||||||
f.write(f"- **URL**: <{job_data.get('url', 'N/A')}>\n\n")
|
f.write(f"- **URL**: <{job_data.get('url', 'N/A')}>\n\n")
|
||||||
f.write(f"### Description\n\n{job_data.get('description', 'N/A')}\n\n")
|
f.write(f"### Description\n\n{job_data.get('description', 'Not provided')}\n\n")
|
||||||
f.write(f"### Requirements\n\n{job_data.get('requirements', 'N/A')}\n\n")
|
f.write(f"### Requirements\n\n{job_data.get('requirements', 'Not provided')}\n\n")
|
||||||
f.write(f"### Qualifications\n\n{job_data.get('qualifications', 'N/A')}\n\n")
|
f.write(f"### Qualifications\n\n{job_data.get('qualifications', 'Not provided')}\n\n")
|
||||||
f.write("---\n\n")
|
f.write("---\n\n")
|
||||||
20
main.py
20
main.py
@ -1,3 +1,4 @@
|
|||||||
|
|
||||||
from scraping_engine import FingerprintScrapingEngine
|
from scraping_engine import FingerprintScrapingEngine
|
||||||
from scraper import CryptoJobScraper # Updated class name
|
from scraper import CryptoJobScraper # Updated class name
|
||||||
import os
|
import os
|
||||||
@ -20,16 +21,15 @@ async def main():
|
|||||||
scraper = CryptoJobScraper(engine, human_speed=1.3, user_request="Extract title, company, location, description, requirements, qualifications, nature of work, and salary")
|
scraper = CryptoJobScraper(engine, human_speed=1.3, user_request="Extract title, company, location, description, requirements, qualifications, nature of work, and salary")
|
||||||
|
|
||||||
job_titles = [
|
job_titles = [
|
||||||
"Blockchain Engineer",
|
"Customer Support",
|
||||||
"Smart Contract Developer",
|
"Design",
|
||||||
"DeFi Analyst",
|
"Engineering",
|
||||||
"Web3 Developer",
|
"Finance",
|
||||||
"Crypto Researcher",
|
"Marketing",
|
||||||
"Solidity Developer",
|
"Operations",
|
||||||
"Protocol Engineer",
|
"Product",
|
||||||
"Tokenomics Specialist",
|
"Sales"
|
||||||
"Zero-Knowledge Proof Engineer",
|
|
||||||
"Crypto Compliance Officer"
|
|
||||||
]
|
]
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
|
|||||||
1
rippling.csv
Normal file
1
rippling.csv
Normal file
@ -0,0 +1 @@
|
|||||||
|
url,timestamp
|
||||||
|
1324
ripplingcompanies.csv
Normal file
1324
ripplingcompanies.csv
Normal file
File diff suppressed because it is too large
Load Diff
426
scraper.py
426
scraper.py
@ -6,11 +6,13 @@ from playwright.async_api import async_playwright, TimeoutError as PlaywrightTim
|
|||||||
from browserforge.injectors.playwright import AsyncNewContext
|
from browserforge.injectors.playwright import AsyncNewContext
|
||||||
from llm_agent import LLMJobRefiner
|
from llm_agent import LLMJobRefiner
|
||||||
import re
|
import re
|
||||||
from fetcher import StealthyFetcher
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import json
|
import json
|
||||||
import redis
|
import redis
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
import hashlib
|
||||||
|
import csv
|
||||||
|
import os
|
||||||
|
|
||||||
class CryptoJobScraper:
|
class CryptoJobScraper:
|
||||||
def __init__(
|
def __init__(
|
||||||
@ -27,6 +29,29 @@ class CryptoJobScraper:
|
|||||||
self.llm_agent = LLMJobRefiner()
|
self.llm_agent = LLMJobRefiner()
|
||||||
self.redis_client = redis.Redis(host='localhost', port=6379, db=0, decode_responses=True)
|
self.redis_client = redis.Redis(host='localhost', port=6379, db=0, decode_responses=True)
|
||||||
|
|
||||||
|
self.FORBIDDEN_ATS_DOMAINS = [
|
||||||
|
'ashby', 'ashbyhq',
|
||||||
|
'greenhouse', 'boards.greenhouse.io',
|
||||||
|
'gem', 'gem.com',
|
||||||
|
'rippling',
|
||||||
|
'myworkday', 'myworkdayjobs',
|
||||||
|
'smartrecruiters',
|
||||||
|
'workable',
|
||||||
|
'lever', 'jobs.lever.co',
|
||||||
|
'linkedin.com' # ✅ Added LinkedIn
|
||||||
|
]
|
||||||
|
|
||||||
|
self.INVALID_CONTENT_PHRASES = [
|
||||||
|
"invalid job url",
|
||||||
|
"cookie consent",
|
||||||
|
"privacy policy",
|
||||||
|
"not a valid job",
|
||||||
|
"job not found",
|
||||||
|
"page not found",
|
||||||
|
"The requested job post could not be found. It may have been removed.",
|
||||||
|
"this page does not contain a job description"
|
||||||
|
]
|
||||||
|
|
||||||
async def _human_click(self, page, element, wait_after: bool = True):
|
async def _human_click(self, page, element, wait_after: bool = True):
|
||||||
if not element:
|
if not element:
|
||||||
return False
|
return False
|
||||||
@ -55,60 +80,127 @@ class CryptoJobScraper:
|
|||||||
matches = sum(1 for kw in keyword_list if kw in title_lower)
|
matches = sum(1 for kw in keyword_list if kw in title_lower)
|
||||||
return matches / len(keyword_list) if keyword_list else 0.0
|
return matches / len(keyword_list) if keyword_list else 0.0
|
||||||
|
|
||||||
async def _scrape_jobs_from_current_page(self, page, search_keywords: str, seen_job_ids, all_job_links):
|
async def _extract_job_title_from_card(self, card) -> str:
|
||||||
current_links = await page.query_selector_all("a[href*='/job/']")
|
try:
|
||||||
new_jobs = 0
|
title_selectors = [
|
||||||
|
'h3', 'h2', 'h4',
|
||||||
|
'strong', 'span'
|
||||||
|
]
|
||||||
|
for selector in title_selectors:
|
||||||
|
title_element = await card.query_selector(selector)
|
||||||
|
if title_element:
|
||||||
|
title_text = await title_element.inner_text()
|
||||||
|
if title_text and len(title_text.strip()) > 3:
|
||||||
|
return title_text.strip()
|
||||||
|
|
||||||
for link in current_links:
|
card_text = await card.inner_text()
|
||||||
href = await link.get_attribute("href")
|
lines = [line.strip() for line in card_text.split('\n') if line.strip()]
|
||||||
if not href or not href.startswith("http"):
|
if lines:
|
||||||
href = "https://cryptocurrencyjobs.co" + href
|
for line in lines:
|
||||||
job_id = href.split("/")[-1] if href.endswith("/") else href.split("/")[-1]
|
if len(line) > 5 and not any(skip in line.lower() for skip in ['today', 'featured', 'yesterday', 'company', 'location']):
|
||||||
|
return line
|
||||||
|
return "Unknown Title"
|
||||||
|
except:
|
||||||
|
return "Unknown Title"
|
||||||
|
|
||||||
|
async def _collect_job_elements_from_page(self, page, search_keywords: str, seen_slugs):
|
||||||
|
job_cards = []
|
||||||
|
job_found = False
|
||||||
|
|
||||||
|
await asyncio.sleep(3 * self.human_speed)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await page.wait_for_selector('a[href^="/"][href*="-"]', timeout=60000)
|
||||||
|
candidates = await page.query_selector_all('a[href^="/"][href*="-"]')
|
||||||
|
|
||||||
|
for link in candidates:
|
||||||
|
href = await link.get_attribute("href") or ""
|
||||||
|
href = href.rstrip('/')
|
||||||
|
if not href or len(href.split('/')) != 3:
|
||||||
|
continue
|
||||||
|
if '-' not in href.split('/')[-1]:
|
||||||
|
continue
|
||||||
|
slug = href.split('/')[-1]
|
||||||
|
if len(slug) < 8 or slug.startswith(('login', 'signup', 'about', 'terms')):
|
||||||
|
continue
|
||||||
|
|
||||||
|
full_url = "https://cryptocurrencyjobs.co" + href if not href.startswith('http') else href
|
||||||
|
if slug in seen_slugs:
|
||||||
|
continue
|
||||||
|
|
||||||
|
title = await self._extract_job_title_from_card(link)
|
||||||
|
if not title or title == "Unknown Title":
|
||||||
|
title = slug.replace('-', ' ').title()
|
||||||
|
|
||||||
if job_id and job_id not in seen_job_ids:
|
|
||||||
title_element = await link.query_selector("h3, .job-title")
|
|
||||||
title = (await title_element.inner_text()) if title_element else "Unknown Title"
|
|
||||||
match_percentage = self._calculate_keyword_match(title, search_keywords)
|
match_percentage = self._calculate_keyword_match(title, search_keywords)
|
||||||
|
if match_percentage >= 0.4 or not search_keywords.strip():
|
||||||
|
seen_slugs.add(slug)
|
||||||
|
job_cards.append((full_url, title, link))
|
||||||
|
job_found = True
|
||||||
|
|
||||||
if match_percentage >= 0.5: # Lower threshold than LinkedIn
|
print(f" ✅ Collected {len(job_cards)} valid job links after filtering ({len(candidates)} raw candidates).")
|
||||||
seen_job_ids.add(job_id)
|
|
||||||
all_job_links.append((href, title))
|
|
||||||
new_jobs += 1
|
|
||||||
else:
|
|
||||||
print(f" ⚠️ Skipping job due to low keyword match: {title[:50]}... (match: {match_percentage:.2%})")
|
|
||||||
return new_jobs
|
|
||||||
|
|
||||||
async def _handle_pagination(self, page, search_keywords: str, seen_job_ids, all_job_links):
|
except Exception as e:
|
||||||
current_page = 1
|
print(f" ⚠️ Error collecting job cards: {e}")
|
||||||
while True:
|
|
||||||
print(f"📄 Processing page {current_page}")
|
|
||||||
new_jobs = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links)
|
|
||||||
print(f" ➕ Found {new_jobs} new job(s) (total: {len(all_job_links)})")
|
|
||||||
|
|
||||||
next_btn = await page.query_selector('a[rel="next"]')
|
if not job_found:
|
||||||
if next_btn:
|
print(" ❌ No valid job listings passed filters.")
|
||||||
next_url = await next_btn.get_attribute("href")
|
|
||||||
if next_url and not next_url.startswith("http"):
|
return job_cards
|
||||||
next_url = "https://cryptocurrencyjobs.co" + next_url
|
|
||||||
await page.goto(next_url, timeout=120000)
|
async def _handle_pagination_and_collect_all(self, page, search_keywords: str, seen_slugs):
|
||||||
await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
|
all_job_elements = []
|
||||||
current_page += 1
|
scroll_attempt = 0
|
||||||
else:
|
max_scrolls = 40
|
||||||
print("🔚 No 'Next' page — stopping pagination.")
|
prev_count = 0
|
||||||
|
|
||||||
|
while scroll_attempt < max_scrolls:
|
||||||
|
print(f" Scroll attempt {scroll_attempt + 1} | Current total jobs: {len(all_job_elements)}")
|
||||||
|
|
||||||
|
page_elements = await self._collect_job_elements_from_page(page, search_keywords, seen_slugs)
|
||||||
|
all_job_elements.extend(page_elements)
|
||||||
|
|
||||||
|
current_count = len(all_job_elements)
|
||||||
|
|
||||||
|
if current_count == prev_count and scroll_attempt > 3:
|
||||||
|
print(" 🔚 No new jobs after several scrolls → assuming end of list.")
|
||||||
break
|
break
|
||||||
|
|
||||||
async def _extract_job_posted_date(self, page) -> str:
|
prev_count = current_count
|
||||||
|
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
await asyncio.sleep(random.uniform(2.5, 5.5) * self.human_speed)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
date_element = await page.query_selector(".job-posted-date, .job-date, time")
|
load_more = await page.query_selector(
|
||||||
if date_element:
|
'button:has-text("Load more"), button:has-text("More"), div[role="button"]:has-text("Load"), a:has-text("Load more")'
|
||||||
date_text = await date_element.inner_text()
|
)
|
||||||
if "Today" in date_text:
|
if load_more:
|
||||||
|
print(" Found 'Load more' button → clicking...")
|
||||||
|
await self._human_click(page, load_more)
|
||||||
|
await asyncio.sleep(random.uniform(3.0, 6.0) * self.human_speed)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
scroll_attempt += 1
|
||||||
|
|
||||||
|
print(f" Finished scrolling → collected {len(all_job_elements)} unique job links.")
|
||||||
|
return all_job_elements
|
||||||
|
|
||||||
|
async def _extract_job_posted_date_from_card(self, card) -> str:
|
||||||
|
try:
|
||||||
|
card_text = await card.inner_text()
|
||||||
|
if "Today" in card_text:
|
||||||
return datetime.now().strftime("%m/%d/%y")
|
return datetime.now().strftime("%m/%d/%y")
|
||||||
elif "Yesterday" in date_text:
|
elif "Yesterday" in card_text:
|
||||||
yesterday = datetime.now().replace(day=datetime.now().day - 1)
|
from datetime import timedelta
|
||||||
return yesterday.strftime("%m/%d/%y")
|
return (datetime.now() - timedelta(days=1)).strftime("%m/%d/%y")
|
||||||
else:
|
else:
|
||||||
return datetime.now().strftime("%m/%d/%y")
|
match = re.search(r'(\d+)d', card_text)
|
||||||
|
if match:
|
||||||
|
days = int(match.group(1))
|
||||||
|
from datetime import timedelta
|
||||||
|
return (datetime.now() - timedelta(days=days)).strftime("%m/%d/%y")
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
return datetime.now().strftime("%m/%d/%y")
|
return datetime.now().strftime("%m/%d/%y")
|
||||||
@ -126,15 +218,103 @@ class CryptoJobScraper:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f" ❌ Failed to add job to Redis cache: {str(e)}")
|
print(f" ❌ Failed to add job to Redis cache: {str(e)}")
|
||||||
|
|
||||||
|
async def _is_forbidden_ats_url(self, url: str) -> bool:
|
||||||
|
url_lower = url.lower()
|
||||||
|
return any(domain in url_lower for domain in self.FORBIDDEN_ATS_DOMAINS)
|
||||||
|
|
||||||
|
def _get_ats_platform_name(self, url: str) -> str:
|
||||||
|
"""Return canonical ATS name based on URL (e.g., 'ashby', 'greenhouse')"""
|
||||||
|
url_lower = url.lower()
|
||||||
|
|
||||||
|
# Order matters: more specific first
|
||||||
|
if 'boards.greenhouse.io' in url_lower:
|
||||||
|
return 'greenhouse'
|
||||||
|
elif 'jobs.lever.co' in url_lower:
|
||||||
|
return 'lever'
|
||||||
|
elif 'myworkdayjobs' in url_lower or 'myworkday' in url_lower:
|
||||||
|
return 'workday'
|
||||||
|
elif 'linkedin.com' in url_lower:
|
||||||
|
return 'linkedin'
|
||||||
|
elif 'ashbyhq.com' in url_lower or 'ashby' in url_lower:
|
||||||
|
return 'ashby'
|
||||||
|
elif 'gem.com' in url_lower or 'gem' in url_lower:
|
||||||
|
return 'gem'
|
||||||
|
elif 'rippling' in url_lower:
|
||||||
|
return 'rippling'
|
||||||
|
elif 'smartrecruiters' in url_lower:
|
||||||
|
return 'smartrecruiters'
|
||||||
|
elif 'workable' in url_lower:
|
||||||
|
return 'workable'
|
||||||
|
else:
|
||||||
|
# Fallback: extract domain part
|
||||||
|
try:
|
||||||
|
parsed = urlparse(url)
|
||||||
|
domain = parsed.netloc.lower()
|
||||||
|
for forbidden in self.FORBIDDEN_ATS_DOMAINS:
|
||||||
|
if forbidden in domain:
|
||||||
|
return forbidden.split('.')[0] if '.' in forbidden else forbidden
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return 'forbidden_ats'
|
||||||
|
|
||||||
|
def _log_forbidden_ats_url(self, url: str, platform: str):
|
||||||
|
"""Append forbidden URL to {platform}.csv"""
|
||||||
|
filename = f"{platform}.csv"
|
||||||
|
file_exists = os.path.isfile(filename)
|
||||||
|
with open(filename, 'a', newline='', encoding='utf-8') as f:
|
||||||
|
writer = csv.writer(f)
|
||||||
|
if not file_exists:
|
||||||
|
writer.writerow(['url', 'timestamp'])
|
||||||
|
writer.writerow([url, datetime.now().isoformat()])
|
||||||
|
print(f" 📥 Logged forbidden ATS URL to {filename}: {url}")
|
||||||
|
|
||||||
|
async def _is_invalid_job_page(self, page_content: str) -> bool:
|
||||||
|
content_lower = page_content.lower()
|
||||||
|
return any(phrase in content_lower for phrase in self.INVALID_CONTENT_PHRASES)
|
||||||
|
|
||||||
|
def _extract_job_id_from_url(self, url: str) -> Optional[str]:
|
||||||
|
try:
|
||||||
|
parsed = urlparse(url)
|
||||||
|
path_parts = [p for p in parsed.path.split('/') if p]
|
||||||
|
if not path_parts:
|
||||||
|
return None
|
||||||
|
|
||||||
|
candidate = path_parts[-1]
|
||||||
|
candidate = re.split(r'[?#]', candidate)[0]
|
||||||
|
candidate = re.sub(r'\.html?$', '', candidate)
|
||||||
|
|
||||||
|
if not candidate or not any(c.isdigit() for c in candidate):
|
||||||
|
return None
|
||||||
|
|
||||||
|
if re.search(r'[A-Za-z]{6,}\s', candidate):
|
||||||
|
return None
|
||||||
|
|
||||||
|
return candidate
|
||||||
|
except:
|
||||||
|
return None
|
||||||
|
|
||||||
async def scrape_jobs(
|
async def scrape_jobs(
|
||||||
self,
|
self,
|
||||||
search_keywords: Optional[str],
|
search_keywords: Optional[str],
|
||||||
max_pages: int = 1,
|
max_pages: int = 1,
|
||||||
credentials: Optional[Dict] = None
|
credentials: Optional[Dict] = None
|
||||||
):
|
):
|
||||||
# cryptocurrencyjobs.co uses URL params differently
|
query = ""
|
||||||
encoded_keywords = search_keywords.replace(" ", "%20")
|
location = ""
|
||||||
search_url = f"https://cryptocurrencyjobs.co/?q={encoded_keywords}"
|
if search_keywords and search_keywords.strip():
|
||||||
|
parts = search_keywords.split(',', 1)
|
||||||
|
query = parts[0].strip()
|
||||||
|
if len(parts) > 1:
|
||||||
|
location = parts[1].strip()
|
||||||
|
|
||||||
|
clean_query = query.replace(' ', '+')
|
||||||
|
clean_location = location.replace(' ', '+')
|
||||||
|
|
||||||
|
search_url = "https://cryptocurrencyjobs.co/"
|
||||||
|
if clean_query:
|
||||||
|
search_url += f"?query={clean_query}"
|
||||||
|
if clean_location:
|
||||||
|
search_url += f"&location={clean_location}"
|
||||||
|
|
||||||
profile = self.engine._select_profile()
|
profile = self.engine._select_profile()
|
||||||
renderer = random.choice(self.engine.common_renderers[self.engine.os])
|
renderer = random.choice(self.engine.common_renderers[self.engine.os])
|
||||||
@ -156,46 +336,103 @@ class CryptoJobScraper:
|
|||||||
await context.add_init_script(spoof_script)
|
await context.add_init_script(spoof_script)
|
||||||
|
|
||||||
page = await context.new_page()
|
page = await context.new_page()
|
||||||
|
print(f"🔍 Searching for: {search_keywords or 'all jobs'}")
|
||||||
# Fetch main search page
|
print(f" 🔗 URL: {search_url}")
|
||||||
print(f"🔍 Searching for: {search_keywords}")
|
await page.goto(search_url, wait_until='networkidle', timeout=120000)
|
||||||
await page.goto(search_url, wait_until='load', timeout=120000)
|
|
||||||
await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
|
await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
|
||||||
|
|
||||||
all_job_links = []
|
try:
|
||||||
seen_job_ids = set()
|
await page.wait_for_selector("a[href^='/'][href*='-']", timeout=10000)
|
||||||
|
except:
|
||||||
|
print(" ⚠️ No job links found initially, waiting longer...")
|
||||||
|
await asyncio.sleep(5 * self.human_speed)
|
||||||
|
|
||||||
print("🔄 Collecting job links from search results...")
|
seen_slugs = set()
|
||||||
await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links)
|
all_job_elements = await self._handle_pagination_and_collect_all(page, search_keywords, seen_slugs)
|
||||||
await self._handle_pagination(page, search_keywords, seen_job_ids, all_job_links)
|
print(f"✅ Collected {len(all_job_elements)} unique job links.")
|
||||||
|
|
||||||
print(f"✅ Collected {len(all_job_links)} unique job links.")
|
|
||||||
|
|
||||||
scraped_count = 0
|
scraped_count = 0
|
||||||
for idx, (href, title) in enumerate(all_job_links):
|
for idx, (href, title, job_element) in enumerate(all_job_elements):
|
||||||
|
job_detail_page = None
|
||||||
|
apply_page = None
|
||||||
|
skip_job = False
|
||||||
|
final_scrape_url = None
|
||||||
try:
|
try:
|
||||||
full_url = href
|
print(f" → Processing job {idx+1}/{len(all_job_elements)}: {title}")
|
||||||
print(f" → Opening job {idx+1}/{len(all_job_links)}: {full_url}")
|
|
||||||
|
|
||||||
fetcher = StealthyFetcher(self.engine, browser, context)
|
posted_date = await self._extract_job_posted_date_from_card(job_element)
|
||||||
job_page = await fetcher.fetch_url(full_url, wait_for_selector="h1")
|
|
||||||
if not job_page:
|
job_detail_page = await context.new_page()
|
||||||
print(f" ❌ Failed to fetch job page {full_url}")
|
await job_detail_page.goto(href, wait_until='networkidle', timeout=60000)
|
||||||
await self._add_job_to_redis_cache(full_url, full_url.split("/")[-1], "fetch_failure")
|
await asyncio.sleep(2 * self.human_speed)
|
||||||
self.engine.report_outcome("fetch_failure", url=full_url)
|
|
||||||
|
page_content = await job_detail_page.content()
|
||||||
|
if await self._is_invalid_job_page(page_content):
|
||||||
|
print(" 🚫 Page contains invalid content → skipping.")
|
||||||
|
await job_detail_page.close()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
posted_date = await self._extract_job_posted_date(job_page)
|
apply_clicked = False
|
||||||
|
apply_selectors = [
|
||||||
|
'a[href*="apply"], a:text("Apply"), a:text("Apply Now"), a:text("Apply here")',
|
||||||
|
'button:text("Apply"), button:has-text("Apply")',
|
||||||
|
'[data-testid="apply-button"], [aria-label*="apply"], [role="button"]:has-text("Apply")',
|
||||||
|
'a.btn-apply, .apply-button, .apply-link, a:has-text("Apply")',
|
||||||
|
'a[rel="noopener"]:has-text("Apply")',
|
||||||
|
]
|
||||||
|
|
||||||
await self.engine._human_like_scroll(job_page)
|
for sel in apply_selectors:
|
||||||
await asyncio.sleep(2 * self.human_speed)
|
apply_elem = await job_detail_page.query_selector(sel)
|
||||||
page_content = await self._extract_page_content_for_llm(job_page)
|
if apply_elem:
|
||||||
|
print(f" 🔗 Found Apply element with selector: {sel}")
|
||||||
|
await self._human_click(job_detail_page, apply_elem, wait_after=True)
|
||||||
|
apply_clicked = True
|
||||||
|
break
|
||||||
|
|
||||||
job_id = full_url.split("/")[-1] if full_url.split("/")[-1] else "unknown"
|
apply_page = job_detail_page
|
||||||
|
|
||||||
|
if apply_clicked:
|
||||||
|
await asyncio.sleep(random.uniform(3.0, 6.0) * self.human_speed)
|
||||||
|
pages = context.pages
|
||||||
|
new_pages = [p for p in pages if p != job_detail_page and p.url != "about:blank"]
|
||||||
|
|
||||||
|
if new_pages:
|
||||||
|
candidate_page = new_pages[-1]
|
||||||
|
new_url = candidate_page.url.strip()
|
||||||
|
print(f" New tab opened: {new_url}")
|
||||||
|
|
||||||
|
if new_url and await self._is_forbidden_ats_url(new_url):
|
||||||
|
platform = self._get_ats_platform_name(new_url)
|
||||||
|
self._log_forbidden_ats_url(new_url, platform)
|
||||||
|
if candidate_page != job_detail_page:
|
||||||
|
await candidate_page.close()
|
||||||
|
await job_detail_page.close()
|
||||||
|
skip_job = True
|
||||||
|
else:
|
||||||
|
apply_page = candidate_page
|
||||||
|
else:
|
||||||
|
print(" No new tab → using original page.")
|
||||||
|
|
||||||
|
if skip_job:
|
||||||
|
continue
|
||||||
|
|
||||||
|
final_scrape_url = apply_page.url
|
||||||
|
|
||||||
|
page_content = await self._extract_page_content_for_llm(apply_page)
|
||||||
|
if await self._is_invalid_job_page(page_content):
|
||||||
|
print(" 🚫 Final page contains invalid content → skipping.")
|
||||||
|
if apply_page != job_detail_page:
|
||||||
|
await apply_page.close()
|
||||||
|
await job_detail_page.close()
|
||||||
|
continue
|
||||||
|
|
||||||
|
job_id = self._extract_job_id_from_url(final_scrape_url)
|
||||||
|
if not job_id:
|
||||||
|
job_id = "job_" + hashlib.md5(final_scrape_url.encode()).hexdigest()[:12]
|
||||||
|
|
||||||
raw_data = {
|
raw_data = {
|
||||||
"page_content": page_content,
|
"page_content": page_content,
|
||||||
"url": full_url,
|
"url": final_scrape_url,
|
||||||
"job_id": job_id,
|
"job_id": job_id,
|
||||||
"search_keywords": search_keywords,
|
"search_keywords": search_keywords,
|
||||||
"posted_date": posted_date
|
"posted_date": posted_date
|
||||||
@ -210,44 +447,45 @@ class CryptoJobScraper:
|
|||||||
if field == 'job_id':
|
if field == 'job_id':
|
||||||
refined_data[field] = job_id
|
refined_data[field] = job_id
|
||||||
elif field == 'url':
|
elif field == 'url':
|
||||||
refined_data[field] = full_url
|
refined_data[field] = final_scrape_url
|
||||||
elif field == 'company_name':
|
elif field == 'company_name':
|
||||||
refined_data[field] = "Unknown Company"
|
refined_data[field] = "Unknown Company"
|
||||||
|
|
||||||
refined_data['scraped_at'] = datetime.now().isoformat()
|
refined_data['scraped_at'] = datetime.now().isoformat()
|
||||||
refined_data['category'] = search_keywords
|
refined_data['category'] = search_keywords or "all"
|
||||||
refined_data['posted_date'] = posted_date
|
refined_data['posted_date'] = posted_date
|
||||||
await self.llm_agent.save_job_data(refined_data, search_keywords)
|
await self.llm_agent.save_job_data(refined_data, search_keywords or "all")
|
||||||
scraped_count += 1
|
scraped_count += 1
|
||||||
print(f" ✅ Scraped and refined: {refined_data['title'][:50]}...")
|
print(f" ✅ Scraped: {refined_data['title'][:50]}... (Job ID: {job_id})")
|
||||||
self.engine.report_outcome("success", url=raw_data["url"])
|
self.engine.report_outcome("success", url=final_scrape_url)
|
||||||
else:
|
else:
|
||||||
print(f" 🟡 Could not extract meaningful data from: {full_url}")
|
print(f" 🟡 Could not extract meaningful data from: {final_scrape_url}")
|
||||||
await self._add_job_to_redis_cache(full_url, job_id, "llm_failure")
|
await self._add_job_to_redis_cache(final_scrape_url, job_id, "llm_failure")
|
||||||
self.engine.report_outcome("llm_failure", url=raw_data["url"])
|
self.engine.report_outcome("llm_failure", url=final_scrape_url)
|
||||||
|
|
||||||
await job_page.close()
|
if apply_page != job_detail_page and not apply_page.is_closed():
|
||||||
|
await apply_page.close()
|
||||||
|
if job_detail_page and not job_detail_page.is_closed():
|
||||||
|
await job_detail_page.close()
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_msg = str(e)[:100]
|
error_msg = str(e)[:100]
|
||||||
print(f" ⚠️ Failed on job {idx+1}: {error_msg}")
|
print(f" ⚠️ Failed on job {idx+1}: {error_msg}")
|
||||||
job_id = full_url.split("/")[-1] if 'full_url' in locals() else "unknown"
|
job_id_for_log = "unknown"
|
||||||
job_url = full_url if 'full_url' in locals() else "unknown"
|
if 'final_scrape_url' in locals() and final_scrape_url:
|
||||||
await self._add_job_to_redis_cache(job_url, job_id, f"exception: {error_msg}")
|
job_id_for_log = "job_" + hashlib.md5(final_scrape_url.encode()).hexdigest()[:12]
|
||||||
if 'job_page' in locals() and job_page:
|
await self._add_job_to_redis_cache(href, job_id_for_log, f"exception: {error_msg}")
|
||||||
await job_page.close()
|
if job_detail_page and not job_detail_page.is_closed():
|
||||||
|
await job_detail_page.close()
|
||||||
|
if 'apply_page' in locals() and apply_page and apply_page != job_detail_page and not apply_page.is_closed():
|
||||||
|
await apply_page.close()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
finally:
|
|
||||||
print(" ↩️ Returning to search results...")
|
|
||||||
await page.goto(search_url, timeout=120000)
|
|
||||||
await asyncio.sleep(4 * self.human_speed)
|
|
||||||
|
|
||||||
await browser.close()
|
await browser.close()
|
||||||
|
|
||||||
if scraped_count > 0:
|
if scraped_count > 0:
|
||||||
self.engine.report_outcome("success")
|
self.engine.report_outcome("success")
|
||||||
print(f"✅ Completed! Processed {scraped_count} jobs for '{search_keywords}' based on request '{self.user_request}'.")
|
print(f"✅ Completed! Processed {scraped_count} jobs for '{search_keywords or 'all jobs'}'.")
|
||||||
else:
|
else:
|
||||||
self.engine.report_outcome("scraping_error")
|
self.engine.report_outcome("scraping_error")
|
||||||
print("⚠️ No jobs processed successfully.")
|
print("⚠️ No jobs processed successfully.")
|
||||||
|
|||||||
5
workable.csv
Normal file
5
workable.csv
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
url,timestamp
|
||||||
|
https://apply.workable.com/thetie/j/C54DFC9985/?ref=cryptocurrencyjobs.co,2025-12-31T08:24:45.755671
|
||||||
|
https://apply.workable.com/thetie/j/C54DFC9985/?ref=cryptocurrencyjobs.co,2025-12-31T09:51:08.343642
|
||||||
|
https://apply.workable.com/thetie/j/2745433865/?ref=cryptocurrencyjobs.co,2025-12-31T09:51:28.331543
|
||||||
|
https://apply.workable.com/thetie/j/1A6C8F2913/?ref=cryptocurrencyjobs.co,2025-12-31T11:22:54.623723
|
||||||
|
3548
workablecompanies.csv
Normal file
3548
workablecompanies.csv
Normal file
File diff suppressed because it is too large
Load Diff
1
workday.csv
Normal file
1
workday.csv
Normal file
@ -0,0 +1 @@
|
|||||||
|
url,timestamp
|
||||||
|
1045
workdaycompanies.csv
Normal file
1045
workdaycompanies.csv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user