Compare commits
No commits in common. "b0e90972b1f5cd9883a38dac4cfb94eb11709c42" and "38ef08c734cdee20dce2cba6dd056783dafc02f6" have entirely different histories.
b0e90972b1
...
38ef08c734
10
ashby.csv
10
ashby.csv
@ -1,10 +0,0 @@
|
|||||||
url,timestamp
|
|
||||||
https://jobs.ashbyhq.com/stellar/a8377cf4-280b-4eb3-ac44-a4c9020c2eaf?utm_source=cryptocurrencyjobs.co,2025-12-31T08:32:17.821505
|
|
||||||
https://jobs.ashbyhq.com/artemisanalytics/5f61b6c6-147c-4707-9003-a9632455b984?utm_source=cryptocurrencyjobs.co,2025-12-31T08:51:57.190172
|
|
||||||
https://jobs.ashbyhq.com/lightning/2d77b496-ab0d-4e54-bcf8-33260d1bab6b?utm_source=cryptocurrencyjobs.co,2025-12-31T09:07:09.491831
|
|
||||||
https://jobs.ashbyhq.com/Braiins/cee9cf74-6049-4dab-aae7-96bef0082689?utm_source=cryptocurrencyjobs.co,2025-12-31T09:35:28.137181
|
|
||||||
https://jobs.ashbyhq.com/blockstream/80ebab98-0039-48bf-86d9-9a2a7962b005?utm_source=cryptocurrencyjobs.co,2025-12-31T10:21:19.253356
|
|
||||||
https://jobs.ashbyhq.com/dynamic/fde8a9ff-9701-485f-a8d1-e717c170f215?utm_source=cryptocurrencyjobs.co,2025-12-31T10:25:55.141543
|
|
||||||
https://jobs.ashbyhq.com/ether.fi/6eb1e350-71ce-47f7-a363-3fa3c521dacb?utm_source=cryptocurrencyjobs.co,2025-12-31T10:44:35.913725
|
|
||||||
https://chainlinklabs.com/open-roles?ashby_jid=112a76d3-4dfd-4eea-828c-41465760b3ef&utm_source=ccj,2025-12-31T10:49:07.453900
|
|
||||||
https://jobs.ashbyhq.com/stellar/cdad9af1-9e64-4fd4-8e2c-f87389f1dd16?utm_source=cryptocurrencyjobs.co,2025-12-31T11:13:58.119967
|
|
||||||
|
1591
ashbycompanies.csv
1591
ashbycompanies.csv
File diff suppressed because it is too large
Load Diff
166
comparator.py
166
comparator.py
@ -1,166 +0,0 @@
|
|||||||
|
|
||||||
import csv
|
|
||||||
import os
|
|
||||||
from urllib.parse import urlparse
|
|
||||||
|
|
||||||
# Define platform mappings: (input_file, companies_file, platform_name)
|
|
||||||
platforms = [
|
|
||||||
("ashby.csv", "ashbycompanies.csv", "ashby"),
|
|
||||||
("gem.csv", "gemcompanies.csv", "gem"),
|
|
||||||
("greenhouse.csv", "greenhousecompanies.csv", "greenhouse"),
|
|
||||||
("lever.csv", "levercompanies.csv", "lever"),
|
|
||||||
("rippling.csv", "ripplingcompanies.csv", "rippling"),
|
|
||||||
("workable.csv", "workablecompanies.csv", "workable"),
|
|
||||||
("workday.csv", "workdaycompanies.csv", "workday"),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_url(platform, url):
|
|
||||||
"""Normalize URL to a company identifier based on platform."""
|
|
||||||
if not url:
|
|
||||||
return None
|
|
||||||
try:
|
|
||||||
parsed = urlparse(url.lower().strip())
|
|
||||||
netloc = parsed.netloc
|
|
||||||
path = parsed.path
|
|
||||||
|
|
||||||
if platform == "ashby":
|
|
||||||
# https://jobs.ashbyhq.com/company_slug/...
|
|
||||||
if "ashbyhq.com" in netloc:
|
|
||||||
parts = [p for p in path.split('/') if p]
|
|
||||||
return parts[0] if parts else None
|
|
||||||
|
|
||||||
elif platform == "greenhouse":
|
|
||||||
# https://boards.greenhouse.io/company_slug/...
|
|
||||||
if "greenhouse.io" in netloc:
|
|
||||||
parts = [p for p in path.split('/') if p]
|
|
||||||
if len(parts) >= 2 and parts[0] == "boards":
|
|
||||||
return parts[1]
|
|
||||||
elif len(parts) >= 1:
|
|
||||||
return parts[0]
|
|
||||||
return None
|
|
||||||
|
|
||||||
elif platform == "lever":
|
|
||||||
# https://jobs.lever.co/company_slug/...
|
|
||||||
if "lever.co" in netloc:
|
|
||||||
parts = [p for p in path.split('/') if p]
|
|
||||||
return parts[0] if parts else None
|
|
||||||
|
|
||||||
elif platform == "workable":
|
|
||||||
# https://apply.workable.com/company_slug/...
|
|
||||||
if "workable.com" in netloc:
|
|
||||||
parts = [p for p in path.split('/') if p]
|
|
||||||
# Usually: /company_slug/j/jobid/ → take first non-'j' segment
|
|
||||||
for part in parts:
|
|
||||||
if part != 'j' and len(part) > 2:
|
|
||||||
return part
|
|
||||||
return parts[0] if parts else None
|
|
||||||
|
|
||||||
elif platform == "workday":
|
|
||||||
# https://company.workday.com/... → company = subdomain
|
|
||||||
if "myworkdayjobs.com" in netloc or "wd" in netloc:
|
|
||||||
# Extract subdomain before main domain
|
|
||||||
subdomain = netloc.split('.')[0]
|
|
||||||
if subdomain and subdomain not in ['www', 'jobs', 'apply', '']:
|
|
||||||
return subdomain
|
|
||||||
# Fallback: look for company in path (rare)
|
|
||||||
parts = [p for p in path.split('/') if p]
|
|
||||||
if parts:
|
|
||||||
return parts[0]
|
|
||||||
return None
|
|
||||||
|
|
||||||
elif platform == "gem":
|
|
||||||
# https://gem.com/company/... or https://www.gem.com/careers/company/...
|
|
||||||
if "gem.com" in netloc:
|
|
||||||
parts = [p for p in path.split('/') if p]
|
|
||||||
# Often: /company-slug or /careers/company-slug
|
|
||||||
for i, part in enumerate(parts):
|
|
||||||
if part in ['company', 'careers', 'jobs']:
|
|
||||||
if i + 1 < len(parts):
|
|
||||||
return parts[i + 1]
|
|
||||||
return parts[0] if parts else None
|
|
||||||
|
|
||||||
elif platform == "rippling":
|
|
||||||
# Rippling uses generic domain; hard to extract company
|
|
||||||
# Best effort: use full domain + first path segment
|
|
||||||
if "rippling.com" in netloc:
|
|
||||||
parts = [p for p in path.split('/') if p]
|
|
||||||
if parts:
|
|
||||||
return f"{netloc}/{parts[0]}"
|
|
||||||
return netloc
|
|
||||||
|
|
||||||
# Fallback: return full URL if unrecognized
|
|
||||||
return url
|
|
||||||
|
|
||||||
except Exception:
|
|
||||||
return url
|
|
||||||
|
|
||||||
|
|
||||||
def read_company_signatures(filepath, platform):
|
|
||||||
"""Read and normalize company identifiers from companies CSV."""
|
|
||||||
if not os.path.exists(filepath):
|
|
||||||
return set()
|
|
||||||
signatures = set()
|
|
||||||
with open(filepath, 'r', encoding='utf-8') as f:
|
|
||||||
reader = csv.DictReader(f)
|
|
||||||
for row in reader:
|
|
||||||
url = row.get('url', '').strip()
|
|
||||||
if url:
|
|
||||||
sig = normalize_url(platform, url)
|
|
||||||
if sig:
|
|
||||||
signatures.add(sig)
|
|
||||||
return signatures
|
|
||||||
|
|
||||||
|
|
||||||
def filter_csv_by_signatures(input_file, excluded_signatures, platform):
|
|
||||||
"""Keep only rows whose normalized URL is NOT in excluded_signatures."""
|
|
||||||
if not os.path.exists(input_file):
|
|
||||||
return [], None
|
|
||||||
kept_rows = []
|
|
||||||
with open(input_file, 'r', encoding='utf-8') as f:
|
|
||||||
reader = csv.DictReader(f)
|
|
||||||
fieldnames = reader.fieldnames
|
|
||||||
for row in reader:
|
|
||||||
url = row.get('url', '').strip()
|
|
||||||
if not url:
|
|
||||||
kept_rows.append(row) # keep if no URL (shouldn't happen)
|
|
||||||
continue
|
|
||||||
sig = normalize_url(platform, url)
|
|
||||||
if sig not in excluded_signatures:
|
|
||||||
kept_rows.append(row)
|
|
||||||
return kept_rows, fieldnames
|
|
||||||
|
|
||||||
|
|
||||||
def write_csv(filepath, rows, fieldnames):
|
|
||||||
"""Write rows to CSV file."""
|
|
||||||
with open(filepath, 'w', newline='', encoding='utf-8') as f:
|
|
||||||
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
||||||
writer.writeheader()
|
|
||||||
writer.writerows(rows)
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
for input_file, companies_file, platform in platforms:
|
|
||||||
print(f"Processing {input_file} against {companies_file} using '{platform}' normalizer...")
|
|
||||||
|
|
||||||
# Step 1: Load and normalize known company signatures
|
|
||||||
known_signatures = read_company_signatures(companies_file, platform)
|
|
||||||
print(f" → Loaded {len(known_signatures)} known company signatures from {companies_file}")
|
|
||||||
|
|
||||||
# Step 2: Filter input file using signatures
|
|
||||||
kept_rows, fieldnames = filter_csv_by_signatures(input_file, known_signatures, platform)
|
|
||||||
|
|
||||||
# Step 3: Write back filtered data
|
|
||||||
if fieldnames:
|
|
||||||
write_csv(input_file, kept_rows, fieldnames)
|
|
||||||
print(f" → Kept {len(kept_rows)} new job URLs in {input_file}")
|
|
||||||
else:
|
|
||||||
if os.path.exists(input_file):
|
|
||||||
os.remove(input_file)
|
|
||||||
print(f" → {input_file} was empty or invalid — removed.")
|
|
||||||
|
|
||||||
print("\n✅ All platforms processed successfully.")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@ -27,7 +27,7 @@ class StealthyFetcher:
|
|||||||
|
|
||||||
if wait_for_selector:
|
if wait_for_selector:
|
||||||
try:
|
try:
|
||||||
await page.wait_for_selector(wait_for_selector, timeout=120000)
|
await page.wait_for_selector(wait_for_selector, timeout=40000)
|
||||||
except PlaywrightTimeoutError:
|
except PlaywrightTimeoutError:
|
||||||
print(f"Selector {wait_for_selector} not found immediately, continuing...")
|
print(f"Selector {wait_for_selector} not found immediately, continuing...")
|
||||||
|
|
||||||
@ -88,7 +88,7 @@ class StealthyFetcher:
|
|||||||
async def _is_content_accessible(self, page: Page, wait_for_selector: Optional[str] = None) -> bool:
|
async def _is_content_accessible(self, page: Page, wait_for_selector: Optional[str] = None) -> bool:
|
||||||
if wait_for_selector:
|
if wait_for_selector:
|
||||||
try:
|
try:
|
||||||
await page.wait_for_selector(wait_for_selector, timeout=120000)
|
await page.wait_for_selector(wait_for_selector, timeout=40000)
|
||||||
return True
|
return True
|
||||||
except PlaywrightTimeoutError:
|
except PlaywrightTimeoutError:
|
||||||
pass
|
pass
|
||||||
|
|||||||
508
gemcompanies.csv
508
gemcompanies.csv
@ -1,508 +0,0 @@
|
|||||||
name,url
|
|
||||||
10Xconstruction Ai,https://jobs.gem.com/10xconstruction-ai
|
|
||||||
11X Ai,https://jobs.gem.com/11x-ai
|
|
||||||
43North,https://jobs.gem.com/43north
|
|
||||||
8020 Consulting,https://jobs.gem.com/8020-consulting
|
|
||||||
A16Z Speedrun,https://jobs.gem.com/a16z-speedrun
|
|
||||||
Aarden Ai,https://jobs.gem.com/aarden-ai
|
|
||||||
Accel,https://jobs.gem.com/accel
|
|
||||||
Accelos,https://jobs.gem.com/accelos
|
|
||||||
Acre,https://jobs.gem.com/acre
|
|
||||||
Advancelevelllc Com,https://jobs.gem.com/advancelevelllc-com
|
|
||||||
Agenta Ai,https://jobs.gem.com/agenta-ai
|
|
||||||
Agentnoon,https://jobs.gem.com/agentnoon
|
|
||||||
Agora,https://jobs.gem.com/agora
|
|
||||||
Aionex Xyz,https://jobs.gem.com/aionex-xyz
|
|
||||||
Aiphrodite Ai,https://jobs.gem.com/aiphrodite-ai
|
|
||||||
Airframe,https://jobs.gem.com/airframe
|
|
||||||
Airvet Com,https://jobs.gem.com/airvet-com
|
|
||||||
Alex And Ani,https://jobs.gem.com/alex-and-ani
|
|
||||||
Alinia Ai,https://jobs.gem.com/alinia-ai
|
|
||||||
Alitheon,https://jobs.gem.com/alitheon
|
|
||||||
Alpharun,https://jobs.gem.com/alpharun
|
|
||||||
Altzero Xyz,https://jobs.gem.com/altzero-xyz
|
|
||||||
Amya Agency,https://jobs.gem.com/amya-agency
|
|
||||||
Andrenam,https://jobs.gem.com/andrenam
|
|
||||||
Anysphere,https://jobs.gem.com/anysphere
|
|
||||||
Aoniclife,https://jobs.gem.com/aoniclife
|
|
||||||
Apartment List,https://jobs.gem.com/apartment-list
|
|
||||||
Apella,https://jobs.gem.com/apella
|
|
||||||
Apticore Io,https://jobs.gem.com/apticore-io
|
|
||||||
Arlo,https://jobs.gem.com/arlo
|
|
||||||
Ascenda Loyalty,https://jobs.gem.com/ascenda-loyalty
|
|
||||||
Ascendarc,https://jobs.gem.com/ascendarc
|
|
||||||
Astroforge Io,https://jobs.gem.com/astroforge-io
|
|
||||||
Atla Ai Com,https://jobs.gem.com/atla-ai-com
|
|
||||||
Atomica,https://jobs.gem.com/atomica
|
|
||||||
Audicus,https://jobs.gem.com/audicus
|
|
||||||
Aurelian Io,https://jobs.gem.com/aurelian-io
|
|
||||||
Aureliussystems Us,https://jobs.gem.com/aureliussystems-us
|
|
||||||
Autopilotbrand Com,https://jobs.gem.com/autopilotbrand-com
|
|
||||||
Avoca,https://jobs.gem.com/avoca
|
|
||||||
Avol,https://jobs.gem.com/avol
|
|
||||||
Axonify,https://jobs.gem.com/axonify
|
|
||||||
Backops Ai,https://jobs.gem.com/backops-ai
|
|
||||||
Basalt Health,https://jobs.gem.com/basalt-health
|
|
||||||
Baxter Aerospace,https://jobs.gem.com/baxter-aerospace
|
|
||||||
Bead Ai,https://jobs.gem.com/bead-ai
|
|
||||||
Benbase,https://jobs.gem.com/benbase
|
|
||||||
Better Auth,https://jobs.gem.com/better-auth
|
|
||||||
Betterbasket Ai,https://jobs.gem.com/betterbasket-ai
|
|
||||||
Bigeye,https://jobs.gem.com/bigeye
|
|
||||||
Bigpanda,https://jobs.gem.com/bigpanda
|
|
||||||
Bikky,https://jobs.gem.com/bikky
|
|
||||||
Bilt,https://jobs.gem.com/bilt
|
|
||||||
Binarly,https://jobs.gem.com/binarly
|
|
||||||
Biofire,https://jobs.gem.com/biofire
|
|
||||||
Biorender,https://jobs.gem.com/biorender
|
|
||||||
Biorender Inc Ats,https://jobs.gem.com/biorender-inc--ats
|
|
||||||
Birdwood Therapeutics,https://jobs.gem.com/birdwood-therapeutics
|
|
||||||
Black Ore,https://jobs.gem.com/black-ore
|
|
||||||
Blaze Ai,https://jobs.gem.com/blaze-ai
|
|
||||||
Blazetalent,https://jobs.gem.com/blazetalent
|
|
||||||
Blend Inc,https://jobs.gem.com/blend-inc
|
|
||||||
Blue J,https://jobs.gem.com/blue-j
|
|
||||||
Bluejeanfinancial Com,https://jobs.gem.com/bluejeanfinancial-com
|
|
||||||
Blueonion Ai,https://jobs.gem.com/blueonion-ai
|
|
||||||
Blueprint,https://jobs.gem.com/blueprint
|
|
||||||
Bluesky,https://jobs.gem.com/bluesky
|
|
||||||
Blume Technologies,https://jobs.gem.com/blume-technologies
|
|
||||||
Bohler ,https://jobs.gem.com/bohler-
|
|
||||||
Bohler Engineering Gemats,https://jobs.gem.com/bohler-engineering-gemats
|
|
||||||
Bolna,https://jobs.gem.com/bolna
|
|
||||||
Bond Partners,https://jobs.gem.com/bond-partners
|
|
||||||
Boost Robotics,https://jobs.gem.com/boost-robotics
|
|
||||||
Boredm,https://jobs.gem.com/boredm
|
|
||||||
Breadcrumb Ai,https://jobs.gem.com/breadcrumb-ai
|
|
||||||
Breakline Ats,https://jobs.gem.com/breakline-ats
|
|
||||||
Breakline Education,https://jobs.gem.com/breakline-education
|
|
||||||
Brewbird,https://jobs.gem.com/brewbird
|
|
||||||
Buildtrayd Com,https://jobs.gem.com/buildtrayd-com
|
|
||||||
Bull Moose Xyz,https://jobs.gem.com/bull-moose-xyz
|
|
||||||
Cadstrom Io,https://jobs.gem.com/cadstrom-io
|
|
||||||
Caffelabs Com,https://jobs.gem.com/caffelabs-com
|
|
||||||
Calaveras,https://jobs.gem.com/calaveras
|
|
||||||
Canals,https://jobs.gem.com/canals
|
|
||||||
Caplight Com,https://jobs.gem.com/caplight-com
|
|
||||||
Carbon,https://jobs.gem.com/carbon
|
|
||||||
Cardnexus,https://jobs.gem.com/cardnexus
|
|
||||||
Careers,https://jobs.gem.com/careers
|
|
||||||
Carry,https://jobs.gem.com/carry
|
|
||||||
Caseflood Ai,https://jobs.gem.com/caseflood-ai
|
|
||||||
Cellbyte,https://jobs.gem.com/cellbyte
|
|
||||||
Chartahealth,https://jobs.gem.com/chartahealth
|
|
||||||
Civrobotics Com,https://jobs.gem.com/civrobotics-com
|
|
||||||
Clarity,https://jobs.gem.com/clarity
|
|
||||||
Clearchecks Com Ats,https://jobs.gem.com/clearchecks-com-ats
|
|
||||||
Clearesthealth Com,https://jobs.gem.com/clearesthealth-com
|
|
||||||
Cloudanix Com,https://jobs.gem.com/cloudanix-com
|
|
||||||
Cloudraft,https://jobs.gem.com/cloudraft
|
|
||||||
Codegen,https://jobs.gem.com/codegen
|
|
||||||
Codesignal,https://jobs.gem.com/codesignal
|
|
||||||
Cognna,https://jobs.gem.com/cognna
|
|
||||||
Cogram,https://jobs.gem.com/cogram
|
|
||||||
Comfy,https://jobs.gem.com/comfy
|
|
||||||
Conductorai,https://jobs.gem.com/conductorai
|
|
||||||
Confida Ai,https://jobs.gem.com/confida-ai
|
|
||||||
Context Wtf,https://jobs.gem.com/context-wtf
|
|
||||||
Contour App,https://jobs.gem.com/contour-app
|
|
||||||
Converge,https://jobs.gem.com/converge
|
|
||||||
Coupa Software Inc Ats 1,https://jobs.gem.com/coupa-software-inc-ats-1
|
|
||||||
Cox Exponential,https://jobs.gem.com/cox-exponential
|
|
||||||
Crabi Robotics Com,https://jobs.gem.com/crabi-robotics-com
|
|
||||||
Crackenagi,https://jobs.gem.com/crackenagi
|
|
||||||
Create Talent Group,https://jobs.gem.com/create-talent-group
|
|
||||||
Createdbyhumans Ai,https://jobs.gem.com/createdbyhumans-ai
|
|
||||||
Credit Key,https://jobs.gem.com/credit-key
|
|
||||||
Crosby,https://jobs.gem.com/crosby
|
|
||||||
Curex Org,https://jobs.gem.com/curex-org
|
|
||||||
Curiouscardinals Com,https://jobs.gem.com/curiouscardinals-com
|
|
||||||
Cyvl,https://jobs.gem.com/cyvl
|
|
||||||
D4M International,https://jobs.gem.com/d4m-international
|
|
||||||
Dalus,https://jobs.gem.com/dalus
|
|
||||||
Dash Fi,https://jobs.gem.com/dash-fi
|
|
||||||
Data Masters,https://jobs.gem.com/data-masters
|
|
||||||
Datacurve Ai,https://jobs.gem.com/datacurve-ai
|
|
||||||
Dataday Technology Solutions,https://jobs.gem.com/dataday-technology-solutions
|
|
||||||
Datagrid,https://jobs.gem.com/datagrid
|
|
||||||
Dawn Media,https://jobs.gem.com/dawn-media
|
|
||||||
Daxko,https://jobs.gem.com/daxko
|
|
||||||
Deep Infra,https://jobs.gem.com/deep-infra
|
|
||||||
Deliver,https://jobs.gem.com/deliver
|
|
||||||
Detections Ai,https://jobs.gem.com/detections-ai
|
|
||||||
Dianahr Ai,https://jobs.gem.com/dianahr-ai
|
|
||||||
Distributed Spectrum,https://jobs.gem.com/distributed-spectrum
|
|
||||||
Dlvrlog,https://jobs.gem.com/dlvrlog
|
|
||||||
Doowii,https://jobs.gem.com/doowii
|
|
||||||
Dragme,https://jobs.gem.com/dragme
|
|
||||||
Dragonfly Careers,https://jobs.gem.com/dragonfly-careers
|
|
||||||
Dropback,https://jobs.gem.com/dropback
|
|
||||||
Durin,https://jobs.gem.com/durin
|
|
||||||
Dydx,https://jobs.gem.com/dydx
|
|
||||||
Eats2Seats,https://jobs.gem.com/eats2seats
|
|
||||||
Echelon,https://jobs.gem.com/echelon
|
|
||||||
Ecocart Io,https://jobs.gem.com/ecocart-io
|
|
||||||
Edgetrace Ai,https://jobs.gem.com/edgetrace-ai
|
|
||||||
Efference Ai,https://jobs.gem.com/efference-ai
|
|
||||||
Elite Talent Consulting,https://jobs.gem.com/elite-talent-consulting
|
|
||||||
Eliza,https://jobs.gem.com/eliza
|
|
||||||
Elloe Ai,https://jobs.gem.com/elloe-ai
|
|
||||||
Elo Ai,https://jobs.gem.com/elo-ai
|
|
||||||
Emerge Career,https://jobs.gem.com/emerge-career
|
|
||||||
Engineering Codified,https://jobs.gem.com/engineering--codified
|
|
||||||
Entrusted Contracting,https://jobs.gem.com/entrusted-contracting
|
|
||||||
Escargot Com,https://jobs.gem.com/escargot-com
|
|
||||||
Everfit Io,https://jobs.gem.com/everfit-io
|
|
||||||
Excelity Careers,https://jobs.gem.com/excelity-careers
|
|
||||||
Exponent,https://jobs.gem.com/exponent
|
|
||||||
Ezraailabs Tech,https://jobs.gem.com/ezraailabs-tech
|
|
||||||
Fabric,https://jobs.gem.com/fabric
|
|
||||||
Fabrichealth,https://jobs.gem.com/fabrichealth
|
|
||||||
Fancypeople,https://jobs.gem.com/fancypeople
|
|
||||||
Fanpierlabs Com,https://jobs.gem.com/fanpierlabs-com
|
|
||||||
Faraday,https://jobs.gem.com/faraday
|
|
||||||
Fathom Org,https://jobs.gem.com/fathom-org
|
|
||||||
Felix,https://jobs.gem.com/felix
|
|
||||||
Ferry Health,https://jobs.gem.com/ferry-health
|
|
||||||
Fetch Ats,https://jobs.gem.com/fetch-ats
|
|
||||||
Fifthdoor Com,https://jobs.gem.com/fifthdoor-com
|
|
||||||
Fireflies,https://jobs.gem.com/fireflies
|
|
||||||
Firestorm,https://jobs.gem.com/firestorm
|
|
||||||
Flatfee Corp,https://jobs.gem.com/flatfee-corp
|
|
||||||
Flint,https://jobs.gem.com/flint
|
|
||||||
Floot,https://jobs.gem.com/floot
|
|
||||||
Forgent Ai,https://jobs.gem.com/forgent-ai
|
|
||||||
Fountainplatform Com,https://jobs.gem.com/fountainplatform-com
|
|
||||||
Foxbox Digital,https://jobs.gem.com/foxbox-digital
|
|
||||||
Freestone Grove Partners,https://jobs.gem.com/freestone-grove-partners
|
|
||||||
Freshbooks,https://jobs.gem.com/freshbooks
|
|
||||||
Fridayharbor Ai,https://jobs.gem.com/fridayharbor-ai
|
|
||||||
Fuelfinance,https://jobs.gem.com/fuelfinance
|
|
||||||
Fulcrumcareers,https://jobs.gem.com/fulcrumcareers
|
|
||||||
Function Health,https://jobs.gem.com/function-health
|
|
||||||
Galadyne,https://jobs.gem.com/galadyne
|
|
||||||
Galaxyventures,https://jobs.gem.com/galaxyventures
|
|
||||||
Gc Ai,https://jobs.gem.com/gc-ai
|
|
||||||
Gem,https://jobs.gem.com/gem
|
|
||||||
Gem Mckesson,https://jobs.gem.com/gem-mckesson
|
|
||||||
Gem Test Board,https://jobs.gem.com/gem-test-board
|
|
||||||
Generation Alpha Transistor,https://jobs.gem.com/generation-alpha-transistor
|
|
||||||
Genspark,https://jobs.gem.com/genspark
|
|
||||||
Gerra,https://jobs.gem.com/gerra
|
|
||||||
Getaero Io,https://jobs.gem.com/getaero-io
|
|
||||||
Getbirdeye Com Au,https://jobs.gem.com/getbirdeye-com-au
|
|
||||||
Getro,https://jobs.gem.com/getro
|
|
||||||
Gigaml,https://jobs.gem.com/gigaml
|
|
||||||
Go Cadre,https://jobs.gem.com/go-cadre
|
|
||||||
Goatrecruit Com,https://jobs.gem.com/goatrecruit-com
|
|
||||||
Good Life Companies,https://jobs.gem.com/good-life-companies
|
|
||||||
Goodbill,https://jobs.gem.com/goodbill
|
|
||||||
Grailpay Com,https://jobs.gem.com/grailpay-com
|
|
||||||
Granger Construction,https://jobs.gem.com/granger-construction
|
|
||||||
Gratia Health,https://jobs.gem.com/gratia-health
|
|
||||||
Greenlite Ai,https://jobs.gem.com/greenlite-ai
|
|
||||||
Greenvalleyjobs,https://jobs.gem.com/greenvalleyjobs
|
|
||||||
Grit,https://jobs.gem.com/grit
|
|
||||||
Groq,https://jobs.gem.com/groq
|
|
||||||
Growthbook,https://jobs.gem.com/growthbook
|
|
||||||
Guardrail Ai,https://jobs.gem.com/guardrail-ai
|
|
||||||
Guidesage Ai,https://jobs.gem.com/guidesage-ai
|
|
||||||
Hallow,https://jobs.gem.com/hallow
|
|
||||||
Happydance Partnership Integration,https://jobs.gem.com/happydance-partnership-integration
|
|
||||||
Harmonic,https://jobs.gem.com/harmonic
|
|
||||||
Hash,https://jobs.gem.com/hash
|
|
||||||
Hayla,https://jobs.gem.com/hayla
|
|
||||||
Heavy Construction Systems Specialists Llc,https://jobs.gem.com/heavy-construction-systems-specialists-llc
|
|
||||||
Helix,https://jobs.gem.com/helix
|
|
||||||
Hellotrade,https://jobs.gem.com/hellotrade
|
|
||||||
Helm Health,https://jobs.gem.com/helm-health
|
|
||||||
Hilabs Ie,https://jobs.gem.com/hilabs-ie
|
|
||||||
Hipeople,https://jobs.gem.com/hipeople
|
|
||||||
Holacasa Yc W23,https://jobs.gem.com/holacasa-yc-w23
|
|
||||||
Homeboost,https://jobs.gem.com/homeboost
|
|
||||||
Hospitable,https://jobs.gem.com/hospitable
|
|
||||||
Howrecruit Io,https://jobs.gem.com/howrecruit-io
|
|
||||||
Hubspot,https://jobs.gem.com/hubspot
|
|
||||||
Hypernatural Ai,https://jobs.gem.com/hypernatural-ai
|
|
||||||
Inception,https://jobs.gem.com/inception
|
|
||||||
Index Exchange,https://jobs.gem.com/index-exchange
|
|
||||||
Infrastructure Modernization Solutions,https://jobs.gem.com/infrastructure-modernization-solutions
|
|
||||||
Inspiration Commerce Group,https://jobs.gem.com/inspiration-commerce-group
|
|
||||||
Inspiresemi Com,https://jobs.gem.com/inspiresemi-com
|
|
||||||
Instrumental Inc ,https://jobs.gem.com/instrumental-inc-
|
|
||||||
Integral Xyz,https://jobs.gem.com/integral-xyz
|
|
||||||
Integrationscaptain,https://jobs.gem.com/integrationscaptain
|
|
||||||
Intelligentresourcing Co,https://jobs.gem.com/intelligentresourcing-co
|
|
||||||
Interfere Old,https://jobs.gem.com/interfere-old
|
|
||||||
Invoicebutler Ai,https://jobs.gem.com/invoicebutler-ai
|
|
||||||
Iris,https://jobs.gem.com/iris
|
|
||||||
Ironsite Ai,https://jobs.gem.com/ironsite-ai
|
|
||||||
Itsvaleria Co,https://jobs.gem.com/itsvaleria-co
|
|
||||||
Jaguaracareers,https://jobs.gem.com/jaguaracareers
|
|
||||||
Janie,https://jobs.gem.com/janie
|
|
||||||
Jayla Careers,https://jobs.gem.com/jayla-careers
|
|
||||||
Jobma,https://jobs.gem.com/jobma
|
|
||||||
Joinanvil Com,https://jobs.gem.com/joinanvil-com
|
|
||||||
Joinformal,https://jobs.gem.com/joinformal
|
|
||||||
Joyful Health,https://jobs.gem.com/joyful-health
|
|
||||||
Kaikaku,https://jobs.gem.com/kaikaku
|
|
||||||
Kaironhealth,https://jobs.gem.com/kaironhealth
|
|
||||||
Kaironhealth Com,https://jobs.gem.com/kaironhealth-com
|
|
||||||
Kanu Ai,https://jobs.gem.com/kanu-ai
|
|
||||||
Kcs Hiring,https://jobs.gem.com/kcs-hiring
|
|
||||||
Keru Ai,https://jobs.gem.com/keru-ai
|
|
||||||
Key To Web3,https://jobs.gem.com/key-to-web3
|
|
||||||
Knight Electric Inc ,https://jobs.gem.com/knight-electric-inc-
|
|
||||||
Kollectiv Ai,https://jobs.gem.com/kollectiv-ai
|
|
||||||
Kumo Ai,https://jobs.gem.com/kumo-ai
|
|
||||||
Lantern,https://jobs.gem.com/lantern
|
|
||||||
Lavapayments Com,https://jobs.gem.com/lavapayments-com
|
|
||||||
Leap Tools,https://jobs.gem.com/leap-tools
|
|
||||||
Letsdata,https://jobs.gem.com/letsdata
|
|
||||||
Letter Ai,https://jobs.gem.com/letter-ai
|
|
||||||
Level,https://jobs.gem.com/level
|
|
||||||
Linktree,https://jobs.gem.com/linktree
|
|
||||||
Little Otter,https://jobs.gem.com/little-otter
|
|
||||||
Lower Llc,https://jobs.gem.com/lower-llc
|
|
||||||
Lumalabs Ai,https://jobs.gem.com/lumalabs-ai
|
|
||||||
Lunajoy,https://jobs.gem.com/lunajoy
|
|
||||||
Lunch,https://jobs.gem.com/lunch
|
|
||||||
Lunos Ai,https://jobs.gem.com/lunos-ai
|
|
||||||
Magnetic,https://jobs.gem.com/magnetic
|
|
||||||
Manifest,https://jobs.gem.com/manifest
|
|
||||||
Manifested Com,https://jobs.gem.com/manifested-com
|
|
||||||
Marble Health,https://jobs.gem.com/marble-health
|
|
||||||
Mavi,https://jobs.gem.com/mavi
|
|
||||||
Meetdex Ai,https://jobs.gem.com/meetdex-ai
|
|
||||||
Megapot,https://jobs.gem.com/megapot
|
|
||||||
Meineautosdirekt,https://jobs.gem.com/meineautosdirekt
|
|
||||||
Menten Ai,https://jobs.gem.com/menten-ai
|
|
||||||
Merge Sandbox,https://jobs.gem.com/merge-sandbox
|
|
||||||
Metal Ai,https://jobs.gem.com/metal-ai
|
|
||||||
Microsoft Demo Gem Com,https://jobs.gem.com/microsoft-demo-gem-com
|
|
||||||
Mimicrobotics Com,https://jobs.gem.com/mimicrobotics-com
|
|
||||||
Mission,https://jobs.gem.com/mission
|
|
||||||
Moosehead Talent,https://jobs.gem.com/moosehead-talent
|
|
||||||
Motion,https://jobs.gem.com/motion
|
|
||||||
Moxa,https://jobs.gem.com/moxa
|
|
||||||
Multiplierhq,https://jobs.gem.com/multiplierhq
|
|
||||||
Multiscale Ai,https://jobs.gem.com/multiscale-ai
|
|
||||||
Myprize,https://jobs.gem.com/myprize
|
|
||||||
Myriad Technology,https://jobs.gem.com/myriad-technology
|
|
||||||
Myrrsgroup,https://jobs.gem.com/myrrsgroup
|
|
||||||
Nabla Bio,https://jobs.gem.com/nabla-bio
|
|
||||||
Nacelle,https://jobs.gem.com/nacelle
|
|
||||||
Nativemsg,https://jobs.gem.com/nativemsg
|
|
||||||
Nclusion,https://jobs.gem.com/nclusion
|
|
||||||
Nerve,https://jobs.gem.com/nerve
|
|
||||||
Newcrew,https://jobs.gem.com/newcrew
|
|
||||||
Ngram,https://jobs.gem.com/ngram
|
|
||||||
Nimble,https://jobs.gem.com/nimble
|
|
||||||
Niva,https://jobs.gem.com/niva
|
|
||||||
Nominal,https://jobs.gem.com/nominal
|
|
||||||
Northone,https://jobs.gem.com/northone
|
|
||||||
Ntop,https://jobs.gem.com/ntop
|
|
||||||
Nue Ai,https://jobs.gem.com/nue-ai
|
|
||||||
Nutrislice,https://jobs.gem.com/nutrislice
|
|
||||||
Nuvo,https://jobs.gem.com/nuvo
|
|
||||||
Obin Ai,https://jobs.gem.com/obin-ai
|
|
||||||
Obsidian Systems,https://jobs.gem.com/obsidian-systems
|
|
||||||
Odo Do,https://jobs.gem.com/odo-do
|
|
||||||
Omegahhagency Com,https://jobs.gem.com/omegahhagency-com
|
|
||||||
Ondo Finance,https://jobs.gem.com/ondo-finance
|
|
||||||
Onesignal,https://jobs.gem.com/onesignal
|
|
||||||
Onesignal Ats,https://jobs.gem.com/onesignal-ats
|
|
||||||
Onezyme,https://jobs.gem.com/onezyme
|
|
||||||
Onfrontiers,https://jobs.gem.com/onfrontiers
|
|
||||||
Openphone,https://jobs.gem.com/openphone
|
|
||||||
Openreqstaffing,https://jobs.gem.com/openreqstaffing
|
|
||||||
Opine,https://jobs.gem.com/opine
|
|
||||||
Ora So,https://jobs.gem.com/ora-so
|
|
||||||
Overlay,https://jobs.gem.com/overlay
|
|
||||||
Overwatch,https://jobs.gem.com/overwatch
|
|
||||||
Paces,https://jobs.gem.com/paces
|
|
||||||
Pae,https://jobs.gem.com/pae
|
|
||||||
Pagebound,https://jobs.gem.com/pagebound
|
|
||||||
Pally,https://jobs.gem.com/pally
|
|
||||||
Paramark,https://jobs.gem.com/paramark
|
|
||||||
Partao,https://jobs.gem.com/partao
|
|
||||||
Partnerhq,https://jobs.gem.com/partnerhq
|
|
||||||
Patlytics,https://jobs.gem.com/patlytics
|
|
||||||
Pave,https://jobs.gem.com/pave
|
|
||||||
Perceptyx,https://jobs.gem.com/perceptyx
|
|
||||||
Photalabs Com,https://jobs.gem.com/photalabs-com
|
|
||||||
Photon,https://jobs.gem.com/photon
|
|
||||||
Pinnacleconnect Llc,https://jobs.gem.com/pinnacleconnect-llc
|
|
||||||
Piqenergy Com,https://jobs.gem.com/piqenergy-com
|
|
||||||
Planet Fans,https://jobs.gem.com/planet-fans
|
|
||||||
Planned,https://jobs.gem.com/planned
|
|
||||||
Plixai,https://jobs.gem.com/plixai
|
|
||||||
Pogo Recruiting,https://jobs.gem.com/pogo-recruiting
|
|
||||||
Polar,https://jobs.gem.com/polar
|
|
||||||
Polywork,https://jobs.gem.com/polywork
|
|
||||||
Pomerium,https://jobs.gem.com/pomerium
|
|
||||||
Portal Ai,https://jobs.gem.com/portal-ai
|
|
||||||
Poseidonaero,https://jobs.gem.com/poseidonaero
|
|
||||||
Prahsys Com,https://jobs.gem.com/prahsys-com
|
|
||||||
Praxisiq Ai,https://jobs.gem.com/praxisiq-ai
|
|
||||||
Precision Ai,https://jobs.gem.com/precision-ai
|
|
||||||
Prodia,https://jobs.gem.com/prodia
|
|
||||||
Productboard,https://jobs.gem.com/productboard
|
|
||||||
Productboard Ats,https://jobs.gem.com/productboard-ats
|
|
||||||
Prohost Ai,https://jobs.gem.com/prohost-ai
|
|
||||||
Project Method,https://jobs.gem.com/project-method
|
|
||||||
Promptql,https://jobs.gem.com/promptql
|
|
||||||
Propel,https://jobs.gem.com/propel
|
|
||||||
Prospermedical Com,https://jobs.gem.com/prospermedical-com
|
|
||||||
Protegeai,https://jobs.gem.com/protegeai
|
|
||||||
Questdb Com,https://jobs.gem.com/questdb-com
|
|
||||||
Quitwithjones,https://jobs.gem.com/quitwithjones
|
|
||||||
Quo,https://jobs.gem.com/quo
|
|
||||||
Rain Aero,https://jobs.gem.com/rain-aero
|
|
||||||
Raincode Bahrain W L L,https://jobs.gem.com/raincode-bahrain-w-l-l
|
|
||||||
Raylu Ai,https://jobs.gem.com/raylu-ai
|
|
||||||
Rctsglobal Com,https://jobs.gem.com/rctsglobal-com
|
|
||||||
Rditrials,https://jobs.gem.com/rditrials
|
|
||||||
Rebuild Work,https://jobs.gem.com/rebuild-work
|
|
||||||
Redcar,https://jobs.gem.com/redcar
|
|
||||||
Redenvelope Co,https://jobs.gem.com/redenvelope-co
|
|
||||||
Redo,https://jobs.gem.com/redo
|
|
||||||
Rektech,https://jobs.gem.com/rektech
|
|
||||||
Renew,https://jobs.gem.com/renew
|
|
||||||
Resprop,https://jobs.gem.com/resprop
|
|
||||||
Retool,https://jobs.gem.com/retool
|
|
||||||
Revolutionparts,https://jobs.gem.com/revolutionparts
|
|
||||||
Rex,https://jobs.gem.com/rex
|
|
||||||
Rf Renovo Management Company Llc,https://jobs.gem.com/rf-renovo-management-company-llc
|
|
||||||
Riley,https://jobs.gem.com/riley
|
|
||||||
Rinsed,https://jobs.gem.com/rinsed
|
|
||||||
Risely Ai,https://jobs.gem.com/risely-ai
|
|
||||||
Rivia,https://jobs.gem.com/rivia
|
|
||||||
Roadio Ai,https://jobs.gem.com/roadio-ai
|
|
||||||
Roamless,https://jobs.gem.com/roamless
|
|
||||||
Roe Ai,https://jobs.gem.com/roe-ai
|
|
||||||
Rossibuilders Com,https://jobs.gem.com/rossibuilders-com
|
|
||||||
Roundhouse Media,https://jobs.gem.com/roundhouse-media
|
|
||||||
Rove,https://jobs.gem.com/rove
|
|
||||||
Runsybil,https://jobs.gem.com/runsybil
|
|
||||||
Sadnaconsulting Com,https://jobs.gem.com/sadnaconsulting-com
|
|
||||||
Sailorhealth Com,https://jobs.gem.com/sailorhealth-com
|
|
||||||
Sales Marker,https://jobs.gem.com/sales-marker
|
|
||||||
Salesqueze Com,https://jobs.gem.com/salesqueze-com
|
|
||||||
Sandbar Inc,https://jobs.gem.com/sandbar-inc
|
|
||||||
Sandboxschonfeld Com,https://jobs.gem.com/sandboxschonfeld-com
|
|
||||||
Sauron Systems,https://jobs.gem.com/sauron-systems
|
|
||||||
Scope Labs,https://jobs.gem.com/scope-labs
|
|
||||||
Scowtt Com,https://jobs.gem.com/scowtt-com
|
|
||||||
Seated,https://jobs.gem.com/seated
|
|
||||||
Seed2Series Com,https://jobs.gem.com/seed2series-com
|
|
||||||
Seniorverse,https://jobs.gem.com/seniorverse
|
|
||||||
Sennder Gmbh,https://jobs.gem.com/sennder-gmbh
|
|
||||||
Senndertechnologies Gmbh,https://jobs.gem.com/senndertechnologies-gmbh
|
|
||||||
Sensorum Health,https://jobs.gem.com/sensorum-health
|
|
||||||
Serv Ai,https://jobs.gem.com/serv-ai
|
|
||||||
Seven Starling,https://jobs.gem.com/seven-starling
|
|
||||||
Shef Com,https://jobs.gem.com/shef-com
|
|
||||||
Shorebird Dev,https://jobs.gem.com/shorebird-dev
|
|
||||||
Showtime,https://jobs.gem.com/showtime
|
|
||||||
Signoz,https://jobs.gem.com/signoz
|
|
||||||
Silkline,https://jobs.gem.com/silkline
|
|
||||||
Skypilot Co,https://jobs.gem.com/skypilot-co
|
|
||||||
Slash,https://jobs.gem.com/slash
|
|
||||||
Sleep Center,https://jobs.gem.com/sleep-center
|
|
||||||
Smacktechnologies Com,https://jobs.gem.com/smacktechnologies-com
|
|
||||||
Snout,https://jobs.gem.com/snout
|
|
||||||
Softup Technologies,https://jobs.gem.com/softup-technologies
|
|
||||||
Sohar Health,https://jobs.gem.com/sohar-health
|
|
||||||
Soundhound,https://jobs.gem.com/soundhound
|
|
||||||
Spawn,https://jobs.gem.com/spawn
|
|
||||||
Spellbrush,https://jobs.gem.com/spellbrush
|
|
||||||
Sphere Semi,https://jobs.gem.com/sphere-semi
|
|
||||||
Ssg,https://jobs.gem.com/ssg
|
|
||||||
Stack Auth Com,https://jobs.gem.com/stack-auth-com
|
|
||||||
Startup People Solutions,https://jobs.gem.com/startup-people-solutions
|
|
||||||
Stealth Startup,https://jobs.gem.com/stealth-startup
|
|
||||||
Stockapp Com,https://jobs.gem.com/stockapp-com
|
|
||||||
Stryke,https://jobs.gem.com/stryke
|
|
||||||
Sunsethq Com,https://jobs.gem.com/sunsethq-com
|
|
||||||
Super Hi Fi,https://jobs.gem.com/super-hi-fi
|
|
||||||
Superblocks,https://jobs.gem.com/superblocks
|
|
||||||
Supersonik Ai,https://jobs.gem.com/supersonik-ai
|
|
||||||
Supio,https://jobs.gem.com/supio
|
|
||||||
Suppliercanada Com,https://jobs.gem.com/suppliercanada-com
|
|
||||||
Switchgrowth Com,https://jobs.gem.com/switchgrowth-com
|
|
||||||
Symbolica,https://jobs.gem.com/symbolica
|
|
||||||
Syndesus,https://jobs.gem.com/syndesus
|
|
||||||
System Two Security,https://jobs.gem.com/system-two-security
|
|
||||||
Taxgpt Inc ,https://jobs.gem.com/taxgpt-inc-
|
|
||||||
Taxo Ai,https://jobs.gem.com/taxo-ai
|
|
||||||
Tektome Com,https://jobs.gem.com/tektome-com
|
|
||||||
Telora,https://jobs.gem.com/telora
|
|
||||||
Tensorstax Com,https://jobs.gem.com/tensorstax-com
|
|
||||||
Tenx Recruiting,https://jobs.gem.com/tenx-recruiting
|
|
||||||
Terraai Earth,https://jobs.gem.com/terraai-earth
|
|
||||||
Test Board,https://jobs.gem.com/test-board
|
|
||||||
The Boring Company,https://jobs.gem.com/the-boring-company
|
|
||||||
The Brewer Garrett Company,https://jobs.gem.com/the-brewer-garrett-company
|
|
||||||
The Talent Project Com,https://jobs.gem.com/the-talent-project-com
|
|
||||||
Theburntapp Com,https://jobs.gem.com/theburntapp-com
|
|
||||||
Theinterface,https://jobs.gem.com/theinterface
|
|
||||||
Thejobbridge,https://jobs.gem.com/thejobbridge
|
|
||||||
Thelma,https://jobs.gem.com/thelma
|
|
||||||
Theluckyfoundation,https://jobs.gem.com/theluckyfoundation
|
|
||||||
Thenewclub Fyi,https://jobs.gem.com/thenewclub-fyi
|
|
||||||
Theseus Us,https://jobs.gem.com/theseus-us
|
|
||||||
Thinkific,https://jobs.gem.com/thinkific
|
|
||||||
Third Dimension,https://jobs.gem.com/third-dimension
|
|
||||||
Thrivory,https://jobs.gem.com/thrivory
|
|
||||||
Thunder,https://jobs.gem.com/thunder
|
|
||||||
Thunder Compute,https://jobs.gem.com/thunder-compute
|
|
||||||
Timetoperform,https://jobs.gem.com/timetoperform
|
|
||||||
Token Transit,https://jobs.gem.com/token-transit
|
|
||||||
Toolhouse Ai,https://jobs.gem.com/toolhouse-ai
|
|
||||||
Torchsystems Com,https://jobs.gem.com/torchsystems-com
|
|
||||||
Transluce,https://jobs.gem.com/transluce
|
|
||||||
Trashlab,https://jobs.gem.com/trashlab
|
|
||||||
Tricentis,https://jobs.gem.com/tricentis
|
|
||||||
Trilliumhiring Com,https://jobs.gem.com/trilliumhiring-com
|
|
||||||
Tripworks Com,https://jobs.gem.com/tripworks-com
|
|
||||||
Tristero,https://jobs.gem.com/tristero
|
|
||||||
Trojan Trading,https://jobs.gem.com/trojan-trading
|
|
||||||
Tropic,https://jobs.gem.com/tropic
|
|
||||||
Trybree Com,https://jobs.gem.com/trybree-com
|
|
||||||
Tryhelium Com,https://jobs.gem.com/tryhelium-com
|
|
||||||
Tungsten Dev,https://jobs.gem.com/tungsten-dev
|
|
||||||
Turbohome,https://jobs.gem.com/turbohome
|
|
||||||
Twentyfour7 Dev,https://jobs.gem.com/twentyfour7-dev
|
|
||||||
Unify Ai,https://jobs.gem.com/unify-ai
|
|
||||||
Untolabs Com,https://jobs.gem.com/untolabs-com
|
|
||||||
Up Labs,https://jobs.gem.com/up-labs
|
|
||||||
Useful,https://jobs.gem.com/useful
|
|
||||||
Usemalleable Com,https://jobs.gem.com/usemalleable-com
|
|
||||||
Vamo Xyz,https://jobs.gem.com/vamo-xyz
|
|
||||||
Vanguard Cleaning Systems,https://jobs.gem.com/vanguard-cleaning-systems
|
|
||||||
Vantaca,https://jobs.gem.com/vantaca
|
|
||||||
Vantager,https://jobs.gem.com/vantager
|
|
||||||
Vantara Ai,https://jobs.gem.com/vantara-ai
|
|
||||||
Vectorworks,https://jobs.gem.com/vectorworks
|
|
||||||
Vectrasim,https://jobs.gem.com/vectrasim
|
|
||||||
Veho Technologies,https://jobs.gem.com/veho-technologies
|
|
||||||
Ventionteams Com,https://jobs.gem.com/ventionteams-com
|
|
||||||
Venture Guides,https://jobs.gem.com/venture-guides
|
|
||||||
Vercel Ats Sandbox,https://jobs.gem.com/vercel-ats-sandbox
|
|
||||||
Vesseltalent Com,https://jobs.gem.com/vesseltalent-com
|
|
||||||
Voker Ai,https://jobs.gem.com/voker-ai
|
|
||||||
Voltai Com,https://jobs.gem.com/voltai-com
|
|
||||||
Wayback Labs,https://jobs.gem.com/wayback-labs
|
|
||||||
Webflow Ats Sandbox,https://jobs.gem.com/webflow-ats-sandbox
|
|
||||||
Western Governors University,https://jobs.gem.com/western-governors-university
|
|
||||||
Whatconverts,https://jobs.gem.com/whatconverts
|
|
||||||
Wiseroad Recruiting Inc,https://jobs.gem.com/wiseroad-recruiting-inc
|
|
||||||
Wizecamel,https://jobs.gem.com/wizecamel
|
|
||||||
Wolfjaw Careers,https://jobs.gem.com/wolfjaw-careers
|
|
||||||
Wonolo,https://jobs.gem.com/wonolo
|
|
||||||
Woodsideai,https://jobs.gem.com/woodsideai
|
|
||||||
Youtrip,https://jobs.gem.com/youtrip
|
|
||||||
Zefi Ai,https://jobs.gem.com/zefi-ai
|
|
||||||
Zep,https://jobs.gem.com/zep
|
|
||||||
Zorrorx,https://jobs.gem.com/zorrorx
|
|
||||||
|
@ -1,6 +0,0 @@
|
|||||||
url,timestamp
|
|
||||||
https://job-boards.eu.greenhouse.io/bcbgroup/jobs/4681083101?gh_src=cryptocurrencyjobs.co,2025-12-31T08:35:23.424931
|
|
||||||
https://job-boards.greenhouse.io/securitize/jobs/4074121009?gh_src=cryptocurrencyjobs.co,2025-12-31T09:19:17.349713
|
|
||||||
https://job-boards.eu.greenhouse.io/bcbgroup/jobs/4681102101?gh_src=cryptocurrencyjobs.co,2025-12-31T09:58:36.919216
|
|
||||||
https://job-boards.greenhouse.io/kiosk/jobs/4427184005?gh_src=cryptocurrencyjobs.co,2025-12-31T10:10:51.176114
|
|
||||||
https://job-boards.eu.greenhouse.io/bcbgroup/jobs/4681083101?gh_src=cryptocurrencyjobs.co,2025-12-31T11:02:31.869728
|
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -1,7 +0,0 @@
|
|||||||
url,timestamp
|
|
||||||
https://jobs.eu.lever.co/kaiko/3f7f3db9-4a6a-4047-8760-bc52c3d03e05?lever-origin=applied&lever-source%5B%5D=cryptocurrencyjobs.co,2025-12-31T09:20:28.542417
|
|
||||||
https://jobs.lever.co/waterfall/880fb1b4-2515-4534-9970-53c497c82f12?lever-origin=applied&lever-source%5B%5D=cryptocurrencyjobs.co,2025-12-31T10:08:17.316072
|
|
||||||
https://jobs.lever.co/obol-tech/fcccd493-54e4-425a-b9bd-82fa6f7e6aff?lever-origin=applied&lever-source%5B%5D=cryptocurrencyjobs.co,2025-12-31T10:35:29.164452
|
|
||||||
https://jobs.eu.lever.co/coinspaid/7605e154-4b1d-45ee-b1d4-35edea13d80b?lever-origin=applied&lever-source%5B%5D=cryptocurrencyjobs.co,2025-12-31T10:51:38.852693
|
|
||||||
https://jobs.lever.co/vedatechlabs/9c59c96c-2bb0-47b0-88fe-5d5a9fd85997?lever-origin=applied&lever-source%5B%5D=cryptocurrencyjobs.co,2025-12-31T11:02:16.120852
|
|
||||||
https://jobs.eu.lever.co/kaiko/3f7f3db9-4a6a-4047-8760-bc52c3d03e05?lever-origin=applied&lever-source%5B%5D=cryptocurrencyjobs.co,2025-12-31T11:16:43.218273
|
|
||||||
|
1792
levercompanies.csv
1792
levercompanies.csv
File diff suppressed because it is too large
Load Diff
@ -1,8 +0,0 @@
|
|||||||
url,timestamp
|
|
||||||
https://www.linkedin.com/jobs/view/operations-analyst-at-amber-group-4325538653/?ref=cryptocurrencyjobs.co,2025-12-31T09:20:11.544002
|
|
||||||
https://www.linkedin.com/jobs/view/hr-operations-intern-sg-at-matrixport-official-4338171692/?ref=cryptocurrencyjobs.co,2025-12-31T09:25:10.499933
|
|
||||||
https://www.linkedin.com/jobs/view/operations-analyst-at-matrixport-official-4235087267/?ref=cryptocurrencyjobs.co,2025-12-31T09:33:53.104120
|
|
||||||
https://www.linkedin.com/jobs/view/business-operations-analyst-at-matrixport-official-4215538150/?ref=cryptocurrencyjobs.co,2025-12-31T09:34:24.186519
|
|
||||||
https://www.linkedin.com/jobs/view/graduate-hiring-business-operations-analyst-wealth-management-at-matrixport-official-4131687672/?ref=cryptocurrencyjobs.co,2025-12-31T09:36:47.038648
|
|
||||||
https://www.linkedin.com/jobs/view/customer-support-specialist-at-matrixport-official-4323103235/?ref=cryptocurrencyjobs.co,2025-12-31T10:39:57.272414
|
|
||||||
https://www.linkedin.com/jobs/view/finance-intern-at-amber-group-4248725225/?ref=cryptocurrencyjobs.co,2025-12-31T11:31:03.349275
|
|
||||||
|
218
llm_agent.py
218
llm_agent.py
@ -21,12 +21,13 @@ class LLMJobRefiner:
|
|||||||
raise ValueError("DEEPSEEK_API_KEY not found in .env file.")
|
raise ValueError("DEEPSEEK_API_KEY not found in .env file.")
|
||||||
|
|
||||||
# Database credentials from .env
|
# Database credentials from .env
|
||||||
|
self.db_url = os.getenv("DB_URL")
|
||||||
self.db_username = os.getenv("DB_USERNAME")
|
self.db_username = os.getenv("DB_USERNAME")
|
||||||
self.db_password = os.getenv("DB_PASSWORD")
|
self.db_password = os.getenv("DB_PASSWORD")
|
||||||
self.db_host = os.getenv("DB_HOST")
|
self.db_host = os.getenv("DB_HOST")
|
||||||
self.db_port = os.getenv("DB_PORT")
|
self.db_port = os.getenv("DB_PORT")
|
||||||
|
|
||||||
if not self.db_username or not self.db_password:
|
if not self.db_url or not self.db_username or not self.db_password:
|
||||||
raise ValueError("Database credentials not found in .env file.")
|
raise ValueError("Database credentials not found in .env file.")
|
||||||
|
|
||||||
# DeepSeek uses OpenAI-compatible API
|
# DeepSeek uses OpenAI-compatible API
|
||||||
@ -40,12 +41,22 @@ class LLMJobRefiner:
|
|||||||
def _init_db(self):
|
def _init_db(self):
|
||||||
"""Initialize PostgreSQL database connection and create table"""
|
"""Initialize PostgreSQL database connection and create table"""
|
||||||
try:
|
try:
|
||||||
conn = psycopg2.connect(
|
self.db_url = os.getenv("DB_URL")
|
||||||
host=self.db_host,
|
if self.db_url and "supabase.com" in self.db_url:
|
||||||
port=self.db_port,
|
conn = psycopg2.connect(
|
||||||
database="postgres",
|
host=self.db_host,
|
||||||
user=self.db_username,
|
port=self.db_port,
|
||||||
password=self.db_password
|
database="postgres",
|
||||||
|
user=self.db_username,
|
||||||
|
password=self.db_password
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
conn = psycopg2.connect(
|
||||||
|
host=self.db_host,
|
||||||
|
port=self.db_port,
|
||||||
|
database="postgres",
|
||||||
|
user=self.db_username,
|
||||||
|
password=self.db_password
|
||||||
)
|
)
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
@ -102,8 +113,8 @@ class LLMJobRefiner:
|
|||||||
text = re.sub(r'\s+', ' ', text)
|
text = re.sub(r'\s+', ' ', text)
|
||||||
|
|
||||||
# Limit length for LLM context
|
# Limit length for LLM context
|
||||||
if len(text) > 100000:
|
if len(text) > 10000:
|
||||||
text = text[:100000] + "..."
|
text = text[:10000] + "..."
|
||||||
|
|
||||||
return text
|
return text
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -117,7 +128,7 @@ class LLMJobRefiner:
|
|||||||
response = self.client.chat.completions.create(
|
response = self.client.chat.completions.create(
|
||||||
model=self.model,
|
model=self.model,
|
||||||
messages=[{"role": "user", "content": prompt}],
|
messages=[{"role": "user", "content": prompt}],
|
||||||
temperature=0.1,
|
temperature=0.2,
|
||||||
max_tokens=2048,
|
max_tokens=2048,
|
||||||
stream=False
|
stream=False
|
||||||
)
|
)
|
||||||
@ -134,52 +145,38 @@ class LLMJobRefiner:
|
|||||||
posted_date = raw_data.get('posted_date', datetime.now().strftime("%m/%d/%y"))
|
posted_date = raw_data.get('posted_date', datetime.now().strftime("%m/%d/%y"))
|
||||||
|
|
||||||
prompt = f"""
|
prompt = f"""
|
||||||
You are an expert job posting data extractor. Your task is to extract AND infer fields from the provided job posting.
|
You are a job posting data extractor.
|
||||||
|
|
||||||
### CORE RULES:
|
EXTRACT EXACT TEXT - DO NOT SUMMARIZE, PARAPHRASE, OR INVENT.
|
||||||
1. **NEVER invent, summarize, or paraphrase** — extract **exact wording** when available.
|
|
||||||
2. **For critical fields (title, company_name, job_id, url, description):**
|
For these critical fields, follow these rules:
|
||||||
- These MUST be present and meaningful.
|
- description: Extract ALL job description text. If ANY job details exist (duties, responsibilities, overview), include them. Only use "Not provided" if absolutely no description exists.
|
||||||
- If not explicitly stated, **infer from context** (e.g., page title, headings, "About Us", etc.).
|
- requirements: Extract ALL requirements text. If ANY requirements exist (skills, experience, education needed), include them. Only use "Not provided" if none exist.
|
||||||
- **NEVER return "Not provided" or "N/A" for these fields.**
|
- qualifications: Extract ALL qualifications text. If ANY qualifications exist, include them. Only use "Not provided" if none exist.
|
||||||
3. **For optional fields (location, salary_range, etc.):**
|
|
||||||
- Extract exact text if present.
|
REQUIRED FIELDS (must have valid values, never "N/A"):
|
||||||
- If absent but inferable (e.g., "Remote (US)", "Full-time"), **infer it**.
|
- title, company_name, job_id, url
|
||||||
- Only return "Not provided" if truly absent and non-inferable.
|
|
||||||
|
OPTIONAL FIELDS (can be "Not provided"):
|
||||||
### FIELD DEFINITIONS:
|
- location, salary_range, nature_of_work
|
||||||
- **title**: The job title. Look in <h1>, page title, or bold headings.
|
|
||||||
- **company_name**: Company name. Look in logo alt text, footer, "About [X]", or page title.
|
Page Content:
|
||||||
- **description**: Main job overview, responsibilities, or duties. Combine relevant paragraphs if needed. **Must not be empty.**
|
{cleaned_content}
|
||||||
- **requirements**: Required skills, experience, or qualifications.
|
|
||||||
- **qualifications**: Educational or certification requirements.
|
Response format (ONLY return this JSON):
|
||||||
- **location**: Office location or remote policy.
|
{{
|
||||||
- **salary_range**: Exact compensation info.
|
"title": "...",
|
||||||
- **nature_of_work**: Employment type (Full-time, Contract, Internship, etc.).
|
"company_name": "...",
|
||||||
|
"location": "...",
|
||||||
### OUTPUT FORMAT:
|
"description": "...",
|
||||||
Return ONLY a valid JSON object with these keys:
|
"requirements": "...",
|
||||||
{{
|
"qualifications": "...",
|
||||||
"title": "...",
|
"salary_range": "...",
|
||||||
"company_name": "...",
|
"nature_of_work": "...",
|
||||||
"location": "...",
|
"job_id": "{job_id}",
|
||||||
"description": "...",
|
"url": "{url}"
|
||||||
"requirements": "...",
|
}}
|
||||||
"qualifications": "...",
|
"""
|
||||||
"salary_range": "...",
|
|
||||||
"nature_of_work": "...",
|
|
||||||
"job_id": "{job_id}",
|
|
||||||
"url": "{url}"
|
|
||||||
}}
|
|
||||||
|
|
||||||
- **Critical fields must NEVER be "Not provided", "N/A", empty, or generic** (e.g., "Company", "Job Title").
|
|
||||||
- **Optional fields may be "Not provided" ONLY if truly absent.**
|
|
||||||
- **Do not include markdown, explanations, or extra text.**
|
|
||||||
- **Use double quotes for JSON.**
|
|
||||||
|
|
||||||
Page Content:
|
|
||||||
{cleaned_content}
|
|
||||||
"""
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response_text = await asyncio.get_event_loop().run_in_executor(
|
response_text = await asyncio.get_event_loop().run_in_executor(
|
||||||
@ -191,23 +188,31 @@ Page Content:
|
|||||||
if not refined_data:
|
if not refined_data:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Validate critical fields — reject if missing or placeholder
|
# Validate required fields
|
||||||
critical_fields = ['title', 'company_name', 'job_id', 'url', 'description']
|
required_fields = ['title', 'company_name', 'job_id', 'url']
|
||||||
for field in critical_fields:
|
for field in required_fields:
|
||||||
value = refined_data.get(field, "").strip()
|
if not refined_data.get(field) or refined_data[field].strip() in ["N/A", "", "Unknown", "Company", "Job"]:
|
||||||
if not value or value.lower() in ["n/a", "not provided", "unknown", "company", "job", "title", ""]:
|
return None
|
||||||
print(f" ❌ Critical field '{field}' is invalid: '{value}'")
|
|
||||||
return None # This job will NOT be saved — as per requirement
|
|
||||||
|
|
||||||
# Optional fields: allow "Not provided", but ensure they're strings
|
# CRITICAL: Validate content fields - check if they SHOULD exist
|
||||||
optional_fields = ['location', 'requirements', 'qualifications', 'salary_range', 'nature_of_work']
|
content_fields = ['description', 'requirements', 'qualifications']
|
||||||
for field in optional_fields:
|
cleaned_original = cleaned_content.lower()
|
||||||
if field not in refined_data:
|
|
||||||
refined_data[field] = "Not provided"
|
|
||||||
elif not isinstance(refined_data[field], str):
|
|
||||||
refined_data[field] = str(refined_data[field])
|
|
||||||
|
|
||||||
|
# Simple heuristic: if page contains job-related keywords, content fields should NOT be "Not provided"
|
||||||
|
job_indicators = ['responsibilit', 'duties', 'require', 'qualifi', 'skill', 'experienc', 'educat', 'degree', 'bachelor', 'master']
|
||||||
|
has_job_content = any(indicator in cleaned_original for indicator in job_indicators)
|
||||||
|
|
||||||
|
if has_job_content:
|
||||||
|
for field in content_fields:
|
||||||
|
value = refined_data.get(field, "").strip()
|
||||||
|
if value in ["Not provided", "N/A", ""]:
|
||||||
|
# LLM failed to extract existing content
|
||||||
|
print(f" ⚠️ LLM returned '{value}' for {field} but job content appears present")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Add the posted_date to the refined data
|
||||||
refined_data['posted_date'] = posted_date
|
refined_data['posted_date'] = posted_date
|
||||||
|
|
||||||
return refined_data
|
return refined_data
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -215,22 +220,15 @@ Page Content:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
def _parse_llm_response(self, response_text: str) -> Dict[str, Any]:
|
def _parse_llm_response(self, response_text: str) -> Dict[str, Any]:
|
||||||
# Try to extract JSON from markdown code block
|
|
||||||
json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', response_text, re.DOTALL)
|
json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', response_text, re.DOTALL)
|
||||||
if not json_match:
|
if not json_match:
|
||||||
# Try to find raw JSON object
|
json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
|
||||||
json_match = re.search(r'\{[^{}]*\{[^{}]*\}[^{}]*\}|\{.*\}', response_text, re.DOTALL)
|
|
||||||
if not json_match:
|
if not json_match:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
json_str = json_match.group(1) if '```' in response_text else json_match.group(0)
|
return json.loads(json_match.group(1) if '```' in response_text else json_match.group(0))
|
||||||
# Clean common issues
|
except json.JSONDecodeError:
|
||||||
json_str = re.sub(r'\s+', ' ', json_str)
|
|
||||||
json_str = re.sub(r',\s*([\]}\)])', r'\1', json_str) # Remove trailing commas
|
|
||||||
return json.loads(json_str)
|
|
||||||
except json.JSONDecodeError as e:
|
|
||||||
print(f"JSON parsing error: {e}")
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
async def save_job_data(self, job_data: Dict[str, Any], keyword: str):
|
async def save_job_data(self, job_data: Dict[str, Any], keyword: str):
|
||||||
@ -241,11 +239,11 @@ Page Content:
|
|||||||
"""Save job data to PostgreSQL database with job_id uniqueness"""
|
"""Save job data to PostgreSQL database with job_id uniqueness"""
|
||||||
try:
|
try:
|
||||||
conn = psycopg2.connect(
|
conn = psycopg2.connect(
|
||||||
host=self.db_host,
|
host=self.db_host,
|
||||||
port=self.db_port,
|
port=self.db_port,
|
||||||
database="postgres",
|
database="postgres",
|
||||||
user=self.db_username,
|
user=self.db_username,
|
||||||
password=self.db_password
|
password=self.db_password
|
||||||
)
|
)
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
@ -256,50 +254,50 @@ Page Content:
|
|||||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||||
ON CONFLICT (job_id) DO NOTHING
|
ON CONFLICT (job_id) DO NOTHING
|
||||||
''', (
|
''', (
|
||||||
job_data.get("title", "Not provided"),
|
job_data.get("title", "N/A"),
|
||||||
job_data.get("company_name", "Not provided"),
|
job_data.get("company_name", "N/A"),
|
||||||
job_data.get("location", "Not provided"),
|
job_data.get("location", "N/A"),
|
||||||
job_data.get("description", "Not provided"),
|
job_data.get("description", "N/A"),
|
||||||
job_data.get("requirements", "Not provided"),
|
job_data.get("requirements", "N/A"),
|
||||||
job_data.get("qualifications", "Not provided"),
|
job_data.get("qualifications", "N/A"),
|
||||||
job_data.get("salary_range", "Not provided"),
|
job_data.get("salary_range", "N/A"),
|
||||||
job_data.get("nature_of_work", "Not provided"),
|
job_data.get("nature_of_work", "N/A"),
|
||||||
job_data.get("job_id", "unknown"),
|
job_data.get("job_id", "N/A"),
|
||||||
job_data.get("url", "N/A"),
|
job_data.get("url", "N/A"),
|
||||||
job_data.get("category", "all"),
|
job_data.get("category", "N/A"),
|
||||||
job_data.get("scraped_at"),
|
job_data.get("scraped_at"),
|
||||||
job_data.get("posted_date", datetime.now().strftime("%m/%d/%y"))
|
job_data.get("posted_date", "N/A")
|
||||||
))
|
))
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
cursor.close()
|
cursor.close()
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
print(f" 💾 Saved job to category '{job_data.get('category', 'all')}' with job_id: {job_data.get('job_id', 'unknown')}")
|
print(f" 💾 Saved job to category '{job_data.get('category', 'N/A')}' with job_id: {job_data.get('job_id', 'N/A')}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"❌ Database save error: {e}")
|
print(f"❌ Database save error: {e}")
|
||||||
|
|
||||||
async def _save_to_markdown(self, job_data: Dict[str, Any], keyword: str):
|
async def _save_to_markdown(self, job_data: Dict[str, Any], keyword: str):
|
||||||
os.makedirs("crypto_jobs", exist_ok=True)
|
os.makedirs("linkedin_jobs", exist_ok=True)
|
||||||
filepath = os.path.join("crypto_jobs", "crypto_jobs_scraped.md")
|
filepath = os.path.join("linkedin_jobs", "linkedin_jobs_scraped.md")
|
||||||
write_header = not os.path.exists(filepath) or os.path.getsize(filepath) == 0
|
write_header = not os.path.exists(filepath) or os.path.getsize(filepath) == 0
|
||||||
|
|
||||||
with open(filepath, "a", encoding="utf-8") as f:
|
with open(filepath, "a", encoding="utf-8") as f:
|
||||||
if write_header:
|
if write_header:
|
||||||
f.write(f"# Crypto Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
|
f.write(f"# LinkedIn Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
|
||||||
f.write(f"## Job: {job_data.get('title', 'Not provided')}\n\n")
|
f.write(f"## Job: {job_data.get('title', 'N/A')}\n\n")
|
||||||
f.write(f"- **Keyword**: {keyword}\n")
|
f.write(f"- **Keyword**: {keyword}\n")
|
||||||
f.write(f"- **Company**: {job_data.get('company_name', 'Not provided')}\n")
|
f.write(f"- **Company**: {job_data.get('company_name', 'N/A')}\n")
|
||||||
f.write(f"- **Location**: {job_data.get('location', 'Not provided')}\n")
|
f.write(f"- **Location**: {job_data.get('location', 'N/A')}\n")
|
||||||
f.write(f"- **Nature of Work**: {job_data.get('nature_of_work', 'Not provided')}\n")
|
f.write(f"- **Nature of Work**: {job_data.get('nature_of_work', 'N/A')}\n")
|
||||||
f.write(f"- **Salary Range**: {job_data.get('salary_range', 'Not provided')}\n")
|
f.write(f"- **Salary Range**: {job_data.get('salary_range', 'N/A')}\n")
|
||||||
f.write(f"- **Job ID**: {job_data.get('job_id', 'unknown')}\n")
|
f.write(f"- **Job ID**: {job_data.get('job_id', 'N/A')}\n")
|
||||||
f.write(f"- **Posted Date**: {job_data.get('posted_date', 'N/A')}\n")
|
f.write(f"- **Posted Date**: {job_data.get('posted_date', 'N/A')}\n")
|
||||||
f.write(f"- **Category**: {job_data.get('category', 'all')}\n")
|
f.write(f"- **Category**: {job_data.get('category', 'N/A')}\n")
|
||||||
f.write(f"- **Scraped At**: {job_data.get('scraped_at', 'N/A')}\n")
|
f.write(f"- **Scraped At**: {job_data.get('scraped_at', 'N/A')}\n")
|
||||||
f.write(f"- **URL**: <{job_data.get('url', 'N/A')}>\n\n")
|
f.write(f"- **URL**: <{job_data.get('url', 'N/A')}>\n\n")
|
||||||
f.write(f"### Description\n\n{job_data.get('description', 'Not provided')}\n\n")
|
f.write(f"### Description\n\n{job_data.get('description', 'N/A')}\n\n")
|
||||||
f.write(f"### Requirements\n\n{job_data.get('requirements', 'Not provided')}\n\n")
|
f.write(f"### Requirements\n\n{job_data.get('requirements', 'N/A')}\n\n")
|
||||||
f.write(f"### Qualifications\n\n{job_data.get('qualifications', 'Not provided')}\n\n")
|
f.write(f"### Qualifications\n\n{job_data.get('qualifications', 'N/A')}\n\n")
|
||||||
f.write("---\n\n")
|
f.write("---\n\n")
|
||||||
20
main.py
20
main.py
@ -1,4 +1,3 @@
|
|||||||
|
|
||||||
from scraping_engine import FingerprintScrapingEngine
|
from scraping_engine import FingerprintScrapingEngine
|
||||||
from scraper import CryptoJobScraper # Updated class name
|
from scraper import CryptoJobScraper # Updated class name
|
||||||
import os
|
import os
|
||||||
@ -21,15 +20,16 @@ async def main():
|
|||||||
scraper = CryptoJobScraper(engine, human_speed=1.3, user_request="Extract title, company, location, description, requirements, qualifications, nature of work, and salary")
|
scraper = CryptoJobScraper(engine, human_speed=1.3, user_request="Extract title, company, location, description, requirements, qualifications, nature of work, and salary")
|
||||||
|
|
||||||
job_titles = [
|
job_titles = [
|
||||||
"Customer Support",
|
"Blockchain Engineer",
|
||||||
"Design",
|
"Smart Contract Developer",
|
||||||
"Engineering",
|
"DeFi Analyst",
|
||||||
"Finance",
|
"Web3 Developer",
|
||||||
"Marketing",
|
"Crypto Researcher",
|
||||||
"Operations",
|
"Solidity Developer",
|
||||||
"Product",
|
"Protocol Engineer",
|
||||||
"Sales"
|
"Tokenomics Specialist",
|
||||||
|
"Zero-Knowledge Proof Engineer",
|
||||||
|
"Crypto Compliance Officer"
|
||||||
]
|
]
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
|
|||||||
@ -1 +0,0 @@
|
|||||||
url,timestamp
|
|
||||||
|
File diff suppressed because it is too large
Load Diff
440
scraper.py
440
scraper.py
@ -6,13 +6,11 @@ from playwright.async_api import async_playwright, TimeoutError as PlaywrightTim
|
|||||||
from browserforge.injectors.playwright import AsyncNewContext
|
from browserforge.injectors.playwright import AsyncNewContext
|
||||||
from llm_agent import LLMJobRefiner
|
from llm_agent import LLMJobRefiner
|
||||||
import re
|
import re
|
||||||
|
from fetcher import StealthyFetcher
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import json
|
import json
|
||||||
import redis
|
import redis
|
||||||
from urllib.parse import urlparse
|
|
||||||
import hashlib
|
|
||||||
import csv
|
|
||||||
import os
|
|
||||||
|
|
||||||
class CryptoJobScraper:
|
class CryptoJobScraper:
|
||||||
def __init__(
|
def __init__(
|
||||||
@ -29,29 +27,6 @@ class CryptoJobScraper:
|
|||||||
self.llm_agent = LLMJobRefiner()
|
self.llm_agent = LLMJobRefiner()
|
||||||
self.redis_client = redis.Redis(host='localhost', port=6379, db=0, decode_responses=True)
|
self.redis_client = redis.Redis(host='localhost', port=6379, db=0, decode_responses=True)
|
||||||
|
|
||||||
self.FORBIDDEN_ATS_DOMAINS = [
|
|
||||||
'ashby', 'ashbyhq',
|
|
||||||
'greenhouse', 'boards.greenhouse.io',
|
|
||||||
'gem', 'gem.com',
|
|
||||||
'rippling',
|
|
||||||
'myworkday', 'myworkdayjobs',
|
|
||||||
'smartrecruiters',
|
|
||||||
'workable',
|
|
||||||
'lever', 'jobs.lever.co',
|
|
||||||
'linkedin.com' # ✅ Added LinkedIn
|
|
||||||
]
|
|
||||||
|
|
||||||
self.INVALID_CONTENT_PHRASES = [
|
|
||||||
"invalid job url",
|
|
||||||
"cookie consent",
|
|
||||||
"privacy policy",
|
|
||||||
"not a valid job",
|
|
||||||
"job not found",
|
|
||||||
"page not found",
|
|
||||||
"The requested job post could not be found. It may have been removed.",
|
|
||||||
"this page does not contain a job description"
|
|
||||||
]
|
|
||||||
|
|
||||||
async def _human_click(self, page, element, wait_after: bool = True):
|
async def _human_click(self, page, element, wait_after: bool = True):
|
||||||
if not element:
|
if not element:
|
||||||
return False
|
return False
|
||||||
@ -80,127 +55,60 @@ class CryptoJobScraper:
|
|||||||
matches = sum(1 for kw in keyword_list if kw in title_lower)
|
matches = sum(1 for kw in keyword_list if kw in title_lower)
|
||||||
return matches / len(keyword_list) if keyword_list else 0.0
|
return matches / len(keyword_list) if keyword_list else 0.0
|
||||||
|
|
||||||
async def _extract_job_title_from_card(self, card) -> str:
|
async def _scrape_jobs_from_current_page(self, page, search_keywords: str, seen_job_ids, all_job_links):
|
||||||
try:
|
current_links = await page.query_selector_all("a[href*='/job/']")
|
||||||
title_selectors = [
|
new_jobs = 0
|
||||||
'h3', 'h2', 'h4',
|
|
||||||
'strong', 'span'
|
for link in current_links:
|
||||||
]
|
href = await link.get_attribute("href")
|
||||||
for selector in title_selectors:
|
if not href or not href.startswith("http"):
|
||||||
title_element = await card.query_selector(selector)
|
href = "https://cryptocurrencyjobs.co" + href
|
||||||
if title_element:
|
job_id = href.split("/")[-1] if href.endswith("/") else href.split("/")[-1]
|
||||||
title_text = await title_element.inner_text()
|
|
||||||
if title_text and len(title_text.strip()) > 3:
|
if job_id and job_id not in seen_job_ids:
|
||||||
return title_text.strip()
|
title_element = await link.query_selector("h3, .job-title")
|
||||||
|
title = (await title_element.inner_text()) if title_element else "Unknown Title"
|
||||||
card_text = await card.inner_text()
|
|
||||||
lines = [line.strip() for line in card_text.split('\n') if line.strip()]
|
|
||||||
if lines:
|
|
||||||
for line in lines:
|
|
||||||
if len(line) > 5 and not any(skip in line.lower() for skip in ['today', 'featured', 'yesterday', 'company', 'location']):
|
|
||||||
return line
|
|
||||||
return "Unknown Title"
|
|
||||||
except:
|
|
||||||
return "Unknown Title"
|
|
||||||
|
|
||||||
async def _collect_job_elements_from_page(self, page, search_keywords: str, seen_slugs):
|
|
||||||
job_cards = []
|
|
||||||
job_found = False
|
|
||||||
|
|
||||||
await asyncio.sleep(3 * self.human_speed)
|
|
||||||
|
|
||||||
try:
|
|
||||||
await page.wait_for_selector('a[href^="/"][href*="-"]', timeout=60000)
|
|
||||||
candidates = await page.query_selector_all('a[href^="/"][href*="-"]')
|
|
||||||
|
|
||||||
for link in candidates:
|
|
||||||
href = await link.get_attribute("href") or ""
|
|
||||||
href = href.rstrip('/')
|
|
||||||
if not href or len(href.split('/')) != 3:
|
|
||||||
continue
|
|
||||||
if '-' not in href.split('/')[-1]:
|
|
||||||
continue
|
|
||||||
slug = href.split('/')[-1]
|
|
||||||
if len(slug) < 8 or slug.startswith(('login', 'signup', 'about', 'terms')):
|
|
||||||
continue
|
|
||||||
|
|
||||||
full_url = "https://cryptocurrencyjobs.co" + href if not href.startswith('http') else href
|
|
||||||
if slug in seen_slugs:
|
|
||||||
continue
|
|
||||||
|
|
||||||
title = await self._extract_job_title_from_card(link)
|
|
||||||
if not title or title == "Unknown Title":
|
|
||||||
title = slug.replace('-', ' ').title()
|
|
||||||
|
|
||||||
match_percentage = self._calculate_keyword_match(title, search_keywords)
|
match_percentage = self._calculate_keyword_match(title, search_keywords)
|
||||||
if match_percentage >= 0.4 or not search_keywords.strip():
|
|
||||||
seen_slugs.add(slug)
|
if match_percentage >= 0.5: # Lower threshold than LinkedIn
|
||||||
job_cards.append((full_url, title, link))
|
seen_job_ids.add(job_id)
|
||||||
job_found = True
|
all_job_links.append((href, title))
|
||||||
|
new_jobs += 1
|
||||||
|
else:
|
||||||
|
print(f" ⚠️ Skipping job due to low keyword match: {title[:50]}... (match: {match_percentage:.2%})")
|
||||||
|
return new_jobs
|
||||||
|
|
||||||
print(f" ✅ Collected {len(job_cards)} valid job links after filtering ({len(candidates)} raw candidates).")
|
async def _handle_pagination(self, page, search_keywords: str, seen_job_ids, all_job_links):
|
||||||
|
current_page = 1
|
||||||
|
while True:
|
||||||
|
print(f"📄 Processing page {current_page}")
|
||||||
|
new_jobs = await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links)
|
||||||
|
print(f" ➕ Found {new_jobs} new job(s) (total: {len(all_job_links)})")
|
||||||
|
|
||||||
except Exception as e:
|
next_btn = await page.query_selector('a[rel="next"]')
|
||||||
print(f" ⚠️ Error collecting job cards: {e}")
|
if next_btn:
|
||||||
|
next_url = await next_btn.get_attribute("href")
|
||||||
if not job_found:
|
if next_url and not next_url.startswith("http"):
|
||||||
print(" ❌ No valid job listings passed filters.")
|
next_url = "https://cryptocurrencyjobs.co" + next_url
|
||||||
|
await page.goto(next_url, timeout=120000)
|
||||||
return job_cards
|
await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
|
||||||
|
current_page += 1
|
||||||
async def _handle_pagination_and_collect_all(self, page, search_keywords: str, seen_slugs):
|
else:
|
||||||
all_job_elements = []
|
print("🔚 No 'Next' page — stopping pagination.")
|
||||||
scroll_attempt = 0
|
|
||||||
max_scrolls = 40
|
|
||||||
prev_count = 0
|
|
||||||
|
|
||||||
while scroll_attempt < max_scrolls:
|
|
||||||
print(f" Scroll attempt {scroll_attempt + 1} | Current total jobs: {len(all_job_elements)}")
|
|
||||||
|
|
||||||
page_elements = await self._collect_job_elements_from_page(page, search_keywords, seen_slugs)
|
|
||||||
all_job_elements.extend(page_elements)
|
|
||||||
|
|
||||||
current_count = len(all_job_elements)
|
|
||||||
|
|
||||||
if current_count == prev_count and scroll_attempt > 3:
|
|
||||||
print(" 🔚 No new jobs after several scrolls → assuming end of list.")
|
|
||||||
break
|
break
|
||||||
|
|
||||||
prev_count = current_count
|
async def _extract_job_posted_date(self, page) -> str:
|
||||||
|
|
||||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
|
||||||
await asyncio.sleep(random.uniform(2.5, 5.5) * self.human_speed)
|
|
||||||
|
|
||||||
try:
|
|
||||||
load_more = await page.query_selector(
|
|
||||||
'button:has-text("Load more"), button:has-text("More"), div[role="button"]:has-text("Load"), a:has-text("Load more")'
|
|
||||||
)
|
|
||||||
if load_more:
|
|
||||||
print(" Found 'Load more' button → clicking...")
|
|
||||||
await self._human_click(page, load_more)
|
|
||||||
await asyncio.sleep(random.uniform(3.0, 6.0) * self.human_speed)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
scroll_attempt += 1
|
|
||||||
|
|
||||||
print(f" Finished scrolling → collected {len(all_job_elements)} unique job links.")
|
|
||||||
return all_job_elements
|
|
||||||
|
|
||||||
async def _extract_job_posted_date_from_card(self, card) -> str:
|
|
||||||
try:
|
try:
|
||||||
card_text = await card.inner_text()
|
date_element = await page.query_selector(".job-posted-date, .job-date, time")
|
||||||
if "Today" in card_text:
|
if date_element:
|
||||||
return datetime.now().strftime("%m/%d/%y")
|
date_text = await date_element.inner_text()
|
||||||
elif "Yesterday" in card_text:
|
if "Today" in date_text:
|
||||||
from datetime import timedelta
|
return datetime.now().strftime("%m/%d/%y")
|
||||||
return (datetime.now() - timedelta(days=1)).strftime("%m/%d/%y")
|
elif "Yesterday" in date_text:
|
||||||
else:
|
yesterday = datetime.now().replace(day=datetime.now().day - 1)
|
||||||
match = re.search(r'(\d+)d', card_text)
|
return yesterday.strftime("%m/%d/%y")
|
||||||
if match:
|
else:
|
||||||
days = int(match.group(1))
|
return datetime.now().strftime("%m/%d/%y")
|
||||||
from datetime import timedelta
|
|
||||||
return (datetime.now() - timedelta(days=days)).strftime("%m/%d/%y")
|
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
return datetime.now().strftime("%m/%d/%y")
|
return datetime.now().strftime("%m/%d/%y")
|
||||||
@ -218,103 +126,15 @@ class CryptoJobScraper:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f" ❌ Failed to add job to Redis cache: {str(e)}")
|
print(f" ❌ Failed to add job to Redis cache: {str(e)}")
|
||||||
|
|
||||||
async def _is_forbidden_ats_url(self, url: str) -> bool:
|
|
||||||
url_lower = url.lower()
|
|
||||||
return any(domain in url_lower for domain in self.FORBIDDEN_ATS_DOMAINS)
|
|
||||||
|
|
||||||
def _get_ats_platform_name(self, url: str) -> str:
|
|
||||||
"""Return canonical ATS name based on URL (e.g., 'ashby', 'greenhouse')"""
|
|
||||||
url_lower = url.lower()
|
|
||||||
|
|
||||||
# Order matters: more specific first
|
|
||||||
if 'boards.greenhouse.io' in url_lower:
|
|
||||||
return 'greenhouse'
|
|
||||||
elif 'jobs.lever.co' in url_lower:
|
|
||||||
return 'lever'
|
|
||||||
elif 'myworkdayjobs' in url_lower or 'myworkday' in url_lower:
|
|
||||||
return 'workday'
|
|
||||||
elif 'linkedin.com' in url_lower:
|
|
||||||
return 'linkedin'
|
|
||||||
elif 'ashbyhq.com' in url_lower or 'ashby' in url_lower:
|
|
||||||
return 'ashby'
|
|
||||||
elif 'gem.com' in url_lower or 'gem' in url_lower:
|
|
||||||
return 'gem'
|
|
||||||
elif 'rippling' in url_lower:
|
|
||||||
return 'rippling'
|
|
||||||
elif 'smartrecruiters' in url_lower:
|
|
||||||
return 'smartrecruiters'
|
|
||||||
elif 'workable' in url_lower:
|
|
||||||
return 'workable'
|
|
||||||
else:
|
|
||||||
# Fallback: extract domain part
|
|
||||||
try:
|
|
||||||
parsed = urlparse(url)
|
|
||||||
domain = parsed.netloc.lower()
|
|
||||||
for forbidden in self.FORBIDDEN_ATS_DOMAINS:
|
|
||||||
if forbidden in domain:
|
|
||||||
return forbidden.split('.')[0] if '.' in forbidden else forbidden
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
return 'forbidden_ats'
|
|
||||||
|
|
||||||
def _log_forbidden_ats_url(self, url: str, platform: str):
|
|
||||||
"""Append forbidden URL to {platform}.csv"""
|
|
||||||
filename = f"{platform}.csv"
|
|
||||||
file_exists = os.path.isfile(filename)
|
|
||||||
with open(filename, 'a', newline='', encoding='utf-8') as f:
|
|
||||||
writer = csv.writer(f)
|
|
||||||
if not file_exists:
|
|
||||||
writer.writerow(['url', 'timestamp'])
|
|
||||||
writer.writerow([url, datetime.now().isoformat()])
|
|
||||||
print(f" 📥 Logged forbidden ATS URL to {filename}: {url}")
|
|
||||||
|
|
||||||
async def _is_invalid_job_page(self, page_content: str) -> bool:
|
|
||||||
content_lower = page_content.lower()
|
|
||||||
return any(phrase in content_lower for phrase in self.INVALID_CONTENT_PHRASES)
|
|
||||||
|
|
||||||
def _extract_job_id_from_url(self, url: str) -> Optional[str]:
|
|
||||||
try:
|
|
||||||
parsed = urlparse(url)
|
|
||||||
path_parts = [p for p in parsed.path.split('/') if p]
|
|
||||||
if not path_parts:
|
|
||||||
return None
|
|
||||||
|
|
||||||
candidate = path_parts[-1]
|
|
||||||
candidate = re.split(r'[?#]', candidate)[0]
|
|
||||||
candidate = re.sub(r'\.html?$', '', candidate)
|
|
||||||
|
|
||||||
if not candidate or not any(c.isdigit() for c in candidate):
|
|
||||||
return None
|
|
||||||
|
|
||||||
if re.search(r'[A-Za-z]{6,}\s', candidate):
|
|
||||||
return None
|
|
||||||
|
|
||||||
return candidate
|
|
||||||
except:
|
|
||||||
return None
|
|
||||||
|
|
||||||
async def scrape_jobs(
|
async def scrape_jobs(
|
||||||
self,
|
self,
|
||||||
search_keywords: Optional[str],
|
search_keywords: Optional[str],
|
||||||
max_pages: int = 1,
|
max_pages: int = 1,
|
||||||
credentials: Optional[Dict] = None
|
credentials: Optional[Dict] = None
|
||||||
):
|
):
|
||||||
query = ""
|
# cryptocurrencyjobs.co uses URL params differently
|
||||||
location = ""
|
encoded_keywords = search_keywords.replace(" ", "%20")
|
||||||
if search_keywords and search_keywords.strip():
|
search_url = f"https://cryptocurrencyjobs.co/?q={encoded_keywords}"
|
||||||
parts = search_keywords.split(',', 1)
|
|
||||||
query = parts[0].strip()
|
|
||||||
if len(parts) > 1:
|
|
||||||
location = parts[1].strip()
|
|
||||||
|
|
||||||
clean_query = query.replace(' ', '+')
|
|
||||||
clean_location = location.replace(' ', '+')
|
|
||||||
|
|
||||||
search_url = "https://cryptocurrencyjobs.co/"
|
|
||||||
if clean_query:
|
|
||||||
search_url += f"?query={clean_query}"
|
|
||||||
if clean_location:
|
|
||||||
search_url += f"&location={clean_location}"
|
|
||||||
|
|
||||||
profile = self.engine._select_profile()
|
profile = self.engine._select_profile()
|
||||||
renderer = random.choice(self.engine.common_renderers[self.engine.os])
|
renderer = random.choice(self.engine.common_renderers[self.engine.os])
|
||||||
@ -336,103 +156,46 @@ class CryptoJobScraper:
|
|||||||
await context.add_init_script(spoof_script)
|
await context.add_init_script(spoof_script)
|
||||||
|
|
||||||
page = await context.new_page()
|
page = await context.new_page()
|
||||||
print(f"🔍 Searching for: {search_keywords or 'all jobs'}")
|
|
||||||
print(f" 🔗 URL: {search_url}")
|
# Fetch main search page
|
||||||
await page.goto(search_url, wait_until='networkidle', timeout=120000)
|
print(f"🔍 Searching for: {search_keywords}")
|
||||||
|
await page.goto(search_url, wait_until='load', timeout=120000)
|
||||||
await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
|
await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
|
||||||
|
|
||||||
try:
|
all_job_links = []
|
||||||
await page.wait_for_selector("a[href^='/'][href*='-']", timeout=10000)
|
seen_job_ids = set()
|
||||||
except:
|
|
||||||
print(" ⚠️ No job links found initially, waiting longer...")
|
|
||||||
await asyncio.sleep(5 * self.human_speed)
|
|
||||||
|
|
||||||
seen_slugs = set()
|
print("🔄 Collecting job links from search results...")
|
||||||
all_job_elements = await self._handle_pagination_and_collect_all(page, search_keywords, seen_slugs)
|
await self._scrape_jobs_from_current_page(page, search_keywords, seen_job_ids, all_job_links)
|
||||||
print(f"✅ Collected {len(all_job_elements)} unique job links.")
|
await self._handle_pagination(page, search_keywords, seen_job_ids, all_job_links)
|
||||||
|
|
||||||
|
print(f"✅ Collected {len(all_job_links)} unique job links.")
|
||||||
|
|
||||||
scraped_count = 0
|
scraped_count = 0
|
||||||
for idx, (href, title, job_element) in enumerate(all_job_elements):
|
for idx, (href, title) in enumerate(all_job_links):
|
||||||
job_detail_page = None
|
|
||||||
apply_page = None
|
|
||||||
skip_job = False
|
|
||||||
final_scrape_url = None
|
|
||||||
try:
|
try:
|
||||||
print(f" → Processing job {idx+1}/{len(all_job_elements)}: {title}")
|
full_url = href
|
||||||
|
print(f" → Opening job {idx+1}/{len(all_job_links)}: {full_url}")
|
||||||
|
|
||||||
posted_date = await self._extract_job_posted_date_from_card(job_element)
|
fetcher = StealthyFetcher(self.engine, browser, context)
|
||||||
|
job_page = await fetcher.fetch_url(full_url, wait_for_selector="h1")
|
||||||
|
if not job_page:
|
||||||
|
print(f" ❌ Failed to fetch job page {full_url}")
|
||||||
|
await self._add_job_to_redis_cache(full_url, full_url.split("/")[-1], "fetch_failure")
|
||||||
|
self.engine.report_outcome("fetch_failure", url=full_url)
|
||||||
|
continue
|
||||||
|
|
||||||
job_detail_page = await context.new_page()
|
posted_date = await self._extract_job_posted_date(job_page)
|
||||||
await job_detail_page.goto(href, wait_until='networkidle', timeout=60000)
|
|
||||||
|
await self.engine._human_like_scroll(job_page)
|
||||||
await asyncio.sleep(2 * self.human_speed)
|
await asyncio.sleep(2 * self.human_speed)
|
||||||
|
page_content = await self._extract_page_content_for_llm(job_page)
|
||||||
|
|
||||||
page_content = await job_detail_page.content()
|
job_id = full_url.split("/")[-1] if full_url.split("/")[-1] else "unknown"
|
||||||
if await self._is_invalid_job_page(page_content):
|
|
||||||
print(" 🚫 Page contains invalid content → skipping.")
|
|
||||||
await job_detail_page.close()
|
|
||||||
continue
|
|
||||||
|
|
||||||
apply_clicked = False
|
|
||||||
apply_selectors = [
|
|
||||||
'a[href*="apply"], a:text("Apply"), a:text("Apply Now"), a:text("Apply here")',
|
|
||||||
'button:text("Apply"), button:has-text("Apply")',
|
|
||||||
'[data-testid="apply-button"], [aria-label*="apply"], [role="button"]:has-text("Apply")',
|
|
||||||
'a.btn-apply, .apply-button, .apply-link, a:has-text("Apply")',
|
|
||||||
'a[rel="noopener"]:has-text("Apply")',
|
|
||||||
]
|
|
||||||
|
|
||||||
for sel in apply_selectors:
|
|
||||||
apply_elem = await job_detail_page.query_selector(sel)
|
|
||||||
if apply_elem:
|
|
||||||
print(f" 🔗 Found Apply element with selector: {sel}")
|
|
||||||
await self._human_click(job_detail_page, apply_elem, wait_after=True)
|
|
||||||
apply_clicked = True
|
|
||||||
break
|
|
||||||
|
|
||||||
apply_page = job_detail_page
|
|
||||||
|
|
||||||
if apply_clicked:
|
|
||||||
await asyncio.sleep(random.uniform(3.0, 6.0) * self.human_speed)
|
|
||||||
pages = context.pages
|
|
||||||
new_pages = [p for p in pages if p != job_detail_page and p.url != "about:blank"]
|
|
||||||
|
|
||||||
if new_pages:
|
|
||||||
candidate_page = new_pages[-1]
|
|
||||||
new_url = candidate_page.url.strip()
|
|
||||||
print(f" New tab opened: {new_url}")
|
|
||||||
|
|
||||||
if new_url and await self._is_forbidden_ats_url(new_url):
|
|
||||||
platform = self._get_ats_platform_name(new_url)
|
|
||||||
self._log_forbidden_ats_url(new_url, platform)
|
|
||||||
if candidate_page != job_detail_page:
|
|
||||||
await candidate_page.close()
|
|
||||||
await job_detail_page.close()
|
|
||||||
skip_job = True
|
|
||||||
else:
|
|
||||||
apply_page = candidate_page
|
|
||||||
else:
|
|
||||||
print(" No new tab → using original page.")
|
|
||||||
|
|
||||||
if skip_job:
|
|
||||||
continue
|
|
||||||
|
|
||||||
final_scrape_url = apply_page.url
|
|
||||||
|
|
||||||
page_content = await self._extract_page_content_for_llm(apply_page)
|
|
||||||
if await self._is_invalid_job_page(page_content):
|
|
||||||
print(" 🚫 Final page contains invalid content → skipping.")
|
|
||||||
if apply_page != job_detail_page:
|
|
||||||
await apply_page.close()
|
|
||||||
await job_detail_page.close()
|
|
||||||
continue
|
|
||||||
|
|
||||||
job_id = self._extract_job_id_from_url(final_scrape_url)
|
|
||||||
if not job_id:
|
|
||||||
job_id = "job_" + hashlib.md5(final_scrape_url.encode()).hexdigest()[:12]
|
|
||||||
|
|
||||||
raw_data = {
|
raw_data = {
|
||||||
"page_content": page_content,
|
"page_content": page_content,
|
||||||
"url": final_scrape_url,
|
"url": full_url,
|
||||||
"job_id": job_id,
|
"job_id": job_id,
|
||||||
"search_keywords": search_keywords,
|
"search_keywords": search_keywords,
|
||||||
"posted_date": posted_date
|
"posted_date": posted_date
|
||||||
@ -447,45 +210,44 @@ class CryptoJobScraper:
|
|||||||
if field == 'job_id':
|
if field == 'job_id':
|
||||||
refined_data[field] = job_id
|
refined_data[field] = job_id
|
||||||
elif field == 'url':
|
elif field == 'url':
|
||||||
refined_data[field] = final_scrape_url
|
refined_data[field] = full_url
|
||||||
elif field == 'company_name':
|
elif field == 'company_name':
|
||||||
refined_data[field] = "Unknown Company"
|
refined_data[field] = "Unknown Company"
|
||||||
|
|
||||||
refined_data['scraped_at'] = datetime.now().isoformat()
|
refined_data['scraped_at'] = datetime.now().isoformat()
|
||||||
refined_data['category'] = search_keywords or "all"
|
refined_data['category'] = search_keywords
|
||||||
refined_data['posted_date'] = posted_date
|
refined_data['posted_date'] = posted_date
|
||||||
await self.llm_agent.save_job_data(refined_data, search_keywords or "all")
|
await self.llm_agent.save_job_data(refined_data, search_keywords)
|
||||||
scraped_count += 1
|
scraped_count += 1
|
||||||
print(f" ✅ Scraped: {refined_data['title'][:50]}... (Job ID: {job_id})")
|
print(f" ✅ Scraped and refined: {refined_data['title'][:50]}...")
|
||||||
self.engine.report_outcome("success", url=final_scrape_url)
|
self.engine.report_outcome("success", url=raw_data["url"])
|
||||||
else:
|
else:
|
||||||
print(f" 🟡 Could not extract meaningful data from: {final_scrape_url}")
|
print(f" 🟡 Could not extract meaningful data from: {full_url}")
|
||||||
await self._add_job_to_redis_cache(final_scrape_url, job_id, "llm_failure")
|
await self._add_job_to_redis_cache(full_url, job_id, "llm_failure")
|
||||||
self.engine.report_outcome("llm_failure", url=final_scrape_url)
|
self.engine.report_outcome("llm_failure", url=raw_data["url"])
|
||||||
|
|
||||||
if apply_page != job_detail_page and not apply_page.is_closed():
|
await job_page.close()
|
||||||
await apply_page.close()
|
|
||||||
if job_detail_page and not job_detail_page.is_closed():
|
|
||||||
await job_detail_page.close()
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_msg = str(e)[:100]
|
error_msg = str(e)[:100]
|
||||||
print(f" ⚠️ Failed on job {idx+1}: {error_msg}")
|
print(f" ⚠️ Failed on job {idx+1}: {error_msg}")
|
||||||
job_id_for_log = "unknown"
|
job_id = full_url.split("/")[-1] if 'full_url' in locals() else "unknown"
|
||||||
if 'final_scrape_url' in locals() and final_scrape_url:
|
job_url = full_url if 'full_url' in locals() else "unknown"
|
||||||
job_id_for_log = "job_" + hashlib.md5(final_scrape_url.encode()).hexdigest()[:12]
|
await self._add_job_to_redis_cache(job_url, job_id, f"exception: {error_msg}")
|
||||||
await self._add_job_to_redis_cache(href, job_id_for_log, f"exception: {error_msg}")
|
if 'job_page' in locals() and job_page:
|
||||||
if job_detail_page and not job_detail_page.is_closed():
|
await job_page.close()
|
||||||
await job_detail_page.close()
|
|
||||||
if 'apply_page' in locals() and apply_page and apply_page != job_detail_page and not apply_page.is_closed():
|
|
||||||
await apply_page.close()
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
finally:
|
||||||
|
print(" ↩️ Returning to search results...")
|
||||||
|
await page.goto(search_url, timeout=120000)
|
||||||
|
await asyncio.sleep(4 * self.human_speed)
|
||||||
|
|
||||||
await browser.close()
|
await browser.close()
|
||||||
|
|
||||||
if scraped_count > 0:
|
if scraped_count > 0:
|
||||||
self.engine.report_outcome("success")
|
self.engine.report_outcome("success")
|
||||||
print(f"✅ Completed! Processed {scraped_count} jobs for '{search_keywords or 'all jobs'}'.")
|
print(f"✅ Completed! Processed {scraped_count} jobs for '{search_keywords}' based on request '{self.user_request}'.")
|
||||||
else:
|
else:
|
||||||
self.engine.report_outcome("scraping_error")
|
self.engine.report_outcome("scraping_error")
|
||||||
print("⚠️ No jobs processed successfully.")
|
print("⚠️ No jobs processed successfully.")
|
||||||
|
|||||||
@ -1,5 +0,0 @@
|
|||||||
url,timestamp
|
|
||||||
https://apply.workable.com/thetie/j/C54DFC9985/?ref=cryptocurrencyjobs.co,2025-12-31T08:24:45.755671
|
|
||||||
https://apply.workable.com/thetie/j/C54DFC9985/?ref=cryptocurrencyjobs.co,2025-12-31T09:51:08.343642
|
|
||||||
https://apply.workable.com/thetie/j/2745433865/?ref=cryptocurrencyjobs.co,2025-12-31T09:51:28.331543
|
|
||||||
https://apply.workable.com/thetie/j/1A6C8F2913/?ref=cryptocurrencyjobs.co,2025-12-31T11:22:54.623723
|
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -1 +0,0 @@
|
|||||||
url,timestamp
|
|
||||||
|
1045
workdaycompanies.csv
1045
workdaycompanies.csv
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user