Compare commits
26 Commits
main
...
crypto_age
| Author | SHA1 | Date | |
|---|---|---|---|
| 96470487f0 | |||
| b0e90972b1 | |||
| 06f8e8b086 | |||
| 38ef08c734 | |||
| 1005dfc041 | |||
| 37da7b2c1a | |||
| 6cc60844a5 | |||
| 5c9b23cc9b | |||
| d68018224b | |||
| e184e21ec1 | |||
| 38915ed5fd | |||
| e86e8d68ca | |||
| ecbfaaa7f1 | |||
| 88525bfc99 | |||
| caed8d8056 | |||
| e257d03b4c | |||
| 2b1387b3e6 | |||
| 8fa59ba69b | |||
| 91047cfc5c | |||
| 224b9c3122 | |||
| 160efadbfb | |||
| 4f78a845ae | |||
| d7d92ba8bb | |||
| d025828036 | |||
| fd4e8c9c05 | |||
| 7dca4c9159 |
BIN
__pycache__/config.cpython-313.pyc
Normal file
BIN
__pycache__/config.cpython-313.pyc
Normal file
Binary file not shown.
BIN
__pycache__/fetcher.cpython-313.pyc
Normal file
BIN
__pycache__/fetcher.cpython-313.pyc
Normal file
Binary file not shown.
BIN
__pycache__/job_scraper2.cpython-313.pyc
Normal file
BIN
__pycache__/job_scraper2.cpython-313.pyc
Normal file
Binary file not shown.
BIN
__pycache__/llm_agent.cpython-313.pyc
Normal file
BIN
__pycache__/llm_agent.cpython-313.pyc
Normal file
Binary file not shown.
BIN
__pycache__/scraping_engine.cpython-313.pyc
Normal file
BIN
__pycache__/scraping_engine.cpython-313.pyc
Normal file
Binary file not shown.
10
ashby.csv
Normal file
10
ashby.csv
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
url,timestamp
|
||||||
|
https://jobs.ashbyhq.com/stellar/a8377cf4-280b-4eb3-ac44-a4c9020c2eaf?utm_source=cryptocurrencyjobs.co,2025-12-31T08:32:17.821505
|
||||||
|
https://jobs.ashbyhq.com/artemisanalytics/5f61b6c6-147c-4707-9003-a9632455b984?utm_source=cryptocurrencyjobs.co,2025-12-31T08:51:57.190172
|
||||||
|
https://jobs.ashbyhq.com/lightning/2d77b496-ab0d-4e54-bcf8-33260d1bab6b?utm_source=cryptocurrencyjobs.co,2025-12-31T09:07:09.491831
|
||||||
|
https://jobs.ashbyhq.com/Braiins/cee9cf74-6049-4dab-aae7-96bef0082689?utm_source=cryptocurrencyjobs.co,2025-12-31T09:35:28.137181
|
||||||
|
https://jobs.ashbyhq.com/blockstream/80ebab98-0039-48bf-86d9-9a2a7962b005?utm_source=cryptocurrencyjobs.co,2025-12-31T10:21:19.253356
|
||||||
|
https://jobs.ashbyhq.com/dynamic/fde8a9ff-9701-485f-a8d1-e717c170f215?utm_source=cryptocurrencyjobs.co,2025-12-31T10:25:55.141543
|
||||||
|
https://jobs.ashbyhq.com/ether.fi/6eb1e350-71ce-47f7-a363-3fa3c521dacb?utm_source=cryptocurrencyjobs.co,2025-12-31T10:44:35.913725
|
||||||
|
https://chainlinklabs.com/open-roles?ashby_jid=112a76d3-4dfd-4eea-828c-41465760b3ef&utm_source=ccj,2025-12-31T10:49:07.453900
|
||||||
|
https://jobs.ashbyhq.com/stellar/cdad9af1-9e64-4fd4-8e2c-f87389f1dd16?utm_source=cryptocurrencyjobs.co,2025-12-31T11:13:58.119967
|
||||||
|
1591
ashbycompanies.csv
Normal file
1591
ashbycompanies.csv
Normal file
File diff suppressed because it is too large
Load Diff
166
comparator.py
Normal file
166
comparator.py
Normal file
@ -0,0 +1,166 @@
|
|||||||
|
|
||||||
|
import csv
|
||||||
|
import os
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
# Define platform mappings: (input_file, companies_file, platform_name)
|
||||||
|
platforms = [
|
||||||
|
("ashby.csv", "ashbycompanies.csv", "ashby"),
|
||||||
|
("gem.csv", "gemcompanies.csv", "gem"),
|
||||||
|
("greenhouse.csv", "greenhousecompanies.csv", "greenhouse"),
|
||||||
|
("lever.csv", "levercompanies.csv", "lever"),
|
||||||
|
("rippling.csv", "ripplingcompanies.csv", "rippling"),
|
||||||
|
("workable.csv", "workablecompanies.csv", "workable"),
|
||||||
|
("workday.csv", "workdaycompanies.csv", "workday"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_url(platform, url):
|
||||||
|
"""Normalize URL to a company identifier based on platform."""
|
||||||
|
if not url:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
parsed = urlparse(url.lower().strip())
|
||||||
|
netloc = parsed.netloc
|
||||||
|
path = parsed.path
|
||||||
|
|
||||||
|
if platform == "ashby":
|
||||||
|
# https://jobs.ashbyhq.com/company_slug/...
|
||||||
|
if "ashbyhq.com" in netloc:
|
||||||
|
parts = [p for p in path.split('/') if p]
|
||||||
|
return parts[0] if parts else None
|
||||||
|
|
||||||
|
elif platform == "greenhouse":
|
||||||
|
# https://boards.greenhouse.io/company_slug/...
|
||||||
|
if "greenhouse.io" in netloc:
|
||||||
|
parts = [p for p in path.split('/') if p]
|
||||||
|
if len(parts) >= 2 and parts[0] == "boards":
|
||||||
|
return parts[1]
|
||||||
|
elif len(parts) >= 1:
|
||||||
|
return parts[0]
|
||||||
|
return None
|
||||||
|
|
||||||
|
elif platform == "lever":
|
||||||
|
# https://jobs.lever.co/company_slug/...
|
||||||
|
if "lever.co" in netloc:
|
||||||
|
parts = [p for p in path.split('/') if p]
|
||||||
|
return parts[0] if parts else None
|
||||||
|
|
||||||
|
elif platform == "workable":
|
||||||
|
# https://apply.workable.com/company_slug/...
|
||||||
|
if "workable.com" in netloc:
|
||||||
|
parts = [p for p in path.split('/') if p]
|
||||||
|
# Usually: /company_slug/j/jobid/ → take first non-'j' segment
|
||||||
|
for part in parts:
|
||||||
|
if part != 'j' and len(part) > 2:
|
||||||
|
return part
|
||||||
|
return parts[0] if parts else None
|
||||||
|
|
||||||
|
elif platform == "workday":
|
||||||
|
# https://company.workday.com/... → company = subdomain
|
||||||
|
if "myworkdayjobs.com" in netloc or "wd" in netloc:
|
||||||
|
# Extract subdomain before main domain
|
||||||
|
subdomain = netloc.split('.')[0]
|
||||||
|
if subdomain and subdomain not in ['www', 'jobs', 'apply', '']:
|
||||||
|
return subdomain
|
||||||
|
# Fallback: look for company in path (rare)
|
||||||
|
parts = [p for p in path.split('/') if p]
|
||||||
|
if parts:
|
||||||
|
return parts[0]
|
||||||
|
return None
|
||||||
|
|
||||||
|
elif platform == "gem":
|
||||||
|
# https://gem.com/company/... or https://www.gem.com/careers/company/...
|
||||||
|
if "gem.com" in netloc:
|
||||||
|
parts = [p for p in path.split('/') if p]
|
||||||
|
# Often: /company-slug or /careers/company-slug
|
||||||
|
for i, part in enumerate(parts):
|
||||||
|
if part in ['company', 'careers', 'jobs']:
|
||||||
|
if i + 1 < len(parts):
|
||||||
|
return parts[i + 1]
|
||||||
|
return parts[0] if parts else None
|
||||||
|
|
||||||
|
elif platform == "rippling":
|
||||||
|
# Rippling uses generic domain; hard to extract company
|
||||||
|
# Best effort: use full domain + first path segment
|
||||||
|
if "rippling.com" in netloc:
|
||||||
|
parts = [p for p in path.split('/') if p]
|
||||||
|
if parts:
|
||||||
|
return f"{netloc}/{parts[0]}"
|
||||||
|
return netloc
|
||||||
|
|
||||||
|
# Fallback: return full URL if unrecognized
|
||||||
|
return url
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
def read_company_signatures(filepath, platform):
|
||||||
|
"""Read and normalize company identifiers from companies CSV."""
|
||||||
|
if not os.path.exists(filepath):
|
||||||
|
return set()
|
||||||
|
signatures = set()
|
||||||
|
with open(filepath, 'r', encoding='utf-8') as f:
|
||||||
|
reader = csv.DictReader(f)
|
||||||
|
for row in reader:
|
||||||
|
url = row.get('url', '').strip()
|
||||||
|
if url:
|
||||||
|
sig = normalize_url(platform, url)
|
||||||
|
if sig:
|
||||||
|
signatures.add(sig)
|
||||||
|
return signatures
|
||||||
|
|
||||||
|
|
||||||
|
def filter_csv_by_signatures(input_file, excluded_signatures, platform):
|
||||||
|
"""Keep only rows whose normalized URL is NOT in excluded_signatures."""
|
||||||
|
if not os.path.exists(input_file):
|
||||||
|
return [], None
|
||||||
|
kept_rows = []
|
||||||
|
with open(input_file, 'r', encoding='utf-8') as f:
|
||||||
|
reader = csv.DictReader(f)
|
||||||
|
fieldnames = reader.fieldnames
|
||||||
|
for row in reader:
|
||||||
|
url = row.get('url', '').strip()
|
||||||
|
if not url:
|
||||||
|
kept_rows.append(row) # keep if no URL (shouldn't happen)
|
||||||
|
continue
|
||||||
|
sig = normalize_url(platform, url)
|
||||||
|
if sig not in excluded_signatures:
|
||||||
|
kept_rows.append(row)
|
||||||
|
return kept_rows, fieldnames
|
||||||
|
|
||||||
|
|
||||||
|
def write_csv(filepath, rows, fieldnames):
|
||||||
|
"""Write rows to CSV file."""
|
||||||
|
with open(filepath, 'w', newline='', encoding='utf-8') as f:
|
||||||
|
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
||||||
|
writer.writeheader()
|
||||||
|
writer.writerows(rows)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
for input_file, companies_file, platform in platforms:
|
||||||
|
print(f"Processing {input_file} against {companies_file} using '{platform}' normalizer...")
|
||||||
|
|
||||||
|
# Step 1: Load and normalize known company signatures
|
||||||
|
known_signatures = read_company_signatures(companies_file, platform)
|
||||||
|
print(f" → Loaded {len(known_signatures)} known company signatures from {companies_file}")
|
||||||
|
|
||||||
|
# Step 2: Filter input file using signatures
|
||||||
|
kept_rows, fieldnames = filter_csv_by_signatures(input_file, known_signatures, platform)
|
||||||
|
|
||||||
|
# Step 3: Write back filtered data
|
||||||
|
if fieldnames:
|
||||||
|
write_csv(input_file, kept_rows, fieldnames)
|
||||||
|
print(f" → Kept {len(kept_rows)} new job URLs in {input_file}")
|
||||||
|
else:
|
||||||
|
if os.path.exists(input_file):
|
||||||
|
os.remove(input_file)
|
||||||
|
print(f" → {input_file} was empty or invalid — removed.")
|
||||||
|
|
||||||
|
print("\n✅ All platforms processed successfully.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
10
config.py
10
config.py
@ -2,6 +2,16 @@
|
|||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
# Load environment variables from .env file
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
# LLM Agent Configuration
|
||||||
|
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")
|
||||||
|
if not DEEPSEEK_API_KEY:
|
||||||
|
raise ValueError("DEEPSEEK_API_KEY environment variable not set in .env file")
|
||||||
|
|
||||||
|
|
||||||
def load_spoof_config():
|
def load_spoof_config():
|
||||||
"""Load spoof data from JSON config file. Falls back to defaults if missing."""
|
"""Load spoof data from JSON config file. Falls back to defaults if missing."""
|
||||||
|
|||||||
125
fetcher.py
Normal file
125
fetcher.py
Normal file
@ -0,0 +1,125 @@
|
|||||||
|
import asyncio
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
from playwright.async_api import Page, BrowserContext, Browser, TimeoutError as PlaywrightTimeoutError
|
||||||
|
from typing import Optional
|
||||||
|
from scraping_engine import FingerprintScrapingEngine
|
||||||
|
|
||||||
|
|
||||||
|
class StealthyFetcher:
|
||||||
|
def __init__(self, engine: FingerprintScrapingEngine, browser: Browser, context: BrowserContext):
|
||||||
|
self.engine = engine
|
||||||
|
self.browser = browser
|
||||||
|
self.context = context
|
||||||
|
self.max_retries = 5
|
||||||
|
self.base_delay = 5
|
||||||
|
|
||||||
|
async def fetch_url(self, url: str, wait_for_selector: Optional[str] = None) -> Optional[Page]:
|
||||||
|
"""
|
||||||
|
Fetch a URL using stealth techniques, handling Cloudflare and other protections intelligently.
|
||||||
|
"""
|
||||||
|
for attempt in range(self.max_retries):
|
||||||
|
try:
|
||||||
|
print(f"Attempt {attempt + 1} to fetch {url}")
|
||||||
|
page = await self.context.new_page()
|
||||||
|
|
||||||
|
await page.goto(url, wait_until='load', timeout=120000)
|
||||||
|
|
||||||
|
if wait_for_selector:
|
||||||
|
try:
|
||||||
|
await page.wait_for_selector(wait_for_selector, timeout=120000)
|
||||||
|
except PlaywrightTimeoutError:
|
||||||
|
print(f"Selector {wait_for_selector} not found immediately, continuing...")
|
||||||
|
|
||||||
|
await self._apply_human_behavior(page)
|
||||||
|
|
||||||
|
protection_type = await self._detect_protection(page)
|
||||||
|
if protection_type:
|
||||||
|
print(f"🛡️ Protection detected: {protection_type}")
|
||||||
|
content_accessible = await self._is_content_accessible(page, wait_for_selector)
|
||||||
|
if not content_accessible:
|
||||||
|
print("🔒 Content not accessible due to protection.")
|
||||||
|
handled = False
|
||||||
|
if protection_type == "cloudflare":
|
||||||
|
handled = await self._handle_cloudflare(page)
|
||||||
|
elif protection_type == "captcha":
|
||||||
|
handled = await self._handle_captcha(page)
|
||||||
|
if not handled:
|
||||||
|
print("❌ Failed to handle protection.")
|
||||||
|
await page.close()
|
||||||
|
await asyncio.sleep(self.base_delay * (2 ** attempt))
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
print("✅ Protection present but content is accessible — proceeding.")
|
||||||
|
|
||||||
|
print(f"✅ Successfully fetched {url}")
|
||||||
|
return page
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Attempt {attempt + 1} failed for {url}: {str(e)}")
|
||||||
|
if 'page' in locals():
|
||||||
|
await page.close()
|
||||||
|
await asyncio.sleep(self.base_delay * (2 ** attempt))
|
||||||
|
|
||||||
|
print(f"❌ Failed to fetch {url} after {self.max_retries} attempts.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def _apply_human_behavior(self, page: Page):
|
||||||
|
await self.engine._human_like_scroll(page)
|
||||||
|
await asyncio.sleep(random.uniform(1, 3))
|
||||||
|
await self.engine._simulate_human_interaction(page)
|
||||||
|
await asyncio.sleep(random.uniform(1, 2))
|
||||||
|
|
||||||
|
async def _detect_protection(self, page: Page) -> Optional[str]:
|
||||||
|
content = (await page.content()).lower()
|
||||||
|
if (
|
||||||
|
"#cf-chl" in content
|
||||||
|
or "checking your browser" in content
|
||||||
|
or "just a moment" in content
|
||||||
|
or "cloudflare" in content
|
||||||
|
or "ddos protection" in content
|
||||||
|
or "turnstile" in content
|
||||||
|
):
|
||||||
|
return "cloudflare"
|
||||||
|
elif "captcha" in content or "robot" in content or "verify you're human" in content:
|
||||||
|
return "captcha"
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def _is_content_accessible(self, page: Page, wait_for_selector: Optional[str] = None) -> bool:
|
||||||
|
if wait_for_selector:
|
||||||
|
try:
|
||||||
|
await page.wait_for_selector(wait_for_selector, timeout=120000)
|
||||||
|
return True
|
||||||
|
except PlaywrightTimeoutError:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
body_text = await page.eval_on_selector("body", "el => el.innerText.toLowerCase()")
|
||||||
|
return len(body_text.strip()) > 200
|
||||||
|
except:
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def _handle_captcha(self, page: Page) -> bool:
|
||||||
|
print("🦾 Using 'avoid' strategy for captcha — skipping page.")
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def _handle_cloudflare(self, page: Page) -> bool:
|
||||||
|
max_wait_time = 60
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
while time.time() - start_time < max_wait_time:
|
||||||
|
if not await self._detect_protection(page):
|
||||||
|
print("☁️ Cloudflare challenge resolved.")
|
||||||
|
return True
|
||||||
|
|
||||||
|
print("☁️ Cloudflare active, waiting...")
|
||||||
|
await self._apply_human_behavior(page)
|
||||||
|
wait_time = min(10, 2 + random.uniform(1, 3) + (time.time() - start_time) * 0.1)
|
||||||
|
await asyncio.sleep(wait_time)
|
||||||
|
|
||||||
|
if (time.time() - start_time) > 15 and (time.time() - start_time) % 20 < 2:
|
||||||
|
print("🔄 Reloading page during Cloudflare wait...")
|
||||||
|
await page.reload(wait_until='load', timeout=120000)
|
||||||
|
|
||||||
|
print("⏰ Timeout waiting for Cloudflare resolution.")
|
||||||
|
return False
|
||||||
|
|
||||||
508
gemcompanies.csv
Normal file
508
gemcompanies.csv
Normal file
@ -0,0 +1,508 @@
|
|||||||
|
name,url
|
||||||
|
10Xconstruction Ai,https://jobs.gem.com/10xconstruction-ai
|
||||||
|
11X Ai,https://jobs.gem.com/11x-ai
|
||||||
|
43North,https://jobs.gem.com/43north
|
||||||
|
8020 Consulting,https://jobs.gem.com/8020-consulting
|
||||||
|
A16Z Speedrun,https://jobs.gem.com/a16z-speedrun
|
||||||
|
Aarden Ai,https://jobs.gem.com/aarden-ai
|
||||||
|
Accel,https://jobs.gem.com/accel
|
||||||
|
Accelos,https://jobs.gem.com/accelos
|
||||||
|
Acre,https://jobs.gem.com/acre
|
||||||
|
Advancelevelllc Com,https://jobs.gem.com/advancelevelllc-com
|
||||||
|
Agenta Ai,https://jobs.gem.com/agenta-ai
|
||||||
|
Agentnoon,https://jobs.gem.com/agentnoon
|
||||||
|
Agora,https://jobs.gem.com/agora
|
||||||
|
Aionex Xyz,https://jobs.gem.com/aionex-xyz
|
||||||
|
Aiphrodite Ai,https://jobs.gem.com/aiphrodite-ai
|
||||||
|
Airframe,https://jobs.gem.com/airframe
|
||||||
|
Airvet Com,https://jobs.gem.com/airvet-com
|
||||||
|
Alex And Ani,https://jobs.gem.com/alex-and-ani
|
||||||
|
Alinia Ai,https://jobs.gem.com/alinia-ai
|
||||||
|
Alitheon,https://jobs.gem.com/alitheon
|
||||||
|
Alpharun,https://jobs.gem.com/alpharun
|
||||||
|
Altzero Xyz,https://jobs.gem.com/altzero-xyz
|
||||||
|
Amya Agency,https://jobs.gem.com/amya-agency
|
||||||
|
Andrenam,https://jobs.gem.com/andrenam
|
||||||
|
Anysphere,https://jobs.gem.com/anysphere
|
||||||
|
Aoniclife,https://jobs.gem.com/aoniclife
|
||||||
|
Apartment List,https://jobs.gem.com/apartment-list
|
||||||
|
Apella,https://jobs.gem.com/apella
|
||||||
|
Apticore Io,https://jobs.gem.com/apticore-io
|
||||||
|
Arlo,https://jobs.gem.com/arlo
|
||||||
|
Ascenda Loyalty,https://jobs.gem.com/ascenda-loyalty
|
||||||
|
Ascendarc,https://jobs.gem.com/ascendarc
|
||||||
|
Astroforge Io,https://jobs.gem.com/astroforge-io
|
||||||
|
Atla Ai Com,https://jobs.gem.com/atla-ai-com
|
||||||
|
Atomica,https://jobs.gem.com/atomica
|
||||||
|
Audicus,https://jobs.gem.com/audicus
|
||||||
|
Aurelian Io,https://jobs.gem.com/aurelian-io
|
||||||
|
Aureliussystems Us,https://jobs.gem.com/aureliussystems-us
|
||||||
|
Autopilotbrand Com,https://jobs.gem.com/autopilotbrand-com
|
||||||
|
Avoca,https://jobs.gem.com/avoca
|
||||||
|
Avol,https://jobs.gem.com/avol
|
||||||
|
Axonify,https://jobs.gem.com/axonify
|
||||||
|
Backops Ai,https://jobs.gem.com/backops-ai
|
||||||
|
Basalt Health,https://jobs.gem.com/basalt-health
|
||||||
|
Baxter Aerospace,https://jobs.gem.com/baxter-aerospace
|
||||||
|
Bead Ai,https://jobs.gem.com/bead-ai
|
||||||
|
Benbase,https://jobs.gem.com/benbase
|
||||||
|
Better Auth,https://jobs.gem.com/better-auth
|
||||||
|
Betterbasket Ai,https://jobs.gem.com/betterbasket-ai
|
||||||
|
Bigeye,https://jobs.gem.com/bigeye
|
||||||
|
Bigpanda,https://jobs.gem.com/bigpanda
|
||||||
|
Bikky,https://jobs.gem.com/bikky
|
||||||
|
Bilt,https://jobs.gem.com/bilt
|
||||||
|
Binarly,https://jobs.gem.com/binarly
|
||||||
|
Biofire,https://jobs.gem.com/biofire
|
||||||
|
Biorender,https://jobs.gem.com/biorender
|
||||||
|
Biorender Inc Ats,https://jobs.gem.com/biorender-inc--ats
|
||||||
|
Birdwood Therapeutics,https://jobs.gem.com/birdwood-therapeutics
|
||||||
|
Black Ore,https://jobs.gem.com/black-ore
|
||||||
|
Blaze Ai,https://jobs.gem.com/blaze-ai
|
||||||
|
Blazetalent,https://jobs.gem.com/blazetalent
|
||||||
|
Blend Inc,https://jobs.gem.com/blend-inc
|
||||||
|
Blue J,https://jobs.gem.com/blue-j
|
||||||
|
Bluejeanfinancial Com,https://jobs.gem.com/bluejeanfinancial-com
|
||||||
|
Blueonion Ai,https://jobs.gem.com/blueonion-ai
|
||||||
|
Blueprint,https://jobs.gem.com/blueprint
|
||||||
|
Bluesky,https://jobs.gem.com/bluesky
|
||||||
|
Blume Technologies,https://jobs.gem.com/blume-technologies
|
||||||
|
Bohler ,https://jobs.gem.com/bohler-
|
||||||
|
Bohler Engineering Gemats,https://jobs.gem.com/bohler-engineering-gemats
|
||||||
|
Bolna,https://jobs.gem.com/bolna
|
||||||
|
Bond Partners,https://jobs.gem.com/bond-partners
|
||||||
|
Boost Robotics,https://jobs.gem.com/boost-robotics
|
||||||
|
Boredm,https://jobs.gem.com/boredm
|
||||||
|
Breadcrumb Ai,https://jobs.gem.com/breadcrumb-ai
|
||||||
|
Breakline Ats,https://jobs.gem.com/breakline-ats
|
||||||
|
Breakline Education,https://jobs.gem.com/breakline-education
|
||||||
|
Brewbird,https://jobs.gem.com/brewbird
|
||||||
|
Buildtrayd Com,https://jobs.gem.com/buildtrayd-com
|
||||||
|
Bull Moose Xyz,https://jobs.gem.com/bull-moose-xyz
|
||||||
|
Cadstrom Io,https://jobs.gem.com/cadstrom-io
|
||||||
|
Caffelabs Com,https://jobs.gem.com/caffelabs-com
|
||||||
|
Calaveras,https://jobs.gem.com/calaveras
|
||||||
|
Canals,https://jobs.gem.com/canals
|
||||||
|
Caplight Com,https://jobs.gem.com/caplight-com
|
||||||
|
Carbon,https://jobs.gem.com/carbon
|
||||||
|
Cardnexus,https://jobs.gem.com/cardnexus
|
||||||
|
Careers,https://jobs.gem.com/careers
|
||||||
|
Carry,https://jobs.gem.com/carry
|
||||||
|
Caseflood Ai,https://jobs.gem.com/caseflood-ai
|
||||||
|
Cellbyte,https://jobs.gem.com/cellbyte
|
||||||
|
Chartahealth,https://jobs.gem.com/chartahealth
|
||||||
|
Civrobotics Com,https://jobs.gem.com/civrobotics-com
|
||||||
|
Clarity,https://jobs.gem.com/clarity
|
||||||
|
Clearchecks Com Ats,https://jobs.gem.com/clearchecks-com-ats
|
||||||
|
Clearesthealth Com,https://jobs.gem.com/clearesthealth-com
|
||||||
|
Cloudanix Com,https://jobs.gem.com/cloudanix-com
|
||||||
|
Cloudraft,https://jobs.gem.com/cloudraft
|
||||||
|
Codegen,https://jobs.gem.com/codegen
|
||||||
|
Codesignal,https://jobs.gem.com/codesignal
|
||||||
|
Cognna,https://jobs.gem.com/cognna
|
||||||
|
Cogram,https://jobs.gem.com/cogram
|
||||||
|
Comfy,https://jobs.gem.com/comfy
|
||||||
|
Conductorai,https://jobs.gem.com/conductorai
|
||||||
|
Confida Ai,https://jobs.gem.com/confida-ai
|
||||||
|
Context Wtf,https://jobs.gem.com/context-wtf
|
||||||
|
Contour App,https://jobs.gem.com/contour-app
|
||||||
|
Converge,https://jobs.gem.com/converge
|
||||||
|
Coupa Software Inc Ats 1,https://jobs.gem.com/coupa-software-inc-ats-1
|
||||||
|
Cox Exponential,https://jobs.gem.com/cox-exponential
|
||||||
|
Crabi Robotics Com,https://jobs.gem.com/crabi-robotics-com
|
||||||
|
Crackenagi,https://jobs.gem.com/crackenagi
|
||||||
|
Create Talent Group,https://jobs.gem.com/create-talent-group
|
||||||
|
Createdbyhumans Ai,https://jobs.gem.com/createdbyhumans-ai
|
||||||
|
Credit Key,https://jobs.gem.com/credit-key
|
||||||
|
Crosby,https://jobs.gem.com/crosby
|
||||||
|
Curex Org,https://jobs.gem.com/curex-org
|
||||||
|
Curiouscardinals Com,https://jobs.gem.com/curiouscardinals-com
|
||||||
|
Cyvl,https://jobs.gem.com/cyvl
|
||||||
|
D4M International,https://jobs.gem.com/d4m-international
|
||||||
|
Dalus,https://jobs.gem.com/dalus
|
||||||
|
Dash Fi,https://jobs.gem.com/dash-fi
|
||||||
|
Data Masters,https://jobs.gem.com/data-masters
|
||||||
|
Datacurve Ai,https://jobs.gem.com/datacurve-ai
|
||||||
|
Dataday Technology Solutions,https://jobs.gem.com/dataday-technology-solutions
|
||||||
|
Datagrid,https://jobs.gem.com/datagrid
|
||||||
|
Dawn Media,https://jobs.gem.com/dawn-media
|
||||||
|
Daxko,https://jobs.gem.com/daxko
|
||||||
|
Deep Infra,https://jobs.gem.com/deep-infra
|
||||||
|
Deliver,https://jobs.gem.com/deliver
|
||||||
|
Detections Ai,https://jobs.gem.com/detections-ai
|
||||||
|
Dianahr Ai,https://jobs.gem.com/dianahr-ai
|
||||||
|
Distributed Spectrum,https://jobs.gem.com/distributed-spectrum
|
||||||
|
Dlvrlog,https://jobs.gem.com/dlvrlog
|
||||||
|
Doowii,https://jobs.gem.com/doowii
|
||||||
|
Dragme,https://jobs.gem.com/dragme
|
||||||
|
Dragonfly Careers,https://jobs.gem.com/dragonfly-careers
|
||||||
|
Dropback,https://jobs.gem.com/dropback
|
||||||
|
Durin,https://jobs.gem.com/durin
|
||||||
|
Dydx,https://jobs.gem.com/dydx
|
||||||
|
Eats2Seats,https://jobs.gem.com/eats2seats
|
||||||
|
Echelon,https://jobs.gem.com/echelon
|
||||||
|
Ecocart Io,https://jobs.gem.com/ecocart-io
|
||||||
|
Edgetrace Ai,https://jobs.gem.com/edgetrace-ai
|
||||||
|
Efference Ai,https://jobs.gem.com/efference-ai
|
||||||
|
Elite Talent Consulting,https://jobs.gem.com/elite-talent-consulting
|
||||||
|
Eliza,https://jobs.gem.com/eliza
|
||||||
|
Elloe Ai,https://jobs.gem.com/elloe-ai
|
||||||
|
Elo Ai,https://jobs.gem.com/elo-ai
|
||||||
|
Emerge Career,https://jobs.gem.com/emerge-career
|
||||||
|
Engineering Codified,https://jobs.gem.com/engineering--codified
|
||||||
|
Entrusted Contracting,https://jobs.gem.com/entrusted-contracting
|
||||||
|
Escargot Com,https://jobs.gem.com/escargot-com
|
||||||
|
Everfit Io,https://jobs.gem.com/everfit-io
|
||||||
|
Excelity Careers,https://jobs.gem.com/excelity-careers
|
||||||
|
Exponent,https://jobs.gem.com/exponent
|
||||||
|
Ezraailabs Tech,https://jobs.gem.com/ezraailabs-tech
|
||||||
|
Fabric,https://jobs.gem.com/fabric
|
||||||
|
Fabrichealth,https://jobs.gem.com/fabrichealth
|
||||||
|
Fancypeople,https://jobs.gem.com/fancypeople
|
||||||
|
Fanpierlabs Com,https://jobs.gem.com/fanpierlabs-com
|
||||||
|
Faraday,https://jobs.gem.com/faraday
|
||||||
|
Fathom Org,https://jobs.gem.com/fathom-org
|
||||||
|
Felix,https://jobs.gem.com/felix
|
||||||
|
Ferry Health,https://jobs.gem.com/ferry-health
|
||||||
|
Fetch Ats,https://jobs.gem.com/fetch-ats
|
||||||
|
Fifthdoor Com,https://jobs.gem.com/fifthdoor-com
|
||||||
|
Fireflies,https://jobs.gem.com/fireflies
|
||||||
|
Firestorm,https://jobs.gem.com/firestorm
|
||||||
|
Flatfee Corp,https://jobs.gem.com/flatfee-corp
|
||||||
|
Flint,https://jobs.gem.com/flint
|
||||||
|
Floot,https://jobs.gem.com/floot
|
||||||
|
Forgent Ai,https://jobs.gem.com/forgent-ai
|
||||||
|
Fountainplatform Com,https://jobs.gem.com/fountainplatform-com
|
||||||
|
Foxbox Digital,https://jobs.gem.com/foxbox-digital
|
||||||
|
Freestone Grove Partners,https://jobs.gem.com/freestone-grove-partners
|
||||||
|
Freshbooks,https://jobs.gem.com/freshbooks
|
||||||
|
Fridayharbor Ai,https://jobs.gem.com/fridayharbor-ai
|
||||||
|
Fuelfinance,https://jobs.gem.com/fuelfinance
|
||||||
|
Fulcrumcareers,https://jobs.gem.com/fulcrumcareers
|
||||||
|
Function Health,https://jobs.gem.com/function-health
|
||||||
|
Galadyne,https://jobs.gem.com/galadyne
|
||||||
|
Galaxyventures,https://jobs.gem.com/galaxyventures
|
||||||
|
Gc Ai,https://jobs.gem.com/gc-ai
|
||||||
|
Gem,https://jobs.gem.com/gem
|
||||||
|
Gem Mckesson,https://jobs.gem.com/gem-mckesson
|
||||||
|
Gem Test Board,https://jobs.gem.com/gem-test-board
|
||||||
|
Generation Alpha Transistor,https://jobs.gem.com/generation-alpha-transistor
|
||||||
|
Genspark,https://jobs.gem.com/genspark
|
||||||
|
Gerra,https://jobs.gem.com/gerra
|
||||||
|
Getaero Io,https://jobs.gem.com/getaero-io
|
||||||
|
Getbirdeye Com Au,https://jobs.gem.com/getbirdeye-com-au
|
||||||
|
Getro,https://jobs.gem.com/getro
|
||||||
|
Gigaml,https://jobs.gem.com/gigaml
|
||||||
|
Go Cadre,https://jobs.gem.com/go-cadre
|
||||||
|
Goatrecruit Com,https://jobs.gem.com/goatrecruit-com
|
||||||
|
Good Life Companies,https://jobs.gem.com/good-life-companies
|
||||||
|
Goodbill,https://jobs.gem.com/goodbill
|
||||||
|
Grailpay Com,https://jobs.gem.com/grailpay-com
|
||||||
|
Granger Construction,https://jobs.gem.com/granger-construction
|
||||||
|
Gratia Health,https://jobs.gem.com/gratia-health
|
||||||
|
Greenlite Ai,https://jobs.gem.com/greenlite-ai
|
||||||
|
Greenvalleyjobs,https://jobs.gem.com/greenvalleyjobs
|
||||||
|
Grit,https://jobs.gem.com/grit
|
||||||
|
Groq,https://jobs.gem.com/groq
|
||||||
|
Growthbook,https://jobs.gem.com/growthbook
|
||||||
|
Guardrail Ai,https://jobs.gem.com/guardrail-ai
|
||||||
|
Guidesage Ai,https://jobs.gem.com/guidesage-ai
|
||||||
|
Hallow,https://jobs.gem.com/hallow
|
||||||
|
Happydance Partnership Integration,https://jobs.gem.com/happydance-partnership-integration
|
||||||
|
Harmonic,https://jobs.gem.com/harmonic
|
||||||
|
Hash,https://jobs.gem.com/hash
|
||||||
|
Hayla,https://jobs.gem.com/hayla
|
||||||
|
Heavy Construction Systems Specialists Llc,https://jobs.gem.com/heavy-construction-systems-specialists-llc
|
||||||
|
Helix,https://jobs.gem.com/helix
|
||||||
|
Hellotrade,https://jobs.gem.com/hellotrade
|
||||||
|
Helm Health,https://jobs.gem.com/helm-health
|
||||||
|
Hilabs Ie,https://jobs.gem.com/hilabs-ie
|
||||||
|
Hipeople,https://jobs.gem.com/hipeople
|
||||||
|
Holacasa Yc W23,https://jobs.gem.com/holacasa-yc-w23
|
||||||
|
Homeboost,https://jobs.gem.com/homeboost
|
||||||
|
Hospitable,https://jobs.gem.com/hospitable
|
||||||
|
Howrecruit Io,https://jobs.gem.com/howrecruit-io
|
||||||
|
Hubspot,https://jobs.gem.com/hubspot
|
||||||
|
Hypernatural Ai,https://jobs.gem.com/hypernatural-ai
|
||||||
|
Inception,https://jobs.gem.com/inception
|
||||||
|
Index Exchange,https://jobs.gem.com/index-exchange
|
||||||
|
Infrastructure Modernization Solutions,https://jobs.gem.com/infrastructure-modernization-solutions
|
||||||
|
Inspiration Commerce Group,https://jobs.gem.com/inspiration-commerce-group
|
||||||
|
Inspiresemi Com,https://jobs.gem.com/inspiresemi-com
|
||||||
|
Instrumental Inc ,https://jobs.gem.com/instrumental-inc-
|
||||||
|
Integral Xyz,https://jobs.gem.com/integral-xyz
|
||||||
|
Integrationscaptain,https://jobs.gem.com/integrationscaptain
|
||||||
|
Intelligentresourcing Co,https://jobs.gem.com/intelligentresourcing-co
|
||||||
|
Interfere Old,https://jobs.gem.com/interfere-old
|
||||||
|
Invoicebutler Ai,https://jobs.gem.com/invoicebutler-ai
|
||||||
|
Iris,https://jobs.gem.com/iris
|
||||||
|
Ironsite Ai,https://jobs.gem.com/ironsite-ai
|
||||||
|
Itsvaleria Co,https://jobs.gem.com/itsvaleria-co
|
||||||
|
Jaguaracareers,https://jobs.gem.com/jaguaracareers
|
||||||
|
Janie,https://jobs.gem.com/janie
|
||||||
|
Jayla Careers,https://jobs.gem.com/jayla-careers
|
||||||
|
Jobma,https://jobs.gem.com/jobma
|
||||||
|
Joinanvil Com,https://jobs.gem.com/joinanvil-com
|
||||||
|
Joinformal,https://jobs.gem.com/joinformal
|
||||||
|
Joyful Health,https://jobs.gem.com/joyful-health
|
||||||
|
Kaikaku,https://jobs.gem.com/kaikaku
|
||||||
|
Kaironhealth,https://jobs.gem.com/kaironhealth
|
||||||
|
Kaironhealth Com,https://jobs.gem.com/kaironhealth-com
|
||||||
|
Kanu Ai,https://jobs.gem.com/kanu-ai
|
||||||
|
Kcs Hiring,https://jobs.gem.com/kcs-hiring
|
||||||
|
Keru Ai,https://jobs.gem.com/keru-ai
|
||||||
|
Key To Web3,https://jobs.gem.com/key-to-web3
|
||||||
|
Knight Electric Inc ,https://jobs.gem.com/knight-electric-inc-
|
||||||
|
Kollectiv Ai,https://jobs.gem.com/kollectiv-ai
|
||||||
|
Kumo Ai,https://jobs.gem.com/kumo-ai
|
||||||
|
Lantern,https://jobs.gem.com/lantern
|
||||||
|
Lavapayments Com,https://jobs.gem.com/lavapayments-com
|
||||||
|
Leap Tools,https://jobs.gem.com/leap-tools
|
||||||
|
Letsdata,https://jobs.gem.com/letsdata
|
||||||
|
Letter Ai,https://jobs.gem.com/letter-ai
|
||||||
|
Level,https://jobs.gem.com/level
|
||||||
|
Linktree,https://jobs.gem.com/linktree
|
||||||
|
Little Otter,https://jobs.gem.com/little-otter
|
||||||
|
Lower Llc,https://jobs.gem.com/lower-llc
|
||||||
|
Lumalabs Ai,https://jobs.gem.com/lumalabs-ai
|
||||||
|
Lunajoy,https://jobs.gem.com/lunajoy
|
||||||
|
Lunch,https://jobs.gem.com/lunch
|
||||||
|
Lunos Ai,https://jobs.gem.com/lunos-ai
|
||||||
|
Magnetic,https://jobs.gem.com/magnetic
|
||||||
|
Manifest,https://jobs.gem.com/manifest
|
||||||
|
Manifested Com,https://jobs.gem.com/manifested-com
|
||||||
|
Marble Health,https://jobs.gem.com/marble-health
|
||||||
|
Mavi,https://jobs.gem.com/mavi
|
||||||
|
Meetdex Ai,https://jobs.gem.com/meetdex-ai
|
||||||
|
Megapot,https://jobs.gem.com/megapot
|
||||||
|
Meineautosdirekt,https://jobs.gem.com/meineautosdirekt
|
||||||
|
Menten Ai,https://jobs.gem.com/menten-ai
|
||||||
|
Merge Sandbox,https://jobs.gem.com/merge-sandbox
|
||||||
|
Metal Ai,https://jobs.gem.com/metal-ai
|
||||||
|
Microsoft Demo Gem Com,https://jobs.gem.com/microsoft-demo-gem-com
|
||||||
|
Mimicrobotics Com,https://jobs.gem.com/mimicrobotics-com
|
||||||
|
Mission,https://jobs.gem.com/mission
|
||||||
|
Moosehead Talent,https://jobs.gem.com/moosehead-talent
|
||||||
|
Motion,https://jobs.gem.com/motion
|
||||||
|
Moxa,https://jobs.gem.com/moxa
|
||||||
|
Multiplierhq,https://jobs.gem.com/multiplierhq
|
||||||
|
Multiscale Ai,https://jobs.gem.com/multiscale-ai
|
||||||
|
Myprize,https://jobs.gem.com/myprize
|
||||||
|
Myriad Technology,https://jobs.gem.com/myriad-technology
|
||||||
|
Myrrsgroup,https://jobs.gem.com/myrrsgroup
|
||||||
|
Nabla Bio,https://jobs.gem.com/nabla-bio
|
||||||
|
Nacelle,https://jobs.gem.com/nacelle
|
||||||
|
Nativemsg,https://jobs.gem.com/nativemsg
|
||||||
|
Nclusion,https://jobs.gem.com/nclusion
|
||||||
|
Nerve,https://jobs.gem.com/nerve
|
||||||
|
Newcrew,https://jobs.gem.com/newcrew
|
||||||
|
Ngram,https://jobs.gem.com/ngram
|
||||||
|
Nimble,https://jobs.gem.com/nimble
|
||||||
|
Niva,https://jobs.gem.com/niva
|
||||||
|
Nominal,https://jobs.gem.com/nominal
|
||||||
|
Northone,https://jobs.gem.com/northone
|
||||||
|
Ntop,https://jobs.gem.com/ntop
|
||||||
|
Nue Ai,https://jobs.gem.com/nue-ai
|
||||||
|
Nutrislice,https://jobs.gem.com/nutrislice
|
||||||
|
Nuvo,https://jobs.gem.com/nuvo
|
||||||
|
Obin Ai,https://jobs.gem.com/obin-ai
|
||||||
|
Obsidian Systems,https://jobs.gem.com/obsidian-systems
|
||||||
|
Odo Do,https://jobs.gem.com/odo-do
|
||||||
|
Omegahhagency Com,https://jobs.gem.com/omegahhagency-com
|
||||||
|
Ondo Finance,https://jobs.gem.com/ondo-finance
|
||||||
|
Onesignal,https://jobs.gem.com/onesignal
|
||||||
|
Onesignal Ats,https://jobs.gem.com/onesignal-ats
|
||||||
|
Onezyme,https://jobs.gem.com/onezyme
|
||||||
|
Onfrontiers,https://jobs.gem.com/onfrontiers
|
||||||
|
Openphone,https://jobs.gem.com/openphone
|
||||||
|
Openreqstaffing,https://jobs.gem.com/openreqstaffing
|
||||||
|
Opine,https://jobs.gem.com/opine
|
||||||
|
Ora So,https://jobs.gem.com/ora-so
|
||||||
|
Overlay,https://jobs.gem.com/overlay
|
||||||
|
Overwatch,https://jobs.gem.com/overwatch
|
||||||
|
Paces,https://jobs.gem.com/paces
|
||||||
|
Pae,https://jobs.gem.com/pae
|
||||||
|
Pagebound,https://jobs.gem.com/pagebound
|
||||||
|
Pally,https://jobs.gem.com/pally
|
||||||
|
Paramark,https://jobs.gem.com/paramark
|
||||||
|
Partao,https://jobs.gem.com/partao
|
||||||
|
Partnerhq,https://jobs.gem.com/partnerhq
|
||||||
|
Patlytics,https://jobs.gem.com/patlytics
|
||||||
|
Pave,https://jobs.gem.com/pave
|
||||||
|
Perceptyx,https://jobs.gem.com/perceptyx
|
||||||
|
Photalabs Com,https://jobs.gem.com/photalabs-com
|
||||||
|
Photon,https://jobs.gem.com/photon
|
||||||
|
Pinnacleconnect Llc,https://jobs.gem.com/pinnacleconnect-llc
|
||||||
|
Piqenergy Com,https://jobs.gem.com/piqenergy-com
|
||||||
|
Planet Fans,https://jobs.gem.com/planet-fans
|
||||||
|
Planned,https://jobs.gem.com/planned
|
||||||
|
Plixai,https://jobs.gem.com/plixai
|
||||||
|
Pogo Recruiting,https://jobs.gem.com/pogo-recruiting
|
||||||
|
Polar,https://jobs.gem.com/polar
|
||||||
|
Polywork,https://jobs.gem.com/polywork
|
||||||
|
Pomerium,https://jobs.gem.com/pomerium
|
||||||
|
Portal Ai,https://jobs.gem.com/portal-ai
|
||||||
|
Poseidonaero,https://jobs.gem.com/poseidonaero
|
||||||
|
Prahsys Com,https://jobs.gem.com/prahsys-com
|
||||||
|
Praxisiq Ai,https://jobs.gem.com/praxisiq-ai
|
||||||
|
Precision Ai,https://jobs.gem.com/precision-ai
|
||||||
|
Prodia,https://jobs.gem.com/prodia
|
||||||
|
Productboard,https://jobs.gem.com/productboard
|
||||||
|
Productboard Ats,https://jobs.gem.com/productboard-ats
|
||||||
|
Prohost Ai,https://jobs.gem.com/prohost-ai
|
||||||
|
Project Method,https://jobs.gem.com/project-method
|
||||||
|
Promptql,https://jobs.gem.com/promptql
|
||||||
|
Propel,https://jobs.gem.com/propel
|
||||||
|
Prospermedical Com,https://jobs.gem.com/prospermedical-com
|
||||||
|
Protegeai,https://jobs.gem.com/protegeai
|
||||||
|
Questdb Com,https://jobs.gem.com/questdb-com
|
||||||
|
Quitwithjones,https://jobs.gem.com/quitwithjones
|
||||||
|
Quo,https://jobs.gem.com/quo
|
||||||
|
Rain Aero,https://jobs.gem.com/rain-aero
|
||||||
|
Raincode Bahrain W L L,https://jobs.gem.com/raincode-bahrain-w-l-l
|
||||||
|
Raylu Ai,https://jobs.gem.com/raylu-ai
|
||||||
|
Rctsglobal Com,https://jobs.gem.com/rctsglobal-com
|
||||||
|
Rditrials,https://jobs.gem.com/rditrials
|
||||||
|
Rebuild Work,https://jobs.gem.com/rebuild-work
|
||||||
|
Redcar,https://jobs.gem.com/redcar
|
||||||
|
Redenvelope Co,https://jobs.gem.com/redenvelope-co
|
||||||
|
Redo,https://jobs.gem.com/redo
|
||||||
|
Rektech,https://jobs.gem.com/rektech
|
||||||
|
Renew,https://jobs.gem.com/renew
|
||||||
|
Resprop,https://jobs.gem.com/resprop
|
||||||
|
Retool,https://jobs.gem.com/retool
|
||||||
|
Revolutionparts,https://jobs.gem.com/revolutionparts
|
||||||
|
Rex,https://jobs.gem.com/rex
|
||||||
|
Rf Renovo Management Company Llc,https://jobs.gem.com/rf-renovo-management-company-llc
|
||||||
|
Riley,https://jobs.gem.com/riley
|
||||||
|
Rinsed,https://jobs.gem.com/rinsed
|
||||||
|
Risely Ai,https://jobs.gem.com/risely-ai
|
||||||
|
Rivia,https://jobs.gem.com/rivia
|
||||||
|
Roadio Ai,https://jobs.gem.com/roadio-ai
|
||||||
|
Roamless,https://jobs.gem.com/roamless
|
||||||
|
Roe Ai,https://jobs.gem.com/roe-ai
|
||||||
|
Rossibuilders Com,https://jobs.gem.com/rossibuilders-com
|
||||||
|
Roundhouse Media,https://jobs.gem.com/roundhouse-media
|
||||||
|
Rove,https://jobs.gem.com/rove
|
||||||
|
Runsybil,https://jobs.gem.com/runsybil
|
||||||
|
Sadnaconsulting Com,https://jobs.gem.com/sadnaconsulting-com
|
||||||
|
Sailorhealth Com,https://jobs.gem.com/sailorhealth-com
|
||||||
|
Sales Marker,https://jobs.gem.com/sales-marker
|
||||||
|
Salesqueze Com,https://jobs.gem.com/salesqueze-com
|
||||||
|
Sandbar Inc,https://jobs.gem.com/sandbar-inc
|
||||||
|
Sandboxschonfeld Com,https://jobs.gem.com/sandboxschonfeld-com
|
||||||
|
Sauron Systems,https://jobs.gem.com/sauron-systems
|
||||||
|
Scope Labs,https://jobs.gem.com/scope-labs
|
||||||
|
Scowtt Com,https://jobs.gem.com/scowtt-com
|
||||||
|
Seated,https://jobs.gem.com/seated
|
||||||
|
Seed2Series Com,https://jobs.gem.com/seed2series-com
|
||||||
|
Seniorverse,https://jobs.gem.com/seniorverse
|
||||||
|
Sennder Gmbh,https://jobs.gem.com/sennder-gmbh
|
||||||
|
Senndertechnologies Gmbh,https://jobs.gem.com/senndertechnologies-gmbh
|
||||||
|
Sensorum Health,https://jobs.gem.com/sensorum-health
|
||||||
|
Serv Ai,https://jobs.gem.com/serv-ai
|
||||||
|
Seven Starling,https://jobs.gem.com/seven-starling
|
||||||
|
Shef Com,https://jobs.gem.com/shef-com
|
||||||
|
Shorebird Dev,https://jobs.gem.com/shorebird-dev
|
||||||
|
Showtime,https://jobs.gem.com/showtime
|
||||||
|
Signoz,https://jobs.gem.com/signoz
|
||||||
|
Silkline,https://jobs.gem.com/silkline
|
||||||
|
Skypilot Co,https://jobs.gem.com/skypilot-co
|
||||||
|
Slash,https://jobs.gem.com/slash
|
||||||
|
Sleep Center,https://jobs.gem.com/sleep-center
|
||||||
|
Smacktechnologies Com,https://jobs.gem.com/smacktechnologies-com
|
||||||
|
Snout,https://jobs.gem.com/snout
|
||||||
|
Softup Technologies,https://jobs.gem.com/softup-technologies
|
||||||
|
Sohar Health,https://jobs.gem.com/sohar-health
|
||||||
|
Soundhound,https://jobs.gem.com/soundhound
|
||||||
|
Spawn,https://jobs.gem.com/spawn
|
||||||
|
Spellbrush,https://jobs.gem.com/spellbrush
|
||||||
|
Sphere Semi,https://jobs.gem.com/sphere-semi
|
||||||
|
Ssg,https://jobs.gem.com/ssg
|
||||||
|
Stack Auth Com,https://jobs.gem.com/stack-auth-com
|
||||||
|
Startup People Solutions,https://jobs.gem.com/startup-people-solutions
|
||||||
|
Stealth Startup,https://jobs.gem.com/stealth-startup
|
||||||
|
Stockapp Com,https://jobs.gem.com/stockapp-com
|
||||||
|
Stryke,https://jobs.gem.com/stryke
|
||||||
|
Sunsethq Com,https://jobs.gem.com/sunsethq-com
|
||||||
|
Super Hi Fi,https://jobs.gem.com/super-hi-fi
|
||||||
|
Superblocks,https://jobs.gem.com/superblocks
|
||||||
|
Supersonik Ai,https://jobs.gem.com/supersonik-ai
|
||||||
|
Supio,https://jobs.gem.com/supio
|
||||||
|
Suppliercanada Com,https://jobs.gem.com/suppliercanada-com
|
||||||
|
Switchgrowth Com,https://jobs.gem.com/switchgrowth-com
|
||||||
|
Symbolica,https://jobs.gem.com/symbolica
|
||||||
|
Syndesus,https://jobs.gem.com/syndesus
|
||||||
|
System Two Security,https://jobs.gem.com/system-two-security
|
||||||
|
Taxgpt Inc ,https://jobs.gem.com/taxgpt-inc-
|
||||||
|
Taxo Ai,https://jobs.gem.com/taxo-ai
|
||||||
|
Tektome Com,https://jobs.gem.com/tektome-com
|
||||||
|
Telora,https://jobs.gem.com/telora
|
||||||
|
Tensorstax Com,https://jobs.gem.com/tensorstax-com
|
||||||
|
Tenx Recruiting,https://jobs.gem.com/tenx-recruiting
|
||||||
|
Terraai Earth,https://jobs.gem.com/terraai-earth
|
||||||
|
Test Board,https://jobs.gem.com/test-board
|
||||||
|
The Boring Company,https://jobs.gem.com/the-boring-company
|
||||||
|
The Brewer Garrett Company,https://jobs.gem.com/the-brewer-garrett-company
|
||||||
|
The Talent Project Com,https://jobs.gem.com/the-talent-project-com
|
||||||
|
Theburntapp Com,https://jobs.gem.com/theburntapp-com
|
||||||
|
Theinterface,https://jobs.gem.com/theinterface
|
||||||
|
Thejobbridge,https://jobs.gem.com/thejobbridge
|
||||||
|
Thelma,https://jobs.gem.com/thelma
|
||||||
|
Theluckyfoundation,https://jobs.gem.com/theluckyfoundation
|
||||||
|
Thenewclub Fyi,https://jobs.gem.com/thenewclub-fyi
|
||||||
|
Theseus Us,https://jobs.gem.com/theseus-us
|
||||||
|
Thinkific,https://jobs.gem.com/thinkific
|
||||||
|
Third Dimension,https://jobs.gem.com/third-dimension
|
||||||
|
Thrivory,https://jobs.gem.com/thrivory
|
||||||
|
Thunder,https://jobs.gem.com/thunder
|
||||||
|
Thunder Compute,https://jobs.gem.com/thunder-compute
|
||||||
|
Timetoperform,https://jobs.gem.com/timetoperform
|
||||||
|
Token Transit,https://jobs.gem.com/token-transit
|
||||||
|
Toolhouse Ai,https://jobs.gem.com/toolhouse-ai
|
||||||
|
Torchsystems Com,https://jobs.gem.com/torchsystems-com
|
||||||
|
Transluce,https://jobs.gem.com/transluce
|
||||||
|
Trashlab,https://jobs.gem.com/trashlab
|
||||||
|
Tricentis,https://jobs.gem.com/tricentis
|
||||||
|
Trilliumhiring Com,https://jobs.gem.com/trilliumhiring-com
|
||||||
|
Tripworks Com,https://jobs.gem.com/tripworks-com
|
||||||
|
Tristero,https://jobs.gem.com/tristero
|
||||||
|
Trojan Trading,https://jobs.gem.com/trojan-trading
|
||||||
|
Tropic,https://jobs.gem.com/tropic
|
||||||
|
Trybree Com,https://jobs.gem.com/trybree-com
|
||||||
|
Tryhelium Com,https://jobs.gem.com/tryhelium-com
|
||||||
|
Tungsten Dev,https://jobs.gem.com/tungsten-dev
|
||||||
|
Turbohome,https://jobs.gem.com/turbohome
|
||||||
|
Twentyfour7 Dev,https://jobs.gem.com/twentyfour7-dev
|
||||||
|
Unify Ai,https://jobs.gem.com/unify-ai
|
||||||
|
Untolabs Com,https://jobs.gem.com/untolabs-com
|
||||||
|
Up Labs,https://jobs.gem.com/up-labs
|
||||||
|
Useful,https://jobs.gem.com/useful
|
||||||
|
Usemalleable Com,https://jobs.gem.com/usemalleable-com
|
||||||
|
Vamo Xyz,https://jobs.gem.com/vamo-xyz
|
||||||
|
Vanguard Cleaning Systems,https://jobs.gem.com/vanguard-cleaning-systems
|
||||||
|
Vantaca,https://jobs.gem.com/vantaca
|
||||||
|
Vantager,https://jobs.gem.com/vantager
|
||||||
|
Vantara Ai,https://jobs.gem.com/vantara-ai
|
||||||
|
Vectorworks,https://jobs.gem.com/vectorworks
|
||||||
|
Vectrasim,https://jobs.gem.com/vectrasim
|
||||||
|
Veho Technologies,https://jobs.gem.com/veho-technologies
|
||||||
|
Ventionteams Com,https://jobs.gem.com/ventionteams-com
|
||||||
|
Venture Guides,https://jobs.gem.com/venture-guides
|
||||||
|
Vercel Ats Sandbox,https://jobs.gem.com/vercel-ats-sandbox
|
||||||
|
Vesseltalent Com,https://jobs.gem.com/vesseltalent-com
|
||||||
|
Voker Ai,https://jobs.gem.com/voker-ai
|
||||||
|
Voltai Com,https://jobs.gem.com/voltai-com
|
||||||
|
Wayback Labs,https://jobs.gem.com/wayback-labs
|
||||||
|
Webflow Ats Sandbox,https://jobs.gem.com/webflow-ats-sandbox
|
||||||
|
Western Governors University,https://jobs.gem.com/western-governors-university
|
||||||
|
Whatconverts,https://jobs.gem.com/whatconverts
|
||||||
|
Wiseroad Recruiting Inc,https://jobs.gem.com/wiseroad-recruiting-inc
|
||||||
|
Wizecamel,https://jobs.gem.com/wizecamel
|
||||||
|
Wolfjaw Careers,https://jobs.gem.com/wolfjaw-careers
|
||||||
|
Wonolo,https://jobs.gem.com/wonolo
|
||||||
|
Woodsideai,https://jobs.gem.com/woodsideai
|
||||||
|
Youtrip,https://jobs.gem.com/youtrip
|
||||||
|
Zefi Ai,https://jobs.gem.com/zefi-ai
|
||||||
|
Zep,https://jobs.gem.com/zep
|
||||||
|
Zorrorx,https://jobs.gem.com/zorrorx
|
||||||
|
6
greenhouse.csv
Normal file
6
greenhouse.csv
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
url,timestamp
|
||||||
|
https://job-boards.eu.greenhouse.io/bcbgroup/jobs/4681083101?gh_src=cryptocurrencyjobs.co,2025-12-31T08:35:23.424931
|
||||||
|
https://job-boards.greenhouse.io/securitize/jobs/4074121009?gh_src=cryptocurrencyjobs.co,2025-12-31T09:19:17.349713
|
||||||
|
https://job-boards.eu.greenhouse.io/bcbgroup/jobs/4681102101?gh_src=cryptocurrencyjobs.co,2025-12-31T09:58:36.919216
|
||||||
|
https://job-boards.greenhouse.io/kiosk/jobs/4427184005?gh_src=cryptocurrencyjobs.co,2025-12-31T10:10:51.176114
|
||||||
|
https://job-boards.eu.greenhouse.io/bcbgroup/jobs/4681083101?gh_src=cryptocurrencyjobs.co,2025-12-31T11:02:31.869728
|
||||||
|
2544
greenhousecompanies.csv
Normal file
2544
greenhousecompanies.csv
Normal file
File diff suppressed because it is too large
Load Diff
491
job_scraper.py
491
job_scraper.py
@ -1,491 +0,0 @@
|
|||||||
|
|
||||||
|
|
||||||
import asyncio
|
|
||||||
import random
|
|
||||||
import sqlite3
|
|
||||||
import os
|
|
||||||
from datetime import datetime
|
|
||||||
from typing import Optional, Dict
|
|
||||||
from playwright.async_api import async_playwright
|
|
||||||
from browserforge.injectors.playwright import AsyncNewContext
|
|
||||||
|
|
||||||
|
|
||||||
class LinkedInJobScraper:
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
engine,
|
|
||||||
db_path: str = "linkedin_jobs.db",
|
|
||||||
human_speed: float = 1.0
|
|
||||||
):
|
|
||||||
self.engine = engine
|
|
||||||
self.db_path = db_path
|
|
||||||
self.human_speed = human_speed
|
|
||||||
self._init_db()
|
|
||||||
|
|
||||||
def _init_db(self):
|
|
||||||
os.makedirs(os.path.dirname(self.db_path) if os.path.dirname(self.db_path) else ".", exist_ok=True)
|
|
||||||
with sqlite3.connect(self.db_path) as conn:
|
|
||||||
cursor = conn.cursor()
|
|
||||||
cursor.execute('''
|
|
||||||
CREATE TABLE IF NOT EXISTS jobs (
|
|
||||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
||||||
keyword TEXT,
|
|
||||||
title TEXT,
|
|
||||||
company TEXT,
|
|
||||||
location TEXT,
|
|
||||||
salary TEXT,
|
|
||||||
description TEXT,
|
|
||||||
url TEXT UNIQUE,
|
|
||||||
workplace_type TEXT,
|
|
||||||
scraped_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
||||||
)
|
|
||||||
''')
|
|
||||||
conn.commit()
|
|
||||||
|
|
||||||
async def _human_click(self, page, element, wait_after: bool = True):
|
|
||||||
if not element:
|
|
||||||
return False
|
|
||||||
await element.scroll_into_view_if_needed()
|
|
||||||
await asyncio.sleep(random.uniform(0.3, 0.8) * self.human_speed)
|
|
||||||
try:
|
|
||||||
await element.click()
|
|
||||||
if wait_after:
|
|
||||||
await asyncio.sleep(random.uniform(2, 4) * self.human_speed)
|
|
||||||
return True
|
|
||||||
except:
|
|
||||||
return False
|
|
||||||
|
|
||||||
async def _login(self, page, credentials: Dict) -> bool:
|
|
||||||
"""Human-realistic LinkedIn login"""
|
|
||||||
print("🔐 Navigating to LinkedIn login page...")
|
|
||||||
await page.goto("https://www.linkedin.com/login", timeout=60000)
|
|
||||||
await asyncio.sleep(random.uniform(2.0, 3.5) * self.human_speed)
|
|
||||||
|
|
||||||
email_field = await page.query_selector('input[name="session_key"]')
|
|
||||||
if not email_field:
|
|
||||||
print("❌ Email field not found.")
|
|
||||||
return False
|
|
||||||
|
|
||||||
print("✍️ Typing username...")
|
|
||||||
await email_field.click()
|
|
||||||
await asyncio.sleep(random.uniform(0.4, 0.9) * self.human_speed)
|
|
||||||
for char in credentials["email"]:
|
|
||||||
await page.keyboard.type(char)
|
|
||||||
await asyncio.sleep(random.uniform(0.06, 0.14) * self.human_speed)
|
|
||||||
await asyncio.sleep(random.uniform(1.0, 1.8) * self.human_speed)
|
|
||||||
|
|
||||||
password_field = await page.query_selector('input[name="session_password"]')
|
|
||||||
if not password_field:
|
|
||||||
print("❌ Password field not found.")
|
|
||||||
return False
|
|
||||||
|
|
||||||
print("🔒 Typing password...")
|
|
||||||
await password_field.click()
|
|
||||||
await asyncio.sleep(random.uniform(0.3, 0.7) * self.human_speed)
|
|
||||||
for char in credentials["password"]:
|
|
||||||
await page.keyboard.type(char)
|
|
||||||
await asyncio.sleep(random.uniform(0.08, 0.16) * self.human_speed)
|
|
||||||
await asyncio.sleep(random.uniform(0.8, 1.5) * self.human_speed)
|
|
||||||
|
|
||||||
print("✅ Submitting login form...")
|
|
||||||
await page.keyboard.press("Enter")
|
|
||||||
|
|
||||||
for _ in range(15):
|
|
||||||
current_url = page.url
|
|
||||||
if "/feed" in current_url or "/jobs" in current_url:
|
|
||||||
if "login" not in current_url:
|
|
||||||
print("✅ Login successful!")
|
|
||||||
await asyncio.sleep(random.uniform(2.0, 3.0) * self.human_speed)
|
|
||||||
return True
|
|
||||||
await asyncio.sleep(1)
|
|
||||||
print("❌ Login may have failed.")
|
|
||||||
return False
|
|
||||||
|
|
||||||
async def _extract_job_details(self, page) -> Dict:
|
|
||||||
"""Extract from ANY job page: LinkedIn Easy Apply OR external site"""
|
|
||||||
await asyncio.sleep(2 * self.human_speed)
|
|
||||||
|
|
||||||
async def get_text(selector: str) -> str:
|
|
||||||
try:
|
|
||||||
el = await page.query_selector(selector)
|
|
||||||
if el:
|
|
||||||
text = await el.inner_text()
|
|
||||||
return text.strip() if text else "N/A"
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
return "N/A"
|
|
||||||
|
|
||||||
title = await get_text("h1.t-24")
|
|
||||||
if title == "N/A":
|
|
||||||
title = await get_text("h1, h2")
|
|
||||||
|
|
||||||
company = await get_text("a.app-aware-link[href*='/company/']")
|
|
||||||
if company == "N/A":
|
|
||||||
company = await get_text("div.org, .company, [class*='company']")
|
|
||||||
|
|
||||||
location = await get_text("span[class*='location']")
|
|
||||||
if location == "N/A":
|
|
||||||
location = await get_text(".location, [class*='location']")
|
|
||||||
|
|
||||||
description = await get_text("div[class*='description__text']")
|
|
||||||
if description == "N/A":
|
|
||||||
description = await get_text(".job-desc, .description, main, body")
|
|
||||||
|
|
||||||
workplace = await get_text("span.job-workplace-type") or "N/A"
|
|
||||||
salary = await get_text("span.salary") or "N/A"
|
|
||||||
|
|
||||||
return {
|
|
||||||
"title": title,
|
|
||||||
"company": company,
|
|
||||||
"location": location,
|
|
||||||
"workplace_type": workplace,
|
|
||||||
"salary": salary,
|
|
||||||
"description": description,
|
|
||||||
"url": page.url
|
|
||||||
}
|
|
||||||
|
|
||||||
async def _save_to_markdown(self, job_data: Dict, keyword: str, verified: bool=True):
|
|
||||||
"""Save to appropriate folder using job ID to avoid duplication"""
|
|
||||||
folder = "linkedin_jobs" if verified else "linkedin_jobs_unverified"
|
|
||||||
os.makedirs(folder, exist_ok=True)
|
|
||||||
|
|
||||||
# Extract job ID from URL for LinkedIn jobs
|
|
||||||
url = job_data.get("url", "")
|
|
||||||
if "/jobs/view/" in url:
|
|
||||||
try:
|
|
||||||
job_id = url.split("/view/")[1].split("/")[0]
|
|
||||||
except:
|
|
||||||
job_id = "unknown"
|
|
||||||
else:
|
|
||||||
# For external jobs, use a hash of the URL (first 12 chars)
|
|
||||||
import hashlib
|
|
||||||
job_id = hashlib.md5(url.encode()).hexdigest()[:12]
|
|
||||||
|
|
||||||
clean_keyword = keyword.replace(" ", "_")
|
|
||||||
filename = f"linkedin_{clean_keyword}_job_{job_id}.md"
|
|
||||||
filepath = os.path.join(folder, filename)
|
|
||||||
|
|
||||||
# Only save if file doesn't already exist (idempotent)
|
|
||||||
if os.path.exists(filepath):
|
|
||||||
print(f" 📝 Skipping duplicate Markdown file: {filename}")
|
|
||||||
return
|
|
||||||
|
|
||||||
with open(filepath, "w", encoding="utf-8") as f:
|
|
||||||
f.write(f"# {job_data['title']}\n\n")
|
|
||||||
f.write(f"- **Company**: {job_data['company']}\n")
|
|
||||||
f.write(f"- **Location**: {job_data['location']}\n")
|
|
||||||
f.write(f"- **Workplace**: {job_data['workplace_type']}\n")
|
|
||||||
f.write(f"- **Salary**: {job_data['salary']}\n")
|
|
||||||
f.write(f"- **URL**: <{url}>\n\n")
|
|
||||||
f.write(f"## Description\n\n{job_data['description']}\n")
|
|
||||||
|
|
||||||
async def _save_to_db(self, job_data: Dict, keyword: str):
|
|
||||||
with sqlite3.connect(self.db_path) as conn:
|
|
||||||
cursor = conn.cursor()
|
|
||||||
cursor.execute('''
|
|
||||||
INSERT OR IGNORE INTO jobs
|
|
||||||
(keyword, title, company, location, salary, description, url, workplace_type)
|
|
||||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
||||||
''', (
|
|
||||||
keyword,
|
|
||||||
job_data["title"],
|
|
||||||
job_data["company"],
|
|
||||||
job_data["location"],
|
|
||||||
job_data["salary"],
|
|
||||||
job_data["description"],
|
|
||||||
job_data["url"],
|
|
||||||
job_data["workplace_type"]
|
|
||||||
))
|
|
||||||
conn.commit()
|
|
||||||
|
|
||||||
async def scrape_jobs(
|
|
||||||
self,
|
|
||||||
search_keywords: str,
|
|
||||||
max_pages: int = 1,
|
|
||||||
credentials: Optional[Dict] = None
|
|
||||||
):
|
|
||||||
encoded_keywords = search_keywords.replace(" ", "%20")
|
|
||||||
search_url = f"https://www.linkedin.com/jobs/search/?keywords={encoded_keywords}"
|
|
||||||
|
|
||||||
profile = self.engine._select_profile()
|
|
||||||
renderer = random.choice(self.engine.common_renderers[self.engine.os])
|
|
||||||
vendor = random.choice(self.engine.common_vendors)
|
|
||||||
spoof_script = self.engine._get_spoof_script(renderer, vendor)
|
|
||||||
|
|
||||||
async with async_playwright() as pw:
|
|
||||||
browser = await pw.chromium.launch(
|
|
||||||
headless=False,
|
|
||||||
args=['--disable-blink-features=AutomationControlled']
|
|
||||||
)
|
|
||||||
context = await AsyncNewContext(browser, fingerprint=profile)
|
|
||||||
|
|
||||||
await context.add_init_script(f"""
|
|
||||||
Object.defineProperty(navigator, 'hardwareConcurrency', {{ get: () => {profile.navigator.hardwareConcurrency} }});
|
|
||||||
Object.defineProperty(navigator, 'deviceMemory', {{ get: () => {profile.navigator.deviceMemory} }});
|
|
||||||
Object.defineProperty(navigator, 'platform', {{ get: () => '{profile.navigator.platform}' }});
|
|
||||||
""")
|
|
||||||
await context.add_init_script(spoof_script)
|
|
||||||
|
|
||||||
page = await context.new_page()
|
|
||||||
|
|
||||||
session_loaded = await self.engine.load_session(context)
|
|
||||||
login_successful = False
|
|
||||||
|
|
||||||
if session_loaded:
|
|
||||||
print("🔁 Using saved session — verifying login...")
|
|
||||||
await page.goto("https://www.linkedin.com/feed/", timeout=60000)
|
|
||||||
if "feed" in page.url and "login" not in page.url:
|
|
||||||
print("✅ Session still valid.")
|
|
||||||
login_successful = True
|
|
||||||
else:
|
|
||||||
print("⚠️ Saved session expired — re-authenticating.")
|
|
||||||
session_loaded = False
|
|
||||||
|
|
||||||
if not session_loaded and credentials:
|
|
||||||
print("🔐 Performing fresh login...")
|
|
||||||
login_successful = await self._login(page, credentials)
|
|
||||||
if login_successful:
|
|
||||||
await self.engine.save_session(context)
|
|
||||||
else:
|
|
||||||
print("❌ Login failed. Exiting.")
|
|
||||||
await browser.close()
|
|
||||||
self.engine.report_outcome("block")
|
|
||||||
return
|
|
||||||
elif not credentials:
|
|
||||||
print("ℹ️ No credentials — proceeding as guest.")
|
|
||||||
login_successful = True
|
|
||||||
else:
|
|
||||||
pass
|
|
||||||
|
|
||||||
await page.wait_for_load_state("load", timeout=60000)
|
|
||||||
print("✅ Post-login page fully loaded. Starting search...")
|
|
||||||
|
|
||||||
if await self.engine._detect_cloudflare(page):
|
|
||||||
print("☁️ Cloudflare detected on initial load.")
|
|
||||||
if not await self.engine._handle_cloudflare(page):
|
|
||||||
print("❌ Cloudflare could not be resolved.")
|
|
||||||
await browser.close()
|
|
||||||
self.engine.report_outcome("cloudflare")
|
|
||||||
return
|
|
||||||
|
|
||||||
print(f"🔍 Searching for: {search_keywords}")
|
|
||||||
await page.goto(search_url, wait_until='load', timeout=60000)
|
|
||||||
await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)
|
|
||||||
|
|
||||||
if await self.engine._detect_cloudflare(page):
|
|
||||||
print("☁️ Cloudflare detected on search page.")
|
|
||||||
if not await self.engine._handle_cloudflare(page):
|
|
||||||
await browser.close()
|
|
||||||
self.engine.report_outcome("cloudflare")
|
|
||||||
return
|
|
||||||
|
|
||||||
scraped_count = 0
|
|
||||||
all_job_links = []
|
|
||||||
seen_job_ids = set()
|
|
||||||
|
|
||||||
# ← NEW: Scroll once to reveal pagination (if any)
|
|
||||||
print("🔄 Scrolling to bottom to reveal pagination controls...")
|
|
||||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
|
||||||
await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
|
|
||||||
|
|
||||||
# Check if pagination exists
|
|
||||||
pagination_exists = await page.query_selector("button[aria-label='Next']")
|
|
||||||
if pagination_exists:
|
|
||||||
print("⏭️ Pagination detected. Using page navigation.")
|
|
||||||
current_page = 1
|
|
||||||
while current_page <= max_pages:
|
|
||||||
print(f"📄 Processing page {current_page}/{max_pages}")
|
|
||||||
|
|
||||||
# Collect job links on current page
|
|
||||||
current_links = await page.query_selector_all("a[href*='/jobs/view/']")
|
|
||||||
new_jobs = 0
|
|
||||||
for link in current_links:
|
|
||||||
href = await link.get_attribute("href")
|
|
||||||
if href:
|
|
||||||
job_id = href.split("/view/")[-1].split("/")[0] if "/view/" in href else href
|
|
||||||
if job_id and job_id not in seen_job_ids:
|
|
||||||
seen_job_ids.add(job_id)
|
|
||||||
all_job_links.append(href)
|
|
||||||
new_jobs += 1
|
|
||||||
|
|
||||||
print(f" ➕ Found {new_jobs} new job(s) on page {current_page} (total: {len(all_job_links)})")
|
|
||||||
|
|
||||||
# Try to go to next page
|
|
||||||
if current_page < max_pages:
|
|
||||||
next_btn = await page.query_selector("button[aria-label='Next']")
|
|
||||||
if next_btn and await next_btn.is_enabled():
|
|
||||||
await self._human_click(page, next_btn)
|
|
||||||
await asyncio.sleep(random.uniform(4.0, 6.0) * self.human_speed)
|
|
||||||
# Wait for URL to change or new content
|
|
||||||
try:
|
|
||||||
await page.wait_for_function("() => window.location.href.includes('start=')", timeout=30000)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
print("🔚 'Next' button not available — stopping pagination.")
|
|
||||||
break
|
|
||||||
current_page += 1
|
|
||||||
|
|
||||||
else:
|
|
||||||
print("🔄 No pagination found. Falling back to infinite scroll...")
|
|
||||||
last_height = await page.evaluate("document.body.scrollHeight")
|
|
||||||
no_new_jobs_count = 0
|
|
||||||
max_no_new = 3
|
|
||||||
|
|
||||||
while no_new_jobs_count < max_no_new:
|
|
||||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
|
||||||
await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
|
|
||||||
|
|
||||||
current_links = await page.query_selector_all("a[href*='/jobs/view/']")
|
|
||||||
new_jobs_found = 0
|
|
||||||
|
|
||||||
for link in current_links:
|
|
||||||
href = await link.get_attribute("href")
|
|
||||||
if href:
|
|
||||||
job_id = href.split("/view/")[-1].split("/")[0] if "/view/" in href else href
|
|
||||||
if job_id and job_id not in seen_job_ids:
|
|
||||||
seen_job_ids.add(job_id)
|
|
||||||
all_job_links.append(href)
|
|
||||||
new_jobs_found += 1
|
|
||||||
|
|
||||||
print(f" ➕ Found {new_jobs_found} new job(s) (total: {len(all_job_links)})")
|
|
||||||
|
|
||||||
new_height = await page.evaluate("document.body.scrollHeight")
|
|
||||||
if new_height == last_height:
|
|
||||||
no_new_jobs_count += 1
|
|
||||||
else:
|
|
||||||
no_new_jobs_count = 0
|
|
||||||
last_height = new_height
|
|
||||||
|
|
||||||
if new_jobs_found == 0 and no_new_jobs_count >= 1:
|
|
||||||
print("🔚 No new jobs loaded. Stopping scroll.")
|
|
||||||
break
|
|
||||||
|
|
||||||
print(f"✅ Collected {len(all_job_links)} unique job links.")
|
|
||||||
|
|
||||||
# ← Rest of job processing loop unchanged
|
|
||||||
scraped_count = 0
|
|
||||||
for idx, href in enumerate(all_job_links):
|
|
||||||
try:
|
|
||||||
full_url = href if href.startswith("http") else f"https://www.linkedin.com{href}"
|
|
||||||
print(f" → Opening job {idx+1}/{len(all_job_links)}: {full_url}")
|
|
||||||
await page.goto(full_url, wait_until='load', timeout=60000)
|
|
||||||
await asyncio.sleep(3 * self.human_speed)
|
|
||||||
|
|
||||||
is_cloudflare = await self.engine._detect_cloudflare(page)
|
|
||||||
page_content = await page.content()
|
|
||||||
has_captcha_text = "captcha" in page_content.lower()
|
|
||||||
captcha_present = is_cloudflare or has_captcha_text
|
|
||||||
|
|
||||||
title_element = await page.query_selector("h1.t-24")
|
|
||||||
job_data_accessible = title_element is not None
|
|
||||||
|
|
||||||
if captcha_present:
|
|
||||||
if job_data_accessible:
|
|
||||||
print(" ⚠️ CAPTCHA detected, but job data is accessible. Proceeding in stealth mode...")
|
|
||||||
await self.engine._avoid_captcha(page)
|
|
||||||
else:
|
|
||||||
print(" ⚠️ CAPTCHA detected and job data blocked. Attempting recovery...")
|
|
||||||
if not await self.engine._solve_captcha_fallback(page):
|
|
||||||
print(" ❌ CAPTCHA recovery failed. Skipping job.")
|
|
||||||
continue
|
|
||||||
title_element = await page.query_selector("h1.t-24")
|
|
||||||
if not title_element:
|
|
||||||
print(" ❌ Job data still unavailable after CAPTCHA handling. Skipping.")
|
|
||||||
continue
|
|
||||||
|
|
||||||
if not captcha_present:
|
|
||||||
await self.engine._avoid_captcha(page)
|
|
||||||
|
|
||||||
apply_btn = None
|
|
||||||
apply_selectors = [
|
|
||||||
"button[aria-label*='Apply']",
|
|
||||||
"button:has-text('Apply')",
|
|
||||||
"a:has-text('Apply')",
|
|
||||||
"button:has-text('Easy Apply')"
|
|
||||||
]
|
|
||||||
for selector in apply_selectors:
|
|
||||||
apply_btn = await page.query_selector(selector)
|
|
||||||
if apply_btn:
|
|
||||||
break
|
|
||||||
|
|
||||||
job_data = None
|
|
||||||
final_url = full_url
|
|
||||||
|
|
||||||
if apply_btn:
|
|
||||||
print(" → Clicking 'Apply' / 'Easy Apply' button...")
|
|
||||||
|
|
||||||
page_waiter = asyncio.create_task(context.wait_for_event("page"))
|
|
||||||
await self._human_click(page, apply_btn, wait_after=False)
|
|
||||||
|
|
||||||
external_page = None
|
|
||||||
try:
|
|
||||||
external_page = await asyncio.wait_for(page_waiter, timeout=5.0)
|
|
||||||
print(" 🌐 External job site opened in new tab.")
|
|
||||||
await external_page.wait_for_load_state("load", timeout=30000)
|
|
||||||
await asyncio.sleep(2 * self.human_speed)
|
|
||||||
await self.engine._human_like_scroll(external_page)
|
|
||||||
await asyncio.sleep(2 * self.human_speed)
|
|
||||||
|
|
||||||
job_data = await self._extract_job_details(external_page)
|
|
||||||
final_url = external_page.url
|
|
||||||
|
|
||||||
if not external_page.is_closed():
|
|
||||||
await external_page.close()
|
|
||||||
|
|
||||||
except asyncio.TimeoutError:
|
|
||||||
print(" 🖥️ No external tab — scraping LinkedIn job page.")
|
|
||||||
await page.wait_for_timeout(2000)
|
|
||||||
try:
|
|
||||||
await page.wait_for_selector("div.jobs-apply-button--fixed, div.jobs-easy-apply-modal", timeout=8000)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
await self.engine._human_like_scroll(page)
|
|
||||||
await asyncio.sleep(2 * self.human_speed)
|
|
||||||
job_data = await self._extract_job_details(page)
|
|
||||||
final_url = page.url
|
|
||||||
else:
|
|
||||||
print(" ⚠️ No 'Apply' button found — scraping job details directly.")
|
|
||||||
await self.engine._human_like_scroll(page)
|
|
||||||
await asyncio.sleep(2 * self.human_speed)
|
|
||||||
job_data = await self._extract_job_details(page)
|
|
||||||
final_url = page.url
|
|
||||||
|
|
||||||
job_data["url"] = final_url
|
|
||||||
|
|
||||||
if job_data["title"] == "N/A" and "linkedin.com" in final_url:
|
|
||||||
job_id = final_url.split("/")[-2] if "/jobs/view/" in final_url else "unknown"
|
|
||||||
job_data["title"] = f"Easy Apply Job - ID {job_id}"
|
|
||||||
|
|
||||||
is_meaningful = (
|
|
||||||
job_data["title"] != "N/A" or
|
|
||||||
job_data["company"] != "N/A" or
|
|
||||||
(job_data["description"] != "N/A" and len(job_data["description"]) > 20)
|
|
||||||
)
|
|
||||||
|
|
||||||
if is_meaningful:
|
|
||||||
await self._save_to_db(job_data, search_keywords)
|
|
||||||
await self._save_to_markdown(job_data, search_keywords, verified=True)
|
|
||||||
scraped_count += 1
|
|
||||||
print(f" ✅ Scraped (verified): {job_data['title'][:50]}...")
|
|
||||||
else:
|
|
||||||
await self._save_to_markdown(job_data, search_keywords, verified=False)
|
|
||||||
print(f" 🟡 Scraped (unverified): {final_url} — low-quality data")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f" ⚠️ Failed on job {idx+1}: {str(e)[:100]}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
finally:
|
|
||||||
print(" ↩️ Returning to LinkedIn search results...")
|
|
||||||
await page.goto(search_url, timeout=60000)
|
|
||||||
await asyncio.sleep(4 * self.human_speed)
|
|
||||||
|
|
||||||
await browser.close()
|
|
||||||
|
|
||||||
if scraped_count > 0:
|
|
||||||
self.engine.report_outcome("success")
|
|
||||||
print(f"✅ Completed! Saved {scraped_count} verified + additional unverified jobs for '{search_keywords}'.")
|
|
||||||
else:
|
|
||||||
self.engine.report_outcome("captcha")
|
|
||||||
print("⚠️ No verified jobs scraped — check 'linkedin_jobs_unverified' for raw outputs.")
|
|
||||||
7
lever.csv
Normal file
7
lever.csv
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
url,timestamp
|
||||||
|
https://jobs.eu.lever.co/kaiko/3f7f3db9-4a6a-4047-8760-bc52c3d03e05?lever-origin=applied&lever-source%5B%5D=cryptocurrencyjobs.co,2025-12-31T09:20:28.542417
|
||||||
|
https://jobs.lever.co/waterfall/880fb1b4-2515-4534-9970-53c497c82f12?lever-origin=applied&lever-source%5B%5D=cryptocurrencyjobs.co,2025-12-31T10:08:17.316072
|
||||||
|
https://jobs.lever.co/obol-tech/fcccd493-54e4-425a-b9bd-82fa6f7e6aff?lever-origin=applied&lever-source%5B%5D=cryptocurrencyjobs.co,2025-12-31T10:35:29.164452
|
||||||
|
https://jobs.eu.lever.co/coinspaid/7605e154-4b1d-45ee-b1d4-35edea13d80b?lever-origin=applied&lever-source%5B%5D=cryptocurrencyjobs.co,2025-12-31T10:51:38.852693
|
||||||
|
https://jobs.lever.co/vedatechlabs/9c59c96c-2bb0-47b0-88fe-5d5a9fd85997?lever-origin=applied&lever-source%5B%5D=cryptocurrencyjobs.co,2025-12-31T11:02:16.120852
|
||||||
|
https://jobs.eu.lever.co/kaiko/3f7f3db9-4a6a-4047-8760-bc52c3d03e05?lever-origin=applied&lever-source%5B%5D=cryptocurrencyjobs.co,2025-12-31T11:16:43.218273
|
||||||
|
1792
levercompanies.csv
Normal file
1792
levercompanies.csv
Normal file
File diff suppressed because it is too large
Load Diff
8
linkedin.csv
Normal file
8
linkedin.csv
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
url,timestamp
|
||||||
|
https://www.linkedin.com/jobs/view/operations-analyst-at-amber-group-4325538653/?ref=cryptocurrencyjobs.co,2025-12-31T09:20:11.544002
|
||||||
|
https://www.linkedin.com/jobs/view/hr-operations-intern-sg-at-matrixport-official-4338171692/?ref=cryptocurrencyjobs.co,2025-12-31T09:25:10.499933
|
||||||
|
https://www.linkedin.com/jobs/view/operations-analyst-at-matrixport-official-4235087267/?ref=cryptocurrencyjobs.co,2025-12-31T09:33:53.104120
|
||||||
|
https://www.linkedin.com/jobs/view/business-operations-analyst-at-matrixport-official-4215538150/?ref=cryptocurrencyjobs.co,2025-12-31T09:34:24.186519
|
||||||
|
https://www.linkedin.com/jobs/view/graduate-hiring-business-operations-analyst-wealth-management-at-matrixport-official-4131687672/?ref=cryptocurrencyjobs.co,2025-12-31T09:36:47.038648
|
||||||
|
https://www.linkedin.com/jobs/view/customer-support-specialist-at-matrixport-official-4323103235/?ref=cryptocurrencyjobs.co,2025-12-31T10:39:57.272414
|
||||||
|
https://www.linkedin.com/jobs/view/finance-intern-at-amber-group-4248725225/?ref=cryptocurrencyjobs.co,2025-12-31T11:31:03.349275
|
||||||
|
@ -1,28 +0,0 @@
|
|||||||
from scraping_engine import FingerprintScrapingEngine
|
|
||||||
from job_scraper import LinkedInJobScraper
|
|
||||||
import os
|
|
||||||
import asyncio
|
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
engine = FingerprintScrapingEngine(
|
|
||||||
seed="job_scraping_engine",
|
|
||||||
target_os="windows",
|
|
||||||
db_path="job_listings.db",
|
|
||||||
markdown_path="job_listings.md",
|
|
||||||
search_keywords="Data Anaylst"
|
|
||||||
)
|
|
||||||
|
|
||||||
scraper = LinkedInJobScraper(engine, human_speed=1.6)
|
|
||||||
|
|
||||||
await scraper.scrape_jobs(
|
|
||||||
search_keywords="Data Anaylst", # ← Your search terms
|
|
||||||
max_pages=3,
|
|
||||||
credentials={
|
|
||||||
"email": os.getenv("SCRAPING_USERNAME"),
|
|
||||||
"password": os.getenv("SCRAPING_PASSWORD")
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
asyncio.run(main())
|
|
||||||
305
llm_agent.py
Normal file
305
llm_agent.py
Normal file
@ -0,0 +1,305 @@
|
|||||||
|
|
||||||
|
from openai import OpenAI
|
||||||
|
from typing import Dict, Any
|
||||||
|
import asyncio
|
||||||
|
import psycopg2
|
||||||
|
import os
|
||||||
|
from datetime import datetime
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
# Load environment variables from .env
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
|
class LLMJobRefiner:
|
||||||
|
def __init__(self):
|
||||||
|
deepseek_api_key = os.getenv("DEEPSEEK_API_KEY")
|
||||||
|
if not deepseek_api_key:
|
||||||
|
raise ValueError("DEEPSEEK_API_KEY not found in .env file.")
|
||||||
|
|
||||||
|
# Database credentials from .env
|
||||||
|
self.db_username = os.getenv("DB_USERNAME")
|
||||||
|
self.db_password = os.getenv("DB_PASSWORD")
|
||||||
|
self.db_host = os.getenv("DB_HOST")
|
||||||
|
self.db_port = os.getenv("DB_PORT")
|
||||||
|
|
||||||
|
if not self.db_username or not self.db_password:
|
||||||
|
raise ValueError("Database credentials not found in .env file.")
|
||||||
|
|
||||||
|
# DeepSeek uses OpenAI-compatible API
|
||||||
|
self.client = OpenAI(
|
||||||
|
api_key=deepseek_api_key,
|
||||||
|
base_url="https://api.deepseek.com/v1"
|
||||||
|
)
|
||||||
|
self.model = "deepseek-chat"
|
||||||
|
self._init_db()
|
||||||
|
|
||||||
|
def _init_db(self):
|
||||||
|
"""Initialize PostgreSQL database connection and create table"""
|
||||||
|
try:
|
||||||
|
conn = psycopg2.connect(
|
||||||
|
host=self.db_host,
|
||||||
|
port=self.db_port,
|
||||||
|
database="postgres",
|
||||||
|
user=self.db_username,
|
||||||
|
password=self.db_password
|
||||||
|
)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
cursor.execute('''
|
||||||
|
CREATE TABLE IF NOT EXISTS crypto_jobs (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
title TEXT,
|
||||||
|
company_name TEXT,
|
||||||
|
location TEXT,
|
||||||
|
description TEXT,
|
||||||
|
requirements TEXT,
|
||||||
|
qualifications TEXT,
|
||||||
|
salary_range TEXT,
|
||||||
|
nature_of_work TEXT,
|
||||||
|
job_id TEXT UNIQUE,
|
||||||
|
url TEXT,
|
||||||
|
category TEXT,
|
||||||
|
scraped_at TIMESTAMP,
|
||||||
|
posted_date TEXT,
|
||||||
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||||
|
)
|
||||||
|
''')
|
||||||
|
|
||||||
|
# Ensure the uniqueness constraint exists
|
||||||
|
cursor.execute('''
|
||||||
|
ALTER TABLE crypto_jobs DROP CONSTRAINT IF EXISTS crypto_jobs_job_id_key;
|
||||||
|
ALTER TABLE crypto_jobs ADD CONSTRAINT crypto_jobs_job_id_key UNIQUE (job_id);
|
||||||
|
''')
|
||||||
|
cursor.execute('CREATE INDEX IF NOT EXISTS idx_job_id ON crypto_jobs(job_id)')
|
||||||
|
cursor.execute('CREATE INDEX IF NOT EXISTS idx_category ON crypto_jobs(category)')
|
||||||
|
cursor.execute('CREATE INDEX IF NOT EXISTS idx_posted_date ON crypto_jobs(posted_date)')
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
cursor.close()
|
||||||
|
conn.close()
|
||||||
|
print("✅ PostgreSQL database initialized successfully")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Database initialization error: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def _clean_html_for_llm(self, html_content: str) -> str:
|
||||||
|
"""Clean HTML to make it more readable for LLM while preserving structure"""
|
||||||
|
try:
|
||||||
|
soup = BeautifulSoup(html_content, 'html.parser')
|
||||||
|
|
||||||
|
# Remove script and style elements
|
||||||
|
for script in soup(["script", "style", "nav", "footer", "header"]):
|
||||||
|
script.decompose()
|
||||||
|
|
||||||
|
# Extract text but keep some structure
|
||||||
|
text = soup.get_text(separator=' ', strip=True)
|
||||||
|
|
||||||
|
# Clean up whitespace
|
||||||
|
text = re.sub(r'\s+', ' ', text)
|
||||||
|
|
||||||
|
# Limit length for LLM context
|
||||||
|
if len(text) > 100000:
|
||||||
|
text = text[:100000] + "..."
|
||||||
|
|
||||||
|
return text
|
||||||
|
except Exception as e:
|
||||||
|
print(f"HTML cleaning error: {e}")
|
||||||
|
# Fallback to raw content if cleaning fails
|
||||||
|
return html_content[:100000] if len(html_content) > 100000 else html_content
|
||||||
|
|
||||||
|
def _generate_content_sync(self, prompt: str) -> str:
|
||||||
|
"""Synchronous call to DeepSeek API"""
|
||||||
|
try:
|
||||||
|
response = self.client.chat.completions.create(
|
||||||
|
model=self.model,
|
||||||
|
messages=[{"role": "user", "content": prompt}],
|
||||||
|
temperature=0.1,
|
||||||
|
max_tokens=2048,
|
||||||
|
stream=False
|
||||||
|
)
|
||||||
|
return response.choices[0].message.content or ""
|
||||||
|
except Exception as e:
|
||||||
|
print(f"DeepSeek API error: {e}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
async def refine_job_data(self, raw_data: Dict[str, Any], target_field: str) -> Dict[str, Any]:
|
||||||
|
page_content = raw_data.get('page_content', '')
|
||||||
|
cleaned_content = self._clean_html_for_llm(page_content)
|
||||||
|
job_id = raw_data.get('job_id', 'unknown')
|
||||||
|
url = raw_data.get('url', 'N/A')
|
||||||
|
posted_date = raw_data.get('posted_date', datetime.now().strftime("%m/%d/%y"))
|
||||||
|
|
||||||
|
prompt = f"""
|
||||||
|
You are an expert job posting data extractor. Your task is to extract AND infer fields from the provided job posting.
|
||||||
|
|
||||||
|
### CORE RULES:
|
||||||
|
1. **NEVER invent, summarize, or paraphrase** — extract **exact wording** when available.
|
||||||
|
2. **For critical fields (title, company_name, job_id, url, description):**
|
||||||
|
- These MUST be present and meaningful.
|
||||||
|
- If not explicitly stated, **infer from context** (e.g., page title, headings, "About Us", etc.).
|
||||||
|
- **NEVER return "Not provided" or "N/A" for these fields.**
|
||||||
|
3. **For optional fields (location, salary_range, etc.):**
|
||||||
|
- Extract exact text if present.
|
||||||
|
- If absent but inferable (e.g., "Remote (US)", "Full-time"), **infer it**.
|
||||||
|
- Only return "Not provided" if truly absent and non-inferable.
|
||||||
|
|
||||||
|
### FIELD DEFINITIONS:
|
||||||
|
- **title**: The job title. Look in <h1>, page title, or bold headings.
|
||||||
|
- **company_name**: Company name. Look in logo alt text, footer, "About [X]", or page title.
|
||||||
|
- **description**: Main job overview, responsibilities, or duties. Combine relevant paragraphs if needed. **Must not be empty.**
|
||||||
|
- **requirements**: Required skills, experience, or qualifications.
|
||||||
|
- **qualifications**: Educational or certification requirements.
|
||||||
|
- **location**: Office location or remote policy.
|
||||||
|
- **salary_range**: Exact compensation info.
|
||||||
|
- **nature_of_work**: Employment type (Full-time, Contract, Internship, etc.).
|
||||||
|
|
||||||
|
### OUTPUT FORMAT:
|
||||||
|
Return ONLY a valid JSON object with these keys:
|
||||||
|
{{
|
||||||
|
"title": "...",
|
||||||
|
"company_name": "...",
|
||||||
|
"location": "...",
|
||||||
|
"description": "...",
|
||||||
|
"requirements": "...",
|
||||||
|
"qualifications": "...",
|
||||||
|
"salary_range": "...",
|
||||||
|
"nature_of_work": "...",
|
||||||
|
"job_id": "{job_id}",
|
||||||
|
"url": "{url}"
|
||||||
|
}}
|
||||||
|
|
||||||
|
- **Critical fields must NEVER be "Not provided", "N/A", empty, or generic** (e.g., "Company", "Job Title").
|
||||||
|
- **Optional fields may be "Not provided" ONLY if truly absent.**
|
||||||
|
- **Do not include markdown, explanations, or extra text.**
|
||||||
|
- **Use double quotes for JSON.**
|
||||||
|
|
||||||
|
Page Content:
|
||||||
|
{cleaned_content}
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
response_text = await asyncio.get_event_loop().run_in_executor(
|
||||||
|
None,
|
||||||
|
lambda: self._generate_content_sync(prompt)
|
||||||
|
)
|
||||||
|
refined_data = self._parse_llm_response(response_text)
|
||||||
|
|
||||||
|
if not refined_data:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Validate critical fields — reject if missing or placeholder
|
||||||
|
critical_fields = ['title', 'company_name', 'job_id', 'url', 'description']
|
||||||
|
for field in critical_fields:
|
||||||
|
value = refined_data.get(field, "").strip()
|
||||||
|
if not value or value.lower() in ["n/a", "not provided", "unknown", "company", "job", "title", ""]:
|
||||||
|
print(f" ❌ Critical field '{field}' is invalid: '{value}'")
|
||||||
|
return None # This job will NOT be saved — as per requirement
|
||||||
|
|
||||||
|
# Optional fields: allow "Not provided", but ensure they're strings
|
||||||
|
optional_fields = ['location', 'requirements', 'qualifications', 'salary_range', 'nature_of_work']
|
||||||
|
for field in optional_fields:
|
||||||
|
if field not in refined_data:
|
||||||
|
refined_data[field] = "Not provided"
|
||||||
|
elif not isinstance(refined_data[field], str):
|
||||||
|
refined_data[field] = str(refined_data[field])
|
||||||
|
|
||||||
|
refined_data['posted_date'] = posted_date
|
||||||
|
return refined_data
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"LLM refinement failed: {str(e)}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _parse_llm_response(self, response_text: str) -> Dict[str, Any]:
|
||||||
|
# Try to extract JSON from markdown code block
|
||||||
|
json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', response_text, re.DOTALL)
|
||||||
|
if not json_match:
|
||||||
|
# Try to find raw JSON object
|
||||||
|
json_match = re.search(r'\{[^{}]*\{[^{}]*\}[^{}]*\}|\{.*\}', response_text, re.DOTALL)
|
||||||
|
if not json_match:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
json_str = json_match.group(1) if '```' in response_text else json_match.group(0)
|
||||||
|
# Clean common issues
|
||||||
|
json_str = re.sub(r'\s+', ' ', json_str)
|
||||||
|
json_str = re.sub(r',\s*([\]}\)])', r'\1', json_str) # Remove trailing commas
|
||||||
|
return json.loads(json_str)
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
print(f"JSON parsing error: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def save_job_data(self, job_data: Dict[str, Any], keyword: str):
|
||||||
|
await self._save_to_db(job_data)
|
||||||
|
await self._save_to_markdown(job_data, keyword)
|
||||||
|
|
||||||
|
async def _save_to_db(self, job_data: Dict[str, Any]):
|
||||||
|
"""Save job data to PostgreSQL database with job_id uniqueness"""
|
||||||
|
try:
|
||||||
|
conn = psycopg2.connect(
|
||||||
|
host=self.db_host,
|
||||||
|
port=self.db_port,
|
||||||
|
database="postgres",
|
||||||
|
user=self.db_username,
|
||||||
|
password=self.db_password
|
||||||
|
)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
cursor.execute('''
|
||||||
|
INSERT INTO crypto_jobs
|
||||||
|
(title, company_name, location, description, requirements,
|
||||||
|
qualifications, salary_range, nature_of_work, job_id, url, category, scraped_at, posted_date)
|
||||||
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||||
|
ON CONFLICT (job_id) DO NOTHING
|
||||||
|
''', (
|
||||||
|
job_data.get("title", "Not provided"),
|
||||||
|
job_data.get("company_name", "Not provided"),
|
||||||
|
job_data.get("location", "Not provided"),
|
||||||
|
job_data.get("description", "Not provided"),
|
||||||
|
job_data.get("requirements", "Not provided"),
|
||||||
|
job_data.get("qualifications", "Not provided"),
|
||||||
|
job_data.get("salary_range", "Not provided"),
|
||||||
|
job_data.get("nature_of_work", "Not provided"),
|
||||||
|
job_data.get("job_id", "unknown"),
|
||||||
|
job_data.get("url", "N/A"),
|
||||||
|
job_data.get("category", "all"),
|
||||||
|
job_data.get("scraped_at"),
|
||||||
|
job_data.get("posted_date", datetime.now().strftime("%m/%d/%y"))
|
||||||
|
))
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
cursor.close()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
print(f" 💾 Saved job to category '{job_data.get('category', 'all')}' with job_id: {job_data.get('job_id', 'unknown')}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Database save error: {e}")
|
||||||
|
|
||||||
|
async def _save_to_markdown(self, job_data: Dict[str, Any], keyword: str):
|
||||||
|
os.makedirs("crypto_jobs", exist_ok=True)
|
||||||
|
filepath = os.path.join("crypto_jobs", "crypto_jobs_scraped.md")
|
||||||
|
write_header = not os.path.exists(filepath) or os.path.getsize(filepath) == 0
|
||||||
|
|
||||||
|
with open(filepath, "a", encoding="utf-8") as f:
|
||||||
|
if write_header:
|
||||||
|
f.write(f"# Crypto Jobs - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
|
||||||
|
f.write(f"## Job: {job_data.get('title', 'Not provided')}\n\n")
|
||||||
|
f.write(f"- **Keyword**: {keyword}\n")
|
||||||
|
f.write(f"- **Company**: {job_data.get('company_name', 'Not provided')}\n")
|
||||||
|
f.write(f"- **Location**: {job_data.get('location', 'Not provided')}\n")
|
||||||
|
f.write(f"- **Nature of Work**: {job_data.get('nature_of_work', 'Not provided')}\n")
|
||||||
|
f.write(f"- **Salary Range**: {job_data.get('salary_range', 'Not provided')}\n")
|
||||||
|
f.write(f"- **Job ID**: {job_data.get('job_id', 'unknown')}\n")
|
||||||
|
f.write(f"- **Posted Date**: {job_data.get('posted_date', 'N/A')}\n")
|
||||||
|
f.write(f"- **Category**: {job_data.get('category', 'all')}\n")
|
||||||
|
f.write(f"- **Scraped At**: {job_data.get('scraped_at', 'N/A')}\n")
|
||||||
|
f.write(f"- **URL**: <{job_data.get('url', 'N/A')}>\n\n")
|
||||||
|
f.write(f"### Description\n\n{job_data.get('description', 'Not provided')}\n\n")
|
||||||
|
f.write(f"### Requirements\n\n{job_data.get('requirements', 'Not provided')}\n\n")
|
||||||
|
f.write(f"### Qualifications\n\n{job_data.get('qualifications', 'Not provided')}\n\n")
|
||||||
|
f.write("---\n\n")
|
||||||
54
main.py
Normal file
54
main.py
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
|
||||||
|
from scraping_engine import FingerprintScrapingEngine
|
||||||
|
from scraper import CryptoJobScraper # Updated class name
|
||||||
|
import os
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
import asyncio
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
engine = FingerprintScrapingEngine(
|
||||||
|
seed="crypto_scraping_12",
|
||||||
|
target_os="windows",
|
||||||
|
db_path="crypto_jobs.db",
|
||||||
|
markdown_path="crypto_jobs.md"
|
||||||
|
)
|
||||||
|
|
||||||
|
scraper = CryptoJobScraper(engine, human_speed=1.3, user_request="Extract title, company, location, description, requirements, qualifications, nature of work, and salary")
|
||||||
|
|
||||||
|
job_titles = [
|
||||||
|
"Customer Support",
|
||||||
|
"Design",
|
||||||
|
"Engineering",
|
||||||
|
"Finance",
|
||||||
|
"Marketing",
|
||||||
|
"Operations",
|
||||||
|
"Product",
|
||||||
|
"Sales"
|
||||||
|
|
||||||
|
]
|
||||||
|
|
||||||
|
while True:
|
||||||
|
random.shuffle(job_titles)
|
||||||
|
for job_title in job_titles:
|
||||||
|
search_keywords = job_title # No location param needed
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Starting scrape for: {search_keywords}")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
|
||||||
|
await scraper.scrape_jobs(search_keywords=search_keywords)
|
||||||
|
|
||||||
|
print(f"\n✅ Completed scraping for: {job_title}")
|
||||||
|
print(f"⏳ Waiting 90 seconds before next job title...")
|
||||||
|
time.sleep(90)
|
||||||
|
|
||||||
|
print(f"\n✅ Completed full cycle")
|
||||||
|
print(f"🔄 Starting new cycle...")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
8
requirements.txt
Normal file
8
requirements.txt
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
|
||||||
|
playwright==1.48.0
|
||||||
|
browserforge==0.1.8
|
||||||
|
openai==1.40.0
|
||||||
|
psycopg2-binary==2.9.9
|
||||||
|
beautifulsoup4==4.12.3
|
||||||
|
python-dotenv==1.0.1
|
||||||
|
redis==5.0.8
|
||||||
1
rippling.csv
Normal file
1
rippling.csv
Normal file
@ -0,0 +1 @@
|
|||||||
|
url,timestamp
|
||||||
|
1324
ripplingcompanies.csv
Normal file
1324
ripplingcompanies.csv
Normal file
File diff suppressed because it is too large
Load Diff
491
scraper.py
Normal file
491
scraper.py
Normal file
@ -0,0 +1,491 @@
|
|||||||
|
|
||||||
|
import asyncio
|
||||||
|
import random
|
||||||
|
from typing import Optional, Dict
|
||||||
|
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
|
||||||
|
from browserforge.injectors.playwright import AsyncNewContext
|
||||||
|
from llm_agent import LLMJobRefiner
|
||||||
|
import re
|
||||||
|
from datetime import datetime
|
||||||
|
import json
|
||||||
|
import redis
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
import hashlib
|
||||||
|
import csv
|
||||||
|
import os
|
||||||
|
|
||||||
|
class CryptoJobScraper:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
engine,
|
||||||
|
db_path: str = "crypto_jobs.db",
|
||||||
|
human_speed: float = 1.0,
|
||||||
|
user_request: str = "Extract all standard job details"
|
||||||
|
):
|
||||||
|
self.engine = engine
|
||||||
|
self.db_path = db_path
|
||||||
|
self.human_speed = human_speed
|
||||||
|
self.user_request = user_request
|
||||||
|
self.llm_agent = LLMJobRefiner()
|
||||||
|
self.redis_client = redis.Redis(host='localhost', port=6379, db=0, decode_responses=True)
|
||||||
|
|
||||||
|
self.FORBIDDEN_ATS_DOMAINS = [
|
||||||
|
'ashby', 'ashbyhq',
|
||||||
|
'greenhouse', 'boards.greenhouse.io',
|
||||||
|
'gem', 'gem.com',
|
||||||
|
'rippling',
|
||||||
|
'myworkday', 'myworkdayjobs',
|
||||||
|
'smartrecruiters',
|
||||||
|
'workable',
|
||||||
|
'lever', 'jobs.lever.co',
|
||||||
|
'linkedin.com' # ✅ Added LinkedIn
|
||||||
|
]
|
||||||
|
|
||||||
|
self.INVALID_CONTENT_PHRASES = [
|
||||||
|
"invalid job url",
|
||||||
|
"cookie consent",
|
||||||
|
"privacy policy",
|
||||||
|
"not a valid job",
|
||||||
|
"job not found",
|
||||||
|
"page not found",
|
||||||
|
"The requested job post could not be found. It may have been removed.",
|
||||||
|
"this page does not contain a job description"
|
||||||
|
]
|
||||||
|
|
||||||
|
async def _human_click(self, page, element, wait_after: bool = True):
|
||||||
|
if not element:
|
||||||
|
return False
|
||||||
|
await element.scroll_into_view_if_needed()
|
||||||
|
await asyncio.sleep(random.uniform(0.3, 0.8) * self.human_speed)
|
||||||
|
try:
|
||||||
|
await element.click()
|
||||||
|
if wait_after:
|
||||||
|
await asyncio.sleep(random.uniform(2, 4) * self.human_speed)
|
||||||
|
return True
|
||||||
|
except:
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def _extract_page_content_for_llm(self, page) -> str:
|
||||||
|
await asyncio.sleep(2 * self.human_speed)
|
||||||
|
await self.engine._human_like_scroll(page)
|
||||||
|
await asyncio.sleep(2 * self.human_speed)
|
||||||
|
page_content = await page.content()
|
||||||
|
return page_content
|
||||||
|
|
||||||
|
def _calculate_keyword_match(self, title: str, keywords: str) -> float:
|
||||||
|
if not title or not keywords:
|
||||||
|
return 0.0
|
||||||
|
title_lower = title.lower()
|
||||||
|
keyword_list = [kw.strip().lower() for kw in keywords.split()]
|
||||||
|
matches = sum(1 for kw in keyword_list if kw in title_lower)
|
||||||
|
return matches / len(keyword_list) if keyword_list else 0.0
|
||||||
|
|
||||||
|
async def _extract_job_title_from_card(self, card) -> str:
|
||||||
|
try:
|
||||||
|
title_selectors = [
|
||||||
|
'h3', 'h2', 'h4',
|
||||||
|
'strong', 'span'
|
||||||
|
]
|
||||||
|
for selector in title_selectors:
|
||||||
|
title_element = await card.query_selector(selector)
|
||||||
|
if title_element:
|
||||||
|
title_text = await title_element.inner_text()
|
||||||
|
if title_text and len(title_text.strip()) > 3:
|
||||||
|
return title_text.strip()
|
||||||
|
|
||||||
|
card_text = await card.inner_text()
|
||||||
|
lines = [line.strip() for line in card_text.split('\n') if line.strip()]
|
||||||
|
if lines:
|
||||||
|
for line in lines:
|
||||||
|
if len(line) > 5 and not any(skip in line.lower() for skip in ['today', 'featured', 'yesterday', 'company', 'location']):
|
||||||
|
return line
|
||||||
|
return "Unknown Title"
|
||||||
|
except:
|
||||||
|
return "Unknown Title"
|
||||||
|
|
||||||
|
async def _collect_job_elements_from_page(self, page, search_keywords: str, seen_slugs):
|
||||||
|
job_cards = []
|
||||||
|
job_found = False
|
||||||
|
|
||||||
|
await asyncio.sleep(3 * self.human_speed)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await page.wait_for_selector('a[href^="/"][href*="-"]', timeout=60000)
|
||||||
|
candidates = await page.query_selector_all('a[href^="/"][href*="-"]')
|
||||||
|
|
||||||
|
for link in candidates:
|
||||||
|
href = await link.get_attribute("href") or ""
|
||||||
|
href = href.rstrip('/')
|
||||||
|
if not href or len(href.split('/')) != 3:
|
||||||
|
continue
|
||||||
|
if '-' not in href.split('/')[-1]:
|
||||||
|
continue
|
||||||
|
slug = href.split('/')[-1]
|
||||||
|
if len(slug) < 8 or slug.startswith(('login', 'signup', 'about', 'terms')):
|
||||||
|
continue
|
||||||
|
|
||||||
|
full_url = "https://cryptocurrencyjobs.co" + href if not href.startswith('http') else href
|
||||||
|
if slug in seen_slugs:
|
||||||
|
continue
|
||||||
|
|
||||||
|
title = await self._extract_job_title_from_card(link)
|
||||||
|
if not title or title == "Unknown Title":
|
||||||
|
title = slug.replace('-', ' ').title()
|
||||||
|
|
||||||
|
match_percentage = self._calculate_keyword_match(title, search_keywords)
|
||||||
|
if match_percentage >= 0.4 or not search_keywords.strip():
|
||||||
|
seen_slugs.add(slug)
|
||||||
|
job_cards.append((full_url, title, link))
|
||||||
|
job_found = True
|
||||||
|
|
||||||
|
print(f" ✅ Collected {len(job_cards)} valid job links after filtering ({len(candidates)} raw candidates).")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ⚠️ Error collecting job cards: {e}")
|
||||||
|
|
||||||
|
if not job_found:
|
||||||
|
print(" ❌ No valid job listings passed filters.")
|
||||||
|
|
||||||
|
return job_cards
|
||||||
|
|
||||||
|
async def _handle_pagination_and_collect_all(self, page, search_keywords: str, seen_slugs):
|
||||||
|
all_job_elements = []
|
||||||
|
scroll_attempt = 0
|
||||||
|
max_scrolls = 40
|
||||||
|
prev_count = 0
|
||||||
|
|
||||||
|
while scroll_attempt < max_scrolls:
|
||||||
|
print(f" Scroll attempt {scroll_attempt + 1} | Current total jobs: {len(all_job_elements)}")
|
||||||
|
|
||||||
|
page_elements = await self._collect_job_elements_from_page(page, search_keywords, seen_slugs)
|
||||||
|
all_job_elements.extend(page_elements)
|
||||||
|
|
||||||
|
current_count = len(all_job_elements)
|
||||||
|
|
||||||
|
if current_count == prev_count and scroll_attempt > 3:
|
||||||
|
print(" 🔚 No new jobs after several scrolls → assuming end of list.")
|
||||||
|
break
|
||||||
|
|
||||||
|
prev_count = current_count
|
||||||
|
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
await asyncio.sleep(random.uniform(2.5, 5.5) * self.human_speed)
|
||||||
|
|
||||||
|
try:
|
||||||
|
load_more = await page.query_selector(
|
||||||
|
'button:has-text("Load more"), button:has-text("More"), div[role="button"]:has-text("Load"), a:has-text("Load more")'
|
||||||
|
)
|
||||||
|
if load_more:
|
||||||
|
print(" Found 'Load more' button → clicking...")
|
||||||
|
await self._human_click(page, load_more)
|
||||||
|
await asyncio.sleep(random.uniform(3.0, 6.0) * self.human_speed)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
scroll_attempt += 1
|
||||||
|
|
||||||
|
print(f" Finished scrolling → collected {len(all_job_elements)} unique job links.")
|
||||||
|
return all_job_elements
|
||||||
|
|
||||||
|
async def _extract_job_posted_date_from_card(self, card) -> str:
|
||||||
|
try:
|
||||||
|
card_text = await card.inner_text()
|
||||||
|
if "Today" in card_text:
|
||||||
|
return datetime.now().strftime("%m/%d/%y")
|
||||||
|
elif "Yesterday" in card_text:
|
||||||
|
from datetime import timedelta
|
||||||
|
return (datetime.now() - timedelta(days=1)).strftime("%m/%d/%y")
|
||||||
|
else:
|
||||||
|
match = re.search(r'(\d+)d', card_text)
|
||||||
|
if match:
|
||||||
|
days = int(match.group(1))
|
||||||
|
from datetime import timedelta
|
||||||
|
return (datetime.now() - timedelta(days=days)).strftime("%m/%d/%y")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return datetime.now().strftime("%m/%d/%y")
|
||||||
|
|
||||||
|
async def _add_job_to_redis_cache(self, job_url: str, job_id: str, error_type: str):
|
||||||
|
try:
|
||||||
|
job_data = {
|
||||||
|
"job_url": job_url,
|
||||||
|
"job_id": job_id,
|
||||||
|
"error_type": error_type,
|
||||||
|
"timestamp": datetime.now().isoformat()
|
||||||
|
}
|
||||||
|
self.redis_client.hset("failed_crypto_jobs", job_id, json.dumps(job_data))
|
||||||
|
print(f" 📦 Added failed job to Redis cache: {job_id} (Error: {error_type})")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ❌ Failed to add job to Redis cache: {str(e)}")
|
||||||
|
|
||||||
|
async def _is_forbidden_ats_url(self, url: str) -> bool:
|
||||||
|
url_lower = url.lower()
|
||||||
|
return any(domain in url_lower for domain in self.FORBIDDEN_ATS_DOMAINS)
|
||||||
|
|
||||||
|
def _get_ats_platform_name(self, url: str) -> str:
|
||||||
|
"""Return canonical ATS name based on URL (e.g., 'ashby', 'greenhouse')"""
|
||||||
|
url_lower = url.lower()
|
||||||
|
|
||||||
|
# Order matters: more specific first
|
||||||
|
if 'boards.greenhouse.io' in url_lower:
|
||||||
|
return 'greenhouse'
|
||||||
|
elif 'jobs.lever.co' in url_lower:
|
||||||
|
return 'lever'
|
||||||
|
elif 'myworkdayjobs' in url_lower or 'myworkday' in url_lower:
|
||||||
|
return 'workday'
|
||||||
|
elif 'linkedin.com' in url_lower:
|
||||||
|
return 'linkedin'
|
||||||
|
elif 'ashbyhq.com' in url_lower or 'ashby' in url_lower:
|
||||||
|
return 'ashby'
|
||||||
|
elif 'gem.com' in url_lower or 'gem' in url_lower:
|
||||||
|
return 'gem'
|
||||||
|
elif 'rippling' in url_lower:
|
||||||
|
return 'rippling'
|
||||||
|
elif 'smartrecruiters' in url_lower:
|
||||||
|
return 'smartrecruiters'
|
||||||
|
elif 'workable' in url_lower:
|
||||||
|
return 'workable'
|
||||||
|
else:
|
||||||
|
# Fallback: extract domain part
|
||||||
|
try:
|
||||||
|
parsed = urlparse(url)
|
||||||
|
domain = parsed.netloc.lower()
|
||||||
|
for forbidden in self.FORBIDDEN_ATS_DOMAINS:
|
||||||
|
if forbidden in domain:
|
||||||
|
return forbidden.split('.')[0] if '.' in forbidden else forbidden
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return 'forbidden_ats'
|
||||||
|
|
||||||
|
def _log_forbidden_ats_url(self, url: str, platform: str):
|
||||||
|
"""Append forbidden URL to {platform}.csv"""
|
||||||
|
filename = f"{platform}.csv"
|
||||||
|
file_exists = os.path.isfile(filename)
|
||||||
|
with open(filename, 'a', newline='', encoding='utf-8') as f:
|
||||||
|
writer = csv.writer(f)
|
||||||
|
if not file_exists:
|
||||||
|
writer.writerow(['url', 'timestamp'])
|
||||||
|
writer.writerow([url, datetime.now().isoformat()])
|
||||||
|
print(f" 📥 Logged forbidden ATS URL to {filename}: {url}")
|
||||||
|
|
||||||
|
async def _is_invalid_job_page(self, page_content: str) -> bool:
|
||||||
|
content_lower = page_content.lower()
|
||||||
|
return any(phrase in content_lower for phrase in self.INVALID_CONTENT_PHRASES)
|
||||||
|
|
||||||
|
def _extract_job_id_from_url(self, url: str) -> Optional[str]:
|
||||||
|
try:
|
||||||
|
parsed = urlparse(url)
|
||||||
|
path_parts = [p for p in parsed.path.split('/') if p]
|
||||||
|
if not path_parts:
|
||||||
|
return None
|
||||||
|
|
||||||
|
candidate = path_parts[-1]
|
||||||
|
candidate = re.split(r'[?#]', candidate)[0]
|
||||||
|
candidate = re.sub(r'\.html?$', '', candidate)
|
||||||
|
|
||||||
|
if not candidate or not any(c.isdigit() for c in candidate):
|
||||||
|
return None
|
||||||
|
|
||||||
|
if re.search(r'[A-Za-z]{6,}\s', candidate):
|
||||||
|
return None
|
||||||
|
|
||||||
|
return candidate
|
||||||
|
except:
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def scrape_jobs(
|
||||||
|
self,
|
||||||
|
search_keywords: Optional[str],
|
||||||
|
max_pages: int = 1,
|
||||||
|
credentials: Optional[Dict] = None
|
||||||
|
):
|
||||||
|
query = ""
|
||||||
|
location = ""
|
||||||
|
if search_keywords and search_keywords.strip():
|
||||||
|
parts = search_keywords.split(',', 1)
|
||||||
|
query = parts[0].strip()
|
||||||
|
if len(parts) > 1:
|
||||||
|
location = parts[1].strip()
|
||||||
|
|
||||||
|
clean_query = query.replace(' ', '+')
|
||||||
|
clean_location = location.replace(' ', '+')
|
||||||
|
|
||||||
|
search_url = "https://cryptocurrencyjobs.co/"
|
||||||
|
if clean_query:
|
||||||
|
search_url += f"?query={clean_query}"
|
||||||
|
if clean_location:
|
||||||
|
search_url += f"&location={clean_location}"
|
||||||
|
|
||||||
|
profile = self.engine._select_profile()
|
||||||
|
renderer = random.choice(self.engine.common_renderers[self.engine.os])
|
||||||
|
vendor = random.choice(self.engine.common_vendors)
|
||||||
|
spoof_script = self.engine._get_spoof_script(renderer, vendor)
|
||||||
|
|
||||||
|
async with async_playwright() as pw:
|
||||||
|
browser = await pw.chromium.launch(
|
||||||
|
headless=False,
|
||||||
|
args=['--disable-blink-features=AutomationControlled']
|
||||||
|
)
|
||||||
|
context = await AsyncNewContext(browser, fingerprint=profile)
|
||||||
|
|
||||||
|
await context.add_init_script(f"""
|
||||||
|
Object.defineProperty(navigator, 'hardwareConcurrency', {{ get: () => {profile.navigator.hardwareConcurrency} }});
|
||||||
|
Object.defineProperty(navigator, 'deviceMemory', {{ get: () => {profile.navigator.deviceMemory} }});
|
||||||
|
Object.defineProperty(navigator, 'platform', {{ get: () => '{profile.navigator.platform}' }});
|
||||||
|
""")
|
||||||
|
await context.add_init_script(spoof_script)
|
||||||
|
|
||||||
|
page = await context.new_page()
|
||||||
|
print(f"🔍 Searching for: {search_keywords or 'all jobs'}")
|
||||||
|
print(f" 🔗 URL: {search_url}")
|
||||||
|
await page.goto(search_url, wait_until='networkidle', timeout=120000)
|
||||||
|
await asyncio.sleep(random.uniform(3.0, 5.0) * self.human_speed)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await page.wait_for_selector("a[href^='/'][href*='-']", timeout=10000)
|
||||||
|
except:
|
||||||
|
print(" ⚠️ No job links found initially, waiting longer...")
|
||||||
|
await asyncio.sleep(5 * self.human_speed)
|
||||||
|
|
||||||
|
seen_slugs = set()
|
||||||
|
all_job_elements = await self._handle_pagination_and_collect_all(page, search_keywords, seen_slugs)
|
||||||
|
print(f"✅ Collected {len(all_job_elements)} unique job links.")
|
||||||
|
|
||||||
|
scraped_count = 0
|
||||||
|
for idx, (href, title, job_element) in enumerate(all_job_elements):
|
||||||
|
job_detail_page = None
|
||||||
|
apply_page = None
|
||||||
|
skip_job = False
|
||||||
|
final_scrape_url = None
|
||||||
|
try:
|
||||||
|
print(f" → Processing job {idx+1}/{len(all_job_elements)}: {title}")
|
||||||
|
|
||||||
|
posted_date = await self._extract_job_posted_date_from_card(job_element)
|
||||||
|
|
||||||
|
job_detail_page = await context.new_page()
|
||||||
|
await job_detail_page.goto(href, wait_until='networkidle', timeout=60000)
|
||||||
|
await asyncio.sleep(2 * self.human_speed)
|
||||||
|
|
||||||
|
page_content = await job_detail_page.content()
|
||||||
|
if await self._is_invalid_job_page(page_content):
|
||||||
|
print(" 🚫 Page contains invalid content → skipping.")
|
||||||
|
await job_detail_page.close()
|
||||||
|
continue
|
||||||
|
|
||||||
|
apply_clicked = False
|
||||||
|
apply_selectors = [
|
||||||
|
'a[href*="apply"], a:text("Apply"), a:text("Apply Now"), a:text("Apply here")',
|
||||||
|
'button:text("Apply"), button:has-text("Apply")',
|
||||||
|
'[data-testid="apply-button"], [aria-label*="apply"], [role="button"]:has-text("Apply")',
|
||||||
|
'a.btn-apply, .apply-button, .apply-link, a:has-text("Apply")',
|
||||||
|
'a[rel="noopener"]:has-text("Apply")',
|
||||||
|
]
|
||||||
|
|
||||||
|
for sel in apply_selectors:
|
||||||
|
apply_elem = await job_detail_page.query_selector(sel)
|
||||||
|
if apply_elem:
|
||||||
|
print(f" 🔗 Found Apply element with selector: {sel}")
|
||||||
|
await self._human_click(job_detail_page, apply_elem, wait_after=True)
|
||||||
|
apply_clicked = True
|
||||||
|
break
|
||||||
|
|
||||||
|
apply_page = job_detail_page
|
||||||
|
|
||||||
|
if apply_clicked:
|
||||||
|
await asyncio.sleep(random.uniform(3.0, 6.0) * self.human_speed)
|
||||||
|
pages = context.pages
|
||||||
|
new_pages = [p for p in pages if p != job_detail_page and p.url != "about:blank"]
|
||||||
|
|
||||||
|
if new_pages:
|
||||||
|
candidate_page = new_pages[-1]
|
||||||
|
new_url = candidate_page.url.strip()
|
||||||
|
print(f" New tab opened: {new_url}")
|
||||||
|
|
||||||
|
if new_url and await self._is_forbidden_ats_url(new_url):
|
||||||
|
platform = self._get_ats_platform_name(new_url)
|
||||||
|
self._log_forbidden_ats_url(new_url, platform)
|
||||||
|
if candidate_page != job_detail_page:
|
||||||
|
await candidate_page.close()
|
||||||
|
await job_detail_page.close()
|
||||||
|
skip_job = True
|
||||||
|
else:
|
||||||
|
apply_page = candidate_page
|
||||||
|
else:
|
||||||
|
print(" No new tab → using original page.")
|
||||||
|
|
||||||
|
if skip_job:
|
||||||
|
continue
|
||||||
|
|
||||||
|
final_scrape_url = apply_page.url
|
||||||
|
|
||||||
|
page_content = await self._extract_page_content_for_llm(apply_page)
|
||||||
|
if await self._is_invalid_job_page(page_content):
|
||||||
|
print(" 🚫 Final page contains invalid content → skipping.")
|
||||||
|
if apply_page != job_detail_page:
|
||||||
|
await apply_page.close()
|
||||||
|
await job_detail_page.close()
|
||||||
|
continue
|
||||||
|
|
||||||
|
job_id = self._extract_job_id_from_url(final_scrape_url)
|
||||||
|
if not job_id:
|
||||||
|
job_id = "job_" + hashlib.md5(final_scrape_url.encode()).hexdigest()[:12]
|
||||||
|
|
||||||
|
raw_data = {
|
||||||
|
"page_content": page_content,
|
||||||
|
"url": final_scrape_url,
|
||||||
|
"job_id": job_id,
|
||||||
|
"search_keywords": search_keywords,
|
||||||
|
"posted_date": posted_date
|
||||||
|
}
|
||||||
|
|
||||||
|
refined_data = await self.llm_agent.refine_job_data(raw_data, self.user_request)
|
||||||
|
|
||||||
|
if refined_data and refined_data.get("title", "N/A") != "N/A":
|
||||||
|
compulsory_fields = ['company_name', 'job_id', 'url']
|
||||||
|
for field in compulsory_fields:
|
||||||
|
if not refined_data.get(field) or refined_data[field] in ["N/A", "", "Unknown"]:
|
||||||
|
if field == 'job_id':
|
||||||
|
refined_data[field] = job_id
|
||||||
|
elif field == 'url':
|
||||||
|
refined_data[field] = final_scrape_url
|
||||||
|
elif field == 'company_name':
|
||||||
|
refined_data[field] = "Unknown Company"
|
||||||
|
|
||||||
|
refined_data['scraped_at'] = datetime.now().isoformat()
|
||||||
|
refined_data['category'] = search_keywords or "all"
|
||||||
|
refined_data['posted_date'] = posted_date
|
||||||
|
await self.llm_agent.save_job_data(refined_data, search_keywords or "all")
|
||||||
|
scraped_count += 1
|
||||||
|
print(f" ✅ Scraped: {refined_data['title'][:50]}... (Job ID: {job_id})")
|
||||||
|
self.engine.report_outcome("success", url=final_scrape_url)
|
||||||
|
else:
|
||||||
|
print(f" 🟡 Could not extract meaningful data from: {final_scrape_url}")
|
||||||
|
await self._add_job_to_redis_cache(final_scrape_url, job_id, "llm_failure")
|
||||||
|
self.engine.report_outcome("llm_failure", url=final_scrape_url)
|
||||||
|
|
||||||
|
if apply_page != job_detail_page and not apply_page.is_closed():
|
||||||
|
await apply_page.close()
|
||||||
|
if job_detail_page and not job_detail_page.is_closed():
|
||||||
|
await job_detail_page.close()
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
error_msg = str(e)[:100]
|
||||||
|
print(f" ⚠️ Failed on job {idx+1}: {error_msg}")
|
||||||
|
job_id_for_log = "unknown"
|
||||||
|
if 'final_scrape_url' in locals() and final_scrape_url:
|
||||||
|
job_id_for_log = "job_" + hashlib.md5(final_scrape_url.encode()).hexdigest()[:12]
|
||||||
|
await self._add_job_to_redis_cache(href, job_id_for_log, f"exception: {error_msg}")
|
||||||
|
if job_detail_page and not job_detail_page.is_closed():
|
||||||
|
await job_detail_page.close()
|
||||||
|
if 'apply_page' in locals() and apply_page and apply_page != job_detail_page and not apply_page.is_closed():
|
||||||
|
await apply_page.close()
|
||||||
|
continue
|
||||||
|
|
||||||
|
await browser.close()
|
||||||
|
|
||||||
|
if scraped_count > 0:
|
||||||
|
self.engine.report_outcome("success")
|
||||||
|
print(f"✅ Completed! Processed {scraped_count} jobs for '{search_keywords or 'all jobs'}'.")
|
||||||
|
else:
|
||||||
|
self.engine.report_outcome("scraping_error")
|
||||||
|
print("⚠️ No jobs processed successfully.")
|
||||||
@ -6,10 +6,12 @@ import hashlib
|
|||||||
import random
|
import random
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
from typing import List, Optional, Dict
|
from playwright.async_api import Page
|
||||||
|
from typing import List, Optional, Dict, Any
|
||||||
from browserforge.fingerprints import FingerprintGenerator
|
from browserforge.fingerprints import FingerprintGenerator
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
from config import load_spoof_config
|
from config import load_spoof_config
|
||||||
|
import time
|
||||||
|
|
||||||
# Load environment variables
|
# Load environment variables
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
@ -24,8 +26,7 @@ class FingerprintScrapingEngine:
|
|||||||
db_path: str = "jobs.db",
|
db_path: str = "jobs.db",
|
||||||
markdown_path: str = "scraped_jobs.md",
|
markdown_path: str = "scraped_jobs.md",
|
||||||
proxies: List[str] = None,
|
proxies: List[str] = None,
|
||||||
login_credentials: Optional[Dict[str, str]] = None,
|
login_credentials: Optional[Dict[str, str]] = None
|
||||||
search_keywords: Optional[str] = None
|
|
||||||
):
|
):
|
||||||
if target_os not in ['windows', 'macos']:
|
if target_os not in ['windows', 'macos']:
|
||||||
raise ValueError("operating_system must be 'windows' or 'macos'")
|
raise ValueError("operating_system must be 'windows' or 'macos'")
|
||||||
@ -42,7 +43,6 @@ class FingerprintScrapingEngine:
|
|||||||
self.markdown_path = markdown_path
|
self.markdown_path = markdown_path
|
||||||
self.proxies = proxies or []
|
self.proxies = proxies or []
|
||||||
self.login_credentials = login_credentials
|
self.login_credentials = login_credentials
|
||||||
self.search_keywords = search_keywords
|
|
||||||
self.fingerprint_generator = FingerprintGenerator(
|
self.fingerprint_generator = FingerprintGenerator(
|
||||||
browser=('chrome',),
|
browser=('chrome',),
|
||||||
os=(self.os,)
|
os=(self.os,)
|
||||||
@ -55,16 +55,28 @@ class FingerprintScrapingEngine:
|
|||||||
self.common_renderers = spoof_config["renderers"]
|
self.common_renderers = spoof_config["renderers"]
|
||||||
self.common_vendors = spoof_config["vendors"]
|
self.common_vendors = spoof_config["vendors"]
|
||||||
|
|
||||||
# Feedback system
|
|
||||||
self.feedback_file = f"feedback_{seed}.json"
|
self.feedback_file = f"feedback_{seed}.json"
|
||||||
|
|
||||||
|
# Feedback system
|
||||||
self.feedback = self._load_feedback()
|
self.feedback = self._load_feedback()
|
||||||
|
|
||||||
# ← NEW: Session persistence paths
|
# ← NEW: Session persistence paths
|
||||||
self.session_dir = "browser_sessions"
|
self.session_dir = "browser_sessions"
|
||||||
os.makedirs(self.session_dir, exist_ok=True)
|
os.makedirs(self.session_dir, exist_ok=True)
|
||||||
self.session_path = os.path.join(self.session_dir, f"{seed}_session.json")
|
self.session_path = os.path.join(
|
||||||
|
self.session_dir, f"{seed}_session.json")
|
||||||
|
|
||||||
def _load_feedback(self):
|
self.optimization_params = {
|
||||||
|
"base_delay": 2.0,
|
||||||
|
"max_concurrent_requests": 4,
|
||||||
|
"request_timeout": 120000,
|
||||||
|
"retry_attempts": 3,
|
||||||
|
"captcha_handling_strategy": "avoid", # or "solve_fallback"
|
||||||
|
"cloudflare_wait_strategy": "smart_wait", # or "aggressive_reload"
|
||||||
|
}
|
||||||
|
self._update_params_from_feedback()
|
||||||
|
|
||||||
|
def _load_feedback(self) -> Dict[str, Any]:
|
||||||
if os.path.exists(self.feedback_file):
|
if os.path.exists(self.feedback_file):
|
||||||
try:
|
try:
|
||||||
with open(self.feedback_file, "r") as f:
|
with open(self.feedback_file, "r") as f:
|
||||||
@ -72,6 +84,8 @@ class FingerprintScrapingEngine:
|
|||||||
data.setdefault("success_rate", 1.0)
|
data.setdefault("success_rate", 1.0)
|
||||||
data.setdefault("captcha_count", 0)
|
data.setdefault("captcha_count", 0)
|
||||||
data.setdefault("cloudflare_count", 0)
|
data.setdefault("cloudflare_count", 0)
|
||||||
|
data.setdefault("avg_response_time", 10.0) # New metric
|
||||||
|
data.setdefault("failed_domains", {}) # New metrice
|
||||||
return data
|
return data
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
@ -81,16 +95,69 @@ class FingerprintScrapingEngine:
|
|||||||
with open(self.feedback_file, "w") as f:
|
with open(self.feedback_file, "w") as f:
|
||||||
json.dump(self.feedback, f)
|
json.dump(self.feedback, f)
|
||||||
|
|
||||||
def report_outcome(self, outcome: str):
|
def report_outcome(self, outcome: str, url: Optional[str] = None, response_time: Optional[float] = None):
|
||||||
if outcome == "success":
|
if outcome == "success":
|
||||||
self.feedback["success_rate"] = min(1.0, self.feedback["success_rate"] + 0.1)
|
self.feedback["success_rate"] = min(
|
||||||
|
1.0, self.feedback["success_rate"] + 0.05) # Smaller increment
|
||||||
else:
|
else:
|
||||||
self.feedback["success_rate"] = max(0.1, self.feedback["success_rate"] - 0.2)
|
self.feedback["success_rate"] = max(
|
||||||
|
0.05, self.feedback["success_rate"] - 0.1) # Smaller decrement
|
||||||
|
|
||||||
if outcome == "captcha":
|
if outcome == "captcha":
|
||||||
self.feedback["captcha_count"] += 1
|
self.feedback["captcha_count"] += 1
|
||||||
|
# Adapt strategy if many captchas
|
||||||
|
self.optimization_params["captcha_handling_strategy"] = "solve_fallback"
|
||||||
elif outcome == "cloudflare":
|
elif outcome == "cloudflare":
|
||||||
self.feedback["cloudflare_count"] += 1
|
self.feedback["cloudflare_count"] += 1
|
||||||
|
# Adjust wait strategy based on frequency
|
||||||
|
if self.feedback["cloudflare_count"] > 5:
|
||||||
|
self.optimization_params["cloudflare_wait_strategy"] = "aggressive_reload"
|
||||||
|
|
||||||
|
# Track domain-specific failures
|
||||||
|
if url and outcome != "success":
|
||||||
|
domain = url.split("//")[1].split("/")[0]
|
||||||
|
if domain not in self.feedback["failed_domains"]:
|
||||||
|
self.feedback["failed_domains"][domain] = 0
|
||||||
|
self.feedback["failed_domains"][domain] += 1
|
||||||
|
|
||||||
|
# Update average response time
|
||||||
|
if response_time:
|
||||||
|
prev_avg = self.feedback.get("avg_response_time", 10.0)
|
||||||
|
# Simple moving average
|
||||||
|
self.feedback["avg_response_time"] = (
|
||||||
|
prev_avg * 0.9) + (response_time * 0.1)
|
||||||
|
|
||||||
self.save_feedback()
|
self.save_feedback()
|
||||||
|
self._update_params_from_feedback() # Update params based on new feedback
|
||||||
|
|
||||||
|
def _update_params_from_feedback(self):
|
||||||
|
"""Adjust optimization parameters based on feedback."""
|
||||||
|
sr = self.feedback["success_rate"]
|
||||||
|
cc = self.feedback["captcha_count"]
|
||||||
|
cf = self.feedback["cloudflare_count"]
|
||||||
|
avg_rt = self.feedback.get("avg_response_time", 10.0)
|
||||||
|
|
||||||
|
# Adjust base delay based on success rate and avg response time
|
||||||
|
if sr < 0.6:
|
||||||
|
self.optimization_params["base_delay"] = max(
|
||||||
|
5.0, self.optimization_params["base_delay"] * 1.2)
|
||||||
|
elif sr > 0.8:
|
||||||
|
self.optimization_params["base_delay"] = min(
|
||||||
|
3.0, self.optimization_params["base_delay"] * 0.9)
|
||||||
|
|
||||||
|
# Reduce concurrency if many captchas/cloudflares
|
||||||
|
if cc > 3 or cf > 3:
|
||||||
|
self.optimization_params["max_concurrent_requests"] = max(
|
||||||
|
2, self.optimization_params["max_concurrent_requests"] - 2)
|
||||||
|
else:
|
||||||
|
# Reset to default
|
||||||
|
self.optimization_params["max_concurrent_requests"] = 4
|
||||||
|
|
||||||
|
# Increase timeout if avg response time is high
|
||||||
|
if avg_rt > 20:
|
||||||
|
self.optimization_params["request_timeout"] = 150000 # 90 seconds
|
||||||
|
|
||||||
|
print(f"Optimization Params Updated: {self.optimization_params}")
|
||||||
|
|
||||||
# ← NEW: Save browser context (cookies + localStorage)
|
# ← NEW: Save browser context (cookies + localStorage)
|
||||||
async def save_session(self, context):
|
async def save_session(self, context):
|
||||||
@ -131,7 +198,8 @@ class FingerprintScrapingEngine:
|
|||||||
if self.feedback["success_rate"] < 0.5:
|
if self.feedback["success_rate"] < 0.5:
|
||||||
concurrency_options = [8, 4]
|
concurrency_options = [8, 4]
|
||||||
memory_options = [8]
|
memory_options = [8]
|
||||||
profile.navigator.hardwareConcurrency = random.choice(concurrency_options)
|
profile.navigator.hardwareConcurrency = random.choice(
|
||||||
|
concurrency_options)
|
||||||
profile.navigator.deviceMemory = random.choice(memory_options)
|
profile.navigator.deviceMemory = random.choice(memory_options)
|
||||||
return profile
|
return profile
|
||||||
|
|
||||||
@ -247,23 +315,6 @@ class FingerprintScrapingEngine:
|
|||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
async def _detect_cloudflare(self, page) -> bool:
|
|
||||||
content = await page.content()
|
|
||||||
return (
|
|
||||||
"#cf-chl" in content or
|
|
||||||
"checking your browser" in content.lower() or
|
|
||||||
"just a moment" in content.lower()
|
|
||||||
)
|
|
||||||
|
|
||||||
async def _handle_cloudflare(self, page, max_retries: int = 3):
|
|
||||||
for i in range(max_retries):
|
|
||||||
if not await self._detect_cloudflare(page):
|
|
||||||
return True
|
|
||||||
print(f"☁️ Cloudflare detected - waiting... (attempt {i+1})")
|
|
||||||
await asyncio.sleep(8 + random.uniform(2, 5))
|
|
||||||
await page.wait_for_load_state("load", timeout=60000)
|
|
||||||
return False
|
|
||||||
|
|
||||||
async def _avoid_captcha(self, page) -> bool:
|
async def _avoid_captcha(self, page) -> bool:
|
||||||
await asyncio.sleep(2 + random.random() * 3)
|
await asyncio.sleep(2 + random.random() * 3)
|
||||||
await self._human_like_scroll(page)
|
await self._human_like_scroll(page)
|
||||||
@ -285,3 +336,42 @@ class FingerprintScrapingEngine:
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
async def _detect_cloudflare(self, page: Page) -> bool:
|
||||||
|
"""Detect Cloudflare challenges."""
|
||||||
|
content = await page.content()
|
||||||
|
return (
|
||||||
|
"#cf-chl" in content
|
||||||
|
or "checking your browser" in content.lower()
|
||||||
|
or "just a moment" in content.lower()
|
||||||
|
or "turnstile" in content.lower() # Check for Cloudflare Turnstile
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _handle_cloudflare(self, page: Page) -> bool:
|
||||||
|
"""
|
||||||
|
Handle Cloudflare challenges, including Turnstile if present.
|
||||||
|
This is a simplified approach; real-world handling might require more sophisticated logic or external solvers.
|
||||||
|
"""
|
||||||
|
max_wait_time = 60 # Total time to wait for Cloudflare to resolve
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
while time.time() - start_time < max_wait_time:
|
||||||
|
if not await self._detect_cloudflare(page):
|
||||||
|
print("Cloudflare challenge resolved.")
|
||||||
|
return True
|
||||||
|
|
||||||
|
print("Cloudflare active, waiting...")
|
||||||
|
# Simulate more human-like behavior while waiting
|
||||||
|
await self._simulate_human_interaction(page)
|
||||||
|
# Wait for a random period, increasing slightly each time
|
||||||
|
wait_time = min(10, 2 + random.uniform(1, 3) +
|
||||||
|
(time.time() - start_time) * 0.1)
|
||||||
|
await asyncio.sleep(wait_time)
|
||||||
|
|
||||||
|
# Reload occasionally to trigger potential client-side checks
|
||||||
|
if (time.time() - start_time) > 15 and (time.time() - start_time) % 20 < 2:
|
||||||
|
print("Reloading page during Cloudflare wait...")
|
||||||
|
await page.reload(wait_until='load', timeout=80000)
|
||||||
|
|
||||||
|
print("Timeout waiting for Cloudflare resolution.")
|
||||||
|
return False
|
||||||
|
|||||||
26
spoof_config.json
Normal file
26
spoof_config.json
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
{
|
||||||
|
"renderers": {
|
||||||
|
"windows": [
|
||||||
|
"ANGLE (Intel, Intel(R) UHD Graphics 630 (0x00003E9B) Direct3D11 vs_5_0 ps_5_0, D3D11)",
|
||||||
|
"ANGLE (Intel, Intel(R) UHD Graphics (0x00009A49) Direct3D11 vs_5_0 ps_5_0, D3D11)",
|
||||||
|
"ANGLE (Intel(R) Iris(TM) Graphics 540 Direct3D11 vs_5_0 ps_5_0)",
|
||||||
|
"ANGLE (Intel, Intel(R) UHD Graphics 620 (0x00005916) Direct3D11 vs_5_0 ps_5_0, D3D11)",
|
||||||
|
"ANGLE (Intel, Intel(R) HD Graphics 530 (0x0000191B) Direct3D11 vs_5_0 ps_5_0, D3D11)",
|
||||||
|
"ANGLE (Intel, Intel(R) UHD Graphics 600 (0x00003180) Direct3D11 vs_5_0 ps_5_0, D3D11)",
|
||||||
|
"ANGLE (Intel, Intel(R) Iris(R) Xe Graphics (0x00009A49) Direct3D11 vs_5_0 ps_5_0, D3D11)"
|
||||||
|
],
|
||||||
|
"macos": [
|
||||||
|
"Intel HD Graphics 530 OpenGL Engine",
|
||||||
|
"Intel Iris Graphics 6100 OpenGL Engine",
|
||||||
|
"Intel UHD Graphics 630 OpenGL Engine",
|
||||||
|
"Intel HD Graphics 4000 OpenGL Engine",
|
||||||
|
"Intel Iris Pro OpenGL Engine",
|
||||||
|
"Intel UHD Graphics 617 OpenGL Engine"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"vendors": [
|
||||||
|
"Intel Inc.",
|
||||||
|
"Intel",
|
||||||
|
"Intel Corporation"
|
||||||
|
]
|
||||||
|
}
|
||||||
5
workable.csv
Normal file
5
workable.csv
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
url,timestamp
|
||||||
|
https://apply.workable.com/thetie/j/C54DFC9985/?ref=cryptocurrencyjobs.co,2025-12-31T08:24:45.755671
|
||||||
|
https://apply.workable.com/thetie/j/C54DFC9985/?ref=cryptocurrencyjobs.co,2025-12-31T09:51:08.343642
|
||||||
|
https://apply.workable.com/thetie/j/2745433865/?ref=cryptocurrencyjobs.co,2025-12-31T09:51:28.331543
|
||||||
|
https://apply.workable.com/thetie/j/1A6C8F2913/?ref=cryptocurrencyjobs.co,2025-12-31T11:22:54.623723
|
||||||
|
3548
workablecompanies.csv
Normal file
3548
workablecompanies.csv
Normal file
File diff suppressed because it is too large
Load Diff
1
workday.csv
Normal file
1
workday.csv
Normal file
@ -0,0 +1 @@
|
|||||||
|
url,timestamp
|
||||||
|
1045
workdaycompanies.csv
Normal file
1045
workdaycompanies.csv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user