Add scraping_engine.py
This commit is contained in:
parent
28d7197378
commit
1a216a1aa8
225
scraping_engine.py
Normal file
225
scraping_engine.py
Normal file
@ -0,0 +1,225 @@
|
|||||||
|
# scraping_engine.py
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import hashlib
|
||||||
|
import random
|
||||||
|
import os
|
||||||
|
from typing import List, Optional, Dict
|
||||||
|
from browserforge.fingerprints import FingerprintGenerator
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
# Load environment variables
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
|
class FingerprintScrapingEngine:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
seed: str = "default_seed",
|
||||||
|
num_variations: int = 10,
|
||||||
|
target_os: str = "windows",
|
||||||
|
db_path: str = "jobs.db",
|
||||||
|
markdown_path: str = "scraped_jobs.md",
|
||||||
|
proxies: List[str] = None,
|
||||||
|
login_credentials: Optional[Dict[str, str]] = None,
|
||||||
|
search_keywords: Optional[str] = None
|
||||||
|
):
|
||||||
|
if target_os not in ['windows', 'macos']:
|
||||||
|
raise ValueError("operating_system must be 'windows' or 'macos'")
|
||||||
|
|
||||||
|
# Load credentials from .env if not provided
|
||||||
|
if login_credentials is None:
|
||||||
|
username = os.getenv("SCRAPING_USERNAME")
|
||||||
|
password = os.getenv("SCRAPING_PASSWORD")
|
||||||
|
if username and password:
|
||||||
|
login_credentials = {
|
||||||
|
"username": username, "password": password}
|
||||||
|
|
||||||
|
self.seed = seed
|
||||||
|
self.os = target_os
|
||||||
|
self.markdown_path = markdown_path
|
||||||
|
self.proxies = proxies or []
|
||||||
|
self.login_credentials = login_credentials
|
||||||
|
self.search_keywords = search_keywords
|
||||||
|
self.fingerprint_generator = FingerprintGenerator(
|
||||||
|
browser=('chrome',),
|
||||||
|
os=(self.os,)
|
||||||
|
)
|
||||||
|
|
||||||
|
self.num_variations = num_variations
|
||||||
|
self.common_renderers = {
|
||||||
|
'windows': [
|
||||||
|
"ANGLE (Intel, Intel(R) UHD Graphics 630 (0x00003E9B) Direct3D11 vs_5_0 ps_5_0, D3D11)",
|
||||||
|
"ANGLE (Intel, Intel(R) UHD Graphics (0x00009A49) Direct3D11 vs_5_0 ps_5_0, D3D11)",
|
||||||
|
"ANGLE (Intel(R) Iris(TM) Graphics 540 Direct3D11 vs_5_0 ps_5_0)",
|
||||||
|
"ANGLE (Intel, Intel(R) UHD Graphics 620 (0x00005916) Direct3D11 vs_5_0 ps_5_0, D3D11)",
|
||||||
|
"ANGLE (Intel, Intel(R) HD Graphics 530 (0x0000191B) Direct3D11 vs_5_0 ps_5_0, D3D11)",
|
||||||
|
"ANGLE (Intel, Intel(R) UHD Graphics 600 (0x00003180) Direct3D11 vs_5_0 ps_5_0, D3D11)",
|
||||||
|
"ANGLE (Intel, Intel(R) Iris(R) Xe Graphics (0x00009A49) Direct3D11 vs_5_0 ps_5_0, D3D11)",
|
||||||
|
],
|
||||||
|
'macos': [
|
||||||
|
"Intel HD Graphics 530 OpenGL Engine",
|
||||||
|
"Intel Iris Graphics 6100 OpenGL Engine",
|
||||||
|
"Intel UHD Graphics 630 OpenGL Engine",
|
||||||
|
"Intel HD Graphics 4000 OpenGL Engine",
|
||||||
|
"Intel Iris Pro OpenGL Engine",
|
||||||
|
"Intel UHD Graphics 617 OpenGL Engine",
|
||||||
|
]
|
||||||
|
}
|
||||||
|
self.common_vendors = ["Intel Inc.", "Intel", "Intel Corporation"]
|
||||||
|
|
||||||
|
def _select_profile(self):
|
||||||
|
seed_hash = int(hashlib.sha256(self.seed.encode()).hexdigest(), 16)
|
||||||
|
random.seed(seed_hash)
|
||||||
|
profile = self.fingerprint_generator.generate()
|
||||||
|
profile.navigator.hardwareConcurrency = random.choice([4, 8, 12, 16])
|
||||||
|
profile.navigator.deviceMemory = random.choice([4, 8])
|
||||||
|
return profile
|
||||||
|
|
||||||
|
def _get_spoof_script(self, renderer: str, vendor: str):
|
||||||
|
seed_hash = int(hashlib.sha256(self.seed.encode()).hexdigest(), 16)
|
||||||
|
return f"""
|
||||||
|
(function() {{
|
||||||
|
const originalGetContext = HTMLCanvasElement.prototype.getContext;
|
||||||
|
HTMLCanvasElement.prototype.getContext = function(type, attributes) {{
|
||||||
|
if (type === 'webgl' || type === 'experimental-webgl') {{
|
||||||
|
const ctx = originalGetContext.call(this, type, attributes);
|
||||||
|
if (ctx) {{
|
||||||
|
const originalGetParameter = ctx.getParameter.bind(ctx);
|
||||||
|
ctx.getParameter = function(pname) {{
|
||||||
|
if (pname === 0x9245) {{ return '{vendor}'; }}
|
||||||
|
if (pname === 0x9246) {{ return '{renderer}'; }}
|
||||||
|
return originalGetParameter(pname);
|
||||||
|
}};
|
||||||
|
const originalGetShaderPrecisionFormat = ctx.getShaderPrecisionFormat.bind(ctx);
|
||||||
|
ctx.getShaderPrecisionFormat = function(shadertype, precisiontype) {{
|
||||||
|
const format = originalGetShaderPrecisionFormat(shadertype, precisiontype);
|
||||||
|
if (precisiontype === ctx.HIGH_FLOAT) {{
|
||||||
|
format.rangeMin = 127; format.rangeMax = 127; format.precision = 23;
|
||||||
|
}}
|
||||||
|
if (precisiontype === ctx.MEDIUM_FLOAT) {{
|
||||||
|
format.rangeMin = 62; format.rangeMax = 62; format.precision = 14;
|
||||||
|
}}
|
||||||
|
return format;
|
||||||
|
}};
|
||||||
|
}}
|
||||||
|
return ctx;
|
||||||
|
}}
|
||||||
|
return originalGetContext.call(this, type, attributes);
|
||||||
|
}};
|
||||||
|
const originalToDataURL = HTMLCanvasElement.prototype.toDataURL;
|
||||||
|
HTMLCanvasElement.prototype.toDataURL = function(type, encoderOptions) {{
|
||||||
|
const ctx = this.getContext('2d');
|
||||||
|
if (ctx) {{
|
||||||
|
const imageData = ctx.getImageData(0, 0, this.width, this.height);
|
||||||
|
for (let i = 0; i < imageData.data.length; i += 4) {{
|
||||||
|
const noise = (Math.sin({seed_hash % 100000000} + i) * 0.5 + 0.5) * 2 - 1;
|
||||||
|
imageData.data[i] = Math.min(255, Math.max(0, imageData.data[i] + noise));
|
||||||
|
imageData.data[i+1] = Math.min(255, Math.max(0, imageData.data[i+1] + noise));
|
||||||
|
imageData.data[i+2] = Math.min(255, Math.max(0, imageData.data[i+2] + noise));
|
||||||
|
}}
|
||||||
|
ctx.putImageData(imageData, 0, 0);
|
||||||
|
}}
|
||||||
|
return originalToDataURL.call(this, type, encoderOptions);
|
||||||
|
}};
|
||||||
|
const originalAudioContext = window.AudioContext || window.webkitAudioContext;
|
||||||
|
if (originalAudioContext) {{
|
||||||
|
const AudioContextOverride = function() {{
|
||||||
|
const ctx = new originalAudioContext();
|
||||||
|
const originalCreateAnalyser = ctx.createAnalyser;
|
||||||
|
ctx.createAnalyser = function() {{
|
||||||
|
const analyser = originalCreateAnalyser.call(ctx);
|
||||||
|
analyser.getByteFrequencyData = function(array) {{
|
||||||
|
for (let i = 0; i < array.length; i++) {{
|
||||||
|
array[i] = Math.floor(Math.sin(i * 0.1 + {seed_hash % 1000}) * 128 + 128);
|
||||||
|
}}
|
||||||
|
}};
|
||||||
|
analyser.getFloatFrequencyData = function(array) {{
|
||||||
|
for (let i = 0; i < array.length; i++) {{
|
||||||
|
array[i] = Math.sin(i * 0.1 + {seed_hash % 1000}) * 100 - 100;
|
||||||
|
}}
|
||||||
|
}};
|
||||||
|
return analyser;
|
||||||
|
}};
|
||||||
|
return ctx;
|
||||||
|
}};
|
||||||
|
window.AudioContext = AudioContextOverride;
|
||||||
|
window.webkitAudioContext = AudioContextOverride;
|
||||||
|
}}
|
||||||
|
const originalQueryFonts = window.queryLocalFonts;
|
||||||
|
if (originalQueryFonts) {{
|
||||||
|
window.queryLocalFonts = async function() {{
|
||||||
|
return [
|
||||||
|
{{family: "Arial", style: "normal", weight: "400"}},
|
||||||
|
{{family: "Times New Roman", style: "normal", weight: "400"}},
|
||||||
|
{{family: "Courier New", style: "normal", weight: "400"}}
|
||||||
|
];
|
||||||
|
}};
|
||||||
|
}}
|
||||||
|
// Remove bot indicators
|
||||||
|
delete navigator.__proto__.webdriver;
|
||||||
|
window.chrome = {{ runtime: {{}} }};
|
||||||
|
}})();
|
||||||
|
"""
|
||||||
|
|
||||||
|
async def _human_like_scroll(self, page):
|
||||||
|
scroll_height = await page.evaluate("document.body.scrollHeight")
|
||||||
|
current_scroll = 0
|
||||||
|
while current_scroll < scroll_height:
|
||||||
|
scroll_step = random.randint(50, 300)
|
||||||
|
await page.evaluate(f"window.scrollBy(0, {scroll_step})")
|
||||||
|
await asyncio.sleep(random.uniform(0.1, 0.8))
|
||||||
|
current_scroll += scroll_step
|
||||||
|
if random.random() < 0.1:
|
||||||
|
await asyncio.sleep(random.uniform(1, 3))
|
||||||
|
|
||||||
|
async def _simulate_human_interaction(self, page):
|
||||||
|
try:
|
||||||
|
elements = await page.query_selector_all("a, button, input")
|
||||||
|
if elements and random.random() < 0.3:
|
||||||
|
target = random.choice(elements)
|
||||||
|
await target.hover()
|
||||||
|
await asyncio.sleep(random.uniform(0.2, 1.0))
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
async def _detect_cloudflare(self, page) -> bool:
|
||||||
|
"""Detect Cloudflare challenge pages"""
|
||||||
|
content = await page.content()
|
||||||
|
return (
|
||||||
|
"#cf-chl" in content or
|
||||||
|
"checking your browser" in content.lower() or
|
||||||
|
"just a moment" in content.lower()
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _handle_cloudflare(self, page, max_retries: int = 3):
|
||||||
|
"""Wait for Cloudflare to resolve"""
|
||||||
|
for i in range(max_retries):
|
||||||
|
if not await self._detect_cloudflare(page):
|
||||||
|
return True
|
||||||
|
print(f"☁️ Cloudflare detected - waiting... (attempt {i+1})")
|
||||||
|
await asyncio.sleep(8 + random.uniform(2, 5))
|
||||||
|
await page.wait_for_load_state("load", timeout=60000)
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def _avoid_captcha(self, page) -> bool:
|
||||||
|
await asyncio.sleep(2 + random.random() * 3)
|
||||||
|
await self._human_like_scroll(page)
|
||||||
|
await self._simulate_human_interaction(page)
|
||||||
|
await asyncio.sleep(3 + random.random() * 2)
|
||||||
|
return True
|
||||||
|
|
||||||
|
async def _solve_captcha_fallback(self, page) -> bool:
|
||||||
|
await asyncio.sleep(15 + random.random() * 10)
|
||||||
|
captcha_content = await page.content()
|
||||||
|
if "captcha" not in captcha_content.lower() and "cloudflare" not in captcha_content.lower():
|
||||||
|
return True
|
||||||
|
|
||||||
|
print("🔄 Refreshing session to bypass CAPTCHA...")
|
||||||
|
await page.reload()
|
||||||
|
await self._avoid_captcha(page)
|
||||||
|
captcha_content = await page.content()
|
||||||
|
if "captcha" not in captcha_content.lower() and "cloudflare" not in captcha_content.lower():
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
Loading…
x
Reference in New Issue
Block a user