vibe coded a crc reverse engineer tool on gpu, because why not
This commit is contained in:
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
target
|
||||
Cargo.lock
|
||||
9
Cargo.toml
Normal file
9
Cargo.toml
Normal file
@@ -0,0 +1,9 @@
|
||||
[package]
|
||||
name = "crc_cracker"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
wgpu = "0.20"
|
||||
bytemuck = { version = "1", features = ["derive"] }
|
||||
pollster = "0.3"
|
||||
156
src/cracker.rs
Normal file
156
src/cracker.rs
Normal file
@@ -0,0 +1,156 @@
|
||||
use std::ops::RangeInclusive;
|
||||
|
||||
/// A known input/output pair used to constrain the search.
|
||||
/// Add multiple test cases to dramatically reduce false positives.
|
||||
pub struct TestCase {
|
||||
/// Input bytes (can be a partial slice of a larger message)
|
||||
pub data: Vec<u8>,
|
||||
/// The CRC value to match against (only bits set in `mask` are checked)
|
||||
pub expected: u32,
|
||||
/// Which bits of the CRC are known.
|
||||
/// `u32::MAX` (or the width mask) means all bits must match.
|
||||
/// `0x00FF` means only the low byte must match, useful when you can only
|
||||
/// observe part of the CRC field.
|
||||
pub mask: u32,
|
||||
}
|
||||
|
||||
impl TestCase {
|
||||
/// All bits of the CRC must match.
|
||||
pub fn exact(data: Vec<u8>, expected: u32) -> Self {
|
||||
Self { data, expected, mask: u32::MAX }
|
||||
}
|
||||
|
||||
/// Only the bits set in `mask` must match.
|
||||
pub fn masked(data: Vec<u8>, expected: u32, mask: u32) -> Self {
|
||||
Self { data, expected, mask }
|
||||
}
|
||||
}
|
||||
|
||||
/// Constraint on a single CRC parameter.
|
||||
#[derive(Clone, Debug)]
|
||||
pub enum ValueRange {
|
||||
/// Exactly one value to try
|
||||
Fixed(u32),
|
||||
/// Try every value in this inclusive range
|
||||
Range(RangeInclusive<u32>),
|
||||
}
|
||||
|
||||
impl ValueRange {
|
||||
pub fn min(&self) -> u32 {
|
||||
match self {
|
||||
Self::Fixed(v) => *v,
|
||||
Self::Range(r) => *r.start(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn count(&self) -> u32 {
|
||||
match self {
|
||||
Self::Fixed(_) => 1,
|
||||
Self::Range(r) => r.end() - r.start() + 1,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Constraints for the brute-force search.
|
||||
///
|
||||
/// The GPU will try every combination within these bounds, testing each
|
||||
/// candidate against all provided `TestCase`s. Only configurations that
|
||||
/// pass every test case are returned.
|
||||
pub struct Constraints {
|
||||
/// CRC register width in bits — currently 8, 16, or 32.
|
||||
pub width: u8,
|
||||
/// Generator polynomial (without the implicit leading 1).
|
||||
pub poly: ValueRange,
|
||||
/// Initial value loaded into the CRC register.
|
||||
pub init: ValueRange,
|
||||
/// Value XOR'd into the final CRC before output.
|
||||
pub xorout: ValueRange,
|
||||
/// Whether each input byte is bit-reversed before entering the CRC register.
|
||||
///
|
||||
/// - `Some(false)` — MSB-first (textbook CRCs, parallel-bus / memory protocols)
|
||||
/// - `Some(true)` — LSB-first (most serial-line protocols: Ethernet, USB, CAN,
|
||||
/// Bluetooth, Dallas 1-Wire — bits arrive LSB-first on the wire)
|
||||
/// - `None` — unknown; the GPU tries both and returns whichever matches
|
||||
pub refin: Option<bool>,
|
||||
|
||||
/// Whether the final CRC register value is bit-reversed before the xorout step.
|
||||
///
|
||||
/// - `Some(false)` — register used as-is
|
||||
/// - `Some(true)` — register is reflected before XOR-out
|
||||
/// - `None` — unknown; the GPU tries both
|
||||
///
|
||||
/// `refin` and `refout` are almost always equal.
|
||||
/// `(true, true)` covers most real-world CRC-16/32 algorithms (CRC-32/ISO-HDLC,
|
||||
/// CRC-16/ARC, CRC-16/IBM, …).
|
||||
/// `(false, false)` covers most CRC-8 variants and simple checksum algorithms.
|
||||
pub refout: Option<bool>,
|
||||
}
|
||||
|
||||
impl Constraints {
|
||||
pub fn refin_start(&self) -> u32 {
|
||||
match self.refin {
|
||||
Some(true) => 1,
|
||||
_ => 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn refin_count(&self) -> u32 {
|
||||
if self.refin.is_none() { 2 } else { 1 }
|
||||
}
|
||||
|
||||
pub fn refout_start(&self) -> u32 {
|
||||
match self.refout {
|
||||
Some(true) => 1,
|
||||
_ => 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn refout_count(&self) -> u32 {
|
||||
if self.refout.is_none() { 2 } else { 1 }
|
||||
}
|
||||
|
||||
/// Total number of (poly, init, xorout, refin, refout) combinations.
|
||||
pub fn total_combinations(&self) -> u64 {
|
||||
self.poly.count() as u64
|
||||
* self.init.count() as u64
|
||||
* self.xorout.count() as u64
|
||||
* self.refin_count() as u64
|
||||
* self.refout_count() as u64
|
||||
}
|
||||
|
||||
pub fn mask(&self) -> u32 {
|
||||
if self.width == 32 {
|
||||
u32::MAX
|
||||
} else {
|
||||
(1u32 << self.width) - 1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A matching CRC configuration found during the search.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct CrcParams {
|
||||
pub poly: u32,
|
||||
pub init: u32,
|
||||
pub xorout: u32,
|
||||
pub refin: bool,
|
||||
pub refout: bool,
|
||||
}
|
||||
|
||||
impl CrcParams {
|
||||
/// Format as a human-readable CRC descriptor.
|
||||
pub fn display(&self, width: u8) -> String {
|
||||
// ceil(width / 4) hex digits so CRC-13 shows 4 digits, CRC-5 shows 2, etc.
|
||||
let hex_w = ((width + 3) / 4) as usize;
|
||||
format!(
|
||||
"CRC-{}: poly=0x{:0>w$X} init=0x{:0>w$X} xorout=0x{:0>w$X} refin={:<5} refout={}",
|
||||
width,
|
||||
self.poly,
|
||||
self.init,
|
||||
self.xorout,
|
||||
self.refin,
|
||||
self.refout,
|
||||
w = hex_w,
|
||||
)
|
||||
}
|
||||
}
|
||||
531
src/gpu.rs
Normal file
531
src/gpu.rs
Normal file
@@ -0,0 +1,531 @@
|
||||
use std::collections::VecDeque;
|
||||
use std::io::Write;
|
||||
|
||||
use bytemuck::{Pod, Zeroable};
|
||||
use wgpu::util::DeviceExt;
|
||||
|
||||
use crate::cracker::{Constraints, CrcParams, TestCase};
|
||||
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
// ─── GPU-side layouts (must match shader.wgsl exactly) ───────────────────────
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Clone, Copy, Pod, Zeroable)]
|
||||
struct SearchParams {
|
||||
width: u32,
|
||||
mask: u32,
|
||||
poly_min: u32,
|
||||
poly_count: u32,
|
||||
init_min: u32,
|
||||
init_count: u32,
|
||||
xorout_min: u32,
|
||||
xorout_count: u32,
|
||||
refin_start: u32,
|
||||
refin_count: u32,
|
||||
refout_start: u32,
|
||||
refout_count: u32,
|
||||
threads_per_row: u32,
|
||||
max_results: u32,
|
||||
test_case_count: u32,
|
||||
_pad: u32,
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Clone, Copy, Pod, Zeroable)]
|
||||
struct GpuTestCase {
|
||||
data_offset: u32,
|
||||
data_len: u32,
|
||||
expected: u32,
|
||||
mask: u32,
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Clone, Copy, Pod, Zeroable)]
|
||||
struct ResultEntry {
|
||||
poly: u32,
|
||||
init: u32,
|
||||
xorout: u32,
|
||||
flags: u32,
|
||||
}
|
||||
|
||||
const RESULT_SIZE: usize = std::mem::size_of::<ResultEntry>();
|
||||
|
||||
// ─── Constants ────────────────────────────────────────────────────────────────
|
||||
|
||||
const WORKGROUP_SIZE: u32 = 256;
|
||||
const MAX_DISPATCH_DIM: u32 = 65_535;
|
||||
const MAX_BATCH_THREADS: u64 = u32::MAX as u64;
|
||||
|
||||
/// Result buffer ceiling per buffer slot.
|
||||
/// Two slots are allocated, so GPU VRAM usage for results = 2 × this × 16 B.
|
||||
/// 8 M × 2 × 16 B = 256 MB — comfortably fits in any modern discrete GPU.
|
||||
const RESULTS_PER_BATCH: u32 = 8_000_000;
|
||||
|
||||
// ─── Per-slot GPU buffers ───────────────────────────────────────────────────<E29480><E29480>─
|
||||
|
||||
struct Slot {
|
||||
results_buf: wgpu::Buffer,
|
||||
count_buf: wgpu::Buffer,
|
||||
results_staging: wgpu::Buffer,
|
||||
count_staging: wgpu::Buffer,
|
||||
}
|
||||
|
||||
// ─── GPU state ────────────────────────────────────────────────────────────────
|
||||
|
||||
pub struct GpuCracker {
|
||||
device: wgpu::Device,
|
||||
queue: wgpu::Queue,
|
||||
pipeline: wgpu::ComputePipeline,
|
||||
bind_group_layout: wgpu::BindGroupLayout,
|
||||
}
|
||||
|
||||
impl GpuCracker {
|
||||
pub async fn new() -> Self {
|
||||
let instance = wgpu::Instance::default();
|
||||
|
||||
let adapter = instance
|
||||
.request_adapter(&wgpu::RequestAdapterOptions {
|
||||
power_preference: wgpu::PowerPreference::HighPerformance,
|
||||
force_fallback_adapter: false,
|
||||
compatible_surface: None,
|
||||
})
|
||||
.await
|
||||
.expect("no GPU adapter found");
|
||||
|
||||
let info = adapter.get_info();
|
||||
println!("GPU: {} ({:?})", info.name, info.backend);
|
||||
|
||||
let (device, queue) = adapter
|
||||
.request_device(
|
||||
&wgpu::DeviceDescriptor {
|
||||
label: Some("crc_cracker"),
|
||||
required_features: wgpu::Features::empty(),
|
||||
required_limits: wgpu::Limits::default(),
|
||||
},
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.expect("failed to open GPU device");
|
||||
|
||||
let shader = device.create_shader_module(wgpu::ShaderModuleDescriptor {
|
||||
label: Some("crc_shader"),
|
||||
source: wgpu::ShaderSource::Wgsl(include_str!("shader.wgsl").into()),
|
||||
});
|
||||
|
||||
let bind_group_layout =
|
||||
device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
|
||||
label: Some("crc_bgl"),
|
||||
entries: &[
|
||||
buf_layout(0, true),
|
||||
buf_layout(1, true),
|
||||
uniform_layout(2),
|
||||
buf_layout(3, false),
|
||||
buf_layout(4, false),
|
||||
],
|
||||
});
|
||||
|
||||
let pipeline_layout =
|
||||
device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
|
||||
label: Some("crc_pipeline_layout"),
|
||||
bind_group_layouts: &[&bind_group_layout],
|
||||
push_constant_ranges: &[],
|
||||
});
|
||||
|
||||
let pipeline =
|
||||
device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
|
||||
label: Some("crc_pipeline"),
|
||||
layout: Some(&pipeline_layout),
|
||||
module: &shader,
|
||||
entry_point: "main",
|
||||
compilation_options: wgpu::PipelineCompilationOptions::default(),
|
||||
});
|
||||
|
||||
Self { device, queue, pipeline, bind_group_layout }
|
||||
}
|
||||
|
||||
// ── Public API ────────────────────────────────────────────────────────────
|
||||
|
||||
/// Brute-force all matching CRC configurations, writing each to `out`.
|
||||
/// Returns the total match count (exact — no results are ever dropped).
|
||||
///
|
||||
/// Pipeline:
|
||||
/// - Work is split into batches (≤ u32::MAX threads each).
|
||||
/// - Two GPU buffer slots alternate (double-buffer).
|
||||
/// - GPU runs batch N while CPU drains batch N-1 to disk — no idle gaps.
|
||||
/// - If a batch's results overflow the buffer it is split into two
|
||||
/// half-batches and re-queued; no count-only pass needed.
|
||||
pub fn run(
|
||||
&self,
|
||||
test_cases: &[TestCase],
|
||||
constraints: &Constraints,
|
||||
out: &mut impl Write,
|
||||
) -> u64 {
|
||||
assert!(!test_cases.is_empty(), "need at least one test case");
|
||||
|
||||
let total = constraints.total_combinations();
|
||||
let poly_count = constraints.poly.count() as u64;
|
||||
let per_poly = total / poly_count;
|
||||
|
||||
let polys_per_batch = ((MAX_BATCH_THREADS / per_poly).max(1) as u32)
|
||||
.min(constraints.poly.count());
|
||||
|
||||
let num_batches = constraints.poly.count().div_ceil(polys_per_batch);
|
||||
|
||||
println!(
|
||||
"search space: {} combinations \
|
||||
({} poly × {} init × {} xorout × {}× refin × {}× refout) \
|
||||
→ {} batch(es) initially",
|
||||
total,
|
||||
constraints.poly.count(),
|
||||
constraints.init.count(),
|
||||
constraints.xorout.count(),
|
||||
constraints.refin_count(),
|
||||
constraints.refout_count(),
|
||||
num_batches,
|
||||
);
|
||||
|
||||
// Shared input buffers — never change between batches.
|
||||
let (data_buf, tc_buf) = self.upload_test_data(test_cases);
|
||||
|
||||
// Work queue — starts with the initial poly-range slices.
|
||||
// Overflow adds split sub-ranges back here.
|
||||
let mut work: VecDeque<(u32, u32)> = {
|
||||
let poly_min = constraints.poly.min();
|
||||
(0..constraints.poly.count())
|
||||
.step_by(polys_per_batch as usize)
|
||||
.map(|off| {
|
||||
let cnt = polys_per_batch.min(constraints.poly.count() - off);
|
||||
(poly_min + off, cnt)
|
||||
})
|
||||
.collect()
|
||||
};
|
||||
|
||||
// Two GPU buffer slots for double-buffering.
|
||||
let slots = [self.create_slot(), self.create_slot()];
|
||||
let mut active = 0usize; // slot to submit the next batch into
|
||||
|
||||
// Currently in-flight submission: (submission_index, slot, poly_min, poly_count)
|
||||
let mut pending: Option<(wgpu::SubmissionIndex, usize, u32, u32)> = None;
|
||||
let mut total_found = 0u64;
|
||||
|
||||
let mut done_work: u64 = 0;
|
||||
let total_work = total;
|
||||
|
||||
let mut last_update = Instant::now();
|
||||
let update_interval = Duration::from_millis(33); // tweak if needed
|
||||
|
||||
loop {
|
||||
// ── Submit next batch (if any) ────────────────────────────────────
|
||||
// Do this BEFORE draining the pending batch so the GPU stays busy.
|
||||
let new_pending = work.pop_front().map(|(poly_min, poly_count)| {
|
||||
let si = self.submit(
|
||||
&slots[active], &data_buf, &tc_buf,
|
||||
constraints, poly_min, poly_count,
|
||||
test_cases.len() as u32,
|
||||
);
|
||||
let result = (si, active, poly_min, poly_count);
|
||||
active = 1 - active;
|
||||
result
|
||||
});
|
||||
|
||||
// ── Drain the previous batch ──────────────────────────────────────
|
||||
// GPU is now running `new_pending` concurrently with this drain.
|
||||
if let Some((p_si, p_slot, p_poly_min, p_poly_count)) = pending.take() {
|
||||
// Schedule mapping BEFORE polling. The callback fires inside the
|
||||
// poll call below, so get_mapped_range() works right after it.
|
||||
let count_slice = slots[p_slot].count_staging.slice(..4);
|
||||
let results_slice = slots[p_slot].results_staging.slice(..);
|
||||
count_slice.map_async(wgpu::MapMode::Read, |_| {});
|
||||
results_slice.map_async(wgpu::MapMode::Read, |_| {});
|
||||
|
||||
// Wait for this batch. GPU continues running new_pending.
|
||||
self.device.poll(wgpu::Maintain::WaitForSubmissionIndex(p_si));
|
||||
|
||||
let raw_count = {
|
||||
let v = count_slice.get_mapped_range();
|
||||
u32::from_le_bytes(v[..4].try_into().unwrap())
|
||||
};
|
||||
slots[p_slot].count_staging.unmap();
|
||||
|
||||
if raw_count > RESULTS_PER_BATCH {
|
||||
// This batch produced more matches than the buffer holds.
|
||||
// Split it into two half-sized poly ranges and re-queue at
|
||||
// the front so they run before the remaining original batches.
|
||||
slots[p_slot].results_staging.unmap();
|
||||
let half = p_poly_count / 2;
|
||||
if half > 0 {
|
||||
work.push_front((p_poly_min + half, p_poly_count - half));
|
||||
work.push_front((p_poly_min, half));
|
||||
} else {
|
||||
// Single poly value with > RESULTS_PER_BATCH matches.
|
||||
// This means almost every (init, xorout, refin, refout)
|
||||
// combination matches — add more test cases to constrain.
|
||||
let capped = RESULTS_PER_BATCH;
|
||||
eprintln!(
|
||||
"warning: poly={:#X} produced {} results, \
|
||||
buffer capped at {capped}. Add more test cases.",
|
||||
p_poly_min, raw_count,
|
||||
);
|
||||
self.write_results(
|
||||
&slots[p_slot].results_staging, capped, constraints.width, out,
|
||||
);
|
||||
total_found += capped as u64;
|
||||
}
|
||||
} else {
|
||||
self.write_results(
|
||||
&slots[p_slot].results_staging, raw_count, constraints.width, out,
|
||||
);
|
||||
total_found += raw_count as u64;
|
||||
}
|
||||
|
||||
let batch_work = p_poly_count as u64
|
||||
* constraints.init.count() as u64
|
||||
* constraints.xorout.count() as u64
|
||||
* constraints.refin_count() as u64
|
||||
* constraints.refout_count() as u64;
|
||||
|
||||
done_work += batch_work;
|
||||
|
||||
if last_update.elapsed() >= update_interval {
|
||||
let progress = done_work as f64 / total_work as f64;
|
||||
let percent = progress * 100.0;
|
||||
|
||||
// simple text bar (cheap, no allocations)
|
||||
let bar_width = 30;
|
||||
let filled = (progress * bar_width as f64) as usize;
|
||||
|
||||
let mut bar = String::with_capacity(bar_width);
|
||||
for i in 0..bar_width {
|
||||
bar.push(if i < filled { '#' } else { '-' });
|
||||
}
|
||||
|
||||
print!("[{}] {:6.2}% ({}/{})\r", bar, percent, done_work, total_work);
|
||||
std::io::stdout().flush().ok();
|
||||
|
||||
last_update = Instant::now();
|
||||
}
|
||||
}
|
||||
|
||||
// ── Advance state ─────────────────────────────────────────────────
|
||||
match new_pending {
|
||||
Some(info) => pending = Some(info),
|
||||
None if pending.is_none() && work.is_empty() => break,
|
||||
None => { /* no new submission, but still pending or overflow work */ }
|
||||
}
|
||||
}
|
||||
|
||||
println!();
|
||||
|
||||
total_found
|
||||
}
|
||||
|
||||
// ── Internals ─────────────────────────────────────────────────────────────
|
||||
|
||||
fn create_slot(&self) -> Slot {
|
||||
let d = &self.device;
|
||||
let results_size = RESULTS_PER_BATCH as u64 * RESULT_SIZE as u64;
|
||||
Slot {
|
||||
results_buf: d.create_buffer(&wgpu::BufferDescriptor {
|
||||
label: Some("results_buf"),
|
||||
size: results_size,
|
||||
usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC,
|
||||
mapped_at_creation: false,
|
||||
}),
|
||||
count_buf: d.create_buffer(&wgpu::BufferDescriptor {
|
||||
label: Some("count_buf"),
|
||||
size: 4,
|
||||
usage: wgpu::BufferUsages::STORAGE
|
||||
| wgpu::BufferUsages::COPY_SRC
|
||||
| wgpu::BufferUsages::COPY_DST,
|
||||
mapped_at_creation: false,
|
||||
}),
|
||||
results_staging: d.create_buffer(&wgpu::BufferDescriptor {
|
||||
label: Some("results_staging"),
|
||||
size: results_size,
|
||||
usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
|
||||
mapped_at_creation: false,
|
||||
}),
|
||||
count_staging: d.create_buffer(&wgpu::BufferDescriptor {
|
||||
label: Some("count_staging"),
|
||||
size: 4,
|
||||
usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
|
||||
mapped_at_creation: false,
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
fn upload_test_data(
|
||||
&self,
|
||||
test_cases: &[TestCase],
|
||||
) -> (wgpu::Buffer, wgpu::Buffer) {
|
||||
let mut flat: Vec<u8> = Vec::new();
|
||||
let mut gpu: Vec<GpuTestCase> = Vec::new();
|
||||
for tc in test_cases {
|
||||
gpu.push(GpuTestCase {
|
||||
data_offset: flat.len() as u32,
|
||||
data_len: tc.data.len() as u32,
|
||||
expected: tc.expected,
|
||||
mask: tc.mask,
|
||||
});
|
||||
flat.extend_from_slice(&tc.data);
|
||||
}
|
||||
while flat.len() % 4 != 0 { flat.push(0); }
|
||||
|
||||
let d = &self.device;
|
||||
(
|
||||
d.create_buffer_init(&wgpu::util::BufferInitDescriptor {
|
||||
label: Some("data_buf"),
|
||||
contents: &flat,
|
||||
usage: wgpu::BufferUsages::STORAGE,
|
||||
}),
|
||||
d.create_buffer_init(&wgpu::util::BufferInitDescriptor {
|
||||
label: Some("tc_buf"),
|
||||
contents: bytemuck::cast_slice(&gpu),
|
||||
usage: wgpu::BufferUsages::STORAGE,
|
||||
}),
|
||||
)
|
||||
}
|
||||
|
||||
/// Build and submit one compute pass. Returns immediately — the caller
|
||||
/// decides when to wait via `WaitForSubmissionIndex`.
|
||||
fn submit(
|
||||
&self,
|
||||
slot: &Slot,
|
||||
data_buf: &wgpu::Buffer,
|
||||
tc_buf: &wgpu::Buffer,
|
||||
constraints: &Constraints,
|
||||
poly_min: u32,
|
||||
poly_count: u32,
|
||||
tc_count: u32,
|
||||
) -> wgpu::SubmissionIndex {
|
||||
let batch_total = poly_count as u64
|
||||
* constraints.init.count() as u64
|
||||
* constraints.xorout.count() as u64
|
||||
* constraints.refin_count() as u64
|
||||
* constraints.refout_count() as u64;
|
||||
let total_u32 = batch_total as u32;
|
||||
let total_wg = total_u32.div_ceil(WORKGROUP_SIZE);
|
||||
let dispatch_x = total_wg.min(MAX_DISPATCH_DIM);
|
||||
let dispatch_y = total_wg.div_ceil(dispatch_x);
|
||||
|
||||
let sp = SearchParams {
|
||||
width: constraints.width as u32,
|
||||
mask: constraints.mask(),
|
||||
poly_min,
|
||||
poly_count,
|
||||
init_min: constraints.init.min(),
|
||||
init_count: constraints.init.count(),
|
||||
xorout_min: constraints.xorout.min(),
|
||||
xorout_count: constraints.xorout.count(),
|
||||
refin_start: constraints.refin_start(),
|
||||
refin_count: constraints.refin_count(),
|
||||
refout_start: constraints.refout_start(),
|
||||
refout_count: constraints.refout_count(),
|
||||
threads_per_row: dispatch_x * WORKGROUP_SIZE,
|
||||
max_results: RESULTS_PER_BATCH,
|
||||
test_case_count: tc_count,
|
||||
_pad: 0,
|
||||
};
|
||||
|
||||
let d = &self.device;
|
||||
|
||||
let params_buf = d.create_buffer_init(&wgpu::util::BufferInitDescriptor {
|
||||
label: Some("params_buf"),
|
||||
contents: bytemuck::bytes_of(&sp),
|
||||
usage: wgpu::BufferUsages::UNIFORM,
|
||||
});
|
||||
|
||||
let bg = d.create_bind_group(&wgpu::BindGroupDescriptor {
|
||||
label: Some("crc_bg"),
|
||||
layout: &self.bind_group_layout,
|
||||
entries: &[
|
||||
wgpu::BindGroupEntry { binding: 0, resource: data_buf.as_entire_binding() },
|
||||
wgpu::BindGroupEntry { binding: 1, resource: tc_buf.as_entire_binding() },
|
||||
wgpu::BindGroupEntry { binding: 2, resource: params_buf.as_entire_binding() },
|
||||
wgpu::BindGroupEntry { binding: 3, resource: slot.results_buf.as_entire_binding() },
|
||||
wgpu::BindGroupEntry { binding: 4, resource: slot.count_buf.as_entire_binding() },
|
||||
],
|
||||
});
|
||||
|
||||
let mut enc = d.create_command_encoder(
|
||||
&wgpu::CommandEncoderDescriptor { label: Some("crc_enc") }
|
||||
);
|
||||
|
||||
enc.clear_buffer(&slot.count_buf, 0, None);
|
||||
|
||||
{
|
||||
let mut pass = enc.begin_compute_pass(&wgpu::ComputePassDescriptor {
|
||||
label: Some("crc_pass"),
|
||||
timestamp_writes: None,
|
||||
});
|
||||
pass.set_pipeline(&self.pipeline);
|
||||
pass.set_bind_group(0, &bg, &[]);
|
||||
pass.dispatch_workgroups(dispatch_x, dispatch_y, 1);
|
||||
}
|
||||
|
||||
let results_copy_size = RESULTS_PER_BATCH as u64 * RESULT_SIZE as u64;
|
||||
enc.copy_buffer_to_buffer(&slot.count_buf, 0, &slot.count_staging, 0, 4);
|
||||
enc.copy_buffer_to_buffer(&slot.results_buf, 0, &slot.results_staging, 0, results_copy_size);
|
||||
|
||||
self.queue.submit(std::iter::once(enc.finish()))
|
||||
}
|
||||
|
||||
/// Read `count` result entries from a slot's staging buffer (which must
|
||||
/// already be mapped) and write them to `out`.
|
||||
fn write_results(
|
||||
&self,
|
||||
results_staging: &wgpu::Buffer,
|
||||
count: u32,
|
||||
width: u8,
|
||||
out: &mut impl Write,
|
||||
) {
|
||||
if count == 0 {
|
||||
results_staging.unmap();
|
||||
return;
|
||||
}
|
||||
let byte_count = count as usize * RESULT_SIZE;
|
||||
{
|
||||
let view = results_staging.slice(..byte_count as u64).get_mapped_range();
|
||||
for r in bytemuck::cast_slice::<u8, ResultEntry>(&view) {
|
||||
writeln!(out, "{}", CrcParams {
|
||||
poly: r.poly,
|
||||
init: r.init,
|
||||
xorout: r.xorout,
|
||||
refin: (r.flags & 1) != 0,
|
||||
refout: (r.flags & 2) != 0,
|
||||
}.display(width))
|
||||
.expect("write to output failed");
|
||||
}
|
||||
}
|
||||
results_staging.unmap();
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Layout helpers ───────────────────────────────────────────────────────────
|
||||
|
||||
fn buf_layout(binding: u32, read_only: bool) -> wgpu::BindGroupLayoutEntry {
|
||||
wgpu::BindGroupLayoutEntry {
|
||||
binding,
|
||||
visibility: wgpu::ShaderStages::COMPUTE,
|
||||
ty: wgpu::BindingType::Buffer {
|
||||
ty: wgpu::BufferBindingType::Storage { read_only },
|
||||
has_dynamic_offset: false,
|
||||
min_binding_size: None,
|
||||
},
|
||||
count: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn uniform_layout(binding: u32) -> wgpu::BindGroupLayoutEntry {
|
||||
wgpu::BindGroupLayoutEntry {
|
||||
binding,
|
||||
visibility: wgpu::ShaderStages::COMPUTE,
|
||||
ty: wgpu::BindingType::Buffer {
|
||||
ty: wgpu::BufferBindingType::Uniform,
|
||||
has_dynamic_offset: false,
|
||||
min_binding_size: None,
|
||||
},
|
||||
count: None,
|
||||
}
|
||||
}
|
||||
52
src/main.rs
Normal file
52
src/main.rs
Normal file
@@ -0,0 +1,52 @@
|
||||
mod cracker;
|
||||
mod gpu;
|
||||
|
||||
use std::fs::File;
|
||||
use std::io::BufWriter;
|
||||
|
||||
use cracker::{Constraints, TestCase, ValueRange};
|
||||
use gpu::GpuCracker;
|
||||
|
||||
fn main() {
|
||||
pollster::block_on(run());
|
||||
}
|
||||
|
||||
async fn run() {
|
||||
let cracker = GpuCracker::new().await;
|
||||
|
||||
let mut out = BufWriter::new(
|
||||
File::create("results.txt").expect("cannot create results.txt"),
|
||||
);
|
||||
|
||||
let count = cracker.run(
|
||||
&[
|
||||
TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x00], 0x94, 0x00FF),
|
||||
TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x01], 0x61, 0x00FF),
|
||||
TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x02], 0x8B, 0x00FF),
|
||||
TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x03], 0x7E, 0x00FF),
|
||||
TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x04], 0xAA, 0x00FF),
|
||||
TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x05], 0x5F, 0x00FF),
|
||||
TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x06], 0xB5, 0x00FF),
|
||||
TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x07], 0x40, 0x00FF),
|
||||
TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x08], 0xE8, 0x00FF),
|
||||
TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x09], 0x1D, 0x00FF),
|
||||
TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x0A], 0xF7, 0x00FF),
|
||||
TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x0B], 0x02, 0x00FF),
|
||||
TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x0C], 0xD6, 0x00FF),
|
||||
TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x0D], 0x23, 0x00FF),
|
||||
TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x0E], 0xC9, 0x00FF),
|
||||
TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x0F], 0x3C, 0x00FF),
|
||||
],
|
||||
&Constraints {
|
||||
width: 13,
|
||||
poly: ValueRange::Range(0x0000..=0x1FFF),
|
||||
init: ValueRange::Range(0x0000..=0x1FFF),
|
||||
xorout: ValueRange::Fixed(0x0000),
|
||||
refin: None,
|
||||
refout: None,
|
||||
},
|
||||
&mut out,
|
||||
);
|
||||
|
||||
println!("found {count} match(es) — written to results.txt");
|
||||
}
|
||||
131
src/shader.wgsl
Normal file
131
src/shader.wgsl
Normal file
@@ -0,0 +1,131 @@
|
||||
// ─── Structs ─────────────────────────────────────────────────────────────────
|
||||
|
||||
struct SearchParams {
|
||||
width: u32,
|
||||
mask: u32,
|
||||
poly_min: u32,
|
||||
poly_count: u32,
|
||||
init_min: u32,
|
||||
init_count: u32,
|
||||
xorout_min: u32,
|
||||
xorout_count: u32,
|
||||
refin_start: u32,
|
||||
refin_count: u32,
|
||||
refout_start: u32,
|
||||
refout_count: u32,
|
||||
threads_per_row: u32,
|
||||
max_results: u32,
|
||||
test_case_count: u32,
|
||||
_pad: u32,
|
||||
}
|
||||
|
||||
struct GpuTestCase {
|
||||
data_offset: u32,
|
||||
data_len: u32,
|
||||
expected: u32,
|
||||
mask: u32,
|
||||
}
|
||||
|
||||
struct ResultEntry {
|
||||
poly: u32,
|
||||
init: u32,
|
||||
xorout: u32,
|
||||
flags: u32, // bit 0 = refin, bit 1 = refout
|
||||
}
|
||||
|
||||
// ─── Bindings ─────────────────────────────────────────────────────────────────
|
||||
|
||||
@group(0) @binding(0) var<storage, read> data_buf: array<u32>;
|
||||
@group(0) @binding(1) var<storage, read> test_cases: array<GpuTestCase>;
|
||||
@group(0) @binding(2) var<uniform> params: SearchParams;
|
||||
@group(0) @binding(3) var<storage, read_write> results: array<ResultEntry>;
|
||||
@group(0) @binding(4) var<storage, read_write> result_count: atomic<u32>;
|
||||
|
||||
// ─── Helpers ──────────────────────────────────────────────────────────────────
|
||||
|
||||
fn reflect_bits(val: u32, width: u32) -> u32 {
|
||||
var r = 0u;
|
||||
var v = val;
|
||||
for (var i = 0u; i < width; i++) {
|
||||
r = (r << 1u) | (v & 1u);
|
||||
v >>= 1u;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
fn load_byte(byte_idx: u32) -> u32 {
|
||||
return (data_buf[byte_idx >> 2u] >> ((byte_idx & 3u) << 3u)) & 0xFFu;
|
||||
}
|
||||
|
||||
// ─── CRC computation ──────────────────────────────────────────────────────────
|
||||
|
||||
// Bit-by-bit CRC — works for any width 1–32.
|
||||
// Inner loop is branchless: select() compiles to a conditional move with no
|
||||
// warp divergence.
|
||||
fn compute_crc(
|
||||
data_start: u32,
|
||||
data_len: u32,
|
||||
poly: u32,
|
||||
init: u32,
|
||||
xorout: u32,
|
||||
refin: bool,
|
||||
refout: bool,
|
||||
) -> u32 {
|
||||
let msb_shift = params.width - 1u;
|
||||
let mask = params.mask;
|
||||
var crc = init & mask;
|
||||
|
||||
for (var i = 0u; i < data_len; i++) {
|
||||
var byte = load_byte(data_start + i);
|
||||
if refin { byte = reflect_bits(byte, 8u); }
|
||||
|
||||
for (var b = 0u; b < 8u; b++) {
|
||||
let feedback = select(0u, poly, ((crc >> msb_shift) ^ ((byte >> (7u - b)) & 1u)) != 0u);
|
||||
crc = ((crc << 1u) & mask) ^ feedback;
|
||||
}
|
||||
}
|
||||
|
||||
if refout { crc = reflect_bits(crc, params.width); }
|
||||
return (crc ^ xorout) & mask;
|
||||
}
|
||||
|
||||
// ─── Entry point ─────────────────────────────────────────────────────────────
|
||||
|
||||
@compute @workgroup_size(256, 1, 1)
|
||||
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
|
||||
let thread_id = gid.y * params.threads_per_row + gid.x;
|
||||
|
||||
let stride_refout = 1u;
|
||||
let stride_refin = params.refout_count;
|
||||
let stride_xorout = params.refin_count * stride_refin;
|
||||
let stride_init = params.xorout_count * stride_xorout;
|
||||
let stride_poly = params.init_count * stride_init;
|
||||
|
||||
if thread_id >= stride_poly * params.poly_count { return; }
|
||||
|
||||
let poly_idx = thread_id / stride_poly;
|
||||
let r1 = thread_id % stride_poly;
|
||||
let init_idx = r1 / stride_init;
|
||||
let r2 = r1 % stride_init;
|
||||
let xorout_idx = r2 / stride_xorout;
|
||||
let r3 = r2 % stride_xorout;
|
||||
let refin_idx = r3 / stride_refin;
|
||||
let refout_idx = r3 % stride_refin;
|
||||
|
||||
let poly = (params.poly_min + poly_idx) & params.mask;
|
||||
let init = (params.init_min + init_idx) & params.mask;
|
||||
let xorout = (params.xorout_min + xorout_idx) & params.mask;
|
||||
let refin = (params.refin_start + refin_idx) != 0u;
|
||||
let refout = (params.refout_start + refout_idx) != 0u;
|
||||
|
||||
for (var t = 0u; t < params.test_case_count; t++) {
|
||||
let tc = test_cases[t];
|
||||
let got = compute_crc(tc.data_offset, tc.data_len, poly, init, xorout, refin, refout);
|
||||
if (got & tc.mask) != (tc.expected & tc.mask) { return; }
|
||||
}
|
||||
|
||||
let idx = atomicAdd(&result_count, 1u);
|
||||
if idx < params.max_results {
|
||||
results[idx] = ResultEntry(poly, init, xorout, u32(refin) | (u32(refout) << 1u));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user