vibe coded a crc reverse engineer tool on gpu, because why not

This commit is contained in:
2026-04-12 23:05:19 +02:00
commit 973b4dba18
6 changed files with 881 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
target
Cargo.lock

9
Cargo.toml Normal file
View File

@@ -0,0 +1,9 @@
[package]
name = "crc_cracker"
version = "0.1.0"
edition = "2021"
[dependencies]
wgpu = "0.20"
bytemuck = { version = "1", features = ["derive"] }
pollster = "0.3"

156
src/cracker.rs Normal file
View File

@@ -0,0 +1,156 @@
use std::ops::RangeInclusive;
/// A known input/output pair used to constrain the search.
/// Add multiple test cases to dramatically reduce false positives.
pub struct TestCase {
/// Input bytes (can be a partial slice of a larger message)
pub data: Vec<u8>,
/// The CRC value to match against (only bits set in `mask` are checked)
pub expected: u32,
/// Which bits of the CRC are known.
/// `u32::MAX` (or the width mask) means all bits must match.
/// `0x00FF` means only the low byte must match, useful when you can only
/// observe part of the CRC field.
pub mask: u32,
}
impl TestCase {
/// All bits of the CRC must match.
pub fn exact(data: Vec<u8>, expected: u32) -> Self {
Self { data, expected, mask: u32::MAX }
}
/// Only the bits set in `mask` must match.
pub fn masked(data: Vec<u8>, expected: u32, mask: u32) -> Self {
Self { data, expected, mask }
}
}
/// Constraint on a single CRC parameter.
#[derive(Clone, Debug)]
pub enum ValueRange {
/// Exactly one value to try
Fixed(u32),
/// Try every value in this inclusive range
Range(RangeInclusive<u32>),
}
impl ValueRange {
pub fn min(&self) -> u32 {
match self {
Self::Fixed(v) => *v,
Self::Range(r) => *r.start(),
}
}
pub fn count(&self) -> u32 {
match self {
Self::Fixed(_) => 1,
Self::Range(r) => r.end() - r.start() + 1,
}
}
}
/// Constraints for the brute-force search.
///
/// The GPU will try every combination within these bounds, testing each
/// candidate against all provided `TestCase`s. Only configurations that
/// pass every test case are returned.
pub struct Constraints {
/// CRC register width in bits — currently 8, 16, or 32.
pub width: u8,
/// Generator polynomial (without the implicit leading 1).
pub poly: ValueRange,
/// Initial value loaded into the CRC register.
pub init: ValueRange,
/// Value XOR'd into the final CRC before output.
pub xorout: ValueRange,
/// Whether each input byte is bit-reversed before entering the CRC register.
///
/// - `Some(false)` — MSB-first (textbook CRCs, parallel-bus / memory protocols)
/// - `Some(true)` — LSB-first (most serial-line protocols: Ethernet, USB, CAN,
/// Bluetooth, Dallas 1-Wire — bits arrive LSB-first on the wire)
/// - `None` — unknown; the GPU tries both and returns whichever matches
pub refin: Option<bool>,
/// Whether the final CRC register value is bit-reversed before the xorout step.
///
/// - `Some(false)` — register used as-is
/// - `Some(true)` — register is reflected before XOR-out
/// - `None` — unknown; the GPU tries both
///
/// `refin` and `refout` are almost always equal.
/// `(true, true)` covers most real-world CRC-16/32 algorithms (CRC-32/ISO-HDLC,
/// CRC-16/ARC, CRC-16/IBM, …).
/// `(false, false)` covers most CRC-8 variants and simple checksum algorithms.
pub refout: Option<bool>,
}
impl Constraints {
pub fn refin_start(&self) -> u32 {
match self.refin {
Some(true) => 1,
_ => 0,
}
}
pub fn refin_count(&self) -> u32 {
if self.refin.is_none() { 2 } else { 1 }
}
pub fn refout_start(&self) -> u32 {
match self.refout {
Some(true) => 1,
_ => 0,
}
}
pub fn refout_count(&self) -> u32 {
if self.refout.is_none() { 2 } else { 1 }
}
/// Total number of (poly, init, xorout, refin, refout) combinations.
pub fn total_combinations(&self) -> u64 {
self.poly.count() as u64
* self.init.count() as u64
* self.xorout.count() as u64
* self.refin_count() as u64
* self.refout_count() as u64
}
pub fn mask(&self) -> u32 {
if self.width == 32 {
u32::MAX
} else {
(1u32 << self.width) - 1
}
}
}
/// A matching CRC configuration found during the search.
#[derive(Debug, Clone, Copy)]
pub struct CrcParams {
pub poly: u32,
pub init: u32,
pub xorout: u32,
pub refin: bool,
pub refout: bool,
}
impl CrcParams {
/// Format as a human-readable CRC descriptor.
pub fn display(&self, width: u8) -> String {
// ceil(width / 4) hex digits so CRC-13 shows 4 digits, CRC-5 shows 2, etc.
let hex_w = ((width + 3) / 4) as usize;
format!(
"CRC-{}: poly=0x{:0>w$X} init=0x{:0>w$X} xorout=0x{:0>w$X} refin={:<5} refout={}",
width,
self.poly,
self.init,
self.xorout,
self.refin,
self.refout,
w = hex_w,
)
}
}

531
src/gpu.rs Normal file
View File

@@ -0,0 +1,531 @@
use std::collections::VecDeque;
use std::io::Write;
use bytemuck::{Pod, Zeroable};
use wgpu::util::DeviceExt;
use crate::cracker::{Constraints, CrcParams, TestCase};
use std::time::{Duration, Instant};
// ─── GPU-side layouts (must match shader.wgsl exactly) ───────────────────────
#[repr(C)]
#[derive(Clone, Copy, Pod, Zeroable)]
struct SearchParams {
width: u32,
mask: u32,
poly_min: u32,
poly_count: u32,
init_min: u32,
init_count: u32,
xorout_min: u32,
xorout_count: u32,
refin_start: u32,
refin_count: u32,
refout_start: u32,
refout_count: u32,
threads_per_row: u32,
max_results: u32,
test_case_count: u32,
_pad: u32,
}
#[repr(C)]
#[derive(Clone, Copy, Pod, Zeroable)]
struct GpuTestCase {
data_offset: u32,
data_len: u32,
expected: u32,
mask: u32,
}
#[repr(C)]
#[derive(Clone, Copy, Pod, Zeroable)]
struct ResultEntry {
poly: u32,
init: u32,
xorout: u32,
flags: u32,
}
const RESULT_SIZE: usize = std::mem::size_of::<ResultEntry>();
// ─── Constants ────────────────────────────────────────────────────────────────
const WORKGROUP_SIZE: u32 = 256;
const MAX_DISPATCH_DIM: u32 = 65_535;
const MAX_BATCH_THREADS: u64 = u32::MAX as u64;
/// Result buffer ceiling per buffer slot.
/// Two slots are allocated, so GPU VRAM usage for results = 2 × this × 16 B.
/// 8 M × 2 × 16 B = 256 MB — comfortably fits in any modern discrete GPU.
const RESULTS_PER_BATCH: u32 = 8_000_000;
// ─── Per-slot GPU buffers ───────────────────────────────────────────────────<E29480><E29480>
struct Slot {
results_buf: wgpu::Buffer,
count_buf: wgpu::Buffer,
results_staging: wgpu::Buffer,
count_staging: wgpu::Buffer,
}
// ─── GPU state ────────────────────────────────────────────────────────────────
pub struct GpuCracker {
device: wgpu::Device,
queue: wgpu::Queue,
pipeline: wgpu::ComputePipeline,
bind_group_layout: wgpu::BindGroupLayout,
}
impl GpuCracker {
pub async fn new() -> Self {
let instance = wgpu::Instance::default();
let adapter = instance
.request_adapter(&wgpu::RequestAdapterOptions {
power_preference: wgpu::PowerPreference::HighPerformance,
force_fallback_adapter: false,
compatible_surface: None,
})
.await
.expect("no GPU adapter found");
let info = adapter.get_info();
println!("GPU: {} ({:?})", info.name, info.backend);
let (device, queue) = adapter
.request_device(
&wgpu::DeviceDescriptor {
label: Some("crc_cracker"),
required_features: wgpu::Features::empty(),
required_limits: wgpu::Limits::default(),
},
None,
)
.await
.expect("failed to open GPU device");
let shader = device.create_shader_module(wgpu::ShaderModuleDescriptor {
label: Some("crc_shader"),
source: wgpu::ShaderSource::Wgsl(include_str!("shader.wgsl").into()),
});
let bind_group_layout =
device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
label: Some("crc_bgl"),
entries: &[
buf_layout(0, true),
buf_layout(1, true),
uniform_layout(2),
buf_layout(3, false),
buf_layout(4, false),
],
});
let pipeline_layout =
device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
label: Some("crc_pipeline_layout"),
bind_group_layouts: &[&bind_group_layout],
push_constant_ranges: &[],
});
let pipeline =
device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
label: Some("crc_pipeline"),
layout: Some(&pipeline_layout),
module: &shader,
entry_point: "main",
compilation_options: wgpu::PipelineCompilationOptions::default(),
});
Self { device, queue, pipeline, bind_group_layout }
}
// ── Public API ────────────────────────────────────────────────────────────
/// Brute-force all matching CRC configurations, writing each to `out`.
/// Returns the total match count (exact — no results are ever dropped).
///
/// Pipeline:
/// - Work is split into batches (≤ u32::MAX threads each).
/// - Two GPU buffer slots alternate (double-buffer).
/// - GPU runs batch N while CPU drains batch N-1 to disk — no idle gaps.
/// - If a batch's results overflow the buffer it is split into two
/// half-batches and re-queued; no count-only pass needed.
pub fn run(
&self,
test_cases: &[TestCase],
constraints: &Constraints,
out: &mut impl Write,
) -> u64 {
assert!(!test_cases.is_empty(), "need at least one test case");
let total = constraints.total_combinations();
let poly_count = constraints.poly.count() as u64;
let per_poly = total / poly_count;
let polys_per_batch = ((MAX_BATCH_THREADS / per_poly).max(1) as u32)
.min(constraints.poly.count());
let num_batches = constraints.poly.count().div_ceil(polys_per_batch);
println!(
"search space: {} combinations \
({} poly × {} init × {} xorout × {}× refin × {}× refout) \
{} batch(es) initially",
total,
constraints.poly.count(),
constraints.init.count(),
constraints.xorout.count(),
constraints.refin_count(),
constraints.refout_count(),
num_batches,
);
// Shared input buffers — never change between batches.
let (data_buf, tc_buf) = self.upload_test_data(test_cases);
// Work queue — starts with the initial poly-range slices.
// Overflow adds split sub-ranges back here.
let mut work: VecDeque<(u32, u32)> = {
let poly_min = constraints.poly.min();
(0..constraints.poly.count())
.step_by(polys_per_batch as usize)
.map(|off| {
let cnt = polys_per_batch.min(constraints.poly.count() - off);
(poly_min + off, cnt)
})
.collect()
};
// Two GPU buffer slots for double-buffering.
let slots = [self.create_slot(), self.create_slot()];
let mut active = 0usize; // slot to submit the next batch into
// Currently in-flight submission: (submission_index, slot, poly_min, poly_count)
let mut pending: Option<(wgpu::SubmissionIndex, usize, u32, u32)> = None;
let mut total_found = 0u64;
let mut done_work: u64 = 0;
let total_work = total;
let mut last_update = Instant::now();
let update_interval = Duration::from_millis(33); // tweak if needed
loop {
// ── Submit next batch (if any) ────────────────────────────────────
// Do this BEFORE draining the pending batch so the GPU stays busy.
let new_pending = work.pop_front().map(|(poly_min, poly_count)| {
let si = self.submit(
&slots[active], &data_buf, &tc_buf,
constraints, poly_min, poly_count,
test_cases.len() as u32,
);
let result = (si, active, poly_min, poly_count);
active = 1 - active;
result
});
// ── Drain the previous batch ──────────────────────────────────────
// GPU is now running `new_pending` concurrently with this drain.
if let Some((p_si, p_slot, p_poly_min, p_poly_count)) = pending.take() {
// Schedule mapping BEFORE polling. The callback fires inside the
// poll call below, so get_mapped_range() works right after it.
let count_slice = slots[p_slot].count_staging.slice(..4);
let results_slice = slots[p_slot].results_staging.slice(..);
count_slice.map_async(wgpu::MapMode::Read, |_| {});
results_slice.map_async(wgpu::MapMode::Read, |_| {});
// Wait for this batch. GPU continues running new_pending.
self.device.poll(wgpu::Maintain::WaitForSubmissionIndex(p_si));
let raw_count = {
let v = count_slice.get_mapped_range();
u32::from_le_bytes(v[..4].try_into().unwrap())
};
slots[p_slot].count_staging.unmap();
if raw_count > RESULTS_PER_BATCH {
// This batch produced more matches than the buffer holds.
// Split it into two half-sized poly ranges and re-queue at
// the front so they run before the remaining original batches.
slots[p_slot].results_staging.unmap();
let half = p_poly_count / 2;
if half > 0 {
work.push_front((p_poly_min + half, p_poly_count - half));
work.push_front((p_poly_min, half));
} else {
// Single poly value with > RESULTS_PER_BATCH matches.
// This means almost every (init, xorout, refin, refout)
// combination matches — add more test cases to constrain.
let capped = RESULTS_PER_BATCH;
eprintln!(
"warning: poly={:#X} produced {} results, \
buffer capped at {capped}. Add more test cases.",
p_poly_min, raw_count,
);
self.write_results(
&slots[p_slot].results_staging, capped, constraints.width, out,
);
total_found += capped as u64;
}
} else {
self.write_results(
&slots[p_slot].results_staging, raw_count, constraints.width, out,
);
total_found += raw_count as u64;
}
let batch_work = p_poly_count as u64
* constraints.init.count() as u64
* constraints.xorout.count() as u64
* constraints.refin_count() as u64
* constraints.refout_count() as u64;
done_work += batch_work;
if last_update.elapsed() >= update_interval {
let progress = done_work as f64 / total_work as f64;
let percent = progress * 100.0;
// simple text bar (cheap, no allocations)
let bar_width = 30;
let filled = (progress * bar_width as f64) as usize;
let mut bar = String::with_capacity(bar_width);
for i in 0..bar_width {
bar.push(if i < filled { '#' } else { '-' });
}
print!("[{}] {:6.2}% ({}/{})\r", bar, percent, done_work, total_work);
std::io::stdout().flush().ok();
last_update = Instant::now();
}
}
// ── Advance state ─────────────────────────────────────────────────
match new_pending {
Some(info) => pending = Some(info),
None if pending.is_none() && work.is_empty() => break,
None => { /* no new submission, but still pending or overflow work */ }
}
}
println!();
total_found
}
// ── Internals ─────────────────────────────────────────────────────────────
fn create_slot(&self) -> Slot {
let d = &self.device;
let results_size = RESULTS_PER_BATCH as u64 * RESULT_SIZE as u64;
Slot {
results_buf: d.create_buffer(&wgpu::BufferDescriptor {
label: Some("results_buf"),
size: results_size,
usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC,
mapped_at_creation: false,
}),
count_buf: d.create_buffer(&wgpu::BufferDescriptor {
label: Some("count_buf"),
size: 4,
usage: wgpu::BufferUsages::STORAGE
| wgpu::BufferUsages::COPY_SRC
| wgpu::BufferUsages::COPY_DST,
mapped_at_creation: false,
}),
results_staging: d.create_buffer(&wgpu::BufferDescriptor {
label: Some("results_staging"),
size: results_size,
usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
mapped_at_creation: false,
}),
count_staging: d.create_buffer(&wgpu::BufferDescriptor {
label: Some("count_staging"),
size: 4,
usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
mapped_at_creation: false,
}),
}
}
fn upload_test_data(
&self,
test_cases: &[TestCase],
) -> (wgpu::Buffer, wgpu::Buffer) {
let mut flat: Vec<u8> = Vec::new();
let mut gpu: Vec<GpuTestCase> = Vec::new();
for tc in test_cases {
gpu.push(GpuTestCase {
data_offset: flat.len() as u32,
data_len: tc.data.len() as u32,
expected: tc.expected,
mask: tc.mask,
});
flat.extend_from_slice(&tc.data);
}
while flat.len() % 4 != 0 { flat.push(0); }
let d = &self.device;
(
d.create_buffer_init(&wgpu::util::BufferInitDescriptor {
label: Some("data_buf"),
contents: &flat,
usage: wgpu::BufferUsages::STORAGE,
}),
d.create_buffer_init(&wgpu::util::BufferInitDescriptor {
label: Some("tc_buf"),
contents: bytemuck::cast_slice(&gpu),
usage: wgpu::BufferUsages::STORAGE,
}),
)
}
/// Build and submit one compute pass. Returns immediately — the caller
/// decides when to wait via `WaitForSubmissionIndex`.
fn submit(
&self,
slot: &Slot,
data_buf: &wgpu::Buffer,
tc_buf: &wgpu::Buffer,
constraints: &Constraints,
poly_min: u32,
poly_count: u32,
tc_count: u32,
) -> wgpu::SubmissionIndex {
let batch_total = poly_count as u64
* constraints.init.count() as u64
* constraints.xorout.count() as u64
* constraints.refin_count() as u64
* constraints.refout_count() as u64;
let total_u32 = batch_total as u32;
let total_wg = total_u32.div_ceil(WORKGROUP_SIZE);
let dispatch_x = total_wg.min(MAX_DISPATCH_DIM);
let dispatch_y = total_wg.div_ceil(dispatch_x);
let sp = SearchParams {
width: constraints.width as u32,
mask: constraints.mask(),
poly_min,
poly_count,
init_min: constraints.init.min(),
init_count: constraints.init.count(),
xorout_min: constraints.xorout.min(),
xorout_count: constraints.xorout.count(),
refin_start: constraints.refin_start(),
refin_count: constraints.refin_count(),
refout_start: constraints.refout_start(),
refout_count: constraints.refout_count(),
threads_per_row: dispatch_x * WORKGROUP_SIZE,
max_results: RESULTS_PER_BATCH,
test_case_count: tc_count,
_pad: 0,
};
let d = &self.device;
let params_buf = d.create_buffer_init(&wgpu::util::BufferInitDescriptor {
label: Some("params_buf"),
contents: bytemuck::bytes_of(&sp),
usage: wgpu::BufferUsages::UNIFORM,
});
let bg = d.create_bind_group(&wgpu::BindGroupDescriptor {
label: Some("crc_bg"),
layout: &self.bind_group_layout,
entries: &[
wgpu::BindGroupEntry { binding: 0, resource: data_buf.as_entire_binding() },
wgpu::BindGroupEntry { binding: 1, resource: tc_buf.as_entire_binding() },
wgpu::BindGroupEntry { binding: 2, resource: params_buf.as_entire_binding() },
wgpu::BindGroupEntry { binding: 3, resource: slot.results_buf.as_entire_binding() },
wgpu::BindGroupEntry { binding: 4, resource: slot.count_buf.as_entire_binding() },
],
});
let mut enc = d.create_command_encoder(
&wgpu::CommandEncoderDescriptor { label: Some("crc_enc") }
);
enc.clear_buffer(&slot.count_buf, 0, None);
{
let mut pass = enc.begin_compute_pass(&wgpu::ComputePassDescriptor {
label: Some("crc_pass"),
timestamp_writes: None,
});
pass.set_pipeline(&self.pipeline);
pass.set_bind_group(0, &bg, &[]);
pass.dispatch_workgroups(dispatch_x, dispatch_y, 1);
}
let results_copy_size = RESULTS_PER_BATCH as u64 * RESULT_SIZE as u64;
enc.copy_buffer_to_buffer(&slot.count_buf, 0, &slot.count_staging, 0, 4);
enc.copy_buffer_to_buffer(&slot.results_buf, 0, &slot.results_staging, 0, results_copy_size);
self.queue.submit(std::iter::once(enc.finish()))
}
/// Read `count` result entries from a slot's staging buffer (which must
/// already be mapped) and write them to `out`.
fn write_results(
&self,
results_staging: &wgpu::Buffer,
count: u32,
width: u8,
out: &mut impl Write,
) {
if count == 0 {
results_staging.unmap();
return;
}
let byte_count = count as usize * RESULT_SIZE;
{
let view = results_staging.slice(..byte_count as u64).get_mapped_range();
for r in bytemuck::cast_slice::<u8, ResultEntry>(&view) {
writeln!(out, "{}", CrcParams {
poly: r.poly,
init: r.init,
xorout: r.xorout,
refin: (r.flags & 1) != 0,
refout: (r.flags & 2) != 0,
}.display(width))
.expect("write to output failed");
}
}
results_staging.unmap();
}
}
// ─── Layout helpers ───────────────────────────────────────────────────────────
fn buf_layout(binding: u32, read_only: bool) -> wgpu::BindGroupLayoutEntry {
wgpu::BindGroupLayoutEntry {
binding,
visibility: wgpu::ShaderStages::COMPUTE,
ty: wgpu::BindingType::Buffer {
ty: wgpu::BufferBindingType::Storage { read_only },
has_dynamic_offset: false,
min_binding_size: None,
},
count: None,
}
}
fn uniform_layout(binding: u32) -> wgpu::BindGroupLayoutEntry {
wgpu::BindGroupLayoutEntry {
binding,
visibility: wgpu::ShaderStages::COMPUTE,
ty: wgpu::BindingType::Buffer {
ty: wgpu::BufferBindingType::Uniform,
has_dynamic_offset: false,
min_binding_size: None,
},
count: None,
}
}

52
src/main.rs Normal file
View File

@@ -0,0 +1,52 @@
mod cracker;
mod gpu;
use std::fs::File;
use std::io::BufWriter;
use cracker::{Constraints, TestCase, ValueRange};
use gpu::GpuCracker;
fn main() {
pollster::block_on(run());
}
async fn run() {
let cracker = GpuCracker::new().await;
let mut out = BufWriter::new(
File::create("results.txt").expect("cannot create results.txt"),
);
let count = cracker.run(
&[
TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x00], 0x94, 0x00FF),
TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x01], 0x61, 0x00FF),
TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x02], 0x8B, 0x00FF),
TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x03], 0x7E, 0x00FF),
TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x04], 0xAA, 0x00FF),
TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x05], 0x5F, 0x00FF),
TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x06], 0xB5, 0x00FF),
TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x07], 0x40, 0x00FF),
TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x08], 0xE8, 0x00FF),
TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x09], 0x1D, 0x00FF),
TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x0A], 0xF7, 0x00FF),
TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x0B], 0x02, 0x00FF),
TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x0C], 0xD6, 0x00FF),
TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x0D], 0x23, 0x00FF),
TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x0E], 0xC9, 0x00FF),
TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x0F], 0x3C, 0x00FF),
],
&Constraints {
width: 13,
poly: ValueRange::Range(0x0000..=0x1FFF),
init: ValueRange::Range(0x0000..=0x1FFF),
xorout: ValueRange::Fixed(0x0000),
refin: None,
refout: None,
},
&mut out,
);
println!("found {count} match(es) — written to results.txt");
}

131
src/shader.wgsl Normal file
View File

@@ -0,0 +1,131 @@
// ─── Structs ─────────────────────────────────────────────────────────────────
struct SearchParams {
width: u32,
mask: u32,
poly_min: u32,
poly_count: u32,
init_min: u32,
init_count: u32,
xorout_min: u32,
xorout_count: u32,
refin_start: u32,
refin_count: u32,
refout_start: u32,
refout_count: u32,
threads_per_row: u32,
max_results: u32,
test_case_count: u32,
_pad: u32,
}
struct GpuTestCase {
data_offset: u32,
data_len: u32,
expected: u32,
mask: u32,
}
struct ResultEntry {
poly: u32,
init: u32,
xorout: u32,
flags: u32, // bit 0 = refin, bit 1 = refout
}
// ─── Bindings ─────────────────────────────────────────────────────────────────
@group(0) @binding(0) var<storage, read> data_buf: array<u32>;
@group(0) @binding(1) var<storage, read> test_cases: array<GpuTestCase>;
@group(0) @binding(2) var<uniform> params: SearchParams;
@group(0) @binding(3) var<storage, read_write> results: array<ResultEntry>;
@group(0) @binding(4) var<storage, read_write> result_count: atomic<u32>;
// ─── Helpers ──────────────────────────────────────────────────────────────────
fn reflect_bits(val: u32, width: u32) -> u32 {
var r = 0u;
var v = val;
for (var i = 0u; i < width; i++) {
r = (r << 1u) | (v & 1u);
v >>= 1u;
}
return r;
}
fn load_byte(byte_idx: u32) -> u32 {
return (data_buf[byte_idx >> 2u] >> ((byte_idx & 3u) << 3u)) & 0xFFu;
}
// ─── CRC computation ──────────────────────────────────────────────────────────
// Bit-by-bit CRC — works for any width 132.
// Inner loop is branchless: select() compiles to a conditional move with no
// warp divergence.
fn compute_crc(
data_start: u32,
data_len: u32,
poly: u32,
init: u32,
xorout: u32,
refin: bool,
refout: bool,
) -> u32 {
let msb_shift = params.width - 1u;
let mask = params.mask;
var crc = init & mask;
for (var i = 0u; i < data_len; i++) {
var byte = load_byte(data_start + i);
if refin { byte = reflect_bits(byte, 8u); }
for (var b = 0u; b < 8u; b++) {
let feedback = select(0u, poly, ((crc >> msb_shift) ^ ((byte >> (7u - b)) & 1u)) != 0u);
crc = ((crc << 1u) & mask) ^ feedback;
}
}
if refout { crc = reflect_bits(crc, params.width); }
return (crc ^ xorout) & mask;
}
// ─── Entry point ─────────────────────────────────────────────────────────────
@compute @workgroup_size(256, 1, 1)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
let thread_id = gid.y * params.threads_per_row + gid.x;
let stride_refout = 1u;
let stride_refin = params.refout_count;
let stride_xorout = params.refin_count * stride_refin;
let stride_init = params.xorout_count * stride_xorout;
let stride_poly = params.init_count * stride_init;
if thread_id >= stride_poly * params.poly_count { return; }
let poly_idx = thread_id / stride_poly;
let r1 = thread_id % stride_poly;
let init_idx = r1 / stride_init;
let r2 = r1 % stride_init;
let xorout_idx = r2 / stride_xorout;
let r3 = r2 % stride_xorout;
let refin_idx = r3 / stride_refin;
let refout_idx = r3 % stride_refin;
let poly = (params.poly_min + poly_idx) & params.mask;
let init = (params.init_min + init_idx) & params.mask;
let xorout = (params.xorout_min + xorout_idx) & params.mask;
let refin = (params.refin_start + refin_idx) != 0u;
let refout = (params.refout_start + refout_idx) != 0u;
for (var t = 0u; t < params.test_case_count; t++) {
let tc = test_cases[t];
let got = compute_crc(tc.data_offset, tc.data_len, poly, init, xorout, refin, refout);
if (got & tc.mask) != (tc.expected & tc.mask) { return; }
}
let idx = atomicAdd(&result_count, 1u);
if idx < params.max_results {
results[idx] = ResultEntry(poly, init, xorout, u32(refin) | (u32(refout) << 1u));
}
}