commit 973b4dba184ffdebc4907e4f0465776b1f574eee Author: Timo Date: Sun Apr 12 23:05:19 2026 +0200 vibe coded a crc reverse engineer tool on gpu, because why not diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f2f9e58 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +target +Cargo.lock \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..ac937f6 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "crc_cracker" +version = "0.1.0" +edition = "2021" + +[dependencies] +wgpu = "0.20" +bytemuck = { version = "1", features = ["derive"] } +pollster = "0.3" diff --git a/src/cracker.rs b/src/cracker.rs new file mode 100644 index 0000000..ceb3c42 --- /dev/null +++ b/src/cracker.rs @@ -0,0 +1,156 @@ +use std::ops::RangeInclusive; + +/// A known input/output pair used to constrain the search. +/// Add multiple test cases to dramatically reduce false positives. +pub struct TestCase { + /// Input bytes (can be a partial slice of a larger message) + pub data: Vec, + /// The CRC value to match against (only bits set in `mask` are checked) + pub expected: u32, + /// Which bits of the CRC are known. + /// `u32::MAX` (or the width mask) means all bits must match. + /// `0x00FF` means only the low byte must match, useful when you can only + /// observe part of the CRC field. + pub mask: u32, +} + +impl TestCase { + /// All bits of the CRC must match. + pub fn exact(data: Vec, expected: u32) -> Self { + Self { data, expected, mask: u32::MAX } + } + + /// Only the bits set in `mask` must match. + pub fn masked(data: Vec, expected: u32, mask: u32) -> Self { + Self { data, expected, mask } + } +} + +/// Constraint on a single CRC parameter. +#[derive(Clone, Debug)] +pub enum ValueRange { + /// Exactly one value to try + Fixed(u32), + /// Try every value in this inclusive range + Range(RangeInclusive), +} + +impl ValueRange { + pub fn min(&self) -> u32 { + match self { + Self::Fixed(v) => *v, + Self::Range(r) => *r.start(), + } + } + + pub fn count(&self) -> u32 { + match self { + Self::Fixed(_) => 1, + Self::Range(r) => r.end() - r.start() + 1, + } + } +} + +/// Constraints for the brute-force search. +/// +/// The GPU will try every combination within these bounds, testing each +/// candidate against all provided `TestCase`s. Only configurations that +/// pass every test case are returned. +pub struct Constraints { + /// CRC register width in bits — currently 8, 16, or 32. + pub width: u8, + /// Generator polynomial (without the implicit leading 1). + pub poly: ValueRange, + /// Initial value loaded into the CRC register. + pub init: ValueRange, + /// Value XOR'd into the final CRC before output. + pub xorout: ValueRange, + /// Whether each input byte is bit-reversed before entering the CRC register. + /// + /// - `Some(false)` — MSB-first (textbook CRCs, parallel-bus / memory protocols) + /// - `Some(true)` — LSB-first (most serial-line protocols: Ethernet, USB, CAN, + /// Bluetooth, Dallas 1-Wire — bits arrive LSB-first on the wire) + /// - `None` — unknown; the GPU tries both and returns whichever matches + pub refin: Option, + + /// Whether the final CRC register value is bit-reversed before the xorout step. + /// + /// - `Some(false)` — register used as-is + /// - `Some(true)` — register is reflected before XOR-out + /// - `None` — unknown; the GPU tries both + /// + /// `refin` and `refout` are almost always equal. + /// `(true, true)` covers most real-world CRC-16/32 algorithms (CRC-32/ISO-HDLC, + /// CRC-16/ARC, CRC-16/IBM, …). + /// `(false, false)` covers most CRC-8 variants and simple checksum algorithms. + pub refout: Option, +} + +impl Constraints { + pub fn refin_start(&self) -> u32 { + match self.refin { + Some(true) => 1, + _ => 0, + } + } + + pub fn refin_count(&self) -> u32 { + if self.refin.is_none() { 2 } else { 1 } + } + + pub fn refout_start(&self) -> u32 { + match self.refout { + Some(true) => 1, + _ => 0, + } + } + + pub fn refout_count(&self) -> u32 { + if self.refout.is_none() { 2 } else { 1 } + } + + /// Total number of (poly, init, xorout, refin, refout) combinations. + pub fn total_combinations(&self) -> u64 { + self.poly.count() as u64 + * self.init.count() as u64 + * self.xorout.count() as u64 + * self.refin_count() as u64 + * self.refout_count() as u64 + } + + pub fn mask(&self) -> u32 { + if self.width == 32 { + u32::MAX + } else { + (1u32 << self.width) - 1 + } + } +} + +/// A matching CRC configuration found during the search. +#[derive(Debug, Clone, Copy)] +pub struct CrcParams { + pub poly: u32, + pub init: u32, + pub xorout: u32, + pub refin: bool, + pub refout: bool, +} + +impl CrcParams { + /// Format as a human-readable CRC descriptor. + pub fn display(&self, width: u8) -> String { + // ceil(width / 4) hex digits so CRC-13 shows 4 digits, CRC-5 shows 2, etc. + let hex_w = ((width + 3) / 4) as usize; + format!( + "CRC-{}: poly=0x{:0>w$X} init=0x{:0>w$X} xorout=0x{:0>w$X} refin={:<5} refout={}", + width, + self.poly, + self.init, + self.xorout, + self.refin, + self.refout, + w = hex_w, + ) + } +} diff --git a/src/gpu.rs b/src/gpu.rs new file mode 100644 index 0000000..53a04e8 --- /dev/null +++ b/src/gpu.rs @@ -0,0 +1,531 @@ +use std::collections::VecDeque; +use std::io::Write; + +use bytemuck::{Pod, Zeroable}; +use wgpu::util::DeviceExt; + +use crate::cracker::{Constraints, CrcParams, TestCase}; + +use std::time::{Duration, Instant}; + +// ─── GPU-side layouts (must match shader.wgsl exactly) ─────────────────────── + +#[repr(C)] +#[derive(Clone, Copy, Pod, Zeroable)] +struct SearchParams { + width: u32, + mask: u32, + poly_min: u32, + poly_count: u32, + init_min: u32, + init_count: u32, + xorout_min: u32, + xorout_count: u32, + refin_start: u32, + refin_count: u32, + refout_start: u32, + refout_count: u32, + threads_per_row: u32, + max_results: u32, + test_case_count: u32, + _pad: u32, +} + +#[repr(C)] +#[derive(Clone, Copy, Pod, Zeroable)] +struct GpuTestCase { + data_offset: u32, + data_len: u32, + expected: u32, + mask: u32, +} + +#[repr(C)] +#[derive(Clone, Copy, Pod, Zeroable)] +struct ResultEntry { + poly: u32, + init: u32, + xorout: u32, + flags: u32, +} + +const RESULT_SIZE: usize = std::mem::size_of::(); + +// ─── Constants ──────────────────────────────────────────────────────────────── + +const WORKGROUP_SIZE: u32 = 256; +const MAX_DISPATCH_DIM: u32 = 65_535; +const MAX_BATCH_THREADS: u64 = u32::MAX as u64; + +/// Result buffer ceiling per buffer slot. +/// Two slots are allocated, so GPU VRAM usage for results = 2 × this × 16 B. +/// 8 M × 2 × 16 B = 256 MB — comfortably fits in any modern discrete GPU. +const RESULTS_PER_BATCH: u32 = 8_000_000; + +// ─── Per-slot GPU buffers ───────────────────────────────────────────────────��─ + +struct Slot { + results_buf: wgpu::Buffer, + count_buf: wgpu::Buffer, + results_staging: wgpu::Buffer, + count_staging: wgpu::Buffer, +} + +// ─── GPU state ──────────────────────────────────────────────────────────────── + +pub struct GpuCracker { + device: wgpu::Device, + queue: wgpu::Queue, + pipeline: wgpu::ComputePipeline, + bind_group_layout: wgpu::BindGroupLayout, +} + +impl GpuCracker { + pub async fn new() -> Self { + let instance = wgpu::Instance::default(); + + let adapter = instance + .request_adapter(&wgpu::RequestAdapterOptions { + power_preference: wgpu::PowerPreference::HighPerformance, + force_fallback_adapter: false, + compatible_surface: None, + }) + .await + .expect("no GPU adapter found"); + + let info = adapter.get_info(); + println!("GPU: {} ({:?})", info.name, info.backend); + + let (device, queue) = adapter + .request_device( + &wgpu::DeviceDescriptor { + label: Some("crc_cracker"), + required_features: wgpu::Features::empty(), + required_limits: wgpu::Limits::default(), + }, + None, + ) + .await + .expect("failed to open GPU device"); + + let shader = device.create_shader_module(wgpu::ShaderModuleDescriptor { + label: Some("crc_shader"), + source: wgpu::ShaderSource::Wgsl(include_str!("shader.wgsl").into()), + }); + + let bind_group_layout = + device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor { + label: Some("crc_bgl"), + entries: &[ + buf_layout(0, true), + buf_layout(1, true), + uniform_layout(2), + buf_layout(3, false), + buf_layout(4, false), + ], + }); + + let pipeline_layout = + device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor { + label: Some("crc_pipeline_layout"), + bind_group_layouts: &[&bind_group_layout], + push_constant_ranges: &[], + }); + + let pipeline = + device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor { + label: Some("crc_pipeline"), + layout: Some(&pipeline_layout), + module: &shader, + entry_point: "main", + compilation_options: wgpu::PipelineCompilationOptions::default(), + }); + + Self { device, queue, pipeline, bind_group_layout } + } + + // ── Public API ──────────────────────────────────────────────────────────── + + /// Brute-force all matching CRC configurations, writing each to `out`. + /// Returns the total match count (exact — no results are ever dropped). + /// + /// Pipeline: + /// - Work is split into batches (≤ u32::MAX threads each). + /// - Two GPU buffer slots alternate (double-buffer). + /// - GPU runs batch N while CPU drains batch N-1 to disk — no idle gaps. + /// - If a batch's results overflow the buffer it is split into two + /// half-batches and re-queued; no count-only pass needed. + pub fn run( + &self, + test_cases: &[TestCase], + constraints: &Constraints, + out: &mut impl Write, + ) -> u64 { + assert!(!test_cases.is_empty(), "need at least one test case"); + + let total = constraints.total_combinations(); + let poly_count = constraints.poly.count() as u64; + let per_poly = total / poly_count; + + let polys_per_batch = ((MAX_BATCH_THREADS / per_poly).max(1) as u32) + .min(constraints.poly.count()); + + let num_batches = constraints.poly.count().div_ceil(polys_per_batch); + + println!( + "search space: {} combinations \ + ({} poly × {} init × {} xorout × {}× refin × {}× refout) \ + → {} batch(es) initially", + total, + constraints.poly.count(), + constraints.init.count(), + constraints.xorout.count(), + constraints.refin_count(), + constraints.refout_count(), + num_batches, + ); + + // Shared input buffers — never change between batches. + let (data_buf, tc_buf) = self.upload_test_data(test_cases); + + // Work queue — starts with the initial poly-range slices. + // Overflow adds split sub-ranges back here. + let mut work: VecDeque<(u32, u32)> = { + let poly_min = constraints.poly.min(); + (0..constraints.poly.count()) + .step_by(polys_per_batch as usize) + .map(|off| { + let cnt = polys_per_batch.min(constraints.poly.count() - off); + (poly_min + off, cnt) + }) + .collect() + }; + + // Two GPU buffer slots for double-buffering. + let slots = [self.create_slot(), self.create_slot()]; + let mut active = 0usize; // slot to submit the next batch into + + // Currently in-flight submission: (submission_index, slot, poly_min, poly_count) + let mut pending: Option<(wgpu::SubmissionIndex, usize, u32, u32)> = None; + let mut total_found = 0u64; + + let mut done_work: u64 = 0; + let total_work = total; + + let mut last_update = Instant::now(); + let update_interval = Duration::from_millis(33); // tweak if needed + + loop { + // ── Submit next batch (if any) ──────────────────────────────────── + // Do this BEFORE draining the pending batch so the GPU stays busy. + let new_pending = work.pop_front().map(|(poly_min, poly_count)| { + let si = self.submit( + &slots[active], &data_buf, &tc_buf, + constraints, poly_min, poly_count, + test_cases.len() as u32, + ); + let result = (si, active, poly_min, poly_count); + active = 1 - active; + result + }); + + // ── Drain the previous batch ────────────────────────────────────── + // GPU is now running `new_pending` concurrently with this drain. + if let Some((p_si, p_slot, p_poly_min, p_poly_count)) = pending.take() { + // Schedule mapping BEFORE polling. The callback fires inside the + // poll call below, so get_mapped_range() works right after it. + let count_slice = slots[p_slot].count_staging.slice(..4); + let results_slice = slots[p_slot].results_staging.slice(..); + count_slice.map_async(wgpu::MapMode::Read, |_| {}); + results_slice.map_async(wgpu::MapMode::Read, |_| {}); + + // Wait for this batch. GPU continues running new_pending. + self.device.poll(wgpu::Maintain::WaitForSubmissionIndex(p_si)); + + let raw_count = { + let v = count_slice.get_mapped_range(); + u32::from_le_bytes(v[..4].try_into().unwrap()) + }; + slots[p_slot].count_staging.unmap(); + + if raw_count > RESULTS_PER_BATCH { + // This batch produced more matches than the buffer holds. + // Split it into two half-sized poly ranges and re-queue at + // the front so they run before the remaining original batches. + slots[p_slot].results_staging.unmap(); + let half = p_poly_count / 2; + if half > 0 { + work.push_front((p_poly_min + half, p_poly_count - half)); + work.push_front((p_poly_min, half)); + } else { + // Single poly value with > RESULTS_PER_BATCH matches. + // This means almost every (init, xorout, refin, refout) + // combination matches — add more test cases to constrain. + let capped = RESULTS_PER_BATCH; + eprintln!( + "warning: poly={:#X} produced {} results, \ + buffer capped at {capped}. Add more test cases.", + p_poly_min, raw_count, + ); + self.write_results( + &slots[p_slot].results_staging, capped, constraints.width, out, + ); + total_found += capped as u64; + } + } else { + self.write_results( + &slots[p_slot].results_staging, raw_count, constraints.width, out, + ); + total_found += raw_count as u64; + } + + let batch_work = p_poly_count as u64 + * constraints.init.count() as u64 + * constraints.xorout.count() as u64 + * constraints.refin_count() as u64 + * constraints.refout_count() as u64; + + done_work += batch_work; + + if last_update.elapsed() >= update_interval { + let progress = done_work as f64 / total_work as f64; + let percent = progress * 100.0; + + // simple text bar (cheap, no allocations) + let bar_width = 30; + let filled = (progress * bar_width as f64) as usize; + + let mut bar = String::with_capacity(bar_width); + for i in 0..bar_width { + bar.push(if i < filled { '#' } else { '-' }); + } + + print!("[{}] {:6.2}% ({}/{})\r", bar, percent, done_work, total_work); + std::io::stdout().flush().ok(); + + last_update = Instant::now(); + } + } + + // ── Advance state ───────────────────────────────────────────────── + match new_pending { + Some(info) => pending = Some(info), + None if pending.is_none() && work.is_empty() => break, + None => { /* no new submission, but still pending or overflow work */ } + } + } + + println!(); + + total_found + } + + // ── Internals ───────────────────────────────────────────────────────────── + + fn create_slot(&self) -> Slot { + let d = &self.device; + let results_size = RESULTS_PER_BATCH as u64 * RESULT_SIZE as u64; + Slot { + results_buf: d.create_buffer(&wgpu::BufferDescriptor { + label: Some("results_buf"), + size: results_size, + usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC, + mapped_at_creation: false, + }), + count_buf: d.create_buffer(&wgpu::BufferDescriptor { + label: Some("count_buf"), + size: 4, + usage: wgpu::BufferUsages::STORAGE + | wgpu::BufferUsages::COPY_SRC + | wgpu::BufferUsages::COPY_DST, + mapped_at_creation: false, + }), + results_staging: d.create_buffer(&wgpu::BufferDescriptor { + label: Some("results_staging"), + size: results_size, + usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ, + mapped_at_creation: false, + }), + count_staging: d.create_buffer(&wgpu::BufferDescriptor { + label: Some("count_staging"), + size: 4, + usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ, + mapped_at_creation: false, + }), + } + } + + fn upload_test_data( + &self, + test_cases: &[TestCase], + ) -> (wgpu::Buffer, wgpu::Buffer) { + let mut flat: Vec = Vec::new(); + let mut gpu: Vec = Vec::new(); + for tc in test_cases { + gpu.push(GpuTestCase { + data_offset: flat.len() as u32, + data_len: tc.data.len() as u32, + expected: tc.expected, + mask: tc.mask, + }); + flat.extend_from_slice(&tc.data); + } + while flat.len() % 4 != 0 { flat.push(0); } + + let d = &self.device; + ( + d.create_buffer_init(&wgpu::util::BufferInitDescriptor { + label: Some("data_buf"), + contents: &flat, + usage: wgpu::BufferUsages::STORAGE, + }), + d.create_buffer_init(&wgpu::util::BufferInitDescriptor { + label: Some("tc_buf"), + contents: bytemuck::cast_slice(&gpu), + usage: wgpu::BufferUsages::STORAGE, + }), + ) + } + + /// Build and submit one compute pass. Returns immediately — the caller + /// decides when to wait via `WaitForSubmissionIndex`. + fn submit( + &self, + slot: &Slot, + data_buf: &wgpu::Buffer, + tc_buf: &wgpu::Buffer, + constraints: &Constraints, + poly_min: u32, + poly_count: u32, + tc_count: u32, + ) -> wgpu::SubmissionIndex { + let batch_total = poly_count as u64 + * constraints.init.count() as u64 + * constraints.xorout.count() as u64 + * constraints.refin_count() as u64 + * constraints.refout_count() as u64; + let total_u32 = batch_total as u32; + let total_wg = total_u32.div_ceil(WORKGROUP_SIZE); + let dispatch_x = total_wg.min(MAX_DISPATCH_DIM); + let dispatch_y = total_wg.div_ceil(dispatch_x); + + let sp = SearchParams { + width: constraints.width as u32, + mask: constraints.mask(), + poly_min, + poly_count, + init_min: constraints.init.min(), + init_count: constraints.init.count(), + xorout_min: constraints.xorout.min(), + xorout_count: constraints.xorout.count(), + refin_start: constraints.refin_start(), + refin_count: constraints.refin_count(), + refout_start: constraints.refout_start(), + refout_count: constraints.refout_count(), + threads_per_row: dispatch_x * WORKGROUP_SIZE, + max_results: RESULTS_PER_BATCH, + test_case_count: tc_count, + _pad: 0, + }; + + let d = &self.device; + + let params_buf = d.create_buffer_init(&wgpu::util::BufferInitDescriptor { + label: Some("params_buf"), + contents: bytemuck::bytes_of(&sp), + usage: wgpu::BufferUsages::UNIFORM, + }); + + let bg = d.create_bind_group(&wgpu::BindGroupDescriptor { + label: Some("crc_bg"), + layout: &self.bind_group_layout, + entries: &[ + wgpu::BindGroupEntry { binding: 0, resource: data_buf.as_entire_binding() }, + wgpu::BindGroupEntry { binding: 1, resource: tc_buf.as_entire_binding() }, + wgpu::BindGroupEntry { binding: 2, resource: params_buf.as_entire_binding() }, + wgpu::BindGroupEntry { binding: 3, resource: slot.results_buf.as_entire_binding() }, + wgpu::BindGroupEntry { binding: 4, resource: slot.count_buf.as_entire_binding() }, + ], + }); + + let mut enc = d.create_command_encoder( + &wgpu::CommandEncoderDescriptor { label: Some("crc_enc") } + ); + + enc.clear_buffer(&slot.count_buf, 0, None); + + { + let mut pass = enc.begin_compute_pass(&wgpu::ComputePassDescriptor { + label: Some("crc_pass"), + timestamp_writes: None, + }); + pass.set_pipeline(&self.pipeline); + pass.set_bind_group(0, &bg, &[]); + pass.dispatch_workgroups(dispatch_x, dispatch_y, 1); + } + + let results_copy_size = RESULTS_PER_BATCH as u64 * RESULT_SIZE as u64; + enc.copy_buffer_to_buffer(&slot.count_buf, 0, &slot.count_staging, 0, 4); + enc.copy_buffer_to_buffer(&slot.results_buf, 0, &slot.results_staging, 0, results_copy_size); + + self.queue.submit(std::iter::once(enc.finish())) + } + + /// Read `count` result entries from a slot's staging buffer (which must + /// already be mapped) and write them to `out`. + fn write_results( + &self, + results_staging: &wgpu::Buffer, + count: u32, + width: u8, + out: &mut impl Write, + ) { + if count == 0 { + results_staging.unmap(); + return; + } + let byte_count = count as usize * RESULT_SIZE; + { + let view = results_staging.slice(..byte_count as u64).get_mapped_range(); + for r in bytemuck::cast_slice::(&view) { + writeln!(out, "{}", CrcParams { + poly: r.poly, + init: r.init, + xorout: r.xorout, + refin: (r.flags & 1) != 0, + refout: (r.flags & 2) != 0, + }.display(width)) + .expect("write to output failed"); + } + } + results_staging.unmap(); + } +} + +// ─── Layout helpers ─────────────────────────────────────────────────────────── + +fn buf_layout(binding: u32, read_only: bool) -> wgpu::BindGroupLayoutEntry { + wgpu::BindGroupLayoutEntry { + binding, + visibility: wgpu::ShaderStages::COMPUTE, + ty: wgpu::BindingType::Buffer { + ty: wgpu::BufferBindingType::Storage { read_only }, + has_dynamic_offset: false, + min_binding_size: None, + }, + count: None, + } +} + +fn uniform_layout(binding: u32) -> wgpu::BindGroupLayoutEntry { + wgpu::BindGroupLayoutEntry { + binding, + visibility: wgpu::ShaderStages::COMPUTE, + ty: wgpu::BindingType::Buffer { + ty: wgpu::BufferBindingType::Uniform, + has_dynamic_offset: false, + min_binding_size: None, + }, + count: None, + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..5994d6a --- /dev/null +++ b/src/main.rs @@ -0,0 +1,52 @@ +mod cracker; +mod gpu; + +use std::fs::File; +use std::io::BufWriter; + +use cracker::{Constraints, TestCase, ValueRange}; +use gpu::GpuCracker; + +fn main() { + pollster::block_on(run()); +} + +async fn run() { + let cracker = GpuCracker::new().await; + + let mut out = BufWriter::new( + File::create("results.txt").expect("cannot create results.txt"), + ); + + let count = cracker.run( + &[ + TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x00], 0x94, 0x00FF), + TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x01], 0x61, 0x00FF), + TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x02], 0x8B, 0x00FF), + TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x03], 0x7E, 0x00FF), + TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x04], 0xAA, 0x00FF), + TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x05], 0x5F, 0x00FF), + TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x06], 0xB5, 0x00FF), + TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x07], 0x40, 0x00FF), + TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x08], 0xE8, 0x00FF), + TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x09], 0x1D, 0x00FF), + TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x0A], 0xF7, 0x00FF), + TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x0B], 0x02, 0x00FF), + TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x0C], 0xD6, 0x00FF), + TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x0D], 0x23, 0x00FF), + TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x0E], 0xC9, 0x00FF), + TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x0F], 0x3C, 0x00FF), + ], + &Constraints { + width: 13, + poly: ValueRange::Range(0x0000..=0x1FFF), + init: ValueRange::Range(0x0000..=0x1FFF), + xorout: ValueRange::Fixed(0x0000), + refin: None, + refout: None, + }, + &mut out, + ); + + println!("found {count} match(es) — written to results.txt"); +} diff --git a/src/shader.wgsl b/src/shader.wgsl new file mode 100644 index 0000000..00f5e7f --- /dev/null +++ b/src/shader.wgsl @@ -0,0 +1,131 @@ +// ─── Structs ───────────────────────────────────────────────────────────────── + +struct SearchParams { + width: u32, + mask: u32, + poly_min: u32, + poly_count: u32, + init_min: u32, + init_count: u32, + xorout_min: u32, + xorout_count: u32, + refin_start: u32, + refin_count: u32, + refout_start: u32, + refout_count: u32, + threads_per_row: u32, + max_results: u32, + test_case_count: u32, + _pad: u32, +} + +struct GpuTestCase { + data_offset: u32, + data_len: u32, + expected: u32, + mask: u32, +} + +struct ResultEntry { + poly: u32, + init: u32, + xorout: u32, + flags: u32, // bit 0 = refin, bit 1 = refout +} + +// ─── Bindings ───────────────────────────────────────────────────────────────── + +@group(0) @binding(0) var data_buf: array; +@group(0) @binding(1) var test_cases: array; +@group(0) @binding(2) var params: SearchParams; +@group(0) @binding(3) var results: array; +@group(0) @binding(4) var result_count: atomic; + +// ─── Helpers ────────────────────────────────────────────────────────────────── + +fn reflect_bits(val: u32, width: u32) -> u32 { + var r = 0u; + var v = val; + for (var i = 0u; i < width; i++) { + r = (r << 1u) | (v & 1u); + v >>= 1u; + } + return r; +} + +fn load_byte(byte_idx: u32) -> u32 { + return (data_buf[byte_idx >> 2u] >> ((byte_idx & 3u) << 3u)) & 0xFFu; +} + +// ─── CRC computation ────────────────────────────────────────────────────────── + +// Bit-by-bit CRC — works for any width 1–32. +// Inner loop is branchless: select() compiles to a conditional move with no +// warp divergence. +fn compute_crc( + data_start: u32, + data_len: u32, + poly: u32, + init: u32, + xorout: u32, + refin: bool, + refout: bool, +) -> u32 { + let msb_shift = params.width - 1u; + let mask = params.mask; + var crc = init & mask; + + for (var i = 0u; i < data_len; i++) { + var byte = load_byte(data_start + i); + if refin { byte = reflect_bits(byte, 8u); } + + for (var b = 0u; b < 8u; b++) { + let feedback = select(0u, poly, ((crc >> msb_shift) ^ ((byte >> (7u - b)) & 1u)) != 0u); + crc = ((crc << 1u) & mask) ^ feedback; + } + } + + if refout { crc = reflect_bits(crc, params.width); } + return (crc ^ xorout) & mask; +} + +// ─── Entry point ───────────────────────────────────────────────────────────── + +@compute @workgroup_size(256, 1, 1) +fn main(@builtin(global_invocation_id) gid: vec3) { + let thread_id = gid.y * params.threads_per_row + gid.x; + + let stride_refout = 1u; + let stride_refin = params.refout_count; + let stride_xorout = params.refin_count * stride_refin; + let stride_init = params.xorout_count * stride_xorout; + let stride_poly = params.init_count * stride_init; + + if thread_id >= stride_poly * params.poly_count { return; } + + let poly_idx = thread_id / stride_poly; + let r1 = thread_id % stride_poly; + let init_idx = r1 / stride_init; + let r2 = r1 % stride_init; + let xorout_idx = r2 / stride_xorout; + let r3 = r2 % stride_xorout; + let refin_idx = r3 / stride_refin; + let refout_idx = r3 % stride_refin; + + let poly = (params.poly_min + poly_idx) & params.mask; + let init = (params.init_min + init_idx) & params.mask; + let xorout = (params.xorout_min + xorout_idx) & params.mask; + let refin = (params.refin_start + refin_idx) != 0u; + let refout = (params.refout_start + refout_idx) != 0u; + + for (var t = 0u; t < params.test_case_count; t++) { + let tc = test_cases[t]; + let got = compute_crc(tc.data_offset, tc.data_len, poly, init, xorout, refin, refout); + if (got & tc.mask) != (tc.expected & tc.mask) { return; } + } + + let idx = atomicAdd(&result_count, 1u); + if idx < params.max_results { + results[idx] = ResultEntry(poly, init, xorout, u32(refin) | (u32(refout) << 1u)); + } +}