vibe coded a crc reverse engineer tool on gpu, because why not

2026-04-12 23:05:19 +02:00
commit 973b4dba18
6 changed files with 881 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
+target
+Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -0,0 +1,9 @@
+[package]
+name = "crc_cracker"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+wgpu = "0.20"
+bytemuck = { version = "1", features = ["derive"] }
+pollster = "0.3"
--- a/src/cracker.rs
+++ b/src/cracker.rs
@@ -0,0 +1,156 @@
+use std::ops::RangeInclusive;
+
+/// A known input/output pair used to constrain the search.
+/// Add multiple test cases to dramatically reduce false positives.
+pub struct TestCase {
+    /// Input bytes (can be a partial slice of a larger message)
+    pub data: Vec<u8>,
+    /// The CRC value to match against (only bits set in `mask` are checked)
+    pub expected: u32,
+    /// Which bits of the CRC are known.
+    /// `u32::MAX` (or the width mask) means all bits must match.
+    /// `0x00FF` means only the low byte must match, useful when you can only
+    /// observe part of the CRC field.
+    pub mask: u32,
+}
+
+impl TestCase {
+    /// All bits of the CRC must match.
+    pub fn exact(data: Vec<u8>, expected: u32) -> Self {
+        Self { data, expected, mask: u32::MAX }
+    }
+
+    /// Only the bits set in `mask` must match.
+    pub fn masked(data: Vec<u8>, expected: u32, mask: u32) -> Self {
+        Self { data, expected, mask }
+    }
+}
+
+/// Constraint on a single CRC parameter.
+#[derive(Clone, Debug)]
+pub enum ValueRange {
+    /// Exactly one value to try
+    Fixed(u32),
+    /// Try every value in this inclusive range
+    Range(RangeInclusive<u32>),
+}
+
+impl ValueRange {
+    pub fn min(&self) -> u32 {
+        match self {
+            Self::Fixed(v) => *v,
+            Self::Range(r) => *r.start(),
+        }
+    }
+
+    pub fn count(&self) -> u32 {
+        match self {
+            Self::Fixed(_) => 1,
+            Self::Range(r) => r.end() - r.start() + 1,
+        }
+    }
+}
+
+/// Constraints for the brute-force search.
+///
+/// The GPU will try every combination within these bounds, testing each
+/// candidate against all provided `TestCase`s. Only configurations that
+/// pass every test case are returned.
+pub struct Constraints {
+    /// CRC register width in bits — currently 8, 16, or 32.
+    pub width: u8,
+    /// Generator polynomial (without the implicit leading 1).
+    pub poly: ValueRange,
+    /// Initial value loaded into the CRC register.
+    pub init: ValueRange,
+    /// Value XOR'd into the final CRC before output.
+    pub xorout: ValueRange,
+    /// Whether each input byte is bit-reversed before entering the CRC register.
+    ///
+    /// - `Some(false)` — MSB-first (textbook CRCs, parallel-bus / memory protocols)
+    /// - `Some(true)`  — LSB-first (most serial-line protocols: Ethernet, USB, CAN,
+    ///                   Bluetooth, Dallas 1-Wire — bits arrive LSB-first on the wire)
+    /// - `None`        — unknown; the GPU tries both and returns whichever matches
+    pub refin: Option<bool>,
+
+    /// Whether the final CRC register value is bit-reversed before the xorout step.
+    ///
+    /// - `Some(false)` — register used as-is
+    /// - `Some(true)`  — register is reflected before XOR-out
+    /// - `None`        — unknown; the GPU tries both
+    ///
+    /// `refin` and `refout` are almost always equal.
+    /// `(true, true)`   covers most real-world CRC-16/32 algorithms (CRC-32/ISO-HDLC,
+    ///                  CRC-16/ARC, CRC-16/IBM, …).
+    /// `(false, false)` covers most CRC-8 variants and simple checksum algorithms.
+    pub refout: Option<bool>,
+}
+
+impl Constraints {
+    pub fn refin_start(&self) -> u32 {
+        match self.refin {
+            Some(true) => 1,
+            _ => 0,
+        }
+    }
+
+    pub fn refin_count(&self) -> u32 {
+        if self.refin.is_none() { 2 } else { 1 }
+    }
+
+    pub fn refout_start(&self) -> u32 {
+        match self.refout {
+            Some(true) => 1,
+            _ => 0,
+        }
+    }
+
+    pub fn refout_count(&self) -> u32 {
+        if self.refout.is_none() { 2 } else { 1 }
+    }
+
+    /// Total number of (poly, init, xorout, refin, refout) combinations.
+    pub fn total_combinations(&self) -> u64 {
+        self.poly.count() as u64
+            * self.init.count() as u64
+            * self.xorout.count() as u64
+            * self.refin_count() as u64
+            * self.refout_count() as u64
+    }
+
+    pub fn mask(&self) -> u32 {
+        if self.width == 32 {
+            u32::MAX
+        } else {
+            (1u32 << self.width) - 1
+        }
+    }
+}
+
+/// A matching CRC configuration found during the search.
+#[derive(Debug, Clone, Copy)]
+pub struct CrcParams {
+    pub poly: u32,
+    pub init: u32,
+    pub xorout: u32,
+    pub refin: bool,
+    pub refout: bool,
+}
+
+impl CrcParams {
+    /// Format as a human-readable CRC descriptor.
+    pub fn display(&self, width: u8) -> String {
+        // ceil(width / 4) hex digits so CRC-13 shows 4 digits, CRC-5 shows 2, etc.
+        let hex_w = ((width + 3) / 4) as usize;
+        format!(
+            "CRC-{}: poly=0x{:0>w$X}  init=0x{:0>w$X}  xorout=0x{:0>w$X}  refin={:<5}  refout={}",
+            width,
+            self.poly,
+            self.init,
+            self.xorout,
+            self.refin,
+            self.refout,
+            w = hex_w,
+        )
+    }
+}
--- a/src/gpu.rs
+++ b/src/gpu.rs
@@ -0,0 +1,531 @@
+use std::collections::VecDeque;
+use std::io::Write;
+
+use bytemuck::{Pod, Zeroable};
+use wgpu::util::DeviceExt;
+
+use crate::cracker::{Constraints, CrcParams, TestCase};
+
+use std::time::{Duration, Instant};
+
+// ─── GPU-side layouts (must match shader.wgsl exactly) ───────────────────────
+
+#[repr(C)]
+#[derive(Clone, Copy, Pod, Zeroable)]
+struct SearchParams {
+    width:           u32,
+    mask:            u32,
+    poly_min:        u32,
+    poly_count:      u32,
+    init_min:        u32,
+    init_count:      u32,
+    xorout_min:      u32,
+    xorout_count:    u32,
+    refin_start:     u32,
+    refin_count:     u32,
+    refout_start:    u32,
+    refout_count:    u32,
+    threads_per_row: u32,
+    max_results:     u32,
+    test_case_count: u32,
+    _pad:            u32,
+}
+
+#[repr(C)]
+#[derive(Clone, Copy, Pod, Zeroable)]
+struct GpuTestCase {
+    data_offset: u32,
+    data_len:    u32,
+    expected:    u32,
+    mask:        u32,
+}
+
+#[repr(C)]
+#[derive(Clone, Copy, Pod, Zeroable)]
+struct ResultEntry {
+    poly:   u32,
+    init:   u32,
+    xorout: u32,
+    flags:  u32,
+}
+
+const RESULT_SIZE: usize = std::mem::size_of::<ResultEntry>();
+
+// ─── Constants ────────────────────────────────────────────────────────────────
+
+const WORKGROUP_SIZE:    u32 = 256;
+const MAX_DISPATCH_DIM:  u32 = 65_535;
+const MAX_BATCH_THREADS: u64 = u32::MAX as u64;
+
+/// Result buffer ceiling per buffer slot.
+/// Two slots are allocated, so GPU VRAM usage for results = 2 × this × 16 B.
+/// 8 M × 2 × 16 B = 256 MB — comfortably fits in any modern discrete GPU.
+const RESULTS_PER_BATCH: u32 = 8_000_000;
+
+// ─── Per-slot GPU buffers ───────────────────────────────────────────────────<E29480><E29480>─
+
+struct Slot {
+    results_buf:     wgpu::Buffer,
+    count_buf:       wgpu::Buffer,
+    results_staging: wgpu::Buffer,
+    count_staging:   wgpu::Buffer,
+}
+
+// ─── GPU state ────────────────────────────────────────────────────────────────
+
+pub struct GpuCracker {
+    device:            wgpu::Device,
+    queue:             wgpu::Queue,
+    pipeline:          wgpu::ComputePipeline,
+    bind_group_layout: wgpu::BindGroupLayout,
+}
+
+impl GpuCracker {
+    pub async fn new() -> Self {
+        let instance = wgpu::Instance::default();
+
+        let adapter = instance
+            .request_adapter(&wgpu::RequestAdapterOptions {
+                power_preference:       wgpu::PowerPreference::HighPerformance,
+                force_fallback_adapter: false,
+                compatible_surface:     None,
+            })
+            .await
+            .expect("no GPU adapter found");
+
+        let info = adapter.get_info();
+        println!("GPU: {} ({:?})", info.name, info.backend);
+
+        let (device, queue) = adapter
+            .request_device(
+                &wgpu::DeviceDescriptor {
+                    label:             Some("crc_cracker"),
+                    required_features: wgpu::Features::empty(),
+                    required_limits:   wgpu::Limits::default(),
+                },
+                None,
+            )
+            .await
+            .expect("failed to open GPU device");
+
+        let shader = device.create_shader_module(wgpu::ShaderModuleDescriptor {
+            label:  Some("crc_shader"),
+            source: wgpu::ShaderSource::Wgsl(include_str!("shader.wgsl").into()),
+        });
+
+        let bind_group_layout =
+            device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+                label:   Some("crc_bgl"),
+                entries: &[
+                    buf_layout(0, true),
+                    buf_layout(1, true),
+                    uniform_layout(2),
+                    buf_layout(3, false),
+                    buf_layout(4, false),
+                ],
+            });
+
+        let pipeline_layout =
+            device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+                label:                Some("crc_pipeline_layout"),
+                bind_group_layouts:   &[&bind_group_layout],
+                push_constant_ranges: &[],
+            });
+
+        let pipeline =
+            device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
+                label:               Some("crc_pipeline"),
+                layout:              Some(&pipeline_layout),
+                module:              &shader,
+                entry_point:         "main",
+                compilation_options: wgpu::PipelineCompilationOptions::default(),
+            });
+
+        Self { device, queue, pipeline, bind_group_layout }
+    }
+
+    // ── Public API ────────────────────────────────────────────────────────────
+
+    /// Brute-force all matching CRC configurations, writing each to `out`.
+    /// Returns the total match count (exact — no results are ever dropped).
+    ///
+    /// Pipeline:
+    ///   - Work is split into batches (≤ u32::MAX threads each).
+    ///   - Two GPU buffer slots alternate (double-buffer).
+    ///   - GPU runs batch N while CPU drains batch N-1 to disk — no idle gaps.
+    ///   - If a batch's results overflow the buffer it is split into two
+    ///     half-batches and re-queued; no count-only pass needed.
+    pub fn run(
+        &self,
+        test_cases:  &[TestCase],
+        constraints: &Constraints,
+        out:         &mut impl Write,
+    ) -> u64 {
+        assert!(!test_cases.is_empty(), "need at least one test case");
+
+        let total      = constraints.total_combinations();
+        let poly_count = constraints.poly.count() as u64;
+        let per_poly   = total / poly_count;
+
+        let polys_per_batch = ((MAX_BATCH_THREADS / per_poly).max(1) as u32)
+            .min(constraints.poly.count());
+
+        let num_batches = constraints.poly.count().div_ceil(polys_per_batch);
+
+        println!(
+            "search space: {} combinations  \
+             ({} poly × {} init × {} xorout × {}× refin × {}× refout)  \
+             → {} batch(es) initially",
+            total,
+            constraints.poly.count(),
+            constraints.init.count(),
+            constraints.xorout.count(),
+            constraints.refin_count(),
+            constraints.refout_count(),
+            num_batches,
+        );
+
+        // Shared input buffers — never change between batches.
+        let (data_buf, tc_buf) = self.upload_test_data(test_cases);
+
+        // Work queue — starts with the initial poly-range slices.
+        // Overflow adds split sub-ranges back here.
+        let mut work: VecDeque<(u32, u32)> = {
+            let poly_min = constraints.poly.min();
+            (0..constraints.poly.count())
+                .step_by(polys_per_batch as usize)
+                .map(|off| {
+                    let cnt = polys_per_batch.min(constraints.poly.count() - off);
+                    (poly_min + off, cnt)
+                })
+                .collect()
+        };
+
+        // Two GPU buffer slots for double-buffering.
+        let slots = [self.create_slot(), self.create_slot()];
+        let mut active = 0usize; // slot to submit the next batch into
+
+        // Currently in-flight submission: (submission_index, slot, poly_min, poly_count)
+        let mut pending: Option<(wgpu::SubmissionIndex, usize, u32, u32)> = None;
+        let mut total_found = 0u64;
+
+        let mut done_work: u64 = 0;
+        let total_work = total;
+
+        let mut last_update = Instant::now();
+        let update_interval = Duration::from_millis(33); // tweak if needed
+
+        loop {
+            // ── Submit next batch (if any) ────────────────────────────────────
+            // Do this BEFORE draining the pending batch so the GPU stays busy.
+            let new_pending = work.pop_front().map(|(poly_min, poly_count)| {
+                let si = self.submit(
+                    &slots[active], &data_buf, &tc_buf,
+                    constraints, poly_min, poly_count,
+                    test_cases.len() as u32,
+                );
+                let result = (si, active, poly_min, poly_count);
+                active = 1 - active;
+                result
+            });
+
+            // ── Drain the previous batch ──────────────────────────────────────
+            // GPU is now running `new_pending` concurrently with this drain.
+            if let Some((p_si, p_slot, p_poly_min, p_poly_count)) = pending.take() {
+                // Schedule mapping BEFORE polling.  The callback fires inside the
+                // poll call below, so get_mapped_range() works right after it.
+                let count_slice   = slots[p_slot].count_staging.slice(..4);
+                let results_slice = slots[p_slot].results_staging.slice(..);
+                count_slice.map_async(wgpu::MapMode::Read, |_| {});
+                results_slice.map_async(wgpu::MapMode::Read, |_| {});
+
+                // Wait for this batch.  GPU continues running new_pending.
+                self.device.poll(wgpu::Maintain::WaitForSubmissionIndex(p_si));
+
+                let raw_count = {
+                    let v = count_slice.get_mapped_range();
+                    u32::from_le_bytes(v[..4].try_into().unwrap())
+                };
+                slots[p_slot].count_staging.unmap();
+
+                if raw_count > RESULTS_PER_BATCH {
+                    // This batch produced more matches than the buffer holds.
+                    // Split it into two half-sized poly ranges and re-queue at
+                    // the front so they run before the remaining original batches.
+                    slots[p_slot].results_staging.unmap();
+                    let half = p_poly_count / 2;
+                    if half > 0 {
+                        work.push_front((p_poly_min + half, p_poly_count - half));
+                        work.push_front((p_poly_min, half));
+                    } else {
+                        // Single poly value with > RESULTS_PER_BATCH matches.
+                        // This means almost every (init, xorout, refin, refout)
+                        // combination matches — add more test cases to constrain.
+                        let capped = RESULTS_PER_BATCH;
+                        eprintln!(
+                            "warning: poly={:#X} produced {} results, \
+                             buffer capped at {capped}. Add more test cases.",
+                            p_poly_min, raw_count,
+                        );
+                        self.write_results(
+                            &slots[p_slot].results_staging, capped, constraints.width, out,
+                        );
+                        total_found += capped as u64;
+                    }
+                } else {
+                    self.write_results(
+                        &slots[p_slot].results_staging, raw_count, constraints.width, out,
+                    );
+                    total_found += raw_count as u64;
+                }
+
+                let batch_work = p_poly_count as u64
+                    * constraints.init.count()   as u64
+                    * constraints.xorout.count() as u64
+                    * constraints.refin_count()  as u64
+                    * constraints.refout_count() as u64;
+
+                done_work += batch_work;
+
+                if last_update.elapsed() >= update_interval {
+                    let progress = done_work as f64 / total_work as f64;
+                    let percent = progress * 100.0;
+
+                    // simple text bar (cheap, no allocations)
+                    let bar_width = 30;
+                    let filled = (progress * bar_width as f64) as usize;
+
+                    let mut bar = String::with_capacity(bar_width);
+                    for i in 0..bar_width {
+                        bar.push(if i < filled { '#' } else { '-' });
+                    }
+
+                    print!("[{}] {:6.2}% ({}/{})\r", bar, percent, done_work, total_work);
+                    std::io::stdout().flush().ok();
+
+                    last_update = Instant::now();
+                }
+            }
+
+            // ── Advance state ─────────────────────────────────────────────────
+            match new_pending {
+                Some(info) => pending = Some(info),
+                None if pending.is_none() && work.is_empty() => break,
+                None => { /* no new submission, but still pending or overflow work */ }
+            }
+        }
+
+        println!();
+
+        total_found
+    }
+
+    // ── Internals ─────────────────────────────────────────────────────────────
+
+    fn create_slot(&self) -> Slot {
+        let d            = &self.device;
+        let results_size = RESULTS_PER_BATCH as u64 * RESULT_SIZE as u64;
+        Slot {
+            results_buf: d.create_buffer(&wgpu::BufferDescriptor {
+                label:              Some("results_buf"),
+                size:               results_size,
+                usage:              wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC,
+                mapped_at_creation: false,
+            }),
+            count_buf: d.create_buffer(&wgpu::BufferDescriptor {
+                label:              Some("count_buf"),
+                size:               4,
+                usage:              wgpu::BufferUsages::STORAGE
+                                  | wgpu::BufferUsages::COPY_SRC
+                                  | wgpu::BufferUsages::COPY_DST,
+                mapped_at_creation: false,
+            }),
+            results_staging: d.create_buffer(&wgpu::BufferDescriptor {
+                label:              Some("results_staging"),
+                size:               results_size,
+                usage:              wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
+                mapped_at_creation: false,
+            }),
+            count_staging: d.create_buffer(&wgpu::BufferDescriptor {
+                label:              Some("count_staging"),
+                size:               4,
+                usage:              wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
+                mapped_at_creation: false,
+            }),
+        }
+    }
+
+    fn upload_test_data(
+        &self,
+        test_cases: &[TestCase],
+    ) -> (wgpu::Buffer, wgpu::Buffer) {
+        let mut flat: Vec<u8>          = Vec::new();
+        let mut gpu:  Vec<GpuTestCase> = Vec::new();
+        for tc in test_cases {
+            gpu.push(GpuTestCase {
+                data_offset: flat.len() as u32,
+                data_len:    tc.data.len() as u32,
+                expected:    tc.expected,
+                mask:        tc.mask,
+            });
+            flat.extend_from_slice(&tc.data);
+        }
+        while flat.len() % 4 != 0 { flat.push(0); }
+
+        let d = &self.device;
+        (
+            d.create_buffer_init(&wgpu::util::BufferInitDescriptor {
+                label:    Some("data_buf"),
+                contents: &flat,
+                usage:    wgpu::BufferUsages::STORAGE,
+            }),
+            d.create_buffer_init(&wgpu::util::BufferInitDescriptor {
+                label:    Some("tc_buf"),
+                contents: bytemuck::cast_slice(&gpu),
+                usage:    wgpu::BufferUsages::STORAGE,
+            }),
+        )
+    }
+
+    /// Build and submit one compute pass.  Returns immediately — the caller
+    /// decides when to wait via `WaitForSubmissionIndex`.
+    fn submit(
+        &self,
+        slot:        &Slot,
+        data_buf:    &wgpu::Buffer,
+        tc_buf:      &wgpu::Buffer,
+        constraints: &Constraints,
+        poly_min:    u32,
+        poly_count:  u32,
+        tc_count:    u32,
+    ) -> wgpu::SubmissionIndex {
+        let batch_total  = poly_count as u64
+            * constraints.init.count()   as u64
+            * constraints.xorout.count() as u64
+            * constraints.refin_count()  as u64
+            * constraints.refout_count() as u64;
+        let total_u32    = batch_total as u32;
+        let total_wg     = total_u32.div_ceil(WORKGROUP_SIZE);
+        let dispatch_x   = total_wg.min(MAX_DISPATCH_DIM);
+        let dispatch_y   = total_wg.div_ceil(dispatch_x);
+
+        let sp = SearchParams {
+            width:           constraints.width as u32,
+            mask:            constraints.mask(),
+            poly_min,
+            poly_count,
+            init_min:        constraints.init.min(),
+            init_count:      constraints.init.count(),
+            xorout_min:      constraints.xorout.min(),
+            xorout_count:    constraints.xorout.count(),
+            refin_start:     constraints.refin_start(),
+            refin_count:     constraints.refin_count(),
+            refout_start:    constraints.refout_start(),
+            refout_count:    constraints.refout_count(),
+            threads_per_row: dispatch_x * WORKGROUP_SIZE,
+            max_results:     RESULTS_PER_BATCH,
+            test_case_count: tc_count,
+            _pad:            0,
+        };
+
+        let d = &self.device;
+
+        let params_buf = d.create_buffer_init(&wgpu::util::BufferInitDescriptor {
+            label:    Some("params_buf"),
+            contents: bytemuck::bytes_of(&sp),
+            usage:    wgpu::BufferUsages::UNIFORM,
+        });
+
+        let bg = d.create_bind_group(&wgpu::BindGroupDescriptor {
+            label:   Some("crc_bg"),
+            layout:  &self.bind_group_layout,
+            entries: &[
+                wgpu::BindGroupEntry { binding: 0, resource: data_buf.as_entire_binding() },
+                wgpu::BindGroupEntry { binding: 1, resource: tc_buf.as_entire_binding() },
+                wgpu::BindGroupEntry { binding: 2, resource: params_buf.as_entire_binding() },
+                wgpu::BindGroupEntry { binding: 3, resource: slot.results_buf.as_entire_binding() },
+                wgpu::BindGroupEntry { binding: 4, resource: slot.count_buf.as_entire_binding() },
+            ],
+        });
+
+        let mut enc = d.create_command_encoder(
+            &wgpu::CommandEncoderDescriptor { label: Some("crc_enc") }
+        );
+
+        enc.clear_buffer(&slot.count_buf, 0, None);
+
+        {
+            let mut pass = enc.begin_compute_pass(&wgpu::ComputePassDescriptor {
+                label:            Some("crc_pass"),
+                timestamp_writes: None,
+            });
+            pass.set_pipeline(&self.pipeline);
+            pass.set_bind_group(0, &bg, &[]);
+            pass.dispatch_workgroups(dispatch_x, dispatch_y, 1);
+        }
+
+        let results_copy_size = RESULTS_PER_BATCH as u64 * RESULT_SIZE as u64;
+        enc.copy_buffer_to_buffer(&slot.count_buf,   0, &slot.count_staging,   0, 4);
+        enc.copy_buffer_to_buffer(&slot.results_buf, 0, &slot.results_staging, 0, results_copy_size);
+
+        self.queue.submit(std::iter::once(enc.finish()))
+    }
+
+    /// Read `count` result entries from a slot's staging buffer (which must
+    /// already be mapped) and write them to `out`.
+    fn write_results(
+        &self,
+        results_staging: &wgpu::Buffer,
+        count:           u32,
+        width:           u8,
+        out:             &mut impl Write,
+    ) {
+        if count == 0 {
+            results_staging.unmap();
+            return;
+        }
+        let byte_count = count as usize * RESULT_SIZE;
+        {
+            let view = results_staging.slice(..byte_count as u64).get_mapped_range();
+            for r in bytemuck::cast_slice::<u8, ResultEntry>(&view) {
+                writeln!(out, "{}", CrcParams {
+                    poly:   r.poly,
+                    init:   r.init,
+                    xorout: r.xorout,
+                    refin:  (r.flags & 1) != 0,
+                    refout: (r.flags & 2) != 0,
+                }.display(width))
+                .expect("write to output failed");
+            }
+        }
+        results_staging.unmap();
+    }
+}
+
+// ─── Layout helpers ───────────────────────────────────────────────────────────
+
+fn buf_layout(binding: u32, read_only: bool) -> wgpu::BindGroupLayoutEntry {
+    wgpu::BindGroupLayoutEntry {
+        binding,
+        visibility: wgpu::ShaderStages::COMPUTE,
+        ty: wgpu::BindingType::Buffer {
+            ty:                 wgpu::BufferBindingType::Storage { read_only },
+            has_dynamic_offset: false,
+            min_binding_size:   None,
+        },
+        count: None,
+    }
+}
+
+fn uniform_layout(binding: u32) -> wgpu::BindGroupLayoutEntry {
+    wgpu::BindGroupLayoutEntry {
+        binding,
+        visibility: wgpu::ShaderStages::COMPUTE,
+        ty: wgpu::BindingType::Buffer {
+            ty:                 wgpu::BufferBindingType::Uniform,
+            has_dynamic_offset: false,
+            min_binding_size:   None,
+        },
+        count: None,
+    }
+}
--- a/src/main.rs
+++ b/src/main.rs
@@ -0,0 +1,52 @@
+mod cracker;
+mod gpu;
+
+use std::fs::File;
+use std::io::BufWriter;
+
+use cracker::{Constraints, TestCase, ValueRange};
+use gpu::GpuCracker;
+
+fn main() {
+    pollster::block_on(run());
+}
+
+async fn run() {
+    let cracker = GpuCracker::new().await;
+
+    let mut out = BufWriter::new(
+        File::create("results.txt").expect("cannot create results.txt"),
+    );
+
+    let count = cracker.run(
+        &[
+            TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x00], 0x94, 0x00FF),
+            TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x01], 0x61, 0x00FF),
+            TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x02], 0x8B, 0x00FF),
+            TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x03], 0x7E, 0x00FF),
+            TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x04], 0xAA, 0x00FF),
+            TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x05], 0x5F, 0x00FF),
+            TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x06], 0xB5, 0x00FF),
+            TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x07], 0x40, 0x00FF),
+            TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x08], 0xE8, 0x00FF),
+            TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x09], 0x1D, 0x00FF),
+            TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x0A], 0xF7, 0x00FF),
+            TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x0B], 0x02, 0x00FF),
+            TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x0C], 0xD6, 0x00FF),
+            TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x0D], 0x23, 0x00FF),
+            TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x0E], 0xC9, 0x00FF),
+            TestCase::masked(vec![0x01, 0x23, 0x45, 0x67, 0x89, 0x0F], 0x3C, 0x00FF),
+        ],
+        &Constraints {
+            width:  13,
+            poly:   ValueRange::Range(0x0000..=0x1FFF),
+            init:   ValueRange::Range(0x0000..=0x1FFF),
+            xorout: ValueRange::Fixed(0x0000),
+            refin:  None,
+            refout: None,
+        },
+        &mut out,
+    );
+
+    println!("found {count} match(es) — written to results.txt");
+}
--- a/src/shader.wgsl
+++ b/src/shader.wgsl
@@ -0,0 +1,131 @@
+// ─── Structs ─────────────────────────────────────────────────────────────────
+
+struct SearchParams {
+    width:           u32,
+    mask:            u32,
+    poly_min:        u32,
+    poly_count:      u32,
+    init_min:        u32,
+    init_count:      u32,
+    xorout_min:      u32,
+    xorout_count:    u32,
+    refin_start:     u32,
+    refin_count:     u32,
+    refout_start:    u32,
+    refout_count:    u32,
+    threads_per_row: u32,
+    max_results:     u32,
+    test_case_count: u32,
+    _pad:            u32,
+}
+
+struct GpuTestCase {
+    data_offset: u32,
+    data_len:    u32,
+    expected:    u32,
+    mask:        u32,
+}
+
+struct ResultEntry {
+    poly:   u32,
+    init:   u32,
+    xorout: u32,
+    flags:  u32,    // bit 0 = refin, bit 1 = refout
+}
+
+// ─── Bindings ─────────────────────────────────────────────────────────────────
+
+@group(0) @binding(0) var<storage, read>       data_buf:     array<u32>;
+@group(0) @binding(1) var<storage, read>       test_cases:   array<GpuTestCase>;
+@group(0) @binding(2) var<uniform>             params:       SearchParams;
+@group(0) @binding(3) var<storage, read_write> results:      array<ResultEntry>;
+@group(0) @binding(4) var<storage, read_write> result_count: atomic<u32>;
+
+// ─── Helpers ──────────────────────────────────────────────────────────────────
+
+fn reflect_bits(val: u32, width: u32) -> u32 {
+    var r = 0u;
+    var v = val;
+    for (var i = 0u; i < width; i++) {
+        r = (r << 1u) | (v & 1u);
+        v >>= 1u;
+    }
+    return r;
+}
+
+fn load_byte(byte_idx: u32) -> u32 {
+    return (data_buf[byte_idx >> 2u] >> ((byte_idx & 3u) << 3u)) & 0xFFu;
+}
+
+// ─── CRC computation ──────────────────────────────────────────────────────────
+
+// Bit-by-bit CRC — works for any width 1–32.
+// Inner loop is branchless: select() compiles to a conditional move with no
+// warp divergence.
+fn compute_crc(
+    data_start: u32,
+    data_len:   u32,
+    poly:       u32,
+    init:       u32,
+    xorout:     u32,
+    refin:      bool,
+    refout:     bool,
+) -> u32 {
+    let msb_shift = params.width - 1u;
+    let mask      = params.mask;
+    var crc       = init & mask;
+
+    for (var i = 0u; i < data_len; i++) {
+        var byte = load_byte(data_start + i);
+        if refin { byte = reflect_bits(byte, 8u); }
+
+        for (var b = 0u; b < 8u; b++) {
+            let feedback = select(0u, poly, ((crc >> msb_shift) ^ ((byte >> (7u - b)) & 1u)) != 0u);
+            crc = ((crc << 1u) & mask) ^ feedback;
+        }
+    }
+
+    if refout { crc = reflect_bits(crc, params.width); }
+    return (crc ^ xorout) & mask;
+}
+
+// ─── Entry point ─────────────────────────────────────────────────────────────
+
+@compute @workgroup_size(256, 1, 1)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let thread_id = gid.y * params.threads_per_row + gid.x;
+
+    let stride_refout = 1u;
+    let stride_refin  = params.refout_count;
+    let stride_xorout = params.refin_count  * stride_refin;
+    let stride_init   = params.xorout_count * stride_xorout;
+    let stride_poly   = params.init_count   * stride_init;
+
+    if thread_id >= stride_poly * params.poly_count { return; }
+
+    let poly_idx   = thread_id / stride_poly;
+    let r1         = thread_id % stride_poly;
+    let init_idx   = r1 / stride_init;
+    let r2         = r1 % stride_init;
+    let xorout_idx = r2 / stride_xorout;
+    let r3         = r2 % stride_xorout;
+    let refin_idx  = r3 / stride_refin;
+    let refout_idx = r3 % stride_refin;
+
+    let poly   = (params.poly_min   + poly_idx)   & params.mask;
+    let init   = (params.init_min   + init_idx)   & params.mask;
+    let xorout = (params.xorout_min + xorout_idx) & params.mask;
+    let refin  = (params.refin_start  + refin_idx)  != 0u;
+    let refout = (params.refout_start + refout_idx) != 0u;
+
+    for (var t = 0u; t < params.test_case_count; t++) {
+        let tc  = test_cases[t];
+        let got = compute_crc(tc.data_offset, tc.data_len, poly, init, xorout, refin, refout);
+        if (got & tc.mask) != (tc.expected & tc.mask) { return; }
+    }
+
+    let idx = atomicAdd(&result_count, 1u);
+    if idx < params.max_results {
+        results[idx] = ResultEntry(poly, init, xorout, u32(refin) | (u32(refout) << 1u));
+    }
+}