yetanotherco · ColoCarletti · May 6, 2026 · May 6, 2026 · May 6, 2026 · May 6, 2026
diff --git a/crypto/math-cuda/build.rs b/crypto/math-cuda/build.rs
@@ -112,4 +112,7 @@ fn main() {
     compile_ptx("ntt.cu", "ntt.ptx", have_nvcc);
     compile_ptx("keccak.cu", "keccak.ptx", have_nvcc);
     compile_ptx("barycentric.cu", "barycentric.ptx", have_nvcc);
+    compile_ptx("inverse.cu", "inverse.ptx", have_nvcc);
+    compile_ptx("deep.cu", "deep.ptx", have_nvcc);
+    compile_ptx("fri.cu", "fri.ptx", have_nvcc);
 }
diff --git a/crypto/math-cuda/kernels/deep.cu b/crypto/math-cuda/kernels/deep.cu
@@ -0,0 +1,117 @@
+// R4 deep composition polynomial evaluations.
+//
+// For each trace-size row i in 0..domain_size, accumulate:
+//   result_i = sum over j of gamma_j * (H_j(x_i) - H_j(z^K)) * inv_h[i]               (H terms)
+//            + sum over j,k of gamma'_{j,k} * (t_j(x_i) - t_j(z*w^k)) * inv_t[k,i]    (trace)
+//
+// where x_i = LDE coset point at stride `blowup_factor` (so the kernel
+// reads LDE column data at `i * blowup_factor`). `j` ranges over
+// num_parts for H-terms and num_total_cols (= num_main + num_aux) for
+// trace terms. `k` ranges over num_eval_points.
+//
+// Buffer layouts (ALL on device):
+//   main_lde    base, row-major per column: main_lde[c * lde_stride + r]
+//   aux_lde     ext3 de-interleaved: aux_lde[(c*3 + k) * lde_stride + r]
+//   h_lde       ext3 de-interleaved: h_lde[(p*3 + k) * lde_stride + r]
+//   h_ood       num_parts * 3  (ext3 interleaved)
+//   trace_ood   num_total_cols * num_eval_points * 3 (ext3 interleaved,
+//               indexed as (col_idx * num_eval_points + k) * 3 + comp)
+//   gammas_h    num_parts * 3
+//   gammas_tr   num_total_cols * num_eval_points * 3
+//   inv_h       domain_size * 3
+//   inv_t       num_eval_points * domain_size * 3
+//   deep_out    domain_size * 3 (ext3 interleaved; caller reinterprets)
+
+#include "goldilocks.cuh"
+#include "ext3.cuh"
+
+extern "C" __global__ void deep_composition_ext3_row(
+    const uint64_t *main_lde,
+    const uint64_t *aux_lde,
+    const uint64_t *h_lde,
+    uint64_t lde_stride,
+    uint64_t num_main,
+    uint64_t num_aux,
+    uint64_t num_parts,
+    uint64_t num_eval_points,
+    uint64_t blowup_factor,
+    uint64_t domain_size,
+    const uint64_t *h_ood,
+    const uint64_t *trace_ood,
+    const uint64_t *gammas_h,
+    const uint64_t *gammas_tr,
+    const uint64_t *inv_h,
+    const uint64_t *inv_t,
+    uint64_t *deep_out) {
+    uint64_t i = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= domain_size) return;
+    uint64_t row = i * blowup_factor;
+
+    ext3::Fe3 result = ext3::zero();
+    ext3::Fe3 inv_h_i = {inv_h[i * 3], inv_h[i * 3 + 1], inv_h[i * 3 + 2]};
+
+    // H-terms
+    for (uint64_t j = 0; j < num_parts; ++j) {
+        ext3::Fe3 h_val = {
+            h_lde[(j * 3 + 0) * lde_stride + row],
+            h_lde[(j * 3 + 1) * lde_stride + row],
+            h_lde[(j * 3 + 2) * lde_stride + row],
+        };
+        ext3::Fe3 h_ood_j = {h_ood[j * 3], h_ood[j * 3 + 1], h_ood[j * 3 + 2]};
+        ext3::Fe3 num = ext3::sub(h_val, h_ood_j);
+        ext3::Fe3 gamma = {gammas_h[j * 3], gammas_h[j * 3 + 1], gammas_h[j * 3 + 2]};
+        ext3::Fe3 tmp = ext3::mul(gamma, num);
+        tmp = ext3::mul(tmp, inv_h_i);
+        result = ext3::add(result, tmp);
+    }
+
+    uint64_t num_total_cols = num_main + num_aux;
+
+    // Main trace terms (base column - ext3 OOD)
+    for (uint64_t j = 0; j < num_main; ++j) {
+        uint64_t t_val = main_lde[j * lde_stride + row];
+        for (uint64_t k = 0; k < num_eval_points; ++k) {
+            uint64_t idx = (j * num_eval_points + k) * 3;
+            ext3::Fe3 t_ood = {trace_ood[idx], trace_ood[idx + 1], trace_ood[idx + 2]};
+            ext3::Fe3 num = {
+                goldilocks::sub(t_val, t_ood.a),
+                goldilocks::neg(t_ood.b),
+                goldilocks::neg(t_ood.c),
+            };
+            ext3::Fe3 gamma = {gammas_tr[idx], gammas_tr[idx + 1], gammas_tr[idx + 2]};
+            uint64_t inv_t_idx = (k * domain_size + i) * 3;
+            ext3::Fe3 inv_t_ki = {inv_t[inv_t_idx], inv_t[inv_t_idx + 1], inv_t[inv_t_idx + 2]};
+            ext3::Fe3 tmp = ext3::mul(gamma, num);
+            tmp = ext3::mul(tmp, inv_t_ki);
+            result = ext3::add(result, tmp);
+        }
+    }
+
+    // Aux trace terms (ext3 column - ext3 OOD)
+    for (uint64_t j = 0; j < num_aux; ++j) {
+        ext3::Fe3 t_val = {
+            aux_lde[(j * 3 + 0) * lde_stride + row],
+            aux_lde[(j * 3 + 1) * lde_stride + row],
+            aux_lde[(j * 3 + 2) * lde_stride + row],
+        };
+        uint64_t trace_j = num_main + j;
+        for (uint64_t k = 0; k < num_eval_points; ++k) {
+            uint64_t idx = (trace_j * num_eval_points + k) * 3;
+            ext3::Fe3 t_ood = {trace_ood[idx], trace_ood[idx + 1], trace_ood[idx + 2]};
+            ext3::Fe3 num = ext3::sub(t_val, t_ood);
+            ext3::Fe3 gamma = {gammas_tr[idx], gammas_tr[idx + 1], gammas_tr[idx + 2]};
+            uint64_t inv_t_idx = (k * domain_size + i) * 3;
+            ext3::Fe3 inv_t_ki = {inv_t[inv_t_idx], inv_t[inv_t_idx + 1], inv_t[inv_t_idx + 2]};
+            ext3::Fe3 tmp = ext3::mul(gamma, num);
+            tmp = ext3::mul(tmp, inv_t_ki);
+            result = ext3::add(result, tmp);
+        }
+    }
+
+    uint64_t out_idx = i * 3;
+    deep_out[out_idx + 0] = result.a;
+    deep_out[out_idx + 1] = result.b;
+    deep_out[out_idx + 2] = result.c;
+    // Suppress unused param warning when num_total_cols not referenced.
+    (void)num_total_cols;
+}
diff --git a/crypto/math-cuda/kernels/fri.cu b/crypto/math-cuda/kernels/fri.cu
@@ -0,0 +1,59 @@
+// R4 FRI fold + twiddle-update kernels on device. The host orchestrator
+// loops log2(N) times: sample zeta on host, fold on device, keccak leaves
+// + tree on device, D2H the root, transcript-append on host, update
+// twiddles on device.
+//
+// Layout: ext3 evaluations are stored INTERLEAVED as
+// `[a0,b0,c0, a1,b1,c1, ...]`, same layout the deep-poly LDE output
+// already produces. Twiddles are base-field, one u64 per entry.
+
+#include "goldilocks.cuh"
+#include "ext3.cuh"
+
+// fold_evaluations_in_place:
+//   out[j] = (lo + hi) + inv_tw[j] * zeta * (lo - hi)
+// where lo = evals[2j], hi = evals[2j+1]. Both lo/hi and zeta are ext3.
+// inv_tw[j] is a base-field twiddle (F * E -> E).
+//
+// Writes N/2 ext3 outputs (3 * n_out u64 total) into `out`. `in` is the
+// previous layer of 2 * n_out ext3 values (6 * n_out u64 total).
+extern "C" __global__ void fri_fold_ext3(
+    const uint64_t *in,        // 3 * 2*n_out u64 (ext3 interleaved)
+    uint64_t n_out,            // number of output ext3 elements (= N/2)
+    const uint64_t *inv_tw,    // n_out base-field twiddles
+    const uint64_t *zeta,      // 3 u64 (ext3)
+    uint64_t *out) {           // 3 * n_out u64 (ext3 interleaved)
+    uint64_t j = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x;
+    if (j >= n_out) return;
+
+    const uint64_t *lo_p = in + 2 * j * 3;
+    const uint64_t *hi_p = lo_p + 3;
+
+    ext3::Fe3 lo = ext3::make(lo_p[0], lo_p[1], lo_p[2]);
+    ext3::Fe3 hi = ext3::make(hi_p[0], hi_p[1], hi_p[2]);
+    ext3::Fe3 sum = ext3::add(lo, hi);
+    ext3::Fe3 diff = ext3::sub(lo, hi);
+
+    ext3::Fe3 z = ext3::make(zeta[0], zeta[1], zeta[2]);
+    ext3::Fe3 zd = ext3::mul(z, diff);      // ext3 * ext3 = ext3
+    uint64_t tw = inv_tw[j];
+    ext3::Fe3 tzd = ext3::mul_base(zd, tw); // base * ext3 = ext3 (componentwise)
+    ext3::Fe3 res = ext3::add(sum, tzd);
+
+    uint64_t *out_p = out + j * 3;
+    out_p[0] = res.a;
+    out_p[1] = res.b;
+    out_p[2] = res.c;
+}
+
+// update_twiddles_in_place: new[j] = old[2j]^2. Writes in-place. Caller
+// must ensure the kernel is not reading the same index concurrently. Since
+// we read `old[2j]` and write `new[j]` with j < 2j, there's no aliasing.
+extern "C" __global__ void fri_update_twiddles(
+    uint64_t *tw,
+    uint64_t n_out) {
+    uint64_t j = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x;
+    if (j >= n_out) return;
+    uint64_t old = tw[2 * j];
+    tw[j] = goldilocks::mul(old, old);
+}