Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
57 commits
Select commit Hold shift + click to select a range
d1a0abf
add first cuda files
ColoCarletti May 6, 2026
79634ff
fmt
ColoCarletti May 6, 2026
ac6fbb5
fix clippy
ColoCarletti May 6, 2026
2ceb3b0
gpu 2nd part
ColoCarletti May 6, 2026
affceb1
feat(cuda): Round 1 GPU LDE+commit dispatch + device-resident handles
ColoCarletti May 6, 2026
01172f2
merge main
ColoCarletti May 19, 2026
c4627e1
Merge branch 'main' into feat/cuda-pr2-r1-gpu-commits
ColoCarletti May 19, 2026
01aa5e4
comments fix
ColoCarletti May 20, 2026
cfc5c19
Merge branch 'main' into feat/cuda-pr2-r1-gpu-commits
MauroToscano May 21, 2026
ea5696f
Update crypto/stark/src/gpu_lde.rs
ColoCarletti May 21, 2026
a8cf265
Update crypto/stark/src/gpu_lde.rs
ColoCarletti May 21, 2026
fb8d31f
Update crypto/stark/src/gpu_lde.rs
ColoCarletti May 21, 2026
a79f2b5
Update crypto/stark/src/gpu_lde.rs
ColoCarletti May 21, 2026
761a2c0
Update crypto/stark/src/gpu_lde.rs
ColoCarletti May 21, 2026
e066e9d
address reviews
ColoCarletti May 21, 2026
7d3d0f0
fix review comments
ColoCarletti May 22, 2026
cf80771
Merge remote-tracking branch 'origin/main' into feat/cuda-pr2-r1-gpu-…
ColoCarletti May 22, 2026
71aba0d
address doc comment suggestions
ColoCarletti May 22, 2026
83d91b8
Merge branch 'main' into feat/cuda-pr2-r1-gpu-commits
ColoCarletti May 22, 2026
34cae4b
fix
ColoCarletti May 22, 2026
f076bf4
Merge branch 'main' into feat/cuda-pr2-r1-gpu-commits
gabrielbosio May 27, 2026
a2cde0f
Pass replay transcript to bus-balance call in verify_vm_minimal
gabrielbosio May 27, 2026
46c305b
Update crypto/math-cuda/src/device.rs
ColoCarletti May 28, 2026
aca3dca
Merge branch 'main' into feat/cuda-pr2-r1-gpu-commits
ColoCarletti May 28, 2026
63d7c00
Update crypto/math-cuda/src/device.rs
ColoCarletti May 29, 2026
eb16c02
Update crypto/math-cuda/src/device.rs
ColoCarletti May 29, 2026
66925b1
Update crypto/math-cuda/src/device.rs
ColoCarletti May 29, 2026
4e6daf3
Update crypto/math-cuda/src/lde.rs
ColoCarletti May 29, 2026
4cd27d9
Update crypto/math-cuda/src/lde.rs
ColoCarletti May 29, 2026
5fe390f
Update crypto/math-cuda/src/lde.rs
ColoCarletti May 29, 2026
5819930
Update crypto/math-cuda/src/lde.rs
ColoCarletti May 29, 2026
33f7c36
Update crypto/math-cuda/src/lde.rs
ColoCarletti May 29, 2026
49d3607
Merge branch 'main' into feat/cuda-pr2-r1-gpu-commits
ColoCarletti May 29, 2026
99cd59c
add pr3 code
ColoCarletti Jun 1, 2026
c52521e
Merge branch 'main' into feat/cuda-pr2-r1-gpu-commits
ColoCarletti Jun 1, 2026
828ee16
fix comments
ColoCarletti Jun 1, 2026
19a36a0
Merge remote-tracking branch 'origin/feat/cuda-pr2-r1-gpu-commits' in…
ColoCarletti Jun 1, 2026
80e1ecb
fix sync stream after D2H in merke.rs
ColoCarletti Jun 1, 2026
3ead022
Merge branch 'main' into feat/cuda-pr3
ColoCarletti Jun 1, 2026
04dd872
fix comments
ColoCarletti Jun 1, 2026
8a67e33
address review feedback
ColoCarletti Jun 1, 2026
1f9394d
Update crypto/math-cuda/src/barycentric.rs
ColoCarletti Jun 1, 2026
b07999c
Update crypto/math-cuda/src/barycentric.rs
ColoCarletti Jun 1, 2026
c575017
fix imports
ColoCarletti Jun 1, 2026
0ffc661
Merge branch 'feat/cuda-pr3' of github.com:yetanotherco/lambda_vm int…
ColoCarletti Jun 1, 2026
0777f1e
Merge branch 'main' into feat/cuda-pr3
ColoCarletti Jun 3, 2026
2c7b0de
cuda integration tests
ColoCarletti Jun 3, 2026
2f1fe2d
address review feedback
ColoCarletti Jun 3, 2026
f254eae
batch invert kernels and parity test
ColoCarletti Jun 3, 2026
84cc04b
DEEP composition kernel
ColoCarletti Jun 3, 2026
0ba7745
fri
ColoCarletti Jun 3, 2026
7046a40
gpu lde
ColoCarletti Jun 3, 2026
065c8f9
gpu_lde
ColoCarletti Jun 3, 2026
7d2810f
fri
ColoCarletti Jun 3, 2026
cc840cd
add tests
ColoCarletti Jun 3, 2026
fac3974
fix
ColoCarletti Jun 3, 2026
bc61a00
Merge branch 'main' into feat/cuda-pr4
ColoCarletti Jun 3, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions crypto/math-cuda/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -112,4 +112,7 @@ fn main() {
compile_ptx("ntt.cu", "ntt.ptx", have_nvcc);
compile_ptx("keccak.cu", "keccak.ptx", have_nvcc);
compile_ptx("barycentric.cu", "barycentric.ptx", have_nvcc);
compile_ptx("inverse.cu", "inverse.ptx", have_nvcc);
compile_ptx("deep.cu", "deep.ptx", have_nvcc);
compile_ptx("fri.cu", "fri.ptx", have_nvcc);
}
117 changes: 117 additions & 0 deletions crypto/math-cuda/kernels/deep.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
// R4 deep composition polynomial evaluations.
//
// For each trace-size row i in 0..domain_size, accumulate:
// result_i = sum over j of gamma_j * (H_j(x_i) - H_j(z^K)) * inv_h[i] (H terms)
// + sum over j,k of gamma'_{j,k} * (t_j(x_i) - t_j(z*w^k)) * inv_t[k,i] (trace)
//
// where x_i = LDE coset point at stride `blowup_factor` (so the kernel
// reads LDE column data at `i * blowup_factor`). `j` ranges over
// num_parts for H-terms and num_total_cols (= num_main + num_aux) for
// trace terms. `k` ranges over num_eval_points.
//
// Buffer layouts (ALL on device):
// main_lde base, row-major per column: main_lde[c * lde_stride + r]
// aux_lde ext3 de-interleaved: aux_lde[(c*3 + k) * lde_stride + r]
// h_lde ext3 de-interleaved: h_lde[(p*3 + k) * lde_stride + r]
// h_ood num_parts * 3 (ext3 interleaved)
// trace_ood num_total_cols * num_eval_points * 3 (ext3 interleaved,
// indexed as (col_idx * num_eval_points + k) * 3 + comp)
// gammas_h num_parts * 3
// gammas_tr num_total_cols * num_eval_points * 3
// inv_h domain_size * 3
// inv_t num_eval_points * domain_size * 3
// deep_out domain_size * 3 (ext3 interleaved; caller reinterprets)

#include "goldilocks.cuh"
#include "ext3.cuh"

extern "C" __global__ void deep_composition_ext3_row(
const uint64_t *main_lde,
const uint64_t *aux_lde,
const uint64_t *h_lde,
uint64_t lde_stride,
uint64_t num_main,
uint64_t num_aux,
uint64_t num_parts,
uint64_t num_eval_points,
uint64_t blowup_factor,
uint64_t domain_size,
const uint64_t *h_ood,
const uint64_t *trace_ood,
const uint64_t *gammas_h,
const uint64_t *gammas_tr,
const uint64_t *inv_h,
const uint64_t *inv_t,
uint64_t *deep_out) {
uint64_t i = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x;
if (i >= domain_size) return;
uint64_t row = i * blowup_factor;

ext3::Fe3 result = ext3::zero();
ext3::Fe3 inv_h_i = {inv_h[i * 3], inv_h[i * 3 + 1], inv_h[i * 3 + 2]};

// H-terms
for (uint64_t j = 0; j < num_parts; ++j) {
ext3::Fe3 h_val = {
h_lde[(j * 3 + 0) * lde_stride + row],
h_lde[(j * 3 + 1) * lde_stride + row],
h_lde[(j * 3 + 2) * lde_stride + row],
};
ext3::Fe3 h_ood_j = {h_ood[j * 3], h_ood[j * 3 + 1], h_ood[j * 3 + 2]};
ext3::Fe3 num = ext3::sub(h_val, h_ood_j);
ext3::Fe3 gamma = {gammas_h[j * 3], gammas_h[j * 3 + 1], gammas_h[j * 3 + 2]};
ext3::Fe3 tmp = ext3::mul(gamma, num);
tmp = ext3::mul(tmp, inv_h_i);
result = ext3::add(result, tmp);
}

uint64_t num_total_cols = num_main + num_aux;

// Main trace terms (base column - ext3 OOD)
for (uint64_t j = 0; j < num_main; ++j) {
uint64_t t_val = main_lde[j * lde_stride + row];
for (uint64_t k = 0; k < num_eval_points; ++k) {
uint64_t idx = (j * num_eval_points + k) * 3;
ext3::Fe3 t_ood = {trace_ood[idx], trace_ood[idx + 1], trace_ood[idx + 2]};
ext3::Fe3 num = {
goldilocks::sub(t_val, t_ood.a),
goldilocks::neg(t_ood.b),
goldilocks::neg(t_ood.c),
};
ext3::Fe3 gamma = {gammas_tr[idx], gammas_tr[idx + 1], gammas_tr[idx + 2]};
uint64_t inv_t_idx = (k * domain_size + i) * 3;
ext3::Fe3 inv_t_ki = {inv_t[inv_t_idx], inv_t[inv_t_idx + 1], inv_t[inv_t_idx + 2]};
ext3::Fe3 tmp = ext3::mul(gamma, num);
tmp = ext3::mul(tmp, inv_t_ki);
result = ext3::add(result, tmp);
}
}

// Aux trace terms (ext3 column - ext3 OOD)
for (uint64_t j = 0; j < num_aux; ++j) {
ext3::Fe3 t_val = {
aux_lde[(j * 3 + 0) * lde_stride + row],
aux_lde[(j * 3 + 1) * lde_stride + row],
aux_lde[(j * 3 + 2) * lde_stride + row],
};
uint64_t trace_j = num_main + j;
for (uint64_t k = 0; k < num_eval_points; ++k) {
uint64_t idx = (trace_j * num_eval_points + k) * 3;
ext3::Fe3 t_ood = {trace_ood[idx], trace_ood[idx + 1], trace_ood[idx + 2]};
ext3::Fe3 num = ext3::sub(t_val, t_ood);
ext3::Fe3 gamma = {gammas_tr[idx], gammas_tr[idx + 1], gammas_tr[idx + 2]};
uint64_t inv_t_idx = (k * domain_size + i) * 3;
ext3::Fe3 inv_t_ki = {inv_t[inv_t_idx], inv_t[inv_t_idx + 1], inv_t[inv_t_idx + 2]};
ext3::Fe3 tmp = ext3::mul(gamma, num);
tmp = ext3::mul(tmp, inv_t_ki);
result = ext3::add(result, tmp);
}
}

uint64_t out_idx = i * 3;
deep_out[out_idx + 0] = result.a;
deep_out[out_idx + 1] = result.b;
deep_out[out_idx + 2] = result.c;
// Suppress unused param warning when num_total_cols not referenced.
(void)num_total_cols;
}
59 changes: 59 additions & 0 deletions crypto/math-cuda/kernels/fri.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
// R4 FRI fold + twiddle-update kernels on device. The host orchestrator
// loops log2(N) times: sample zeta on host, fold on device, keccak leaves
// + tree on device, D2H the root, transcript-append on host, update
// twiddles on device.
//
// Layout: ext3 evaluations are stored INTERLEAVED as
// `[a0,b0,c0, a1,b1,c1, ...]`, same layout the deep-poly LDE output
// already produces. Twiddles are base-field, one u64 per entry.

#include "goldilocks.cuh"
#include "ext3.cuh"

// fold_evaluations_in_place:
// out[j] = (lo + hi) + inv_tw[j] * zeta * (lo - hi)
// where lo = evals[2j], hi = evals[2j+1]. Both lo/hi and zeta are ext3.
// inv_tw[j] is a base-field twiddle (F * E -> E).
//
// Writes N/2 ext3 outputs (3 * n_out u64 total) into `out`. `in` is the
// previous layer of 2 * n_out ext3 values (6 * n_out u64 total).
extern "C" __global__ void fri_fold_ext3(
const uint64_t *in, // 3 * 2*n_out u64 (ext3 interleaved)
uint64_t n_out, // number of output ext3 elements (= N/2)
const uint64_t *inv_tw, // n_out base-field twiddles
const uint64_t *zeta, // 3 u64 (ext3)
uint64_t *out) { // 3 * n_out u64 (ext3 interleaved)
uint64_t j = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x;
if (j >= n_out) return;

const uint64_t *lo_p = in + 2 * j * 3;
const uint64_t *hi_p = lo_p + 3;

ext3::Fe3 lo = ext3::make(lo_p[0], lo_p[1], lo_p[2]);
ext3::Fe3 hi = ext3::make(hi_p[0], hi_p[1], hi_p[2]);
ext3::Fe3 sum = ext3::add(lo, hi);
ext3::Fe3 diff = ext3::sub(lo, hi);

ext3::Fe3 z = ext3::make(zeta[0], zeta[1], zeta[2]);
ext3::Fe3 zd = ext3::mul(z, diff); // ext3 * ext3 = ext3
uint64_t tw = inv_tw[j];
ext3::Fe3 tzd = ext3::mul_base(zd, tw); // base * ext3 = ext3 (componentwise)
ext3::Fe3 res = ext3::add(sum, tzd);

uint64_t *out_p = out + j * 3;
out_p[0] = res.a;
out_p[1] = res.b;
out_p[2] = res.c;
}

// update_twiddles_in_place: new[j] = old[2j]^2. Writes in-place. Caller
// must ensure the kernel is not reading the same index concurrently. Since
// we read `old[2j]` and write `new[j]` with j < 2j, there's no aliasing.
extern "C" __global__ void fri_update_twiddles(
uint64_t *tw,
uint64_t n_out) {
uint64_t j = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x;
if (j >= n_out) return;
uint64_t old = tw[2 * j];
tw[j] = goldilocks::mul(old, old);
}
Loading
Loading