From 6ab81ca89a32256c1e8920ef247612a86fddf16e Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Tue, 26 May 2026 18:49:04 -0300
Subject: [PATCH 01/21] feat(crypto/mmcs): standalone Multi-Matrix Commitment
 Scheme module + tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PR1 of the streaming-MMCS plan (see docs/mmcs-streaming-design.md for the
full plan). This commit adds the module standalone — NOT wired into the
prover yet — so the API and the 8-vector threat model can be reviewed in
isolation before any production hot-path change.

What's in:
- `MmcsBuilder` / `Mmcs` / `MmcsOpening` types over the existing
  `IsMerkleTreeBackend` abstraction.
- Plonky3-style layer-injection build: sort matrices by `padded_height`
  desc (ties by tag asc), layer 0 = largest matrix's leaves, compress
  pairs upward, inject smaller matrices at their height-layer via
  `compress(node, leaf)`.
- Open(global_index) -> per-matrix leaves at the shifted index + sibling
  chain. Verify reconstructs the root bottom-up; rejects malformed or
  tampered openings.

MVP restrictions (called out in the module doc):
- Matrices must have *distinct* padded_heights (matches lambda-vm's chip
  topology where heights almost never collide). Same-height case is
  Phase 2 — would mix multiple matrices at layer 0 row-by-row.
- No SIMD / parallel hashing yet.
- Caller materializes full leaf digest arrays per matrix (no streaming
  chunked absorption yet; that's wrapped on top later).
- Single root only — no caps in this iteration.

Security: 8 vectors from the threat model, each as a test:
  v1 cross-matrix row swap (rejected via matrix-leaf order check)
  v2 padding-byte freedom (rejected at build: NotPowerOfTwo)
  v3 same-height matrices (rejected: DuplicateHeight — MVP restriction)
  v4 leaf re-labelling (rejected via tag binding in opening)
  v5 wrong leaf data (rejected via root mismatch)
  v6 index tampering (rejected via path divergence)
  v7 truncated path (rejected: siblings.len() vs depth check)
  v8 verifier-side spec mismatch (rejected via expected_specs check)
Plus duplicate-tag and out-of-bounds checks.

13 tests, 0 failed. `make lint` clean across all three clippy configs.
No prover/verifier code touched, no proof format change.

Next steps (separate PRs):
- PR2: wire into Round 1 main-trace commit, replace per-chip Merkle loop.
- PR3: extend to aux trace + composition.
---
 crypto/crypto/src/merkle_tree/mmcs.rs | 436 ++++++++++++++++++++++++++
 crypto/crypto/src/merkle_tree/mod.rs  |   1 +
 2 files changed, 437 insertions(+)
 create mode 100644 crypto/crypto/src/merkle_tree/mmcs.rs

diff --git a/crypto/crypto/src/merkle_tree/mmcs.rs b/crypto/crypto/src/merkle_tree/mmcs.rs
new file mode 100644
index 000000000..28ad4fba9
--- /dev/null
+++ b/crypto/crypto/src/merkle_tree/mmcs.rs
@@ -0,0 +1,436 @@
+//! Multi-Matrix Commitment Scheme (MMCS): a single Merkle root that
+//! commits to multiple matrices of (different) heights, with one
+//! authentication path per query covering all matrices.
+//!
+//! Plonky3-style layer injection: sort matrices by `padded_height` desc;
+//! layer 0 = largest matrix's leaves; compress pairs upward; at each
+//! layer whose length matches a smaller matrix's `padded_height`, inject
+//! that matrix's leaves via `compress(node_i, matrix.leaves[i])`.
+//!
+//! MVP scope:
+//! - All matrices have distinct `padded_height` (matches lambda-vm topology).
+//! - No SIMD, no streaming, no caps. Standalone module, not wired to prover.
+//!
+//! Security: see `docs/mmcs-streaming-design.md` for the 8-vector threat
+//! model; each vector is tested below.
+
+use alloc::vec::Vec;
+
+use super::traits::IsMerkleTreeBackend;
+
+/// Per-matrix domain separator. Caller-defined; verifier reconstructs
+/// from chip spec.
+#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
+pub struct MatrixTag(pub [u8; 8]);
+
+impl MatrixTag {
+    pub const fn new(tag: [u8; 8]) -> Self {
+        Self(tag)
+    }
+}
+
+#[derive(Debug, PartialEq, Eq)]
+pub enum MmcsError {
+    DuplicateTag,
+    EmptyMatrix,
+    NotPowerOfTwo,
+    Empty,
+    DuplicateHeight,
+    IndexOutOfBounds,
+}
+
+struct MmcsMatrix<N> {
+    tag: MatrixTag,
+    leaf_digests: Vec<N>,
+}
+
+impl<N> MmcsMatrix<N> {
+    fn padded_height(&self) -> usize {
+        self.leaf_digests.len()
+    }
+}
+
+pub struct MmcsBuilder<B: IsMerkleTreeBackend> {
+    matrices: Vec<MmcsMatrix<B::Node>>,
+}
+
+impl<B: IsMerkleTreeBackend> Default for MmcsBuilder<B> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<B: IsMerkleTreeBackend> MmcsBuilder<B> {
+    pub fn new() -> Self {
+        Self {
+            matrices: Vec::new(),
+        }
+    }
+
+    /// Register a matrix. `leaf_digests` MUST be pre-hashed with the
+    /// matrix tag embedded (e.g. `hash(tag || row_bytes)`). Length must
+    /// be a power of two.
+    pub fn add_matrix(
+        &mut self,
+        tag: MatrixTag,
+        leaf_digests: Vec<B::Node>,
+    ) -> Result<(), MmcsError> {
+        if self.matrices.iter().any(|m| m.tag == tag) {
+            return Err(MmcsError::DuplicateTag);
+        }
+        if leaf_digests.is_empty() {
+            return Err(MmcsError::EmptyMatrix);
+        }
+        if !leaf_digests.len().is_power_of_two() {
+            return Err(MmcsError::NotPowerOfTwo);
+        }
+        self.matrices.push(MmcsMatrix { tag, leaf_digests });
+        Ok(())
+    }
+
+    pub fn finalize(mut self) -> Result<Mmcs<B>, MmcsError> {
+        if self.matrices.is_empty() {
+            return Err(MmcsError::Empty);
+        }
+        // Deterministic order: height desc, tag asc. Verifier reproduces.
+        self.matrices.sort_by(|a, b| {
+            b.padded_height()
+                .cmp(&a.padded_height())
+                .then(a.tag.cmp(&b.tag))
+        });
+        for w in self.matrices.windows(2) {
+            if w[0].padded_height() == w[1].padded_height() {
+                return Err(MmcsError::DuplicateHeight);
+            }
+        }
+
+        let max_height = self.matrices[0].padded_height();
+        let depth = max_height.trailing_zeros() as usize;
+        let mut layers: Vec<Vec<B::Node>> = Vec::with_capacity(depth + 1);
+        // Layer 0 = largest matrix's leaves.
+        layers.push(self.matrices[0].leaf_digests.clone());
+
+        for level in 0..depth {
+            let cur = &layers[level];
+            let new_len = cur.len() / 2;
+            let mut next: Vec<B::Node> = Vec::with_capacity(new_len);
+            for i in 0..new_len {
+                next.push(B::hash_new_parent(&cur[2 * i], &cur[2 * i + 1]));
+            }
+            // Inject any non-largest matrix at this layer length.
+            if let Some(matrix) = self
+                .matrices
+                .iter()
+                .skip(1)
+                .find(|m| m.padded_height() == new_len)
+            {
+                for (node, inject) in next.iter_mut().zip(matrix.leaf_digests.iter()) {
+                    *node = B::hash_new_parent(node, inject);
+                }
+            }
+            layers.push(next);
+        }
+
+        Ok(Mmcs {
+            layers,
+            matrices: self.matrices,
+        })
+    }
+}
+
+pub struct Mmcs<B: IsMerkleTreeBackend> {
+    layers: Vec<Vec<B::Node>>,
+    matrices: Vec<MmcsMatrix<B::Node>>,
+}
+
+impl<B: IsMerkleTreeBackend> Mmcs<B> {
+    pub fn root(&self) -> &B::Node {
+        let top = self.layers.last().expect("layers always populated");
+        &top[0]
+    }
+
+    pub fn spec(&self) -> Vec<(MatrixTag, usize)> {
+        self.matrices
+            .iter()
+            .map(|m| (m.tag, m.padded_height()))
+            .collect()
+    }
+
+    pub fn open(&self, global_index: usize) -> Result<MmcsOpening<B::Node>, MmcsError> {
+        let max_height = self.matrices[0].padded_height();
+        if global_index >= max_height {
+            return Err(MmcsError::IndexOutOfBounds);
+        }
+        let depth = max_height.trailing_zeros() as usize;
+
+        let mut matrix_leaves: Vec<(MatrixTag, B::Node)> = Vec::with_capacity(self.matrices.len());
+        for matrix in &self.matrices {
+            let shift = (max_height / matrix.padded_height()).trailing_zeros() as usize;
+            let idx = global_index >> shift;
+            matrix_leaves.push((matrix.tag, matrix.leaf_digests[idx].clone()));
+        }
+
+        let mut siblings: Vec<B::Node> = Vec::with_capacity(depth);
+        let mut idx = global_index;
+        for layer in &self.layers[..depth] {
+            let sibling_idx = idx ^ 1;
+            siblings.push(layer[sibling_idx].clone());
+            idx >>= 1;
+        }
+
+        Ok(MmcsOpening {
+            matrix_leaves,
+            siblings,
+            global_index,
+        })
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct MmcsOpening<N> {
+    pub matrix_leaves: Vec<(MatrixTag, N)>,
+    pub siblings: Vec<N>,
+    pub global_index: usize,
+}
+
+impl<N: PartialEq + Eq + Clone> MmcsOpening<N> {
+    pub fn verify<B>(&self, expected_root: &N, expected_specs: &[(MatrixTag, usize)]) -> bool
+    where
+        B: IsMerkleTreeBackend<Node = N>,
+    {
+        let mut specs = expected_specs.to_vec();
+        specs.sort_by(|a, b| b.1.cmp(&a.1).then(a.0.cmp(&b.0)));
+
+        if self.matrix_leaves.len() != specs.len() {
+            return false;
+        }
+        for ((tag, _), (spec_tag, _)) in self.matrix_leaves.iter().zip(&specs) {
+            if tag != spec_tag {
+                return false;
+            }
+        }
+        for w in specs.windows(2) {
+            if w[0].1 == w[1].1 {
+                return false;
+            }
+            if !w[0].1.is_power_of_two() || !w[1].1.is_power_of_two() {
+                return false;
+            }
+        }
+        let max_height = specs[0].1;
+        if !max_height.is_power_of_two() || max_height == 0 {
+            return false;
+        }
+        if self.global_index >= max_height {
+            return false;
+        }
+        let depth = max_height.trailing_zeros() as usize;
+        if self.siblings.len() != depth {
+            return false;
+        }
+
+        let mut current = self.matrix_leaves[0].1.clone();
+        let mut idx = self.global_index;
+
+        for level in 0..depth {
+            let sibling = &self.siblings[level];
+            current = if idx & 1 == 0 {
+                B::hash_new_parent(&current, sibling)
+            } else {
+                B::hash_new_parent(sibling, &current)
+            };
+            idx >>= 1;
+
+            let new_len = max_height >> (level + 1);
+            if let Some((tag, _)) = specs.iter().find(|(_, ph)| *ph == new_len) {
+                let inject = self
+                    .matrix_leaves
+                    .iter()
+                    .find(|(t, _)| t == tag)
+                    .map(|(_, leaf)| leaf);
+                let inject = match inject {
+                    Some(l) => l,
+                    None => return false,
+                };
+                current = B::hash_new_parent(&current, inject);
+            }
+        }
+
+        &current == expected_root
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use sha3::{Digest, Keccak256};
+
+    struct TestBackend;
+    type Node = [u8; 32];
+    impl IsMerkleTreeBackend for TestBackend {
+        type Node = Node;
+        type Data = Vec<u8>;
+        fn hash_data(leaf: &Vec<u8>) -> Node {
+            let mut h = Keccak256::new();
+            h.update(leaf);
+            h.finalize().into()
+        }
+        fn hash_new_parent(a: &Node, b: &Node) -> Node {
+            let mut h = Keccak256::new();
+            h.update(a);
+            h.update(b);
+            h.finalize().into()
+        }
+    }
+
+    fn hash_leaf_with_tag(tag: &MatrixTag, row: &[u8]) -> Node {
+        let mut h = Keccak256::new();
+        h.update(b"LEAF_V1");
+        h.update(tag.0);
+        h.update(row);
+        h.finalize().into()
+    }
+
+    fn make_matrix(tag_byte: u8, height: usize) -> (MatrixTag, Vec<Node>) {
+        let tag = MatrixTag::new([tag_byte; 8]);
+        let leaves: Vec<Node> = (0..height)
+            .map(|i| hash_leaf_with_tag(&tag, &(i as u64).to_le_bytes()))
+            .collect();
+        (tag, leaves)
+    }
+
+    fn build(matrices: Vec<(MatrixTag, Vec<Node>)>) -> Mmcs<TestBackend> {
+        let mut b: MmcsBuilder<TestBackend> = MmcsBuilder::new();
+        for (tag, leaves) in matrices {
+            b.add_matrix(tag, leaves).expect("add_matrix");
+        }
+        b.finalize().expect("finalize")
+    }
+
+    #[test]
+    fn build_single_matrix_round_trips() {
+        let (tag, leaves) = make_matrix(0xAA, 8);
+        let tree = build(vec![(tag, leaves)]);
+        for i in 0..8 {
+            let opening = tree.open(i).expect("open");
+            assert!(opening.verify::<TestBackend>(tree.root(), &tree.spec()));
+        }
+    }
+
+    #[test]
+    fn build_distinct_heights_round_trips() {
+        let big = make_matrix(0x01, 8);
+        let mid = make_matrix(0x02, 4);
+        let small = make_matrix(0x03, 2);
+        let tree = build(vec![big, mid, small]);
+        for i in 0..8 {
+            let opening = tree.open(i).expect("open");
+            assert!(opening.verify::<TestBackend>(tree.root(), &tree.spec()));
+        }
+    }
+
+    #[test]
+    fn build_is_deterministic() {
+        let m1 = make_matrix(0x01, 8);
+        let m2 = make_matrix(0x02, 4);
+        let r1 = *build(vec![m1.clone(), m2.clone()]).root();
+        let r2 = *build(vec![m1.clone(), m2.clone()]).root();
+        assert_eq!(r1, r2);
+        let r3 = *build(vec![m2, m1]).root();
+        assert_eq!(r1, r3);
+    }
+
+    #[test]
+    fn v1_cross_matrix_row_swap_is_rejected() {
+        let big = make_matrix(0xAA, 4);
+        let small = make_matrix(0xBB, 2);
+        let tree = build(vec![big, small]);
+        let mut opening = tree.open(0).expect("open");
+        opening.matrix_leaves.swap(0, 1);
+        assert!(!opening.verify::<TestBackend>(tree.root(), &tree.spec()));
+    }
+
+    #[test]
+    fn v2_unpadded_matrix_is_rejected_at_build() {
+        let tag = MatrixTag::new([0; 8]);
+        let leaves: Vec<Node> = (0..3).map(|i| [i as u8; 32]).collect();
+        let mut b: MmcsBuilder<TestBackend> = MmcsBuilder::new();
+        assert_eq!(b.add_matrix(tag, leaves), Err(MmcsError::NotPowerOfTwo));
+    }
+
+    #[test]
+    fn v3_same_height_matrices_rejected_in_mvp() {
+        let m1 = make_matrix(0x01, 4);
+        let m2 = make_matrix(0x02, 4);
+        let mut b: MmcsBuilder<TestBackend> = MmcsBuilder::new();
+        b.add_matrix(m1.0, m1.1).expect("add 1");
+        b.add_matrix(m2.0, m2.1).expect("add 2");
+        assert_eq!(b.finalize().err(), Some(MmcsError::DuplicateHeight));
+    }
+
+    #[test]
+    fn v4_auth_path_forgery_via_relabeling_is_rejected() {
+        let big = make_matrix(0xAA, 4);
+        let small = make_matrix(0xBB, 2);
+        let tree = build(vec![big, small]);
+        let mut opening = tree.open(0).expect("open");
+        opening.matrix_leaves[1].0 = MatrixTag::new([0xCC; 8]);
+        assert!(!opening.verify::<TestBackend>(tree.root(), &tree.spec()));
+    }
+
+    #[test]
+    fn v5_wrong_leaf_data_is_rejected() {
+        let big = make_matrix(0xAA, 4);
+        let small = make_matrix(0xBB, 2);
+        let tree = build(vec![big, small]);
+        let mut opening = tree.open(0).expect("open");
+        opening.matrix_leaves[1].1[0] ^= 1;
+        assert!(!opening.verify::<TestBackend>(tree.root(), &tree.spec()));
+    }
+
+    #[test]
+    fn v6_index_tampering_rejected() {
+        let big = make_matrix(0xAA, 4);
+        let tree = build(vec![big]);
+        let o0 = tree.open(0).expect("open 0");
+        let o1 = tree.open(1).expect("open 1");
+        assert_ne!(o0.matrix_leaves[0].1, o1.matrix_leaves[0].1);
+        let mut faked = o0.clone();
+        faked.global_index = 1;
+        assert!(!faked.verify::<TestBackend>(tree.root(), &tree.spec()));
+    }
+
+    #[test]
+    fn v7_truncated_path_is_rejected() {
+        let big = make_matrix(0xAA, 8);
+        let tree = build(vec![big]);
+        let mut opening = tree.open(3).expect("open");
+        opening.siblings.pop();
+        assert!(!opening.verify::<TestBackend>(tree.root(), &tree.spec()));
+    }
+
+    #[test]
+    fn v8_lying_about_spec_is_rejected() {
+        let big = make_matrix(0xAA, 8);
+        let tree = build(vec![big]);
+        let opening = tree.open(0).expect("open");
+        let bad_specs = vec![(MatrixTag::new([0xAA; 8]), 4)];
+        assert!(!opening.verify::<TestBackend>(tree.root(), &bad_specs));
+    }
+
+    #[test]
+    fn duplicate_tag_is_rejected() {
+        let tag = MatrixTag::new([1; 8]);
+        let leaves: Vec<Node> = vec![[0; 32]; 4];
+        let mut b: MmcsBuilder<TestBackend> = MmcsBuilder::new();
+        b.add_matrix(tag, leaves.clone()).expect("add first");
+        assert_eq!(b.add_matrix(tag, leaves), Err(MmcsError::DuplicateTag));
+    }
+
+    #[test]
+    fn open_out_of_bounds_is_rejected() {
+        let big = make_matrix(0xAA, 4);
+        let tree = build(vec![big]);
+        assert_eq!(tree.open(4).err(), Some(MmcsError::IndexOutOfBounds));
+    }
+}
diff --git a/crypto/crypto/src/merkle_tree/mod.rs b/crypto/crypto/src/merkle_tree/mod.rs
index 99ea82dea..f6e601c30 100644
--- a/crypto/crypto/src/merkle_tree/mod.rs
+++ b/crypto/crypto/src/merkle_tree/mod.rs
@@ -1,5 +1,6 @@
 pub mod backends;
 pub mod merkle;
+pub mod mmcs;
 pub mod proof;
 pub mod traits;
 pub mod utils;

From 1750dc5f2760e7fce80dfb8801347eb819789286 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Tue, 26 May 2026 20:33:02 -0300
Subject: [PATCH 02/21] feat(crypto/mmcs): support same-height matrices
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Drops the MVP `DuplicateHeight` restriction so the MMCS matches lambda-vm's
actual chip topology (3+ CPU chunks all at 2^20, BITWISE at 2^20, etc.).

Build pipeline (now general):
- Sort by `padded_height` desc, ties by `tag` asc — verifier reproduces.
- Layer 0 = first max-height matrix's leaves, then sequentially compress
  in every additional max-height matrix's leaves at the same row index.
- Each upper layer: compress pairs of children, then inject every matrix
  whose `padded_height` matches that layer's length, in tag-asc order.

Verify mirrors with a cursor over `matrix_leaves` (which the builder
already orders by height desc then tag asc). At layer L we combine in
every leaf at the current row position whose matrix has height equal to
the current layer length; cursor must end exactly at the leaves' end
(unconsumed leaves => topology mismatch => reject).

Tests:
- `same_height_pair_round_trips`: two matrices at max_height combine.
- `lambda_vm_style_multi_chunk_round_trips`: 3 chunks at 2^3, 2 at 2^2,
  1 at 2^0 — mirrors typical lambda-vm chip groupings.
- `insertion_order_does_not_change_root`: permutation invariance.
- `same_height_tampered_leaf_rejected`: tampering one of the combined
  leaves at the same layer still fails root match.

17 tests total, 0 failed. `make lint` clean.
---
 crypto/crypto/src/merkle_tree/mmcs.rs | 209 +++++++++++++++++++-------
 1 file changed, 152 insertions(+), 57 deletions(-)

diff --git a/crypto/crypto/src/merkle_tree/mmcs.rs b/crypto/crypto/src/merkle_tree/mmcs.rs
index 28ad4fba9..a5a8706ba 100644
--- a/crypto/crypto/src/merkle_tree/mmcs.rs
+++ b/crypto/crypto/src/merkle_tree/mmcs.rs
@@ -1,19 +1,26 @@
 //! Multi-Matrix Commitment Scheme (MMCS): a single Merkle root that
-//! commits to multiple matrices of (different) heights, with one
-//! authentication path per query covering all matrices.
+//! commits to multiple matrices of (different or equal) heights, with
+//! one authentication path per query covering all matrices.
 //!
-//! Plonky3-style layer injection: sort matrices by `padded_height` desc;
-//! layer 0 = largest matrix's leaves; compress pairs upward; at each
-//! layer whose length matches a smaller matrix's `padded_height`, inject
-//! that matrix's leaves via `compress(node_i, matrix.leaves[i])`.
+//! Plonky3-style layer injection: sort matrices by `padded_height` desc
+//! (ties broken by `tag` asc); layer 0 starts with the first max-height
+//! matrix's leaves and sequentially compresses in additional max-height
+//! matrices; each upper layer compresses pairs of children then injects
+//! every matrix whose `padded_height` matches that layer's length.
 //!
-//! MVP scope:
-//! - All matrices have distinct `padded_height` (matches lambda-vm topology).
-//! - No SIMD, no streaming, no caps. Standalone module, not wired to prover.
+//! Scope:
+//! - Multiple matrices may share a `padded_height` (matches lambda-vm's
+//!   chunked-table topology: 3 CPU chunks all at 2^20, BITWISE at 2^20,
+//!   etc.). Combination order at a layer is deterministic (tag asc).
+//! - No SIMD / parallel hashing yet.
+//! - No streaming chunked absorption — caller materializes full leaf
+//!   digest arrays per matrix.
+//! - Single root (no caps).
 //!
 //! Security: see `docs/mmcs-streaming-design.md` for the 8-vector threat
 //! model; each vector is tested below.
 
+use alloc::collections::BTreeMap;
 use alloc::vec::Vec;
 
 use super::traits::IsMerkleTreeBackend;
@@ -35,7 +42,6 @@ pub enum MmcsError {
     EmptyMatrix,
     NotPowerOfTwo,
     Empty,
-    DuplicateHeight,
     IndexOutOfBounds,
 }
 
@@ -92,40 +98,52 @@ impl<B: IsMerkleTreeBackend> MmcsBuilder<B> {
         if self.matrices.is_empty() {
             return Err(MmcsError::Empty);
         }
-        // Deterministic order: height desc, tag asc. Verifier reproduces.
+        // Deterministic sort: height desc, then tag asc. The verifier
+        // reproduces this exact ordering so prover/verifier agree on
+        // which matrix contributes when.
         self.matrices.sort_by(|a, b| {
             b.padded_height()
                 .cmp(&a.padded_height())
                 .then(a.tag.cmp(&b.tag))
         });
-        for w in self.matrices.windows(2) {
-            if w[0].padded_height() == w[1].padded_height() {
-                return Err(MmcsError::DuplicateHeight);
-            }
-        }
 
         let max_height = self.matrices[0].padded_height();
         let depth = max_height.trailing_zeros() as usize;
+
+        // Group matrix indices by padded_height (preserving tag-asc order
+        // within each group because `matrices` is already sorted).
+        let mut by_height: BTreeMap<usize, Vec<usize>> = BTreeMap::new();
+        for (idx, m) in self.matrices.iter().enumerate() {
+            by_height.entry(m.padded_height()).or_default().push(idx);
+        }
+
         let mut layers: Vec<Vec<B::Node>> = Vec::with_capacity(depth + 1);
-        // Layer 0 = largest matrix's leaves.
-        layers.push(self.matrices[0].leaf_digests.clone());
 
+        // Layer 0: combine all max-height matrices in tag-asc order.
+        let top_group = by_height
+            .get(&max_height)
+            .expect("max_height bucket exists");
+        let mut layer0: Vec<B::Node> = self.matrices[top_group[0]].leaf_digests.clone();
+        for &mi in &top_group[1..] {
+            for (node, leaf) in layer0.iter_mut().zip(self.matrices[mi].leaf_digests.iter()) {
+                *node = B::hash_new_parent(node, leaf);
+            }
+        }
+        layers.push(layer0);
+
+        // Walk upward; at each new layer, compress pairs then inject all
+        // matrices at this layer's length in tag-asc order.
         for level in 0..depth {
             let cur = &layers[level];
             let new_len = cur.len() / 2;
-            let mut next: Vec<B::Node> = Vec::with_capacity(new_len);
-            for i in 0..new_len {
-                next.push(B::hash_new_parent(&cur[2 * i], &cur[2 * i + 1]));
-            }
-            // Inject any non-largest matrix at this layer length.
-            if let Some(matrix) = self
-                .matrices
-                .iter()
-                .skip(1)
-                .find(|m| m.padded_height() == new_len)
-            {
-                for (node, inject) in next.iter_mut().zip(matrix.leaf_digests.iter()) {
-                    *node = B::hash_new_parent(node, inject);
+            let mut next: Vec<B::Node> = (0..new_len)
+                .map(|i| B::hash_new_parent(&cur[2 * i], &cur[2 * i + 1]))
+                .collect();
+            if let Some(group) = by_height.get(&new_len) {
+                for &mi in group {
+                    for (node, leaf) in next.iter_mut().zip(self.matrices[mi].leaf_digests.iter()) {
+                        *node = B::hash_new_parent(node, leaf);
+                    }
                 }
             }
             layers.push(next);
@@ -149,6 +167,7 @@ impl<B: IsMerkleTreeBackend> Mmcs<B> {
         &top[0]
     }
 
+    /// `(tag, padded_height)` per matrix in deterministic sort order.
     pub fn spec(&self) -> Vec<(MatrixTag, usize)> {
         self.matrices
             .iter()
@@ -188,6 +207,8 @@ impl<B: IsMerkleTreeBackend> Mmcs<B> {
 
 #[derive(Debug, Clone)]
 pub struct MmcsOpening<N> {
+    /// `(tag, leaf_at_shifted_index)` per matrix, in the builder's sort
+    /// order (height desc, tag asc).
     pub matrix_leaves: Vec<(MatrixTag, N)>,
     pub siblings: Vec<N>,
     pub global_index: usize,
@@ -209,18 +230,12 @@ impl<N: PartialEq + Eq + Clone> MmcsOpening<N> {
                 return false;
             }
         }
-        for w in specs.windows(2) {
-            if w[0].1 == w[1].1 {
-                return false;
-            }
-            if !w[0].1.is_power_of_two() || !w[1].1.is_power_of_two() {
+        for (_, ph) in &specs {
+            if !ph.is_power_of_two() || *ph == 0 {
                 return false;
             }
         }
         let max_height = specs[0].1;
-        if !max_height.is_power_of_two() || max_height == 0 {
-            return false;
-        }
         if self.global_index >= max_height {
             return false;
         }
@@ -229,9 +244,21 @@ impl<N: PartialEq + Eq + Clone> MmcsOpening<N> {
             return false;
         }
 
-        let mut current = self.matrix_leaves[0].1.clone();
-        let mut idx = self.global_index;
+        // Walk `matrix_leaves` left to right with a cursor; the leaves
+        // are grouped by height (largest first) and within each group
+        // are sorted by tag.
+        let mut cursor = 0usize;
+
+        // Reconstruct layer-0 at global_index: combine all max-height
+        // matrices' leaves at global_index in tag-asc order.
+        let mut current = self.matrix_leaves[cursor].1.clone();
+        cursor += 1;
+        while cursor < self.matrix_leaves.len() && specs[cursor].1 == max_height {
+            current = B::hash_new_parent(&current, &self.matrix_leaves[cursor].1);
+            cursor += 1;
+        }
 
+        let mut idx = self.global_index;
         for level in 0..depth {
             let sibling = &self.siblings[level];
             current = if idx & 1 == 0 {
@@ -242,20 +269,16 @@ impl<N: PartialEq + Eq + Clone> MmcsOpening<N> {
             idx >>= 1;
 
             let new_len = max_height >> (level + 1);
-            if let Some((tag, _)) = specs.iter().find(|(_, ph)| *ph == new_len) {
-                let inject = self
-                    .matrix_leaves
-                    .iter()
-                    .find(|(t, _)| t == tag)
-                    .map(|(_, leaf)| leaf);
-                let inject = match inject {
-                    Some(l) => l,
-                    None => return false,
-                };
-                current = B::hash_new_parent(&current, inject);
+            while cursor < self.matrix_leaves.len() && specs[cursor].1 == new_len {
+                current = B::hash_new_parent(&current, &self.matrix_leaves[cursor].1);
+                cursor += 1;
             }
         }
 
+        if cursor != self.matrix_leaves.len() {
+            // Unconsumed leaves => topology mismatch.
+            return false;
+        }
         &current == expected_root
     }
 }
@@ -307,6 +330,8 @@ mod tests {
         b.finalize().expect("finalize")
     }
 
+    // ---------- Basic ----------
+
     #[test]
     fn build_single_matrix_round_trips() {
         let (tag, leaves) = make_matrix(0xAA, 8);
@@ -340,6 +365,74 @@ mod tests {
         assert_eq!(r1, r3);
     }
 
+    // ---------- Same-height topology (lambda-vm style) ----------
+
+    #[test]
+    fn same_height_pair_round_trips() {
+        // Two matrices both at max_height — combined into layer 0.
+        let m1 = make_matrix(0x01, 4);
+        let m2 = make_matrix(0x02, 4);
+        let tree = build(vec![m1, m2]);
+        for i in 0..4 {
+            let opening = tree.open(i).expect("open");
+            assert!(
+                opening.verify::<TestBackend>(tree.root(), &tree.spec()),
+                "round-trip at index {i}"
+            );
+        }
+    }
+
+    #[test]
+    fn lambda_vm_style_multi_chunk_round_trips() {
+        // 3 max-height chunks (CPU-like), 2 mid-height (MEMW-like at 1/2),
+        // 1 small (REGISTER-like at 1/8). Heights: 8, 8, 8, 4, 4, 1.
+        let cpus = vec![
+            make_matrix(0x01, 8),
+            make_matrix(0x02, 8),
+            make_matrix(0x03, 8),
+        ];
+        let memws = vec![make_matrix(0x10, 4), make_matrix(0x11, 4)];
+        let reg = make_matrix(0xF0, 1);
+        let mut all = cpus;
+        all.extend(memws);
+        all.push(reg);
+        let tree = build(all);
+        for i in 0..8 {
+            let opening = tree.open(i).expect("open");
+            assert!(
+                opening.verify::<TestBackend>(tree.root(), &tree.spec()),
+                "round-trip at index {i}"
+            );
+        }
+    }
+
+    #[test]
+    fn insertion_order_does_not_change_root() {
+        // Multi-permutation determinism: any permutation of the same set
+        // of matrices must produce the same root.
+        let a = make_matrix(0x01, 8);
+        let b = make_matrix(0x02, 8);
+        let c = make_matrix(0x03, 4);
+        let r1 = *build(vec![a.clone(), b.clone(), c.clone()]).root();
+        let r2 = *build(vec![c.clone(), a.clone(), b.clone()]).root();
+        let r3 = *build(vec![b, c, a]).root();
+        assert_eq!(r1, r2);
+        assert_eq!(r1, r3);
+    }
+
+    #[test]
+    fn same_height_tampered_leaf_rejected() {
+        let m1 = make_matrix(0x01, 4);
+        let m2 = make_matrix(0x02, 4);
+        let tree = build(vec![m1, m2]);
+        let mut opening = tree.open(2).expect("open");
+        // Flip one bit of the second max-height matrix's leaf.
+        opening.matrix_leaves[1].1[0] ^= 1;
+        assert!(!opening.verify::<TestBackend>(tree.root(), &tree.spec()));
+    }
+
+    // ---------- Threat model (vectors 1-8) ----------
+
     #[test]
     fn v1_cross_matrix_row_swap_is_rejected() {
         let big = make_matrix(0xAA, 4);
@@ -359,13 +452,15 @@ mod tests {
     }
 
     #[test]
-    fn v3_same_height_matrices_rejected_in_mvp() {
+    fn v3_layer_injection_order_deterministic_under_permutation() {
+        // Two matrices at same height — combining is in tag-asc order
+        // regardless of insertion. Already covered above; pin it here.
         let m1 = make_matrix(0x01, 4);
         let m2 = make_matrix(0x02, 4);
-        let mut b: MmcsBuilder<TestBackend> = MmcsBuilder::new();
-        b.add_matrix(m1.0, m1.1).expect("add 1");
-        b.add_matrix(m2.0, m2.1).expect("add 2");
-        assert_eq!(b.finalize().err(), Some(MmcsError::DuplicateHeight));
+        assert_eq!(
+            *build(vec![m1.clone(), m2.clone()]).root(),
+            *build(vec![m2, m1]).root()
+        );
     }
 
     #[test]

From becb5cdf5caa071d28c9986801b08836c54b91a5 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Wed, 27 May 2026 10:20:44 -0300
Subject: [PATCH 03/21] perf(crypto/mmcs): row-parallel build via Rayon

Adds `#[cfg(feature = "parallel")]` paths to the MMCS finalize step:
- Layer-0 same-height combine: row-parallel (each row independently
  folds K matrices in tag-asc order; K is small, the per-row sequential
  chain is short, rows scale across cores).
- Pair compression upward: pair-parallel.
- Matrix injection at non-leaf layers: row-parallel.

Mirrors Plonky3's `first_digest_layer` + `compress_and_inject` parallel
shape (we read their `merkle-tree/src/merkle_tree.rs` to confirm the
algorithmic structure matches). Differences:
- Plonky3 SIMD-packs WIDTH rows per hash call via PackedField. lambda-vm
  uses scalar Keccak so we fall back to per-row Rayon parallelism
  (similar throughput class for our chip topology, no SIMD code needed).
- Plonky3 hashes raw matrix rows internally (multi-matrix same-height
  rows concatenated -> single hash). Ours takes pre-hashed leaves so a
  same-height combine is `compress(leaf_A, leaf_B)` (one extra compress
  per matrix per row vs Plonky3). The trade-off: ours lets the caller
  hash chip-by-chip and drop the LDE before the MMCS sees anything,
  giving better peak memory for lambda-vm's large per-chip LDEs. Worth
  reconsidering for PR2 if profiling shows the extra compresses matter.

Helpers factored out so the parallel/serial cfg lives in one place per
operation (`build_combined_layer`, `compress_pairs`, `inject_matrices`).

Tests: same 17 cases pass both with and without `--features parallel`.
`make lint` clean across all three configs.
---
 crypto/crypto/src/merkle_tree/mmcs.rs | 97 ++++++++++++++++++++++-----
 1 file changed, 80 insertions(+), 17 deletions(-)

diff --git a/crypto/crypto/src/merkle_tree/mmcs.rs b/crypto/crypto/src/merkle_tree/mmcs.rs
index a5a8706ba..6a213704b 100644
--- a/crypto/crypto/src/merkle_tree/mmcs.rs
+++ b/crypto/crypto/src/merkle_tree/mmcs.rs
@@ -23,6 +23,9 @@
 use alloc::collections::BTreeMap;
 use alloc::vec::Vec;
 
+#[cfg(feature = "parallel")]
+use rayon::prelude::*;
+
 use super::traits::IsMerkleTreeBackend;
 
 /// Per-matrix domain separator. Caller-defined; verifier reconstructs
@@ -119,34 +122,29 @@ impl<B: IsMerkleTreeBackend> MmcsBuilder<B> {
 
         let mut layers: Vec<Vec<B::Node>> = Vec::with_capacity(depth + 1);
 
-        // Layer 0: combine all max-height matrices in tag-asc order.
+        // Layer 0: combine all max-height matrices' leaves at row i in
+        // tag-asc order. Row-parallel: each row independently folds K
+        // matrices (K is small — 1-5 typically), so the per-row sequential
+        // chain is short while rows scale across cores. Mirrors Plonky3's
+        // `first_digest_layer` parallelism, minus the SIMD vertical packing
+        // (lambda-vm uses scalar Keccak).
         let top_group = by_height
             .get(&max_height)
             .expect("max_height bucket exists");
-        let mut layer0: Vec<B::Node> = self.matrices[top_group[0]].leaf_digests.clone();
-        for &mi in &top_group[1..] {
-            for (node, leaf) in layer0.iter_mut().zip(self.matrices[mi].leaf_digests.iter()) {
-                *node = B::hash_new_parent(node, leaf);
-            }
-        }
+        let layer0: Vec<B::Node> = build_combined_layer::<B>(max_height, top_group, &self.matrices);
         layers.push(layer0);
 
-        // Walk upward; at each new layer, compress pairs then inject all
-        // matrices at this layer's length in tag-asc order.
+        // Walk upward: compress pairs (pair-parallel), then inject any
+        // matrices at this layer's length (row-parallel).
         for level in 0..depth {
             let cur = &layers[level];
             let new_len = cur.len() / 2;
-            let mut next: Vec<B::Node> = (0..new_len)
-                .map(|i| B::hash_new_parent(&cur[2 * i], &cur[2 * i + 1]))
-                .collect();
+            let mut next: Vec<B::Node> = compress_pairs::<B>(cur);
             if let Some(group) = by_height.get(&new_len) {
-                for &mi in group {
-                    for (node, leaf) in next.iter_mut().zip(self.matrices[mi].leaf_digests.iter()) {
-                        *node = B::hash_new_parent(node, leaf);
-                    }
-                }
+                inject_matrices::<B>(&mut next, group, &self.matrices);
             }
             layers.push(next);
+            let _ = new_len;
         }
 
         Ok(Mmcs {
@@ -156,6 +154,71 @@ impl<B: IsMerkleTreeBackend> MmcsBuilder<B> {
     }
 }
 
+/// Build layer 0 by folding all matrices at `max_height` at row `i`, in
+/// tag-asc order (`group` already preserves this). Row-parallel.
+fn build_combined_layer<B: IsMerkleTreeBackend>(
+    max_height: usize,
+    group: &[usize],
+    matrices: &[MmcsMatrix<B::Node>],
+) -> Vec<B::Node> {
+    let inner = |i: usize| -> B::Node {
+        let mut acc = matrices[group[0]].leaf_digests[i].clone();
+        for &mi in &group[1..] {
+            acc = B::hash_new_parent(&acc, &matrices[mi].leaf_digests[i]);
+        }
+        acc
+    };
+    #[cfg(feature = "parallel")]
+    {
+        (0..max_height).into_par_iter().map(inner).collect()
+    }
+    #[cfg(not(feature = "parallel"))]
+    {
+        (0..max_height).map(inner).collect()
+    }
+}
+
+/// Compress pairs of children into the next layer up. Pair-parallel.
+fn compress_pairs<B: IsMerkleTreeBackend>(prev: &[B::Node]) -> Vec<B::Node> {
+    let new_len = prev.len() / 2;
+    let inner = |i: usize| -> B::Node { B::hash_new_parent(&prev[2 * i], &prev[2 * i + 1]) };
+    #[cfg(feature = "parallel")]
+    {
+        (0..new_len).into_par_iter().map(inner).collect()
+    }
+    #[cfg(not(feature = "parallel"))]
+    {
+        (0..new_len).map(inner).collect()
+    }
+}
+
+/// Inject all matrices in `group` into `layer` (row-parallel).
+fn inject_matrices<B: IsMerkleTreeBackend>(
+    layer: &mut [B::Node],
+    group: &[usize],
+    matrices: &[MmcsMatrix<B::Node>],
+) {
+    let n = layer.len();
+    let updated: Vec<B::Node> = {
+        let inner = |i: usize| -> B::Node {
+            let mut acc = layer[i].clone();
+            for &mi in group {
+                acc = B::hash_new_parent(&acc, &matrices[mi].leaf_digests[i]);
+            }
+            acc
+        };
+        #[cfg(feature = "parallel")]
+        {
+            (0..n).into_par_iter().map(inner).collect()
+        }
+        #[cfg(not(feature = "parallel"))]
+        {
+            (0..n).map(inner).collect()
+        }
+    };
+    layer.clone_from_slice(&updated);
+}
+
 pub struct Mmcs<B: IsMerkleTreeBackend> {
     layers: Vec<Vec<B::Node>>,
     matrices: Vec<MmcsMatrix<B::Node>>,

From baecc56d966ff492da938be42d906c3f17fad3de Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Wed, 27 May 2026 10:35:31 -0300
Subject: [PATCH 04/21] bench(crypto/mmcs): micro-bench vs N independent Merkle
 trees
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds two `#[ignore]` tests that print concrete numbers when invoked
with `cargo test -p crypto --features parallel mmcs_bench --
--ignored --nocapture`. Two scenarios:

1. **Build time** comparison on a scaled-down lambda-vm topology
   (8 chips: 3 at 2^14, 2 at 2^12, 2 at 2^10, 1 at 2^8).
   Measured on this host:
       N independent trees:    4481 µs
       Single MMCS tree:       3176 µs
       MMCS / N-trees ratio:   0.709  (MMCS is ~30% faster)
   The savings come from sharing layer-0 work when multiple chips
   live at max_height — fewer hash chains end up duplicated.

2. **Per-query opening hash count** for the same topology:
       N independent trees:   102 hashes per query
       Unified MMCS:           21 hashes per query
       Reduction factor:      ~4.9x
   At production scale (~30 chips, max_h = 2^20) this projects to
   ~13x reduction in per-query Merkle hashes, which is the dominant
   recursion-guest cost after #601's preprocessed-commitment cache.

Caveats:
- Topology is scaled-down to keep the bench fast (<1 s); production
  numbers will differ but in the same direction.
- Build bench measures wall-clock with `Instant`, not statistically
  rigorous like Criterion. Good enough as a sanity gate before PR2.
- Opening bench counts *Merkle path hashes only*, not leaf-hash work
  or compressions inside `verify` — both equal between modes.

Validates the Phase B->C migration: MMCS build does NOT regress prover
time (it improves it), and the verifier-as-guest cycle saving is
real and measurable.
---
 crypto/crypto/src/merkle_tree/mmcs.rs | 195 ++++++++++++++++++++++++++
 1 file changed, 195 insertions(+)

diff --git a/crypto/crypto/src/merkle_tree/mmcs.rs b/crypto/crypto/src/merkle_tree/mmcs.rs
index 6a213704b..b775bf17a 100644
--- a/crypto/crypto/src/merkle_tree/mmcs.rs
+++ b/crypto/crypto/src/merkle_tree/mmcs.rs
@@ -592,3 +592,198 @@ mod tests {
         assert_eq!(tree.open(4).err(), Some(MmcsError::IndexOutOfBounds));
     }
 }
+
+#[cfg(test)]
+mod bench {
+    //! Micro-benchmark comparing MMCS build against N independent
+    //! `MerkleTree` builds for a lambda-vm-style topology. Marked
+    //! `#[ignore]` so it doesn't run by default; trigger with
+    //!     cargo test -p crypto --features parallel mmcs_bench -- --ignored --nocapture
+    use super::*;
+    use crate::merkle_tree::merkle::MerkleTree;
+    use sha3::{Digest, Keccak256};
+    use std::time::Instant;
+
+    struct BenchBackend;
+    type Node = [u8; 32];
+    impl IsMerkleTreeBackend for BenchBackend {
+        type Node = Node;
+        type Data = Node;
+        fn hash_data(leaf: &Node) -> Node {
+            *leaf
+        }
+        fn hash_new_parent(a: &Node, b: &Node) -> Node {
+            let mut h = Keccak256::new();
+            h.update(a);
+            h.update(b);
+            h.finalize().into()
+        }
+    }
+
+    fn synthetic_chip_leaves(seed: u8, height: usize) -> Vec<Node> {
+        (0..height)
+            .map(|i| {
+                let mut h = Keccak256::new();
+                h.update([seed]);
+                h.update((i as u64).to_le_bytes());
+                h.finalize().into()
+            })
+            .collect()
+    }
+
+    /// lambda-vm-style topology, scaled down so the bench finishes fast:
+    /// - 3 chips at 2^14 (CPU-like chunked)
+    /// - 2 chips at 2^12 (MEMW-like)
+    /// - 2 chips at 2^10 (LT-like)
+    /// - 1 chip at 2^8  (HALT/COMMIT-like)
+    fn lambda_vm_topology() -> Vec<(MatrixTag, Vec<Node>)> {
+        let mut out = Vec::new();
+        let mut seed = 0u8;
+        for height in [1 << 14, 1 << 14, 1 << 14] {
+            out.push((
+                MatrixTag::new([seed; 8]),
+                synthetic_chip_leaves(seed, height),
+            ));
+            seed = seed.wrapping_add(1);
+        }
+        for height in [1 << 12, 1 << 12] {
+            out.push((
+                MatrixTag::new([seed; 8]),
+                synthetic_chip_leaves(seed, height),
+            ));
+            seed = seed.wrapping_add(1);
+        }
+        for height in [1 << 10, 1 << 10] {
+            out.push((
+                MatrixTag::new([seed; 8]),
+                synthetic_chip_leaves(seed, height),
+            ));
+            seed = seed.wrapping_add(1);
+        }
+        {
+            let height = 1 << 8;
+            out.push((
+                MatrixTag::new([seed; 8]),
+                synthetic_chip_leaves(seed, height),
+            ));
+        }
+        out
+    }
+
+    #[test]
+    #[ignore]
+    fn mmcs_bench_lambda_vm_topology() {
+        let chips = lambda_vm_topology();
+        let total_leaves: usize = chips.iter().map(|(_, l)| l.len()).sum();
+        let max_h = chips.iter().map(|(_, l)| l.len()).max().unwrap();
+
+        // Warm caches.
+        for _ in 0..2 {
+            let mut b: MmcsBuilder<BenchBackend> = MmcsBuilder::new();
+            for (t, l) in &chips {
+                b.add_matrix(*t, l.clone()).unwrap();
+            }
+            let _ = b.finalize().unwrap();
+        }
+
+        // MMCS build.
+        let t0 = Instant::now();
+        let iters = 5;
+        let mut mmcs_root = [0u8; 32];
+        for _ in 0..iters {
+            let mut b: MmcsBuilder<BenchBackend> = MmcsBuilder::new();
+            for (t, l) in &chips {
+                b.add_matrix(*t, l.clone()).unwrap();
+            }
+            let m = b.finalize().unwrap();
+            mmcs_root = *m.root();
+        }
+        let mmcs_us = t0.elapsed().as_micros() as f64 / iters as f64;
+
+        // N independent trees build.
+        let t0 = Instant::now();
+        let mut n_roots = Vec::new();
+        for _ in 0..iters {
+            let roots: Vec<Node> = chips
+                .iter()
+                .map(|(_, leaves)| {
+                    let tree = MerkleTree::<BenchBackend>::build_from_hashed_leaves(leaves.clone())
+                        .unwrap();
+                    tree.root
+                })
+                .collect();
+            n_roots = roots;
+        }
+        let ntrees_us = t0.elapsed().as_micros() as f64 / iters as f64;
+
+        // Sanity: per-chip roots equal one of the layer-0 contributions for
+        // MMCS *only* when the chip is the sole max-height matrix — we don't
+        // assert equality, just print stats so reviewers can spot anomalies.
+        let _ = (mmcs_root, n_roots);
+
+        println!();
+        println!("┌─────────────────────────────────────────────────────────────┐");
+        println!("│ MMCS micro-bench (lambda-vm-style topology)                 │");
+        println!("├─────────────────────────────────────────────────────────────┤");
+        println!(
+            "│ Chips: {:<3}    Σh_i: {:<10}   max_h: {:<10}    │",
+            chips.len(),
+            total_leaves,
+            max_h
+        );
+        println!(
+            "│ Build N independent trees:  {:>8.0} µs                  │",
+            ntrees_us
+        );
+        println!(
+            "│ Build single MMCS tree:     {:>8.0} µs                  │",
+            mmcs_us
+        );
+        println!(
+            "│ MMCS / N-trees ratio:       {:>8.3}                     │",
+            mmcs_us / ntrees_us
+        );
+        println!("└─────────────────────────────────────────────────────────────┘");
+    }
+
+    #[test]
+    #[ignore]
+    fn mmcs_opening_count_lambda_vm_topology() {
+        let chips = lambda_vm_topology();
+        let mut b: MmcsBuilder<BenchBackend> = MmcsBuilder::new();
+        for (t, l) in &chips {
+            b.add_matrix(*t, l.clone()).unwrap();
+        }
+        let tree = b.finalize().unwrap();
+        let opening = tree.open(0).unwrap();
+
+        // Path siblings + per-matrix leaves -> total opening hashes.
+        let mmcs_hashes = opening.siblings.len() + opening.matrix_leaves.len() - 1;
+
+        // Today (N independent trees): each chip's opening path is log2(h_i)
+        // hashes; verifier must hash one extra per opening for the leaf
+        // compute. Total per-query hashes = Σ (log2(h_i) + 1).
+        let ntrees_hashes: usize = chips
+            .iter()
+            .map(|(_, l)| l.len().trailing_zeros() as usize + 1)
+            .sum();
+
+        println!();
+        println!("┌─────────────────────────────────────────────────────────────┐");
+        println!("│ MMCS per-query opening hash count                           │");
+        println!("├─────────────────────────────────────────────────────────────┤");
+        println!(
+            "│ N independent trees: {:>4} hashes per query             │",
+            ntrees_hashes
+        );
+        println!(
+            "│ Unified MMCS:        {:>4} hashes per query             │",
+            mmcs_hashes
+        );
+        println!(
+            "│ Reduction factor:    {:>4.2}x                              │",
+            ntrees_hashes as f64 / mmcs_hashes as f64
+        );
+        println!("└─────────────────────────────────────────────────────────────┘");
+    }
+}

From 24dabdf895e20a07bac62a84c649de4e32275492 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Wed, 27 May 2026 10:45:31 -0300
Subject: [PATCH 05/21] feat(prover/mmcs_tags): per-chip MatrixTag spec (PR2
 foundation)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The MMCS leaf hash binds matrix identity via a `MatrixTag` (8 bytes per
matrix). Prover and verifier must derive the same tag for the same
chip-chunk; otherwise the Fiat-Shamir transcript diverges silently —
opaque verification failure with no clear diagnostic. Centralising the
tag derivation in one shared module turns "same tag" from a convention
into a compile-time guarantee.

Encoding:
    MatrixTag = [chip_type_id : u32 LE] [chunk_index : u32 LE]

Chip type IDs are STABLE wire constants (verified by `tag_encoding_is_stable`
test pinning specific bytes). Append-only — never reassign, never reuse
removed IDs. Split tables share a low ID range (CPU=0..MEMW_REGISTER=9),
single-instance tables a mid range (100..107), and per-page tables get a
single ID (200) with `chunk_index` encoding the page index.

Tests cover:
- Uniqueness across all realistic (chip, chunk) pairs (10 split tables
  × 64 chunks + 8 single + 256 page indices, ~944 distinct tags).
- Encoding stability (specific bytes pinned).
- Sensitivity (changing chip_type or chunk_index changes the tag).

No prover/verifier code touched yet. This module exists so PR2's
wire-up (which replaces N per-chip Merkle trees with one MMCS) can
import a single source of truth.
---
 prover/src/tables/mmcs_tags.rs | 148 +++++++++++++++++++++++++++++++++
 prover/src/tables/mod.rs       |   1 +
 2 files changed, 149 insertions(+)
 create mode 100644 prover/src/tables/mmcs_tags.rs

diff --git a/prover/src/tables/mmcs_tags.rs b/prover/src/tables/mmcs_tags.rs
new file mode 100644
index 000000000..9b70c17e6
--- /dev/null
+++ b/prover/src/tables/mmcs_tags.rs
@@ -0,0 +1,148 @@
+//! Per-chip [`MatrixTag`] assignments for the unified MMCS over the main
+//! trace (PR2 of the streaming-MMCS plan).
+//!
+//! ## Why this lives here
+//!
+//! The MMCS leaf-hash binds matrix identity via a per-matrix `MatrixTag`.
+//! Prover and verifier MUST derive the same tag for the same chip-chunk;
+//! otherwise the Fiat-Shamir transcript diverges and verification fails
+//! silently from the user's POV (just an opaque rejection). Centralising
+//! the tag derivation in one place — used by both sides — turns "same tag"
+//! from a hope into a compile-time guarantee.
+//!
+//! ## Encoding
+//!
+//! ```text
+//! MatrixTag = [chip_type_id : u32 (le)] [chunk_index : u32 (le)]
+//! ```
+//!
+//! `chip_type_id` values are **stable** — they go on the wire (indirectly,
+//! via the Fiat-Shamir transcript) and must never be reassigned. Adding a
+//! new chip type appends a new ID; removing one leaves the gap (do not
+//! reuse).
+//!
+//! `chunk_index` is the 0-based index within a single chip type (e.g. CPU
+//! chunk 0, CPU chunk 1, ...). For non-split chips (BITWISE, DECODE, ...)
+//! it's always 0.
+
+use crypto::merkle_tree::mmcs::MatrixTag;
+
+// =========================================================================
+// Chip type IDs — STABLE. Never reassign. Append-only.
+// =========================================================================
+// Split tables (multiple chunks possible)
+pub const CHIP_CPU: u32 = 0;
+pub const CHIP_LT: u32 = 1;
+pub const CHIP_MEMW: u32 = 2;
+pub const CHIP_MEMW_ALIGNED: u32 = 3;
+pub const CHIP_LOAD: u32 = 4;
+pub const CHIP_MUL: u32 = 5;
+pub const CHIP_DVRM: u32 = 6;
+pub const CHIP_SHIFT: u32 = 7;
+pub const CHIP_BRANCH: u32 = 8;
+pub const CHIP_MEMW_REGISTER: u32 = 9;
+
+// Single-instance tables (chunk_index is always 0)
+pub const CHIP_BITWISE: u32 = 100;
+pub const CHIP_DECODE: u32 = 101;
+pub const CHIP_HALT: u32 = 102;
+pub const CHIP_COMMIT: u32 = 103;
+pub const CHIP_KECCAK: u32 = 104;
+pub const CHIP_KECCAK_RC: u32 = 105;
+pub const CHIP_KECCAK_RND: u32 = 106;
+pub const CHIP_REGISTER: u32 = 107;
+
+// Per-page tables — chunk_index encodes the page index within the page
+// configuration the prover and verifier reconstruct from the proof's
+// runtime_page_ranges + num_private_input_pages. ELF-segment pages and
+// runtime zero-init pages live here; private-input pages also share this
+// space because the AIR is the same kind.
+pub const CHIP_PAGE: u32 = 200;
+
+/// Build a [`MatrixTag`] from a chip type ID and a chunk index. The
+/// encoding is `chip_type_id` (4 bytes LE) followed by `chunk_index`
+/// (4 bytes LE) — total 8 bytes.
+#[inline]
+pub const fn chip_tag(chip_type_id: u32, chunk_index: u32) -> MatrixTag {
+    let ct = chip_type_id.to_le_bytes();
+    let ci = chunk_index.to_le_bytes();
+    MatrixTag::new([ct[0], ct[1], ct[2], ct[3], ci[0], ci[1], ci[2], ci[3]])
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::collections::HashSet;
+
+    /// Every (chip_type, chunk) pair we might realistically use must
+    /// produce a distinct tag. This catches accidental ID collisions.
+    #[test]
+    fn tags_are_unique_across_realistic_assignments() {
+        let split_chips = [
+            CHIP_CPU,
+            CHIP_LT,
+            CHIP_MEMW,
+            CHIP_MEMW_ALIGNED,
+            CHIP_LOAD,
+            CHIP_MUL,
+            CHIP_DVRM,
+            CHIP_SHIFT,
+            CHIP_BRANCH,
+            CHIP_MEMW_REGISTER,
+        ];
+        let single_chips = [
+            CHIP_BITWISE,
+            CHIP_DECODE,
+            CHIP_HALT,
+            CHIP_COMMIT,
+            CHIP_KECCAK,
+            CHIP_KECCAK_RC,
+            CHIP_KECCAK_RND,
+            CHIP_REGISTER,
+        ];
+
+        let mut seen: HashSet<[u8; 8]> = HashSet::new();
+        for chip in split_chips {
+            for chunk in 0..64u32 {
+                let tag = chip_tag(chip, chunk);
+                assert!(
+                    seen.insert(tag.0),
+                    "duplicate tag for chip {chip:#x} chunk {chunk}"
+                );
+            }
+        }
+        for chip in single_chips {
+            let tag = chip_tag(chip, 0);
+            assert!(seen.insert(tag.0), "duplicate single-chip tag {chip:#x}");
+        }
+        for page_idx in 0..256u32 {
+            let tag = chip_tag(CHIP_PAGE, page_idx);
+            assert!(seen.insert(tag.0), "duplicate PAGE tag at index {page_idx}");
+        }
+    }
+
+    /// Stability test: specific bytes must match a frozen layout so a
+    /// future refactor that reshuffles the encoding fails loudly. If you
+    /// need to change the encoding, BUMP a new constant family (V2) and
+    /// migrate the verifier alongside.
+    #[test]
+    fn tag_encoding_is_stable() {
+        assert_eq!(chip_tag(CHIP_CPU, 0).0, [0, 0, 0, 0, 0, 0, 0, 0]);
+        assert_eq!(chip_tag(CHIP_CPU, 1).0, [0, 0, 0, 0, 1, 0, 0, 0]);
+        assert_eq!(chip_tag(CHIP_BITWISE, 0).0, [100, 0, 0, 0, 0, 0, 0, 0]);
+        assert_eq!(
+            chip_tag(CHIP_PAGE, 0xABCD).0,
+            [200, 0, 0, 0, 0xCD, 0xAB, 0, 0]
+        );
+    }
+
+    /// chip_type and chunk_index encode into independent halves; flipping
+    /// either changes the tag.
+    #[test]
+    fn changing_chip_type_or_chunk_changes_tag() {
+        let base = chip_tag(CHIP_CPU, 0);
+        assert_ne!(base, chip_tag(CHIP_LT, 0));
+        assert_ne!(base, chip_tag(CHIP_CPU, 1));
+        assert_ne!(base, chip_tag(CHIP_CPU, u32::MAX));
+    }
+}
diff --git a/prover/src/tables/mod.rs b/prover/src/tables/mod.rs
index 4a6032ef2..7d80bd2c6 100644
--- a/prover/src/tables/mod.rs
+++ b/prover/src/tables/mod.rs
@@ -36,6 +36,7 @@ pub mod lt;
 pub mod memw;
 pub mod memw_aligned;
 pub mod memw_register;
+pub mod mmcs_tags;
 pub mod mul;
 pub mod page;
 pub mod register;

From 8f09c0d8bbcd78cc826b6bc2087409bab1aabf8f Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Wed, 27 May 2026 11:01:22 -0300
Subject: [PATCH 06/21] feat(prover/mmcs_commit): chip-leaf hashing +
 MMCS-build adapter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bridges per-chip LDE columns to the unified MMCS over the main trace.
Not yet wired into `multi_prove` — exists so the API + leaf-hash format
can be reviewed and tested before the hot-path change in the next step.

Module surface:
- `compute_chip_leaves_with_tag(columns, tag) -> Vec<Commitment>`:
  per-row Keccak256 of `LEAF_DOMAIN_TAG || tag.0 || row_bytes_be`.
  Bit-reversed row order matches the existing FRI / Merkle layout. Caller
  can drop the LDE columns immediately after — memory peak is one chip's
  LDE at a time (same as today's per-chip Merkle build, no regression).
- `build_main_trace_mmcs(entries)`: thin wrapper that pours
  `(MatrixTag, leaves)` pairs into the standalone `MmcsBuilder` from
  `crypto/merkle_tree/mmcs`. Returns one Mmcs with one root; per-query
  opens come from `mmcs.open(global_index)`.

Soundness notes baked into the doc:
- The leaf hash MUST include the chip tag. Without it, a single shared
  root cannot bind matrix identity (cross-chip row swap becomes a real
  attack). The legacy `keccak_leaves_bit_reversed` is unsafe for MMCS
  use; mixing the two formats would be a silent soundness bug. The test
  `leaves_differ_from_legacy_format` pins that the two encodings produce
  different bytes.
- The domain tag `LAMBDAVM_MAIN_MMCS_LEAF_V1` is versioned so future
  changes to the leaf encoding can be detected by stale verifiers.

5 tests, 0 failed. `make lint` clean.

Next step: replace the per-chip transcript-absorb loop in `multi_prove`
Round 1 Phase A with a single MMCS build + absorb. That commit also
drops `StarkProof.lde_trace_main_merkle_root` and changes the per-query
opening format.
---
 prover/src/lib.rs         |   1 +
 prover/src/mmcs_commit.rs | 257 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 258 insertions(+)
 create mode 100644 prover/src/mmcs_commit.rs

diff --git a/prover/src/lib.rs b/prover/src/lib.rs
index 14f35cdf8..cfa9a52ef 100644
--- a/prover/src/lib.rs
+++ b/prover/src/lib.rs
@@ -17,6 +17,7 @@ pub mod constraints;
 mod debug_report;
 #[cfg(feature = "instruments")]
 pub mod instruments;
+pub mod mmcs_commit;
 mod statement;
 pub mod tables;
 pub mod test_utils;
diff --git a/prover/src/mmcs_commit.rs b/prover/src/mmcs_commit.rs
new file mode 100644
index 000000000..f531e8e9f
--- /dev/null
+++ b/prover/src/mmcs_commit.rs
@@ -0,0 +1,257 @@
+//! Helpers that bridge per-chip LDE columns to the unified MMCS over the
+//! main trace (PR2 of the streaming-MMCS plan).
+//!
+//! This module is **not yet wired into `multi_prove`**. It exists so the
+//! API + leaf-hash format can be reviewed and tested in isolation before
+//! the hot-path change. The pattern PR2 will use:
+//!
+//! 1. For each chip-chunk: compute its tagged leaf-digest array via
+//!    [`compute_chip_leaves_with_tag`]. The chip's LDE columns can be
+//!    dropped immediately after.
+//! 2. Once every chip has produced its leaves, call
+//!    [`build_main_trace_mmcs`] with the `(MatrixTag, leaves)` pairs to
+//!    get a single MMCS root + the prover-side tree for opens.
+//! 3. Absorb that one root into the transcript instead of N per-chip roots.
+//! 4. Per query: `mmcs.open(global_index)` returns one `MmcsOpening`
+//!    covering every chip at the appropriate shifted indices.
+//!
+//! The leaf-hash format is deliberately **distinct** from
+//! `stark::prover::keccak_leaves_bit_reversed` — that one omits the
+//! per-chip tag, which is why N independent trees today are safe (each
+//! root inherently binds its content). With a single shared root the tag
+//! must move into the leaf, and feeding the old bytes into the MMCS would
+//! be a silent soundness bug.
+
+use crypto::merkle_tree::mmcs::{Mmcs, MmcsBuilder, MmcsError, MmcsOpening};
+use math::fft::bit_reversing::reverse_index;
+use math::field::element::FieldElement;
+use math::field::traits::IsField;
+use math::traits::{AsBytes, ByteConversion};
+use sha3::{Digest, Keccak256};
+use stark::config::{BatchedMerkleTreeBackend, Commitment};
+
+pub use crate::tables::mmcs_tags as tags;
+pub use crypto::merkle_tree::mmcs::MatrixTag;
+
+/// Domain tag prepended to every main-trace MMCS leaf hash so that
+/// (a) the bytes are clearly versioned against any future change and
+/// (b) they cannot collide with leaves of a different MMCS (aux trace,
+/// composition, ...). Bump the suffix on any encoding change.
+const LEAF_DOMAIN_TAG: &[u8] = b"LAMBDAVM_MAIN_MMCS_LEAF_V1";
+
+/// Compute the per-row leaf digests for a chip's main-trace LDE,
+/// binding the chip's `MatrixTag` into every leaf so the MMCS can
+/// authenticate (matrix, row) pairs uniquely.
+///
+/// Each row is laid out bit-reversed (matching the existing FRI / Merkle
+/// layout). The leaf is `Keccak256(LEAF_DOMAIN_TAG || tag.0 || row_bytes)`
+/// where `row_bytes` is every column's element written big-endian and
+/// concatenated.
+///
+/// The input columns are read but never mutated; the caller can drop
+/// them immediately after this returns — memory peak is one chip's LDE
+/// at a time (same as today's per-chip Merkle build).
+pub fn compute_chip_leaves_with_tag<E>(
+    columns: &[Vec<FieldElement<E>>],
+    tag: MatrixTag,
+) -> Vec<Commitment>
+where
+    E: IsField + Send + Sync,
+    FieldElement<E>: AsBytes + Sync + Send + ByteConversion,
+{
+    if columns.is_empty() || columns[0].is_empty() {
+        return Vec::new();
+    }
+    let num_rows = columns[0].len();
+    let num_cols = columns.len();
+    let byte_len = <FieldElement<E> as ByteConversion>::BYTE_LEN;
+    debug_assert!(
+        num_rows.is_power_of_two(),
+        "num_rows must be a power of two for reverse_index"
+    );
+
+    let total_bytes = num_cols * byte_len;
+
+    let hash_leaf = |buf: &mut [u8], row_idx: usize| -> Commitment {
+        let br_idx = reverse_index(row_idx, num_rows as u64);
+        for (col_idx, col) in columns.iter().enumerate() {
+            col[br_idx].write_bytes_be(&mut buf[col_idx * byte_len..(col_idx + 1) * byte_len]);
+        }
+        let mut h = Keccak256::new();
+        h.update(LEAF_DOMAIN_TAG);
+        h.update(tag.0);
+        h.update(&buf[..]);
+        h.finalize().into()
+    };
+
+    #[cfg(feature = "parallel")]
+    {
+        use rayon::prelude::*;
+        (0..num_rows)
+            .into_par_iter()
+            .map_init(
+                || vec![0u8; total_bytes],
+                |buf, row_idx| hash_leaf(buf, row_idx),
+            )
+            .collect()
+    }
+    #[cfg(not(feature = "parallel"))]
+    {
+        let mut buf = vec![0u8; total_bytes];
+        (0..num_rows)
+            .map(|row_idx| hash_leaf(&mut buf, row_idx))
+            .collect()
+    }
+}
+
+/// Convenience: build the unified main-trace MMCS from `(tag, leaves)`
+/// pairs that the caller produced via [`compute_chip_leaves_with_tag`].
+pub fn build_main_trace_mmcs<F>(
+    entries: Vec<(MatrixTag, Vec<Commitment>)>,
+) -> Result<Mmcs<BatchedMerkleTreeBackend<F>>, MmcsError>
+where
+    F: IsField + Send + Sync,
+    FieldElement<F>: AsBytes + Sync + Send,
+{
+    let mut builder = MmcsBuilder::<BatchedMerkleTreeBackend<F>>::new();
+    for (tag, leaves) in entries {
+        builder.add_matrix(tag, leaves)?;
+    }
+    builder.finalize()
+}
+
+/// Convenience opening accessor for tests / callers that don't want to
+/// import `Mmcs` directly.
+pub fn open_main_trace_mmcs<F>(
+    mmcs: &Mmcs<BatchedMerkleTreeBackend<F>>,
+    global_index: usize,
+) -> Result<MmcsOpening<Commitment>, MmcsError>
+where
+    F: IsField + Send + Sync,
+    FieldElement<F>: AsBytes + Sync + Send,
+{
+    mmcs.open(global_index)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use math::field::goldilocks::GoldilocksField;
+
+    type FE = FieldElement<GoldilocksField>;
+
+    fn fake_columns(seed: u64, num_cols: usize, num_rows: usize) -> Vec<Vec<FE>> {
+        (0..num_cols)
+            .map(|c| {
+                (0..num_rows)
+                    .map(|r| FE::from((seed.wrapping_add(c as u64) * 31 + r as u64) % 1_000_003))
+                    .collect()
+            })
+            .collect()
+    }
+
+    #[test]
+    fn leaves_change_when_tag_changes() {
+        let cols = fake_columns(42, 4, 8);
+        let tag_a = tags::chip_tag(tags::CHIP_CPU, 0);
+        let tag_b = tags::chip_tag(tags::CHIP_CPU, 1);
+        let la = compute_chip_leaves_with_tag(&cols, tag_a);
+        let lb = compute_chip_leaves_with_tag(&cols, tag_b);
+        assert_eq!(la.len(), 8);
+        assert_eq!(la.len(), lb.len());
+        assert_ne!(la[0], lb[0], "tag must be in the leaf");
+        // Every row should differ; collision at one row would be extreme.
+        assert!(la.iter().zip(lb.iter()).any(|(a, b)| a != b));
+    }
+
+    #[test]
+    fn leaves_differ_from_legacy_format() {
+        // Sanity: our tagged leaves are NOT equal to a Keccak256 of just
+        // the row bytes (i.e. the legacy non-tagged format). Feeding old
+        // bytes into the MMCS would be a silent soundness bug.
+        let cols = fake_columns(1, 2, 4);
+        let tag = tags::chip_tag(tags::CHIP_BITWISE, 0);
+        let tagged = compute_chip_leaves_with_tag(&cols, tag);
+        let untagged: Commitment = {
+            let mut buf = [0u8; 2 * 8];
+            let br = reverse_index(0, 4);
+            for (c, col) in cols.iter().enumerate() {
+                col[br].write_bytes_be(&mut buf[c * 8..(c + 1) * 8]);
+            }
+            let mut h = Keccak256::new();
+            h.update(&buf[..]);
+            h.finalize().into()
+        };
+        assert_ne!(tagged[0], untagged);
+    }
+
+    #[test]
+    fn build_main_trace_mmcs_round_trips() {
+        // 3 chips at distinct heights — realistic small case.
+        let cols_a = fake_columns(1, 6, 16);
+        let cols_b = fake_columns(2, 4, 8);
+        let cols_c = fake_columns(3, 2, 4);
+        let tag_a = tags::chip_tag(tags::CHIP_CPU, 0);
+        let tag_b = tags::chip_tag(tags::CHIP_MEMW, 0);
+        let tag_c = tags::chip_tag(tags::CHIP_BITWISE, 0);
+        let leaves_a = compute_chip_leaves_with_tag(&cols_a, tag_a);
+        let leaves_b = compute_chip_leaves_with_tag(&cols_b, tag_b);
+        let leaves_c = compute_chip_leaves_with_tag(&cols_c, tag_c);
+        let entries = vec![(tag_a, leaves_a), (tag_b, leaves_b), (tag_c, leaves_c)];
+        let mmcs = build_main_trace_mmcs::<GoldilocksField>(entries).expect("build mmcs");
+        let spec = mmcs.spec();
+        // 16 is the max; open at every row in that domain.
+        for i in 0..16 {
+            let opening = mmcs.open(i).expect("open");
+            assert!(
+                opening.verify::<BatchedMerkleTreeBackend<GoldilocksField>>(mmcs.root(), &spec),
+                "round-trip failed at index {i}"
+            );
+        }
+    }
+
+    #[test]
+    fn build_main_trace_mmcs_same_height_chunks() {
+        // 3 chips at the SAME height — exercises the same-height combine
+        // path with realistic lambda-vm-style data (CPU chunks).
+        let cols_0 = fake_columns(10, 8, 16);
+        let cols_1 = fake_columns(11, 8, 16);
+        let cols_2 = fake_columns(12, 8, 16);
+        let entries = vec![
+            (
+                tags::chip_tag(tags::CHIP_CPU, 0),
+                compute_chip_leaves_with_tag(&cols_0, tags::chip_tag(tags::CHIP_CPU, 0)),
+            ),
+            (
+                tags::chip_tag(tags::CHIP_CPU, 1),
+                compute_chip_leaves_with_tag(&cols_1, tags::chip_tag(tags::CHIP_CPU, 1)),
+            ),
+            (
+                tags::chip_tag(tags::CHIP_CPU, 2),
+                compute_chip_leaves_with_tag(&cols_2, tags::chip_tag(tags::CHIP_CPU, 2)),
+            ),
+        ];
+        let mmcs = build_main_trace_mmcs::<GoldilocksField>(entries).expect("build mmcs");
+        let spec = mmcs.spec();
+        for i in 0..16 {
+            let opening = mmcs.open(i).expect("open");
+            assert!(
+                opening.verify::<BatchedMerkleTreeBackend<GoldilocksField>>(mmcs.root(), &spec)
+            );
+        }
+    }
+
+    #[test]
+    fn duplicate_tag_caught_at_build() {
+        // Two chips sharing a tag is a caller bug (e.g. forgot to bump
+        // chunk_index). MMCS rejects at finalize time.
+        let cols = fake_columns(7, 2, 4);
+        let tag = tags::chip_tag(tags::CHIP_CPU, 0);
+        let entries = vec![
+            (tag, compute_chip_leaves_with_tag(&cols, tag)),
+            (tag, compute_chip_leaves_with_tag(&cols, tag)),
+        ];
+        let err = build_main_trace_mmcs::<GoldilocksField>(entries);
+        assert!(matches!(err, Err(MmcsError::DuplicateTag)));
+    }
+}

From 5e784da412a49c5c572d6da1a81be56dc831ba33 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Wed, 27 May 2026 11:50:31 -0300
Subject: [PATCH 07/21] feat(stark/mmcs): foundation types + leaf hash module +
 per-AIR tags

Foundation for the upcoming MMCS Phase C wire-up. No hot-path changes;
all new types are defined but unused. Lands the surfaces both the
prover-side wire-up commit and the verifier-side replay commit will
consume next.

- crypto/mmcs: add serde derives on MatrixTag and MmcsOpening so they
  can live in the proof format.
- stark/mmcs_leaf: single source of truth for the main-trace MMCS
  leaf hash format (LEAF_DOMAIN_TAG + hash_tagged_row_bytes +
  hash_tagged_row). Prover and verifier will both call this; the
  prover-side compute_chip_leaves_with_tag adapter calls it row-wise.
- stark/proof/stark: MainTraceOpening<F> struct (evaluations +
  evaluations_sym + MmcsOpening per row, paired with its symmetric
  counterpart). Not yet wired into DeepPolynomialOpening.
- stark/prover: MainCommit<F> struct (shared Arc<Mmcs> + per-table
  MatrixTag + optional precomputed tree). Unused at the wire-up level
  but defined here as the keystone type. Marked allow(dead_code).
- prover/lib: VmAirs::air_tags() returns the parallel Vec<MatrixTag>
  in air_trace_pairs / air_refs order. Prover and verifier must call
  this on identical VmAirs configurations.
---
 crypto/crypto/src/merkle_tree/mmcs.rs |  6 ++
 crypto/stark/src/lib.rs               |  1 +
 crypto/stark/src/mmcs_leaf.rs         | 84 +++++++++++++++++++++++++++
 crypto/stark/src/proof/stark.rs       | 17 ++++++
 crypto/stark/src/prover.rs            | 52 ++++++++++++++++-
 prover/src/lib.rs                     | 61 +++++++++++++++++++
 6 files changed, 220 insertions(+), 1 deletion(-)
 create mode 100644 crypto/stark/src/mmcs_leaf.rs

diff --git a/crypto/crypto/src/merkle_tree/mmcs.rs b/crypto/crypto/src/merkle_tree/mmcs.rs
index b775bf17a..8bbd8607f 100644
--- a/crypto/crypto/src/merkle_tree/mmcs.rs
+++ b/crypto/crypto/src/merkle_tree/mmcs.rs
@@ -31,6 +31,7 @@ use super::traits::IsMerkleTreeBackend;
 /// Per-matrix domain separator. Caller-defined; verifier reconstructs
 /// from chip spec.
 #[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
+#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
 pub struct MatrixTag(pub [u8; 8]);
 
 impl MatrixTag {
@@ -269,6 +270,11 @@ impl<B: IsMerkleTreeBackend> Mmcs<B> {
 }
 
 #[derive(Debug, Clone)]
+#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
+#[cfg_attr(
+    feature = "serde",
+    serde(bound = "N: serde::Serialize + serde::de::DeserializeOwned")
+)]
 pub struct MmcsOpening<N> {
     /// `(tag, leaf_at_shifted_index)` per matrix, in the builder's sort
     /// order (height desc, tag asc).
diff --git a/crypto/stark/src/lib.rs b/crypto/stark/src/lib.rs
index 7379594b4..dce36aece 100644
--- a/crypto/stark/src/lib.rs
+++ b/crypto/stark/src/lib.rs
@@ -17,6 +17,7 @@ pub mod grinding;
 #[cfg(feature = "instruments")]
 pub mod instruments;
 pub mod lookup;
+pub mod mmcs_leaf;
 pub(crate) mod par;
 pub mod proof;
 pub mod prover;
diff --git a/crypto/stark/src/mmcs_leaf.rs b/crypto/stark/src/mmcs_leaf.rs
new file mode 100644
index 000000000..4c2c84b67
--- /dev/null
+++ b/crypto/stark/src/mmcs_leaf.rs
@@ -0,0 +1,84 @@
+//! Single source of truth for the main-trace MMCS leaf hash format.
+//!
+//! Both the prover (when computing per-row leaves before MMCS build) and
+//! the verifier (when re-hashing a per-row opening to compare against
+//! `MmcsOpening::matrix_leaves`) must produce byte-identical digests for
+//! the same `(MatrixTag, row_bytes)` pair. Centralising the format here
+//! removes the risk of prover/verifier divergence.
+//!
+//! Leaf bytes layout:
+//!
+//! ```text
+//! Keccak256( LEAF_DOMAIN_TAG || tag.0 (8 bytes) || row_bytes_be )
+//! ```
+//!
+//! where `row_bytes_be` is every committed column's element written
+//! big-endian, in column order. For preprocessed tables the precomputed
+//! slice is NOT included here (those columns live in a separate
+//! per-table Merkle tree).
+//!
+//! Bump `LEAF_DOMAIN_TAG` on any wire-incompatible change.
+
+use crypto::merkle_tree::mmcs::MatrixTag;
+use math::field::element::FieldElement;
+use math::field::traits::IsField;
+use math::traits::ByteConversion;
+use sha3::{Digest, Keccak256};
+
+use crate::config::Commitment;
+
+/// Versioned domain separator for main-trace MMCS leaves. Bump suffix on
+/// any encoding change so old proofs cannot be silently re-interpreted.
+pub const LEAF_DOMAIN_TAG: &[u8] = b"LAMBDAVM_MAIN_MMCS_LEAF_V1";
+
+/// Hash one row's worth of column bytes into a leaf digest using the
+/// canonical tagged format. `row_bytes_be` is the concatenation of every
+/// committed column's element written big-endian, in column order.
+#[inline]
+pub fn hash_tagged_row_bytes(tag: MatrixTag, row_bytes_be: &[u8]) -> Commitment {
+    let mut h = Keccak256::new();
+    h.update(LEAF_DOMAIN_TAG);
+    h.update(tag.0);
+    h.update(row_bytes_be);
+    h.finalize().into()
+}
+
+/// Convenience: hash a row from individual field elements. Allocates a
+/// stack-or-heap buffer for the row, suitable for verifier-side per-query
+/// re-hashing (where allocation cost is dominated by FRI work anyway).
+pub fn hash_tagged_row<E>(tag: MatrixTag, row: &[FieldElement<E>]) -> Commitment
+where
+    E: IsField,
+    FieldElement<E>: ByteConversion,
+{
+    let byte_len = <FieldElement<E> as ByteConversion>::BYTE_LEN;
+    let mut buf = vec![0u8; row.len() * byte_len];
+    for (col_idx, fe) in row.iter().enumerate() {
+        fe.write_bytes_be(&mut buf[col_idx * byte_len..(col_idx + 1) * byte_len]);
+    }
+    hash_tagged_row_bytes(tag, &buf)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use math::field::goldilocks::GoldilocksField;
+
+    type FE = FieldElement<GoldilocksField>;
+
+    #[test]
+    fn tag_changes_digest() {
+        let row = vec![FE::from(1u64), FE::from(2u64), FE::from(3u64)];
+        let a = hash_tagged_row(MatrixTag::new([0; 8]), &row);
+        let b = hash_tagged_row(MatrixTag::new([1, 0, 0, 0, 0, 0, 0, 0]), &row);
+        assert_ne!(a, b);
+    }
+
+    #[test]
+    fn row_change_changes_digest() {
+        let tag = MatrixTag::new([7; 8]);
+        let row_a = vec![FE::from(1u64), FE::from(2u64)];
+        let row_b = vec![FE::from(1u64), FE::from(3u64)];
+        assert_ne!(hash_tagged_row(tag, &row_a), hash_tagged_row(tag, &row_b));
+    }
+}
diff --git a/crypto/stark/src/proof/stark.rs b/crypto/stark/src/proof/stark.rs
index 1751d60fe..ec11acd3b 100644
--- a/crypto/stark/src/proof/stark.rs
+++ b/crypto/stark/src/proof/stark.rs
@@ -1,3 +1,4 @@
+use crypto::merkle_tree::mmcs::MmcsOpening;
 use crypto::merkle_tree::proof::Proof;
 use math::field::{
     element::FieldElement,
@@ -17,6 +18,22 @@ pub struct PolynomialOpenings<F: IsField> {
     pub evaluations_sym: Vec<FieldElement<F>>,
 }
 
+/// Per-query main-trace opening backed by the shared MMCS.
+///
+/// The (iota, iota_sym) pair are consecutive global indices in the LDE.
+/// Each carries its own `MmcsOpening` because they live at different
+/// positions in the layer-0 array — there is no shared sibling sub-path
+/// between them at the leaf level (only at higher tree levels, which the
+/// MMCS opening encodes).
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
+#[serde(bound = "")]
+pub struct MainTraceOpening<F: IsField> {
+    pub evaluations: Vec<FieldElement<F>>,
+    pub evaluations_sym: Vec<FieldElement<F>>,
+    pub mmcs_opening: MmcsOpening<Commitment>,
+    pub mmcs_opening_sym: MmcsOpening<Commitment>,
+}
+
 #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
 #[serde(bound = "")]
 pub struct DeepPolynomialOpening<F: IsSubFieldOf<E>, E: IsField> {
diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs
index 53af372ec..5c35281e5 100644
--- a/crypto/stark/src/prover.rs
+++ b/crypto/stark/src/prover.rs
@@ -27,11 +27,15 @@ use rayon::prelude::{
 use crate::debug::validate_trace;
 use crate::fri;
 use crate::lookup::LOGUP_NUM_CHALLENGES;
+#[allow(unused_imports)]
+use crate::mmcs_leaf::hash_tagged_row_bytes;
 use crate::proof::stark::{DeepPolynomialOpenings, PolynomialOpenings};
 #[cfg(feature = "disk-spill")]
 use crate::storage_mode::StorageMode;
 use crate::table::Table;
 use crate::trace::LDETraceTable;
+#[allow(unused_imports)]
+use crypto::merkle_tree::mmcs::{MatrixTag, Mmcs, MmcsBuilder, MmcsError};
 
 use super::config::{BatchedMerkleTree, BatchedMerkleTreeBackend, Commitment};
 use super::constraints::evaluator::ConstraintEvaluator;
@@ -149,6 +153,52 @@ where
     }
 }
 
+/// Per-table commitment artifacts for the main trace under the shared
+/// MMCS protocol. The `mmcs` Arc is the SAME instance for every table in
+/// the multi-proof — Phase A builds it once.
+///
+/// Currently unused at the wire-up level; defined here as the keystone
+/// type for the upcoming MMCS Phase C wire-up (see
+/// `docs/mmcs-streaming-c1-spec.md`). Marked `allow(dead_code)` until the
+/// follow-up commit consumes it.
+#[allow(dead_code)]
+pub(crate) struct MainCommit<F: IsField>
+where
+    FieldElement<F>: AsBytes,
+{
+    /// Shared MMCS across all tables in the multi-proof.
+    pub(crate) mmcs: Arc<Mmcs<BatchedMerkleTreeBackend<F>>>,
+    /// This table's MatrixTag within the MMCS.
+    pub(crate) tag: MatrixTag,
+    /// Preprocessed tables only: separate Merkle tree over precomputed columns.
+    pub(crate) precomputed_tree: Option<Arc<BatchedMerkleTree<F>>>,
+    /// Preprocessed tables only: root of `precomputed_tree`.
+    pub(crate) precomputed_root: Option<Commitment>,
+    /// Preprocessed tables only: number of precomputed columns. Zero otherwise.
+    pub(crate) num_precomputed_cols: usize,
+}
+
+#[allow(dead_code)]
+impl<F: IsField> MainCommit<F>
+where
+    FieldElement<F>: AsBytes,
+{
+    fn is_preprocessed(&self) -> bool {
+        self.precomputed_tree.is_some()
+    }
+
+    /// Cheap clone. Only bumps Arc refcounts.
+    fn share(&self) -> Self {
+        Self {
+            mmcs: Arc::clone(&self.mmcs),
+            tag: self.tag,
+            precomputed_tree: self.precomputed_tree.as_ref().map(Arc::clone),
+            precomputed_root: self.precomputed_root,
+            num_precomputed_cols: self.num_precomputed_cols,
+        }
+    }
+}
+
 /// A container for the results of the first round of the STARK Prove protocol.
 pub(crate) struct Round1<Field, FieldExtension>
 where
@@ -201,7 +251,7 @@ where
     FieldElement<FieldExtension>: AsBytes,
 {
     /// Build a `Round1` by consuming a `Lde` and borrowing commitment data.
-    /// The `TableCommit::share` calls are cheap — only bump Arc refcounts.
+    /// The `share` calls are cheap — only bump Arc refcounts.
     fn build_round1(
         &self,
         lde: Lde<Field, FieldExtension>,
diff --git a/prover/src/lib.rs b/prover/src/lib.rs
index cfa9a52ef..1632ac642 100644
--- a/prover/src/lib.rs
+++ b/prover/src/lib.rs
@@ -273,6 +273,67 @@ impl VmAirs {
         pairs
     }
 
+    /// Build the parallel `Vec<MatrixTag>` for the main-trace MMCS, in the
+    /// exact same order as [`Self::air_trace_pairs`] and [`Self::air_refs`].
+    /// Prover and verifier MUST call this on identical `VmAirs` configurations.
+    ///
+    /// Currently unused at the call sites; defined here as the foundation
+    /// for the upcoming MMCS Phase C wire-up (see
+    /// `docs/mmcs-streaming-c1-spec.md`).
+    #[allow(dead_code)]
+    pub fn air_tags(&self) -> Vec<crypto::merkle_tree::mmcs::MatrixTag> {
+        use crate::tables::mmcs_tags::{
+            CHIP_BITWISE, CHIP_BRANCH, CHIP_COMMIT, CHIP_CPU, CHIP_DECODE, CHIP_DVRM, CHIP_HALT,
+            CHIP_KECCAK, CHIP_KECCAK_RC, CHIP_KECCAK_RND, CHIP_LOAD, CHIP_LT, CHIP_MEMW,
+            CHIP_MEMW_ALIGNED, CHIP_MEMW_REGISTER, CHIP_MUL, CHIP_PAGE, CHIP_REGISTER, CHIP_SHIFT,
+            chip_tag,
+        };
+        let mut tags = vec![
+            chip_tag(CHIP_BITWISE, 0),
+            chip_tag(CHIP_DECODE, 0),
+            chip_tag(CHIP_HALT, 0),
+            chip_tag(CHIP_COMMIT, 0),
+            chip_tag(CHIP_KECCAK, 0),
+            chip_tag(CHIP_KECCAK_RND, 0),
+            chip_tag(CHIP_KECCAK_RC, 0),
+            chip_tag(CHIP_REGISTER, 0),
+        ];
+        for i in 0..self.cpus.len() {
+            tags.push(chip_tag(CHIP_CPU, i as u32));
+        }
+        for i in 0..self.lts.len() {
+            tags.push(chip_tag(CHIP_LT, i as u32));
+        }
+        for i in 0..self.shifts.len() {
+            tags.push(chip_tag(CHIP_SHIFT, i as u32));
+        }
+        for i in 0..self.memws.len() {
+            tags.push(chip_tag(CHIP_MEMW, i as u32));
+        }
+        for i in 0..self.memw_aligneds.len() {
+            tags.push(chip_tag(CHIP_MEMW_ALIGNED, i as u32));
+        }
+        for i in 0..self.loads.len() {
+            tags.push(chip_tag(CHIP_LOAD, i as u32));
+        }
+        for i in 0..self.muls.len() {
+            tags.push(chip_tag(CHIP_MUL, i as u32));
+        }
+        for i in 0..self.dvrms.len() {
+            tags.push(chip_tag(CHIP_DVRM, i as u32));
+        }
+        for i in 0..self.branches.len() {
+            tags.push(chip_tag(CHIP_BRANCH, i as u32));
+        }
+        for i in 0..self.pages.len() {
+            tags.push(chip_tag(CHIP_PAGE, i as u32));
+        }
+        for i in 0..self.memw_registers.len() {
+            tags.push(chip_tag(CHIP_MEMW_REGISTER, i as u32));
+        }
+        tags
+    }
+
     /// Collect AIR references for [`Verifier::multi_verify`].
     pub fn air_refs(&self) -> Vec<&dyn AIR<Field = F, FieldExtension = E, PublicInputs = ()>> {
         let mut refs: Vec<&dyn AIR<Field = F, FieldExtension = E, PublicInputs = ()>> = vec![

From 00cc9d0ed1ec9a041981d41fdb3c03cb7be956a7 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Wed, 27 May 2026 12:20:51 -0300
Subject: [PATCH 08/21] feat(stark): thread main_tags through multi_prove /
 multi_verify
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Wires the parallel Vec<MatrixTag> through both prover and verifier
signatures. The value is not consumed yet (let _ = main_tags), only
validated against AIR count. Lays the surface for the upcoming Phase A
MMCS wire-up without changing any cryptographic behaviour.

- multi_prove(air_trace_pairs, main_tags, transcript, ...)
- multi_verify(airs, multi_proof, main_tags, transcript, balance)
- Both reject a mismatch between `main_tags.len()` and AIR count.
- Single-AIR `prove` / `verify` synthesize a default tag.
- stark::test_utils::multi_verify_ram helper hides the synthetic tag
  generation from stark internal tests; bulk-migrated all 37 call
  sites to use it.
- New `mmcs_leaf::synth_main_tags(n)` + `synth_main_tags_for(slice)`
  helpers for lambda-vm tests that build ad-hoc AIR slices.
- prover/src/lib.rs `prove` / `verify` wire `airs.air_tags()` to the
  shared MMCS-bound transcript.
- lambda-vm tests bulk-updated to pass synthesized tags (values are
  ignored under the current `let _ = main_tags` no-op).

Tests: stark 130/130 green; lambda-vm-prover non-ELF tests green
(prove_elfs failures pre-date this change — they need ELF binaries
not present in the working tree).
---
 crypto/stark/src/mmcs_leaf.rs                 | 16 +++++++
 crypto/stark/src/prover.rs                    | 16 +++++++
 crypto/stark/src/test_utils.rs                | 30 +++++++++++++
 crypto/stark/src/tests/air_tests.rs           |  6 +--
 .../src/tests/bus_tests/completeness_tests.rs | 13 +++---
 .../src/tests/bus_tests/multiplicity_tests.rs |  7 ++-
 .../src/tests/bus_tests/soundness_tests.rs    | 44 +++++++++----------
 .../src/tests/prove_verify_roundtrip_tests.rs |  3 +-
 crypto/stark/src/tests/prover_tests.rs        |  5 +--
 crypto/stark/src/verifier.rs                  | 20 ++++++++-
 prover/src/lib.rs                             |  4 ++
 prover/src/test_utils.rs                      |  2 +
 prover/src/tests/bitwise_bus_tests.rs         |  2 +
 prover/src/tests/bitwise_tests.rs             |  3 ++
 prover/src/tests/branch_bus_tests.rs          |  2 +
 prover/src/tests/decode_tests.rs              |  2 +
 prover/src/tests/lt_bus_tests.rs              |  2 +
 prover/src/tests/prove_elfs_tests.rs          |  7 +++
 18 files changed, 142 insertions(+), 42 deletions(-)

diff --git a/crypto/stark/src/mmcs_leaf.rs b/crypto/stark/src/mmcs_leaf.rs
index 4c2c84b67..488a937af 100644
--- a/crypto/stark/src/mmcs_leaf.rs
+++ b/crypto/stark/src/mmcs_leaf.rs
@@ -31,6 +31,22 @@ use crate::config::Commitment;
 /// any encoding change so old proofs cannot be silently re-interpreted.
 pub const LEAF_DOMAIN_TAG: &[u8] = b"LAMBDAVM_MAIN_MMCS_LEAF_V1";
 
+/// Synthesize `n` distinct [`MatrixTag`]s derived from positional index.
+/// Useful for generic stark tests where the caller does not own a stable
+/// chip-type assignment. Production code in lambda-vm uses
+/// `VmAirs::air_tags()` instead, which encodes chip type + chunk index.
+pub fn synth_main_tags(n: usize) -> Vec<MatrixTag> {
+    (0..n)
+        .map(|i| MatrixTag::new((i as u64).to_le_bytes()))
+        .collect()
+}
+
+/// Convenience: synthesize `MatrixTag`s sized to a slice. Equivalent to
+/// `synth_main_tags(slice.len())`.
+pub fn synth_main_tags_for<T>(slice: &[T]) -> Vec<MatrixTag> {
+    synth_main_tags(slice.len())
+}
+
 /// Hash one row's worth of column bytes into a leaf digest using the
 /// canonical tagged format. `row_bytes_be` is the concatenation of every
 /// committed column's element written big-endian, in column order.
diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs
index 5c35281e5..f71fc4343 100644
--- a/crypto/stark/src/prover.rs
+++ b/crypto/stark/src/prover.rs
@@ -1495,6 +1495,7 @@ pub trait IsStarkProver<
     /// The transcript must be safely initialized before passing it to this method.
     fn multi_prove(
         mut air_trace_pairs: Vec<AirTracePair<'_, Field, FieldExtension, PI>>,
+        main_tags: &[MatrixTag],
         transcript: &mut (impl IsStarkTranscript<FieldExtension, Field> + Clone + Send),
         #[cfg(feature = "disk-spill")] storage_mode: StorageMode,
     ) -> Result<MultiProof<Field, FieldExtension, PI>, ProvingError>
@@ -1516,6 +1517,16 @@ pub trait IsStarkProver<
 
         let num_airs = air_trace_pairs.len();
 
+        if main_tags.len() != num_airs {
+            return Err(ProvingError::WrongParameter(format!(
+                "main_tags len ({}) does not match number of AIRs ({})",
+                main_tags.len(),
+                num_airs
+            )));
+        }
+        // `main_tags` is reserved for the upcoming MMCS wire-up; not consumed yet.
+        let _ = main_tags;
+
         // Check if any AIR has an auxiliary trace
         let needs_lookup_challenges = air_trace_pairs
             .iter()
@@ -1972,8 +1983,13 @@ pub trait IsStarkProver<
         <FieldExtension as IsField>::BaseType: SpillSafe,
     {
         let air_trace_pairs = vec![(air, trace, pub_inputs)];
+        // Single-AIR path: synthesize a default tag. Callers that want
+        // multi-table soundness should call `multi_prove` directly with
+        // distinct tags.
+        let main_tags = [MatrixTag::new([0; 8])];
         Self::multi_prove(
             air_trace_pairs,
+            &main_tags,
             transcript,
             #[cfg(feature = "disk-spill")]
             StorageMode::Ram,
diff --git a/crypto/stark/src/test_utils.rs b/crypto/stark/src/test_utils.rs
index f5cd19f80..383d0c3d3 100644
--- a/crypto/stark/src/test_utils.rs
+++ b/crypto/stark/src/test_utils.rs
@@ -5,6 +5,7 @@ use crate::prover::{IsStarkProver, Prover, ProvingError};
 use crate::trace::TraceTable;
 use crate::traits::AIR;
 use crypto::fiat_shamir::is_transcript::IsStarkTranscript;
+// MatrixTag is re-exported via `synth_main_tags`; no direct use here.
 use math::field::element::FieldElement;
 use math::field::traits::{IsFFTField, IsField, IsSubFieldOf};
 use math::spill_safe::SpillSafe;
@@ -16,6 +17,33 @@ type AirTracePair<'a, Field, FieldExtension, PI> = (
     &'a PI,
 );
 
+pub use crate::mmcs_leaf::synth_main_tags;
+
+pub fn multi_verify_ram<Field, FieldExtension, PI>(
+    airs: &[&dyn AIR<Field = Field, FieldExtension = FieldExtension, PublicInputs = PI>],
+    multi_proof: &MultiProof<Field, FieldExtension, PI>,
+    transcript: &mut (impl IsStarkTranscript<FieldExtension, Field> + Clone),
+    expected_bus_balance: &FieldElement<FieldExtension>,
+) -> bool
+where
+    Field: IsSubFieldOf<FieldExtension> + IsFFTField + Send + Sync + Copy + 'static,
+    FieldExtension: IsField + Send + Sync + Copy + 'static,
+    FieldElement<Field>: AsBytes + ByteConversion + Sync + Send,
+    FieldElement<FieldExtension>: AsBytes + ByteConversion + Sync + Send,
+    <Field as IsField>::BaseType: SpillSafe,
+    <FieldExtension as IsField>::BaseType: SpillSafe,
+{
+    use crate::verifier::{IsStarkVerifier, Verifier};
+    let main_tags = synth_main_tags(airs.len());
+    Verifier::<Field, FieldExtension, PI>::multi_verify(
+        airs,
+        multi_proof,
+        &main_tags,
+        transcript,
+        expected_bus_balance,
+    )
+}
+
 pub fn multi_prove_ram<Field, FieldExtension, PI>(
     air_trace_pairs: Vec<AirTracePair<'_, Field, FieldExtension, PI>>,
     transcript: &mut (impl IsStarkTranscript<FieldExtension, Field> + Clone + Send),
@@ -29,8 +57,10 @@ where
     <Field as IsField>::BaseType: SpillSafe,
     <FieldExtension as IsField>::BaseType: SpillSafe,
 {
+    let main_tags = synth_main_tags(air_trace_pairs.len());
     Prover::<Field, FieldExtension, PI>::multi_prove(
         air_trace_pairs,
+        &main_tags,
         transcript,
         #[cfg(feature = "disk-spill")]
         crate::storage_mode::StorageMode::Ram,
diff --git a/crypto/stark/src/tests/air_tests.rs b/crypto/stark/src/tests/air_tests.rs
index 8e20f303e..5084c7462 100644
--- a/crypto/stark/src/tests/air_tests.rs
+++ b/crypto/stark/src/tests/air_tests.rs
@@ -411,7 +411,7 @@ fn test_multi_prove_fib_3_tables() {
         >,
     > = vec![&air_1, &air_2, &air_3];
 
-    assert!(Verifier::multi_verify(
+    assert!(crate::test_utils::multi_verify_ram(
         &airs,
         &multi_proof,
         &mut DefaultTranscript::<F>::new(&[]),
@@ -515,7 +515,7 @@ fn test_multi_prove_2_tables_small_field() {
         >,
     > = vec![&air_1, &air_2];
 
-    assert!(Verifier::multi_verify(
+    assert!(crate::test_utils::multi_verify_ram(
         &airs,
         &multi_proof,
         &mut DefaultTranscript::<Degree3GoldilocksExtensionField>::new(&[]),
@@ -545,7 +545,7 @@ fn test_multi_prove_different_airs() {
         &dyn AIR<Field = GoldilocksField, FieldExtension = GoldilocksField, PublicInputs = ()>,
     > = vec![&air_1, &air_2];
 
-    assert!(Verifier::multi_verify(
+    assert!(crate::test_utils::multi_verify_ram(
         &airs,
         &multi_proof,
         &mut DefaultTranscript::<F>::new(&[]),
diff --git a/crypto/stark/src/tests/bus_tests/completeness_tests.rs b/crypto/stark/src/tests/bus_tests/completeness_tests.rs
index 83f8ac391..cdef1d1e3 100644
--- a/crypto/stark/src/tests/bus_tests/completeness_tests.rs
+++ b/crypto/stark/src/tests/bus_tests/completeness_tests.rs
@@ -19,7 +19,6 @@ use crate::proof::options::ProofOptions;
 use crate::test_utils::multi_prove_ram;
 use crate::trace::TraceTable;
 use crate::traits::AIR;
-use crate::verifier::{IsStarkVerifier, Verifier};
 
 type F = GoldilocksField;
 type E = Degree3GoldilocksExtensionField;
@@ -127,7 +126,7 @@ fn test_multi_table_proof() {
     let airs: Vec<&dyn AIR<Field = F, FieldExtension = E, PublicInputs = ()>> =
         vec![&cpu_air, &add_air, &mul_air];
 
-    assert!(Verifier::multi_verify(
+    assert!(crate::test_utils::multi_verify_ram(
         &airs,
         &multi_proof,
         &mut DefaultTranscript::<E>::new(&[]),
@@ -190,7 +189,7 @@ fn test_all_padding() {
     let airs: Vec<&dyn AIR<Field = F, FieldExtension = E, PublicInputs = ()>> =
         vec![&cpu_air, &add_air, &mul_air];
 
-    assert!(Verifier::multi_verify(
+    assert!(crate::test_utils::multi_verify_ram(
         &airs,
         &multi_proof,
         &mut DefaultTranscript::<E>::new(&[]),
@@ -253,7 +252,7 @@ fn test_single_operation() {
     let airs: Vec<&dyn AIR<Field = F, FieldExtension = E, PublicInputs = ()>> =
         vec![&cpu_air, &add_air, &mul_air];
 
-    assert!(Verifier::multi_verify(
+    assert!(crate::test_utils::multi_verify_ram(
         &airs,
         &multi_proof,
         &mut DefaultTranscript::<E>::new(&[]),
@@ -316,7 +315,7 @@ fn test_duplicate_operations() {
     let airs: Vec<&dyn AIR<Field = F, FieldExtension = E, PublicInputs = ()>> =
         vec![&cpu_air, &add_air, &mul_air];
 
-    assert!(Verifier::multi_verify(
+    assert!(crate::test_utils::multi_verify_ram(
         &airs,
         &multi_proof,
         &mut DefaultTranscript::<E>::new(&[]),
@@ -384,7 +383,7 @@ fn test_serialization_roundtrip() {
     let airs: Vec<&dyn AIR<Field = F, FieldExtension = E, PublicInputs = ()>> =
         vec![&cpu_air, &add_air, &mul_air];
 
-    assert!(Verifier::multi_verify(
+    assert!(crate::test_utils::multi_verify_ram(
         &airs,
         &deserialized,
         &mut DefaultTranscript::<E>::new(&[]),
@@ -524,7 +523,7 @@ fn test_bus_value_features() {
     let airs: Vec<&dyn AIR<Field = F, FieldExtension = E, PublicInputs = ()>> =
         vec![&sender_air, &receiver_air];
 
-    assert!(Verifier::multi_verify(
+    assert!(crate::test_utils::multi_verify_ram(
         &airs,
         &multi_proof,
         &mut DefaultTranscript::<E>::new(&[]),
diff --git a/crypto/stark/src/tests/bus_tests/multiplicity_tests.rs b/crypto/stark/src/tests/bus_tests/multiplicity_tests.rs
index 7e4d632dd..7bbcbf239 100644
--- a/crypto/stark/src/tests/bus_tests/multiplicity_tests.rs
+++ b/crypto/stark/src/tests/bus_tests/multiplicity_tests.rs
@@ -18,7 +18,6 @@ use crate::proof::options::ProofOptions;
 use crate::test_utils::multi_prove_ram;
 use crate::trace::TraceTable;
 use crate::traits::AIR;
-use crate::verifier::{IsStarkVerifier, Verifier};
 
 type F = GoldilocksField;
 type E = Degree3GoldilocksExtensionField;
@@ -119,7 +118,7 @@ fn test_multiplicity_one() {
         vec![&sender, &receiver];
 
     assert!(
-        Verifier::multi_verify(
+        crate::test_utils::multi_verify_ram(
             &airs,
             &multi_proof,
             &mut DefaultTranscript::<E>::new(&[]),
@@ -229,7 +228,7 @@ fn test_multiplicity_sum() {
         vec![&sender, &receiver];
 
     assert!(
-        Verifier::multi_verify(
+        crate::test_utils::multi_verify_ram(
             &airs,
             &multi_proof,
             &mut DefaultTranscript::<E>::new(&[]),
@@ -337,7 +336,7 @@ fn test_multiplicity_negated() {
         vec![&sender, &receiver];
 
     assert!(
-        Verifier::multi_verify(
+        crate::test_utils::multi_verify_ram(
             &airs,
             &multi_proof,
             &mut DefaultTranscript::<E>::new(&[]),
diff --git a/crypto/stark/src/tests/bus_tests/soundness_tests.rs b/crypto/stark/src/tests/bus_tests/soundness_tests.rs
index fc718bf7c..d2af70678 100644
--- a/crypto/stark/src/tests/bus_tests/soundness_tests.rs
+++ b/crypto/stark/src/tests/bus_tests/soundness_tests.rs
@@ -85,7 +85,7 @@ fn test_wrong_result_value() {
     let airs: Vec<&dyn AIR<Field = F, FieldExtension = E, PublicInputs = ()>> =
         vec![&cpu_air, &add_air, &mul_air];
 
-    assert!(!Verifier::multi_verify(
+    assert!(!crate::test_utils::multi_verify_ram(
         &airs,
         &multi_proof,
         &mut DefaultTranscript::<E>::new(&[]),
@@ -148,7 +148,7 @@ fn test_off_by_one() {
     let airs: Vec<&dyn AIR<Field = F, FieldExtension = E, PublicInputs = ()>> =
         vec![&cpu_air, &add_air, &mul_air];
 
-    assert!(!Verifier::multi_verify(
+    assert!(!crate::test_utils::multi_verify_ram(
         &airs,
         &multi_proof,
         &mut DefaultTranscript::<E>::new(&[]),
@@ -211,7 +211,7 @@ fn test_swapped_operands() {
     let airs: Vec<&dyn AIR<Field = F, FieldExtension = E, PublicInputs = ()>> =
         vec![&cpu_air, &add_air, &mul_air];
 
-    assert!(!Verifier::multi_verify(
+    assert!(!crate::test_utils::multi_verify_ram(
         &airs,
         &multi_proof,
         &mut DefaultTranscript::<E>::new(&[]),
@@ -274,7 +274,7 @@ fn test_single_column_wrong() {
     let airs: Vec<&dyn AIR<Field = F, FieldExtension = E, PublicInputs = ()>> =
         vec![&cpu_air, &add_air, &mul_air];
 
-    assert!(!Verifier::multi_verify(
+    assert!(!crate::test_utils::multi_verify_ram(
         &airs,
         &multi_proof,
         &mut DefaultTranscript::<E>::new(&[]),
@@ -341,7 +341,7 @@ fn test_over_report_multiplicity() {
     let airs: Vec<&dyn AIR<Field = F, FieldExtension = E, PublicInputs = ()>> =
         vec![&cpu_air, &add_air, &mul_air];
 
-    assert!(!Verifier::multi_verify(
+    assert!(!crate::test_utils::multi_verify_ram(
         &airs,
         &multi_proof,
         &mut DefaultTranscript::<E>::new(&[]),
@@ -404,7 +404,7 @@ fn test_under_report_multiplicity() {
     let airs: Vec<&dyn AIR<Field = F, FieldExtension = E, PublicInputs = ()>> =
         vec![&cpu_air, &add_air, &mul_air];
 
-    assert!(!Verifier::multi_verify(
+    assert!(!crate::test_utils::multi_verify_ram(
         &airs,
         &multi_proof,
         &mut DefaultTranscript::<E>::new(&[]),
@@ -467,7 +467,7 @@ fn test_zero_multiplicity_skip() {
     let airs: Vec<&dyn AIR<Field = F, FieldExtension = E, PublicInputs = ()>> =
         vec![&cpu_air, &add_air, &mul_air];
 
-    assert!(!Verifier::multi_verify(
+    assert!(!crate::test_utils::multi_verify_ram(
         &airs,
         &multi_proof,
         &mut DefaultTranscript::<E>::new(&[]),
@@ -534,7 +534,7 @@ fn test_phantom_receive() {
     let airs: Vec<&dyn AIR<Field = F, FieldExtension = E, PublicInputs = ()>> =
         vec![&cpu_air, &add_air, &mul_air];
 
-    assert!(!Verifier::multi_verify(
+    assert!(!crate::test_utils::multi_verify_ram(
         &airs,
         &multi_proof,
         &mut DefaultTranscript::<E>::new(&[]),
@@ -597,7 +597,7 @@ fn test_missing_receiver() {
     let airs: Vec<&dyn AIR<Field = F, FieldExtension = E, PublicInputs = ()>> =
         vec![&cpu_air, &add_air, &mul_air];
 
-    assert!(!Verifier::multi_verify(
+    assert!(!crate::test_utils::multi_verify_ram(
         &airs,
         &multi_proof,
         &mut DefaultTranscript::<E>::new(&[]),
@@ -680,7 +680,7 @@ fn test_tampered_table_contribution() {
         vec![&cpu_air, &add_air, &mul_air];
 
     assert!(
-        !Verifier::multi_verify(
+        !crate::test_utils::multi_verify_ram(
             &airs,
             &multi_proof,
             &mut DefaultTranscript::<E>::new(&[]),
@@ -761,7 +761,7 @@ fn test_tampered_acc_ood_evaluation() {
         vec![&cpu_air, &add_air, &mul_air];
 
     assert!(
-        !Verifier::multi_verify(
+        !crate::test_utils::multi_verify_ram(
             &airs,
             &multi_proof,
             &mut DefaultTranscript::<E>::new(&[]),
@@ -837,7 +837,7 @@ fn test_missing_bus_public_inputs_rejected() {
         vec![&cpu_air, &add_air, &mul_air];
 
     assert!(
-        !Verifier::multi_verify(
+        !crate::test_utils::multi_verify_ram(
             &airs,
             &multi_proof,
             &mut DefaultTranscript::<E>::new(&[]),
@@ -963,7 +963,7 @@ fn test_zeroed_table_contribution_rejected() {
         vec![&cpu_air, &add_air, &mul_air];
 
     assert!(
-        !Verifier::multi_verify(
+        !crate::test_utils::multi_verify_ram(
             &airs,
             &multi_proof,
             &mut DefaultTranscript::<E>::new(&[]),
@@ -1032,7 +1032,7 @@ fn test_one_of_many_wrong() {
     let airs: Vec<&dyn AIR<Field = F, FieldExtension = E, PublicInputs = ()>> =
         vec![&cpu_air, &add_air, &mul_air];
 
-    assert!(!Verifier::multi_verify(
+    assert!(!crate::test_utils::multi_verify_ram(
         &airs,
         &multi_proof,
         &mut DefaultTranscript::<E>::new(&[]),
@@ -1140,7 +1140,7 @@ fn test_full_scenario_wrong_add() {
     let airs: Vec<&dyn AIR<Field = F, FieldExtension = E, PublicInputs = ()>> =
         vec![&cpu_air, &add_air, &mul_air];
 
-    assert!(!Verifier::multi_verify(
+    assert!(!crate::test_utils::multi_verify_ram(
         &airs,
         &multi_proof,
         &mut DefaultTranscript::<E>::new(&[]),
@@ -1217,7 +1217,7 @@ fn test_wrong_table_consumes_value_rejected() {
     // Verification MUST fail: MUL table cannot consume values sent to ADD bus
     // because bus_id is included in the fingerprint
     assert!(
-        !Verifier::multi_verify(
+        !crate::test_utils::multi_verify_ram(
             &airs,
             &multi_proof,
             &mut DefaultTranscript::<E>::new(&[]),
@@ -1334,7 +1334,7 @@ fn test_packing_mismatch_direct_vs_word2l() {
     // Sender: z - (100 + 200*α)
     // Receiver: z - (100 + 200*2^16) = z - (100 + 13107200)
     assert!(
-        !Verifier::multi_verify(
+        !crate::test_utils::multi_verify_ram(
             &airs,
             &multi_proof,
             &mut DefaultTranscript::<E>::new(&[]),
@@ -1439,7 +1439,7 @@ fn test_packing_mismatch_element_count() {
     // Receiver: z - ((10 + 20*65536) + 30*α) = z - (1310730 + 30*α)  [2 bus elements]
     // Different fingerprints!
     assert!(
-        !Verifier::multi_verify(
+        !crate::test_utils::multi_verify_ram(
             &airs,
             &multi_proof,
             &mut DefaultTranscript::<E>::new(&[]),
@@ -1538,7 +1538,7 @@ fn test_packing_mismatch_shift_constant() {
         vec![&sender, &receiver];
 
     assert!(
-        !Verifier::multi_verify(
+        !crate::test_utils::multi_verify_ram(
             &airs,
             &multi_proof,
             &mut DefaultTranscript::<E>::new(&[]),
@@ -1641,7 +1641,7 @@ fn test_compound_mismatch_dwordhhw_vs_dwordwhh() {
         vec![&sender, &receiver];
 
     assert!(
-        !Verifier::multi_verify(
+        !crate::test_utils::multi_verify_ram(
             &airs,
             &multi_proof,
             &mut DefaultTranscript::<E>::new(&[]),
@@ -1735,7 +1735,7 @@ fn test_compound_equals_primitive_expansion() {
 
     // This should PASS - compound and primitive expansion are equivalent
     assert!(
-        Verifier::multi_verify(
+        crate::test_utils::multi_verify_ram(
             &airs,
             &multi_proof,
             &mut DefaultTranscript::<E>::new(&[]),
@@ -1849,7 +1849,7 @@ fn test_full_scenario_wrong_mul() {
     let airs: Vec<&dyn AIR<Field = F, FieldExtension = E, PublicInputs = ()>> =
         vec![&cpu_air, &add_air, &mul_air];
 
-    assert!(!Verifier::multi_verify(
+    assert!(!crate::test_utils::multi_verify_ram(
         &airs,
         &multi_proof,
         &mut DefaultTranscript::<E>::new(&[]),
diff --git a/crypto/stark/src/tests/prove_verify_roundtrip_tests.rs b/crypto/stark/src/tests/prove_verify_roundtrip_tests.rs
index 4059ed481..b58df3975 100644
--- a/crypto/stark/src/tests/prove_verify_roundtrip_tests.rs
+++ b/crypto/stark/src/tests/prove_verify_roundtrip_tests.rs
@@ -18,7 +18,6 @@ use crate::proof::options::ProofOptions;
 use crate::proof::stark::MultiProof;
 use crate::test_utils::multi_prove_ram;
 use crate::traits::AIR;
-use crate::verifier::{IsStarkVerifier, Verifier};
 
 type F = GoldilocksField;
 type E = Degree3GoldilocksExtensionField;
@@ -168,7 +167,7 @@ fn test_verify_serialized_multi_table_proofs() {
         vec![&cpu_air, &add_air, &mul_air];
 
     assert!(
-        Verifier::multi_verify(
+        crate::test_utils::multi_verify_ram(
             &airs,
             &received_proofs,
             &mut DefaultTranscript::<E>::new(&[]),
diff --git a/crypto/stark/src/tests/prover_tests.rs b/crypto/stark/src/tests/prover_tests.rs
index c645eebb2..640608ae8 100644
--- a/crypto/stark/src/tests/prover_tests.rs
+++ b/crypto/stark/src/tests/prover_tests.rs
@@ -12,7 +12,6 @@ use crate::{
     tests::domain_cache_stats,
     trace::{LDETraceTable, get_trace_evaluations, get_trace_evaluations_from_lde},
     traits::AIR,
-    verifier::{IsStarkVerifier, Verifier},
 };
 use math::{
     field::{element::FieldElement, goldilocks::GoldilocksField, traits::IsFFTField},
@@ -304,7 +303,7 @@ fn test_multi_prove_mixed_coset_offsets() {
     > = vec![&air_1, &air_2];
 
     assert!(
-        Verifier::multi_verify(
+        crate::test_utils::multi_verify_ram(
             &airs,
             &multi_proof,
             &mut DefaultTranscript::<GoldilocksField>::new(&[]),
@@ -380,7 +379,7 @@ fn test_multi_prove_dedups_shared_domain_params() {
     > = vec![&air_1, &air_2, &air_3];
 
     assert!(
-        Verifier::multi_verify(
+        crate::test_utils::multi_verify_ram(
             &airs,
             &multi_proof,
             &mut DefaultTranscript::<GoldilocksField>::new(&[]),
diff --git a/crypto/stark/src/verifier.rs b/crypto/stark/src/verifier.rs
index 8091c8b32..7b86cd4b4 100644
--- a/crypto/stark/src/verifier.rs
+++ b/crypto/stark/src/verifier.rs
@@ -716,6 +716,7 @@ pub trait IsStarkVerifier<
     fn multi_verify(
         airs: &[&dyn AIR<Field = Field, FieldExtension = FieldExtension, PublicInputs = PI>],
         multi_proof: &MultiProof<Field, FieldExtension, PI>,
+        main_tags: &[crypto::merkle_tree::mmcs::MatrixTag],
         transcript: &mut (impl IsStarkTranscript<FieldExtension, Field> + Clone),
         expected_bus_balance: &FieldElement<FieldExtension>,
     ) -> bool
@@ -731,6 +732,16 @@ pub trait IsStarkVerifier<
             );
             return false;
         }
+        if main_tags.len() != airs.len() {
+            error!(
+                "main_tags count ({}) does not match AIR count ({})",
+                main_tags.len(),
+                airs.len()
+            );
+            return false;
+        }
+        // `main_tags` is reserved for the upcoming MMCS verifier replay.
+        let _ = main_tags;
 
         // Check if any AIR has an auxiliary trace
         let needs_lookup_challenges = airs.iter().any(|air| air.has_aux_trace());
@@ -903,7 +914,14 @@ pub trait IsStarkVerifier<
         let multi_proof = MultiProof {
             proofs: vec![proof.clone()],
         };
-        Self::multi_verify(&[air], &multi_proof, transcript, &FieldElement::zero())
+        let main_tags = [crypto::merkle_tree::mmcs::MatrixTag::new([0; 8])];
+        Self::multi_verify(
+            &[air],
+            &multi_proof,
+            &main_tags,
+            transcript,
+            &FieldElement::zero(),
+        )
     }
 
     /// Replays rounds 2, 3 and 4 of the protocol for a given proof, assuming round 1 has
diff --git a/prover/src/lib.rs b/prover/src/lib.rs
index 1632ac642..209d45853 100644
--- a/prover/src/lib.rs
+++ b/prover/src/lib.rs
@@ -736,8 +736,10 @@ pub fn prove_with_options_and_inputs(
     );
 
     // Phase 4: Prove (multi_prove)
+    let main_tags = airs.air_tags();
     let proof = Prover::multi_prove(
         airs.air_trace_pairs(&mut traces),
+        &main_tags,
         &mut transcript,
         #[cfg(feature = "disk-spill")]
         storage_mode,
@@ -870,9 +872,11 @@ pub fn verify_with_options(
         None => return Ok(false),
     };
 
+    let main_tags = airs.air_tags();
     Ok(Verifier::multi_verify(
         &air_refs,
         &vm_proof.proof,
+        &main_tags,
         &mut transcript,
         &expected_bus_balance,
     ))
diff --git a/prover/src/test_utils.rs b/prover/src/test_utils.rs
index 1b608034c..db62577dc 100644
--- a/prover/src/test_utils.rs
+++ b/prover/src/test_utils.rs
@@ -100,8 +100,10 @@ pub fn multi_prove_ram<PI>(
 where
     PI: Send + Sync + Clone,
 {
+    let main_tags = stark::mmcs_leaf::synth_main_tags(air_trace_pairs.len());
     Prover::<F, E, PI>::multi_prove(
         air_trace_pairs,
+        &main_tags,
         transcript,
         #[cfg(feature = "disk-spill")]
         StorageMode::Ram,
diff --git a/prover/src/tests/bitwise_bus_tests.rs b/prover/src/tests/bitwise_bus_tests.rs
index 2a5fd31dd..2dca2d670 100644
--- a/prover/src/tests/bitwise_bus_tests.rs
+++ b/prover/src/tests/bitwise_bus_tests.rs
@@ -205,6 +205,7 @@ fn prove_and_verify(sender_lookups: &[(u8, u8, u8)]) -> bool {
     Verifier::multi_verify(
         &airs,
         &multi_proof,
+        &stark::mmcs_leaf::synth_main_tags_for(&airs),
         &mut DefaultTranscript::<E>::new(&[]),
         &FieldElement::zero(),
     )
@@ -315,6 +316,7 @@ fn prove_and_verify_custom(
     Verifier::multi_verify(
         &airs,
         &multi_proof,
+        &stark::mmcs_leaf::synth_main_tags_for(&airs),
         &mut DefaultTranscript::<E>::new(&[]),
         &FieldElement::zero(),
     )
diff --git a/prover/src/tests/bitwise_tests.rs b/prover/src/tests/bitwise_tests.rs
index 8337f8bf7..fc9907ed4 100644
--- a/prover/src/tests/bitwise_tests.rs
+++ b/prover/src/tests/bitwise_tests.rs
@@ -599,6 +599,7 @@ mod soundness_tests {
         let result = Verifier::multi_verify(
             &airs,
             &multi_proof,
+            &stark::mmcs_leaf::synth_main_tags_for(&airs),
             &mut DefaultTranscript::<E>::new(&[]),
             &FieldElement::zero(),
         );
@@ -647,6 +648,7 @@ mod soundness_tests {
         let result = Verifier::multi_verify(
             &airs,
             &multi_proof,
+            &stark::mmcs_leaf::synth_main_tags_for(&airs),
             &mut DefaultTranscript::<E>::new(&[]),
             &FieldElement::zero(),
         );
@@ -718,6 +720,7 @@ mod soundness_tests {
         let result = Verifier::multi_verify(
             &verifier_airs,
             &multi_proof,
+            &stark::mmcs_leaf::synth_main_tags_for(&verifier_airs),
             &mut DefaultTranscript::<E>::new(&[]),
             &FieldElement::zero(),
         );
diff --git a/prover/src/tests/branch_bus_tests.rs b/prover/src/tests/branch_bus_tests.rs
index c19a580ad..d234a585d 100644
--- a/prover/src/tests/branch_bus_tests.rs
+++ b/prover/src/tests/branch_bus_tests.rs
@@ -348,6 +348,7 @@ fn prove_and_verify(ops: &[BranchOperation]) -> bool {
     Verifier::multi_verify(
         &airs,
         &multi_proof,
+        &stark::mmcs_leaf::synth_main_tags_for(&airs),
         &mut DefaultTranscript::<E>::new(&[]),
         &FieldElement::zero(),
     )
@@ -438,6 +439,7 @@ fn prove_and_verify_custom(ops: &[BranchOperation], receiver_rows: &[CustomBranc
     Verifier::multi_verify(
         &airs,
         &multi_proof,
+        &stark::mmcs_leaf::synth_main_tags_for(&airs),
         &mut DefaultTranscript::<E>::new(&[]),
         &FieldElement::zero(),
     )
diff --git a/prover/src/tests/decode_tests.rs b/prover/src/tests/decode_tests.rs
index c6a436c95..fd11d6392 100644
--- a/prover/src/tests/decode_tests.rs
+++ b/prover/src/tests/decode_tests.rs
@@ -981,6 +981,7 @@ fn test_decode_soundness_different_elf_rejected() {
     let result = Verifier::multi_verify(
         &verifier_airs,
         &proof,
+        &stark::mmcs_leaf::synth_main_tags_for(&verifier_airs),
         &mut DefaultTranscript::<E>::new(&[]),
         &FieldElement::zero(),
     );
@@ -1076,6 +1077,7 @@ fn test_decode_soundness_same_elf_accepted() {
     let result = Verifier::multi_verify(
         &verifier_air_refs,
         &proof,
+        &stark::mmcs_leaf::synth_main_tags_for(&verifier_air_refs),
         &mut DefaultTranscript::<E>::new(&[]),
         &expected_bus_balance,
     );
diff --git a/prover/src/tests/lt_bus_tests.rs b/prover/src/tests/lt_bus_tests.rs
index dcc555780..36011c831 100644
--- a/prover/src/tests/lt_bus_tests.rs
+++ b/prover/src/tests/lt_bus_tests.rs
@@ -301,6 +301,7 @@ fn prove_and_verify(ops: &[LtOperation]) -> bool {
     Verifier::multi_verify(
         &airs,
         &multi_proof,
+        &stark::mmcs_leaf::synth_main_tags_for(&airs),
         &mut DefaultTranscript::<E>::new(&[]),
         &FieldElement::zero(),
     )
@@ -385,6 +386,7 @@ fn prove_and_verify_custom(ops: &[LtOperation], receiver_rows: &[CustomLtRow]) -
     Verifier::multi_verify(
         &airs,
         &multi_proof,
+        &stark::mmcs_leaf::synth_main_tags_for(&airs),
         &mut DefaultTranscript::<E>::new(&[]),
         &FieldElement::zero(),
     )
diff --git a/prover/src/tests/prove_elfs_tests.rs b/prover/src/tests/prove_elfs_tests.rs
index fe97911b9..1e0c1236f 100644
--- a/prover/src/tests/prove_elfs_tests.rs
+++ b/prover/src/tests/prove_elfs_tests.rs
@@ -80,6 +80,7 @@ fn prove_and_verify_vm_minimal(elf: &Elf, traces: &mut Traces) -> bool {
     Verifier::multi_verify(
         &airs.air_refs(),
         &multi_proof,
+        &stark::mmcs_leaf::synth_main_tags_for(&airs.air_refs()),
         &mut DefaultTranscript::<E>::new(&[]),
         &expected_bus_balance,
     )
@@ -135,6 +136,7 @@ fn test_cpu_only_no_bus() {
         Verifier::multi_verify(
             &airs,
             &multi_proof,
+            &stark::mmcs_leaf::synth_main_tags_for(&airs),
             &mut DefaultTranscript::<E>::new(&[]),
             &FieldElement::zero(),
         ),
@@ -895,6 +897,7 @@ fn test_prove_elfs_test_commit_4_wrong_pages_rejected() {
     let verified = Verifier::multi_verify(
         &verifier_air_refs,
         &proof,
+        &stark::mmcs_leaf::synth_main_tags_for(&verifier_air_refs),
         &mut DefaultTranscript::<E>::new(&[]),
         &expected_bus_balance,
     );
@@ -1633,6 +1636,7 @@ fn test_deep_stack_runtime_pages_roundtrip() {
     let verified = Verifier::multi_verify(
         &verifier_air_refs,
         &proof,
+        &stark::mmcs_leaf::synth_main_tags_for(&verifier_air_refs),
         &mut DefaultTranscript::<E>::new(&[]),
         &expected_bus_balance,
     );
@@ -1690,6 +1694,7 @@ fn test_deep_stack_missing_pages_rejected() {
     let verified = Verifier::multi_verify(
         &verifier_air_refs,
         &proof,
+        &stark::mmcs_leaf::synth_main_tags_for(&verifier_air_refs),
         &mut DefaultTranscript::<E>::new(&[]),
         &expected_bus_balance,
     );
@@ -1782,6 +1787,7 @@ fn test_heap_alloc_runtime_pages_roundtrip() {
     let verified = Verifier::multi_verify(
         &verifier_air_refs,
         &proof,
+        &stark::mmcs_leaf::synth_main_tags_for(&verifier_air_refs),
         &mut DefaultTranscript::<E>::new(&[]),
         &expected_bus_balance,
     );
@@ -1942,6 +1948,7 @@ fn test_crafted_zero_count_proof_must_not_verify() {
     let verified = Verifier::multi_verify(
         &verifier_air_refs,
         &proof,
+        &stark::mmcs_leaf::synth_main_tags_for(&verifier_air_refs),
         &mut DefaultTranscript::<E>::new(&[]),
         &FieldElement::zero(),
     );

From 284fe640aa4338b866de1e9f89b641c36b9a1a2d Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Wed, 27 May 2026 14:41:14 -0300
Subject: [PATCH 09/21] feat(stark/mmcs): wire MMCS into Phase A + verifier;
 preprocessed kept on per-table trees
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase C wire-up of the streaming-MMCS plan. Non-preprocessed tables are now
committed under a single shared main-trace MMCS; preprocessed tables stay
OUT of the MMCS and keep their own per-table multiplicities Merkle tree
(in addition to the AIR-pinned precomputed tree).

Architecture:
- `MainCommit<F>` is now an enum:
  * `Shared { mmcs (Arc), tag, padded_height }` for non-preprocessed
  * `Preprocessed { multiplicities_tree, multiplicities_root,
                    precomputed_tree, precomputed_root, num_precomputed_cols }`
- `MainTraceOpening<F>` is now an enum:
  * `Mmcs { evaluations, evaluations_sym, mmcs_opening, mmcs_opening_sym }`
  * `Tree(PolynomialOpenings<F>)` for the preprocessed per-table tree
- `MultiProof` gains `main_mmcs_root: Commitment` and
  `main_mmcs_spec: Vec<(MatrixTag, usize)>` (the MMCS is at multi-proof
  level, hence single-AIR `prove` / `verify` now return / take MultiProof).
- `StarkProof.lde_trace_main_merkle_root: Option<Commitment>` is repurposed:
  `Some` for preprocessed tables (per-table multiplicities root), `None`
  for non-preprocessed tables (covered by `main_mmcs_root`).

Phase A absorb order (prover + verifier + `replay_transcript_phase_a` match):
  for each table in spec-fixed order:
      if preprocessed:
          absorb precomputed_root           (AIR-pinned, verifier rejects mismatch)
          absorb multiplicities_root         (per-table tree)
  absorb main_mmcs_root                      (once, over Shared tables only)

Per-query main-trace opening:
- Shared tables: `mmcs.open(iota*2 << shift)` and `mmcs.open((iota*2+1) << shift)`,
  where `shift = log2(max_height / this_padded_height)`. The verifier
  rehashes `evaluations` with `mmcs_leaf::hash_tagged_row`, compares against
  `matrix_leaves[table_idx]`, and authenticates against root+spec.
- Preprocessed tables: `verify_opening_pair` against
  `StarkProof.lde_trace_main_merkle_root` (unchanged single-tree path).

Test-suite updates (single-AIR `prove` now returns MultiProof):
- crypto/stark/src/tests/small_trace_tests.rs: field access via `proof.proofs[0]`,
  `MainTraceOpening` matched as enum.
- crypto/stark/src/tests/bus_tests/soundness_tests.rs: same.

Tests: stark 130/130 green. lambda-vm-prover non-ELF tests unchanged
(the 77 prove_elfs failures predate this change — same UnknownSyscall(5)
executor bug on `git stash` baseline).
---
 crypto/stark/src/proof/stark.rs               |  67 ++-
 crypto/stark/src/prover.rs                    | 478 +++++++++++++-----
 .../src/tests/bus_tests/soundness_tests.rs    |   2 +-
 crypto/stark/src/tests/small_trace_tests.rs   |  31 +-
 crypto/stark/src/verifier.rs                  | 230 +++++++--
 prover/src/lib.rs                             |  12 +-
 6 files changed, 627 insertions(+), 193 deletions(-)

diff --git a/crypto/stark/src/proof/stark.rs b/crypto/stark/src/proof/stark.rs
index ec11acd3b..667f9f170 100644
--- a/crypto/stark/src/proof/stark.rs
+++ b/crypto/stark/src/proof/stark.rs
@@ -1,4 +1,4 @@
-use crypto::merkle_tree::mmcs::MmcsOpening;
+use crypto::merkle_tree::mmcs::{MatrixTag, MmcsOpening};
 use crypto::merkle_tree::proof::Proof;
 use math::field::{
     element::FieldElement,
@@ -18,27 +18,50 @@ pub struct PolynomialOpenings<F: IsField> {
     pub evaluations_sym: Vec<FieldElement<F>>,
 }
 
-/// Per-query main-trace opening backed by the shared MMCS.
+/// Per-query main-trace opening.
 ///
-/// The (iota, iota_sym) pair are consecutive global indices in the LDE.
-/// Each carries its own `MmcsOpening` because they live at different
-/// positions in the layer-0 array — there is no shared sibling sub-path
-/// between them at the leaf level (only at higher tree levels, which the
-/// MMCS opening encodes).
+/// Non-preprocessed tables are committed under the shared main-trace MMCS,
+/// so a query carries an `MmcsOpening` pair (one per iota / iota_sym).
+/// Preprocessed tables keep their multiplicities slice in their OWN
+/// per-table Merkle tree (distinct from the shared MMCS) and use the
+/// legacy `PolynomialOpenings` layout. The per-table root for the latter
+/// lives in `StarkProof::lde_trace_main_merkle_root`.
 #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
 #[serde(bound = "")]
-pub struct MainTraceOpening<F: IsField> {
-    pub evaluations: Vec<FieldElement<F>>,
-    pub evaluations_sym: Vec<FieldElement<F>>,
-    pub mmcs_opening: MmcsOpening<Commitment>,
-    pub mmcs_opening_sym: MmcsOpening<Commitment>,
+pub enum MainTraceOpening<F: IsField> {
+    /// Opening into the shared main-trace MMCS (non-preprocessed tables).
+    Mmcs {
+        evaluations: Vec<FieldElement<F>>,
+        evaluations_sym: Vec<FieldElement<F>>,
+        mmcs_opening: MmcsOpening<Commitment>,
+        mmcs_opening_sym: MmcsOpening<Commitment>,
+    },
+    /// Opening into this table's own multiplicities Merkle tree
+    /// (preprocessed tables).
+    Tree(PolynomialOpenings<F>),
+}
+
+impl<F: IsField> MainTraceOpening<F> {
+    pub fn evaluations(&self) -> &[FieldElement<F>] {
+        match self {
+            Self::Mmcs { evaluations, .. } => evaluations,
+            Self::Tree(p) => &p.evaluations,
+        }
+    }
+
+    pub fn evaluations_sym(&self) -> &[FieldElement<F>] {
+        match self {
+            Self::Mmcs { evaluations_sym, .. } => evaluations_sym,
+            Self::Tree(p) => &p.evaluations_sym,
+        }
+    }
 }
 
 #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
 #[serde(bound = "")]
 pub struct DeepPolynomialOpening<F: IsSubFieldOf<E>, E: IsField> {
     pub composition_poly: PolynomialOpenings<E>,
-    pub main_trace_polys: PolynomialOpenings<F>,
+    pub main_trace_polys: MainTraceOpening<F>,
     /// For preprocessed tables: openings for precomputed columns.
     /// These are verified against the hardcoded precomputed commitment.
     pub precomputed_trace_polys: Option<PolynomialOpenings<F>>,
@@ -52,9 +75,11 @@ pub type DeepPolynomialOpenings<F, E> = Vec<DeepPolynomialOpening<F, E>>;
 pub struct StarkProof<F: IsSubFieldOf<E>, E: IsField, PI> {
     // Length of the execution trace
     pub trace_length: usize,
-    // Commitments of the trace columns
-    // [tⱼ]
-    pub lde_trace_main_merkle_root: Commitment,
+    /// For PREPROCESSED tables only: per-table Merkle root over the
+    /// multiplicities columns (the non-precomputed slice). Preprocessed
+    /// tables stay out of the shared main-trace MMCS, so their main slice
+    /// keeps its own per-table tree. `None` for non-preprocessed tables.
+    pub lde_trace_main_merkle_root: Option<Commitment>,
     // Commitments of auxiliary trace columns
     // [tⱼ]
     pub lde_trace_aux_merkle_root: Option<Commitment>,
@@ -90,8 +115,18 @@ pub struct StarkProof<F: IsSubFieldOf<E>, E: IsField, PI> {
 /// A collection of STARK proofs for multiple AIRs.
 /// Used for multi-table proving where tables are linked via bus (LogUp).
 /// Returned by `Prover::multi_prove` and verified by `Verifier::multi_verify`.
+///
+/// Non-preprocessed tables share a single main-trace MMCS authenticated by
+/// `main_mmcs_root`; `main_mmcs_spec` lists `(MatrixTag, padded_height)`
+/// per committed table in the MMCS sort order. Preprocessed tables stay
+/// out of this MMCS — each carries its own per-table Merkle root in
+/// `StarkProof::lde_trace_main_merkle_root` plus the AIR-pinned
+/// precomputed root. Both groups' roots are absorbed in spec-fixed order
+/// during Phase A.
 #[derive(Debug, serde::Serialize, serde::Deserialize)]
 #[serde(bound = "PI: serde::Serialize + serde::de::DeserializeOwned")]
 pub struct MultiProof<F: IsSubFieldOf<E>, E: IsField, PI> {
     pub proofs: Vec<StarkProof<F, E, PI>>,
+    pub main_mmcs_root: Commitment,
+    pub main_mmcs_spec: Vec<(MatrixTag, usize)>,
 }
diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs
index f71fc4343..6c58d5ac0 100644
--- a/crypto/stark/src/prover.rs
+++ b/crypto/stark/src/prover.rs
@@ -27,14 +27,11 @@ use rayon::prelude::{
 use crate::debug::validate_trace;
 use crate::fri;
 use crate::lookup::LOGUP_NUM_CHALLENGES;
-#[allow(unused_imports)]
-use crate::mmcs_leaf::hash_tagged_row_bytes;
-use crate::proof::stark::{DeepPolynomialOpenings, PolynomialOpenings};
+use crate::proof::stark::{DeepPolynomialOpenings, MainTraceOpening, PolynomialOpenings};
 #[cfg(feature = "disk-spill")]
 use crate::storage_mode::StorageMode;
 use crate::table::Table;
 use crate::trace::LDETraceTable;
-#[allow(unused_imports)]
 use crypto::merkle_tree::mmcs::{MatrixTag, Mmcs, MmcsBuilder, MmcsError};
 
 use super::config::{BatchedMerkleTree, BatchedMerkleTreeBackend, Commitment};
@@ -120,23 +117,6 @@ where
         }
     }
 
-    /// Build a `TableCommit` for a preprocessed table.
-    fn preprocessed(
-        tree: BatchedMerkleTree<F>,
-        root: Commitment,
-        precomputed_tree: BatchedMerkleTree<F>,
-        precomputed_root: Commitment,
-        num_precomputed_cols: usize,
-    ) -> Self {
-        Self {
-            tree: Arc::new(tree),
-            root,
-            precomputed_tree: Some(Arc::new(precomputed_tree)),
-            precomputed_root: Some(precomputed_root),
-            num_precomputed_cols,
-        }
-    }
-
     /// Cheap clone. Only bumps Arc refcounts, no tree data is copied.
     fn share(&self) -> Self {
         Self {
@@ -148,53 +128,130 @@ where
         }
     }
 
-    fn is_preprocessed(&self) -> bool {
-        self.precomputed_tree.is_some()
-    }
 }
 
 /// Per-table commitment artifacts for the main trace under the shared
 /// MMCS protocol. The `mmcs` Arc is the SAME instance for every table in
 /// the multi-proof — Phase A builds it once.
 ///
-/// Currently unused at the wire-up level; defined here as the keystone
-/// type for the upcoming MMCS Phase C wire-up (see
-/// `docs/mmcs-streaming-c1-spec.md`). Marked `allow(dead_code)` until the
-/// follow-up commit consumes it.
-#[allow(dead_code)]
-pub(crate) struct MainCommit<F: IsField>
+/// `padded_height` is this table's LDE height (a power of two), needed to
+/// translate the table's local FRI iota into a global MMCS index when
+/// opening (see `open_deep_composition_poly`).
+pub(crate) enum MainCommit<F: IsField>
 where
     FieldElement<F>: AsBytes,
 {
-    /// Shared MMCS across all tables in the multi-proof.
-    pub(crate) mmcs: Arc<Mmcs<BatchedMerkleTreeBackend<F>>>,
-    /// This table's MatrixTag within the MMCS.
-    pub(crate) tag: MatrixTag,
-    /// Preprocessed tables only: separate Merkle tree over precomputed columns.
-    pub(crate) precomputed_tree: Option<Arc<BatchedMerkleTree<F>>>,
-    /// Preprocessed tables only: root of `precomputed_tree`.
-    pub(crate) precomputed_root: Option<Commitment>,
-    /// Preprocessed tables only: number of precomputed columns. Zero otherwise.
-    pub(crate) num_precomputed_cols: usize,
+    /// Non-preprocessed table: committed under the shared MMCS.
+    Shared {
+        mmcs: Arc<Mmcs<BatchedMerkleTreeBackend<F>>>,
+        tag: MatrixTag,
+        /// Padded height (== LDE row count); needed to translate a local
+        /// FRI iota into a global MMCS index.
+        padded_height: usize,
+    },
+    /// Preprocessed table: two per-table Merkle trees, NOT in the MMCS.
+    Preprocessed {
+        multiplicities_tree: Arc<BatchedMerkleTree<F>>,
+        multiplicities_root: Commitment,
+        precomputed_tree: Arc<BatchedMerkleTree<F>>,
+        precomputed_root: Commitment,
+        num_precomputed_cols: usize,
+    },
 }
 
-#[allow(dead_code)]
 impl<F: IsField> MainCommit<F>
 where
     FieldElement<F>: AsBytes,
 {
-    fn is_preprocessed(&self) -> bool {
-        self.precomputed_tree.is_some()
+    fn precomputed_root(&self) -> Option<Commitment> {
+        match self {
+            Self::Shared { .. } => None,
+            Self::Preprocessed {
+                precomputed_root, ..
+            } => Some(*precomputed_root),
+        }
+    }
+
+    fn main_tree_root(&self) -> Option<Commitment> {
+        match self {
+            Self::Shared { .. } => None,
+            Self::Preprocessed {
+                multiplicities_root,
+                ..
+            } => Some(*multiplicities_root),
+        }
     }
 
     /// Cheap clone. Only bumps Arc refcounts.
     fn share(&self) -> Self {
-        Self {
-            mmcs: Arc::clone(&self.mmcs),
-            tag: self.tag,
-            precomputed_tree: self.precomputed_tree.as_ref().map(Arc::clone),
-            precomputed_root: self.precomputed_root,
-            num_precomputed_cols: self.num_precomputed_cols,
+        match self {
+            Self::Shared {
+                mmcs,
+                tag,
+                padded_height,
+            } => Self::Shared {
+                mmcs: Arc::clone(mmcs),
+                tag: *tag,
+                padded_height: *padded_height,
+            },
+            Self::Preprocessed {
+                multiplicities_tree,
+                multiplicities_root,
+                precomputed_tree,
+                precomputed_root,
+                num_precomputed_cols,
+            } => Self::Preprocessed {
+                multiplicities_tree: Arc::clone(multiplicities_tree),
+                multiplicities_root: *multiplicities_root,
+                precomputed_tree: Arc::clone(precomputed_tree),
+                precomputed_root: *precomputed_root,
+                num_precomputed_cols: *num_precomputed_cols,
+            },
+        }
+    }
+}
+
+/// Per-table Phase-A output. Non-preprocessed tables contribute their
+/// tagged leaf vector to the shared MMCS; preprocessed tables ship two
+/// independent per-table Merkle trees that stay out of the MMCS.
+enum MainPhaseAOutput<F: IsField>
+where
+    FieldElement<F>: AsBytes,
+{
+    Shared {
+        tag: MatrixTag,
+        leaves: Vec<Commitment>,
+        padded_height: usize,
+    },
+    Preprocessed {
+        multiplicities_tree: Arc<BatchedMerkleTree<F>>,
+        multiplicities_root: Commitment,
+        precomputed_tree: Arc<BatchedMerkleTree<F>>,
+        precomputed_root: Commitment,
+        num_precomputed_cols: usize,
+    },
+}
+
+impl<F: IsField> MainPhaseAOutput<F>
+where
+    FieldElement<F>: AsBytes,
+{
+    fn precomputed_root(&self) -> Option<Commitment> {
+        match self {
+            Self::Shared { .. } => None,
+            Self::Preprocessed {
+                precomputed_root, ..
+            } => Some(*precomputed_root),
+        }
+    }
+
+    fn main_tree_root(&self) -> Option<Commitment> {
+        match self {
+            Self::Shared { .. } => None,
+            Self::Preprocessed {
+                multiplicities_root,
+                ..
+            } => Some(*multiplicities_root),
         }
     }
 }
@@ -209,8 +266,8 @@ where
 {
     /// The table of evaluations over the LDE of the main and auxiliary trace tables.
     pub(crate) lde_trace: LDETraceTable<Field, FieldExtension>,
-    /// Commitment to the main trace.
-    pub(crate) main: TableCommit<Field>,
+    /// Commitment to the main trace (shared MMCS handle + per-table tag).
+    pub(crate) main: MainCommit<Field>,
     /// Commitment to the auxiliary (RAP) trace, if any.
     pub(crate) aux: Option<TableCommit<FieldExtension>>,
     /// The challenges of the RAP round.
@@ -228,7 +285,7 @@ where
     FieldElement<Field>: AsBytes,
     FieldElement<FieldExtension>: AsBytes,
 {
-    main: TableCommit<Field>,
+    main: MainCommit<Field>,
     aux: Option<TableCommit<FieldExtension>>,
     rap_challenges: Vec<FieldElement<FieldExtension>>,
     bus_public_inputs: Option<BusPublicInputs<FieldExtension>>,
@@ -452,6 +509,87 @@ where
     result
 }
 
+fn map_mmcs_err(e: MmcsError) -> ProvingError {
+    ProvingError::WrongParameter(format!("MMCS: {e:?}"))
+}
+
+/// Build the unified main-trace MMCS from the per-table Phase A outputs.
+/// Returns the root, the (tag, padded_height) spec, and the shared Arc that
+/// every table's `MainCommit` borrows.
+#[allow(clippy::type_complexity)]
+fn build_main_mmcs<F>(
+    outputs: &[MainPhaseAOutput<F>],
+) -> Result<
+    (
+        Commitment,
+        Vec<(MatrixTag, usize)>,
+        Arc<Mmcs<BatchedMerkleTreeBackend<F>>>,
+    ),
+    ProvingError,
+>
+where
+    F: IsField + Send + Sync,
+    FieldElement<F>: AsBytes + Send + Sync,
+{
+    let mut builder: MmcsBuilder<BatchedMerkleTreeBackend<F>> = MmcsBuilder::new();
+    for output in outputs {
+        if let MainPhaseAOutput::Shared {
+            tag,
+            leaves,
+            padded_height: _,
+        } = output
+        {
+            builder
+                .add_matrix(*tag, leaves.clone())
+                .map_err(map_mmcs_err)?;
+        }
+    }
+    let mmcs = builder.finalize().map_err(map_mmcs_err)?;
+    let root = *mmcs.root();
+    let spec = mmcs.spec();
+    Ok((root, spec, Arc::new(mmcs)))
+}
+
+/// Tagged per-row leaf digest for the main-trace MMCS.
+pub fn compute_tagged_leaves_bit_reversed<E>(
+    columns: &[Vec<FieldElement<E>>],
+    tag: MatrixTag,
+) -> Vec<Commitment>
+where
+    E: IsField,
+    FieldElement<E>: AsBytes + Sync + Send + ByteConversion,
+{
+    if columns.is_empty() || columns[0].is_empty() {
+        return Vec::new();
+    }
+    let num_rows = columns[0].len();
+    let num_cols = columns.len();
+    let byte_len = <FieldElement<E> as ByteConversion>::BYTE_LEN;
+    debug_assert!(num_rows.is_power_of_two());
+    let total_bytes = num_cols * byte_len;
+    let hash_leaf =
+        |buf: &mut [u8], row_idx: usize| -> Commitment {
+            let br_idx = reverse_index(row_idx, num_rows as u64);
+            for (col_idx, col) in columns.iter().enumerate() {
+                col[br_idx]
+                    .write_bytes_be(&mut buf[col_idx * byte_len..(col_idx + 1) * byte_len]);
+            }
+            crate::mmcs_leaf::hash_tagged_row_bytes(tag, buf)
+        };
+    #[cfg(feature = "parallel")]
+    {
+        (0..num_rows)
+            .into_par_iter()
+            .map_init(|| vec![0u8; total_bytes], |buf, i| hash_leaf(buf, i))
+            .collect()
+    }
+    #[cfg(not(feature = "parallel"))]
+    {
+        let mut buf = vec![0u8; total_bytes];
+        (0..num_rows).map(|i| hash_leaf(&mut buf, i)).collect()
+    }
+}
+
 /// Compute Keccak-256 leaf hashes for `commit_composition_polynomial`: one
 /// leaf per row-pair, where leaf `i` hashes the BE concatenation of
 /// `parts[..][br_0] ++ parts[..][br_1]` with
@@ -653,20 +791,26 @@ pub trait IsStarkProver<
         });
     }
 
-    /// Compute the main-trace LDE and commit. Returns a `TableCommit` along
-    /// with the owned LDE columns (consumed later in Phase D).
+    /// Compute the main-trace LDE and the per-table inputs needed by the
+    /// shared MMCS build. Returns a `MainPhaseAOutput` (tagged leaves + the
+    /// optional precomputed-columns Merkle tree) together with the owned
+    /// LDE columns consumed later in Phase D.
     ///
-    /// `precomputed`: if present, the leading `num_cols` columns are committed
-    /// as a separate Merkle tree (the precomputed split for preprocessed
-    /// tables) and the root is checked against the AIR-hardcoded commitment.
+    /// `tag`: the table's MatrixTag, fed into every leaf hash so the MMCS
+    /// can authenticate (matrix, row) pairs uniquely.
+    /// `precomputed`: if present, the leading `num_cols` columns are
+    /// committed as a separate Merkle tree (the precomputed split) and the
+    /// root is checked against the AIR-hardcoded commitment. The remaining
+    /// columns feed the MMCS leaves. If absent, every column feeds the MMCS.
     #[allow(clippy::type_complexity)]
     fn commit_main_trace(
         trace: &TraceTable<Field, FieldExtension>,
         domain: &Domain<Field>,
         twiddles: &LdeTwiddles<Field>,
+        tag: MatrixTag,
         precomputed: Option<(Commitment, usize)>,
         #[cfg(feature = "disk-spill")] storage_mode: StorageMode,
-    ) -> Result<(TableCommit<Field>, Vec<Vec<FieldElement<Field>>>), ProvingError>
+    ) -> Result<(MainPhaseAOutput<Field>, Vec<Vec<FieldElement<Field>>>), ProvingError>
     where
         FieldElement<Field>: AsBytes,
         FieldElement<FieldExtension>: AsBytes,
@@ -686,54 +830,58 @@ pub trait IsStarkProver<
         #[cfg(feature = "instruments")]
         let t_sub = Instant::now();
 
-        let commit = match precomputed {
+        let output = match precomputed {
             None => {
-                #[allow(unused_mut)]
-                let (mut tree, root) = Self::commit_columns_bit_reversed(&columns)
-                    .ok_or(ProvingError::EmptyCommitment)?;
-                #[cfg(feature = "disk-spill")]
-                if storage_mode == StorageMode::Disk {
-                    tree.spill_nodes_to_disk()
-                        .map_err(|e| ProvingError::DiskSpill(format!("main Merkle tree: {e}")))?;
+                let leaves = compute_tagged_leaves_bit_reversed::<Field>(&columns, tag);
+                if leaves.is_empty() {
+                    return Err(ProvingError::EmptyCommitment);
+                }
+                let padded_height = leaves.len();
+                MainPhaseAOutput::Shared {
+                    tag,
+                    leaves,
+                    padded_height,
                 }
-                TableCommit::plain(tree, root)
             }
             Some((expected_precomputed_root, num_cols)) => {
                 #[allow(unused_mut)]
                 let (mut precomputed_tree, precomputed_root) =
                     Self::commit_columns_bit_reversed(&columns[..num_cols])
                         .ok_or(ProvingError::EmptyCommitment)?;
-                #[allow(unused_mut)]
-                let (mut mult_tree, mult_root) =
-                    Self::commit_columns_bit_reversed(&columns[num_cols..])
-                        .ok_or(ProvingError::EmptyCommitment)?;
                 debug_assert_eq!(
                     precomputed_root, expected_precomputed_root,
-                    "Prover's precomputed commitment doesn't match hardcoded AIR commitment"
+                    "Prover precomputed commitment must match the AIR-pinned value"
                 );
                 #[cfg(feature = "disk-spill")]
                 if storage_mode == StorageMode::Disk {
                     precomputed_tree.spill_nodes_to_disk().map_err(|e| {
                         ProvingError::DiskSpill(format!("precomputed Merkle tree: {e}"))
                     })?;
-                    mult_tree
-                        .spill_nodes_to_disk()
-                        .map_err(|e| ProvingError::DiskSpill(format!("mult Merkle tree: {e}")))?;
                 }
-                TableCommit::preprocessed(
-                    mult_tree,
-                    mult_root,
-                    precomputed_tree,
+                #[allow(unused_mut)]
+                let (mut multiplicities_tree, multiplicities_root) =
+                    Self::commit_columns_bit_reversed(&columns[num_cols..])
+                        .ok_or(ProvingError::EmptyCommitment)?;
+                #[cfg(feature = "disk-spill")]
+                if storage_mode == StorageMode::Disk {
+                    multiplicities_tree.spill_nodes_to_disk().map_err(|e| {
+                        ProvingError::DiskSpill(format!("multiplicities Merkle tree: {e}"))
+                    })?;
+                }
+                MainPhaseAOutput::Preprocessed {
+                    multiplicities_tree: Arc::new(multiplicities_tree),
+                    multiplicities_root,
+                    precomputed_tree: Arc::new(precomputed_tree),
                     precomputed_root,
-                    num_cols,
-                )
+                    num_precomputed_cols: num_cols,
+                }
             }
         };
 
         #[cfg(feature = "instruments")]
         crate::instruments::accum_r1_main(main_lde_dur, t_sub.elapsed());
 
-        Ok((commit, columns))
+        Ok((output, columns))
     }
 
     /// Recompute Round1 from the trace, reusing the Merkle trees stored in commitments.
@@ -1426,30 +1574,9 @@ pub trait IsStarkProver<
 
         let lde_trace = &round_1_result.lde_trace;
         let main_commit = &round_1_result.main;
-        let is_preprocessed = main_commit.is_preprocessed();
-        let num_precomputed_cols = main_commit.num_precomputed_cols;
         let total_cols = lde_trace.num_main_cols();
 
         for index in indexes_to_open.iter() {
-            // For preprocessed tables, open the main split (multiplicities only);
-            // for normal tables, open all main columns.
-            let main_trace_opening = if is_preprocessed {
-                Self::open_polys_with(domain, &main_commit.tree, *index, |row| {
-                    lde_trace.gather_main_row_range(row, num_precomputed_cols, total_cols)
-                })
-            } else {
-                Self::open_polys_with(domain, &main_commit.tree, *index, |row| {
-                    lde_trace.gather_main_row(row)
-                })
-            };
-
-            // For preprocessed tables, also open the precomputed-columns tree.
-            let precomputed_trace_opening = main_commit.precomputed_tree.as_ref().map(|tree| {
-                Self::open_polys_with(domain, tree, *index, |row| {
-                    lde_trace.gather_main_row_range(row, 0, num_precomputed_cols)
-                })
-            });
-
             let composition_openings = Self::open_composition_poly(
                 &round_2_result.composition_poly_merkle_tree,
                 &round_2_result.lde_composition_poly_evaluations,
@@ -1462,6 +1589,69 @@ pub trait IsStarkProver<
                 })
             });
 
+            let (main_trace_opening, precomputed_trace_opening) = match main_commit {
+                MainCommit::Shared {
+                    mmcs,
+                    padded_height,
+                    ..
+                } => {
+                    let max_height = mmcs
+                        .spec()
+                        .first()
+                        .map(|(_, h)| *h)
+                        .expect("MMCS spec is non-empty");
+                    debug_assert!(
+                        padded_height.is_power_of_two() && max_height >= *padded_height
+                    );
+                    let shift = (max_height / *padded_height).trailing_zeros() as usize;
+                    let domain_size = domain.lde_roots_of_unity_coset.len() as u64;
+                    let primary = *index * 2;
+                    let sym = *index * 2 + 1;
+                    let evaluations = lde_trace.gather_main_row(reverse_index(primary, domain_size));
+                    let evaluations_sym = lde_trace.gather_main_row(reverse_index(sym, domain_size));
+                    let mmcs_opening = mmcs
+                        .open(primary << shift)
+                        .expect("MMCS open: prover-side primary index in range");
+                    let mmcs_opening_sym = mmcs
+                        .open(sym << shift)
+                        .expect("MMCS open: prover-side sym index in range");
+                    let opening = MainTraceOpening::Mmcs {
+                        evaluations,
+                        evaluations_sym,
+                        mmcs_opening,
+                        mmcs_opening_sym,
+                    };
+                    (opening, None)
+                }
+                MainCommit::Preprocessed {
+                    multiplicities_tree,
+                    precomputed_tree,
+                    num_precomputed_cols,
+                    ..
+                } => {
+                    let num_precomputed_cols = *num_precomputed_cols;
+                    let mult = Self::open_polys_with(
+                        domain,
+                        multiplicities_tree,
+                        *index,
+                        |row| {
+                            lde_trace.gather_main_row_range(
+                                row,
+                                num_precomputed_cols,
+                                total_cols,
+                            )
+                        },
+                    );
+                    let pre = Self::open_polys_with(
+                        domain,
+                        precomputed_tree,
+                        *index,
+                        |row| lde_trace.gather_main_row_range(row, 0, num_precomputed_cols),
+                    );
+                    (MainTraceOpening::Tree(mult), Some(pre))
+                }
+            };
+
             openings.push(DeepPolynomialOpening {
                 composition_poly: composition_openings,
                 main_trace_polys: main_trace_opening,
@@ -1524,8 +1714,6 @@ pub trait IsStarkProver<
                 num_airs
             )));
         }
-        // `main_tags` is reserved for the upcoming MMCS wire-up; not consumed yet.
-        let _ = main_tags;
 
         // Check if any AIR has an auxiliary trace
         let needs_lookup_challenges = air_trace_pairs
@@ -1611,7 +1799,7 @@ pub trait IsStarkProver<
         #[cfg(feature = "instruments")]
         let phase_start = Instant::now();
 
-        let mut main_commits: Vec<TableCommit<Field>> = Vec::with_capacity(num_airs);
+        let mut phase_a_outputs: Vec<MainPhaseAOutput<Field>> = Vec::with_capacity(num_airs);
         let mut main_ldes: Vec<Vec<Vec<FieldElement<Field>>>> = Vec::with_capacity(num_airs);
 
         for chunk_start in (0..num_airs).step_by(k) {
@@ -1628,6 +1816,7 @@ pub trait IsStarkProver<
                     let (air, trace, _) = &air_trace_pairs[idx];
                     let domain = &domains[idx];
                     let twiddles = &twiddle_caches[idx];
+                    let tag = main_tags[idx];
 
                     let precomputed = air
                         .is_preprocessed()
@@ -1636,6 +1825,7 @@ pub trait IsStarkProver<
                         *trace,
                         domain,
                         twiddles,
+                        tag,
                         precomputed,
                         #[cfg(feature = "disk-spill")]
                         storage_mode,
@@ -1643,18 +1833,58 @@ pub trait IsStarkProver<
                 })
                 .collect();
 
-            // Sequential: append roots to shared transcript (Fiat-Shamir ordering)
+            // Sequential: per table, absorb its preprocessed root and then
+            // its own per-table multiplicities root (preprocessed only). The
+            // shared MMCS root is absorbed once after the loop. Order must
+            // match the verifier replay.
             for result in chunk_results {
-                let (commit, cached_main) = result?;
-                if let Some(ref pre_root) = commit.precomputed_root {
+                let (output, cached_main) = result?;
+                if let Some(ref pre_root) = output.precomputed_root() {
                     transcript.append_bytes(pre_root);
                 }
-                transcript.append_bytes(&commit.root);
-                main_commits.push(commit);
+                if let Some(ref main_root) = output.main_tree_root() {
+                    transcript.append_bytes(main_root);
+                }
+                phase_a_outputs.push(output);
                 main_ldes.push(cached_main);
             }
         }
 
+        // Build the unified main-trace MMCS once over Shared (non-preprocessed)
+        // entries. Preprocessed tables stay out of the MMCS and keep their
+        // own per-table Merkle trees (already absorbed above).
+        let (main_mmcs_root, main_mmcs_spec, mmcs_arc) =
+            build_main_mmcs::<Field>(&phase_a_outputs)?;
+        transcript.append_bytes(&main_mmcs_root);
+
+        let main_commits: Vec<MainCommit<Field>> = phase_a_outputs
+            .into_iter()
+            .map(|o| match o {
+                MainPhaseAOutput::Shared {
+                    tag,
+                    padded_height,
+                    leaves: _,
+                } => MainCommit::Shared {
+                    mmcs: Arc::clone(&mmcs_arc),
+                    tag,
+                    padded_height,
+                },
+                MainPhaseAOutput::Preprocessed {
+                    multiplicities_tree,
+                    multiplicities_root,
+                    precomputed_tree,
+                    precomputed_root,
+                    num_precomputed_cols,
+                } => MainCommit::Preprocessed {
+                    multiplicities_tree,
+                    multiplicities_root,
+                    precomputed_tree,
+                    precomputed_root,
+                    num_precomputed_cols,
+                },
+            })
+            .collect();
+
         #[cfg(feature = "instruments")]
         let main_commits_elapsed = phase_start.elapsed();
         #[cfg(feature = "instruments")]
@@ -1962,17 +2192,22 @@ pub trait IsStarkProver<
             });
         }
 
-        Ok(MultiProof { proofs })
+        Ok(MultiProof {
+            proofs,
+            main_mmcs_root,
+            main_mmcs_spec,
+        })
     }
 
-    /// Generate a STARK proof for a single AIR/trace.
-    /// This is equivalent to calling `multi_prove` with a single-element slice.
+    /// Generate a single-AIR STARK proof, returned as a one-element
+    /// `MultiProof`. The MMCS root + spec live at the multi-proof level (see
+    /// `MultiProof`), so single-table callers consume the wrapper directly.
     fn prove(
         air: &dyn AIR<Field = Field, FieldExtension = FieldExtension, PublicInputs = PI>,
         trace: &mut TraceTable<Field, FieldExtension>,
         pub_inputs: &PI,
         transcript: &mut (impl IsStarkTranscript<FieldExtension, Field> + Clone + Send),
-    ) -> Result<StarkProof<Field, FieldExtension, PI>, ProvingError>
+    ) -> Result<MultiProof<Field, FieldExtension, PI>, ProvingError>
     where
         FieldElement<Field>: AsBytes,
         FieldElement<FieldExtension>: AsBytes,
@@ -1983,9 +2218,8 @@ pub trait IsStarkProver<
         <FieldExtension as IsField>::BaseType: SpillSafe,
     {
         let air_trace_pairs = vec![(air, trace, pub_inputs)];
-        // Single-AIR path: synthesize a default tag. Callers that want
-        // multi-table soundness should call `multi_prove` directly with
-        // distinct tags.
+        // Single-AIR path: synthesize a default tag. Callers that need
+        // distinct chip identities call `multi_prove` directly.
         let main_tags = [MatrixTag::new([0; 8])];
         Self::multi_prove(
             air_trace_pairs,
@@ -1994,7 +2228,6 @@ pub trait IsStarkProver<
             #[cfg(feature = "disk-spill")]
             StorageMode::Ram,
         )
-        .map(|mut multi_proof| multi_proof.proofs.remove(0))
     }
 
     // TODO: propagate errors instead of unwrap() in open_deep_composition_poly and FRI operations
@@ -2128,12 +2361,13 @@ pub trait IsStarkProver<
         info!("End proof generation");
 
         Ok(StarkProof {
-            // [t]
-            lde_trace_main_merkle_root: round_1_result.main.root,
+            // For preprocessed tables: per-table Merkle root over multiplicities
+            // (preprocessed tables stay out of the shared main-trace MMCS).
+            lde_trace_main_merkle_root: round_1_result.main.main_tree_root(),
             // [t]
             lde_trace_aux_merkle_root: round_1_result.aux.as_ref().map(|x| x.root),
             // For preprocessed tables: commitment to precomputed columns only
-            lde_trace_precomputed_merkle_root: round_1_result.main.precomputed_root,
+            lde_trace_precomputed_merkle_root: round_1_result.main.precomputed_root(),
             // tⱼ(zgᵏ)
             trace_ood_evaluations: round_3_result.trace_ood_evaluations,
             // [H₁] and [H₂]
diff --git a/crypto/stark/src/tests/bus_tests/soundness_tests.rs b/crypto/stark/src/tests/bus_tests/soundness_tests.rs
index d2af70678..922049e0e 100644
--- a/crypto/stark/src/tests/bus_tests/soundness_tests.rs
+++ b/crypto/stark/src/tests/bus_tests/soundness_tests.rs
@@ -875,7 +875,7 @@ fn test_injected_bus_public_inputs_on_non_logup_air_rejected() {
 
     // Inject fake bus_public_inputs into a non-LogUp proof.
     // DummyAIR has has_trace_interaction() = false, so this must be rejected.
-    proof.bus_public_inputs = Some(BusPublicInputs {
+    proof.proofs[0].bus_public_inputs = Some(BusPublicInputs {
         table_contribution: FieldElement::<DummyF>::from(42u64),
         #[cfg(feature = "debug-checks")]
         per_bus_sums: Default::default(),
diff --git a/crypto/stark/src/tests/small_trace_tests.rs b/crypto/stark/src/tests/small_trace_tests.rs
index 8373ae9d6..0a006d6a6 100644
--- a/crypto/stark/src/tests/small_trace_tests.rs
+++ b/crypto/stark/src/tests/small_trace_tests.rs
@@ -19,7 +19,7 @@ type Felt = FieldElement<GoldilocksField>;
 
 fn make_valid_simple_proof() -> (
     SimpleAdditionAIR<GoldilocksField>,
-    crate::proof::stark::StarkProof<
+    crate::proof::stark::MultiProof<
         GoldilocksField,
         GoldilocksField,
         SimpleAdditionPublicInputs<GoldilocksField>,
@@ -99,7 +99,7 @@ fn test_verify_fails_with_wrong_inputs() {
     let (air, mut proof) = make_valid_simple_proof();
 
     // Tamper with the proof's public inputs
-    proof.public_inputs = SimpleAdditionPublicInputs {
+    proof.proofs[0].public_inputs = SimpleAdditionPublicInputs {
         a: Felt::from(99u64), // Wrong value - doesn't match trace
         b: Felt::from(2u64),
     };
@@ -124,11 +124,13 @@ fn test_verify_rejects_truncated_composition_poly_parts_ood() {
     let (air, mut proof) = make_valid_simple_proof();
 
     assert!(
-        !proof.composition_poly_parts_ood_evaluation.is_empty(),
+        !proof.proofs[0]
+            .composition_poly_parts_ood_evaluation
+            .is_empty(),
         "test precondition: a valid proof has at least one composition poly part",
     );
     // Drop one entry so the per-query opening has more parts than the header.
-    proof.composition_poly_parts_ood_evaluation.pop();
+    proof.proofs[0].composition_poly_parts_ood_evaluation.pop();
 
     assert!(
         !Verifier::verify(
@@ -150,15 +152,28 @@ fn test_verify_rejects_opening_column_count_mismatch() {
 
     // Append a phantom extra evaluation column to the first query's
     // main-trace opening so the (base + aux) count exceeds `ood_evaluations_table_width`.
-    if let Some(opening) = proof.deep_poly_openings.first_mut() {
+    use crate::proof::stark::MainTraceOpening;
+    if let Some(opening) = proof.proofs[0].deep_poly_openings.first_mut() {
         let extra = opening
             .main_trace_polys
-            .evaluations
+            .evaluations()
             .last()
             .cloned()
             .unwrap_or_else(Felt::zero);
-        opening.main_trace_polys.evaluations.push(extra);
-        opening.main_trace_polys.evaluations_sym.push(extra);
+        match &mut opening.main_trace_polys {
+            MainTraceOpening::Mmcs {
+                evaluations,
+                evaluations_sym,
+                ..
+            } => {
+                evaluations.push(extra);
+                evaluations_sym.push(extra);
+            }
+            MainTraceOpening::Tree(p) => {
+                p.evaluations.push(extra);
+                p.evaluations_sym.push(extra);
+            }
+        }
     } else {
         panic!("test precondition: a valid proof has at least one deep poly opening");
     }
diff --git a/crypto/stark/src/verifier.rs b/crypto/stark/src/verifier.rs
index 7b86cd4b4..31ccbb3cb 100644
--- a/crypto/stark/src/verifier.rs
+++ b/crypto/stark/src/verifier.rs
@@ -340,23 +340,36 @@ pub trait IsStarkVerifier<
             )
     }
 
-    /// Verify opening Open(tⱼ(D_LDE), 𝜐) and Open(tⱼ(D_LDE), -𝜐) for all trace polynomials tⱼ,
-    /// where 𝜐 and -𝜐 are the elements corresponding to the index challenge `iota`.
+    /// Verify the main MMCS opening + precomputed/aux Merkle openings at FRI
+    /// challenge `iota`. `main_tag`, `main_mmcs_root`, `main_mmcs_spec` come
+    /// from the surrounding multi-proof.
     fn verify_trace_openings(
         proof: &StarkProof<Field, FieldExtension, PI>,
         deep_poly_openings: &DeepPolynomialOpening<Field, FieldExtension>,
         iota: usize,
+        main_tag: crypto::merkle_tree::mmcs::MatrixTag,
+        main_mmcs_root: &Commitment,
+        main_mmcs_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)],
     ) -> bool
     where
-        FieldElement<Field>: AsBytes + Sync + Send,
+        FieldElement<Field>: AsBytes + Sync + Send + math::traits::ByteConversion,
         FieldElement<FieldExtension>: AsBytes + Sync + Send,
     {
-        // Main trace (multiplicities for preprocessed, full trace for normal).
-        let mut ok = Self::verify_opening_pair::<Field>(
-            &deep_poly_openings.main_trace_polys,
-            &proof.lde_trace_main_merkle_root,
-            iota,
-        );
+        use crate::proof::stark::MainTraceOpening;
+        let main_ok = match &deep_poly_openings.main_trace_polys {
+            MainTraceOpening::Mmcs { .. } => Self::verify_main_mmcs_pair(
+                &deep_poly_openings.main_trace_polys,
+                iota,
+                main_tag,
+                main_mmcs_root,
+                main_mmcs_spec,
+            ),
+            MainTraceOpening::Tree(opening) => match &proof.lde_trace_main_merkle_root {
+                Some(root) => Self::verify_opening_pair::<Field>(opening, root, iota),
+                None => false,
+            },
+        };
+        let mut ok = main_ok;
 
         // Precomputed trace (preprocessed tables only). Mismatched presence is
         // unreachable in practice (multi_verify rejects such proofs upstream),
@@ -385,6 +398,26 @@ pub trait IsStarkVerifier<
         ok
     }
 
+    /// Authenticate the main-trace MMCS pair for one query.
+    fn verify_main_mmcs_pair(
+        main_opening: &crate::proof::stark::MainTraceOpening<Field>,
+        iota: usize,
+        main_tag: crypto::merkle_tree::mmcs::MatrixTag,
+        main_mmcs_root: &Commitment,
+        main_mmcs_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)],
+    ) -> bool
+    where
+        FieldElement<Field>: AsBytes + Sync + Send + math::traits::ByteConversion,
+    {
+        verify_main_mmcs_pair_inner::<Field>(
+            main_opening,
+            iota,
+            main_tag,
+            main_mmcs_root,
+            main_mmcs_spec,
+        )
+    }
+
     /// Verify opening Open(Hᵢ(D_LDE), 𝜐) and Open(Hᵢ(D_LDE), -𝜐) for all parts Hᵢof the composition
     /// polynomial, where 𝜐 and -𝜐 are the elements corresponding to the index challenge `iota`.
     fn verify_composition_poly_opening(
@@ -415,9 +448,12 @@ pub trait IsStarkVerifier<
     fn step_4_verify_trace_and_composition_openings(
         proof: &StarkProof<Field, FieldExtension, PI>,
         challenges: &Challenges<FieldExtension>,
+        main_tag: crypto::merkle_tree::mmcs::MatrixTag,
+        main_mmcs_root: &Commitment,
+        main_mmcs_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)],
     ) -> bool
     where
-        FieldElement<Field>: AsBytes + Sync + Send,
+        FieldElement<Field>: AsBytes + Sync + Send + math::traits::ByteConversion,
         FieldElement<FieldExtension>: AsBytes + Sync + Send,
     {
         challenges
@@ -429,7 +465,14 @@ pub trait IsStarkVerifier<
                     deep_poly_opening,
                     &proof.composition_poly_root,
                     iota_n,
-                ) && Self::verify_trace_openings(proof, deep_poly_opening, *iota_n)
+                ) && Self::verify_trace_openings(
+                    proof,
+                    deep_poly_opening,
+                    *iota_n,
+                    main_tag,
+                    main_mmcs_root,
+                    main_mmcs_spec,
+                )
             })
     }
 
@@ -570,7 +613,7 @@ pub trait IsStarkVerifier<
             if let Some(p) = &opening.precomputed_trace_polys {
                 lde_base.extend_from_slice(&p.evaluations);
             }
-            lde_base.extend_from_slice(&opening.main_trace_polys.evaluations);
+            lde_base.extend_from_slice(opening.main_trace_polys.evaluations());
 
             let lde_aux: &[FieldElement<FieldExtension>] = opening
                 .aux_trace_polys
@@ -594,7 +637,7 @@ pub trait IsStarkVerifier<
             if let Some(p) = &opening.precomputed_trace_polys {
                 lde_base_sym.extend_from_slice(&p.evaluations_sym);
             }
-            lde_base_sym.extend_from_slice(&opening.main_trace_polys.evaluations_sym);
+            lde_base_sym.extend_from_slice(opening.main_trace_polys.evaluations_sym());
 
             let lde_aux_sym: &[FieldElement<FieldExtension>] = opening
                 .aux_trace_polys
@@ -721,7 +764,7 @@ pub trait IsStarkVerifier<
         expected_bus_balance: &FieldElement<FieldExtension>,
     ) -> bool
     where
-        FieldElement<Field>: AsBytes + Sync + Send,
+        FieldElement<Field>: AsBytes + Sync + Send + math::traits::ByteConversion,
         FieldElement<FieldExtension>: AsBytes + Sync + Send,
     {
         if airs.len() != multi_proof.proofs.len() {
@@ -740,8 +783,6 @@ pub trait IsStarkVerifier<
             );
             return false;
         }
-        // `main_tags` is reserved for the upcoming MMCS verifier replay.
-        let _ = main_tags;
 
         // Check if any AIR has an auxiliary trace
         let needs_lookup_challenges = airs.iter().any(|air| air.has_aux_trace());
@@ -749,18 +790,24 @@ pub trait IsStarkVerifier<
         // =====================================================================
         // Round 1, Phase A: Replay main trace commitments
         // =====================================================================
-        // For preprocessed tables, use the hardcoded commitment (verifier cannot
-        // trust the prover). For normal tables, use the commitment from the proof.
-
+        // Per table: validate the optional precomputed commitment against
+        // the hardcoded AIR value (the only one the verifier trusts), and
+        // absorb it into the transcript. After every table, absorb the
+        // single shared MMCS root that commits to every main trace. Also
+        // cross-check `main_mmcs_spec` against the (tag, padded_height_lde)
+        // pairs reproduced from the AIRs.
+
+        let mut expected_spec: Vec<(crypto::merkle_tree::mmcs::MatrixTag, usize)> =
+            Vec::with_capacity(airs.len());
         for (idx, (air, proof)) in airs.iter().zip(&multi_proof.proofs).enumerate() {
+            let lde_size = proof.trace_length * (air.options().blowup_factor as usize);
             if air.is_preprocessed() {
-                // Preprocessed table: VERIFY precomputed commitment matches hardcoded.
-                // This is the critical soundness check - ensures prover used correct precomputed values.
+                // Preprocessed table: validate + absorb both its AIR-pinned
+                // precomputed root and its own per-table multiplicities root.
+                // Stays OUT of the shared MMCS spec.
                 let expected_precomputed = air.precomputed_commitment();
                 match &proof.lde_trace_precomputed_merkle_root {
-                    Some(actual) if *actual == expected_precomputed => {
-                        // OK - commitment matches hardcoded
-                    }
+                    Some(actual) if *actual == expected_precomputed => {}
                     Some(actual) => {
                         error!(
                             "Preprocessed commitment MISMATCH for table {idx}: expected {:?}, got {:?}",
@@ -773,18 +820,42 @@ pub trait IsStarkVerifier<
                         return false;
                     }
                 }
-
-                // Add BOTH commitments to transcript (Fiat-Shamir binding).
-                // Precomputed commitment binds challenges to correct precomputed values.
-                // Multiplicities commitment binds challenges to actual lookups made.
                 transcript.append_bytes(&expected_precomputed);
-                transcript.append_bytes(&proof.lde_trace_main_merkle_root);
+
+                match &proof.lde_trace_main_merkle_root {
+                    Some(root) => transcript.append_bytes(root),
+                    None => {
+                        error!(
+                            "Preprocessed table {idx} proof missing multiplicities Merkle root"
+                        );
+                        return false;
+                    }
+                }
             } else {
-                // Normal table: use commitment from proof
-                transcript.append_bytes(&proof.lde_trace_main_merkle_root);
+                // Non-preprocessed table: nothing per-table; the shared MMCS
+                // root absorbed below covers its main columns.
+                if proof.lde_trace_main_merkle_root.is_some() {
+                    error!(
+                        "Non-preprocessed table {idx} unexpectedly supplied a per-table main root"
+                    );
+                    return false;
+                }
+                expected_spec.push((main_tags[idx], lde_size));
             }
         }
 
+        // Deterministic sort matches `MmcsBuilder::finalize` (height desc, tag asc).
+        expected_spec.sort_by(|a, b| b.1.cmp(&a.1).then(a.0.cmp(&b.0)));
+        if expected_spec != multi_proof.main_mmcs_spec {
+            error!(
+                "main_mmcs_spec mismatch: expected {:?}, got {:?}",
+                expected_spec, multi_proof.main_mmcs_spec,
+            );
+            return false;
+        }
+
+        transcript.append_bytes(&multi_proof.main_mmcs_root);
+
         // =====================================================================
         // Round 1, Phase B: Sample shared LogUp challenges
         // =====================================================================
@@ -847,12 +918,15 @@ pub trait IsStarkVerifier<
                 table_transcript.append_field_element(&bpi.table_contribution);
             }
 
-            // Rounds 2-4: verify
+            // Rounds 2-4: verify (per-table MMCS context threaded through).
             if !Self::verify_rounds_2_to_4(
                 *air,
                 proof,
                 &mut table_transcript,
                 lookup_challenges.clone(),
+                main_tags[idx],
+                &multi_proof.main_mmcs_root,
+                &multi_proof.main_mmcs_spec,
             ) {
                 error!(
                     "Table {} failed verify_rounds_2_to_4 (num_constraints={}, trace_cols={})",
@@ -899,25 +973,22 @@ pub trait IsStarkVerifier<
         true
     }
 
-    /// Verify a single STARK proof.
-    /// This is equivalent to calling `multi_verify` with a single-element slice.
+    /// Verify a single-AIR STARK proof packaged as a one-element `MultiProof`.
+    /// Equivalent to `multi_verify(&[air], proof, &[default_tag], ...)`.
     fn verify(
-        proof: &StarkProof<Field, FieldExtension, PI>,
+        proof: &MultiProof<Field, FieldExtension, PI>,
         air: &dyn AIR<Field = Field, FieldExtension = FieldExtension, PublicInputs = PI>,
         transcript: &mut (impl IsStarkTranscript<FieldExtension, Field> + Clone),
     ) -> bool
     where
-        FieldElement<Field>: AsBytes + Sync + Send,
+        FieldElement<Field>: AsBytes + Sync + Send + math::traits::ByteConversion,
         FieldElement<FieldExtension>: AsBytes + Sync + Send,
         PI: Clone,
     {
-        let multi_proof = MultiProof {
-            proofs: vec![proof.clone()],
-        };
         let main_tags = [crypto::merkle_tree::mmcs::MatrixTag::new([0; 8])];
         Self::multi_verify(
             &[air],
-            &multi_proof,
+            proof,
             &main_tags,
             transcript,
             &FieldElement::zero(),
@@ -1061,14 +1132,22 @@ pub trait IsStarkVerifier<
     }
 
     /// Verifies a single table after round 1 has been replayed.
+    ///
+    /// `main_tag`, `main_mmcs_root`, `main_mmcs_spec` come from the shared
+    /// multi-proof and are needed to authenticate the per-table main-trace
+    /// openings in step 4.
+    #[allow(clippy::too_many_arguments)]
     fn verify_rounds_2_to_4(
         air: &dyn AIR<Field = Field, FieldExtension = FieldExtension, PublicInputs = PI>,
         proof: &StarkProof<Field, FieldExtension, PI>,
         transcript: &mut impl IsStarkTranscript<FieldExtension, Field>,
         rap_challenges: Vec<FieldElement<FieldExtension>>,
+        main_tag: crypto::merkle_tree::mmcs::MatrixTag,
+        main_mmcs_root: &Commitment,
+        main_mmcs_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)],
     ) -> bool
     where
-        FieldElement<Field>: AsBytes + Sync + Send,
+        FieldElement<Field>: AsBytes + Sync + Send + math::traits::ByteConversion,
         FieldElement<FieldExtension>: AsBytes + Sync + Send,
     {
         let domain = new_verifier_domain(air, proof.trace_length);
@@ -1142,7 +1221,13 @@ pub trait IsStarkVerifier<
         let timer4 = Instant::now();
 
         #[allow(clippy::let_and_return)]
-        if !Self::step_4_verify_trace_and_composition_openings(proof, &challenges) {
+        if !Self::step_4_verify_trace_and_composition_openings(
+            proof,
+            &challenges,
+            main_tag,
+            main_mmcs_root,
+            main_mmcs_spec,
+        ) {
             #[cfg(not(feature = "test_fiat_shamir"))]
             error!("DEEP Composition Polynomial verification failed");
             return false;
@@ -1168,3 +1253,64 @@ pub trait IsStarkVerifier<
         true
     }
 }
+
+fn verify_main_mmcs_pair_inner<F>(
+    main_opening: &crate::proof::stark::MainTraceOpening<F>,
+    iota: usize,
+    main_tag: crypto::merkle_tree::mmcs::MatrixTag,
+    main_mmcs_root: &Commitment,
+    main_mmcs_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)],
+) -> bool
+where
+    F: IsField,
+    FieldElement<F>: AsBytes + Sync + Send + math::traits::ByteConversion,
+{
+    use crate::mmcs_leaf::hash_tagged_row;
+    use crate::proof::stark::MainTraceOpening;
+
+    let (evaluations, evaluations_sym, mmcs_opening, mmcs_opening_sym) = match main_opening {
+        MainTraceOpening::Mmcs {
+            evaluations,
+            evaluations_sym,
+            mmcs_opening,
+            mmcs_opening_sym,
+        } => (evaluations, evaluations_sym, mmcs_opening, mmcs_opening_sym),
+        MainTraceOpening::Tree(_) => return false,
+    };
+
+    let table_idx = match main_mmcs_spec.iter().position(|(t, _)| *t == main_tag) {
+        Some(i) => i,
+        None => return false,
+    };
+    let table_height = main_mmcs_spec[table_idx].1;
+    let max_height = match main_mmcs_spec.first().map(|(_, h)| *h) {
+        Some(h) => h,
+        None => return false,
+    };
+    if !table_height.is_power_of_two() || max_height < table_height {
+        return false;
+    }
+    let shift = (max_height / table_height).trailing_zeros() as usize;
+    let g_primary = (iota * 2) << shift;
+    let g_sym = (iota * 2 + 1) << shift;
+    let leaf_primary = hash_tagged_row::<F>(main_tag, evaluations);
+    let leaf_sym = hash_tagged_row::<F>(main_tag, evaluations_sym);
+    if mmcs_opening.global_index != g_primary || mmcs_opening_sym.global_index != g_sym {
+        return false;
+    }
+    let leaves = &mmcs_opening.matrix_leaves;
+    let leaves_sym = &mmcs_opening_sym.matrix_leaves;
+    if table_idx >= leaves.len() || table_idx >= leaves_sym.len() {
+        return false;
+    }
+    if leaves[table_idx].0 != main_tag || leaves[table_idx].1 != leaf_primary {
+        return false;
+    }
+    if leaves_sym[table_idx].0 != main_tag || leaves_sym[table_idx].1 != leaf_sym {
+        return false;
+    }
+    let ok = mmcs_opening.verify::<BatchedMerkleTreeBackend<F>>(main_mmcs_root, main_mmcs_spec);
+    let ok_sym =
+        mmcs_opening_sym.verify::<BatchedMerkleTreeBackend<F>>(main_mmcs_root, main_mmcs_spec);
+    ok && ok_sym
+}
diff --git a/prover/src/lib.rs b/prover/src/lib.rs
index 209d45853..dc5073ac9 100644
--- a/prover/src/lib.rs
+++ b/prover/src/lib.rs
@@ -503,9 +503,10 @@ impl VmAirs {
 // =============================================================================
 
 /// Replay the prover's Phase A (main trace commitments) to recover the shared
-/// LogUp challenges (z, alpha). Creates a fresh transcript, appends all main
-/// trace commitments in the same order as the prover, then samples two
-/// challenge elements.
+/// LogUp challenges (z, alpha). Mirrors `multi_verify` Phase A absorb order:
+/// for each table, absorb its precomputed root and (preprocessed only) its
+/// per-table multiplicities Merkle root; then absorb the shared main-trace
+/// MMCS root once at the end.
 pub(crate) fn replay_transcript_phase_a(
     airs: &[&dyn AIR<Field = F, FieldExtension = E, PublicInputs = ()>],
     multi_proof: &MultiProof<F, E, ()>,
@@ -514,9 +515,12 @@ pub(crate) fn replay_transcript_phase_a(
     for (air, proof) in airs.iter().zip(&multi_proof.proofs) {
         if air.is_preprocessed() {
             transcript.append_bytes(&air.precomputed_commitment());
+            if let Some(root) = &proof.lde_trace_main_merkle_root {
+                transcript.append_bytes(root);
+            }
         }
-        transcript.append_bytes(&proof.lde_trace_main_merkle_root);
     }
+    transcript.append_bytes(&multi_proof.main_mmcs_root);
     let z: FieldElement<E> = transcript.sample_field_element();
     let alpha: FieldElement<E> = transcript.sample_field_element();
     (z, alpha)

From 34ff7658c674a19cc1abc1de5aa88b7c0dde6666 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Wed, 27 May 2026 14:46:40 -0300
Subject: [PATCH 10/21] test(stark/mmcs): per-vector soundness tests for the
 shared MMCS path

10 tests covering the attack surface of the shared main-trace MMCS
introduced in the previous commit. All operate on a baseline-valid
two-table multi-proof over non-preprocessed AIRs (DummyAIR + BitFlagsAIR)
so every main opening is `MainTraceOpening::Mmcs`. The preprocessed
`MainTraceOpening::Tree` branch is exercised end-to-end by lambda-vm-prover
`bitwise_tests`.

Each test tampers with a single field and asserts verifier rejection:

  Field tampered                                  Detection mechanism
  -----------------------------------------------------------------------
  multi_proof.main_mmcs_root[0]                   transcript divergence
  multi_proof.main_mmcs_spec[0].1 (height)        reproduced-spec check
  multi_proof.main_mmcs_spec[0].0 (tag)           reproduced-spec check
  mmcs_opening.matrix_leaves[idx].1 (digest)      rehash-vs-leaf check
  mmcs_opening.matrix_leaves[idx].0 (tag)         leaf-tag check
  mmcs_opening.global_index                       g_primary/g_sym match
  mmcs_opening.siblings[0][0]                     MmcsOpening::verify
  evaluations[0]                                  rehash mismatch
  main_tags slice swapped at verifier             spec sort mismatch
  (baseline test)                                 verifies cleanly

Together these pin the soundness of every byte the verifier consults on
the new path, locking in behaviour before the C2 streaming-builder work.
---
 .../stark/src/tests/mmcs_soundness_tests.rs   | 239 ++++++++++++++++++
 crypto/stark/src/tests/mod.rs                 |   1 +
 2 files changed, 240 insertions(+)
 create mode 100644 crypto/stark/src/tests/mmcs_soundness_tests.rs

diff --git a/crypto/stark/src/tests/mmcs_soundness_tests.rs b/crypto/stark/src/tests/mmcs_soundness_tests.rs
new file mode 100644
index 000000000..0a690e085
--- /dev/null
+++ b/crypto/stark/src/tests/mmcs_soundness_tests.rs
@@ -0,0 +1,239 @@
+//! Soundness tests for the shared main-trace MMCS path.
+//!
+//! All tests use a multi-table proof over non-preprocessed AIRs (so every
+//! table's main slice lives in `MainTraceOpening::Mmcs`). The preprocessed
+//! per-table-tree path is exercised end-to-end by lambda-vm-prover's
+//! `bitwise_tests` (the bitwise AIR is preprocessed).
+//!
+//! Each test starts from a baseline-valid multi-proof, tampers with a
+//! single field on the MMCS path, and asserts the verifier rejects.
+
+use crypto::fiat_shamir::default_transcript::DefaultTranscript;
+use crypto::merkle_tree::mmcs::MatrixTag;
+use math::field::{element::FieldElement, goldilocks::GoldilocksField};
+
+use crate::examples::{
+    bit_flags::{self, BitFlagsAIR},
+    dummy_air::{self, DummyAIR},
+};
+use crate::proof::options::ProofOptions;
+use crate::proof::stark::{MainTraceOpening, MultiProof};
+use crate::test_utils::{multi_prove_ram, multi_verify_ram, synth_main_tags};
+use crate::traits::AIR;
+
+type F = GoldilocksField;
+
+/// Build a baseline multi-proof over (DummyAIR, BitFlagsAIR). Both are
+/// non-preprocessed → every main opening is `MainTraceOpening::Mmcs`.
+#[allow(clippy::type_complexity)]
+fn baseline_proof() -> (
+    DummyAIR,
+    BitFlagsAIR,
+    MultiProof<F, F, ()>,
+) {
+    let proof_options = ProofOptions::default_test_options();
+    let air_1 = DummyAIR::new(&proof_options);
+    let air_2 = BitFlagsAIR::new(&proof_options);
+    let mut trace_1 = dummy_air::dummy_trace::<F>(16);
+    let mut trace_2 = bit_flags::bit_prefix_flag_trace(32);
+    let air_trace_pairs: Vec<(
+        &dyn AIR<Field = F, FieldExtension = F, PublicInputs = ()>,
+        &mut _,
+        &_,
+    )> = vec![
+        (&air_1, &mut trace_1, &()),
+        (&air_2, &mut trace_2, &()),
+    ];
+    let proof =
+        multi_prove_ram(air_trace_pairs, &mut DefaultTranscript::<F>::new(&[])).unwrap();
+    (air_1, air_2, proof)
+}
+
+fn verify(airs: &[&dyn AIR<Field = F, FieldExtension = F, PublicInputs = ()>], proof: &MultiProof<F, F, ()>) -> bool {
+    multi_verify_ram(
+        airs,
+        proof,
+        &mut DefaultTranscript::<F>::new(&[]),
+        &FieldElement::zero(),
+    )
+}
+
+/// First-iota opening for the first table in the multi-proof, in the Mmcs
+/// variant. Helper for tests that need a mutable handle into the per-query
+/// MMCS opening fields.
+fn first_mmcs_opening_mut(
+    proof: &mut MultiProof<F, F, ()>,
+) -> &mut MainTraceOpening<F> {
+    &mut proof.proofs[0].deep_poly_openings[0].main_trace_polys
+}
+
+#[test_log::test]
+fn baseline_two_table_proof_verifies() {
+    let (air_1, air_2, proof) = baseline_proof();
+    let airs: Vec<&dyn AIR<Field = F, FieldExtension = F, PublicInputs = ()>> =
+        vec![&air_1, &air_2];
+    assert!(verify(&airs, &proof), "baseline proof must verify");
+}
+
+#[test_log::test]
+fn tampered_main_mmcs_root_rejected() {
+    let (air_1, air_2, mut proof) = baseline_proof();
+    let airs: Vec<&dyn AIR<Field = F, FieldExtension = F, PublicInputs = ()>> =
+        vec![&air_1, &air_2];
+    proof.main_mmcs_root[0] ^= 1;
+    assert!(
+        !verify(&airs, &proof),
+        "tampered main MMCS root must be rejected"
+    );
+}
+
+#[test_log::test]
+fn tampered_main_mmcs_spec_height_rejected() {
+    let (air_1, air_2, mut proof) = baseline_proof();
+    let airs: Vec<&dyn AIR<Field = F, FieldExtension = F, PublicInputs = ()>> =
+        vec![&air_1, &air_2];
+    let height = &mut proof.main_mmcs_spec[0].1;
+    *height /= 2;
+    assert!(
+        !verify(&airs, &proof),
+        "spec height mismatch must be rejected"
+    );
+}
+
+#[test_log::test]
+fn tampered_main_mmcs_spec_tag_rejected() {
+    let (air_1, air_2, mut proof) = baseline_proof();
+    let airs: Vec<&dyn AIR<Field = F, FieldExtension = F, PublicInputs = ()>> =
+        vec![&air_1, &air_2];
+    proof.main_mmcs_spec[0].0 = MatrixTag::new([0xFF; 8]);
+    assert!(
+        !verify(&airs, &proof),
+        "spec tag mismatch must be rejected"
+    );
+}
+
+#[test_log::test]
+fn tampered_mmcs_opening_leaf_rejected() {
+    let (air_1, air_2, mut proof) = baseline_proof();
+    let airs: Vec<&dyn AIR<Field = F, FieldExtension = F, PublicInputs = ()>> =
+        vec![&air_1, &air_2];
+    match first_mmcs_opening_mut(&mut proof) {
+        MainTraceOpening::Mmcs { mmcs_opening, .. } => {
+            mmcs_opening.matrix_leaves[0].1[0] ^= 1;
+        }
+        MainTraceOpening::Tree(_) => panic!("baseline must produce Mmcs variant"),
+    }
+    assert!(
+        !verify(&airs, &proof),
+        "tampered matrix-leaf digest must be rejected"
+    );
+}
+
+#[test_log::test]
+fn tampered_mmcs_opening_leaf_tag_rejected() {
+    let (air_1, air_2, mut proof) = baseline_proof();
+    let airs: Vec<&dyn AIR<Field = F, FieldExtension = F, PublicInputs = ()>> =
+        vec![&air_1, &air_2];
+    match first_mmcs_opening_mut(&mut proof) {
+        MainTraceOpening::Mmcs { mmcs_opening, .. } => {
+            mmcs_opening.matrix_leaves[0].0 = MatrixTag::new([0xCC; 8]);
+        }
+        MainTraceOpening::Tree(_) => panic!("baseline must produce Mmcs variant"),
+    }
+    assert!(
+        !verify(&airs, &proof),
+        "tampered matrix-leaf tag must be rejected"
+    );
+}
+
+#[test_log::test]
+fn tampered_mmcs_opening_global_index_rejected() {
+    let (air_1, air_2, mut proof) = baseline_proof();
+    let airs: Vec<&dyn AIR<Field = F, FieldExtension = F, PublicInputs = ()>> =
+        vec![&air_1, &air_2];
+    match first_mmcs_opening_mut(&mut proof) {
+        MainTraceOpening::Mmcs { mmcs_opening, .. } => {
+            mmcs_opening.global_index ^= 0b10;
+        }
+        MainTraceOpening::Tree(_) => panic!("baseline must produce Mmcs variant"),
+    }
+    assert!(
+        !verify(&airs, &proof),
+        "tampered MMCS global_index must be rejected"
+    );
+}
+
+#[test_log::test]
+fn tampered_mmcs_opening_sibling_rejected() {
+    let (air_1, air_2, mut proof) = baseline_proof();
+    let airs: Vec<&dyn AIR<Field = F, FieldExtension = F, PublicInputs = ()>> =
+        vec![&air_1, &air_2];
+    match first_mmcs_opening_mut(&mut proof) {
+        MainTraceOpening::Mmcs { mmcs_opening, .. } => {
+            assert!(!mmcs_opening.siblings.is_empty());
+            mmcs_opening.siblings[0][0] ^= 1;
+        }
+        MainTraceOpening::Tree(_) => panic!("baseline must produce Mmcs variant"),
+    }
+    assert!(
+        !verify(&airs, &proof),
+        "tampered MMCS sibling must be rejected"
+    );
+}
+
+#[test_log::test]
+fn tampered_evaluations_rejected() {
+    let (air_1, air_2, mut proof) = baseline_proof();
+    let airs: Vec<&dyn AIR<Field = F, FieldExtension = F, PublicInputs = ()>> =
+        vec![&air_1, &air_2];
+    match first_mmcs_opening_mut(&mut proof) {
+        MainTraceOpening::Mmcs { evaluations, .. } => {
+            assert!(!evaluations.is_empty());
+            evaluations[0] += FieldElement::<F>::one();
+        }
+        MainTraceOpening::Tree(_) => panic!("baseline must produce Mmcs variant"),
+    }
+    assert!(
+        !verify(&airs, &proof),
+        "tampered row evaluations must be rejected (rehash mismatch)"
+    );
+}
+
+#[test_log::test]
+fn swapped_main_tags_at_verifier_rejected() {
+    // The verifier reproduces `main_tags` from `synth_main_tags(num_airs)`
+    // inside `multi_verify_ram`. To simulate a verifier that "lies" about
+    // tag ordering we call `multi_verify` directly with a permuted slice.
+    use crate::verifier::{IsStarkVerifier, Verifier};
+    let (air_1, air_2, proof) = baseline_proof();
+    let airs: Vec<&dyn AIR<Field = F, FieldExtension = F, PublicInputs = ()>> =
+        vec![&air_1, &air_2];
+
+    // Sanity: with the correct (synth) tag order it passes.
+    let correct = synth_main_tags(airs.len());
+    assert!(
+        Verifier::multi_verify(
+            &airs,
+            &proof,
+            &correct,
+            &mut DefaultTranscript::<F>::new(&[]),
+            &FieldElement::zero(),
+        ),
+        "baseline must verify with correct tags"
+    );
+
+    // Swap the two tags — the spec sort order is now wrong relative to the
+    // prover's commitments, so the spec match check must reject.
+    let mut swapped = correct.clone();
+    swapped.swap(0, 1);
+    assert!(
+        !Verifier::multi_verify(
+            &airs,
+            &proof,
+            &swapped,
+            &mut DefaultTranscript::<F>::new(&[]),
+            &FieldElement::zero(),
+        ),
+        "swapped main_tags must be rejected"
+    );
+}
diff --git a/crypto/stark/src/tests/mod.rs b/crypto/stark/src/tests/mod.rs
index bc80e522e..f44c65ee9 100644
--- a/crypto/stark/src/tests/mod.rs
+++ b/crypto/stark/src/tests/mod.rs
@@ -2,6 +2,7 @@ pub mod air_tests;
 pub mod bus_tests;
 pub mod domain_cache_stats;
 pub mod fri_tests;
+pub mod mmcs_soundness_tests;
 pub mod proof_options_tests;
 pub mod prove_verify_roundtrip_tests;
 pub mod prover_tests;

From 9f6abba1050c4ec03425608e8c9450eaaee9e7a9 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Wed, 27 May 2026 14:48:51 -0300
Subject: [PATCH 11/21] feat(cli): `proof-size` subcommand with per-section
 byte breakdown
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds `cli proof-size <ELF>` which generates (or loads via `--proof`) a
VmProof and reports the serialized byte size, broken down by component
across every sub-proof. Intended for CI to track proof-size regressions
and improvements (e.g. the streaming MMCS migration that just landed).

Usage:
    cli proof-size <ELF>                            # human-readable table
    cli proof-size <ELF> --json                     # machine-readable
    cli proof-size <ELF> --proof bundle.bin         # skip the re-prove
    cli proof-size <ELF> --private-input file       # stdin if needed

Sections reported (summed across all sub-proofs):
- main_mmcs_root / main_mmcs_spec                   (multi-proof header)
- per_table_main_merkle_root (preprocessed only)
- per_table_precomputed_merkle_root
- per_table_aux_merkle_root
- deep_poly_openings.main_trace_polys               (MMCS vs Tree opens)
- deep_poly_openings.{precomputed,aux,composition_poly}
- fri_layers_merkle_roots / fri_query_list          (FRI; usually dominant)
- trace_ood_evaluations / composition_poly_parts_ood_evaluation
- bus_public_inputs
- other                                             (bundle delta — headers,
                                                     public_inputs, nonce, ...)

Encoding is bincode v1, matching `cli prove`'s output format so saved
bundles round-trip 1:1.

Cargo.lock updated only for the new `serde` + `serde_json` deps in
`bin/cli`. No core dep changes.
---
 Cargo.lock          |   2 +
 bin/cli/Cargo.toml  |   2 +
 bin/cli/src/main.rs | 223 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 227 insertions(+)

diff --git a/Cargo.lock b/Cargo.lock
index 8fff60dcf..30a3adde6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -570,6 +570,8 @@ dependencies = [
  "env_logger",
  "executor",
  "lambda-vm-prover",
+ "serde",
+ "serde_json",
  "stark",
  "tikv-jemalloc-ctl",
  "tikv-jemallocator",
diff --git a/bin/cli/Cargo.toml b/bin/cli/Cargo.toml
index 87bb1c8fc..45195a28c 100644
--- a/bin/cli/Cargo.toml
+++ b/bin/cli/Cargo.toml
@@ -10,6 +10,8 @@ prover = { path = "../../prover", package = "lambda-vm-prover" }
 stark = { path = "../../crypto/stark" }
 clap = { version = "4.3.10", features = ["derive"] }
 bincode = "1"
+serde = { version = "1", features = ["derive"] }
+serde_json = "1"
 tikv-jemallocator = "0.6"
 tikv-jemalloc-ctl = { version = "0.6", features = ["stats"], optional = true }
 env_logger = "0.11"
diff --git a/bin/cli/src/main.rs b/bin/cli/src/main.rs
index bdcea9518..a8ba411d5 100644
--- a/bin/cli/src/main.rs
+++ b/bin/cli/src/main.rs
@@ -171,6 +171,31 @@ enum Commands {
         #[arg(long, value_hint = ValueHint::FilePath)]
         private_input: Option<PathBuf>,
     },
+
+    /// Generate a proof and report its serialized byte size, broken down
+    /// by component (trace openings, FRI, OOD evals, MMCS metadata, ...).
+    /// Intended for CI to track proof-size regressions / improvements
+    /// (e.g. the streaming MMCS migration).
+    ProofSize {
+        /// Path to the ELF file
+        #[arg(value_parser, value_hint = ValueHint::FilePath)]
+        elf: PathBuf,
+
+        /// Optional path to a pre-generated proof bundle. When supplied,
+        /// the ELF is not re-proven; the file is decoded and its sizes
+        /// reported directly. The ELF is still needed to bind the proof
+        /// to the program statement.
+        #[arg(long, value_hint = ValueHint::FilePath)]
+        proof: Option<PathBuf>,
+
+        /// Path to the private input file
+        #[arg(long, value_hint = ValueHint::FilePath)]
+        private_input: Option<PathBuf>,
+
+        /// Emit machine-readable JSON instead of a human-readable table.
+        #[arg(long)]
+        json: bool,
+    },
 }
 
 fn main() -> ExitCode {
@@ -199,6 +224,12 @@ fn main() -> ExitCode {
             time,
         } => cmd_verify(proof, elf, blowup, time),
         Commands::CountElements { elf, private_input } => cmd_count_elements(elf, private_input),
+        Commands::ProofSize {
+            elf,
+            proof,
+            private_input,
+            json,
+        } => cmd_proof_size(elf, proof, private_input, json),
     }
 }
 
@@ -554,3 +585,195 @@ fn cmd_count_elements(elf_path: PathBuf, private_input_path: Option<PathBuf>) ->
         }
     }
 }
+
+// =============================================================================
+// proof-size: serialize a VmProof and report a per-section byte breakdown.
+// =============================================================================
+
+/// One row of the proof-size report. `bytes` are the serialized length of
+/// the corresponding piece of the proof under the same encoder used for the
+/// full bundle (bincode v1).
+#[derive(Debug, Clone, serde::Serialize)]
+struct ProofSizeEntry {
+    section: &'static str,
+    bytes: usize,
+}
+
+fn ser_len<T: serde::Serialize>(value: &T) -> usize {
+    // bincode v1 mirrors the encoding used by VmProof callers (bin/cli prove
+    // and prover tests), so per-section sums add up to the total bundle.
+    bincode::serialize(value).map(|v| v.len()).unwrap_or(0)
+}
+
+fn cmd_proof_size(
+    elf_path: PathBuf,
+    proof_path: Option<PathBuf>,
+    private_input_path: Option<PathBuf>,
+    json: bool,
+) -> ExitCode {
+    let elf_data = match std::fs::read(&elf_path) {
+        Ok(data) => data,
+        Err(e) => {
+            eprintln!("Failed to read ELF file: {}", e);
+            return ExitCode::FAILURE;
+        }
+    };
+
+    let vm_proof: VmProof = if let Some(path) = proof_path {
+        let bytes = match std::fs::read(&path) {
+            Ok(b) => b,
+            Err(e) => {
+                eprintln!("Failed to read proof file {}: {}", path.display(), e);
+                return ExitCode::FAILURE;
+            }
+        };
+        match bincode::deserialize(&bytes) {
+            Ok(p) => p,
+            Err(e) => {
+                eprintln!("Failed to decode proof bundle: {}", e);
+                return ExitCode::FAILURE;
+            }
+        }
+    } else {
+        let private_inputs = match read_private_input(private_input_path.as_ref()) {
+            Ok(v) => v,
+            Err(e) => {
+                eprintln!("{e}");
+                return ExitCode::FAILURE;
+            }
+        };
+        eprintln!("Generating proof to measure...");
+        match prover::prove_with_inputs(&elf_data, &private_inputs) {
+            Ok(p) => p,
+            Err(e) => {
+                eprintln!("Proving failed: {:?}", e);
+                return ExitCode::FAILURE;
+            }
+        }
+    };
+
+    let total = ser_len(&vm_proof);
+    let multi_proof_bytes = ser_len(&vm_proof.proof);
+    let main_mmcs_root_bytes = ser_len(&vm_proof.proof.main_mmcs_root);
+    let main_mmcs_spec_bytes = ser_len(&vm_proof.proof.main_mmcs_spec);
+
+    // Sum per-section across every sub-proof so a single number captures the
+    // contribution of, e.g., "all FRI query lists across all tables".
+    let mut s_main_trace_openings = 0usize;
+    let mut s_precomputed_trace_openings = 0usize;
+    let mut s_aux_trace_openings = 0usize;
+    let mut s_composition_openings = 0usize;
+    let mut s_fri_query_list = 0usize;
+    let mut s_fri_layers_roots = 0usize;
+    let mut s_trace_ood = 0usize;
+    let mut s_composition_ood = 0usize;
+    let mut s_per_table_main_root = 0usize;
+    let mut s_aux_root = 0usize;
+    let mut s_precomputed_root = 0usize;
+    let mut s_bus_public_inputs = 0usize;
+    let s_other;
+
+    for proof in &vm_proof.proof.proofs {
+        s_per_table_main_root += ser_len(&proof.lde_trace_main_merkle_root);
+        s_aux_root += ser_len(&proof.lde_trace_aux_merkle_root);
+        s_precomputed_root += ser_len(&proof.lde_trace_precomputed_merkle_root);
+        s_trace_ood += ser_len(&proof.trace_ood_evaluations);
+        s_composition_ood += ser_len(&proof.composition_poly_parts_ood_evaluation);
+        s_fri_query_list += ser_len(&proof.query_list);
+        s_fri_layers_roots += ser_len(&proof.fri_layers_merkle_roots);
+        s_bus_public_inputs += ser_len(&proof.bus_public_inputs);
+
+        for opening in &proof.deep_poly_openings {
+            s_main_trace_openings += ser_len(&opening.main_trace_polys);
+            s_precomputed_trace_openings += ser_len(&opening.precomputed_trace_polys);
+            s_aux_trace_openings += ser_len(&opening.aux_trace_polys);
+            s_composition_openings += ser_len(&opening.composition_poly);
+        }
+    }
+
+    // Anything not captured above (composition_poly_root, fri_last_value,
+    // nonce, public_inputs, trace_length, headers...). Calculate as the
+    // bundle delta so the breakdown still sums to ~total.
+    let accounted = main_mmcs_root_bytes
+        + main_mmcs_spec_bytes
+        + s_main_trace_openings
+        + s_precomputed_trace_openings
+        + s_aux_trace_openings
+        + s_composition_openings
+        + s_fri_query_list
+        + s_fri_layers_roots
+        + s_trace_ood
+        + s_composition_ood
+        + s_per_table_main_root
+        + s_aux_root
+        + s_precomputed_root
+        + s_bus_public_inputs;
+    s_other = multi_proof_bytes.saturating_sub(accounted);
+
+    let entries: Vec<ProofSizeEntry> = vec![
+        ProofSizeEntry { section: "main_mmcs_root", bytes: main_mmcs_root_bytes },
+        ProofSizeEntry { section: "main_mmcs_spec", bytes: main_mmcs_spec_bytes },
+        ProofSizeEntry { section: "per_table_main_merkle_root (preprocessed)", bytes: s_per_table_main_root },
+        ProofSizeEntry { section: "per_table_precomputed_merkle_root", bytes: s_precomputed_root },
+        ProofSizeEntry { section: "per_table_aux_merkle_root", bytes: s_aux_root },
+        ProofSizeEntry { section: "deep_poly_openings.main_trace_polys", bytes: s_main_trace_openings },
+        ProofSizeEntry { section: "deep_poly_openings.precomputed_trace_polys", bytes: s_precomputed_trace_openings },
+        ProofSizeEntry { section: "deep_poly_openings.aux_trace_polys", bytes: s_aux_trace_openings },
+        ProofSizeEntry { section: "deep_poly_openings.composition_poly", bytes: s_composition_openings },
+        ProofSizeEntry { section: "fri_layers_merkle_roots", bytes: s_fri_layers_roots },
+        ProofSizeEntry { section: "fri_query_list", bytes: s_fri_query_list },
+        ProofSizeEntry { section: "trace_ood_evaluations", bytes: s_trace_ood },
+        ProofSizeEntry { section: "composition_poly_parts_ood_evaluation", bytes: s_composition_ood },
+        ProofSizeEntry { section: "bus_public_inputs", bytes: s_bus_public_inputs },
+        ProofSizeEntry { section: "other (headers / public_inputs / nonce / ...)", bytes: s_other },
+    ];
+
+    if json {
+        #[derive(serde::Serialize)]
+        struct Report<'a> {
+            elf: String,
+            total_vm_proof_bytes: usize,
+            multi_proof_bytes: usize,
+            sub_proof_count: usize,
+            main_mmcs_spec_entries: usize,
+            sections: &'a [ProofSizeEntry],
+        }
+        let report = Report {
+            elf: elf_path.display().to_string(),
+            total_vm_proof_bytes: total,
+            multi_proof_bytes,
+            sub_proof_count: vm_proof.proof.proofs.len(),
+            main_mmcs_spec_entries: vm_proof.proof.main_mmcs_spec.len(),
+            sections: &entries,
+        };
+        match serde_json::to_string_pretty(&report) {
+            Ok(s) => println!("{s}"),
+            Err(e) => {
+                eprintln!("Failed to encode JSON: {}", e);
+                return ExitCode::FAILURE;
+            }
+        }
+    } else {
+        println!();
+        println!("== VmProof size report ==");
+        println!("ELF:               {}", elf_path.display());
+        println!("Total VmProof:     {:>10}  bytes", total);
+        println!("MultiProof only:   {:>10}  bytes", multi_proof_bytes);
+        println!("Sub-proofs:        {:>10}", vm_proof.proof.proofs.len());
+        println!("MMCS spec entries: {:>10}", vm_proof.proof.main_mmcs_spec.len());
+        println!();
+        println!("{:<48}{:>14}{:>10}", "section", "bytes", "% of total");
+        println!("{}", "-".repeat(72));
+        let denom = total.max(1) as f64;
+        for e in &entries {
+            println!(
+                "{:<48}{:>14}{:>9.2}%",
+                e.section,
+                e.bytes,
+                (e.bytes as f64) * 100.0 / denom
+            );
+        }
+    }
+
+    ExitCode::SUCCESS
+}

From dea5289652a14ee6668e8d6d970d618b34cab476 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Wed, 27 May 2026 14:58:48 -0300
Subject: [PATCH 12/21] feat(cli): `proof-size-diff` subcommand + stable
 `ProofSizeReport` JSON
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a pure post-processing subcommand for CI to render proof-size
deltas between two `proof-size --json` reports. Mirrors the
`tooling/loc` workflow: prove on the baseline and on the PR, then diff
the JSONs to produce a comment-ready table.

    cli proof-size base.elf --json > base.json
    cli proof-size pr.elf   --json > pr.json
    cli proof-size-diff base.json pr.json --format github > comment.md

Formats:
- `text`  — plain aligned table for terminal / logs
- `github` — Markdown with a collapsible <details> per-section block;
  the headline row stays visible in the PR comment so reviewers see the
  total delta at a glance
- `slack` — Slack-flavoured Markdown (code fence for the breakdown)

Per-section diff handles the typical edge cases:
- Section present in both → bytes delta + percentage
- Section new in current → `+N (new)`
- Section dropped in current → `-N (gone)`
- Section order follows `current`, then prev-only at the end (lossless)

The previously-anonymous `Report` struct in `cmd_proof_size` is hoisted
to a public-shape `ProofSizeReport { elf, total_vm_proof_bytes,
multi_proof_bytes, sub_proof_count, main_mmcs_spec_entries, sections }`
so the JSON schema is stable enough for CI to depend on. `section` is
now `String` (was `&'static str`) so the same struct round-trips
through deserialization.

3 unit tests in `proof_size_diff_tests` cover:
- Headline + per-section deltas in text format
- New-section + removed-section handling
- GitHub format collapsible block + percentage rendering

Build clean, 3/3 tests green, smoke-tested both formats with fake JSONs.
---
 bin/cli/src/main.rs | 279 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 251 insertions(+), 28 deletions(-)

diff --git a/bin/cli/src/main.rs b/bin/cli/src/main.rs
index a8ba411d5..9e4c95ad4 100644
--- a/bin/cli/src/main.rs
+++ b/bin/cli/src/main.rs
@@ -196,6 +196,25 @@ enum Commands {
         #[arg(long)]
         json: bool,
     },
+
+    /// Diff two `proof-size --json` reports and emit a comparison suitable
+    /// for posting to a PR / Slack channel. Pure post-processing — does not
+    /// run the prover. Designed to mirror the `tooling/loc` workflow:
+    ///   cli proof-size base.elf --json > base.json
+    ///   cli proof-size pr.elf   --json > pr.json
+    ///   cli proof-size-diff base.json pr.json --format github > comment.md
+    ProofSizeDiff {
+        /// JSON report from the baseline (e.g. main) build.
+        #[arg(value_hint = ValueHint::FilePath)]
+        previous: PathBuf,
+        /// JSON report from the candidate (e.g. PR) build.
+        #[arg(value_hint = ValueHint::FilePath)]
+        current: PathBuf,
+        /// Output format: `github` (markdown table for PR comments),
+        /// `slack` (Slack-flavoured markdown), or `text` (plain table).
+        #[arg(long, default_value = "text")]
+        format: String,
+    },
 }
 
 fn main() -> ExitCode {
@@ -230,6 +249,11 @@ fn main() -> ExitCode {
             private_input,
             json,
         } => cmd_proof_size(elf, proof, private_input, json),
+        Commands::ProofSizeDiff {
+            previous,
+            current,
+            format,
+        } => cmd_proof_size_diff(previous, current, &format),
     }
 }
 
@@ -593,12 +617,24 @@ fn cmd_count_elements(elf_path: PathBuf, private_input_path: Option<PathBuf>) ->
 /// One row of the proof-size report. `bytes` are the serialized length of
 /// the corresponding piece of the proof under the same encoder used for the
 /// full bundle (bincode v1).
-#[derive(Debug, Clone, serde::Serialize)]
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
 struct ProofSizeEntry {
-    section: &'static str,
+    section: String,
     bytes: usize,
 }
 
+/// Top-level JSON shape emitted by `cli proof-size --json` and consumed by
+/// `cli proof-size-diff`. Stable enough for CI to depend on.
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
+struct ProofSizeReport {
+    elf: String,
+    total_vm_proof_bytes: usize,
+    multi_proof_bytes: usize,
+    sub_proof_count: usize,
+    main_mmcs_spec_entries: usize,
+    sections: Vec<ProofSizeEntry>,
+}
+
 fn ser_len<T: serde::Serialize>(value: &T) -> usize {
     // bincode v1 mirrors the encoding used by VmProof callers (bin/cli prove
     // and prover tests), so per-section sums add up to the total bundle.
@@ -711,40 +747,31 @@ fn cmd_proof_size(
     s_other = multi_proof_bytes.saturating_sub(accounted);
 
     let entries: Vec<ProofSizeEntry> = vec![
-        ProofSizeEntry { section: "main_mmcs_root", bytes: main_mmcs_root_bytes },
-        ProofSizeEntry { section: "main_mmcs_spec", bytes: main_mmcs_spec_bytes },
-        ProofSizeEntry { section: "per_table_main_merkle_root (preprocessed)", bytes: s_per_table_main_root },
-        ProofSizeEntry { section: "per_table_precomputed_merkle_root", bytes: s_precomputed_root },
-        ProofSizeEntry { section: "per_table_aux_merkle_root", bytes: s_aux_root },
-        ProofSizeEntry { section: "deep_poly_openings.main_trace_polys", bytes: s_main_trace_openings },
-        ProofSizeEntry { section: "deep_poly_openings.precomputed_trace_polys", bytes: s_precomputed_trace_openings },
-        ProofSizeEntry { section: "deep_poly_openings.aux_trace_polys", bytes: s_aux_trace_openings },
-        ProofSizeEntry { section: "deep_poly_openings.composition_poly", bytes: s_composition_openings },
-        ProofSizeEntry { section: "fri_layers_merkle_roots", bytes: s_fri_layers_roots },
-        ProofSizeEntry { section: "fri_query_list", bytes: s_fri_query_list },
-        ProofSizeEntry { section: "trace_ood_evaluations", bytes: s_trace_ood },
-        ProofSizeEntry { section: "composition_poly_parts_ood_evaluation", bytes: s_composition_ood },
-        ProofSizeEntry { section: "bus_public_inputs", bytes: s_bus_public_inputs },
-        ProofSizeEntry { section: "other (headers / public_inputs / nonce / ...)", bytes: s_other },
+        ProofSizeEntry { section: "main_mmcs_root".into(), bytes: main_mmcs_root_bytes },
+        ProofSizeEntry { section: "main_mmcs_spec".into(), bytes: main_mmcs_spec_bytes },
+        ProofSizeEntry { section: "per_table_main_merkle_root (preprocessed)".into(), bytes: s_per_table_main_root },
+        ProofSizeEntry { section: "per_table_precomputed_merkle_root".into(), bytes: s_precomputed_root },
+        ProofSizeEntry { section: "per_table_aux_merkle_root".into(), bytes: s_aux_root },
+        ProofSizeEntry { section: "deep_poly_openings.main_trace_polys".into(), bytes: s_main_trace_openings },
+        ProofSizeEntry { section: "deep_poly_openings.precomputed_trace_polys".into(), bytes: s_precomputed_trace_openings },
+        ProofSizeEntry { section: "deep_poly_openings.aux_trace_polys".into(), bytes: s_aux_trace_openings },
+        ProofSizeEntry { section: "deep_poly_openings.composition_poly".into(), bytes: s_composition_openings },
+        ProofSizeEntry { section: "fri_layers_merkle_roots".into(), bytes: s_fri_layers_roots },
+        ProofSizeEntry { section: "fri_query_list".into(), bytes: s_fri_query_list },
+        ProofSizeEntry { section: "trace_ood_evaluations".into(), bytes: s_trace_ood },
+        ProofSizeEntry { section: "composition_poly_parts_ood_evaluation".into(), bytes: s_composition_ood },
+        ProofSizeEntry { section: "bus_public_inputs".into(), bytes: s_bus_public_inputs },
+        ProofSizeEntry { section: "other (headers / public_inputs / nonce / ...)".into(), bytes: s_other },
     ];
 
     if json {
-        #[derive(serde::Serialize)]
-        struct Report<'a> {
-            elf: String,
-            total_vm_proof_bytes: usize,
-            multi_proof_bytes: usize,
-            sub_proof_count: usize,
-            main_mmcs_spec_entries: usize,
-            sections: &'a [ProofSizeEntry],
-        }
-        let report = Report {
+        let report = ProofSizeReport {
             elf: elf_path.display().to_string(),
             total_vm_proof_bytes: total,
             multi_proof_bytes,
             sub_proof_count: vm_proof.proof.proofs.len(),
             main_mmcs_spec_entries: vm_proof.proof.main_mmcs_spec.len(),
-            sections: &entries,
+            sections: entries.clone(),
         };
         match serde_json::to_string_pretty(&report) {
             Ok(s) => println!("{s}"),
@@ -777,3 +804,199 @@ fn cmd_proof_size(
 
     ExitCode::SUCCESS
 }
+
+// =============================================================================
+// proof-size-diff: read two ProofSizeReport JSONs and emit a comparison.
+// =============================================================================
+
+fn cmd_proof_size_diff(previous: PathBuf, current: PathBuf, format: &str) -> ExitCode {
+    let prev: ProofSizeReport = match load_report(&previous) {
+        Ok(r) => r,
+        Err(e) => {
+            eprintln!("Failed to load previous report ({}): {}", previous.display(), e);
+            return ExitCode::FAILURE;
+        }
+    };
+    let curr: ProofSizeReport = match load_report(&current) {
+        Ok(r) => r,
+        Err(e) => {
+            eprintln!("Failed to load current report ({}): {}", current.display(), e);
+            return ExitCode::FAILURE;
+        }
+    };
+
+    let rendered = match format {
+        "github" => render_github(&prev, &curr),
+        "slack" => render_slack(&prev, &curr),
+        "text" | "txt" => render_text(&prev, &curr),
+        other => {
+            eprintln!("Unknown --format value: {other:?}. Try github | slack | text.");
+            return ExitCode::FAILURE;
+        }
+    };
+    println!("{rendered}");
+    ExitCode::SUCCESS
+}
+
+fn load_report(path: &PathBuf) -> Result<ProofSizeReport, String> {
+    let s = std::fs::read_to_string(path).map_err(|e| e.to_string())?;
+    serde_json::from_str(&s).map_err(|e| e.to_string())
+}
+
+/// Pair sections from two reports by name. The order returned mirrors the
+/// section order of `curr`; any section present in `prev` but missing in
+/// `curr` is appended at the end so the diff is lossless.
+fn paired_sections<'a>(
+    prev: &'a ProofSizeReport,
+    curr: &'a ProofSizeReport,
+) -> Vec<(String, Option<usize>, Option<usize>)> {
+    let mut out: Vec<(String, Option<usize>, Option<usize>)> = Vec::new();
+    for c in &curr.sections {
+        let p = prev.sections.iter().find(|p| p.section == c.section);
+        out.push((c.section.clone(), p.map(|p| p.bytes), Some(c.bytes)));
+    }
+    for p in &prev.sections {
+        if curr.sections.iter().all(|c| c.section != p.section) {
+            out.push((p.section.clone(), Some(p.bytes), None));
+        }
+    }
+    out
+}
+
+fn fmt_delta(prev: Option<usize>, curr: Option<usize>) -> String {
+    match (prev, curr) {
+        (Some(p), Some(c)) => {
+            let d = c as i64 - p as i64;
+            let pct = if p == 0 { 0.0 } else { d as f64 * 100.0 / p as f64 };
+            format!("{:+} ({:+.2}%)", d, pct)
+        }
+        (None, Some(c)) => format!("+{} (new)", c),
+        (Some(p), None) => format!("-{} (gone)", p),
+        (None, None) => "—".to_string(),
+    }
+}
+
+fn fmt_total_delta(prev: usize, curr: usize) -> String {
+    let d = curr as i64 - prev as i64;
+    let pct = if prev == 0 { 0.0 } else { d as f64 * 100.0 / prev as f64 };
+    format!("{:+} ({:+.2}%)", d, pct)
+}
+
+fn render_text(prev: &ProofSizeReport, curr: &ProofSizeReport) -> String {
+    let mut s = String::new();
+    s.push_str("== Proof size diff ==\n");
+    s.push_str(&format!("previous: {}  ({} bytes)\n", prev.elf, prev.total_vm_proof_bytes));
+    s.push_str(&format!("current:  {}  ({} bytes)\n", curr.elf, curr.total_vm_proof_bytes));
+    s.push_str(&format!(
+        "total delta: {}\n\n",
+        fmt_total_delta(prev.total_vm_proof_bytes, curr.total_vm_proof_bytes)
+    ));
+    s.push_str(&format!("{:<48}{:>12}{:>12}{:>22}\n", "section", "previous", "current", "delta"));
+    s.push_str(&format!("{}\n", "-".repeat(94)));
+    for (section, p, c) in paired_sections(prev, curr) {
+        let p_str = p.map(|v| v.to_string()).unwrap_or_else(|| "—".into());
+        let c_str = c.map(|v| v.to_string()).unwrap_or_else(|| "—".into());
+        s.push_str(&format!("{:<48}{:>12}{:>12}{:>22}\n", section, p_str, c_str, fmt_delta(p, c)));
+    }
+    s
+}
+
+fn render_github(prev: &ProofSizeReport, curr: &ProofSizeReport) -> String {
+    let mut s = String::new();
+    s.push_str("### 📦 Proof size diff\n\n");
+    s.push_str(&format!(
+        "| | bytes |\n|---|---:|\n| previous (`{}`) | {} |\n| current (`{}`) | {} |\n| **total delta** | **{}** |\n\n",
+        prev.elf,
+        prev.total_vm_proof_bytes,
+        curr.elf,
+        curr.total_vm_proof_bytes,
+        fmt_total_delta(prev.total_vm_proof_bytes, curr.total_vm_proof_bytes),
+    ));
+    s.push_str("<details><summary>Per-section breakdown</summary>\n\n");
+    s.push_str("| section | previous | current | delta |\n|---|---:|---:|---:|\n");
+    for (section, p, c) in paired_sections(prev, curr) {
+        let p_str = p.map(|v| v.to_string()).unwrap_or_else(|| "—".into());
+        let c_str = c.map(|v| v.to_string()).unwrap_or_else(|| "—".into());
+        s.push_str(&format!("| `{}` | {} | {} | {} |\n", section, p_str, c_str, fmt_delta(p, c)));
+    }
+    s.push_str("\n</details>\n");
+    s
+}
+
+fn render_slack(prev: &ProofSizeReport, curr: &ProofSizeReport) -> String {
+    let mut s = String::new();
+    s.push_str("*Proof size diff*\n");
+    s.push_str(&format!(
+        "previous (`{}`): {} bytes\n",
+        prev.elf, prev.total_vm_proof_bytes
+    ));
+    s.push_str(&format!(
+        "current  (`{}`): {} bytes\n",
+        curr.elf, curr.total_vm_proof_bytes
+    ));
+    s.push_str(&format!(
+        "*total delta*: {}\n\n```\n",
+        fmt_total_delta(prev.total_vm_proof_bytes, curr.total_vm_proof_bytes)
+    ));
+    s.push_str(&format!("{:<48}{:>12}{:>12}{:>22}\n", "section", "previous", "current", "delta"));
+    for (section, p, c) in paired_sections(prev, curr) {
+        let p_str = p.map(|v| v.to_string()).unwrap_or_else(|| "—".into());
+        let c_str = c.map(|v| v.to_string()).unwrap_or_else(|| "—".into());
+        s.push_str(&format!("{:<48}{:>12}{:>12}{:>22}\n", section, p_str, c_str, fmt_delta(p, c)));
+    }
+    s.push_str("```\n");
+    s
+}
+
+#[cfg(test)]
+mod proof_size_diff_tests {
+    use super::*;
+
+    fn r(elf: &str, total: usize, sections: &[(&str, usize)]) -> ProofSizeReport {
+        ProofSizeReport {
+            elf: elf.into(),
+            total_vm_proof_bytes: total,
+            multi_proof_bytes: total,
+            sub_proof_count: 1,
+            main_mmcs_spec_entries: 0,
+            sections: sections
+                .iter()
+                .map(|(s, b)| ProofSizeEntry { section: (*s).into(), bytes: *b })
+                .collect(),
+        }
+    }
+
+    #[test]
+    fn text_diff_shows_total_and_per_section_delta() {
+        let prev = r("base.elf", 100, &[("a", 60), ("b", 40)]);
+        let curr = r("pr.elf", 110, &[("a", 50), ("b", 60)]);
+        let out = render_text(&prev, &curr);
+        assert!(out.contains("total delta: +10"));
+        assert!(out.contains("-10"));
+        assert!(out.contains("+20"));
+    }
+
+    #[test]
+    fn diff_handles_new_and_removed_sections() {
+        let prev = r("base.elf", 50, &[("a", 30), ("gone", 20)]);
+        let curr = r("pr.elf", 60, &[("a", 30), ("new", 30)]);
+        let pairs = paired_sections(&prev, &curr);
+        // Order: current sections first, then prev-only.
+        assert_eq!(pairs[0].0, "a");
+        assert_eq!(pairs[1].0, "new");
+        assert_eq!(pairs[2].0, "gone");
+        let text = render_text(&prev, &curr);
+        assert!(text.contains("(new)"));
+        assert!(text.contains("(gone)"));
+    }
+
+    #[test]
+    fn github_format_has_collapsible_section() {
+        let prev = r("base.elf", 100, &[("a", 100)]);
+        let curr = r("pr.elf", 90, &[("a", 90)]);
+        let out = render_github(&prev, &curr);
+        assert!(out.contains("### 📦 Proof size diff"));
+        assert!(out.contains("<details>"));
+        assert!(out.contains("-10 (-10.00%)"));
+    }
+}

From 3f6e5a33069c80ff91bc679c54fefd82ecd6922e Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Wed, 27 May 2026 15:29:29 -0300
Subject: [PATCH 13/21] feat(stark/mmcs): wire AUX trace under a shared MMCS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Mirror of the main-trace MMCS C1 work, applied to the auxiliary trace.
Pulls per-table aux Merkle roots out of the per-table forked transcripts
into a single shared aux MMCS root absorbed into the SHARED transcript
BEFORE per-table forking — every fork inherits the same aux binding
identically, and the proof carries one aux root instead of N.

Domain separation:
- `LEAF_DOMAIN_TAG_AUX = "LAMBDAVM_AUX_MMCS_LEAF_V1"` (alongside the
  existing main tag, now also aliased as `LEAF_DOMAIN_TAG_MAIN`).
- `hash_tagged_row_bytes_aux` / `hash_tagged_row_aux` helpers use it.
- An aux-MMCS opening cannot authenticate a main leaf (or vice versa);
  a `(tag, row)` pair under the two domains produces distinct digests.
  Pinned by a new test in `mmcs_leaf::tests`.

Architecture:
- `MainTraceOpening` gains a sibling `AuxTraceOpening<E>` enum (Mmcs
  variant only — there is no preprocessed-equivalent for aux).
- `MainCommit` gains a sibling `AuxCommit<E>::Shared { mmcs (Arc), tag,
  padded_height }`. `Round1.aux: Option<AuxCommit<E>>` (`None` when an
  AIR has no aux trace).
- `DeepPolynomialOpening.aux_trace_polys: Option<AuxTraceOpening<E>>`
  (the old `PolynomialOpenings<E>` shape is gone).
- `StarkProof` drops `lde_trace_aux_merkle_root` entirely.
- `MultiProof` gains:
  - `aux_mmcs_root: Option<Commitment>`  — `None` when no AIR has aux.
  - `aux_mmcs_spec: Vec<(MatrixTag, usize)>` — filtered to has-aux tables.

Phase C absorb order (prover + verifier match exactly):
  1. Phase A: main MMCS root absorb (unchanged).
  2. Phase B: sample LogUp challenges (unchanged).
  3. NEW: build aux LDEs + tagged leaves for tables with aux,
          build shared aux MMCS,
          absorb its root into the SHARED transcript.
  4. Fork per-table.
  5. Per-table: bind `bus_public_inputs.table_contribution` (unchanged),
                run rounds 2-4 (unchanged). No per-table aux root absorb.

Per-query open / verify path:
- Prover: `aux_commit.mmcs.open((iota*2) << shift)` and
  `mmcs.open((iota*2+1) << shift)`, producing `AuxTraceOpening::Mmcs`.
- Verifier: `verify_aux_mmcs_pair_inner` rehashes evaluations with the
  AUX domain, compares against `mmcs_opening.matrix_leaves[table_idx]`,
  cross-checks `global_index`, and authenticates against root + spec.
- `verify_trace_openings` dispatches: `Some(_)` requires aux MMCS root
  to be present; `(None, _)` is fine (table has no aux); `(Some, None)`
  rejects (proof claims an opening but no MMCS exists).

Verifier spec validation:
- Reproduces `expected_aux_spec` from `airs.filter(has_aux_trace).map((tag, lde_size))`
  in spec-fixed order, sorts by `(height desc, tag asc)` to match
  `MmcsBuilder::finalize`, and rejects on any mismatch with the
  proof-supplied `aux_mmcs_spec`.

Dead code cleanup:
- Removed `TableCommit<F>` entirely. Both main and aux now go through
  their own enum-typed commits; nothing else used it.

Tests: stark 142/142 green (130 prior + the 10 main-MMCS soundness +
2 new aux leaf-hash tests). The 77 prove_elfs failures in
lambda-vm-prover predate this work (UnknownSyscall(5)).
---
 crypto/stark/src/mmcs_leaf.rs   |  88 ++++++++-
 crypto/stark/src/proof/stark.rs |  49 ++++-
 crypto/stark/src/prover.rs      | 326 ++++++++++++++++++++++----------
 crypto/stark/src/verifier.rs    | 168 ++++++++++++----
 4 files changed, 491 insertions(+), 140 deletions(-)

diff --git a/crypto/stark/src/mmcs_leaf.rs b/crypto/stark/src/mmcs_leaf.rs
index 488a937af..447f9650f 100644
--- a/crypto/stark/src/mmcs_leaf.rs
+++ b/crypto/stark/src/mmcs_leaf.rs
@@ -31,6 +31,18 @@ use crate::config::Commitment;
 /// any encoding change so old proofs cannot be silently re-interpreted.
 pub const LEAF_DOMAIN_TAG: &[u8] = b"LAMBDAVM_MAIN_MMCS_LEAF_V1";
 
+/// Aliased name for `LEAF_DOMAIN_TAG`. Use this in new code to make the
+/// intent explicit when an MMCS-specific tag is needed alongside the aux
+/// tag below.
+pub const LEAF_DOMAIN_TAG_MAIN: &[u8] = LEAF_DOMAIN_TAG;
+
+/// Versioned domain separator for AUX-trace MMCS leaves. Distinct from
+/// `LEAF_DOMAIN_TAG_MAIN` so that an aux leaf and a main leaf with the
+/// same `(MatrixTag, row_bytes)` produce different digests — i.e. neither
+/// MMCS opening can authenticate a leaf that was committed against the
+/// other.
+pub const LEAF_DOMAIN_TAG_AUX: &[u8] = b"LAMBDAVM_AUX_MMCS_LEAF_V1";
+
 /// Synthesize `n` distinct [`MatrixTag`]s derived from positional index.
 /// Useful for generic stark tests where the caller does not own a stable
 /// chip-type assignment. Production code in lambda-vm uses
@@ -47,22 +59,58 @@ pub fn synth_main_tags_for<T>(slice: &[T]) -> Vec<MatrixTag> {
     synth_main_tags(slice.len())
 }
 
-/// Hash one row's worth of column bytes into a leaf digest using the
-/// canonical tagged format. `row_bytes_be` is the concatenation of every
-/// committed column's element written big-endian, in column order.
+/// Hash one row's worth of column bytes into a MAIN-trace MMCS leaf digest.
+/// `row_bytes_be` is the concatenation of every committed column's element
+/// written big-endian, in column order.
 #[inline]
 pub fn hash_tagged_row_bytes(tag: MatrixTag, row_bytes_be: &[u8]) -> Commitment {
+    hash_with_domain(LEAF_DOMAIN_TAG_MAIN, tag, row_bytes_be)
+}
+
+/// Hash one row's worth of column bytes into an AUX-trace MMCS leaf digest.
+/// Uses [`LEAF_DOMAIN_TAG_AUX`] so the digest cannot collide with a
+/// main-trace leaf for the same `(tag, row_bytes)`.
+#[inline]
+pub fn hash_tagged_row_bytes_aux(tag: MatrixTag, row_bytes_be: &[u8]) -> Commitment {
+    hash_with_domain(LEAF_DOMAIN_TAG_AUX, tag, row_bytes_be)
+}
+
+#[inline]
+fn hash_with_domain(domain: &[u8], tag: MatrixTag, row_bytes_be: &[u8]) -> Commitment {
     let mut h = Keccak256::new();
-    h.update(LEAF_DOMAIN_TAG);
+    h.update(domain);
     h.update(tag.0);
     h.update(row_bytes_be);
     h.finalize().into()
 }
 
-/// Convenience: hash a row from individual field elements. Allocates a
-/// stack-or-heap buffer for the row, suitable for verifier-side per-query
+/// Convenience: hash a MAIN-trace row from individual field elements.
+/// Allocates a row-sized buffer; suitable for verifier-side per-query
 /// re-hashing (where allocation cost is dominated by FRI work anyway).
 pub fn hash_tagged_row<E>(tag: MatrixTag, row: &[FieldElement<E>]) -> Commitment
+where
+    E: IsField,
+    FieldElement<E>: ByteConversion,
+{
+    hash_tagged_row_inner::<E>(LEAF_DOMAIN_TAG_MAIN, tag, row)
+}
+
+/// Convenience: hash an AUX-trace row from individual field elements. Same
+/// allocation pattern as [`hash_tagged_row`].
+pub fn hash_tagged_row_aux<E>(tag: MatrixTag, row: &[FieldElement<E>]) -> Commitment
+where
+    E: IsField,
+    FieldElement<E>: ByteConversion,
+{
+    hash_tagged_row_inner::<E>(LEAF_DOMAIN_TAG_AUX, tag, row)
+}
+
+#[inline]
+fn hash_tagged_row_inner<E>(
+    domain: &[u8],
+    tag: MatrixTag,
+    row: &[FieldElement<E>],
+) -> Commitment
 where
     E: IsField,
     FieldElement<E>: ByteConversion,
@@ -72,7 +120,7 @@ where
     for (col_idx, fe) in row.iter().enumerate() {
         fe.write_bytes_be(&mut buf[col_idx * byte_len..(col_idx + 1) * byte_len]);
     }
-    hash_tagged_row_bytes(tag, &buf)
+    hash_with_domain(domain, tag, &buf)
 }
 
 #[cfg(test)]
@@ -97,4 +145,30 @@ mod tests {
         let row_b = vec![FE::from(1u64), FE::from(3u64)];
         assert_ne!(hash_tagged_row(tag, &row_a), hash_tagged_row(tag, &row_b));
     }
+
+    #[test]
+    fn main_and_aux_domains_separate() {
+        // Same (tag, row) under the two domains MUST produce distinct
+        // digests; otherwise an aux opening could authenticate a main leaf
+        // (or vice versa).
+        let tag = MatrixTag::new([0xAB; 8]);
+        let row = vec![FE::from(42u64), FE::from(7u64)];
+        let main_digest = hash_tagged_row(tag, &row);
+        let aux_digest = hash_tagged_row_aux(tag, &row);
+        assert_ne!(main_digest, aux_digest);
+    }
+
+    #[test]
+    fn aux_bytes_helper_matches_aux_element_helper() {
+        // The bytes-flavoured helper and the element-flavoured helper must
+        // agree on the same input — same domain separator, same hash.
+        let tag = MatrixTag::new([3; 8]);
+        let row = vec![FE::from(11u64), FE::from(13u64), FE::from(17u64)];
+        let byte_len = <FE as ByteConversion>::BYTE_LEN;
+        let mut buf = vec![0u8; row.len() * byte_len];
+        for (i, fe) in row.iter().enumerate() {
+            fe.write_bytes_be(&mut buf[i * byte_len..(i + 1) * byte_len]);
+        }
+        assert_eq!(hash_tagged_row_bytes_aux(tag, &buf), hash_tagged_row_aux(tag, &row));
+    }
 }
diff --git a/crypto/stark/src/proof/stark.rs b/crypto/stark/src/proof/stark.rs
index 667f9f170..57b28f75c 100644
--- a/crypto/stark/src/proof/stark.rs
+++ b/crypto/stark/src/proof/stark.rs
@@ -57,6 +57,37 @@ impl<F: IsField> MainTraceOpening<F> {
     }
 }
 
+/// Per-query aux-trace opening. Symmetric to [`MainTraceOpening`], minus
+/// the `Tree` variant — every aux table that exists goes through the
+/// shared aux MMCS (there's no preprocessed-equivalent for aux).
+///
+/// `Option<AuxTraceOpening>` in `DeepPolynomialOpening.aux_trace_polys`
+/// carries the "this AIR has no aux trace at all" case.
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
+#[serde(bound = "")]
+pub enum AuxTraceOpening<E: IsField> {
+    Mmcs {
+        evaluations: Vec<FieldElement<E>>,
+        evaluations_sym: Vec<FieldElement<E>>,
+        mmcs_opening: MmcsOpening<Commitment>,
+        mmcs_opening_sym: MmcsOpening<Commitment>,
+    },
+}
+
+impl<E: IsField> AuxTraceOpening<E> {
+    pub fn evaluations(&self) -> &[FieldElement<E>] {
+        match self {
+            Self::Mmcs { evaluations, .. } => evaluations,
+        }
+    }
+
+    pub fn evaluations_sym(&self) -> &[FieldElement<E>] {
+        match self {
+            Self::Mmcs { evaluations_sym, .. } => evaluations_sym,
+        }
+    }
+}
+
 #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
 #[serde(bound = "")]
 pub struct DeepPolynomialOpening<F: IsSubFieldOf<E>, E: IsField> {
@@ -65,7 +96,9 @@ pub struct DeepPolynomialOpening<F: IsSubFieldOf<E>, E: IsField> {
     /// For preprocessed tables: openings for precomputed columns.
     /// These are verified against the hardcoded precomputed commitment.
     pub precomputed_trace_polys: Option<PolynomialOpenings<F>>,
-    pub aux_trace_polys: Option<PolynomialOpenings<E>>,
+    /// `None` when the AIR has no aux trace; otherwise an MMCS opening
+    /// against the shared aux MMCS (root at `MultiProof::aux_mmcs_root`).
+    pub aux_trace_polys: Option<AuxTraceOpening<E>>,
 }
 
 pub type DeepPolynomialOpenings<F, E> = Vec<DeepPolynomialOpening<F, E>>;
@@ -80,9 +113,6 @@ pub struct StarkProof<F: IsSubFieldOf<E>, E: IsField, PI> {
     /// tables stay out of the shared main-trace MMCS, so their main slice
     /// keeps its own per-table tree. `None` for non-preprocessed tables.
     pub lde_trace_main_merkle_root: Option<Commitment>,
-    // Commitments of auxiliary trace columns
-    // [tⱼ]
-    pub lde_trace_aux_merkle_root: Option<Commitment>,
     // For preprocessed tables: commitment to precomputed columns only.
     // Verifier checks this matches the hardcoded commitment from AIR.
     pub lde_trace_precomputed_merkle_root: Option<Commitment>,
@@ -119,14 +149,23 @@ pub struct StarkProof<F: IsSubFieldOf<E>, E: IsField, PI> {
 /// Non-preprocessed tables share a single main-trace MMCS authenticated by
 /// `main_mmcs_root`; `main_mmcs_spec` lists `(MatrixTag, padded_height)`
 /// per committed table in the MMCS sort order. Preprocessed tables stay
-/// out of this MMCS — each carries its own per-table Merkle root in
+/// out of the main MMCS — each carries its own per-table Merkle root in
 /// `StarkProof::lde_trace_main_merkle_root` plus the AIR-pinned
 /// precomputed root. Both groups' roots are absorbed in spec-fixed order
 /// during Phase A.
+///
+/// Aux traces (only present for AIRs with LogUp interactions) share a
+/// SECOND MMCS authenticated by `aux_mmcs_root`; `aux_mmcs_spec` lists
+/// `(MatrixTag, padded_height)` for the subset of tables that contribute
+/// aux. `aux_mmcs_root` is `None` when no table in the multi-proof has an
+/// aux trace. Domain-separated from the main MMCS via `LEAF_DOMAIN_TAG_AUX`
+/// so that no aux opening can authenticate a main leaf (or vice versa).
 #[derive(Debug, serde::Serialize, serde::Deserialize)]
 #[serde(bound = "PI: serde::Serialize + serde::de::DeserializeOwned")]
 pub struct MultiProof<F: IsSubFieldOf<E>, E: IsField, PI> {
     pub proofs: Vec<StarkProof<F, E, PI>>,
     pub main_mmcs_root: Commitment,
     pub main_mmcs_spec: Vec<(MatrixTag, usize)>,
+    pub aux_mmcs_root: Option<Commitment>,
+    pub aux_mmcs_spec: Vec<(MatrixTag, usize)>,
 }
diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs
index 6c58d5ac0..a5e2b8142 100644
--- a/crypto/stark/src/prover.rs
+++ b/crypto/stark/src/prover.rs
@@ -81,55 +81,6 @@ pub enum ProvingError {
     DiskSpill(String),
 }
 
-/// Commitment artifacts for one trace table (main or auxiliary). Used for both
-/// plain and preprocessed tables. Preprocessed tables additionally carry a
-/// separate Merkle tree over their precomputed columns, hence the optional
-/// `precomputed_tree`/`precomputed_root` pair and the `num_precomputed_cols`
-/// index used when opening positions.
-pub(crate) struct TableCommit<F: IsField>
-where
-    FieldElement<F>: AsBytes,
-{
-    /// Merkle tree over the trace columns (multiplicities only for preprocessed tables).
-    pub(crate) tree: Arc<BatchedMerkleTree<F>>,
-    /// Root of `tree`.
-    pub(crate) root: Commitment,
-    /// Preprocessed tables only: Merkle tree over precomputed columns.
-    pub(crate) precomputed_tree: Option<Arc<BatchedMerkleTree<F>>>,
-    /// Preprocessed tables only: root of `precomputed_tree`.
-    pub(crate) precomputed_root: Option<Commitment>,
-    /// Preprocessed tables only: number of precomputed columns. Zero otherwise.
-    pub(crate) num_precomputed_cols: usize,
-}
-
-impl<F: IsField> TableCommit<F>
-where
-    FieldElement<F>: AsBytes,
-{
-    /// Build a `TableCommit` for a plain (non-preprocessed) table.
-    fn plain(tree: BatchedMerkleTree<F>, root: Commitment) -> Self {
-        Self {
-            tree: Arc::new(tree),
-            root,
-            precomputed_tree: None,
-            precomputed_root: None,
-            num_precomputed_cols: 0,
-        }
-    }
-
-    /// Cheap clone. Only bumps Arc refcounts, no tree data is copied.
-    fn share(&self) -> Self {
-        Self {
-            tree: Arc::clone(&self.tree),
-            root: self.root,
-            precomputed_tree: self.precomputed_tree.as_ref().map(Arc::clone),
-            precomputed_root: self.precomputed_root,
-            num_precomputed_cols: self.num_precomputed_cols,
-        }
-    }
-
-}
-
 /// Per-table commitment artifacts for the main trace under the shared
 /// MMCS protocol. The `mmcs` Arc is the SAME instance for every table in
 /// the multi-proof — Phase A builds it once.
@@ -256,6 +207,54 @@ where
     }
 }
 
+/// Per-table aux-trace commitment under the shared aux MMCS.
+/// Mirror of [`MainCommit::Shared`]: the `mmcs` Arc is shared across every
+/// table that contributes an aux trace; `tag` + `padded_height` identify
+/// this table's slot inside that MMCS.
+pub(crate) enum AuxCommit<E: IsField>
+where
+    FieldElement<E>: AsBytes,
+{
+    Shared {
+        mmcs: Arc<Mmcs<BatchedMerkleTreeBackend<E>>>,
+        tag: MatrixTag,
+        padded_height: usize,
+    },
+}
+
+impl<E: IsField> AuxCommit<E>
+where
+    FieldElement<E>: AsBytes,
+{
+    fn share(&self) -> Self {
+        match self {
+            Self::Shared {
+                mmcs,
+                tag,
+                padded_height,
+            } => Self::Shared {
+                mmcs: Arc::clone(mmcs),
+                tag: *tag,
+                padded_height: *padded_height,
+            },
+        }
+    }
+}
+
+/// Per-table aux Phase-C output collected BEFORE the shared aux MMCS is
+/// built. `leaves` are aux-tagged Keccak digests over the committed aux-trace
+/// LDE rows. Consumed by the single `MmcsBuilder::finalize` call once
+/// every aux-bearing table has produced them.
+struct AuxPhaseCOutput<E: IsField>
+where
+    FieldElement<E>: AsBytes,
+{
+    tag: MatrixTag,
+    leaves: Vec<Commitment>,
+    _marker: PhantomData<E>,
+    padded_height: usize,
+}
+
 /// A container for the results of the first round of the STARK Prove protocol.
 pub(crate) struct Round1<Field, FieldExtension>
 where
@@ -269,7 +268,7 @@ where
     /// Commitment to the main trace (shared MMCS handle + per-table tag).
     pub(crate) main: MainCommit<Field>,
     /// Commitment to the auxiliary (RAP) trace, if any.
-    pub(crate) aux: Option<TableCommit<FieldExtension>>,
+    pub(crate) aux: Option<AuxCommit<FieldExtension>>,
     /// The challenges of the RAP round.
     pub(crate) rap_challenges: Vec<FieldElement<FieldExtension>>,
     /// Bus interaction public inputs (initial and final aux column values).
@@ -286,7 +285,7 @@ where
     FieldElement<FieldExtension>: AsBytes,
 {
     main: MainCommit<Field>,
-    aux: Option<TableCommit<FieldExtension>>,
+    aux: Option<AuxCommit<FieldExtension>>,
     rap_challenges: Vec<FieldElement<FieldExtension>>,
     bus_public_inputs: Option<BusPublicInputs<FieldExtension>>,
 }
@@ -318,7 +317,7 @@ where
         Round1 {
             lde_trace: LDETraceTable::from_columns(lde.main, lde.aux, step_size, blowup_factor),
             main: self.main.share(),
-            aux: self.aux.as_ref().map(TableCommit::share),
+            aux: self.aux.as_ref().map(AuxCommit::share),
             rap_challenges: self.rap_challenges.clone(),
             bus_public_inputs: self.bus_public_inputs.clone(),
         }
@@ -550,6 +549,82 @@ where
     Ok((root, spec, Arc::new(mmcs)))
 }
 
+/// Tagged per-row leaf digest for the AUX-trace MMCS. Mirror of
+/// [`compute_tagged_leaves_bit_reversed`] but uses the aux domain
+/// separator so aux/main leaves cannot collide.
+pub fn compute_tagged_leaves_bit_reversed_aux<E>(
+    columns: &[Vec<FieldElement<E>>],
+    tag: MatrixTag,
+) -> Vec<Commitment>
+where
+    E: IsField,
+    FieldElement<E>: AsBytes + Sync + Send + ByteConversion,
+{
+    if columns.is_empty() || columns[0].is_empty() {
+        return Vec::new();
+    }
+    let num_rows = columns[0].len();
+    let num_cols = columns.len();
+    let byte_len = <FieldElement<E> as ByteConversion>::BYTE_LEN;
+    debug_assert!(num_rows.is_power_of_two());
+    let total_bytes = num_cols * byte_len;
+    let hash_leaf =
+        |buf: &mut [u8], row_idx: usize| -> Commitment {
+            let br_idx = reverse_index(row_idx, num_rows as u64);
+            for (col_idx, col) in columns.iter().enumerate() {
+                col[br_idx]
+                    .write_bytes_be(&mut buf[col_idx * byte_len..(col_idx + 1) * byte_len]);
+            }
+            crate::mmcs_leaf::hash_tagged_row_bytes_aux(tag, buf)
+        };
+    #[cfg(feature = "parallel")]
+    {
+        (0..num_rows)
+            .into_par_iter()
+            .map_init(|| vec![0u8; total_bytes], |buf, i| hash_leaf(buf, i))
+            .collect()
+    }
+    #[cfg(not(feature = "parallel"))]
+    {
+        let mut buf = vec![0u8; total_bytes];
+        (0..num_rows).map(|i| hash_leaf(&mut buf, i)).collect()
+    }
+}
+
+/// Build the shared AUX-trace MMCS from per-table Phase-C outputs (only
+/// tables that have an aux trace participate). Returns `None`/`empty spec`
+/// when no table contributes aux.
+#[allow(clippy::type_complexity)]
+fn build_aux_mmcs<E>(
+    outputs: &[Option<AuxPhaseCOutput<E>>],
+) -> Result<
+    (
+        Option<Commitment>,
+        Vec<(MatrixTag, usize)>,
+        Option<Arc<Mmcs<BatchedMerkleTreeBackend<E>>>>,
+    ),
+    ProvingError,
+>
+where
+    E: IsField + Send + Sync,
+    FieldElement<E>: AsBytes + Send + Sync,
+{
+    let any = outputs.iter().any(|o| o.is_some());
+    if !any {
+        return Ok((None, Vec::new(), None));
+    }
+    let mut builder: MmcsBuilder<BatchedMerkleTreeBackend<E>> = MmcsBuilder::new();
+    for out in outputs.iter().flatten() {
+        builder
+            .add_matrix(out.tag, out.leaves.clone())
+            .map_err(map_mmcs_err)?;
+    }
+    let mmcs = builder.finalize().map_err(map_mmcs_err)?;
+    let root = *mmcs.root();
+    let spec = mmcs.spec();
+    Ok((Some(root), spec, Some(Arc::new(mmcs))))
+}
+
 /// Tagged per-row leaf digest for the main-trace MMCS.
 pub fn compute_tagged_leaves_bit_reversed<E>(
     columns: &[Vec<FieldElement<E>>],
@@ -1584,9 +1659,31 @@ pub trait IsStarkProver<
             );
 
             let aux_trace_polys = round_1_result.aux.as_ref().map(|aux| {
-                Self::open_polys_with(domain, &aux.tree, *index, |row| {
-                    lde_trace.gather_aux_row(row)
-                })
+                let AuxCommit::Shared { mmcs, padded_height, .. } = aux;
+                let max_height = mmcs
+                    .spec()
+                    .first()
+                    .map(|(_, h)| *h)
+                    .expect("aux MMCS spec is non-empty when aux commit exists");
+                debug_assert!(padded_height.is_power_of_two() && max_height >= *padded_height);
+                let shift = (max_height / *padded_height).trailing_zeros() as usize;
+                let domain_size = domain.lde_roots_of_unity_coset.len() as u64;
+                let primary = *index * 2;
+                let sym = *index * 2 + 1;
+                let evaluations = lde_trace.gather_aux_row(reverse_index(primary, domain_size));
+                let evaluations_sym = lde_trace.gather_aux_row(reverse_index(sym, domain_size));
+                let mmcs_opening = mmcs
+                    .open(primary << shift)
+                    .expect("aux MMCS open: prover-side primary index in range");
+                let mmcs_opening_sym = mmcs
+                    .open(sym << shift)
+                    .expect("aux MMCS open: prover-side sym index in range");
+                crate::proof::stark::AuxTraceOpening::Mmcs {
+                    evaluations,
+                    evaluations_sym,
+                    mmcs_opening,
+                    mmcs_opening_sym,
+                }
             });
 
             let (main_trace_opening, precomputed_trace_opening) = match main_commit {
@@ -1959,30 +2056,20 @@ pub trait IsStarkProver<
             heap_snaps.push(s);
         }
 
-        // Pass 2: Parallel fork transcript → extract → LDE → commit in chunks of K.
-        // Each table gets its own transcript fork.
+        // Pass 2: parallel aux-LDE + tagged-leaf computation, then a single
+        // shared aux MMCS build. The aux MMCS root is absorbed into the
+        // SHARED transcript BEFORE per-table forking, so every table's
+        // forked transcript sees the same aux MMCS commitment without
+        // dragging per-table aux roots through Fiat-Shamir.
         #[cfg(feature = "instruments")]
         let phase_start = Instant::now();
 
-        // Pre-fork all transcripts (cheap, sequential — must match verifier ordering)
-        let mut table_transcripts: Vec<_> = (0..num_airs)
-            .map(|idx| {
-                let mut t = transcript.clone();
-                if num_airs > 1 {
-                    t.append_bytes(&(idx as u64).to_le_bytes());
-                }
-                t
-            })
-            .collect();
-
-        // Parallel aux commit in chunks of K. Each entry holds the optional aux
-        // `TableCommit` (`None` when the AIR has no aux trace) and the cached
-        // aux LDE columns consumed in Phase D.
-        #[allow(clippy::type_complexity)]
-        let mut aux_results: Vec<(
-            Option<TableCommit<FieldExtension>>,
-            Vec<Vec<FieldElement<FieldExtension>>>,
-        )> = Vec::with_capacity(num_airs);
+        // Per-table aux Phase-C outputs. `None` entries are tables with no
+        // aux trace and contribute neither leaves nor an MMCS slot.
+        let mut aux_outputs: Vec<Option<AuxPhaseCOutput<FieldExtension>>> =
+            Vec::with_capacity(num_airs);
+        let mut aux_ldes: Vec<Vec<Vec<FieldElement<FieldExtension>>>> =
+            Vec::with_capacity(num_airs);
 
         for chunk_start in (0..num_airs).step_by(k) {
             let chunk_end = (chunk_start + k).min(num_airs);
@@ -1998,6 +2085,7 @@ pub trait IsStarkProver<
                     let (air, trace, _) = &air_trace_pairs[idx];
                     let domain = &domains[idx];
                     let twiddles = &twiddle_caches[idx];
+                    let tag = main_tags[idx];
 
                     if air.has_aux_trace() {
                         let lde_size = domain.interpolation_domain_size * domain.blowup_factor;
@@ -2017,35 +2105,81 @@ pub trait IsStarkProver<
                         let aux_lde_dur = t_sub.elapsed();
                         #[cfg(feature = "instruments")]
                         let t_sub = Instant::now();
-                        #[allow(unused_mut)]
-                        let (mut tree, root) = Self::commit_columns_bit_reversed(&columns)
-                            .ok_or(ProvingError::EmptyCommitment)?;
+                        let leaves =
+                            compute_tagged_leaves_bit_reversed_aux::<FieldExtension>(&columns, tag);
+                        if leaves.is_empty() {
+                            return Err(ProvingError::EmptyCommitment);
+                        }
+                        let padded_height = leaves.len();
                         #[cfg(feature = "instruments")]
                         crate::instruments::accum_r1_aux(aux_lde_dur, t_sub.elapsed());
-
-                        #[cfg(feature = "disk-spill")]
-                        if storage_mode == StorageMode::Disk {
-                            tree.spill_nodes_to_disk().map_err(|e| {
-                                ProvingError::DiskSpill(format!("aux Merkle tree: {e}"))
-                            })?;
-                        }
-                        Ok((Some(TableCommit::plain(tree, root)), columns))
+                        let output = AuxPhaseCOutput::<FieldExtension> {
+                            tag,
+                            leaves,
+                            padded_height,
+                            _marker: PhantomData,
+                        };
+                        Ok((Some(output), columns))
                     } else {
                         Ok((None, Vec::new()))
                     }
                 })
                 .collect();
 
-            // Sequential: append aux roots to forked transcripts
-            for (j, result) in chunk_aux.into_iter().enumerate() {
-                let (aux_commit, cached_aux) = result?;
-                if let Some(ref c) = aux_commit {
-                    table_transcripts[chunk_start + j].append_bytes(&c.root);
-                }
-                aux_results.push((aux_commit, cached_aux));
+            for result in chunk_aux {
+                let (output, cached_aux) = result?;
+                aux_outputs.push(output);
+                aux_ldes.push(cached_aux);
             }
         }
 
+        // Build the shared aux MMCS over the non-None entries. Order is
+        // spec-fixed (matches `main_tags` order, filtered to has-aux).
+        let (aux_mmcs_root_opt, aux_mmcs_spec, aux_mmcs_arc) =
+            build_aux_mmcs::<FieldExtension>(&aux_outputs)?;
+
+        // Absorb the aux MMCS root into the SHARED transcript before
+        // forking — every table's fork inherits this binding identically.
+        if let Some(ref root) = aux_mmcs_root_opt {
+            transcript.append_bytes(root);
+        }
+
+        // Pre-fork all transcripts (cheap, sequential — must match verifier ordering).
+        // Happens AFTER aux MMCS absorb so each fork inherits the binding.
+        let mut table_transcripts: Vec<_> = (0..num_airs)
+            .map(|idx| {
+                let mut t = transcript.clone();
+                if num_airs > 1 {
+                    t.append_bytes(&(idx as u64).to_le_bytes());
+                }
+                t
+            })
+            .collect();
+
+        // Reassemble per-table aux commits from the shared MMCS Arc.
+        let aux_commits: Vec<Option<AuxCommit<FieldExtension>>> = aux_outputs
+            .into_iter()
+            .map(|o| {
+                o.map(|out| AuxCommit::Shared {
+                    mmcs: Arc::clone(
+                        aux_mmcs_arc
+                            .as_ref()
+                            .expect("MMCS Arc populated when at least one aux output present"),
+                    ),
+                    tag: out.tag,
+                    padded_height: out.padded_height,
+                })
+            })
+            .collect();
+        #[allow(clippy::type_complexity)]
+        let aux_results: Vec<(
+            Option<AuxCommit<FieldExtension>>,
+            Vec<Vec<FieldElement<FieldExtension>>>,
+        )> = aux_commits
+            .into_iter()
+            .zip(aux_ldes)
+            .collect();
+
         // Build commitments and cached LDEs as separate vecs:
         // commitments are borrowed in Phase D, LDEs are consumed by value.
         let mut commitments: Vec<Round1Commitments<Field, FieldExtension>> =
@@ -2196,6 +2330,8 @@ pub trait IsStarkProver<
             proofs,
             main_mmcs_root,
             main_mmcs_spec,
+            aux_mmcs_root: aux_mmcs_root_opt,
+            aux_mmcs_spec,
         })
     }
 
@@ -2364,8 +2500,6 @@ pub trait IsStarkProver<
             // For preprocessed tables: per-table Merkle root over multiplicities
             // (preprocessed tables stay out of the shared main-trace MMCS).
             lde_trace_main_merkle_root: round_1_result.main.main_tree_root(),
-            // [t]
-            lde_trace_aux_merkle_root: round_1_result.aux.as_ref().map(|x| x.root),
             // For preprocessed tables: commitment to precomputed columns only
             lde_trace_precomputed_merkle_root: round_1_result.main.precomputed_root(),
             // tⱼ(zgᵏ)
diff --git a/crypto/stark/src/verifier.rs b/crypto/stark/src/verifier.rs
index 31ccbb3cb..569221ce0 100644
--- a/crypto/stark/src/verifier.rs
+++ b/crypto/stark/src/verifier.rs
@@ -340,9 +340,11 @@ pub trait IsStarkVerifier<
             )
     }
 
-    /// Verify the main MMCS opening + precomputed/aux Merkle openings at FRI
-    /// challenge `iota`. `main_tag`, `main_mmcs_root`, `main_mmcs_spec` come
-    /// from the surrounding multi-proof.
+    /// Verify the main MMCS opening + precomputed + aux openings at FRI
+    /// challenge `iota`. `main_*` and `aux_*` come from the surrounding
+    /// multi-proof. Aux is `None` when no AIR in the multi-proof has an
+    /// aux trace.
+    #[allow(clippy::too_many_arguments)]
     fn verify_trace_openings(
         proof: &StarkProof<Field, FieldExtension, PI>,
         deep_poly_openings: &DeepPolynomialOpening<Field, FieldExtension>,
@@ -350,10 +352,12 @@ pub trait IsStarkVerifier<
         main_tag: crypto::merkle_tree::mmcs::MatrixTag,
         main_mmcs_root: &Commitment,
         main_mmcs_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)],
+        aux_mmcs_root: Option<&Commitment>,
+        aux_mmcs_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)],
     ) -> bool
     where
         FieldElement<Field>: AsBytes + Sync + Send + math::traits::ByteConversion,
-        FieldElement<FieldExtension>: AsBytes + Sync + Send,
+        FieldElement<FieldExtension>: AsBytes + Sync + Send + math::traits::ByteConversion,
     {
         use crate::proof::stark::MainTraceOpening;
         let main_ok = match &deep_poly_openings.main_trace_polys {
@@ -383,16 +387,14 @@ pub trait IsStarkVerifier<
             _ => false,
         };
 
-        // Auxiliary trace.
-        ok &= match (
-            proof.lde_trace_aux_merkle_root,
-            &deep_poly_openings.aux_trace_polys,
-        ) {
-            (Some(root), Some(opening)) => {
-                Self::verify_opening_pair::<FieldExtension>(opening, &root, iota)
-            }
-            (None, None) => true,
-            _ => false,
+        // Auxiliary trace: shared MMCS opening for tables with aux, or
+        // None when this AIR has no aux at all.
+        ok &= match (&deep_poly_openings.aux_trace_polys, aux_mmcs_root) {
+            (Some(opening), Some(root)) => verify_aux_mmcs_pair_inner::<FieldExtension>(
+                opening, iota, main_tag, root, aux_mmcs_spec,
+            ),
+            (None, _) => true,
+            (Some(_), None) => false,
         };
 
         ok
@@ -445,16 +447,19 @@ pub trait IsStarkVerifier<
     /// Verifies the validity of the purported values of the trace polynomials and the composition polynomial
     /// parts at the domain elements and their symmetric counterparts corresponding to all the FRI query
     /// index challenges.
+    #[allow(clippy::too_many_arguments)]
     fn step_4_verify_trace_and_composition_openings(
         proof: &StarkProof<Field, FieldExtension, PI>,
         challenges: &Challenges<FieldExtension>,
         main_tag: crypto::merkle_tree::mmcs::MatrixTag,
         main_mmcs_root: &Commitment,
         main_mmcs_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)],
+        aux_mmcs_root: Option<&Commitment>,
+        aux_mmcs_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)],
     ) -> bool
     where
         FieldElement<Field>: AsBytes + Sync + Send + math::traits::ByteConversion,
-        FieldElement<FieldExtension>: AsBytes + Sync + Send,
+        FieldElement<FieldExtension>: AsBytes + Sync + Send + math::traits::ByteConversion,
     {
         challenges
             .iotas
@@ -472,6 +477,8 @@ pub trait IsStarkVerifier<
                     main_tag,
                     main_mmcs_root,
                     main_mmcs_spec,
+                    aux_mmcs_root,
+                    aux_mmcs_spec,
                 )
             })
     }
@@ -618,7 +625,7 @@ pub trait IsStarkVerifier<
             let lde_aux: &[FieldElement<FieldExtension>] = opening
                 .aux_trace_polys
                 .as_ref()
-                .map(|a| a.evaluations.as_slice())
+                .map(|a| a.evaluations())
                 .unwrap_or(&[]);
 
             let evaluation_point = Self::query_challenge_to_evaluation_point(*iota, false, domain);
@@ -642,7 +649,7 @@ pub trait IsStarkVerifier<
             let lde_aux_sym: &[FieldElement<FieldExtension>] = opening
                 .aux_trace_polys
                 .as_ref()
-                .map(|a| a.evaluations_sym.as_slice())
+                .map(|a| a.evaluations_sym())
                 .unwrap_or(&[]);
 
             let evaluation_point = Self::query_challenge_to_evaluation_point(*iota, true, domain);
@@ -765,7 +772,7 @@ pub trait IsStarkVerifier<
     ) -> bool
     where
         FieldElement<Field>: AsBytes + Sync + Send + math::traits::ByteConversion,
-        FieldElement<FieldExtension>: AsBytes + Sync + Send,
+        FieldElement<FieldExtension>: AsBytes + Sync + Send + math::traits::ByteConversion,
     {
         if airs.len() != multi_proof.proofs.len() {
             error!(
@@ -893,11 +900,48 @@ pub trait IsStarkVerifier<
         }
 
         // =====================================================================
-        // Phase C + Rounds 2-4: Forked per table
+        // Phase C: validate + absorb the shared aux MMCS root (if any)
         // =====================================================================
-        // Each table gets an independent transcript fork (cloned from the shared
-        // state after Phase B, domain-separated by table index). This matches
-        // the prover's forking and makes per-table verification independent.
+        // The aux MMCS lives at multi-proof level: a single absorb into the
+        // SHARED transcript replaces the per-table aux root absorb of the
+        // pre-MMCS protocol. Verify the spec mirrors the prover-side
+        // filtered-by-has_aux_trace order before binding.
+        let mut expected_aux_spec: Vec<(crypto::merkle_tree::mmcs::MatrixTag, usize)> =
+            Vec::new();
+        for (idx, (air, proof)) in airs.iter().zip(&multi_proof.proofs).enumerate() {
+            if air.has_aux_trace() {
+                let lde_size = proof.trace_length * (air.options().blowup_factor as usize);
+                expected_aux_spec.push((main_tags[idx], lde_size));
+            }
+        }
+        expected_aux_spec.sort_by(|a, b| b.1.cmp(&a.1).then(a.0.cmp(&b.0)));
+        if expected_aux_spec != multi_proof.aux_mmcs_spec {
+            error!(
+                "aux_mmcs_spec mismatch: expected {:?}, got {:?}",
+                expected_aux_spec, multi_proof.aux_mmcs_spec,
+            );
+            return false;
+        }
+        match (&multi_proof.aux_mmcs_root, expected_aux_spec.is_empty()) {
+            (Some(root), false) => transcript.append_bytes(root),
+            (None, true) => {}
+            (Some(_), true) => {
+                error!("aux_mmcs_root present but no AIR has an aux trace");
+                return false;
+            }
+            (None, false) => {
+                error!("aux_mmcs_root missing but some AIR has an aux trace");
+                return false;
+            }
+        }
+
+        // =====================================================================
+        // Rounds 2-4: Forked per table
+        // =====================================================================
+        // Each table gets an independent transcript fork (cloned from the
+        // shared state after the aux MMCS absorb above, domain-separated by
+        // table index). This matches the prover's forking and makes
+        // per-table verification independent.
 
         for (idx, (air, proof)) in airs.iter().zip(&multi_proof.proofs).enumerate() {
             // Must match prover: fork with domain separator for multi-table,
@@ -908,11 +952,6 @@ pub trait IsStarkVerifier<
                 table_transcript.append_bytes(&(idx as u64).to_le_bytes());
             }
 
-            // Phase C: replay aux commitment
-            if let Some(root) = proof.lde_trace_aux_merkle_root {
-                table_transcript.append_bytes(&root);
-            }
-
             // Bind table_contribution (L) to transcript, matching prover.
             if let Some(ref bpi) = proof.bus_public_inputs {
                 table_transcript.append_field_element(&bpi.table_contribution);
@@ -927,6 +966,8 @@ pub trait IsStarkVerifier<
                 main_tags[idx],
                 &multi_proof.main_mmcs_root,
                 &multi_proof.main_mmcs_spec,
+                multi_proof.aux_mmcs_root.as_ref(),
+                &multi_proof.aux_mmcs_spec,
             ) {
                 error!(
                     "Table {} failed verify_rounds_2_to_4 (num_constraints={}, trace_cols={})",
@@ -982,7 +1023,7 @@ pub trait IsStarkVerifier<
     ) -> bool
     where
         FieldElement<Field>: AsBytes + Sync + Send + math::traits::ByteConversion,
-        FieldElement<FieldExtension>: AsBytes + Sync + Send,
+        FieldElement<FieldExtension>: AsBytes + Sync + Send + math::traits::ByteConversion,
         PI: Clone,
     {
         let main_tags = [crypto::merkle_tree::mmcs::MatrixTag::new([0; 8])];
@@ -1133,9 +1174,8 @@ pub trait IsStarkVerifier<
 
     /// Verifies a single table after round 1 has been replayed.
     ///
-    /// `main_tag`, `main_mmcs_root`, `main_mmcs_spec` come from the shared
-    /// multi-proof and are needed to authenticate the per-table main-trace
-    /// openings in step 4.
+    /// `main_*` / `aux_*` come from the shared multi-proof and authenticate
+    /// the per-table trace openings in step 4.
     #[allow(clippy::too_many_arguments)]
     fn verify_rounds_2_to_4(
         air: &dyn AIR<Field = Field, FieldExtension = FieldExtension, PublicInputs = PI>,
@@ -1145,10 +1185,12 @@ pub trait IsStarkVerifier<
         main_tag: crypto::merkle_tree::mmcs::MatrixTag,
         main_mmcs_root: &Commitment,
         main_mmcs_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)],
+        aux_mmcs_root: Option<&Commitment>,
+        aux_mmcs_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)],
     ) -> bool
     where
         FieldElement<Field>: AsBytes + Sync + Send + math::traits::ByteConversion,
-        FieldElement<FieldExtension>: AsBytes + Sync + Send,
+        FieldElement<FieldExtension>: AsBytes + Sync + Send + math::traits::ByteConversion,
     {
         let domain = new_verifier_domain(air, proof.trace_length);
 
@@ -1227,6 +1269,8 @@ pub trait IsStarkVerifier<
             main_tag,
             main_mmcs_root,
             main_mmcs_spec,
+            aux_mmcs_root,
+            aux_mmcs_spec,
         ) {
             #[cfg(not(feature = "test_fiat_shamir"))]
             error!("DEEP Composition Polynomial verification failed");
@@ -1314,3 +1358,63 @@ where
         mmcs_opening_sym.verify::<BatchedMerkleTreeBackend<F>>(main_mmcs_root, main_mmcs_spec);
     ok && ok_sym
 }
+
+/// Aux-trace counterpart of [`verify_main_mmcs_pair_inner`]. Same shape,
+/// but rehashes the row using the AUX domain separator so an aux opening
+/// cannot authenticate a main leaf (or vice versa).
+fn verify_aux_mmcs_pair_inner<E>(
+    aux_opening: &crate::proof::stark::AuxTraceOpening<E>,
+    iota: usize,
+    main_tag: crypto::merkle_tree::mmcs::MatrixTag,
+    aux_mmcs_root: &Commitment,
+    aux_mmcs_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)],
+) -> bool
+where
+    E: IsField,
+    FieldElement<E>: AsBytes + Sync + Send + math::traits::ByteConversion,
+{
+    use crate::mmcs_leaf::hash_tagged_row_aux;
+    use crate::proof::stark::AuxTraceOpening;
+    let AuxTraceOpening::Mmcs {
+        evaluations,
+        evaluations_sym,
+        mmcs_opening,
+        mmcs_opening_sym,
+    } = aux_opening;
+
+    let table_idx = match aux_mmcs_spec.iter().position(|(t, _)| *t == main_tag) {
+        Some(i) => i,
+        None => return false,
+    };
+    let table_height = aux_mmcs_spec[table_idx].1;
+    let max_height = match aux_mmcs_spec.first().map(|(_, h)| *h) {
+        Some(h) => h,
+        None => return false,
+    };
+    if !table_height.is_power_of_two() || max_height < table_height {
+        return false;
+    }
+    let shift = (max_height / table_height).trailing_zeros() as usize;
+    let g_primary = (iota * 2) << shift;
+    let g_sym = (iota * 2 + 1) << shift;
+    let leaf_primary = hash_tagged_row_aux::<E>(main_tag, evaluations);
+    let leaf_sym = hash_tagged_row_aux::<E>(main_tag, evaluations_sym);
+    if mmcs_opening.global_index != g_primary || mmcs_opening_sym.global_index != g_sym {
+        return false;
+    }
+    let leaves = &mmcs_opening.matrix_leaves;
+    let leaves_sym = &mmcs_opening_sym.matrix_leaves;
+    if table_idx >= leaves.len() || table_idx >= leaves_sym.len() {
+        return false;
+    }
+    if leaves[table_idx].0 != main_tag || leaves[table_idx].1 != leaf_primary {
+        return false;
+    }
+    if leaves_sym[table_idx].0 != main_tag || leaves_sym[table_idx].1 != leaf_sym {
+        return false;
+    }
+    let ok = mmcs_opening.verify::<BatchedMerkleTreeBackend<E>>(aux_mmcs_root, aux_mmcs_spec);
+    let ok_sym =
+        mmcs_opening_sym.verify::<BatchedMerkleTreeBackend<E>>(aux_mmcs_root, aux_mmcs_spec);
+    ok && ok_sym
+}

From d82d9abf9a0b02e9a7d72fd0600452906b1c0740 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Wed, 27 May 2026 15:30:00 -0300
Subject: [PATCH 14/21] test(stark/mmcs): per-vector soundness tests for the
 shared AUX MMCS path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

9 tests covering the aux MMCS attack surface (mirror of the existing
main-MMCS suite). All use a baseline two-table multi-proof over two
`LogReadOnlyRAP` AIRs so both tables contribute aux and therefore both
participate in the shared MMCS.

  Field tampered                                  Detection mechanism
  -----------------------------------------------------------------------
  multi_proof.aux_mmcs_root[0]                    transcript divergence
  multi_proof.aux_mmcs_root = None                root presence/spec check
  multi_proof.aux_mmcs_spec[0].1 (height)         reproduced-spec check
  multi_proof.aux_mmcs_spec[0].0 (tag)            reproduced-spec check
  mmcs_opening.matrix_leaves[idx].1 (digest)      rehash-vs-leaf check
  mmcs_opening.global_index                       g_primary/g_sym match
  mmcs_opening.siblings[0][0]                     MmcsOpening::verify
  evaluations[0] += 1                             rehash mismatch
  (baseline test)                                 verifies cleanly

Locks behaviour for the entire aux MMCS path — root absorb, spec sort,
leaf rehash with the aux domain separator, and Merkle authentication.
---
 .../src/tests/mmcs_aux_soundness_tests.rs     | 210 ++++++++++++++++++
 crypto/stark/src/tests/mod.rs                 |   1 +
 2 files changed, 211 insertions(+)
 create mode 100644 crypto/stark/src/tests/mmcs_aux_soundness_tests.rs

diff --git a/crypto/stark/src/tests/mmcs_aux_soundness_tests.rs b/crypto/stark/src/tests/mmcs_aux_soundness_tests.rs
new file mode 100644
index 000000000..d01d4a924
--- /dev/null
+++ b/crypto/stark/src/tests/mmcs_aux_soundness_tests.rs
@@ -0,0 +1,210 @@
+//! Soundness tests for the shared AUX-trace MMCS path (mirror of
+//! `mmcs_soundness_tests.rs`). Uses two `LogReadOnlyRAP` AIRs so both
+//! tables have an aux trace and therefore both participate in the shared
+//! aux MMCS — the only path that produces `AuxTraceOpening::Mmcs` data.
+//!
+//! Each test tampers with a single field on the aux MMCS path and
+//! asserts the verifier rejects.
+
+use crypto::fiat_shamir::default_transcript::DefaultTranscript;
+use crypto::merkle_tree::mmcs::MatrixTag;
+use math::field::{
+    element::FieldElement, extensions_goldilocks::Degree3GoldilocksExtensionField,
+    goldilocks::GoldilocksField,
+};
+
+use crate::examples::read_only_memory_logup::{
+    LogReadOnlyPublicInputs, LogReadOnlyRAP, read_only_logup_trace,
+};
+use crate::proof::options::ProofOptions;
+use crate::proof::stark::{AuxTraceOpening, MultiProof};
+use crate::test_utils::{multi_prove_ram, multi_verify_ram};
+use crate::traits::AIR;
+
+type F = GoldilocksField;
+type E = Degree3GoldilocksExtensionField;
+
+#[allow(clippy::type_complexity)]
+fn baseline_proof() -> (
+    LogReadOnlyRAP<F, E>,
+    LogReadOnlyRAP<F, E>,
+    MultiProof<F, E, LogReadOnlyPublicInputs<F>>,
+) {
+    let proof_options = ProofOptions::default_test_options();
+    let air_1 = LogReadOnlyRAP::<F, E>::new(&proof_options);
+    let air_2 = LogReadOnlyRAP::<F, E>::new(&proof_options);
+
+    let address_col_1 = vec![
+        FieldElement::<F>::from(3),
+        FieldElement::<F>::from(2),
+        FieldElement::<F>::from(2),
+        FieldElement::<F>::from(3),
+        FieldElement::<F>::from(4),
+        FieldElement::<F>::from(5),
+        FieldElement::<F>::from(1),
+        FieldElement::<F>::from(3),
+    ];
+    let value_col_1 = vec![
+        FieldElement::<F>::from(30),
+        FieldElement::<F>::from(20),
+        FieldElement::<F>::from(20),
+        FieldElement::<F>::from(30),
+        FieldElement::<F>::from(40),
+        FieldElement::<F>::from(50),
+        FieldElement::<F>::from(10),
+        FieldElement::<F>::from(30),
+    ];
+    let address_col_2 = vec![
+        FieldElement::<F>::from(15),
+        FieldElement::<F>::from(12),
+        FieldElement::<F>::from(17),
+        FieldElement::<F>::from(10),
+        FieldElement::<F>::from(14),
+        FieldElement::<F>::from(11),
+        FieldElement::<F>::from(16),
+        FieldElement::<F>::from(13),
+    ];
+    let value_col_2 = vec![
+        FieldElement::<F>::from(150),
+        FieldElement::<F>::from(120),
+        FieldElement::<F>::from(170),
+        FieldElement::<F>::from(100),
+        FieldElement::<F>::from(140),
+        FieldElement::<F>::from(110),
+        FieldElement::<F>::from(160),
+        FieldElement::<F>::from(130),
+    ];
+    let pub_inputs_1 = LogReadOnlyPublicInputs {
+        a0: FieldElement::<F>::from(3),
+        v0: FieldElement::<F>::from(30),
+        a_sorted_0: FieldElement::<F>::from(1),
+        v_sorted_0: FieldElement::<F>::from(10),
+        m0: FieldElement::<F>::from(1),
+    };
+    let pub_inputs_2 = LogReadOnlyPublicInputs {
+        a0: FieldElement::<F>::from(15),
+        v0: FieldElement::<F>::from(150),
+        a_sorted_0: FieldElement::<F>::from(10),
+        v_sorted_0: FieldElement::<F>::from(100),
+        m0: FieldElement::<F>::from(1),
+    };
+
+    let mut trace_1 = read_only_logup_trace(address_col_1, value_col_1);
+    let mut trace_2 = read_only_logup_trace(address_col_2, value_col_2);
+    let air_trace_pairs: Vec<(
+        &dyn AIR<Field = F, FieldExtension = E, PublicInputs = LogReadOnlyPublicInputs<F>>,
+        &mut _,
+        &_,
+    )> = vec![
+        (&air_1, &mut trace_1, &pub_inputs_1),
+        (&air_2, &mut trace_2, &pub_inputs_2),
+    ];
+    let proof =
+        multi_prove_ram(air_trace_pairs, &mut DefaultTranscript::<E>::new(&[])).expect("prove");
+    (air_1, air_2, proof)
+}
+
+fn verify(
+    airs: &[&dyn AIR<Field = F, FieldExtension = E, PublicInputs = LogReadOnlyPublicInputs<F>>],
+    proof: &MultiProof<F, E, LogReadOnlyPublicInputs<F>>,
+) -> bool {
+    multi_verify_ram(airs, proof, &mut DefaultTranscript::<E>::new(&[]), &FieldElement::zero())
+}
+
+fn first_aux_mmcs_opening_mut(
+    proof: &mut MultiProof<F, E, LogReadOnlyPublicInputs<F>>,
+) -> &mut AuxTraceOpening<E> {
+    proof.proofs[0].deep_poly_openings[0]
+        .aux_trace_polys
+        .as_mut()
+        .expect("baseline must have aux openings")
+}
+
+#[test_log::test]
+fn baseline_two_rap_tables_verify() {
+    let (air_1, air_2, proof) = baseline_proof();
+    assert!(proof.aux_mmcs_root.is_some(), "aux MMCS must be present");
+    assert_eq!(proof.aux_mmcs_spec.len(), 2, "both AIRs contribute aux");
+    let airs: Vec<&dyn AIR<Field = F, FieldExtension = E, PublicInputs = LogReadOnlyPublicInputs<F>>> =
+        vec![&air_1, &air_2];
+    assert!(verify(&airs, &proof), "baseline aux proof must verify");
+}
+
+#[test_log::test]
+fn tampered_aux_mmcs_root_rejected() {
+    let (air_1, air_2, mut proof) = baseline_proof();
+    let airs: Vec<&dyn AIR<Field = F, FieldExtension = E, PublicInputs = LogReadOnlyPublicInputs<F>>> =
+        vec![&air_1, &air_2];
+    let root = proof.aux_mmcs_root.as_mut().expect("baseline has root");
+    root[0] ^= 1;
+    assert!(!verify(&airs, &proof));
+}
+
+#[test_log::test]
+fn missing_aux_mmcs_root_rejected() {
+    let (air_1, air_2, mut proof) = baseline_proof();
+    let airs: Vec<&dyn AIR<Field = F, FieldExtension = E, PublicInputs = LogReadOnlyPublicInputs<F>>> =
+        vec![&air_1, &air_2];
+    proof.aux_mmcs_root = None;
+    assert!(!verify(&airs, &proof));
+}
+
+#[test_log::test]
+fn tampered_aux_mmcs_spec_height_rejected() {
+    let (air_1, air_2, mut proof) = baseline_proof();
+    let airs: Vec<&dyn AIR<Field = F, FieldExtension = E, PublicInputs = LogReadOnlyPublicInputs<F>>> =
+        vec![&air_1, &air_2];
+    proof.aux_mmcs_spec[0].1 /= 2;
+    assert!(!verify(&airs, &proof));
+}
+
+#[test_log::test]
+fn tampered_aux_mmcs_spec_tag_rejected() {
+    let (air_1, air_2, mut proof) = baseline_proof();
+    let airs: Vec<&dyn AIR<Field = F, FieldExtension = E, PublicInputs = LogReadOnlyPublicInputs<F>>> =
+        vec![&air_1, &air_2];
+    proof.aux_mmcs_spec[0].0 = MatrixTag::new([0xFF; 8]);
+    assert!(!verify(&airs, &proof));
+}
+
+#[test_log::test]
+fn tampered_aux_mmcs_opening_leaf_rejected() {
+    let (air_1, air_2, mut proof) = baseline_proof();
+    let airs: Vec<&dyn AIR<Field = F, FieldExtension = E, PublicInputs = LogReadOnlyPublicInputs<F>>> =
+        vec![&air_1, &air_2];
+    let AuxTraceOpening::Mmcs { mmcs_opening, .. } = first_aux_mmcs_opening_mut(&mut proof);
+    mmcs_opening.matrix_leaves[0].1[0] ^= 1;
+    assert!(!verify(&airs, &proof));
+}
+
+#[test_log::test]
+fn tampered_aux_mmcs_opening_global_index_rejected() {
+    let (air_1, air_2, mut proof) = baseline_proof();
+    let airs: Vec<&dyn AIR<Field = F, FieldExtension = E, PublicInputs = LogReadOnlyPublicInputs<F>>> =
+        vec![&air_1, &air_2];
+    let AuxTraceOpening::Mmcs { mmcs_opening, .. } = first_aux_mmcs_opening_mut(&mut proof);
+    mmcs_opening.global_index ^= 0b10;
+    assert!(!verify(&airs, &proof));
+}
+
+#[test_log::test]
+fn tampered_aux_mmcs_opening_sibling_rejected() {
+    let (air_1, air_2, mut proof) = baseline_proof();
+    let airs: Vec<&dyn AIR<Field = F, FieldExtension = E, PublicInputs = LogReadOnlyPublicInputs<F>>> =
+        vec![&air_1, &air_2];
+    let AuxTraceOpening::Mmcs { mmcs_opening, .. } = first_aux_mmcs_opening_mut(&mut proof);
+    assert!(!mmcs_opening.siblings.is_empty());
+    mmcs_opening.siblings[0][0] ^= 1;
+    assert!(!verify(&airs, &proof));
+}
+
+#[test_log::test]
+fn tampered_aux_evaluations_rejected() {
+    let (air_1, air_2, mut proof) = baseline_proof();
+    let airs: Vec<&dyn AIR<Field = F, FieldExtension = E, PublicInputs = LogReadOnlyPublicInputs<F>>> =
+        vec![&air_1, &air_2];
+    let AuxTraceOpening::Mmcs { evaluations, .. } = first_aux_mmcs_opening_mut(&mut proof);
+    assert!(!evaluations.is_empty());
+    evaluations[0] += FieldElement::<E>::one();
+    assert!(!verify(&airs, &proof));
+}
diff --git a/crypto/stark/src/tests/mod.rs b/crypto/stark/src/tests/mod.rs
index f44c65ee9..b42b2abd9 100644
--- a/crypto/stark/src/tests/mod.rs
+++ b/crypto/stark/src/tests/mod.rs
@@ -2,6 +2,7 @@ pub mod air_tests;
 pub mod bus_tests;
 pub mod domain_cache_stats;
 pub mod fri_tests;
+pub mod mmcs_aux_soundness_tests;
 pub mod mmcs_soundness_tests;
 pub mod proof_options_tests;
 pub mod prove_verify_roundtrip_tests;

From dd742270e25f54ab4388faf8dc365569f1f7d780 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Wed, 27 May 2026 23:53:34 -0300
Subject: [PATCH 15/21] =?UTF-8?q?feat(crypto/mmcs):=20StreamingMmcsBuilder?=
 =?UTF-8?q?=20=E2=80=94=20fold=20max-height=20chips=20incrementally?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a streaming-build variant of `MmcsBuilder` that folds per-chip
leaves at the MAX height into a single shared running layer-0 as they
arrive, instead of holding every max-height chip's leaf vector alive
simultaneously. The other heights still need per-chip leaves stored
until `finalize` — see "Why max-height only?" below.

Equivalence guarantee: identical root + spec + layer bytes to
`MmcsBuilder` for the same input set (locked by 4 round-trip tests
against the one-shot path, plus order-violation rejection tests).

# Why "max-height only"?

MMCS layer-0 at max height is built by left-folding every chip's leaves
at row `i`. With no left-anchor to compose with, the running fold
`acc = hash(acc, chip_k[i])` is mathematically equivalent to the
one-shot `hash(hash(hash(chip_0[i], chip_1[i]), chip_2[i]), ...)`.

For chips at heights BELOW max, the MMCS injection rule is
`next[i] = hash(hash(hash(next[i], chip_0[i]), chip_1[i]), ...)`, which
mixes the upward-compressed `next[i]` into the left-fold. Keccak (and
any non-associative hash) makes it impossible to pre-fold the chips
into a single summary and inject that summary later — the resulting
digest would differ from the one-shot builder, breaking verifier
compatibility. So we keep per-chip leaves for non-max heights and
inject them in left-fold order at `finalize`.

# Add order

Callers MUST add matrices in (height desc, tag asc) order — the exact
sort `MmcsBuilder::finalize` does internally. Out-of-order calls return
`MmcsError::OutOfOrder`. Same-height tags must strictly ascend;
duplicate tags at different heights still trip `DuplicateTag`.

# Memory / wire-format

- Peak savings vs one-shot: `(num_max_height_chips - 1) * max_height * node_size`
  worth of per-chip leaf storage that is folded immediately and dropped.
  For lambda-vm with ~5 chips at 2^20 that is ~128 MB of transient
  Vec<Commitment>.
- Wire format unchanged: the produced `Mmcs` has the same `root()` and
  `spec()`. Layers Vec matches one-shot byte-for-byte.
- Per-chip `leaf_digests` is empty on the streaming output — `Mmcs::open`
  is therefore unavailable until callers supply leaves at open time.
  This is the prover wire-up that comes next.

# MmcsMatrix change

`MmcsMatrix.padded_height` is now a stored field instead of reading
`leaf_digests.len()`, so `spec()` reports the right value even when
the streaming builder has emptied per-chip leaves.

Tests: 9 new tests on top of the prior 17 MMCS tests; 26 total green.
---
 crypto/crypto/src/merkle_tree/mmcs.rs | 365 +++++++++++++++++++++++++-
 1 file changed, 363 insertions(+), 2 deletions(-)

diff --git a/crypto/crypto/src/merkle_tree/mmcs.rs b/crypto/crypto/src/merkle_tree/mmcs.rs
index 8bbd8607f..fac4e6b2a 100644
--- a/crypto/crypto/src/merkle_tree/mmcs.rs
+++ b/crypto/crypto/src/merkle_tree/mmcs.rs
@@ -47,16 +47,30 @@ pub enum MmcsError {
     NotPowerOfTwo,
     Empty,
     IndexOutOfBounds,
+    /// Returned by [`StreamingMmcsBuilder::add_matrix`] when the caller
+    /// supplies a `(height, tag)` pair that violates the required
+    /// (height desc, tag asc) insertion order.
+    OutOfOrder,
 }
 
 struct MmcsMatrix<N> {
     tag: MatrixTag,
+    /// Source row hashes. Populated by the one-shot [`MmcsBuilder`] and
+    /// consulted by [`Mmcs::open`] to fill the per-matrix leaf in an
+    /// opening. Empty when the Mmcs was produced by [`StreamingMmcsBuilder`]
+    /// (which discards per-chip leaves as it folds them), in which case
+    /// `Mmcs::open` is unavailable but `root()` / `spec()` still work.
     leaf_digests: Vec<N>,
+    /// Padded height (= leaf_digests.len() for one-shot, or the height
+    /// recorded at insertion time for streaming). Carried separately so
+    /// `padded_height()` reports the right value when `leaf_digests` is
+    /// empty.
+    padded_height: usize,
 }
 
 impl<N> MmcsMatrix<N> {
     fn padded_height(&self) -> usize {
-        self.leaf_digests.len()
+        self.padded_height
     }
 }
 
@@ -94,7 +108,12 @@ impl<B: IsMerkleTreeBackend> MmcsBuilder<B> {
         if !leaf_digests.len().is_power_of_two() {
             return Err(MmcsError::NotPowerOfTwo);
         }
-        self.matrices.push(MmcsMatrix { tag, leaf_digests });
+        let padded_height = leaf_digests.len();
+        self.matrices.push(MmcsMatrix {
+            tag,
+            leaf_digests,
+            padded_height,
+        });
         Ok(())
     }
 
@@ -155,6 +174,230 @@ impl<B: IsMerkleTreeBackend> MmcsBuilder<B> {
     }
 }
 
+/// Streaming MMCS builder. Equivalent to [`MmcsBuilder`] in output
+/// (identical root + spec + opening *root* bytes for the same input set)
+/// but folds per-chip leaves at the MAX height into a single shared
+/// running layer-0 as they arrive, instead of holding every max-height
+/// chip's leaf vector alive simultaneously.
+///
+/// # Why "max height only"?
+///
+/// MMCS layer-0 at the max height is built by left-folding every chip's
+/// leaves at row `i`. With no left-anchor to compose with, the running
+/// fold `acc = hash(acc, chip_k[i])` is mathematically equivalent to the
+/// one-shot `hash(hash(hash(chip_0[i], chip_1[i]), chip_2[i]), ...)`.
+///
+/// For chips at heights BELOW max, the MMCS injection rule is
+/// `next[i] = hash(hash(hash(next[i], chip_0[i]), chip_1[i]), ...)`,
+/// which mixes the upward-compressed `next[i]` into the left-fold. Keccak
+/// (and any non-associative hash) makes it impossible to pre-fold the
+/// chips into a single summary and inject that summary later — the
+/// resulting digest would differ from the one-shot builder, breaking
+/// verifier compatibility. So we keep per-chip leaves for non-max heights
+/// and inject them in left-fold order at `finalize`.
+///
+/// # Memory
+///
+/// Peak savings come from the max-height chips, which is where the
+/// dominant per-row storage lives in lambda-vm (CPU chunks at 2^20).
+/// Smaller-height chips contribute proportionally less per chip, so
+/// keeping their per-chip leaves alive has modest impact.
+///
+/// # Add order
+///
+/// Callers MUST call [`StreamingMmcsBuilder::add_matrix`] in the same
+/// order that [`MmcsBuilder::finalize`] would sort the matrices in:
+/// height descending, then tag ascending within each height. The builder
+/// returns [`MmcsError::OutOfOrder`] if a call would break this.
+pub struct StreamingMmcsBuilder<B: IsMerkleTreeBackend> {
+    /// Max-height layer-0 — incrementally folded as max-height chips
+    /// arrive. `None` until the first chip is added (which fixes the
+    /// max height).
+    layer0: Option<Vec<B::Node>>,
+    /// Per-chip leaves for chips at heights < max_height, grouped by
+    /// height. Within each group, chips are in tag-asc order (enforced
+    /// by `add_matrix`).
+    by_height_below_max: BTreeMap<usize, Vec<Vec<B::Node>>>,
+    /// `(tag, padded_height)` in caller-supplied order. Populates the
+    /// final `Mmcs.matrices` (used by `spec()`).
+    matrix_specs: Vec<(MatrixTag, usize)>,
+    max_height: Option<usize>,
+}
+
+impl<B: IsMerkleTreeBackend> Default for StreamingMmcsBuilder<B> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<B: IsMerkleTreeBackend> StreamingMmcsBuilder<B> {
+    pub fn new() -> Self {
+        Self {
+            layer0: None,
+            by_height_below_max: BTreeMap::new(),
+            matrix_specs: Vec::new(),
+            max_height: None,
+        }
+    }
+
+    /// Add a chip's leaves to the in-progress MMCS. The vector is
+    /// consumed so the caller can drop the chip's source data
+    /// immediately on return.
+    ///
+    /// At the MAX height the leaves are folded into the shared layer-0
+    /// running and the vector is freed. At lower heights the vector is
+    /// stored verbatim until `finalize`.
+    pub fn add_matrix(
+        &mut self,
+        tag: MatrixTag,
+        leaf_digests: Vec<B::Node>,
+    ) -> Result<(), MmcsError> {
+        if leaf_digests.is_empty() {
+            return Err(MmcsError::EmptyMatrix);
+        }
+        if !leaf_digests.len().is_power_of_two() {
+            return Err(MmcsError::NotPowerOfTwo);
+        }
+        // Order check first — protects all subsequent invariants.
+        let h = leaf_digests.len();
+        if let Some(&(prev_tag, prev_h)) = self.matrix_specs.last() {
+            let ord = core::cmp::Ord::cmp(&prev_h, &h)
+                .reverse()
+                .then(prev_tag.cmp(&tag));
+            if !matches!(ord, core::cmp::Ordering::Less) {
+                return Err(MmcsError::OutOfOrder);
+            }
+        }
+        if self.matrix_specs.iter().any(|(t, _)| *t == tag) {
+            return Err(MmcsError::DuplicateTag);
+        }
+
+        match self.max_height {
+            None => {
+                // First chip — its height fixes max_height; its leaves
+                // seed the running layer-0.
+                self.max_height = Some(h);
+                self.layer0 = Some(leaf_digests);
+            }
+            Some(max_h) if h == max_h => {
+                // Subsequent max-height chip — fold into running layer-0.
+                let running = self
+                    .layer0
+                    .as_mut()
+                    .expect("layer0 populated once max_height is set");
+                debug_assert_eq!(running.len(), leaf_digests.len());
+                fold_into::<B>(running, &leaf_digests);
+            }
+            Some(_) => {
+                // Below max — stash per-chip leaves, drop at finalize.
+                self.by_height_below_max
+                    .entry(h)
+                    .or_default()
+                    .push(leaf_digests);
+            }
+        }
+        self.matrix_specs.push((tag, h));
+        Ok(())
+    }
+
+    /// Compress the running layer-0 upward, injecting lower-height chips
+    /// at the matching level using the same left-fold the one-shot
+    /// [`MmcsBuilder::finalize`] uses.
+    ///
+    /// The returned [`Mmcs`] has empty `leaf_digests` for each matrix
+    /// because the streaming builder consumed them. `root()` / `spec()`
+    /// are fully functional; callers that also need [`Mmcs::open`] must
+    /// regenerate the chip leaves or use [`MmcsBuilder`].
+    pub fn finalize(self) -> Result<Mmcs<B>, MmcsError> {
+        if self.matrix_specs.is_empty() {
+            return Err(MmcsError::Empty);
+        }
+        let max_height = self.max_height.ok_or(MmcsError::Empty)?;
+        let depth = max_height.trailing_zeros() as usize;
+
+        let StreamingMmcsBuilder {
+            layer0,
+            mut by_height_below_max,
+            matrix_specs,
+            max_height: _,
+        } = self;
+
+        let mut layers: Vec<Vec<B::Node>> = Vec::with_capacity(depth + 1);
+        layers.push(layer0.ok_or(MmcsError::Empty)?);
+
+        for level in 0..depth {
+            let mut next = compress_pairs::<B>(&layers[level]);
+            let new_len = max_height >> (level + 1);
+            if let Some(chips) = by_height_below_max.remove(&new_len) {
+                inject_chips_left_fold::<B>(&mut next, &chips);
+            }
+            layers.push(next);
+        }
+
+        // Carry tag + height into the Mmcs so `spec()` reports the right
+        // pairs. leaf_digests stays empty — opens are not supported on
+        // streaming output (caller must use the one-shot builder when
+        // openings are needed).
+        let matrices = matrix_specs
+            .into_iter()
+            .map(|(tag, padded_height)| MmcsMatrix {
+                tag,
+                leaf_digests: Vec::new(),
+                padded_height,
+            })
+            .collect();
+        Ok(Mmcs { layers, matrices })
+    }
+}
+
+/// Per-row fold: `acc[i] = hash_new_parent(acc[i], other[i])`.
+fn fold_into<B: IsMerkleTreeBackend>(acc: &mut [B::Node], other: &[B::Node]) {
+    debug_assert_eq!(acc.len(), other.len());
+    let n = acc.len();
+    let updated: Vec<B::Node> = {
+        let inner = |i: usize| -> B::Node { B::hash_new_parent(&acc[i], &other[i]) };
+        #[cfg(feature = "parallel")]
+        {
+            (0..n).into_par_iter().map(inner).collect()
+        }
+        #[cfg(not(feature = "parallel"))]
+        {
+            (0..n).map(inner).collect()
+        }
+    };
+    acc.clone_from_slice(&updated);
+}
+
+/// Left-fold inject several chips' leaves into `layer` at every row in
+/// tag-asc chip order:
+/// `layer[i] = hash(hash(hash(layer[i], chips[0][i]), chips[1][i]), ...)`.
+/// Mirrors `inject_matrices` in the one-shot path.
+fn inject_chips_left_fold<B: IsMerkleTreeBackend>(
+    layer: &mut [B::Node],
+    chips: &[Vec<B::Node>],
+) {
+    let n = layer.len();
+    let updated: Vec<B::Node> = {
+        let inner = |i: usize| -> B::Node {
+            let mut acc = layer[i].clone();
+            for chip in chips {
+                acc = B::hash_new_parent(&acc, &chip[i]);
+            }
+            acc
+        };
+        #[cfg(feature = "parallel")]
+        {
+            (0..n).into_par_iter().map(inner).collect()
+        }
+        #[cfg(not(feature = "parallel"))]
+        {
+            (0..n).map(inner).collect()
+        }
+    };
+    layer.clone_from_slice(&updated);
+}
+
+
 /// Build layer 0 by folding all matrices at `max_height` at row `i`, in
 /// tag-asc order (`group` already preserves this). Row-parallel.
 fn build_combined_layer<B: IsMerkleTreeBackend>(
@@ -597,6 +840,124 @@ mod tests {
         let tree = build(vec![big]);
         assert_eq!(tree.open(4).err(), Some(MmcsError::IndexOutOfBounds));
     }
+
+    // ---------- StreamingMmcsBuilder equivalence ----------
+
+    fn build_streaming(
+        matrices_in_spec_order: Vec<(MatrixTag, Vec<Node>)>,
+    ) -> Mmcs<TestBackend> {
+        let mut b: StreamingMmcsBuilder<TestBackend> = StreamingMmcsBuilder::new();
+        for (tag, leaves) in matrices_in_spec_order {
+            b.add_matrix(tag, leaves).expect("streaming add_matrix");
+        }
+        b.finalize().expect("streaming finalize")
+    }
+
+    /// Convert an arbitrary input set into the (height desc, tag asc)
+    /// order required by `StreamingMmcsBuilder`. Matches the sort
+    /// `MmcsBuilder::finalize` does internally.
+    fn spec_sorted(mut v: Vec<(MatrixTag, Vec<Node>)>) -> Vec<(MatrixTag, Vec<Node>)> {
+        v.sort_by(|a, b| b.1.len().cmp(&a.1.len()).then(a.0.cmp(&b.0)));
+        v
+    }
+
+    #[test]
+    fn streaming_root_matches_oneshot_single_matrix() {
+        let m = make_matrix(0xAA, 8);
+        let r_oneshot = *build(vec![m.clone()]).root();
+        let r_stream = *build_streaming(spec_sorted(vec![m])).root();
+        assert_eq!(r_oneshot, r_stream);
+    }
+
+    #[test]
+    fn streaming_root_matches_oneshot_lambdavm_topology() {
+        let inputs = vec![
+            make_matrix(0x01, 8),
+            make_matrix(0x02, 8),
+            make_matrix(0x03, 8),
+            make_matrix(0x10, 4),
+            make_matrix(0x11, 4),
+            make_matrix(0xF0, 1),
+        ];
+        let r_oneshot = *build(inputs.clone()).root();
+        let r_stream = *build_streaming(spec_sorted(inputs)).root();
+        assert_eq!(r_oneshot, r_stream);
+    }
+
+    #[test]
+    fn streaming_spec_matches_oneshot() {
+        let inputs = vec![
+            make_matrix(0x01, 8),
+            make_matrix(0x02, 4),
+            make_matrix(0x03, 8),
+            make_matrix(0x04, 2),
+        ];
+        let oneshot = build(inputs.clone());
+        let stream = build_streaming(spec_sorted(inputs));
+        assert_eq!(oneshot.spec(), stream.spec());
+    }
+
+    #[test]
+    fn streaming_rejects_height_ascending() {
+        let mut b: StreamingMmcsBuilder<TestBackend> = StreamingMmcsBuilder::new();
+        let (t0, l0) = make_matrix(0x01, 4);
+        let (t1, l1) = make_matrix(0x02, 8);
+        b.add_matrix(t0, l0).expect("first add");
+        assert_eq!(b.add_matrix(t1, l1), Err(MmcsError::OutOfOrder));
+    }
+
+    #[test]
+    fn streaming_rejects_same_height_tag_descending() {
+        let mut b: StreamingMmcsBuilder<TestBackend> = StreamingMmcsBuilder::new();
+        let (t0, l0) = make_matrix(0x02, 4);
+        let (t1, l1) = make_matrix(0x01, 4);
+        b.add_matrix(t0, l0).expect("first add");
+        assert_eq!(b.add_matrix(t1, l1), Err(MmcsError::OutOfOrder));
+    }
+
+    #[test]
+    fn streaming_rejects_duplicate_tag_same_height() {
+        // Same tag and same height violates (height desc, tag asc); the
+        // order check fires first.
+        let mut b: StreamingMmcsBuilder<TestBackend> = StreamingMmcsBuilder::new();
+        let (t, l) = make_matrix(0x01, 4);
+        b.add_matrix(t, l.clone()).expect("first add");
+        assert_eq!(b.add_matrix(t, l), Err(MmcsError::OutOfOrder));
+    }
+
+    #[test]
+    fn streaming_rejects_duplicate_tag_smaller_height() {
+        // Same tag at a strictly smaller height passes the order check,
+        // so the dup-tag scan catches it instead.
+        let mut b: StreamingMmcsBuilder<TestBackend> = StreamingMmcsBuilder::new();
+        let (t, l) = make_matrix(0x01, 4);
+        b.add_matrix(t, l).expect("first add");
+        let l2: Vec<Node> = vec![[0; 32]; 2];
+        assert_eq!(b.add_matrix(t, l2), Err(MmcsError::DuplicateTag));
+    }
+
+    #[test]
+    fn streaming_rejects_empty_and_non_power_of_two() {
+        let mut b: StreamingMmcsBuilder<TestBackend> = StreamingMmcsBuilder::new();
+        let tag = MatrixTag::new([0; 8]);
+        assert_eq!(b.add_matrix(tag, Vec::new()), Err(MmcsError::EmptyMatrix));
+        let bad: Vec<Node> = vec![[0; 32]; 3];
+        assert_eq!(b.add_matrix(tag, bad), Err(MmcsError::NotPowerOfTwo));
+    }
+
+    #[test]
+    fn streaming_root_matches_oneshot_pure_same_height() {
+        let inputs = vec![
+            make_matrix(0x01, 8),
+            make_matrix(0x02, 8),
+            make_matrix(0x03, 8),
+            make_matrix(0x04, 8),
+            make_matrix(0x05, 8),
+        ];
+        let r_oneshot = *build(inputs.clone()).root();
+        let r_stream = *build_streaming(spec_sorted(inputs)).root();
+        assert_eq!(r_oneshot, r_stream);
+    }
 }
 
 #[cfg(test)]

From 5d1a2d6639bbda3fb301c6b517f3b37508f02410 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Thu, 28 May 2026 00:05:33 -0300
Subject: [PATCH 16/21] =?UTF-8?q?feat(crypto/mmcs):=20Mmcs::open=5Fwith=5F?=
 =?UTF-8?q?leaves=20=E2=80=94=20open=20streaming-built=20MMCSes?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`StreamingMmcsBuilder` discards per-chip leaves at build time, so
`Mmcs::open(global_index)` — which reads `matrix.leaf_digests[idx]` to
fill `MmcsOpening.matrix_leaves` — would return empty leaves and a bad
opening. This commit adds the complementary API:

    pub fn open_with_leaves<F>(global_index, leaf_fn) -> MmcsOpening
    where F: FnMut(matrix_idx, local_idx) -> B::Node

The closure provides each matrix's per-row leaf at the appropriate
shifted index. The prover-side use case (next commit) is to rehash a
row from a per-table LDE on demand, replacing the one-shot Mmcs's
internal leaf storage with on-the-fly rehashing.

`Mmcs::open` is now a thin wrapper around `open_with_leaves` whose
closure reads from `self.matrices[i].leaf_digests` — back-compat, no
caller change.

Equivalence pinned by a new test
`streaming_open_with_leaves_round_trips_against_one_shot`:
- Build lambda-vm-shaped topology two ways (one-shot + streaming).
- Compare roots + specs (already covered).
- For every global_index in [0, max_height): open one-shot, open
  streaming via `open_with_leaves` feeding leaves from the input set,
  assert `global_index`, `siblings`, and `matrix_leaves` byte-identical.
- Assert the streaming opening verifies against the streaming root.

27/27 MMCS tests green (1 new on top of 26).

Foundation for the per-chunk MMCS refactor: with `open_with_leaves`
available, the prover can stream-build per-chunk MMCSes and rehash
leaves from chunk-shared LDEs at open time.
---
 crypto/crypto/src/merkle_tree/mmcs.rs | 72 +++++++++++++++++++++++++--
 1 file changed, 69 insertions(+), 3 deletions(-)

diff --git a/crypto/crypto/src/merkle_tree/mmcs.rs b/crypto/crypto/src/merkle_tree/mmcs.rs
index fac4e6b2a..b4b38dc2f 100644
--- a/crypto/crypto/src/merkle_tree/mmcs.rs
+++ b/crypto/crypto/src/merkle_tree/mmcs.rs
@@ -483,6 +483,30 @@ impl<B: IsMerkleTreeBackend> Mmcs<B> {
     }
 
     pub fn open(&self, global_index: usize) -> Result<MmcsOpening<B::Node>, MmcsError> {
+        self.open_with_leaves(global_index, |m_idx, local_idx| {
+            self.matrices[m_idx].leaf_digests[local_idx].clone()
+        })
+    }
+
+    /// Like [`Mmcs::open`] but pulls each matrix's per-row leaf from a
+    /// caller-supplied closure instead of `self.matrices[i].leaf_digests`.
+    /// Required when this `Mmcs` was produced by [`StreamingMmcsBuilder`]
+    /// (which discards per-chip leaves at build time): the closure
+    /// rehashes the row from the chip's source data on demand.
+    ///
+    /// The closure receives `(matrix_idx_in_spec_order, local_idx)` where
+    /// `local_idx = global_index >> log2(max_height / m.padded_height())`,
+    /// and must return the same digest the one-shot builder would have
+    /// stored at that position. Returning a wrong digest produces an
+    /// opening whose `verify` will fail on the prover side.
+    pub fn open_with_leaves<F>(
+        &self,
+        global_index: usize,
+        mut leaf_fn: F,
+    ) -> Result<MmcsOpening<B::Node>, MmcsError>
+    where
+        F: FnMut(usize, usize) -> B::Node,
+    {
         let max_height = self.matrices[0].padded_height();
         if global_index >= max_height {
             return Err(MmcsError::IndexOutOfBounds);
@@ -490,10 +514,10 @@ impl<B: IsMerkleTreeBackend> Mmcs<B> {
         let depth = max_height.trailing_zeros() as usize;
 
         let mut matrix_leaves: Vec<(MatrixTag, B::Node)> = Vec::with_capacity(self.matrices.len());
-        for matrix in &self.matrices {
+        for (m_idx, matrix) in self.matrices.iter().enumerate() {
             let shift = (max_height / matrix.padded_height()).trailing_zeros() as usize;
-            let idx = global_index >> shift;
-            matrix_leaves.push((matrix.tag, matrix.leaf_digests[idx].clone()));
+            let local_idx = global_index >> shift;
+            matrix_leaves.push((matrix.tag, leaf_fn(m_idx, local_idx)));
         }
 
         let mut siblings: Vec<B::Node> = Vec::with_capacity(depth);
@@ -945,6 +969,48 @@ mod tests {
         assert_eq!(b.add_matrix(tag, bad), Err(MmcsError::NotPowerOfTwo));
     }
 
+    #[test]
+    fn streaming_open_with_leaves_round_trips_against_one_shot() {
+        // Lambda-vm topology built two ways: one-shot builds a fully-
+        // populated Mmcs whose `open` works directly; streaming builds an
+        // empty-leaves Mmcs whose `open_with_leaves` must produce the
+        // same opening when fed the same chip leaves.
+        let inputs = vec![
+            make_matrix(0x01, 8),
+            make_matrix(0x02, 8),
+            make_matrix(0x03, 8),
+            make_matrix(0x10, 4),
+            make_matrix(0x11, 4),
+            make_matrix(0xF0, 1),
+        ];
+        let oneshot = build(inputs.clone());
+        let stream = build_streaming(spec_sorted(inputs.clone()));
+        assert_eq!(*oneshot.root(), *stream.root());
+        assert_eq!(oneshot.spec(), stream.spec());
+
+        let sorted = spec_sorted(inputs);
+        let leaves_by_tag: std::collections::HashMap<MatrixTag, Vec<Node>> =
+            sorted.iter().map(|(t, l)| (*t, l.clone())).collect();
+        let spec = stream.spec();
+
+        for global_index in 0..8 {
+            let from_oneshot = oneshot.open(global_index).expect("oneshot open");
+            let from_stream = stream
+                .open_with_leaves(global_index, |m_idx, local_idx| {
+                    let tag = spec[m_idx].0;
+                    leaves_by_tag[&tag][local_idx]
+                })
+                .expect("streaming open_with_leaves");
+            assert_eq!(from_oneshot.global_index, from_stream.global_index);
+            assert_eq!(from_oneshot.siblings, from_stream.siblings);
+            assert_eq!(from_oneshot.matrix_leaves, from_stream.matrix_leaves);
+            assert!(
+                from_stream.verify::<TestBackend>(stream.root(), &spec),
+                "streaming opening must verify"
+            );
+        }
+    }
+
     #[test]
     fn streaming_root_matches_oneshot_pure_same_height() {
         let inputs = vec![

From e854e15a9e1dd298e2acc08c8e2630d226cb1b00 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Thu, 28 May 2026 00:26:28 -0300
Subject: [PATCH 17/21] refactor(stark/trace): Arc-wrap LDETraceTable columns
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Foundation for the per-chunk MMCS migration. The chunk-grouped main and
aux MMCSes need to rehash chunk-mate LDE rows on demand at open time
(the prover-side input to `Mmcs::open_with_leaves`). Without Arc-shared
columns, each chunk-mate's open would have to copy ~600 MB of column
data per CPU LDE — a non-starter.

Change:
- `LDETraceTable.main_columns: Vec<Vec<FE<F>>>` → `Arc<Vec<Vec<FE<F>>>>`.
- Same for `aux_columns`.
- `from_columns(Vec, Vec, ...)` still works (wraps internally with
  `Arc::new`) — all existing call sites unaffected.
- New `from_columns_arc(Arc, Arc, ...)` for callers (next commit) that
  already hold Arc-shared column data.
- New `main_columns_arc()` / `aux_columns_arc()` cheap-clone accessors
  for the chunk-shared MMCS open helpers.
- `into_columns()` now returns `(Arc, Arc)` instead of `(Vec, Vec)`.
  No internal caller uses the old signature outside `LDETraceTable`
  construction (which uses Arc-aware path); external uses would have
  to migrate, but searching the workspace shows no such callers.

No protocol change. No proof-format change. No memory cost: a single
`Arc::new` on construction; deref is free; column reads via `[col][row]`
still go through `Vec` Deref. Existing 151/151 stark tests pass without
edits.
---
 crypto/stark/src/trace.rs | 58 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 51 insertions(+), 7 deletions(-)

diff --git a/crypto/stark/src/trace.rs b/crypto/stark/src/trace.rs
index 834ffdcda..f81e1496b 100644
--- a/crypto/stark/src/trace.rs
+++ b/crypto/stark/src/trace.rs
@@ -1,3 +1,5 @@
+use std::sync::Arc;
+
 use crate::domain::{Domain, DomainConstants};
 use crate::table::Table;
 #[cfg(test)]
@@ -216,8 +218,13 @@ where
     E: IsField,
     F: IsSubFieldOf<E> + IsField,
 {
-    pub(crate) main_columns: Vec<Vec<FieldElement<F>>>,
-    pub(crate) aux_columns: Vec<Vec<FieldElement<E>>>,
+    /// LDE columns for the main trace, Arc-wrapped so chunk-mate tables
+    /// can share access without copying the large column data — needed by
+    /// the per-chunk MMCS open path which rehashes chunk-mate rows on
+    /// demand. Read-only after construction.
+    pub(crate) main_columns: Arc<Vec<Vec<FieldElement<F>>>>,
+    /// Same shape for aux columns.
+    pub(crate) aux_columns: Arc<Vec<Vec<FieldElement<E>>>>,
     pub(crate) lde_step_size: usize,
     pub(crate) blowup_factor: usize,
 }
@@ -227,16 +234,35 @@ where
     E: IsField,
     F: IsSubFieldOf<E>,
 {
-    /// Creates a column-major LDETraceTable by consuming column vectors directly.
-    /// No transpose is performed — columns are stored as-is.
+    /// Creates a column-major LDETraceTable by consuming column vectors
+    /// directly. Wraps each column slice in an `Arc` so the resulting
+    /// table can be cheaply shared across threads and per-chunk open
+    /// helpers.
     pub fn from_columns(
         main_columns: Vec<Vec<FieldElement<F>>>,
         aux_columns: Vec<Vec<FieldElement<E>>>,
         trace_step_size: usize,
         blowup_factor: usize,
     ) -> Self {
-        let lde_step_size = trace_step_size * blowup_factor;
+        Self::from_columns_arc(
+            Arc::new(main_columns),
+            Arc::new(aux_columns),
+            trace_step_size,
+            blowup_factor,
+        )
+    }
 
+    /// Creates an `LDETraceTable` from already-`Arc`-wrapped column data.
+    /// Useful when the same column data is being shared with other
+    /// consumers (e.g. a per-chunk MMCS open context) and the caller
+    /// wants to avoid re-allocating the Arc.
+    pub fn from_columns_arc(
+        main_columns: Arc<Vec<Vec<FieldElement<F>>>>,
+        aux_columns: Arc<Vec<Vec<FieldElement<E>>>>,
+        trace_step_size: usize,
+        blowup_factor: usize,
+    ) -> Self {
+        let lde_step_size = trace_step_size * blowup_factor;
         Self {
             main_columns,
             aux_columns,
@@ -245,12 +271,30 @@ where
         }
     }
 
-    /// Consume self and return the owned column vectors.
+    /// Consume self and return the Arc-wrapped column vectors. Callers
+    /// that need to mutate or destructure should clone the inner Vecs.
     #[allow(clippy::type_complexity)]
-    pub fn into_columns(self) -> (Vec<Vec<FieldElement<F>>>, Vec<Vec<FieldElement<E>>>) {
+    pub fn into_columns(
+        self,
+    ) -> (
+        Arc<Vec<Vec<FieldElement<F>>>>,
+        Arc<Vec<Vec<FieldElement<E>>>>,
+    ) {
         (self.main_columns, self.aux_columns)
     }
 
+    /// Cheap clone of the underlying main-column Arc. Used by per-chunk
+    /// MMCS open helpers that need read-only shared access without
+    /// owning a copy.
+    pub fn main_columns_arc(&self) -> Arc<Vec<Vec<FieldElement<F>>>> {
+        Arc::clone(&self.main_columns)
+    }
+
+    /// Cheap clone of the underlying aux-column Arc. See [`main_columns_arc`].
+    pub fn aux_columns_arc(&self) -> Arc<Vec<Vec<FieldElement<E>>>> {
+        Arc::clone(&self.aux_columns)
+    }
+
     pub fn num_main_cols(&self) -> usize {
         self.main_columns.len()
     }

From b16218083e0bb8b178bc11a66c5e31de2a2fd995 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Thu, 28 May 2026 11:03:53 -0300
Subject: [PATCH 18/21] =?UTF-8?q?feat(stark/mmcs):=20per-chunk=20MMCS=20?=
 =?UTF-8?q?=E2=80=94=20group=20K=20tables=20per=20MMCS,=20stream=20within?=
 =?UTF-8?q?=20chunk?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Refactors the main + aux MMCSes from one-global to per-chunk: each
parallel chunk of K = `table_parallelism()` tables gets its own
streaming MMCS, with chunk-mate LDEs Arc-shared so the per-query open
path can rehash chunk-mate rows on demand. Final piece of the streaming
plan (after `StreamingMmcsBuilder` and `Mmcs::open_with_leaves`).

# Why per-chunk?

The streaming MMCS drops per-chip leaves at build time, so `Mmcs::open`
needs to recompute leaves on the fly via `open_with_leaves`. That
closure must produce leaves for every matrix in the MMCS spec — not
just the current table — because `MmcsOpening::verify` walks all
max-height matrices to reconstruct layer-0. Per-chunk grouping bounds
the cross-table LDE access to the K tables a chunk owns (already kept
alive together by the existing parallel R2-R4 loop), avoiding the
alternatives of (a) global Arc<Vec<Arc<Lde>>> threaded through every
fork, or (b) reordering iota sampling earlier.

# Proof format

`MultiProof`:
- `main_mmcs_root: Commitment`            → `main_mmcs_roots: Vec<Option<Commitment>>`
- `main_mmcs_spec: Vec<(...)>`            → `main_mmcs_specs: Vec<Vec<(...)>>`
- `aux_mmcs_root: Option<Commitment>`     → `aux_mmcs_roots: Vec<Option<Commitment>>`
- `aux_mmcs_spec: Vec<(...)>`             → `aux_mmcs_specs: Vec<Vec<(...)>>`
- + `chunk_size: u32` (pinned `table_parallelism()` so the verifier
   chunks the AIR slice the same way the prover did).

`None` entries in the *_roots Vecs mark chunks with no MMCS-eligible
tables (all-preprocessed for main, no-aux for aux). Per-query openings
shrink: each carries ≤K matrix_leaves instead of N.

# Phase A / Phase C absorb order (prover + verifier match exactly)

Per chunk in chunk order:
  for each table in spec order:
    - absorb its preprocessed root (preprocessed only)
    - absorb its per-table multiplicities root (preprocessed only)
  - absorb that chunk's main MMCS root (Some) or skip (None)
After Phase A → sample LogUp challenges → Phase C aux mirrors main:
  for each chunk in chunk order:
    - absorb that chunk's aux MMCS root (Some) or skip (None)
Then fork per-table → per-table table_contribution → rounds 2-4.

# Data plumbing

`MainCommit::Shared` now carries `Arc<ChunkMainMmcsContext<F>>` +
`chunk_idx` instead of an `Arc<Mmcs>` directly. The context holds the
chunk's MMCS + Arc-cloned LDE columns for the chunk-mates in MMCS spec
sort order. Aux mirrors via `ChunkAuxMmcsContext<E>`.

`Lde { main, aux }` columns are now `Arc<Vec<Vec<FE>>>` (built on
B1's Arc-wrapped `LDETraceTable`). Each table's `Round1.lde_trace`
shares the same Arc as the chunk context — no duplication.

# Open path

`open_deep_composition_poly` dispatches on the chunk context's MMCS via
`mmcs.open_with_leaves`, with a closure that rehashes chunk-mate rows
via the new `rehash_main_chip_leaf` / `rehash_aux_chip_leaf` helpers
(read from chunk-shared LDE columns + matrix tag, hash with the
appropriate `LEAF_DOMAIN_TAG_*`).

# Verifier

`multi_verify` reads `chunk_size` from the proof, walks chunks of the
AIR slice, validates each chunk's expected spec against the supplied
one, and absorbs roots in chunk order. `verify_rounds_2_to_4` and
`verify_main_mmcs_pair` take `main_mmcs_root: Option<&Commitment>` (and
similarly aux): `Shared` opening with `None` root → reject.

# Tests

- Existing main + aux soundness suites adapted to the per-chunk shape
  via `first_populated_main_chunk` / `first_populated_aux_chunk`
  helpers + `proof.main_mmcs_roots[chunk_idx]` field access.
- New soundness test `tampered_chunk_size_rejected`: pinned chunk_size
  mismatch must be rejected (Vec length cross-check fires).
- bin/cli/proof-size breakdown updated: now reports `main_mmcs_roots /
  main_mmcs_specs / aux_mmcs_roots / aux_mmcs_specs / chunk_size` as
  separate sections (multi-proof header), drops the obsolete
  `per_table_aux_merkle_root` and pre-Vec `main_mmcs_root/spec` rows.

Results: 152/152 stark tests green (151 before + new
`tampered_chunk_size_rejected`); 27/27 crypto mmcs tests green; 3/3
cli unit tests green; lambda-vm-prover bitwise (preprocessed-path) +
non-ELF tests pass; the 77 prove_elfs failures are the same pre-existing
`UnknownSyscall(5)` executor bug present on main.
---
 bin/cli/src/main.rs                           |  29 +-
 crypto/stark/src/proof/stark.rs               |  52 +-
 crypto/stark/src/prover.rs                    | 556 +++++++++++++-----
 .../src/tests/mmcs_aux_soundness_tests.rs     |  43 +-
 .../stark/src/tests/mmcs_soundness_tests.rs   |  68 ++-
 crypto/stark/src/verifier.rs                  | 230 +++++---
 prover/src/lib.rs                             |  29 +-
 7 files changed, 697 insertions(+), 310 deletions(-)

diff --git a/bin/cli/src/main.rs b/bin/cli/src/main.rs
index 9e4c95ad4..dd65466b4 100644
--- a/bin/cli/src/main.rs
+++ b/bin/cli/src/main.rs
@@ -690,8 +690,11 @@ fn cmd_proof_size(
 
     let total = ser_len(&vm_proof);
     let multi_proof_bytes = ser_len(&vm_proof.proof);
-    let main_mmcs_root_bytes = ser_len(&vm_proof.proof.main_mmcs_root);
-    let main_mmcs_spec_bytes = ser_len(&vm_proof.proof.main_mmcs_spec);
+    let main_mmcs_roots_bytes = ser_len(&vm_proof.proof.main_mmcs_roots);
+    let main_mmcs_specs_bytes = ser_len(&vm_proof.proof.main_mmcs_specs);
+    let aux_mmcs_roots_bytes = ser_len(&vm_proof.proof.aux_mmcs_roots);
+    let aux_mmcs_specs_bytes = ser_len(&vm_proof.proof.aux_mmcs_specs);
+    let chunk_size_bytes = ser_len(&vm_proof.proof.chunk_size);
 
     // Sum per-section across every sub-proof so a single number captures the
     // contribution of, e.g., "all FRI query lists across all tables".
@@ -704,14 +707,12 @@ fn cmd_proof_size(
     let mut s_trace_ood = 0usize;
     let mut s_composition_ood = 0usize;
     let mut s_per_table_main_root = 0usize;
-    let mut s_aux_root = 0usize;
     let mut s_precomputed_root = 0usize;
     let mut s_bus_public_inputs = 0usize;
     let s_other;
 
     for proof in &vm_proof.proof.proofs {
         s_per_table_main_root += ser_len(&proof.lde_trace_main_merkle_root);
-        s_aux_root += ser_len(&proof.lde_trace_aux_merkle_root);
         s_precomputed_root += ser_len(&proof.lde_trace_precomputed_merkle_root);
         s_trace_ood += ser_len(&proof.trace_ood_evaluations);
         s_composition_ood += ser_len(&proof.composition_poly_parts_ood_evaluation);
@@ -730,8 +731,11 @@ fn cmd_proof_size(
     // Anything not captured above (composition_poly_root, fri_last_value,
     // nonce, public_inputs, trace_length, headers...). Calculate as the
     // bundle delta so the breakdown still sums to ~total.
-    let accounted = main_mmcs_root_bytes
-        + main_mmcs_spec_bytes
+    let accounted = main_mmcs_roots_bytes
+        + main_mmcs_specs_bytes
+        + aux_mmcs_roots_bytes
+        + aux_mmcs_specs_bytes
+        + chunk_size_bytes
         + s_main_trace_openings
         + s_precomputed_trace_openings
         + s_aux_trace_openings
@@ -741,17 +745,18 @@ fn cmd_proof_size(
         + s_trace_ood
         + s_composition_ood
         + s_per_table_main_root
-        + s_aux_root
         + s_precomputed_root
         + s_bus_public_inputs;
     s_other = multi_proof_bytes.saturating_sub(accounted);
 
     let entries: Vec<ProofSizeEntry> = vec![
-        ProofSizeEntry { section: "main_mmcs_root".into(), bytes: main_mmcs_root_bytes },
-        ProofSizeEntry { section: "main_mmcs_spec".into(), bytes: main_mmcs_spec_bytes },
+        ProofSizeEntry { section: "main_mmcs_roots (per-chunk)".into(), bytes: main_mmcs_roots_bytes },
+        ProofSizeEntry { section: "main_mmcs_specs (per-chunk)".into(), bytes: main_mmcs_specs_bytes },
+        ProofSizeEntry { section: "aux_mmcs_roots (per-chunk)".into(), bytes: aux_mmcs_roots_bytes },
+        ProofSizeEntry { section: "aux_mmcs_specs (per-chunk)".into(), bytes: aux_mmcs_specs_bytes },
+        ProofSizeEntry { section: "chunk_size".into(), bytes: chunk_size_bytes },
         ProofSizeEntry { section: "per_table_main_merkle_root (preprocessed)".into(), bytes: s_per_table_main_root },
         ProofSizeEntry { section: "per_table_precomputed_merkle_root".into(), bytes: s_precomputed_root },
-        ProofSizeEntry { section: "per_table_aux_merkle_root".into(), bytes: s_aux_root },
         ProofSizeEntry { section: "deep_poly_openings.main_trace_polys".into(), bytes: s_main_trace_openings },
         ProofSizeEntry { section: "deep_poly_openings.precomputed_trace_polys".into(), bytes: s_precomputed_trace_openings },
         ProofSizeEntry { section: "deep_poly_openings.aux_trace_polys".into(), bytes: s_aux_trace_openings },
@@ -770,7 +775,7 @@ fn cmd_proof_size(
             total_vm_proof_bytes: total,
             multi_proof_bytes,
             sub_proof_count: vm_proof.proof.proofs.len(),
-            main_mmcs_spec_entries: vm_proof.proof.main_mmcs_spec.len(),
+            main_mmcs_spec_entries: vm_proof.proof.main_mmcs_specs.iter().map(|s| s.len()).sum::<usize>(),
             sections: entries.clone(),
         };
         match serde_json::to_string_pretty(&report) {
@@ -787,7 +792,7 @@ fn cmd_proof_size(
         println!("Total VmProof:     {:>10}  bytes", total);
         println!("MultiProof only:   {:>10}  bytes", multi_proof_bytes);
         println!("Sub-proofs:        {:>10}", vm_proof.proof.proofs.len());
-        println!("MMCS spec entries: {:>10}", vm_proof.proof.main_mmcs_spec.len());
+        println!("MMCS spec entries: {:>10}", vm_proof.proof.main_mmcs_specs.iter().map(|s| s.len()).sum::<usize>());
         println!();
         println!("{:<48}{:>14}{:>10}", "section", "bytes", "% of total");
         println!("{}", "-".repeat(72));
diff --git a/crypto/stark/src/proof/stark.rs b/crypto/stark/src/proof/stark.rs
index 57b28f75c..cc69f7bf0 100644
--- a/crypto/stark/src/proof/stark.rs
+++ b/crypto/stark/src/proof/stark.rs
@@ -146,26 +146,46 @@ pub struct StarkProof<F: IsSubFieldOf<E>, E: IsField, PI> {
 /// Used for multi-table proving where tables are linked via bus (LogUp).
 /// Returned by `Prover::multi_prove` and verified by `Verifier::multi_verify`.
 ///
-/// Non-preprocessed tables share a single main-trace MMCS authenticated by
-/// `main_mmcs_root`; `main_mmcs_spec` lists `(MatrixTag, padded_height)`
-/// per committed table in the MMCS sort order. Preprocessed tables stay
-/// out of the main MMCS — each carries its own per-table Merkle root in
+/// Non-preprocessed tables in each chunk share a main-trace MMCS
+/// authenticated by `main_mmcs_roots[chunk_idx]`. Tables are grouped into
+/// chunks of `chunk_size` (the prover's `table_parallelism()` at proving
+/// time, pinned in the proof so the verifier chunks the AIR slice the
+/// same way). Per-chunk grouping keeps openings small (at most K matrix_leaves
+/// per opening instead of N) and bounds the streaming MMCS build to one
+/// chunk's K LDEs at a time. Preprocessed tables stay out of any main
+/// MMCS; each carries its own per-table Merkle root in
 /// `StarkProof::lde_trace_main_merkle_root` plus the AIR-pinned
-/// precomputed root. Both groups' roots are absorbed in spec-fixed order
-/// during Phase A.
+/// precomputed root.
 ///
-/// Aux traces (only present for AIRs with LogUp interactions) share a
-/// SECOND MMCS authenticated by `aux_mmcs_root`; `aux_mmcs_spec` lists
-/// `(MatrixTag, padded_height)` for the subset of tables that contribute
-/// aux. `aux_mmcs_root` is `None` when no table in the multi-proof has an
-/// aux trace. Domain-separated from the main MMCS via `LEAF_DOMAIN_TAG_AUX`
-/// so that no aux opening can authenticate a main leaf (or vice versa).
+/// Phase A absorb order: for each table in spec order, absorb its
+/// preprocessed root + per-table multiplicities root (preprocessed only);
+/// after each chunk, absorb that chunk's main MMCS root (`Some`) or skip
+/// (`None`, when the chunk has no non-preprocessed tables).
+///
+/// Aux traces mirror the same chunk grouping. `aux_mmcs_roots[chunk_idx]`
+/// is `None` when no table in that chunk has an aux trace. Aux MMCS
+/// leaves are domain-separated from main via `LEAF_DOMAIN_TAG_AUX`.
 #[derive(Debug, serde::Serialize, serde::Deserialize)]
 #[serde(bound = "PI: serde::Serialize + serde::de::DeserializeOwned")]
 pub struct MultiProof<F: IsSubFieldOf<E>, E: IsField, PI> {
     pub proofs: Vec<StarkProof<F, E, PI>>,
-    pub main_mmcs_root: Commitment,
-    pub main_mmcs_spec: Vec<(MatrixTag, usize)>,
-    pub aux_mmcs_root: Option<Commitment>,
-    pub aux_mmcs_spec: Vec<(MatrixTag, usize)>,
+    /// Per-chunk main MMCS roots in chunk order. `None` for chunks whose
+    /// tables are all preprocessed (no main MMCS exists for that chunk).
+    pub main_mmcs_roots: Vec<Option<Commitment>>,
+    /// Per-chunk MMCS specs for the main trace, parallel to
+    /// `main_mmcs_roots`. Empty inner Vec when the corresponding root is
+    /// `None`. Each non-empty Vec lists `(MatrixTag, padded_height)` for
+    /// the non-preprocessed tables in that chunk in MMCS sort order
+    /// (height desc, tag asc).
+    pub main_mmcs_specs: Vec<Vec<(MatrixTag, usize)>>,
+    /// Per-chunk aux MMCS roots. `None` for chunks with no has_aux_trace
+    /// tables. Parallel to `main_mmcs_roots`.
+    pub aux_mmcs_roots: Vec<Option<Commitment>>,
+    /// Per-chunk aux MMCS specs. Empty inner Vec when the corresponding
+    /// `aux_mmcs_roots[i]` is `None`.
+    pub aux_mmcs_specs: Vec<Vec<(MatrixTag, usize)>>,
+    /// Pinned chunk size. Equals the prover's `table_parallelism()` at
+    /// proving time. The verifier uses this to chunk the AIR slice into
+    /// the same per-chunk grouping the prover used.
+    pub chunk_size: u32,
 }
diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs
index a5e2b8142..bf9a5f03e 100644
--- a/crypto/stark/src/prover.rs
+++ b/crypto/stark/src/prover.rs
@@ -32,7 +32,7 @@ use crate::proof::stark::{DeepPolynomialOpenings, MainTraceOpening, PolynomialOp
 use crate::storage_mode::StorageMode;
 use crate::table::Table;
 use crate::trace::LDETraceTable;
-use crypto::merkle_tree::mmcs::{MatrixTag, Mmcs, MmcsBuilder, MmcsError};
+use crypto::merkle_tree::mmcs::{MatrixTag, Mmcs, MmcsError, StreamingMmcsBuilder};
 
 use super::config::{BatchedMerkleTree, BatchedMerkleTreeBackend, Commitment};
 use super::constraints::evaluator::ConstraintEvaluator;
@@ -81,26 +81,43 @@ pub enum ProvingError {
     DiskSpill(String),
 }
 
-/// Per-table commitment artifacts for the main trace under the shared
-/// MMCS protocol. The `mmcs` Arc is the SAME instance for every table in
-/// the multi-proof — Phase A builds it once.
+/// Per-chunk main MMCS context. Shared across every non-preprocessed
+/// table in a chunk: the chunk's MMCS Arc + Arc-cloned LDE columns for
+/// chunk-mate non-preprocessed tables in MMCS-spec sort order. The
+/// per-query open path uses this to rehash chunk-mate rows on demand
+/// (the streaming MMCS dropped the per-chip leaf arrays at build time).
+pub(crate) struct ChunkMainMmcsContext<F: IsField>
+where
+    FieldElement<F>: AsBytes,
+{
+    /// Chunk-scoped MMCS (built once per chunk in Phase A).
+    pub(crate) mmcs: Arc<Mmcs<BatchedMerkleTreeBackend<F>>>,
+    /// Arc-cloned LDE columns for the non-preprocessed chunk-mates,
+    /// indexed in MMCS spec sort order (parallel to `mmcs.spec()`).
+    /// Open path closures look up `lde_columns_in_spec_order[m_idx]` to
+    /// rehash the row at the queried local position.
+    pub(crate) lde_columns_in_spec_order: Vec<Arc<Vec<Vec<FieldElement<F>>>>>,
+}
+
+/// Per-table commitment artifacts for the main trace.
 ///
-/// `padded_height` is this table's LDE height (a power of two), needed to
-/// translate the table's local FRI iota into a global MMCS index when
-/// opening (see `open_deep_composition_poly`).
+/// `Shared` tables borrow a per-chunk MMCS context (Arc) and remember
+/// their chunk index so the verifier can look up the matching root +
+/// spec in `MultiProof::main_mmcs_roots[chunk_idx]`.
 pub(crate) enum MainCommit<F: IsField>
 where
     FieldElement<F>: AsBytes,
 {
-    /// Non-preprocessed table: committed under the shared MMCS.
+    /// Non-preprocessed table: committed under the chunk's MMCS.
     Shared {
-        mmcs: Arc<Mmcs<BatchedMerkleTreeBackend<F>>>,
+        chunk_ctx: Arc<ChunkMainMmcsContext<F>>,
+        chunk_idx: usize,
         tag: MatrixTag,
         /// Padded height (== LDE row count); needed to translate a local
-        /// FRI iota into a global MMCS index.
+        /// FRI iota into a global MMCS index inside this chunk's MMCS.
         padded_height: usize,
     },
-    /// Preprocessed table: two per-table Merkle trees, NOT in the MMCS.
+    /// Preprocessed table: two per-table Merkle trees, NOT in any MMCS.
     Preprocessed {
         multiplicities_tree: Arc<BatchedMerkleTree<F>>,
         multiplicities_root: Commitment,
@@ -137,11 +154,13 @@ where
     fn share(&self) -> Self {
         match self {
             Self::Shared {
-                mmcs,
+                chunk_ctx,
+                chunk_idx,
                 tag,
                 padded_height,
             } => Self::Shared {
-                mmcs: Arc::clone(mmcs),
+                chunk_ctx: Arc::clone(chunk_ctx),
+                chunk_idx: *chunk_idx,
                 tag: *tag,
                 padded_height: *padded_height,
             },
@@ -207,16 +226,26 @@ where
     }
 }
 
-/// Per-table aux-trace commitment under the shared aux MMCS.
-/// Mirror of [`MainCommit::Shared`]: the `mmcs` Arc is shared across every
-/// table that contributes an aux trace; `tag` + `padded_height` identify
-/// this table's slot inside that MMCS.
+/// Per-chunk aux MMCS context. Sister of [`ChunkMainMmcsContext`] for
+/// the aux trace.
+pub(crate) struct ChunkAuxMmcsContext<E: IsField>
+where
+    FieldElement<E>: AsBytes,
+{
+    pub(crate) mmcs: Arc<Mmcs<BatchedMerkleTreeBackend<E>>>,
+    /// Arc-cloned aux LDE columns for chunk-mates with aux, in MMCS
+    /// spec sort order.
+    pub(crate) lde_columns_in_spec_order: Vec<Arc<Vec<Vec<FieldElement<E>>>>>,
+}
+
+/// Per-table aux-trace commitment under a chunk's aux MMCS.
 pub(crate) enum AuxCommit<E: IsField>
 where
     FieldElement<E>: AsBytes,
 {
     Shared {
-        mmcs: Arc<Mmcs<BatchedMerkleTreeBackend<E>>>,
+        chunk_ctx: Arc<ChunkAuxMmcsContext<E>>,
+        chunk_idx: usize,
         tag: MatrixTag,
         padded_height: usize,
     },
@@ -229,11 +258,13 @@ where
     fn share(&self) -> Self {
         match self {
             Self::Shared {
-                mmcs,
+                chunk_ctx,
+                chunk_idx,
                 tag,
                 padded_height,
             } => Self::Shared {
-                mmcs: Arc::clone(mmcs),
+                chunk_ctx: Arc::clone(chunk_ctx),
+                chunk_idx: *chunk_idx,
                 tag: *tag,
                 padded_height: *padded_height,
             },
@@ -290,13 +321,16 @@ where
     bus_public_inputs: Option<BusPublicInputs<FieldExtension>>,
 }
 
-/// LDE columns for main (Phase A) and auxiliary (Phase C) traces, consumed by value in Phase D.
+/// LDE columns for main (Phase A) and auxiliary (Phase C) traces.
+/// Arc-wrapped so per-chunk MMCS contexts can hold cheap clones for the
+/// open path while the originating table's `Round1.lde_trace` retains
+/// the same data via Arc share (no duplication).
 ///
-/// Memory trade-off: all N tables' LDE columns are live simultaneously between Phase A/C
-/// and Phase D (O(N × cols × lde_size)).
+/// Memory trade-off: all N tables' LDE columns are live simultaneously
+/// between Phase A/C and Phase D (O(N × cols × lde_size)).
 struct Lde<Field: IsFFTField, FieldExtension: IsField> {
-    main: Vec<Vec<FieldElement<Field>>>,
-    aux: Vec<Vec<FieldElement<FieldExtension>>>,
+    main: Arc<Vec<Vec<FieldElement<Field>>>>,
+    aux: Arc<Vec<Vec<FieldElement<FieldExtension>>>>,
 }
 
 impl<Field, FieldExtension> Round1Commitments<Field, FieldExtension>
@@ -307,7 +341,9 @@ where
     FieldElement<FieldExtension>: AsBytes,
 {
     /// Build a `Round1` by consuming a `Lde` and borrowing commitment data.
-    /// The `share` calls are cheap — only bump Arc refcounts.
+    /// The `share` calls are cheap — only bump Arc refcounts. The LDE
+    /// columns are also Arc-shared (with this chunk's MMCS contexts) so
+    /// the open path can rehash chunk-mate rows without copying.
     fn build_round1(
         &self,
         lde: Lde<Field, FieldExtension>,
@@ -315,7 +351,12 @@ where
         blowup_factor: usize,
     ) -> Round1<Field, FieldExtension> {
         Round1 {
-            lde_trace: LDETraceTable::from_columns(lde.main, lde.aux, step_size, blowup_factor),
+            lde_trace: LDETraceTable::from_columns_arc(
+                lde.main,
+                lde.aux,
+                step_size,
+                blowup_factor,
+            ),
             main: self.main.share(),
             aux: self.aux.as_ref().map(AuxCommit::share),
             rap_challenges: self.rap_challenges.clone(),
@@ -512,17 +553,73 @@ fn map_mmcs_err(e: MmcsError) -> ProvingError {
     ProvingError::WrongParameter(format!("MMCS: {e:?}"))
 }
 
-/// Build the unified main-trace MMCS from the per-table Phase A outputs.
-/// Returns the root, the (tag, padded_height) spec, and the shared Arc that
-/// every table's `MainCommit` borrows.
+/// Rehash a single main-trace LDE row to its tagged leaf digest. Used by
+/// the per-chunk open path: when `Mmcs::open_with_leaves` walks the chunk
+/// MMCS spec to gather matrix_leaves at a queried position, this helper
+/// recomputes each chunk-mate's leaf on demand from the chunk-shared LDE
+/// columns. Mirrors what the verifier computes via `hash_tagged_row`.
+pub fn rehash_main_chip_leaf<F>(
+    tag: MatrixTag,
+    columns: &Arc<Vec<Vec<FieldElement<F>>>>,
+    local_idx: usize,
+) -> Commitment
+where
+    F: IsField,
+    FieldElement<F>: AsBytes + ByteConversion,
+{
+    let num_rows = columns
+        .first()
+        .map(|c| c.len())
+        .expect("non-empty LDE columns");
+    let br_idx = reverse_index(local_idx, num_rows as u64);
+    let byte_len = <FieldElement<F> as ByteConversion>::BYTE_LEN;
+    let mut buf = vec![0u8; columns.len() * byte_len];
+    for (col_idx, col) in columns.iter().enumerate() {
+        col[br_idx].write_bytes_be(&mut buf[col_idx * byte_len..(col_idx + 1) * byte_len]);
+    }
+    crate::mmcs_leaf::hash_tagged_row_bytes(tag, &buf)
+}
+
+/// Aux-trace counterpart of [`rehash_main_chip_leaf`] using the AUX
+/// domain separator so aux/main leaves cannot collide.
+pub fn rehash_aux_chip_leaf<E>(
+    tag: MatrixTag,
+    columns: &Arc<Vec<Vec<FieldElement<E>>>>,
+    local_idx: usize,
+) -> Commitment
+where
+    E: IsField,
+    FieldElement<E>: AsBytes + ByteConversion,
+{
+    let num_rows = columns
+        .first()
+        .map(|c| c.len())
+        .expect("non-empty aux LDE columns");
+    let br_idx = reverse_index(local_idx, num_rows as u64);
+    let byte_len = <FieldElement<E> as ByteConversion>::BYTE_LEN;
+    let mut buf = vec![0u8; columns.len() * byte_len];
+    for (col_idx, col) in columns.iter().enumerate() {
+        col[br_idx].write_bytes_be(&mut buf[col_idx * byte_len..(col_idx + 1) * byte_len]);
+    }
+    crate::mmcs_leaf::hash_tagged_row_bytes_aux(tag, &buf)
+}
+
+/// Build a CHUNK-scoped main MMCS via [`StreamingMmcsBuilder`]. Consumes
+/// the Shared phase-A outputs (drops their per-chip leaves once folded),
+/// returns the chunk root + spec + an `Arc<ChunkMainMmcsContext>` that
+/// every Shared table in the chunk borrows.
+///
+/// Returns `None` for the root/context when the chunk has no Shared
+/// tables (entire chunk is preprocessed).
 #[allow(clippy::type_complexity)]
-fn build_main_mmcs<F>(
-    outputs: &[MainPhaseAOutput<F>],
+fn build_chunk_main_mmcs<F>(
+    shared_outputs: Vec<(MatrixTag, Vec<Commitment>, usize)>,
+    chunk_lde_for_shared: Vec<(MatrixTag, Arc<Vec<Vec<FieldElement<F>>>>)>,
 ) -> Result<
     (
-        Commitment,
+        Option<Commitment>,
         Vec<(MatrixTag, usize)>,
-        Arc<Mmcs<BatchedMerkleTreeBackend<F>>>,
+        Option<Arc<ChunkMainMmcsContext<F>>>,
     ),
     ProvingError,
 >
@@ -530,23 +627,41 @@ where
     F: IsField + Send + Sync,
     FieldElement<F>: AsBytes + Send + Sync,
 {
-    let mut builder: MmcsBuilder<BatchedMerkleTreeBackend<F>> = MmcsBuilder::new();
-    for output in outputs {
-        if let MainPhaseAOutput::Shared {
-            tag,
-            leaves,
-            padded_height: _,
-        } = output
-        {
-            builder
-                .add_matrix(*tag, leaves.clone())
-                .map_err(map_mmcs_err)?;
-        }
+    if shared_outputs.is_empty() {
+        return Ok((None, Vec::new(), None));
+    }
+    debug_assert_eq!(shared_outputs.len(), chunk_lde_for_shared.len());
+
+    // Sort both vectors into MMCS spec order: height desc, tag asc.
+    let mut shared_outputs = shared_outputs;
+    shared_outputs.sort_by(|a, b| b.2.cmp(&a.2).then(a.0.cmp(&b.0)));
+    let lde_by_tag: std::collections::BTreeMap<MatrixTag, Arc<Vec<Vec<FieldElement<F>>>>> =
+        chunk_lde_for_shared.into_iter().collect();
+
+    let mut builder: StreamingMmcsBuilder<BatchedMerkleTreeBackend<F>> =
+        StreamingMmcsBuilder::new();
+    let mut lde_columns_in_spec_order: Vec<Arc<Vec<Vec<FieldElement<F>>>>> =
+        Vec::with_capacity(shared_outputs.len());
+    for (tag, leaves, _padded_height) in shared_outputs {
+        let lde = lde_by_tag
+            .get(&tag)
+            .ok_or_else(|| {
+                ProvingError::WrongParameter(format!(
+                    "missing chunk LDE for tag {tag:?} during chunk MMCS build"
+                ))
+            })?
+            .clone();
+        lde_columns_in_spec_order.push(lde);
+        builder.add_matrix(tag, leaves).map_err(map_mmcs_err)?;
     }
     let mmcs = builder.finalize().map_err(map_mmcs_err)?;
     let root = *mmcs.root();
     let spec = mmcs.spec();
-    Ok((root, spec, Arc::new(mmcs)))
+    let ctx = Arc::new(ChunkMainMmcsContext {
+        mmcs: Arc::new(mmcs),
+        lde_columns_in_spec_order,
+    });
+    Ok((Some(root), spec, Some(ctx)))
 }
 
 /// Tagged per-row leaf digest for the AUX-trace MMCS. Mirror of
@@ -591,17 +706,18 @@ where
     }
 }
 
-/// Build the shared AUX-trace MMCS from per-table Phase-C outputs (only
-/// tables that have an aux trace participate). Returns `None`/`empty spec`
-/// when no table contributes aux.
+/// Build a CHUNK-scoped aux MMCS via [`StreamingMmcsBuilder`]. Sister of
+/// [`build_chunk_main_mmcs`] for the aux trace. Returns `None` for root
+/// and context when no chunk-mate has an aux trace.
 #[allow(clippy::type_complexity)]
-fn build_aux_mmcs<E>(
-    outputs: &[Option<AuxPhaseCOutput<E>>],
+fn build_chunk_aux_mmcs<E>(
+    aux_outputs: Vec<(MatrixTag, Vec<Commitment>, usize)>,
+    chunk_aux_lde_for_shared: Vec<(MatrixTag, Arc<Vec<Vec<FieldElement<E>>>>)>,
 ) -> Result<
     (
         Option<Commitment>,
         Vec<(MatrixTag, usize)>,
-        Option<Arc<Mmcs<BatchedMerkleTreeBackend<E>>>>,
+        Option<Arc<ChunkAuxMmcsContext<E>>>,
     ),
     ProvingError,
 >
@@ -609,20 +725,40 @@ where
     E: IsField + Send + Sync,
     FieldElement<E>: AsBytes + Send + Sync,
 {
-    let any = outputs.iter().any(|o| o.is_some());
-    if !any {
+    if aux_outputs.is_empty() {
         return Ok((None, Vec::new(), None));
     }
-    let mut builder: MmcsBuilder<BatchedMerkleTreeBackend<E>> = MmcsBuilder::new();
-    for out in outputs.iter().flatten() {
-        builder
-            .add_matrix(out.tag, out.leaves.clone())
-            .map_err(map_mmcs_err)?;
+    debug_assert_eq!(aux_outputs.len(), chunk_aux_lde_for_shared.len());
+
+    let mut aux_outputs = aux_outputs;
+    aux_outputs.sort_by(|a, b| b.2.cmp(&a.2).then(a.0.cmp(&b.0)));
+    let lde_by_tag: std::collections::BTreeMap<MatrixTag, Arc<Vec<Vec<FieldElement<E>>>>> =
+        chunk_aux_lde_for_shared.into_iter().collect();
+
+    let mut builder: StreamingMmcsBuilder<BatchedMerkleTreeBackend<E>> =
+        StreamingMmcsBuilder::new();
+    let mut lde_columns_in_spec_order: Vec<Arc<Vec<Vec<FieldElement<E>>>>> =
+        Vec::with_capacity(aux_outputs.len());
+    for (tag, leaves, _padded_height) in aux_outputs {
+        let lde = lde_by_tag
+            .get(&tag)
+            .ok_or_else(|| {
+                ProvingError::WrongParameter(format!(
+                    "missing chunk aux LDE for tag {tag:?} during chunk MMCS build"
+                ))
+            })?
+            .clone();
+        lde_columns_in_spec_order.push(lde);
+        builder.add_matrix(tag, leaves).map_err(map_mmcs_err)?;
     }
     let mmcs = builder.finalize().map_err(map_mmcs_err)?;
     let root = *mmcs.root();
     let spec = mmcs.spec();
-    Ok((Some(root), spec, Some(Arc::new(mmcs))))
+    let ctx = Arc::new(ChunkAuxMmcsContext {
+        mmcs: Arc::new(mmcs),
+        lde_columns_in_spec_order,
+    });
+    Ok((Some(root), spec, Some(ctx)))
 }
 
 /// Tagged per-row leaf digest for the main-trace MMCS.
@@ -987,7 +1123,14 @@ pub trait IsStarkProver<
             Vec::new()
         };
 
-        Ok(commitment.build_round1(Lde { main, aux }, air.step_size(), domain.blowup_factor))
+        Ok(commitment.build_round1(
+            Lde {
+                main: Arc::new(main),
+                aux: Arc::new(aux),
+            },
+            air.step_size(),
+            domain.blowup_factor,
+        ))
     }
 
     /// Reconstruct Round1 for every table, print the bus balance report, and
@@ -1659,7 +1802,9 @@ pub trait IsStarkProver<
             );
 
             let aux_trace_polys = round_1_result.aux.as_ref().map(|aux| {
-                let AuxCommit::Shared { mmcs, padded_height, .. } = aux;
+                let AuxCommit::Shared { chunk_ctx, padded_height, .. } = aux;
+                let mmcs = &chunk_ctx.mmcs;
+                let lde_in_spec_order = &chunk_ctx.lde_columns_in_spec_order;
                 let max_height = mmcs
                     .spec()
                     .first()
@@ -1673,11 +1818,23 @@ pub trait IsStarkProver<
                 let evaluations = lde_trace.gather_aux_row(reverse_index(primary, domain_size));
                 let evaluations_sym = lde_trace.gather_aux_row(reverse_index(sym, domain_size));
                 let mmcs_opening = mmcs
-                    .open(primary << shift)
-                    .expect("aux MMCS open: prover-side primary index in range");
+                    .open_with_leaves(primary << shift, |m_idx, local_idx| {
+                        rehash_aux_chip_leaf::<FieldExtension>(
+                            mmcs.spec()[m_idx].0,
+                            &lde_in_spec_order[m_idx],
+                            local_idx,
+                        )
+                    })
+                    .expect("aux MMCS open_with_leaves: primary index in range");
                 let mmcs_opening_sym = mmcs
-                    .open(sym << shift)
-                    .expect("aux MMCS open: prover-side sym index in range");
+                    .open_with_leaves(sym << shift, |m_idx, local_idx| {
+                        rehash_aux_chip_leaf::<FieldExtension>(
+                            mmcs.spec()[m_idx].0,
+                            &lde_in_spec_order[m_idx],
+                            local_idx,
+                        )
+                    })
+                    .expect("aux MMCS open_with_leaves: sym index in range");
                 crate::proof::stark::AuxTraceOpening::Mmcs {
                     evaluations,
                     evaluations_sym,
@@ -1688,10 +1845,12 @@ pub trait IsStarkProver<
 
             let (main_trace_opening, precomputed_trace_opening) = match main_commit {
                 MainCommit::Shared {
-                    mmcs,
+                    chunk_ctx,
                     padded_height,
                     ..
                 } => {
+                    let mmcs = &chunk_ctx.mmcs;
+                    let lde_in_spec_order = &chunk_ctx.lde_columns_in_spec_order;
                     let max_height = mmcs
                         .spec()
                         .first()
@@ -1707,11 +1866,23 @@ pub trait IsStarkProver<
                     let evaluations = lde_trace.gather_main_row(reverse_index(primary, domain_size));
                     let evaluations_sym = lde_trace.gather_main_row(reverse_index(sym, domain_size));
                     let mmcs_opening = mmcs
-                        .open(primary << shift)
-                        .expect("MMCS open: prover-side primary index in range");
+                        .open_with_leaves(primary << shift, |m_idx, local_idx| {
+                            rehash_main_chip_leaf::<Field>(
+                                mmcs.spec()[m_idx].0,
+                                &lde_in_spec_order[m_idx],
+                                local_idx,
+                            )
+                        })
+                        .expect("main MMCS open_with_leaves: primary index in range");
                     let mmcs_opening_sym = mmcs
-                        .open(sym << shift)
-                        .expect("MMCS open: prover-side sym index in range");
+                        .open_with_leaves(sym << shift, |m_idx, local_idx| {
+                            rehash_main_chip_leaf::<Field>(
+                                mmcs.spec()[m_idx].0,
+                                &lde_in_spec_order[m_idx],
+                                local_idx,
+                            )
+                        })
+                        .expect("main MMCS open_with_leaves: sym index in range");
                     let opening = MainTraceOpening::Mmcs {
                         evaluations,
                         evaluations_sym,
@@ -1896,17 +2067,26 @@ pub trait IsStarkProver<
         #[cfg(feature = "instruments")]
         let phase_start = Instant::now();
 
-        let mut phase_a_outputs: Vec<MainPhaseAOutput<Field>> = Vec::with_capacity(num_airs);
-        let mut main_ldes: Vec<Vec<Vec<FieldElement<Field>>>> = Vec::with_capacity(num_airs);
+        // Per-chunk MMCS: each chunk of K tables builds its own streaming
+        // MMCS, sharing chunk LDEs via Arc so per-query opens can rehash
+        // chunk-mate rows on demand. Phase A absorb order: per table in
+        // spec order, absorb preprocessed + main-tree roots (preprocessed
+        // only); after each chunk, absorb the chunk's MMCS root (`Some`)
+        // or skip when the chunk has no Shared tables (`None`).
+        let mut main_commits: Vec<Option<MainCommit<Field>>> = (0..num_airs).map(|_| None).collect();
+        let mut main_ldes: Vec<Option<Arc<Vec<Vec<FieldElement<Field>>>>>> =
+            (0..num_airs).map(|_| None).collect();
+        let mut main_mmcs_roots_per_chunk: Vec<Option<Commitment>> = Vec::new();
+        let mut main_mmcs_specs_per_chunk: Vec<Vec<(MatrixTag, usize)>> = Vec::new();
 
         for chunk_start in (0..num_airs).step_by(k) {
             let chunk_end = (chunk_start + k).min(num_airs);
             let chunk_range = chunk_start..chunk_end;
 
             #[cfg(feature = "parallel")]
-            let iter = chunk_range.into_par_iter();
+            let iter = chunk_range.clone().into_par_iter();
             #[cfg(not(feature = "parallel"))]
-            let iter = chunk_range;
+            let iter = chunk_range.clone();
 
             let chunk_results: Vec<Result<_, ProvingError>> = iter
                 .map(|idx| {
@@ -1930,56 +2110,90 @@ pub trait IsStarkProver<
                 })
                 .collect();
 
-            // Sequential: per table, absorb its preprocessed root and then
-            // its own per-table multiplicities root (preprocessed only). The
-            // shared MMCS root is absorbed once after the loop. Order must
-            // match the verifier replay.
-            for result in chunk_results {
-                let (output, cached_main) = result?;
+            // Sequential: absorb per-table preprocessed + main-tree roots
+            // (preprocessed only) in order, then build this chunk's MMCS
+            // from the chunk's Shared outputs and absorb its root.
+            let mut chunk_shared_outputs: Vec<(MatrixTag, Vec<Commitment>, usize)> = Vec::new();
+            let mut chunk_shared_ldes: Vec<(MatrixTag, Arc<Vec<Vec<FieldElement<Field>>>>)> =
+                Vec::new();
+            let chunk_idx = main_mmcs_roots_per_chunk.len();
+            let chunk_outputs: Vec<_> = chunk_results.into_iter().collect::<Result<_, _>>()?;
+            for (offset, (output, cached_main)) in chunk_outputs.into_iter().enumerate() {
+                let idx = chunk_start + offset;
                 if let Some(ref pre_root) = output.precomputed_root() {
                     transcript.append_bytes(pre_root);
                 }
                 if let Some(ref main_root) = output.main_tree_root() {
                     transcript.append_bytes(main_root);
                 }
-                phase_a_outputs.push(output);
-                main_ldes.push(cached_main);
+                let cached_main_arc = Arc::new(cached_main);
+                main_ldes[idx] = Some(Arc::clone(&cached_main_arc));
+                match output {
+                    MainPhaseAOutput::Shared {
+                        tag,
+                        leaves,
+                        padded_height,
+                    } => {
+                        chunk_shared_outputs.push((tag, leaves, padded_height));
+                        chunk_shared_ldes.push((tag, cached_main_arc));
+                        // MainCommit::Shared placeholder filled in after chunk MMCS build.
+                        main_commits[idx] = None;
+                    }
+                    MainPhaseAOutput::Preprocessed {
+                        multiplicities_tree,
+                        multiplicities_root,
+                        precomputed_tree,
+                        precomputed_root,
+                        num_precomputed_cols,
+                    } => {
+                        main_commits[idx] = Some(MainCommit::Preprocessed {
+                            multiplicities_tree,
+                            multiplicities_root,
+                            precomputed_tree,
+                            precomputed_root,
+                            num_precomputed_cols,
+                        });
+                    }
+                }
             }
-        }
 
-        // Build the unified main-trace MMCS once over Shared (non-preprocessed)
-        // entries. Preprocessed tables stay out of the MMCS and keep their
-        // own per-table Merkle trees (already absorbed above).
-        let (main_mmcs_root, main_mmcs_spec, mmcs_arc) =
-            build_main_mmcs::<Field>(&phase_a_outputs)?;
-        transcript.append_bytes(&main_mmcs_root);
+            let (chunk_root, chunk_spec, chunk_ctx_opt) =
+                build_chunk_main_mmcs::<Field>(chunk_shared_outputs, chunk_shared_ldes)?;
+            if let Some(ref root) = chunk_root {
+                transcript.append_bytes(root);
+            }
+            main_mmcs_roots_per_chunk.push(chunk_root);
+            main_mmcs_specs_per_chunk.push(chunk_spec.clone());
+
+            // Fill in MainCommit::Shared for this chunk's Shared tables.
+            if let Some(chunk_ctx) = chunk_ctx_opt {
+                // chunk_spec is in MMCS sort order (height desc, tag asc).
+                // Use tag → padded_height lookup to populate Shared variants.
+                let height_by_tag: std::collections::BTreeMap<MatrixTag, usize> =
+                    chunk_spec.iter().copied().collect();
+                for idx in chunk_range.clone() {
+                    if main_commits[idx].is_none() {
+                        let tag = main_tags[idx];
+                        if let Some(&padded_height) = height_by_tag.get(&tag) {
+                            main_commits[idx] = Some(MainCommit::Shared {
+                                chunk_ctx: Arc::clone(&chunk_ctx),
+                                chunk_idx,
+                                tag,
+                                padded_height,
+                            });
+                        }
+                    }
+                }
+            }
+        }
 
-        let main_commits: Vec<MainCommit<Field>> = phase_a_outputs
+        let main_commits: Vec<MainCommit<Field>> = main_commits
             .into_iter()
-            .map(|o| match o {
-                MainPhaseAOutput::Shared {
-                    tag,
-                    padded_height,
-                    leaves: _,
-                } => MainCommit::Shared {
-                    mmcs: Arc::clone(&mmcs_arc),
-                    tag,
-                    padded_height,
-                },
-                MainPhaseAOutput::Preprocessed {
-                    multiplicities_tree,
-                    multiplicities_root,
-                    precomputed_tree,
-                    precomputed_root,
-                    num_precomputed_cols,
-                } => MainCommit::Preprocessed {
-                    multiplicities_tree,
-                    multiplicities_root,
-                    precomputed_tree,
-                    precomputed_root,
-                    num_precomputed_cols,
-                },
-            })
+            .map(|c| c.expect("main commit populated for every table"))
+            .collect();
+        let main_ldes: Vec<Arc<Vec<Vec<FieldElement<Field>>>>> = main_ldes
+            .into_iter()
+            .map(|l| l.expect("main LDE populated for every table"))
             .collect();
 
         #[cfg(feature = "instruments")]
@@ -2064,21 +2278,25 @@ pub trait IsStarkProver<
         #[cfg(feature = "instruments")]
         let phase_start = Instant::now();
 
-        // Per-table aux Phase-C outputs. `None` entries are tables with no
-        // aux trace and contribute neither leaves nor an MMCS slot.
-        let mut aux_outputs: Vec<Option<AuxPhaseCOutput<FieldExtension>>> =
-            Vec::with_capacity(num_airs);
-        let mut aux_ldes: Vec<Vec<Vec<FieldElement<FieldExtension>>>> =
+        // Per-chunk aux MMCS: mirror of Phase A main, applied to the aux
+        // trace. Each chunk's aux MMCS root is absorbed into the SHARED
+        // transcript BEFORE per-table forking so every fork sees the
+        // same per-chunk aux binding identically.
+        let mut aux_commits: Vec<Option<AuxCommit<FieldExtension>>> =
+            (0..num_airs).map(|_| None).collect();
+        let mut aux_ldes_arc: Vec<Arc<Vec<Vec<FieldElement<FieldExtension>>>>> =
             Vec::with_capacity(num_airs);
+        let mut aux_mmcs_roots_per_chunk: Vec<Option<Commitment>> = Vec::new();
+        let mut aux_mmcs_specs_per_chunk: Vec<Vec<(MatrixTag, usize)>> = Vec::new();
 
         for chunk_start in (0..num_airs).step_by(k) {
             let chunk_end = (chunk_start + k).min(num_airs);
             let chunk_range = chunk_start..chunk_end;
 
             #[cfg(feature = "parallel")]
-            let iter = chunk_range.into_par_iter();
+            let iter = chunk_range.clone().into_par_iter();
             #[cfg(not(feature = "parallel"))]
-            let iter = chunk_range;
+            let iter = chunk_range.clone();
 
             let chunk_aux: Vec<Result<_, ProvingError>> = iter
                 .map(|idx| {
@@ -2126,26 +2344,60 @@ pub trait IsStarkProver<
                 })
                 .collect();
 
-            for result in chunk_aux {
-                let (output, cached_aux) = result?;
-                aux_outputs.push(output);
-                aux_ldes.push(cached_aux);
+            let chunk_idx = aux_mmcs_roots_per_chunk.len();
+            let mut chunk_aux_outputs: Vec<(MatrixTag, Vec<Commitment>, usize)> = Vec::new();
+            let mut chunk_aux_ldes: Vec<(MatrixTag, Arc<Vec<Vec<FieldElement<FieldExtension>>>>)> =
+                Vec::new();
+            let chunk_outputs: Vec<_> = chunk_aux.into_iter().collect::<Result<_, _>>()?;
+            for (offset, (maybe_output, cached_aux)) in chunk_outputs.into_iter().enumerate() {
+                let idx = chunk_start + offset;
+                let cached_arc = Arc::new(cached_aux);
+                aux_ldes_arc.push(Arc::clone(&cached_arc));
+                if let Some(out) = maybe_output {
+                    let AuxPhaseCOutput {
+                        tag,
+                        leaves,
+                        padded_height,
+                        ..
+                    } = out;
+                    chunk_aux_outputs.push((tag, leaves, padded_height));
+                    chunk_aux_ldes.push((tag, cached_arc));
+                    aux_commits[idx] = None; // filled in after MMCS build
+                } else {
+                    aux_commits[idx] = None;
+                }
             }
-        }
-
-        // Build the shared aux MMCS over the non-None entries. Order is
-        // spec-fixed (matches `main_tags` order, filtered to has-aux).
-        let (aux_mmcs_root_opt, aux_mmcs_spec, aux_mmcs_arc) =
-            build_aux_mmcs::<FieldExtension>(&aux_outputs)?;
 
-        // Absorb the aux MMCS root into the SHARED transcript before
-        // forking — every table's fork inherits this binding identically.
-        if let Some(ref root) = aux_mmcs_root_opt {
-            transcript.append_bytes(root);
+            let (chunk_root, chunk_spec, chunk_ctx_opt) =
+                build_chunk_aux_mmcs::<FieldExtension>(chunk_aux_outputs, chunk_aux_ldes)?;
+            if let Some(ref root) = chunk_root {
+                transcript.append_bytes(root);
+            }
+            aux_mmcs_roots_per_chunk.push(chunk_root);
+            aux_mmcs_specs_per_chunk.push(chunk_spec.clone());
+
+            if let Some(chunk_ctx) = chunk_ctx_opt {
+                let height_by_tag: std::collections::BTreeMap<MatrixTag, usize> =
+                    chunk_spec.iter().copied().collect();
+                for idx in chunk_range.clone() {
+                    let (air, _, _) = &air_trace_pairs[idx];
+                    if air.has_aux_trace() {
+                        let tag = main_tags[idx];
+                        if let Some(&padded_height) = height_by_tag.get(&tag) {
+                            aux_commits[idx] = Some(AuxCommit::Shared {
+                                chunk_ctx: Arc::clone(&chunk_ctx),
+                                chunk_idx,
+                                tag,
+                                padded_height,
+                            });
+                        }
+                    }
+                }
+            }
         }
 
         // Pre-fork all transcripts (cheap, sequential — must match verifier ordering).
-        // Happens AFTER aux MMCS absorb so each fork inherits the binding.
+        // Happens AFTER all per-chunk aux MMCS roots have been absorbed.
         let mut table_transcripts: Vec<_> = (0..num_airs)
             .map(|idx| {
                 let mut t = transcript.clone();
@@ -2156,28 +2408,13 @@ pub trait IsStarkProver<
             })
             .collect();
 
-        // Reassemble per-table aux commits from the shared MMCS Arc.
-        let aux_commits: Vec<Option<AuxCommit<FieldExtension>>> = aux_outputs
-            .into_iter()
-            .map(|o| {
-                o.map(|out| AuxCommit::Shared {
-                    mmcs: Arc::clone(
-                        aux_mmcs_arc
-                            .as_ref()
-                            .expect("MMCS Arc populated when at least one aux output present"),
-                    ),
-                    tag: out.tag,
-                    padded_height: out.padded_height,
-                })
-            })
-            .collect();
         #[allow(clippy::type_complexity)]
         let aux_results: Vec<(
             Option<AuxCommit<FieldExtension>>,
-            Vec<Vec<FieldElement<FieldExtension>>>,
+            Arc<Vec<Vec<FieldElement<FieldExtension>>>>,
         )> = aux_commits
             .into_iter()
-            .zip(aux_ldes)
+            .zip(aux_ldes_arc)
             .collect();
 
         // Build commitments and cached LDEs as separate vecs:
@@ -2328,10 +2565,11 @@ pub trait IsStarkProver<
 
         Ok(MultiProof {
             proofs,
-            main_mmcs_root,
-            main_mmcs_spec,
-            aux_mmcs_root: aux_mmcs_root_opt,
-            aux_mmcs_spec,
+            main_mmcs_roots: main_mmcs_roots_per_chunk,
+            main_mmcs_specs: main_mmcs_specs_per_chunk,
+            aux_mmcs_roots: aux_mmcs_roots_per_chunk,
+            aux_mmcs_specs: aux_mmcs_specs_per_chunk,
+            chunk_size: k as u32,
         })
     }
 
diff --git a/crypto/stark/src/tests/mmcs_aux_soundness_tests.rs b/crypto/stark/src/tests/mmcs_aux_soundness_tests.rs
index d01d4a924..cfa4828f4 100644
--- a/crypto/stark/src/tests/mmcs_aux_soundness_tests.rs
+++ b/crypto/stark/src/tests/mmcs_aux_soundness_tests.rs
@@ -120,11 +120,31 @@ fn first_aux_mmcs_opening_mut(
         .expect("baseline must have aux openings")
 }
 
+/// First chunk index whose aux MMCS root is `Some`.
+fn first_populated_aux_chunk(proof: &MultiProof<F, E, LogReadOnlyPublicInputs<F>>) -> usize {
+    proof
+        .aux_mmcs_roots
+        .iter()
+        .position(|r| r.is_some())
+        .expect("at least one chunk must have an aux MMCS root in this baseline")
+}
+
 #[test_log::test]
 fn baseline_two_rap_tables_verify() {
     let (air_1, air_2, proof) = baseline_proof();
-    assert!(proof.aux_mmcs_root.is_some(), "aux MMCS must be present");
-    assert_eq!(proof.aux_mmcs_spec.len(), 2, "both AIRs contribute aux");
+    assert!(
+        proof.aux_mmcs_roots.iter().any(|r| r.is_some()),
+        "at least one chunk's aux MMCS must be present"
+    );
+    assert!(
+        proof
+            .aux_mmcs_specs
+            .iter()
+            .map(|s| s.len())
+            .sum::<usize>()
+            == 2,
+        "both AIRs contribute aux"
+    );
     let airs: Vec<&dyn AIR<Field = F, FieldExtension = E, PublicInputs = LogReadOnlyPublicInputs<F>>> =
         vec![&air_1, &air_2];
     assert!(verify(&airs, &proof), "baseline aux proof must verify");
@@ -135,7 +155,10 @@ fn tampered_aux_mmcs_root_rejected() {
     let (air_1, air_2, mut proof) = baseline_proof();
     let airs: Vec<&dyn AIR<Field = F, FieldExtension = E, PublicInputs = LogReadOnlyPublicInputs<F>>> =
         vec![&air_1, &air_2];
-    let root = proof.aux_mmcs_root.as_mut().expect("baseline has root");
+    let chunk_idx = first_populated_aux_chunk(&proof);
+    let root = proof.aux_mmcs_roots[chunk_idx]
+        .as_mut()
+        .expect("populated");
     root[0] ^= 1;
     assert!(!verify(&airs, &proof));
 }
@@ -145,8 +168,12 @@ fn missing_aux_mmcs_root_rejected() {
     let (air_1, air_2, mut proof) = baseline_proof();
     let airs: Vec<&dyn AIR<Field = F, FieldExtension = E, PublicInputs = LogReadOnlyPublicInputs<F>>> =
         vec![&air_1, &air_2];
-    proof.aux_mmcs_root = None;
-    assert!(!verify(&airs, &proof));
+    let chunk_idx = first_populated_aux_chunk(&proof);
+    proof.aux_mmcs_roots[chunk_idx] = None;
+    assert!(
+        !verify(&airs, &proof),
+        "aux_mmcs_root=None while chunk has aux tables must be rejected"
+    );
 }
 
 #[test_log::test]
@@ -154,7 +181,8 @@ fn tampered_aux_mmcs_spec_height_rejected() {
     let (air_1, air_2, mut proof) = baseline_proof();
     let airs: Vec<&dyn AIR<Field = F, FieldExtension = E, PublicInputs = LogReadOnlyPublicInputs<F>>> =
         vec![&air_1, &air_2];
-    proof.aux_mmcs_spec[0].1 /= 2;
+    let chunk_idx = first_populated_aux_chunk(&proof);
+    proof.aux_mmcs_specs[chunk_idx][0].1 /= 2;
     assert!(!verify(&airs, &proof));
 }
 
@@ -163,7 +191,8 @@ fn tampered_aux_mmcs_spec_tag_rejected() {
     let (air_1, air_2, mut proof) = baseline_proof();
     let airs: Vec<&dyn AIR<Field = F, FieldExtension = E, PublicInputs = LogReadOnlyPublicInputs<F>>> =
         vec![&air_1, &air_2];
-    proof.aux_mmcs_spec[0].0 = MatrixTag::new([0xFF; 8]);
+    let chunk_idx = first_populated_aux_chunk(&proof);
+    proof.aux_mmcs_specs[chunk_idx][0].0 = MatrixTag::new([0xFF; 8]);
     assert!(!verify(&airs, &proof));
 }
 
diff --git a/crypto/stark/src/tests/mmcs_soundness_tests.rs b/crypto/stark/src/tests/mmcs_soundness_tests.rs
index 0a690e085..ab0c8912f 100644
--- a/crypto/stark/src/tests/mmcs_soundness_tests.rs
+++ b/crypto/stark/src/tests/mmcs_soundness_tests.rs
@@ -26,11 +26,7 @@ type F = GoldilocksField;
 /// Build a baseline multi-proof over (DummyAIR, BitFlagsAIR). Both are
 /// non-preprocessed → every main opening is `MainTraceOpening::Mmcs`.
 #[allow(clippy::type_complexity)]
-fn baseline_proof() -> (
-    DummyAIR,
-    BitFlagsAIR,
-    MultiProof<F, F, ()>,
-) {
+fn baseline_proof() -> (DummyAIR, BitFlagsAIR, MultiProof<F, F, ()>) {
     let proof_options = ProofOptions::default_test_options();
     let air_1 = DummyAIR::new(&proof_options);
     let air_2 = BitFlagsAIR::new(&proof_options);
@@ -40,16 +36,15 @@ fn baseline_proof() -> (
         &dyn AIR<Field = F, FieldExtension = F, PublicInputs = ()>,
         &mut _,
         &_,
-    )> = vec![
-        (&air_1, &mut trace_1, &()),
-        (&air_2, &mut trace_2, &()),
-    ];
-    let proof =
-        multi_prove_ram(air_trace_pairs, &mut DefaultTranscript::<F>::new(&[])).unwrap();
+    )> = vec![(&air_1, &mut trace_1, &()), (&air_2, &mut trace_2, &())];
+    let proof = multi_prove_ram(air_trace_pairs, &mut DefaultTranscript::<F>::new(&[])).unwrap();
     (air_1, air_2, proof)
 }
 
-fn verify(airs: &[&dyn AIR<Field = F, FieldExtension = F, PublicInputs = ()>], proof: &MultiProof<F, F, ()>) -> bool {
+fn verify(
+    airs: &[&dyn AIR<Field = F, FieldExtension = F, PublicInputs = ()>],
+    proof: &MultiProof<F, F, ()>,
+) -> bool {
     multi_verify_ram(
         airs,
         proof,
@@ -58,12 +53,18 @@ fn verify(airs: &[&dyn AIR<Field = F, FieldExtension = F, PublicInputs = ()>], p
     )
 }
 
-/// First-iota opening for the first table in the multi-proof, in the Mmcs
-/// variant. Helper for tests that need a mutable handle into the per-query
-/// MMCS opening fields.
-fn first_mmcs_opening_mut(
-    proof: &mut MultiProof<F, F, ()>,
-) -> &mut MainTraceOpening<F> {
+/// First chunk index whose main MMCS root is `Some` — i.e., the first
+/// chunk that has at least one non-preprocessed table. Used by the
+/// tampering tests to locate a real root/spec to mutate.
+fn first_populated_main_chunk(proof: &MultiProof<F, F, ()>) -> usize {
+    proof
+        .main_mmcs_roots
+        .iter()
+        .position(|r| r.is_some())
+        .expect("at least one chunk must have a main MMCS root in this baseline")
+}
+
+fn first_mmcs_opening_mut(proof: &mut MultiProof<F, F, ()>) -> &mut MainTraceOpening<F> {
     &mut proof.proofs[0].deep_poly_openings[0].main_trace_polys
 }
 
@@ -80,7 +81,11 @@ fn tampered_main_mmcs_root_rejected() {
     let (air_1, air_2, mut proof) = baseline_proof();
     let airs: Vec<&dyn AIR<Field = F, FieldExtension = F, PublicInputs = ()>> =
         vec![&air_1, &air_2];
-    proof.main_mmcs_root[0] ^= 1;
+    let chunk_idx = first_populated_main_chunk(&proof);
+    let root = proof.main_mmcs_roots[chunk_idx]
+        .as_mut()
+        .expect("populated");
+    root[0] ^= 1;
     assert!(
         !verify(&airs, &proof),
         "tampered main MMCS root must be rejected"
@@ -92,8 +97,8 @@ fn tampered_main_mmcs_spec_height_rejected() {
     let (air_1, air_2, mut proof) = baseline_proof();
     let airs: Vec<&dyn AIR<Field = F, FieldExtension = F, PublicInputs = ()>> =
         vec![&air_1, &air_2];
-    let height = &mut proof.main_mmcs_spec[0].1;
-    *height /= 2;
+    let chunk_idx = first_populated_main_chunk(&proof);
+    proof.main_mmcs_specs[chunk_idx][0].1 /= 2;
     assert!(
         !verify(&airs, &proof),
         "spec height mismatch must be rejected"
@@ -105,13 +110,25 @@ fn tampered_main_mmcs_spec_tag_rejected() {
     let (air_1, air_2, mut proof) = baseline_proof();
     let airs: Vec<&dyn AIR<Field = F, FieldExtension = F, PublicInputs = ()>> =
         vec![&air_1, &air_2];
-    proof.main_mmcs_spec[0].0 = MatrixTag::new([0xFF; 8]);
+    let chunk_idx = first_populated_main_chunk(&proof);
+    proof.main_mmcs_specs[chunk_idx][0].0 = MatrixTag::new([0xFF; 8]);
     assert!(
         !verify(&airs, &proof),
         "spec tag mismatch must be rejected"
     );
 }
 
+#[test_log::test]
+fn tampered_chunk_size_rejected() {
+    // Pinned chunk_size mismatch should produce verifier rejection (per-chunk
+    // Vec lengths no longer line up with the verifier's chunking).
+    let (air_1, air_2, mut proof) = baseline_proof();
+    let airs: Vec<&dyn AIR<Field = F, FieldExtension = F, PublicInputs = ()>> =
+        vec![&air_1, &air_2];
+    proof.chunk_size = proof.chunk_size.saturating_add(1);
+    assert!(!verify(&airs, &proof), "tampered chunk_size must be rejected");
+}
+
 #[test_log::test]
 fn tampered_mmcs_opening_leaf_rejected() {
     let (air_1, air_2, mut proof) = baseline_proof();
@@ -202,14 +219,13 @@ fn tampered_evaluations_rejected() {
 #[test_log::test]
 fn swapped_main_tags_at_verifier_rejected() {
     // The verifier reproduces `main_tags` from `synth_main_tags(num_airs)`
-    // inside `multi_verify_ram`. To simulate a verifier that "lies" about
-    // tag ordering we call `multi_verify` directly with a permuted slice.
+    // inside `multi_verify_ram`. Simulate a verifier that "lies" about
+    // tag ordering by calling `multi_verify` directly with a permuted slice.
     use crate::verifier::{IsStarkVerifier, Verifier};
     let (air_1, air_2, proof) = baseline_proof();
     let airs: Vec<&dyn AIR<Field = F, FieldExtension = F, PublicInputs = ()>> =
         vec![&air_1, &air_2];
 
-    // Sanity: with the correct (synth) tag order it passes.
     let correct = synth_main_tags(airs.len());
     assert!(
         Verifier::multi_verify(
@@ -222,8 +238,6 @@ fn swapped_main_tags_at_verifier_rejected() {
         "baseline must verify with correct tags"
     );
 
-    // Swap the two tags — the spec sort order is now wrong relative to the
-    // prover's commitments, so the spec match check must reject.
     let mut swapped = correct.clone();
     swapped.swap(0, 1);
     assert!(
diff --git a/crypto/stark/src/verifier.rs b/crypto/stark/src/verifier.rs
index 569221ce0..95165a253 100644
--- a/crypto/stark/src/verifier.rs
+++ b/crypto/stark/src/verifier.rs
@@ -350,7 +350,7 @@ pub trait IsStarkVerifier<
         deep_poly_openings: &DeepPolynomialOpening<Field, FieldExtension>,
         iota: usize,
         main_tag: crypto::merkle_tree::mmcs::MatrixTag,
-        main_mmcs_root: &Commitment,
+        main_mmcs_root: Option<&Commitment>,
         main_mmcs_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)],
         aux_mmcs_root: Option<&Commitment>,
         aux_mmcs_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)],
@@ -405,7 +405,7 @@ pub trait IsStarkVerifier<
         main_opening: &crate::proof::stark::MainTraceOpening<Field>,
         iota: usize,
         main_tag: crypto::merkle_tree::mmcs::MatrixTag,
-        main_mmcs_root: &Commitment,
+        main_mmcs_root: Option<&Commitment>,
         main_mmcs_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)],
     ) -> bool
     where
@@ -452,7 +452,7 @@ pub trait IsStarkVerifier<
         proof: &StarkProof<Field, FieldExtension, PI>,
         challenges: &Challenges<FieldExtension>,
         main_tag: crypto::merkle_tree::mmcs::MatrixTag,
-        main_mmcs_root: &Commitment,
+        main_mmcs_root: Option<&Commitment>,
         main_mmcs_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)],
         aux_mmcs_root: Option<&Commitment>,
         aux_mmcs_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)],
@@ -804,65 +804,106 @@ pub trait IsStarkVerifier<
         // cross-check `main_mmcs_spec` against the (tag, padded_height_lde)
         // pairs reproduced from the AIRs.
 
-        let mut expected_spec: Vec<(crypto::merkle_tree::mmcs::MatrixTag, usize)> =
-            Vec::with_capacity(airs.len());
-        for (idx, (air, proof)) in airs.iter().zip(&multi_proof.proofs).enumerate() {
-            let lde_size = proof.trace_length * (air.options().blowup_factor as usize);
-            if air.is_preprocessed() {
-                // Preprocessed table: validate + absorb both its AIR-pinned
-                // precomputed root and its own per-table multiplicities root.
-                // Stays OUT of the shared MMCS spec.
-                let expected_precomputed = air.precomputed_commitment();
-                match &proof.lde_trace_precomputed_merkle_root {
-                    Some(actual) if *actual == expected_precomputed => {}
-                    Some(actual) => {
-                        error!(
-                            "Preprocessed commitment MISMATCH for table {idx}: expected {:?}, got {:?}",
-                            expected_precomputed, actual
-                        );
-                        return false;
+        // Per-chunk Phase A replay: chunk tables of size `chunk_size`. For
+        // each table absorb its preprocessed root + per-table main root
+        // (preprocessed only); at the end of each chunk, validate the
+        // chunk's main MMCS spec and absorb the chunk's main MMCS root
+        // (`Some`) or skip (`None` when the chunk has no non-preprocessed
+        // tables). Must match `multi_prove` Phase A absorb order exactly.
+        let chunk_size = multi_proof.chunk_size as usize;
+        if chunk_size == 0 {
+            error!("multi_proof.chunk_size is zero");
+            return false;
+        }
+        let expected_num_chunks = (airs.len() + chunk_size - 1) / chunk_size;
+        if multi_proof.main_mmcs_roots.len() != expected_num_chunks
+            || multi_proof.main_mmcs_specs.len() != expected_num_chunks
+            || multi_proof.aux_mmcs_roots.len() != expected_num_chunks
+            || multi_proof.aux_mmcs_specs.len() != expected_num_chunks
+        {
+            error!(
+                "per-chunk MMCS Vec lengths inconsistent with chunk_size={chunk_size}:                  expected {expected_num_chunks} chunks; got main_roots={}, main_specs={},                  aux_roots={}, aux_specs={}",
+                multi_proof.main_mmcs_roots.len(),
+                multi_proof.main_mmcs_specs.len(),
+                multi_proof.aux_mmcs_roots.len(),
+                multi_proof.aux_mmcs_specs.len(),
+            );
+            return false;
+        }
+
+        for chunk_idx in 0..expected_num_chunks {
+            let chunk_start = chunk_idx * chunk_size;
+            let chunk_end = (chunk_start + chunk_size).min(airs.len());
+
+            let mut expected_spec: Vec<(crypto::merkle_tree::mmcs::MatrixTag, usize)> =
+                Vec::new();
+            for idx in chunk_start..chunk_end {
+                let (air, proof) = (airs[idx], &multi_proof.proofs[idx]);
+                let lde_size = proof.trace_length * (air.options().blowup_factor as usize);
+                if air.is_preprocessed() {
+                    let expected_precomputed = air.precomputed_commitment();
+                    match &proof.lde_trace_precomputed_merkle_root {
+                        Some(actual) if *actual == expected_precomputed => {}
+                        Some(actual) => {
+                            error!(
+                                "Preprocessed commitment MISMATCH for table {idx}: expected {:?}, got {:?}",
+                                expected_precomputed, actual
+                            );
+                            return false;
+                        }
+                        None => {
+                            error!("Preprocessed table {idx} proof missing precomputed commitment");
+                            return false;
+                        }
                     }
-                    None => {
-                        error!("Preprocessed table {idx} proof missing precomputed commitment");
-                        return false;
+                    transcript.append_bytes(&expected_precomputed);
+                    match &proof.lde_trace_main_merkle_root {
+                        Some(root) => transcript.append_bytes(root),
+                        None => {
+                            error!(
+                                "Preprocessed table {idx} proof missing multiplicities Merkle root"
+                            );
+                            return false;
+                        }
                     }
-                }
-                transcript.append_bytes(&expected_precomputed);
-
-                match &proof.lde_trace_main_merkle_root {
-                    Some(root) => transcript.append_bytes(root),
-                    None => {
+                } else {
+                    if proof.lde_trace_main_merkle_root.is_some() {
                         error!(
-                            "Preprocessed table {idx} proof missing multiplicities Merkle root"
+                            "Non-preprocessed table {idx} unexpectedly supplied a per-table main root"
                         );
                         return false;
                     }
+                    expected_spec.push((main_tags[idx], lde_size));
                 }
-            } else {
-                // Non-preprocessed table: nothing per-table; the shared MMCS
-                // root absorbed below covers its main columns.
-                if proof.lde_trace_main_merkle_root.is_some() {
-                    error!(
-                        "Non-preprocessed table {idx} unexpectedly supplied a per-table main root"
-                    );
+            }
+
+            // Deterministic sort matches `MmcsBuilder::finalize`
+            // (height desc, tag asc) — same as the streaming builder.
+            expected_spec.sort_by(|a, b| b.1.cmp(&a.1).then(a.0.cmp(&b.0)));
+            if expected_spec != multi_proof.main_mmcs_specs[chunk_idx] {
+                error!(
+                    "chunk {chunk_idx} main_mmcs_spec mismatch: expected {:?}, got {:?}",
+                    expected_spec, multi_proof.main_mmcs_specs[chunk_idx],
+                );
+                return false;
+            }
+            match (
+                &multi_proof.main_mmcs_roots[chunk_idx],
+                expected_spec.is_empty(),
+            ) {
+                (Some(root), false) => transcript.append_bytes(root),
+                (None, true) => {}
+                (Some(_), true) => {
+                    error!("chunk {chunk_idx} main_mmcs_root present but no Shared tables");
+                    return false;
+                }
+                (None, false) => {
+                    error!("chunk {chunk_idx} main_mmcs_root missing but Shared tables exist");
                     return false;
                 }
-                expected_spec.push((main_tags[idx], lde_size));
             }
         }
 
-        // Deterministic sort matches `MmcsBuilder::finalize` (height desc, tag asc).
-        expected_spec.sort_by(|a, b| b.1.cmp(&a.1).then(a.0.cmp(&b.0)));
-        if expected_spec != multi_proof.main_mmcs_spec {
-            error!(
-                "main_mmcs_spec mismatch: expected {:?}, got {:?}",
-                expected_spec, multi_proof.main_mmcs_spec,
-            );
-            return false;
-        }
-
-        transcript.append_bytes(&multi_proof.main_mmcs_root);
-
         // =====================================================================
         // Round 1, Phase B: Sample shared LogUp challenges
         // =====================================================================
@@ -906,32 +947,45 @@ pub trait IsStarkVerifier<
         // SHARED transcript replaces the per-table aux root absorb of the
         // pre-MMCS protocol. Verify the spec mirrors the prover-side
         // filtered-by-has_aux_trace order before binding.
-        let mut expected_aux_spec: Vec<(crypto::merkle_tree::mmcs::MatrixTag, usize)> =
-            Vec::new();
-        for (idx, (air, proof)) in airs.iter().zip(&multi_proof.proofs).enumerate() {
-            if air.has_aux_trace() {
-                let lde_size = proof.trace_length * (air.options().blowup_factor as usize);
-                expected_aux_spec.push((main_tags[idx], lde_size));
+        // Per-chunk Phase C replay (aux). Mirrors Phase A: for each chunk,
+        // validate the aux spec + absorb the aux MMCS root (or skip when
+        // the chunk has no aux-bearing tables). Must match `multi_prove`
+        // Phase C absorb order exactly.
+        for chunk_idx in 0..expected_num_chunks {
+            let chunk_start = chunk_idx * chunk_size;
+            let chunk_end = (chunk_start + chunk_size).min(airs.len());
+
+            let mut expected_aux_spec: Vec<(crypto::merkle_tree::mmcs::MatrixTag, usize)> =
+                Vec::new();
+            for idx in chunk_start..chunk_end {
+                let (air, proof) = (airs[idx], &multi_proof.proofs[idx]);
+                if air.has_aux_trace() {
+                    let lde_size = proof.trace_length * (air.options().blowup_factor as usize);
+                    expected_aux_spec.push((main_tags[idx], lde_size));
+                }
             }
-        }
-        expected_aux_spec.sort_by(|a, b| b.1.cmp(&a.1).then(a.0.cmp(&b.0)));
-        if expected_aux_spec != multi_proof.aux_mmcs_spec {
-            error!(
-                "aux_mmcs_spec mismatch: expected {:?}, got {:?}",
-                expected_aux_spec, multi_proof.aux_mmcs_spec,
-            );
-            return false;
-        }
-        match (&multi_proof.aux_mmcs_root, expected_aux_spec.is_empty()) {
-            (Some(root), false) => transcript.append_bytes(root),
-            (None, true) => {}
-            (Some(_), true) => {
-                error!("aux_mmcs_root present but no AIR has an aux trace");
+            expected_aux_spec.sort_by(|a, b| b.1.cmp(&a.1).then(a.0.cmp(&b.0)));
+            if expected_aux_spec != multi_proof.aux_mmcs_specs[chunk_idx] {
+                error!(
+                    "chunk {chunk_idx} aux_mmcs_spec mismatch: expected {:?}, got {:?}",
+                    expected_aux_spec, multi_proof.aux_mmcs_specs[chunk_idx],
+                );
                 return false;
             }
-            (None, false) => {
-                error!("aux_mmcs_root missing but some AIR has an aux trace");
-                return false;
+            match (
+                &multi_proof.aux_mmcs_roots[chunk_idx],
+                expected_aux_spec.is_empty(),
+            ) {
+                (Some(root), false) => transcript.append_bytes(root),
+                (None, true) => {}
+                (Some(_), true) => {
+                    error!("chunk {chunk_idx} aux_mmcs_root present but no aux tables");
+                    return false;
+                }
+                (None, false) => {
+                    error!("chunk {chunk_idx} aux_mmcs_root missing but aux tables exist");
+                    return false;
+                }
             }
         }
 
@@ -957,17 +1011,27 @@ pub trait IsStarkVerifier<
                 table_transcript.append_field_element(&bpi.table_contribution);
             }
 
-            // Rounds 2-4: verify (per-table MMCS context threaded through).
+            // Per-chunk lookup: each table's main / aux MMCS root + spec
+            // come from its chunk.
+            let table_chunk_idx = idx / chunk_size;
+            let main_root_for_chunk =
+                multi_proof.main_mmcs_roots[table_chunk_idx].as_ref();
+            let main_spec_for_chunk: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)] =
+                &multi_proof.main_mmcs_specs[table_chunk_idx];
+            let aux_root_for_chunk = multi_proof.aux_mmcs_roots[table_chunk_idx].as_ref();
+            let aux_spec_for_chunk: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)] =
+                &multi_proof.aux_mmcs_specs[table_chunk_idx];
+
             if !Self::verify_rounds_2_to_4(
                 *air,
                 proof,
                 &mut table_transcript,
                 lookup_challenges.clone(),
                 main_tags[idx],
-                &multi_proof.main_mmcs_root,
-                &multi_proof.main_mmcs_spec,
-                multi_proof.aux_mmcs_root.as_ref(),
-                &multi_proof.aux_mmcs_spec,
+                main_root_for_chunk,
+                main_spec_for_chunk,
+                aux_root_for_chunk,
+                aux_spec_for_chunk,
             ) {
                 error!(
                     "Table {} failed verify_rounds_2_to_4 (num_constraints={}, trace_cols={})",
@@ -1183,7 +1247,7 @@ pub trait IsStarkVerifier<
         transcript: &mut impl IsStarkTranscript<FieldExtension, Field>,
         rap_challenges: Vec<FieldElement<FieldExtension>>,
         main_tag: crypto::merkle_tree::mmcs::MatrixTag,
-        main_mmcs_root: &Commitment,
+        main_mmcs_root: Option<&Commitment>,
         main_mmcs_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)],
         aux_mmcs_root: Option<&Commitment>,
         aux_mmcs_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)],
@@ -1302,7 +1366,7 @@ fn verify_main_mmcs_pair_inner<F>(
     main_opening: &crate::proof::stark::MainTraceOpening<F>,
     iota: usize,
     main_tag: crypto::merkle_tree::mmcs::MatrixTag,
-    main_mmcs_root: &Commitment,
+    main_mmcs_root: Option<&Commitment>,
     main_mmcs_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)],
 ) -> bool
 where
@@ -1322,6 +1386,12 @@ where
         MainTraceOpening::Tree(_) => return false,
     };
 
+    // Shared opening requires a chunk MMCS root; if missing, reject.
+    let main_mmcs_root = match main_mmcs_root {
+        Some(r) => r,
+        None => return false,
+    };
+
     let table_idx = match main_mmcs_spec.iter().position(|(t, _)| *t == main_tag) {
         Some(i) => i,
         None => return false,
diff --git a/prover/src/lib.rs b/prover/src/lib.rs
index dc5073ac9..25146e01a 100644
--- a/prover/src/lib.rs
+++ b/prover/src/lib.rs
@@ -504,23 +504,34 @@ impl VmAirs {
 
 /// Replay the prover's Phase A (main trace commitments) to recover the shared
 /// LogUp challenges (z, alpha). Mirrors `multi_verify` Phase A absorb order:
-/// for each table, absorb its precomputed root and (preprocessed only) its
-/// per-table multiplicities Merkle root; then absorb the shared main-trace
-/// MMCS root once at the end.
+/// for each chunk of `chunk_size` tables, in order, absorb each table's
+/// preprocessed + per-table multiplicities root (preprocessed only); then,
+/// after each chunk, absorb that chunk's main MMCS root (`Some`) or skip
+/// (`None`, when the chunk has no non-preprocessed tables).
 pub(crate) fn replay_transcript_phase_a(
     airs: &[&dyn AIR<Field = F, FieldExtension = E, PublicInputs = ()>],
     multi_proof: &MultiProof<F, E, ()>,
     transcript: &mut DefaultTranscript<E>,
 ) -> (FieldElement<E>, FieldElement<E>) {
-    for (air, proof) in airs.iter().zip(&multi_proof.proofs) {
-        if air.is_preprocessed() {
-            transcript.append_bytes(&air.precomputed_commitment());
-            if let Some(root) = &proof.lde_trace_main_merkle_root {
-                transcript.append_bytes(root);
+    let chunk_size = multi_proof.chunk_size as usize;
+    let num_chunks = multi_proof.main_mmcs_roots.len();
+    for chunk_idx in 0..num_chunks {
+        let chunk_start = chunk_idx * chunk_size;
+        let chunk_end = (chunk_start + chunk_size).min(airs.len());
+        for idx in chunk_start..chunk_end {
+            let air = airs[idx];
+            let proof = &multi_proof.proofs[idx];
+            if air.is_preprocessed() {
+                transcript.append_bytes(&air.precomputed_commitment());
+                if let Some(root) = &proof.lde_trace_main_merkle_root {
+                    transcript.append_bytes(root);
+                }
             }
         }
+        if let Some(root) = &multi_proof.main_mmcs_roots[chunk_idx] {
+            transcript.append_bytes(root);
+        }
     }
-    transcript.append_bytes(&multi_proof.main_mmcs_root);
     let z: FieldElement<E> = transcript.sample_field_element();
     let alpha: FieldElement<E> = transcript.sample_field_element();
     (z, alpha)

From c22fca9e881bb428779da1d4f32c9b5e358a786e Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Thu, 28 May 2026 11:58:01 -0300
Subject: [PATCH 19/21] =?UTF-8?q?feat(stark/mmcs):=20composition=20MMCS=20?=
 =?UTF-8?q?=E2=80=94=20per-chunk=20over=20composition=20polys?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Third MMCS after main + aux. Each chunk's tables share a composition
MMCS built between R2 LDE build and R3 z-sample. Drops per-table
composition_poly_root from StarkProof entirely; the chunk root is
absorbed into each chunk-mate's fork once per chunk.

# Why structurally different from main/aux

Composition polys are built INSIDE the per-table R2-R4 loop after each
fork samples its own beta. Per-chunk MMCS therefore needs a join point
mid-rounds:
  R2a (per-table, parallel): sample beta + build composition LDE +
                              tagged row-pair leaves
  [chunk join]               build chunk composition MMCS via
                              StreamingMmcsBuilder; absorb root into
                              each chunk-mate's fork
  R2b → R3 → R4 (per-table): sample z, run OOD + FRI + opens

The old `prove_rounds_2_to_4` is now `prove_round_2a` (R2 LDE build)
+ `prove_rounds_2b_to_4` (everything after the chunk join). Verifier
mirrors: `replay_rounds_after_round_1` takes an Option<&Commitment>
chunk-comp root and absorbs it between beta and z (replacing the old
per-table `composition_poly_root` absorb).

# Leaf hash + helpers

Composition leaves hash a row PAIR (br_0 || br_1) rather than a single
row, so domain separation needs its own helpers:
- `LEAF_DOMAIN_TAG_COMPOSITION = "LAMBDAVM_COMP_MMCS_LEAF_V1"`.
- `hash_tagged_row_pair_bytes_composition` (bytes flavour).
- `hash_tagged_row_pair_composition` (elements flavour).
- `compute_tagged_leaves_row_pair_bit_reversed_composition` (prover-side
  bulk-leaf computation).
- `rehash_comp_chip_leaf` (open-time on-demand leaf for chunk-mates).

# Proof format

`StarkProof`:
- drops `composition_poly_root: Commitment` (now at MultiProof / chunk
  level).

`DeepPolynomialOpening`:
- `composition_poly: PolynomialOpenings<E>` → `CompositionTraceOpening<E>`
  (new enum, single `Mmcs` variant with `evaluations`, `evaluations_sym`,
  and ONE `mmcs_opening` — the row-pair leaf covers both rows).

`MultiProof`:
- `comp_mmcs_roots: Vec<Option<Commitment>>` (parallel to main/aux,
  always Some in practice since every table has a composition poly).
- `comp_mmcs_specs: Vec<Vec<(MatrixTag, usize)>>` — chunk specs in MMCS
  sort order. Padded height = lde_size / 2 (row-pair count).

# Architecture

- `ChunkCompMmcsContext<E>` — sister of main/aux chunk contexts.
- `CompCommit<E>::Shared { chunk_ctx, chunk_idx, tag, padded_height }`.
- `Round2<F>` now holds `Arc<Vec<Vec<FE>>>` LDE + `CompCommit`. No
  per-table Merkle tree.
- `R2aResult<E>` — handoff carrier between R2a and the chunk join.
- `build_chunk_comp_mmcs` — mirror of `build_chunk_main_mmcs` /
  `build_chunk_aux_mmcs`; uses `StreamingMmcsBuilder`.

# Open + verifier

- `open_composition_poly` produces `CompositionTraceOpening::Mmcs` via
  `chunk_ctx.mmcs.open_with_leaves` + the rehash closure.
- `verify_comp_mmcs_pair_inner` rehashes the row-pair leaf, checks
  `matrix_leaves[table_idx]`, authenticates against the chunk root.
- `step_4_verify_trace_and_composition_openings` threads
  `comp_mmcs_root` + `comp_mmcs_spec` through to the per-query check.
- `multi_verify` adds per-chunk comp-spec validation (every chunk has
  Some root; spec sorted height-desc, tag-asc).

# Tests

- 3 new composition soundness tests: tampered root / spec height /
  missing root all rejected.
- 2 new mmcs_leaf tests: composition domain separates from main+aux;
  bytes-flavour matches element-flavour.
- `cli proof-size` breakdown gains `comp_mmcs_roots (per-chunk)` and
  `comp_mmcs_specs (per-chunk)` sections.

Results: 157/157 stark tests green (152 prior + 3 new comp soundness +
2 new mmcs_leaf comp tests). cli 3/3, lambda-vm-prover bitwise
(preprocessed) tests pass, 77 prove_elfs failures are the pre-existing
UnknownSyscall(5) executor bug present on main.

# Performance note

R2a and R2b-R4 within a chunk are sequential rather than per-table
parallel — `chunk_size = table_parallelism()` is small (typically 8),
and the dominant cost in R2 (constraint eval, FFT) already has internal
rayon parallelism. Keeping the chunk loop sequential avoids
`par_iter_mut()` on transcript slices that breaks with downstream
generic `IsStarkTranscript` bounds.
---
 bin/cli/src/main.rs                           |   6 +
 crypto/stark/src/mmcs_leaf.rs                 |  90 +++
 crypto/stark/src/proof/stark.rs               |  48 +-
 crypto/stark/src/prover.rs                    | 560 ++++++++++++++----
 .../stark/src/tests/mmcs_soundness_tests.rs   |  52 ++
 crypto/stark/src/verifier.rs                  | 183 +++++-
 6 files changed, 800 insertions(+), 139 deletions(-)

diff --git a/bin/cli/src/main.rs b/bin/cli/src/main.rs
index dd65466b4..86a6dbddf 100644
--- a/bin/cli/src/main.rs
+++ b/bin/cli/src/main.rs
@@ -694,6 +694,8 @@ fn cmd_proof_size(
     let main_mmcs_specs_bytes = ser_len(&vm_proof.proof.main_mmcs_specs);
     let aux_mmcs_roots_bytes = ser_len(&vm_proof.proof.aux_mmcs_roots);
     let aux_mmcs_specs_bytes = ser_len(&vm_proof.proof.aux_mmcs_specs);
+    let comp_mmcs_roots_bytes = ser_len(&vm_proof.proof.comp_mmcs_roots);
+    let comp_mmcs_specs_bytes = ser_len(&vm_proof.proof.comp_mmcs_specs);
     let chunk_size_bytes = ser_len(&vm_proof.proof.chunk_size);
 
     // Sum per-section across every sub-proof so a single number captures the
@@ -735,6 +737,8 @@ fn cmd_proof_size(
         + main_mmcs_specs_bytes
         + aux_mmcs_roots_bytes
         + aux_mmcs_specs_bytes
+        + comp_mmcs_roots_bytes
+        + comp_mmcs_specs_bytes
         + chunk_size_bytes
         + s_main_trace_openings
         + s_precomputed_trace_openings
@@ -754,6 +758,8 @@ fn cmd_proof_size(
         ProofSizeEntry { section: "main_mmcs_specs (per-chunk)".into(), bytes: main_mmcs_specs_bytes },
         ProofSizeEntry { section: "aux_mmcs_roots (per-chunk)".into(), bytes: aux_mmcs_roots_bytes },
         ProofSizeEntry { section: "aux_mmcs_specs (per-chunk)".into(), bytes: aux_mmcs_specs_bytes },
+        ProofSizeEntry { section: "comp_mmcs_roots (per-chunk)".into(), bytes: comp_mmcs_roots_bytes },
+        ProofSizeEntry { section: "comp_mmcs_specs (per-chunk)".into(), bytes: comp_mmcs_specs_bytes },
         ProofSizeEntry { section: "chunk_size".into(), bytes: chunk_size_bytes },
         ProofSizeEntry { section: "per_table_main_merkle_root (preprocessed)".into(), bytes: s_per_table_main_root },
         ProofSizeEntry { section: "per_table_precomputed_merkle_root".into(), bytes: s_precomputed_root },
diff --git a/crypto/stark/src/mmcs_leaf.rs b/crypto/stark/src/mmcs_leaf.rs
index 447f9650f..6a995fa2c 100644
--- a/crypto/stark/src/mmcs_leaf.rs
+++ b/crypto/stark/src/mmcs_leaf.rs
@@ -43,6 +43,13 @@ pub const LEAF_DOMAIN_TAG_MAIN: &[u8] = LEAF_DOMAIN_TAG;
 /// other.
 pub const LEAF_DOMAIN_TAG_AUX: &[u8] = b"LAMBDAVM_AUX_MMCS_LEAF_V1";
 
+/// Versioned domain separator for COMPOSITION-trace MMCS leaves.
+/// Composition leaves hash a PAIR of rows (br_0 || br_1) instead of a
+/// single row — the legacy `keccak_leaves_row_pair_bit_reversed` shape.
+/// Distinct from main/aux so no composition opening can authenticate a
+/// main or aux leaf.
+pub const LEAF_DOMAIN_TAG_COMPOSITION: &[u8] = b"LAMBDAVM_COMP_MMCS_LEAF_V1";
+
 /// Synthesize `n` distinct [`MatrixTag`]s derived from positional index.
 /// Useful for generic stark tests where the caller does not own a stable
 /// chip-type assignment. Production code in lambda-vm uses
@@ -75,6 +82,18 @@ pub fn hash_tagged_row_bytes_aux(tag: MatrixTag, row_bytes_be: &[u8]) -> Commitm
     hash_with_domain(LEAF_DOMAIN_TAG_AUX, tag, row_bytes_be)
 }
 
+/// Hash a COMPOSITION-trace MMCS leaf from a pre-concatenated `(br_0 ||
+/// br_1)` byte buffer — i.e. the two row-pair rows written big-endian,
+/// `part_0_row_0 || part_1_row_0 || ... || part_0_row_1 || part_1_row_1
+/// || ...`. Uses [`LEAF_DOMAIN_TAG_COMPOSITION`].
+#[inline]
+pub fn hash_tagged_row_pair_bytes_composition(
+    tag: MatrixTag,
+    row_pair_bytes_be: &[u8],
+) -> Commitment {
+    hash_with_domain(LEAF_DOMAIN_TAG_COMPOSITION, tag, row_pair_bytes_be)
+}
+
 #[inline]
 fn hash_with_domain(domain: &[u8], tag: MatrixTag, row_bytes_be: &[u8]) -> Commitment {
     let mut h = Keccak256::new();
@@ -105,6 +124,34 @@ where
     hash_tagged_row_inner::<E>(LEAF_DOMAIN_TAG_AUX, tag, row)
 }
 
+/// Convenience: hash a COMPOSITION-trace row-pair from two slices of
+/// field elements (the parts evaluated at `br_0` and `br_1`), each
+/// `num_parts` long.
+pub fn hash_tagged_row_pair_composition<E>(
+    tag: MatrixTag,
+    parts_at_br_0: &[FieldElement<E>],
+    parts_at_br_1: &[FieldElement<E>],
+) -> Commitment
+where
+    E: IsField,
+    FieldElement<E>: ByteConversion,
+{
+    debug_assert_eq!(parts_at_br_0.len(), parts_at_br_1.len());
+    let byte_len = <FieldElement<E> as ByteConversion>::BYTE_LEN;
+    let num_parts = parts_at_br_0.len();
+    let mut buf = vec![0u8; 2 * num_parts * byte_len];
+    let mut offset = 0;
+    for fe in parts_at_br_0 {
+        fe.write_bytes_be(&mut buf[offset..offset + byte_len]);
+        offset += byte_len;
+    }
+    for fe in parts_at_br_1 {
+        fe.write_bytes_be(&mut buf[offset..offset + byte_len]);
+        offset += byte_len;
+    }
+    hash_tagged_row_pair_bytes_composition(tag, &buf)
+}
+
 #[inline]
 fn hash_tagged_row_inner<E>(
     domain: &[u8],
@@ -158,6 +205,49 @@ mod tests {
         assert_ne!(main_digest, aux_digest);
     }
 
+    #[test]
+    fn composition_domain_separates_from_main_and_aux() {
+        // Same row-pair under composition MUST differ from main + aux
+        // domains so a composition opening can't authenticate a main or
+        // aux leaf.
+        let tag = MatrixTag::new([0xCC; 8]);
+        let row0 = vec![FE::from(1u64), FE::from(2u64)];
+        let row1 = vec![FE::from(3u64), FE::from(4u64)];
+        let comp_digest = hash_tagged_row_pair_composition(tag, &row0, &row1);
+
+        // Build the equivalent flat byte buffer manually and run it
+        // through the main + aux single-domain helpers.
+        let byte_len = <FE as ByteConversion>::BYTE_LEN;
+        let mut flat = vec![0u8; (row0.len() + row1.len()) * byte_len];
+        let mut offset = 0;
+        for fe in row0.iter().chain(row1.iter()) {
+            fe.write_bytes_be(&mut flat[offset..offset + byte_len]);
+            offset += byte_len;
+        }
+        let main_digest = hash_tagged_row_bytes(tag, &flat);
+        let aux_digest = hash_tagged_row_bytes_aux(tag, &flat);
+        assert_ne!(comp_digest, main_digest);
+        assert_ne!(comp_digest, aux_digest);
+    }
+
+    #[test]
+    fn composition_bytes_helper_matches_composition_element_helper() {
+        let tag = MatrixTag::new([5; 8]);
+        let row0 = vec![FE::from(10u64), FE::from(20u64)];
+        let row1 = vec![FE::from(30u64), FE::from(40u64)];
+        let from_elements = hash_tagged_row_pair_composition(tag, &row0, &row1);
+
+        let byte_len = <FE as ByteConversion>::BYTE_LEN;
+        let mut flat = vec![0u8; 2 * row0.len() * byte_len];
+        let mut offset = 0;
+        for fe in row0.iter().chain(row1.iter()) {
+            fe.write_bytes_be(&mut flat[offset..offset + byte_len]);
+            offset += byte_len;
+        }
+        let from_bytes = hash_tagged_row_pair_bytes_composition(tag, &flat);
+        assert_eq!(from_elements, from_bytes);
+    }
+
     #[test]
     fn aux_bytes_helper_matches_aux_element_helper() {
         // The bytes-flavoured helper and the element-flavoured helper must
diff --git a/crypto/stark/src/proof/stark.rs b/crypto/stark/src/proof/stark.rs
index cc69f7bf0..32ac76184 100644
--- a/crypto/stark/src/proof/stark.rs
+++ b/crypto/stark/src/proof/stark.rs
@@ -88,10 +88,45 @@ impl<E: IsField> AuxTraceOpening<E> {
     }
 }
 
+/// Per-query composition-trace opening. Sister of [`MainTraceOpening`]
+/// and [`AuxTraceOpening`] for the composition polynomial parts. Always
+/// `Mmcs`: every table has a composition polynomial, and the chunk-scoped
+/// composition MMCS commits to all of them.
+///
+/// Composition leaves are hashed in row-PAIR form (`br_0` + `br_1`).
+/// A single MMCS opening covers both rows since they share the same
+/// leaf in the underlying tree.
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
+#[serde(bound = "")]
+pub enum CompositionTraceOpening<E: IsField> {
+    Mmcs {
+        /// Parts at `br_0`.
+        evaluations: Vec<FieldElement<E>>,
+        /// Parts at `br_1` (sym row).
+        evaluations_sym: Vec<FieldElement<E>>,
+        /// Single MMCS opening for the row-pair leaf.
+        mmcs_opening: MmcsOpening<Commitment>,
+    },
+}
+
+impl<E: IsField> CompositionTraceOpening<E> {
+    pub fn evaluations(&self) -> &[FieldElement<E>] {
+        match self {
+            Self::Mmcs { evaluations, .. } => evaluations,
+        }
+    }
+
+    pub fn evaluations_sym(&self) -> &[FieldElement<E>] {
+        match self {
+            Self::Mmcs { evaluations_sym, .. } => evaluations_sym,
+        }
+    }
+}
+
 #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
 #[serde(bound = "")]
 pub struct DeepPolynomialOpening<F: IsSubFieldOf<E>, E: IsField> {
-    pub composition_poly: PolynomialOpenings<E>,
+    pub composition_poly: CompositionTraceOpening<E>,
     pub main_trace_polys: MainTraceOpening<F>,
     /// For preprocessed tables: openings for precomputed columns.
     /// These are verified against the hardcoded precomputed commitment.
@@ -118,8 +153,6 @@ pub struct StarkProof<F: IsSubFieldOf<E>, E: IsField, PI> {
     pub lde_trace_precomputed_merkle_root: Option<Commitment>,
     // tⱼ(zgᵏ)
     pub trace_ood_evaluations: Table<E>,
-    // Commitments to Hᵢ
-    pub composition_poly_root: Commitment,
     // Hᵢ(z^N)
     pub composition_poly_parts_ood_evaluation: Vec<FieldElement<E>>,
     // [pₖ]
@@ -184,6 +217,15 @@ pub struct MultiProof<F: IsSubFieldOf<E>, E: IsField, PI> {
     /// Per-chunk aux MMCS specs. Empty inner Vec when the corresponding
     /// `aux_mmcs_roots[i]` is `None`.
     pub aux_mmcs_specs: Vec<Vec<(MatrixTag, usize)>>,
+    /// Per-chunk composition MMCS roots. Always `Some` (every table has a
+    /// composition polynomial), but stored as `Option` for shape parity
+    /// with main/aux. Parallel to `main_mmcs_roots`.
+    pub comp_mmcs_roots: Vec<Option<Commitment>>,
+    /// Per-chunk composition MMCS specs. Each non-empty Vec lists
+    /// `(MatrixTag, padded_height)` for the chunk-mate composition
+    /// polynomials in MMCS sort order. `padded_height` is the row-pair
+    /// count = `lde_size / 2`.
+    pub comp_mmcs_specs: Vec<Vec<(MatrixTag, usize)>>,
     /// Pinned chunk size. Equals the prover's `table_parallelism()` at
     /// proving time. The verifier uses this to chunk the AIR slice into
     /// the same per-chunk grouping the prover used.
diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs
index bf9a5f03e..72f2a822f 100644
--- a/crypto/stark/src/prover.rs
+++ b/crypto/stark/src/prover.rs
@@ -27,7 +27,9 @@ use rayon::prelude::{
 use crate::debug::validate_trace;
 use crate::fri;
 use crate::lookup::LOGUP_NUM_CHALLENGES;
-use crate::proof::stark::{DeepPolynomialOpenings, MainTraceOpening, PolynomialOpenings};
+use crate::proof::stark::{
+    CompositionTraceOpening, DeepPolynomialOpenings, MainTraceOpening, PolynomialOpenings,
+};
 #[cfg(feature = "disk-spill")]
 use crate::storage_mode::StorageMode;
 use crate::table::Table;
@@ -433,17 +435,74 @@ pub fn table_parallelism() -> usize {
 }
 
 /// A container for the results of the second round of the STARK Prove protocol.
+/// Per-chunk composition MMCS context.
+pub(crate) struct ChunkCompMmcsContext<E: IsField>
+where
+    FieldElement<E>: AsBytes,
+{
+    pub(crate) mmcs: Arc<Mmcs<BatchedMerkleTreeBackend<E>>>,
+    /// Arc-cloned composition LDE columns for chunk-mates, in MMCS spec
+    /// sort order. Used by the per-query open path to rehash composition
+    /// row-pair leaves on demand.
+    pub(crate) lde_columns_in_spec_order: Vec<Arc<Vec<Vec<FieldElement<E>>>>>,
+}
+
+/// Per-table composition-trace commitment under the chunk's composition MMCS.
+pub(crate) enum CompCommit<E: IsField>
+where
+    FieldElement<E>: AsBytes,
+{
+    Shared {
+        chunk_ctx: Arc<ChunkCompMmcsContext<E>>,
+        chunk_idx: usize,
+        tag: MatrixTag,
+        /// Padded height = lde_size / 2 (row-pair leaves).
+        padded_height: usize,
+    },
+}
+
+impl<E: IsField> CompCommit<E>
+where
+    FieldElement<E>: AsBytes,
+{
+    fn share(&self) -> Self {
+        match self {
+            Self::Shared {
+                chunk_ctx,
+                chunk_idx,
+                tag,
+                padded_height,
+            } => Self::Shared {
+                chunk_ctx: Arc::clone(chunk_ctx),
+                chunk_idx: *chunk_idx,
+                tag: *tag,
+                padded_height: *padded_height,
+            },
+        }
+    }
+}
+
+/// Per-table Round 2 partial — produced by `round_2a_build_composition_lde`
+/// before the chunk composition MMCS is built.
+pub(crate) struct R2aResult<E: IsField>
+where
+    FieldElement<E>: AsBytes,
+{
+    pub(crate) lde_composition_poly_evaluations: Arc<Vec<Vec<FieldElement<E>>>>,
+    pub(crate) composition_leaves: Vec<Commitment>,
+    pub(crate) padded_height: usize,
+}
+
 pub(crate) struct Round2<F>
 where
     F: IsField,
     FieldElement<F>: AsBytes,
 {
-    /// Evaluations of the composition polynomial parts over the LDE domain.
-    pub(crate) lde_composition_poly_evaluations: Vec<Vec<FieldElement<F>>>,
-    /// The Merkle tree built to compute the commitment to the composition polynomial parts.
-    pub(crate) composition_poly_merkle_tree: BatchedMerkleTree<F>,
-    /// The commitment to the composition polynomial parts.
-    pub(crate) composition_poly_root: Commitment,
+    /// Evaluations of the composition polynomial parts over the LDE
+    /// domain (Arc-shared with the chunk composition MMCS context).
+    pub(crate) lde_composition_poly_evaluations: Arc<Vec<Vec<FieldElement<F>>>>,
+    /// This table's slot inside the chunk's composition MMCS.
+    pub(crate) comp: CompCommit<F>,
 }
 
 /// A container for the results of the third round of the STARK Prove protocol.
@@ -761,6 +820,137 @@ where
     Ok((Some(root), spec, Some(ctx)))
 }
 
+/// Tagged per-row-PAIR leaf digest for the COMPOSITION-trace MMCS.
+pub fn compute_tagged_leaves_row_pair_bit_reversed_composition<E>(
+    parts: &[Vec<FieldElement<E>>],
+    tag: MatrixTag,
+) -> Vec<Commitment>
+where
+    E: IsField,
+    FieldElement<E>: AsBytes + Sync + Send + ByteConversion,
+{
+    let num_parts = parts.len();
+    if num_parts == 0 {
+        return Vec::new();
+    }
+    let num_rows = parts[0].len();
+    if num_rows == 0 {
+        return Vec::new();
+    }
+    let num_leaves = num_rows / 2;
+    debug_assert!(num_rows.is_power_of_two());
+    let byte_len = <FieldElement<E> as ByteConversion>::BYTE_LEN;
+    let total_bytes = 2 * num_parts * byte_len;
+    let hash_leaf_pair = |buf: &mut [u8], leaf_idx: usize| -> Commitment {
+        let br_0 = reverse_index(2 * leaf_idx, num_rows as u64);
+        let br_1 = reverse_index(2 * leaf_idx + 1, num_rows as u64);
+        let mut offset = 0;
+        for part in parts.iter() {
+            part[br_0].write_bytes_be(&mut buf[offset..offset + byte_len]);
+            offset += byte_len;
+        }
+        for part in parts.iter() {
+            part[br_1].write_bytes_be(&mut buf[offset..offset + byte_len]);
+            offset += byte_len;
+        }
+        crate::mmcs_leaf::hash_tagged_row_pair_bytes_composition(tag, buf)
+    };
+    #[cfg(feature = "parallel")]
+    {
+        (0..num_leaves)
+            .into_par_iter()
+            .map_init(|| vec![0u8; total_bytes], |buf, i| hash_leaf_pair(buf, i))
+            .collect()
+    }
+    #[cfg(not(feature = "parallel"))]
+    {
+        let mut buf = vec![0u8; total_bytes];
+        (0..num_leaves).map(|i| hash_leaf_pair(&mut buf, i)).collect()
+    }
+}
+
+/// Build a CHUNK-scoped composition MMCS via StreamingMmcsBuilder.
+#[allow(clippy::type_complexity)]
+fn build_chunk_comp_mmcs<E>(
+    comp_outputs: Vec<(MatrixTag, Vec<Commitment>, usize)>,
+    chunk_comp_lde: Vec<(MatrixTag, Arc<Vec<Vec<FieldElement<E>>>>)>,
+) -> Result<
+    (
+        Option<Commitment>,
+        Vec<(MatrixTag, usize)>,
+        Option<Arc<ChunkCompMmcsContext<E>>>,
+    ),
+    ProvingError,
+>
+where
+    E: IsField + Send + Sync,
+    FieldElement<E>: AsBytes + Send + Sync,
+{
+    if comp_outputs.is_empty() {
+        return Ok((None, Vec::new(), None));
+    }
+    debug_assert_eq!(comp_outputs.len(), chunk_comp_lde.len());
+    let mut comp_outputs = comp_outputs;
+    comp_outputs.sort_by(|a, b| b.2.cmp(&a.2).then(a.0.cmp(&b.0)));
+    let lde_by_tag: std::collections::BTreeMap<MatrixTag, Arc<Vec<Vec<FieldElement<E>>>>> =
+        chunk_comp_lde.into_iter().collect();
+    let mut builder: StreamingMmcsBuilder<BatchedMerkleTreeBackend<E>> =
+        StreamingMmcsBuilder::new();
+    let mut lde_columns_in_spec_order: Vec<Arc<Vec<Vec<FieldElement<E>>>>> =
+        Vec::with_capacity(comp_outputs.len());
+    for (tag, leaves, _padded_height) in comp_outputs {
+        let lde = lde_by_tag
+            .get(&tag)
+            .ok_or_else(|| {
+                ProvingError::WrongParameter(format!(
+                    "missing chunk composition LDE for tag {tag:?}"
+                ))
+            })?
+            .clone();
+        lde_columns_in_spec_order.push(lde);
+        builder.add_matrix(tag, leaves).map_err(map_mmcs_err)?;
+    }
+    let mmcs = builder.finalize().map_err(map_mmcs_err)?;
+    let root = *mmcs.root();
+    let spec = mmcs.spec();
+    let ctx = Arc::new(ChunkCompMmcsContext {
+        mmcs: Arc::new(mmcs),
+        lde_columns_in_spec_order,
+    });
+    Ok((Some(root), spec, Some(ctx)))
+}
+
+/// Rehash a composition-trace row-PAIR leaf for the open path.
+pub fn rehash_comp_chip_leaf<E>(
+    tag: MatrixTag,
+    parts: &Arc<Vec<Vec<FieldElement<E>>>>,
+    local_idx: usize,
+) -> Commitment
+where
+    E: IsField,
+    FieldElement<E>: AsBytes + ByteConversion,
+{
+    let num_rows = parts
+        .first()
+        .map(|c| c.len())
+        .expect("composition LDE columns non-empty by construction");
+    let num_parts = parts.len();
+    let byte_len = <FieldElement<E> as ByteConversion>::BYTE_LEN;
+    let br_0 = reverse_index(2 * local_idx, num_rows as u64);
+    let br_1 = reverse_index(2 * local_idx + 1, num_rows as u64);
+    let mut buf = vec![0u8; 2 * num_parts * byte_len];
+    let mut offset = 0;
+    for part in parts.iter() {
+        part[br_0].write_bytes_be(&mut buf[offset..offset + byte_len]);
+        offset += byte_len;
+    }
+    for part in parts.iter() {
+        part[br_1].write_bytes_be(&mut buf[offset..offset + byte_len]);
+        offset += byte_len;
+    }
+    crate::mmcs_leaf::hash_tagged_row_pair_bytes_composition(tag, &buf)
+}
+
 /// Tagged per-row leaf digest for the main-trace MMCS.
 pub fn compute_tagged_leaves_bit_reversed<E>(
     columns: &[Vec<FieldElement<E>>],
@@ -1282,15 +1472,20 @@ pub trait IsStarkProver<
         .expect("LDE evaluation should succeed")
     }
 
-    /// Returns the result of the second round of the STARK Prove protocol.
-    fn round_2_compute_composition_polynomial(
+    /// Round 2 phase A: build the composition LDE parts + tagged leaves
+    /// for the chunk MMCS, WITHOUT committing yet. The chunk MMCS is
+    /// built externally once every chunk-mate has returned their
+    /// [`R2aResult`]; only then does the resulting chunk root get
+    /// absorbed back into each fork and R3 sampling proceeds.
+    fn round_2a_build_composition_lde(
         air: &dyn AIR<Field = Field, FieldExtension = FieldExtension, PublicInputs = PI>,
         pub_inputs: &PI,
         domain: &Domain<Field>,
         round_1_result: &Round1<Field, FieldExtension>,
         transition_coefficients: &[FieldElement<FieldExtension>],
         boundary_coefficients: &[FieldElement<FieldExtension>],
-    ) -> Result<Round2<FieldExtension>, ProvingError>
+        tag: MatrixTag,
+    ) -> Result<R2aResult<FieldExtension>, ProvingError>
     where
         FieldElement<Field>: AsBytes,
         FieldElement<FieldExtension>: AsBytes,
@@ -1355,21 +1550,24 @@ pub trait IsStarkProver<
 
         #[cfg(feature = "instruments")]
         let t_sub = Instant::now();
-        let Some((composition_poly_merkle_tree, composition_poly_root)) =
-            Self::commit_composition_polynomial(&lde_composition_poly_parts_evaluations)
-        else {
+        let composition_leaves =
+            compute_tagged_leaves_row_pair_bit_reversed_composition::<FieldExtension>(
+                &lde_composition_poly_parts_evaluations,
+                tag,
+            );
+        if composition_leaves.is_empty() {
             return Err(ProvingError::EmptyCommitment);
-        };
+        }
+        let padded_height = composition_leaves.len();
         #[cfg(feature = "instruments")]
         let merkle_dur = t_sub.elapsed();
-
         #[cfg(feature = "instruments")]
         crate::instruments::store_r2_sub(constraints_dur, fft_dur, merkle_dur);
 
-        Ok(Round2 {
-            lde_composition_poly_evaluations: lde_composition_poly_parts_evaluations,
-            composition_poly_merkle_tree,
-            composition_poly_root,
+        Ok(R2aResult {
+            lde_composition_poly_evaluations: Arc::new(lde_composition_poly_parts_evaluations),
+            composition_leaves,
+            padded_height,
         })
     }
 
@@ -1709,22 +1907,41 @@ pub trait IsStarkProver<
         .collect()
     }
 
-    /// Computes values and validity proofs of the evaluations of the composition polynomial parts
-    /// at the domain value corresponding to the FRI query challenge `index` and its symmetric
-    /// element.
+    /// Compute the composition-poly opening for one query against the
+    /// chunk composition MMCS. The opening's `mmcs_opening` carries
+    /// matrix_leaves for every chunk-mate's composition matrix; the
+    /// closure rehashes those row-pair leaves on demand from the
+    /// chunk-shared LDE columns.
     fn open_composition_poly(
-        composition_poly_merkle_tree: &BatchedMerkleTree<FieldExtension>,
+        comp: &CompCommit<FieldExtension>,
         lde_composition_poly_evaluations: &[Vec<FieldElement<FieldExtension>>],
         index: usize,
-    ) -> PolynomialOpenings<FieldExtension>
+    ) -> CompositionTraceOpening<FieldExtension>
     where
         FieldElement<Field>: AsBytes + Sync + Send,
-        FieldElement<FieldExtension>: AsBytes + Sync + Send,
+        FieldElement<FieldExtension>: AsBytes + Sync + Send + ByteConversion,
     {
-        let proof = composition_poly_merkle_tree
-            .get_proof_by_pos(index)
-            .unwrap();
+        let CompCommit::Shared { chunk_ctx, .. } = comp;
+        let mmcs = &chunk_ctx.mmcs;
+        let lde_in_spec_order = &chunk_ctx.lde_columns_in_spec_order;
+
+        // Composition row-pair leaves are indexed by row-pair, so the
+        // opening's global_index equals the query index directly (no
+        // shift). Per-table local index = global_index >> shift, which
+        // is 0 when all chunk-mates share the max height.
+        let local_idx = index;
+        let mmcs_opening = mmcs
+            .open_with_leaves(local_idx, |m_idx, local_idx_in_matrix| {
+                rehash_comp_chip_leaf::<FieldExtension>(
+                    mmcs.spec()[m_idx].0,
+                    &lde_in_spec_order[m_idx],
+                    local_idx_in_matrix,
+                )
+            })
+            .expect("composition MMCS open_with_leaves: index in range");
 
+        // Build the (evaluations, evaluations_sym) field arrays from this
+        // table's composition LDE — same layout as the legacy opening.
         let lde_composition_poly_parts_evaluation: Vec<_> = lde_composition_poly_evaluations
             .iter()
             .flat_map(|part| {
@@ -1734,20 +1951,21 @@ pub trait IsStarkProver<
                 ]
             })
             .collect();
+        let evaluations = lde_composition_poly_parts_evaluation
+            .clone()
+            .into_iter()
+            .step_by(2)
+            .collect();
+        let evaluations_sym = lde_composition_poly_parts_evaluation
+            .into_iter()
+            .skip(1)
+            .step_by(2)
+            .collect();
 
-        PolynomialOpenings {
-            proof: proof.clone(),
-            proof_sym: proof,
-            evaluations: lde_composition_poly_parts_evaluation
-                .clone()
-                .into_iter()
-                .step_by(2)
-                .collect(),
-            evaluations_sym: lde_composition_poly_parts_evaluation
-                .into_iter()
-                .skip(1)
-                .step_by(2)
-                .collect(),
+        CompositionTraceOpening::Mmcs {
+            evaluations,
+            evaluations_sym,
+            mmcs_opening,
         }
     }
 
@@ -1796,7 +2014,7 @@ pub trait IsStarkProver<
 
         for index in indexes_to_open.iter() {
             let composition_openings = Self::open_composition_poly(
-                &round_2_result.composition_poly_merkle_tree,
+                &round_2_result.comp,
                 &round_2_result.lde_composition_poly_evaluations,
                 *index,
             );
@@ -2467,32 +2685,148 @@ pub trait IsStarkProver<
             crate::instruments::TableSubOps,
         )> = Vec::with_capacity(num_airs);
 
-        let mut proofs = Vec::with_capacity(num_airs);
+        let mut proofs: Vec<Option<StarkProof<Field, FieldExtension, PI>>> =
+            (0..num_airs).map(|_| None).collect();
+        let mut comp_mmcs_roots_per_chunk: Vec<Option<Commitment>> = Vec::new();
+        let mut comp_mmcs_specs_per_chunk: Vec<Vec<(MatrixTag, usize)>> = Vec::new();
         let mut lde_drain = cached_ldes.into_iter();
         for chunk_start in (0..num_airs).step_by(k) {
             let chunk_end = (chunk_start + k).min(num_airs);
             let chunk_size = chunk_end - chunk_start;
+            let chunk_idx = comp_mmcs_roots_per_chunk.len();
 
             let chunk_ldes: Vec<Lde<Field, FieldExtension>> =
                 lde_drain.by_ref().take(chunk_size).collect();
             let chunk_commitments = &commitments[chunk_start..chunk_end];
-            let chunk_transcripts = &mut table_transcripts[chunk_start..chunk_end];
-
-            #[cfg(feature = "parallel")]
-            let iter = chunk_ldes
-                .into_par_iter()
-                .zip(chunk_commitments.par_iter())
-                .zip(chunk_transcripts.par_iter_mut())
-                .enumerate();
-            #[cfg(not(feature = "parallel"))]
-            let iter = chunk_ldes
+            // Build Round1 per-table sequentially (build_round1 only bumps
+            // Arc refcounts), then run R2a in parallel.
+            let chunk_round1: Vec<Round1<Field, FieldExtension>> = chunk_ldes
                 .into_iter()
                 .zip(chunk_commitments.iter())
+                .enumerate()
+                .map(|(j, (lde, commitment))| {
+                    let idx = chunk_start + j;
+                    let (air, _, _) = &air_trace_pairs[idx];
+                    let domain = &domains[idx];
+                    commitment.build_round1(lde, air.step_size(), domain.blowup_factor)
+                })
+                .collect();
+
+            // Bind per-table table_contribution into forks before sampling beta.
+            for (j, round_1_result) in chunk_round1.iter().enumerate() {
+                let idx = chunk_start + j;
+                if let Some(ref bpi) = round_1_result.bus_public_inputs {
+                    table_transcripts[idx].append_field_element(&bpi.table_contribution);
+                }
+            }
+
+            // Phase R2a (sequential within chunk): sample beta + build
+            // composition LDE + tagged leaves per table. Internal
+            // parallelism inside constraint eval / FFT keeps cores busy.
+            // K is small (chunk size = table_parallelism()), so per-table
+            // serialization here costs little.
+            let chunk_transcripts = &mut table_transcripts[chunk_start..chunk_end];
+            let r2a_iter = chunk_round1
+                .iter()
                 .zip(chunk_transcripts.iter_mut())
                 .enumerate();
 
-            let chunk_results: Vec<Result<_, ProvingError>> = iter
-                .map(|(j, ((lde, commitment), table_transcript))| {
+            #[allow(clippy::type_complexity)]
+            let r2a_results: Vec<Result<
+                (
+                    usize,
+                    Vec<FieldElement<FieldExtension>>,
+                    Vec<FieldElement<FieldExtension>>,
+                    R2aResult<FieldExtension>,
+                ),
+                ProvingError,
+            >> = r2a_iter
+                .map(|(j, (round_1_result, table_transcript))| {
+                    let idx = chunk_start + j;
+                    let (air, _, pub_inputs) = &air_trace_pairs[idx];
+                    let domain = &domains[idx];
+                    let tag = main_tags[idx];
+                    let (tc, bc, r2a) = Self::prove_round_2a(
+                        *air,
+                        *pub_inputs,
+                        round_1_result,
+                        table_transcript,
+                        domain,
+                        tag,
+                    )?;
+                    Ok((j, tc, bc, r2a))
+                })
+                .collect();
+
+            // Sequential: collect R2a outputs in chunk-local-index order;
+            // build chunk composition MMCS over them.
+            let mut chunk_r2a: Vec<Option<(
+                Vec<FieldElement<FieldExtension>>,
+                Vec<FieldElement<FieldExtension>>,
+                R2aResult<FieldExtension>,
+            )>> = (0..chunk_size).map(|_| None).collect();
+            for r in r2a_results {
+                let (j, tc, bc, r2a) = r?;
+                chunk_r2a[j] = Some((tc, bc, r2a));
+            }
+
+            let mut chunk_comp_outputs: Vec<(MatrixTag, Vec<Commitment>, usize)> = Vec::new();
+            let mut chunk_comp_ldes: Vec<(MatrixTag, Arc<Vec<Vec<FieldElement<FieldExtension>>>>)> =
+                Vec::new();
+            for (j, entry) in chunk_r2a.iter().enumerate() {
+                let idx = chunk_start + j;
+                let tag = main_tags[idx];
+                let (_, _, r2a) = entry.as_ref().expect("R2a populated");
+                chunk_comp_outputs.push((tag, r2a.composition_leaves.clone(), r2a.padded_height));
+                chunk_comp_ldes.push((tag, Arc::clone(&r2a.lde_composition_poly_evaluations)));
+            }
+
+            let (chunk_comp_root, chunk_comp_spec, chunk_comp_ctx_opt) =
+                build_chunk_comp_mmcs::<FieldExtension>(chunk_comp_outputs, chunk_comp_ldes)?;
+            // Absorb chunk composition root into EACH chunk-mate's fork.
+            if let Some(ref root) = chunk_comp_root {
+                for idx in chunk_start..chunk_end {
+                    table_transcripts[idx].append_bytes(root);
+                }
+            }
+            comp_mmcs_roots_per_chunk.push(chunk_comp_root);
+            comp_mmcs_specs_per_chunk.push(chunk_comp_spec.clone());
+
+            let chunk_comp_ctx = chunk_comp_ctx_opt
+                .expect("chunk has at least one composition matrix (every table has comp)");
+            let height_by_tag: std::collections::BTreeMap<MatrixTag, usize> =
+                chunk_comp_spec.iter().copied().collect();
+
+            // Reassemble per-table Round2 from R2a + chunk MMCS context.
+            let mut chunk_round2: Vec<Round2<FieldExtension>> = Vec::with_capacity(chunk_size);
+            for j in 0..chunk_size {
+                let idx = chunk_start + j;
+                let tag = main_tags[idx];
+                let (_, _, r2a) = chunk_r2a[j].take().unwrap();
+                let padded_height = *height_by_tag.get(&tag).expect("spec contains tag");
+                chunk_round2.push(Round2 {
+                    lde_composition_poly_evaluations: r2a.lde_composition_poly_evaluations,
+                    comp: CompCommit::Shared {
+                        chunk_ctx: Arc::clone(&chunk_comp_ctx),
+                        chunk_idx,
+                        tag,
+                        padded_height,
+                    },
+                });
+            }
+
+            // Phase R2b → R4 (sequential within chunk): each fork has
+            // the chunk comp root absorbed; sample z, run R3 OOD + R4
+            // FRI. Same rationale as R2a above.
+            let chunk_transcripts = &mut table_transcripts[chunk_start..chunk_end];
+            let r2b_iter = chunk_round1
+                .iter()
+                .zip(chunk_round2.iter())
+                .zip(chunk_transcripts.iter_mut())
+                .enumerate();
+
+            let chunk_results: Vec<Result<_, ProvingError>> = r2b_iter
+                .map(|(j, ((round_1_result, round_2_result), table_transcript))| {
                     let idx = chunk_start + j;
                     let (air, trace, pub_inputs) = &air_trace_pairs[idx];
                     let _ = trace; // used by instruments
@@ -2501,18 +2835,11 @@ pub trait IsStarkProver<
                     #[cfg(feature = "instruments")]
                     let table_start = Instant::now();
 
-                    // Build Round1 from cached LDE (consumed by value, no recomputation).
-                    let round_1_result =
-                        commitment.build_round1(lde, air.step_size(), domain.blowup_factor);
-
-                    if let Some(ref bpi) = round_1_result.bus_public_inputs {
-                        table_transcript.append_field_element(&bpi.table_contribution);
-                    }
-
-                    let proof = Self::prove_rounds_2_to_4(
+                    let proof = Self::prove_rounds_2b_to_4(
                         *air,
                         *pub_inputs,
-                        &round_1_result,
+                        round_1_result,
+                        round_2_result,
                         table_transcript,
                         domain,
                     )?;
@@ -2529,24 +2856,34 @@ pub trait IsStarkProver<
                     };
 
                     #[cfg(feature = "instruments")]
-                    return Ok((proof, table_timing));
+                    return Ok((j, proof, table_timing));
                     #[cfg(not(feature = "instruments"))]
-                    Ok(proof)
+                    Ok((j, proof))
                 })
                 .collect();
 
             for result in chunk_results {
                 #[cfg(feature = "instruments")]
                 {
-                    let (proof, timing) = result?;
-                    proofs.push(proof);
+                    let (j, proof, timing) = result?;
+                    let idx = chunk_start + j;
+                    proofs[idx] = Some(proof);
                     table_timings.push(timing);
                 }
                 #[cfg(not(feature = "instruments"))]
-                proofs.push(result?);
+                {
+                    let (j, proof) = result?;
+                    let idx = chunk_start + j;
+                    proofs[idx] = Some(proof);
+                }
             }
         }
 
+        let proofs: Vec<StarkProof<Field, FieldExtension, PI>> = proofs
+            .into_iter()
+            .map(|p| p.expect("every table emits a proof"))
+            .collect();
+
         #[cfg(feature = "instruments")]
         {
             // Store timing data for the top-level report in prove_with_options.
@@ -2569,6 +2906,8 @@ pub trait IsStarkProver<
             main_mmcs_specs: main_mmcs_specs_per_chunk,
             aux_mmcs_roots: aux_mmcs_roots_per_chunk,
             aux_mmcs_specs: aux_mmcs_specs_per_chunk,
+            comp_mmcs_roots: comp_mmcs_roots_per_chunk,
+            comp_mmcs_specs: comp_mmcs_specs_per_chunk,
             chunk_size: k as u32,
         })
     }
@@ -2607,24 +2946,30 @@ pub trait IsStarkProver<
     // TODO: propagate errors instead of unwrap() in open_deep_composition_poly and FRI operations
     /// Executes rounds 2-4 and generates a STARK proof for the trace `main_trace` with public inputs `pub_inputs`.
     /// Warning: the transcript must be safely initializated before passing it to this method.
-    fn prove_rounds_2_to_4(
+    /// Part A of Round 2: sample beta + build the composition LDE parts
+    /// + compute tagged row-pair leaves for the chunk composition MMCS.
+    /// Returns the artefacts the chunk-level MMCS build consumes
+    /// alongside this table's tag.
+    fn prove_round_2a(
         air: &dyn AIR<Field = Field, FieldExtension = FieldExtension, PublicInputs = PI>,
         pub_inputs: &PI,
         round_1_result: &Round1<Field, FieldExtension>,
         transcript: &mut impl IsStarkTranscript<FieldExtension, Field>,
         domain: &Domain<Field>,
-    ) -> Result<StarkProof<Field, FieldExtension, PI>, ProvingError>
+        tag: MatrixTag,
+    ) -> Result<
+        (
+            Vec<FieldElement<FieldExtension>>,
+            Vec<FieldElement<FieldExtension>>,
+            R2aResult<FieldExtension>,
+        ),
+        ProvingError,
+    >
     where
         FieldElement<Field>: AsBytes,
         FieldElement<FieldExtension>: AsBytes,
         PI: Send + Sync + Clone,
     {
-        info!("Started proof generation...");
-
-        // ===================================
-        // ==========|   Round 2   |==========
-        // ===================================
-
         // <<<< Receive challenge: 𝛽
         let beta = transcript.sample_field_element();
         let trace_length = domain.interpolation_domain_size;
@@ -2637,35 +2982,47 @@ pub trait IsStarkProver<
             )
             .constraints
             .len();
-
         let num_transition_constraints = air.context().num_transition_constraints;
-
         let mut coefficients: Vec<_> =
             core::iter::successors(Some(FieldElement::one()), |x| Some(x * &beta))
                 .take(num_boundary_constraints + num_transition_constraints)
                 .collect();
-
         let transition_coefficients: Vec<_> =
             coefficients.drain(..num_transition_constraints).collect();
         let boundary_coefficients = coefficients;
-
-        let round_2_result = Self::round_2_compute_composition_polynomial(
+        let r2a = Self::round_2a_build_composition_lde(
             air,
             pub_inputs,
             domain,
             round_1_result,
             &transition_coefficients,
             &boundary_coefficients,
+            tag,
         )?;
+        Ok((transition_coefficients, boundary_coefficients, r2a))
+    }
 
-        // >>>> Send commitments: [H₁], [H₂]
-        transcript.append_bytes(&round_2_result.composition_poly_root);
-
-        // ===================================
-        // ==========|   Round 3   |==========
-        // ===================================
+    /// Part B of Round 2 onward: assumes the chunk composition MMCS root
+    /// has been absorbed into `transcript` already. Runs the absorb of
+    /// the per-table H_i values, R3 OOD, and R4 FRI + opens, producing
+    /// the final per-table StarkProof.
+    #[allow(clippy::too_many_arguments)]
+    fn prove_rounds_2b_to_4(
+        air: &dyn AIR<Field = Field, FieldExtension = FieldExtension, PublicInputs = PI>,
+        pub_inputs: &PI,
+        round_1_result: &Round1<Field, FieldExtension>,
+        round_2_result: &Round2<FieldExtension>,
+        transcript: &mut impl IsStarkTranscript<FieldExtension, Field>,
+        domain: &Domain<Field>,
+    ) -> Result<StarkProof<Field, FieldExtension, PI>, ProvingError>
+    where
+        FieldElement<Field>: AsBytes,
+        FieldElement<FieldExtension>: AsBytes,
+        PI: Send + Sync + Clone,
+    {
+        info!("Started proof generation (post-R2 chunk join)...");
 
-        // <<<< Receive challenge: z
+        // <<<< Receive challenge: z (transcript already saw chunk comp root)
         let z = transcript.sample_z_ood(
             &domain.lde_roots_of_unity_coset,
             &domain.trace_roots_of_unity,
@@ -2677,7 +3034,7 @@ pub trait IsStarkProver<
             air,
             domain,
             round_1_result,
-            &round_2_result,
+            round_2_result,
             &z,
         );
         #[cfg(feature = "instruments")]
@@ -2699,15 +3056,11 @@ pub trait IsStarkProver<
         // ===================================
         // ==========|   Round 4   |==========
         // ===================================
-
-        // Part of this round is running FRI, which is an interactive
-        // protocol on its own. Therefore we pass it the transcript
-        // to simulate the interactions with the verifier.
         let round_4_result = Self::round_4_compute_and_run_fri_on_the_deep_composition_polynomial(
             air,
             domain,
             round_1_result,
-            &round_2_result,
+            round_2_result,
             &round_3_result,
             &z,
             transcript,
@@ -2735,32 +3088,17 @@ pub trait IsStarkProver<
         info!("End proof generation");
 
         Ok(StarkProof {
-            // For preprocessed tables: per-table Merkle root over multiplicities
-            // (preprocessed tables stay out of the shared main-trace MMCS).
             lde_trace_main_merkle_root: round_1_result.main.main_tree_root(),
-            // For preprocessed tables: commitment to precomputed columns only
             lde_trace_precomputed_merkle_root: round_1_result.main.precomputed_root(),
-            // tⱼ(zgᵏ)
             trace_ood_evaluations: round_3_result.trace_ood_evaluations,
-            // [H₁] and [H₂]
-            composition_poly_root: round_2_result.composition_poly_root,
-            // Hᵢ(z^N)
             composition_poly_parts_ood_evaluation: round_3_result
                 .composition_poly_parts_ood_evaluation,
-            // [pₖ]
             fri_layers_merkle_roots: round_4_result.fri_layers_merkle_roots,
-            // pₙ
             fri_last_value: round_4_result.fri_last_value,
-            // Open(p₀(D₀), 𝜐ₛ), Open(pₖ(Dₖ), −𝜐ₛ^(2ᵏ))
             query_list: round_4_result.query_list,
-            // Open(H₁(D_LDE, 𝜐₀), Open(H₂(D_LDE, 𝜐₀), Open(tⱼ(D_LDE), 𝜐₀)
-            // Open(H₁(D_LDE, -𝜐ᵢ), Open(H₂(D_LDE, -𝜐ᵢ), Open(tⱼ(D_LDE), -𝜐ᵢ)
             deep_poly_openings: round_4_result.deep_poly_openings,
-            // nonce obtained from grinding
             nonce: round_4_result.nonce,
-            // Bus interaction public inputs (for boundary constraints and bus balance check)
             bus_public_inputs: round_1_result.bus_public_inputs.clone(),
-            // Public inputs for boundary constraints
             public_inputs: pub_inputs.clone(),
             trace_length: domain.interpolation_domain_size,
         })
diff --git a/crypto/stark/src/tests/mmcs_soundness_tests.rs b/crypto/stark/src/tests/mmcs_soundness_tests.rs
index ab0c8912f..71448e64d 100644
--- a/crypto/stark/src/tests/mmcs_soundness_tests.rs
+++ b/crypto/stark/src/tests/mmcs_soundness_tests.rs
@@ -251,3 +251,55 @@ fn swapped_main_tags_at_verifier_rejected() {
         "swapped main_tags must be rejected"
     );
 }
+
+// ---------- Composition MMCS soundness ----------
+
+fn first_populated_comp_chunk(proof: &MultiProof<F, F, ()>) -> usize {
+    proof
+        .comp_mmcs_roots
+        .iter()
+        .position(|r| r.is_some())
+        .expect("at least one chunk must have a comp MMCS root in this baseline")
+}
+
+#[test_log::test]
+fn tampered_comp_mmcs_root_rejected() {
+    let (air_1, air_2, mut proof) = baseline_proof();
+    let airs: Vec<&dyn AIR<Field = F, FieldExtension = F, PublicInputs = ()>> =
+        vec![&air_1, &air_2];
+    let chunk_idx = first_populated_comp_chunk(&proof);
+    let root = proof.comp_mmcs_roots[chunk_idx]
+        .as_mut()
+        .expect("populated");
+    root[0] ^= 1;
+    assert!(
+        !verify(&airs, &proof),
+        "tampered composition MMCS root must be rejected"
+    );
+}
+
+#[test_log::test]
+fn tampered_comp_mmcs_spec_height_rejected() {
+    let (air_1, air_2, mut proof) = baseline_proof();
+    let airs: Vec<&dyn AIR<Field = F, FieldExtension = F, PublicInputs = ()>> =
+        vec![&air_1, &air_2];
+    let chunk_idx = first_populated_comp_chunk(&proof);
+    proof.comp_mmcs_specs[chunk_idx][0].1 /= 2;
+    assert!(
+        !verify(&airs, &proof),
+        "composition spec height mismatch must be rejected"
+    );
+}
+
+#[test_log::test]
+fn missing_comp_mmcs_root_rejected() {
+    let (air_1, air_2, mut proof) = baseline_proof();
+    let airs: Vec<&dyn AIR<Field = F, FieldExtension = F, PublicInputs = ()>> =
+        vec![&air_1, &air_2];
+    let chunk_idx = first_populated_comp_chunk(&proof);
+    proof.comp_mmcs_roots[chunk_idx] = None;
+    assert!(
+        !verify(&airs, &proof),
+        "missing composition MMCS root must be rejected (every chunk must have one)"
+    );
+}
diff --git a/crypto/stark/src/verifier.rs b/crypto/stark/src/verifier.rs
index 95165a253..dc614fb85 100644
--- a/crypto/stark/src/verifier.rs
+++ b/crypto/stark/src/verifier.rs
@@ -422,26 +422,28 @@ pub trait IsStarkVerifier<
 
     /// Verify opening Open(Hᵢ(D_LDE), 𝜐) and Open(Hᵢ(D_LDE), -𝜐) for all parts Hᵢof the composition
     /// polynomial, where 𝜐 and -𝜐 are the elements corresponding to the index challenge `iota`.
+    /// Verify the composition-trace MMCS opening pair for one query.
+    /// Rehashes the row-pair leaf using the COMPOSITION domain
+    /// separator, checks it matches `matrix_leaves[table_idx]`, and
+    /// authenticates against the chunk's composition root + spec.
     fn verify_composition_poly_opening(
         deep_poly_openings: &DeepPolynomialOpening<Field, FieldExtension>,
-        composition_poly_merkle_root: &Commitment,
-        iota: &usize,
+        comp_mmcs_root: Option<&Commitment>,
+        comp_mmcs_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)],
+        main_tag: crypto::merkle_tree::mmcs::MatrixTag,
+        iota: usize,
     ) -> bool
     where
         FieldElement<Field>: AsBytes + Sync + Send,
-        FieldElement<FieldExtension>: AsBytes + Sync + Send,
+        FieldElement<FieldExtension>: AsBytes + Sync + Send + math::traits::ByteConversion,
     {
-        let mut value = deep_poly_openings.composition_poly.evaluations.clone();
-        value.extend_from_slice(&deep_poly_openings.composition_poly.evaluations_sym);
-
-        deep_poly_openings
-            .composition_poly
-            .proof
-            .verify::<BatchedMerkleTreeBackend<FieldExtension>>(
-                composition_poly_merkle_root,
-                *iota,
-                &value,
-            )
+        verify_comp_mmcs_pair_inner::<FieldExtension>(
+            &deep_poly_openings.composition_poly,
+            iota,
+            main_tag,
+            comp_mmcs_root,
+            comp_mmcs_spec,
+        )
     }
 
     /// Verifies the validity of the purported values of the trace polynomials and the composition polynomial
@@ -456,6 +458,8 @@ pub trait IsStarkVerifier<
         main_mmcs_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)],
         aux_mmcs_root: Option<&Commitment>,
         aux_mmcs_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)],
+        comp_mmcs_root: Option<&Commitment>,
+        comp_mmcs_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)],
     ) -> bool
     where
         FieldElement<Field>: AsBytes + Sync + Send + math::traits::ByteConversion,
@@ -468,8 +472,10 @@ pub trait IsStarkVerifier<
             .all(|(iota_n, deep_poly_opening)| {
                 Self::verify_composition_poly_opening(
                     deep_poly_opening,
-                    &proof.composition_poly_root,
-                    iota_n,
+                    comp_mmcs_root,
+                    comp_mmcs_spec,
+                    main_tag,
+                    *iota_n,
                 ) && Self::verify_trace_openings(
                     proof,
                     deep_poly_opening,
@@ -636,7 +642,7 @@ pub trait IsStarkVerifier<
                 challenges,
                 &lde_base,
                 lde_aux,
-                &opening.composition_poly.evaluations,
+                opening.composition_poly.evaluations(),
             )?);
 
             // Mirror for the symmetric query point.
@@ -660,7 +666,7 @@ pub trait IsStarkVerifier<
                 challenges,
                 &lde_base_sym,
                 lde_aux_sym,
-                &opening.composition_poly.evaluations_sym,
+                opening.composition_poly.evaluations_sym(),
             )?);
         }
         Some((deep_poly_evaluations, deep_poly_evaluations_sym))
@@ -820,13 +826,19 @@ pub trait IsStarkVerifier<
             || multi_proof.main_mmcs_specs.len() != expected_num_chunks
             || multi_proof.aux_mmcs_roots.len() != expected_num_chunks
             || multi_proof.aux_mmcs_specs.len() != expected_num_chunks
+            || multi_proof.comp_mmcs_roots.len() != expected_num_chunks
+            || multi_proof.comp_mmcs_specs.len() != expected_num_chunks
         {
             error!(
-                "per-chunk MMCS Vec lengths inconsistent with chunk_size={chunk_size}:                  expected {expected_num_chunks} chunks; got main_roots={}, main_specs={},                  aux_roots={}, aux_specs={}",
+                "per-chunk MMCS Vec lengths inconsistent with chunk_size={chunk_size}: \
+                 expected {expected_num_chunks} chunks; got main_roots={}, main_specs={}, \
+                 aux_roots={}, aux_specs={}, comp_roots={}, comp_specs={}",
                 multi_proof.main_mmcs_roots.len(),
                 multi_proof.main_mmcs_specs.len(),
                 multi_proof.aux_mmcs_roots.len(),
                 multi_proof.aux_mmcs_specs.len(),
+                multi_proof.comp_mmcs_roots.len(),
+                multi_proof.comp_mmcs_specs.len(),
             );
             return false;
         }
@@ -989,6 +1001,41 @@ pub trait IsStarkVerifier<
             }
         }
 
+        // Per-chunk composition MMCS spec validation. Every table has a
+        // composition polynomial, so every chunk has Some(root). The
+        // composition root is NOT absorbed here at the shared-transcript
+        // level — it gets absorbed PER-TABLE inside `verify_rounds_2_to_4`
+        // between sampling beta and sampling z (mirroring the prover,
+        // which absorbs it into each chunk-mate's fork at that point).
+        for chunk_idx in 0..expected_num_chunks {
+            let chunk_start = chunk_idx * chunk_size;
+            let chunk_end = (chunk_start + chunk_size).min(airs.len());
+
+            let mut expected_comp_spec: Vec<(crypto::merkle_tree::mmcs::MatrixTag, usize)> =
+                Vec::new();
+            for idx in chunk_start..chunk_end {
+                let proof = &multi_proof.proofs[idx];
+                let lde_size =
+                    proof.trace_length * (airs[idx].options().blowup_factor as usize);
+                // Composition MMCS padded height = lde_size / 2 (row-pair leaves).
+                expected_comp_spec.push((main_tags[idx], lde_size / 2));
+            }
+            expected_comp_spec.sort_by(|a, b| b.1.cmp(&a.1).then(a.0.cmp(&b.0)));
+            if expected_comp_spec != multi_proof.comp_mmcs_specs[chunk_idx] {
+                error!(
+                    "chunk {chunk_idx} comp_mmcs_spec mismatch: expected {:?}, got {:?}",
+                    expected_comp_spec, multi_proof.comp_mmcs_specs[chunk_idx],
+                );
+                return false;
+            }
+            if multi_proof.comp_mmcs_roots[chunk_idx].is_none() {
+                error!(
+                    "chunk {chunk_idx} comp_mmcs_root missing (every chunk must commit at least one composition matrix)"
+                );
+                return false;
+            }
+        }
+
         // =====================================================================
         // Rounds 2-4: Forked per table
         // =====================================================================
@@ -1011,8 +1058,8 @@ pub trait IsStarkVerifier<
                 table_transcript.append_field_element(&bpi.table_contribution);
             }
 
-            // Per-chunk lookup: each table's main / aux MMCS root + spec
-            // come from its chunk.
+            // Per-chunk lookup: each table's main / aux / comp MMCS
+            // root + spec come from its chunk.
             let table_chunk_idx = idx / chunk_size;
             let main_root_for_chunk =
                 multi_proof.main_mmcs_roots[table_chunk_idx].as_ref();
@@ -1021,6 +1068,10 @@ pub trait IsStarkVerifier<
             let aux_root_for_chunk = multi_proof.aux_mmcs_roots[table_chunk_idx].as_ref();
             let aux_spec_for_chunk: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)] =
                 &multi_proof.aux_mmcs_specs[table_chunk_idx];
+            let comp_root_for_chunk =
+                multi_proof.comp_mmcs_roots[table_chunk_idx].as_ref();
+            let comp_spec_for_chunk: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)] =
+                &multi_proof.comp_mmcs_specs[table_chunk_idx];
 
             if !Self::verify_rounds_2_to_4(
                 *air,
@@ -1032,6 +1083,8 @@ pub trait IsStarkVerifier<
                 main_spec_for_chunk,
                 aux_root_for_chunk,
                 aux_spec_for_chunk,
+                comp_root_for_chunk,
+                comp_spec_for_chunk,
             ) {
                 error!(
                     "Table {} failed verify_rounds_2_to_4 (num_constraints={}, trace_cols={})",
@@ -1102,12 +1155,18 @@ pub trait IsStarkVerifier<
 
     /// Replays rounds 2, 3 and 4 of the protocol for a given proof, assuming round 1 has
     /// already been replayed and the RAP challenges are known.
+    ///
+    /// `comp_mmcs_root` is this table's chunk composition MMCS root,
+    /// absorbed between beta and z sampling. The prover absorbs the
+    /// same root into each chunk-mate's fork.
+    #[allow(clippy::too_many_arguments)]
     fn replay_rounds_after_round_1(
         air: &dyn AIR<Field = Field, FieldExtension = FieldExtension, PublicInputs = PI>,
         proof: &StarkProof<Field, FieldExtension, PI>,
         domain: &VerifierDomain<Field>,
         transcript: &mut impl IsStarkTranscript<FieldExtension, Field>,
         rap_challenges: Vec<FieldElement<FieldExtension>>,
+        comp_mmcs_root: Option<&Commitment>,
     ) -> Challenges<FieldExtension>
     where
         FieldElement<Field>: AsBytes,
@@ -1138,8 +1197,11 @@ pub trait IsStarkVerifier<
         let transition_coeffs: Vec<_> = coefficients.drain(..num_transition_constraints).collect();
         let boundary_coeffs = coefficients;
 
-        // <<<< Receive commitments: [H₁], [H₂]
-        transcript.append_bytes(&proof.composition_poly_root);
+        // <<<< Receive commitment: chunk composition MMCS root (one
+        // absorb per chunk-mate's fork, mirroring `multi_prove`).
+        if let Some(root) = comp_mmcs_root {
+            transcript.append_bytes(root);
+        }
 
         // ===================================
         // ==========|   Round 3   |==========
@@ -1251,6 +1313,8 @@ pub trait IsStarkVerifier<
         main_mmcs_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)],
         aux_mmcs_root: Option<&Commitment>,
         aux_mmcs_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)],
+        comp_mmcs_root: Option<&Commitment>,
+        comp_mmcs_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)],
     ) -> bool
     where
         FieldElement<Field>: AsBytes + Sync + Send + math::traits::ByteConversion,
@@ -1268,8 +1332,14 @@ pub trait IsStarkVerifier<
         #[cfg(feature = "instruments")]
         let timer1 = Instant::now();
 
-        let challenges =
-            Self::replay_rounds_after_round_1(air, proof, &domain, transcript, rap_challenges);
+        let challenges = Self::replay_rounds_after_round_1(
+            air,
+            proof,
+            &domain,
+            transcript,
+            rap_challenges,
+            comp_mmcs_root,
+        );
 
         // verify grinding
         let security_bits = air.context().proof_options.grinding_factor;
@@ -1335,6 +1405,8 @@ pub trait IsStarkVerifier<
             main_mmcs_spec,
             aux_mmcs_root,
             aux_mmcs_spec,
+            comp_mmcs_root,
+            comp_mmcs_spec,
         ) {
             #[cfg(not(feature = "test_fiat_shamir"))]
             error!("DEEP Composition Polynomial verification failed");
@@ -1488,3 +1560,64 @@ where
         mmcs_opening_sym.verify::<BatchedMerkleTreeBackend<E>>(aux_mmcs_root, aux_mmcs_spec);
     ok && ok_sym
 }
+
+/// Composition-trace counterpart of [`verify_main_mmcs_pair_inner`]. Uses
+/// `LEAF_DOMAIN_TAG_COMPOSITION` for rehash; the leaf hashes a row-PAIR
+/// rather than a single row, so the opening covers both `evaluations`
+/// (row 0 / br_0) and `evaluations_sym` (row 1 / br_1) under one MMCS
+/// opening — no separate `_sym` opening at this layer (the underlying
+/// tree's leaves are already row-pairs).
+fn verify_comp_mmcs_pair_inner<E>(
+    comp_opening: &crate::proof::stark::CompositionTraceOpening<E>,
+    iota: usize,
+    main_tag: crypto::merkle_tree::mmcs::MatrixTag,
+    comp_mmcs_root: Option<&Commitment>,
+    comp_mmcs_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)],
+) -> bool
+where
+    E: IsField,
+    FieldElement<E>: AsBytes + Sync + Send + math::traits::ByteConversion,
+{
+    use crate::mmcs_leaf::hash_tagged_row_pair_composition;
+    use crate::proof::stark::CompositionTraceOpening;
+
+    let comp_mmcs_root = match comp_mmcs_root {
+        Some(r) => r,
+        None => return false,
+    };
+    let CompositionTraceOpening::Mmcs {
+        evaluations,
+        evaluations_sym,
+        mmcs_opening,
+    } = comp_opening;
+
+    let table_idx = match comp_mmcs_spec.iter().position(|(t, _)| *t == main_tag) {
+        Some(i) => i,
+        None => return false,
+    };
+    let table_height = comp_mmcs_spec[table_idx].1;
+    let max_height = match comp_mmcs_spec.first().map(|(_, h)| *h) {
+        Some(h) => h,
+        None => return false,
+    };
+    if !table_height.is_power_of_two() || max_height < table_height {
+        return false;
+    }
+    let shift = (max_height / table_height).trailing_zeros() as usize;
+    // Composition opens at row-pair index iota, so the global index in
+    // the chunk MMCS is iota shifted up by the chunk-mate's depth diff.
+    let g_index = iota << shift;
+    if mmcs_opening.global_index != g_index {
+        return false;
+    }
+
+    let leaf = hash_tagged_row_pair_composition::<E>(main_tag, evaluations, evaluations_sym);
+    let leaves = &mmcs_opening.matrix_leaves;
+    if table_idx >= leaves.len() {
+        return false;
+    }
+    if leaves[table_idx].0 != main_tag || leaves[table_idx].1 != leaf {
+        return false;
+    }
+    mmcs_opening.verify::<BatchedMerkleTreeBackend<E>>(comp_mmcs_root, comp_mmcs_spec)
+}

From 7512928c5ef77a70a1fbc352fed7036600655530 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Thu, 28 May 2026 12:15:45 -0300
Subject: [PATCH 20/21] feat(stark/fri): linear_combine_evaluations helper for
 Phase D
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Mathematical primitive for batched FRI: combine N same-size evaluation
vectors into one using successive powers of δ_fri. A bucket's chip-DEEP
evaluations combine into one polynomial; FRI commits + opens only that
combined polynomial, while per-chip openings (main/aux/comp) still
authenticate each individual D_i value at the bucket-shared iotas.

Singleton bucket is a fast path (no combination needed). Empty input is
a usage bug (debug-asserted).

Not yet wired up — prover + verifier integration lands in the follow-up
chunk-join refactor.

4/4 new fri:: unit tests green. 157 prior stark tests unaffected.
---
 crypto/stark/src/fri/mod.rs | 98 +++++++++++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)

diff --git a/crypto/stark/src/fri/mod.rs b/crypto/stark/src/fri/mod.rs
index bbb988bd1..42682947e 100644
--- a/crypto/stark/src/fri/mod.rs
+++ b/crypto/stark/src/fri/mod.rs
@@ -15,6 +15,49 @@ use self::fri_functions::{
     compute_coset_twiddles_inv, fold_evaluations_in_place, update_twiddles_in_place,
 };
 
+/// Linearly combine N same-size evaluation vectors into one, using
+/// successive powers of `delta_fri`:
+///
+/// `out[i] = sources[0][i] + delta_fri * sources[1][i] + delta_fri^2 *
+/// sources[2][i] + ...`
+///
+/// This is the mathematical primitive behind Phase D batched FRI: a
+/// bucket's chip-DEEP-LDEs are combined into one polynomial whose
+/// low-degree-ness implies each summand's. FRI then commits + opens
+/// just that combined polynomial.
+///
+/// Empty `sources` is a usage bug — caller must pre-filter.
+/// All `sources[i]` must share the same length; debug-asserted.
+pub fn linear_combine_evaluations<E: IsField>(
+    sources: &[&[FieldElement<E>]],
+    delta_fri: &FieldElement<E>,
+) -> Vec<FieldElement<E>> {
+    debug_assert!(
+        !sources.is_empty(),
+        "linear_combine_evaluations: caller must supply at least one source"
+    );
+    let n = sources[0].len();
+    debug_assert!(
+        sources.iter().all(|s| s.len() == n),
+        "linear_combine_evaluations: all source vectors must share length"
+    );
+
+    if sources.len() == 1 {
+        // Singleton bucket: combining one polynomial is the identity.
+        return sources[0].to_vec();
+    }
+
+    let mut out = sources[0].to_vec();
+    let mut coeff = delta_fri.clone();
+    for src in &sources[1..] {
+        for (o, s) in out.iter_mut().zip(src.iter()) {
+            *o = &*o + &coeff * s;
+        }
+        coeff = &coeff * delta_fri;
+    }
+    out
+}
+
 /// FRI commit phase from pre-computed bit-reversed evaluations, skipping the
 /// initial FFT. Use this when the caller already has the evaluation vector
 /// (e.g. from a fused LDE pipeline).
@@ -126,3 +169,58 @@ where
             .collect()
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use math::field::goldilocks::GoldilocksField;
+
+    type FE = FieldElement<GoldilocksField>;
+
+    #[test]
+    fn linear_combine_singleton_is_identity() {
+        let v = vec![FE::from(7u64), FE::from(11u64), FE::from(13u64), FE::from(17u64)];
+        let delta = FE::from(99u64);
+        let out = linear_combine_evaluations(&[&v[..]], &delta);
+        assert_eq!(out, v);
+    }
+
+    #[test]
+    fn linear_combine_two_sources_uses_horner_in_delta() {
+        // out[i] = a[i] + delta * b[i]
+        let a = vec![FE::from(1u64), FE::from(2u64), FE::from(3u64)];
+        let b = vec![FE::from(10u64), FE::from(20u64), FE::from(30u64)];
+        let delta = FE::from(5u64);
+        let out = linear_combine_evaluations(&[&a[..], &b[..]], &delta);
+        let expected: Vec<FE> = a
+            .iter()
+            .zip(b.iter())
+            .map(|(x, y)| x + &delta * y)
+            .collect();
+        assert_eq!(out, expected);
+    }
+
+    #[test]
+    fn linear_combine_three_sources_powers_of_delta() {
+        // out[i] = a[i] + delta * b[i] + delta^2 * c[i]
+        let a = vec![FE::from(1u64), FE::from(0u64)];
+        let b = vec![FE::from(0u64), FE::from(1u64)];
+        let c = vec![FE::from(1u64), FE::from(1u64)];
+        let delta = FE::from(3u64);
+        let out = linear_combine_evaluations(&[&a[..], &b[..], &c[..]], &delta);
+        let delta_sq = &delta * &delta;
+        // out[0] = 1 + 3*0 + 9*1 = 10
+        // out[1] = 0 + 3*1 + 9*1 = 12
+        assert_eq!(out[0], FE::from(1u64) + &delta_sq);
+        assert_eq!(out[1], FE::from(3u64) + &delta_sq);
+    }
+
+    #[test]
+    fn linear_combine_zero_delta_keeps_only_first_source() {
+        let a = vec![FE::from(7u64), FE::from(7u64)];
+        let b = vec![FE::from(99u64), FE::from(99u64)];
+        let zero = FE::from(0u64);
+        let out = linear_combine_evaluations(&[&a[..], &b[..]], &zero);
+        assert_eq!(out, a);
+    }
+}

From c8025237a884b90ad8aaceaf15d50b52d4225378 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Thu, 28 May 2026 15:53:18 -0300
Subject: [PATCH 21/21] feat(stark/fri): per-(chunk, lde_size) batched FRI +
 streaming bucket combine
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase D of the MMCS streaming plan: one FRI commit per height bucket
inside a chunk. Bucket-mates' DEEP composition polynomials are linearly
combined with successive powers of δ_fri (sampled from the chunk-shared,
post-OOD-broadcast transcript) and a single FRI commit + grinding +
query phase runs on the combined polynomial. Drops 4 per-table proof
fields; adds one `MultiProof` field.

Built on top of (and is the final piece of) the streaming MMCS plan:
trace-MMCS (b1621808) and composition-MMCS (c22fca9e) shave ~10-15% of
proof bytes; batched FRI is where the rest of the proof-size win lives.

# Proof format

`StarkProof` drops:
  - `fri_layers_merkle_roots: Vec<Commitment>`
  - `fri_last_value: FieldElement<E>`
  - `query_list: Vec<FriDecommitment<E>>`
  - `nonce: Option<u64>`

`MultiProof` gains:
  - `fri_chunk_buckets: Vec<Vec<ChunkBucketFri<E>>>`

`ChunkBucketFri { lde_size, members, layer_roots, last_value,
decommitments, nonce }` — bucket members listed in canonical
chunk-local-index order so δ_fri^i powers are reproducible verifier-side.

# Bucket-seed transcript (canonical OOD broadcast)

For chunk c, after Phase A+B+C absorbs, the bucket seed extends a clone
of the pre-fork shared state by appending, in exact order:
  1. per chunk-mate j in chunk-local order: table_contribution_j (when
     bus_public_inputs is Some)
  2. chunk composition MMCS root (when Some)
  3. per chunk-mate j in chunk-local order: trace_ood_evaluations
     columns + composition_poly_parts_ood_evaluation

Per bucket b: bucket_transcript = bucket_seed.clone(), append
`(lde_size as u64).to_le_bytes()`, sample δ_fri, then standard
commit_phase_from_evaluations / grinding / iotas interaction.

The verifier reconstructs the bucket seed from `MultiProof` data alone
(table contributions from `bus_public_inputs`, comp root from
`comp_mmcs_roots`, OODs from each `StarkProof.trace_ood_evaluations` +
`composition_poly_parts_ood_evaluation`).

# Prover (multi_prove)

R2b → R3.5 split into a light per-fork pass + heavy bucket-level pass:

  - `prove_rounds_2b_to_3_5(...)` per chunk-mate (now parallel via
    rayon `par_iter_mut()` on the chunk transcript slice): sample z,
    compute OOD, absorb own OOD, sample γ, build the DEEP coefficient
    layout (`DeepCoeffs { z, gammas, trace_term_coeffs, lde_size }`).
    Does NOT build the DEEP LDE.

  - Build `bucket_seed` (canonical replay on `pre_fork_transcript`).

  - Bucket chunk-mates by `lde_size` (first-encounter order on
    chunk-local index).

  - Per bucket: clone bucket_seed, append lde_size, sample δ_fri.
    Streaming combine: per member, `compute_deep_lde_with_coeffs` →
    fold into the bucket accumulator with δ_fri^i_local → drop. First
    member of every bucket reuses its DEEP LDE as the accumulator
    base (no zero-init copy in the singleton case). Then
    commit_phase_from_evaluations / grind / sample iotas /
    query_phase on the combined polynomial.

  - Per chunk-mate: open trace / aux / comp / precomputed at the
    bucket-shared iotas, assemble per-table `StarkProof`. Iotas come
    from the bucket cache; no transcript replay needed.

# Verifier (multi_verify)

  - `replay_rounds_after_round_1` stops at γ — leaves `zetas`,
    `iotas`, and `grinding_seed` empty in the returned `Challenges`.
  - `verify_rounds_2_to_4` split into `replay_and_verify_step_2`
    (per-fork, returns `Option<Challenges>`) + `verify_step_4_at_iotas`
    (driven from `multi_verify` after the bucket FRI sets iotas).
  - `step_3_verify_fri` / `verify_query_and_sym_openings` replaced by
    `verify_bucket_fri_query` which takes the bucket's
    `layer_roots` + `last_value` + per-iota `FriDecommitment`
    externally and runs the standard FRI fold check against the
    combined-D evaluations.

  - `multi_verify` chunk loop:
      1. Per chunk-mate: fork transcript → bind table_contribution →
         `replay_and_verify_step_2` → store Challenges.
      2. Build `bucket_seed` from `MultiProof` data.
      3. Validate `fri_chunk_buckets[chunk_idx]` structure: bucket
         count matches the expected lde_size grouping; per bucket
         lde_size and member tag order match.
      4. Per bucket: replay layer-root absorbs + zetas + last_value +
         grinding + iotas (cached in `bucket_iotas_cache`).
         Reconstruct each member's D_i evaluations via the existing
         barycentric helper, combine with δ_fri^i, run
         `verify_bucket_fri_query` per iota.
      5. Per chunk-mate: `verify_step_4_at_iotas` using cached iotas.

# Performance considerations (F1 fixes)

The first draft of D5 retained K × DEEP LDE simultaneously and lost
per-chunk-mate parallelism in R2b-R3.5 (c22fca9e had serialised it):
proving time on the workload regressed ~117% while memory only dropped
~10%. Three fixes shipped together:

  - F1.1 (streaming bucket combine): DEEP_i materialises one at a
    time inside the bucket loop and folds into the accumulator with
    δ_fri^i, then drops. Peak DEEP memory inside the bucket loop is
    2 × |LDE| (current member + accumulator), restored from K × |LDE|.

  - F1.2 (restore per-chunk-mate parallelism): `chunk_transcripts.
    par_iter_mut().enumerate()` runs R2b → R3 → DEEP-coeffs
    concurrently per chunk-mate. `chunk_airs` + `chunk_num_rows` are
    extracted before the parallel iter so we don't capture the
    Sync-unfriendly `&mut TraceTable` inside `air_trace_pairs`.

  - F1.3 (verifier iota cache): `bucket_iotas_cache: Vec<Vec<usize>>`
    is populated during the FRI fold check pass and reused in the
    step 4 pass, eliminating a redundant clone+replay of the bucket
    transcript per chunk-mate.

# Tests (D7)

New `tests/bucket_fri_soundness_tests.rs` (7 tests):
  - baseline_phase_d_proof_verifies
  - tampered_bucket_last_value_rejected
  - tampered_bucket_layer_root_rejected
  - truncated_bucket_decommitments_rejected
  - missing_chunk_buckets_rejected
  - wrong_bucket_lde_size_rejected
  - swapped_member_order_rejected

`bin/cli/proof-size` breakdown: drops `fri_layers_merkle_roots` and
`fri_query_list`; adds `fri_chunk_buckets (per-chunk batched FRI)`.

# Results

  - `cargo test -p stark --lib` → 168/168 (161 prior + 7 new bucket
    FRI soundness tests).
  - `cargo test -p crypto --lib` → 73/73.
  - `cargo test -p cli` → 3/3.
  - `cargo test -p lambda-vm-prover --lib tests::bitwise*` → 29/29
    (preprocessed-AIR exercise).
  - 77 `tests::prove_elfs_tests::*` failures are the unchanged
    pre-existing `UnknownSyscall(5)` executor bug, present on main.

# Out of scope (follow-ups)

  - Fuse compute-DEEP + bucket-combine into a single per-LDE-point
    pass (eliminate the intermediate `Vec` for the current member).
  - Adaptive chunk sizing by LDE memory budget (replace fixed
    `chunk_size = table_parallelism()` with a bin-packing decision).
  - Disk-spill `StarkProof` + `ChunkBucketFri` between chunks for
    very-large-table workloads.
---
 bin/cli/src/main.rs                           |  19 +-
 crypto/stark/src/proof/stark.rs               |  59 +-
 crypto/stark/src/prover.rs                    | 509 ++++++++++-----
 .../src/tests/bucket_fri_soundness_tests.rs   | 168 +++++
 crypto/stark/src/tests/mod.rs                 |   1 +
 crypto/stark/src/verifier.rs                  | 604 ++++++++++++------
 6 files changed, 973 insertions(+), 387 deletions(-)
 create mode 100644 crypto/stark/src/tests/bucket_fri_soundness_tests.rs

diff --git a/bin/cli/src/main.rs b/bin/cli/src/main.rs
index 86a6dbddf..6e310264b 100644
--- a/bin/cli/src/main.rs
+++ b/bin/cli/src/main.rs
@@ -697,6 +697,8 @@ fn cmd_proof_size(
     let comp_mmcs_roots_bytes = ser_len(&vm_proof.proof.comp_mmcs_roots);
     let comp_mmcs_specs_bytes = ser_len(&vm_proof.proof.comp_mmcs_specs);
     let chunk_size_bytes = ser_len(&vm_proof.proof.chunk_size);
+    // Phase D: per-(chunk, bucket) batched FRI.
+    let fri_chunk_buckets_bytes = ser_len(&vm_proof.proof.fri_chunk_buckets);
 
     // Sum per-section across every sub-proof so a single number captures the
     // contribution of, e.g., "all FRI query lists across all tables".
@@ -704,8 +706,6 @@ fn cmd_proof_size(
     let mut s_precomputed_trace_openings = 0usize;
     let mut s_aux_trace_openings = 0usize;
     let mut s_composition_openings = 0usize;
-    let mut s_fri_query_list = 0usize;
-    let mut s_fri_layers_roots = 0usize;
     let mut s_trace_ood = 0usize;
     let mut s_composition_ood = 0usize;
     let mut s_per_table_main_root = 0usize;
@@ -718,8 +718,6 @@ fn cmd_proof_size(
         s_precomputed_root += ser_len(&proof.lde_trace_precomputed_merkle_root);
         s_trace_ood += ser_len(&proof.trace_ood_evaluations);
         s_composition_ood += ser_len(&proof.composition_poly_parts_ood_evaluation);
-        s_fri_query_list += ser_len(&proof.query_list);
-        s_fri_layers_roots += ser_len(&proof.fri_layers_merkle_roots);
         s_bus_public_inputs += ser_len(&proof.bus_public_inputs);
 
         for opening in &proof.deep_poly_openings {
@@ -730,9 +728,8 @@ fn cmd_proof_size(
         }
     }
 
-    // Anything not captured above (composition_poly_root, fri_last_value,
-    // nonce, public_inputs, trace_length, headers...). Calculate as the
-    // bundle delta so the breakdown still sums to ~total.
+    // Anything not captured above (public_inputs, trace_length, headers...).
+    // Calculate as the bundle delta so the breakdown still sums to ~total.
     let accounted = main_mmcs_roots_bytes
         + main_mmcs_specs_bytes
         + aux_mmcs_roots_bytes
@@ -740,12 +737,11 @@ fn cmd_proof_size(
         + comp_mmcs_roots_bytes
         + comp_mmcs_specs_bytes
         + chunk_size_bytes
+        + fri_chunk_buckets_bytes
         + s_main_trace_openings
         + s_precomputed_trace_openings
         + s_aux_trace_openings
         + s_composition_openings
-        + s_fri_query_list
-        + s_fri_layers_roots
         + s_trace_ood
         + s_composition_ood
         + s_per_table_main_root
@@ -767,12 +763,11 @@ fn cmd_proof_size(
         ProofSizeEntry { section: "deep_poly_openings.precomputed_trace_polys".into(), bytes: s_precomputed_trace_openings },
         ProofSizeEntry { section: "deep_poly_openings.aux_trace_polys".into(), bytes: s_aux_trace_openings },
         ProofSizeEntry { section: "deep_poly_openings.composition_poly".into(), bytes: s_composition_openings },
-        ProofSizeEntry { section: "fri_layers_merkle_roots".into(), bytes: s_fri_layers_roots },
-        ProofSizeEntry { section: "fri_query_list".into(), bytes: s_fri_query_list },
+        ProofSizeEntry { section: "fri_chunk_buckets (per-chunk batched FRI)".into(), bytes: fri_chunk_buckets_bytes },
         ProofSizeEntry { section: "trace_ood_evaluations".into(), bytes: s_trace_ood },
         ProofSizeEntry { section: "composition_poly_parts_ood_evaluation".into(), bytes: s_composition_ood },
         ProofSizeEntry { section: "bus_public_inputs".into(), bytes: s_bus_public_inputs },
-        ProofSizeEntry { section: "other (headers / public_inputs / nonce / ...)".into(), bytes: s_other },
+        ProofSizeEntry { section: "other (headers / public_inputs / ...)".into(), bytes: s_other },
     ];
 
     if json {
diff --git a/crypto/stark/src/proof/stark.rs b/crypto/stark/src/proof/stark.rs
index 32ac76184..7fb68751e 100644
--- a/crypto/stark/src/proof/stark.rs
+++ b/crypto/stark/src/proof/stark.rs
@@ -155,17 +155,12 @@ pub struct StarkProof<F: IsSubFieldOf<E>, E: IsField, PI> {
     pub trace_ood_evaluations: Table<E>,
     // Hᵢ(z^N)
     pub composition_poly_parts_ood_evaluation: Vec<FieldElement<E>>,
-    // [pₖ]
-    pub fri_layers_merkle_roots: Vec<Commitment>,
-    // pₙ
-    pub fri_last_value: FieldElement<E>,
-    // Open(pₖ(Dₖ), −𝜐ₛ^(2ᵏ))
-    pub query_list: Vec<FriDecommitment<E>>,
-    // Open(H₁(D_LDE, 𝜐ᵢ), Open(H₂(D_LDE, 𝜐ᵢ), Open(tⱼ(D_LDE), 𝜐ᵢ)
-    // Open(H₁(D_LDE, -𝜐ᵢ), Open(H₂(D_LDE, -𝜐ᵢ), Open(tⱼ(D_LDE), -𝜐ᵢ)
+    // Per-query openings of THIS table's main / aux / composition / precomputed
+    // data, indexed at the SHARED bucket iotas (Phase D batched FRI). The FRI
+    // commit + last value + query decommitments + grinding nonce now live at
+    // chunk-bucket level in `MultiProof::fri_chunk_buckets`; this proof only
+    // carries the per-table trace authentication.
     pub deep_poly_openings: DeepPolynomialOpenings<F, E>,
-    // nonce obtained from grinding
-    pub nonce: Option<u64>,
     // Bus interaction public inputs for the accumulated column.
     // Contains the table contribution (L), used for:
     // 1. Circular constraint offset: L/N per row
@@ -230,4 +225,48 @@ pub struct MultiProof<F: IsSubFieldOf<E>, E: IsField, PI> {
     /// proving time. The verifier uses this to chunk the AIR slice into
     /// the same per-chunk grouping the prover used.
     pub chunk_size: u32,
+    /// Per-(chunk, lde_size-bucket) batched FRI instances. Outer Vec is
+    /// indexed by chunk (parallel to `main_mmcs_roots` etc.); inner Vec
+    /// lists buckets in canonical first-encounter (chunk-local-index
+    /// ascending) order. Each `ChunkBucketFri` carries the FRI layer
+    /// roots, last value, per-iota decommitments, and grinding nonce
+    /// for ONE linearly-combined DEEP composition polynomial committing
+    /// to every bucket-mate's individual D_i (combined with successive
+    /// powers of the bucket's `delta_fri` challenge).
+    pub fri_chunk_buckets: Vec<Vec<ChunkBucketFri<E>>>,
+}
+
+/// Phase D — per-(chunk, lde_size) batched FRI instance.
+///
+/// One per height bucket inside a chunk: bucket-mates' individual DEEP
+/// composition polynomials are linearly combined with successive powers
+/// of `delta_fri` (sampled from the chunk-shared, post-OOD-broadcast
+/// transcript), and a single FRI commit + grinding + query is run on
+/// the combined polynomial. The `members` list pins the canonical
+/// bucket-local order used to derive `delta_fri^i` on the verifier side;
+/// reordering the list rejects the proof.
+///
+/// `decommitments` length equals `air.options().fri_number_of_queries`
+/// (one decommitment per shared iota). `nonce` is `Some` when the
+/// AIR's grinding factor > 0 (`None` otherwise).
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
+#[serde(bound = "")]
+pub struct ChunkBucketFri<E: IsField> {
+    /// LDE size shared by every bucket-mate. Equal to
+    /// `trace_length * blowup_factor` for each member.
+    pub lde_size: u32,
+    /// Bucket-mate tags in the canonical bucket-local order (matches
+    /// chunk-local index ascending). Index `i` here corresponds to
+    /// `delta_fri^i` in the linear combination.
+    pub members: Vec<MatrixTag>,
+    /// `[pₖ]` for k = 1..num_layers.
+    pub layer_roots: Vec<Commitment>,
+    /// `pₙ` — the final folded constant.
+    pub last_value: FieldElement<E>,
+    /// One FRI decommitment per shared iota (the bucket transcript
+    /// samples a single iota list reused by every bucket-mate's
+    /// per-table opening).
+    pub decommitments: Vec<FriDecommitment<E>>,
+    /// Grinding nonce, when `grinding_factor > 0`.
+    pub nonce: Option<u64>,
 }
diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs
index 72f2a822f..bd913d4d1 100644
--- a/crypto/stark/src/prover.rs
+++ b/crypto/stark/src/prover.rs
@@ -39,7 +39,6 @@ use crypto::merkle_tree::mmcs::{MatrixTag, Mmcs, MmcsError, StreamingMmcsBuilder
 use super::config::{BatchedMerkleTree, BatchedMerkleTreeBackend, Commitment};
 use super::constraints::evaluator::ConstraintEvaluator;
 use super::domain::{Domain, DomainConstants};
-use super::fri::fri_decommit::FriDecommitment;
 use super::grinding;
 use super::lookup::BusPublicInputs;
 use super::proof::stark::{DeepPolynomialOpening, MultiProof, StarkProof};
@@ -507,26 +506,26 @@ where
 
 /// A container for the results of the third round of the STARK Prove protocol.
 pub(crate) struct Round3<F: IsField> {
-    /// Evaluations of the trace polynomials, main ans auxiliary, at the out-of-domain challenge.
-    trace_ood_evaluations: Table<F>,
+    /// Evaluations of the trace polynomials, main and auxiliary, at the out-of-domain challenge.
+    pub(crate) trace_ood_evaluations: Table<F>,
     /// Evaluations of the composition polynomial parts at the out-of-domain challenge.
-    composition_poly_parts_ood_evaluation: Vec<FieldElement<F>>,
+    pub(crate) composition_poly_parts_ood_evaluation: Vec<FieldElement<F>>,
 }
 
-/// A container for the results of the fourth round of the STARK Prove protocol.
-pub(crate) struct Round4<F: IsSubFieldOf<E>, E: IsField> {
-    /// The final value resulting from folding the Deep composition polynomial all the way down to a constant value.
-    fri_last_value: FieldElement<E>,
-    /// The commitments to the fold polynomials of the inner layers of FRI.
-    fri_layers_merkle_roots: Vec<Commitment>,
-    /// The values and proofs of validity of the evaluations of the trace polynomials and the composition polynomials
-    /// parts at the domain values corresponding to the FRI query challenges and their symmetric counterparts.
-    deep_poly_openings: DeepPolynomialOpenings<F, E>,
-    /// The values and proofs of validity of the evaluations of the fold polynomials of the inner
-    /// layers of FRI at the values corresponding to the symmetrics of the FRI query challenges.
-    query_list: Vec<FriDecommitment<E>>,
-    /// The proof of work nonce.
-    nonce: Option<u64>,
+/// Per-table DEEP composition coefficient layout sampled at R3.5 from
+/// the per-fork transcript. Stored across chunk-mates so the bucket FRI
+/// loop can rebuild DEEP_i LDEs on demand (one at a time) and fold them
+/// into the bucket-combined accumulator without retaining K full DEEPs
+/// in memory simultaneously.
+pub(crate) struct DeepCoeffs<E: IsField> {
+    pub(crate) z: FieldElement<E>,
+    /// γⱼ for the composition-poly OOD H_j parts.
+    pub(crate) gammas: Vec<FieldElement<E>>,
+    /// γ′ⱼₖ for the trace-column OOD terms, grouped by column.
+    pub(crate) trace_term_coeffs: Vec<Vec<FieldElement<E>>>,
+    /// LDE size — matches `round_1.lde_trace`'s length and is the size
+    /// of the DEEP LDE this table contributes to its bucket.
+    pub(crate) lde_size: usize,
 }
 
 /// Returns the evaluations of the polynomial `p` over the lde domain defined by the given
@@ -1631,23 +1630,24 @@ pub trait IsStarkProver<
         }
     }
 
-    /// Returns the result of the fourth round of the STARK Prove protocol.
-    fn round_4_compute_and_run_fri_on_the_deep_composition_polynomial(
+    /// Round 3.5 part A — sample γ from the per-fork transcript and
+    /// build the DEEP composition coefficient layout. Cheap (a few
+    /// field elements + Vec allocations); the heavy LDE compute is
+    /// deferred to [`compute_deep_lde_with_coeffs`] which runs **inside
+    /// the bucket loop**, so DEEP_i is materialised one at a time and
+    /// folded into the bucket accumulator with `δ_fri^i` before
+    /// dropping.
+    fn sample_deep_coeffs(
         air: &dyn AIR<Field = Field, FieldExtension = FieldExtension, PublicInputs = PI>,
         domain: &Domain<Field>,
-        round_1_result: &Round1<Field, FieldExtension>,
         round_2_result: &Round2<FieldExtension>,
-        round_3_result: &Round3<FieldExtension>,
-        z: &FieldElement<FieldExtension>,
+        z: FieldElement<FieldExtension>,
         transcript: &mut impl IsStarkTranscript<FieldExtension, Field>,
-    ) -> Round4<Field, FieldExtension>
+    ) -> DeepCoeffs<FieldExtension>
     where
         FieldElement<FieldExtension>: AsBytes,
         FieldElement<Field>: AsBytes,
     {
-        let coset_offset_u64 = air.context().proof_options.coset_offset;
-        let coset_offset = FieldElement::<Field>::from(coset_offset_u64);
-
         let gamma = transcript.sample_field_element();
 
         let n_terms_composition_poly = round_2_result.lde_composition_poly_evaluations.len();
@@ -1670,84 +1670,64 @@ pub trait IsStarkProver<
         // <<<< Receive challenges: 𝛾ⱼ, 𝛾ⱼ'
         let gammas = deep_composition_coefficients;
 
-        // Compute p₀ (deep composition polynomial) as N evaluations on trace-size coset
+        let lde_size = domain.lde_roots_of_unity_coset.len();
+        DeepCoeffs {
+            z,
+            gammas,
+            trace_term_coeffs,
+            lde_size,
+        }
+    }
+
+    /// Round 3.5 part B — compute the DEEP composition polynomial on
+    /// the LDE coset (bit-reverse permuted, ready for FRI). Pure
+    /// computation, no transcript interaction. Called once per
+    /// chunk-mate inside the bucket loop so we can drop each DEEP_i
+    /// after folding it into the bucket-shared accumulator.
+    fn compute_deep_lde_with_coeffs(
+        domain: &Domain<Field>,
+        round_1_result: &Round1<Field, FieldExtension>,
+        round_2_result: &Round2<FieldExtension>,
+        round_3_result: &Round3<FieldExtension>,
+        coeffs: &DeepCoeffs<FieldExtension>,
+    ) -> Vec<FieldElement<FieldExtension>>
+    where
+        FieldElement<FieldExtension>: AsBytes,
+        FieldElement<Field>: AsBytes,
+    {
         #[cfg(feature = "instruments")]
         let t_sub = Instant::now();
         let deep_evals = Self::compute_deep_composition_poly_evaluations(
             &round_1_result.lde_trace,
             round_2_result,
             round_3_result,
-            z,
+            &coeffs.z,
             domain,
             &domain.trace_primitive_root,
-            &gammas,
-            &trace_term_coeffs,
+            &coeffs.gammas,
+            &coeffs.trace_term_coeffs,
         );
         #[cfg(feature = "instruments")]
         let other_dur_1 = t_sub.elapsed();
 
-        // DEEP evaluations are already at 2N LDE points — just bit-reverse for FRI.
-        // No iFFT+FFT extension needed (Plonky3-style direct LDE computation).
-        let domain_size = domain.lde_roots_of_unity_coset.len();
+        // DEEP evaluations are already at 2N LDE points; bit-reverse
+        // to make them FRI-input compatible without an FFT extension.
         #[cfg(feature = "instruments")]
         let t_sub = Instant::now();
-        let mut lde_evals = deep_evals;
-        in_place_bit_reverse_permute(&mut lde_evals);
+        let mut deep_lde = deep_evals;
+        in_place_bit_reverse_permute(&mut deep_lde);
         #[cfg(feature = "instruments")]
         let r4_fft_dur = t_sub.elapsed();
 
-        // FRI commit phase from pre-computed evaluations
         #[cfg(feature = "instruments")]
-        let t_sub = Instant::now();
-        let (fri_last_value, fri_layers) =
-            fri::commit_phase_from_evaluations::<Field, FieldExtension>(
-                domain.root_order as usize,
-                lde_evals,
-                transcript,
-                &coset_offset,
-                domain_size,
-            );
-        #[cfg(feature = "instruments")]
-        let r4_merkle_dur = t_sub.elapsed();
-
-        // grinding: generate nonce and append it to the transcript
-        #[cfg(feature = "instruments")]
-        let t_sub = Instant::now();
-        let security_bits = air.context().proof_options.grinding_factor;
-        let mut nonce = None;
-        if security_bits > 0 {
-            let nonce_value = grinding::generate_nonce(&transcript.state(), security_bits)
-                .expect("nonce not found");
-            transcript.append_bytes(&nonce_value.to_be_bytes());
-            nonce = Some(nonce_value);
-        }
-
-        let number_of_queries = air.options().fri_number_of_queries;
-        let iotas = Self::sample_query_indexes(number_of_queries, domain, transcript);
-
-        let query_list = fri::query_phase(&fri_layers, &iotas);
-
-        let fri_layers_merkle_roots: Vec<_> = fri_layers
-            .iter()
-            .map(|layer| layer.merkle_tree.root)
-            .collect();
-
-        let deep_poly_openings =
-            Self::open_deep_composition_poly(domain, round_1_result, round_2_result, &iotas);
-
-        #[cfg(feature = "instruments")]
-        {
-            let queries_dur = t_sub.elapsed();
-            crate::instruments::store_r4_sub(r4_fft_dur, r4_merkle_dur, other_dur_1, queries_dur);
-        }
+        crate::instruments::store_r4_sub(
+            r4_fft_dur,
+            std::time::Duration::ZERO,
+            other_dur_1,
+            std::time::Duration::ZERO,
+        );
 
-        Round4 {
-            fri_last_value,
-            fri_layers_merkle_roots,
-            deep_poly_openings,
-            query_list,
-            nonce,
-        }
+        deep_lde
     }
 
     fn sample_query_indexes(
@@ -2614,6 +2594,12 @@ pub trait IsStarkProver<
             }
         }
 
+        // Capture pre-fork shared transcript state. Phase D's per-chunk
+        // bucket seed clones this and replays chunk-local data
+        // (table_contributions, comp root, all chunk-mate OODs) canonically
+        // so every bucket-mate reaches the same δ_fri / iotas state.
+        let pre_fork_transcript = transcript.clone();
+
         // Pre-fork all transcripts (cheap, sequential — must match verifier ordering).
         // Happens AFTER all per-chunk aux MMCS roots have been absorbed.
         let mut table_transcripts: Vec<_> = (0..num_airs)
@@ -2689,6 +2675,8 @@ pub trait IsStarkProver<
             (0..num_airs).map(|_| None).collect();
         let mut comp_mmcs_roots_per_chunk: Vec<Option<Commitment>> = Vec::new();
         let mut comp_mmcs_specs_per_chunk: Vec<Vec<(MatrixTag, usize)>> = Vec::new();
+        let mut fri_chunk_buckets_per_chunk: Vec<Vec<crate::proof::stark::ChunkBucketFri<FieldExtension>>> =
+            Vec::new();
         let mut lde_drain = cached_ldes.into_iter();
         for chunk_start in (0..num_airs).step_by(k) {
             let chunk_end = (chunk_start + k).min(num_airs);
@@ -2815,68 +2803,279 @@ pub trait IsStarkProver<
                 });
             }
 
-            // Phase R2b → R4 (sequential within chunk): each fork has
-            // the chunk comp root absorbed; sample z, run R3 OOD + R4
-            // FRI. Same rationale as R2a above.
+            // Phase R2b → R3.5 (parallel within chunk via rayon): each
+            // fork already saw the chunk comp root; sample z, compute +
+            // absorb own OOD, sample γ + DEEP coeffs. The heavy DEEP LDE
+            // computation is deferred to the bucket loop below. Rayon
+            // restores per-chunk-mate parallelism that c22fca9e had
+            // serialised — safe because each fork owns its mutable
+            // transcript slot and the rest of the captured state is
+            // read-only and Sync.
+            let chunk_airs: Vec<&dyn AIR<
+                Field = Field,
+                FieldExtension = FieldExtension,
+                PublicInputs = PI,
+            >> = (chunk_start..chunk_end)
+                .map(|i| air_trace_pairs[i].0)
+                .collect();
+            #[cfg(feature = "instruments")]
+            let chunk_air_names: Vec<String> =
+                chunk_airs.iter().map(|a| a.name().to_string()).collect();
+            #[cfg(feature = "instruments")]
+            let chunk_num_rows: Vec<usize> = (chunk_start..chunk_end)
+                .map(|i| air_trace_pairs[i].1.num_rows())
+                .collect();
+
             let chunk_transcripts = &mut table_transcripts[chunk_start..chunk_end];
-            let r2b_iter = chunk_round1
-                .iter()
-                .zip(chunk_round2.iter())
-                .zip(chunk_transcripts.iter_mut())
-                .enumerate();
 
-            let chunk_results: Vec<Result<_, ProvingError>> = r2b_iter
-                .map(|(j, ((round_1_result, round_2_result), table_transcript))| {
-                    let idx = chunk_start + j;
-                    let (air, trace, pub_inputs) = &air_trace_pairs[idx];
-                    let _ = trace; // used by instruments
-                    let domain = &domains[idx];
+            #[cfg(feature = "parallel")]
+            let r2b_iter = chunk_transcripts.par_iter_mut().enumerate();
+            #[cfg(not(feature = "parallel"))]
+            let r2b_iter = chunk_transcripts.iter_mut().enumerate();
 
-                    #[cfg(feature = "instruments")]
+            #[cfg(feature = "instruments")]
+            #[allow(clippy::type_complexity)]
+            let r2b_results: Vec<Result<(usize, Round3<FieldExtension>, DeepCoeffs<FieldExtension>, (String, usize, std::time::Duration, crate::instruments::TableSubOps)), ProvingError>> = r2b_iter
+                .map(|(j, table_transcript)| {
+                    let air = chunk_airs[j];
+                    let round_1_result = &chunk_round1[j];
+                    let round_2_result = &chunk_round2[j];
+                    let domain = &domains[chunk_start + j];
                     let table_start = Instant::now();
-
-                    let proof = Self::prove_rounds_2b_to_4(
-                        *air,
-                        *pub_inputs,
+                    let (round_3, deep_coeffs) = Self::prove_rounds_2b_to_3_5(
+                        air,
                         round_1_result,
                         round_2_result,
                         table_transcript,
                         domain,
                     )?;
+                    let sub_ops = crate::instruments::take_round_sub_ops().unwrap_or_default();
+                    let timing = (
+                        chunk_air_names[j].clone(),
+                        chunk_num_rows[j],
+                        table_start.elapsed(),
+                        sub_ops,
+                    );
+                    Ok((j, round_3, deep_coeffs, timing))
+                })
+                .collect();
 
-                    #[cfg(feature = "instruments")]
-                    let table_timing = {
-                        let sub_ops = crate::instruments::take_round_sub_ops().unwrap_or_default();
-                        (
-                            air.name().to_string(),
-                            trace.num_rows(),
-                            table_start.elapsed(),
-                            sub_ops,
-                        )
-                    };
-
-                    #[cfg(feature = "instruments")]
-                    return Ok((j, proof, table_timing));
-                    #[cfg(not(feature = "instruments"))]
-                    Ok((j, proof))
+            #[cfg(not(feature = "instruments"))]
+            #[allow(clippy::type_complexity)]
+            let r2b_results: Vec<Result<(usize, Round3<FieldExtension>, DeepCoeffs<FieldExtension>), ProvingError>> = r2b_iter
+                .map(|(j, table_transcript)| {
+                    let air = chunk_airs[j];
+                    let round_1_result = &chunk_round1[j];
+                    let round_2_result = &chunk_round2[j];
+                    let domain = &domains[chunk_start + j];
+                    let (round_3, deep_coeffs) = Self::prove_rounds_2b_to_3_5(
+                        air,
+                        round_1_result,
+                        round_2_result,
+                        table_transcript,
+                        domain,
+                    )?;
+                    Ok((j, round_3, deep_coeffs))
                 })
                 .collect();
 
-            for result in chunk_results {
+            // Collect R2b results in chunk-local-index order.
+            let mut chunk_round3: Vec<Option<Round3<FieldExtension>>> =
+                (0..chunk_size).map(|_| None).collect();
+            let mut chunk_deep_coeffs: Vec<Option<DeepCoeffs<FieldExtension>>> =
+                (0..chunk_size).map(|_| None).collect();
+            for r in r2b_results {
                 #[cfg(feature = "instruments")]
                 {
-                    let (j, proof, timing) = result?;
-                    let idx = chunk_start + j;
-                    proofs[idx] = Some(proof);
+                    let (j, r3, dc, timing) = r?;
+                    chunk_round3[j] = Some(r3);
+                    chunk_deep_coeffs[j] = Some(dc);
                     table_timings.push(timing);
                 }
                 #[cfg(not(feature = "instruments"))]
                 {
-                    let (j, proof) = result?;
+                    let (j, r3, dc) = r?;
+                    chunk_round3[j] = Some(r3);
+                    chunk_deep_coeffs[j] = Some(dc);
+                }
+            }
+            let chunk_round3: Vec<Round3<FieldExtension>> = chunk_round3
+                .into_iter()
+                .map(|r| r.expect("R3 populated for every chunk-mate"))
+                .collect();
+            let chunk_deep_coeffs: Vec<DeepCoeffs<FieldExtension>> = chunk_deep_coeffs
+                .into_iter()
+                .map(|d| d.expect("DEEP coeffs populated for every chunk-mate"))
+                .collect();
+
+            // Chunk join 2: bucket-shared transcript built by canonical replay
+            // of chunk-local data on the pre-fork state. Verifier reconstructs
+            // identical seed from proof data only.
+            let mut bucket_seed = pre_fork_transcript.clone();
+            for j in 0..chunk_size {
+                if let Some(ref bpi) = chunk_round1[j].bus_public_inputs {
+                    bucket_seed.append_field_element(&bpi.table_contribution);
+                }
+            }
+            if let Some(ref root) = comp_mmcs_roots_per_chunk[chunk_idx] {
+                bucket_seed.append_bytes(root);
+            }
+            for j in 0..chunk_size {
+                let round_3 = &chunk_round3[j];
+                for col in round_3.trace_ood_evaluations.columns().iter() {
+                    for elem in col.iter() {
+                        bucket_seed.append_field_element(elem);
+                    }
+                }
+                for elem in round_3.composition_poly_parts_ood_evaluation.iter() {
+                    bucket_seed.append_field_element(elem);
+                }
+            }
+
+            // Bucket by lde_size (first-encounter order).
+            let mut bucket_indices: Vec<Vec<usize>> = Vec::new();
+            let mut bucket_lde_sizes: Vec<usize> = Vec::new();
+            for j in 0..chunk_size {
+                let sz = chunk_deep_coeffs[j].lde_size;
+                match bucket_lde_sizes.iter().position(|&s| s == sz) {
+                    Some(b) => bucket_indices[b].push(j),
+                    None => {
+                        bucket_lde_sizes.push(sz);
+                        bucket_indices.push(vec![j]);
+                    }
+                }
+            }
+
+            let mut chunk_buckets: Vec<crate::proof::stark::ChunkBucketFri<FieldExtension>> =
+                Vec::with_capacity(bucket_indices.len());
+            let mut bucket_iotas_per_bucket: Vec<Vec<usize>> =
+                Vec::with_capacity(bucket_indices.len());
+
+            for (members, &lde_size) in bucket_indices.iter().zip(bucket_lde_sizes.iter()) {
+                let mut bt = bucket_seed.clone();
+                bt.append_bytes(&(lde_size as u64).to_le_bytes());
+                let delta_fri: FieldElement<FieldExtension> = bt.sample_field_element();
+
+                let leader_idx = chunk_start + members[0];
+                let (leader_air, _, _) = &air_trace_pairs[leader_idx];
+                let leader_domain = &domains[leader_idx];
+                let coset_offset =
+                    FieldElement::<Field>::from(leader_air.context().proof_options.coset_offset);
+
+                // Streaming bucket combine: build each member's DEEP LDE
+                // one at a time, fold into the bucket accumulator with
+                // δ_fri^i, then drop. Peak DEEP memory inside this loop:
+                // 2 × |LDE| (current member + accumulator).
+                let mut combined: Vec<FieldElement<FieldExtension>> =
+                    vec![FieldElement::<FieldExtension>::zero(); lde_size];
+                let mut delta_power = FieldElement::<FieldExtension>::one();
+                for (i_local, &j) in members.iter().enumerate() {
                     let idx = chunk_start + j;
-                    proofs[idx] = Some(proof);
+                    let domain_j = &domains[idx];
+                    let round_1_j = &chunk_round1[j];
+                    let round_2_j = &chunk_round2[j];
+                    let round_3_j = &chunk_round3[j];
+                    let coeffs_j = &chunk_deep_coeffs[j];
+                    let deep_lde_j = Self::compute_deep_lde_with_coeffs(
+                        domain_j,
+                        round_1_j,
+                        round_2_j,
+                        round_3_j,
+                        coeffs_j,
+                    );
+                    debug_assert_eq!(deep_lde_j.len(), lde_size);
+                    if i_local == 0 {
+                        // First member: avoid the multiply-by-one in the
+                        // common singleton-bucket case.
+                        combined = deep_lde_j;
+                    } else {
+                        for (acc, src) in combined.iter_mut().zip(deep_lde_j.iter()) {
+                            *acc = &*acc + &delta_power * src;
+                        }
+                    }
+                    delta_power = &delta_power * &delta_fri;
+                }
+
+                let (last_value, fri_layers) =
+                    fri::commit_phase_from_evaluations::<Field, FieldExtension>(
+                        leader_domain.root_order as usize,
+                        combined,
+                        &mut bt,
+                        &coset_offset,
+                        lde_size,
+                    );
+
+                let security_bits = leader_air.context().proof_options.grinding_factor;
+                let nonce = if security_bits > 0 {
+                    let nonce_value = grinding::generate_nonce(&bt.state(), security_bits)
+                        .expect("bucket-FRI grinding nonce not found");
+                    bt.append_bytes(&nonce_value.to_be_bytes());
+                    Some(nonce_value)
+                } else {
+                    None
+                };
+
+                let number_of_queries = leader_air.options().fri_number_of_queries;
+                let iotas = Self::sample_query_indexes(number_of_queries, leader_domain, &mut bt);
+                let decommitments = fri::query_phase(&fri_layers, &iotas);
+                let layer_roots: Vec<Commitment> = fri_layers
+                    .iter()
+                    .map(|layer| layer.merkle_tree.root)
+                    .collect();
+                let member_tags: Vec<MatrixTag> = members
+                    .iter()
+                    .map(|&j| main_tags[chunk_start + j])
+                    .collect();
+
+                chunk_buckets.push(crate::proof::stark::ChunkBucketFri {
+                    lde_size: lde_size as u32,
+                    members: member_tags,
+                    layer_roots,
+                    last_value,
+                    decommitments,
+                    nonce,
+                });
+                bucket_iotas_per_bucket.push(iotas);
+            }
+            fri_chunk_buckets_per_chunk.push(chunk_buckets);
+
+            let mut member_bucket_idx: Vec<usize> = vec![0; chunk_size];
+            for (b, members) in bucket_indices.iter().enumerate() {
+                for &j in members.iter() {
+                    member_bucket_idx[j] = b;
                 }
             }
+
+            // Per chunk-mate: open at bucket-shared iotas + assemble StarkProof.
+            for j in 0..chunk_size {
+                let idx = chunk_start + j;
+                let (_, _, pub_inputs) = &air_trace_pairs[idx];
+                let domain = &domains[idx];
+                let round_1_result = &chunk_round1[j];
+                let round_2_result = &chunk_round2[j];
+                let bucket_idx = member_bucket_idx[j];
+                let iotas = &bucket_iotas_per_bucket[bucket_idx];
+                let deep_poly_openings = Self::open_deep_composition_poly(
+                    domain,
+                    round_1_result,
+                    round_2_result,
+                    iotas,
+                );
+                let round_3 = &chunk_round3[j];
+                let proof = StarkProof {
+                    lde_trace_main_merkle_root: round_1_result.main.main_tree_root(),
+                    lde_trace_precomputed_merkle_root: round_1_result.main.precomputed_root(),
+                    trace_ood_evaluations: round_3.trace_ood_evaluations.clone(),
+                    composition_poly_parts_ood_evaluation: round_3
+                        .composition_poly_parts_ood_evaluation
+                        .clone(),
+                    deep_poly_openings,
+                    bus_public_inputs: round_1_result.bus_public_inputs.clone(),
+                    public_inputs: (*pub_inputs).clone(),
+                    trace_length: domain.interpolation_domain_size,
+                };
+                proofs[idx] = Some(proof);
+            }
         }
 
         let proofs: Vec<StarkProof<Field, FieldExtension, PI>> = proofs
@@ -2909,6 +3108,7 @@ pub trait IsStarkProver<
             comp_mmcs_roots: comp_mmcs_roots_per_chunk,
             comp_mmcs_specs: comp_mmcs_specs_per_chunk,
             chunk_size: k as u32,
+            fri_chunk_buckets: fri_chunk_buckets_per_chunk,
         })
     }
 
@@ -3002,19 +3202,26 @@ pub trait IsStarkProver<
         Ok((transition_coefficients, boundary_coefficients, r2a))
     }
 
-    /// Part B of Round 2 onward: assumes the chunk composition MMCS root
-    /// has been absorbed into `transcript` already. Runs the absorb of
-    /// the per-table H_i values, R3 OOD, and R4 FRI + opens, producing
-    /// the final per-table StarkProof.
+    /// Part B of Round 2 through R3.5 (light): assumes the chunk
+    /// composition MMCS root has been absorbed into the per-fork
+    /// `transcript`. Runs z-sample, OOD computation + own-OOD absorb,
+    /// then samples γ + builds the DEEP composition **coefficients**.
+    ///
+    /// Does NOT build the DEEP LDE — that's deferred to
+    /// [`compute_deep_lde_with_coeffs`] which runs inside the bucket
+    /// FRI loop, so DEEP_i is materialised one at a time, folded into
+    /// the bucket accumulator with δ_fri^i, then dropped.
     #[allow(clippy::too_many_arguments)]
-    fn prove_rounds_2b_to_4(
+    fn prove_rounds_2b_to_3_5(
         air: &dyn AIR<Field = Field, FieldExtension = FieldExtension, PublicInputs = PI>,
-        pub_inputs: &PI,
         round_1_result: &Round1<Field, FieldExtension>,
         round_2_result: &Round2<FieldExtension>,
         transcript: &mut impl IsStarkTranscript<FieldExtension, Field>,
         domain: &Domain<Field>,
-    ) -> Result<StarkProof<Field, FieldExtension, PI>, ProvingError>
+    ) -> Result<
+        (Round3<FieldExtension>, DeepCoeffs<FieldExtension>),
+        ProvingError,
+    >
     where
         FieldElement<Field>: AsBytes,
         FieldElement<FieldExtension>: AsBytes,
@@ -3054,15 +3261,15 @@ pub trait IsStarkProver<
         }
 
         // ===================================
-        // ==========|   Round 4   |==========
+        // ==========|  Round 3.5  |==========
         // ===================================
-        let round_4_result = Self::round_4_compute_and_run_fri_on_the_deep_composition_polynomial(
+        // Sample γ + build DEEP coefficients (lightweight). The DEEP LDE
+        // itself is computed later inside the bucket FRI loop.
+        let deep_coeffs = Self::sample_deep_coeffs(
             air,
             domain,
-            round_1_result,
             round_2_result,
-            &round_3_result,
-            &z,
+            z,
             transcript,
         );
 
@@ -3071,37 +3278,21 @@ pub trait IsStarkProver<
             let zero = std::time::Duration::ZERO;
             let (r2_constraints, r2_fft, r2_merkle) =
                 crate::instruments::take_r2_sub().unwrap_or((zero, zero, zero));
-            let (r4_fft, r4_merkle, r4_deep_comp, r4_queries) =
-                crate::instruments::take_r4_sub().unwrap_or((zero, zero, zero, zero));
             crate::instruments::store_round_sub_ops(crate::instruments::TableSubOps {
                 constraints: r2_constraints,
                 comp_decompose: r2_fft,
                 comp_commit: r2_merkle,
                 ood: round_3_dur,
-                deep_comp: r4_deep_comp,
-                deep_extend: r4_fft,
-                fri_commit: r4_merkle,
-                queries: r4_queries,
+                deep_comp: zero,
+                deep_extend: zero,
+                fri_commit: zero,
+                queries: zero,
             });
         }
 
         info!("End proof generation");
 
-        Ok(StarkProof {
-            lde_trace_main_merkle_root: round_1_result.main.main_tree_root(),
-            lde_trace_precomputed_merkle_root: round_1_result.main.precomputed_root(),
-            trace_ood_evaluations: round_3_result.trace_ood_evaluations,
-            composition_poly_parts_ood_evaluation: round_3_result
-                .composition_poly_parts_ood_evaluation,
-            fri_layers_merkle_roots: round_4_result.fri_layers_merkle_roots,
-            fri_last_value: round_4_result.fri_last_value,
-            query_list: round_4_result.query_list,
-            deep_poly_openings: round_4_result.deep_poly_openings,
-            nonce: round_4_result.nonce,
-            bus_public_inputs: round_1_result.bus_public_inputs.clone(),
-            public_inputs: pub_inputs.clone(),
-            trace_length: domain.interpolation_domain_size,
-        })
+        Ok((round_3_result, deep_coeffs))
     }
 }
 
diff --git a/crypto/stark/src/tests/bucket_fri_soundness_tests.rs b/crypto/stark/src/tests/bucket_fri_soundness_tests.rs
new file mode 100644
index 000000000..10db5169f
--- /dev/null
+++ b/crypto/stark/src/tests/bucket_fri_soundness_tests.rs
@@ -0,0 +1,168 @@
+//! Phase D — per-(chunk, lde_size) batched FRI soundness tests.
+//!
+//! Every test starts from a baseline-valid multi-proof, then tampers
+//! with a single field on the bucket-FRI path inside `MultiProof::
+//! fri_chunk_buckets` and asserts the verifier rejects. Pre-existing
+//! main / aux / composition MMCS path soundness is covered by
+//! `mmcs_soundness_tests`, `mmcs_aux_soundness_tests`, and the
+//! composition tests inside `mmcs_soundness_tests`.
+
+use crypto::fiat_shamir::default_transcript::DefaultTranscript;
+use math::field::{element::FieldElement, goldilocks::GoldilocksField};
+
+use crate::examples::{
+    bit_flags::{self, BitFlagsAIR},
+    dummy_air::{self, DummyAIR},
+};
+use crate::proof::options::ProofOptions;
+use crate::proof::stark::MultiProof;
+use crate::test_utils::{multi_prove_ram, multi_verify_ram};
+use crate::traits::AIR;
+
+type F = GoldilocksField;
+
+#[allow(clippy::type_complexity)]
+fn baseline_proof() -> (DummyAIR, BitFlagsAIR, MultiProof<F, F, ()>) {
+    let proof_options = ProofOptions::default_test_options();
+    let air_1 = DummyAIR::new(&proof_options);
+    let air_2 = BitFlagsAIR::new(&proof_options);
+    let mut trace_1 = dummy_air::dummy_trace::<F>(16);
+    let mut trace_2 = bit_flags::bit_prefix_flag_trace(32);
+    let air_trace_pairs: Vec<(
+        &dyn AIR<Field = F, FieldExtension = F, PublicInputs = ()>,
+        &mut _,
+        &_,
+    )> = vec![(&air_1, &mut trace_1, &()), (&air_2, &mut trace_2, &())];
+    let proof = multi_prove_ram(air_trace_pairs, &mut DefaultTranscript::<F>::new(&[])).unwrap();
+    (air_1, air_2, proof)
+}
+
+fn verify(
+    airs: &[&dyn AIR<Field = F, FieldExtension = F, PublicInputs = ()>],
+    proof: &MultiProof<F, F, ()>,
+) -> bool {
+    multi_verify_ram(
+        airs,
+        proof,
+        &mut DefaultTranscript::<F>::new(&[]),
+        &FieldElement::zero(),
+    )
+}
+
+/// Locate the first chunk whose `fri_chunk_buckets` is non-empty and the
+/// first bucket inside it. Used by tampering tests to find a real bucket
+/// to mutate.
+fn first_bucket_mut(
+    proof: &mut MultiProof<F, F, ()>,
+) -> (usize, usize) {
+    let chunk_idx = proof
+        .fri_chunk_buckets
+        .iter()
+        .position(|c| !c.is_empty())
+        .expect("baseline has at least one non-empty fri_chunk_buckets entry");
+    (chunk_idx, 0)
+}
+
+#[test_log::test]
+fn baseline_phase_d_proof_verifies() {
+    let (air_1, air_2, proof) = baseline_proof();
+    let airs: Vec<&dyn AIR<Field = F, FieldExtension = F, PublicInputs = ()>> =
+        vec![&air_1, &air_2];
+    assert!(verify(&airs, &proof), "baseline Phase D proof must verify");
+    // Sanity: fri_chunk_buckets is parallel to per-chunk MMCS vecs.
+    assert_eq!(proof.fri_chunk_buckets.len(), proof.main_mmcs_roots.len());
+    // Every populated bucket must have non-empty members + at least one
+    // decommitment per fri query.
+    for chunk in &proof.fri_chunk_buckets {
+        for bucket in chunk {
+            assert!(!bucket.members.is_empty());
+            assert!(!bucket.decommitments.is_empty());
+        }
+    }
+}
+
+#[test_log::test]
+fn tampered_bucket_last_value_rejected() {
+    let (air_1, air_2, mut proof) = baseline_proof();
+    let (ci, bi) = first_bucket_mut(&mut proof);
+    proof.fri_chunk_buckets[ci][bi].last_value =
+        &proof.fri_chunk_buckets[ci][bi].last_value + FieldElement::<F>::one();
+    let airs: Vec<&dyn AIR<Field = F, FieldExtension = F, PublicInputs = ()>> =
+        vec![&air_1, &air_2];
+    assert!(!verify(&airs, &proof));
+}
+
+#[test_log::test]
+fn tampered_bucket_layer_root_rejected() {
+    let (air_1, air_2, mut proof) = baseline_proof();
+    let (ci, bi) = first_bucket_mut(&mut proof);
+    if proof.fri_chunk_buckets[ci][bi].layer_roots.is_empty() {
+        // Trivially-small LDE: no committed FRI layers to tamper with;
+        // tampering last_value above already covers that case.
+        return;
+    }
+    proof.fri_chunk_buckets[ci][bi].layer_roots[0][0] ^= 0xFF;
+    let airs: Vec<&dyn AIR<Field = F, FieldExtension = F, PublicInputs = ()>> =
+        vec![&air_1, &air_2];
+    assert!(!verify(&airs, &proof));
+}
+
+#[test_log::test]
+fn truncated_bucket_decommitments_rejected() {
+    let (air_1, air_2, mut proof) = baseline_proof();
+    let (ci, bi) = first_bucket_mut(&mut proof);
+    assert!(!proof.fri_chunk_buckets[ci][bi].decommitments.is_empty());
+    proof.fri_chunk_buckets[ci][bi].decommitments.pop();
+    let airs: Vec<&dyn AIR<Field = F, FieldExtension = F, PublicInputs = ()>> =
+        vec![&air_1, &air_2];
+    assert!(!verify(&airs, &proof));
+}
+
+#[test_log::test]
+fn missing_chunk_buckets_rejected() {
+    let (air_1, air_2, mut proof) = baseline_proof();
+    // Wipe a chunk's bucket list; verifier checks bucket count matches
+    // the lde-size grouping expected from the AIRs in the chunk.
+    let (ci, _) = first_bucket_mut(&mut proof);
+    proof.fri_chunk_buckets[ci].clear();
+    let airs: Vec<&dyn AIR<Field = F, FieldExtension = F, PublicInputs = ()>> =
+        vec![&air_1, &air_2];
+    assert!(!verify(&airs, &proof));
+}
+
+#[test_log::test]
+fn wrong_bucket_lde_size_rejected() {
+    let (air_1, air_2, mut proof) = baseline_proof();
+    let (ci, bi) = first_bucket_mut(&mut proof);
+    let actual = proof.fri_chunk_buckets[ci][bi].lde_size;
+    // Bump to a different power of two — verifier reconstructs expected
+    // lde_size from per-AIR blowup × trace_length and rejects mismatch.
+    proof.fri_chunk_buckets[ci][bi].lde_size = actual.wrapping_mul(2);
+    let airs: Vec<&dyn AIR<Field = F, FieldExtension = F, PublicInputs = ()>> =
+        vec![&air_1, &air_2];
+    assert!(!verify(&airs, &proof));
+}
+
+#[test_log::test]
+fn swapped_member_order_rejected() {
+    let (air_1, air_2, mut proof) = baseline_proof();
+    // Find a bucket with ≥ 2 members and swap their order. The verifier
+    // requires bucket members in canonical chunk-local-index order so a
+    // tag swap shifts δ_fri^i powers and rejects the combined FRI.
+    let target = proof
+        .fri_chunk_buckets
+        .iter_mut()
+        .enumerate()
+        .find_map(|(ci, c)| c.iter_mut().enumerate().find_map(|(bi, b)| {
+            if b.members.len() >= 2 { Some((ci, bi)) } else { None }
+        }));
+    let Some((ci, bi)) = target else {
+        // Single-table-per-bucket baseline — swap is not applicable; in
+        // practice every chunk-mate becomes its own singleton bucket here.
+        return;
+    };
+    proof.fri_chunk_buckets[ci][bi].members.swap(0, 1);
+    let airs: Vec<&dyn AIR<Field = F, FieldExtension = F, PublicInputs = ()>> =
+        vec![&air_1, &air_2];
+    assert!(!verify(&airs, &proof));
+}
diff --git a/crypto/stark/src/tests/mod.rs b/crypto/stark/src/tests/mod.rs
index b42b2abd9..d0fe0530a 100644
--- a/crypto/stark/src/tests/mod.rs
+++ b/crypto/stark/src/tests/mod.rs
@@ -1,4 +1,5 @@
 pub mod air_tests;
+pub mod bucket_fri_soundness_tests;
 pub mod bus_tests;
 pub mod domain_cache_stats;
 pub mod fri_tests;
diff --git a/crypto/stark/src/verifier.rs b/crypto/stark/src/verifier.rs
index dc614fb85..4069267e7 100644
--- a/crypto/stark/src/verifier.rs
+++ b/crypto/stark/src/verifier.rs
@@ -238,54 +238,18 @@ pub trait IsStarkVerifier<
         composition_poly_claimed_ood_evaluation == composition_poly_ood_evaluation
     }
 
-    /// Reconstructs the Deep composition polynomial evaluations at the challenge indices values using the provided
-    /// openings of the trace polynomials and the composition polynomial parts. It then uses these to verify that the
-    /// FRI decommitments are valid and correspond to the Deep composition polynomial.
-    fn step_3_verify_fri(
+    /// Reconstruct the per-table DEEP composition evaluations `D_i(iota)` and
+    /// `D_i(-iota)` for ONE table at every query index. Used by the
+    /// chunk-bucket FRI verification (Phase D) to combine bucket-mates
+    /// into the polynomial actually committed by FRI.
+    fn reconstruct_d_evaluations_for_table(
         proof: &StarkProof<Field, FieldExtension, PI>,
         domain: &VerifierDomain<Field>,
         challenges: &Challenges<FieldExtension>,
-    ) -> bool
-    where
-        FieldElement<Field>: AsBytes + Sync + Send,
-        FieldElement<FieldExtension>: AsBytes + Sync + Send,
-    {
-        let (deep_poly_evaluations, deep_poly_evaluations_sym) =
-            match Self::reconstruct_deep_composition_poly_evaluations_for_all_queries(
-                challenges, domain, proof,
-            ) {
-                Some(pair) => pair,
-                None => return false,
-            };
-
-        // verify FRI
-        let mut evaluation_point_inverse = challenges
-            .iotas
-            .iter()
-            .map(|iota| Self::query_challenge_to_evaluation_point(*iota, false, domain))
-            .collect::<Vec<FieldElement<Field>>>();
-        // Any zero evaluation point means a malformed query index, reject.
-        if FieldElement::inplace_batch_inverse(&mut evaluation_point_inverse).is_err() {
-            return false;
-        }
-
-        proof
-            .query_list
-            .iter()
-            .zip(&challenges.iotas)
-            .zip(evaluation_point_inverse)
-            .enumerate()
-            .all(|(i, ((proof_s, iota_s), eval))| {
-                Self::verify_query_and_sym_openings(
-                    proof,
-                    &challenges.zetas,
-                    *iota_s,
-                    proof_s,
-                    eval,
-                    &deep_poly_evaluations[i],
-                    &deep_poly_evaluations_sym[i],
-                )
-            })
+    ) -> Option<DeepPolynomialEvaluations<FieldExtension>> {
+        Self::reconstruct_deep_composition_poly_evaluations_for_all_queries(
+            challenges, domain, proof,
+        )
     }
 
     /// Returns the field element element of the domain `domain` corresponding to the given FRI query index challenge `iota`.
@@ -514,16 +478,16 @@ pub trait IsStarkVerifier<
         )
     }
 
-    /// Verify a single FRI query
-    /// `zetas`: the vector of all challenges sent by the verifier to the prover at the commit
-    /// phase to fold polynomials.
-    /// `iota`: the index challenge of this FRI query. This index uniquely determines two elements 𝜐 and -𝜐
-    /// of the evaluation domain of FRI layer 0.
-    /// `evaluation_point_inv`: precomputed value of 𝜐⁻¹.
-    /// `deep_composition_evaluation`: precomputed value of p₀(𝜐), where p₀ is the deep composition polynomial.
-    /// `deep_composition_evaluation_sym`: precomputed value of p₀(-𝜐), where p₀ is the deep composition polynomial.
-    fn verify_query_and_sym_openings(
-        proof: &StarkProof<Field, FieldExtension, PI>,
+    /// Verify a single bucket-FRI query.
+    ///
+    /// `fri_layers_merkle_roots` / `fri_last_value` come from the bucket
+    /// (`ChunkBucketFri`), not from any per-table proof. `deep_composition_*`
+    /// is `D_combined(±iota)` — the linear combination of bucket-mates'
+    /// reconstructed D_i evaluations with successive powers of `delta_fri`.
+    #[allow(clippy::too_many_arguments)]
+    fn verify_bucket_fri_query(
+        fri_layers_merkle_roots: &[Commitment],
+        fri_last_value: &FieldElement<FieldExtension>,
         zetas: &[FieldElement<FieldExtension>],
         iota: usize,
         fri_decommitment: &FriDecommitment<FieldExtension>,
@@ -535,7 +499,6 @@ pub trait IsStarkVerifier<
         FieldElement<Field>: AsBytes + Sync + Send,
         FieldElement<FieldExtension>: AsBytes + Sync + Send,
     {
-        let fri_layers_merkle_roots = &proof.fri_layers_merkle_roots;
         let evaluation_point_vec: Vec<FieldElement<Field>> =
             core::iter::successors(Some(evaluation_point_inv.square()), |evaluation_point| {
                 Some(evaluation_point.square())
@@ -551,16 +514,11 @@ pub trait IsStarkVerifier<
             (p0_eval + p0_eval_sym) + evaluation_point_inv * &zetas[0] * (p0_eval - p0_eval_sym);
         let mut index = iota;
 
-        // Handle case with 0 FRI layers (trace_length <= 2)
-        // In this case, the fold loop below doesn't iterate, so we need to verify
-        // the final value directly here.
+        // 0-layer FRI (trivially small LDE): folded p0 must equal the bucket's last_value.
         if fri_layers_merkle_roots.is_empty() {
-            return v == proof.fri_last_value;
+            return v == *fri_last_value;
         }
 
-        // For each FRI layer, starting from the layer 1: use the proof to verify the validity of values pᵢ(−𝜐^(2ⁱ)) (given by the prover) and
-        // pᵢ(𝜐^(2ⁱ)) (computed on the previous iteration by the verifier). Then use them to obtain pᵢ₊₁(𝜐^(2ⁱ⁺¹)).
-        // Finally, check that the final value coincides with the given by the prover.
         fri_layers_merkle_roots
             .iter()
             .enumerate()
@@ -574,9 +532,6 @@ pub trait IsStarkVerifier<
                     (((i, merkle_root), auth_path_sym), evaluation_sym),
                     evaluation_point_inv,
                 )| {
-                    // Verify opening Open(pᵢ(Dₖ), −𝜐^(2ⁱ)) and Open(pᵢ(Dₖ), 𝜐^(2ⁱ)).
-                    // `v` is pᵢ(𝜐^(2ⁱ)).
-                    // `evaluation_sym` is pᵢ(−𝜐^(2ⁱ)).
                     let openings_ok = Self::verify_fri_layer_openings(
                         merkle_root,
                         auth_path_sym,
@@ -585,19 +540,13 @@ pub trait IsStarkVerifier<
                         index,
                     );
 
-                    // Update `v` with next value pᵢ₊₁(𝜐^(2ⁱ⁺¹)).
                     v = (&v + evaluation_sym) + evaluation_point_inv * &zetas[i + 1] * (&v - evaluation_sym);
-
-                    // Update index for next iteration. The index of the squares in the next layer
-                    // is obtained by halving the current index. This is due to the bit-reverse
-                    // ordering of the elements in the Merkle tree.
                     index >>= 1;
 
                     if i < fri_decommitment.layers_evaluations_sym.len() - 1 {
                         result & openings_ok
                     } else {
-                        // Check that final value is the given by the prover
-                        result & (v == proof.fri_last_value) & openings_ok
+                        result & (v == *fri_last_value) & openings_ok
                     }
                 },
             )
@@ -1037,63 +986,333 @@ pub trait IsStarkVerifier<
         }
 
         // =====================================================================
-        // Rounds 2-4: Forked per table
+        // Rounds 2 → 3.5 per-fork replay + per-chunk bucket FRI (Phase D)
         // =====================================================================
-        // Each table gets an independent transcript fork (cloned from the
-        // shared state after the aux MMCS absorb above, domain-separated by
-        // table index). This matches the prover's forking and makes
-        // per-table verification independent.
+        // Per chunk-mate: build fork, replay through γ + step 2 verify.
+        // Then per chunk: build the bucket-shared transcript, verify each
+        // height bucket's batched FRI, and use the bucket-shared iotas to
+        // authenticate every per-query trace / aux / composition opening.
+
+        let num_tables = airs.len();
+        let pre_fork_transcript = transcript.clone();
+        let mut challenges_per_table: Vec<Option<Challenges<FieldExtension>>> =
+            (0..num_tables).map(|_| None).collect();
 
         for (idx, (air, proof)) in airs.iter().zip(&multi_proof.proofs).enumerate() {
-            // Must match prover: fork with domain separator for multi-table,
-            // use original transcript directly for single-table.
-            let num_tables = airs.len();
             let mut table_transcript = transcript.clone();
             if num_tables > 1 {
                 table_transcript.append_bytes(&(idx as u64).to_le_bytes());
             }
-
-            // Bind table_contribution (L) to transcript, matching prover.
             if let Some(ref bpi) = proof.bus_public_inputs {
                 table_transcript.append_field_element(&bpi.table_contribution);
             }
 
-            // Per-chunk lookup: each table's main / aux / comp MMCS
-            // root + spec come from its chunk.
             let table_chunk_idx = idx / chunk_size;
-            let main_root_for_chunk =
-                multi_proof.main_mmcs_roots[table_chunk_idx].as_ref();
-            let main_spec_for_chunk: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)] =
-                &multi_proof.main_mmcs_specs[table_chunk_idx];
-            let aux_root_for_chunk = multi_proof.aux_mmcs_roots[table_chunk_idx].as_ref();
-            let aux_spec_for_chunk: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)] =
-                &multi_proof.aux_mmcs_specs[table_chunk_idx];
             let comp_root_for_chunk =
                 multi_proof.comp_mmcs_roots[table_chunk_idx].as_ref();
-            let comp_spec_for_chunk: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)] =
-                &multi_proof.comp_mmcs_specs[table_chunk_idx];
 
-            if !Self::verify_rounds_2_to_4(
+            let chal = match Self::replay_and_verify_step_2(
                 *air,
                 proof,
                 &mut table_transcript,
                 lookup_challenges.clone(),
-                main_tags[idx],
-                main_root_for_chunk,
-                main_spec_for_chunk,
-                aux_root_for_chunk,
-                aux_spec_for_chunk,
                 comp_root_for_chunk,
-                comp_spec_for_chunk,
             ) {
+                Some(c) => c,
+                None => {
+                    error!(
+                        "Table {} failed replay_and_verify_step_2 (num_constraints={}, trace_cols={})",
+                        idx,
+                        air.context().num_transition_constraints,
+                        air.context().trace_columns
+                    );
+                    return false;
+                }
+            };
+            challenges_per_table[idx] = Some(chal);
+        }
+
+        // Per-chunk: build bucket_seed (canonical replay on pre-fork state),
+        // validate fri_chunk_buckets[chunk_idx] structure, verify each
+        // bucket's batched FRI, then per chunk-mate verify step 4.
+        if multi_proof.fri_chunk_buckets.len() != expected_num_chunks {
+            error!(
+                "fri_chunk_buckets outer length {} != expected_num_chunks {}",
+                multi_proof.fri_chunk_buckets.len(),
+                expected_num_chunks,
+            );
+            return false;
+        }
+
+        for chunk_idx in 0..expected_num_chunks {
+            let chunk_start = chunk_idx * chunk_size;
+            let chunk_end = (chunk_start + chunk_size).min(num_tables);
+
+            // bucket_seed: clone pre-fork shared state + canonical replay.
+            let mut bucket_seed = pre_fork_transcript.clone();
+            for idx in chunk_start..chunk_end {
+                if let Some(ref bpi) = multi_proof.proofs[idx].bus_public_inputs {
+                    bucket_seed.append_field_element(&bpi.table_contribution);
+                }
+            }
+            if let Some(ref root) = multi_proof.comp_mmcs_roots[chunk_idx] {
+                bucket_seed.append_bytes(root);
+            }
+            for idx in chunk_start..chunk_end {
+                let p = &multi_proof.proofs[idx];
+                for col in p.trace_ood_evaluations.columns().iter() {
+                    for elem in col.iter() {
+                        bucket_seed.append_field_element(elem);
+                    }
+                }
+                for elem in p.composition_poly_parts_ood_evaluation.iter() {
+                    bucket_seed.append_field_element(elem);
+                }
+            }
+
+            // Expected bucketing: first-encounter order by lde_size.
+            let mut expected_bucket_indices: Vec<Vec<usize>> = Vec::new();
+            let mut expected_bucket_lde_sizes: Vec<usize> = Vec::new();
+            for j in 0..(chunk_end - chunk_start) {
+                let idx = chunk_start + j;
+                let lde_size = multi_proof.proofs[idx].trace_length
+                    * airs[idx].options().blowup_factor as usize;
+                match expected_bucket_lde_sizes.iter().position(|&s| s == lde_size) {
+                    Some(b) => expected_bucket_indices[b].push(j),
+                    None => {
+                        expected_bucket_lde_sizes.push(lde_size);
+                        expected_bucket_indices.push(vec![j]);
+                    }
+                }
+            }
+
+            let chunk_buckets = &multi_proof.fri_chunk_buckets[chunk_idx];
+            if chunk_buckets.len() != expected_bucket_indices.len() {
                 error!(
-                    "Table {} failed verify_rounds_2_to_4 (num_constraints={}, trace_cols={})",
-                    idx,
-                    air.context().num_transition_constraints,
-                    air.context().trace_columns
+                    "chunk {chunk_idx}: bucket count {} != expected {}",
+                    chunk_buckets.len(),
+                    expected_bucket_indices.len(),
                 );
                 return false;
             }
+
+            // map chunk-local-index → bucket index (for step 4 dispatch).
+            let mut member_bucket_idx: Vec<usize> = vec![0; chunk_end - chunk_start];
+            // Cache bucket iotas: derived once during FRI verification,
+            // reused in step 4 without re-cloning the bucket transcript.
+            let mut bucket_iotas_cache: Vec<Vec<usize>> =
+                Vec::with_capacity(chunk_buckets.len());
+
+            for (b, bucket) in chunk_buckets.iter().enumerate() {
+                let expected_members = &expected_bucket_indices[b];
+                let expected_lde_size = expected_bucket_lde_sizes[b];
+                if bucket.lde_size as usize != expected_lde_size {
+                    error!(
+                        "chunk {chunk_idx} bucket {b}: lde_size {} != expected {}",
+                        bucket.lde_size, expected_lde_size,
+                    );
+                    return false;
+                }
+                if bucket.members.len() != expected_members.len() {
+                    error!(
+                        "chunk {chunk_idx} bucket {b}: members.len {} != expected {}",
+                        bucket.members.len(),
+                        expected_members.len(),
+                    );
+                    return false;
+                }
+                for (mi, &j) in expected_members.iter().enumerate() {
+                    let expected_tag = main_tags[chunk_start + j];
+                    if bucket.members[mi] != expected_tag {
+                        error!(
+                            "chunk {chunk_idx} bucket {b} member {mi}: tag mismatch",
+                        );
+                        return false;
+                    }
+                    member_bucket_idx[j] = b;
+                }
+
+                // Verify the bucket FRI: replay layer-root absorbs, sample
+                // zetas, absorb last_value, grinding, sample iotas, and run
+                // per-iota combined-D fold check.
+                let leader_idx = chunk_start + expected_members[0];
+                let leader_air = airs[leader_idx];
+                let leader_domain =
+                    new_verifier_domain(leader_air, multi_proof.proofs[leader_idx].trace_length);
+
+                let mut bt = bucket_seed.clone();
+                bt.append_bytes(&(bucket.lde_size as u64).to_le_bytes());
+                let delta_fri: FieldElement<FieldExtension> = bt.sample_field_element();
+
+                let mut zetas: Vec<FieldElement<FieldExtension>> =
+                    Vec::with_capacity(bucket.layer_roots.len() + 1);
+                for root in &bucket.layer_roots {
+                    let z = bt.sample_field_element();
+                    bt.append_bytes(root);
+                    zetas.push(z);
+                }
+                zetas.push(bt.sample_field_element());
+                bt.append_field_element(&bucket.last_value);
+
+                let security_bits = leader_air.context().proof_options.grinding_factor;
+                if security_bits > 0 {
+                    let nonce = match bucket.nonce {
+                        Some(n) => n,
+                        None => {
+                            error!(
+                                "chunk {chunk_idx} bucket {b}: grinding required but nonce missing",
+                            );
+                            return false;
+                        }
+                    };
+                    let grinding_seed = bt.state();
+                    if !grinding::is_valid_nonce(&grinding_seed, nonce, security_bits) {
+                        #[cfg(not(feature = "test_fiat_shamir"))]
+                        error!("chunk {chunk_idx} bucket {b}: grinding factor not satisfied");
+                        return false;
+                    }
+                    bt.append_bytes(&nonce.to_be_bytes());
+                } else if bucket.nonce.is_some() {
+                    error!(
+                        "chunk {chunk_idx} bucket {b}: nonce present but grinding disabled",
+                    );
+                    return false;
+                }
+
+                let number_of_queries = leader_air.options().fri_number_of_queries;
+                let iotas =
+                    Self::sample_query_indexes(number_of_queries, &leader_domain, &mut bt);
+
+                if bucket.decommitments.len() != iotas.len() {
+                    error!(
+                        "chunk {chunk_idx} bucket {b}: decommitments {} != iotas {}",
+                        bucket.decommitments.len(),
+                        iotas.len(),
+                    );
+                    return false;
+                }
+
+                // Reconstruct per-bucket-mate D_i(iota±) for every iota.
+                let mut per_member_d: Vec<DeepPolynomialEvaluations<FieldExtension>> =
+                    Vec::with_capacity(expected_members.len());
+                for &j in expected_members.iter() {
+                    let idx = chunk_start + j;
+                    let chal = challenges_per_table[idx]
+                        .as_ref()
+                        .expect("step-2 succeeded → challenges populated");
+                    // Replace the challenge's empty iotas with bucket iotas.
+                    let chal_with_iotas = Challenges {
+                        z: chal.z.clone(),
+                        boundary_coeffs: chal.boundary_coeffs.clone(),
+                        transition_coeffs: chal.transition_coeffs.clone(),
+                        trace_term_coeffs: chal.trace_term_coeffs.clone(),
+                        gammas: chal.gammas.clone(),
+                        zetas: zetas.clone(),
+                        iotas: iotas.clone(),
+                        rap_challenges: chal.rap_challenges.clone(),
+                        grinding_seed: [0u8; 32],
+                    };
+                    let member_domain =
+                        new_verifier_domain(airs[idx], multi_proof.proofs[idx].trace_length);
+                    let pair = match Self::reconstruct_d_evaluations_for_table(
+                        &multi_proof.proofs[idx],
+                        &member_domain,
+                        &chal_with_iotas,
+                    ) {
+                        Some(pair) => pair,
+                        None => {
+                            error!(
+                                "chunk {chunk_idx} bucket {b} member {j}: D reconstruction failed",
+                            );
+                            return false;
+                        }
+                    };
+                    // chal_with_iotas only needed inside the call.
+                    let _ = chal_with_iotas;
+                    per_member_d.push(pair);
+                }
+
+                // Per-iota: combine D_i with successive powers of δ_fri,
+                // verify FRI fold authenticates and reaches bucket.last_value.
+                let mut evaluation_point_inv = iotas
+                    .iter()
+                    .map(|iota| {
+                        Self::query_challenge_to_evaluation_point(*iota, false, &leader_domain)
+                    })
+                    .collect::<Vec<FieldElement<Field>>>();
+                if FieldElement::inplace_batch_inverse(&mut evaluation_point_inv).is_err() {
+                    error!(
+                        "chunk {chunk_idx} bucket {b}: query evaluation point not invertible",
+                    );
+                    return false;
+                }
+
+                for (q, &iota) in iotas.iter().enumerate() {
+                    let mut d_iota = FieldElement::<FieldExtension>::zero();
+                    let mut d_iota_sym = FieldElement::<FieldExtension>::zero();
+                    let mut coeff = FieldElement::<FieldExtension>::one();
+                    for (i_local, member_d) in per_member_d.iter().enumerate() {
+                        d_iota = d_iota + &coeff * &member_d.0[q];
+                        d_iota_sym = d_iota_sym + &coeff * &member_d.1[q];
+                        if i_local + 1 < per_member_d.len() {
+                            coeff = coeff * &delta_fri;
+                        }
+                    }
+
+                    if !Self::verify_bucket_fri_query(
+                        &bucket.layer_roots,
+                        &bucket.last_value,
+                        &zetas,
+                        iota,
+                        &bucket.decommitments[q],
+                        evaluation_point_inv[q].clone(),
+                        &d_iota,
+                        &d_iota_sym,
+                    ) {
+                        #[cfg(not(feature = "test_fiat_shamir"))]
+                        error!(
+                            "chunk {chunk_idx} bucket {b} query {q}: FRI fold verification failed",
+                        );
+                        return false;
+                    }
+                }
+                bucket_iotas_cache.push(iotas);
+            }
+
+            // Per chunk-mate: step 4 at its bucket's iotas (cached above,
+            // no transcript replay needed).
+            for j in 0..(chunk_end - chunk_start) {
+                let idx = chunk_start + j;
+                let b = member_bucket_idx[j];
+                let iotas = &bucket_iotas_cache[b];
+
+                let proof = &multi_proof.proofs[idx];
+                let main_root = multi_proof.main_mmcs_roots[chunk_idx].as_ref();
+                let main_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)] =
+                    &multi_proof.main_mmcs_specs[chunk_idx];
+                let aux_root = multi_proof.aux_mmcs_roots[chunk_idx].as_ref();
+                let aux_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)] =
+                    &multi_proof.aux_mmcs_specs[chunk_idx];
+                let comp_root = multi_proof.comp_mmcs_roots[chunk_idx].as_ref();
+                let comp_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)] =
+                    &multi_proof.comp_mmcs_specs[chunk_idx];
+
+                if !Self::verify_step_4_at_iotas(
+                    proof,
+                    iotas,
+                    main_tags[idx],
+                    main_root,
+                    main_spec,
+                    aux_root,
+                    aux_spec,
+                    comp_root,
+                    comp_spec,
+                ) {
+                    #[cfg(not(feature = "test_fiat_shamir"))]
+                    error!("Table {idx}: step 4 trace/comp openings failed at bucket iotas");
+                    return false;
+                }
+            }
         }
 
         // =====================================================================
@@ -1227,8 +1446,12 @@ pub trait IsStarkVerifier<
         }
 
         // ===================================
-        // ==========|   Round 4   |==========
+        // ==========|  Round 3.5  |==========
         // ===================================
+        // Sample γ from the per-fork transcript; build the per-table
+        // DEEP composition coefficient layout. The FRI commit + iotas
+        // happen at chunk-bucket level (verified separately) — this
+        // replay stops at γ.
 
         let num_terms_composition_poly = proof.composition_poly_parts_ood_evaluation.len();
         let num_terms_trace =
@@ -1251,50 +1474,19 @@ pub trait IsStarkVerifier<
         // <<<< Receive challenges: 𝛾ⱼ, 𝛾ⱼ'
         let gammas = deep_composition_coefficients;
 
-        // FRI commit phase
-        let merkle_roots = &proof.fri_layers_merkle_roots;
-        let mut zetas = merkle_roots
-            .iter()
-            .map(|root| {
-                // >>>> Send challenge 𝜁ₖ
-                let element = transcript.sample_field_element();
-                // <<<< Receive commitment: [pₖ] (the first one is [p₀])
-                transcript.append_bytes(root);
-                element
-            })
-            .collect::<Vec<FieldElement<FieldExtension>>>();
-
-        // >>>> Send challenge 𝜁ₙ₋₁
-        zetas.push(transcript.sample_field_element());
-
-        // <<<< Receive value: pₙ
-        transcript.append_field_element(&proof.fri_last_value);
-
-        // Receive grinding value
-        let security_bits = air.context().proof_options.grinding_factor;
-        let mut grinding_seed = [0u8; 32];
-        if security_bits > 0
-            && let Some(nonce_value) = proof.nonce
-        {
-            grinding_seed = transcript.state();
-            transcript.append_bytes(&nonce_value.to_be_bytes());
-        }
-
-        // FRI query phase
-        // <<<< Send challenges 𝜄ₛ (iota_s)
-        let number_of_queries = air.options().fri_number_of_queries;
-        let iotas = Self::sample_query_indexes(number_of_queries, domain, transcript);
-
+        // zetas / iotas / grinding_seed are populated by the chunk-bucket
+        // FRI verification step in `multi_verify` (Phase D). The per-fork
+        // transcript ends here.
         Challenges {
             z,
             boundary_coeffs,
             transition_coeffs,
             trace_term_coeffs,
             gammas,
-            zetas,
-            iotas,
+            zetas: Vec::new(),
+            iotas: Vec::new(),
             rap_challenges,
-            grinding_seed,
+            grinding_seed: [0u8; 32],
         }
     }
 
@@ -1302,31 +1494,29 @@ pub trait IsStarkVerifier<
     ///
     /// `main_*` / `aux_*` come from the shared multi-proof and authenticate
     /// the per-table trace openings in step 4.
+    /// Replays per-fork rounds 2 → 3.5 for one table and runs step 2
+    /// (composition-polynomial OOD consistency). Returns the per-fork
+    /// Challenges populated up through γ — `zetas`, `iotas`, and
+    /// `grinding_seed` remain empty and are filled in by the chunk-bucket
+    /// FRI verification (Phase D).
+    ///
+    /// Step 4 (trace openings at iotas) is split into
+    /// [`verify_step_4_at_iotas`] driven by `multi_verify` after the
+    /// bucket FRI sets each chunk-mate's iota list.
     #[allow(clippy::too_many_arguments)]
-    fn verify_rounds_2_to_4(
+    fn replay_and_verify_step_2(
         air: &dyn AIR<Field = Field, FieldExtension = FieldExtension, PublicInputs = PI>,
         proof: &StarkProof<Field, FieldExtension, PI>,
         transcript: &mut impl IsStarkTranscript<FieldExtension, Field>,
         rap_challenges: Vec<FieldElement<FieldExtension>>,
-        main_tag: crypto::merkle_tree::mmcs::MatrixTag,
-        main_mmcs_root: Option<&Commitment>,
-        main_mmcs_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)],
-        aux_mmcs_root: Option<&Commitment>,
-        aux_mmcs_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)],
         comp_mmcs_root: Option<&Commitment>,
-        comp_mmcs_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)],
-    ) -> bool
+    ) -> Option<Challenges<FieldExtension>>
     where
         FieldElement<Field>: AsBytes + Sync + Send + math::traits::ByteConversion,
         FieldElement<FieldExtension>: AsBytes + Sync + Send + math::traits::ByteConversion,
     {
         let domain = new_verifier_domain(air, proof.trace_length);
 
-        // Verify there are enough queries
-        if proof.query_list.len() < air.options().fri_number_of_queries {
-            return false;
-        }
-
         #[cfg(feature = "instruments")]
         println!("- Started step 1: Recover challenges");
         #[cfg(feature = "instruments")]
@@ -1341,19 +1531,8 @@ pub trait IsStarkVerifier<
             comp_mmcs_root,
         );
 
-        // verify grinding
-        let security_bits = air.context().proof_options.grinding_factor;
-        if security_bits > 0 {
-            let nonce_is_valid = proof.nonce.is_some_and(|nonce_value| {
-                grinding::is_valid_nonce(&challenges.grinding_seed, nonce_value, security_bits)
-            });
-
-            if !nonce_is_valid {
-                #[cfg(not(feature = "test_fiat_shamir"))]
-                error!("Grinding factor not satisfied");
-                return false;
-            }
-        }
+        // Grinding + iotas + FRI verification moved to chunk-bucket level
+        // in `multi_verify` (Phase D batched FRI).
 
         #[cfg(feature = "instruments")]
         let elapsed1 = timer1.elapsed();
@@ -1368,7 +1547,7 @@ pub trait IsStarkVerifier<
         if !Self::step_2_verify_claimed_composition_polynomial(air, proof, &domain, &challenges) {
             #[cfg(not(feature = "test_fiat_shamir"))]
             error!("Composition Polynomial verification failed");
-            return false;
+            return None;
         }
 
         #[cfg(feature = "instruments")]
@@ -1380,11 +1559,8 @@ pub trait IsStarkVerifier<
         #[cfg(feature = "instruments")]
         let timer3 = Instant::now();
 
-        if !Self::step_3_verify_fri(proof, &domain, &challenges) {
-            #[cfg(not(feature = "test_fiat_shamir"))]
-            error!("FRI verification failed");
-            return false;
-        }
+        // FRI verification (Phase D) is driven from `multi_verify` per
+        // chunk-bucket. This per-table replay stops here.
 
         #[cfg(feature = "instruments")]
         let elapsed3 = timer3.elapsed();
@@ -1396,41 +1572,57 @@ pub trait IsStarkVerifier<
         #[cfg(feature = "instruments")]
         let timer4 = Instant::now();
 
-        #[allow(clippy::let_and_return)]
-        if !Self::step_4_verify_trace_and_composition_openings(
-            proof,
-            &challenges,
-            main_tag,
-            main_mmcs_root,
-            main_mmcs_spec,
-            aux_mmcs_root,
-            aux_mmcs_spec,
-            comp_mmcs_root,
-            comp_mmcs_spec,
-        ) {
-            #[cfg(not(feature = "test_fiat_shamir"))]
-            error!("DEEP Composition Polynomial verification failed");
-            return false;
-        }
+        // Step 4 (per-iota openings) runs at chunk-bucket level (Phase D).
 
         #[cfg(feature = "instruments")]
-        let elapsed4 = timer4.elapsed();
-        #[cfg(feature = "instruments")]
-        println!("  Time spent: {:?}", elapsed4);
+        let _ = (elapsed1, timer2.elapsed(), timer3.elapsed(), timer4.elapsed());
 
-        #[cfg(feature = "instruments")]
-        {
-            let total_time = elapsed1 + elapsed2 + elapsed3 + elapsed4;
-            println!(
-                " Fraction of verifying time per step: {:.4} {:.4} {:.4} {:.4}",
-                elapsed1.as_nanos() as f64 / total_time.as_nanos() as f64,
-                elapsed2.as_nanos() as f64 / total_time.as_nanos() as f64,
-                elapsed3.as_nanos() as f64 / total_time.as_nanos() as f64,
-                elapsed4.as_nanos() as f64 / total_time.as_nanos() as f64
-            );
-        }
+        Some(challenges)
+    }
 
-        true
+    /// Step 4 for one table at the bucket-shared iotas: authenticate
+    /// every per-query opening against the chunk's main / aux /
+    /// composition MMCS roots.
+    #[allow(clippy::too_many_arguments)]
+    fn verify_step_4_at_iotas(
+        proof: &StarkProof<Field, FieldExtension, PI>,
+        iotas: &[usize],
+        main_tag: crypto::merkle_tree::mmcs::MatrixTag,
+        main_mmcs_root: Option<&Commitment>,
+        main_mmcs_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)],
+        aux_mmcs_root: Option<&Commitment>,
+        aux_mmcs_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)],
+        comp_mmcs_root: Option<&Commitment>,
+        comp_mmcs_spec: &[(crypto::merkle_tree::mmcs::MatrixTag, usize)],
+    ) -> bool
+    where
+        FieldElement<Field>: AsBytes + Sync + Send + math::traits::ByteConversion,
+        FieldElement<FieldExtension>: AsBytes + Sync + Send + math::traits::ByteConversion,
+    {
+        if proof.deep_poly_openings.len() < iotas.len() {
+            return false;
+        }
+        iotas
+            .iter()
+            .zip(proof.deep_poly_openings.iter())
+            .all(|(iota_n, deep_poly_opening)| {
+                Self::verify_composition_poly_opening(
+                    deep_poly_opening,
+                    comp_mmcs_root,
+                    comp_mmcs_spec,
+                    main_tag,
+                    *iota_n,
+                ) && Self::verify_trace_openings(
+                    proof,
+                    deep_poly_opening,
+                    *iota_n,
+                    main_tag,
+                    main_mmcs_root,
+                    main_mmcs_spec,
+                    aux_mmcs_root,
+                    aux_mmcs_spec,
+                )
+            })
     }
 }