diff --git a/parquet-benchmarks/pom.xml b/parquet-benchmarks/pom.xml
index d5a288b677..8bf1d7f821 100644
--- a/parquet-benchmarks/pom.xml
+++ b/parquet-benchmarks/pom.xml
@@ -89,11 +89,50 @@
     </dependency>
   </dependencies>
 
+  <profiles>
+    <!-- PARQUET-1975 Do not add brotli-codec for ARM64 architectures -->
+    <profile>
+      <id>non-aarch64</id>
+      <activation>
+        <os>
+          <arch>!aarch64</arch>
+        </os>
+      </activation>
+      <repositories>
+        <repository>
+          <id>jitpack.io</id>
+          <url>https://jitpack.io</url>
+          <name>Jitpack.io repository</name>
+        </repository>
+      </repositories>
+      <dependencies>
+        <dependency>
+          <groupId>com.github.rdblue</groupId>
+          <artifactId>brotli-codec</artifactId>
+          <version>${brotli-codec.version}</version>
+          <scope>runtime</scope>
+        </dependency>
+      </dependencies>
+    </profile>
+  </profiles>
+
   <build>
     <plugins>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-compiler-plugin</artifactId>
+        <configuration>
+          <annotationProcessorPaths>
+            <path>
+              <groupId>org.openjdk.jmh</groupId>
+              <artifactId>jmh-generator-annprocess</artifactId>
+              <version>${jmh.version}</version>
+            </path>
+          </annotationProcessorPaths>
+          <annotationProcessors>
+            <annotationProcessor>org.openjdk.jmh.generators.BenchmarkProcessor</annotationProcessor>
+          </annotationProcessors>
+        </configuration>
       </plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
@@ -112,6 +151,12 @@
                 <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
                   <mainClass>org.openjdk.jmh.Main</mainClass>
                 </transformer>
+                <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
+                  <resource>META-INF/BenchmarkList</resource>
+                </transformer>
+                <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
+                  <resource>META-INF/CompilerHints</resource>
+                </transformer>
               </transformers>
               <artifactSet>
                 <includes>
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BenchmarkEncodingUtils.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BenchmarkEncodingUtils.java
new file mode 100644
index 0000000000..c79dedce28
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BenchmarkEncodingUtils.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.IOException;
+import org.apache.parquet.column.page.DictionaryPage;
+import org.apache.parquet.column.values.dictionary.DictionaryValuesWriter;
+
+/**
+ * Shared helpers for encode/decode micro-benchmarks.
+ */
+final class BenchmarkEncodingUtils {
+
+  private BenchmarkEncodingUtils() {}
+
+  /**
+   * Container for the two artefacts produced by a dictionary-encoded page:
+   * the encoded dictionary indices ({@link #dictData}) and the dictionary
+   * page itself ({@link #dictPage}). The dictionary page may be {@code null}
+   * if the writer fell back to plain encoding (for example, when the
+   * dictionary exceeded its configured maximum size).
+   */
+  static final class EncodedDictionary {
+    final byte[] dictData;
+    final DictionaryPage dictPage;
+
+    EncodedDictionary(byte[] dictData, DictionaryPage dictPage) {
+      this.dictData = dictData;
+      this.dictPage = dictPage;
+    }
+
+    boolean fellBackToPlain() {
+      return dictPage == null;
+    }
+  }
+
+  /**
+   * Drains a {@link DictionaryValuesWriter} into an {@link EncodedDictionary}.
+   *
+   * <p>The writer's data bytes (the RLE-encoded indices) and the dictionary
+   * page are returned separately so both pieces can be measured or fed to a
+   * decoder symmetrically. The dictionary page buffer is copied so it remains
+   * valid after the writer's allocator is released.
+   *
+   * <p>The writer is closed via {@code toDictPageAndClose()}; callers must not
+   * call {@link DictionaryValuesWriter#close()} again afterwards.
+   */
+  static EncodedDictionary drainDictionary(DictionaryValuesWriter writer) throws IOException {
+    byte[] dictData = writer.getBytes().toByteArray();
+    DictionaryPage rawPage = writer.toDictPageAndClose();
+    DictionaryPage dictPage = rawPage == null ? null : rawPage.copy();
+    return new EncodedDictionary(dictData, dictPage);
+  }
+}
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BinaryEncodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BinaryEncodingBenchmark.java
new file mode 100644
index 0000000000..e6646458d8
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BinaryEncodingBenchmark.java
@@ -0,0 +1,233 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.concurrent.TimeUnit;
+import org.apache.parquet.bytes.ByteBufferInputStream;
+import org.apache.parquet.bytes.HeapByteBufferAllocator;
+import org.apache.parquet.column.Dictionary;
+import org.apache.parquet.column.Encoding;
+import org.apache.parquet.column.page.DictionaryPage;
+import org.apache.parquet.column.values.ValuesWriter;
+import org.apache.parquet.column.values.deltalengthbytearray.DeltaLengthByteArrayValuesReader;
+import org.apache.parquet.column.values.deltalengthbytearray.DeltaLengthByteArrayValuesWriter;
+import org.apache.parquet.column.values.deltastrings.DeltaByteArrayReader;
+import org.apache.parquet.column.values.deltastrings.DeltaByteArrayWriter;
+import org.apache.parquet.column.values.dictionary.DictionaryValuesReader;
+import org.apache.parquet.column.values.dictionary.DictionaryValuesWriter;
+import org.apache.parquet.column.values.dictionary.PlainValuesDictionary;
+import org.apache.parquet.column.values.plain.BinaryPlainValuesReader;
+import org.apache.parquet.column.values.plain.PlainValuesWriter;
+import org.apache.parquet.io.api.Binary;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OperationsPerInvocation;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+/**
+ * Encoding-level and decoding-level micro-benchmarks for BINARY values.
+ * Compares PLAIN, DELTA_BYTE_ARRAY, DELTA_LENGTH_BYTE_ARRAY, and DICTIONARY encodings
+ * across different string lengths and cardinality patterns.
+ *
+ * <p>Each benchmark invocation processes {@value #VALUE_COUNT} values. Throughput is
+ * reported per-value using {@link OperationsPerInvocation}.
+ *
+ * <p>The dictionary encode/decode benchmarks intentionally measure the full path:
+ * the encoder produces both the RLE-encoded indices and a {@link DictionaryPage};
+ * the decoder consumes the indices through a {@link DictionaryValuesReader} backed
+ * by the same dictionary. If the dictionary exceeds {@link #MAX_DICT_BYTE_SIZE}
+ * (which can happen for high-cardinality, long-string parameter combinations) the
+ * writer falls back to plain encoding and dictionary decoding for that combination
+ * is skipped.
+ */
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@Fork(1)
+@Warmup(iterations = 3, time = 1)
+@Measurement(iterations = 5, time = 1)
+@State(Scope.Thread)
+public class BinaryEncodingBenchmark {
+
+  static final int VALUE_COUNT = 100_000;
+  private static final int INIT_SLAB_SIZE = 64 * 1024;
+  private static final int PAGE_SIZE = 4 * 1024 * 1024;
+  private static final int MAX_DICT_BYTE_SIZE = 4 * 1024 * 1024;
+
+  @Param({"10", "100", "1000"})
+  public int stringLength;
+
+  /** LOW = 100 distinct values; HIGH = all unique. */
+  @Param({"LOW", "HIGH"})
+  public String cardinality;
+
+  private Binary[] data;
+  private byte[] plainEncoded;
+  private byte[] deltaLengthEncoded;
+  private byte[] deltaStringsEncoded;
+  private byte[] dictEncoded;
+  private DictionaryPage dictPage;
+  private Dictionary binaryDictionary;
+  private boolean dictionaryAvailable;
+
+  @Setup(Level.Trial)
+  public void setup() throws IOException {
+    int distinct = "LOW".equals(cardinality) ? TestDataFactory.LOW_CARDINALITY_DISTINCT : 0;
+    data = TestDataFactory.generateBinaryData(VALUE_COUNT, stringLength, distinct, TestDataFactory.DEFAULT_SEED);
+
+    // Pre-encode data for decode benchmarks
+    plainEncoded = encodeBinaryWith(newPlainWriter());
+    deltaLengthEncoded = encodeBinaryWith(newDeltaLengthWriter());
+    deltaStringsEncoded = encodeBinaryWith(newDeltaStringsWriter());
+
+    DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter dictWriter = newDictWriter();
+    for (Binary v : data) {
+      dictWriter.writeBytes(v);
+    }
+    BenchmarkEncodingUtils.EncodedDictionary encoded = BenchmarkEncodingUtils.drainDictionary(dictWriter);
+    dictEncoded = encoded.dictData;
+    dictPage = encoded.dictPage;
+    dictionaryAvailable = !encoded.fellBackToPlain();
+    if (dictionaryAvailable) {
+      binaryDictionary = new PlainValuesDictionary.PlainBinaryDictionary(dictPage);
+    }
+  }
+
+  private byte[] encodeBinaryWith(ValuesWriter writer) throws IOException {
+    for (Binary v : data) {
+      writer.writeBytes(v);
+    }
+    byte[] bytes = writer.getBytes().toByteArray();
+    writer.close();
+    return bytes;
+  }
+
+  private BenchmarkEncodingUtils.EncodedDictionary encodeDictionaryWith(
+      DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter writer) throws IOException {
+    for (Binary v : data) {
+      writer.writeBytes(v);
+    }
+    return BenchmarkEncodingUtils.drainDictionary(writer);
+  }
+
+  // ---- Writer factories ----
+
+  private static PlainValuesWriter newPlainWriter() {
+    return new PlainValuesWriter(INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+  }
+
+  private static DeltaLengthByteArrayValuesWriter newDeltaLengthWriter() {
+    return new DeltaLengthByteArrayValuesWriter(INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+  }
+
+  private static DeltaByteArrayWriter newDeltaStringsWriter() {
+    return new DeltaByteArrayWriter(INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+  }
+
+  private static DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter newDictWriter() {
+    return new DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter(
+        MAX_DICT_BYTE_SIZE, Encoding.PLAIN_DICTIONARY, Encoding.PLAIN, new HeapByteBufferAllocator());
+  }
+
+  // ---- Encode benchmarks ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodePlain() throws IOException {
+    return encodeBinaryWith(newPlainWriter());
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeDeltaLengthByteArray() throws IOException {
+    return encodeBinaryWith(newDeltaLengthWriter());
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeDeltaByteArray() throws IOException {
+    return encodeBinaryWith(newDeltaStringsWriter());
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void encodeDictionary(Blackhole bh) throws IOException {
+    BenchmarkEncodingUtils.EncodedDictionary encoded = encodeDictionaryWith(newDictWriter());
+    bh.consume(encoded.dictData);
+    bh.consume(encoded.dictPage);
+  }
+
+  // ---- Decode benchmarks ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodePlain(Blackhole bh) throws IOException {
+    BinaryPlainValuesReader reader = new BinaryPlainValuesReader();
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(plainEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readBytes());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeDeltaLengthByteArray(Blackhole bh) throws IOException {
+    DeltaLengthByteArrayValuesReader reader = new DeltaLengthByteArrayValuesReader();
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(deltaLengthEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readBytes());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeDeltaByteArray(Blackhole bh) throws IOException {
+    DeltaByteArrayReader reader = new DeltaByteArrayReader();
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(deltaStringsEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readBytes());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeDictionary(Blackhole bh) throws IOException {
+    if (!dictionaryAvailable) {
+      // Dictionary fell back to plain encoding (e.g. high-cardinality long strings
+      // exceeding MAX_DICT_BYTE_SIZE). Skip to keep the benchmark meaningful.
+      return;
+    }
+    DictionaryValuesReader reader = new DictionaryValuesReader(binaryDictionary);
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(dictEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readBytes());
+    }
+  }
+}
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BlackHoleOutputFile.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BlackHoleOutputFile.java
new file mode 100644
index 0000000000..690ddc2bbe
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BlackHoleOutputFile.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.IOException;
+import org.apache.parquet.io.OutputFile;
+import org.apache.parquet.io.PositionOutputStream;
+
+/**
+ * A no-op {@link OutputFile} that discards all written data.
+ * Useful for isolating CPU/encoding cost from filesystem I/O in write benchmarks.
+ */
+public final class BlackHoleOutputFile implements OutputFile {
+
+  public static final BlackHoleOutputFile INSTANCE = new BlackHoleOutputFile();
+
+  private BlackHoleOutputFile() {}
+
+  @Override
+  public boolean supportsBlockSize() {
+    return false;
+  }
+
+  @Override
+  public long defaultBlockSize() {
+    return -1L;
+  }
+
+  @Override
+  public PositionOutputStream createOrOverwrite(long blockSizeHint) {
+    return create(blockSizeHint);
+  }
+
+  @Override
+  public PositionOutputStream create(long blockSizeHint) {
+    return new PositionOutputStream() {
+      private long pos;
+
+      @Override
+      public long getPos() throws IOException {
+        return pos;
+      }
+
+      @Override
+      public void write(int b) throws IOException {
+        ++pos;
+      }
+
+      @Override
+      public void write(byte[] b, int off, int len) throws IOException {
+        pos += len;
+      }
+    };
+  }
+
+  @Override
+  public String getPath() {
+    return "/dev/null";
+  }
+}
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BooleanEncodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BooleanEncodingBenchmark.java
new file mode 100644
index 0000000000..7c3452652d
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BooleanEncodingBenchmark.java
@@ -0,0 +1,222 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import org.apache.parquet.bytes.ByteBufferInputStream;
+import org.apache.parquet.bytes.HeapByteBufferAllocator;
+import org.apache.parquet.column.values.ValuesReader;
+import org.apache.parquet.column.values.ValuesWriter;
+import org.apache.parquet.column.values.plain.BooleanPlainValuesReader;
+import org.apache.parquet.column.values.plain.BooleanPlainValuesWriter;
+import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridValuesReader;
+import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridValuesWriter;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OperationsPerInvocation;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+/**
+ * Characterization benchmarks for BOOLEAN encoding in Parquet.
+ *
+ * <p>BOOLEAN columns use two distinct encoding paths:
+ * <ul>
+ *   <li><b>V1 (PLAIN):</b> {@link BooleanPlainValuesWriter} delegates to
+ *       {@code ByteBitPackingValuesWriter(bitWidth=1)}. Always bit-packs.</li>
+ *   <li><b>V2 (RLE):</b> {@link RunLengthBitPackingHybridValuesWriter} with
+ *       {@code bitWidth=1}. Uses the RLE/bit-packing hybrid, which can
+ *       run-length encode long runs of identical values.</li>
+ * </ul>
+ *
+ * <p>The {@code dataPattern} parameter exercises RLE's best cases (ALL_TRUE,
+ * ALL_FALSE), worst case (ALTERNATING), and realistic distributions (RANDOM,
+ * MOSTLY_TRUE_99, MOSTLY_FALSE_99).
+ *
+ * <p>Each invocation processes {@value #VALUE_COUNT} values; throughput is
+ * reported per-value via {@link OperationsPerInvocation}.
+ */
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@Fork(1)
+@Warmup(iterations = 3, time = 1)
+@Measurement(iterations = 5, time = 1)
+@State(Scope.Thread)
+public class BooleanEncodingBenchmark {
+
+  static final int VALUE_COUNT = 100_000;
+  private static final int INIT_SLAB_SIZE = 64 * 1024;
+  private static final int PAGE_SIZE = 4 * 1024 * 1024;
+
+  @Param({"ALL_TRUE", "ALL_FALSE", "ALTERNATING", "RANDOM", "MOSTLY_TRUE_99", "MOSTLY_FALSE_99"})
+  public String dataPattern;
+
+  private boolean[] data;
+  private byte[] v1Page;
+  private byte[] v2Page;
+
+  // Pre-allocated batch destination array
+  private boolean[] boolDest;
+
+  @Setup(Level.Trial)
+  public void setup() throws IOException {
+    data = generateData(dataPattern);
+    v1Page = encodeV1(data);
+    v2Page = encodeV2(data);
+    boolDest = new boolean[VALUE_COUNT];
+  }
+
+  private static boolean[] generateData(String pattern) {
+    boolean[] d = new boolean[VALUE_COUNT];
+    Random rng = new Random(42);
+    switch (pattern) {
+      case "ALL_TRUE":
+        for (int i = 0; i < VALUE_COUNT; i++) d[i] = true;
+        break;
+      case "ALL_FALSE":
+        // already false
+        break;
+      case "ALTERNATING":
+        for (int i = 0; i < VALUE_COUNT; i++) d[i] = (i & 1) == 0;
+        break;
+      case "RANDOM":
+        for (int i = 0; i < VALUE_COUNT; i++) d[i] = rng.nextBoolean();
+        break;
+      case "MOSTLY_TRUE_99":
+        for (int i = 0; i < VALUE_COUNT; i++) d[i] = rng.nextInt(100) != 0;
+        break;
+      case "MOSTLY_FALSE_99":
+        for (int i = 0; i < VALUE_COUNT; i++) d[i] = rng.nextInt(100) == 0;
+        break;
+      default:
+        throw new IllegalArgumentException("Unknown pattern: " + pattern);
+    }
+    return d;
+  }
+
+  private static byte[] encodeV1(boolean[] values) throws IOException {
+    ValuesWriter w = new BooleanPlainValuesWriter();
+    for (boolean v : values) {
+      w.writeBoolean(v);
+    }
+    byte[] bytes = w.getBytes().toByteArray();
+    w.close();
+    return bytes;
+  }
+
+  private static byte[] encodeV2(boolean[] values) throws IOException {
+    ValuesWriter w = new RunLengthBitPackingHybridValuesWriter(
+        1, INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+    for (boolean v : values) {
+      w.writeBoolean(v);
+    }
+    byte[] bytes = w.getBytes().toByteArray();
+    w.close();
+    return bytes;
+  }
+
+  // ---- Encode benchmarks ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodePlainV1() throws IOException {
+    return encodeV1(data);
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeRleV2() throws IOException {
+    return encodeV2(data);
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodePlainV1Batch() throws IOException {
+    ValuesWriter w = new BooleanPlainValuesWriter();
+    w.writeBooleans(data, 0, data.length);
+    byte[] bytes = w.getBytes().toByteArray();
+    w.close();
+    return bytes;
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeRleV2Batch() throws IOException {
+    ValuesWriter w = new RunLengthBitPackingHybridValuesWriter(
+        1, INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+    w.writeBooleans(data, 0, data.length);
+    byte[] bytes = w.getBytes().toByteArray();
+    w.close();
+    return bytes;
+  }
+
+  // ---- Decode benchmarks ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodePlainV1(Blackhole bh) throws IOException {
+    ValuesReader r = new BooleanPlainValuesReader();
+    r.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(v1Page)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(r.readBoolean());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeRleV2(Blackhole bh) throws IOException {
+    ValuesReader r = new RunLengthBitPackingHybridValuesReader(1);
+    r.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(v2Page)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(r.readBoolean());
+    }
+  }
+
+  // ---- Batch decode benchmarks ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodePlainV1Batch(Blackhole bh) throws IOException {
+    ValuesReader r = new BooleanPlainValuesReader();
+    r.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(v1Page)));
+    r.readBooleans(boolDest, 0, VALUE_COUNT);
+    bh.consume(boolDest);
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeRleV2Batch(Blackhole bh) throws IOException {
+    ValuesReader r = new RunLengthBitPackingHybridValuesReader(1);
+    r.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(v2Page)));
+    r.readBooleans(boolDest, 0, VALUE_COUNT);
+    bh.consume(boolDest);
+  }
+}
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ByteStreamSplitDecodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ByteStreamSplitDecodingBenchmark.java
new file mode 100644
index 0000000000..81fda8c186
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ByteStreamSplitDecodingBenchmark.java
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.IOException;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import org.apache.parquet.bytes.ByteBufferInputStream;
+import org.apache.parquet.bytes.HeapByteBufferAllocator;
+import org.apache.parquet.column.values.ValuesWriter;
+import org.apache.parquet.column.values.bytestreamsplit.ByteStreamSplitValuesReader;
+import org.apache.parquet.column.values.bytestreamsplit.ByteStreamSplitValuesReaderForDouble;
+import org.apache.parquet.column.values.bytestreamsplit.ByteStreamSplitValuesReaderForFloat;
+import org.apache.parquet.column.values.bytestreamsplit.ByteStreamSplitValuesReaderForInteger;
+import org.apache.parquet.column.values.bytestreamsplit.ByteStreamSplitValuesReaderForLong;
+import org.apache.parquet.column.values.bytestreamsplit.ByteStreamSplitValuesWriter;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OperationsPerInvocation;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+/**
+ * Decoding-level micro-benchmarks for the BYTE_STREAM_SPLIT encoding across the four
+ * primitive widths supported by Parquet ({@code FLOAT}, {@code DOUBLE}, {@code INT32},
+ * {@code INT64}).
+ *
+ * <p>Each invocation decodes {@value #VALUE_COUNT} values; throughput is reported
+ * per-value via {@link OperationsPerInvocation}. The cost includes both
+ * {@code initFromPage} (which eagerly transposes the entire page) and the per-value
+ * read calls. Page transposition is the part this benchmark is primarily designed
+ * to exercise.
+ */
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@Fork(1)
+@Warmup(iterations = 3, time = 1)
+@Measurement(iterations = 5, time = 1)
+@State(Scope.Thread)
+public class ByteStreamSplitDecodingBenchmark {
+
+  static final int VALUE_COUNT = 100_000;
+  private static final int INIT_SLAB_SIZE = 64 * 1024;
+  private static final int PAGE_SIZE = 4 * 1024 * 1024;
+
+  private byte[] floatPage;
+  private byte[] doublePage;
+  private byte[] intPage;
+  private byte[] longPage;
+
+  // Pre-allocated batch destination arrays (avoid per-invocation allocation artifact)
+  private float[] floatDest;
+  private double[] doubleDest;
+  private int[] intDest;
+  private long[] longDest;
+
+  @Setup(Level.Trial)
+  public void setup() throws IOException {
+    Random random = new Random(42);
+    int[] intData = new int[VALUE_COUNT];
+    long[] longData = new long[VALUE_COUNT];
+    float[] floatData = new float[VALUE_COUNT];
+    double[] doubleData = new double[VALUE_COUNT];
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      intData[i] = random.nextInt();
+      longData[i] = random.nextLong();
+      floatData[i] = random.nextFloat();
+      doubleData[i] = random.nextDouble();
+    }
+
+    {
+      ValuesWriter w = new ByteStreamSplitValuesWriter.FloatByteStreamSplitValuesWriter(
+          INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+      for (float v : floatData) {
+        w.writeFloat(v);
+      }
+      floatPage = w.getBytes().toByteArray();
+      w.close();
+    }
+    {
+      ValuesWriter w = new ByteStreamSplitValuesWriter.DoubleByteStreamSplitValuesWriter(
+          INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+      for (double v : doubleData) {
+        w.writeDouble(v);
+      }
+      doublePage = w.getBytes().toByteArray();
+      w.close();
+    }
+    {
+      ValuesWriter w = new ByteStreamSplitValuesWriter.IntegerByteStreamSplitValuesWriter(
+          INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+      for (int v : intData) {
+        w.writeInteger(v);
+      }
+      intPage = w.getBytes().toByteArray();
+      w.close();
+    }
+    {
+      ValuesWriter w = new ByteStreamSplitValuesWriter.LongByteStreamSplitValuesWriter(
+          INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+      for (long v : longData) {
+        w.writeLong(v);
+      }
+      longPage = w.getBytes().toByteArray();
+      w.close();
+    }
+
+    floatDest = new float[VALUE_COUNT];
+    doubleDest = new double[VALUE_COUNT];
+    intDest = new int[VALUE_COUNT];
+    longDest = new long[VALUE_COUNT];
+  }
+
+  private static void init(ByteStreamSplitValuesReader r, byte[] page) throws IOException {
+    r.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(java.nio.ByteBuffer.wrap(page)));
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeFloat(Blackhole bh) throws IOException {
+    ByteStreamSplitValuesReaderForFloat r = new ByteStreamSplitValuesReaderForFloat();
+    init(r, floatPage);
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(r.readFloat());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeDouble(Blackhole bh) throws IOException {
+    ByteStreamSplitValuesReaderForDouble r = new ByteStreamSplitValuesReaderForDouble();
+    init(r, doublePage);
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(r.readDouble());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeInt(Blackhole bh) throws IOException {
+    ByteStreamSplitValuesReaderForInteger r = new ByteStreamSplitValuesReaderForInteger();
+    init(r, intPage);
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(r.readInteger());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeLong(Blackhole bh) throws IOException {
+    ByteStreamSplitValuesReaderForLong r = new ByteStreamSplitValuesReaderForLong();
+    init(r, longPage);
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(r.readLong());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeFloatBatch(Blackhole bh) throws IOException {
+    ByteStreamSplitValuesReaderForFloat r = new ByteStreamSplitValuesReaderForFloat();
+    init(r, floatPage);
+    r.readFloats(floatDest, 0, VALUE_COUNT);
+    bh.consume(floatDest);
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeDoubleBatch(Blackhole bh) throws IOException {
+    ByteStreamSplitValuesReaderForDouble r = new ByteStreamSplitValuesReaderForDouble();
+    init(r, doublePage);
+    r.readDoubles(doubleDest, 0, VALUE_COUNT);
+    bh.consume(doubleDest);
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeIntBatch(Blackhole bh) throws IOException {
+    ByteStreamSplitValuesReaderForInteger r = new ByteStreamSplitValuesReaderForInteger();
+    init(r, intPage);
+    r.readIntegers(intDest, 0, VALUE_COUNT);
+    bh.consume(intDest);
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeLongBatch(Blackhole bh) throws IOException {
+    ByteStreamSplitValuesReaderForLong r = new ByteStreamSplitValuesReaderForLong();
+    init(r, longPage);
+    r.readLongs(longDest, 0, VALUE_COUNT);
+    bh.consume(longDest);
+  }
+}
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ByteStreamSplitEncodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ByteStreamSplitEncodingBenchmark.java
new file mode 100644
index 0000000000..37ec9df812
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ByteStreamSplitEncodingBenchmark.java
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.IOException;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import org.apache.parquet.bytes.HeapByteBufferAllocator;
+import org.apache.parquet.column.values.ValuesWriter;
+import org.apache.parquet.column.values.bytestreamsplit.ByteStreamSplitValuesWriter;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OperationsPerInvocation;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+
+/**
+ * Encoding-level micro-benchmarks for the BYTE_STREAM_SPLIT encoding across the four
+ * primitive widths supported by Parquet ({@code FLOAT}, {@code DOUBLE}, {@code INT32},
+ * {@code INT64}).
+ *
+ * <p>Each invocation encodes {@value #VALUE_COUNT} values; throughput is reported
+ * per-value via {@link OperationsPerInvocation}.
+ */
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@Fork(1)
+@Warmup(iterations = 3, time = 1)
+@Measurement(iterations = 5, time = 1)
+@State(Scope.Thread)
+public class ByteStreamSplitEncodingBenchmark {
+
+  static final int VALUE_COUNT = 100_000;
+  private static final int INIT_SLAB_SIZE = 64 * 1024;
+  private static final int PAGE_SIZE = 4 * 1024 * 1024;
+
+  private int[] intData;
+  private long[] longData;
+  private float[] floatData;
+  private double[] doubleData;
+
+  @Setup(Level.Trial)
+  public void setup() {
+    Random random = new Random(42);
+    intData = new int[VALUE_COUNT];
+    longData = new long[VALUE_COUNT];
+    floatData = new float[VALUE_COUNT];
+    doubleData = new double[VALUE_COUNT];
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      intData[i] = random.nextInt();
+      longData[i] = random.nextLong();
+      floatData[i] = random.nextFloat();
+      doubleData[i] = random.nextDouble();
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeFloat() throws IOException {
+    ValuesWriter w = new ByteStreamSplitValuesWriter.FloatByteStreamSplitValuesWriter(
+        INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+    for (float v : floatData) {
+      w.writeFloat(v);
+    }
+    byte[] bytes = w.getBytes().toByteArray();
+    w.close();
+    return bytes;
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeDouble() throws IOException {
+    ValuesWriter w = new ByteStreamSplitValuesWriter.DoubleByteStreamSplitValuesWriter(
+        INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+    for (double v : doubleData) {
+      w.writeDouble(v);
+    }
+    byte[] bytes = w.getBytes().toByteArray();
+    w.close();
+    return bytes;
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeInt() throws IOException {
+    ValuesWriter w = new ByteStreamSplitValuesWriter.IntegerByteStreamSplitValuesWriter(
+        INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+    for (int v : intData) {
+      w.writeInteger(v);
+    }
+    byte[] bytes = w.getBytes().toByteArray();
+    w.close();
+    return bytes;
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeLong() throws IOException {
+    ValuesWriter w = new ByteStreamSplitValuesWriter.LongByteStreamSplitValuesWriter(
+        INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+    for (long v : longData) {
+      w.writeLong(v);
+    }
+    byte[] bytes = w.getBytes().toByteArray();
+    w.close();
+    return bytes;
+  }
+}
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/CompressionBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/CompressionBenchmark.java
new file mode 100644
index 0000000000..11e9fe6d6a
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/CompressionBenchmark.java
@@ -0,0 +1,157 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.IOException;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.parquet.bytes.BytesInput;
+import org.apache.parquet.bytes.DirectByteBufferAllocator;
+import org.apache.parquet.compression.CompressionCodecFactory;
+import org.apache.parquet.hadoop.CodecFactory;
+import org.apache.parquet.hadoop.metadata.CompressionCodecName;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.TearDown;
+import org.openjdk.jmh.annotations.Warmup;
+
+/**
+ * Isolated JMH benchmarks for raw Parquet compression and decompression throughput.
+ *
+ * <p>Measures the performance of {@link CompressionCodecFactory.BytesInputCompressor}
+ * and {@link CompressionCodecFactory.BytesInputDecompressor} for each supported codec,
+ * using the direct-memory {@link CodecFactory} path (same as actual Parquet file I/O).
+ * Input data is generated to approximate realistic Parquet page content (a mix of
+ * sequential, repeated, and random byte patterns).
+ *
+ * <p>This benchmark isolates the codec hot path from file I/O, encoding, and other
+ * Parquet overhead, making it ideal for measuring compression-specific optimizations.
+ */
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@Fork(1)
+@Warmup(iterations = 2, time = 1)
+@Measurement(iterations = 3, time = 2)
+@State(Scope.Thread)
+public class CompressionBenchmark {
+
+  @Param({"SNAPPY", "ZSTD", "LZ4_RAW", "GZIP", "BROTLI"})
+  public String codec;
+
+  @Param({"65536", "131072", "262144", "1048576"})
+  public int pageSize;
+
+  private byte[] uncompressedData;
+  private byte[] compressedData;
+  private int decompressedSize;
+
+  private CompressionCodecFactory.BytesInputCompressor compressor;
+  private CompressionCodecFactory.BytesInputDecompressor decompressor;
+  private CodecFactory factory;
+
+  @Setup(Level.Trial)
+  public void setup() throws IOException {
+    uncompressedData = generatePageData(pageSize, 42L);
+    decompressedSize = uncompressedData.length;
+
+    Configuration conf = new Configuration();
+    factory = CodecFactory.createDirectCodecFactory(conf, DirectByteBufferAllocator.getInstance(), pageSize);
+    CompressionCodecName codecName = CompressionCodecName.valueOf(codec);
+
+    compressor = factory.getCompressor(codecName);
+    decompressor = factory.getDecompressor(codecName);
+
+    // Pre-compress for decompression benchmark; copy to a stable byte array
+    // since the compressor may reuse its internal buffer.
+    BytesInput compressed = compressor.compress(BytesInput.from(uncompressedData));
+    compressedData = compressed.toByteArray();
+  }
+
+  @TearDown(Level.Trial)
+  public void tearDown() {
+    factory.release();
+  }
+
+  @Benchmark
+  public BytesInput compress() throws IOException {
+    return compressor.compress(BytesInput.from(uncompressedData));
+  }
+
+  @Benchmark
+  public byte[] decompress() throws IOException {
+    // Force materialization of the decompressed data. Without this, codecs using
+    // the stream-based HeapBytesDecompressor (e.g. GZIP) would return a lazy
+    // StreamBytesInput, deferring the actual work. toByteArray() is essentially
+    // free for our optimized implementations (returns the existing byte[]).
+    return decompressor
+        .decompress(BytesInput.from(compressedData), decompressedSize)
+        .toByteArray();
+  }
+
+  /**
+   * Generates byte data that approximates realistic Parquet page content.
+   * Mixes sequential runs, repeated values, low-range random, and full random
+   * to produce a realistic compression ratio (~2-4x for fast codecs).
+   */
+  static byte[] generatePageData(int size, long seed) {
+    Random random = new Random(seed);
+    byte[] data = new byte[size];
+    int i = 0;
+    while (i < size) {
+      int patternType = random.nextInt(4);
+      int chunkSize = Math.min(random.nextInt(256) + 64, size - i);
+      switch (patternType) {
+        case 0: // Sequential bytes (highly compressible)
+          for (int j = 0; j < chunkSize && i < size; j++) {
+            data[i++] = (byte) (j & 0xFF);
+          }
+          break;
+        case 1: // Repeated value (highly compressible)
+          byte val = (byte) random.nextInt(256);
+          for (int j = 0; j < chunkSize && i < size; j++) {
+            data[i++] = val;
+          }
+          break;
+        case 2: // Small range random (moderately compressible)
+          for (int j = 0; j < chunkSize && i < size; j++) {
+            data[i++] = (byte) random.nextInt(16);
+          }
+          break;
+        case 3: // Full random (low compressibility)
+          byte[] randomChunk = new byte[chunkSize];
+          random.nextBytes(randomChunk);
+          int toCopy = Math.min(chunkSize, size - i);
+          System.arraycopy(randomChunk, 0, data, i, toCopy);
+          i += toCopy;
+          break;
+      }
+    }
+    return data;
+  }
+}
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ConcurrentReadWriteBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ConcurrentReadWriteBenchmark.java
new file mode 100644
index 0000000000..de94b422cf
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ConcurrentReadWriteBenchmark.java
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.concurrent.TimeUnit;
+import org.apache.parquet.example.data.Group;
+import org.apache.parquet.hadoop.ParquetFileWriter;
+import org.apache.parquet.hadoop.ParquetReader;
+import org.apache.parquet.hadoop.ParquetWriter;
+import org.apache.parquet.hadoop.api.ReadSupport;
+import org.apache.parquet.hadoop.example.ExampleParquetWriter;
+import org.apache.parquet.hadoop.example.GroupReadSupport;
+import org.apache.parquet.io.InputFile;
+import org.apache.parquet.io.LocalInputFile;
+import org.apache.parquet.io.LocalOutputFile;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.TearDown;
+import org.openjdk.jmh.annotations.Threads;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+/**
+ * Multi-threaded benchmarks measuring independent read and write throughput under
+ * concurrency. Uses {@code @Threads(4)} by default (overridable via JMH {@code -t} flag).
+ *
+ * <p>This benchmark does not assert correctness; it measures the cost of each thread
+ * writing a full file to a stateless sink or reading a shared pre-generated file.
+ * The set of rows used by {@link #concurrentWrite(Blackhole)} is built once during
+ * setup and shared (read-only) across all threads, so the timed section measures
+ * the encoder/serializer pipeline rather than per-row data construction.
+ *
+ * <ul>
+ *   <li>{@link #concurrentWrite(Blackhole)} - each thread independently writes the
+ *       shared pre-generated rows to a {@link BlackHoleOutputFile} (stateless sink)</li>
+ *   <li>{@link #concurrentRead(Blackhole)} - each thread independently reads the same
+ *       pre-generated Parquet file</li>
+ * </ul>
+ *
+ * <p>{@link Mode#SingleShotTime} is used because each invocation does enough work
+ * (a full file write or read of {@value TestDataFactory#DEFAULT_ROW_COUNT} rows)
+ * that JIT amortization across invocations is unnecessary.
+ */
+@BenchmarkMode(Mode.SingleShotTime)
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@Fork(1)
+@Warmup(iterations = 2, batchSize = 1)
+@Measurement(iterations = 5, batchSize = 1)
+@Threads(4)
+@State(Scope.Benchmark)
+public class ConcurrentReadWriteBenchmark {
+
+  private File tempFile;
+  private Group[] rows;
+
+  @Setup(Level.Trial)
+  public void setup() throws IOException {
+    rows = TestDataFactory.generateRows(
+        TestDataFactory.newGroupFactory(), TestDataFactory.DEFAULT_ROW_COUNT, TestDataFactory.DEFAULT_SEED);
+
+    // Generate a shared file for concurrent reads
+    tempFile = File.createTempFile("parquet-concurrent-bench-", ".parquet");
+    tempFile.deleteOnExit();
+    tempFile.delete();
+
+    try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(new LocalOutputFile(tempFile.toPath()))
+        .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
+        .withType(TestDataFactory.FILE_BENCHMARK_SCHEMA)
+        .build()) {
+      for (Group row : rows) {
+        writer.write(row);
+      }
+    }
+  }
+
+  @TearDown(Level.Trial)
+  public void tearDown() {
+    if (tempFile != null && tempFile.exists()) {
+      tempFile.delete();
+    }
+  }
+
+  /**
+   * Each thread writes the shared pre-generated rows independently to the
+   * stateless {@link BlackHoleOutputFile} sink.
+   */
+  @Benchmark
+  public void concurrentWrite(Blackhole bh) throws IOException {
+    try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(BlackHoleOutputFile.INSTANCE)
+        .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
+        .withType(TestDataFactory.FILE_BENCHMARK_SCHEMA)
+        .build()) {
+      for (Group row : rows) {
+        writer.write(row);
+      }
+    }
+    bh.consume(rows);
+  }
+
+  /**
+   * Each thread reads the full pre-generated file independently.
+   */
+  @Benchmark
+  public void concurrentRead(Blackhole bh) throws IOException {
+    InputFile inputFile = new LocalInputFile(tempFile.toPath());
+    try (ParquetReader<Group> reader = new ParquetReader.Builder<Group>(inputFile) {
+      @Override
+      protected ReadSupport<Group> getReadSupport() {
+        return new GroupReadSupport();
+      }
+    }.build()) {
+      Group group;
+      while ((group = reader.read()) != null) {
+        bh.consume(group);
+      }
+    }
+  }
+}
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/DictionaryEncodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/DictionaryEncodingBenchmark.java
new file mode 100644
index 0000000000..327d6beb0f
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/DictionaryEncodingBenchmark.java
@@ -0,0 +1,326 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.concurrent.TimeUnit;
+import org.apache.parquet.bytes.ByteBufferInputStream;
+import org.apache.parquet.bytes.HeapByteBufferAllocator;
+import org.apache.parquet.column.Dictionary;
+import org.apache.parquet.column.Encoding;
+import org.apache.parquet.column.page.DictionaryPage;
+import org.apache.parquet.column.values.dictionary.DictionaryValuesReader;
+import org.apache.parquet.column.values.dictionary.DictionaryValuesWriter;
+import org.apache.parquet.column.values.dictionary.PlainValuesDictionary;
+import org.apache.parquet.io.api.Binary;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OperationsPerInvocation;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+/**
+ * Dictionary encoding/decoding benchmarks for LONG, FLOAT, DOUBLE, and
+ * FIXED_LEN_BYTE_ARRAY types — complementing the INT32 dictionary coverage
+ * already in {@link IntEncodingBenchmark}.
+ *
+ * <p>Each type's encode benchmark measures the full dictionary-build path
+ * (type-specific hash map + id append). Decode benchmarks measure the
+ * {@link DictionaryValuesReader} lookup path, both per-value and batch.
+ *
+ * <p>The {@code dataPattern} parameter controls cardinality to exercise
+ * both the dictionary-hits-only path (LOW_CARDINALITY) and the path
+ * where every value is unique (HIGH_CARDINALITY, which may trigger
+ * dictionary fallback for large value counts).
+ */
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@Fork(1)
+@Warmup(iterations = 3, time = 1)
+@Measurement(iterations = 5, time = 1)
+@State(Scope.Thread)
+public class DictionaryEncodingBenchmark {
+
+  static final int VALUE_COUNT = 100_000;
+  private static final int MAX_DICT_BYTE_SIZE = 4 * 1024 * 1024;
+  private static final int INIT_SLAB_SIZE = 64 * 1024;
+  private static final int PAGE_SIZE = 1024 * 1024;
+
+  @Param({"LOW_CARDINALITY", "HIGH_CARDINALITY"})
+  public String dataPattern;
+
+  // ---- Data arrays ----
+  private long[] longData;
+  private float[] floatData;
+  private double[] doubleData;
+  private Binary[] flbaData;
+
+  // ---- Pre-encoded dictionary pages for decode benchmarks ----
+  private byte[] longDictDataEncoded;
+  private Dictionary longDictionary;
+  private boolean longDictAvailable;
+
+  private byte[] floatDictDataEncoded;
+  private Dictionary floatDictionary;
+  private boolean floatDictAvailable;
+
+  private byte[] doubleDictDataEncoded;
+  private Dictionary doubleDictionary;
+  private boolean doubleDictAvailable;
+
+  private byte[] flbaDictDataEncoded;
+  private Dictionary flbaDictionary;
+  private boolean flbaDictAvailable;
+
+  // Fixed length for FLBA tests (16 = UUID-sized)
+  private static final int FLBA_LENGTH = 16;
+
+  @Setup(Level.Trial)
+  public void setup() throws IOException {
+    int distinct = "LOW_CARDINALITY".equals(dataPattern)
+        ? TestDataFactory.LOW_CARDINALITY_DISTINCT
+        : 0; // 0 = all unique for HIGH_CARDINALITY
+
+    long seed = TestDataFactory.DEFAULT_SEED;
+
+    // Generate data
+    if (distinct > 0) {
+      longData = TestDataFactory.generateLowCardinalityLongs(VALUE_COUNT, distinct, seed);
+      floatData = TestDataFactory.generateLowCardinalityFloats(VALUE_COUNT, distinct, seed);
+      doubleData = TestDataFactory.generateLowCardinalityDoubles(VALUE_COUNT, distinct, seed);
+      flbaData = TestDataFactory.generateFixedLenByteArrays(VALUE_COUNT, FLBA_LENGTH, distinct, seed);
+    } else {
+      longData = TestDataFactory.generateRandomLongs(VALUE_COUNT, seed);
+      floatData = TestDataFactory.generateRandomFloats(VALUE_COUNT, seed);
+      doubleData = TestDataFactory.generateRandomDoubles(VALUE_COUNT, seed);
+      flbaData = TestDataFactory.generateFixedLenByteArrays(VALUE_COUNT, FLBA_LENGTH, 0, seed);
+    }
+
+    // Pre-encode for decode benchmarks
+    setupLongDict();
+    setupFloatDict();
+    setupDoubleDict();
+    setupFlbaDict();
+  }
+
+  private void setupLongDict() throws IOException {
+    DictionaryValuesWriter.PlainLongDictionaryValuesWriter w = new DictionaryValuesWriter.PlainLongDictionaryValuesWriter(
+        MAX_DICT_BYTE_SIZE, Encoding.PLAIN_DICTIONARY, Encoding.PLAIN, new HeapByteBufferAllocator());
+    for (long v : longData) {
+      w.writeLong(v);
+    }
+    BenchmarkEncodingUtils.EncodedDictionary enc = BenchmarkEncodingUtils.drainDictionary(w);
+    longDictDataEncoded = enc.dictData;
+    longDictAvailable = !enc.fellBackToPlain();
+    if (longDictAvailable) {
+      longDictionary = new PlainValuesDictionary.PlainLongDictionary(enc.dictPage);
+    }
+  }
+
+  private void setupFloatDict() throws IOException {
+    DictionaryValuesWriter.PlainFloatDictionaryValuesWriter w = new DictionaryValuesWriter.PlainFloatDictionaryValuesWriter(
+        MAX_DICT_BYTE_SIZE, Encoding.PLAIN_DICTIONARY, Encoding.PLAIN, new HeapByteBufferAllocator());
+    for (float v : floatData) {
+      w.writeFloat(v);
+    }
+    BenchmarkEncodingUtils.EncodedDictionary enc = BenchmarkEncodingUtils.drainDictionary(w);
+    floatDictDataEncoded = enc.dictData;
+    floatDictAvailable = !enc.fellBackToPlain();
+    if (floatDictAvailable) {
+      floatDictionary = new PlainValuesDictionary.PlainFloatDictionary(enc.dictPage);
+    }
+  }
+
+  private void setupDoubleDict() throws IOException {
+    DictionaryValuesWriter.PlainDoubleDictionaryValuesWriter w = new DictionaryValuesWriter.PlainDoubleDictionaryValuesWriter(
+        MAX_DICT_BYTE_SIZE, Encoding.PLAIN_DICTIONARY, Encoding.PLAIN, new HeapByteBufferAllocator());
+    for (double v : doubleData) {
+      w.writeDouble(v);
+    }
+    BenchmarkEncodingUtils.EncodedDictionary enc = BenchmarkEncodingUtils.drainDictionary(w);
+    doubleDictDataEncoded = enc.dictData;
+    doubleDictAvailable = !enc.fellBackToPlain();
+    if (doubleDictAvailable) {
+      doubleDictionary = new PlainValuesDictionary.PlainDoubleDictionary(enc.dictPage);
+    }
+  }
+
+  private void setupFlbaDict() throws IOException {
+    DictionaryValuesWriter.PlainFixedLenArrayDictionaryValuesWriter w =
+        new DictionaryValuesWriter.PlainFixedLenArrayDictionaryValuesWriter(
+            MAX_DICT_BYTE_SIZE, FLBA_LENGTH, Encoding.PLAIN_DICTIONARY, Encoding.PLAIN,
+            new HeapByteBufferAllocator());
+    for (Binary v : flbaData) {
+      w.writeBytes(v);
+    }
+    BenchmarkEncodingUtils.EncodedDictionary enc = BenchmarkEncodingUtils.drainDictionary(w);
+    flbaDictDataEncoded = enc.dictData;
+    flbaDictAvailable = !enc.fellBackToPlain();
+    if (flbaDictAvailable) {
+      flbaDictionary = new PlainValuesDictionary.PlainBinaryDictionary(enc.dictPage, FLBA_LENGTH);
+    }
+  }
+
+  // ==== ENCODE BENCHMARKS ====
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void encodeLong(Blackhole bh) throws IOException {
+    DictionaryValuesWriter.PlainLongDictionaryValuesWriter w = new DictionaryValuesWriter.PlainLongDictionaryValuesWriter(
+        MAX_DICT_BYTE_SIZE, Encoding.PLAIN_DICTIONARY, Encoding.PLAIN, new HeapByteBufferAllocator());
+    for (long v : longData) {
+      w.writeLong(v);
+    }
+    BenchmarkEncodingUtils.EncodedDictionary enc = BenchmarkEncodingUtils.drainDictionary(w);
+    bh.consume(enc.dictData);
+    bh.consume(enc.dictPage);
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void encodeFloat(Blackhole bh) throws IOException {
+    DictionaryValuesWriter.PlainFloatDictionaryValuesWriter w = new DictionaryValuesWriter.PlainFloatDictionaryValuesWriter(
+        MAX_DICT_BYTE_SIZE, Encoding.PLAIN_DICTIONARY, Encoding.PLAIN, new HeapByteBufferAllocator());
+    for (float v : floatData) {
+      w.writeFloat(v);
+    }
+    BenchmarkEncodingUtils.EncodedDictionary enc = BenchmarkEncodingUtils.drainDictionary(w);
+    bh.consume(enc.dictData);
+    bh.consume(enc.dictPage);
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void encodeDouble(Blackhole bh) throws IOException {
+    DictionaryValuesWriter.PlainDoubleDictionaryValuesWriter w = new DictionaryValuesWriter.PlainDoubleDictionaryValuesWriter(
+        MAX_DICT_BYTE_SIZE, Encoding.PLAIN_DICTIONARY, Encoding.PLAIN, new HeapByteBufferAllocator());
+    for (double v : doubleData) {
+      w.writeDouble(v);
+    }
+    BenchmarkEncodingUtils.EncodedDictionary enc = BenchmarkEncodingUtils.drainDictionary(w);
+    bh.consume(enc.dictData);
+    bh.consume(enc.dictPage);
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void encodeFlba(Blackhole bh) throws IOException {
+    DictionaryValuesWriter.PlainFixedLenArrayDictionaryValuesWriter w =
+        new DictionaryValuesWriter.PlainFixedLenArrayDictionaryValuesWriter(
+            MAX_DICT_BYTE_SIZE, FLBA_LENGTH, Encoding.PLAIN_DICTIONARY, Encoding.PLAIN,
+            new HeapByteBufferAllocator());
+    for (Binary v : flbaData) {
+      w.writeBytes(v);
+    }
+    BenchmarkEncodingUtils.EncodedDictionary enc = BenchmarkEncodingUtils.drainDictionary(w);
+    bh.consume(enc.dictData);
+    bh.consume(enc.dictPage);
+  }
+
+  // ==== DECODE BENCHMARKS (per-value) ====
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeLong(Blackhole bh) throws IOException {
+    if (!longDictAvailable) return;
+    DictionaryValuesReader r = new DictionaryValuesReader(longDictionary);
+    r.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(longDictDataEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(r.readLong());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeFloat(Blackhole bh) throws IOException {
+    if (!floatDictAvailable) return;
+    DictionaryValuesReader r = new DictionaryValuesReader(floatDictionary);
+    r.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(floatDictDataEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(r.readFloat());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeDouble(Blackhole bh) throws IOException {
+    if (!doubleDictAvailable) return;
+    DictionaryValuesReader r = new DictionaryValuesReader(doubleDictionary);
+    r.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(doubleDictDataEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(r.readDouble());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeFlba(Blackhole bh) throws IOException {
+    if (!flbaDictAvailable) return;
+    DictionaryValuesReader r = new DictionaryValuesReader(flbaDictionary);
+    r.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(flbaDictDataEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(r.readBytes());
+    }
+  }
+
+  // ==== DECODE BENCHMARKS (batch) ====
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public long[] decodeLongBatch() throws IOException {
+    if (!longDictAvailable) return new long[0];
+    DictionaryValuesReader r = new DictionaryValuesReader(longDictionary);
+    r.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(longDictDataEncoded)));
+    long[] dest = new long[VALUE_COUNT];
+    r.readLongs(dest, 0, VALUE_COUNT);
+    return dest;
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public float[] decodeFloatBatch() throws IOException {
+    if (!floatDictAvailable) return new float[0];
+    DictionaryValuesReader r = new DictionaryValuesReader(floatDictionary);
+    r.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(floatDictDataEncoded)));
+    float[] dest = new float[VALUE_COUNT];
+    r.readFloats(dest, 0, VALUE_COUNT);
+    return dest;
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public double[] decodeDoubleBatch() throws IOException {
+    if (!doubleDictAvailable) return new double[0];
+    DictionaryValuesReader r = new DictionaryValuesReader(doubleDictionary);
+    r.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(doubleDictDataEncoded)));
+    double[] dest = new double[VALUE_COUNT];
+    r.readDoubles(dest, 0, VALUE_COUNT);
+    return dest;
+  }
+}
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileReadBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileReadBenchmark.java
new file mode 100644
index 0000000000..a2da10eb38
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileReadBenchmark.java
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.concurrent.TimeUnit;
+import org.apache.parquet.column.ParquetProperties.WriterVersion;
+import org.apache.parquet.example.data.Group;
+import org.apache.parquet.hadoop.ParquetFileWriter;
+import org.apache.parquet.hadoop.ParquetReader;
+import org.apache.parquet.hadoop.ParquetWriter;
+import org.apache.parquet.hadoop.api.ReadSupport;
+import org.apache.parquet.hadoop.example.ExampleParquetWriter;
+import org.apache.parquet.hadoop.example.GroupReadSupport;
+import org.apache.parquet.hadoop.metadata.CompressionCodecName;
+import org.apache.parquet.io.InputFile;
+import org.apache.parquet.io.LocalInputFile;
+import org.apache.parquet.io.LocalOutputFile;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.TearDown;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+/**
+ * File-level read benchmarks measuring end-to-end Parquet read throughput through the
+ * example {@link Group} API. A temporary file is generated once during setup from
+ * pre-generated rows using {@link LocalOutputFile}, then read repeatedly during the
+ * benchmark.
+ *
+ * <p>Parameterized across compression codec and writer version. The footer parse
+ * (via {@link LocalInputFile} open) is included in the timed section so the result
+ * reflects the full open-and-read cost a typical caller would observe.
+ *
+ * <p>{@link Mode#SingleShotTime} is used because each invocation does enough work
+ * (a full read of {@value TestDataFactory#DEFAULT_ROW_COUNT} rows) that JIT
+ * amortization across invocations is unnecessary. Ten measurement iterations
+ * provide stable statistics for SS mode.
+ */
+@BenchmarkMode(Mode.SingleShotTime)
+@Fork(1)
+@Warmup(iterations = 5, batchSize = 1)
+@Measurement(iterations = 10, batchSize = 1)
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@State(Scope.Benchmark)
+public class FileReadBenchmark {
+
+  @Param({"UNCOMPRESSED", "SNAPPY", "ZSTD", "GZIP", "LZ4_RAW"})
+  public String codec;
+
+  @Param({"PARQUET_1_0", "PARQUET_2_0"})
+  public String writerVersion;
+
+  private File tempFile;
+
+  @Setup(Level.Trial)
+  public void setup() throws IOException {
+    tempFile = File.createTempFile("parquet-read-bench-", ".parquet");
+    tempFile.deleteOnExit();
+    tempFile.delete(); // remove so the writer can create it
+
+    Group[] rows = TestDataFactory.generateRows(
+        TestDataFactory.newGroupFactory(), TestDataFactory.DEFAULT_ROW_COUNT, TestDataFactory.DEFAULT_SEED);
+    try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(new LocalOutputFile(tempFile.toPath()))
+        .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
+        .withType(TestDataFactory.FILE_BENCHMARK_SCHEMA)
+        .withCompressionCodec(CompressionCodecName.valueOf(codec))
+        .withWriterVersion(WriterVersion.valueOf(writerVersion))
+        .withDictionaryEncoding(true)
+        .build()) {
+      for (Group row : rows) {
+        writer.write(row);
+      }
+    }
+  }
+
+  @TearDown(Level.Trial)
+  public void tearDown() {
+    if (tempFile != null && tempFile.exists()) {
+      tempFile.delete();
+    }
+  }
+
+  @Benchmark
+  public void readFile(Blackhole bh) throws IOException {
+    InputFile inputFile = new LocalInputFile(tempFile.toPath());
+    try (ParquetReader<Group> reader = new ParquetReader.Builder<Group>(inputFile) {
+      @Override
+      protected ReadSupport<Group> getReadSupport() {
+        return new GroupReadSupport();
+      }
+    }.build()) {
+      Group group;
+      while ((group = reader.read()) != null) {
+        bh.consume(group);
+      }
+    }
+  }
+}
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileWriteBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileWriteBenchmark.java
new file mode 100644
index 0000000000..4fa5bf238a
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileWriteBenchmark.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.IOException;
+import java.util.concurrent.TimeUnit;
+import org.apache.parquet.column.ParquetProperties.WriterVersion;
+import org.apache.parquet.example.data.Group;
+import org.apache.parquet.hadoop.ParquetFileWriter;
+import org.apache.parquet.hadoop.ParquetWriter;
+import org.apache.parquet.hadoop.example.ExampleParquetWriter;
+import org.apache.parquet.hadoop.metadata.CompressionCodecName;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+
+/**
+ * File-level write benchmarks measuring end-to-end Parquet write throughput through the
+ * example {@link Group} API. Row contents are pre-generated during setup so compression
+ * and writer settings dominate the timed section, while writes still flow through the
+ * full Parquet writer path.
+ *
+ * <p>Writes are sent to a {@link BlackHoleOutputFile} to isolate CPU and encoding cost
+ * from filesystem I/O. Parameterized across compression codec, writer version, and
+ * dictionary encoding.
+ *
+ * <p>{@link Mode#SingleShotTime} is used because each invocation does enough work
+ * (a full write of {@value TestDataFactory#DEFAULT_ROW_COUNT} rows) that JIT
+ * amortization across invocations is unnecessary. Ten measurement iterations
+ * provide stable statistics for SS mode.
+ */
+@BenchmarkMode(Mode.SingleShotTime)
+@Fork(1)
+@Warmup(iterations = 5, batchSize = 1)
+@Measurement(iterations = 10, batchSize = 1)
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@State(Scope.Benchmark)
+public class FileWriteBenchmark {
+
+  @Param({"UNCOMPRESSED", "SNAPPY", "ZSTD", "GZIP", "LZ4_RAW"})
+  public String codec;
+
+  @Param({"PARQUET_1_0", "PARQUET_2_0"})
+  public String writerVersion;
+
+  @Param({"true", "false"})
+  public String dictionary;
+
+  private Group[] rows;
+
+  @Setup(Level.Trial)
+  public void setup() {
+    rows = TestDataFactory.generateRows(
+        TestDataFactory.newGroupFactory(), TestDataFactory.DEFAULT_ROW_COUNT, TestDataFactory.DEFAULT_SEED);
+  }
+
+  @Benchmark
+  public void writeFile() throws IOException {
+    try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(BlackHoleOutputFile.INSTANCE)
+        .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
+        .withType(TestDataFactory.FILE_BENCHMARK_SCHEMA)
+        .withCompressionCodec(CompressionCodecName.valueOf(codec))
+        .withWriterVersion(WriterVersion.valueOf(writerVersion))
+        .withDictionaryEncoding(Boolean.parseBoolean(dictionary))
+        .build()) {
+      for (Group row : rows) {
+        writer.write(row);
+      }
+    }
+  }
+}
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FixedLenByteArrayEncodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FixedLenByteArrayEncodingBenchmark.java
new file mode 100644
index 0000000000..bd4ba406bb
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FixedLenByteArrayEncodingBenchmark.java
@@ -0,0 +1,268 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.concurrent.TimeUnit;
+import org.apache.parquet.bytes.ByteBufferInputStream;
+import org.apache.parquet.bytes.HeapByteBufferAllocator;
+import org.apache.parquet.column.Dictionary;
+import org.apache.parquet.column.Encoding;
+import org.apache.parquet.column.page.DictionaryPage;
+import org.apache.parquet.column.values.ValuesReader;
+import org.apache.parquet.column.values.ValuesWriter;
+import org.apache.parquet.column.values.bytestreamsplit.ByteStreamSplitValuesReaderForFLBA;
+import org.apache.parquet.column.values.bytestreamsplit.ByteStreamSplitValuesWriter;
+import org.apache.parquet.column.values.deltastrings.DeltaByteArrayReader;
+import org.apache.parquet.column.values.deltastrings.DeltaByteArrayWriter;
+import org.apache.parquet.column.values.dictionary.DictionaryValuesReader;
+import org.apache.parquet.column.values.dictionary.DictionaryValuesWriter;
+import org.apache.parquet.column.values.dictionary.PlainValuesDictionary;
+import org.apache.parquet.column.values.plain.FixedLenByteArrayPlainValuesReader;
+import org.apache.parquet.column.values.plain.FixedLenByteArrayPlainValuesWriter;
+import org.apache.parquet.io.api.Binary;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OperationsPerInvocation;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+/**
+ * Encoding-level micro-benchmarks for FIXED_LEN_BYTE_ARRAY (FLBA) values across
+ * all supported encodings: PLAIN, DELTA_BYTE_ARRAY, BYTE_STREAM_SPLIT, and DICTIONARY.
+ *
+ * <p>Each benchmark invocation processes {@value #VALUE_COUNT} values; throughput is
+ * reported per-value via {@link OperationsPerInvocation}.
+ *
+ * <p>The {@code fixedLength} parameter exercises key FLBA sizes:
+ * <ul>
+ *   <li>2 = FLOAT16</li>
+ *   <li>12 = INT96 (legacy timestamps)</li>
+ *   <li>16 = UUID</li>
+ * </ul>
+ *
+ * <p>The {@code dataPattern} parameter controls cardinality:
+ * <ul>
+ *   <li>RANDOM = all unique values</li>
+ *   <li>LOW_CARDINALITY = 100 distinct values (favors dictionary and delta)</li>
+ * </ul>
+ */
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@Fork(1)
+@Warmup(iterations = 3, time = 1)
+@Measurement(iterations = 5, time = 1)
+@State(Scope.Thread)
+public class FixedLenByteArrayEncodingBenchmark {
+
+  static final int VALUE_COUNT = 100_000;
+  private static final int INIT_SLAB_SIZE = 64 * 1024;
+  private static final int PAGE_SIZE = 4 * 1024 * 1024;
+  private static final int MAX_DICT_BYTE_SIZE = 4 * 1024 * 1024;
+
+  @Param({"2", "12", "16"})
+  public int fixedLength;
+
+  @Param({"RANDOM", "LOW_CARDINALITY"})
+  public String dataPattern;
+
+  private Binary[] data;
+
+  // Pre-encoded pages for decode benchmarks
+  private byte[] plainEncoded;
+  private byte[] deltaEncoded;
+  private byte[] bssEncoded;
+  private byte[] dictDataEncoded;
+  private Dictionary flbaDictionary;
+  private boolean dictAvailable;
+
+  @Setup(Level.Trial)
+  public void setup() throws IOException {
+    int distinct = "LOW_CARDINALITY".equals(dataPattern)
+        ? TestDataFactory.LOW_CARDINALITY_DISTINCT
+        : 0;
+    data = TestDataFactory.generateFixedLenByteArrays(
+        VALUE_COUNT, fixedLength, distinct, TestDataFactory.DEFAULT_SEED);
+
+    // Pre-encode for decode benchmarks
+    plainEncoded = encodeWith(newPlainWriter());
+    deltaEncoded = encodeWith(newDeltaWriter());
+    bssEncoded = encodeWith(newBssWriter());
+    setupDict();
+  }
+
+  private byte[] encodeWith(ValuesWriter writer) throws IOException {
+    for (Binary v : data) {
+      writer.writeBytes(v);
+    }
+    byte[] bytes = writer.getBytes().toByteArray();
+    writer.close();
+    return bytes;
+  }
+
+  private void setupDict() throws IOException {
+    DictionaryValuesWriter.PlainFixedLenArrayDictionaryValuesWriter w =
+        new DictionaryValuesWriter.PlainFixedLenArrayDictionaryValuesWriter(
+            MAX_DICT_BYTE_SIZE, fixedLength, Encoding.PLAIN_DICTIONARY, Encoding.PLAIN,
+            new HeapByteBufferAllocator());
+    for (Binary v : data) {
+      w.writeBytes(v);
+    }
+    BenchmarkEncodingUtils.EncodedDictionary enc = BenchmarkEncodingUtils.drainDictionary(w);
+    dictDataEncoded = enc.dictData;
+    dictAvailable = !enc.fellBackToPlain();
+    if (dictAvailable) {
+      flbaDictionary = new PlainValuesDictionary.PlainBinaryDictionary(enc.dictPage, fixedLength);
+    }
+  }
+
+  // ---- Writer factories ----
+
+  private FixedLenByteArrayPlainValuesWriter newPlainWriter() {
+    return new FixedLenByteArrayPlainValuesWriter(
+        fixedLength, INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+  }
+
+  private DeltaByteArrayWriter newDeltaWriter() {
+    return new DeltaByteArrayWriter(INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+  }
+
+  private ByteStreamSplitValuesWriter.FixedLenByteArrayByteStreamSplitValuesWriter newBssWriter() {
+    return new ByteStreamSplitValuesWriter.FixedLenByteArrayByteStreamSplitValuesWriter(
+        fixedLength, INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+  }
+
+  // ==== ENCODE BENCHMARKS ====
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodePlain() throws IOException {
+    return encodeWith(newPlainWriter());
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodePlainBatch() throws IOException {
+    FixedLenByteArrayPlainValuesWriter writer = newPlainWriter();
+    writer.writeBinaries(data, 0, data.length);
+    byte[] bytes = writer.getBytes().toByteArray();
+    writer.close();
+    return bytes;
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeDelta() throws IOException {
+    return encodeWith(newDeltaWriter());
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeBss() throws IOException {
+    return encodeWith(newBssWriter());
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void encodeDictionary(Blackhole bh) throws IOException {
+    DictionaryValuesWriter.PlainFixedLenArrayDictionaryValuesWriter w =
+        new DictionaryValuesWriter.PlainFixedLenArrayDictionaryValuesWriter(
+            MAX_DICT_BYTE_SIZE, fixedLength, Encoding.PLAIN_DICTIONARY, Encoding.PLAIN,
+            new HeapByteBufferAllocator());
+    for (Binary v : data) {
+      w.writeBytes(v);
+    }
+    BenchmarkEncodingUtils.EncodedDictionary enc = BenchmarkEncodingUtils.drainDictionary(w);
+    bh.consume(enc.dictData);
+    bh.consume(enc.dictPage);
+  }
+
+  // ==== DECODE BENCHMARKS ====
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodePlain(Blackhole bh) throws IOException {
+    FixedLenByteArrayPlainValuesReader reader = new FixedLenByteArrayPlainValuesReader(fixedLength);
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(plainEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readBytes());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodePlainBatch(Blackhole bh) throws IOException {
+    FixedLenByteArrayPlainValuesReader reader = new FixedLenByteArrayPlainValuesReader(fixedLength);
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(plainEncoded)));
+    Binary[] batch = new Binary[VALUE_COUNT];
+    reader.readBinaries(batch, 0, VALUE_COUNT);
+    bh.consume(batch);
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeDelta(Blackhole bh) throws IOException {
+    DeltaByteArrayReader reader = new DeltaByteArrayReader();
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(deltaEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readBytes());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeBss(Blackhole bh) throws IOException {
+    ByteStreamSplitValuesReaderForFLBA reader = new ByteStreamSplitValuesReaderForFLBA(fixedLength);
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(bssEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readBytes());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeBssBatch(Blackhole bh) throws IOException {
+    ByteStreamSplitValuesReaderForFLBA reader = new ByteStreamSplitValuesReaderForFLBA(fixedLength);
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(bssEncoded)));
+    Binary[] batch = new Binary[VALUE_COUNT];
+    reader.readBinaries(batch, 0, VALUE_COUNT);
+    bh.consume(batch);
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeDictionary(Blackhole bh) throws IOException {
+    if (!dictAvailable) return;
+    DictionaryValuesReader reader = new DictionaryValuesReader(flbaDictionary);
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(dictDataEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readBytes());
+    }
+  }
+}
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/IntEncodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/IntEncodingBenchmark.java
new file mode 100644
index 0000000000..71ff35f674
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/IntEncodingBenchmark.java
@@ -0,0 +1,258 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.concurrent.TimeUnit;
+import org.apache.parquet.bytes.ByteBufferInputStream;
+import org.apache.parquet.bytes.HeapByteBufferAllocator;
+import org.apache.parquet.column.Dictionary;
+import org.apache.parquet.column.Encoding;
+import org.apache.parquet.column.page.DictionaryPage;
+import org.apache.parquet.column.values.ValuesWriter;
+import org.apache.parquet.column.values.bytestreamsplit.ByteStreamSplitValuesReaderForInteger;
+import org.apache.parquet.column.values.bytestreamsplit.ByteStreamSplitValuesWriter;
+import org.apache.parquet.column.values.delta.DeltaBinaryPackingValuesReader;
+import org.apache.parquet.column.values.delta.DeltaBinaryPackingValuesWriterForInteger;
+import org.apache.parquet.column.values.dictionary.DictionaryValuesReader;
+import org.apache.parquet.column.values.dictionary.DictionaryValuesWriter;
+import org.apache.parquet.column.values.dictionary.PlainValuesDictionary;
+import org.apache.parquet.column.values.plain.PlainValuesReader;
+import org.apache.parquet.column.values.plain.PlainValuesWriter;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OperationsPerInvocation;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+/**
+ * Encoding-level and decoding-level micro-benchmarks for INT32 values.
+ * Compares PLAIN, DELTA_BINARY_PACKED, BYTE_STREAM_SPLIT, and DICTIONARY encodings
+ * across different data distribution patterns. Synthetic dictionary-id RLE decode is
+ * benchmarked separately in {@link RleDictionaryIndexDecodingBenchmark} so the results
+ * here stay comparable at the full-value level.
+ *
+ * <p>Each benchmark invocation processes {@value #VALUE_COUNT} values. Throughput is
+ * reported per-value using {@link OperationsPerInvocation}.
+ *
+ * <p>BYTE_STREAM_SPLIT is included for completeness even though it is rarely a good
+ * choice for integer data; it exists here to compare the full set of encodings the
+ * Parquet writer can emit for INT32.
+ *
+ * <p>The dictionary encode/decode benchmarks measure the full path: the encoder
+ * produces both the RLE-encoded indices and a {@link DictionaryPage}; the decoder
+ * consumes the indices through a {@link DictionaryValuesReader} backed by the same
+ * dictionary.
+ */
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@Fork(1)
+@Warmup(iterations = 3, time = 1)
+@Measurement(iterations = 5, time = 1)
+@State(Scope.Thread)
+public class IntEncodingBenchmark {
+
+  static final int VALUE_COUNT = 100_000;
+  private static final int INIT_SLAB_SIZE = 64 * 1024;
+  private static final int PAGE_SIZE = 1024 * 1024;
+  private static final int MAX_DICT_BYTE_SIZE = 1024 * 1024;
+
+  @Param({"SEQUENTIAL", "RANDOM", "LOW_CARDINALITY", "HIGH_CARDINALITY"})
+  public String dataPattern;
+
+  private int[] data;
+  private byte[] plainEncoded;
+  private byte[] deltaEncoded;
+  private byte[] bssEncoded;
+  private byte[] dictDataEncoded;
+  private DictionaryPage dictPage;
+  private Dictionary intDictionary;
+  private boolean dictionaryAvailable;
+
+  @Setup(Level.Trial)
+  public void setup() throws IOException {
+    switch (dataPattern) {
+      case "SEQUENTIAL":
+        data = TestDataFactory.generateSequentialInts(VALUE_COUNT);
+        break;
+      case "RANDOM":
+        data = TestDataFactory.generateRandomInts(VALUE_COUNT, TestDataFactory.DEFAULT_SEED);
+        break;
+      case "LOW_CARDINALITY":
+        data = TestDataFactory.generateLowCardinalityInts(
+            VALUE_COUNT, TestDataFactory.LOW_CARDINALITY_DISTINCT, TestDataFactory.DEFAULT_SEED);
+        break;
+      case "HIGH_CARDINALITY":
+        data = TestDataFactory.generateHighCardinalityInts(VALUE_COUNT, TestDataFactory.DEFAULT_SEED);
+        break;
+      default:
+        throw new IllegalArgumentException("Unknown data pattern: " + dataPattern);
+    }
+
+    // Pre-encode data for decode benchmarks
+    plainEncoded = encodeWith(newPlainWriter());
+    deltaEncoded = encodeWith(newDeltaWriter());
+    bssEncoded = encodeWith(newBssWriter());
+
+    // Pre-encode dictionary data for decode benchmark
+    DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter dictWriter = newDictWriter();
+    for (int v : data) {
+      dictWriter.writeInteger(v);
+    }
+    BenchmarkEncodingUtils.EncodedDictionary encoded = BenchmarkEncodingUtils.drainDictionary(dictWriter);
+    dictDataEncoded = encoded.dictData;
+    dictPage = encoded.dictPage;
+    dictionaryAvailable = !encoded.fellBackToPlain();
+    if (dictionaryAvailable) {
+      intDictionary = new PlainValuesDictionary.PlainIntegerDictionary(dictPage);
+    }
+  }
+
+  private byte[] encodeWith(ValuesWriter writer) throws IOException {
+    for (int v : data) {
+      writer.writeInteger(v);
+    }
+    byte[] bytes = writer.getBytes().toByteArray();
+    writer.close();
+    return bytes;
+  }
+
+  private BenchmarkEncodingUtils.EncodedDictionary encodeDictionaryWith(
+      DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter writer) throws IOException {
+    for (int v : data) {
+      writer.writeInteger(v);
+    }
+    return BenchmarkEncodingUtils.drainDictionary(writer);
+  }
+
+  // ---- Writer factories ----
+
+  private static PlainValuesWriter newPlainWriter() {
+    return new PlainValuesWriter(INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+  }
+
+  private static DeltaBinaryPackingValuesWriterForInteger newDeltaWriter() {
+    return new DeltaBinaryPackingValuesWriterForInteger(INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+  }
+
+  private static ByteStreamSplitValuesWriter.IntegerByteStreamSplitValuesWriter newBssWriter() {
+    return new ByteStreamSplitValuesWriter.IntegerByteStreamSplitValuesWriter(
+        INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+  }
+
+  private static DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter newDictWriter() {
+    return new DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter(
+        MAX_DICT_BYTE_SIZE, Encoding.PLAIN_DICTIONARY, Encoding.PLAIN, new HeapByteBufferAllocator());
+  }
+
+  // ---- Encode benchmarks ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodePlain() throws IOException {
+    return encodeWith(newPlainWriter());
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodePlainBatch() throws IOException {
+    PlainValuesWriter writer = newPlainWriter();
+    writer.writeIntegers(data, 0, data.length);
+    byte[] bytes = writer.getBytes().toByteArray();
+    writer.close();
+    return bytes;
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeDelta() throws IOException {
+    return encodeWith(newDeltaWriter());
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeByteStreamSplit() throws IOException {
+    return encodeWith(newBssWriter());
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void encodeDictionary(Blackhole bh) throws IOException {
+    BenchmarkEncodingUtils.EncodedDictionary encoded = encodeDictionaryWith(newDictWriter());
+    bh.consume(encoded.dictData);
+    bh.consume(encoded.dictPage);
+  }
+
+  // ---- Decode benchmarks ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodePlain(Blackhole bh) throws IOException {
+    PlainValuesReader.IntegerPlainValuesReader reader = new PlainValuesReader.IntegerPlainValuesReader();
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(plainEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readInteger());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeDelta(Blackhole bh) throws IOException {
+    DeltaBinaryPackingValuesReader reader = new DeltaBinaryPackingValuesReader();
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(deltaEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readInteger());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeByteStreamSplit(Blackhole bh) throws IOException {
+    ByteStreamSplitValuesReaderForInteger reader = new ByteStreamSplitValuesReaderForInteger();
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(bssEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readInteger());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeDictionary(Blackhole bh) throws IOException {
+    if (!dictionaryAvailable) {
+      // Dictionary fell back to plain encoding (e.g. very large unique-value sets
+      // exceeding MAX_DICT_BYTE_SIZE). Skip to keep the benchmark meaningful.
+      return;
+    }
+    DictionaryValuesReader reader = new DictionaryValuesReader(intDictionary);
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(dictDataEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readInteger());
+    }
+  }
+}
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PlainDecodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PlainDecodingBenchmark.java
new file mode 100644
index 0000000000..eea9f01986
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PlainDecodingBenchmark.java
@@ -0,0 +1,204 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import org.apache.parquet.bytes.ByteBufferInputStream;
+import org.apache.parquet.bytes.HeapByteBufferAllocator;
+import org.apache.parquet.column.values.plain.PlainValuesReader;
+import org.apache.parquet.column.values.plain.PlainValuesWriter;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OperationsPerInvocation;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+/**
+ * Decoding micro-benchmarks for the PLAIN encoding across the four numeric primitive
+ * types: {@code INT32}, {@code INT64}, {@code FLOAT}, {@code DOUBLE}.
+ *
+ * <p>Each invocation decodes {@value #VALUE_COUNT} values. Per-value methods measure
+ * scalar read throughput; batch methods measure bulk array-fill throughput using
+ * {@link PlainValuesReader}'s bulk {@code ByteBuffer} view reads.
+ *
+ * <p>INT32 per-value and batch decode are also available in {@link IntEncodingBenchmark}
+ * alongside other INT32 encodings. This benchmark focuses on the PLAIN encoding path
+ * for all four types to validate the bulk view buffer optimization uniformly.
+ */
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@Fork(1)
+@Warmup(iterations = 3, time = 1)
+@Measurement(iterations = 5, time = 1)
+@State(Scope.Thread)
+public class PlainDecodingBenchmark {
+
+  static final int VALUE_COUNT = 100_000;
+  private static final int INIT_SLAB_SIZE = 64 * 1024;
+  private static final int PAGE_SIZE = 1024 * 1024;
+
+  private byte[] intPage;
+  private byte[] longPage;
+  private byte[] floatPage;
+  private byte[] doublePage;
+
+  // Pre-allocated destination arrays to avoid per-invocation allocation noise
+  private int[] intDest;
+  private long[] longDest;
+  private float[] floatDest;
+  private double[] doubleDest;
+
+  @Setup(Level.Trial)
+  public void setup() throws IOException {
+    Random r = new Random(42);
+
+    // Pre-allocate destination arrays
+    intDest = new int[VALUE_COUNT];
+    longDest = new long[VALUE_COUNT];
+    floatDest = new float[VALUE_COUNT];
+    doubleDest = new double[VALUE_COUNT];
+
+    // Encode INT32
+    PlainValuesWriter w = new PlainValuesWriter(INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      w.writeInteger(r.nextInt());
+    }
+    intPage = w.getBytes().toByteArray();
+    w.close();
+
+    // Encode INT64
+    w = new PlainValuesWriter(INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      w.writeLong(r.nextLong());
+    }
+    longPage = w.getBytes().toByteArray();
+    w.close();
+
+    // Encode FLOAT
+    w = new PlainValuesWriter(INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      w.writeFloat(r.nextFloat());
+    }
+    floatPage = w.getBytes().toByteArray();
+    w.close();
+
+    // Encode DOUBLE
+    w = new PlainValuesWriter(INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      w.writeDouble(r.nextDouble());
+    }
+    doublePage = w.getBytes().toByteArray();
+    w.close();
+  }
+
+  // ---- INT32 ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeInt(Blackhole bh) throws IOException {
+    PlainValuesReader.IntegerPlainValuesReader reader = new PlainValuesReader.IntegerPlainValuesReader();
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(intPage)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readInteger());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public int[] decodeIntBatch() throws IOException {
+    PlainValuesReader.IntegerPlainValuesReader reader = new PlainValuesReader.IntegerPlainValuesReader();
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(intPage)));
+    reader.readIntegers(intDest, 0, VALUE_COUNT);
+    return intDest;
+  }
+
+  // ---- INT64 ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeLong(Blackhole bh) throws IOException {
+    PlainValuesReader.LongPlainValuesReader reader = new PlainValuesReader.LongPlainValuesReader();
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(longPage)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readLong());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public long[] decodeLongBatch() throws IOException {
+    PlainValuesReader.LongPlainValuesReader reader = new PlainValuesReader.LongPlainValuesReader();
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(longPage)));
+    reader.readLongs(longDest, 0, VALUE_COUNT);
+    return longDest;
+  }
+
+  // ---- FLOAT ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeFloat(Blackhole bh) throws IOException {
+    PlainValuesReader.FloatPlainValuesReader reader = new PlainValuesReader.FloatPlainValuesReader();
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(floatPage)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readFloat());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public float[] decodeFloatBatch() throws IOException {
+    PlainValuesReader.FloatPlainValuesReader reader = new PlainValuesReader.FloatPlainValuesReader();
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(floatPage)));
+    reader.readFloats(floatDest, 0, VALUE_COUNT);
+    return floatDest;
+  }
+
+  // ---- DOUBLE ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeDouble(Blackhole bh) throws IOException {
+    PlainValuesReader.DoublePlainValuesReader reader = new PlainValuesReader.DoublePlainValuesReader();
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(doublePage)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readDouble());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public double[] decodeDoubleBatch() throws IOException {
+    PlainValuesReader.DoublePlainValuesReader reader = new PlainValuesReader.DoublePlainValuesReader();
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(doublePage)));
+    reader.readDoubles(doubleDest, 0, VALUE_COUNT);
+    return doubleDest;
+  }
+}
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PlainEncodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PlainEncodingBenchmark.java
new file mode 100644
index 0000000000..bd85eb22b2
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PlainEncodingBenchmark.java
@@ -0,0 +1,179 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.IOException;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import org.apache.parquet.bytes.HeapByteBufferAllocator;
+import org.apache.parquet.column.values.plain.PlainValuesWriter;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OperationsPerInvocation;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+
+/**
+ * Encoding micro-benchmarks for the PLAIN encoding across the four numeric primitive
+ * types: {@code INT32}, {@code INT64}, {@code FLOAT}, {@code DOUBLE}.
+ *
+ * <p>Compares per-value scalar writes vs bulk batch writes using
+ * {@link PlainValuesWriter}'s {@code writeIntegers}, {@code writeLongs},
+ * {@code writeFloats}, {@code writeDoubles} methods backed by bulk
+ * {@code ByteBuffer} view transfers in {@code CapacityByteArrayOutputStream}.
+ */
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@Fork(1)
+@Warmup(iterations = 3, time = 1)
+@Measurement(iterations = 5, time = 1)
+@State(Scope.Thread)
+public class PlainEncodingBenchmark {
+
+  static final int VALUE_COUNT = 100_000;
+  private static final int INIT_SLAB_SIZE = 64 * 1024;
+  private static final int PAGE_SIZE = 1024 * 1024;
+
+  private int[] intData;
+  private long[] longData;
+  private float[] floatData;
+  private double[] doubleData;
+
+  @Setup(Level.Trial)
+  public void setup() {
+    Random r = new Random(42);
+    intData = new int[VALUE_COUNT];
+    longData = new long[VALUE_COUNT];
+    floatData = new float[VALUE_COUNT];
+    doubleData = new double[VALUE_COUNT];
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      intData[i] = r.nextInt();
+      longData[i] = r.nextLong();
+      floatData[i] = r.nextFloat();
+      doubleData[i] = r.nextDouble();
+    }
+  }
+
+  private static PlainValuesWriter newWriter() {
+    return new PlainValuesWriter(INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+  }
+
+  // ---- INT32 ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeInt() throws IOException {
+    PlainValuesWriter w = newWriter();
+    for (int v : intData) {
+      w.writeInteger(v);
+    }
+    byte[] bytes = w.getBytes().toByteArray();
+    w.close();
+    return bytes;
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeIntBatch() throws IOException {
+    PlainValuesWriter w = newWriter();
+    w.writeIntegers(intData, 0, VALUE_COUNT);
+    byte[] bytes = w.getBytes().toByteArray();
+    w.close();
+    return bytes;
+  }
+
+  // ---- INT64 ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeLong() throws IOException {
+    PlainValuesWriter w = newWriter();
+    for (long v : longData) {
+      w.writeLong(v);
+    }
+    byte[] bytes = w.getBytes().toByteArray();
+    w.close();
+    return bytes;
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeLongBatch() throws IOException {
+    PlainValuesWriter w = newWriter();
+    w.writeLongs(longData, 0, VALUE_COUNT);
+    byte[] bytes = w.getBytes().toByteArray();
+    w.close();
+    return bytes;
+  }
+
+  // ---- FLOAT ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeFloat() throws IOException {
+    PlainValuesWriter w = newWriter();
+    for (float v : floatData) {
+      w.writeFloat(v);
+    }
+    byte[] bytes = w.getBytes().toByteArray();
+    w.close();
+    return bytes;
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeFloatBatch() throws IOException {
+    PlainValuesWriter w = newWriter();
+    w.writeFloats(floatData, 0, VALUE_COUNT);
+    byte[] bytes = w.getBytes().toByteArray();
+    w.close();
+    return bytes;
+  }
+
+  // ---- DOUBLE ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeDouble() throws IOException {
+    PlainValuesWriter w = newWriter();
+    for (double v : doubleData) {
+      w.writeDouble(v);
+    }
+    byte[] bytes = w.getBytes().toByteArray();
+    w.close();
+    return bytes;
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeDoubleBatch() throws IOException {
+    PlainValuesWriter w = newWriter();
+    w.writeDoubles(doubleData, 0, VALUE_COUNT);
+    byte[] bytes = w.getBytes().toByteArray();
+    w.close();
+    return bytes;
+  }
+}
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/RleDictionaryIndexDecodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/RleDictionaryIndexDecodingBenchmark.java
new file mode 100644
index 0000000000..f60349c2a0
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/RleDictionaryIndexDecodingBenchmark.java
@@ -0,0 +1,181 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.concurrent.TimeUnit;
+import org.apache.parquet.bytes.ByteBufferInputStream;
+import org.apache.parquet.bytes.HeapByteBufferAllocator;
+import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridDecoder;
+import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridEncoder;
+import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridValuesReader;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OperationsPerInvocation;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+/**
+ * Encoding and decoding micro-benchmarks for synthetic dictionary-id pages using
+ * {@link RunLengthBitPackingHybridEncoder} and {@link RunLengthBitPackingHybridDecoder}.
+ * This isolates the RLE/bit-packing hybrid codec paths and is intentionally
+ * separate from {@link IntEncodingBenchmark}, which measures full INT32 value
+ * encode/decode paths.
+ *
+ * <p>The encode benchmark measures the RLE encoder's {@code pack32Values} fast path
+ * and bit-packing throughput. The decode benchmark measures the corresponding
+ * {@code unpack32Values} fast path and RLE run expansion.
+ *
+ * <p>Per-invocation overhead (encoder/decoder construction and {@link ByteBufferInputStream}
+ * wrapping) is amortized over {@value #VALUE_COUNT} reads via
+ * {@link OperationsPerInvocation}.
+ */
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@Fork(1)
+@Warmup(iterations = 3, time = 1)
+@Measurement(iterations = 5, time = 1)
+@State(Scope.Thread)
+public class RleDictionaryIndexDecodingBenchmark {
+
+  static final int VALUE_COUNT = 100_000;
+  private static final int INIT_SLAB_SIZE = 64 * 1024;
+  private static final int PAGE_SIZE = 1024 * 1024;
+  private static final int BIT_WIDTH = 10;
+  private static final int MAX_ID = 1 << BIT_WIDTH;
+
+  static {
+    if (TestDataFactory.LOW_CARDINALITY_DISTINCT > MAX_ID) {
+      throw new IllegalStateException("LOW_CARDINALITY_DISTINCT (" + TestDataFactory.LOW_CARDINALITY_DISTINCT
+          + ") must fit within BIT_WIDTH=" + BIT_WIDTH + " (MAX_ID=" + MAX_ID + ")");
+    }
+  }
+
+  @Param({"SEQUENTIAL", "RANDOM", "LOW_CARDINALITY"})
+  public String indexPattern;
+
+  private byte[] encoded;
+
+  private int[] ids;
+
+  // encoded with 4-byte LE length prefix, as expected by ValuesReader.initFromPage()
+  private byte[] encodedWithLengthPrefix;
+
+  @Setup(Level.Trial)
+  public void setup() throws IOException {
+    ids = generateDictionaryIds();
+    try (RunLengthBitPackingHybridEncoder encoder = new RunLengthBitPackingHybridEncoder(
+        BIT_WIDTH, INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator())) {
+      for (int id : ids) {
+        encoder.writeInt(id);
+      }
+      encoded = encoder.toBytes().toByteArray();
+    }
+
+    // Prepend 4-byte LE length for ValuesReader.initFromPage() format
+    encodedWithLengthPrefix = new byte[4 + encoded.length];
+    ByteBuffer.wrap(encodedWithLengthPrefix).order(ByteOrder.LITTLE_ENDIAN).putInt(encoded.length);
+    System.arraycopy(encoded, 0, encodedWithLengthPrefix, 4, encoded.length);
+  }
+
+  private int[] generateDictionaryIds() {
+    switch (indexPattern) {
+      case "SEQUENTIAL":
+        int[] sequential = new int[VALUE_COUNT];
+        for (int i = 0; i < VALUE_COUNT; i++) {
+          sequential[i] = i % MAX_ID;
+        }
+        return sequential;
+      case "RANDOM":
+        return TestDataFactory.generateLowCardinalityInts(VALUE_COUNT, MAX_ID, TestDataFactory.DEFAULT_SEED);
+      case "LOW_CARDINALITY":
+        return TestDataFactory.generateLowCardinalityInts(
+            VALUE_COUNT, TestDataFactory.LOW_CARDINALITY_DISTINCT, TestDataFactory.DEFAULT_SEED);
+      default:
+        throw new IllegalArgumentException("Unknown index pattern: " + indexPattern);
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeDictionaryIds() throws IOException {
+    try (RunLengthBitPackingHybridEncoder encoder = new RunLengthBitPackingHybridEncoder(
+        BIT_WIDTH, INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator())) {
+      for (int id : ids) {
+        encoder.writeInt(id);
+      }
+      return encoder.toBytes().toByteArray();
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeDictionaryIds(Blackhole bh) throws IOException {
+    RunLengthBitPackingHybridDecoder decoder =
+        new RunLengthBitPackingHybridDecoder(BIT_WIDTH, ByteBufferInputStream.wrap(ByteBuffer.wrap(encoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(decoder.readInt());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public int[] decodeDictionaryIdsBatch() throws IOException {
+    RunLengthBitPackingHybridDecoder decoder =
+        new RunLengthBitPackingHybridDecoder(BIT_WIDTH, ByteBufferInputStream.wrap(ByteBuffer.wrap(encoded)));
+    int[] result = new int[VALUE_COUNT];
+    decoder.readInts(result, 0, VALUE_COUNT);
+    return result;
+  }
+
+  // ---- ValuesReader-level benchmarks ----
+  // These go through the RunLengthBitPackingHybridValuesReader wrapper,
+  // which is the path used by ColumnReader in production.
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeValuesReader(Blackhole bh) throws IOException {
+    RunLengthBitPackingHybridValuesReader reader = new RunLengthBitPackingHybridValuesReader(BIT_WIDTH);
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(encodedWithLengthPrefix)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readInteger());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public int[] decodeValuesReaderBatch() throws IOException {
+    RunLengthBitPackingHybridValuesReader reader = new RunLengthBitPackingHybridValuesReader(BIT_WIDTH);
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(encodedWithLengthPrefix)));
+    int[] result = new int[VALUE_COUNT];
+    reader.readIntegers(result, 0, VALUE_COUNT);
+    return result;
+  }
+}
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/RowGroupFlushBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/RowGroupFlushBenchmark.java
new file mode 100644
index 0000000000..9bc5cab0a8
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/RowGroupFlushBenchmark.java
@@ -0,0 +1,191 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicLong;
+import org.apache.parquet.bytes.ByteBufferAllocator;
+import org.apache.parquet.bytes.HeapByteBufferAllocator;
+import org.apache.parquet.column.ParquetProperties.WriterVersion;
+import org.apache.parquet.example.data.Group;
+import org.apache.parquet.example.data.simple.SimpleGroupFactory;
+import org.apache.parquet.hadoop.ParquetFileWriter;
+import org.apache.parquet.hadoop.ParquetWriter;
+import org.apache.parquet.hadoop.example.ExampleParquetWriter;
+import org.apache.parquet.hadoop.metadata.CompressionCodecName;
+import org.apache.parquet.io.api.Binary;
+import org.apache.parquet.schema.MessageType;
+import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
+import org.apache.parquet.schema.Types;
+import org.openjdk.jmh.annotations.AuxCounters;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+
+/**
+ * Benchmark measuring row group flush performance and peak buffer memory.
+ *
+ * <p>Uses a wide schema (20 BINARY columns, 200 bytes each) to produce
+ * substantial per-column page buffers. A {@link PeakTrackingAllocator}
+ * wraps the heap allocator to precisely track the peak bytes outstanding
+ * across all parquet-managed ByteBuffers (independent of JVM GC behavior).
+ *
+ * <p>The key metric is {@code peakAllocatorMB}: with the interleaved flush
+ * optimization, each column's pages are finalized, written, and released
+ * before the next column is processed, so peak buffer memory is roughly
+ * 1/N of the total row group size (N = number of columns).
+ *
+ * <p>Writes to {@link BlackHoleOutputFile} to isolate flush cost from
+ * filesystem I/O.
+ */
+@BenchmarkMode({Mode.AverageTime})
+@Fork(
+    value = 1,
+    jvmArgs = {"-Xms512m", "-Xmx1g"})
+@Warmup(iterations = 3)
+@Measurement(iterations = 5)
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@State(Scope.Thread)
+public class RowGroupFlushBenchmark {
+
+  private static final int COLUMN_COUNT = 20;
+  private static final int BINARY_VALUE_LENGTH = 200;
+  private static final int ROW_COUNT = 100_000;
+
+  /** Row group sizes: 8MB and 64MB. */
+  @Param({"8388608", "67108864"})
+  public int rowGroupSize;
+
+  /** Wide schema: 20 required BINARY columns. */
+  private static final MessageType WIDE_SCHEMA;
+
+  static {
+    Types.MessageTypeBuilder builder = Types.buildMessage();
+    for (int c = 0; c < COLUMN_COUNT; c++) {
+      builder.required(PrimitiveTypeName.BINARY).named("col_" + c);
+    }
+    WIDE_SCHEMA = builder.named("wide_record");
+  }
+
+  /** Pre-generated column values (one unique value per column). */
+  private Binary[] columnValues;
+
+  @Setup(Level.Trial)
+  public void setup() {
+    Random random = new Random(42);
+    columnValues = new Binary[COLUMN_COUNT];
+    for (int c = 0; c < COLUMN_COUNT; c++) {
+      byte[] value = new byte[BINARY_VALUE_LENGTH];
+      random.nextBytes(value);
+      columnValues[c] = Binary.fromConstantByteArray(value);
+    }
+  }
+
+  /**
+   * Auxiliary counters reported alongside timing. JMH collects these after
+   * each iteration.
+   */
+  @AuxCounters(AuxCounters.Type.EVENTS)
+  @State(Scope.Thread)
+  public static class MemoryCounters {
+    /** Peak bytes outstanding in the parquet ByteBufferAllocator. */
+    public long peakAllocatorBytes;
+
+    /** Convenience: peak in MB (peakAllocatorBytes / 1048576). */
+    public double peakAllocatorMB;
+
+    @Setup(Level.Iteration)
+    public void reset() {
+      peakAllocatorBytes = 0;
+      peakAllocatorMB = 0;
+    }
+  }
+
+  /**
+   * ByteBufferAllocator wrapper that tracks current and peak allocated bytes.
+   * Thread-safe (uses AtomicLong) although the write path is single-threaded.
+   */
+  static class PeakTrackingAllocator implements ByteBufferAllocator {
+    private final ByteBufferAllocator delegate = new HeapByteBufferAllocator();
+    private final AtomicLong currentBytes = new AtomicLong();
+    private final AtomicLong peakBytes = new AtomicLong();
+
+    @Override
+    public ByteBuffer allocate(int size) {
+      ByteBuffer buf = delegate.allocate(size);
+      long current = currentBytes.addAndGet(buf.capacity());
+      peakBytes.accumulateAndGet(current, Math::max);
+      return buf;
+    }
+
+    @Override
+    public void release(ByteBuffer buf) {
+      currentBytes.addAndGet(-buf.capacity());
+      delegate.release(buf);
+    }
+
+    @Override
+    public boolean isDirect() {
+      return delegate.isDirect();
+    }
+
+    long getPeakBytes() {
+      return peakBytes.get();
+    }
+  }
+
+  @Benchmark
+  public void writeWithFlush(MemoryCounters counters) throws IOException {
+    PeakTrackingAllocator allocator = new PeakTrackingAllocator();
+    SimpleGroupFactory factory = new SimpleGroupFactory(WIDE_SCHEMA);
+
+    try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(BlackHoleOutputFile.INSTANCE)
+        .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
+        .withType(WIDE_SCHEMA)
+        .withCompressionCodec(CompressionCodecName.UNCOMPRESSED)
+        .withWriterVersion(WriterVersion.PARQUET_1_0)
+        .withRowGroupSize(rowGroupSize)
+        .withDictionaryEncoding(false)
+        .withAllocator(allocator)
+        .build()) {
+      for (int i = 0; i < ROW_COUNT; i++) {
+        Group group = factory.newGroup();
+        for (int c = 0; c < COLUMN_COUNT; c++) {
+          group.append("col_" + c, columnValues[c]);
+        }
+        writer.write(group);
+      }
+    }
+
+    counters.peakAllocatorBytes = allocator.getPeakBytes();
+    counters.peakAllocatorMB = allocator.getPeakBytes() / (1024.0 * 1024.0);
+  }
+}
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/TestDataFactory.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/TestDataFactory.java
new file mode 100644
index 0000000000..93cc714730
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/TestDataFactory.java
@@ -0,0 +1,377 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BOOLEAN;
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE;
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT;
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32;
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64;
+
+import java.nio.charset.StandardCharsets;
+import java.util.Random;
+import org.apache.parquet.example.data.Group;
+import org.apache.parquet.example.data.simple.SimpleGroupFactory;
+import org.apache.parquet.io.api.Binary;
+import org.apache.parquet.schema.MessageType;
+import org.apache.parquet.schema.Types;
+
+/**
+ * Utility class for generating test schemas and data for benchmarks.
+ */
+public final class TestDataFactory {
+
+  /** Default number of rows for file-level benchmarks. */
+  public static final int DEFAULT_ROW_COUNT = 100_000;
+
+  /** Number of distinct values for low-cardinality data patterns. */
+  public static final int LOW_CARDINALITY_DISTINCT = 100;
+
+  /** Default RNG seed used across benchmarks for deterministic data. */
+  public static final long DEFAULT_SEED = 42L;
+
+  /** A standard multi-type schema used by file-level benchmarks. */
+  public static final MessageType FILE_BENCHMARK_SCHEMA = Types.buildMessage()
+      .required(INT32)
+      .named("int32_field")
+      .required(INT64)
+      .named("int64_field")
+      .required(FLOAT)
+      .named("float_field")
+      .required(DOUBLE)
+      .named("double_field")
+      .required(BOOLEAN)
+      .named("boolean_field")
+      .required(BINARY)
+      .named("binary_field")
+      .named("benchmark_record");
+
+  private TestDataFactory() {}
+
+  /**
+   * Creates a {@link SimpleGroupFactory} for the standard benchmark schema.
+   */
+  public static SimpleGroupFactory newGroupFactory() {
+    return new SimpleGroupFactory(FILE_BENCHMARK_SCHEMA);
+  }
+
+  /**
+   * Generates a single row of benchmark data.
+   *
+   * @param factory the group factory
+   * @param index   the row index (used for deterministic data)
+   * @param random  the random source
+   * @return a populated Group
+   */
+  public static Group generateRow(SimpleGroupFactory factory, int index, Random random) {
+    return factory.newGroup()
+        .append("int32_field", index)
+        .append("int64_field", (long) index * 100)
+        .append("float_field", random.nextFloat())
+        .append("double_field", random.nextDouble())
+        .append("boolean_field", index % 2 == 0)
+        .append("binary_field", "value_" + (index % 1000));
+  }
+
+  /**
+   * Generates a deterministic set of rows for file-level benchmarks.
+   */
+  public static Group[] generateRows(SimpleGroupFactory factory, int rowCount, long seed) {
+    Group[] rows = new Group[rowCount];
+    Random random = new Random(seed);
+    for (int i = 0; i < rowCount; i++) {
+      rows[i] = generateRow(factory, i, random);
+    }
+    return rows;
+  }
+
+  // ---- Integer data generation for encoding benchmarks ----
+
+  /**
+   * Generates sequential integers: 0, 1, 2, ...
+   */
+  public static int[] generateSequentialInts(int count) {
+    int[] data = new int[count];
+    for (int i = 0; i < count; i++) {
+      data[i] = i;
+    }
+    return data;
+  }
+
+  /**
+   * Generates uniformly random integers using the given seed.
+   */
+  public static int[] generateRandomInts(int count, long seed) {
+    return generateRandomInts(count, new Random(seed));
+  }
+
+  /**
+   * Generates uniformly random integers.
+   *
+   * <p>Note: prefer {@link #generateRandomInts(int, long)} when call ordering between
+   * generators in the same setup must not influence the produced data.
+   */
+  public static int[] generateRandomInts(int count, Random random) {
+    int[] data = new int[count];
+    for (int i = 0; i < count; i++) {
+      data[i] = random.nextInt();
+    }
+    return data;
+  }
+
+  /**
+   * Generates low-cardinality integers (values drawn from a small set) using the given seed.
+   */
+  public static int[] generateLowCardinalityInts(int count, int distinctValues, long seed) {
+    return generateLowCardinalityInts(count, distinctValues, new Random(seed));
+  }
+
+  /**
+   * Generates low-cardinality integers (values drawn from a small set).
+   */
+  public static int[] generateLowCardinalityInts(int count, int distinctValues, Random random) {
+    int[] data = new int[count];
+    for (int i = 0; i < count; i++) {
+      data[i] = random.nextInt(distinctValues);
+    }
+    return data;
+  }
+
+  /**
+   * Generates high-cardinality integers (all unique in randomized order) using the given seed.
+   */
+  public static int[] generateHighCardinalityInts(int count, long seed) {
+    return generateHighCardinalityInts(count, new Random(seed));
+  }
+
+  /**
+   * Generates high-cardinality integers (all unique in randomized order).
+   */
+  public static int[] generateHighCardinalityInts(int count, Random random) {
+    int[] data = generateSequentialInts(count);
+    for (int i = count - 1; i > 0; i--) {
+      int swapIndex = random.nextInt(i + 1);
+      int tmp = data[i];
+      data[i] = data[swapIndex];
+      data[swapIndex] = tmp;
+    }
+    return data;
+  }
+
+  // ---- Long data generation for encoding benchmarks ----
+
+  /**
+   * Generates sequential longs: 0, 1, 2, ...
+   */
+  public static long[] generateSequentialLongs(int count) {
+    long[] data = new long[count];
+    for (int i = 0; i < count; i++) {
+      data[i] = i;
+    }
+    return data;
+  }
+
+  /**
+   * Generates uniformly random longs using the given seed.
+   */
+  public static long[] generateRandomLongs(int count, long seed) {
+    Random random = new Random(seed);
+    long[] data = new long[count];
+    for (int i = 0; i < count; i++) {
+      data[i] = random.nextLong();
+    }
+    return data;
+  }
+
+  /**
+   * Generates low-cardinality longs (values drawn from a small set).
+   */
+  public static long[] generateLowCardinalityLongs(int count, int distinctValues, long seed) {
+    Random random = new Random(seed);
+    long[] palette = new long[distinctValues];
+    for (int i = 0; i < distinctValues; i++) {
+      palette[i] = random.nextLong();
+    }
+    long[] data = new long[count];
+    for (int i = 0; i < count; i++) {
+      data[i] = palette[random.nextInt(distinctValues)];
+    }
+    return data;
+  }
+
+  /**
+   * Generates high-cardinality longs (all unique, shuffled).
+   */
+  public static long[] generateHighCardinalityLongs(int count, long seed) {
+    Random random = new Random(seed);
+    long[] data = generateSequentialLongs(count);
+    for (int i = count - 1; i > 0; i--) {
+      int swapIndex = random.nextInt(i + 1);
+      long tmp = data[i];
+      data[i] = data[swapIndex];
+      data[swapIndex] = tmp;
+    }
+    return data;
+  }
+
+  // ---- Float data generation for encoding benchmarks ----
+
+  /**
+   * Generates uniformly random floats using the given seed.
+   */
+  public static float[] generateRandomFloats(int count, long seed) {
+    Random random = new Random(seed);
+    float[] data = new float[count];
+    for (int i = 0; i < count; i++) {
+      data[i] = random.nextFloat() * 1000.0f;
+    }
+    return data;
+  }
+
+  /**
+   * Generates low-cardinality floats (values drawn from a small set).
+   */
+  public static float[] generateLowCardinalityFloats(int count, int distinctValues, long seed) {
+    Random random = new Random(seed);
+    float[] palette = new float[distinctValues];
+    for (int i = 0; i < distinctValues; i++) {
+      palette[i] = random.nextFloat() * 1000.0f;
+    }
+    float[] data = new float[count];
+    for (int i = 0; i < count; i++) {
+      data[i] = palette[random.nextInt(distinctValues)];
+    }
+    return data;
+  }
+
+  // ---- Double data generation for encoding benchmarks ----
+
+  /**
+   * Generates uniformly random doubles using the given seed.
+   */
+  public static double[] generateRandomDoubles(int count, long seed) {
+    Random random = new Random(seed);
+    double[] data = new double[count];
+    for (int i = 0; i < count; i++) {
+      data[i] = random.nextDouble() * 1000.0;
+    }
+    return data;
+  }
+
+  /**
+   * Generates low-cardinality doubles (values drawn from a small set).
+   */
+  public static double[] generateLowCardinalityDoubles(int count, int distinctValues, long seed) {
+    Random random = new Random(seed);
+    double[] palette = new double[distinctValues];
+    for (int i = 0; i < distinctValues; i++) {
+      palette[i] = random.nextDouble() * 1000.0;
+    }
+    double[] data = new double[count];
+    for (int i = 0; i < count; i++) {
+      data[i] = palette[random.nextInt(distinctValues)];
+    }
+    return data;
+  }
+
+  // ---- Fixed-length byte array data generation for encoding benchmarks ----
+
+  /**
+   * Generates fixed-length byte arrays with the specified cardinality.
+   *
+   * @param count      number of values
+   * @param length     byte length of each value
+   * @param distinct   number of distinct values (0 means all unique)
+   * @param seed       RNG seed
+   */
+  public static Binary[] generateFixedLenByteArrays(int count, int length, int distinct, long seed) {
+    Random random = new Random(seed);
+    if (distinct > 0) {
+      Binary[] palette = new Binary[distinct];
+      for (int i = 0; i < distinct; i++) {
+        byte[] bytes = new byte[length];
+        random.nextBytes(bytes);
+        palette[i] = Binary.fromConstantByteArray(bytes);
+      }
+      Binary[] data = new Binary[count];
+      for (int i = 0; i < count; i++) {
+        data[i] = palette[random.nextInt(distinct)];
+      }
+      return data;
+    } else {
+      Binary[] data = new Binary[count];
+      for (int i = 0; i < count; i++) {
+        byte[] bytes = new byte[length];
+        random.nextBytes(bytes);
+        data[i] = Binary.fromConstantByteArray(bytes);
+      }
+      return data;
+    }
+  }
+
+  // ---- Binary data generation for encoding benchmarks ----
+
+  /**
+   * Generates binary strings of the given length with the specified cardinality, using
+   * a deterministic seed.
+   */
+  public static Binary[] generateBinaryData(int count, int stringLength, int distinct, long seed) {
+    return generateBinaryData(count, stringLength, distinct, new Random(seed));
+  }
+
+  /**
+   * Generates binary strings of the given length with the specified cardinality.
+   *
+   * @param count         number of values
+   * @param stringLength  length of each string
+   * @param distinct      number of distinct values (0 means all unique)
+   * @param random        random source
+   * @return array of Binary values
+   */
+  public static Binary[] generateBinaryData(int count, int stringLength, int distinct, Random random) {
+    Binary[] data = new Binary[count];
+    if (distinct > 0) {
+      // Pre-generate the distinct values
+      Binary[] dictionary = new Binary[distinct];
+      for (int i = 0; i < distinct; i++) {
+        dictionary[i] = Binary.fromConstantByteArray(
+            randomString(stringLength, random).getBytes(StandardCharsets.UTF_8));
+      }
+      for (int i = 0; i < count; i++) {
+        data[i] = dictionary[random.nextInt(distinct)];
+      }
+    } else {
+      // All unique
+      for (int i = 0; i < count; i++) {
+        data[i] = Binary.fromConstantByteArray(
+            randomString(stringLength, random).getBytes(StandardCharsets.UTF_8));
+      }
+    }
+    return data;
+  }
+
+  private static String randomString(int length, Random random) {
+    StringBuilder sb = new StringBuilder(length);
+    for (int i = 0; i < length; i++) {
+      sb.append((char) ('a' + random.nextInt(26)));
+    }
+    return sb.toString();
+  }
+}
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/ValuesReader.java b/parquet-column/src/main/java/org/apache/parquet/column/values/ValuesReader.java
index 1713acc012..114936d153 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/ValuesReader.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/ValuesReader.java
@@ -185,6 +185,88 @@ public long readLong() {
     throw new UnsupportedOperationException();
   }
 
+  // ---- Batch read methods ----
+  // Default implementations loop over the per-value methods.
+  // Subclasses should override with bulk/memcpy-style implementations.
+
+  /**
+   * Reads {@code count} integers into {@code dest} starting at {@code offset}.
+   *
+   * @param dest   destination array
+   * @param offset start index in dest
+   * @param count  number of values to read
+   */
+  public void readIntegers(int[] dest, int offset, int count) {
+    for (int i = 0; i < count; i++) {
+      dest[offset + i] = readInteger();
+    }
+  }
+
+  /**
+   * Reads {@code count} longs into {@code dest} starting at {@code offset}.
+   *
+   * @param dest   destination array
+   * @param offset start index in dest
+   * @param count  number of values to read
+   */
+  public void readLongs(long[] dest, int offset, int count) {
+    for (int i = 0; i < count; i++) {
+      dest[offset + i] = readLong();
+    }
+  }
+
+  /**
+   * Reads {@code count} floats into {@code dest} starting at {@code offset}.
+   *
+   * @param dest   destination array
+   * @param offset start index in dest
+   * @param count  number of values to read
+   */
+  public void readFloats(float[] dest, int offset, int count) {
+    for (int i = 0; i < count; i++) {
+      dest[offset + i] = readFloat();
+    }
+  }
+
+  /**
+   * Reads {@code count} doubles into {@code dest} starting at {@code offset}.
+   *
+   * @param dest   destination array
+   * @param offset start index in dest
+   * @param count  number of values to read
+   */
+  public void readDoubles(double[] dest, int offset, int count) {
+    for (int i = 0; i < count; i++) {
+      dest[offset + i] = readDouble();
+    }
+  }
+
+  /**
+   * Reads {@code count} booleans into {@code dest} starting at {@code offset}.
+   *
+   * @param dest   destination array
+   * @param offset start index in dest
+   * @param count  number of values to read
+   */
+  public void readBooleans(boolean[] dest, int offset, int count) {
+    for (int i = 0; i < count; i++) {
+      dest[offset + i] = readBoolean();
+    }
+  }
+
+  /**
+   * Reads {@code count} Binary values into {@code dest} starting at {@code offset}.
+   *
+   * @param dest   destination array
+   * @param offset start index in dest
+   * @param count  number of values to read
+   */
+  public void readBinaries(Binary[] dest, int offset, int count) {
+    for (int i = 0; i < count; i++) {
+      dest[offset + i] = readBytes();
+    }
+  }
+
   /**
    * Skips the next value in the page
    */
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/ValuesWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/ValuesWriter.java
index ecea4a7520..bbe9230397 100755
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/ValuesWriter.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/ValuesWriter.java
@@ -98,6 +98,19 @@ public void writeBoolean(boolean v) {
     throw new UnsupportedOperationException(getClass().getName());
   }
 
+  /**
+   * Writes a batch of boolean values. Subclasses may override for optimized bulk encoding.
+   *
+   * @param values the boolean array to read from
+   * @param offset the start position in the array
+   * @param length the number of values to write
+   */
+  public void writeBooleans(boolean[] values, int offset, int length) {
+    for (int i = offset; i < offset + length; i++) {
+      writeBoolean(values[i]);
+    }
+  }
+
   /**
    * @param v the value to encode
    */
@@ -105,6 +118,19 @@ public void writeBytes(Binary v) {
     throw new UnsupportedOperationException(getClass().getName());
   }
 
+  /**
+   * Writes a batch of Binary values. Subclasses may override for optimized bulk encoding.
+   *
+   * @param values the Binary array to read from
+   * @param offset the start position in the array
+   * @param length the number of values to write
+   */
+  public void writeBinaries(Binary[] values, int offset, int length) {
+    for (int i = offset; i < offset + length; i++) {
+      writeBytes(values[i]);
+    }
+  }
+
   /**
    * @param v the value to encode
    */
@@ -112,6 +138,19 @@ public void writeInteger(int v) {
     throw new UnsupportedOperationException(getClass().getName());
   }
 
+  /**
+   * Writes a batch of int values. Subclasses may override for optimized bulk encoding.
+   *
+   * @param values the int array to read from
+   * @param offset the start position in the array
+   * @param length the number of values to write
+   */
+  public void writeIntegers(int[] values, int offset, int length) {
+    for (int i = offset; i < offset + length; i++) {
+      writeInteger(values[i]);
+    }
+  }
+
   /**
    * @param v the value to encode
    */
@@ -119,6 +158,19 @@ public void writeLong(long v) {
     throw new UnsupportedOperationException(getClass().getName());
   }
 
+  /**
+   * Writes a batch of long values. Subclasses may override for optimized bulk encoding.
+   *
+   * @param values the long array to read from
+   * @param offset the start position in the array
+   * @param length the number of values to write
+   */
+  public void writeLongs(long[] values, int offset, int length) {
+    for (int i = offset; i < offset + length; i++) {
+      writeLong(values[i]);
+    }
+  }
+
   /**
    * @param v the value to encode
    */
@@ -126,6 +178,19 @@ public void writeDouble(double v) {
     throw new UnsupportedOperationException(getClass().getName());
   }
 
+  /**
+   * Writes a batch of double values. Subclasses may override for optimized bulk encoding.
+   *
+   * @param values the double array to read from
+   * @param offset the start position in the array
+   * @param length the number of values to write
+   */
+  public void writeDoubles(double[] values, int offset, int length) {
+    for (int i = offset; i < offset + length; i++) {
+      writeDouble(values[i]);
+    }
+  }
+
   /**
    * @param v the value to encode
    */
@@ -133,5 +198,18 @@ public void writeFloat(float v) {
     throw new UnsupportedOperationException(getClass().getName());
   }
 
+  /**
+   * Writes a batch of float values. Subclasses may override for optimized bulk encoding.
+   *
+   * @param values the float array to read from
+   * @param offset the start position in the array
+   * @param length the number of values to write
+   */
+  public void writeFloats(float[] values, int offset, int length) {
+    for (int i = offset; i < offset + length; i++) {
+      writeFloat(values[i]);
+    }
+  }
+
   public abstract String memUsageString(String prefix);
 }
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReader.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReader.java
index c8ab3043bd..a500e1401e 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReader.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReader.java
@@ -49,17 +49,131 @@ protected int nextElementByteOffset() {
     return offset;
   }
 
-  // Decode an entire data page
+  /**
+   * Advances the stream position by {@code count} elements and returns the byte offset
+   * of the first element. Used by batch read methods in subclasses.
+   */
+  protected int advanceByteOffset(int count) {
+    if (indexInStream + count > valuesCount) {
+      throw new ParquetDecodingException("Byte-stream data was already exhausted.");
+    }
+    int offset = indexInStream * elementSizeInBytes;
+    indexInStream += count;
+    return offset;
+  }
+
+  // Decode an entire data page by transposing from stream-split layout to interleaved layout.
   private byte[] decodeData(ByteBuffer encoded, int valuesCount) {
-    assert encoded.limit() == valuesCount * elementSizeInBytes;
-    byte[] decoded = new byte[encoded.limit()];
-    int destByteIndex = 0;
-    for (int srcValueIndex = 0; srcValueIndex < valuesCount; ++srcValueIndex) {
-      for (int stream = 0; stream < elementSizeInBytes; ++stream, ++destByteIndex) {
-        decoded[destByteIndex] = encoded.get(srcValueIndex + stream * valuesCount);
+    int totalBytes = valuesCount * elementSizeInBytes;
+    assert encoded.remaining() >= totalBytes;
+
+    // Bulk access: use the backing array directly if available, otherwise copy once.
+    byte[] src;
+    int srcBase;
+    if (encoded.hasArray()) {
+      src = encoded.array();
+      srcBase = encoded.arrayOffset() + encoded.position();
+    } else {
+      src = new byte[totalBytes];
+      encoded.get(src);
+      srcBase = 0;
+    }
+
+    byte[] decoded = new byte[totalBytes];
+
+    // Specialized single-pass loops for common element sizes.
+    if (elementSizeInBytes == 2) {
+      int s0 = srcBase, s1 = srcBase + valuesCount;
+      for (int i = 0; i < valuesCount; ++i) {
+        int di = i * 2;
+        decoded[di] = src[s0 + i];
+        decoded[di + 1] = src[s1 + i];
+      }
+    } else if (elementSizeInBytes == 4) {
+      int s0 = srcBase, s1 = srcBase + valuesCount, s2 = srcBase + 2 * valuesCount,
+          s3 = srcBase + 3 * valuesCount;
+      for (int i = 0; i < valuesCount; ++i) {
+        int di = i * 4;
+        decoded[di] = src[s0 + i];
+        decoded[di + 1] = src[s1 + i];
+        decoded[di + 2] = src[s2 + i];
+        decoded[di + 3] = src[s3 + i];
+      }
+    } else if (elementSizeInBytes == 8) {
+      int s0 = srcBase, s1 = srcBase + valuesCount, s2 = srcBase + 2 * valuesCount,
+          s3 = srcBase + 3 * valuesCount, s4 = srcBase + 4 * valuesCount,
+          s5 = srcBase + 5 * valuesCount, s6 = srcBase + 6 * valuesCount,
+          s7 = srcBase + 7 * valuesCount;
+      for (int i = 0; i < valuesCount; ++i) {
+        int di = i * 8;
+        decoded[di] = src[s0 + i];
+        decoded[di + 1] = src[s1 + i];
+        decoded[di + 2] = src[s2 + i];
+        decoded[di + 3] = src[s3 + i];
+        decoded[di + 4] = src[s4 + i];
+        decoded[di + 5] = src[s5 + i];
+        decoded[di + 6] = src[s6 + i];
+        decoded[di + 7] = src[s7 + i];
+      }
+    } else if (elementSizeInBytes == 12) {
+      int s0 = srcBase, s1 = srcBase + valuesCount, s2 = srcBase + 2 * valuesCount,
+          s3 = srcBase + 3 * valuesCount, s4 = srcBase + 4 * valuesCount,
+          s5 = srcBase + 5 * valuesCount, s6 = srcBase + 6 * valuesCount,
+          s7 = srcBase + 7 * valuesCount, s8 = srcBase + 8 * valuesCount,
+          s9 = srcBase + 9 * valuesCount, s10 = srcBase + 10 * valuesCount,
+          s11 = srcBase + 11 * valuesCount;
+      for (int i = 0; i < valuesCount; ++i) {
+        int di = i * 12;
+        decoded[di] = src[s0 + i];
+        decoded[di + 1] = src[s1 + i];
+        decoded[di + 2] = src[s2 + i];
+        decoded[di + 3] = src[s3 + i];
+        decoded[di + 4] = src[s4 + i];
+        decoded[di + 5] = src[s5 + i];
+        decoded[di + 6] = src[s6 + i];
+        decoded[di + 7] = src[s7 + i];
+        decoded[di + 8] = src[s8 + i];
+        decoded[di + 9] = src[s9 + i];
+        decoded[di + 10] = src[s10 + i];
+        decoded[di + 11] = src[s11 + i];
+      }
+    } else if (elementSizeInBytes == 16) {
+      int s0 = srcBase, s1 = srcBase + valuesCount, s2 = srcBase + 2 * valuesCount,
+          s3 = srcBase + 3 * valuesCount, s4 = srcBase + 4 * valuesCount,
+          s5 = srcBase + 5 * valuesCount, s6 = srcBase + 6 * valuesCount,
+          s7 = srcBase + 7 * valuesCount, s8 = srcBase + 8 * valuesCount,
+          s9 = srcBase + 9 * valuesCount, s10 = srcBase + 10 * valuesCount,
+          s11 = srcBase + 11 * valuesCount, s12 = srcBase + 12 * valuesCount,
+          s13 = srcBase + 13 * valuesCount, s14 = srcBase + 14 * valuesCount,
+          s15 = srcBase + 15 * valuesCount;
+      for (int i = 0; i < valuesCount; ++i) {
+        int di = i * 16;
+        decoded[di] = src[s0 + i];
+        decoded[di + 1] = src[s1 + i];
+        decoded[di + 2] = src[s2 + i];
+        decoded[di + 3] = src[s3 + i];
+        decoded[di + 4] = src[s4 + i];
+        decoded[di + 5] = src[s5 + i];
+        decoded[di + 6] = src[s6 + i];
+        decoded[di + 7] = src[s7 + i];
+        decoded[di + 8] = src[s8 + i];
+        decoded[di + 9] = src[s9 + i];
+        decoded[di + 10] = src[s10 + i];
+        decoded[di + 11] = src[s11 + i];
+        decoded[di + 12] = src[s12 + i];
+        decoded[di + 13] = src[s13 + i];
+        decoded[di + 14] = src[s14 + i];
+        decoded[di + 15] = src[s15 + i];
+      }
+    } else {
+      // Generic fallback for arbitrary element sizes
+      for (int stream = 0; stream < elementSizeInBytes; ++stream) {
+        int srcOffset = srcBase + stream * valuesCount;
+        for (int i = 0; i < valuesCount; ++i) {
+          decoded[i * elementSizeInBytes + stream] = src[srcOffset + i];
+        }
       }
     }
-    assert destByteIndex == decoded.length;
     return decoded;
   }
 
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForDouble.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForDouble.java
index e725dc9fce..0917cd3902 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForDouble.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForDouble.java
@@ -27,4 +27,11 @@ public ByteStreamSplitValuesReaderForDouble() {
   public double readDouble() {
     return decodedDataBuffer.getDouble(nextElementByteOffset());
   }
+
+  @Override
+  public void readDoubles(double[] dest, int offset, int count) {
+    int byteOffset = advanceByteOffset(count);
+    decodedDataBuffer.position(byteOffset);
+    decodedDataBuffer.asDoubleBuffer().get(dest, offset, count);
+  }
 }
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForFLBA.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForFLBA.java
index d8613dd8b9..b026a7d76e 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForFLBA.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForFLBA.java
@@ -30,4 +30,17 @@ public ByteStreamSplitValuesReaderForFLBA(int length) {
   public Binary readBytes() {
     return Binary.fromConstantByteBuffer(decodedDataBuffer, nextElementByteOffset(), elementSizeInBytes);
   }
+
+  /**
+   * Batch read: advances the stream by {@code count} elements in a single bounds check,
+   * then creates Binary views at sequential offsets — eliminating per-value bounds checking.
+   */
+  @Override
+  public void readBinaries(Binary[] dest, int offset, int count) {
+    int byteOffset = advanceByteOffset(count);
+    for (int i = 0; i < count; i++) {
+      dest[offset + i] = Binary.fromConstantByteBuffer(decodedDataBuffer, byteOffset, elementSizeInBytes);
+      byteOffset += elementSizeInBytes;
+    }
+  }
 }
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForFloat.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForFloat.java
index cecb7925d8..bb28ef0ac2 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForFloat.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForFloat.java
@@ -27,4 +27,11 @@ public ByteStreamSplitValuesReaderForFloat() {
   public float readFloat() {
     return decodedDataBuffer.getFloat(nextElementByteOffset());
   }
+
+  @Override
+  public void readFloats(float[] dest, int offset, int count) {
+    int byteOffset = advanceByteOffset(count);
+    decodedDataBuffer.position(byteOffset);
+    decodedDataBuffer.asFloatBuffer().get(dest, offset, count);
+  }
 }
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForInteger.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForInteger.java
index 57f9bfdf03..e71079d2f6 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForInteger.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForInteger.java
@@ -27,4 +27,11 @@ public ByteStreamSplitValuesReaderForInteger() {
   public int readInteger() {
     return decodedDataBuffer.getInt(nextElementByteOffset());
   }
+
+  @Override
+  public void readIntegers(int[] dest, int offset, int count) {
+    int byteOffset = advanceByteOffset(count);
+    decodedDataBuffer.position(byteOffset);
+    decodedDataBuffer.asIntBuffer().get(dest, offset, count);
+  }
 }
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForLong.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForLong.java
index c7711d8919..f73c46e972 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForLong.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForLong.java
@@ -27,4 +27,11 @@ public ByteStreamSplitValuesReaderForLong() {
   public long readLong() {
     return decodedDataBuffer.getLong(nextElementByteOffset());
   }
+
+  @Override
+  public void readLongs(long[] dest, int offset, int count) {
+    int byteOffset = advanceByteOffset(count);
+    decodedDataBuffer.position(byteOffset);
+    decodedDataBuffer.asLongBuffer().get(dest, offset, count);
+  }
 }
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesWriter.java
index c197a4fd6f..e62126ed4d 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesWriter.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesWriter.java
@@ -29,9 +29,15 @@
 
 public abstract class ByteStreamSplitValuesWriter extends ValuesWriter {
 
+  /**
+   * Batch size for buffered scatter writes. Values are accumulated in a batch buffer
+   * and flushed as bulk {@code write(byte[], off, len)} calls to each stream.
+   */
+  private static final int BATCH_SIZE = 64;
+
   protected final int numStreams;
   protected final int elementSizeInBytes;
-  private final CapacityByteArrayOutputStream[] byteStreams;
+  protected final CapacityByteArrayOutputStream[] byteStreams;
 
   public ByteStreamSplitValuesWriter(
       int elementSizeInBytes, int initialCapacity, int pageSize, ByteBufferAllocator allocator) {
@@ -176,6 +182,8 @@ public String memUsageString(String prefix) {
 
   public static class FixedLenByteArrayByteStreamSplitValuesWriter extends ByteStreamSplitValuesWriter {
     private final int length;
+    private byte[][] batchBufs; // [stream][batchIndex] scratch buffers
+    private int flbaBatchCount;
 
     public FixedLenByteArrayByteStreamSplitValuesWriter(
         int length, int initialCapacity, int pageSize, ByteBufferAllocator allocator) {
@@ -187,7 +195,69 @@ public FixedLenByteArrayByteStreamSplitValuesWriter(
     public final void writeBytes(Binary v) {
       assert (v.length() == length)
           : ("Fixed Binary size " + v.length() + " does not match field type length " + length);
-      super.scatterBytes(v.getBytesUnsafe());
+      if (batchBufs == null) {
+        batchBufs = new byte[length][BATCH_SIZE];
+      }
+      byte[] bytes = v.getBytesUnsafe();
+      for (int stream = 0; stream < length; stream++) {
+        batchBufs[stream][flbaBatchCount] = bytes[stream];
+      }
+      flbaBatchCount++;
+      if (flbaBatchCount == BATCH_SIZE) {
+        flushFlbaBatch();
+      }
+    }
+
+    @Override
+    public void writeBinaries(Binary[] values, int offset, int len) {
+      if (batchBufs == null) {
+        batchBufs = new byte[length][BATCH_SIZE];
+      }
+      for (int i = offset; i < offset + len; i++) {
+        Binary v = values[i];
+        assert (v.length() == length)
+            : ("Fixed Binary size " + v.length() + " does not match field type length " + length);
+        byte[] bytes = v.getBytesUnsafe();
+        for (int stream = 0; stream < length; stream++) {
+          batchBufs[stream][flbaBatchCount] = bytes[stream];
+        }
+        flbaBatchCount++;
+        if (flbaBatchCount == BATCH_SIZE) {
+          flushFlbaBatch();
+        }
+      }
+    }
+
+    private void flushFlbaBatch() {
+      if (flbaBatchCount == 0) return;
+      final int count = flbaBatchCount;
+      for (int stream = 0; stream < length; stream++) {
+        byteStreams[stream].write(batchBufs[stream], 0, count);
+      }
+      flbaBatchCount = 0;
+    }
+
+    @Override
+    public BytesInput getBytes() {
+      flushFlbaBatch();
+      return super.getBytes();
+    }
+
+    @Override
+    public void reset() {
+      flbaBatchCount = 0;
+      super.reset();
+    }
+
+    @Override
+    public void close() {
+      flbaBatchCount = 0;
+      super.close();
+    }
+
+    @Override
+    public long getBufferedSize() {
+      return super.getBufferedSize() + (long) flbaBatchCount * length;
     }
 
     @Override
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/delta/DeltaBinaryPackingValuesReader.java b/parquet-column/src/main/java/org/apache/parquet/column/values/delta/DeltaBinaryPackingValuesReader.java
index 259ebc09c0..6726614460 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/delta/DeltaBinaryPackingValuesReader.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/delta/DeltaBinaryPackingValuesReader.java
@@ -112,6 +112,22 @@ public long readLong() {
     return valuesBuffer[valuesRead++];
   }
 
+  @Override
+  public void readIntegers(int[] dest, int offset, int count) {
+    checkRead();
+    for (int i = 0; i < count; i++) {
+      dest[offset + i] = (int) valuesBuffer[valuesRead + i];
+    }
+    valuesRead += count;
+  }
+
+  @Override
+  public void readLongs(long[] dest, int offset, int count) {
+    checkRead();
+    System.arraycopy(valuesBuffer, valuesRead, dest, offset, count);
+    valuesRead += count;
+  }
+
   private void checkRead() {
     if (valuesRead >= totalValueCount) {
       throw new ParquetDecodingException("no more value to read, total value count is " + totalValueCount);
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/DictionaryValuesReader.java b/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/DictionaryValuesReader.java
index 53fafc55dc..db344c3e63 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/DictionaryValuesReader.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/DictionaryValuesReader.java
@@ -117,6 +117,59 @@ public long readLong() {
     }
   }
 
+  @Override
+  public void readIntegers(int[] dest, int offset, int count) {
+    try {
+      // Batch-decode dictionary IDs, then batch-lookup
+      int[] ids = new int[count];
+      decoder.readInts(ids, 0, count);
+      for (int i = 0; i < count; i++) {
+        dest[offset + i] = dictionary.decodeToInt(ids[i]);
+      }
+    } catch (IOException e) {
+      throw new ParquetDecodingException(e);
+    }
+  }
+
+  @Override
+  public void readLongs(long[] dest, int offset, int count) {
+    try {
+      int[] ids = new int[count];
+      decoder.readInts(ids, 0, count);
+      for (int i = 0; i < count; i++) {
+        dest[offset + i] = dictionary.decodeToLong(ids[i]);
+      }
+    } catch (IOException e) {
+      throw new ParquetDecodingException(e);
+    }
+  }
+
+  @Override
+  public void readFloats(float[] dest, int offset, int count) {
+    try {
+      int[] ids = new int[count];
+      decoder.readInts(ids, 0, count);
+      for (int i = 0; i < count; i++) {
+        dest[offset + i] = dictionary.decodeToFloat(ids[i]);
+      }
+    } catch (IOException e) {
+      throw new ParquetDecodingException(e);
+    }
+  }
+
+  @Override
+  public void readDoubles(double[] dest, int offset, int count) {
+    try {
+      int[] ids = new int[count];
+      decoder.readInts(ids, 0, count);
+      for (int i = 0; i < count; i++) {
+        dest[offset + i] = dictionary.decodeToDouble(ids[i]);
+      }
+    } catch (IOException e) {
+      throw new ParquetDecodingException(e);
+    }
+  }
+
   @Override
   public void skip() {
     try {
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/plain/BooleanPlainValuesReader.java b/parquet-column/src/main/java/org/apache/parquet/column/values/plain/BooleanPlainValuesReader.java
index 22ca2d567c..3843e3b6f0 100755
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/plain/BooleanPlainValuesReader.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/plain/BooleanPlainValuesReader.java
@@ -18,57 +18,122 @@
  */
 package org.apache.parquet.column.values.plain;
 
-import static org.apache.parquet.column.values.bitpacking.Packer.LITTLE_ENDIAN;
-
 import java.io.IOException;
+import java.nio.ByteBuffer;
 import org.apache.parquet.bytes.ByteBufferInputStream;
+import org.apache.parquet.bytes.BytesUtils;
 import org.apache.parquet.column.values.ValuesReader;
-import org.apache.parquet.column.values.bitpacking.ByteBitPackingValuesReader;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 /**
- * encodes boolean for the plain encoding: one bit at a time (0 = false)
+ * Decodes PLAIN-encoded booleans: one bit per value, packed 8 per byte, little-endian
+ * bit order (bit 0 of each byte is the first value).
+ *
+ * <p>Direct bit extraction from the page ByteBuffer avoids the overhead of the generic
+ * bit-packing machinery ({@code ByteBitPackingValuesReader}) and intermediate
+ * {@code int[8]} buffers.
+ *
+ * <p>The batch path uses a static 256-entry lookup table that maps each byte value to
+ * its 8 pre-decoded booleans. This enables {@code System.arraycopy} of 8 booleans per
+ * byte (a single 64-bit memory operation in HotSpot) instead of 8 individual
+ * comparison+store operations.
  */
 public class BooleanPlainValuesReader extends ValuesReader {
   private static final Logger LOG = LoggerFactory.getLogger(BooleanPlainValuesReader.class);
 
-  private ByteBitPackingValuesReader in = new ByteBitPackingValuesReader(1, LITTLE_ENDIAN);
-
   /**
-   * {@inheritDoc}
-   *
-   * @see org.apache.parquet.column.values.ValuesReader#readBoolean()
+   * Lookup table: BYTE_TO_BOOLS[b] contains the 8 boolean values for byte value b,
+   * in little-endian bit order (bit 0 = index 0).
    */
+  private static final boolean[][] BYTE_TO_BOOLS = new boolean[256][8];
+
+  static {
+    for (int b = 0; b < 256; b++) {
+      for (int bit = 0; bit < 8; bit++) {
+        BYTE_TO_BOOLS[b][bit] = ((b >>> bit) & 1) != 0;
+      }
+    }
+  }
+
+  private byte[] pageData;
+  private int pageOffset;
+  private int bitIndex;
+
+  @Override
+  public void initFromPage(int valueCount, ByteBufferInputStream stream) throws IOException {
+    LOG.debug("init from page at offset {} for length {}", stream.position(), stream.available());
+    int effectiveBitLength = valueCount; // bitWidth = 1
+    int length = BytesUtils.paddedByteCountFromBits(effectiveBitLength);
+    length = Math.min(length, stream.available());
+    ByteBuffer buf = stream.slice(length);
+
+    // Bulk access: use backing array directly if available, otherwise copy once.
+    if (buf.hasArray()) {
+      pageData = buf.array();
+      pageOffset = buf.arrayOffset() + buf.position();
+    } else {
+      pageData = new byte[length];
+      buf.get(pageData);
+      pageOffset = 0;
+    }
+    bitIndex = 0;
+    updateNextOffset(length);
+  }
+
   @Override
   public boolean readBoolean() {
-    return in.readInteger() == 0 ? false : true;
+    int byteIdx = pageOffset + (bitIndex >>> 3);
+    int bitPos = bitIndex & 7;
+    bitIndex++;
+    return ((pageData[byteIdx] >>> bitPos) & 1) != 0;
   }
 
-  /**
-   * {@inheritDoc}
-   *
-   * @see org.apache.parquet.column.values.ValuesReader#skip()
-   */
   @Override
-  public void skip() {
-    in.readInteger();
+  public void readBooleans(boolean[] dest, int offset, int count) {
+    int i = 0;
+
+    // Handle partial byte at current position
+    int bitPos = bitIndex & 7;
+    if (bitPos != 0) {
+      int byteIdx = pageOffset + (bitIndex >>> 3);
+      byte b = pageData[byteIdx];
+      while (bitPos < 8 && i < count) {
+        dest[offset + i] = ((b >>> bitPos) & 1) != 0;
+        bitPos++;
+        i++;
+      }
+    }
+
+    // Process full bytes: 8 booleans per byte via lookup table + arraycopy
+    int byteIdx = pageOffset + ((bitIndex + i) >>> 3);
+    while (i + 8 <= count) {
+      System.arraycopy(BYTE_TO_BOOLS[pageData[byteIdx] & 0xFF], 0, dest, offset + i, 8);
+      byteIdx++;
+      i += 8;
+    }
+
+    // Handle remaining bits in the last partial byte
+    if (i < count) {
+      byte b = pageData[byteIdx];
+      int bp = 0;
+      while (i < count) {
+        dest[offset + i] = ((b >>> bp) & 1) != 0;
+        bp++;
+        i++;
+      }
+    }
+
+    bitIndex += count;
   }
 
-  /**
-   * {@inheritDoc}
-   *
-   * @see org.apache.parquet.column.values.ValuesReader#initFromPage(int, ByteBufferInputStream)
-   */
   @Override
-  public void initFromPage(int valueCount, ByteBufferInputStream stream) throws IOException {
-    LOG.debug("init from page at offset {} for length {}", stream.position(), stream.available());
-    this.in.initFromPage(valueCount, stream);
+  public void skip() {
+    bitIndex++;
   }
 
-  @Deprecated
   @Override
-  public int getNextOffset() {
-    return in.getNextOffset();
+  public void skip(int n) {
+    bitIndex += n;
   }
 }
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/plain/BooleanPlainValuesWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/plain/BooleanPlainValuesWriter.java
index 7f80ec150a..ae3b43c63b 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/plain/BooleanPlainValuesWriter.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/plain/BooleanPlainValuesWriter.java
@@ -19,52 +19,118 @@
 package org.apache.parquet.column.values.plain;
 
 import static org.apache.parquet.column.Encoding.PLAIN;
-import static org.apache.parquet.column.values.bitpacking.Packer.LITTLE_ENDIAN;
 
 import org.apache.parquet.bytes.BytesInput;
+import org.apache.parquet.bytes.CapacityByteArrayOutputStream;
 import org.apache.parquet.column.Encoding;
 import org.apache.parquet.column.values.ValuesWriter;
-import org.apache.parquet.column.values.bitpacking.ByteBitPackingValuesWriter;
 
 /**
- * An implementation of the PLAIN encoding
+ * An implementation of the PLAIN encoding for BOOLEAN values.
+ *
+ * <p>Packs booleans directly into bytes (8 per byte, LSB first) without
+ * going through the generic int-based bit-packing encoder.
  */
 public class BooleanPlainValuesWriter extends ValuesWriter {
 
-  private ByteBitPackingValuesWriter bitPackingWriter;
+  private static final int INITIAL_SLAB_SIZE = 1024;
+  private static final int MAX_CAPACITY = 64 * 1024;
+
+  private final CapacityByteArrayOutputStream baos;
+  private int currentByte;
+  private int bitsWritten;
 
   public BooleanPlainValuesWriter() {
-    bitPackingWriter = new ByteBitPackingValuesWriter(1, LITTLE_ENDIAN);
+    this.baos = new CapacityByteArrayOutputStream(INITIAL_SLAB_SIZE, MAX_CAPACITY);
+    this.currentByte = 0;
+    this.bitsWritten = 0;
   }
 
   @Override
   public final void writeBoolean(boolean v) {
-    bitPackingWriter.writeInteger(v ? 1 : 0);
+    currentByte |= ((v ? 1 : 0) << bitsWritten);
+    bitsWritten++;
+    if (bitsWritten == 8) {
+      baos.write(currentByte);
+      currentByte = 0;
+      bitsWritten = 0;
+    }
+  }
+
+  @Override
+  public void writeBooleans(boolean[] values, int offset, int length) {
+    int pos = offset;
+    int end = offset + length;
+
+    // Fill current partial byte
+    while (bitsWritten > 0 && bitsWritten < 8 && pos < end) {
+      if (values[pos]) {
+        currentByte |= (1 << bitsWritten);
+      }
+      bitsWritten++;
+      pos++;
+      if (bitsWritten == 8) {
+        baos.write(currentByte);
+        currentByte = 0;
+        bitsWritten = 0;
+      }
+    }
+
+    // Process 8 values at a time — pack directly into a byte
+    while (pos + 8 <= end) {
+      int b = 0;
+      if (values[pos]) b |= 0x01;
+      if (values[pos + 1]) b |= 0x02;
+      if (values[pos + 2]) b |= 0x04;
+      if (values[pos + 3]) b |= 0x08;
+      if (values[pos + 4]) b |= 0x10;
+      if (values[pos + 5]) b |= 0x20;
+      if (values[pos + 6]) b |= 0x40;
+      if (values[pos + 7]) b |= 0x80;
+      baos.write(b);
+      pos += 8;
+    }
+
+    // Handle remaining values (< 8)
+    while (pos < end) {
+      if (values[pos]) {
+        currentByte |= (1 << bitsWritten);
+      }
+      bitsWritten++;
+      pos++;
+    }
   }
 
   @Override
   public long getBufferedSize() {
-    return bitPackingWriter.getBufferedSize();
+    return baos.size() + (bitsWritten > 0 ? 1 : 0);
   }
 
   @Override
   public BytesInput getBytes() {
-    return bitPackingWriter.getBytes();
+    if (bitsWritten > 0) {
+      baos.write(currentByte);
+      currentByte = 0;
+      bitsWritten = 0;
+    }
+    return BytesInput.from(baos);
   }
 
   @Override
   public void reset() {
-    bitPackingWriter.reset();
+    baos.reset();
+    currentByte = 0;
+    bitsWritten = 0;
   }
 
   @Override
   public void close() {
-    bitPackingWriter.close();
+    baos.close();
   }
 
   @Override
   public long getAllocatedSize() {
-    return bitPackingWriter.getAllocatedSize();
+    return baos.getCapacity();
   }
 
   @Override
@@ -74,6 +140,6 @@ public Encoding getEncoding() {
 
   @Override
   public String memUsageString(String prefix) {
-    return bitPackingWriter.memUsageString(prefix);
+    return String.format("%s BooleanPlainValuesWriter %d bytes", prefix, getAllocatedSize());
   }
 }
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/plain/FixedLenByteArrayPlainValuesReader.java b/parquet-column/src/main/java/org/apache/parquet/column/values/plain/FixedLenByteArrayPlainValuesReader.java
index adfc488924..6200ae4477 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/plain/FixedLenByteArrayPlainValuesReader.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/plain/FixedLenByteArrayPlainValuesReader.java
@@ -19,6 +19,7 @@
 package org.apache.parquet.column.values.plain;
 
 import java.io.IOException;
+import java.nio.ByteBuffer;
 import org.apache.parquet.bytes.ByteBufferInputStream;
 import org.apache.parquet.column.values.ValuesReader;
 import org.apache.parquet.io.ParquetDecodingException;
@@ -62,6 +63,25 @@ public void skip(int n) {
     }
   }
 
+  /**
+   * Batch read: slices the entire block of {@code count * length} bytes in one call,
+   * then creates Binary views at fixed offsets within the single ByteBuffer — eliminating
+   * per-value slice overhead.
+   */
+  @Override
+  public void readBinaries(Binary[] dest, int offset, int count) {
+    try {
+      int totalBytes = count * length;
+      ByteBuffer block = in.slice(totalBytes);
+      int baseOffset = block.position();
+      for (int i = 0; i < count; i++) {
+        dest[offset + i] = Binary.fromConstantByteBuffer(block, baseOffset + i * length, length);
+      }
+    } catch (IOException | RuntimeException e) {
+      throw new ParquetDecodingException("could not read bytes at offset " + in.position(), e);
+    }
+  }
+
   @Override
   public void initFromPage(int valueCount, ByteBufferInputStream stream) throws IOException {
     LOG.debug("init from page at offset {} for length {}", stream.position(), stream.available());
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/plain/FixedLenByteArrayPlainValuesWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/plain/FixedLenByteArrayPlainValuesWriter.java
index dec4d1be1b..9d8c7e464b 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/plain/FixedLenByteArrayPlainValuesWriter.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/plain/FixedLenByteArrayPlainValuesWriter.java
@@ -62,6 +62,42 @@ public final void writeBytes(Binary v) {
     }
   }
 
+  /**
+   * Batch write: copies Binary values into a temporary buffer and writes them in a single
+   * bulk {@code write()} call to the output stream, amortizing stream overhead across
+   * the entire batch.
+   */
+  @Override
+  public void writeBinaries(Binary[] values, int offset, int length) {
+    final int fixedLen = this.length;
+    // Process in chunks to avoid excessive temp allocation
+    final int CHUNK = 1024;
+    byte[] buf = new byte[Math.min(length, CHUNK) * fixedLen];
+    try {
+      int remaining = length;
+      int srcIdx = offset;
+      while (remaining > 0) {
+        int batch = Math.min(remaining, CHUNK);
+        int bufPos = 0;
+        for (int i = 0; i < batch; i++) {
+          Binary v = values[srcIdx++];
+          if (v.length() != fixedLen) {
+            throw new IllegalArgumentException(
+                "Fixed Binary size " + v.length() + " does not match field type length " + fixedLen);
+          }
+          // Copy bytes from the Binary's backing store into the batch buffer
+          byte[] bytes = v.getBytesUnsafe();
+          System.arraycopy(bytes, 0, buf, bufPos, fixedLen);
+          bufPos += fixedLen;
+        }
+        arrayOut.write(buf, 0, bufPos);
+        remaining -= batch;
+      }
+    } catch (RuntimeException e) {
+      throw new ParquetEncodingException("could not write fixed bytes", e);
+    }
+  }
+
   @Override
   public long getBufferedSize() {
     return arrayOut.size();
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/plain/PlainValuesReader.java b/parquet-column/src/main/java/org/apache/parquet/column/values/plain/PlainValuesReader.java
index a0c7af7394..a3d0d06923 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/plain/PlainValuesReader.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/plain/PlainValuesReader.java
@@ -71,6 +71,17 @@ public double readDouble() {
         throw new ParquetDecodingException("could not read double", e);
       }
     }
+
+    @Override
+    public void readDoubles(double[] dest, int offset, int count) {
+      try {
+        for (int i = 0; i < count; i++) {
+          dest[offset + i] = in.readDouble();
+        }
+      } catch (IOException e) {
+        throw new ParquetDecodingException("could not read doubles", e);
+      }
+    }
   }
 
   public static class FloatPlainValuesReader extends PlainValuesReader {
@@ -92,6 +103,17 @@ public float readFloat() {
         throw new ParquetDecodingException("could not read float", e);
       }
     }
+
+    @Override
+    public void readFloats(float[] dest, int offset, int count) {
+      try {
+        for (int i = 0; i < count; i++) {
+          dest[offset + i] = in.readFloat();
+        }
+      } catch (IOException e) {
+        throw new ParquetDecodingException("could not read floats", e);
+      }
+    }
   }
 
   public static class IntegerPlainValuesReader extends PlainValuesReader {
@@ -113,6 +135,17 @@ public int readInteger() {
         throw new ParquetDecodingException("could not read int", e);
       }
     }
+
+    @Override
+    public void readIntegers(int[] dest, int offset, int count) {
+      try {
+        for (int i = 0; i < count; i++) {
+          dest[offset + i] = in.readInt();
+        }
+      } catch (IOException e) {
+        throw new ParquetDecodingException("could not read ints", e);
+      }
+    }
   }
 
   public static class LongPlainValuesReader extends PlainValuesReader {
@@ -134,5 +167,16 @@ public long readLong() {
         throw new ParquetDecodingException("could not read long", e);
       }
     }
+
+    @Override
+    public void readLongs(long[] dest, int offset, int count) {
+      try {
+        for (int i = 0; i < count; i++) {
+          dest[offset + i] = in.readLong();
+        }
+      } catch (IOException e) {
+        throw new ParquetDecodingException("could not read longs", e);
+      }
+    }
   }
 }
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/plain/PlainValuesWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/plain/PlainValuesWriter.java
index c7069bc092..0802f46d2a 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/plain/PlainValuesWriter.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/plain/PlainValuesWriter.java
@@ -94,6 +94,26 @@ public final void writeDouble(double v) {
     }
   }
 
+  @Override
+  public final void writeIntegers(int[] values, int offset, int length) {
+    arrayOut.writeInts(values, offset, length);
+  }
+
+  @Override
+  public final void writeLongs(long[] values, int offset, int length) {
+    arrayOut.writeLongs(values, offset, length);
+  }
+
+  @Override
+  public final void writeFloats(float[] values, int offset, int length) {
+    arrayOut.writeFloats(values, offset, length);
+  }
+
+  @Override
+  public final void writeDoubles(double[] values, int offset, int length) {
+    arrayOut.writeDoubles(values, offset, length);
+  }
+
   @Override
   public void writeByte(int value) {
     try {
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridDecoder.java b/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridDecoder.java
index e55b276b29..f2dd50d623 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridDecoder.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridDecoder.java
@@ -48,6 +48,8 @@ private static enum MODE {
   private int currentCount;
   private int currentValue;
   private int[] currentBuffer;
+  // Saved packed bytes for bitWidth=1 boolean optimization (lookup table decode)
+  private byte[] packedBytesBuffer;
 
   public RunLengthBitPackingHybridDecoder(int bitWidth, InputStream in) {
     LOG.debug("decoding bitWidth {}", bitWidth);
@@ -77,6 +79,121 @@ public int readInt() throws IOException {
     return result;
   }
 
+  /**
+   * Reads {@code count} int values into {@code dest} starting at {@code offset}.
+   * This avoids per-value virtual dispatch overhead by batching across RLE runs
+   * and packed groups.
+   *
+   * @param dest   destination array
+   * @param offset start index in dest
+   * @param count  number of values to read
+   */
+  public void readInts(int[] dest, int offset, int count) throws IOException {
+    int remaining = count;
+    int pos = offset;
+    while (remaining > 0) {
+      if (currentCount == 0) {
+        readNext();
+      }
+      int batchSize = Math.min(remaining, currentCount);
+      switch (mode) {
+        case RLE:
+          java.util.Arrays.fill(dest, pos, pos + batchSize, currentValue);
+          break;
+        case PACKED:
+          int startIdx = currentBuffer.length - currentCount;
+          System.arraycopy(currentBuffer, startIdx, dest, pos, batchSize);
+          break;
+        default:
+          throw new ParquetDecodingException("not a valid mode " + mode);
+      }
+      currentCount -= batchSize;
+      remaining -= batchSize;
+      pos += batchSize;
+    }
+  }
+
+  /**
+   * Lookup table for bitWidth=1: maps each byte to its 8 boolean values.
+   * Used by {@link #readBooleans} PACKED path to bypass the int[] intermediate.
+   */
+  private static final boolean[][] BYTE_TO_BOOLS = new boolean[256][8];
+
+  static {
+    for (int b = 0; b < 256; b++) {
+      for (int bit = 0; bit < 8; bit++) {
+        BYTE_TO_BOOLS[b][bit] = ((b >>> bit) & 1) != 0;
+      }
+    }
+  }
+
+  /**
+   * Reads {@code count} boolean values into {@code dest} starting at {@code offset}.
+   * For RLE runs, uses {@code Arrays.fill} with a single boolean value.
+   * For packed groups, uses a lookup table to decode 8 booleans per byte directly
+   * from the raw packed bytes, bypassing the int[] intermediate buffer.
+   *
+   * @param dest   destination array
+   * @param offset start index in dest
+   * @param count  number of values to read
+   */
+  public void readBooleans(boolean[] dest, int offset, int count) throws IOException {
+    int remaining = count;
+    int pos = offset;
+    while (remaining > 0) {
+      if (currentCount == 0) {
+        readNext();
+      }
+      int batchSize = Math.min(remaining, currentCount);
+      switch (mode) {
+        case RLE:
+          java.util.Arrays.fill(dest, pos, pos + batchSize, currentValue != 0);
+          break;
+        case PACKED:
+          // For bitWidth=1, read directly from packedBytesBuffer via lookup table
+          int bitOff = currentBuffer.length - currentCount;
+          int written = 0;
+
+          // Handle partial byte alignment
+          int bitPos = bitOff & 7;
+          if (bitPos != 0) {
+            int byteIdx = bitOff >>> 3;
+            byte b = packedBytesBuffer[byteIdx];
+            while (bitPos < 8 && written < batchSize) {
+              dest[pos + written] = ((b >>> bitPos) & 1) != 0;
+              bitPos++;
+              written++;
+            }
+          }
+
+          // Process full bytes via lookup table
+          int byteIdx = (bitOff + written) >>> 3;
+          while (written + 8 <= batchSize) {
+            System.arraycopy(BYTE_TO_BOOLS[packedBytesBuffer[byteIdx] & 0xFF], 0, dest, pos + written, 8);
+            byteIdx++;
+            written += 8;
+          }
+
+          // Handle remaining bits
+          if (written < batchSize) {
+            byte b = packedBytesBuffer[byteIdx];
+            int bp = 0;
+            while (written < batchSize) {
+              dest[pos + written] = ((b >>> bp) & 1) != 0;
+              bp++;
+              written++;
+            }
+          }
+          break;
+        default:
+          throw new ParquetDecodingException("not a valid mode " + mode);
+      }
+      currentCount -= batchSize;
+      remaining -= batchSize;
+      pos += batchSize;
+    }
+  }
+
   private void readNext() throws IOException {
     Preconditions.checkArgument(in.available() > 0, "Reading past RLE/BitPacking stream.");
     final int header = BytesUtils.readUnsignedVarInt(in);
@@ -97,6 +214,7 @@ private void readNext() throws IOException {
         int bytesToRead = (int) Math.ceil(currentCount * bitWidth / 8.0);
         bytesToRead = Math.min(bytesToRead, in.available());
         new DataInputStream(in).readFully(bytes, 0, bytesToRead);
+        packedBytesBuffer = bytes;
         for (int valueIndex = 0, byteIndex = 0;
             valueIndex < currentCount;
             valueIndex += 8, byteIndex += bitWidth) {
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridEncoder.java b/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridEncoder.java
index e33824bff1..fc83e85963 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridEncoder.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridEncoder.java
@@ -272,6 +272,71 @@ public BytesInput toBytes() throws IOException {
     return BytesInput.from(baos);
   }
 
+  /**
+   * Batch-encodes boolean values (bitWidth must be 1). Pre-scans for runs to emit
+   * RLE runs directly and packs remaining groups into bit-packed runs, bypassing
+   * the per-value state machine.
+   *
+   * <p>This method may only be called when the encoder is in its initial state
+   * (no values have been written via {@link #writeInt}). If called after scalar
+   * writes, behavior is undefined.
+   *
+   * @param values the boolean array
+   * @param offset start position in the array
+   * @param length number of values to encode
+   */
+  public void writeBooleans(boolean[] values, int offset, int length) throws IOException {
+    Preconditions.checkArgument(bitWidth == 1, "writeBooleans requires bitWidth == 1");
+
+    int pos = offset;
+    int end = offset + length;
+
+    while (pos < end) {
+      // Scan for run of consecutive identical values
+      boolean val = values[pos];
+      int runStart = pos;
+      pos++;
+      while (pos < end && values[pos] == val) {
+        pos++;
+      }
+      int runLen = pos - runStart;
+      int intVal = val ? 1 : 0;
+
+      // If we have a pending partial buffer, fill it first from this run
+      if (numBufferedValues > 0 && runLen >= 8) {
+        int fill = 8 - numBufferedValues;
+        for (int i = 0; i < fill; i++) {
+          bufferedValues[numBufferedValues] = intVal;
+          numBufferedValues++;
+        }
+        writeOrAppendBitPackedRun();
+        runLen -= fill;
+      }
+
+      if (runLen >= 8) {
+        // Buffer is empty now, emit RLE run for the remaining
+        endPreviousBitPackedRun();
+        BytesUtils.writeUnsignedVarInt(runLen << 1, baos);
+        BytesUtils.writeIntLittleEndianPaddedOnBitWidth(baos, intVal, bitWidth);
+      } else {
+        // Buffer values for bit-packing
+        for (int i = 0; i < runLen; i++) {
+          bufferedValues[numBufferedValues] = intVal;
+          numBufferedValues++;
+          if (numBufferedValues == 8) {
+            writeOrAppendBitPackedRun();
+          }
+        }
+      }
+    }
+
+    // Update state so toBytes() handles the tail correctly
+    repeatCount = 0;
+    if (numBufferedValues > 0) {
+      previousValue = bufferedValues[numBufferedValues - 1];
+    }
+  }
+
   /**
    * Reset this encoder for re-use
    */
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridValuesReader.java b/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridValuesReader.java
index 0bd5a18d2b..9ee70add6d 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridValuesReader.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridValuesReader.java
@@ -54,6 +54,24 @@ public int readInteger() {
     }
   }
 
+  @Override
+  public void readIntegers(int[] dest, int offset, int count) {
+    try {
+      decoder.readInts(dest, offset, count);
+    } catch (IOException e) {
+      throw new ParquetDecodingException(e);
+    }
+  }
+
+  @Override
+  public void readBooleans(boolean[] dest, int offset, int count) {
+    try {
+      decoder.readBooleans(dest, offset, count);
+    } catch (IOException e) {
+      throw new ParquetDecodingException(e);
+    }
+  }
+
   @Override
   public boolean readBoolean() {
     return readInteger() == 0 ? false : true;
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridValuesWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridValuesWriter.java
index e869b0f2a3..b6609b1d43 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridValuesWriter.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridValuesWriter.java
@@ -52,6 +52,15 @@ public void writeBoolean(boolean v) {
     writeInteger(v ? 1 : 0);
   }
 
+  @Override
+  public void writeBooleans(boolean[] values, int offset, int length) {
+    try {
+      encoder.writeBooleans(values, offset, length);
+    } catch (IOException e) {
+      throw new ParquetEncodingException(e);
+    }
+  }
+
   @Override
   public long getBufferedSize() {
     return encoder.getBufferedSize();
diff --git a/parquet-common/src/main/java/org/apache/parquet/bytes/CapacityByteArrayOutputStream.java b/parquet-common/src/main/java/org/apache/parquet/bytes/CapacityByteArrayOutputStream.java
index d3d8b1b6de..7dbe22a6b3 100644
--- a/parquet-common/src/main/java/org/apache/parquet/bytes/CapacityByteArrayOutputStream.java
+++ b/parquet-common/src/main/java/org/apache/parquet/bytes/CapacityByteArrayOutputStream.java
@@ -27,6 +27,7 @@
 import java.io.IOException;
 import java.io.OutputStream;
 import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
 import java.util.ArrayList;
 import java.util.List;
 import org.apache.parquet.OutputStreamCloseException;
@@ -201,6 +202,7 @@ private void addSlab(int minimumSize) {
     LOG.debug("used {} slabs, adding new slab of size {}", slabs.size(), nextSlabSize);
 
     this.currentSlab = allocator.allocate(nextSlabSize);
+    this.currentSlab.order(ByteOrder.LITTLE_ENDIAN);
     this.slabs.add(currentSlab);
     this.bytesAllocated = Math.addExact(this.bytesAllocated, nextSlabSize);
   }
@@ -232,6 +234,114 @@ public void write(byte b[], int off, int len) {
     bytesUsed = Math.addExact(bytesUsed, len);
   }
 
+  /**
+   * Writes multiple int values in little-endian byte order using bulk {@code IntBuffer} transfer.
+   * Amortizes capacity checks across the entire batch and leverages platform-optimized bulk put.
+   *
+   * @param values source array
+   * @param offset start index in values
+   * @param length number of ints to write
+   */
+  public void writeInts(int[] values, int offset, int length) {
+    int bytesNeeded = length * 4;
+    if (bytesNeeded <= currentSlab.remaining()) {
+      currentSlab.asIntBuffer().put(values, offset, length);
+      currentSlab.position(currentSlab.position() + bytesNeeded);
+    } else {
+      // Fill current slab, then continue into a new one
+      int fits = currentSlab.remaining() / 4;
+      if (fits > 0) {
+        currentSlab.asIntBuffer().put(values, offset, fits);
+        currentSlab.position(currentSlab.position() + fits * 4);
+      }
+      int remaining = length - fits;
+      addSlab(remaining * 4);
+      currentSlab.asIntBuffer().put(values, offset + fits, remaining);
+      currentSlab.position(currentSlab.position() + remaining * 4);
+    }
+    bytesUsed = Math.addExact(bytesUsed, bytesNeeded);
+  }
+
+  /**
+   * Writes multiple long values in little-endian byte order using bulk {@code LongBuffer} transfer.
+   *
+   * @param values source array
+   * @param offset start index in values
+   * @param length number of longs to write
+   */
+  public void writeLongs(long[] values, int offset, int length) {
+    int bytesNeeded = length * 8;
+    if (bytesNeeded <= currentSlab.remaining()) {
+      currentSlab.asLongBuffer().put(values, offset, length);
+      currentSlab.position(currentSlab.position() + bytesNeeded);
+    } else {
+      int fits = currentSlab.remaining() / 8;
+      if (fits > 0) {
+        currentSlab.asLongBuffer().put(values, offset, fits);
+        currentSlab.position(currentSlab.position() + fits * 8);
+      }
+      int remaining = length - fits;
+      addSlab(remaining * 8);
+      currentSlab.asLongBuffer().put(values, offset + fits, remaining);
+      currentSlab.position(currentSlab.position() + remaining * 8);
+    }
+    bytesUsed = Math.addExact(bytesUsed, bytesNeeded);
+  }
+
+  /**
+   * Writes multiple float values in little-endian byte order using bulk {@code FloatBuffer} transfer.
+   * The slab's LE byte order ensures correct IEEE 754 encoding without explicit
+   * {@code Float.floatToIntBits()} conversion.
+   *
+   * @param values source array
+   * @param offset start index in values
+   * @param length number of floats to write
+   */
+  public void writeFloats(float[] values, int offset, int length) {
+    int bytesNeeded = length * 4;
+    if (bytesNeeded <= currentSlab.remaining()) {
+      currentSlab.asFloatBuffer().put(values, offset, length);
+      currentSlab.position(currentSlab.position() + bytesNeeded);
+    } else {
+      int fits = currentSlab.remaining() / 4;
+      if (fits > 0) {
+        currentSlab.asFloatBuffer().put(values, offset, fits);
+        currentSlab.position(currentSlab.position() + fits * 4);
+      }
+      int remaining = length - fits;
+      addSlab(remaining * 4);
+      currentSlab.asFloatBuffer().put(values, offset + fits, remaining);
+      currentSlab.position(currentSlab.position() + remaining * 4);
+    }
+    bytesUsed = Math.addExact(bytesUsed, bytesNeeded);
+  }
+
+  /**
+   * Writes multiple double values in little-endian byte order using bulk {@code DoubleBuffer} transfer.
+   *
+   * @param values source array
+   * @param offset start index in values
+   * @param length number of doubles to write
+   */
+  public void writeDoubles(double[] values, int offset, int length) {
+    int bytesNeeded = length * 8;
+    if (bytesNeeded <= currentSlab.remaining()) {
+      currentSlab.asDoubleBuffer().put(values, offset, length);
+      currentSlab.position(currentSlab.position() + bytesNeeded);
+    } else {
+      int fits = currentSlab.remaining() / 8;
+      if (fits > 0) {
+        currentSlab.asDoubleBuffer().put(values, offset, fits);
+        currentSlab.position(currentSlab.position() + fits * 8);
+      }
+      int remaining = length - fits;
+      addSlab(remaining * 8);
+      currentSlab.asDoubleBuffer().put(values, offset + fits, remaining);
+      currentSlab.position(currentSlab.position() + remaining * 8);
+    }
+    bytesUsed = Math.addExact(bytesUsed, bytesNeeded);
+  }
+
   private void writeToOutput(OutputStream out, ByteBuffer buf, int len) throws IOException {
     if (buf.hasArray()) {
       out.write(buf.array(), buf.arrayOffset(), len);
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/DirectCodecFactory.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/DirectCodecFactory.java
index b2b5233eeb..c96c071e07 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/DirectCodecFactory.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/DirectCodecFactory.java
@@ -21,6 +21,7 @@
 import com.github.luben.zstd.ZstdCompressCtx;
 import com.github.luben.zstd.ZstdDecompressCtx;
 import java.io.IOException;
+import java.lang.reflect.Constructor;
 import java.lang.reflect.InvocationTargetException;
 import java.lang.reflect.Method;
 import java.nio.ByteBuffer;
@@ -61,6 +62,14 @@ class DirectCodecFactory extends CodecFactory implements AutoCloseable {
   private static final Method DECOMPRESS_METHOD;
   private static final Method CREATE_DIRECT_DECOMPRESSOR_METHOD;
 
+  // Brotli JNI bypass via reflection (brotli-codec is a runtime-only dependency)
+  private static final boolean BROTLI_NATIVE_AVAILABLE;
+  private static final Method BROTLI_DECOMPRESS_METHOD; // BrotliDeCompressor.deCompress(ByteBuffer, ByteBuffer)
+  private static final Method BROTLI_COMPRESS_METHOD; // BrotliCompressor.compress(Parameter, ByteBuffer, ByteBuffer)
+  private static final Constructor<?> BROTLI_DECOMPRESSOR_CTOR; // BrotliDeCompressor()
+  private static final Constructor<?> BROTLI_COMPRESSOR_CTOR; // BrotliCompressor()
+  private static final Object BROTLI_COMPRESS_PARAMETER; // Brotli.Parameter instance (quality=1)
+
   static {
     Class<?> tempClass = null;
     Method tempCreateMethod = null;
@@ -76,6 +85,46 @@ class DirectCodecFactory extends CodecFactory implements AutoCloseable {
     DIRECT_DECOMPRESSION_CODEC_CLASS = tempClass;
     CREATE_DIRECT_DECOMPRESSOR_METHOD = tempCreateMethod;
     DECOMPRESS_METHOD = tempDecompressMethod;
+
+    // Initialize Brotli JNI bypass via reflection
+    boolean brotliLoaded = false;
+    Method brotliDecompress = null;
+    Method brotliCompress = null;
+    Constructor<?> brotliDecompressorCtor = null;
+    Constructor<?> brotliCompressorCtor = null;
+    Object brotliParam = null;
+    try {
+      // Load native library
+      Class<?> loaderClass = Class.forName("org.meteogroup.jbrotli.libloader.BrotliLibraryLoader");
+      loaderClass.getMethod("loadBrotli").invoke(null);
+
+      // BrotliDeCompressor: no-arg ctor + deCompress(ByteBuffer, ByteBuffer) -> int
+      Class<?> decompClass = Class.forName("org.meteogroup.jbrotli.BrotliDeCompressor");
+      brotliDecompressorCtor = decompClass.getConstructor();
+      brotliDecompress = decompClass.getMethod("deCompress", ByteBuffer.class, ByteBuffer.class);
+
+      // BrotliCompressor: no-arg ctor + compress(Parameter, ByteBuffer, ByteBuffer) -> int
+      Class<?> compClass = Class.forName("org.meteogroup.jbrotli.BrotliCompressor");
+      Class<?> paramClass = Class.forName("org.meteogroup.jbrotli.Brotli$Parameter");
+      Class<?> modeClass = Class.forName("org.meteogroup.jbrotli.Brotli$Mode");
+      brotliCompressorCtor = compClass.getConstructor();
+      brotliCompress = compClass.getMethod("compress", paramClass, ByteBuffer.class, ByteBuffer.class);
+
+      // Create Parameter(Mode.GENERIC, quality=1, lgwin=22, lgblock=0)
+      Object genericMode = modeClass.getField("GENERIC").get(null);
+      Constructor<?> paramCtor = paramClass.getConstructor(modeClass, int.class, int.class, int.class);
+      brotliParam = paramCtor.newInstance(genericMode, 1, 22, 0);
+
+      brotliLoaded = true;
+    } catch (Throwable t) {
+      LOG.debug("Brotli native library not available, falling back to Hadoop codec", t);
+    }
+    BROTLI_NATIVE_AVAILABLE = brotliLoaded;
+    BROTLI_DECOMPRESS_METHOD = brotliDecompress;
+    BROTLI_COMPRESS_METHOD = brotliCompress;
+    BROTLI_DECOMPRESSOR_CTOR = brotliDecompressorCtor;
+    BROTLI_COMPRESSOR_CTOR = brotliCompressorCtor;
+    BROTLI_COMPRESS_PARAMETER = brotliParam;
   }
 
   /**
@@ -103,8 +152,13 @@ protected BytesCompressor createCompressor(final CompressionCodecName codecName)
         return new SnappyCompressor();
       case ZSTD:
         return new ZstdCompressor();
-        // todo: create class similar to the SnappyCompressor for zlib and exclude it as
-        // snappy is above since it also generates allocateDirect calls.
+      case LZ4_RAW:
+        return new Lz4RawCompressor();
+      case BROTLI:
+        if (BROTLI_NATIVE_AVAILABLE) {
+          return new BrotliDirectCompressor();
+        }
+        return super.createCompressor(codecName);
       default:
         return super.createCompressor(codecName);
     }
@@ -117,6 +171,16 @@ protected BytesDecompressor createDecompressor(final CompressionCodecName codecN
         return new SnappyDecompressor();
       case ZSTD:
         return new ZstdDecompressor();
+      case LZ4_RAW:
+        return new Lz4RawDecompressor();
+      case BROTLI:
+        if (BROTLI_NATIVE_AVAILABLE) {
+          return new BrotliDirectDecompressor();
+        }
+        // fall through to default Hadoop codec path
+      case GZIP:
+      case UNCOMPRESSED:
+        return super.createDecompressor(codecName);
       default:
         CompressionCodec codec = getCodec(codecName);
         if (codec == null) {
@@ -437,6 +501,133 @@ void closeCompressor() {
     }
   }
 
+  /**
+  /**
+   * Direct-memory LZ4_RAW decompressor using airlift's LZ4 decompressor with
+   * direct ByteBuffers, avoiding reflection-based {@link FullDirectDecompressor}.
+   */
+  private class Lz4RawDecompressor extends BaseDecompressor {
+    private final io.airlift.compress.lz4.Lz4Decompressor decompressor =
+        new io.airlift.compress.lz4.Lz4Decompressor();
+
+    @Override
+    int decompress(ByteBuffer input, ByteBuffer output) {
+      decompressor.decompress(input, output);
+      return output.position();
+    }
+
+    @Override
+    void closeDecompressor() {
+      // no-op
+    }
+  }
+
+  /**
+   * Direct-memory LZ4_RAW compressor using airlift's LZ4 compressor with
+   * direct ByteBuffers, avoiding the stream-based heap path.
+   */
+  private class Lz4RawCompressor extends BaseCompressor {
+    private final io.airlift.compress.lz4.Lz4Compressor compressor = new io.airlift.compress.lz4.Lz4Compressor();
+
+    @Override
+    public CompressionCodecName getCodecName() {
+      return CompressionCodecName.LZ4_RAW;
+    }
+
+    @Override
+    int maxCompressedSize(int size) {
+      return compressor.maxCompressedLength(size);
+    }
+
+    @Override
+    int compress(ByteBuffer input, ByteBuffer output) {
+      compressor.compress(input, output);
+      return output.position();
+    }
+
+    @Override
+    void closeCompressor() {
+      // no-op
+    }
+  }
+
+  /**
+   * Direct-memory Brotli decompressor using jbrotli's native JNI bindings via reflection,
+   * bypassing the Hadoop BrotliCodec/stream wrapper overhead.
+   */
+  private class BrotliDirectDecompressor extends BaseDecompressor {
+    private final Object decompressor;
+
+    BrotliDirectDecompressor() {
+      try {
+        this.decompressor = BROTLI_DECOMPRESSOR_CTOR.newInstance();
+      } catch (ReflectiveOperationException e) {
+        throw new DirectCodecPool.ParquetCompressionCodecException("Failed to create Brotli decompressor", e);
+      }
+    }
+
+    @Override
+    int decompress(ByteBuffer input, ByteBuffer output) throws IOException {
+      try {
+        return (int) BROTLI_DECOMPRESS_METHOD.invoke(decompressor, input, output);
+      } catch (InvocationTargetException e) {
+        throw new IOException("Brotli decompression failed", e.getCause());
+      } catch (IllegalAccessException e) {
+        throw new IOException("Brotli decompression failed", e);
+      }
+    }
+
+    @Override
+    void closeDecompressor() {
+      // no-op: BrotliDeCompressor has no resources to release
+    }
+  }
+
+  /**
+   * Direct-memory Brotli compressor using jbrotli's native JNI bindings via reflection,
+   * bypassing the Hadoop BrotliCodec/stream wrapper overhead.
+   * Uses quality=1 by default (fast compression, matching Hadoop's BrotliCompressor default).
+   */
+  private class BrotliDirectCompressor extends BaseCompressor {
+    private final Object compressor;
+
+    BrotliDirectCompressor() {
+      try {
+        this.compressor = BROTLI_COMPRESSOR_CTOR.newInstance();
+      } catch (ReflectiveOperationException e) {
+        throw new DirectCodecPool.ParquetCompressionCodecException("Failed to create Brotli compressor", e);
+      }
+    }
+
+    @Override
+    public CompressionCodecName getCodecName() {
+      return CompressionCodecName.BROTLI;
+    }
+
+    @Override
+    int maxCompressedSize(int size) {
+      // Brotli worst case: input size + (input size >> 2) + 1K overhead for small inputs
+      // This is a conservative upper bound matching the Brotli spec
+      return size + (size >> 2) + 1024;
+    }
+
+    @Override
+    int compress(ByteBuffer input, ByteBuffer output) throws IOException {
+      try {
+        return (int) BROTLI_COMPRESS_METHOD.invoke(compressor, BROTLI_COMPRESS_PARAMETER, input, output);
+      } catch (InvocationTargetException e) {
+        throw new IOException("Brotli compression failed", e.getCause());
+      } catch (IllegalAccessException e) {
+        throw new IOException("Brotli compression failed", e);
+      }
+    }
+
+    @Override
+    void closeCompressor() {
+      // no-op: BrotliCompressor has no resources to release
+    }
+  }
+
   /**
    * @deprecated Use {@link CodecFactory#NO_OP_COMPRESSOR} instead
    */