From 184fd8432c413bf44c80c7b5dbf7d9b1bdbeb18b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= <iemejia@gmail.com>
Date: Sun, 19 Apr 2026 20:08:17 +0000
Subject: [PATCH 01/27] GH-3511: Add JMH encoding benchmarks and fix
 parquet-benchmarks shaded jar

The parquet-benchmarks pom is missing the JMH annotation-processor
configuration and the AppendingTransformer entries for BenchmarkList /
CompilerHints. As a result, the shaded jar built from master fails at
runtime with "Unable to find the resource: /META-INF/BenchmarkList".

This commit:

- Fixes parquet-benchmarks/pom.xml so the shaded jar is runnable: adds
  jmh-generator-annprocess to maven-compiler-plugin's annotation
  processor paths, and adds AppendingTransformer entries for
  META-INF/BenchmarkList and META-INF/CompilerHints to the shade plugin.

- Adds 11 JMH benchmarks covering the encode/decode paths used by the
  pending performance optimization PRs (#3494, #3496, #3500, #3504,
  #3506, #3510), so reviewers can reproduce the reported numbers and
  detect regressions:

    IntEncodingBenchmark, BinaryEncodingBenchmark,
    ByteStreamSplitEncodingBenchmark, ByteStreamSplitDecodingBenchmark,
    FixedLenByteArrayEncodingBenchmark, FileReadBenchmark,
    FileWriteBenchmark, RowGroupFlushBenchmark,
    ConcurrentReadWriteBenchmark, BlackHoleOutputFile, TestDataFactory.

After this change the shaded jar registers 87 benchmarks (was 0 from a
working build, or unrunnable at all from a default build).
---
 parquet-benchmarks/pom.xml                    |  18 ++
 .../benchmarks/BinaryEncodingBenchmark.java   | 182 +++++++++++++
 .../benchmarks/BlackHoleOutputFile.java       |  76 ++++++
 .../ByteStreamSplitDecodingBenchmark.java     | 170 ++++++++++++
 .../ByteStreamSplitEncodingBenchmark.java     | 131 ++++++++++
 .../ConcurrentReadWriteBenchmark.java         | 135 ++++++++++
 .../parquet/benchmarks/FileReadBenchmark.java | 119 +++++++++
 .../benchmarks/FileWriteBenchmark.java        |  82 ++++++
 .../FixedLenByteArrayEncodingBenchmark.java   |  89 +++++++
 .../benchmarks/IntEncodingBenchmark.java      | 244 ++++++++++++++++++
 .../benchmarks/RowGroupFlushBenchmark.java    | 191 ++++++++++++++
 .../parquet/benchmarks/TestDataFactory.java   | 175 +++++++++++++
 12 files changed, 1612 insertions(+)
 create mode 100644 parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BinaryEncodingBenchmark.java
 create mode 100644 parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BlackHoleOutputFile.java
 create mode 100644 parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ByteStreamSplitDecodingBenchmark.java
 create mode 100644 parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ByteStreamSplitEncodingBenchmark.java
 create mode 100644 parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ConcurrentReadWriteBenchmark.java
 create mode 100644 parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileReadBenchmark.java
 create mode 100644 parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileWriteBenchmark.java
 create mode 100644 parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FixedLenByteArrayEncodingBenchmark.java
 create mode 100644 parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/IntEncodingBenchmark.java
 create mode 100644 parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/RowGroupFlushBenchmark.java
 create mode 100644 parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/TestDataFactory.java
diff --git a/parquet-benchmarks/pom.xml b/parquet-benchmarks/pom.xml
index d5a288b677..8d5a1253bf 100644
--- a/parquet-benchmarks/pom.xml
+++ b/parquet-benchmarks/pom.xml
@@ -94,6 +94,18 @@
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-compiler-plugin</artifactId>
+        <configuration>
+          <annotationProcessorPaths>
+            <path>
+              <groupId>org.openjdk.jmh</groupId>
+              <artifactId>jmh-generator-annprocess</artifactId>
+              <version>${jmh.version}</version>
+            </path>
+          </annotationProcessorPaths>
+          <annotationProcessors>
+            <annotationProcessor>org.openjdk.jmh.generators.BenchmarkProcessor</annotationProcessor>
+          </annotationProcessors>
+        </configuration>
       </plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
@@ -112,6 +124,12 @@
                 <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
                   <mainClass>org.openjdk.jmh.Main</mainClass>
                 </transformer>
+                <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
+                  <resource>META-INF/BenchmarkList</resource>
+                </transformer>
+                <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
+                  <resource>META-INF/CompilerHints</resource>
+                </transformer>
               </transformers>
               <artifactSet>
                 <includes>
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BinaryEncodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BinaryEncodingBenchmark.java
new file mode 100644
index 0000000000..7added9717
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BinaryEncodingBenchmark.java
@@ -0,0 +1,182 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import org.apache.parquet.bytes.ByteBufferInputStream;
+import org.apache.parquet.bytes.HeapByteBufferAllocator;
+import org.apache.parquet.column.Encoding;
+import org.apache.parquet.column.values.ValuesWriter;
+import org.apache.parquet.column.values.deltalengthbytearray.DeltaLengthByteArrayValuesReader;
+import org.apache.parquet.column.values.deltalengthbytearray.DeltaLengthByteArrayValuesWriter;
+import org.apache.parquet.column.values.deltastrings.DeltaByteArrayReader;
+import org.apache.parquet.column.values.deltastrings.DeltaByteArrayWriter;
+import org.apache.parquet.column.values.dictionary.DictionaryValuesWriter;
+import org.apache.parquet.column.values.plain.BinaryPlainValuesReader;
+import org.apache.parquet.column.values.plain.PlainValuesWriter;
+import org.apache.parquet.io.api.Binary;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OperationsPerInvocation;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+/**
+ * Encoding-level micro-benchmarks for BINARY values.
+ * Compares PLAIN, DELTA_BYTE_ARRAY, DELTA_LENGTH_BYTE_ARRAY, and DICTIONARY encodings
+ * across different string lengths and cardinality patterns.
+ *
+ * <p>Each benchmark invocation processes {@value #VALUE_COUNT} values. Throughput is
+ * reported per-value using {@link OperationsPerInvocation}.
+ */
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@Fork(1)
+@Warmup(iterations = 3, time = 1)
+@Measurement(iterations = 5, time = 1)
+@State(Scope.Thread)
+public class BinaryEncodingBenchmark {
+
+  static final int VALUE_COUNT = 100_000;
+  private static final int INIT_SLAB_SIZE = 64 * 1024;
+  private static final int PAGE_SIZE = 4 * 1024 * 1024;
+  private static final int MAX_DICT_BYTE_SIZE = 4 * 1024 * 1024;
+
+  @Param({"10", "100", "1000"})
+  public int stringLength;
+
+  /** LOW = 100 distinct values; HIGH = all unique. */
+  @Param({"LOW", "HIGH"})
+  public String cardinality;
+
+  private Binary[] data;
+  private byte[] plainEncoded;
+  private byte[] deltaLengthEncoded;
+  private byte[] deltaStringsEncoded;
+
+  @Setup(Level.Trial)
+  public void setup() throws IOException {
+    Random random = new Random(42);
+    int distinct = "LOW".equals(cardinality) ? TestDataFactory.LOW_CARDINALITY_DISTINCT : 0;
+    data = TestDataFactory.generateBinaryData(VALUE_COUNT, stringLength, distinct, random);
+
+    // Pre-encode data for decode benchmarks
+    plainEncoded = encodeBinaryWith(newPlainWriter());
+    deltaLengthEncoded = encodeBinaryWith(newDeltaLengthWriter());
+    deltaStringsEncoded = encodeBinaryWith(newDeltaStringsWriter());
+  }
+
+  private byte[] encodeBinaryWith(ValuesWriter writer) throws IOException {
+    for (Binary v : data) {
+      writer.writeBytes(v);
+    }
+    byte[] bytes = writer.getBytes().toByteArray();
+    writer.close();
+    return bytes;
+  }
+
+  // ---- Writer factories ----
+
+  private static PlainValuesWriter newPlainWriter() {
+    return new PlainValuesWriter(INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+  }
+
+  private static DeltaLengthByteArrayValuesWriter newDeltaLengthWriter() {
+    return new DeltaLengthByteArrayValuesWriter(INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+  }
+
+  private static DeltaByteArrayWriter newDeltaStringsWriter() {
+    return new DeltaByteArrayWriter(INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+  }
+
+  private static DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter newDictWriter() {
+    return new DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter(
+        MAX_DICT_BYTE_SIZE, Encoding.PLAIN_DICTIONARY, Encoding.PLAIN, new HeapByteBufferAllocator());
+  }
+
+  // ---- Encode benchmarks ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodePlain() throws IOException {
+    return encodeBinaryWith(newPlainWriter());
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeDeltaLengthByteArray() throws IOException {
+    return encodeBinaryWith(newDeltaLengthWriter());
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeDeltaByteArray() throws IOException {
+    return encodeBinaryWith(newDeltaStringsWriter());
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeDictionary() throws IOException {
+    return encodeBinaryWith(newDictWriter());
+  }
+
+  // ---- Decode benchmarks ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodePlain(Blackhole bh) throws IOException {
+    BinaryPlainValuesReader reader = new BinaryPlainValuesReader();
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(plainEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readBytes());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeDeltaLengthByteArray(Blackhole bh) throws IOException {
+    DeltaLengthByteArrayValuesReader reader = new DeltaLengthByteArrayValuesReader();
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(deltaLengthEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readBytes());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeDeltaByteArray(Blackhole bh) throws IOException {
+    DeltaByteArrayReader reader = new DeltaByteArrayReader();
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(deltaStringsEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readBytes());
+    }
+  }
+}
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BlackHoleOutputFile.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BlackHoleOutputFile.java
new file mode 100644
index 0000000000..690ddc2bbe
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BlackHoleOutputFile.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.IOException;
+import org.apache.parquet.io.OutputFile;
+import org.apache.parquet.io.PositionOutputStream;
+
+/**
+ * A no-op {@link OutputFile} that discards all written data.
+ * Useful for isolating CPU/encoding cost from filesystem I/O in write benchmarks.
+ */
+public final class BlackHoleOutputFile implements OutputFile {
+
+  public static final BlackHoleOutputFile INSTANCE = new BlackHoleOutputFile();
+
+  private BlackHoleOutputFile() {}
+
+  @Override
+  public boolean supportsBlockSize() {
+    return false;
+  }
+
+  @Override
+  public long defaultBlockSize() {
+    return -1L;
+  }
+
+  @Override
+  public PositionOutputStream createOrOverwrite(long blockSizeHint) {
+    return create(blockSizeHint);
+  }
+
+  @Override
+  public PositionOutputStream create(long blockSizeHint) {
+    return new PositionOutputStream() {
+      private long pos;
+
+      @Override
+      public long getPos() throws IOException {
+        return pos;
+      }
+
+      @Override
+      public void write(int b) throws IOException {
+        ++pos;
+      }
+
+      @Override
+      public void write(byte[] b, int off, int len) throws IOException {
+        pos += len;
+      }
+    };
+  }
+
+  @Override
+  public String getPath() {
+    return "/dev/null";
+  }
+}
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ByteStreamSplitDecodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ByteStreamSplitDecodingBenchmark.java
new file mode 100644
index 0000000000..e59b7ba941
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ByteStreamSplitDecodingBenchmark.java
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.IOException;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import org.apache.parquet.bytes.ByteBufferInputStream;
+import org.apache.parquet.bytes.HeapByteBufferAllocator;
+import org.apache.parquet.column.values.ValuesWriter;
+import org.apache.parquet.column.values.bytestreamsplit.ByteStreamSplitValuesReader;
+import org.apache.parquet.column.values.bytestreamsplit.ByteStreamSplitValuesReaderForDouble;
+import org.apache.parquet.column.values.bytestreamsplit.ByteStreamSplitValuesReaderForFloat;
+import org.apache.parquet.column.values.bytestreamsplit.ByteStreamSplitValuesReaderForInteger;
+import org.apache.parquet.column.values.bytestreamsplit.ByteStreamSplitValuesReaderForLong;
+import org.apache.parquet.column.values.bytestreamsplit.ByteStreamSplitValuesWriter;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OperationsPerInvocation;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+/**
+ * Decoding-level micro-benchmarks for the BYTE_STREAM_SPLIT encoding across the four
+ * primitive widths supported by Parquet ({@code FLOAT}, {@code DOUBLE}, {@code INT32},
+ * {@code INT64}).
+ *
+ * <p>Each invocation decodes {@value #VALUE_COUNT} values; throughput is reported
+ * per-value via {@link OperationsPerInvocation}. The cost includes both
+ * {@code initFromPage} (which eagerly transposes the entire page) and the per-value
+ * read calls. Page transposition is the part this benchmark is primarily designed
+ * to exercise.
+ */
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@Fork(1)
+@Warmup(iterations = 3, time = 1)
+@Measurement(iterations = 5, time = 1)
+@State(Scope.Thread)
+public class ByteStreamSplitDecodingBenchmark {
+
+  static final int VALUE_COUNT = 100_000;
+  private static final int INIT_SLAB_SIZE = 64 * 1024;
+  private static final int PAGE_SIZE = 4 * 1024 * 1024;
+
+  private byte[] floatPage;
+  private byte[] doublePage;
+  private byte[] intPage;
+  private byte[] longPage;
+
+  @Setup(Level.Trial)
+  public void setup() throws IOException {
+    Random random = new Random(42);
+    int[] intData = new int[VALUE_COUNT];
+    long[] longData = new long[VALUE_COUNT];
+    float[] floatData = new float[VALUE_COUNT];
+    double[] doubleData = new double[VALUE_COUNT];
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      intData[i] = random.nextInt();
+      longData[i] = random.nextLong();
+      floatData[i] = random.nextFloat();
+      doubleData[i] = random.nextDouble();
+    }
+
+    {
+      ValuesWriter w = new ByteStreamSplitValuesWriter.FloatByteStreamSplitValuesWriter(
+          INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+      for (float v : floatData) {
+        w.writeFloat(v);
+      }
+      floatPage = w.getBytes().toByteArray();
+      w.close();
+    }
+    {
+      ValuesWriter w = new ByteStreamSplitValuesWriter.DoubleByteStreamSplitValuesWriter(
+          INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+      for (double v : doubleData) {
+        w.writeDouble(v);
+      }
+      doublePage = w.getBytes().toByteArray();
+      w.close();
+    }
+    {
+      ValuesWriter w = new ByteStreamSplitValuesWriter.IntegerByteStreamSplitValuesWriter(
+          INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+      for (int v : intData) {
+        w.writeInteger(v);
+      }
+      intPage = w.getBytes().toByteArray();
+      w.close();
+    }
+    {
+      ValuesWriter w = new ByteStreamSplitValuesWriter.LongByteStreamSplitValuesWriter(
+          INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+      for (long v : longData) {
+        w.writeLong(v);
+      }
+      longPage = w.getBytes().toByteArray();
+      w.close();
+    }
+  }
+
+  private static void init(ByteStreamSplitValuesReader r, byte[] page) throws IOException {
+    r.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(java.nio.ByteBuffer.wrap(page)));
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeFloat(Blackhole bh) throws IOException {
+    ByteStreamSplitValuesReaderForFloat r = new ByteStreamSplitValuesReaderForFloat();
+    init(r, floatPage);
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(r.readFloat());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeDouble(Blackhole bh) throws IOException {
+    ByteStreamSplitValuesReaderForDouble r = new ByteStreamSplitValuesReaderForDouble();
+    init(r, doublePage);
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(r.readDouble());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeInt(Blackhole bh) throws IOException {
+    ByteStreamSplitValuesReaderForInteger r = new ByteStreamSplitValuesReaderForInteger();
+    init(r, intPage);
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(r.readInteger());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeLong(Blackhole bh) throws IOException {
+    ByteStreamSplitValuesReaderForLong r = new ByteStreamSplitValuesReaderForLong();
+    init(r, longPage);
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(r.readLong());
+    }
+  }
+}
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ByteStreamSplitEncodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ByteStreamSplitEncodingBenchmark.java
new file mode 100644
index 0000000000..37ec9df812
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ByteStreamSplitEncodingBenchmark.java
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.IOException;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import org.apache.parquet.bytes.HeapByteBufferAllocator;
+import org.apache.parquet.column.values.ValuesWriter;
+import org.apache.parquet.column.values.bytestreamsplit.ByteStreamSplitValuesWriter;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OperationsPerInvocation;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+
+/**
+ * Encoding-level micro-benchmarks for the BYTE_STREAM_SPLIT encoding across the four
+ * primitive widths supported by Parquet ({@code FLOAT}, {@code DOUBLE}, {@code INT32},
+ * {@code INT64}).
+ *
+ * <p>Each invocation encodes {@value #VALUE_COUNT} values; throughput is reported
+ * per-value via {@link OperationsPerInvocation}.
+ */
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@Fork(1)
+@Warmup(iterations = 3, time = 1)
+@Measurement(iterations = 5, time = 1)
+@State(Scope.Thread)
+public class ByteStreamSplitEncodingBenchmark {
+
+  static final int VALUE_COUNT = 100_000;
+  private static final int INIT_SLAB_SIZE = 64 * 1024;
+  private static final int PAGE_SIZE = 4 * 1024 * 1024;
+
+  private int[] intData;
+  private long[] longData;
+  private float[] floatData;
+  private double[] doubleData;
+
+  @Setup(Level.Trial)
+  public void setup() {
+    Random random = new Random(42);
+    intData = new int[VALUE_COUNT];
+    longData = new long[VALUE_COUNT];
+    floatData = new float[VALUE_COUNT];
+    doubleData = new double[VALUE_COUNT];
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      intData[i] = random.nextInt();
+      longData[i] = random.nextLong();
+      floatData[i] = random.nextFloat();
+      doubleData[i] = random.nextDouble();
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeFloat() throws IOException {
+    ValuesWriter w = new ByteStreamSplitValuesWriter.FloatByteStreamSplitValuesWriter(
+        INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+    for (float v : floatData) {
+      w.writeFloat(v);
+    }
+    byte[] bytes = w.getBytes().toByteArray();
+    w.close();
+    return bytes;
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeDouble() throws IOException {
+    ValuesWriter w = new ByteStreamSplitValuesWriter.DoubleByteStreamSplitValuesWriter(
+        INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+    for (double v : doubleData) {
+      w.writeDouble(v);
+    }
+    byte[] bytes = w.getBytes().toByteArray();
+    w.close();
+    return bytes;
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeInt() throws IOException {
+    ValuesWriter w = new ByteStreamSplitValuesWriter.IntegerByteStreamSplitValuesWriter(
+        INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+    for (int v : intData) {
+      w.writeInteger(v);
+    }
+    byte[] bytes = w.getBytes().toByteArray();
+    w.close();
+    return bytes;
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeLong() throws IOException {
+    ValuesWriter w = new ByteStreamSplitValuesWriter.LongByteStreamSplitValuesWriter(
+        INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+    for (long v : longData) {
+      w.writeLong(v);
+    }
+    byte[] bytes = w.getBytes().toByteArray();
+    w.close();
+    return bytes;
+  }
+}
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ConcurrentReadWriteBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ConcurrentReadWriteBenchmark.java
new file mode 100644
index 0000000000..9c5d135eab
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ConcurrentReadWriteBenchmark.java
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import org.apache.parquet.example.data.Group;
+import org.apache.parquet.example.data.simple.SimpleGroupFactory;
+import org.apache.parquet.hadoop.ParquetFileWriter;
+import org.apache.parquet.hadoop.ParquetReader;
+import org.apache.parquet.hadoop.ParquetWriter;
+import org.apache.parquet.hadoop.api.ReadSupport;
+import org.apache.parquet.hadoop.example.ExampleParquetWriter;
+import org.apache.parquet.hadoop.example.GroupReadSupport;
+import org.apache.parquet.io.InputFile;
+import org.apache.parquet.io.LocalInputFile;
+import org.apache.parquet.io.LocalOutputFile;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.TearDown;
+import org.openjdk.jmh.annotations.Threads;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+/**
+ * Multi-threaded benchmarks to validate that read and write operations perform correctly
+ * under concurrency. Uses {@code @Threads(4)} by default (overridable via JMH {@code -t} flag).
+ *
+ * <ul>
+ *   <li>{@link #concurrentWrite()} - each thread independently writes to a shared
+ *       {@link BlackHoleOutputFile} (stateless sink)</li>
+ *   <li>{@link #concurrentRead(Blackhole)} - each thread independently reads the same
+ *       pre-generated Parquet file</li>
+ * </ul>
+ */
+@BenchmarkMode({Mode.SingleShotTime, Mode.AverageTime})
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@Fork(1)
+@Warmup(iterations = 2, batchSize = 1)
+@Measurement(iterations = 5, batchSize = 1)
+@Threads(4)
+@State(Scope.Benchmark)
+public class ConcurrentReadWriteBenchmark {
+
+  private File tempFile;
+
+  @Setup(Level.Trial)
+  public void setup() throws IOException {
+    // Generate a shared file for concurrent reads
+    tempFile = File.createTempFile("parquet-concurrent-bench-", ".parquet");
+    tempFile.deleteOnExit();
+    tempFile.delete();
+
+    SimpleGroupFactory factory = TestDataFactory.newGroupFactory();
+    Random random = new Random(42);
+    try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(new LocalOutputFile(tempFile.toPath()))
+        .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
+        .withType(TestDataFactory.FILE_BENCHMARK_SCHEMA)
+        .build()) {
+      for (int i = 0; i < TestDataFactory.DEFAULT_ROW_COUNT; i++) {
+        writer.write(TestDataFactory.generateRow(factory, i, random));
+      }
+    }
+  }
+
+  @TearDown(Level.Trial)
+  public void tearDown() {
+    if (tempFile != null && tempFile.exists()) {
+      tempFile.delete();
+    }
+  }
+
+  /**
+   * Each thread writes a full file independently to the shared stateless
+   * {@link BlackHoleOutputFile} sink.
+   */
+  @Benchmark
+  public void concurrentWrite() throws IOException {
+    SimpleGroupFactory factory = TestDataFactory.newGroupFactory();
+    Random random = new Random(Thread.currentThread().getId());
+    try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(BlackHoleOutputFile.INSTANCE)
+        .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
+        .withType(TestDataFactory.FILE_BENCHMARK_SCHEMA)
+        .build()) {
+      for (int i = 0; i < TestDataFactory.DEFAULT_ROW_COUNT; i++) {
+        writer.write(TestDataFactory.generateRow(factory, i, random));
+      }
+    }
+  }
+
+  /**
+   * Each thread reads the full pre-generated file independently.
+   */
+  @Benchmark
+  public void concurrentRead(Blackhole bh) throws IOException {
+    InputFile inputFile = new LocalInputFile(tempFile.toPath());
+    try (ParquetReader<Group> reader = new ParquetReader.Builder<Group>(inputFile) {
+      @Override
+      protected ReadSupport<Group> getReadSupport() {
+        return new GroupReadSupport();
+      }
+    }.build()) {
+      Group group;
+      while ((group = reader.read()) != null) {
+        bh.consume(group);
+      }
+    }
+  }
+}
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileReadBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileReadBenchmark.java
new file mode 100644
index 0000000000..7d5d0f5159
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileReadBenchmark.java
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import org.apache.parquet.column.ParquetProperties.WriterVersion;
+import org.apache.parquet.example.data.Group;
+import org.apache.parquet.example.data.simple.SimpleGroupFactory;
+import org.apache.parquet.hadoop.ParquetFileWriter;
+import org.apache.parquet.hadoop.ParquetReader;
+import org.apache.parquet.hadoop.ParquetWriter;
+import org.apache.parquet.hadoop.api.ReadSupport;
+import org.apache.parquet.hadoop.example.ExampleParquetWriter;
+import org.apache.parquet.hadoop.example.GroupReadSupport;
+import org.apache.parquet.hadoop.metadata.CompressionCodecName;
+import org.apache.parquet.io.InputFile;
+import org.apache.parquet.io.LocalInputFile;
+import org.apache.parquet.io.LocalOutputFile;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.TearDown;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+/**
+ * File-level read benchmarks measuring throughput of the full Parquet read pipeline.
+ * A temporary file is generated during setup using {@link LocalOutputFile} (no Hadoop FS
+ * overhead on write side), then read repeatedly during the benchmark.
+ *
+ * <p>Parameterized across compression codec and writer version.
+ */
+@BenchmarkMode({Mode.SingleShotTime, Mode.AverageTime})
+@Fork(1)
+@Warmup(iterations = 3, batchSize = 1)
+@Measurement(iterations = 5, batchSize = 1)
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@State(Scope.Benchmark)
+public class FileReadBenchmark {
+
+  @Param({"UNCOMPRESSED", "SNAPPY", "ZSTD", "GZIP"})
+  public String codec;
+
+  @Param({"PARQUET_1_0", "PARQUET_2_0"})
+  public String writerVersion;
+
+  private File tempFile;
+
+  @Setup(Level.Trial)
+  public void setup() throws IOException {
+    tempFile = File.createTempFile("parquet-read-bench-", ".parquet");
+    tempFile.deleteOnExit();
+    tempFile.delete(); // remove so the writer can create it
+
+    SimpleGroupFactory factory = TestDataFactory.newGroupFactory();
+    Random random = new Random(42);
+    try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(new LocalOutputFile(tempFile.toPath()))
+        .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
+        .withType(TestDataFactory.FILE_BENCHMARK_SCHEMA)
+        .withCompressionCodec(CompressionCodecName.valueOf(codec))
+        .withWriterVersion(WriterVersion.valueOf(writerVersion))
+        .withDictionaryEncoding(true)
+        .build()) {
+      for (int i = 0; i < TestDataFactory.DEFAULT_ROW_COUNT; i++) {
+        writer.write(TestDataFactory.generateRow(factory, i, random));
+      }
+    }
+  }
+
+  @TearDown(Level.Trial)
+  public void tearDown() {
+    if (tempFile != null && tempFile.exists()) {
+      tempFile.delete();
+    }
+  }
+
+  @Benchmark
+  public void readFile(Blackhole bh) throws IOException {
+    InputFile inputFile = new LocalInputFile(tempFile.toPath());
+    try (ParquetReader<Group> reader = new ParquetReader.Builder<Group>(inputFile) {
+      @Override
+      protected ReadSupport<Group> getReadSupport() {
+        return new GroupReadSupport();
+      }
+    }.build()) {
+      Group group;
+      while ((group = reader.read()) != null) {
+        bh.consume(group);
+      }
+    }
+  }
+}
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileWriteBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileWriteBenchmark.java
new file mode 100644
index 0000000000..60ac086504
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileWriteBenchmark.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.IOException;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import org.apache.parquet.column.ParquetProperties.WriterVersion;
+import org.apache.parquet.example.data.Group;
+import org.apache.parquet.example.data.simple.SimpleGroupFactory;
+import org.apache.parquet.hadoop.ParquetFileWriter;
+import org.apache.parquet.hadoop.ParquetWriter;
+import org.apache.parquet.hadoop.example.ExampleParquetWriter;
+import org.apache.parquet.hadoop.metadata.CompressionCodecName;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+
+/**
+ * File-level write benchmarks measuring throughput of the full Parquet write pipeline.
+ * Writes are sent to a {@link BlackHoleOutputFile} to isolate CPU/encoding cost from
+ * filesystem I/O.
+ *
+ * <p>Parameterized across compression codec, writer version, and dictionary encoding.
+ */
+@BenchmarkMode({Mode.SingleShotTime, Mode.AverageTime})
+@Fork(1)
+@Warmup(iterations = 3, batchSize = 1)
+@Measurement(iterations = 5, batchSize = 1)
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@State(Scope.Benchmark)
+public class FileWriteBenchmark {
+
+  @Param({"UNCOMPRESSED", "SNAPPY", "ZSTD", "GZIP"})
+  public String codec;
+
+  @Param({"PARQUET_1_0", "PARQUET_2_0"})
+  public String writerVersion;
+
+  @Param({"true", "false"})
+  public String dictionary;
+
+  @Benchmark
+  public void writeFile() throws IOException {
+    SimpleGroupFactory factory = TestDataFactory.newGroupFactory();
+    Random random = new Random(42);
+    try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(BlackHoleOutputFile.INSTANCE)
+        .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
+        .withType(TestDataFactory.FILE_BENCHMARK_SCHEMA)
+        .withCompressionCodec(CompressionCodecName.valueOf(codec))
+        .withWriterVersion(WriterVersion.valueOf(writerVersion))
+        .withDictionaryEncoding(Boolean.parseBoolean(dictionary))
+        .build()) {
+      for (int i = 0; i < TestDataFactory.DEFAULT_ROW_COUNT; i++) {
+        writer.write(TestDataFactory.generateRow(factory, i, random));
+      }
+    }
+  }
+}
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FixedLenByteArrayEncodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FixedLenByteArrayEncodingBenchmark.java
new file mode 100644
index 0000000000..7bf9359c92
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FixedLenByteArrayEncodingBenchmark.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.IOException;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import org.apache.parquet.bytes.HeapByteBufferAllocator;
+import org.apache.parquet.column.values.ValuesWriter;
+import org.apache.parquet.column.values.plain.FixedLenByteArrayPlainValuesWriter;
+import org.apache.parquet.io.api.Binary;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OperationsPerInvocation;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+
+/**
+ * Encoding-level micro-benchmark for {@link FixedLenByteArrayPlainValuesWriter}.
+ * Each input value has a fixed length matching the writer's configured length, so
+ * no length prefix is emitted -- the writer simply concatenates the raw bytes.
+ *
+ * <p>Each benchmark invocation processes {@value #VALUE_COUNT} values; throughput
+ * is reported per-value via {@link OperationsPerInvocation}.
+ */
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@Fork(1)
+@Warmup(iterations = 3, time = 1)
+@Measurement(iterations = 5, time = 1)
+@State(Scope.Thread)
+public class FixedLenByteArrayEncodingBenchmark {
+
+  static final int VALUE_COUNT = 100_000;
+  private static final int INIT_SLAB_SIZE = 64 * 1024;
+  private static final int PAGE_SIZE = 4 * 1024 * 1024;
+
+  @Param({"10", "100", "1000"})
+  public int fixedLength;
+
+  private Binary[] data;
+
+  @Setup(Level.Trial)
+  public void setup() {
+    Random random = new Random(42);
+    // distinct=0 -> all unique values; each is exactly fixedLength bytes long.
+    data = TestDataFactory.generateBinaryData(VALUE_COUNT, fixedLength, 0, random);
+  }
+
+  private byte[] encodeWith(ValuesWriter writer) throws IOException {
+    for (Binary v : data) {
+      writer.writeBytes(v);
+    }
+    byte[] bytes = writer.getBytes().toByteArray();
+    writer.close();
+    return bytes;
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeFixedLenPlain() throws IOException {
+    return encodeWith(new FixedLenByteArrayPlainValuesWriter(
+        fixedLength, INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator()));
+  }
+}
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/IntEncodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/IntEncodingBenchmark.java
new file mode 100644
index 0000000000..6ce4420e7e
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/IntEncodingBenchmark.java
@@ -0,0 +1,244 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import org.apache.parquet.bytes.ByteBufferInputStream;
+import org.apache.parquet.bytes.BytesInput;
+import org.apache.parquet.bytes.HeapByteBufferAllocator;
+import org.apache.parquet.column.Dictionary;
+import org.apache.parquet.column.Encoding;
+import org.apache.parquet.column.page.DictionaryPage;
+import org.apache.parquet.column.values.ValuesWriter;
+import org.apache.parquet.column.values.bytestreamsplit.ByteStreamSplitValuesReaderForInteger;
+import org.apache.parquet.column.values.bytestreamsplit.ByteStreamSplitValuesWriter;
+import org.apache.parquet.column.values.delta.DeltaBinaryPackingValuesReader;
+import org.apache.parquet.column.values.delta.DeltaBinaryPackingValuesWriterForInteger;
+import org.apache.parquet.column.values.dictionary.DictionaryValuesReader;
+import org.apache.parquet.column.values.dictionary.DictionaryValuesWriter;
+import org.apache.parquet.column.values.dictionary.PlainValuesDictionary;
+import org.apache.parquet.column.values.plain.PlainValuesReader;
+import org.apache.parquet.column.values.plain.PlainValuesWriter;
+import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridDecoder;
+import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridEncoder;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OperationsPerInvocation;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+/**
+ * Encoding-level micro-benchmarks for INT32 values.
+ * Compares PLAIN, DELTA_BINARY_PACKED, BYTE_STREAM_SPLIT, and DICTIONARY encodings
+ * across different data distribution patterns.
+ *
+ * <p>Each benchmark invocation processes {@value #VALUE_COUNT} values. Throughput is
+ * reported per-value using {@link OperationsPerInvocation}.
+ */
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@Fork(1)
+@Warmup(iterations = 3, time = 1)
+@Measurement(iterations = 5, time = 1)
+@State(Scope.Thread)
+public class IntEncodingBenchmark {
+
+  static final int VALUE_COUNT = 100_000;
+  private static final int INIT_SLAB_SIZE = 64 * 1024;
+  private static final int PAGE_SIZE = 1024 * 1024;
+  private static final int MAX_DICT_BYTE_SIZE = 1024 * 1024;
+
+  @Param({"SEQUENTIAL", "RANDOM", "LOW_CARDINALITY", "HIGH_CARDINALITY"})
+  public String dataPattern;
+
+  private int[] data;
+  private byte[] plainEncoded;
+  private byte[] deltaEncoded;
+  private byte[] bssEncoded;
+  private byte[] rleEncoded;
+  private int rleBitWidth;
+  private byte[] dictDataEncoded;
+  private Dictionary intDictionary;
+
+  @Setup(Level.Trial)
+  public void setup() throws IOException {
+    Random random = new Random(42);
+    switch (dataPattern) {
+      case "SEQUENTIAL":
+        data = TestDataFactory.generateSequentialInts(VALUE_COUNT);
+        break;
+      case "RANDOM":
+        data = TestDataFactory.generateRandomInts(VALUE_COUNT, random);
+        break;
+      case "LOW_CARDINALITY":
+        data = TestDataFactory.generateLowCardinalityInts(
+            VALUE_COUNT, TestDataFactory.LOW_CARDINALITY_DISTINCT, random);
+        break;
+      case "HIGH_CARDINALITY":
+        data = TestDataFactory.generateHighCardinalityInts(VALUE_COUNT);
+        break;
+      default:
+        throw new IllegalArgumentException("Unknown data pattern: " + dataPattern);
+    }
+
+    // Pre-encode data for decode benchmarks
+    plainEncoded = encodeWith(newPlainWriter());
+    deltaEncoded = encodeWith(newDeltaWriter());
+    bssEncoded = encodeWith(newBssWriter());
+
+    // Pre-encode RLE data (using 10-bit values to simulate dictionary indices)
+    rleBitWidth = 10;
+    RunLengthBitPackingHybridEncoder rleEncoder = new RunLengthBitPackingHybridEncoder(
+        rleBitWidth, INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+    for (int v : data) {
+      rleEncoder.writeInt(v & 0x3FF); // mask to 10 bits
+    }
+    rleEncoded = rleEncoder.toBytes().toByteArray();
+    rleEncoder.close();
+
+    // Pre-encode dictionary data for decode benchmark
+    DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter dictWriter = newDictWriter();
+    for (int v : data) {
+      dictWriter.writeInteger(v);
+    }
+    BytesInput dictDataBytes = dictWriter.getBytes();
+    dictDataEncoded = dictDataBytes.toByteArray();
+    DictionaryPage dictPage = dictWriter.toDictPageAndClose().copy();
+    intDictionary = new PlainValuesDictionary.PlainIntegerDictionary(dictPage);
+  }
+
+  private byte[] encodeWith(ValuesWriter writer) throws IOException {
+    for (int v : data) {
+      writer.writeInteger(v);
+    }
+    byte[] bytes = writer.getBytes().toByteArray();
+    writer.close();
+    return bytes;
+  }
+
+  // ---- Writer factories ----
+
+  private static PlainValuesWriter newPlainWriter() {
+    return new PlainValuesWriter(INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+  }
+
+  private static DeltaBinaryPackingValuesWriterForInteger newDeltaWriter() {
+    return new DeltaBinaryPackingValuesWriterForInteger(INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+  }
+
+  private static ByteStreamSplitValuesWriter.IntegerByteStreamSplitValuesWriter newBssWriter() {
+    return new ByteStreamSplitValuesWriter.IntegerByteStreamSplitValuesWriter(
+        INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+  }
+
+  private static DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter newDictWriter() {
+    return new DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter(
+        MAX_DICT_BYTE_SIZE, Encoding.PLAIN_DICTIONARY, Encoding.PLAIN, new HeapByteBufferAllocator());
+  }
+
+  // ---- Encode benchmarks ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodePlain() throws IOException {
+    return encodeWith(newPlainWriter());
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeDelta() throws IOException {
+    return encodeWith(newDeltaWriter());
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeByteStreamSplit() throws IOException {
+    return encodeWith(newBssWriter());
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeDictionary() throws IOException {
+    return encodeWith(newDictWriter());
+  }
+
+  // ---- Decode benchmarks ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodePlain(Blackhole bh) throws IOException {
+    PlainValuesReader.IntegerPlainValuesReader reader = new PlainValuesReader.IntegerPlainValuesReader();
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(plainEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readInteger());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeDelta(Blackhole bh) throws IOException {
+    DeltaBinaryPackingValuesReader reader = new DeltaBinaryPackingValuesReader();
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(deltaEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readInteger());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeByteStreamSplit(Blackhole bh) throws IOException {
+    ByteStreamSplitValuesReaderForInteger reader = new ByteStreamSplitValuesReaderForInteger();
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(bssEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readInteger());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeRle(Blackhole bh) throws IOException {
+    RunLengthBitPackingHybridDecoder decoder = new RunLengthBitPackingHybridDecoder(
+        rleBitWidth, ByteBufferInputStream.wrap(ByteBuffer.wrap(rleEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(decoder.readInt());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeDictionary(Blackhole bh) throws IOException {
+    DictionaryValuesReader reader = new DictionaryValuesReader(intDictionary);
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(dictDataEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readInteger());
+    }
+  }
+}
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/RowGroupFlushBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/RowGroupFlushBenchmark.java
new file mode 100644
index 0000000000..753b27de4a
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/RowGroupFlushBenchmark.java
@@ -0,0 +1,191 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicLong;
+import org.apache.parquet.bytes.ByteBufferAllocator;
+import org.apache.parquet.bytes.HeapByteBufferAllocator;
+import org.apache.parquet.column.ParquetProperties.WriterVersion;
+import org.apache.parquet.example.data.Group;
+import org.apache.parquet.example.data.simple.SimpleGroupFactory;
+import org.apache.parquet.hadoop.ParquetFileWriter;
+import org.apache.parquet.hadoop.ParquetWriter;
+import org.apache.parquet.hadoop.example.ExampleParquetWriter;
+import org.apache.parquet.hadoop.metadata.CompressionCodecName;
+import org.apache.parquet.io.api.Binary;
+import org.apache.parquet.schema.MessageType;
+import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
+import org.apache.parquet.schema.Types;
+import org.openjdk.jmh.annotations.AuxCounters;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+
+/**
+ * Benchmark measuring row group flush performance and peak buffer memory.
+ *
+ * <p>Uses a wide schema (20 BINARY columns, 200 bytes each) to produce
+ * substantial per-column page buffers. A {@link PeakTrackingAllocator}
+ * wraps the heap allocator to precisely track the peak bytes outstanding
+ * across all parquet-managed ByteBuffers (independent of JVM GC behavior).
+ *
+ * <p>The key metric is {@code peakAllocatorMB}: with the interleaved flush
+ * optimization, each column's pages are finalized, written, and released
+ * before the next column is processed, so peak buffer memory is roughly
+ * 1/N of the total row group size (N = number of columns).
+ *
+ * <p>Writes to {@link BlackHoleOutputFile} to isolate flush cost from
+ * filesystem I/O.
+ */
+@BenchmarkMode({Mode.AverageTime})
+@Fork(
+    value = 1,
+    jvmArgs = {"-Xms512m", "-Xmx1g"})
+@Warmup(iterations = 2)
+@Measurement(iterations = 3)
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@State(Scope.Thread)
+public class RowGroupFlushBenchmark {
+
+  private static final int COLUMN_COUNT = 20;
+  private static final int BINARY_VALUE_LENGTH = 200;
+  private static final int ROW_COUNT = 100_000;
+
+  /** Row group sizes: 8MB and 64MB. */
+  @Param({"8388608", "67108864"})
+  public int rowGroupSize;
+
+  /** Wide schema: 20 required BINARY columns. */
+  private static final MessageType WIDE_SCHEMA;
+
+  static {
+    Types.MessageTypeBuilder builder = Types.buildMessage();
+    for (int c = 0; c < COLUMN_COUNT; c++) {
+      builder.required(PrimitiveTypeName.BINARY).named("col_" + c);
+    }
+    WIDE_SCHEMA = builder.named("wide_record");
+  }
+
+  /** Pre-generated column values (one unique value per column). */
+  private Binary[] columnValues;
+
+  @Setup(Level.Trial)
+  public void setup() {
+    Random random = new Random(42);
+    columnValues = new Binary[COLUMN_COUNT];
+    for (int c = 0; c < COLUMN_COUNT; c++) {
+      byte[] value = new byte[BINARY_VALUE_LENGTH];
+      random.nextBytes(value);
+      columnValues[c] = Binary.fromConstantByteArray(value);
+    }
+  }
+
+  /**
+   * Auxiliary counters reported alongside timing. JMH collects these after
+   * each iteration.
+   */
+  @AuxCounters(AuxCounters.Type.EVENTS)
+  @State(Scope.Thread)
+  public static class MemoryCounters {
+    /** Peak bytes outstanding in the parquet ByteBufferAllocator. */
+    public long peakAllocatorBytes;
+
+    /** Convenience: peak in MB (peakAllocatorBytes / 1048576). */
+    public double peakAllocatorMB;
+
+    @Setup(Level.Iteration)
+    public void reset() {
+      peakAllocatorBytes = 0;
+      peakAllocatorMB = 0;
+    }
+  }
+
+  /**
+   * ByteBufferAllocator wrapper that tracks current and peak allocated bytes.
+   * Thread-safe (uses AtomicLong) although the write path is single-threaded.
+   */
+  static class PeakTrackingAllocator implements ByteBufferAllocator {
+    private final ByteBufferAllocator delegate = new HeapByteBufferAllocator();
+    private final AtomicLong currentBytes = new AtomicLong();
+    private final AtomicLong peakBytes = new AtomicLong();
+
+    @Override
+    public ByteBuffer allocate(int size) {
+      ByteBuffer buf = delegate.allocate(size);
+      long current = currentBytes.addAndGet(buf.capacity());
+      peakBytes.accumulateAndGet(current, Math::max);
+      return buf;
+    }
+
+    @Override
+    public void release(ByteBuffer buf) {
+      currentBytes.addAndGet(-buf.capacity());
+      delegate.release(buf);
+    }
+
+    @Override
+    public boolean isDirect() {
+      return delegate.isDirect();
+    }
+
+    long getPeakBytes() {
+      return peakBytes.get();
+    }
+  }
+
+  @Benchmark
+  public void writeWithFlush(MemoryCounters counters) throws IOException {
+    PeakTrackingAllocator allocator = new PeakTrackingAllocator();
+    SimpleGroupFactory factory = new SimpleGroupFactory(WIDE_SCHEMA);
+
+    try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(BlackHoleOutputFile.INSTANCE)
+        .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
+        .withType(WIDE_SCHEMA)
+        .withCompressionCodec(CompressionCodecName.UNCOMPRESSED)
+        .withWriterVersion(WriterVersion.PARQUET_1_0)
+        .withRowGroupSize(rowGroupSize)
+        .withDictionaryEncoding(false)
+        .withAllocator(allocator)
+        .build()) {
+      for (int i = 0; i < ROW_COUNT; i++) {
+        Group group = factory.newGroup();
+        for (int c = 0; c < COLUMN_COUNT; c++) {
+          group.append("col_" + c, columnValues[c]);
+        }
+        writer.write(group);
+      }
+    }
+
+    counters.peakAllocatorBytes = allocator.getPeakBytes();
+    counters.peakAllocatorMB = allocator.getPeakBytes() / (1024.0 * 1024.0);
+  }
+}
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/TestDataFactory.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/TestDataFactory.java
new file mode 100644
index 0000000000..f0fc7c52df
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/TestDataFactory.java
@@ -0,0 +1,175 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BOOLEAN;
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE;
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT;
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32;
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64;
+
+import java.util.Random;
+import org.apache.parquet.example.data.Group;
+import org.apache.parquet.example.data.simple.SimpleGroupFactory;
+import org.apache.parquet.io.api.Binary;
+import org.apache.parquet.schema.MessageType;
+import org.apache.parquet.schema.Types;
+
+/**
+ * Utility class for generating test schemas and data for benchmarks.
+ */
+public final class TestDataFactory {
+
+  /** Default number of rows for file-level benchmarks. */
+  public static final int DEFAULT_ROW_COUNT = 100_000;
+
+  /** Number of distinct values for low-cardinality data patterns. */
+  public static final int LOW_CARDINALITY_DISTINCT = 100;
+
+  /** A standard multi-type schema used by file-level benchmarks. */
+  public static final MessageType FILE_BENCHMARK_SCHEMA = Types.buildMessage()
+      .required(INT32)
+      .named("int32_field")
+      .required(INT64)
+      .named("int64_field")
+      .required(FLOAT)
+      .named("float_field")
+      .required(DOUBLE)
+      .named("double_field")
+      .required(BOOLEAN)
+      .named("boolean_field")
+      .required(BINARY)
+      .named("binary_field")
+      .named("benchmark_record");
+
+  private TestDataFactory() {}
+
+  /**
+   * Creates a {@link SimpleGroupFactory} for the standard benchmark schema.
+   */
+  public static SimpleGroupFactory newGroupFactory() {
+    return new SimpleGroupFactory(FILE_BENCHMARK_SCHEMA);
+  }
+
+  /**
+   * Generates a single row of benchmark data.
+   *
+   * @param factory the group factory
+   * @param index   the row index (used for deterministic data)
+   * @param random  the random source
+   * @return a populated Group
+   */
+  public static Group generateRow(SimpleGroupFactory factory, int index, Random random) {
+    return factory.newGroup()
+        .append("int32_field", index)
+        .append("int64_field", (long) index * 100)
+        .append("float_field", random.nextFloat())
+        .append("double_field", random.nextDouble())
+        .append("boolean_field", index % 2 == 0)
+        .append("binary_field", "value_" + (index % 1000));
+  }
+
+  // ---- Integer data generation for encoding benchmarks ----
+
+  /**
+   * Generates sequential integers: 0, 1, 2, ...
+   */
+  public static int[] generateSequentialInts(int count) {
+    int[] data = new int[count];
+    for (int i = 0; i < count; i++) {
+      data[i] = i;
+    }
+    return data;
+  }
+
+  /**
+   * Generates uniformly random integers.
+   */
+  public static int[] generateRandomInts(int count, Random random) {
+    int[] data = new int[count];
+    for (int i = 0; i < count; i++) {
+      data[i] = random.nextInt();
+    }
+    return data;
+  }
+
+  /**
+   * Generates low-cardinality integers (values drawn from a small set).
+   */
+  public static int[] generateLowCardinalityInts(int count, int distinctValues, Random random) {
+    int[] data = new int[count];
+    for (int i = 0; i < count; i++) {
+      data[i] = random.nextInt(distinctValues);
+    }
+    return data;
+  }
+
+  /**
+   * Generates high-cardinality integers (all unique).
+   */
+  public static int[] generateHighCardinalityInts(int count) {
+    int[] data = new int[count];
+    for (int i = 0; i < count; i++) {
+      data[i] = i;
+    }
+    return data;
+  }
+
+  // ---- Binary data generation for encoding benchmarks ----
+
+  /**
+   * Generates binary strings of the given length with the specified cardinality.
+   *
+   * @param count         number of values
+   * @param stringLength  length of each string
+   * @param distinct      number of distinct values (0 means all unique)
+   * @param random        random source
+   * @return array of Binary values
+   */
+  public static Binary[] generateBinaryData(int count, int stringLength, int distinct, Random random) {
+    Binary[] data = new Binary[count];
+    if (distinct > 0) {
+      // Pre-generate the distinct values
+      Binary[] dictionary = new Binary[distinct];
+      for (int i = 0; i < distinct; i++) {
+        dictionary[i] = Binary.fromConstantByteArray(
+            randomString(stringLength, random).getBytes());
+      }
+      for (int i = 0; i < count; i++) {
+        data[i] = dictionary[random.nextInt(distinct)];
+      }
+    } else {
+      // All unique
+      for (int i = 0; i < count; i++) {
+        data[i] = Binary.fromConstantByteArray(
+            randomString(stringLength, random).getBytes());
+      }
+    }
+    return data;
+  }
+
+  private static String randomString(int length, Random random) {
+    StringBuilder sb = new StringBuilder(length);
+    for (int i = 0; i < length; i++) {
+      sb.append((char) ('a' + random.nextInt(26)));
+    }
+    return sb.toString();
+  }
+}

From bbf36c7dc1c5aad79ae4632c905ba68073a9b501 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= <iemejia@gmail.com>
Date: Mon, 20 Apr 2026 01:16:11 +0000
Subject: [PATCH 02/27] GH-3511: Isolate setup cost and benchmark full
 dictionary paths

Pre-generate deterministic rows for the file and concurrent benchmarks so row construction does not skew the timed section, and make the encoding benchmarks include real dictionary-page and dictionary-decode work instead of only value buffers. Split synthetic RLE dictionary-index decoding into its own benchmark and encode generated binary payloads as UTF-8 explicitly so benchmark inputs stay consistent across runs and platforms.
---
 .../benchmarks/BinaryEncodingBenchmark.java   |  48 +++++++-
 .../ConcurrentReadWriteBenchmark.java         |  38 +++---
 .../parquet/benchmarks/FileReadBenchmark.java |  19 ++-
 .../benchmarks/FileWriteBenchmark.java        |  31 +++--
 .../benchmarks/IntEncodingBenchmark.java      |  53 ++++----
 .../RleDictionaryIndexDecodingBenchmark.java  | 115 ++++++++++++++++++
 .../parquet/benchmarks/TestDataFactory.java   |  30 +++--
 7 files changed, 262 insertions(+), 72 deletions(-)
 create mode 100644 parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/RleDictionaryIndexDecodingBenchmark.java

diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BinaryEncodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BinaryEncodingBenchmark.java
index 7added9717..db65ca5f25 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BinaryEncodingBenchmark.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BinaryEncodingBenchmark.java
@@ -23,14 +23,19 @@
 import java.util.Random;
 import java.util.concurrent.TimeUnit;
 import org.apache.parquet.bytes.ByteBufferInputStream;
+import org.apache.parquet.bytes.BytesInput;
 import org.apache.parquet.bytes.HeapByteBufferAllocator;
+import org.apache.parquet.column.Dictionary;
 import org.apache.parquet.column.Encoding;
+import org.apache.parquet.column.page.DictionaryPage;
 import org.apache.parquet.column.values.ValuesWriter;
 import org.apache.parquet.column.values.deltalengthbytearray.DeltaLengthByteArrayValuesReader;
 import org.apache.parquet.column.values.deltalengthbytearray.DeltaLengthByteArrayValuesWriter;
 import org.apache.parquet.column.values.deltastrings.DeltaByteArrayReader;
 import org.apache.parquet.column.values.deltastrings.DeltaByteArrayWriter;
+import org.apache.parquet.column.values.dictionary.DictionaryValuesReader;
 import org.apache.parquet.column.values.dictionary.DictionaryValuesWriter;
+import org.apache.parquet.column.values.dictionary.PlainValuesDictionary;
 import org.apache.parquet.column.values.plain.BinaryPlainValuesReader;
 import org.apache.parquet.column.values.plain.PlainValuesWriter;
 import org.apache.parquet.io.api.Binary;
@@ -50,7 +55,7 @@
 import org.openjdk.jmh.infra.Blackhole;
 
 /**
- * Encoding-level micro-benchmarks for BINARY values.
+ * Encoding-level and decoding-level micro-benchmarks for BINARY values.
  * Compares PLAIN, DELTA_BYTE_ARRAY, DELTA_LENGTH_BYTE_ARRAY, and DICTIONARY encodings
  * across different string lengths and cardinality patterns.
  *
@@ -81,6 +86,8 @@ public class BinaryEncodingBenchmark {
   private byte[] plainEncoded;
   private byte[] deltaLengthEncoded;
   private byte[] deltaStringsEncoded;
+  private byte[] dictEncoded;
+  private Dictionary binaryDictionary;
 
   @Setup(Level.Trial)
   public void setup() throws IOException {
@@ -92,6 +99,15 @@ public void setup() throws IOException {
     plainEncoded = encodeBinaryWith(newPlainWriter());
     deltaLengthEncoded = encodeBinaryWith(newDeltaLengthWriter());
     deltaStringsEncoded = encodeBinaryWith(newDeltaStringsWriter());
+
+    DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter dictWriter = newDictWriter();
+    for (Binary v : data) {
+      dictWriter.writeBytes(v);
+    }
+    dictEncoded = dictWriter.getBytes().toByteArray();
+    DictionaryPage dictPage = dictWriter.toDictPageAndClose().copy();
+    binaryDictionary = new PlainValuesDictionary.PlainBinaryDictionary(dictPage);
+    dictWriter.close();
   }
 
   private byte[] encodeBinaryWith(ValuesWriter writer) throws IOException {
@@ -103,6 +119,24 @@ private byte[] encodeBinaryWith(ValuesWriter writer) throws IOException {
     return bytes;
   }
 
+  private byte[] encodeDictionaryWith(DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter writer)
+      throws IOException {
+    for (Binary v : data) {
+      writer.writeBytes(v);
+    }
+    BytesInput dataBytes = writer.getBytes();
+    DictionaryPage dictPage = writer.toDictPageAndClose();
+    byte[] bytes;
+    if (dictPage == null) {
+      bytes = dataBytes.toByteArray();
+    } else {
+      BytesInput allBytes = BytesInput.concat(dataBytes, dictPage.getBytes());
+      bytes = allBytes.toByteArray();
+    }
+    writer.close();
+    return bytes;
+  }
+
   // ---- Writer factories ----
 
   private static PlainValuesWriter newPlainWriter() {
@@ -145,7 +179,7 @@ public byte[] encodeDeltaByteArray() throws IOException {
   @Benchmark
   @OperationsPerInvocation(VALUE_COUNT)
   public byte[] encodeDictionary() throws IOException {
-    return encodeBinaryWith(newDictWriter());
+    return encodeDictionaryWith(newDictWriter());
   }
 
   // ---- Decode benchmarks ----
@@ -179,4 +213,14 @@ public void decodeDeltaByteArray(Blackhole bh) throws IOException {
       bh.consume(reader.readBytes());
     }
   }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeDictionary(Blackhole bh) throws IOException {
+    DictionaryValuesReader reader = new DictionaryValuesReader(binaryDictionary);
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(dictEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readBytes());
+    }
+  }
 }
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ConcurrentReadWriteBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ConcurrentReadWriteBenchmark.java
index 9c5d135eab..29371f7eb1 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ConcurrentReadWriteBenchmark.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ConcurrentReadWriteBenchmark.java
@@ -20,10 +20,8 @@
 
 import java.io.File;
 import java.io.IOException;
-import java.util.Random;
 import java.util.concurrent.TimeUnit;
 import org.apache.parquet.example.data.Group;
-import org.apache.parquet.example.data.simple.SimpleGroupFactory;
 import org.apache.parquet.hadoop.ParquetFileWriter;
 import org.apache.parquet.hadoop.ParquetReader;
 import org.apache.parquet.hadoop.ParquetWriter;
@@ -49,8 +47,10 @@
 import org.openjdk.jmh.infra.Blackhole;
 
 /**
- * Multi-threaded benchmarks to validate that read and write operations perform correctly
- * under concurrency. Uses {@code @Threads(4)} by default (overridable via JMH {@code -t} flag).
+ * Multi-threaded benchmarks measuring independent read and write throughput under
+ * concurrency. Uses {@code @Threads(4)} by default (overridable via JMH {@code -t} flag).
+ * This benchmark does not assert correctness; it measures the cost of each thread
+ * writing a full file to a stateless sink or reading a shared pre-generated file.
  *
  * <ul>
  *   <li>{@link #concurrentWrite()} - each thread independently writes to a shared
@@ -59,7 +59,7 @@
  *       pre-generated Parquet file</li>
  * </ul>
  */
-@BenchmarkMode({Mode.SingleShotTime, Mode.AverageTime})
+@BenchmarkMode(Mode.SingleShotTime)
 @OutputTimeUnit(TimeUnit.MILLISECONDS)
 @Fork(1)
 @Warmup(iterations = 2, batchSize = 1)
@@ -69,6 +69,18 @@
 public class ConcurrentReadWriteBenchmark {
 
   private File tempFile;
+  private Group[] readRows;
+
+  @State(Scope.Thread)
+  public static class ThreadData {
+    private Group[] rows;
+
+    @Setup(Level.Trial)
+    public void setup() {
+      rows = TestDataFactory.generateRows(
+          TestDataFactory.newGroupFactory(), TestDataFactory.DEFAULT_ROW_COUNT, 42L);
+    }
+  }
 
   @Setup(Level.Trial)
   public void setup() throws IOException {
@@ -77,14 +89,14 @@ public void setup() throws IOException {
     tempFile.deleteOnExit();
     tempFile.delete();
 
-    SimpleGroupFactory factory = TestDataFactory.newGroupFactory();
-    Random random = new Random(42);
+    readRows = TestDataFactory.generateRows(
+        TestDataFactory.newGroupFactory(), TestDataFactory.DEFAULT_ROW_COUNT, 42L);
     try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(new LocalOutputFile(tempFile.toPath()))
         .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
         .withType(TestDataFactory.FILE_BENCHMARK_SCHEMA)
         .build()) {
-      for (int i = 0; i < TestDataFactory.DEFAULT_ROW_COUNT; i++) {
-        writer.write(TestDataFactory.generateRow(factory, i, random));
+      for (Group row : readRows) {
+        writer.write(row);
       }
     }
   }
@@ -101,15 +113,13 @@ public void tearDown() {
    * {@link BlackHoleOutputFile} sink.
    */
   @Benchmark
-  public void concurrentWrite() throws IOException {
-    SimpleGroupFactory factory = TestDataFactory.newGroupFactory();
-    Random random = new Random(Thread.currentThread().getId());
+  public void concurrentWrite(ThreadData threadData) throws IOException {
     try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(BlackHoleOutputFile.INSTANCE)
         .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
         .withType(TestDataFactory.FILE_BENCHMARK_SCHEMA)
         .build()) {
-      for (int i = 0; i < TestDataFactory.DEFAULT_ROW_COUNT; i++) {
-        writer.write(TestDataFactory.generateRow(factory, i, random));
+      for (Group row : threadData.rows) {
+        writer.write(row);
       }
     }
   }
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileReadBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileReadBenchmark.java
index 7d5d0f5159..eb5b959efa 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileReadBenchmark.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileReadBenchmark.java
@@ -20,11 +20,9 @@
 
 import java.io.File;
 import java.io.IOException;
-import java.util.Random;
 import java.util.concurrent.TimeUnit;
 import org.apache.parquet.column.ParquetProperties.WriterVersion;
 import org.apache.parquet.example.data.Group;
-import org.apache.parquet.example.data.simple.SimpleGroupFactory;
 import org.apache.parquet.hadoop.ParquetFileWriter;
 import org.apache.parquet.hadoop.ParquetReader;
 import org.apache.parquet.hadoop.ParquetWriter;
@@ -51,13 +49,14 @@
 import org.openjdk.jmh.infra.Blackhole;
 
 /**
- * File-level read benchmarks measuring throughput of the full Parquet read pipeline.
- * A temporary file is generated during setup using {@link LocalOutputFile} (no Hadoop FS
- * overhead on write side), then read repeatedly during the benchmark.
+ * File-level read benchmarks measuring end-to-end Parquet read throughput through the
+ * example {@link Group} API. A temporary file is generated once during setup from
+ * pre-generated rows using {@link LocalOutputFile}, then read repeatedly during the
+ * benchmark.
  *
  * <p>Parameterized across compression codec and writer version.
  */
-@BenchmarkMode({Mode.SingleShotTime, Mode.AverageTime})
+@BenchmarkMode(Mode.SingleShotTime)
 @Fork(1)
 @Warmup(iterations = 3, batchSize = 1)
 @Measurement(iterations = 5, batchSize = 1)
@@ -79,8 +78,8 @@ public void setup() throws IOException {
     tempFile.deleteOnExit();
     tempFile.delete(); // remove so the writer can create it
 
-    SimpleGroupFactory factory = TestDataFactory.newGroupFactory();
-    Random random = new Random(42);
+    Group[] rows = TestDataFactory.generateRows(
+        TestDataFactory.newGroupFactory(), TestDataFactory.DEFAULT_ROW_COUNT, 42L);
     try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(new LocalOutputFile(tempFile.toPath()))
         .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
         .withType(TestDataFactory.FILE_BENCHMARK_SCHEMA)
@@ -88,8 +87,8 @@ public void setup() throws IOException {
         .withWriterVersion(WriterVersion.valueOf(writerVersion))
         .withDictionaryEncoding(true)
         .build()) {
-      for (int i = 0; i < TestDataFactory.DEFAULT_ROW_COUNT; i++) {
-        writer.write(TestDataFactory.generateRow(factory, i, random));
+      for (Group row : rows) {
+        writer.write(row);
       }
     }
   }
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileWriteBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileWriteBenchmark.java
index 60ac086504..73f60d7199 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileWriteBenchmark.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileWriteBenchmark.java
@@ -19,11 +19,9 @@
 package org.apache.parquet.benchmarks;
 
 import java.io.IOException;
-import java.util.Random;
 import java.util.concurrent.TimeUnit;
 import org.apache.parquet.column.ParquetProperties.WriterVersion;
 import org.apache.parquet.example.data.Group;
-import org.apache.parquet.example.data.simple.SimpleGroupFactory;
 import org.apache.parquet.hadoop.ParquetFileWriter;
 import org.apache.parquet.hadoop.ParquetWriter;
 import org.apache.parquet.hadoop.example.ExampleParquetWriter;
@@ -31,22 +29,27 @@
 import org.openjdk.jmh.annotations.Benchmark;
 import org.openjdk.jmh.annotations.BenchmarkMode;
 import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
 import org.openjdk.jmh.annotations.Measurement;
 import org.openjdk.jmh.annotations.Mode;
 import org.openjdk.jmh.annotations.OutputTimeUnit;
 import org.openjdk.jmh.annotations.Param;
 import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
 import org.openjdk.jmh.annotations.State;
 import org.openjdk.jmh.annotations.Warmup;
 
 /**
- * File-level write benchmarks measuring throughput of the full Parquet write pipeline.
- * Writes are sent to a {@link BlackHoleOutputFile} to isolate CPU/encoding cost from
- * filesystem I/O.
+ * File-level write benchmarks measuring end-to-end Parquet write throughput through the
+ * example {@link Group} API. Row contents are pre-generated during setup so compression
+ * and writer settings dominate the timed section, while writes still flow through the
+ * full Parquet writer path.
  *
- * <p>Parameterized across compression codec, writer version, and dictionary encoding.
+ * <p>Writes are sent to a {@link BlackHoleOutputFile} to isolate CPU and encoding cost
+ * from filesystem I/O. Parameterized across compression codec, writer version, and
+ * dictionary encoding.
  */
-@BenchmarkMode({Mode.SingleShotTime, Mode.AverageTime})
+@BenchmarkMode(Mode.SingleShotTime)
 @Fork(1)
 @Warmup(iterations = 3, batchSize = 1)
 @Measurement(iterations = 5, batchSize = 1)
@@ -63,10 +66,16 @@ public class FileWriteBenchmark {
   @Param({"true", "false"})
   public String dictionary;
 
+  private Group[] rows;
+
+  @Setup(Level.Trial)
+  public void setup() {
+    rows = TestDataFactory.generateRows(
+        TestDataFactory.newGroupFactory(), TestDataFactory.DEFAULT_ROW_COUNT, 42L);
+  }
+
   @Benchmark
   public void writeFile() throws IOException {
-    SimpleGroupFactory factory = TestDataFactory.newGroupFactory();
-    Random random = new Random(42);
     try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(BlackHoleOutputFile.INSTANCE)
         .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
         .withType(TestDataFactory.FILE_BENCHMARK_SCHEMA)
@@ -74,8 +83,8 @@ public void writeFile() throws IOException {
         .withWriterVersion(WriterVersion.valueOf(writerVersion))
         .withDictionaryEncoding(Boolean.parseBoolean(dictionary))
         .build()) {
-      for (int i = 0; i < TestDataFactory.DEFAULT_ROW_COUNT; i++) {
-        writer.write(TestDataFactory.generateRow(factory, i, random));
+      for (Group row : rows) {
+        writer.write(row);
       }
     }
   }
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/IntEncodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/IntEncodingBenchmark.java
index 6ce4420e7e..df767df455 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/IntEncodingBenchmark.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/IntEncodingBenchmark.java
@@ -38,8 +38,6 @@
 import org.apache.parquet.column.values.dictionary.PlainValuesDictionary;
 import org.apache.parquet.column.values.plain.PlainValuesReader;
 import org.apache.parquet.column.values.plain.PlainValuesWriter;
-import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridDecoder;
-import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridEncoder;
 import org.openjdk.jmh.annotations.Benchmark;
 import org.openjdk.jmh.annotations.BenchmarkMode;
 import org.openjdk.jmh.annotations.Fork;
@@ -56,9 +54,11 @@
 import org.openjdk.jmh.infra.Blackhole;
 
 /**
- * Encoding-level micro-benchmarks for INT32 values.
+ * Encoding-level and decoding-level micro-benchmarks for INT32 values.
  * Compares PLAIN, DELTA_BINARY_PACKED, BYTE_STREAM_SPLIT, and DICTIONARY encodings
- * across different data distribution patterns.
+ * across different data distribution patterns. Synthetic dictionary-id RLE decode is
+ * benchmarked separately in {@link RleDictionaryIndexDecodingBenchmark} so the results
+ * here stay comparable at the full-value level.
  *
  * <p>Each benchmark invocation processes {@value #VALUE_COUNT} values. Throughput is
  * reported per-value using {@link OperationsPerInvocation}.
@@ -83,8 +83,6 @@ public class IntEncodingBenchmark {
   private byte[] plainEncoded;
   private byte[] deltaEncoded;
   private byte[] bssEncoded;
-  private byte[] rleEncoded;
-  private int rleBitWidth;
   private byte[] dictDataEncoded;
   private Dictionary intDictionary;
 
@@ -103,7 +101,7 @@ public void setup() throws IOException {
             VALUE_COUNT, TestDataFactory.LOW_CARDINALITY_DISTINCT, random);
         break;
       case "HIGH_CARDINALITY":
-        data = TestDataFactory.generateHighCardinalityInts(VALUE_COUNT);
+        data = TestDataFactory.generateHighCardinalityInts(VALUE_COUNT, random);
         break;
       default:
         throw new IllegalArgumentException("Unknown data pattern: " + dataPattern);
@@ -114,16 +112,6 @@ public void setup() throws IOException {
     deltaEncoded = encodeWith(newDeltaWriter());
     bssEncoded = encodeWith(newBssWriter());
 
-    // Pre-encode RLE data (using 10-bit values to simulate dictionary indices)
-    rleBitWidth = 10;
-    RunLengthBitPackingHybridEncoder rleEncoder = new RunLengthBitPackingHybridEncoder(
-        rleBitWidth, INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
-    for (int v : data) {
-      rleEncoder.writeInt(v & 0x3FF); // mask to 10 bits
-    }
-    rleEncoded = rleEncoder.toBytes().toByteArray();
-    rleEncoder.close();
-
     // Pre-encode dictionary data for decode benchmark
     DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter dictWriter = newDictWriter();
     for (int v : data) {
@@ -133,6 +121,7 @@ public void setup() throws IOException {
     dictDataEncoded = dictDataBytes.toByteArray();
     DictionaryPage dictPage = dictWriter.toDictPageAndClose().copy();
     intDictionary = new PlainValuesDictionary.PlainIntegerDictionary(dictPage);
+    dictWriter.close();
   }
 
   private byte[] encodeWith(ValuesWriter writer) throws IOException {
@@ -144,6 +133,24 @@ private byte[] encodeWith(ValuesWriter writer) throws IOException {
     return bytes;
   }
 
+  private byte[] encodeDictionaryWith(DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter writer)
+      throws IOException {
+    for (int v : data) {
+      writer.writeInteger(v);
+    }
+    BytesInput dataBytes = writer.getBytes();
+    DictionaryPage dictPage = writer.toDictPageAndClose();
+    byte[] bytes;
+    if (dictPage == null) {
+      bytes = dataBytes.toByteArray();
+    } else {
+      BytesInput allBytes = BytesInput.concat(dataBytes, dictPage.getBytes());
+      bytes = allBytes.toByteArray();
+    }
+    writer.close();
+    return bytes;
+  }
+
   // ---- Writer factories ----
 
   private static PlainValuesWriter newPlainWriter() {
@@ -187,7 +194,7 @@ public byte[] encodeByteStreamSplit() throws IOException {
   @Benchmark
   @OperationsPerInvocation(VALUE_COUNT)
   public byte[] encodeDictionary() throws IOException {
-    return encodeWith(newDictWriter());
+    return encodeDictionaryWith(newDictWriter());
   }
 
   // ---- Decode benchmarks ----
@@ -222,16 +229,6 @@ public void decodeByteStreamSplit(Blackhole bh) throws IOException {
     }
   }
 
-  @Benchmark
-  @OperationsPerInvocation(VALUE_COUNT)
-  public void decodeRle(Blackhole bh) throws IOException {
-    RunLengthBitPackingHybridDecoder decoder = new RunLengthBitPackingHybridDecoder(
-        rleBitWidth, ByteBufferInputStream.wrap(ByteBuffer.wrap(rleEncoded)));
-    for (int i = 0; i < VALUE_COUNT; i++) {
-      bh.consume(decoder.readInt());
-    }
-  }
-
   @Benchmark
   @OperationsPerInvocation(VALUE_COUNT)
   public void decodeDictionary(Blackhole bh) throws IOException {
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/RleDictionaryIndexDecodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/RleDictionaryIndexDecodingBenchmark.java
new file mode 100644
index 0000000000..c9d604c946
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/RleDictionaryIndexDecodingBenchmark.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import org.apache.parquet.bytes.ByteBufferInputStream;
+import org.apache.parquet.bytes.HeapByteBufferAllocator;
+import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridDecoder;
+import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridEncoder;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OperationsPerInvocation;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+/**
+ * Decoding micro-benchmark for synthetic dictionary-id pages encoded with
+ * {@link RunLengthBitPackingHybridEncoder}. This isolates the dictionary-id
+ * decode path and is intentionally separate from {@link IntEncodingBenchmark},
+ * which measures full INT32 value decode paths.
+ */
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@Fork(1)
+@Warmup(iterations = 3, time = 1)
+@Measurement(iterations = 5, time = 1)
+@State(Scope.Thread)
+public class RleDictionaryIndexDecodingBenchmark {
+
+  static final int VALUE_COUNT = 100_000;
+  private static final int INIT_SLAB_SIZE = 64 * 1024;
+  private static final int PAGE_SIZE = 1024 * 1024;
+  private static final int BIT_WIDTH = 10;
+  private static final int MAX_ID = 1 << BIT_WIDTH;
+
+  @Param({"SEQUENTIAL", "RANDOM", "LOW_CARDINALITY"})
+  public String indexPattern;
+
+  private byte[] encoded;
+
+  @Setup(Level.Trial)
+  public void setup() throws IOException {
+    int[] ids = generateDictionaryIds();
+    RunLengthBitPackingHybridEncoder encoder = new RunLengthBitPackingHybridEncoder(
+        BIT_WIDTH, INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+    for (int id : ids) {
+      encoder.writeInt(id);
+    }
+    encoded = encoder.toBytes().toByteArray();
+    encoder.close();
+  }
+
+  private int[] generateDictionaryIds() {
+    int[] ids = new int[VALUE_COUNT];
+    Random random = new Random(42);
+    switch (indexPattern) {
+      case "SEQUENTIAL":
+        for (int i = 0; i < VALUE_COUNT; i++) {
+          ids[i] = i % MAX_ID;
+        }
+        break;
+      case "RANDOM":
+        for (int i = 0; i < VALUE_COUNT; i++) {
+          ids[i] = random.nextInt(MAX_ID);
+        }
+        break;
+      case "LOW_CARDINALITY":
+        for (int i = 0; i < VALUE_COUNT; i++) {
+          ids[i] = random.nextInt(TestDataFactory.LOW_CARDINALITY_DISTINCT);
+        }
+        break;
+      default:
+        throw new IllegalArgumentException("Unknown index pattern: " + indexPattern);
+    }
+    return ids;
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeDictionaryIds(Blackhole bh) throws IOException {
+    RunLengthBitPackingHybridDecoder decoder = new RunLengthBitPackingHybridDecoder(
+        BIT_WIDTH, ByteBufferInputStream.wrap(ByteBuffer.wrap(encoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(decoder.readInt());
+    }
+  }
+}
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/TestDataFactory.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/TestDataFactory.java
index f0fc7c52df..bc00f3b070 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/TestDataFactory.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/TestDataFactory.java
@@ -25,6 +25,7 @@
 import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32;
 import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64;
 
+import java.nio.charset.StandardCharsets;
 import java.util.Random;
 import org.apache.parquet.example.data.Group;
 import org.apache.parquet.example.data.simple.SimpleGroupFactory;
@@ -86,6 +87,18 @@ public static Group generateRow(SimpleGroupFactory factory, int index, Random ra
         .append("binary_field", "value_" + (index % 1000));
   }
 
+  /**
+   * Generates a deterministic set of rows for file-level benchmarks.
+   */
+  public static Group[] generateRows(SimpleGroupFactory factory, int rowCount, long seed) {
+    Group[] rows = new Group[rowCount];
+    Random random = new Random(seed);
+    for (int i = 0; i < rowCount; i++) {
+      rows[i] = generateRow(factory, i, random);
+    }
+    return rows;
+  }
+
   // ---- Integer data generation for encoding benchmarks ----
 
   /**
@@ -122,12 +135,15 @@ public static int[] generateLowCardinalityInts(int count, int distinctValues, Ra
   }
 
   /**
-   * Generates high-cardinality integers (all unique).
+   * Generates high-cardinality integers (all unique in randomized order).
    */
-  public static int[] generateHighCardinalityInts(int count) {
-    int[] data = new int[count];
-    for (int i = 0; i < count; i++) {
-      data[i] = i;
+  public static int[] generateHighCardinalityInts(int count, Random random) {
+    int[] data = generateSequentialInts(count);
+    for (int i = count - 1; i > 0; i--) {
+      int swapIndex = random.nextInt(i + 1);
+      int tmp = data[i];
+      data[i] = data[swapIndex];
+      data[swapIndex] = tmp;
     }
     return data;
   }
@@ -150,7 +166,7 @@ public static Binary[] generateBinaryData(int count, int stringLength, int disti
       Binary[] dictionary = new Binary[distinct];
       for (int i = 0; i < distinct; i++) {
         dictionary[i] = Binary.fromConstantByteArray(
-            randomString(stringLength, random).getBytes());
+            randomString(stringLength, random).getBytes(StandardCharsets.UTF_8));
       }
       for (int i = 0; i < count; i++) {
         data[i] = dictionary[random.nextInt(distinct)];
@@ -159,7 +175,7 @@ public static Binary[] generateBinaryData(int count, int stringLength, int disti
       // All unique
       for (int i = 0; i < count; i++) {
         data[i] = Binary.fromConstantByteArray(
-            randomString(stringLength, random).getBytes());
+            randomString(stringLength, random).getBytes(StandardCharsets.UTF_8));
       }
     }
     return data;

From 1710b3b9ca4514793ae7ed855ec6339110c8849c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= <iemejia@gmail.com>
Date: Mon, 20 Apr 2026 01:45:22 +0000
Subject: [PATCH 03/27] GH-3511: Tighten dictionary benchmarks and reduce setup
 coupling

Make the dictionary encode/decode benchmarks symmetric by routing both
sides through a shared EncodedDictionary helper, guard against the
dictionary writer falling back to plain encoding (which previously NPE'd
in BinaryEncodingBenchmark setup for high-cardinality long strings),
and drop redundant close() calls after toDictPageAndClose().

Share the pre-generated row array across threads in
ConcurrentReadWriteBenchmark via Scope.Benchmark, eliminating 4x heap
duplication and a now-unnecessary ThreadData inner class.

Centralize the RNG seed as TestDataFactory.DEFAULT_SEED and add
seed-overload variants for the int and binary generators so generators
in the same setup no longer share a Random and silently depend on call
order. Wrap the RLE encoder in try-with-resources and validate that
LOW_CARDINALITY_DISTINCT fits within the configured bit width.
---
 .../benchmarks/BenchmarkEncodingUtils.java    | 70 +++++++++++++++++++
 .../benchmarks/BinaryEncodingBenchmark.java   | 53 ++++++++------
 .../ConcurrentReadWriteBenchmark.java         | 43 ++++++------
 .../parquet/benchmarks/FileReadBenchmark.java | 10 ++-
 .../benchmarks/FileWriteBenchmark.java        |  6 +-
 .../benchmarks/IntEncodingBenchmark.java      | 59 +++++++++-------
 .../RleDictionaryIndexDecodingBenchmark.java  | 47 +++++++------
 .../parquet/benchmarks/TestDataFactory.java   | 35 ++++++++++
 8 files changed, 227 insertions(+), 96 deletions(-)
 create mode 100644 parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BenchmarkEncodingUtils.java

diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BenchmarkEncodingUtils.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BenchmarkEncodingUtils.java
new file mode 100644
index 0000000000..c79dedce28
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BenchmarkEncodingUtils.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.IOException;
+import org.apache.parquet.column.page.DictionaryPage;
+import org.apache.parquet.column.values.dictionary.DictionaryValuesWriter;
+
+/**
+ * Shared helpers for encode/decode micro-benchmarks.
+ */
+final class BenchmarkEncodingUtils {
+
+  private BenchmarkEncodingUtils() {}
+
+  /**
+   * Container for the two artefacts produced by a dictionary-encoded page:
+   * the encoded dictionary indices ({@link #dictData}) and the dictionary
+   * page itself ({@link #dictPage}). The dictionary page may be {@code null}
+   * if the writer fell back to plain encoding (for example, when the
+   * dictionary exceeded its configured maximum size).
+   */
+  static final class EncodedDictionary {
+    final byte[] dictData;
+    final DictionaryPage dictPage;
+
+    EncodedDictionary(byte[] dictData, DictionaryPage dictPage) {
+      this.dictData = dictData;
+      this.dictPage = dictPage;
+    }
+
+    boolean fellBackToPlain() {
+      return dictPage == null;
+    }
+  }
+
+  /**
+   * Drains a {@link DictionaryValuesWriter} into an {@link EncodedDictionary}.
+   *
+   * <p>The writer's data bytes (the RLE-encoded indices) and the dictionary
+   * page are returned separately so both pieces can be measured or fed to a
+   * decoder symmetrically. The dictionary page buffer is copied so it remains
+   * valid after the writer's allocator is released.
+   *
+   * <p>The writer is closed via {@code toDictPageAndClose()}; callers must not
+   * call {@link DictionaryValuesWriter#close()} again afterwards.
+   */
+  static EncodedDictionary drainDictionary(DictionaryValuesWriter writer) throws IOException {
+    byte[] dictData = writer.getBytes().toByteArray();
+    DictionaryPage rawPage = writer.toDictPageAndClose();
+    DictionaryPage dictPage = rawPage == null ? null : rawPage.copy();
+    return new EncodedDictionary(dictData, dictPage);
+  }
+}
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BinaryEncodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BinaryEncodingBenchmark.java
index db65ca5f25..e6646458d8 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BinaryEncodingBenchmark.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BinaryEncodingBenchmark.java
@@ -20,10 +20,8 @@
 
 import java.io.IOException;
 import java.nio.ByteBuffer;
-import java.util.Random;
 import java.util.concurrent.TimeUnit;
 import org.apache.parquet.bytes.ByteBufferInputStream;
-import org.apache.parquet.bytes.BytesInput;
 import org.apache.parquet.bytes.HeapByteBufferAllocator;
 import org.apache.parquet.column.Dictionary;
 import org.apache.parquet.column.Encoding;
@@ -61,6 +59,14 @@
  *
  * <p>Each benchmark invocation processes {@value #VALUE_COUNT} values. Throughput is
  * reported per-value using {@link OperationsPerInvocation}.
+ *
+ * <p>The dictionary encode/decode benchmarks intentionally measure the full path:
+ * the encoder produces both the RLE-encoded indices and a {@link DictionaryPage};
+ * the decoder consumes the indices through a {@link DictionaryValuesReader} backed
+ * by the same dictionary. If the dictionary exceeds {@link #MAX_DICT_BYTE_SIZE}
+ * (which can happen for high-cardinality, long-string parameter combinations) the
+ * writer falls back to plain encoding and dictionary decoding for that combination
+ * is skipped.
  */
 @BenchmarkMode(Mode.Throughput)
 @OutputTimeUnit(TimeUnit.SECONDS)
@@ -87,13 +93,14 @@ public class BinaryEncodingBenchmark {
   private byte[] deltaLengthEncoded;
   private byte[] deltaStringsEncoded;
   private byte[] dictEncoded;
+  private DictionaryPage dictPage;
   private Dictionary binaryDictionary;
+  private boolean dictionaryAvailable;
 
   @Setup(Level.Trial)
   public void setup() throws IOException {
-    Random random = new Random(42);
     int distinct = "LOW".equals(cardinality) ? TestDataFactory.LOW_CARDINALITY_DISTINCT : 0;
-    data = TestDataFactory.generateBinaryData(VALUE_COUNT, stringLength, distinct, random);
+    data = TestDataFactory.generateBinaryData(VALUE_COUNT, stringLength, distinct, TestDataFactory.DEFAULT_SEED);
 
     // Pre-encode data for decode benchmarks
     plainEncoded = encodeBinaryWith(newPlainWriter());
@@ -104,10 +111,13 @@ public void setup() throws IOException {
     for (Binary v : data) {
       dictWriter.writeBytes(v);
     }
-    dictEncoded = dictWriter.getBytes().toByteArray();
-    DictionaryPage dictPage = dictWriter.toDictPageAndClose().copy();
-    binaryDictionary = new PlainValuesDictionary.PlainBinaryDictionary(dictPage);
-    dictWriter.close();
+    BenchmarkEncodingUtils.EncodedDictionary encoded = BenchmarkEncodingUtils.drainDictionary(dictWriter);
+    dictEncoded = encoded.dictData;
+    dictPage = encoded.dictPage;
+    dictionaryAvailable = !encoded.fellBackToPlain();
+    if (dictionaryAvailable) {
+      binaryDictionary = new PlainValuesDictionary.PlainBinaryDictionary(dictPage);
+    }
   }
 
   private byte[] encodeBinaryWith(ValuesWriter writer) throws IOException {
@@ -119,22 +129,12 @@ private byte[] encodeBinaryWith(ValuesWriter writer) throws IOException {
     return bytes;
   }
 
-  private byte[] encodeDictionaryWith(DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter writer)
-      throws IOException {
+  private BenchmarkEncodingUtils.EncodedDictionary encodeDictionaryWith(
+      DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter writer) throws IOException {
     for (Binary v : data) {
       writer.writeBytes(v);
     }
-    BytesInput dataBytes = writer.getBytes();
-    DictionaryPage dictPage = writer.toDictPageAndClose();
-    byte[] bytes;
-    if (dictPage == null) {
-      bytes = dataBytes.toByteArray();
-    } else {
-      BytesInput allBytes = BytesInput.concat(dataBytes, dictPage.getBytes());
-      bytes = allBytes.toByteArray();
-    }
-    writer.close();
-    return bytes;
+    return BenchmarkEncodingUtils.drainDictionary(writer);
   }
 
   // ---- Writer factories ----
@@ -178,8 +178,10 @@ public byte[] encodeDeltaByteArray() throws IOException {
 
   @Benchmark
   @OperationsPerInvocation(VALUE_COUNT)
-  public byte[] encodeDictionary() throws IOException {
-    return encodeDictionaryWith(newDictWriter());
+  public void encodeDictionary(Blackhole bh) throws IOException {
+    BenchmarkEncodingUtils.EncodedDictionary encoded = encodeDictionaryWith(newDictWriter());
+    bh.consume(encoded.dictData);
+    bh.consume(encoded.dictPage);
   }
 
   // ---- Decode benchmarks ----
@@ -217,6 +219,11 @@ public void decodeDeltaByteArray(Blackhole bh) throws IOException {
   @Benchmark
   @OperationsPerInvocation(VALUE_COUNT)
   public void decodeDictionary(Blackhole bh) throws IOException {
+    if (!dictionaryAvailable) {
+      // Dictionary fell back to plain encoding (e.g. high-cardinality long strings
+      // exceeding MAX_DICT_BYTE_SIZE). Skip to keep the benchmark meaningful.
+      return;
+    }
     DictionaryValuesReader reader = new DictionaryValuesReader(binaryDictionary);
     reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(dictEncoded)));
     for (int i = 0; i < VALUE_COUNT; i++) {
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ConcurrentReadWriteBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ConcurrentReadWriteBenchmark.java
index 29371f7eb1..de94b422cf 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ConcurrentReadWriteBenchmark.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ConcurrentReadWriteBenchmark.java
@@ -49,15 +49,23 @@
 /**
  * Multi-threaded benchmarks measuring independent read and write throughput under
  * concurrency. Uses {@code @Threads(4)} by default (overridable via JMH {@code -t} flag).
- * This benchmark does not assert correctness; it measures the cost of each thread
+ *
+ * <p>This benchmark does not assert correctness; it measures the cost of each thread
  * writing a full file to a stateless sink or reading a shared pre-generated file.
+ * The set of rows used by {@link #concurrentWrite(Blackhole)} is built once during
+ * setup and shared (read-only) across all threads, so the timed section measures
+ * the encoder/serializer pipeline rather than per-row data construction.
  *
  * <ul>
- *   <li>{@link #concurrentWrite()} - each thread independently writes to a shared
- *       {@link BlackHoleOutputFile} (stateless sink)</li>
+ *   <li>{@link #concurrentWrite(Blackhole)} - each thread independently writes the
+ *       shared pre-generated rows to a {@link BlackHoleOutputFile} (stateless sink)</li>
  *   <li>{@link #concurrentRead(Blackhole)} - each thread independently reads the same
  *       pre-generated Parquet file</li>
  * </ul>
+ *
+ * <p>{@link Mode#SingleShotTime} is used because each invocation does enough work
+ * (a full file write or read of {@value TestDataFactory#DEFAULT_ROW_COUNT} rows)
+ * that JIT amortization across invocations is unnecessary.
  */
 @BenchmarkMode(Mode.SingleShotTime)
 @OutputTimeUnit(TimeUnit.MILLISECONDS)
@@ -69,33 +77,23 @@
 public class ConcurrentReadWriteBenchmark {
 
   private File tempFile;
-  private Group[] readRows;
-
-  @State(Scope.Thread)
-  public static class ThreadData {
-    private Group[] rows;
-
-    @Setup(Level.Trial)
-    public void setup() {
-      rows = TestDataFactory.generateRows(
-          TestDataFactory.newGroupFactory(), TestDataFactory.DEFAULT_ROW_COUNT, 42L);
-    }
-  }
+  private Group[] rows;
 
   @Setup(Level.Trial)
   public void setup() throws IOException {
+    rows = TestDataFactory.generateRows(
+        TestDataFactory.newGroupFactory(), TestDataFactory.DEFAULT_ROW_COUNT, TestDataFactory.DEFAULT_SEED);
+
     // Generate a shared file for concurrent reads
     tempFile = File.createTempFile("parquet-concurrent-bench-", ".parquet");
     tempFile.deleteOnExit();
     tempFile.delete();
 
-    readRows = TestDataFactory.generateRows(
-        TestDataFactory.newGroupFactory(), TestDataFactory.DEFAULT_ROW_COUNT, 42L);
     try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(new LocalOutputFile(tempFile.toPath()))
         .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
         .withType(TestDataFactory.FILE_BENCHMARK_SCHEMA)
         .build()) {
-      for (Group row : readRows) {
+      for (Group row : rows) {
         writer.write(row);
       }
     }
@@ -109,19 +107,20 @@ public void tearDown() {
   }
 
   /**
-   * Each thread writes a full file independently to the shared stateless
-   * {@link BlackHoleOutputFile} sink.
+   * Each thread writes the shared pre-generated rows independently to the
+   * stateless {@link BlackHoleOutputFile} sink.
    */
   @Benchmark
-  public void concurrentWrite(ThreadData threadData) throws IOException {
+  public void concurrentWrite(Blackhole bh) throws IOException {
     try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(BlackHoleOutputFile.INSTANCE)
         .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
         .withType(TestDataFactory.FILE_BENCHMARK_SCHEMA)
         .build()) {
-      for (Group row : threadData.rows) {
+      for (Group row : rows) {
         writer.write(row);
       }
     }
+    bh.consume(rows);
   }
 
   /**
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileReadBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileReadBenchmark.java
index eb5b959efa..de8e0b6580 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileReadBenchmark.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileReadBenchmark.java
@@ -54,7 +54,13 @@
  * pre-generated rows using {@link LocalOutputFile}, then read repeatedly during the
  * benchmark.
  *
- * <p>Parameterized across compression codec and writer version.
+ * <p>Parameterized across compression codec and writer version. The footer parse
+ * (via {@link LocalInputFile} open) is included in the timed section so the result
+ * reflects the full open-and-read cost a typical caller would observe.
+ *
+ * <p>{@link Mode#SingleShotTime} is used because each invocation does enough work
+ * (a full read of {@value TestDataFactory#DEFAULT_ROW_COUNT} rows) that JIT
+ * amortization across invocations is unnecessary.
  */
 @BenchmarkMode(Mode.SingleShotTime)
 @Fork(1)
@@ -79,7 +85,7 @@ public void setup() throws IOException {
     tempFile.delete(); // remove so the writer can create it
 
     Group[] rows = TestDataFactory.generateRows(
-        TestDataFactory.newGroupFactory(), TestDataFactory.DEFAULT_ROW_COUNT, 42L);
+        TestDataFactory.newGroupFactory(), TestDataFactory.DEFAULT_ROW_COUNT, TestDataFactory.DEFAULT_SEED);
     try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(new LocalOutputFile(tempFile.toPath()))
         .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
         .withType(TestDataFactory.FILE_BENCHMARK_SCHEMA)
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileWriteBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileWriteBenchmark.java
index 73f60d7199..f6174bcaa2 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileWriteBenchmark.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileWriteBenchmark.java
@@ -48,6 +48,10 @@
  * <p>Writes are sent to a {@link BlackHoleOutputFile} to isolate CPU and encoding cost
  * from filesystem I/O. Parameterized across compression codec, writer version, and
  * dictionary encoding.
+ *
+ * <p>{@link Mode#SingleShotTime} is used because each invocation does enough work
+ * (a full write of {@value TestDataFactory#DEFAULT_ROW_COUNT} rows) that JIT
+ * amortization across invocations is unnecessary.
  */
 @BenchmarkMode(Mode.SingleShotTime)
 @Fork(1)
@@ -71,7 +75,7 @@ public class FileWriteBenchmark {
   @Setup(Level.Trial)
   public void setup() {
     rows = TestDataFactory.generateRows(
-        TestDataFactory.newGroupFactory(), TestDataFactory.DEFAULT_ROW_COUNT, 42L);
+        TestDataFactory.newGroupFactory(), TestDataFactory.DEFAULT_ROW_COUNT, TestDataFactory.DEFAULT_SEED);
   }
 
   @Benchmark
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/IntEncodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/IntEncodingBenchmark.java
index df767df455..7665a7462a 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/IntEncodingBenchmark.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/IntEncodingBenchmark.java
@@ -20,10 +20,8 @@
 
 import java.io.IOException;
 import java.nio.ByteBuffer;
-import java.util.Random;
 import java.util.concurrent.TimeUnit;
 import org.apache.parquet.bytes.ByteBufferInputStream;
-import org.apache.parquet.bytes.BytesInput;
 import org.apache.parquet.bytes.HeapByteBufferAllocator;
 import org.apache.parquet.column.Dictionary;
 import org.apache.parquet.column.Encoding;
@@ -62,6 +60,15 @@
  *
  * <p>Each benchmark invocation processes {@value #VALUE_COUNT} values. Throughput is
  * reported per-value using {@link OperationsPerInvocation}.
+ *
+ * <p>BYTE_STREAM_SPLIT is included for completeness even though it is rarely a good
+ * choice for integer data; it exists here to compare the full set of encodings the
+ * Parquet writer can emit for INT32.
+ *
+ * <p>The dictionary encode/decode benchmarks measure the full path: the encoder
+ * produces both the RLE-encoded indices and a {@link DictionaryPage}; the decoder
+ * consumes the indices through a {@link DictionaryValuesReader} backed by the same
+ * dictionary.
  */
 @BenchmarkMode(Mode.Throughput)
 @OutputTimeUnit(TimeUnit.SECONDS)
@@ -84,24 +91,25 @@ public class IntEncodingBenchmark {
   private byte[] deltaEncoded;
   private byte[] bssEncoded;
   private byte[] dictDataEncoded;
+  private DictionaryPage dictPage;
   private Dictionary intDictionary;
+  private boolean dictionaryAvailable;
 
   @Setup(Level.Trial)
   public void setup() throws IOException {
-    Random random = new Random(42);
     switch (dataPattern) {
       case "SEQUENTIAL":
         data = TestDataFactory.generateSequentialInts(VALUE_COUNT);
         break;
       case "RANDOM":
-        data = TestDataFactory.generateRandomInts(VALUE_COUNT, random);
+        data = TestDataFactory.generateRandomInts(VALUE_COUNT, TestDataFactory.DEFAULT_SEED);
         break;
       case "LOW_CARDINALITY":
         data = TestDataFactory.generateLowCardinalityInts(
-            VALUE_COUNT, TestDataFactory.LOW_CARDINALITY_DISTINCT, random);
+            VALUE_COUNT, TestDataFactory.LOW_CARDINALITY_DISTINCT, TestDataFactory.DEFAULT_SEED);
         break;
       case "HIGH_CARDINALITY":
-        data = TestDataFactory.generateHighCardinalityInts(VALUE_COUNT, random);
+        data = TestDataFactory.generateHighCardinalityInts(VALUE_COUNT, TestDataFactory.DEFAULT_SEED);
         break;
       default:
         throw new IllegalArgumentException("Unknown data pattern: " + dataPattern);
@@ -117,11 +125,13 @@ public void setup() throws IOException {
     for (int v : data) {
       dictWriter.writeInteger(v);
     }
-    BytesInput dictDataBytes = dictWriter.getBytes();
-    dictDataEncoded = dictDataBytes.toByteArray();
-    DictionaryPage dictPage = dictWriter.toDictPageAndClose().copy();
-    intDictionary = new PlainValuesDictionary.PlainIntegerDictionary(dictPage);
-    dictWriter.close();
+    BenchmarkEncodingUtils.EncodedDictionary encoded = BenchmarkEncodingUtils.drainDictionary(dictWriter);
+    dictDataEncoded = encoded.dictData;
+    dictPage = encoded.dictPage;
+    dictionaryAvailable = !encoded.fellBackToPlain();
+    if (dictionaryAvailable) {
+      intDictionary = new PlainValuesDictionary.PlainIntegerDictionary(dictPage);
+    }
   }
 
   private byte[] encodeWith(ValuesWriter writer) throws IOException {
@@ -133,22 +143,12 @@ private byte[] encodeWith(ValuesWriter writer) throws IOException {
     return bytes;
   }
 
-  private byte[] encodeDictionaryWith(DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter writer)
-      throws IOException {
+  private BenchmarkEncodingUtils.EncodedDictionary encodeDictionaryWith(
+      DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter writer) throws IOException {
     for (int v : data) {
       writer.writeInteger(v);
     }
-    BytesInput dataBytes = writer.getBytes();
-    DictionaryPage dictPage = writer.toDictPageAndClose();
-    byte[] bytes;
-    if (dictPage == null) {
-      bytes = dataBytes.toByteArray();
-    } else {
-      BytesInput allBytes = BytesInput.concat(dataBytes, dictPage.getBytes());
-      bytes = allBytes.toByteArray();
-    }
-    writer.close();
-    return bytes;
+    return BenchmarkEncodingUtils.drainDictionary(writer);
   }
 
   // ---- Writer factories ----
@@ -193,8 +193,10 @@ public byte[] encodeByteStreamSplit() throws IOException {
 
   @Benchmark
   @OperationsPerInvocation(VALUE_COUNT)
-  public byte[] encodeDictionary() throws IOException {
-    return encodeDictionaryWith(newDictWriter());
+  public void encodeDictionary(Blackhole bh) throws IOException {
+    BenchmarkEncodingUtils.EncodedDictionary encoded = encodeDictionaryWith(newDictWriter());
+    bh.consume(encoded.dictData);
+    bh.consume(encoded.dictPage);
   }
 
   // ---- Decode benchmarks ----
@@ -232,6 +234,11 @@ public void decodeByteStreamSplit(Blackhole bh) throws IOException {
   @Benchmark
   @OperationsPerInvocation(VALUE_COUNT)
   public void decodeDictionary(Blackhole bh) throws IOException {
+    if (!dictionaryAvailable) {
+      // Dictionary fell back to plain encoding (e.g. very large unique-value sets
+      // exceeding MAX_DICT_BYTE_SIZE). Skip to keep the benchmark meaningful.
+      return;
+    }
     DictionaryValuesReader reader = new DictionaryValuesReader(intDictionary);
     reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(dictDataEncoded)));
     for (int i = 0; i < VALUE_COUNT; i++) {
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/RleDictionaryIndexDecodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/RleDictionaryIndexDecodingBenchmark.java
index c9d604c946..68c51f0842 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/RleDictionaryIndexDecodingBenchmark.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/RleDictionaryIndexDecodingBenchmark.java
@@ -20,7 +20,6 @@
 
 import java.io.IOException;
 import java.nio.ByteBuffer;
-import java.util.Random;
 import java.util.concurrent.TimeUnit;
 import org.apache.parquet.bytes.ByteBufferInputStream;
 import org.apache.parquet.bytes.HeapByteBufferAllocator;
@@ -46,6 +45,10 @@
  * {@link RunLengthBitPackingHybridEncoder}. This isolates the dictionary-id
  * decode path and is intentionally separate from {@link IntEncodingBenchmark},
  * which measures full INT32 value decode paths.
+ *
+ * <p>Per-invocation overhead (decoder construction and {@link ByteBufferInputStream}
+ * wrapping) is amortized over {@value #VALUE_COUNT} reads via
+ * {@link OperationsPerInvocation}.
  */
 @BenchmarkMode(Mode.Throughput)
 @OutputTimeUnit(TimeUnit.SECONDS)
@@ -61,6 +64,13 @@ public class RleDictionaryIndexDecodingBenchmark {
   private static final int BIT_WIDTH = 10;
   private static final int MAX_ID = 1 << BIT_WIDTH;
 
+  static {
+    if (TestDataFactory.LOW_CARDINALITY_DISTINCT > MAX_ID) {
+      throw new IllegalStateException("LOW_CARDINALITY_DISTINCT (" + TestDataFactory.LOW_CARDINALITY_DISTINCT
+          + ") must fit within BIT_WIDTH=" + BIT_WIDTH + " (MAX_ID=" + MAX_ID + ")");
+    }
+  }
+
   @Param({"SEQUENTIAL", "RANDOM", "LOW_CARDINALITY"})
   public String indexPattern;
 
@@ -69,45 +79,38 @@ public class RleDictionaryIndexDecodingBenchmark {
   @Setup(Level.Trial)
   public void setup() throws IOException {
     int[] ids = generateDictionaryIds();
-    RunLengthBitPackingHybridEncoder encoder = new RunLengthBitPackingHybridEncoder(
-        BIT_WIDTH, INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
-    for (int id : ids) {
-      encoder.writeInt(id);
+    try (RunLengthBitPackingHybridEncoder encoder = new RunLengthBitPackingHybridEncoder(
+        BIT_WIDTH, INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator())) {
+      for (int id : ids) {
+        encoder.writeInt(id);
+      }
+      encoded = encoder.toBytes().toByteArray();
     }
-    encoded = encoder.toBytes().toByteArray();
-    encoder.close();
   }
 
   private int[] generateDictionaryIds() {
-    int[] ids = new int[VALUE_COUNT];
-    Random random = new Random(42);
     switch (indexPattern) {
       case "SEQUENTIAL":
+        int[] sequential = new int[VALUE_COUNT];
         for (int i = 0; i < VALUE_COUNT; i++) {
-          ids[i] = i % MAX_ID;
+          sequential[i] = i % MAX_ID;
         }
-        break;
+        return sequential;
       case "RANDOM":
-        for (int i = 0; i < VALUE_COUNT; i++) {
-          ids[i] = random.nextInt(MAX_ID);
-        }
-        break;
+        return TestDataFactory.generateLowCardinalityInts(VALUE_COUNT, MAX_ID, TestDataFactory.DEFAULT_SEED);
       case "LOW_CARDINALITY":
-        for (int i = 0; i < VALUE_COUNT; i++) {
-          ids[i] = random.nextInt(TestDataFactory.LOW_CARDINALITY_DISTINCT);
-        }
-        break;
+        return TestDataFactory.generateLowCardinalityInts(
+            VALUE_COUNT, TestDataFactory.LOW_CARDINALITY_DISTINCT, TestDataFactory.DEFAULT_SEED);
       default:
         throw new IllegalArgumentException("Unknown index pattern: " + indexPattern);
     }
-    return ids;
   }
 
   @Benchmark
   @OperationsPerInvocation(VALUE_COUNT)
   public void decodeDictionaryIds(Blackhole bh) throws IOException {
-    RunLengthBitPackingHybridDecoder decoder = new RunLengthBitPackingHybridDecoder(
-        BIT_WIDTH, ByteBufferInputStream.wrap(ByteBuffer.wrap(encoded)));
+    RunLengthBitPackingHybridDecoder decoder =
+        new RunLengthBitPackingHybridDecoder(BIT_WIDTH, ByteBufferInputStream.wrap(ByteBuffer.wrap(encoded)));
     for (int i = 0; i < VALUE_COUNT; i++) {
       bh.consume(decoder.readInt());
     }
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/TestDataFactory.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/TestDataFactory.java
index bc00f3b070..13c5175d66 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/TestDataFactory.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/TestDataFactory.java
@@ -44,6 +44,9 @@ public final class TestDataFactory {
   /** Number of distinct values for low-cardinality data patterns. */
   public static final int LOW_CARDINALITY_DISTINCT = 100;
 
+  /** Default RNG seed used across benchmarks for deterministic data. */
+  public static final long DEFAULT_SEED = 42L;
+
   /** A standard multi-type schema used by file-level benchmarks. */
   public static final MessageType FILE_BENCHMARK_SCHEMA = Types.buildMessage()
       .required(INT32)
@@ -112,8 +115,18 @@ public static int[] generateSequentialInts(int count) {
     return data;
   }
 
+  /**
+   * Generates uniformly random integers using the given seed.
+   */
+  public static int[] generateRandomInts(int count, long seed) {
+    return generateRandomInts(count, new Random(seed));
+  }
+
   /**
    * Generates uniformly random integers.
+   *
+   * <p>Note: prefer {@link #generateRandomInts(int, long)} when call ordering between
+   * generators in the same setup must not influence the produced data.
    */
   public static int[] generateRandomInts(int count, Random random) {
     int[] data = new int[count];
@@ -123,6 +136,13 @@ public static int[] generateRandomInts(int count, Random random) {
     return data;
   }
 
+  /**
+   * Generates low-cardinality integers (values drawn from a small set) using the given seed.
+   */
+  public static int[] generateLowCardinalityInts(int count, int distinctValues, long seed) {
+    return generateLowCardinalityInts(count, distinctValues, new Random(seed));
+  }
+
   /**
    * Generates low-cardinality integers (values drawn from a small set).
    */
@@ -134,6 +154,13 @@ public static int[] generateLowCardinalityInts(int count, int distinctValues, Ra
     return data;
   }
 
+  /**
+   * Generates high-cardinality integers (all unique in randomized order) using the given seed.
+   */
+  public static int[] generateHighCardinalityInts(int count, long seed) {
+    return generateHighCardinalityInts(count, new Random(seed));
+  }
+
   /**
    * Generates high-cardinality integers (all unique in randomized order).
    */
@@ -150,6 +177,14 @@ public static int[] generateHighCardinalityInts(int count, Random random) {
 
   // ---- Binary data generation for encoding benchmarks ----
 
+  /**
+   * Generates binary strings of the given length with the specified cardinality, using
+   * a deterministic seed.
+   */
+  public static Binary[] generateBinaryData(int count, int stringLength, int distinct, long seed) {
+    return generateBinaryData(count, stringLength, distinct, new Random(seed));
+  }
+
   /**
    * Generates binary strings of the given length with the specified cardinality.
    *

From 04bce6a670c02fc9797a93cefe14807e20b4d151 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= <iemejia@gmail.com>
Date: Sun, 10 May 2026 19:46:48 +0200
Subject: [PATCH 04/27] Add JMH CompressionBenchmark for isolated codec
 throughput measurement

Benchmarks raw compress/decompress throughput for each supported codec
(SNAPPY, ZSTD, LZ4_RAW, GZIP) at page sizes 8KB, 64KB, and 256KB using
the heap-based CodecFactory path. Input data mixes sequential, repeated,
low-range random, and full random patterns for realistic compression ratios.
---
 .../benchmarks/CompressionBenchmark.java      | 155 ++++++++++++++++++
 1 file changed, 155 insertions(+)
 create mode 100644 parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/CompressionBenchmark.java

diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/CompressionBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/CompressionBenchmark.java
new file mode 100644
index 0000000000..9ff2884222
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/CompressionBenchmark.java
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.IOException;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.parquet.bytes.BytesInput;
+import org.apache.parquet.compression.CompressionCodecFactory;
+import org.apache.parquet.hadoop.CodecFactory;
+import org.apache.parquet.hadoop.metadata.CompressionCodecName;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.TearDown;
+import org.openjdk.jmh.annotations.Warmup;
+
+/**
+ * Isolated JMH benchmarks for raw Parquet compression and decompression throughput.
+ *
+ * <p>Measures the performance of {@link CompressionCodecFactory.BytesInputCompressor}
+ * and {@link CompressionCodecFactory.BytesInputDecompressor} for each supported codec,
+ * using the heap-based {@link CodecFactory} path. Input data is generated to approximate
+ * realistic Parquet page content (a mix of sequential, repeated, and random byte patterns).
+ *
+ * <p>This benchmark isolates the codec hot path from file I/O, encoding, and other
+ * Parquet overhead, making it ideal for measuring compression-specific optimizations.
+ */
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@Fork(1)
+@Warmup(iterations = 2, time = 1)
+@Measurement(iterations = 3, time = 2)
+@State(Scope.Thread)
+public class CompressionBenchmark {
+
+  @Param({"SNAPPY", "ZSTD", "LZ4_RAW", "GZIP"})
+  public String codec;
+
+  @Param({"65536", "131072", "262144", "1048576"})
+  public int pageSize;
+
+  private byte[] uncompressedData;
+  private byte[] compressedData;
+  private int decompressedSize;
+
+  private CompressionCodecFactory.BytesInputCompressor compressor;
+  private CompressionCodecFactory.BytesInputDecompressor decompressor;
+  private CodecFactory factory;
+
+  @Setup(Level.Trial)
+  public void setup() throws IOException {
+    uncompressedData = generatePageData(pageSize, 42L);
+    decompressedSize = uncompressedData.length;
+
+    Configuration conf = new Configuration();
+    factory = new CodecFactory(conf, pageSize);
+    CompressionCodecName codecName = CompressionCodecName.valueOf(codec);
+
+    compressor = factory.getCompressor(codecName);
+    decompressor = factory.getDecompressor(codecName);
+
+    // Pre-compress for decompression benchmark; copy to a stable byte array
+    // since the compressor may reuse its internal buffer.
+    BytesInput compressed = compressor.compress(BytesInput.from(uncompressedData));
+    compressedData = compressed.toByteArray();
+  }
+
+  @TearDown(Level.Trial)
+  public void tearDown() {
+    factory.release();
+  }
+
+  @Benchmark
+  public BytesInput compress() throws IOException {
+    return compressor.compress(BytesInput.from(uncompressedData));
+  }
+
+  @Benchmark
+  public byte[] decompress() throws IOException {
+    // Force materialization of the decompressed data. Without this, codecs using
+    // the stream-based HeapBytesDecompressor (e.g. GZIP) would return a lazy
+    // StreamBytesInput, deferring the actual work. toByteArray() is essentially
+    // free for our optimized implementations (returns the existing byte[]).
+    return decompressor
+        .decompress(BytesInput.from(compressedData), decompressedSize)
+        .toByteArray();
+  }
+
+  /**
+   * Generates byte data that approximates realistic Parquet page content.
+   * Mixes sequential runs, repeated values, low-range random, and full random
+   * to produce a realistic compression ratio (~2-4x for fast codecs).
+   */
+  static byte[] generatePageData(int size, long seed) {
+    Random random = new Random(seed);
+    byte[] data = new byte[size];
+    int i = 0;
+    while (i < size) {
+      int patternType = random.nextInt(4);
+      int chunkSize = Math.min(random.nextInt(256) + 64, size - i);
+      switch (patternType) {
+        case 0: // Sequential bytes (highly compressible)
+          for (int j = 0; j < chunkSize && i < size; j++) {
+            data[i++] = (byte) (j & 0xFF);
+          }
+          break;
+        case 1: // Repeated value (highly compressible)
+          byte val = (byte) random.nextInt(256);
+          for (int j = 0; j < chunkSize && i < size; j++) {
+            data[i++] = val;
+          }
+          break;
+        case 2: // Small range random (moderately compressible)
+          for (int j = 0; j < chunkSize && i < size; j++) {
+            data[i++] = (byte) random.nextInt(16);
+          }
+          break;
+        case 3: // Full random (low compressibility)
+          byte[] randomChunk = new byte[chunkSize];
+          random.nextBytes(randomChunk);
+          int toCopy = Math.min(chunkSize, size - i);
+          System.arraycopy(randomChunk, 0, data, i, toCopy);
+          i += toCopy;
+          break;
+      }
+    }
+    return data;
+  }
+}

From 440c38c4f55ca0fe02173782bd6b3a7d1d7bb07c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= <iemejia@gmail.com>
Date: Wed, 13 May 2026 10:58:03 +0200
Subject: [PATCH 05/27] Fine-tune JMH benchmarks for targeted per-branch
 measurement

- Add RLE encodeDictionaryIds benchmark to cover par9 encoder
  pack32Values fast path (previously only decode was benchmarked)
- Trim CompressionBenchmark page sizes to boundary conditions
  (64K, 1MB) to cut redundant mid-points
- Increase FileRead/FileWriteBenchmark SS iterations (warmup 3->5,
  measurement 5->10) for better statistical stability
- Increase RowGroupFlushBenchmark iterations (warmup 2->3,
  measurement 3->5) for improved confidence with 2 param combos
---
 .../parquet/benchmarks/FileReadBenchmark.java |  7 +++--
 .../benchmarks/FileWriteBenchmark.java        |  7 +++--
 .../RleDictionaryIndexDecodingBenchmark.java  | 31 +++++++++++++++----
 .../benchmarks/RowGroupFlushBenchmark.java    |  4 +--
 4 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileReadBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileReadBenchmark.java
index de8e0b6580..de133f4607 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileReadBenchmark.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileReadBenchmark.java
@@ -60,12 +60,13 @@
  *
  * <p>{@link Mode#SingleShotTime} is used because each invocation does enough work
  * (a full read of {@value TestDataFactory#DEFAULT_ROW_COUNT} rows) that JIT
- * amortization across invocations is unnecessary.
+ * amortization across invocations is unnecessary. Ten measurement iterations
+ * provide stable statistics for SS mode.
  */
 @BenchmarkMode(Mode.SingleShotTime)
 @Fork(1)
-@Warmup(iterations = 3, batchSize = 1)
-@Measurement(iterations = 5, batchSize = 1)
+@Warmup(iterations = 5, batchSize = 1)
+@Measurement(iterations = 10, batchSize = 1)
 @OutputTimeUnit(TimeUnit.MILLISECONDS)
 @State(Scope.Benchmark)
 public class FileReadBenchmark {
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileWriteBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileWriteBenchmark.java
index f6174bcaa2..6716010cc3 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileWriteBenchmark.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileWriteBenchmark.java
@@ -51,12 +51,13 @@
  *
  * <p>{@link Mode#SingleShotTime} is used because each invocation does enough work
  * (a full write of {@value TestDataFactory#DEFAULT_ROW_COUNT} rows) that JIT
- * amortization across invocations is unnecessary.
+ * amortization across invocations is unnecessary. Ten measurement iterations
+ * provide stable statistics for SS mode.
  */
 @BenchmarkMode(Mode.SingleShotTime)
 @Fork(1)
-@Warmup(iterations = 3, batchSize = 1)
-@Measurement(iterations = 5, batchSize = 1)
+@Warmup(iterations = 5, batchSize = 1)
+@Measurement(iterations = 10, batchSize = 1)
 @OutputTimeUnit(TimeUnit.MILLISECONDS)
 @State(Scope.Benchmark)
 public class FileWriteBenchmark {
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/RleDictionaryIndexDecodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/RleDictionaryIndexDecodingBenchmark.java
index 68c51f0842..e8f8598fdd 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/RleDictionaryIndexDecodingBenchmark.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/RleDictionaryIndexDecodingBenchmark.java
@@ -41,12 +41,17 @@
 import org.openjdk.jmh.infra.Blackhole;
 
 /**
- * Decoding micro-benchmark for synthetic dictionary-id pages encoded with
- * {@link RunLengthBitPackingHybridEncoder}. This isolates the dictionary-id
- * decode path and is intentionally separate from {@link IntEncodingBenchmark},
- * which measures full INT32 value decode paths.
+ * Encoding and decoding micro-benchmarks for synthetic dictionary-id pages using
+ * {@link RunLengthBitPackingHybridEncoder} and {@link RunLengthBitPackingHybridDecoder}.
+ * This isolates the RLE/bit-packing hybrid codec paths and is intentionally
+ * separate from {@link IntEncodingBenchmark}, which measures full INT32 value
+ * encode/decode paths.
  *
- * <p>Per-invocation overhead (decoder construction and {@link ByteBufferInputStream}
+ * <p>The encode benchmark measures the RLE encoder's {@code pack32Values} fast path
+ * and bit-packing throughput. The decode benchmark measures the corresponding
+ * {@code unpack32Values} fast path and RLE run expansion.
+ *
+ * <p>Per-invocation overhead (encoder/decoder construction and {@link ByteBufferInputStream}
  * wrapping) is amortized over {@value #VALUE_COUNT} reads via
  * {@link OperationsPerInvocation}.
  */
@@ -76,9 +81,11 @@ public class RleDictionaryIndexDecodingBenchmark {
 
   private byte[] encoded;
 
+  private int[] ids;
+
   @Setup(Level.Trial)
   public void setup() throws IOException {
-    int[] ids = generateDictionaryIds();
+    ids = generateDictionaryIds();
     try (RunLengthBitPackingHybridEncoder encoder = new RunLengthBitPackingHybridEncoder(
         BIT_WIDTH, INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator())) {
       for (int id : ids) {
@@ -106,6 +113,18 @@ private int[] generateDictionaryIds() {
     }
   }
 
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeDictionaryIds() throws IOException {
+    try (RunLengthBitPackingHybridEncoder encoder = new RunLengthBitPackingHybridEncoder(
+        BIT_WIDTH, INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator())) {
+      for (int id : ids) {
+        encoder.writeInt(id);
+      }
+      return encoder.toBytes().toByteArray();
+    }
+  }
+
   @Benchmark
   @OperationsPerInvocation(VALUE_COUNT)
   public void decodeDictionaryIds(Blackhole bh) throws IOException {
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/RowGroupFlushBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/RowGroupFlushBenchmark.java
index 753b27de4a..9bc5cab0a8 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/RowGroupFlushBenchmark.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/RowGroupFlushBenchmark.java
@@ -70,8 +70,8 @@
 @Fork(
     value = 1,
     jvmArgs = {"-Xms512m", "-Xmx1g"})
-@Warmup(iterations = 2)
-@Measurement(iterations = 3)
+@Warmup(iterations = 3)
+@Measurement(iterations = 5)
 @OutputTimeUnit(TimeUnit.MILLISECONDS)
 @State(Scope.Thread)
 public class RowGroupFlushBenchmark {

From bf599cda1ca40df3e5e01b419d3cc182eb26d689 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= <iemejia@gmail.com>
Date: Wed, 13 May 2026 13:29:28 +0200
Subject: [PATCH 06/27] GH-3522: Add batch read APIs to ValuesReader hierarchy
 (from par13)

---
 .../parquet/column/values/ValuesReader.java   | 56 +++++++++++++++++++
 .../ByteStreamSplitValuesReader.java          | 13 +++++
 .../ByteStreamSplitValuesReaderForDouble.java |  8 +++
 .../ByteStreamSplitValuesReaderForFloat.java  |  8 +++
 ...ByteStreamSplitValuesReaderForInteger.java |  8 +++
 .../ByteStreamSplitValuesReaderForLong.java   |  8 +++
 .../delta/DeltaBinaryPackingValuesReader.java | 16 ++++++
 .../dictionary/DictionaryValuesReader.java    | 53 ++++++++++++++++++
 .../values/plain/PlainValuesReader.java       | 44 +++++++++++++++
 .../rle/RunLengthBitPackingHybridDecoder.java | 34 +++++++++++
 10 files changed, 248 insertions(+)

diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/ValuesReader.java b/parquet-column/src/main/java/org/apache/parquet/column/values/ValuesReader.java
index 1713acc012..bd7f3eaeff 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/ValuesReader.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/ValuesReader.java
@@ -185,6 +185,62 @@ public long readLong() {
     throw new UnsupportedOperationException();
   }
 
+  // ---- Batch read methods ----
+  // Default implementations loop over the per-value methods.
+  // Subclasses should override with bulk/memcpy-style implementations.
+
+  /**
+   * Reads {@code count} integers into {@code dest} starting at {@code offset}.
+   *
+   * @param dest   destination array
+   * @param offset start index in dest
+   * @param count  number of values to read
+   */
+  public void readIntegers(int[] dest, int offset, int count) {
+    for (int i = 0; i < count; i++) {
+      dest[offset + i] = readInteger();
+    }
+  }
+
+  /**
+   * Reads {@code count} longs into {@code dest} starting at {@code offset}.
+   *
+   * @param dest   destination array
+   * @param offset start index in dest
+   * @param count  number of values to read
+   */
+  public void readLongs(long[] dest, int offset, int count) {
+    for (int i = 0; i < count; i++) {
+      dest[offset + i] = readLong();
+    }
+  }
+
+  /**
+   * Reads {@code count} floats into {@code dest} starting at {@code offset}.
+   *
+   * @param dest   destination array
+   * @param offset start index in dest
+   * @param count  number of values to read
+   */
+  public void readFloats(float[] dest, int offset, int count) {
+    for (int i = 0; i < count; i++) {
+      dest[offset + i] = readFloat();
+    }
+  }
+
+  /**
+   * Reads {@code count} doubles into {@code dest} starting at {@code offset}.
+   *
+   * @param dest   destination array
+   * @param offset start index in dest
+   * @param count  number of values to read
+   */
+  public void readDoubles(double[] dest, int offset, int count) {
+    for (int i = 0; i < count; i++) {
+      dest[offset + i] = readDouble();
+    }
+  }
+
   /**
    * Skips the next value in the page
    */
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReader.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReader.java
index c8ab3043bd..6b7449ea11 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReader.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReader.java
@@ -49,6 +49,19 @@ protected int nextElementByteOffset() {
     return offset;
   }
 
+  /**
+   * Advances the stream position by {@code count} elements and returns the byte offset
+   * of the first element. Used by batch read methods in subclasses.
+   */
+  protected int advanceByteOffset(int count) {
+    if (indexInStream + count > valuesCount) {
+      throw new ParquetDecodingException("Byte-stream data was already exhausted.");
+    }
+    int offset = indexInStream * elementSizeInBytes;
+    indexInStream += count;
+    return offset;
+  }
+
   // Decode an entire data page
   private byte[] decodeData(ByteBuffer encoded, int valuesCount) {
     assert encoded.limit() == valuesCount * elementSizeInBytes;
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForDouble.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForDouble.java
index e725dc9fce..e2053eec3a 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForDouble.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForDouble.java
@@ -27,4 +27,12 @@ public ByteStreamSplitValuesReaderForDouble() {
   public double readDouble() {
     return decodedDataBuffer.getDouble(nextElementByteOffset());
   }
+
+  @Override
+  public void readDoubles(double[] dest, int offset, int count) {
+    int byteOffset = advanceByteOffset(count);
+    for (int i = 0; i < count; i++) {
+      dest[offset + i] = decodedDataBuffer.getDouble(byteOffset + i * 8);
+    }
+  }
 }
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForFloat.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForFloat.java
index cecb7925d8..eb80eacbf1 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForFloat.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForFloat.java
@@ -27,4 +27,12 @@ public ByteStreamSplitValuesReaderForFloat() {
   public float readFloat() {
     return decodedDataBuffer.getFloat(nextElementByteOffset());
   }
+
+  @Override
+  public void readFloats(float[] dest, int offset, int count) {
+    int byteOffset = advanceByteOffset(count);
+    for (int i = 0; i < count; i++) {
+      dest[offset + i] = decodedDataBuffer.getFloat(byteOffset + i * 4);
+    }
+  }
 }
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForInteger.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForInteger.java
index 57f9bfdf03..8bac36da17 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForInteger.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForInteger.java
@@ -27,4 +27,12 @@ public ByteStreamSplitValuesReaderForInteger() {
   public int readInteger() {
     return decodedDataBuffer.getInt(nextElementByteOffset());
   }
+
+  @Override
+  public void readIntegers(int[] dest, int offset, int count) {
+    int byteOffset = advanceByteOffset(count);
+    for (int i = 0; i < count; i++) {
+      dest[offset + i] = decodedDataBuffer.getInt(byteOffset + i * 4);
+    }
+  }
 }
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForLong.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForLong.java
index c7711d8919..5186210ef5 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForLong.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForLong.java
@@ -27,4 +27,12 @@ public ByteStreamSplitValuesReaderForLong() {
   public long readLong() {
     return decodedDataBuffer.getLong(nextElementByteOffset());
   }
+
+  @Override
+  public void readLongs(long[] dest, int offset, int count) {
+    int byteOffset = advanceByteOffset(count);
+    for (int i = 0; i < count; i++) {
+      dest[offset + i] = decodedDataBuffer.getLong(byteOffset + i * 8);
+    }
+  }
 }
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/delta/DeltaBinaryPackingValuesReader.java b/parquet-column/src/main/java/org/apache/parquet/column/values/delta/DeltaBinaryPackingValuesReader.java
index 259ebc09c0..6726614460 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/delta/DeltaBinaryPackingValuesReader.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/delta/DeltaBinaryPackingValuesReader.java
@@ -112,6 +112,22 @@ public long readLong() {
     return valuesBuffer[valuesRead++];
   }
 
+  @Override
+  public void readIntegers(int[] dest, int offset, int count) {
+    checkRead();
+    for (int i = 0; i < count; i++) {
+      dest[offset + i] = (int) valuesBuffer[valuesRead + i];
+    }
+    valuesRead += count;
+  }
+
+  @Override
+  public void readLongs(long[] dest, int offset, int count) {
+    checkRead();
+    System.arraycopy(valuesBuffer, valuesRead, dest, offset, count);
+    valuesRead += count;
+  }
+
   private void checkRead() {
     if (valuesRead >= totalValueCount) {
       throw new ParquetDecodingException("no more value to read, total value count is " + totalValueCount);
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/DictionaryValuesReader.java b/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/DictionaryValuesReader.java
index 53fafc55dc..db344c3e63 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/DictionaryValuesReader.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/DictionaryValuesReader.java
@@ -117,6 +117,59 @@ public long readLong() {
     }
   }
 
+  @Override
+  public void readIntegers(int[] dest, int offset, int count) {
+    try {
+      // Batch-decode dictionary IDs, then batch-lookup
+      int[] ids = new int[count];
+      decoder.readInts(ids, 0, count);
+      for (int i = 0; i < count; i++) {
+        dest[offset + i] = dictionary.decodeToInt(ids[i]);
+      }
+    } catch (IOException e) {
+      throw new ParquetDecodingException(e);
+    }
+  }
+
+  @Override
+  public void readLongs(long[] dest, int offset, int count) {
+    try {
+      int[] ids = new int[count];
+      decoder.readInts(ids, 0, count);
+      for (int i = 0; i < count; i++) {
+        dest[offset + i] = dictionary.decodeToLong(ids[i]);
+      }
+    } catch (IOException e) {
+      throw new ParquetDecodingException(e);
+    }
+  }
+
+  @Override
+  public void readFloats(float[] dest, int offset, int count) {
+    try {
+      int[] ids = new int[count];
+      decoder.readInts(ids, 0, count);
+      for (int i = 0; i < count; i++) {
+        dest[offset + i] = dictionary.decodeToFloat(ids[i]);
+      }
+    } catch (IOException e) {
+      throw new ParquetDecodingException(e);
+    }
+  }
+
+  @Override
+  public void readDoubles(double[] dest, int offset, int count) {
+    try {
+      int[] ids = new int[count];
+      decoder.readInts(ids, 0, count);
+      for (int i = 0; i < count; i++) {
+        dest[offset + i] = dictionary.decodeToDouble(ids[i]);
+      }
+    } catch (IOException e) {
+      throw new ParquetDecodingException(e);
+    }
+  }
+
   @Override
   public void skip() {
     try {
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/plain/PlainValuesReader.java b/parquet-column/src/main/java/org/apache/parquet/column/values/plain/PlainValuesReader.java
index a0c7af7394..a3d0d06923 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/plain/PlainValuesReader.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/plain/PlainValuesReader.java
@@ -71,6 +71,17 @@ public double readDouble() {
         throw new ParquetDecodingException("could not read double", e);
       }
     }
+
+    @Override
+    public void readDoubles(double[] dest, int offset, int count) {
+      try {
+        for (int i = 0; i < count; i++) {
+          dest[offset + i] = in.readDouble();
+        }
+      } catch (IOException e) {
+        throw new ParquetDecodingException("could not read doubles", e);
+      }
+    }
   }
 
   public static class FloatPlainValuesReader extends PlainValuesReader {
@@ -92,6 +103,17 @@ public float readFloat() {
         throw new ParquetDecodingException("could not read float", e);
       }
     }
+
+    @Override
+    public void readFloats(float[] dest, int offset, int count) {
+      try {
+        for (int i = 0; i < count; i++) {
+          dest[offset + i] = in.readFloat();
+        }
+      } catch (IOException e) {
+        throw new ParquetDecodingException("could not read floats", e);
+      }
+    }
   }
 
   public static class IntegerPlainValuesReader extends PlainValuesReader {
@@ -113,6 +135,17 @@ public int readInteger() {
         throw new ParquetDecodingException("could not read int", e);
       }
     }
+
+    @Override
+    public void readIntegers(int[] dest, int offset, int count) {
+      try {
+        for (int i = 0; i < count; i++) {
+          dest[offset + i] = in.readInt();
+        }
+      } catch (IOException e) {
+        throw new ParquetDecodingException("could not read ints", e);
+      }
+    }
   }
 
   public static class LongPlainValuesReader extends PlainValuesReader {
@@ -134,5 +167,16 @@ public long readLong() {
         throw new ParquetDecodingException("could not read long", e);
       }
     }
+
+    @Override
+    public void readLongs(long[] dest, int offset, int count) {
+      try {
+        for (int i = 0; i < count; i++) {
+          dest[offset + i] = in.readLong();
+        }
+      } catch (IOException e) {
+        throw new ParquetDecodingException("could not read longs", e);
+      }
+    }
   }
 }
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridDecoder.java b/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridDecoder.java
index e55b276b29..8064c25e10 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridDecoder.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridDecoder.java
@@ -77,6 +77,40 @@ public int readInt() throws IOException {
     return result;
   }
 
+  /**
+   * Reads {@code count} int values into {@code dest} starting at {@code offset}.
+   * This avoids per-value virtual dispatch overhead by batching across RLE runs
+   * and packed groups.
+   *
+   * @param dest   destination array
+   * @param offset start index in dest
+   * @param count  number of values to read
+   */
+  public void readInts(int[] dest, int offset, int count) throws IOException {
+    int remaining = count;
+    int pos = offset;
+    while (remaining > 0) {
+      if (currentCount == 0) {
+        readNext();
+      }
+      int batchSize = Math.min(remaining, currentCount);
+      switch (mode) {
+        case RLE:
+          java.util.Arrays.fill(dest, pos, pos + batchSize, currentValue);
+          break;
+        case PACKED:
+          int startIdx = currentBuffer.length - currentCount;
+          System.arraycopy(currentBuffer, startIdx, dest, pos, batchSize);
+          break;
+        default:
+          throw new ParquetDecodingException("not a valid mode " + mode);
+      }
+      currentCount -= batchSize;
+      remaining -= batchSize;
+      pos += batchSize;
+    }
+  }
+
   private void readNext() throws IOException {
     Preconditions.checkArgument(in.available() > 0, "Reading past RLE/BitPacking stream.");
     final int header = BytesUtils.readUnsignedVarInt(in);

From 17e20bdf8109cd63f2508ff3802502beedb5b1b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= <iemejia@gmail.com>
Date: Wed, 13 May 2026 13:29:33 +0200
Subject: [PATCH 07/27] Override readIntegers() in RLE ValuesReader to delegate
 to batch decoder (from par13)

---
 .../rle/RunLengthBitPackingHybridValuesReader.java       | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridValuesReader.java b/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridValuesReader.java
index 0bd5a18d2b..e17867d5f1 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridValuesReader.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridValuesReader.java
@@ -54,6 +54,15 @@ public int readInteger() {
     }
   }
 
+  @Override
+  public void readIntegers(int[] dest, int offset, int count) {
+    try {
+      decoder.readInts(dest, offset, count);
+    } catch (IOException e) {
+      throw new ParquetDecodingException(e);
+    }
+  }
+
   @Override
   public boolean readBoolean() {
     return readInteger() == 0 ? false : true;

From 33478f47db33b0edc09b7637508de1ac64d29a66 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= <iemejia@gmail.com>
Date: Wed, 13 May 2026 13:33:04 +0200
Subject: [PATCH 08/27] Add ValuesReader-level and batch RLE decode benchmarks

Restore decodeDictionaryIdsBatch and decodeValuesReaderBatch from the
original cherry-pick, now that the par13 batch read APIs are available.
Add decodeValuesReader for production-path (ValuesReader wrapper) coverage.

Five RLE benchmark methods now cover par9 and par13:
- encodeDictionaryIds: RLE encoder pack32Values fast path
- decodeDictionaryIds: per-value RLE decoder
- decodeDictionaryIdsBatch: batch RLE decoder via readInts()
- decodeValuesReader: per-value via ValuesReader wrapper
- decodeValuesReaderBatch: batch via ValuesReader.readIntegers()
---
 .../RleDictionaryIndexDecodingBenchmark.java  | 44 +++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/RleDictionaryIndexDecodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/RleDictionaryIndexDecodingBenchmark.java
index e8f8598fdd..f60349c2a0 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/RleDictionaryIndexDecodingBenchmark.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/RleDictionaryIndexDecodingBenchmark.java
@@ -20,11 +20,13 @@
 
 import java.io.IOException;
 import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
 import java.util.concurrent.TimeUnit;
 import org.apache.parquet.bytes.ByteBufferInputStream;
 import org.apache.parquet.bytes.HeapByteBufferAllocator;
 import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridDecoder;
 import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridEncoder;
+import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridValuesReader;
 import org.openjdk.jmh.annotations.Benchmark;
 import org.openjdk.jmh.annotations.BenchmarkMode;
 import org.openjdk.jmh.annotations.Fork;
@@ -83,6 +85,9 @@ public class RleDictionaryIndexDecodingBenchmark {
 
   private int[] ids;
 
+  // encoded with 4-byte LE length prefix, as expected by ValuesReader.initFromPage()
+  private byte[] encodedWithLengthPrefix;
+
   @Setup(Level.Trial)
   public void setup() throws IOException {
     ids = generateDictionaryIds();
@@ -93,6 +98,11 @@ BIT_WIDTH, INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator())) {
       }
       encoded = encoder.toBytes().toByteArray();
     }
+
+    // Prepend 4-byte LE length for ValuesReader.initFromPage() format
+    encodedWithLengthPrefix = new byte[4 + encoded.length];
+    ByteBuffer.wrap(encodedWithLengthPrefix).order(ByteOrder.LITTLE_ENDIAN).putInt(encoded.length);
+    System.arraycopy(encoded, 0, encodedWithLengthPrefix, 4, encoded.length);
   }
 
   private int[] generateDictionaryIds() {
@@ -134,4 +144,38 @@ public void decodeDictionaryIds(Blackhole bh) throws IOException {
       bh.consume(decoder.readInt());
     }
   }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public int[] decodeDictionaryIdsBatch() throws IOException {
+    RunLengthBitPackingHybridDecoder decoder =
+        new RunLengthBitPackingHybridDecoder(BIT_WIDTH, ByteBufferInputStream.wrap(ByteBuffer.wrap(encoded)));
+    int[] result = new int[VALUE_COUNT];
+    decoder.readInts(result, 0, VALUE_COUNT);
+    return result;
+  }
+
+  // ---- ValuesReader-level benchmarks ----
+  // These go through the RunLengthBitPackingHybridValuesReader wrapper,
+  // which is the path used by ColumnReader in production.
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeValuesReader(Blackhole bh) throws IOException {
+    RunLengthBitPackingHybridValuesReader reader = new RunLengthBitPackingHybridValuesReader(BIT_WIDTH);
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(encodedWithLengthPrefix)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readInteger());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public int[] decodeValuesReaderBatch() throws IOException {
+    RunLengthBitPackingHybridValuesReader reader = new RunLengthBitPackingHybridValuesReader(BIT_WIDTH);
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(encodedWithLengthPrefix)));
+    int[] result = new int[VALUE_COUNT];
+    reader.readIntegers(result, 0, VALUE_COUNT);
+    return result;
+  }
 }

From 529429400183e4b2334ef8e60b11d7f3158f110a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= <iemejia@gmail.com>
Date: Wed, 13 May 2026 15:11:45 +0200
Subject: [PATCH 09/27] Add PLAIN decoding benchmark for all numeric types
 (INT32/INT64/FLOAT/DOUBLE)

Covers per-value and batch decode paths for PlainValuesReader across
all four numeric primitive types. Uses pre-allocated destination arrays
to avoid per-invocation allocation noise in batch measurements.
---
 .../benchmarks/PlainDecodingBenchmark.java    | 204 ++++++++++++++++++
 1 file changed, 204 insertions(+)
 create mode 100644 parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PlainDecodingBenchmark.java

diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PlainDecodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PlainDecodingBenchmark.java
new file mode 100644
index 0000000000..eea9f01986
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PlainDecodingBenchmark.java
@@ -0,0 +1,204 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import org.apache.parquet.bytes.ByteBufferInputStream;
+import org.apache.parquet.bytes.HeapByteBufferAllocator;
+import org.apache.parquet.column.values.plain.PlainValuesReader;
+import org.apache.parquet.column.values.plain.PlainValuesWriter;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OperationsPerInvocation;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+/**
+ * Decoding micro-benchmarks for the PLAIN encoding across the four numeric primitive
+ * types: {@code INT32}, {@code INT64}, {@code FLOAT}, {@code DOUBLE}.
+ *
+ * <p>Each invocation decodes {@value #VALUE_COUNT} values. Per-value methods measure
+ * scalar read throughput; batch methods measure bulk array-fill throughput using
+ * {@link PlainValuesReader}'s bulk {@code ByteBuffer} view reads.
+ *
+ * <p>INT32 per-value and batch decode are also available in {@link IntEncodingBenchmark}
+ * alongside other INT32 encodings. This benchmark focuses on the PLAIN encoding path
+ * for all four types to validate the bulk view buffer optimization uniformly.
+ */
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@Fork(1)
+@Warmup(iterations = 3, time = 1)
+@Measurement(iterations = 5, time = 1)
+@State(Scope.Thread)
+public class PlainDecodingBenchmark {
+
+  static final int VALUE_COUNT = 100_000;
+  private static final int INIT_SLAB_SIZE = 64 * 1024;
+  private static final int PAGE_SIZE = 1024 * 1024;
+
+  private byte[] intPage;
+  private byte[] longPage;
+  private byte[] floatPage;
+  private byte[] doublePage;
+
+  // Pre-allocated destination arrays to avoid per-invocation allocation noise
+  private int[] intDest;
+  private long[] longDest;
+  private float[] floatDest;
+  private double[] doubleDest;
+
+  @Setup(Level.Trial)
+  public void setup() throws IOException {
+    Random r = new Random(42);
+
+    // Pre-allocate destination arrays
+    intDest = new int[VALUE_COUNT];
+    longDest = new long[VALUE_COUNT];
+    floatDest = new float[VALUE_COUNT];
+    doubleDest = new double[VALUE_COUNT];
+
+    // Encode INT32
+    PlainValuesWriter w = new PlainValuesWriter(INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      w.writeInteger(r.nextInt());
+    }
+    intPage = w.getBytes().toByteArray();
+    w.close();
+
+    // Encode INT64
+    w = new PlainValuesWriter(INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      w.writeLong(r.nextLong());
+    }
+    longPage = w.getBytes().toByteArray();
+    w.close();
+
+    // Encode FLOAT
+    w = new PlainValuesWriter(INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      w.writeFloat(r.nextFloat());
+    }
+    floatPage = w.getBytes().toByteArray();
+    w.close();
+
+    // Encode DOUBLE
+    w = new PlainValuesWriter(INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      w.writeDouble(r.nextDouble());
+    }
+    doublePage = w.getBytes().toByteArray();
+    w.close();
+  }
+
+  // ---- INT32 ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeInt(Blackhole bh) throws IOException {
+    PlainValuesReader.IntegerPlainValuesReader reader = new PlainValuesReader.IntegerPlainValuesReader();
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(intPage)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readInteger());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public int[] decodeIntBatch() throws IOException {
+    PlainValuesReader.IntegerPlainValuesReader reader = new PlainValuesReader.IntegerPlainValuesReader();
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(intPage)));
+    reader.readIntegers(intDest, 0, VALUE_COUNT);
+    return intDest;
+  }
+
+  // ---- INT64 ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeLong(Blackhole bh) throws IOException {
+    PlainValuesReader.LongPlainValuesReader reader = new PlainValuesReader.LongPlainValuesReader();
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(longPage)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readLong());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public long[] decodeLongBatch() throws IOException {
+    PlainValuesReader.LongPlainValuesReader reader = new PlainValuesReader.LongPlainValuesReader();
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(longPage)));
+    reader.readLongs(longDest, 0, VALUE_COUNT);
+    return longDest;
+  }
+
+  // ---- FLOAT ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeFloat(Blackhole bh) throws IOException {
+    PlainValuesReader.FloatPlainValuesReader reader = new PlainValuesReader.FloatPlainValuesReader();
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(floatPage)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readFloat());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public float[] decodeFloatBatch() throws IOException {
+    PlainValuesReader.FloatPlainValuesReader reader = new PlainValuesReader.FloatPlainValuesReader();
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(floatPage)));
+    reader.readFloats(floatDest, 0, VALUE_COUNT);
+    return floatDest;
+  }
+
+  // ---- DOUBLE ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeDouble(Blackhole bh) throws IOException {
+    PlainValuesReader.DoublePlainValuesReader reader = new PlainValuesReader.DoublePlainValuesReader();
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(doublePage)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readDouble());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public double[] decodeDoubleBatch() throws IOException {
+    PlainValuesReader.DoublePlainValuesReader reader = new PlainValuesReader.DoublePlainValuesReader();
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(doublePage)));
+    reader.readDoubles(doubleDest, 0, VALUE_COUNT);
+    return doubleDest;
+  }
+}

From ecb854e8079600b1a1a61f6615b967191204b72c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= <iemejia@gmail.com>
Date: Wed, 13 May 2026 15:27:34 +0200
Subject: [PATCH 10/27] Add batch decode benchmarks to
 ByteStreamSplitDecodingBenchmark

Add decodeFloatBatch, decodeDoubleBatch, decodeIntBatch, decodeLongBatch
benchmarks with pre-allocated destination arrays to measure readXxx(dest,
offset, count) throughput for all four BSS primitive types.
---
 .../ByteStreamSplitDecodingBenchmark.java     | 47 +++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ByteStreamSplitDecodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ByteStreamSplitDecodingBenchmark.java
index e59b7ba941..81fda8c186 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ByteStreamSplitDecodingBenchmark.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ByteStreamSplitDecodingBenchmark.java
@@ -72,6 +72,12 @@ public class ByteStreamSplitDecodingBenchmark {
   private byte[] intPage;
   private byte[] longPage;
 
+  // Pre-allocated batch destination arrays (avoid per-invocation allocation artifact)
+  private float[] floatDest;
+  private double[] doubleDest;
+  private int[] intDest;
+  private long[] longDest;
+
   @Setup(Level.Trial)
   public void setup() throws IOException {
     Random random = new Random(42);
@@ -122,6 +128,11 @@ public void setup() throws IOException {
       longPage = w.getBytes().toByteArray();
       w.close();
     }
+
+    floatDest = new float[VALUE_COUNT];
+    doubleDest = new double[VALUE_COUNT];
+    intDest = new int[VALUE_COUNT];
+    longDest = new long[VALUE_COUNT];
   }
 
   private static void init(ByteStreamSplitValuesReader r, byte[] page) throws IOException {
@@ -167,4 +178,40 @@ public void decodeLong(Blackhole bh) throws IOException {
       bh.consume(r.readLong());
     }
   }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeFloatBatch(Blackhole bh) throws IOException {
+    ByteStreamSplitValuesReaderForFloat r = new ByteStreamSplitValuesReaderForFloat();
+    init(r, floatPage);
+    r.readFloats(floatDest, 0, VALUE_COUNT);
+    bh.consume(floatDest);
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeDoubleBatch(Blackhole bh) throws IOException {
+    ByteStreamSplitValuesReaderForDouble r = new ByteStreamSplitValuesReaderForDouble();
+    init(r, doublePage);
+    r.readDoubles(doubleDest, 0, VALUE_COUNT);
+    bh.consume(doubleDest);
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeIntBatch(Blackhole bh) throws IOException {
+    ByteStreamSplitValuesReaderForInteger r = new ByteStreamSplitValuesReaderForInteger();
+    init(r, intPage);
+    r.readIntegers(intDest, 0, VALUE_COUNT);
+    bh.consume(intDest);
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeLongBatch(Blackhole bh) throws IOException {
+    ByteStreamSplitValuesReaderForLong r = new ByteStreamSplitValuesReaderForLong();
+    init(r, longPage);
+    r.readLongs(longDest, 0, VALUE_COUNT);
+    bh.consume(longDest);
+  }
 }

From 1072d585a4d36c3d3657c262bd782c9b8048fce3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= <iemejia@gmail.com>
Date: Wed, 13 May 2026 15:44:12 +0200
Subject: [PATCH 11/27] Add BOOLEAN encoding characterization benchmark (V1
 PLAIN vs V2 RLE)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

BooleanEncodingBenchmark exercises both encoding paths across six data
patterns: ALL_TRUE, ALL_FALSE, ALTERNATING, RANDOM, MOSTLY_TRUE_99,
MOSTLY_FALSE_99.

Key findings (100K values):
  Encode: V1 PLAIN is data-independent (~880M ops/s). V2 RLE ranges
  from 2,344M (ALL_FALSE, +166%) to 192M (RANDOM, -78%).
  Decode: V2 RLE always >= V1 PLAIN — from +154% (ALL_FALSE) to +7%
  (ALTERNATING). The RLE decode penalty for random data is negligible.

The severe RLE encode penalty for random data (4.6x slower than PLAIN)
suggests the V1/V2 split is well-justified: V2 RLE is ideal for the
common case of skewed boolean columns, while V1 PLAIN is safer for
high-entropy data.
---
 .../benchmarks/BooleanEncodingBenchmark.java  | 177 ++++++++++++++++++
 1 file changed, 177 insertions(+)
 create mode 100644 parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BooleanEncodingBenchmark.java

diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BooleanEncodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BooleanEncodingBenchmark.java
new file mode 100644
index 0000000000..21d8b28c9d
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BooleanEncodingBenchmark.java
@@ -0,0 +1,177 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import org.apache.parquet.bytes.ByteBufferInputStream;
+import org.apache.parquet.bytes.HeapByteBufferAllocator;
+import org.apache.parquet.column.values.ValuesReader;
+import org.apache.parquet.column.values.ValuesWriter;
+import org.apache.parquet.column.values.plain.BooleanPlainValuesReader;
+import org.apache.parquet.column.values.plain.BooleanPlainValuesWriter;
+import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridValuesReader;
+import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridValuesWriter;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OperationsPerInvocation;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+/**
+ * Characterization benchmarks for BOOLEAN encoding in Parquet.
+ *
+ * <p>BOOLEAN columns use two distinct encoding paths:
+ * <ul>
+ *   <li><b>V1 (PLAIN):</b> {@link BooleanPlainValuesWriter} delegates to
+ *       {@code ByteBitPackingValuesWriter(bitWidth=1)}. Always bit-packs.</li>
+ *   <li><b>V2 (RLE):</b> {@link RunLengthBitPackingHybridValuesWriter} with
+ *       {@code bitWidth=1}. Uses the RLE/bit-packing hybrid, which can
+ *       run-length encode long runs of identical values.</li>
+ * </ul>
+ *
+ * <p>The {@code dataPattern} parameter exercises RLE's best cases (ALL_TRUE,
+ * ALL_FALSE), worst case (ALTERNATING), and realistic distributions (RANDOM,
+ * MOSTLY_TRUE_99, MOSTLY_FALSE_99).
+ *
+ * <p>Each invocation processes {@value #VALUE_COUNT} values; throughput is
+ * reported per-value via {@link OperationsPerInvocation}.
+ */
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@Fork(1)
+@Warmup(iterations = 3, time = 1)
+@Measurement(iterations = 5, time = 1)
+@State(Scope.Thread)
+public class BooleanEncodingBenchmark {
+
+  static final int VALUE_COUNT = 100_000;
+  private static final int INIT_SLAB_SIZE = 64 * 1024;
+  private static final int PAGE_SIZE = 4 * 1024 * 1024;
+
+  @Param({"ALL_TRUE", "ALL_FALSE", "ALTERNATING", "RANDOM", "MOSTLY_TRUE_99", "MOSTLY_FALSE_99"})
+  public String dataPattern;
+
+  private boolean[] data;
+  private byte[] v1Page;
+  private byte[] v2Page;
+
+  @Setup(Level.Trial)
+  public void setup() throws IOException {
+    data = generateData(dataPattern);
+    v1Page = encodeV1(data);
+    v2Page = encodeV2(data);
+  }
+
+  private static boolean[] generateData(String pattern) {
+    boolean[] d = new boolean[VALUE_COUNT];
+    Random rng = new Random(42);
+    switch (pattern) {
+      case "ALL_TRUE":
+        for (int i = 0; i < VALUE_COUNT; i++) d[i] = true;
+        break;
+      case "ALL_FALSE":
+        // already false
+        break;
+      case "ALTERNATING":
+        for (int i = 0; i < VALUE_COUNT; i++) d[i] = (i & 1) == 0;
+        break;
+      case "RANDOM":
+        for (int i = 0; i < VALUE_COUNT; i++) d[i] = rng.nextBoolean();
+        break;
+      case "MOSTLY_TRUE_99":
+        for (int i = 0; i < VALUE_COUNT; i++) d[i] = rng.nextInt(100) != 0;
+        break;
+      case "MOSTLY_FALSE_99":
+        for (int i = 0; i < VALUE_COUNT; i++) d[i] = rng.nextInt(100) == 0;
+        break;
+      default:
+        throw new IllegalArgumentException("Unknown pattern: " + pattern);
+    }
+    return d;
+  }
+
+  private static byte[] encodeV1(boolean[] values) throws IOException {
+    ValuesWriter w = new BooleanPlainValuesWriter();
+    for (boolean v : values) {
+      w.writeBoolean(v);
+    }
+    byte[] bytes = w.getBytes().toByteArray();
+    w.close();
+    return bytes;
+  }
+
+  private static byte[] encodeV2(boolean[] values) throws IOException {
+    ValuesWriter w = new RunLengthBitPackingHybridValuesWriter(
+        1, INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+    for (boolean v : values) {
+      w.writeBoolean(v);
+    }
+    byte[] bytes = w.getBytes().toByteArray();
+    w.close();
+    return bytes;
+  }
+
+  // ---- Encode benchmarks ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodePlainV1() throws IOException {
+    return encodeV1(data);
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeRleV2() throws IOException {
+    return encodeV2(data);
+  }
+
+  // ---- Decode benchmarks ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodePlainV1(Blackhole bh) throws IOException {
+    ValuesReader r = new BooleanPlainValuesReader();
+    r.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(v1Page)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(r.readBoolean());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeRleV2(Blackhole bh) throws IOException {
+    ValuesReader r = new RunLengthBitPackingHybridValuesReader(1);
+    r.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(v2Page)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(r.readBoolean());
+    }
+  }
+}

From ae8d22a884743d132e012eac4a1a5289355955bd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= <iemejia@gmail.com>
Date: Wed, 13 May 2026 15:59:07 +0200
Subject: [PATCH 12/27] Add batch decode benchmarks to BooleanEncodingBenchmark
 (V1 PLAIN + V2 RLE)

Adds decodePlainV1Batch and decodeRleV2Batch benchmark methods that
exercise the new readBooleans() batch API. Uses a pre-allocated boolean[]
destination array to isolate decode throughput from allocation overhead.
---
 .../benchmarks/BooleanEncodingBenchmark.java  | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BooleanEncodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BooleanEncodingBenchmark.java
index 21d8b28c9d..af31242f9f 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BooleanEncodingBenchmark.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BooleanEncodingBenchmark.java
@@ -83,11 +83,15 @@ public class BooleanEncodingBenchmark {
   private byte[] v1Page;
   private byte[] v2Page;
 
+  // Pre-allocated batch destination array
+  private boolean[] boolDest;
+
   @Setup(Level.Trial)
   public void setup() throws IOException {
     data = generateData(dataPattern);
     v1Page = encodeV1(data);
     v2Page = encodeV2(data);
+    boolDest = new boolean[VALUE_COUNT];
   }
 
   private static boolean[] generateData(String pattern) {
@@ -174,4 +178,24 @@ public void decodeRleV2(Blackhole bh) throws IOException {
       bh.consume(r.readBoolean());
     }
   }
+
+  // ---- Batch decode benchmarks ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodePlainV1Batch(Blackhole bh) throws IOException {
+    ValuesReader r = new BooleanPlainValuesReader();
+    r.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(v1Page)));
+    r.readBooleans(boolDest, 0, VALUE_COUNT);
+    bh.consume(boolDest);
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeRleV2Batch(Blackhole bh) throws IOException {
+    ValuesReader r = new RunLengthBitPackingHybridValuesReader(1);
+    r.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(v2Page)));
+    r.readBooleans(boolDest, 0, VALUE_COUNT);
+    bh.consume(boolDest);
+  }
 }

From dcd9812558689de0eb560d120ec3f18a76385d45 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= <iemejia@gmail.com>
Date: Wed, 13 May 2026 16:07:57 +0200
Subject: [PATCH 13/27] Add dictionary encoding benchmark for LONG, FLOAT,
 DOUBLE, and FLBA types

Covers encode + decode (scalar and batch) paths for all four type-specific
dictionary implementations: Long2IntOpenHashMap, Float2IntOpenHashMap,
Double2IntOpenHashMap, and Object2IntOpenHashMap (for FLBA). Two data
patterns exercise low-cardinality (100 distinct values, ~100% hit rate)
and high-cardinality (all unique, stresses hash map growth).

Also adds TestDataFactory generators for long[], float[], double[], and
fixed-length Binary[] data with configurable cardinality.

Characterization results (100K values, JDK 25, Compiler Blackholes):
- Batch decode shows +60-67% over scalar for LONG/FLOAT/DOUBLE
- HIGH_CARDINALITY encode is 6-7x slower than LOW (hash map pressure)
- FLBA encode is 14-108M ops/s (Binary hashing overhead)
---
 .../DictionaryEncodingBenchmark.java          | 326 ++++++++++++++++++
 .../parquet/benchmarks/TestDataFactory.java   | 151 ++++++++
 2 files changed, 477 insertions(+)
 create mode 100644 parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/DictionaryEncodingBenchmark.java

diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/DictionaryEncodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/DictionaryEncodingBenchmark.java
new file mode 100644
index 0000000000..327d6beb0f
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/DictionaryEncodingBenchmark.java
@@ -0,0 +1,326 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.concurrent.TimeUnit;
+import org.apache.parquet.bytes.ByteBufferInputStream;
+import org.apache.parquet.bytes.HeapByteBufferAllocator;
+import org.apache.parquet.column.Dictionary;
+import org.apache.parquet.column.Encoding;
+import org.apache.parquet.column.page.DictionaryPage;
+import org.apache.parquet.column.values.dictionary.DictionaryValuesReader;
+import org.apache.parquet.column.values.dictionary.DictionaryValuesWriter;
+import org.apache.parquet.column.values.dictionary.PlainValuesDictionary;
+import org.apache.parquet.io.api.Binary;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OperationsPerInvocation;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+/**
+ * Dictionary encoding/decoding benchmarks for LONG, FLOAT, DOUBLE, and
+ * FIXED_LEN_BYTE_ARRAY types — complementing the INT32 dictionary coverage
+ * already in {@link IntEncodingBenchmark}.
+ *
+ * <p>Each type's encode benchmark measures the full dictionary-build path
+ * (type-specific hash map + id append). Decode benchmarks measure the
+ * {@link DictionaryValuesReader} lookup path, both per-value and batch.
+ *
+ * <p>The {@code dataPattern} parameter controls cardinality to exercise
+ * both the dictionary-hits-only path (LOW_CARDINALITY) and the path
+ * where every value is unique (HIGH_CARDINALITY, which may trigger
+ * dictionary fallback for large value counts).
+ */
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@Fork(1)
+@Warmup(iterations = 3, time = 1)
+@Measurement(iterations = 5, time = 1)
+@State(Scope.Thread)
+public class DictionaryEncodingBenchmark {
+
+  static final int VALUE_COUNT = 100_000;
+  private static final int MAX_DICT_BYTE_SIZE = 4 * 1024 * 1024;
+  private static final int INIT_SLAB_SIZE = 64 * 1024;
+  private static final int PAGE_SIZE = 1024 * 1024;
+
+  @Param({"LOW_CARDINALITY", "HIGH_CARDINALITY"})
+  public String dataPattern;
+
+  // ---- Data arrays ----
+  private long[] longData;
+  private float[] floatData;
+  private double[] doubleData;
+  private Binary[] flbaData;
+
+  // ---- Pre-encoded dictionary pages for decode benchmarks ----
+  private byte[] longDictDataEncoded;
+  private Dictionary longDictionary;
+  private boolean longDictAvailable;
+
+  private byte[] floatDictDataEncoded;
+  private Dictionary floatDictionary;
+  private boolean floatDictAvailable;
+
+  private byte[] doubleDictDataEncoded;
+  private Dictionary doubleDictionary;
+  private boolean doubleDictAvailable;
+
+  private byte[] flbaDictDataEncoded;
+  private Dictionary flbaDictionary;
+  private boolean flbaDictAvailable;
+
+  // Fixed length for FLBA tests (16 = UUID-sized)
+  private static final int FLBA_LENGTH = 16;
+
+  @Setup(Level.Trial)
+  public void setup() throws IOException {
+    int distinct = "LOW_CARDINALITY".equals(dataPattern)
+        ? TestDataFactory.LOW_CARDINALITY_DISTINCT
+        : 0; // 0 = all unique for HIGH_CARDINALITY
+
+    long seed = TestDataFactory.DEFAULT_SEED;
+
+    // Generate data
+    if (distinct > 0) {
+      longData = TestDataFactory.generateLowCardinalityLongs(VALUE_COUNT, distinct, seed);
+      floatData = TestDataFactory.generateLowCardinalityFloats(VALUE_COUNT, distinct, seed);
+      doubleData = TestDataFactory.generateLowCardinalityDoubles(VALUE_COUNT, distinct, seed);
+      flbaData = TestDataFactory.generateFixedLenByteArrays(VALUE_COUNT, FLBA_LENGTH, distinct, seed);
+    } else {
+      longData = TestDataFactory.generateRandomLongs(VALUE_COUNT, seed);
+      floatData = TestDataFactory.generateRandomFloats(VALUE_COUNT, seed);
+      doubleData = TestDataFactory.generateRandomDoubles(VALUE_COUNT, seed);
+      flbaData = TestDataFactory.generateFixedLenByteArrays(VALUE_COUNT, FLBA_LENGTH, 0, seed);
+    }
+
+    // Pre-encode for decode benchmarks
+    setupLongDict();
+    setupFloatDict();
+    setupDoubleDict();
+    setupFlbaDict();
+  }
+
+  private void setupLongDict() throws IOException {
+    DictionaryValuesWriter.PlainLongDictionaryValuesWriter w = new DictionaryValuesWriter.PlainLongDictionaryValuesWriter(
+        MAX_DICT_BYTE_SIZE, Encoding.PLAIN_DICTIONARY, Encoding.PLAIN, new HeapByteBufferAllocator());
+    for (long v : longData) {
+      w.writeLong(v);
+    }
+    BenchmarkEncodingUtils.EncodedDictionary enc = BenchmarkEncodingUtils.drainDictionary(w);
+    longDictDataEncoded = enc.dictData;
+    longDictAvailable = !enc.fellBackToPlain();
+    if (longDictAvailable) {
+      longDictionary = new PlainValuesDictionary.PlainLongDictionary(enc.dictPage);
+    }
+  }
+
+  private void setupFloatDict() throws IOException {
+    DictionaryValuesWriter.PlainFloatDictionaryValuesWriter w = new DictionaryValuesWriter.PlainFloatDictionaryValuesWriter(
+        MAX_DICT_BYTE_SIZE, Encoding.PLAIN_DICTIONARY, Encoding.PLAIN, new HeapByteBufferAllocator());
+    for (float v : floatData) {
+      w.writeFloat(v);
+    }
+    BenchmarkEncodingUtils.EncodedDictionary enc = BenchmarkEncodingUtils.drainDictionary(w);
+    floatDictDataEncoded = enc.dictData;
+    floatDictAvailable = !enc.fellBackToPlain();
+    if (floatDictAvailable) {
+      floatDictionary = new PlainValuesDictionary.PlainFloatDictionary(enc.dictPage);
+    }
+  }
+
+  private void setupDoubleDict() throws IOException {
+    DictionaryValuesWriter.PlainDoubleDictionaryValuesWriter w = new DictionaryValuesWriter.PlainDoubleDictionaryValuesWriter(
+        MAX_DICT_BYTE_SIZE, Encoding.PLAIN_DICTIONARY, Encoding.PLAIN, new HeapByteBufferAllocator());
+    for (double v : doubleData) {
+      w.writeDouble(v);
+    }
+    BenchmarkEncodingUtils.EncodedDictionary enc = BenchmarkEncodingUtils.drainDictionary(w);
+    doubleDictDataEncoded = enc.dictData;
+    doubleDictAvailable = !enc.fellBackToPlain();
+    if (doubleDictAvailable) {
+      doubleDictionary = new PlainValuesDictionary.PlainDoubleDictionary(enc.dictPage);
+    }
+  }
+
+  private void setupFlbaDict() throws IOException {
+    DictionaryValuesWriter.PlainFixedLenArrayDictionaryValuesWriter w =
+        new DictionaryValuesWriter.PlainFixedLenArrayDictionaryValuesWriter(
+            MAX_DICT_BYTE_SIZE, FLBA_LENGTH, Encoding.PLAIN_DICTIONARY, Encoding.PLAIN,
+            new HeapByteBufferAllocator());
+    for (Binary v : flbaData) {
+      w.writeBytes(v);
+    }
+    BenchmarkEncodingUtils.EncodedDictionary enc = BenchmarkEncodingUtils.drainDictionary(w);
+    flbaDictDataEncoded = enc.dictData;
+    flbaDictAvailable = !enc.fellBackToPlain();
+    if (flbaDictAvailable) {
+      flbaDictionary = new PlainValuesDictionary.PlainBinaryDictionary(enc.dictPage, FLBA_LENGTH);
+    }
+  }
+
+  // ==== ENCODE BENCHMARKS ====
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void encodeLong(Blackhole bh) throws IOException {
+    DictionaryValuesWriter.PlainLongDictionaryValuesWriter w = new DictionaryValuesWriter.PlainLongDictionaryValuesWriter(
+        MAX_DICT_BYTE_SIZE, Encoding.PLAIN_DICTIONARY, Encoding.PLAIN, new HeapByteBufferAllocator());
+    for (long v : longData) {
+      w.writeLong(v);
+    }
+    BenchmarkEncodingUtils.EncodedDictionary enc = BenchmarkEncodingUtils.drainDictionary(w);
+    bh.consume(enc.dictData);
+    bh.consume(enc.dictPage);
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void encodeFloat(Blackhole bh) throws IOException {
+    DictionaryValuesWriter.PlainFloatDictionaryValuesWriter w = new DictionaryValuesWriter.PlainFloatDictionaryValuesWriter(
+        MAX_DICT_BYTE_SIZE, Encoding.PLAIN_DICTIONARY, Encoding.PLAIN, new HeapByteBufferAllocator());
+    for (float v : floatData) {
+      w.writeFloat(v);
+    }
+    BenchmarkEncodingUtils.EncodedDictionary enc = BenchmarkEncodingUtils.drainDictionary(w);
+    bh.consume(enc.dictData);
+    bh.consume(enc.dictPage);
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void encodeDouble(Blackhole bh) throws IOException {
+    DictionaryValuesWriter.PlainDoubleDictionaryValuesWriter w = new DictionaryValuesWriter.PlainDoubleDictionaryValuesWriter(
+        MAX_DICT_BYTE_SIZE, Encoding.PLAIN_DICTIONARY, Encoding.PLAIN, new HeapByteBufferAllocator());
+    for (double v : doubleData) {
+      w.writeDouble(v);
+    }
+    BenchmarkEncodingUtils.EncodedDictionary enc = BenchmarkEncodingUtils.drainDictionary(w);
+    bh.consume(enc.dictData);
+    bh.consume(enc.dictPage);
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void encodeFlba(Blackhole bh) throws IOException {
+    DictionaryValuesWriter.PlainFixedLenArrayDictionaryValuesWriter w =
+        new DictionaryValuesWriter.PlainFixedLenArrayDictionaryValuesWriter(
+            MAX_DICT_BYTE_SIZE, FLBA_LENGTH, Encoding.PLAIN_DICTIONARY, Encoding.PLAIN,
+            new HeapByteBufferAllocator());
+    for (Binary v : flbaData) {
+      w.writeBytes(v);
+    }
+    BenchmarkEncodingUtils.EncodedDictionary enc = BenchmarkEncodingUtils.drainDictionary(w);
+    bh.consume(enc.dictData);
+    bh.consume(enc.dictPage);
+  }
+
+  // ==== DECODE BENCHMARKS (per-value) ====
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeLong(Blackhole bh) throws IOException {
+    if (!longDictAvailable) return;
+    DictionaryValuesReader r = new DictionaryValuesReader(longDictionary);
+    r.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(longDictDataEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(r.readLong());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeFloat(Blackhole bh) throws IOException {
+    if (!floatDictAvailable) return;
+    DictionaryValuesReader r = new DictionaryValuesReader(floatDictionary);
+    r.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(floatDictDataEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(r.readFloat());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeDouble(Blackhole bh) throws IOException {
+    if (!doubleDictAvailable) return;
+    DictionaryValuesReader r = new DictionaryValuesReader(doubleDictionary);
+    r.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(doubleDictDataEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(r.readDouble());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeFlba(Blackhole bh) throws IOException {
+    if (!flbaDictAvailable) return;
+    DictionaryValuesReader r = new DictionaryValuesReader(flbaDictionary);
+    r.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(flbaDictDataEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(r.readBytes());
+    }
+  }
+
+  // ==== DECODE BENCHMARKS (batch) ====
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public long[] decodeLongBatch() throws IOException {
+    if (!longDictAvailable) return new long[0];
+    DictionaryValuesReader r = new DictionaryValuesReader(longDictionary);
+    r.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(longDictDataEncoded)));
+    long[] dest = new long[VALUE_COUNT];
+    r.readLongs(dest, 0, VALUE_COUNT);
+    return dest;
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public float[] decodeFloatBatch() throws IOException {
+    if (!floatDictAvailable) return new float[0];
+    DictionaryValuesReader r = new DictionaryValuesReader(floatDictionary);
+    r.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(floatDictDataEncoded)));
+    float[] dest = new float[VALUE_COUNT];
+    r.readFloats(dest, 0, VALUE_COUNT);
+    return dest;
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public double[] decodeDoubleBatch() throws IOException {
+    if (!doubleDictAvailable) return new double[0];
+    DictionaryValuesReader r = new DictionaryValuesReader(doubleDictionary);
+    r.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(doubleDictDataEncoded)));
+    double[] dest = new double[VALUE_COUNT];
+    r.readDoubles(dest, 0, VALUE_COUNT);
+    return dest;
+  }
+}
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/TestDataFactory.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/TestDataFactory.java
index 13c5175d66..93cc714730 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/TestDataFactory.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/TestDataFactory.java
@@ -175,6 +175,157 @@ public static int[] generateHighCardinalityInts(int count, Random random) {
     return data;
   }
 
+  // ---- Long data generation for encoding benchmarks ----
+
+  /**
+   * Generates sequential longs: 0, 1, 2, ...
+   */
+  public static long[] generateSequentialLongs(int count) {
+    long[] data = new long[count];
+    for (int i = 0; i < count; i++) {
+      data[i] = i;
+    }
+    return data;
+  }
+
+  /**
+   * Generates uniformly random longs using the given seed.
+   */
+  public static long[] generateRandomLongs(int count, long seed) {
+    Random random = new Random(seed);
+    long[] data = new long[count];
+    for (int i = 0; i < count; i++) {
+      data[i] = random.nextLong();
+    }
+    return data;
+  }
+
+  /**
+   * Generates low-cardinality longs (values drawn from a small set).
+   */
+  public static long[] generateLowCardinalityLongs(int count, int distinctValues, long seed) {
+    Random random = new Random(seed);
+    long[] palette = new long[distinctValues];
+    for (int i = 0; i < distinctValues; i++) {
+      palette[i] = random.nextLong();
+    }
+    long[] data = new long[count];
+    for (int i = 0; i < count; i++) {
+      data[i] = palette[random.nextInt(distinctValues)];
+    }
+    return data;
+  }
+
+  /**
+   * Generates high-cardinality longs (all unique, shuffled).
+   */
+  public static long[] generateHighCardinalityLongs(int count, long seed) {
+    Random random = new Random(seed);
+    long[] data = generateSequentialLongs(count);
+    for (int i = count - 1; i > 0; i--) {
+      int swapIndex = random.nextInt(i + 1);
+      long tmp = data[i];
+      data[i] = data[swapIndex];
+      data[swapIndex] = tmp;
+    }
+    return data;
+  }
+
+  // ---- Float data generation for encoding benchmarks ----
+
+  /**
+   * Generates uniformly random floats using the given seed.
+   */
+  public static float[] generateRandomFloats(int count, long seed) {
+    Random random = new Random(seed);
+    float[] data = new float[count];
+    for (int i = 0; i < count; i++) {
+      data[i] = random.nextFloat() * 1000.0f;
+    }
+    return data;
+  }
+
+  /**
+   * Generates low-cardinality floats (values drawn from a small set).
+   */
+  public static float[] generateLowCardinalityFloats(int count, int distinctValues, long seed) {
+    Random random = new Random(seed);
+    float[] palette = new float[distinctValues];
+    for (int i = 0; i < distinctValues; i++) {
+      palette[i] = random.nextFloat() * 1000.0f;
+    }
+    float[] data = new float[count];
+    for (int i = 0; i < count; i++) {
+      data[i] = palette[random.nextInt(distinctValues)];
+    }
+    return data;
+  }
+
+  // ---- Double data generation for encoding benchmarks ----
+
+  /**
+   * Generates uniformly random doubles using the given seed.
+   */
+  public static double[] generateRandomDoubles(int count, long seed) {
+    Random random = new Random(seed);
+    double[] data = new double[count];
+    for (int i = 0; i < count; i++) {
+      data[i] = random.nextDouble() * 1000.0;
+    }
+    return data;
+  }
+
+  /**
+   * Generates low-cardinality doubles (values drawn from a small set).
+   */
+  public static double[] generateLowCardinalityDoubles(int count, int distinctValues, long seed) {
+    Random random = new Random(seed);
+    double[] palette = new double[distinctValues];
+    for (int i = 0; i < distinctValues; i++) {
+      palette[i] = random.nextDouble() * 1000.0;
+    }
+    double[] data = new double[count];
+    for (int i = 0; i < count; i++) {
+      data[i] = palette[random.nextInt(distinctValues)];
+    }
+    return data;
+  }
+
+  // ---- Fixed-length byte array data generation for encoding benchmarks ----
+
+  /**
+   * Generates fixed-length byte arrays with the specified cardinality.
+   *
+   * @param count      number of values
+   * @param length     byte length of each value
+   * @param distinct   number of distinct values (0 means all unique)
+   * @param seed       RNG seed
+   */
+  public static Binary[] generateFixedLenByteArrays(int count, int length, int distinct, long seed) {
+    Random random = new Random(seed);
+    if (distinct > 0) {
+      Binary[] palette = new Binary[distinct];
+      for (int i = 0; i < distinct; i++) {
+        byte[] bytes = new byte[length];
+        random.nextBytes(bytes);
+        palette[i] = Binary.fromConstantByteArray(bytes);
+      }
+      Binary[] data = new Binary[count];
+      for (int i = 0; i < count; i++) {
+        data[i] = palette[random.nextInt(distinct)];
+      }
+      return data;
+    } else {
+      Binary[] data = new Binary[count];
+      for (int i = 0; i < count; i++) {
+        byte[] bytes = new byte[length];
+        random.nextBytes(bytes);
+        data[i] = Binary.fromConstantByteArray(bytes);
+      }
+      return data;
+    }
+  }
+
   // ---- Binary data generation for encoding benchmarks ----
 
   /**

From 598f4707b363b10140a9cffe302585bcf1a76bbb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= <iemejia@gmail.com>
Date: Wed, 13 May 2026 16:11:45 +0200
Subject: [PATCH 14/27] Expand FLBA benchmark: add DELTA_BYTE_ARRAY, BSS,
 DICTIONARY encode+decode

Rewrites FixedLenByteArrayEncodingBenchmark from a single encodePlain()
method to full coverage of all four FLBA-supported encodings (PLAIN,
DELTA_BYTE_ARRAY, BYTE_STREAM_SPLIT, DICTIONARY) with both encode and
decode benchmarks. Adds parameterized fixedLength (2=FLOAT16, 12=INT96,
16=UUID) and dataPattern (RANDOM, LOW_CARDINALITY) axes.

Characterization results (100K values, JDK 25, fixedLength=16):
- Dictionary decode: 543M ops/s (fastest, avoids 16B copy per value)
- PLAIN decode: 184M ops/s (slice + Binary wrapping)
- BSS/Delta decode: ~87M ops/s (byte scatter/prefix overhead)
- BSS excels at fixedLength=2: 368M ops/s (trivial 2-stream transpose)
---
 .../FixedLenByteArrayEncodingBenchmark.java   | 177 ++++++++++++++++--
 1 file changed, 163 insertions(+), 14 deletions(-)

diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FixedLenByteArrayEncodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FixedLenByteArrayEncodingBenchmark.java
index 7bf9359c92..8c87e9e81b 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FixedLenByteArrayEncodingBenchmark.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FixedLenByteArrayEncodingBenchmark.java
@@ -19,10 +19,23 @@
 package org.apache.parquet.benchmarks;
 
 import java.io.IOException;
-import java.util.Random;
+import java.nio.ByteBuffer;
 import java.util.concurrent.TimeUnit;
+import org.apache.parquet.bytes.ByteBufferInputStream;
 import org.apache.parquet.bytes.HeapByteBufferAllocator;
+import org.apache.parquet.column.Dictionary;
+import org.apache.parquet.column.Encoding;
+import org.apache.parquet.column.page.DictionaryPage;
+import org.apache.parquet.column.values.ValuesReader;
 import org.apache.parquet.column.values.ValuesWriter;
+import org.apache.parquet.column.values.bytestreamsplit.ByteStreamSplitValuesReaderForFLBA;
+import org.apache.parquet.column.values.bytestreamsplit.ByteStreamSplitValuesWriter;
+import org.apache.parquet.column.values.deltastrings.DeltaByteArrayReader;
+import org.apache.parquet.column.values.deltastrings.DeltaByteArrayWriter;
+import org.apache.parquet.column.values.dictionary.DictionaryValuesReader;
+import org.apache.parquet.column.values.dictionary.DictionaryValuesWriter;
+import org.apache.parquet.column.values.dictionary.PlainValuesDictionary;
+import org.apache.parquet.column.values.plain.FixedLenByteArrayPlainValuesReader;
 import org.apache.parquet.column.values.plain.FixedLenByteArrayPlainValuesWriter;
 import org.apache.parquet.io.api.Binary;
 import org.openjdk.jmh.annotations.Benchmark;
@@ -38,14 +51,27 @@
 import org.openjdk.jmh.annotations.Setup;
 import org.openjdk.jmh.annotations.State;
 import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
 
 /**
- * Encoding-level micro-benchmark for {@link FixedLenByteArrayPlainValuesWriter}.
- * Each input value has a fixed length matching the writer's configured length, so
- * no length prefix is emitted -- the writer simply concatenates the raw bytes.
+ * Encoding-level micro-benchmarks for FIXED_LEN_BYTE_ARRAY (FLBA) values across
+ * all supported encodings: PLAIN, DELTA_BYTE_ARRAY, BYTE_STREAM_SPLIT, and DICTIONARY.
  *
- * <p>Each benchmark invocation processes {@value #VALUE_COUNT} values; throughput
- * is reported per-value via {@link OperationsPerInvocation}.
+ * <p>Each benchmark invocation processes {@value #VALUE_COUNT} values; throughput is
+ * reported per-value via {@link OperationsPerInvocation}.
+ *
+ * <p>The {@code fixedLength} parameter exercises key FLBA sizes:
+ * <ul>
+ *   <li>2 = FLOAT16</li>
+ *   <li>12 = INT96 (legacy timestamps)</li>
+ *   <li>16 = UUID</li>
+ * </ul>
+ *
+ * <p>The {@code dataPattern} parameter controls cardinality:
+ * <ul>
+ *   <li>RANDOM = all unique values</li>
+ *   <li>LOW_CARDINALITY = 100 distinct values (favors dictionary and delta)</li>
+ * </ul>
  */
 @BenchmarkMode(Mode.Throughput)
 @OutputTimeUnit(TimeUnit.SECONDS)
@@ -58,17 +84,37 @@ public class FixedLenByteArrayEncodingBenchmark {
   static final int VALUE_COUNT = 100_000;
   private static final int INIT_SLAB_SIZE = 64 * 1024;
   private static final int PAGE_SIZE = 4 * 1024 * 1024;
+  private static final int MAX_DICT_BYTE_SIZE = 4 * 1024 * 1024;
 
-  @Param({"10", "100", "1000"})
+  @Param({"2", "12", "16"})
   public int fixedLength;
 
+  @Param({"RANDOM", "LOW_CARDINALITY"})
+  public String dataPattern;
+
   private Binary[] data;
 
+  // Pre-encoded pages for decode benchmarks
+  private byte[] plainEncoded;
+  private byte[] deltaEncoded;
+  private byte[] bssEncoded;
+  private byte[] dictDataEncoded;
+  private Dictionary flbaDictionary;
+  private boolean dictAvailable;
+
   @Setup(Level.Trial)
-  public void setup() {
-    Random random = new Random(42);
-    // distinct=0 -> all unique values; each is exactly fixedLength bytes long.
-    data = TestDataFactory.generateBinaryData(VALUE_COUNT, fixedLength, 0, random);
+  public void setup() throws IOException {
+    int distinct = "LOW_CARDINALITY".equals(dataPattern)
+        ? TestDataFactory.LOW_CARDINALITY_DISTINCT
+        : 0;
+    data = TestDataFactory.generateFixedLenByteArrays(
+        VALUE_COUNT, fixedLength, distinct, TestDataFactory.DEFAULT_SEED);
+
+    // Pre-encode for decode benchmarks
+    plainEncoded = encodeWith(newPlainWriter());
+    deltaEncoded = encodeWith(newDeltaWriter());
+    bssEncoded = encodeWith(newBssWriter());
+    setupDict();
   }
 
   private byte[] encodeWith(ValuesWriter writer) throws IOException {
@@ -80,10 +126,113 @@ private byte[] encodeWith(ValuesWriter writer) throws IOException {
     return bytes;
   }
 
+  private void setupDict() throws IOException {
+    DictionaryValuesWriter.PlainFixedLenArrayDictionaryValuesWriter w =
+        new DictionaryValuesWriter.PlainFixedLenArrayDictionaryValuesWriter(
+            MAX_DICT_BYTE_SIZE, fixedLength, Encoding.PLAIN_DICTIONARY, Encoding.PLAIN,
+            new HeapByteBufferAllocator());
+    for (Binary v : data) {
+      w.writeBytes(v);
+    }
+    BenchmarkEncodingUtils.EncodedDictionary enc = BenchmarkEncodingUtils.drainDictionary(w);
+    dictDataEncoded = enc.dictData;
+    dictAvailable = !enc.fellBackToPlain();
+    if (dictAvailable) {
+      flbaDictionary = new PlainValuesDictionary.PlainBinaryDictionary(enc.dictPage, fixedLength);
+    }
+  }
+
+  // ---- Writer factories ----
+
+  private FixedLenByteArrayPlainValuesWriter newPlainWriter() {
+    return new FixedLenByteArrayPlainValuesWriter(
+        fixedLength, INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+  }
+
+  private DeltaByteArrayWriter newDeltaWriter() {
+    return new DeltaByteArrayWriter(INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+  }
+
+  private ByteStreamSplitValuesWriter.FixedLenByteArrayByteStreamSplitValuesWriter newBssWriter() {
+    return new ByteStreamSplitValuesWriter.FixedLenByteArrayByteStreamSplitValuesWriter(
+        fixedLength, INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+  }
+
+  // ==== ENCODE BENCHMARKS ====
+
   @Benchmark
   @OperationsPerInvocation(VALUE_COUNT)
-  public byte[] encodeFixedLenPlain() throws IOException {
-    return encodeWith(new FixedLenByteArrayPlainValuesWriter(
-        fixedLength, INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator()));
+  public byte[] encodePlain() throws IOException {
+    return encodeWith(newPlainWriter());
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeDelta() throws IOException {
+    return encodeWith(newDeltaWriter());
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeBss() throws IOException {
+    return encodeWith(newBssWriter());
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void encodeDictionary(Blackhole bh) throws IOException {
+    DictionaryValuesWriter.PlainFixedLenArrayDictionaryValuesWriter w =
+        new DictionaryValuesWriter.PlainFixedLenArrayDictionaryValuesWriter(
+            MAX_DICT_BYTE_SIZE, fixedLength, Encoding.PLAIN_DICTIONARY, Encoding.PLAIN,
+            new HeapByteBufferAllocator());
+    for (Binary v : data) {
+      w.writeBytes(v);
+    }
+    BenchmarkEncodingUtils.EncodedDictionary enc = BenchmarkEncodingUtils.drainDictionary(w);
+    bh.consume(enc.dictData);
+    bh.consume(enc.dictPage);
+  }
+
+  // ==== DECODE BENCHMARKS ====
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodePlain(Blackhole bh) throws IOException {
+    FixedLenByteArrayPlainValuesReader reader = new FixedLenByteArrayPlainValuesReader(fixedLength);
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(plainEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readBytes());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeDelta(Blackhole bh) throws IOException {
+    DeltaByteArrayReader reader = new DeltaByteArrayReader();
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(deltaEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readBytes());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeBss(Blackhole bh) throws IOException {
+    ByteStreamSplitValuesReaderForFLBA reader = new ByteStreamSplitValuesReaderForFLBA(fixedLength);
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(bssEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readBytes());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeDictionary(Blackhole bh) throws IOException {
+    if (!dictAvailable) return;
+    DictionaryValuesReader reader = new DictionaryValuesReader(flbaDictionary);
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(dictDataEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readBytes());
+    }
   }
 }

From dbb356265df9979781c9d1b0fbb8b61e5b5e5d6a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= <iemejia@gmail.com>
Date: Wed, 13 May 2026 16:14:02 +0200
Subject: [PATCH 15/27] Add LZ4_RAW to file-level benchmark @Param codec lists

LZ4_RAW was optimized (+47-77% decompress throughput) and has a
micro-benchmark in CompressionBenchmark, but was missing from the
end-to-end file read/write benchmarks. Adding it enables direct
comparison with SNAPPY, ZSTD, and GZIP at the full-file level.
---
 .../java/org/apache/parquet/benchmarks/FileReadBenchmark.java   | 2 +-
 .../java/org/apache/parquet/benchmarks/FileWriteBenchmark.java  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileReadBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileReadBenchmark.java
index de133f4607..a2da10eb38 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileReadBenchmark.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileReadBenchmark.java
@@ -71,7 +71,7 @@
 @State(Scope.Benchmark)
 public class FileReadBenchmark {
 
-  @Param({"UNCOMPRESSED", "SNAPPY", "ZSTD", "GZIP"})
+  @Param({"UNCOMPRESSED", "SNAPPY", "ZSTD", "GZIP", "LZ4_RAW"})
   public String codec;
 
   @Param({"PARQUET_1_0", "PARQUET_2_0"})
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileWriteBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileWriteBenchmark.java
index 6716010cc3..4fa5bf238a 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileWriteBenchmark.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileWriteBenchmark.java
@@ -62,7 +62,7 @@
 @State(Scope.Benchmark)
 public class FileWriteBenchmark {
 
-  @Param({"UNCOMPRESSED", "SNAPPY", "ZSTD", "GZIP"})
+  @Param({"UNCOMPRESSED", "SNAPPY", "ZSTD", "GZIP", "LZ4_RAW"})
   public String codec;
 
   @Param({"PARQUET_1_0", "PARQUET_2_0"})

From 67ef35bbf177dbe8a74a09df6736ce739a27a076 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= <iemejia@gmail.com>
Date: Wed, 13 May 2026 17:38:11 +0200
Subject: [PATCH 16/27] Add batch encode benchmarks for BOOLEAN writeBooleans()
 API

Add encodePlainV1Batch and encodeRleV2Batch benchmarks that exercise
the new writeBooleans() batch encoding path, complementing the existing
scalar encode benchmarks.
---
 .../benchmarks/BooleanEncodingBenchmark.java  | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BooleanEncodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BooleanEncodingBenchmark.java
index af31242f9f..7c3452652d 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BooleanEncodingBenchmark.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BooleanEncodingBenchmark.java
@@ -157,6 +157,27 @@ public byte[] encodeRleV2() throws IOException {
     return encodeV2(data);
   }
 
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodePlainV1Batch() throws IOException {
+    ValuesWriter w = new BooleanPlainValuesWriter();
+    w.writeBooleans(data, 0, data.length);
+    byte[] bytes = w.getBytes().toByteArray();
+    w.close();
+    return bytes;
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeRleV2Batch() throws IOException {
+    ValuesWriter w = new RunLengthBitPackingHybridValuesWriter(
+        1, INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+    w.writeBooleans(data, 0, data.length);
+    byte[] bytes = w.getBytes().toByteArray();
+    w.close();
+    return bytes;
+  }
+
   // ---- Decode benchmarks ----
 
   @Benchmark

From cb0ba5b293c72fe3072399641c33de1716c0759b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= <iemejia@gmail.com>
Date: Wed, 13 May 2026 18:13:29 +0200
Subject: [PATCH 17/27] Add batch encode benchmarks for PLAIN numeric encoding
 (INT32/INT64/FLOAT/DOUBLE)

New PlainEncodingBenchmark class with scalar vs batch comparison for all four
numeric types. Also adds encodePlainBatch to IntEncodingBenchmark for consistency.
---
 .../benchmarks/IntEncodingBenchmark.java      |  10 +
 .../benchmarks/PlainEncodingBenchmark.java    | 179 ++++++++++++++++++
 2 files changed, 189 insertions(+)
 create mode 100644 parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PlainEncodingBenchmark.java

diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/IntEncodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/IntEncodingBenchmark.java
index 7665a7462a..71ff35f674 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/IntEncodingBenchmark.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/IntEncodingBenchmark.java
@@ -179,6 +179,16 @@ public byte[] encodePlain() throws IOException {
     return encodeWith(newPlainWriter());
   }
 
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodePlainBatch() throws IOException {
+    PlainValuesWriter writer = newPlainWriter();
+    writer.writeIntegers(data, 0, data.length);
+    byte[] bytes = writer.getBytes().toByteArray();
+    writer.close();
+    return bytes;
+  }
+
   @Benchmark
   @OperationsPerInvocation(VALUE_COUNT)
   public byte[] encodeDelta() throws IOException {
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PlainEncodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PlainEncodingBenchmark.java
new file mode 100644
index 0000000000..bd85eb22b2
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PlainEncodingBenchmark.java
@@ -0,0 +1,179 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.IOException;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import org.apache.parquet.bytes.HeapByteBufferAllocator;
+import org.apache.parquet.column.values.plain.PlainValuesWriter;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OperationsPerInvocation;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+
+/**
+ * Encoding micro-benchmarks for the PLAIN encoding across the four numeric primitive
+ * types: {@code INT32}, {@code INT64}, {@code FLOAT}, {@code DOUBLE}.
+ *
+ * <p>Compares per-value scalar writes vs bulk batch writes using
+ * {@link PlainValuesWriter}'s {@code writeIntegers}, {@code writeLongs},
+ * {@code writeFloats}, {@code writeDoubles} methods backed by bulk
+ * {@code ByteBuffer} view transfers in {@code CapacityByteArrayOutputStream}.
+ */
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@Fork(1)
+@Warmup(iterations = 3, time = 1)
+@Measurement(iterations = 5, time = 1)
+@State(Scope.Thread)
+public class PlainEncodingBenchmark {
+
+  static final int VALUE_COUNT = 100_000;
+  private static final int INIT_SLAB_SIZE = 64 * 1024;
+  private static final int PAGE_SIZE = 1024 * 1024;
+
+  private int[] intData;
+  private long[] longData;
+  private float[] floatData;
+  private double[] doubleData;
+
+  @Setup(Level.Trial)
+  public void setup() {
+    Random r = new Random(42);
+    intData = new int[VALUE_COUNT];
+    longData = new long[VALUE_COUNT];
+    floatData = new float[VALUE_COUNT];
+    doubleData = new double[VALUE_COUNT];
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      intData[i] = r.nextInt();
+      longData[i] = r.nextLong();
+      floatData[i] = r.nextFloat();
+      doubleData[i] = r.nextDouble();
+    }
+  }
+
+  private static PlainValuesWriter newWriter() {
+    return new PlainValuesWriter(INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+  }
+
+  // ---- INT32 ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeInt() throws IOException {
+    PlainValuesWriter w = newWriter();
+    for (int v : intData) {
+      w.writeInteger(v);
+    }
+    byte[] bytes = w.getBytes().toByteArray();
+    w.close();
+    return bytes;
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeIntBatch() throws IOException {
+    PlainValuesWriter w = newWriter();
+    w.writeIntegers(intData, 0, VALUE_COUNT);
+    byte[] bytes = w.getBytes().toByteArray();
+    w.close();
+    return bytes;
+  }
+
+  // ---- INT64 ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeLong() throws IOException {
+    PlainValuesWriter w = newWriter();
+    for (long v : longData) {
+      w.writeLong(v);
+    }
+    byte[] bytes = w.getBytes().toByteArray();
+    w.close();
+    return bytes;
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeLongBatch() throws IOException {
+    PlainValuesWriter w = newWriter();
+    w.writeLongs(longData, 0, VALUE_COUNT);
+    byte[] bytes = w.getBytes().toByteArray();
+    w.close();
+    return bytes;
+  }
+
+  // ---- FLOAT ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeFloat() throws IOException {
+    PlainValuesWriter w = newWriter();
+    for (float v : floatData) {
+      w.writeFloat(v);
+    }
+    byte[] bytes = w.getBytes().toByteArray();
+    w.close();
+    return bytes;
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeFloatBatch() throws IOException {
+    PlainValuesWriter w = newWriter();
+    w.writeFloats(floatData, 0, VALUE_COUNT);
+    byte[] bytes = w.getBytes().toByteArray();
+    w.close();
+    return bytes;
+  }
+
+  // ---- DOUBLE ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeDouble() throws IOException {
+    PlainValuesWriter w = newWriter();
+    for (double v : doubleData) {
+      w.writeDouble(v);
+    }
+    byte[] bytes = w.getBytes().toByteArray();
+    w.close();
+    return bytes;
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeDoubleBatch() throws IOException {
+    PlainValuesWriter w = newWriter();
+    w.writeDoubles(doubleData, 0, VALUE_COUNT);
+    byte[] bytes = w.getBytes().toByteArray();
+    w.close();
+    return bytes;
+  }
+}

From 7f087620439d22306f7db34eb8a040017aafc05c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= <iemejia@gmail.com>
Date: Wed, 13 May 2026 19:01:21 +0200
Subject: [PATCH 18/27] Add FLBA PLAIN batch encode/decode benchmarks
 (encodePlainBatch, decodePlainBatch)

Exercises the new readBinaries()/writeBinaries() batch APIs for FIXED_LEN_BYTE_ARRAY
PLAIN encoding. Results: decode batch +165-245%, encode batch +19-81%.
---
 .../FixedLenByteArrayEncodingBenchmark.java   | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FixedLenByteArrayEncodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FixedLenByteArrayEncodingBenchmark.java
index 8c87e9e81b..a2f0f9bbc7 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FixedLenByteArrayEncodingBenchmark.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FixedLenByteArrayEncodingBenchmark.java
@@ -166,6 +166,16 @@ public byte[] encodePlain() throws IOException {
     return encodeWith(newPlainWriter());
   }
 
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodePlainBatch() throws IOException {
+    FixedLenByteArrayPlainValuesWriter writer = newPlainWriter();
+    writer.writeBinaries(data, 0, data.length);
+    byte[] bytes = writer.getBytes().toByteArray();
+    writer.close();
+    return bytes;
+  }
+
   @Benchmark
   @OperationsPerInvocation(VALUE_COUNT)
   public byte[] encodeDelta() throws IOException {
@@ -205,6 +215,16 @@ public void decodePlain(Blackhole bh) throws IOException {
     }
   }
 
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodePlainBatch(Blackhole bh) throws IOException {
+    FixedLenByteArrayPlainValuesReader reader = new FixedLenByteArrayPlainValuesReader(fixedLength);
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(plainEncoded)));
+    Binary[] batch = new Binary[VALUE_COUNT];
+    reader.readBinaries(batch, 0, VALUE_COUNT);
+    bh.consume(batch);
+  }
+
   @Benchmark
   @OperationsPerInvocation(VALUE_COUNT)
   public void decodeDelta(Blackhole bh) throws IOException {

From af5652620e811d38944c37f883664c92bb241d7c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= <iemejia@gmail.com>
Date: Wed, 13 May 2026 19:45:14 +0200
Subject: [PATCH 19/27] Add BROTLI to CompressionBenchmark codec parameter list

- Add brotli-codec dependency to parquet-benchmarks (profile-gated, x86_64 only)
- Include BROTLI in @Param codec list alongside SNAPPY, ZSTD, LZ4_RAW, GZIP
- Add jitpack.io repository for brotli-codec resolution
---
 parquet-benchmarks/pom.xml                    | 27 +++++++++++++++++++
 .../benchmarks/CompressionBenchmark.java      |  2 +-
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/parquet-benchmarks/pom.xml b/parquet-benchmarks/pom.xml
index 8d5a1253bf..8bf1d7f821 100644
--- a/parquet-benchmarks/pom.xml
+++ b/parquet-benchmarks/pom.xml
@@ -89,6 +89,33 @@
     </dependency>
   </dependencies>
 
+  <profiles>
+    <!-- PARQUET-1975 Do not add brotli-codec for ARM64 architectures -->
+    <profile>
+      <id>non-aarch64</id>
+      <activation>
+        <os>
+          <arch>!aarch64</arch>
+        </os>
+      </activation>
+      <repositories>
+        <repository>
+          <id>jitpack.io</id>
+          <url>https://jitpack.io</url>
+          <name>Jitpack.io repository</name>
+        </repository>
+      </repositories>
+      <dependencies>
+        <dependency>
+          <groupId>com.github.rdblue</groupId>
+          <artifactId>brotli-codec</artifactId>
+          <version>${brotli-codec.version}</version>
+          <scope>runtime</scope>
+        </dependency>
+      </dependencies>
+    </profile>
+  </profiles>
+
   <build>
     <plugins>
       <plugin>
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/CompressionBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/CompressionBenchmark.java
index 9ff2884222..adb8edd4c8 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/CompressionBenchmark.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/CompressionBenchmark.java
@@ -59,7 +59,7 @@
 @State(Scope.Thread)
 public class CompressionBenchmark {
 
-  @Param({"SNAPPY", "ZSTD", "LZ4_RAW", "GZIP"})
+  @Param({"SNAPPY", "ZSTD", "LZ4_RAW", "GZIP", "BROTLI"})
   public String codec;
 
   @Param({"65536", "131072", "262144", "1048576"})

From 5cd1b29c8a1121525958cfbff3aef3c0fb41b950 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= <iemejia@gmail.com>
Date: Wed, 13 May 2026 19:59:00 +0200
Subject: [PATCH 20/27] Add BROTLI direct bypass in DirectCodecFactory using
 jbrotli JNI

Bypass the Hadoop BrotliCodec/stream wrapper for BROTLI compression and
decompression by using org.meteogroup.jbrotli's native JNI bindings directly
with ByteBuffer support via reflection (brotli-codec remains runtime scope).
This eliminates intermediate buffer copies and the BrotliStreamCompressor
state machine overhead.

Changes:
- DirectCodecFactory: Add BrotliDirectCompressor (quality=1, matching Hadoop
  default) and BrotliDirectDecompressor using one-shot jbrotli API via reflection
- Load native library eagerly with graceful fallback to Hadoop codec path
- CompressionBenchmark: Switch from heap CodecFactory to DirectCodecFactory
  to benchmark the actual production code path

Results at 64KB page size:
- Compress: 6,746 -> 9,662 ops/s (1.43x speedup)
- Decompress: 2,534 -> 2,786 ops/s (1.10x speedup)
---
 .../benchmarks/CompressionBenchmark.java      |   8 +-
 .../parquet/hadoop/DirectCodecFactory.java    | 195 +++++++++++++++++-
 2 files changed, 198 insertions(+), 5 deletions(-)

diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/CompressionBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/CompressionBenchmark.java
index adb8edd4c8..11e9fe6d6a 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/CompressionBenchmark.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/CompressionBenchmark.java
@@ -23,6 +23,7 @@
 import java.util.concurrent.TimeUnit;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.parquet.bytes.BytesInput;
+import org.apache.parquet.bytes.DirectByteBufferAllocator;
 import org.apache.parquet.compression.CompressionCodecFactory;
 import org.apache.parquet.hadoop.CodecFactory;
 import org.apache.parquet.hadoop.metadata.CompressionCodecName;
@@ -45,8 +46,9 @@
  *
  * <p>Measures the performance of {@link CompressionCodecFactory.BytesInputCompressor}
  * and {@link CompressionCodecFactory.BytesInputDecompressor} for each supported codec,
- * using the heap-based {@link CodecFactory} path. Input data is generated to approximate
- * realistic Parquet page content (a mix of sequential, repeated, and random byte patterns).
+ * using the direct-memory {@link CodecFactory} path (same as actual Parquet file I/O).
+ * Input data is generated to approximate realistic Parquet page content (a mix of
+ * sequential, repeated, and random byte patterns).
  *
  * <p>This benchmark isolates the codec hot path from file I/O, encoding, and other
  * Parquet overhead, making it ideal for measuring compression-specific optimizations.
@@ -79,7 +81,7 @@ public void setup() throws IOException {
     decompressedSize = uncompressedData.length;
 
     Configuration conf = new Configuration();
-    factory = new CodecFactory(conf, pageSize);
+    factory = CodecFactory.createDirectCodecFactory(conf, DirectByteBufferAllocator.getInstance(), pageSize);
     CompressionCodecName codecName = CompressionCodecName.valueOf(codec);
 
     compressor = factory.getCompressor(codecName);
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/DirectCodecFactory.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/DirectCodecFactory.java
index b2b5233eeb..c96c071e07 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/DirectCodecFactory.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/DirectCodecFactory.java
@@ -21,6 +21,7 @@
 import com.github.luben.zstd.ZstdCompressCtx;
 import com.github.luben.zstd.ZstdDecompressCtx;
 import java.io.IOException;
+import java.lang.reflect.Constructor;
 import java.lang.reflect.InvocationTargetException;
 import java.lang.reflect.Method;
 import java.nio.ByteBuffer;
@@ -61,6 +62,14 @@ class DirectCodecFactory extends CodecFactory implements AutoCloseable {
   private static final Method DECOMPRESS_METHOD;
   private static final Method CREATE_DIRECT_DECOMPRESSOR_METHOD;
 
+  // Brotli JNI bypass via reflection (brotli-codec is a runtime-only dependency)
+  private static final boolean BROTLI_NATIVE_AVAILABLE;
+  private static final Method BROTLI_DECOMPRESS_METHOD; // BrotliDeCompressor.deCompress(ByteBuffer, ByteBuffer)
+  private static final Method BROTLI_COMPRESS_METHOD; // BrotliCompressor.compress(Parameter, ByteBuffer, ByteBuffer)
+  private static final Constructor<?> BROTLI_DECOMPRESSOR_CTOR; // BrotliDeCompressor()
+  private static final Constructor<?> BROTLI_COMPRESSOR_CTOR; // BrotliCompressor()
+  private static final Object BROTLI_COMPRESS_PARAMETER; // Brotli.Parameter instance (quality=1)
+
   static {
     Class<?> tempClass = null;
     Method tempCreateMethod = null;
@@ -76,6 +85,46 @@ class DirectCodecFactory extends CodecFactory implements AutoCloseable {
     DIRECT_DECOMPRESSION_CODEC_CLASS = tempClass;
     CREATE_DIRECT_DECOMPRESSOR_METHOD = tempCreateMethod;
     DECOMPRESS_METHOD = tempDecompressMethod;
+
+    // Initialize Brotli JNI bypass via reflection
+    boolean brotliLoaded = false;
+    Method brotliDecompress = null;
+    Method brotliCompress = null;
+    Constructor<?> brotliDecompressorCtor = null;
+    Constructor<?> brotliCompressorCtor = null;
+    Object brotliParam = null;
+    try {
+      // Load native library
+      Class<?> loaderClass = Class.forName("org.meteogroup.jbrotli.libloader.BrotliLibraryLoader");
+      loaderClass.getMethod("loadBrotli").invoke(null);
+
+      // BrotliDeCompressor: no-arg ctor + deCompress(ByteBuffer, ByteBuffer) -> int
+      Class<?> decompClass = Class.forName("org.meteogroup.jbrotli.BrotliDeCompressor");
+      brotliDecompressorCtor = decompClass.getConstructor();
+      brotliDecompress = decompClass.getMethod("deCompress", ByteBuffer.class, ByteBuffer.class);
+
+      // BrotliCompressor: no-arg ctor + compress(Parameter, ByteBuffer, ByteBuffer) -> int
+      Class<?> compClass = Class.forName("org.meteogroup.jbrotli.BrotliCompressor");
+      Class<?> paramClass = Class.forName("org.meteogroup.jbrotli.Brotli$Parameter");
+      Class<?> modeClass = Class.forName("org.meteogroup.jbrotli.Brotli$Mode");
+      brotliCompressorCtor = compClass.getConstructor();
+      brotliCompress = compClass.getMethod("compress", paramClass, ByteBuffer.class, ByteBuffer.class);
+
+      // Create Parameter(Mode.GENERIC, quality=1, lgwin=22, lgblock=0)
+      Object genericMode = modeClass.getField("GENERIC").get(null);
+      Constructor<?> paramCtor = paramClass.getConstructor(modeClass, int.class, int.class, int.class);
+      brotliParam = paramCtor.newInstance(genericMode, 1, 22, 0);
+
+      brotliLoaded = true;
+    } catch (Throwable t) {
+      LOG.debug("Brotli native library not available, falling back to Hadoop codec", t);
+    }
+    BROTLI_NATIVE_AVAILABLE = brotliLoaded;
+    BROTLI_DECOMPRESS_METHOD = brotliDecompress;
+    BROTLI_COMPRESS_METHOD = brotliCompress;
+    BROTLI_DECOMPRESSOR_CTOR = brotliDecompressorCtor;
+    BROTLI_COMPRESSOR_CTOR = brotliCompressorCtor;
+    BROTLI_COMPRESS_PARAMETER = brotliParam;
   }
 
   /**
@@ -103,8 +152,13 @@ protected BytesCompressor createCompressor(final CompressionCodecName codecName)
         return new SnappyCompressor();
       case ZSTD:
         return new ZstdCompressor();
-        // todo: create class similar to the SnappyCompressor for zlib and exclude it as
-        // snappy is above since it also generates allocateDirect calls.
+      case LZ4_RAW:
+        return new Lz4RawCompressor();
+      case BROTLI:
+        if (BROTLI_NATIVE_AVAILABLE) {
+          return new BrotliDirectCompressor();
+        }
+        return super.createCompressor(codecName);
       default:
         return super.createCompressor(codecName);
     }
@@ -117,6 +171,16 @@ protected BytesDecompressor createDecompressor(final CompressionCodecName codecN
         return new SnappyDecompressor();
       case ZSTD:
         return new ZstdDecompressor();
+      case LZ4_RAW:
+        return new Lz4RawDecompressor();
+      case BROTLI:
+        if (BROTLI_NATIVE_AVAILABLE) {
+          return new BrotliDirectDecompressor();
+        }
+        // fall through to default Hadoop codec path
+      case GZIP:
+      case UNCOMPRESSED:
+        return super.createDecompressor(codecName);
       default:
         CompressionCodec codec = getCodec(codecName);
         if (codec == null) {
@@ -437,6 +501,133 @@ void closeCompressor() {
     }
   }
 
+  /**
+  /**
+   * Direct-memory LZ4_RAW decompressor using airlift's LZ4 decompressor with
+   * direct ByteBuffers, avoiding reflection-based {@link FullDirectDecompressor}.
+   */
+  private class Lz4RawDecompressor extends BaseDecompressor {
+    private final io.airlift.compress.lz4.Lz4Decompressor decompressor =
+        new io.airlift.compress.lz4.Lz4Decompressor();
+
+    @Override
+    int decompress(ByteBuffer input, ByteBuffer output) {
+      decompressor.decompress(input, output);
+      return output.position();
+    }
+
+    @Override
+    void closeDecompressor() {
+      // no-op
+    }
+  }
+
+  /**
+   * Direct-memory LZ4_RAW compressor using airlift's LZ4 compressor with
+   * direct ByteBuffers, avoiding the stream-based heap path.
+   */
+  private class Lz4RawCompressor extends BaseCompressor {
+    private final io.airlift.compress.lz4.Lz4Compressor compressor = new io.airlift.compress.lz4.Lz4Compressor();
+
+    @Override
+    public CompressionCodecName getCodecName() {
+      return CompressionCodecName.LZ4_RAW;
+    }
+
+    @Override
+    int maxCompressedSize(int size) {
+      return compressor.maxCompressedLength(size);
+    }
+
+    @Override
+    int compress(ByteBuffer input, ByteBuffer output) {
+      compressor.compress(input, output);
+      return output.position();
+    }
+
+    @Override
+    void closeCompressor() {
+      // no-op
+    }
+  }
+
+  /**
+   * Direct-memory Brotli decompressor using jbrotli's native JNI bindings via reflection,
+   * bypassing the Hadoop BrotliCodec/stream wrapper overhead.
+   */
+  private class BrotliDirectDecompressor extends BaseDecompressor {
+    private final Object decompressor;
+
+    BrotliDirectDecompressor() {
+      try {
+        this.decompressor = BROTLI_DECOMPRESSOR_CTOR.newInstance();
+      } catch (ReflectiveOperationException e) {
+        throw new DirectCodecPool.ParquetCompressionCodecException("Failed to create Brotli decompressor", e);
+      }
+    }
+
+    @Override
+    int decompress(ByteBuffer input, ByteBuffer output) throws IOException {
+      try {
+        return (int) BROTLI_DECOMPRESS_METHOD.invoke(decompressor, input, output);
+      } catch (InvocationTargetException e) {
+        throw new IOException("Brotli decompression failed", e.getCause());
+      } catch (IllegalAccessException e) {
+        throw new IOException("Brotli decompression failed", e);
+      }
+    }
+
+    @Override
+    void closeDecompressor() {
+      // no-op: BrotliDeCompressor has no resources to release
+    }
+  }
+
+  /**
+   * Direct-memory Brotli compressor using jbrotli's native JNI bindings via reflection,
+   * bypassing the Hadoop BrotliCodec/stream wrapper overhead.
+   * Uses quality=1 by default (fast compression, matching Hadoop's BrotliCompressor default).
+   */
+  private class BrotliDirectCompressor extends BaseCompressor {
+    private final Object compressor;
+
+    BrotliDirectCompressor() {
+      try {
+        this.compressor = BROTLI_COMPRESSOR_CTOR.newInstance();
+      } catch (ReflectiveOperationException e) {
+        throw new DirectCodecPool.ParquetCompressionCodecException("Failed to create Brotli compressor", e);
+      }
+    }
+
+    @Override
+    public CompressionCodecName getCodecName() {
+      return CompressionCodecName.BROTLI;
+    }
+
+    @Override
+    int maxCompressedSize(int size) {
+      // Brotli worst case: input size + (input size >> 2) + 1K overhead for small inputs
+      // This is a conservative upper bound matching the Brotli spec
+      return size + (size >> 2) + 1024;
+    }
+
+    @Override
+    int compress(ByteBuffer input, ByteBuffer output) throws IOException {
+      try {
+        return (int) BROTLI_COMPRESS_METHOD.invoke(compressor, BROTLI_COMPRESS_PARAMETER, input, output);
+      } catch (InvocationTargetException e) {
+        throw new IOException("Brotli compression failed", e.getCause());
+      } catch (IllegalAccessException e) {
+        throw new IOException("Brotli compression failed", e);
+      }
+    }
+
+    @Override
+    void closeCompressor() {
+      // no-op: BrotliCompressor has no resources to release
+    }
+  }
+
   /**
    * @deprecated Use {@link CodecFactory#NO_OP_COMPRESSOR} instead
    */

From c55905cea4d8db2b15a78399e2f2189c4854e66b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= <iemejia@gmail.com>
Date: Wed, 13 May 2026 15:27:45 +0200
Subject: [PATCH 21/27] Use bulk ByteBuffer view reads in BSS batch decode for
 all numeric types

Replace per-value getXxx(offset) loops with position()+asXxxBuffer().get()
bulk copy in readFloats/readDoubles/readIntegers/readLongs. The decoded
data buffer is a contiguous heap byte[] in LE order, making view buffer
bulk reads a single memcpy via Unsafe.copyMemory.

Benchmark results (100K values, BSS FLOAT batch):
  Before: ~1,228M ops/s
  After:  ~1,442M ops/s (+17%)

INT32/INT64/DOUBLE show negligible change because BSS invocation cost is
dominated by page transposition in initFromPage, not the read loop.
---
 .../ByteStreamSplitValuesReaderForDouble.java                | 5 ++---
 .../bytestreamsplit/ByteStreamSplitValuesReaderForFloat.java | 5 ++---
 .../ByteStreamSplitValuesReaderForInteger.java               | 5 ++---
 .../bytestreamsplit/ByteStreamSplitValuesReaderForLong.java  | 5 ++---
 4 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForDouble.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForDouble.java
index e2053eec3a..0917cd3902 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForDouble.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForDouble.java
@@ -31,8 +31,7 @@ public double readDouble() {
   @Override
   public void readDoubles(double[] dest, int offset, int count) {
     int byteOffset = advanceByteOffset(count);
-    for (int i = 0; i < count; i++) {
-      dest[offset + i] = decodedDataBuffer.getDouble(byteOffset + i * 8);
-    }
+    decodedDataBuffer.position(byteOffset);
+    decodedDataBuffer.asDoubleBuffer().get(dest, offset, count);
   }
 }
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForFloat.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForFloat.java
index eb80eacbf1..bb28ef0ac2 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForFloat.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForFloat.java
@@ -31,8 +31,7 @@ public float readFloat() {
   @Override
   public void readFloats(float[] dest, int offset, int count) {
     int byteOffset = advanceByteOffset(count);
-    for (int i = 0; i < count; i++) {
-      dest[offset + i] = decodedDataBuffer.getFloat(byteOffset + i * 4);
-    }
+    decodedDataBuffer.position(byteOffset);
+    decodedDataBuffer.asFloatBuffer().get(dest, offset, count);
   }
 }
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForInteger.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForInteger.java
index 8bac36da17..e71079d2f6 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForInteger.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForInteger.java
@@ -31,8 +31,7 @@ public int readInteger() {
   @Override
   public void readIntegers(int[] dest, int offset, int count) {
     int byteOffset = advanceByteOffset(count);
-    for (int i = 0; i < count; i++) {
-      dest[offset + i] = decodedDataBuffer.getInt(byteOffset + i * 4);
-    }
+    decodedDataBuffer.position(byteOffset);
+    decodedDataBuffer.asIntBuffer().get(dest, offset, count);
   }
 }
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForLong.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForLong.java
index 5186210ef5..f73c46e972 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForLong.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForLong.java
@@ -31,8 +31,7 @@ public long readLong() {
   @Override
   public void readLongs(long[] dest, int offset, int count) {
     int byteOffset = advanceByteOffset(count);
-    for (int i = 0; i < count; i++) {
-      dest[offset + i] = decodedDataBuffer.getLong(byteOffset + i * 8);
-    }
+    decodedDataBuffer.position(byteOffset);
+    decodedDataBuffer.asLongBuffer().get(dest, offset, count);
   }
 }

From 957d9091e46ce4a101a7f37d4fc8638ec510dd4e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= <iemejia@gmail.com>
Date: Wed, 13 May 2026 15:59:01 +0200
Subject: [PATCH 22/27] Optimize BOOLEAN decoding: direct bit extraction and
 batch readBooleans()

Replace ByteBitPackingValuesReader delegation in BooleanPlainValuesReader
with direct bit extraction from the page byte[]. The scalar path uses a
single array access + shift + mask instead of the 8-element int[] buffer
and packer dispatch. The batch path (readBooleans) unrolls 8 booleans per
byte with constant masks.

For RLE (V2), add a native readBooleans() method that uses Arrays.fill
for RLE runs (constant-time for uniform data) and direct int-to-boolean
conversion for packed groups, avoiding the intermediate int[] allocation
of the readInts() path.

Benchmark results (1M values, JDK 25, Compiler Blackholes):
- V1 PLAIN scalar: ~620M -> ~1,528-1,618M ops/s (+150%)
- V1 PLAIN batch:  ALL_TRUE/FALSE ~5,000M (+680%), RANDOM 2,757M (+337%)
- V2 RLE batch:    ALL_TRUE/FALSE ~190B (fill), RANDOM 1,335M (+93%)
---
 .../parquet/column/values/ValuesReader.java   |  13 ++
 .../plain/BooleanPlainValuesReader.java       | 112 +++++++++++++-----
 .../rle/RunLengthBitPackingHybridDecoder.java |  36 ++++++
 ...RunLengthBitPackingHybridValuesReader.java |   9 ++
 4 files changed, 141 insertions(+), 29 deletions(-)

diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/ValuesReader.java b/parquet-column/src/main/java/org/apache/parquet/column/values/ValuesReader.java
index bd7f3eaeff..1511925957 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/ValuesReader.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/ValuesReader.java
@@ -241,6 +241,19 @@ public void readDoubles(double[] dest, int offset, int count) {
     }
   }
 
+  /**
+   * Reads {@code count} booleans into {@code dest} starting at {@code offset}.
+   *
+   * @param dest   destination array
+   * @param offset start index in dest
+   * @param count  number of values to read
+   */
+  public void readBooleans(boolean[] dest, int offset, int count) {
+    for (int i = 0; i < count; i++) {
+      dest[offset + i] = readBoolean();
+    }
+  }
+
   /**
    * Skips the next value in the page
    */
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/plain/BooleanPlainValuesReader.java b/parquet-column/src/main/java/org/apache/parquet/column/values/plain/BooleanPlainValuesReader.java
index 22ca2d567c..17a80e36c9 100755
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/plain/BooleanPlainValuesReader.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/plain/BooleanPlainValuesReader.java
@@ -18,57 +18,111 @@
  */
 package org.apache.parquet.column.values.plain;
 
-import static org.apache.parquet.column.values.bitpacking.Packer.LITTLE_ENDIAN;
-
 import java.io.IOException;
+import java.nio.ByteBuffer;
 import org.apache.parquet.bytes.ByteBufferInputStream;
+import org.apache.parquet.bytes.BytesUtils;
 import org.apache.parquet.column.values.ValuesReader;
-import org.apache.parquet.column.values.bitpacking.ByteBitPackingValuesReader;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 /**
- * encodes boolean for the plain encoding: one bit at a time (0 = false)
+ * Decodes PLAIN-encoded booleans: one bit per value, packed 8 per byte, little-endian
+ * bit order (bit 0 of each byte is the first value).
+ *
+ * <p>Direct bit extraction from the page ByteBuffer avoids the overhead of the generic
+ * bit-packing machinery ({@code ByteBitPackingValuesReader}) and intermediate
+ * {@code int[8]} buffers.
  */
 public class BooleanPlainValuesReader extends ValuesReader {
   private static final Logger LOG = LoggerFactory.getLogger(BooleanPlainValuesReader.class);
 
-  private ByteBitPackingValuesReader in = new ByteBitPackingValuesReader(1, LITTLE_ENDIAN);
+  private byte[] pageData;
+  private int pageOffset;
+  private int bitIndex;
+
+  @Override
+  public void initFromPage(int valueCount, ByteBufferInputStream stream) throws IOException {
+    LOG.debug("init from page at offset {} for length {}", stream.position(), stream.available());
+    int effectiveBitLength = valueCount; // bitWidth = 1
+    int length = BytesUtils.paddedByteCountFromBits(effectiveBitLength);
+    length = Math.min(length, stream.available());
+    ByteBuffer buf = stream.slice(length);
+
+    // Bulk access: use backing array directly if available, otherwise copy once.
+    if (buf.hasArray()) {
+      pageData = buf.array();
+      pageOffset = buf.arrayOffset() + buf.position();
+    } else {
+      pageData = new byte[length];
+      buf.get(pageData);
+      pageOffset = 0;
+    }
+    bitIndex = 0;
+    updateNextOffset(length);
+  }
 
-  /**
-   * {@inheritDoc}
-   *
-   * @see org.apache.parquet.column.values.ValuesReader#readBoolean()
-   */
   @Override
   public boolean readBoolean() {
-    return in.readInteger() == 0 ? false : true;
+    int byteIdx = pageOffset + (bitIndex >>> 3);
+    int bitPos = bitIndex & 7;
+    bitIndex++;
+    return ((pageData[byteIdx] >>> bitPos) & 1) != 0;
   }
 
-  /**
-   * {@inheritDoc}
-   *
-   * @see org.apache.parquet.column.values.ValuesReader#skip()
-   */
   @Override
-  public void skip() {
-    in.readInteger();
+  public void readBooleans(boolean[] dest, int offset, int count) {
+    int i = 0;
+
+    // Handle partial byte at current position
+    int bitPos = bitIndex & 7;
+    if (bitPos != 0) {
+      int byteIdx = pageOffset + (bitIndex >>> 3);
+      byte b = pageData[byteIdx];
+      while (bitPos < 8 && i < count) {
+        dest[offset + i] = ((b >>> bitPos) & 1) != 0;
+        bitPos++;
+        i++;
+      }
+    }
+
+    // Process full bytes: 8 booleans per byte
+    int byteIdx = pageOffset + ((bitIndex + i) >>> 3);
+    while (i + 8 <= count) {
+      byte b = pageData[byteIdx];
+      dest[offset + i] = (b & 1) != 0;
+      dest[offset + i + 1] = (b & 2) != 0;
+      dest[offset + i + 2] = (b & 4) != 0;
+      dest[offset + i + 3] = (b & 8) != 0;
+      dest[offset + i + 4] = (b & 16) != 0;
+      dest[offset + i + 5] = (b & 32) != 0;
+      dest[offset + i + 6] = (b & 64) != 0;
+      dest[offset + i + 7] = (b & 128) != 0;
+      byteIdx++;
+      i += 8;
+    }
+
+    // Handle remaining bits in the last partial byte
+    if (i < count) {
+      byte b = pageData[byteIdx];
+      int bp = 0;
+      while (i < count) {
+        dest[offset + i] = ((b >>> bp) & 1) != 0;
+        bp++;
+        i++;
+      }
+    }
+
+    bitIndex += count;
   }
 
-  /**
-   * {@inheritDoc}
-   *
-   * @see org.apache.parquet.column.values.ValuesReader#initFromPage(int, ByteBufferInputStream)
-   */
   @Override
-  public void initFromPage(int valueCount, ByteBufferInputStream stream) throws IOException {
-    LOG.debug("init from page at offset {} for length {}", stream.position(), stream.available());
-    this.in.initFromPage(valueCount, stream);
+  public void skip() {
+    bitIndex++;
   }
 
-  @Deprecated
   @Override
-  public int getNextOffset() {
-    return in.getNextOffset();
+  public void skip(int n) {
+    bitIndex += n;
   }
 }
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridDecoder.java b/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridDecoder.java
index 8064c25e10..a78d3821ce 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridDecoder.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridDecoder.java
@@ -111,6 +111,42 @@ public void readInts(int[] dest, int offset, int count) throws IOException {
     }
   }
 
+  /**
+   * Reads {@code count} boolean values into {@code dest} starting at {@code offset}.
+   * For RLE runs, uses {@code Arrays.fill} with a single boolean value.
+   * For packed groups, converts each int to boolean.
+   *
+   * @param dest   destination array
+   * @param offset start index in dest
+   * @param count  number of values to read
+   */
+  public void readBooleans(boolean[] dest, int offset, int count) throws IOException {
+    int remaining = count;
+    int pos = offset;
+    while (remaining > 0) {
+      if (currentCount == 0) {
+        readNext();
+      }
+      int batchSize = Math.min(remaining, currentCount);
+      switch (mode) {
+        case RLE:
+          java.util.Arrays.fill(dest, pos, pos + batchSize, currentValue != 0);
+          break;
+        case PACKED:
+          int startIdx = currentBuffer.length - currentCount;
+          for (int i = 0; i < batchSize; i++) {
+            dest[pos + i] = currentBuffer[startIdx + i] != 0;
+          }
+          break;
+        default:
+          throw new ParquetDecodingException("not a valid mode " + mode);
+      }
+      currentCount -= batchSize;
+      remaining -= batchSize;
+      pos += batchSize;
+    }
+  }
+
   private void readNext() throws IOException {
     Preconditions.checkArgument(in.available() > 0, "Reading past RLE/BitPacking stream.");
     final int header = BytesUtils.readUnsignedVarInt(in);
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridValuesReader.java b/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridValuesReader.java
index e17867d5f1..9ee70add6d 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridValuesReader.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridValuesReader.java
@@ -63,6 +63,15 @@ public void readIntegers(int[] dest, int offset, int count) {
     }
   }
 
+  @Override
+  public void readBooleans(boolean[] dest, int offset, int count) {
+    try {
+      decoder.readBooleans(dest, offset, count);
+    } catch (IOException e) {
+      throw new ParquetDecodingException(e);
+    }
+  }
+
   @Override
   public boolean readBoolean() {
     return readInteger() == 0 ? false : true;

From f36eac3ec48058be8a81667a71c7af70b854bd4d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= <iemejia@gmail.com>
Date: Wed, 13 May 2026 16:37:47 +0200
Subject: [PATCH 23/27] Use lookup table for boolean batch decode (PLAIN + RLE
 PACKED)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the per-bit unrolled extraction loop with a static boolean[256][8]
lookup table + System.arraycopy. Each byte maps to its 8 pre-decoded
booleans, and the 8-byte copy is emitted by HotSpot as a single 64-bit
load/store pair — the boolean equivalent of asIntBuffer().get() for ints.

For RLE PACKED groups (bitWidth=1), bypass the int[] intermediate and
read directly from the raw packed bytes via the same lookup table.

This makes batch decode throughput independent of data pattern:
- V1 PLAIN batch RANDOM: 2,757M -> 5,047M ops/s (+83%)
- V2 RLE batch RANDOM: 1,335M -> 1,618M ops/s (+21%)
- V2 RLE batch MOSTLY_TRUE_99: 3,205M -> 3,745M ops/s (+17%)
- Uniform patterns (ALL_TRUE/FALSE): unchanged (still Arrays.fill)
---
 .../plain/BooleanPlainValuesReader.java       | 31 ++++++----
 .../rle/RunLengthBitPackingHybridDecoder.java | 56 +++++++++++++++++--
 2 files changed, 73 insertions(+), 14 deletions(-)

diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/plain/BooleanPlainValuesReader.java b/parquet-column/src/main/java/org/apache/parquet/column/values/plain/BooleanPlainValuesReader.java
index 17a80e36c9..3843e3b6f0 100755
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/plain/BooleanPlainValuesReader.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/plain/BooleanPlainValuesReader.java
@@ -33,10 +33,29 @@
  * <p>Direct bit extraction from the page ByteBuffer avoids the overhead of the generic
  * bit-packing machinery ({@code ByteBitPackingValuesReader}) and intermediate
  * {@code int[8]} buffers.
+ *
+ * <p>The batch path uses a static 256-entry lookup table that maps each byte value to
+ * its 8 pre-decoded booleans. This enables {@code System.arraycopy} of 8 booleans per
+ * byte (a single 64-bit memory operation in HotSpot) instead of 8 individual
+ * comparison+store operations.
  */
 public class BooleanPlainValuesReader extends ValuesReader {
   private static final Logger LOG = LoggerFactory.getLogger(BooleanPlainValuesReader.class);
 
+  /**
+   * Lookup table: BYTE_TO_BOOLS[b] contains the 8 boolean values for byte value b,
+   * in little-endian bit order (bit 0 = index 0).
+   */
+  private static final boolean[][] BYTE_TO_BOOLS = new boolean[256][8];
+
+  static {
+    for (int b = 0; b < 256; b++) {
+      for (int bit = 0; bit < 8; bit++) {
+        BYTE_TO_BOOLS[b][bit] = ((b >>> bit) & 1) != 0;
+      }
+    }
+  }
+
   private byte[] pageData;
   private int pageOffset;
   private int bitIndex;
@@ -86,18 +105,10 @@ public void readBooleans(boolean[] dest, int offset, int count) {
       }
     }
 
-    // Process full bytes: 8 booleans per byte
+    // Process full bytes: 8 booleans per byte via lookup table + arraycopy
     int byteIdx = pageOffset + ((bitIndex + i) >>> 3);
     while (i + 8 <= count) {
-      byte b = pageData[byteIdx];
-      dest[offset + i] = (b & 1) != 0;
-      dest[offset + i + 1] = (b & 2) != 0;
-      dest[offset + i + 2] = (b & 4) != 0;
-      dest[offset + i + 3] = (b & 8) != 0;
-      dest[offset + i + 4] = (b & 16) != 0;
-      dest[offset + i + 5] = (b & 32) != 0;
-      dest[offset + i + 6] = (b & 64) != 0;
-      dest[offset + i + 7] = (b & 128) != 0;
+      System.arraycopy(BYTE_TO_BOOLS[pageData[byteIdx] & 0xFF], 0, dest, offset + i, 8);
       byteIdx++;
       i += 8;
     }
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridDecoder.java b/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridDecoder.java
index a78d3821ce..f2dd50d623 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridDecoder.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridDecoder.java
@@ -48,6 +48,8 @@ private static enum MODE {
   private int currentCount;
   private int currentValue;
   private int[] currentBuffer;
+  // Saved packed bytes for bitWidth=1 boolean optimization (lookup table decode)
+  private byte[] packedBytesBuffer;
 
   public RunLengthBitPackingHybridDecoder(int bitWidth, InputStream in) {
     LOG.debug("decoding bitWidth {}", bitWidth);
@@ -111,10 +113,25 @@ public void readInts(int[] dest, int offset, int count) throws IOException {
     }
   }
 
+  /**
+   * Lookup table for bitWidth=1: maps each byte to its 8 boolean values.
+   * Used by {@link #readBooleans} PACKED path to bypass the int[] intermediate.
+   */
+  private static final boolean[][] BYTE_TO_BOOLS = new boolean[256][8];
+
+  static {
+    for (int b = 0; b < 256; b++) {
+      for (int bit = 0; bit < 8; bit++) {
+        BYTE_TO_BOOLS[b][bit] = ((b >>> bit) & 1) != 0;
+      }
+    }
+  }
+
   /**
    * Reads {@code count} boolean values into {@code dest} starting at {@code offset}.
    * For RLE runs, uses {@code Arrays.fill} with a single boolean value.
-   * For packed groups, converts each int to boolean.
+   * For packed groups, uses a lookup table to decode 8 booleans per byte directly
+   * from the raw packed bytes, bypassing the int[] intermediate buffer.
    *
    * @param dest   destination array
    * @param offset start index in dest
@@ -133,9 +150,39 @@ public void readBooleans(boolean[] dest, int offset, int count) throws IOExcepti
           java.util.Arrays.fill(dest, pos, pos + batchSize, currentValue != 0);
           break;
         case PACKED:
-          int startIdx = currentBuffer.length - currentCount;
-          for (int i = 0; i < batchSize; i++) {
-            dest[pos + i] = currentBuffer[startIdx + i] != 0;
+          // For bitWidth=1, read directly from packedBytesBuffer via lookup table
+          int bitOff = currentBuffer.length - currentCount;
+          int written = 0;
+
+          // Handle partial byte alignment
+          int bitPos = bitOff & 7;
+          if (bitPos != 0) {
+            int byteIdx = bitOff >>> 3;
+            byte b = packedBytesBuffer[byteIdx];
+            while (bitPos < 8 && written < batchSize) {
+              dest[pos + written] = ((b >>> bitPos) & 1) != 0;
+              bitPos++;
+              written++;
+            }
+          }
+
+          // Process full bytes via lookup table
+          int byteIdx = (bitOff + written) >>> 3;
+          while (written + 8 <= batchSize) {
+            System.arraycopy(BYTE_TO_BOOLS[packedBytesBuffer[byteIdx] & 0xFF], 0, dest, pos + written, 8);
+            byteIdx++;
+            written += 8;
+          }
+
+          // Handle remaining bits
+          if (written < batchSize) {
+            byte b = packedBytesBuffer[byteIdx];
+            int bp = 0;
+            while (written < batchSize) {
+              dest[pos + written] = ((b >>> bp) & 1) != 0;
+              bp++;
+              written++;
+            }
           }
           break;
         default:
@@ -167,6 +214,7 @@ private void readNext() throws IOException {
         int bytesToRead = (int) Math.ceil(currentCount * bitWidth / 8.0);
         bytesToRead = Math.min(bytesToRead, in.available());
         new DataInputStream(in).readFully(bytes, 0, bytesToRead);
+        packedBytesBuffer = bytes;
         for (int valueIndex = 0, byteIndex = 0;
             valueIndex < currentCount;
             valueIndex += 8, byteIndex += bitWidth) {

From 650331440a60ccad420f1fc1ea9132d9dc7f62c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= <iemejia@gmail.com>
Date: Wed, 13 May 2026 17:38:06 +0200
Subject: [PATCH 24/27] Optimize BOOLEAN encoding: batch writeBooleans() with
 direct byte packing

Refactor BooleanPlainValuesWriter to pack bits directly into bytes
instead of delegating through ByteBitPackingValuesWriter and the generic
int[8]-based ByteBasedBitPackingEncoder. Add batch writeBooleans() API
to ValuesWriter with optimized overrides:

- PLAIN: processes 8 booleans at a time into single bytes with OR/shift,
  eliminating the per-value method call chain and int[] intermediate.
- RLE: pre-scans for runs >= 8 to emit RLE directly, fills partial
  bit-packed groups from run boundaries to avoid spurious padding.

PLAIN scalar improves +69% (890M -> 1,500M ops/s) from the refactoring.
PLAIN batch: +184% over old scalar (2,528M for RANDOM).
RLE batch: +278% for ALL_FALSE, +95% for MOSTLY_*, +36% for ALTERNATING.
---
 .../parquet/column/values/ValuesWriter.java   | 13 +++
 .../plain/BooleanPlainValuesWriter.java       | 90 ++++++++++++++++---
 .../rle/RunLengthBitPackingHybridEncoder.java | 65 ++++++++++++++
 ...RunLengthBitPackingHybridValuesWriter.java |  9 ++
 4 files changed, 165 insertions(+), 12 deletions(-)

diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/ValuesWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/ValuesWriter.java
index ecea4a7520..a50bfe8a3b 100755
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/ValuesWriter.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/ValuesWriter.java
@@ -98,6 +98,19 @@ public void writeBoolean(boolean v) {
     throw new UnsupportedOperationException(getClass().getName());
   }
 
+  /**
+   * Writes a batch of boolean values. Subclasses may override for optimized bulk encoding.
+   *
+   * @param values the boolean array to read from
+   * @param offset the start position in the array
+   * @param length the number of values to write
+   */
+  public void writeBooleans(boolean[] values, int offset, int length) {
+    for (int i = offset; i < offset + length; i++) {
+      writeBoolean(values[i]);
+    }
+  }
+
   /**
    * @param v the value to encode
    */
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/plain/BooleanPlainValuesWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/plain/BooleanPlainValuesWriter.java
index 7f80ec150a..ae3b43c63b 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/plain/BooleanPlainValuesWriter.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/plain/BooleanPlainValuesWriter.java
@@ -19,52 +19,118 @@
 package org.apache.parquet.column.values.plain;
 
 import static org.apache.parquet.column.Encoding.PLAIN;
-import static org.apache.parquet.column.values.bitpacking.Packer.LITTLE_ENDIAN;
 
 import org.apache.parquet.bytes.BytesInput;
+import org.apache.parquet.bytes.CapacityByteArrayOutputStream;
 import org.apache.parquet.column.Encoding;
 import org.apache.parquet.column.values.ValuesWriter;
-import org.apache.parquet.column.values.bitpacking.ByteBitPackingValuesWriter;
 
 /**
- * An implementation of the PLAIN encoding
+ * An implementation of the PLAIN encoding for BOOLEAN values.
+ *
+ * <p>Packs booleans directly into bytes (8 per byte, LSB first) without
+ * going through the generic int-based bit-packing encoder.
  */
 public class BooleanPlainValuesWriter extends ValuesWriter {
 
-  private ByteBitPackingValuesWriter bitPackingWriter;
+  private static final int INITIAL_SLAB_SIZE = 1024;
+  private static final int MAX_CAPACITY = 64 * 1024;
+
+  private final CapacityByteArrayOutputStream baos;
+  private int currentByte;
+  private int bitsWritten;
 
   public BooleanPlainValuesWriter() {
-    bitPackingWriter = new ByteBitPackingValuesWriter(1, LITTLE_ENDIAN);
+    this.baos = new CapacityByteArrayOutputStream(INITIAL_SLAB_SIZE, MAX_CAPACITY);
+    this.currentByte = 0;
+    this.bitsWritten = 0;
   }
 
   @Override
   public final void writeBoolean(boolean v) {
-    bitPackingWriter.writeInteger(v ? 1 : 0);
+    currentByte |= ((v ? 1 : 0) << bitsWritten);
+    bitsWritten++;
+    if (bitsWritten == 8) {
+      baos.write(currentByte);
+      currentByte = 0;
+      bitsWritten = 0;
+    }
+  }
+
+  @Override
+  public void writeBooleans(boolean[] values, int offset, int length) {
+    int pos = offset;
+    int end = offset + length;
+
+    // Fill current partial byte
+    while (bitsWritten > 0 && bitsWritten < 8 && pos < end) {
+      if (values[pos]) {
+        currentByte |= (1 << bitsWritten);
+      }
+      bitsWritten++;
+      pos++;
+      if (bitsWritten == 8) {
+        baos.write(currentByte);
+        currentByte = 0;
+        bitsWritten = 0;
+      }
+    }
+
+    // Process 8 values at a time — pack directly into a byte
+    while (pos + 8 <= end) {
+      int b = 0;
+      if (values[pos]) b |= 0x01;
+      if (values[pos + 1]) b |= 0x02;
+      if (values[pos + 2]) b |= 0x04;
+      if (values[pos + 3]) b |= 0x08;
+      if (values[pos + 4]) b |= 0x10;
+      if (values[pos + 5]) b |= 0x20;
+      if (values[pos + 6]) b |= 0x40;
+      if (values[pos + 7]) b |= 0x80;
+      baos.write(b);
+      pos += 8;
+    }
+
+    // Handle remaining values (< 8)
+    while (pos < end) {
+      if (values[pos]) {
+        currentByte |= (1 << bitsWritten);
+      }
+      bitsWritten++;
+      pos++;
+    }
   }
 
   @Override
   public long getBufferedSize() {
-    return bitPackingWriter.getBufferedSize();
+    return baos.size() + (bitsWritten > 0 ? 1 : 0);
   }
 
   @Override
   public BytesInput getBytes() {
-    return bitPackingWriter.getBytes();
+    if (bitsWritten > 0) {
+      baos.write(currentByte);
+      currentByte = 0;
+      bitsWritten = 0;
+    }
+    return BytesInput.from(baos);
   }
 
   @Override
   public void reset() {
-    bitPackingWriter.reset();
+    baos.reset();
+    currentByte = 0;
+    bitsWritten = 0;
   }
 
   @Override
   public void close() {
-    bitPackingWriter.close();
+    baos.close();
   }
 
   @Override
   public long getAllocatedSize() {
-    return bitPackingWriter.getAllocatedSize();
+    return baos.getCapacity();
   }
 
   @Override
@@ -74,6 +140,6 @@ public Encoding getEncoding() {
 
   @Override
   public String memUsageString(String prefix) {
-    return bitPackingWriter.memUsageString(prefix);
+    return String.format("%s BooleanPlainValuesWriter %d bytes", prefix, getAllocatedSize());
   }
 }
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridEncoder.java b/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridEncoder.java
index e33824bff1..fc83e85963 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridEncoder.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridEncoder.java
@@ -272,6 +272,71 @@ public BytesInput toBytes() throws IOException {
     return BytesInput.from(baos);
   }
 
+  /**
+   * Batch-encodes boolean values (bitWidth must be 1). Pre-scans for runs to emit
+   * RLE runs directly and packs remaining groups into bit-packed runs, bypassing
+   * the per-value state machine.
+   *
+   * <p>This method may only be called when the encoder is in its initial state
+   * (no values have been written via {@link #writeInt}). If called after scalar
+   * writes, behavior is undefined.
+   *
+   * @param values the boolean array
+   * @param offset start position in the array
+   * @param length number of values to encode
+   */
+  public void writeBooleans(boolean[] values, int offset, int length) throws IOException {
+    Preconditions.checkArgument(bitWidth == 1, "writeBooleans requires bitWidth == 1");
+
+    int pos = offset;
+    int end = offset + length;
+
+    while (pos < end) {
+      // Scan for run of consecutive identical values
+      boolean val = values[pos];
+      int runStart = pos;
+      pos++;
+      while (pos < end && values[pos] == val) {
+        pos++;
+      }
+      int runLen = pos - runStart;
+      int intVal = val ? 1 : 0;
+
+      // If we have a pending partial buffer, fill it first from this run
+      if (numBufferedValues > 0 && runLen >= 8) {
+        int fill = 8 - numBufferedValues;
+        for (int i = 0; i < fill; i++) {
+          bufferedValues[numBufferedValues] = intVal;
+          numBufferedValues++;
+        }
+        writeOrAppendBitPackedRun();
+        runLen -= fill;
+      }
+
+      if (runLen >= 8) {
+        // Buffer is empty now, emit RLE run for the remaining
+        endPreviousBitPackedRun();
+        BytesUtils.writeUnsignedVarInt(runLen << 1, baos);
+        BytesUtils.writeIntLittleEndianPaddedOnBitWidth(baos, intVal, bitWidth);
+      } else {
+        // Buffer values for bit-packing
+        for (int i = 0; i < runLen; i++) {
+          bufferedValues[numBufferedValues] = intVal;
+          numBufferedValues++;
+          if (numBufferedValues == 8) {
+            writeOrAppendBitPackedRun();
+          }
+        }
+      }
+    }
+
+    // Update state so toBytes() handles the tail correctly
+    repeatCount = 0;
+    if (numBufferedValues > 0) {
+      previousValue = bufferedValues[numBufferedValues - 1];
+    }
+  }
+
   /**
    * Reset this encoder for re-use
    */
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridValuesWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridValuesWriter.java
index e869b0f2a3..b6609b1d43 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridValuesWriter.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridValuesWriter.java
@@ -52,6 +52,15 @@ public void writeBoolean(boolean v) {
     writeInteger(v ? 1 : 0);
   }
 
+  @Override
+  public void writeBooleans(boolean[] values, int offset, int length) {
+    try {
+      encoder.writeBooleans(values, offset, length);
+    } catch (IOException e) {
+      throw new ParquetEncodingException(e);
+    }
+  }
+
   @Override
   public long getBufferedSize() {
     return encoder.getBufferedSize();

From 7449f7c72d0e3e702ab900b79d0bb4cb5d18b7ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= <iemejia@gmail.com>
Date: Wed, 13 May 2026 18:13:23 +0200
Subject: [PATCH 25/27] Optimize PLAIN encoding: batch
 writeIntegers/writeLongs/writeFloats/writeDoubles with bulk ByteBuffer view
 transfers

Add bulk write methods to CapacityByteArrayOutputStream (writeInts, writeLongs,
writeFloats, writeDoubles) that use IntBuffer/LongBuffer/FloatBuffer/DoubleBuffer
view puts to transfer entire arrays in one operation, amortizing capacity checks
across the batch. Add corresponding batch APIs to ValuesWriter (with scalar
default) and optimized overrides in PlainValuesWriter.

Performance improvement (100K values, JDK 25):
  INT32:  566M -> 2,809M ops/s (+396%)
  FLOAT:  540M -> 2,818M ops/s (+422%)
  INT64:  479M -> 1,306M ops/s (+173%)
  DOUBLE: 442M -> 1,275M ops/s (+189%)
---
 .../parquet/column/values/ValuesWriter.java   |  52 +++++++++
 .../values/plain/PlainValuesWriter.java       |  20 ++++
 .../bytes/CapacityByteArrayOutputStream.java  | 110 ++++++++++++++++++
 3 files changed, 182 insertions(+)

diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/ValuesWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/ValuesWriter.java
index a50bfe8a3b..ce16c9f59d 100755
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/ValuesWriter.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/ValuesWriter.java
@@ -125,6 +125,19 @@ public void writeInteger(int v) {
     throw new UnsupportedOperationException(getClass().getName());
   }
 
+  /**
+   * Writes a batch of int values. Subclasses may override for optimized bulk encoding.
+   *
+   * @param values the int array to read from
+   * @param offset the start position in the array
+   * @param length the number of values to write
+   */
+  public void writeIntegers(int[] values, int offset, int length) {
+    for (int i = offset; i < offset + length; i++) {
+      writeInteger(values[i]);
+    }
+  }
+
   /**
    * @param v the value to encode
    */
@@ -132,6 +145,19 @@ public void writeLong(long v) {
     throw new UnsupportedOperationException(getClass().getName());
   }
 
+  /**
+   * Writes a batch of long values. Subclasses may override for optimized bulk encoding.
+   *
+   * @param values the long array to read from
+   * @param offset the start position in the array
+   * @param length the number of values to write
+   */
+  public void writeLongs(long[] values, int offset, int length) {
+    for (int i = offset; i < offset + length; i++) {
+      writeLong(values[i]);
+    }
+  }
+
   /**
    * @param v the value to encode
    */
@@ -139,6 +165,19 @@ public void writeDouble(double v) {
     throw new UnsupportedOperationException(getClass().getName());
   }
 
+  /**
+   * Writes a batch of double values. Subclasses may override for optimized bulk encoding.
+   *
+   * @param values the double array to read from
+   * @param offset the start position in the array
+   * @param length the number of values to write
+   */
+  public void writeDoubles(double[] values, int offset, int length) {
+    for (int i = offset; i < offset + length; i++) {
+      writeDouble(values[i]);
+    }
+  }
+
   /**
    * @param v the value to encode
    */
@@ -146,5 +185,18 @@ public void writeFloat(float v) {
     throw new UnsupportedOperationException(getClass().getName());
   }
 
+  /**
+   * Writes a batch of float values. Subclasses may override for optimized bulk encoding.
+   *
+   * @param values the float array to read from
+   * @param offset the start position in the array
+   * @param length the number of values to write
+   */
+  public void writeFloats(float[] values, int offset, int length) {
+    for (int i = offset; i < offset + length; i++) {
+      writeFloat(values[i]);
+    }
+  }
+
   public abstract String memUsageString(String prefix);
 }
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/plain/PlainValuesWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/plain/PlainValuesWriter.java
index c7069bc092..0802f46d2a 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/plain/PlainValuesWriter.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/plain/PlainValuesWriter.java
@@ -94,6 +94,26 @@ public final void writeDouble(double v) {
     }
   }
 
+  @Override
+  public final void writeIntegers(int[] values, int offset, int length) {
+    arrayOut.writeInts(values, offset, length);
+  }
+
+  @Override
+  public final void writeLongs(long[] values, int offset, int length) {
+    arrayOut.writeLongs(values, offset, length);
+  }
+
+  @Override
+  public final void writeFloats(float[] values, int offset, int length) {
+    arrayOut.writeFloats(values, offset, length);
+  }
+
+  @Override
+  public final void writeDoubles(double[] values, int offset, int length) {
+    arrayOut.writeDoubles(values, offset, length);
+  }
+
   @Override
   public void writeByte(int value) {
     try {
diff --git a/parquet-common/src/main/java/org/apache/parquet/bytes/CapacityByteArrayOutputStream.java b/parquet-common/src/main/java/org/apache/parquet/bytes/CapacityByteArrayOutputStream.java
index d3d8b1b6de..7dbe22a6b3 100644
--- a/parquet-common/src/main/java/org/apache/parquet/bytes/CapacityByteArrayOutputStream.java
+++ b/parquet-common/src/main/java/org/apache/parquet/bytes/CapacityByteArrayOutputStream.java
@@ -27,6 +27,7 @@
 import java.io.IOException;
 import java.io.OutputStream;
 import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
 import java.util.ArrayList;
 import java.util.List;
 import org.apache.parquet.OutputStreamCloseException;
@@ -201,6 +202,7 @@ private void addSlab(int minimumSize) {
     LOG.debug("used {} slabs, adding new slab of size {}", slabs.size(), nextSlabSize);
 
     this.currentSlab = allocator.allocate(nextSlabSize);
+    this.currentSlab.order(ByteOrder.LITTLE_ENDIAN);
     this.slabs.add(currentSlab);
     this.bytesAllocated = Math.addExact(this.bytesAllocated, nextSlabSize);
   }
@@ -232,6 +234,114 @@ public void write(byte b[], int off, int len) {
     bytesUsed = Math.addExact(bytesUsed, len);
   }
 
+  /**
+   * Writes multiple int values in little-endian byte order using bulk {@code IntBuffer} transfer.
+   * Amortizes capacity checks across the entire batch and leverages platform-optimized bulk put.
+   *
+   * @param values source array
+   * @param offset start index in values
+   * @param length number of ints to write
+   */
+  public void writeInts(int[] values, int offset, int length) {
+    int bytesNeeded = length * 4;
+    if (bytesNeeded <= currentSlab.remaining()) {
+      currentSlab.asIntBuffer().put(values, offset, length);
+      currentSlab.position(currentSlab.position() + bytesNeeded);
+    } else {
+      // Fill current slab, then continue into a new one
+      int fits = currentSlab.remaining() / 4;
+      if (fits > 0) {
+        currentSlab.asIntBuffer().put(values, offset, fits);
+        currentSlab.position(currentSlab.position() + fits * 4);
+      }
+      int remaining = length - fits;
+      addSlab(remaining * 4);
+      currentSlab.asIntBuffer().put(values, offset + fits, remaining);
+      currentSlab.position(currentSlab.position() + remaining * 4);
+    }
+    bytesUsed = Math.addExact(bytesUsed, bytesNeeded);
+  }
+
+  /**
+   * Writes multiple long values in little-endian byte order using bulk {@code LongBuffer} transfer.
+   *
+   * @param values source array
+   * @param offset start index in values
+   * @param length number of longs to write
+   */
+  public void writeLongs(long[] values, int offset, int length) {
+    int bytesNeeded = length * 8;
+    if (bytesNeeded <= currentSlab.remaining()) {
+      currentSlab.asLongBuffer().put(values, offset, length);
+      currentSlab.position(currentSlab.position() + bytesNeeded);
+    } else {
+      int fits = currentSlab.remaining() / 8;
+      if (fits > 0) {
+        currentSlab.asLongBuffer().put(values, offset, fits);
+        currentSlab.position(currentSlab.position() + fits * 8);
+      }
+      int remaining = length - fits;
+      addSlab(remaining * 8);
+      currentSlab.asLongBuffer().put(values, offset + fits, remaining);
+      currentSlab.position(currentSlab.position() + remaining * 8);
+    }
+    bytesUsed = Math.addExact(bytesUsed, bytesNeeded);
+  }
+
+  /**
+   * Writes multiple float values in little-endian byte order using bulk {@code FloatBuffer} transfer.
+   * The slab's LE byte order ensures correct IEEE 754 encoding without explicit
+   * {@code Float.floatToIntBits()} conversion.
+   *
+   * @param values source array
+   * @param offset start index in values
+   * @param length number of floats to write
+   */
+  public void writeFloats(float[] values, int offset, int length) {
+    int bytesNeeded = length * 4;
+    if (bytesNeeded <= currentSlab.remaining()) {
+      currentSlab.asFloatBuffer().put(values, offset, length);
+      currentSlab.position(currentSlab.position() + bytesNeeded);
+    } else {
+      int fits = currentSlab.remaining() / 4;
+      if (fits > 0) {
+        currentSlab.asFloatBuffer().put(values, offset, fits);
+        currentSlab.position(currentSlab.position() + fits * 4);
+      }
+      int remaining = length - fits;
+      addSlab(remaining * 4);
+      currentSlab.asFloatBuffer().put(values, offset + fits, remaining);
+      currentSlab.position(currentSlab.position() + remaining * 4);
+    }
+    bytesUsed = Math.addExact(bytesUsed, bytesNeeded);
+  }
+
+  /**
+   * Writes multiple double values in little-endian byte order using bulk {@code DoubleBuffer} transfer.
+   *
+   * @param values source array
+   * @param offset start index in values
+   * @param length number of doubles to write
+   */
+  public void writeDoubles(double[] values, int offset, int length) {
+    int bytesNeeded = length * 8;
+    if (bytesNeeded <= currentSlab.remaining()) {
+      currentSlab.asDoubleBuffer().put(values, offset, length);
+      currentSlab.position(currentSlab.position() + bytesNeeded);
+    } else {
+      int fits = currentSlab.remaining() / 8;
+      if (fits > 0) {
+        currentSlab.asDoubleBuffer().put(values, offset, fits);
+        currentSlab.position(currentSlab.position() + fits * 8);
+      }
+      int remaining = length - fits;
+      addSlab(remaining * 8);
+      currentSlab.asDoubleBuffer().put(values, offset + fits, remaining);
+      currentSlab.position(currentSlab.position() + remaining * 8);
+    }
+    bytesUsed = Math.addExact(bytesUsed, bytesNeeded);
+  }
+
   private void writeToOutput(OutputStream out, ByteBuffer buf, int len) throws IOException {
     if (buf.hasArray()) {
       out.write(buf.array(), buf.arrayOffset(), len);

From e8e256d034232b6dcb61b67fc7cf4107a8cb7a9c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= <iemejia@gmail.com>
Date: Wed, 13 May 2026 19:02:04 +0200
Subject: [PATCH 26/27] Add batch read/write APIs for fixed-length byte arrays

- ValuesReader.readBinaries() / ValuesWriter.writeBinaries() default impls
- FixedLenByteArrayPlainValuesReader: bulk slice() with fixed-offset Binary views
- FixedLenByteArrayPlainValuesWriter: chunked bulk write() amortizing stream overhead
- ByteStreamSplitValuesReader: optimized array-based decode with unrolled loops
  for element sizes 2, 4, 8, 12, 16
- ByteStreamSplitValuesReaderForFLBA: batch readBinaries() with single advanceByteOffset
- FixedLenByteArrayEncodingBenchmark: full FLBA benchmark suite with batch variants
- Add TestDataFactory and BenchmarkEncodingUtils helper classes
- Fix JMH annotation processor config in pom.xml for Maven Compiler 3.14+
---
 .../FixedLenByteArrayEncodingBenchmark.java   |  10 ++
 .../parquet/column/values/ValuesReader.java   |  13 ++
 .../parquet/column/values/ValuesWriter.java   |  13 ++
 .../ByteStreamSplitValuesReader.java          | 117 ++++++++++++++++--
 .../ByteStreamSplitValuesReaderForFLBA.java   |  13 ++
 .../FixedLenByteArrayPlainValuesReader.java   |  20 +++
 .../FixedLenByteArrayPlainValuesWriter.java   |  36 ++++++
 7 files changed, 214 insertions(+), 8 deletions(-)

diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FixedLenByteArrayEncodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FixedLenByteArrayEncodingBenchmark.java
index a2f0f9bbc7..bd4ba406bb 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FixedLenByteArrayEncodingBenchmark.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FixedLenByteArrayEncodingBenchmark.java
@@ -245,6 +245,16 @@ public void decodeBss(Blackhole bh) throws IOException {
     }
   }
 
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeBssBatch(Blackhole bh) throws IOException {
+    ByteStreamSplitValuesReaderForFLBA reader = new ByteStreamSplitValuesReaderForFLBA(fixedLength);
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(bssEncoded)));
+    Binary[] batch = new Binary[VALUE_COUNT];
+    reader.readBinaries(batch, 0, VALUE_COUNT);
+    bh.consume(batch);
+  }
+
   @Benchmark
   @OperationsPerInvocation(VALUE_COUNT)
   public void decodeDictionary(Blackhole bh) throws IOException {
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/ValuesReader.java b/parquet-column/src/main/java/org/apache/parquet/column/values/ValuesReader.java
index 1511925957..114936d153 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/ValuesReader.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/ValuesReader.java
@@ -254,6 +254,19 @@ public void readBooleans(boolean[] dest, int offset, int count) {
     }
   }
 
+  /**
+   * Reads {@code count} Binary values into {@code dest} starting at {@code offset}.
+   *
+   * @param dest   destination array
+   * @param offset start index in dest
+   * @param count  number of values to read
+   */
+  public void readBinaries(Binary[] dest, int offset, int count) {
+    for (int i = 0; i < count; i++) {
+      dest[offset + i] = readBytes();
+    }
+  }
+
   /**
    * Skips the next value in the page
    */
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/ValuesWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/ValuesWriter.java
index ce16c9f59d..bbe9230397 100755
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/ValuesWriter.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/ValuesWriter.java
@@ -118,6 +118,19 @@ public void writeBytes(Binary v) {
     throw new UnsupportedOperationException(getClass().getName());
   }
 
+  /**
+   * Writes a batch of Binary values. Subclasses may override for optimized bulk encoding.
+   *
+   * @param values the Binary array to read from
+   * @param offset the start position in the array
+   * @param length the number of values to write
+   */
+  public void writeBinaries(Binary[] values, int offset, int length) {
+    for (int i = offset; i < offset + length; i++) {
+      writeBytes(values[i]);
+    }
+  }
+
   /**
    * @param v the value to encode
    */
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReader.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReader.java
index 6b7449ea11..a500e1401e 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReader.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReader.java
@@ -62,17 +62,118 @@ protected int advanceByteOffset(int count) {
     return offset;
   }
 
-  // Decode an entire data page
+  // Decode an entire data page by transposing from stream-split layout to interleaved layout.
   private byte[] decodeData(ByteBuffer encoded, int valuesCount) {
-    assert encoded.limit() == valuesCount * elementSizeInBytes;
-    byte[] decoded = new byte[encoded.limit()];
-    int destByteIndex = 0;
-    for (int srcValueIndex = 0; srcValueIndex < valuesCount; ++srcValueIndex) {
-      for (int stream = 0; stream < elementSizeInBytes; ++stream, ++destByteIndex) {
-        decoded[destByteIndex] = encoded.get(srcValueIndex + stream * valuesCount);
+    int totalBytes = valuesCount * elementSizeInBytes;
+    assert encoded.remaining() >= totalBytes;
+
+    // Bulk access: use the backing array directly if available, otherwise copy once.
+    byte[] src;
+    int srcBase;
+    if (encoded.hasArray()) {
+      src = encoded.array();
+      srcBase = encoded.arrayOffset() + encoded.position();
+    } else {
+      src = new byte[totalBytes];
+      encoded.get(src);
+      srcBase = 0;
+    }
+
+    byte[] decoded = new byte[totalBytes];
+
+    // Specialized single-pass loops for common element sizes.
+    if (elementSizeInBytes == 2) {
+      int s0 = srcBase, s1 = srcBase + valuesCount;
+      for (int i = 0; i < valuesCount; ++i) {
+        int di = i * 2;
+        decoded[di] = src[s0 + i];
+        decoded[di + 1] = src[s1 + i];
+      }
+    } else if (elementSizeInBytes == 4) {
+      int s0 = srcBase, s1 = srcBase + valuesCount, s2 = srcBase + 2 * valuesCount,
+          s3 = srcBase + 3 * valuesCount;
+      for (int i = 0; i < valuesCount; ++i) {
+        int di = i * 4;
+        decoded[di] = src[s0 + i];
+        decoded[di + 1] = src[s1 + i];
+        decoded[di + 2] = src[s2 + i];
+        decoded[di + 3] = src[s3 + i];
+      }
+    } else if (elementSizeInBytes == 8) {
+      int s0 = srcBase, s1 = srcBase + valuesCount, s2 = srcBase + 2 * valuesCount,
+          s3 = srcBase + 3 * valuesCount, s4 = srcBase + 4 * valuesCount,
+          s5 = srcBase + 5 * valuesCount, s6 = srcBase + 6 * valuesCount,
+          s7 = srcBase + 7 * valuesCount;
+      for (int i = 0; i < valuesCount; ++i) {
+        int di = i * 8;
+        decoded[di] = src[s0 + i];
+        decoded[di + 1] = src[s1 + i];
+        decoded[di + 2] = src[s2 + i];
+        decoded[di + 3] = src[s3 + i];
+        decoded[di + 4] = src[s4 + i];
+        decoded[di + 5] = src[s5 + i];
+        decoded[di + 6] = src[s6 + i];
+        decoded[di + 7] = src[s7 + i];
+      }
+    } else if (elementSizeInBytes == 12) {
+      int s0 = srcBase, s1 = srcBase + valuesCount, s2 = srcBase + 2 * valuesCount,
+          s3 = srcBase + 3 * valuesCount, s4 = srcBase + 4 * valuesCount,
+          s5 = srcBase + 5 * valuesCount, s6 = srcBase + 6 * valuesCount,
+          s7 = srcBase + 7 * valuesCount, s8 = srcBase + 8 * valuesCount,
+          s9 = srcBase + 9 * valuesCount, s10 = srcBase + 10 * valuesCount,
+          s11 = srcBase + 11 * valuesCount;
+      for (int i = 0; i < valuesCount; ++i) {
+        int di = i * 12;
+        decoded[di] = src[s0 + i];
+        decoded[di + 1] = src[s1 + i];
+        decoded[di + 2] = src[s2 + i];
+        decoded[di + 3] = src[s3 + i];
+        decoded[di + 4] = src[s4 + i];
+        decoded[di + 5] = src[s5 + i];
+        decoded[di + 6] = src[s6 + i];
+        decoded[di + 7] = src[s7 + i];
+        decoded[di + 8] = src[s8 + i];
+        decoded[di + 9] = src[s9 + i];
+        decoded[di + 10] = src[s10 + i];
+        decoded[di + 11] = src[s11 + i];
+      }
+    } else if (elementSizeInBytes == 16) {
+      int s0 = srcBase, s1 = srcBase + valuesCount, s2 = srcBase + 2 * valuesCount,
+          s3 = srcBase + 3 * valuesCount, s4 = srcBase + 4 * valuesCount,
+          s5 = srcBase + 5 * valuesCount, s6 = srcBase + 6 * valuesCount,
+          s7 = srcBase + 7 * valuesCount, s8 = srcBase + 8 * valuesCount,
+          s9 = srcBase + 9 * valuesCount, s10 = srcBase + 10 * valuesCount,
+          s11 = srcBase + 11 * valuesCount, s12 = srcBase + 12 * valuesCount,
+          s13 = srcBase + 13 * valuesCount, s14 = srcBase + 14 * valuesCount,
+          s15 = srcBase + 15 * valuesCount;
+      for (int i = 0; i < valuesCount; ++i) {
+        int di = i * 16;
+        decoded[di] = src[s0 + i];
+        decoded[di + 1] = src[s1 + i];
+        decoded[di + 2] = src[s2 + i];
+        decoded[di + 3] = src[s3 + i];
+        decoded[di + 4] = src[s4 + i];
+        decoded[di + 5] = src[s5 + i];
+        decoded[di + 6] = src[s6 + i];
+        decoded[di + 7] = src[s7 + i];
+        decoded[di + 8] = src[s8 + i];
+        decoded[di + 9] = src[s9 + i];
+        decoded[di + 10] = src[s10 + i];
+        decoded[di + 11] = src[s11 + i];
+        decoded[di + 12] = src[s12 + i];
+        decoded[di + 13] = src[s13 + i];
+        decoded[di + 14] = src[s14 + i];
+        decoded[di + 15] = src[s15 + i];
+      }
+    } else {
+      // Generic fallback for arbitrary element sizes
+      for (int stream = 0; stream < elementSizeInBytes; ++stream) {
+        int srcOffset = srcBase + stream * valuesCount;
+        for (int i = 0; i < valuesCount; ++i) {
+          decoded[i * elementSizeInBytes + stream] = src[srcOffset + i];
+        }
       }
     }
-    assert destByteIndex == decoded.length;
     return decoded;
   }
 
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForFLBA.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForFLBA.java
index d8613dd8b9..b026a7d76e 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForFLBA.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderForFLBA.java
@@ -30,4 +30,17 @@ public ByteStreamSplitValuesReaderForFLBA(int length) {
   public Binary readBytes() {
     return Binary.fromConstantByteBuffer(decodedDataBuffer, nextElementByteOffset(), elementSizeInBytes);
   }
+
+  /**
+   * Batch read: advances the stream by {@code count} elements in a single bounds check,
+   * then creates Binary views at sequential offsets — eliminating per-value bounds checking.
+   */
+  @Override
+  public void readBinaries(Binary[] dest, int offset, int count) {
+    int byteOffset = advanceByteOffset(count);
+    for (int i = 0; i < count; i++) {
+      dest[offset + i] = Binary.fromConstantByteBuffer(decodedDataBuffer, byteOffset, elementSizeInBytes);
+      byteOffset += elementSizeInBytes;
+    }
+  }
 }
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/plain/FixedLenByteArrayPlainValuesReader.java b/parquet-column/src/main/java/org/apache/parquet/column/values/plain/FixedLenByteArrayPlainValuesReader.java
index adfc488924..6200ae4477 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/plain/FixedLenByteArrayPlainValuesReader.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/plain/FixedLenByteArrayPlainValuesReader.java
@@ -19,6 +19,7 @@
 package org.apache.parquet.column.values.plain;
 
 import java.io.IOException;
+import java.nio.ByteBuffer;
 import org.apache.parquet.bytes.ByteBufferInputStream;
 import org.apache.parquet.column.values.ValuesReader;
 import org.apache.parquet.io.ParquetDecodingException;
@@ -62,6 +63,25 @@ public void skip(int n) {
     }
   }
 
+  /**
+   * Batch read: slices the entire block of {@code count * length} bytes in one call,
+   * then creates Binary views at fixed offsets within the single ByteBuffer — eliminating
+   * per-value slice overhead.
+   */
+  @Override
+  public void readBinaries(Binary[] dest, int offset, int count) {
+    try {
+      int totalBytes = count * length;
+      ByteBuffer block = in.slice(totalBytes);
+      int baseOffset = block.position();
+      for (int i = 0; i < count; i++) {
+        dest[offset + i] = Binary.fromConstantByteBuffer(block, baseOffset + i * length, length);
+      }
+    } catch (IOException | RuntimeException e) {
+      throw new ParquetDecodingException("could not read bytes at offset " + in.position(), e);
+    }
+  }
+
   @Override
   public void initFromPage(int valueCount, ByteBufferInputStream stream) throws IOException {
     LOG.debug("init from page at offset {} for length {}", stream.position(), stream.available());
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/plain/FixedLenByteArrayPlainValuesWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/plain/FixedLenByteArrayPlainValuesWriter.java
index dec4d1be1b..9d8c7e464b 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/plain/FixedLenByteArrayPlainValuesWriter.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/plain/FixedLenByteArrayPlainValuesWriter.java
@@ -62,6 +62,42 @@ public final void writeBytes(Binary v) {
     }
   }
 
+  /**
+   * Batch write: copies Binary values into a temporary buffer and writes them in a single
+   * bulk {@code write()} call to the output stream, amortizing stream overhead across
+   * the entire batch.
+   */
+  @Override
+  public void writeBinaries(Binary[] values, int offset, int length) {
+    final int fixedLen = this.length;
+    // Process in chunks to avoid excessive temp allocation
+    final int CHUNK = 1024;
+    byte[] buf = new byte[Math.min(length, CHUNK) * fixedLen];
+    try {
+      int remaining = length;
+      int srcIdx = offset;
+      while (remaining > 0) {
+        int batch = Math.min(remaining, CHUNK);
+        int bufPos = 0;
+        for (int i = 0; i < batch; i++) {
+          Binary v = values[srcIdx++];
+          if (v.length() != fixedLen) {
+            throw new IllegalArgumentException(
+                "Fixed Binary size " + v.length() + " does not match field type length " + fixedLen);
+          }
+          // Copy bytes from the Binary's backing store into the batch buffer
+          byte[] bytes = v.getBytesUnsafe();
+          System.arraycopy(bytes, 0, buf, bufPos, fixedLen);
+          bufPos += fixedLen;
+        }
+        arrayOut.write(buf, 0, bufPos);
+        remaining -= batch;
+      }
+    } catch (RuntimeException e) {
+      throw new ParquetEncodingException("could not write fixed bytes", e);
+    }
+  }
+
   @Override
   public long getBufferedSize() {
     return arrayOut.size();

From 165bf498e03de49bc820895fde950d237d4de645 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= <iemejia@gmail.com>
Date: Wed, 13 May 2026 19:03:31 +0200
Subject: [PATCH 27/27] Optimize FLBA BSS encoding: batched scatter replacing
 per-byte stream writes

Replace per-value scatterBytes() in FixedLenByteArrayByteStreamSplitValuesWriter
with a BATCH_SIZE=64 buffered scatter pattern:
- Accumulate byte values into per-stream batch buffers
- Flush as bulk write(byte[], 0, count) to each stream
- Eliminates N*elementSize individual stream.write(byte) calls per batch
- Adds writeBinaries() batch override for FLBA BSS writer

Performance improvement: FLBA size=2 +85%, size=16 +160% (vs per-byte scatter).
---
 .../ByteStreamSplitValuesWriter.java          | 74 ++++++++++++++++++-
 1 file changed, 72 insertions(+), 2 deletions(-)

diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesWriter.java
index c197a4fd6f..e62126ed4d 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesWriter.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesWriter.java
@@ -29,9 +29,15 @@
 
 public abstract class ByteStreamSplitValuesWriter extends ValuesWriter {
 
+  /**
+   * Batch size for buffered scatter writes. Values are accumulated in a batch buffer
+   * and flushed as bulk {@code write(byte[], off, len)} calls to each stream.
+   */
+  private static final int BATCH_SIZE = 64;
+
   protected final int numStreams;
   protected final int elementSizeInBytes;
-  private final CapacityByteArrayOutputStream[] byteStreams;
+  protected final CapacityByteArrayOutputStream[] byteStreams;
 
   public ByteStreamSplitValuesWriter(
       int elementSizeInBytes, int initialCapacity, int pageSize, ByteBufferAllocator allocator) {
@@ -176,6 +182,8 @@ public String memUsageString(String prefix) {
 
   public static class FixedLenByteArrayByteStreamSplitValuesWriter extends ByteStreamSplitValuesWriter {
     private final int length;
+    private byte[][] batchBufs; // [stream][batchIndex] scratch buffers
+    private int flbaBatchCount;
 
     public FixedLenByteArrayByteStreamSplitValuesWriter(
         int length, int initialCapacity, int pageSize, ByteBufferAllocator allocator) {
@@ -187,7 +195,69 @@ public FixedLenByteArrayByteStreamSplitValuesWriter(
     public final void writeBytes(Binary v) {
       assert (v.length() == length)
           : ("Fixed Binary size " + v.length() + " does not match field type length " + length);
-      super.scatterBytes(v.getBytesUnsafe());
+      if (batchBufs == null) {
+        batchBufs = new byte[length][BATCH_SIZE];
+      }
+      byte[] bytes = v.getBytesUnsafe();
+      for (int stream = 0; stream < length; stream++) {
+        batchBufs[stream][flbaBatchCount] = bytes[stream];
+      }
+      flbaBatchCount++;
+      if (flbaBatchCount == BATCH_SIZE) {
+        flushFlbaBatch();
+      }
+    }
+
+    @Override
+    public void writeBinaries(Binary[] values, int offset, int len) {
+      if (batchBufs == null) {
+        batchBufs = new byte[length][BATCH_SIZE];
+      }
+      for (int i = offset; i < offset + len; i++) {
+        Binary v = values[i];
+        assert (v.length() == length)
+            : ("Fixed Binary size " + v.length() + " does not match field type length " + length);
+        byte[] bytes = v.getBytesUnsafe();
+        for (int stream = 0; stream < length; stream++) {
+          batchBufs[stream][flbaBatchCount] = bytes[stream];
+        }
+        flbaBatchCount++;
+        if (flbaBatchCount == BATCH_SIZE) {
+          flushFlbaBatch();
+        }
+      }
+    }
+
+    private void flushFlbaBatch() {
+      if (flbaBatchCount == 0) return;
+      final int count = flbaBatchCount;
+      for (int stream = 0; stream < length; stream++) {
+        byteStreams[stream].write(batchBufs[stream], 0, count);
+      }
+      flbaBatchCount = 0;
+    }
+
+    @Override
+    public BytesInput getBytes() {
+      flushFlbaBatch();
+      return super.getBytes();
+    }
+
+    @Override
+    public void reset() {
+      flbaBatchCount = 0;
+      super.reset();
+    }
+
+    @Override
+    public void close() {
+      flbaBatchCount = 0;
+      super.close();
+    }
+
+    @Override
+    public long getBufferedSize() {
+      return super.getBufferedSize() + (long) flbaBatchCount * length;
     }
 
     @Override