From 32b4bd09d84330420aa33d4c0212a7e7b07f97d4 Mon Sep 17 00:00:00 2001 From: g Date: Wed, 6 May 2026 12:15:52 +0200 Subject: [PATCH 1/3] fixed parquet metadata error --- .../DeltaBinaryPackedColumnChunkBuilder.php | 8 ++++ .../PlainFlatColumnChunkBuilder.php | 8 ++++ .../RLEDictionaryChunkBuilder.php | 8 ++++ .../Flow/Parquet/Writer/StatisticsCounter.php | 10 +++++ .../PlainFlatColumnChunkBuilderTest.php | 34 +++++++++++++++++ .../Unit/Writer/StatisticsCounterTest.php | 37 +++++++++++++++++++ 6 files changed, 105 insertions(+) diff --git a/src/lib/parquet/src/Flow/Parquet/Writer/ColumnChunkBuilder/DeltaBinaryPackedColumnChunkBuilder.php b/src/lib/parquet/src/Flow/Parquet/Writer/ColumnChunkBuilder/DeltaBinaryPackedColumnChunkBuilder.php index cc485defe..45304b843 100644 --- a/src/lib/parquet/src/Flow/Parquet/Writer/ColumnChunkBuilder/DeltaBinaryPackedColumnChunkBuilder.php +++ b/src/lib/parquet/src/Flow/Parquet/Writer/ColumnChunkBuilder/DeltaBinaryPackedColumnChunkBuilder.php @@ -73,9 +73,12 @@ public function addColumn(WriteFlatColumnValues $columnValues) : void $maxDefinitionLevel = $this->column->maxDefinitionsLevel(); + $nullsInBatch = 0; + foreach ($defLevels as $definitionLevel) { if ($definitionLevel < $maxDefinitionLevel) { $this->nullCount++; + $nullsInBatch++; } else { $this->nonNullValuesCount++; } @@ -83,6 +86,11 @@ public function addColumn(WriteFlatColumnValues $columnValues) : void $this->valueStorage->addValues($this->column, $columnValues->values()); $this->pageStatistics->addBatch($columnValues->values()); + + if ($nullsInBatch > 0) { + $this->pageStatistics->addNulls($nullsInBatch); + } + $this->rowsCount += $columnValues->rowsCount(); } diff --git a/src/lib/parquet/src/Flow/Parquet/Writer/ColumnChunkBuilder/PlainFlatColumnChunkBuilder.php b/src/lib/parquet/src/Flow/Parquet/Writer/ColumnChunkBuilder/PlainFlatColumnChunkBuilder.php index 3ab01a0cf..796e6c55a 100644 --- a/src/lib/parquet/src/Flow/Parquet/Writer/ColumnChunkBuilder/PlainFlatColumnChunkBuilder.php +++ b/src/lib/parquet/src/Flow/Parquet/Writer/ColumnChunkBuilder/PlainFlatColumnChunkBuilder.php @@ -71,9 +71,12 @@ public function addColumn(WriteFlatColumnValues $columnValues) : void $maxDefinitionLevel = $this->column->maxDefinitionsLevel(); + $nullsInBatch = 0; + foreach ($defLevels as $definitionLevel) { if ($definitionLevel < $maxDefinitionLevel) { $this->nullCount++; + $nullsInBatch++; } else { $this->nonNullValuesCount++; } @@ -81,6 +84,11 @@ public function addColumn(WriteFlatColumnValues $columnValues) : void $this->valueStorage->addValues($this->column, $columnValues->values()); $this->pageStatistics->addBatch($columnValues->values()); + + if ($nullsInBatch > 0) { + $this->pageStatistics->addNulls($nullsInBatch); + } + $this->rowsCount += $columnValues->rowsCount(); } diff --git a/src/lib/parquet/src/Flow/Parquet/Writer/ColumnChunkBuilder/RLEDictionaryChunkBuilder.php b/src/lib/parquet/src/Flow/Parquet/Writer/ColumnChunkBuilder/RLEDictionaryChunkBuilder.php index 0e24284cf..e2b4a1371 100644 --- a/src/lib/parquet/src/Flow/Parquet/Writer/ColumnChunkBuilder/RLEDictionaryChunkBuilder.php +++ b/src/lib/parquet/src/Flow/Parquet/Writer/ColumnChunkBuilder/RLEDictionaryChunkBuilder.php @@ -70,14 +70,22 @@ public function addColumn(WriteFlatColumnValues $columnValues) : void $maxDefinitionLevel = $this->column->maxDefinitionsLevel(); + $nullsInBatch = 0; + foreach ($defLevels as $definitionLevel) { if ($definitionLevel < $maxDefinitionLevel) { $this->nullCount++; + $nullsInBatch++; } } array_push($this->pageValues, ...$columnValues->values()); $this->pageStatistics->addBatch($columnValues->values()); + + if ($nullsInBatch > 0) { + $this->pageStatistics->addNulls($nullsInBatch); + } + $this->rowsCount += $columnValues->rowsCount(); } diff --git a/src/lib/parquet/src/Flow/Parquet/Writer/StatisticsCounter.php b/src/lib/parquet/src/Flow/Parquet/Writer/StatisticsCounter.php index f836b6b0a..ebd565ada 100644 --- a/src/lib/parquet/src/Flow/Parquet/Writer/StatisticsCounter.php +++ b/src/lib/parquet/src/Flow/Parquet/Writer/StatisticsCounter.php @@ -105,6 +105,16 @@ public function addBatch(array $values) : void } } + public function addNulls(int $count) : void + { + if ($count < 0) { + throw new InvalidArgumentException('Null count cannot be negative.'); + } + + $this->nullCount += $count; + $this->valuesCount += $count; + } + public function max() : mixed { return $this->max; diff --git a/src/lib/parquet/tests/Flow/Parquet/Tests/Unit/Writer/ColumnChunkBuilder/PlainFlatColumnChunkBuilderTest.php b/src/lib/parquet/tests/Flow/Parquet/Tests/Unit/Writer/ColumnChunkBuilder/PlainFlatColumnChunkBuilderTest.php index 429d42ed2..521efef1a 100644 --- a/src/lib/parquet/tests/Flow/Parquet/Tests/Unit/Writer/ColumnChunkBuilder/PlainFlatColumnChunkBuilderTest.php +++ b/src/lib/parquet/tests/Flow/Parquet/Tests/Unit/Writer/ColumnChunkBuilder/PlainFlatColumnChunkBuilderTest.php @@ -776,4 +776,38 @@ public function test_workflow_with_all_null_values() : void self::assertCount(1, $containers); self::assertSame(5, $containers[0]->columnChunk->valuesCount()); } + + public function test_statistics_track_null_count_for_all_null_chunk() : void + { + $column = FlatColumn::string('all_null'); + $options = new Options(); + $compression = Compressions::UNCOMPRESSED; + $builder = new PlainFlatColumnChunkBuilder($column, $options, $compression); + + $builder->addColumn(new WriteFlatColumnValues($column, [0, 0, 0], [0, 0, 0], [])); + + $statistics = $builder->flush(0)[0]->columnChunk->statistics(); + + self::assertNotNull($statistics); + self::assertSame(3, $statistics->nullCount()); + self::assertNull($statistics->min($column)); + self::assertNull($statistics->max($column)); + } + + public function test_statistics_track_null_count_for_mixed_chunk() : void + { + $column = FlatColumn::string('mixed'); + $options = new Options(); + $compression = Compressions::UNCOMPRESSED; + $builder = new PlainFlatColumnChunkBuilder($column, $options, $compression); + + $builder->addColumn(new WriteFlatColumnValues($column, [0, 0, 0], [1, 0, 1], ['x', 'z'])); + + $statistics = $builder->flush(0)[0]->columnChunk->statistics(); + + self::assertNotNull($statistics); + self::assertSame(1, $statistics->nullCount()); + self::assertSame('x', $statistics->min($column)); + self::assertSame('z', $statistics->max($column)); + } } diff --git a/src/lib/parquet/tests/Flow/Parquet/Tests/Unit/Writer/StatisticsCounterTest.php b/src/lib/parquet/tests/Flow/Parquet/Tests/Unit/Writer/StatisticsCounterTest.php index 5873cdf63..9aae531f9 100644 --- a/src/lib/parquet/tests/Flow/Parquet/Tests/Unit/Writer/StatisticsCounterTest.php +++ b/src/lib/parquet/tests/Flow/Parquet/Tests/Unit/Writer/StatisticsCounterTest.php @@ -130,6 +130,43 @@ public function test_add_null_value() : void self::assertNull($statistics->max()); } + public function test_add_nulls_increments_null_and_values_count() : void + { + $column = FlatColumn::string('test_column'); + $statistics = new StatisticsCounter($column); + + $statistics->add('hello'); + $statistics->addNulls(3); + + self::assertSame(3, $statistics->nullCount()); + self::assertSame(4, $statistics->valuesCount()); + self::assertSame(1, $statistics->notNullCount()); + self::assertSame('hello', $statistics->min()); + self::assertSame('hello', $statistics->max()); + } + + public function test_add_nulls_with_zero_is_noop() : void + { + $column = FlatColumn::string('test_column'); + $statistics = new StatisticsCounter($column); + + $statistics->addNulls(0); + + self::assertSame(0, $statistics->nullCount()); + self::assertSame(0, $statistics->valuesCount()); + } + + public function test_add_nulls_with_negative_throws() : void + { + $column = FlatColumn::string('test_column'); + $statistics = new StatisticsCounter($column); + + $this->expectException(InvalidArgumentException::class); + $this->expectExceptionMessage('Null count cannot be negative.'); + + $statistics->addNulls(-1); + } + public function test_add_object_value() : void { $column = FlatColumn::string('test_column'); From 95a2dac9fb5c04534499e3ef034c326784e9b7e2 Mon Sep 17 00:00:00 2001 From: g Date: Fri, 8 May 2026 10:38:33 +0200 Subject: [PATCH 2/3] fix statics --- .../PlainFlatColumnChunkBuilderTest.php | 68 +++++++++---------- .../Unit/Writer/StatisticsCounterTest.php | 16 ++--- 2 files changed, 42 insertions(+), 42 deletions(-) diff --git a/src/lib/parquet/tests/Flow/Parquet/Tests/Unit/Writer/ColumnChunkBuilder/PlainFlatColumnChunkBuilderTest.php b/src/lib/parquet/tests/Flow/Parquet/Tests/Unit/Writer/ColumnChunkBuilder/PlainFlatColumnChunkBuilderTest.php index 521efef1a..c0f9003f2 100644 --- a/src/lib/parquet/tests/Flow/Parquet/Tests/Unit/Writer/ColumnChunkBuilder/PlainFlatColumnChunkBuilderTest.php +++ b/src/lib/parquet/tests/Flow/Parquet/Tests/Unit/Writer/ColumnChunkBuilder/PlainFlatColumnChunkBuilderTest.php @@ -637,6 +637,40 @@ public function test_statistics_are_generated() : void self::assertNotNull($statistics); } + public function test_statistics_track_null_count_for_all_null_chunk() : void + { + $column = FlatColumn::string('all_null'); + $options = new Options(); + $compression = Compressions::UNCOMPRESSED; + $builder = new PlainFlatColumnChunkBuilder($column, $options, $compression); + + $builder->addColumn(new WriteFlatColumnValues($column, [0, 0, 0], [0, 0, 0], [])); + + $statistics = $builder->flush(0)[0]->columnChunk->statistics(); + + self::assertNotNull($statistics); + self::assertSame(3, $statistics->nullCount()); + self::assertNull($statistics->min($column)); + self::assertNull($statistics->max($column)); + } + + public function test_statistics_track_null_count_for_mixed_chunk() : void + { + $column = FlatColumn::string('mixed'); + $options = new Options(); + $compression = Compressions::UNCOMPRESSED; + $builder = new PlainFlatColumnChunkBuilder($column, $options, $compression); + + $builder->addColumn(new WriteFlatColumnValues($column, [0, 0, 0], [1, 0, 1], ['x', 'z'])); + + $statistics = $builder->flush(0)[0]->columnChunk->statistics(); + + self::assertNotNull($statistics); + self::assertSame(1, $statistics->nullCount()); + self::assertSame('x', $statistics->min($column)); + self::assertSame('z', $statistics->max($column)); + } + public function test_uncompressed_size_accumulates_multiple_pages() : void { $column = new FlatColumn('test_col', PhysicalType::INT32); @@ -776,38 +810,4 @@ public function test_workflow_with_all_null_values() : void self::assertCount(1, $containers); self::assertSame(5, $containers[0]->columnChunk->valuesCount()); } - - public function test_statistics_track_null_count_for_all_null_chunk() : void - { - $column = FlatColumn::string('all_null'); - $options = new Options(); - $compression = Compressions::UNCOMPRESSED; - $builder = new PlainFlatColumnChunkBuilder($column, $options, $compression); - - $builder->addColumn(new WriteFlatColumnValues($column, [0, 0, 0], [0, 0, 0], [])); - - $statistics = $builder->flush(0)[0]->columnChunk->statistics(); - - self::assertNotNull($statistics); - self::assertSame(3, $statistics->nullCount()); - self::assertNull($statistics->min($column)); - self::assertNull($statistics->max($column)); - } - - public function test_statistics_track_null_count_for_mixed_chunk() : void - { - $column = FlatColumn::string('mixed'); - $options = new Options(); - $compression = Compressions::UNCOMPRESSED; - $builder = new PlainFlatColumnChunkBuilder($column, $options, $compression); - - $builder->addColumn(new WriteFlatColumnValues($column, [0, 0, 0], [1, 0, 1], ['x', 'z'])); - - $statistics = $builder->flush(0)[0]->columnChunk->statistics(); - - self::assertNotNull($statistics); - self::assertSame(1, $statistics->nullCount()); - self::assertSame('x', $statistics->min($column)); - self::assertSame('z', $statistics->max($column)); - } } diff --git a/src/lib/parquet/tests/Flow/Parquet/Tests/Unit/Writer/StatisticsCounterTest.php b/src/lib/parquet/tests/Flow/Parquet/Tests/Unit/Writer/StatisticsCounterTest.php index 9aae531f9..bcd202adc 100644 --- a/src/lib/parquet/tests/Flow/Parquet/Tests/Unit/Writer/StatisticsCounterTest.php +++ b/src/lib/parquet/tests/Flow/Parquet/Tests/Unit/Writer/StatisticsCounterTest.php @@ -145,26 +145,26 @@ public function test_add_nulls_increments_null_and_values_count() : void self::assertSame('hello', $statistics->max()); } - public function test_add_nulls_with_zero_is_noop() : void + public function test_add_nulls_with_negative_throws() : void { $column = FlatColumn::string('test_column'); $statistics = new StatisticsCounter($column); - $statistics->addNulls(0); + $this->expectException(InvalidArgumentException::class); + $this->expectExceptionMessage('Null count cannot be negative.'); - self::assertSame(0, $statistics->nullCount()); - self::assertSame(0, $statistics->valuesCount()); + $statistics->addNulls(-1); } - public function test_add_nulls_with_negative_throws() : void + public function test_add_nulls_with_zero_is_noop() : void { $column = FlatColumn::string('test_column'); $statistics = new StatisticsCounter($column); - $this->expectException(InvalidArgumentException::class); - $this->expectExceptionMessage('Null count cannot be negative.'); + $statistics->addNulls(0); - $statistics->addNulls(-1); + self::assertSame(0, $statistics->nullCount()); + self::assertSame(0, $statistics->valuesCount()); } public function test_add_object_value() : void From 0f5f3393a224fe7b5c42dabb200c6e6ef686d368 Mon Sep 17 00:00:00 2001 From: g Date: Fri, 8 May 2026 10:51:58 +0200 Subject: [PATCH 3/3] added smoke test to test null counts after writing and reading a parquet file --- .../Tests/Integration/IO/WriterTest.php | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/src/lib/parquet/tests/Flow/Parquet/Tests/Integration/IO/WriterTest.php b/src/lib/parquet/tests/Flow/Parquet/Tests/Integration/IO/WriterTest.php index b71195017..29959f98a 100644 --- a/src/lib/parquet/tests/Flow/Parquet/Tests/Integration/IO/WriterTest.php +++ b/src/lib/parquet/tests/Flow/Parquet/Tests/Integration/IO/WriterTest.php @@ -141,6 +141,38 @@ public function test_writing_column_statistics(ParquetEngine $engine) : void \unlink($path); } + #[DataProvider('engine_provider')] + public function test_writing_column_statistics_with_null_values(ParquetEngine $engine) : void + { + $schema = Schema::with( + FlatColumn::string('all_null'), + FlatColumn::string('all_string'), + FlatColumn::string('mixed'), + ); + + $rows = [ + ['all_null' => null, 'all_string' => 'a', 'mixed' => 'x'], + ['all_null' => null, 'all_string' => 'b', 'mixed' => null], + ['all_null' => null, 'all_string' => 'c', 'mixed' => 'z'], + ]; + + $path = __DIR__ . '/var/test-writer-parquet-null-stats-' . generate_random_string() . '.parquet'; + + (new Writer(engine: $engine))->write($path, $schema, $rows); + + $chunks = []; + + foreach ((new Reader(engine: $engine))->read($path)->metadata()->columnChunks() as $chunk) { + $chunks[$chunk->flatPath()] = $chunk; + } + + static::assertSame(3, $chunks['all_null']->statistics()->nullCount()); + static::assertSame(0, $chunks['all_string']->statistics()->nullCount()); + static::assertSame(1, $chunks['mixed']->statistics()->nullCount()); + + \unlink($path); + } + public function test_writing_data_page_v2_statistics() : void { $options = Options::default()->set(Option::WRITER_VERSION, 2);