diff --git a/src/lib/parquet/src/Flow/Parquet/Writer/ColumnChunkBuilder/DeltaBinaryPackedColumnChunkBuilder.php b/src/lib/parquet/src/Flow/Parquet/Writer/ColumnChunkBuilder/DeltaBinaryPackedColumnChunkBuilder.php index cc485defe..45304b843 100644 --- a/src/lib/parquet/src/Flow/Parquet/Writer/ColumnChunkBuilder/DeltaBinaryPackedColumnChunkBuilder.php +++ b/src/lib/parquet/src/Flow/Parquet/Writer/ColumnChunkBuilder/DeltaBinaryPackedColumnChunkBuilder.php @@ -73,9 +73,12 @@ public function addColumn(WriteFlatColumnValues $columnValues) : void $maxDefinitionLevel = $this->column->maxDefinitionsLevel(); + $nullsInBatch = 0; + foreach ($defLevels as $definitionLevel) { if ($definitionLevel < $maxDefinitionLevel) { $this->nullCount++; + $nullsInBatch++; } else { $this->nonNullValuesCount++; } @@ -83,6 +86,11 @@ public function addColumn(WriteFlatColumnValues $columnValues) : void $this->valueStorage->addValues($this->column, $columnValues->values()); $this->pageStatistics->addBatch($columnValues->values()); + + if ($nullsInBatch > 0) { + $this->pageStatistics->addNulls($nullsInBatch); + } + $this->rowsCount += $columnValues->rowsCount(); } diff --git a/src/lib/parquet/src/Flow/Parquet/Writer/ColumnChunkBuilder/PlainFlatColumnChunkBuilder.php b/src/lib/parquet/src/Flow/Parquet/Writer/ColumnChunkBuilder/PlainFlatColumnChunkBuilder.php index 3ab01a0cf..796e6c55a 100644 --- a/src/lib/parquet/src/Flow/Parquet/Writer/ColumnChunkBuilder/PlainFlatColumnChunkBuilder.php +++ b/src/lib/parquet/src/Flow/Parquet/Writer/ColumnChunkBuilder/PlainFlatColumnChunkBuilder.php @@ -71,9 +71,12 @@ public function addColumn(WriteFlatColumnValues $columnValues) : void $maxDefinitionLevel = $this->column->maxDefinitionsLevel(); + $nullsInBatch = 0; + foreach ($defLevels as $definitionLevel) { if ($definitionLevel < $maxDefinitionLevel) { $this->nullCount++; + $nullsInBatch++; } else { $this->nonNullValuesCount++; } @@ -81,6 +84,11 @@ public function addColumn(WriteFlatColumnValues $columnValues) : void $this->valueStorage->addValues($this->column, $columnValues->values()); $this->pageStatistics->addBatch($columnValues->values()); + + if ($nullsInBatch > 0) { + $this->pageStatistics->addNulls($nullsInBatch); + } + $this->rowsCount += $columnValues->rowsCount(); } diff --git a/src/lib/parquet/src/Flow/Parquet/Writer/ColumnChunkBuilder/RLEDictionaryChunkBuilder.php b/src/lib/parquet/src/Flow/Parquet/Writer/ColumnChunkBuilder/RLEDictionaryChunkBuilder.php index 0e24284cf..e2b4a1371 100644 --- a/src/lib/parquet/src/Flow/Parquet/Writer/ColumnChunkBuilder/RLEDictionaryChunkBuilder.php +++ b/src/lib/parquet/src/Flow/Parquet/Writer/ColumnChunkBuilder/RLEDictionaryChunkBuilder.php @@ -70,14 +70,22 @@ public function addColumn(WriteFlatColumnValues $columnValues) : void $maxDefinitionLevel = $this->column->maxDefinitionsLevel(); + $nullsInBatch = 0; + foreach ($defLevels as $definitionLevel) { if ($definitionLevel < $maxDefinitionLevel) { $this->nullCount++; + $nullsInBatch++; } } array_push($this->pageValues, ...$columnValues->values()); $this->pageStatistics->addBatch($columnValues->values()); + + if ($nullsInBatch > 0) { + $this->pageStatistics->addNulls($nullsInBatch); + } + $this->rowsCount += $columnValues->rowsCount(); } diff --git a/src/lib/parquet/src/Flow/Parquet/Writer/StatisticsCounter.php b/src/lib/parquet/src/Flow/Parquet/Writer/StatisticsCounter.php index f836b6b0a..ebd565ada 100644 --- a/src/lib/parquet/src/Flow/Parquet/Writer/StatisticsCounter.php +++ b/src/lib/parquet/src/Flow/Parquet/Writer/StatisticsCounter.php @@ -105,6 +105,16 @@ public function addBatch(array $values) : void } } + public function addNulls(int $count) : void + { + if ($count < 0) { + throw new InvalidArgumentException('Null count cannot be negative.'); + } + + $this->nullCount += $count; + $this->valuesCount += $count; + } + public function max() : mixed { return $this->max; diff --git a/src/lib/parquet/tests/Flow/Parquet/Tests/Integration/IO/WriterTest.php b/src/lib/parquet/tests/Flow/Parquet/Tests/Integration/IO/WriterTest.php index b71195017..29959f98a 100644 --- a/src/lib/parquet/tests/Flow/Parquet/Tests/Integration/IO/WriterTest.php +++ b/src/lib/parquet/tests/Flow/Parquet/Tests/Integration/IO/WriterTest.php @@ -141,6 +141,38 @@ public function test_writing_column_statistics(ParquetEngine $engine) : void \unlink($path); } + #[DataProvider('engine_provider')] + public function test_writing_column_statistics_with_null_values(ParquetEngine $engine) : void + { + $schema = Schema::with( + FlatColumn::string('all_null'), + FlatColumn::string('all_string'), + FlatColumn::string('mixed'), + ); + + $rows = [ + ['all_null' => null, 'all_string' => 'a', 'mixed' => 'x'], + ['all_null' => null, 'all_string' => 'b', 'mixed' => null], + ['all_null' => null, 'all_string' => 'c', 'mixed' => 'z'], + ]; + + $path = __DIR__ . '/var/test-writer-parquet-null-stats-' . generate_random_string() . '.parquet'; + + (new Writer(engine: $engine))->write($path, $schema, $rows); + + $chunks = []; + + foreach ((new Reader(engine: $engine))->read($path)->metadata()->columnChunks() as $chunk) { + $chunks[$chunk->flatPath()] = $chunk; + } + + static::assertSame(3, $chunks['all_null']->statistics()->nullCount()); + static::assertSame(0, $chunks['all_string']->statistics()->nullCount()); + static::assertSame(1, $chunks['mixed']->statistics()->nullCount()); + + \unlink($path); + } + public function test_writing_data_page_v2_statistics() : void { $options = Options::default()->set(Option::WRITER_VERSION, 2); diff --git a/src/lib/parquet/tests/Flow/Parquet/Tests/Unit/Writer/ColumnChunkBuilder/PlainFlatColumnChunkBuilderTest.php b/src/lib/parquet/tests/Flow/Parquet/Tests/Unit/Writer/ColumnChunkBuilder/PlainFlatColumnChunkBuilderTest.php index 429d42ed2..c0f9003f2 100644 --- a/src/lib/parquet/tests/Flow/Parquet/Tests/Unit/Writer/ColumnChunkBuilder/PlainFlatColumnChunkBuilderTest.php +++ b/src/lib/parquet/tests/Flow/Parquet/Tests/Unit/Writer/ColumnChunkBuilder/PlainFlatColumnChunkBuilderTest.php @@ -637,6 +637,40 @@ public function test_statistics_are_generated() : void self::assertNotNull($statistics); } + public function test_statistics_track_null_count_for_all_null_chunk() : void + { + $column = FlatColumn::string('all_null'); + $options = new Options(); + $compression = Compressions::UNCOMPRESSED; + $builder = new PlainFlatColumnChunkBuilder($column, $options, $compression); + + $builder->addColumn(new WriteFlatColumnValues($column, [0, 0, 0], [0, 0, 0], [])); + + $statistics = $builder->flush(0)[0]->columnChunk->statistics(); + + self::assertNotNull($statistics); + self::assertSame(3, $statistics->nullCount()); + self::assertNull($statistics->min($column)); + self::assertNull($statistics->max($column)); + } + + public function test_statistics_track_null_count_for_mixed_chunk() : void + { + $column = FlatColumn::string('mixed'); + $options = new Options(); + $compression = Compressions::UNCOMPRESSED; + $builder = new PlainFlatColumnChunkBuilder($column, $options, $compression); + + $builder->addColumn(new WriteFlatColumnValues($column, [0, 0, 0], [1, 0, 1], ['x', 'z'])); + + $statistics = $builder->flush(0)[0]->columnChunk->statistics(); + + self::assertNotNull($statistics); + self::assertSame(1, $statistics->nullCount()); + self::assertSame('x', $statistics->min($column)); + self::assertSame('z', $statistics->max($column)); + } + public function test_uncompressed_size_accumulates_multiple_pages() : void { $column = new FlatColumn('test_col', PhysicalType::INT32); diff --git a/src/lib/parquet/tests/Flow/Parquet/Tests/Unit/Writer/StatisticsCounterTest.php b/src/lib/parquet/tests/Flow/Parquet/Tests/Unit/Writer/StatisticsCounterTest.php index 5873cdf63..bcd202adc 100644 --- a/src/lib/parquet/tests/Flow/Parquet/Tests/Unit/Writer/StatisticsCounterTest.php +++ b/src/lib/parquet/tests/Flow/Parquet/Tests/Unit/Writer/StatisticsCounterTest.php @@ -130,6 +130,43 @@ public function test_add_null_value() : void self::assertNull($statistics->max()); } + public function test_add_nulls_increments_null_and_values_count() : void + { + $column = FlatColumn::string('test_column'); + $statistics = new StatisticsCounter($column); + + $statistics->add('hello'); + $statistics->addNulls(3); + + self::assertSame(3, $statistics->nullCount()); + self::assertSame(4, $statistics->valuesCount()); + self::assertSame(1, $statistics->notNullCount()); + self::assertSame('hello', $statistics->min()); + self::assertSame('hello', $statistics->max()); + } + + public function test_add_nulls_with_negative_throws() : void + { + $column = FlatColumn::string('test_column'); + $statistics = new StatisticsCounter($column); + + $this->expectException(InvalidArgumentException::class); + $this->expectExceptionMessage('Null count cannot be negative.'); + + $statistics->addNulls(-1); + } + + public function test_add_nulls_with_zero_is_noop() : void + { + $column = FlatColumn::string('test_column'); + $statistics = new StatisticsCounter($column); + + $statistics->addNulls(0); + + self::assertSame(0, $statistics->nullCount()); + self::assertSame(0, $statistics->valuesCount()); + } + public function test_add_object_value() : void { $column = FlatColumn::string('test_column');