Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -73,16 +73,24 @@ public function addColumn(WriteFlatColumnValues $columnValues) : void

$maxDefinitionLevel = $this->column->maxDefinitionsLevel();

$nullsInBatch = 0;

foreach ($defLevels as $definitionLevel) {
if ($definitionLevel < $maxDefinitionLevel) {
$this->nullCount++;
$nullsInBatch++;
} else {
$this->nonNullValuesCount++;
}
}

$this->valueStorage->addValues($this->column, $columnValues->values());
$this->pageStatistics->addBatch($columnValues->values());

if ($nullsInBatch > 0) {
$this->pageStatistics->addNulls($nullsInBatch);
}

$this->rowsCount += $columnValues->rowsCount();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,16 +71,24 @@ public function addColumn(WriteFlatColumnValues $columnValues) : void

$maxDefinitionLevel = $this->column->maxDefinitionsLevel();

$nullsInBatch = 0;

foreach ($defLevels as $definitionLevel) {
if ($definitionLevel < $maxDefinitionLevel) {
$this->nullCount++;
$nullsInBatch++;
} else {
$this->nonNullValuesCount++;
}
}

$this->valueStorage->addValues($this->column, $columnValues->values());
$this->pageStatistics->addBatch($columnValues->values());

if ($nullsInBatch > 0) {
$this->pageStatistics->addNulls($nullsInBatch);
}

$this->rowsCount += $columnValues->rowsCount();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,14 +70,22 @@ public function addColumn(WriteFlatColumnValues $columnValues) : void

$maxDefinitionLevel = $this->column->maxDefinitionsLevel();

$nullsInBatch = 0;

foreach ($defLevels as $definitionLevel) {
if ($definitionLevel < $maxDefinitionLevel) {
$this->nullCount++;
$nullsInBatch++;
}
}

array_push($this->pageValues, ...$columnValues->values());
$this->pageStatistics->addBatch($columnValues->values());

if ($nullsInBatch > 0) {
$this->pageStatistics->addNulls($nullsInBatch);
}

$this->rowsCount += $columnValues->rowsCount();
}

Expand Down
10 changes: 10 additions & 0 deletions src/lib/parquet/src/Flow/Parquet/Writer/StatisticsCounter.php
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,16 @@ public function addBatch(array $values) : void
}
}

public function addNulls(int $count) : void
{
if ($count < 0) {
throw new InvalidArgumentException('Null count cannot be negative.');
}

$this->nullCount += $count;
$this->valuesCount += $count;
}

public function max() : mixed
{
return $this->max;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,38 @@ public function test_writing_column_statistics(ParquetEngine $engine) : void
\unlink($path);
}

#[DataProvider('engine_provider')]
public function test_writing_column_statistics_with_null_values(ParquetEngine $engine) : void
{
$schema = Schema::with(
FlatColumn::string('all_null'),
FlatColumn::string('all_string'),
FlatColumn::string('mixed'),
);

$rows = [
['all_null' => null, 'all_string' => 'a', 'mixed' => 'x'],
['all_null' => null, 'all_string' => 'b', 'mixed' => null],
['all_null' => null, 'all_string' => 'c', 'mixed' => 'z'],
];

$path = __DIR__ . '/var/test-writer-parquet-null-stats-' . generate_random_string() . '.parquet';

(new Writer(engine: $engine))->write($path, $schema, $rows);

$chunks = [];

foreach ((new Reader(engine: $engine))->read($path)->metadata()->columnChunks() as $chunk) {
$chunks[$chunk->flatPath()] = $chunk;
}

static::assertSame(3, $chunks['all_null']->statistics()->nullCount());
static::assertSame(0, $chunks['all_string']->statistics()->nullCount());
static::assertSame(1, $chunks['mixed']->statistics()->nullCount());

\unlink($path);
}

public function test_writing_data_page_v2_statistics() : void
{
$options = Options::default()->set(Option::WRITER_VERSION, 2);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -637,6 +637,40 @@ public function test_statistics_are_generated() : void
self::assertNotNull($statistics);
}

public function test_statistics_track_null_count_for_all_null_chunk() : void
{
$column = FlatColumn::string('all_null');
$options = new Options();
$compression = Compressions::UNCOMPRESSED;
$builder = new PlainFlatColumnChunkBuilder($column, $options, $compression);

$builder->addColumn(new WriteFlatColumnValues($column, [0, 0, 0], [0, 0, 0], []));

$statistics = $builder->flush(0)[0]->columnChunk->statistics();

self::assertNotNull($statistics);
self::assertSame(3, $statistics->nullCount());
self::assertNull($statistics->min($column));
self::assertNull($statistics->max($column));
}

public function test_statistics_track_null_count_for_mixed_chunk() : void
{
$column = FlatColumn::string('mixed');
$options = new Options();
$compression = Compressions::UNCOMPRESSED;
$builder = new PlainFlatColumnChunkBuilder($column, $options, $compression);

$builder->addColumn(new WriteFlatColumnValues($column, [0, 0, 0], [1, 0, 1], ['x', 'z']));

$statistics = $builder->flush(0)[0]->columnChunk->statistics();

self::assertNotNull($statistics);
self::assertSame(1, $statistics->nullCount());
self::assertSame('x', $statistics->min($column));
self::assertSame('z', $statistics->max($column));
}

public function test_uncompressed_size_accumulates_multiple_pages() : void
{
$column = new FlatColumn('test_col', PhysicalType::INT32);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,43 @@ public function test_add_null_value() : void
self::assertNull($statistics->max());
}

public function test_add_nulls_increments_null_and_values_count() : void
{
$column = FlatColumn::string('test_column');
$statistics = new StatisticsCounter($column);

$statistics->add('hello');
$statistics->addNulls(3);

self::assertSame(3, $statistics->nullCount());
self::assertSame(4, $statistics->valuesCount());
self::assertSame(1, $statistics->notNullCount());
self::assertSame('hello', $statistics->min());
self::assertSame('hello', $statistics->max());
}

public function test_add_nulls_with_negative_throws() : void
{
$column = FlatColumn::string('test_column');
$statistics = new StatisticsCounter($column);

$this->expectException(InvalidArgumentException::class);
$this->expectExceptionMessage('Null count cannot be negative.');

$statistics->addNulls(-1);
}

public function test_add_nulls_with_zero_is_noop() : void
{
$column = FlatColumn::string('test_column');
$statistics = new StatisticsCounter($column);

$statistics->addNulls(0);

self::assertSame(0, $statistics->nullCount());
self::assertSame(0, $statistics->valuesCount());
}

public function test_add_object_value() : void
{
$column = FlatColumn::string('test_column');
Expand Down
Loading