From f8acc50a160db6f2ef6d2785d9ed67c943ef90af Mon Sep 17 00:00:00 2001 From: JunRuiLee Date: Sat, 4 Jul 2026 16:56:47 +0800 Subject: [PATCH 1/4] feat(core): shared exact residual predicate evaluator (arrow::residual) Consolidate the two near-duplicate Arrow-batch predicate evaluators (one private to the Parquet reader, one in the Vortex reader) into a shared arrow::residual module, and make it faithful to Paimon's Datum comparison semantics: - Complete leaf dispatch incl. startsWith/endsWith/contains/like/between, so those operators are evaluated exactly wherever the residual runs (the Vortex copy previously deferred them). - Resolve each predicate column by name against the batch schema (a reader may emit columns in file-schema order, e.g. ORC ProjectionMask::named_roots), with a debug_assert guarding a predicate column missing from the scanned batch. - Binary/VarBinary comparisons (ordering and Between) use Java signed-byte order (java_bytes_cmp) to match Datum::Bytes (0xFF < 0x00), not Arrow's unsigned comparison. Between delegates to the shared comparison path so it cannot diverge. - Decimal value comparisons are evaluated row-wise via datum_cmp, exact across scales (e.g. d > 1.05 on a DECIMAL(_,1) column == d >= 1.1). - Casts a decoded column up to the predicate's declared type before comparison, so a promoted INT->BIGINT read with an out-of-range literal yields no rows rather than an error. - An unconvertible literal errors rather than fail-open to all-rows. - Parquet keeps its RowFilter fast path and adds a residual backstop when a predicate is not fully enforced (Or/Not/unsupported leaves); Vortex reuses the shared helpers. Narrows the FormatFileReader contract to per-format exactness. --- crates/paimon/src/arrow/format/mod.rs | 15 +- crates/paimon/src/arrow/format/parquet.rs | 810 ++++++----- crates/paimon/src/arrow/format/vortex.rs | 509 +------ crates/paimon/src/arrow/mod.rs | 1 + crates/paimon/src/arrow/residual.rs | 1590 +++++++++++++++++++++ crates/paimon/src/spec/mod.rs | 1 + crates/paimon/src/spec/predicate.rs | 2 +- 7 files changed, 2031 insertions(+), 897 deletions(-) create mode 100644 crates/paimon/src/arrow/residual.rs diff --git a/crates/paimon/src/arrow/format/mod.rs b/crates/paimon/src/arrow/format/mod.rs index cec1be93..b3e33c4a 100644 --- a/crates/paimon/src/arrow/format/mod.rs +++ b/crates/paimon/src/arrow/format/mod.rs @@ -54,8 +54,19 @@ pub(crate) struct FilePredicates { /// - Row range selection #[async_trait] pub(crate) trait FormatFileReader: Send + Sync { - /// Read a single data file, returning a stream of RecordBatches - /// containing only the projected columns (using names from the file's schema). + /// Read a single data file, returning a stream of RecordBatches containing + /// at least the projected columns (using names from the file's schema). A + /// reader MAY include extra columns it needed to scan (e.g. predicate columns + /// for residual filtering); the caller (`DataFileReader`) projects to the + /// requested output by name, so extra columns are harmless. + /// + /// Predicate exactness is per-format, NOT a blanket guarantee: + /// - Parquet, ORC, Avro, Row, and Vortex apply the predicate **exactly** — + /// each emitted batch contains only rows matching the pushed-down predicate + /// (native pushdown for pruning + a row-level residual pass for the rest). + /// - Blob does not evaluate predicates at all; Mosaic applies only + /// stats-level (row-group) pruning. For those, non-matching rows may + /// survive and the caller must not assume exactness. /// /// `row_selection` is a pre-merged list of 0-based inclusive row ranges /// (DV + row_ranges already combined by the caller). diff --git a/crates/paimon/src/arrow/format/parquet.rs b/crates/paimon/src/arrow/format/parquet.rs index 982f518a..25a4d339 100644 --- a/crates/paimon/src/arrow/format/parquet.rs +++ b/crates/paimon/src/arrow/format/parquet.rs @@ -21,20 +21,7 @@ use crate::io::{FileRead, OutputFile}; use crate::spec::{DataField, DataType, Datum, Predicate, PredicateOperator}; use crate::table::{ArrowRecordBatchStream, RowRange}; use crate::Error; -use arrow_array::{ - Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Datum as ArrowDatum, Decimal128Array, - Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, RecordBatch, Scalar, - StringArray, -}; -use arrow_ord::cmp::{ - eq as arrow_eq, gt as arrow_gt, gt_eq as arrow_gt_eq, lt as arrow_lt, lt_eq as arrow_lt_eq, - neq as arrow_neq, -}; -use arrow_schema::ArrowError; -use arrow_string::like::{ - contains as arrow_contains, ends_with as arrow_ends_with, like as arrow_like, - starts_with as arrow_starts_with, -}; +use arrow_array::{BooleanArray, RecordBatch}; use async_trait::async_trait; use bytes::Bytes; use futures::future::BoxFuture; @@ -176,8 +163,33 @@ impl FormatFileReader for ParquetFormatReader { .await?; let parquet_schema = batch_stream_builder.parquet_schema().clone(); + + // Determine whether the Arrow row filter enforces every pushed-down + // predicate exactly. It only accepts top-level supported leaves; `Or`, + // `Not`, and unsupported leaves fall through to conservative pruning + // only, which is NOT exact. When any predicate is not fully enforced we + // add a row-level residual backstop over the decoded batch. + // + // Fast path (the common case, including all-AND-of-simple-leaves): every + // predicate is fully enforced, so we decode exactly `read_fields`, skip + // the residual pass entirely, and return the stream as before — zero + // added overhead. + let all_enforced = preds + .iter() + .all(|p| predicate_fully_enforced_by_row_filter(&parquet_schema, p, file_fields)); + + // Residual branch must decode the predicate columns too, or the residual + // pass could not see a predicate on a non-projected column (Gap A). The + // projection MASK here (which columns to DECODE) is distinct from + // `DataFileReader`'s by-name output projection applied downstream. + let scan_fields: Vec = if all_enforced { + read_fields.to_vec() + } else { + crate::arrow::residual::widen_scan_fields(read_fields, predicates) + }; + let root_schema = parquet_schema.root_schema(); - let root_indices: Vec = read_fields + let root_indices: Vec = scan_fields .iter() .filter_map(|f| { root_schema @@ -223,7 +235,32 @@ impl FormatFileReader for ParquetFormatReader { } let batch_stream = batch_stream_builder.build()?; - Ok(batch_stream.map(|r| r.map_err(Error::from)).boxed()) + + if all_enforced { + // Fast path: the row filter enforced every predicate exactly during + // decode. Return the stream directly — no residual pass. + return Ok(batch_stream.map(|r| r.map_err(Error::from)).boxed()); + } + + // Residual backstop: at least one predicate (`Or`/`Not`/unsupported leaf) + // was not fully enforced by the row filter, so apply the exact residual + // filter per batch over the widened `scan_fields`. The row filter (for + // whatever leaves it accepted) still ran during decode for I/O pruning; + // double-filtering an accepted leaf is idempotent. `DataFileReader` + // projects the filtered batch to `read_fields` by name. + let residual_predicates = FilePredicates { + predicates: preds.to_vec(), + file_fields: file_fields.to_vec(), + }; + let stream = batch_stream.map(move |result| { + let batch = result.map_err(Error::from)?; + crate::arrow::residual::filter_record_batch_by_predicates( + batch, + &residual_predicates, + &scan_fields, + ) + }); + Ok(stream.boxed()) } } @@ -271,19 +308,16 @@ fn build_parquet_arrow_predicate( else { return Ok(None); }; - if !predicate_supported_for_parquet_row_filter(*op) { - return Ok(None); - } - - let Some(file_field) = file_fields.get(*index) else { - return Ok(None); - }; - let Some(root_index) = parquet_root_index(parquet_schema, file_field.name()) else { - return Ok(None); - }; - if !parquet_row_filter_literals_supported(*op, literals, file_field.data_type())? { + // Gate on the shared acceptance test so this builder and + // `predicate_fully_enforced_by_row_filter` cannot disagree about which + // leaves the row filter enforces. + if !parquet_leaf_row_filter_accepted(parquet_schema, *index, *op, literals, file_fields)? { return Ok(None); } + // Accepted: the field and its root column are guaranteed present. + let file_field = &file_fields[*index]; + let root_index = parquet_root_index(parquet_schema, file_field.name()) + .expect("root index resolvable for accepted leaf"); let projection = ProjectionMask::roots(parquet_schema, [root_index]); let op = *op; @@ -295,11 +329,64 @@ fn build_parquet_arrow_predicate( let Some(column) = batch.columns().first() else { return Ok(BooleanArray::new_null(batch.num_rows())); }; - evaluate_exact_leaf_predicate(column, &data_type, op, &literals) + crate::arrow::residual::evaluate_exact_leaf_predicate(column, &data_type, op, &literals) }, )))) } +/// `true` iff [`build_parquet_arrow_predicate`] would return `Some` for +/// `predicate` — i.e. it is a `Leaf` with a supported operator, resolvable +/// column, and a literal set the Arrow row filter can enforce. Such a predicate +/// is applied exactly during decode by the [`RowFilter`], so it needs no +/// residual backstop. +/// +/// Shares the leaf-acceptance test with `build_parquet_arrow_predicate` (via +/// `parquet_leaf_row_filter_accepted`) so the two cannot drift: whatever the +/// builder accepts, this reports as fully enforced, and vice versa. Any +/// non-`Leaf` node (`Or`, `Not`, `And`, ...) is NOT fully enforced. +fn predicate_fully_enforced_by_row_filter( + parquet_schema: &parquet::schema::types::SchemaDescriptor, + predicate: &Predicate, + file_fields: &[DataField], +) -> bool { + let Predicate::Leaf { + index, + op, + literals, + .. + } = predicate + else { + return false; + }; + parquet_leaf_row_filter_accepted(parquet_schema, *index, *op, literals, file_fields) + .unwrap_or(false) +} + +/// Shared leaf-acceptance test for the Parquet Arrow row filter: a leaf is +/// accepted iff its operator is supported, its column resolves to a root parquet +/// column, and its literals are representable for the Arrow filter. +/// +/// Returning an error only propagates a genuine failure from literal conversion; +/// an unsupported (but well-formed) leaf yields `Ok(false)`. +fn parquet_leaf_row_filter_accepted( + parquet_schema: &parquet::schema::types::SchemaDescriptor, + index: usize, + op: PredicateOperator, + literals: &[Datum], + file_fields: &[DataField], +) -> crate::Result { + if !predicate_supported_for_parquet_row_filter(op) { + return Ok(false); + } + let Some(file_field) = file_fields.get(index) else { + return Ok(false); + }; + if parquet_root_index(parquet_schema, file_field.name()).is_none() { + return Ok(false); + } + parquet_row_filter_literals_supported(op, literals, file_field.data_type()) +} + fn predicate_supported_for_parquet_row_filter(op: PredicateOperator) -> bool { matches!( op, @@ -338,11 +425,16 @@ fn parquet_row_filter_literals_supported( let Some(literal) = literals.first() else { return Ok(false); }; - Ok(literal_scalar_for_parquet_filter(literal, file_data_type)?.is_some()) + Ok( + crate::arrow::residual::literal_scalar_for_arrow_filter(literal, file_data_type)? + .is_some(), + ) } PredicateOperator::In | PredicateOperator::NotIn => { for literal in literals { - if literal_scalar_for_parquet_filter(literal, file_data_type)?.is_none() { + if crate::arrow::residual::literal_scalar_for_arrow_filter(literal, file_data_type)? + .is_none() + { return Ok(false); } } @@ -361,14 +453,19 @@ fn parquet_row_filter_literals_supported( let Some(literal) = literals.first() else { return Ok(false); }; - Ok(literal_scalar_for_parquet_filter(literal, file_data_type)?.is_some()) + Ok( + crate::arrow::residual::literal_scalar_for_arrow_filter(literal, file_data_type)? + .is_some(), + ) } PredicateOperator::Between | PredicateOperator::NotBetween => { if literals.len() != 2 { return Ok(false); } for literal in literals { - if literal_scalar_for_parquet_filter(literal, file_data_type)?.is_none() { + if crate::arrow::residual::literal_scalar_for_arrow_filter(literal, file_data_type)? + .is_none() + { return Ok(false); } } @@ -388,231 +485,6 @@ fn parquet_root_index( .position(|field| field.name() == root_name) } -// --------------------------------------------------------------------------- -// Predicate evaluation helpers -// --------------------------------------------------------------------------- - -fn evaluate_exact_leaf_predicate( - array: &ArrayRef, - data_type: &DataType, - op: PredicateOperator, - literals: &[Datum], -) -> Result { - match op { - PredicateOperator::IsNull => Ok(boolean_mask_from_predicate(array.len(), |row_index| { - array.is_null(row_index) - })), - PredicateOperator::IsNotNull => Ok(boolean_mask_from_predicate(array.len(), |row_index| { - array.is_valid(row_index) - })), - PredicateOperator::In | PredicateOperator::NotIn => { - evaluate_set_membership_predicate(array, data_type, op, literals) - } - PredicateOperator::Eq - | PredicateOperator::NotEq - | PredicateOperator::Lt - | PredicateOperator::LtEq - | PredicateOperator::Gt - | PredicateOperator::GtEq - | PredicateOperator::StartsWith - | PredicateOperator::EndsWith - | PredicateOperator::Contains - | PredicateOperator::Like => { - let Some(literal) = literals.first() else { - return Ok(BooleanArray::from(vec![true; array.len()])); - }; - let Some(scalar) = literal_scalar_for_parquet_filter(literal, data_type) - .map_err(|e| ArrowError::ComputeError(e.to_string()))? - else { - return Ok(BooleanArray::from(vec![true; array.len()])); - }; - let result = evaluate_column_predicate(array, &scalar, op)?; - Ok(sanitize_filter_mask(result)) - } - PredicateOperator::Between | PredicateOperator::NotBetween => { - evaluate_between_predicate(array, data_type, op, literals) - } - } -} - -/// `Between` / `NotBetween` translate to `gt_eq(col, low) & lt_eq(col, high)` -/// (and its negation). `arrow_ord::cmp` produces nullable masks: any null -/// row makes the comparison null, so a fully-built `Between` mask preserves -/// nulls. `NotBetween` then negates valid rows and leaves nulls null — -/// matching SQL three-valued logic; `sanitize_filter_mask` collapses nulls -/// into `false` to match the predicate evaluator's "NULL → false" rule. -fn evaluate_between_predicate( - array: &ArrayRef, - data_type: &DataType, - op: PredicateOperator, - literals: &[Datum], -) -> Result { - let (Some(low), Some(high)) = (literals.first(), literals.get(1)) else { - return Ok(BooleanArray::from(vec![true; array.len()])); - }; - let Some(low_scalar) = literal_scalar_for_parquet_filter(low, data_type) - .map_err(|e| ArrowError::ComputeError(e.to_string()))? - else { - return Ok(BooleanArray::from(vec![true; array.len()])); - }; - let Some(high_scalar) = literal_scalar_for_parquet_filter(high, data_type) - .map_err(|e| ArrowError::ComputeError(e.to_string()))? - else { - return Ok(BooleanArray::from(vec![true; array.len()])); - }; - let lo_mask = arrow_gt_eq(array, &low_scalar)?; - let hi_mask = arrow_lt_eq(array, &high_scalar)?; - let between = arrow_arith::boolean::and_kleene(&lo_mask, &hi_mask)?; - let result = match op { - PredicateOperator::Between => between, - PredicateOperator::NotBetween => arrow_arith::boolean::not(&between)?, - _ => unreachable!(), - }; - Ok(sanitize_filter_mask(result)) -} - -fn evaluate_set_membership_predicate( - array: &ArrayRef, - data_type: &DataType, - op: PredicateOperator, - literals: &[Datum], -) -> Result { - if literals.is_empty() { - return Ok(match op { - PredicateOperator::In => BooleanArray::from(vec![false; array.len()]), - PredicateOperator::NotIn => { - boolean_mask_from_predicate(array.len(), |row_index| array.is_valid(row_index)) - } - _ => unreachable!(), - }); - } - - let mut combined = match op { - PredicateOperator::In => BooleanArray::from(vec![false; array.len()]), - PredicateOperator::NotIn => { - boolean_mask_from_predicate(array.len(), |row_index| array.is_valid(row_index)) - } - _ => unreachable!(), - }; - - for literal in literals { - let Some(scalar) = literal_scalar_for_parquet_filter(literal, data_type) - .map_err(|e| ArrowError::ComputeError(e.to_string()))? - else { - return Ok(BooleanArray::from(vec![true; array.len()])); - }; - let comparison_op = match op { - PredicateOperator::In => PredicateOperator::Eq, - PredicateOperator::NotIn => PredicateOperator::NotEq, - _ => unreachable!(), - }; - let mask = sanitize_filter_mask(evaluate_column_predicate(array, &scalar, comparison_op)?); - combined = combine_filter_masks(&combined, &mask, matches!(op, PredicateOperator::In)); - } - - Ok(combined) -} - -fn evaluate_column_predicate( - column: &ArrayRef, - scalar: &Scalar, - op: PredicateOperator, -) -> Result { - match op { - PredicateOperator::Eq => arrow_eq(column, scalar), - PredicateOperator::NotEq => arrow_neq(column, scalar), - PredicateOperator::Lt => arrow_lt(column, scalar), - PredicateOperator::LtEq => arrow_lt_eq(column, scalar), - PredicateOperator::Gt => arrow_gt(column, scalar), - PredicateOperator::GtEq => arrow_gt_eq(column, scalar), - PredicateOperator::StartsWith - | PredicateOperator::EndsWith - | PredicateOperator::Contains - | PredicateOperator::Like => { - let pattern = pattern_scalar_for_string_kernel(scalar, column.data_type())?; - match op { - PredicateOperator::StartsWith => arrow_starts_with(column, &pattern), - PredicateOperator::EndsWith => arrow_ends_with(column, &pattern), - PredicateOperator::Contains => arrow_contains(column, &pattern), - PredicateOperator::Like => arrow_like(column, &pattern), - _ => unreachable!(), - } - } - PredicateOperator::IsNull - | PredicateOperator::IsNotNull - | PredicateOperator::In - | PredicateOperator::NotIn - | PredicateOperator::Between - | PredicateOperator::NotBetween => Ok(BooleanArray::new_null(column.len())), - } -} - -/// `arrow_string::like::*` kernels reject mismatched string types — Utf8 column -/// against Utf8 pattern is fine, but a LargeUtf8 / Utf8View column needs a -/// pattern of the same flavour. The shared scalar built upstream is always -/// `StringArray` (Utf8); promote it to match the column when needed. -fn pattern_scalar_for_string_kernel( - scalar: &Scalar, - column_type: &arrow_schema::DataType, -) -> Result, ArrowError> { - use arrow_array::{LargeStringArray, StringArray, StringViewArray}; - use arrow_schema::DataType as ArrowDataType; - - let arr = scalar.get().0; - let value = arr - .as_any() - .downcast_ref::() - .and_then(|s| (s.len() == 1 && s.is_valid(0)).then(|| s.value(0).to_string())); - let Some(value) = value else { - return Ok(scalar.clone()); - }; - Ok(match column_type { - ArrowDataType::Utf8 => Scalar::new(Arc::new(StringArray::from(vec![value])) as ArrayRef), - ArrowDataType::LargeUtf8 => { - Scalar::new(Arc::new(LargeStringArray::from(vec![value])) as ArrayRef) - } - ArrowDataType::Utf8View => { - Scalar::new(Arc::new(StringViewArray::from(vec![value])) as ArrayRef) - } - ArrowDataType::Dictionary(_, value_type) if value_type.as_ref() == &ArrowDataType::Utf8 => { - Scalar::new(Arc::new(StringArray::from(vec![value])) as ArrayRef) - } - other => { - return Err(ArrowError::InvalidArgumentError(format!( - "string predicate against non-string column type {other:?}" - ))) - } - }) -} - -fn sanitize_filter_mask(mask: BooleanArray) -> BooleanArray { - if mask.null_count() == 0 { - return mask; - } - - boolean_mask_from_predicate(mask.len(), |row_index| { - mask.is_valid(row_index) && mask.value(row_index) - }) -} - -fn combine_filter_masks(left: &BooleanArray, right: &BooleanArray, use_or: bool) -> BooleanArray { - debug_assert_eq!(left.len(), right.len()); - boolean_mask_from_predicate(left.len(), |row_index| { - if use_or { - left.value(row_index) || right.value(row_index) - } else { - left.value(row_index) && right.value(row_index) - } - }) -} - -fn boolean_mask_from_predicate( - len: usize, - mut predicate: impl FnMut(usize) -> bool, -) -> BooleanArray { - BooleanArray::from((0..len).map(&mut predicate).collect::>()) -} - // --------------------------------------------------------------------------- // Row-group statistics pruning // --------------------------------------------------------------------------- @@ -1127,134 +999,6 @@ fn exact_parquet_value<'a, T>( } } -// --------------------------------------------------------------------------- -// Literal → Arrow scalar conversion -// --------------------------------------------------------------------------- - -fn literal_scalar_for_parquet_filter( - literal: &Datum, - file_data_type: &DataType, -) -> crate::Result>> { - let array: ArrayRef = match file_data_type { - DataType::Boolean(_) => match literal { - Datum::Bool(value) => Arc::new(BooleanArray::new_scalar(*value).into_inner()), - _ => return Ok(None), - }, - DataType::TinyInt(_) => { - match integer_literal(literal).and_then(|value| i8::try_from(value).ok()) { - Some(value) => Arc::new(Int8Array::new_scalar(value).into_inner()), - None => return Ok(None), - } - } - DataType::SmallInt(_) => { - match integer_literal(literal).and_then(|value| i16::try_from(value).ok()) { - Some(value) => Arc::new(Int16Array::new_scalar(value).into_inner()), - None => return Ok(None), - } - } - DataType::Int(_) => { - match integer_literal(literal).and_then(|value| i32::try_from(value).ok()) { - Some(value) => Arc::new(Int32Array::new_scalar(value).into_inner()), - None => return Ok(None), - } - } - DataType::BigInt(_) => { - match integer_literal(literal).and_then(|value| i64::try_from(value).ok()) { - Some(value) => Arc::new(Int64Array::new_scalar(value).into_inner()), - None => return Ok(None), - } - } - DataType::Float(_) => match float32_literal(literal) { - Some(value) => Arc::new(Float32Array::new_scalar(value).into_inner()), - None => return Ok(None), - }, - DataType::Double(_) => match float64_literal(literal) { - Some(value) => Arc::new(Float64Array::new_scalar(value).into_inner()), - None => return Ok(None), - }, - DataType::Char(_) | DataType::VarChar(_) => match literal { - Datum::String(value) => Arc::new(StringArray::new_scalar(value.as_str()).into_inner()), - _ => return Ok(None), - }, - DataType::Binary(_) | DataType::VarBinary(_) => match literal { - Datum::Bytes(value) => Arc::new(BinaryArray::new_scalar(value.as_slice()).into_inner()), - _ => return Ok(None), - }, - DataType::Date(_) => match literal { - Datum::Date(value) => Arc::new(Date32Array::new_scalar(*value).into_inner()), - _ => return Ok(None), - }, - DataType::Decimal(decimal) => match literal { - Datum::Decimal { - unscaled, - precision, - scale, - } if *precision <= decimal.precision() && *scale == decimal.scale() => { - let precision = - u8::try_from(decimal.precision()).map_err(|_| Error::Unsupported { - message: "Decimal precision exceeds Arrow decimal128 range".to_string(), - })?; - let scale = - i8::try_from(decimal.scale() as i32).map_err(|_| Error::Unsupported { - message: "Decimal scale exceeds Arrow decimal128 range".to_string(), - })?; - Arc::new( - Decimal128Array::new_scalar(*unscaled) - .into_inner() - .with_precision_and_scale(precision, scale) - .map_err(|e| Error::UnexpectedError { - message: format!( - "Failed to build decimal scalar for parquet row filter: {e}" - ), - source: Some(Box::new(e)), - })?, - ) - } - _ => return Ok(None), - }, - DataType::Time(_) - | DataType::Timestamp(_) - | DataType::LocalZonedTimestamp(_) - | DataType::Blob(_) - | DataType::Array(_) - | DataType::Map(_) - | DataType::Multiset(_) - | DataType::Row(_) - | DataType::Vector(_) => return Ok(None), - }; - - Ok(Some(Scalar::new(array))) -} - -fn integer_literal(literal: &Datum) -> Option { - match literal { - Datum::TinyInt(value) => Some(i128::from(*value)), - Datum::SmallInt(value) => Some(i128::from(*value)), - Datum::Int(value) => Some(i128::from(*value)), - Datum::Long(value) => Some(i128::from(*value)), - _ => None, - } -} - -fn float32_literal(literal: &Datum) -> Option { - match literal { - Datum::Float(value) => Some(*value), - Datum::Double(value) => { - let casted = *value as f32; - ((casted as f64) == *value).then_some(casted) - } - _ => None, - } -} - -fn float64_literal(literal: &Datum) -> Option { - match literal { - Datum::Float(value) => Some(f64::from(*value)), - Datum::Double(value) => Some(*value), - _ => None, - } -} - // --------------------------------------------------------------------------- // Row selection helpers (DV, row ranges) // --------------------------------------------------------------------------- @@ -1609,15 +1353,15 @@ fn split_ranges_for_concurrency(merged: Vec>, concurrency: usize) -> #[cfg(test)] mod tests { use super::build_parquet_row_filter; - use super::ParquetFormatWriter; use super::{ - AsyncArrowWriter, PageIndexPolicy, ParquetMetaDataReader, Predicate, PredicateOperator, - RowSelection, + AsyncArrowWriter, Bytes, PageIndexPolicy, ParquetMetaDataReader, Predicate, + PredicateOperator, RowSelection, }; - use crate::arrow::format::FormatFileWriter; + use super::{FilePredicates, ParquetFormatReader, ParquetFormatWriter}; + use crate::arrow::format::{FormatFileReader, FormatFileWriter}; use crate::io::FileIOBuilder; use crate::spec::{DataField, DataType, Datum, IntType, PredicateBuilder}; - use arrow_array::{Int32Array, RecordBatch}; + use arrow_array::{Array, Int32Array, RecordBatch}; use arrow_schema::{DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema}; use parquet::schema::{parser::parse_message_type, types::SchemaDescriptor}; use std::sync::Arc; @@ -1676,7 +1420,7 @@ mod tests { ) -> arrow_array::BooleanArray { use crate::spec::VarCharType; let dt = DataType::VarChar(VarCharType::default()); - super::evaluate_exact_leaf_predicate( + crate::arrow::residual::evaluate_exact_leaf_predicate( &column, &dt, op, @@ -1764,8 +1508,13 @@ mod tests { high: i32, ) -> arrow_array::BooleanArray { let dt = DataType::Int(IntType::new()); - super::evaluate_exact_leaf_predicate(&column, &dt, op, &[Datum::Int(low), Datum::Int(high)]) - .expect("BETWEEN should evaluate") + crate::arrow::residual::evaluate_exact_leaf_predicate( + &column, + &dt, + op, + &[Datum::Int(low), Datum::Int(high)], + ) + .expect("BETWEEN should evaluate") } #[test] @@ -2333,6 +2082,281 @@ mod tests { ); } + // ----------------------------------------------------------------------- + // Residual backstop for compound / unsupported-leaf predicates (Gap B) + // ----------------------------------------------------------------------- + + /// Write a single-row-group parquet file with columns `(id, name, age)`: + /// `id=[1..=5]`, `name=[a,b,c,d,e]`, `age=[10,20,30,40,50]`. One row group so + /// row-group / page pruning cannot exclude the non-matching rows — the only + /// way to return exact rows is the row-level residual filter. + async fn write_id_name_age_parquet() -> Vec { + let schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("id", ArrowDataType::Int32, true), + ArrowField::new("name", ArrowDataType::Utf8, true), + ArrowField::new("age", ArrowDataType::Int32, true), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])), + Arc::new(arrow_array::StringArray::from(vec![ + "a", "b", "c", "d", "e", + ])), + Arc::new(Int32Array::from(vec![10, 20, 30, 40, 50])), + ], + ) + .unwrap(); + + // Single row group covering every row so no row-group pruning is possible. + let props = parquet::file::properties::WriterProperties::builder() + .set_max_row_group_row_count(Some(5)) + .build(); + let mut buf: Vec = Vec::new(); + { + let mut writer = + AsyncArrowWriter::try_new(&mut buf, schema.clone(), Some(props)).unwrap(); + writer.write(&batch).await.unwrap(); + writer.close().await.unwrap(); + } + buf + } + + fn residual_test_field(index: i32, name: &str, data_type: DataType) -> DataField { + DataField::new(index, name.to_string(), data_type) + } + + fn residual_leaf( + column: &str, + index: usize, + data_type: DataType, + op: PredicateOperator, + literals: Vec, + ) -> Predicate { + Predicate::Leaf { + column: column.to_string(), + index, + data_type, + op, + literals, + } + } + + /// File-level schema `(id, name, age)` used to resolve predicate indices. + fn id_name_age_file_fields() -> Vec { + use crate::spec::VarCharType; + vec![ + residual_test_field(0, "id", DataType::Int(IntType::new())), + residual_test_field(1, "name", DataType::VarChar(VarCharType::string_type())), + residual_test_field(2, "age", DataType::Int(IntType::new())), + ] + } + + /// Read `[name]` from the `(id, name, age)` parquet file under `predicates` + /// and collect the surviving `name` values in row order. The reader-level + /// batch may include extra (predicate) columns — we look `name` up by name. + async fn read_names_under_predicate( + read_fields: &[DataField], + predicate: Predicate, + ) -> Vec { + use crate::io::FileIOBuilder; + use futures::StreamExt; + + let bytes = write_id_name_age_parquet().await; + + let file_io = FileIOBuilder::new("memory").build().unwrap(); + let path = "memory:/test_parquet_residual_backstop.parquet"; + let output = file_io.new_output(path).unwrap(); + output.write(Bytes::from(bytes)).await.unwrap(); + + let input = file_io.new_input(path).unwrap(); + let file_size = input.metadata().await.unwrap().size; + let reader_input = input.reader().await.unwrap(); + + let predicates = FilePredicates { + predicates: vec![predicate], + file_fields: id_name_age_file_fields(), + }; + + let reader = ParquetFormatReader; + let mut stream = reader + .read_batch_stream( + Box::new(reader_input), + file_size, + read_fields, + Some(&predicates), + None, + None, + ) + .await + .unwrap(); + + let mut names: Vec = Vec::new(); + while let Some(result) = stream.next().await { + let batch = result.unwrap(); + let name_idx = batch.schema().index_of("name").unwrap(); + let col = batch + .column(name_idx) + .as_any() + .downcast_ref::() + .unwrap(); + names.extend((0..col.len()).map(|i| col.value(i).to_string())); + } + names + } + + #[tokio::test] + async fn test_parquet_or_predicate_returns_exact_rows() { + use crate::spec::VarCharType; + // Read only [name]; predicate `age > 25 OR name = 'a'`. + // age>25 -> c,d,e (rows 3,4,5); name='a' -> a (row 1). Union: a,c,d,e. + // Or is not accepted by the row filter, so before the fix it falls + // through to pruning only (single row group -> all 5 rows returned). + let read_fields = vec![residual_test_field( + 1, + "name", + DataType::VarChar(VarCharType::string_type()), + )]; + let predicate = Predicate::Or(vec![ + residual_leaf( + "age", + 2, + DataType::Int(IntType::new()), + PredicateOperator::Gt, + vec![Datum::Int(25)], + ), + residual_leaf( + "name", + 1, + DataType::VarChar(VarCharType::string_type()), + PredicateOperator::Eq, + vec![Datum::String("a".to_string())], + ), + ]); + let names = read_names_under_predicate(&read_fields, predicate).await; + assert_eq!(names, vec!["a", "c", "d", "e"]); + } + + #[tokio::test] + async fn test_parquet_not_predicate_returns_exact_complement() { + use crate::spec::VarCharType; + // `NOT (age > 25)` -> age <= 25 -> a,b (rows 1,2). + let read_fields = vec![residual_test_field( + 1, + "name", + DataType::VarChar(VarCharType::string_type()), + )]; + let predicate = Predicate::Not(Box::new(residual_leaf( + "age", + 2, + DataType::Int(IntType::new()), + PredicateOperator::Gt, + vec![Datum::Int(25)], + ))); + let names = read_names_under_predicate(&read_fields, predicate).await; + assert_eq!(names, vec!["a", "b"]); + } + + #[tokio::test] + async fn test_parquet_and_of_supported_leaves_takes_fast_path() { + use crate::spec::VarCharType; + // Fast-path guard: a top-level AND of two supported leaves stays on the + // row-filter fast path (no residual pass) and still returns exact rows. + // `age > 15 AND age < 45` -> b,c,d (rows 2,3,4). + let read_fields = vec![residual_test_field( + 1, + "name", + DataType::VarChar(VarCharType::string_type()), + )]; + + // Both leaves must be fully enforced by the row filter -> fast path taken. + let parquet_schema = SchemaDescriptor::new(Arc::new( + parse_message_type( + " + message test_schema { + OPTIONAL INT32 id; + OPTIONAL BYTE_ARRAY name (UTF8); + OPTIONAL INT32 age; + } + ", + ) + .expect("test schema should parse"), + )); + let file_fields = id_name_age_file_fields(); + let leaf_gt = residual_leaf( + "age", + 2, + DataType::Int(IntType::new()), + PredicateOperator::Gt, + vec![Datum::Int(15)], + ); + let leaf_lt = residual_leaf( + "age", + 2, + DataType::Int(IntType::new()), + PredicateOperator::Lt, + vec![Datum::Int(45)], + ); + assert!(super::predicate_fully_enforced_by_row_filter( + &parquet_schema, + &leaf_gt, + &file_fields + )); + assert!(super::predicate_fully_enforced_by_row_filter( + &parquet_schema, + &leaf_lt, + &file_fields + )); + // An Or is NOT fully enforced -> residual pass would engage. + let or_pred = Predicate::Or(vec![leaf_gt.clone(), leaf_lt.clone()]); + assert!(!super::predicate_fully_enforced_by_row_filter( + &parquet_schema, + &or_pred, + &file_fields + )); + + // Both leaves reach the reader split into the top-level predicate list + // (split_and flattens AND), so we pass them as two separate predicates. + use crate::io::FileIOBuilder; + use futures::StreamExt; + let bytes = write_id_name_age_parquet().await; + let file_io = FileIOBuilder::new("memory").build().unwrap(); + let path = "memory:/test_parquet_fast_path_and.parquet"; + let output = file_io.new_output(path).unwrap(); + output.write(Bytes::from(bytes)).await.unwrap(); + let input = file_io.new_input(path).unwrap(); + let file_size = input.metadata().await.unwrap().size; + let reader_input = input.reader().await.unwrap(); + let predicates = FilePredicates { + predicates: vec![leaf_gt, leaf_lt], + file_fields, + }; + let reader = ParquetFormatReader; + let mut stream = reader + .read_batch_stream( + Box::new(reader_input), + file_size, + &read_fields, + Some(&predicates), + None, + None, + ) + .await + .unwrap(); + let mut names: Vec = Vec::new(); + while let Some(result) = stream.next().await { + let batch = result.unwrap(); + let name_idx = batch.schema().index_of("name").unwrap(); + let col = batch + .column(name_idx) + .as_any() + .downcast_ref::() + .unwrap(); + names.extend((0..col.len()).map(|i| col.value(i).to_string())); + } + assert_eq!(names, vec!["b", "c", "d"]); + } + #[test] fn test_page_boundaries_valid() { use parquet::file::page_index::offset_index::PageLocation; diff --git a/crates/paimon/src/arrow/format/vortex.rs b/crates/paimon/src/arrow/format/vortex.rs index 0a9717da..12250a76 100644 --- a/crates/paimon/src/arrow/format/vortex.rs +++ b/crates/paimon/src/arrow/format/vortex.rs @@ -16,21 +16,15 @@ // under the License. use super::{FilePredicates, FormatFileReader, FormatFileWriter}; +use crate::arrow::residual::{ + filter_record_batch_by_predicates, same_data_field, widen_scan_fields, +}; use crate::io::{FileRead, OutputFile}; -use crate::spec::{DataField, DataType, Datum, Predicate, PredicateOperator}; +use crate::spec::{DataField, Predicate}; use crate::table::{ArrowRecordBatchStream, RowRange}; use crate::Error; -use arrow_array::{ - Array, ArrayRef as ArrowArrayRef, BinaryArray, BooleanArray, Date32Array, Decimal128Array, - Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, RecordBatch, Scalar, - StringArray, Time32MillisecondArray, TimestampMicrosecondArray, TimestampMillisecondArray, - TimestampNanosecondArray, -}; -use arrow_ord::cmp::{ - eq as arrow_eq, gt as arrow_gt, gt_eq as arrow_gt_eq, lt as arrow_lt, lt_eq as arrow_lt_eq, - neq as arrow_neq, -}; -use arrow_schema::{ArrowError, DataType as ArrowDataType, Field, SchemaRef}; +use arrow_array::RecordBatch; +use arrow_schema::{DataType as ArrowDataType, Field, SchemaRef}; use async_trait::async_trait; use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, OnceLock}; @@ -84,7 +78,7 @@ impl FormatFileReader for VortexFormatReader { predicates: fp.predicates.clone(), file_fields: fp.file_fields.clone(), }); - let scan_fields = build_vortex_scan_fields(&read_fields, predicates.as_ref()); + let scan_fields = widen_scan_fields(&read_fields, predicates.as_ref()); let scan_schema = crate::arrow::build_target_arrow_schema(&scan_fields)?; let _permit = acquire_vortex_io_permit().await?; @@ -296,53 +290,6 @@ fn as_single_row_range(ranges: &[RowRange], total_rows: u64) -> Option, -) -> Vec { - let mut fields = read_fields.to_vec(); - - if let Some(fp) = predicates { - let mut predicate_indices = Vec::new(); - for predicate in &fp.predicates { - collect_predicate_field_indices(predicate, &mut predicate_indices); - } - for index in predicate_indices { - if let Some(field) = fp.file_fields.get(index) { - push_unique_scan_field(&mut fields, field); - } - } - } - - fields -} - -fn collect_predicate_field_indices(predicate: &Predicate, indices: &mut Vec) { - match predicate { - Predicate::Leaf { index, .. } => indices.push(*index), - Predicate::And(children) | Predicate::Or(children) => { - for child in children { - collect_predicate_field_indices(child, indices); - } - } - Predicate::Not(inner) => collect_predicate_field_indices(inner, indices), - Predicate::AlwaysTrue | Predicate::AlwaysFalse => {} - } -} - -fn push_unique_scan_field(fields: &mut Vec, field: &DataField) { - if !fields - .iter() - .any(|existing| same_data_field(existing, field)) - { - fields.push(field.clone()); - } -} - -fn same_data_field(left: &DataField, right: &DataField) -> bool { - left.id() == right.id() || left.name() == right.name() -} - fn filter_and_project_batch( batch: RecordBatch, target_schema: &SchemaRef, @@ -399,446 +346,6 @@ fn projection_indices( .collect() } -fn filter_record_batch_by_predicates( - batch: RecordBatch, - predicates: &FilePredicates, - scan_fields: &[DataField], -) -> crate::Result { - let Some(mask) = evaluate_predicates_mask( - &batch, - &predicates.predicates, - &predicates.file_fields, - scan_fields, - )? - else { - return Ok(batch); - }; - - arrow_select::filter::filter_record_batch(&batch, &mask).map_err(|e| Error::DataInvalid { - message: format!("Failed to filter Vortex RecordBatch: {e}"), - source: Some(Box::new(e)), - }) -} - -fn evaluate_predicates_mask( - batch: &RecordBatch, - predicates: &[Predicate], - file_fields: &[DataField], - scan_fields: &[DataField], -) -> crate::Result> { - let mut combined = None; - for predicate in predicates { - let Some(mask) = evaluate_predicate_mask(batch, predicate, file_fields, scan_fields)? - else { - continue; - }; - combined = Some(match combined { - Some(existing) => combine_filter_masks(&existing, &mask, false), - None => mask, - }); - } - Ok(combined) -} - -fn evaluate_predicate_mask( - batch: &RecordBatch, - predicate: &Predicate, - file_fields: &[DataField], - scan_fields: &[DataField], -) -> crate::Result> { - match predicate { - Predicate::AlwaysTrue => Ok(Some(BooleanArray::from(vec![true; batch.num_rows()]))), - Predicate::AlwaysFalse => Ok(Some(BooleanArray::from(vec![false; batch.num_rows()]))), - Predicate::And(children) => { - let mut combined = None; - for child in children { - let Some(mask) = evaluate_predicate_mask(batch, child, file_fields, scan_fields)? - else { - continue; - }; - combined = Some(match combined { - Some(existing) => combine_filter_masks(&existing, &mask, false), - None => mask, - }); - } - Ok(combined) - } - Predicate::Or(children) => { - let mut combined = BooleanArray::from(vec![false; batch.num_rows()]); - for child in children { - let Some(mask) = evaluate_predicate_mask(batch, child, file_fields, scan_fields)? - else { - return Ok(None); - }; - combined = combine_filter_masks(&combined, &mask, true); - } - Ok(Some(combined)) - } - Predicate::Not(inner) => { - let Some(mask) = evaluate_predicate_mask(batch, inner, file_fields, scan_fields)? - else { - return Ok(None); - }; - Ok(Some(boolean_mask_from_predicate(mask.len(), |row_index| { - !mask.value(row_index) - }))) - } - Predicate::Leaf { - index, - op, - literals, - .. - } => { - let Some(file_field) = file_fields.get(*index) else { - return Ok(None); - }; - let Some(scan_index) = scan_fields - .iter() - .position(|scan_field| same_data_field(scan_field, file_field)) - else { - return Ok(None); - }; - evaluate_arrow_leaf_predicate( - batch.column(scan_index), - file_field.data_type(), - *op, - literals, - ) - } - } -} - -fn evaluate_arrow_leaf_predicate( - array: &ArrowArrayRef, - data_type: &DataType, - op: PredicateOperator, - literals: &[Datum], -) -> crate::Result> { - match op { - PredicateOperator::IsNull => Ok(Some(boolean_mask_from_predicate( - array.len(), - |row_index| array.is_null(row_index), - ))), - PredicateOperator::IsNotNull => Ok(Some(boolean_mask_from_predicate( - array.len(), - |row_index| array.is_valid(row_index), - ))), - PredicateOperator::In | PredicateOperator::NotIn => { - evaluate_set_membership_predicate(array, data_type, op, literals) - } - PredicateOperator::Eq - | PredicateOperator::NotEq - | PredicateOperator::Lt - | PredicateOperator::LtEq - | PredicateOperator::Gt - | PredicateOperator::GtEq => { - let Some(literal) = literals.first() else { - return Ok(None); - }; - let Some(scalar) = literal_scalar_for_arrow_filter(literal, data_type)? else { - return Ok(None); - }; - let mask = - evaluate_column_predicate(array, &scalar, op).map_err(|e| Error::DataInvalid { - message: format!("Failed to evaluate Vortex predicate: {e}"), - source: Some(Box::new(e)), - })?; - Ok(Some(sanitize_filter_mask(mask))) - } - // Vortex's scalar comparators don't cover substring/range predicates, so - // fall open: returning None defers them to the outer stats-prune + arrow - // row-filter path, which evaluates them with full NULL (Kleene) semantics. - PredicateOperator::StartsWith - | PredicateOperator::EndsWith - | PredicateOperator::Contains - | PredicateOperator::Like - | PredicateOperator::Between - | PredicateOperator::NotBetween => Ok(None), - } -} - -fn evaluate_set_membership_predicate( - array: &ArrowArrayRef, - data_type: &DataType, - op: PredicateOperator, - literals: &[Datum], -) -> crate::Result> { - if literals.is_empty() { - return Ok(Some(match op { - PredicateOperator::In => BooleanArray::from(vec![false; array.len()]), - PredicateOperator::NotIn => { - boolean_mask_from_predicate(array.len(), |row_index| array.is_valid(row_index)) - } - _ => unreachable!(), - })); - } - - let mut combined = match op { - PredicateOperator::In => BooleanArray::from(vec![false; array.len()]), - PredicateOperator::NotIn => { - boolean_mask_from_predicate(array.len(), |row_index| array.is_valid(row_index)) - } - _ => unreachable!(), - }; - - for literal in literals { - let Some(scalar) = literal_scalar_for_arrow_filter(literal, data_type)? else { - return Ok(None); - }; - let comparison_op = match op { - PredicateOperator::In => PredicateOperator::Eq, - PredicateOperator::NotIn => PredicateOperator::NotEq, - _ => unreachable!(), - }; - let mask = evaluate_column_predicate(array, &scalar, comparison_op).map_err(|e| { - Error::DataInvalid { - message: format!("Failed to evaluate Vortex set predicate: {e}"), - source: Some(Box::new(e)), - } - })?; - let mask = sanitize_filter_mask(mask); - combined = combine_filter_masks(&combined, &mask, matches!(op, PredicateOperator::In)); - } - - Ok(Some(combined)) -} - -fn evaluate_column_predicate( - column: &ArrowArrayRef, - scalar: &Scalar, - op: PredicateOperator, -) -> Result { - match op { - PredicateOperator::Eq => arrow_eq(column, scalar), - PredicateOperator::NotEq => arrow_neq(column, scalar), - PredicateOperator::Lt => arrow_lt(column, scalar), - PredicateOperator::LtEq => arrow_lt_eq(column, scalar), - PredicateOperator::Gt => arrow_gt(column, scalar), - PredicateOperator::GtEq => arrow_gt_eq(column, scalar), - PredicateOperator::IsNull - | PredicateOperator::IsNotNull - | PredicateOperator::In - | PredicateOperator::NotIn - // String/range ops never reach the scalar comparator: the leaf - // dispatcher falls open (returns None) for them, so the value here is - // unused. Listed only to keep the match exhaustive. - | PredicateOperator::StartsWith - | PredicateOperator::EndsWith - | PredicateOperator::Contains - | PredicateOperator::Like - | PredicateOperator::Between - | PredicateOperator::NotBetween => Ok(BooleanArray::new_null(column.len())), - } -} - -fn sanitize_filter_mask(mask: BooleanArray) -> BooleanArray { - if mask.null_count() == 0 { - return mask; - } - - boolean_mask_from_predicate(mask.len(), |row_index| { - mask.is_valid(row_index) && mask.value(row_index) - }) -} - -fn combine_filter_masks(left: &BooleanArray, right: &BooleanArray, use_or: bool) -> BooleanArray { - debug_assert_eq!(left.len(), right.len()); - boolean_mask_from_predicate(left.len(), |row_index| { - if use_or { - left.value(row_index) || right.value(row_index) - } else { - left.value(row_index) && right.value(row_index) - } - }) -} - -fn boolean_mask_from_predicate( - len: usize, - mut predicate: impl FnMut(usize) -> bool, -) -> BooleanArray { - BooleanArray::from((0..len).map(&mut predicate).collect::>()) -} - -fn literal_scalar_for_arrow_filter( - literal: &Datum, - file_data_type: &DataType, -) -> crate::Result>> { - let array: ArrowArrayRef = match file_data_type { - DataType::Boolean(_) => match literal { - Datum::Bool(value) => Arc::new(BooleanArray::new_scalar(*value).into_inner()), - _ => return Ok(None), - }, - DataType::TinyInt(_) => { - match integer_literal(literal).and_then(|value| i8::try_from(value).ok()) { - Some(value) => Arc::new(Int8Array::new_scalar(value).into_inner()), - None => return Ok(None), - } - } - DataType::SmallInt(_) => { - match integer_literal(literal).and_then(|value| i16::try_from(value).ok()) { - Some(value) => Arc::new(Int16Array::new_scalar(value).into_inner()), - None => return Ok(None), - } - } - DataType::Int(_) => { - match integer_literal(literal).and_then(|value| i32::try_from(value).ok()) { - Some(value) => Arc::new(Int32Array::new_scalar(value).into_inner()), - None => return Ok(None), - } - } - DataType::BigInt(_) => { - match integer_literal(literal).and_then(|value| i64::try_from(value).ok()) { - Some(value) => Arc::new(Int64Array::new_scalar(value).into_inner()), - None => return Ok(None), - } - } - DataType::Float(_) => match float32_literal(literal) { - Some(value) => Arc::new(Float32Array::new_scalar(value).into_inner()), - None => return Ok(None), - }, - DataType::Double(_) => match float64_literal(literal) { - Some(value) => Arc::new(Float64Array::new_scalar(value).into_inner()), - None => return Ok(None), - }, - DataType::Char(_) | DataType::VarChar(_) => match literal { - Datum::String(value) => Arc::new(StringArray::new_scalar(value.as_str()).into_inner()), - _ => return Ok(None), - }, - DataType::Binary(_) | DataType::VarBinary(_) | DataType::Blob(_) => match literal { - Datum::Bytes(value) => Arc::new(BinaryArray::new_scalar(value.as_slice()).into_inner()), - _ => return Ok(None), - }, - DataType::Date(_) => match literal { - Datum::Date(value) => Arc::new(Date32Array::new_scalar(*value).into_inner()), - _ => return Ok(None), - }, - DataType::Time(_) => match literal { - Datum::Time(value) => Arc::new(Time32MillisecondArray::new_scalar(*value).into_inner()), - _ => return Ok(None), - }, - DataType::Timestamp(ts) => match literal { - Datum::Timestamp { millis, nanos } => { - let Some(array) = timestamp_scalar(*millis, *nanos, ts.precision(), None)? else { - return Ok(None); - }; - array - } - _ => return Ok(None), - }, - DataType::LocalZonedTimestamp(ts) => match literal { - Datum::LocalZonedTimestamp { millis, nanos } => { - let Some(array) = timestamp_scalar(*millis, *nanos, ts.precision(), Some("UTC"))? - else { - return Ok(None); - }; - array - } - _ => return Ok(None), - }, - DataType::Decimal(decimal) => match literal { - Datum::Decimal { - unscaled, - precision, - scale, - } if *precision <= decimal.precision() && *scale == decimal.scale() => { - let precision = - u8::try_from(decimal.precision()).map_err(|_| Error::Unsupported { - message: "Decimal precision exceeds Arrow decimal128 range".to_string(), - })?; - let scale = - i8::try_from(decimal.scale() as i32).map_err(|_| Error::Unsupported { - message: "Decimal scale exceeds Arrow decimal128 range".to_string(), - })?; - Arc::new( - Decimal128Array::new_scalar(*unscaled) - .into_inner() - .with_precision_and_scale(precision, scale) - .map_err(|e| Error::UnexpectedError { - message: format!( - "Failed to build decimal scalar for Vortex row filter: {e}" - ), - source: Some(Box::new(e)), - })?, - ) - } - _ => return Ok(None), - }, - DataType::Array(_) - | DataType::Map(_) - | DataType::Multiset(_) - | DataType::Row(_) - | DataType::Vector(_) => { - return Ok(None); - } - }; - - Ok(Some(Scalar::new(array))) -} - -fn timestamp_scalar( - millis: i64, - nanos: i32, - precision: u32, - timezone: Option<&'static str>, -) -> crate::Result> { - let array: ArrowArrayRef = match precision { - 0..=3 => { - let array = TimestampMillisecondArray::new_scalar(millis).into_inner(); - match timezone { - Some(tz) => Arc::new(array.with_timezone(tz)), - None => Arc::new(array), - } - } - 4..=6 => { - let value = millis * 1_000 + (nanos as i64) / 1_000; - let array = TimestampMicrosecondArray::new_scalar(value).into_inner(); - match timezone { - Some(tz) => Arc::new(array.with_timezone(tz)), - None => Arc::new(array), - } - } - 7..=9 => { - let value = millis * 1_000_000 + (nanos as i64); - let array = TimestampNanosecondArray::new_scalar(value).into_inner(); - match timezone { - Some(tz) => Arc::new(array.with_timezone(tz)), - None => Arc::new(array), - } - } - _ => return Ok(None), - }; - Ok(Some(array)) -} - -fn integer_literal(literal: &Datum) -> Option { - match literal { - Datum::TinyInt(value) => Some(i128::from(*value)), - Datum::SmallInt(value) => Some(i128::from(*value)), - Datum::Int(value) => Some(i128::from(*value)), - Datum::Long(value) => Some(i128::from(*value)), - _ => None, - } -} - -fn float32_literal(literal: &Datum) -> Option { - match literal { - Datum::Float(value) => Some(*value), - Datum::Double(value) => { - let casted = *value as f32; - ((casted as f64) == *value).then_some(casted) - } - _ => None, - } -} - -fn float64_literal(literal: &Datum) -> Option { - match literal { - Datum::Float(value) => Some(f64::from(*value)), - Datum::Double(value) => Some(*value), - _ => None, - } -} - /// Convert paimon `RowRange`s (inclusive [from, to]) to a Vortex `Selection`. fn row_ranges_to_selection(ranges: &[RowRange], total_rows: u64) -> Selection { let mut bitmap = roaring::RoaringTreemap::new(); @@ -1050,7 +557,7 @@ mod tests { use super::*; use crate::arrow::format::FormatFileWriter; use crate::io::FileIOBuilder; - use crate::spec::{DataField, DataType, VarCharType}; + use crate::spec::{DataField, DataType, Datum, VarCharType}; use arrow_array::{Int32Array, StringArray}; use arrow_schema::{DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema}; use bytes::Bytes; diff --git a/crates/paimon/src/arrow/mod.rs b/crates/paimon/src/arrow/mod.rs index b897c8ca..07c70a22 100644 --- a/crates/paimon/src/arrow/mod.rs +++ b/crates/paimon/src/arrow/mod.rs @@ -17,6 +17,7 @@ pub(crate) mod filtering; pub(crate) mod format; +pub(crate) mod residual; pub(crate) mod schema_evolution; use crate::spec::{ diff --git a/crates/paimon/src/arrow/residual.rs b/crates/paimon/src/arrow/residual.rs new file mode 100644 index 00000000..ed57b3b4 --- /dev/null +++ b/crates/paimon/src/arrow/residual.rs @@ -0,0 +1,1590 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Shared, exact Arrow-batch residual predicate evaluator. +//! +//! This module holds the single source of truth for evaluating a +//! [`FilePredicates`] set against an already-decoded Arrow [`RecordBatch`], +//! producing a boolean row mask (or filtering the batch directly). It replaces +//! two near-duplicate copies that previously lived in the Parquet and Vortex +//! format readers: +//! +//! - The Parquet copy implemented the full leaf operator set (comparison, +//! set-membership, and the string / range operators `StartsWith` / `EndsWith` +//! / `Contains` / `Like` / `Between` / `NotBetween`). +//! - The Vortex copy carried the compound batch walker (And/Or/Not plus the +//! `Leaf` arm that maps a `file_field` to the corresponding batch column via +//! [`same_data_field`]) and a broader literal-to-scalar conversion (Time / +//! Timestamp / LocalZonedTimestamp / Blob), but deferred the string / range +//! leaves. +//! +//! The consolidation keeps the complete leaf dispatch (from Parquet), the +//! compound walker + `filter_record_batch_by_predicates` (from Vortex), and the +//! broader `literal_scalar_for_arrow_filter` (from Vortex). As a result the +//! string / range leaves are now evaluated everywhere the shared walker runs — +//! they are no longer silently deferred. +//! +//! `NULL` rows collapse to `false` via [`sanitize_filter_mask`], matching the +//! evaluator's residual-filter convention everywhere. +//! +//! The module is always compiled (independent of the `vortex` feature), so it +//! must not reference any vortex-specific types. + +use crate::arrow::format::FilePredicates; +use crate::spec::{DataField, DataType, Datum, Predicate, PredicateOperator}; +use crate::Error; +use arrow_array::{ + Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Datum as ArrowDatum, Decimal128Array, + Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, RecordBatch, Scalar, + StringArray, Time32MillisecondArray, TimestampMicrosecondArray, TimestampMillisecondArray, + TimestampNanosecondArray, +}; +use arrow_ord::cmp::{ + eq as arrow_eq, gt as arrow_gt, gt_eq as arrow_gt_eq, lt as arrow_lt, lt_eq as arrow_lt_eq, + neq as arrow_neq, +}; +use arrow_schema::ArrowError; +use arrow_string::like::{ + contains as arrow_contains, ends_with as arrow_ends_with, like as arrow_like, + starts_with as arrow_starts_with, +}; +use std::sync::Arc; + +/// Filter a [`RecordBatch`] to exactly the rows satisfying `predicates`. +/// +/// `scan_fields` describes the columns actually present in `batch` (in order); +/// `predicates.file_fields` describes the full file schema the predicate indices +/// point into. Each leaf's `file_field` is resolved to a `batch` column via +/// [`same_data_field`]. When no predicate produces a mask (e.g. empty predicate +/// list), the batch is returned unchanged. +pub(crate) fn filter_record_batch_by_predicates( + batch: RecordBatch, + predicates: &FilePredicates, + scan_fields: &[DataField], +) -> crate::Result { + let Some(mask) = evaluate_predicates_mask( + &batch, + &predicates.predicates, + &predicates.file_fields, + scan_fields, + )? + else { + return Ok(batch); + }; + + arrow_select::filter::filter_record_batch(&batch, &mask).map_err(|e| Error::DataInvalid { + message: format!("Failed to filter RecordBatch by predicates: {e}"), + source: Some(Box::new(e)), + }) +} + +/// Evaluate the conjunction of `predicates` against `batch`, returning the +/// combined boolean row mask, or `None` when no predicate contributed a mask +/// (identity — keep every row). +pub(crate) fn evaluate_predicates_mask( + batch: &RecordBatch, + predicates: &[Predicate], + file_fields: &[DataField], + scan_fields: &[DataField], +) -> crate::Result> { + let mut combined = None; + for predicate in predicates { + let Some(mask) = evaluate_predicate_mask(batch, predicate, file_fields, scan_fields)? + else { + continue; + }; + combined = Some(match combined { + Some(existing) => combine_filter_masks(&existing, &mask, false), + None => mask, + }); + } + Ok(combined) +} + +fn evaluate_predicate_mask( + batch: &RecordBatch, + predicate: &Predicate, + file_fields: &[DataField], + scan_fields: &[DataField], +) -> crate::Result> { + match predicate { + Predicate::AlwaysTrue => Ok(Some(BooleanArray::from(vec![true; batch.num_rows()]))), + Predicate::AlwaysFalse => Ok(Some(BooleanArray::from(vec![false; batch.num_rows()]))), + Predicate::And(children) => { + let mut combined = None; + for child in children { + let Some(mask) = evaluate_predicate_mask(batch, child, file_fields, scan_fields)? + else { + continue; + }; + combined = Some(match combined { + Some(existing) => combine_filter_masks(&existing, &mask, false), + None => mask, + }); + } + Ok(combined) + } + Predicate::Or(children) => { + let mut combined = BooleanArray::from(vec![false; batch.num_rows()]); + for child in children { + let Some(mask) = evaluate_predicate_mask(batch, child, file_fields, scan_fields)? + else { + return Ok(None); + }; + combined = combine_filter_masks(&combined, &mask, true); + } + Ok(Some(combined)) + } + Predicate::Not(inner) => { + let Some(mask) = evaluate_predicate_mask(batch, inner, file_fields, scan_fields)? + else { + return Ok(None); + }; + Ok(Some(boolean_mask_from_predicate(mask.len(), |row_index| { + !mask.value(row_index) + }))) + } + Predicate::Leaf { + index, + op, + literals, + data_type: predicate_data_type, + .. + } => { + let Some(file_field) = file_fields.get(*index) else { + return Ok(None); + }; + // Resolve the predicate column in the batch by NAME against the batch's + // own schema. We must not index by the column's position in + // `scan_fields`: a reader's emitted batch may order columns by its file + // schema (e.g. ORC `ProjectionMask::named_roots`), not by `scan_fields` + // order, so positional indexing can select the wrong column (and + // compare mismatched types). `scan_fields` is used only to detect the + // Gap-A "predicate column not scanned" bug below. + let column = batch + .schema() + .index_of(file_field.name()) + .ok() + .map(|batch_index| batch.column(batch_index)); + let Some(column) = column else { + // The predicate column exists in the file schema but is absent + // from the batch actually scanned — this is the Gap-A bug (a + // reader that did not widen its scan to include predicate columns + // before filtering). It must never happen. Fail loudly in + // debug/test builds; degrade to a skip (rather than panic) in + // release. `scan_fields` is unused for resolution now (we look up + // by name in the batch), so touch it here only to keep the guard + // message informative. + let _ = scan_fields; + debug_assert!( + false, + "residual predicate column '{}' exists in file_fields but is missing from the scanned batch; the reader must widen its scan to include predicate columns", + file_field.name() + ); + return Ok(None); + }; + // Evaluate against the predicate's declared type (the table type), + // not the file column's type. Under schema evolution these differ + // (e.g. an old INT file column for a promoted BIGINT table column): + // building the literal in the narrower file type could fail for an + // out-of-range value, incorrectly erroring when the exact answer is + // simply "no rows". Promotion is always widening, so cast the decoded + // column up to the predicate type first — then the literal is always + // representable and the comparison is exact. + let predicate_arrow_type = crate::arrow::paimon_type_to_arrow(predicate_data_type)?; + let mask = if column.data_type() == &predicate_arrow_type { + evaluate_exact_leaf_predicate(column, file_field.data_type(), *op, literals) + } else { + let cast_column = arrow_cast::cast(column, &predicate_arrow_type).map_err(|e| { + Error::DataInvalid { + message: format!( + "Failed to cast residual column '{}' from {:?} to {:?}: {e}", + file_field.name(), + column.data_type(), + predicate_arrow_type + ), + source: Some(Box::new(e)), + } + })?; + evaluate_exact_leaf_predicate(&cast_column, predicate_data_type, *op, literals) + } + .map_err(|e| Error::DataInvalid { + message: format!("Failed to evaluate residual predicate: {e}"), + source: Some(Box::new(e)), + })?; + Ok(Some(mask)) + } + } +} + +/// Two [`DataField`]s refer to the same logical column when their IDs match, or +/// (for schemas without stable IDs) their names match. +pub(crate) fn same_data_field(left: &DataField, right: &DataField) -> bool { + left.id() == right.id() || left.name() == right.name() +} + +/// Widen `read_fields` to include every column referenced by `predicates` that +/// is not already projected, so a reader scans `read_fields ∪ predicate columns` +/// and the residual filter can see every predicate column. +/// +/// Deduped by [`same_data_field`]: a predicate column already present in +/// `read_fields` is not added twice. When `predicates` is `None`, `read_fields` +/// is returned unchanged. +pub(crate) fn widen_scan_fields( + read_fields: &[DataField], + predicates: Option<&FilePredicates>, +) -> Vec { + let mut fields = read_fields.to_vec(); + + if let Some(fp) = predicates { + let mut predicate_indices = Vec::new(); + for predicate in &fp.predicates { + collect_predicate_field_indices(predicate, &mut predicate_indices); + } + for index in predicate_indices { + if let Some(field) = fp.file_fields.get(index) { + push_unique_scan_field(&mut fields, field); + } + } + } + + fields +} + +pub(crate) fn collect_predicate_field_indices(predicate: &Predicate, indices: &mut Vec) { + match predicate { + Predicate::Leaf { index, .. } => indices.push(*index), + Predicate::And(children) | Predicate::Or(children) => { + for child in children { + collect_predicate_field_indices(child, indices); + } + } + Predicate::Not(inner) => collect_predicate_field_indices(inner, indices), + Predicate::AlwaysTrue | Predicate::AlwaysFalse => {} + } +} + +pub(crate) fn push_unique_scan_field(fields: &mut Vec, field: &DataField) { + if !fields + .iter() + .any(|existing| same_data_field(existing, field)) + { + fields.push(field.clone()); + } +} + +/// Error for a leaf predicate whose literal(s) cannot be converted to an Arrow +/// scalar for the column's type (e.g. a decimal literal whose scale differs from +/// the column, or a malformed leaf with too few literals). The residual pass is +/// the last line of exactness, so it must error rather than pass all rows. +fn unconvertible_literal_error(op: PredicateOperator, data_type: &DataType) -> ArrowError { + ArrowError::ComputeError(format!( + "residual predicate operator {op:?} has a literal that cannot be evaluated exactly against column type {data_type:?}" + )) +} + +/// Evaluate a single leaf predicate against a decoded column, producing an +/// exact boolean row mask with the `NULL` → `false` convention applied. +/// This is the *complete* leaf dispatch: comparison, null checks, +/// set-membership, the string operators (`StartsWith` / `EndsWith` / `Contains` +/// / `Like`), and the range operators (`Between` / `NotBetween`). +pub(crate) fn evaluate_exact_leaf_predicate( + array: &ArrayRef, + data_type: &DataType, + op: PredicateOperator, + literals: &[Datum], +) -> Result { + // Decimals are compared by mathematical value across scales (Paimon + // `datum_cmp`/`decimal_cmp`). Arrow scalar comparison requires the literal to + // be representable at the column scale, which fails for a finer-scale literal + // (e.g. `d > 1.05` on a DECIMAL(_,1) column). Route all decimal value + // comparisons through a row-wise `datum_cmp` path that is exact for any scale + // combination. IsNull/IsNotNull carry no literal and stay on the generic path. + if matches!(array.data_type(), arrow_schema::DataType::Decimal128(_, _)) + && !matches!(op, PredicateOperator::IsNull | PredicateOperator::IsNotNull) + { + return evaluate_decimal_leaf(array, op, literals); + } + match op { + PredicateOperator::IsNull => Ok(boolean_mask_from_predicate(array.len(), |row_index| { + array.is_null(row_index) + })), + PredicateOperator::IsNotNull => Ok(boolean_mask_from_predicate(array.len(), |row_index| { + array.is_valid(row_index) + })), + PredicateOperator::In | PredicateOperator::NotIn => { + evaluate_set_membership_predicate(array, data_type, op, literals) + } + PredicateOperator::Eq + | PredicateOperator::NotEq + | PredicateOperator::Lt + | PredicateOperator::LtEq + | PredicateOperator::Gt + | PredicateOperator::GtEq + | PredicateOperator::StartsWith + | PredicateOperator::EndsWith + | PredicateOperator::Contains + | PredicateOperator::Like => { + let Some(literal) = literals.first() else { + return Err(unconvertible_literal_error(op, data_type)); + }; + let Some(scalar) = literal_scalar_for_arrow_filter(literal, data_type) + .map_err(|e| ArrowError::ComputeError(e.to_string()))? + else { + // The literal cannot be converted to an Arrow scalar for this + // column type (e.g. a decimal literal whose scale differs from + // the column). Erroring is required for correctness: returning + // all-true here would silently pass every row (a wrong-read), + // and this residual pass is the only place the predicate is + // enforced when the row filter did not accept the leaf. + return Err(unconvertible_literal_error(op, data_type)); + }; + let result = evaluate_column_predicate(array, &scalar, op)?; + Ok(sanitize_filter_mask(result)) + } + PredicateOperator::Between | PredicateOperator::NotBetween => { + evaluate_between_predicate(array, data_type, op, literals) + } + } +} + +/// Evaluate a value comparison on a Decimal128 column using Paimon's by-value +/// decimal semantics (`datum_cmp`), which normalizes across scales. This is +/// exact for any (column scale, literal scale) combination and any comparison +/// operator, including finer-scale literals that cannot be represented at the +/// column scale (e.g. `d > 1.05` on a DECIMAL(_,1) column is exactly `d >= 1.1`). +/// NULL rows stay NULL in the mask (collapsed to `false` by the caller). +fn evaluate_decimal_leaf( + array: &ArrayRef, + op: PredicateOperator, + literals: &[Datum], +) -> Result { + use std::cmp::Ordering; + + let decimals = array + .as_any() + .downcast_ref::() + .ok_or_else(|| { + ArrowError::ComputeError("decimal predicate expects a Decimal128 column".to_string()) + })?; + let col_scale = match array.data_type() { + arrow_schema::DataType::Decimal128(_, s) => u32::try_from(*s).map_err(|_| { + ArrowError::ComputeError("negative decimal scale is not supported".to_string()) + })?, + _ => unreachable!("guarded by caller"), + }; + + // Compare one column value (as a Datum::Decimal at the column scale) against a + // literal Datum. `precision` is irrelevant to `datum_cmp` (it compares by + // value), so any value is fine. `None` means the cross-scale normalization + // overflowed i128 — surface it rather than silently drop rows. + let cmp = |v: i128, lit: &Datum| -> Result { + let cell = Datum::Decimal { + unscaled: v, + precision: 38, + scale: col_scale, + }; + crate::spec::datum_cmp(&cell, lit).ok_or_else(|| { + ArrowError::ComputeError( + "decimal comparison overflowed while normalizing scales".to_string(), + ) + }) + }; + + let require_decimal = |lit: &Datum| -> Result<(), ArrowError> { + match lit { + Datum::Decimal { .. } => Ok(()), + _ => Err(ArrowError::ComputeError( + "decimal column compared against a non-decimal literal".to_string(), + )), + } + }; + + let mut builder = Vec::with_capacity(decimals.len()); + for i in 0..decimals.len() { + if decimals.is_null(i) { + builder.push(None); + continue; + } + let v = decimals.value(i); + let keep = match op { + PredicateOperator::Eq + | PredicateOperator::NotEq + | PredicateOperator::Lt + | PredicateOperator::LtEq + | PredicateOperator::Gt + | PredicateOperator::GtEq => { + let lit = literals + .first() + .ok_or_else(|| ArrowError::ComputeError("missing literal".to_string()))?; + require_decimal(lit)?; + let ord = cmp(v, lit)?; + match op { + PredicateOperator::Eq => ord == Ordering::Equal, + PredicateOperator::NotEq => ord != Ordering::Equal, + PredicateOperator::Lt => ord == Ordering::Less, + PredicateOperator::LtEq => ord != Ordering::Greater, + PredicateOperator::Gt => ord == Ordering::Greater, + PredicateOperator::GtEq => ord != Ordering::Less, + _ => unreachable!(), + } + } + PredicateOperator::In | PredicateOperator::NotIn => { + let mut any_eq = false; + for lit in literals { + require_decimal(lit)?; + if cmp(v, lit)? == Ordering::Equal { + any_eq = true; + break; + } + } + if matches!(op, PredicateOperator::In) { + any_eq + } else { + !any_eq + } + } + PredicateOperator::Between | PredicateOperator::NotBetween => { + let (Some(low), Some(high)) = (literals.first(), literals.get(1)) else { + return Err(ArrowError::ComputeError( + "between requires two literals".to_string(), + )); + }; + require_decimal(low)?; + require_decimal(high)?; + let in_range = cmp(v, low)? != Ordering::Less && cmp(v, high)? != Ordering::Greater; + if matches!(op, PredicateOperator::Between) { + in_range + } else { + !in_range + } + } + _ => { + return Err(ArrowError::ComputeError(format!( + "operator {op:?} is not a decimal value comparison" + ))) + } + }; + builder.push(Some(keep)); + } + Ok(BooleanArray::from(builder)) +} + +/// `Between` / `NotBetween` translate to `gt_eq(col, low) & lt_eq(col, high)` +/// (and its negation). `arrow_ord::cmp` produces nullable masks: any null +/// row makes the comparison null, so a fully-built `Between` mask preserves +/// nulls. `NotBetween` then negates valid rows and leaves nulls null — +/// matching SQL three-valued logic; `sanitize_filter_mask` collapses nulls +/// into `false` to match the predicate evaluator's "NULL → false" rule. +fn evaluate_between_predicate( + array: &ArrayRef, + data_type: &DataType, + op: PredicateOperator, + literals: &[Datum], +) -> Result { + let (Some(low), Some(high)) = (literals.first(), literals.get(1)) else { + return Err(unconvertible_literal_error(op, data_type)); + }; + let Some(low_scalar) = literal_scalar_for_arrow_filter(low, data_type) + .map_err(|e| ArrowError::ComputeError(e.to_string()))? + else { + return Err(unconvertible_literal_error(op, data_type)); + }; + let Some(high_scalar) = literal_scalar_for_arrow_filter(high, data_type) + .map_err(|e| ArrowError::ComputeError(e.to_string()))? + else { + return Err(unconvertible_literal_error(op, data_type)); + }; + // Delegate the two bound comparisons to `evaluate_column_predicate` rather + // than calling `arrow_gt_eq`/`arrow_lt_eq` directly, so Between inherits the + // type-faithful comparison paths (e.g. signed-byte order for Binary). Using + // Arrow's kernels here directly would reintroduce unsigned binary ordering. + let lo_mask = evaluate_column_predicate(array, &low_scalar, PredicateOperator::GtEq)?; + let hi_mask = evaluate_column_predicate(array, &high_scalar, PredicateOperator::LtEq)?; + let between = arrow_arith::boolean::and_kleene(&lo_mask, &hi_mask)?; + let result = match op { + PredicateOperator::Between => between, + PredicateOperator::NotBetween => arrow_arith::boolean::not(&between)?, + _ => unreachable!(), + }; + Ok(sanitize_filter_mask(result)) +} + +fn evaluate_set_membership_predicate( + array: &ArrayRef, + data_type: &DataType, + op: PredicateOperator, + literals: &[Datum], +) -> Result { + if literals.is_empty() { + return Ok(match op { + PredicateOperator::In => BooleanArray::from(vec![false; array.len()]), + PredicateOperator::NotIn => { + boolean_mask_from_predicate(array.len(), |row_index| array.is_valid(row_index)) + } + _ => unreachable!(), + }); + } + + let mut combined = match op { + PredicateOperator::In => BooleanArray::from(vec![false; array.len()]), + PredicateOperator::NotIn => { + boolean_mask_from_predicate(array.len(), |row_index| array.is_valid(row_index)) + } + _ => unreachable!(), + }; + + for literal in literals { + let Some(scalar) = literal_scalar_for_arrow_filter(literal, data_type) + .map_err(|e| ArrowError::ComputeError(e.to_string()))? + else { + return Err(unconvertible_literal_error(op, data_type)); + }; + let comparison_op = match op { + PredicateOperator::In => PredicateOperator::Eq, + PredicateOperator::NotIn => PredicateOperator::NotEq, + _ => unreachable!(), + }; + let mask = sanitize_filter_mask(evaluate_column_predicate(array, &scalar, comparison_op)?); + combined = combine_filter_masks(&combined, &mask, matches!(op, PredicateOperator::In)); + } + + Ok(combined) +} + +fn evaluate_column_predicate( + column: &ArrayRef, + scalar: &Scalar, + op: PredicateOperator, +) -> Result { + // Binary ordering must match Paimon's Datum::Bytes semantics (Java signed-byte + // order, 0xFF < 0x00), which Arrow's unsigned byte comparison does not. Route + // ordering ops on Binary/VarBinary columns through the signed comparator. + // Eq/NotEq are order-independent, so Arrow's kernels are correct for them. + if matches!( + column.data_type(), + arrow_schema::DataType::Binary | arrow_schema::DataType::LargeBinary + ) && matches!( + op, + PredicateOperator::Lt + | PredicateOperator::LtEq + | PredicateOperator::Gt + | PredicateOperator::GtEq + ) { + return evaluate_binary_ordering_predicate(column, scalar, op); + } + match op { + PredicateOperator::Eq => arrow_eq(column, scalar), + PredicateOperator::NotEq => arrow_neq(column, scalar), + PredicateOperator::Lt => arrow_lt(column, scalar), + PredicateOperator::LtEq => arrow_lt_eq(column, scalar), + PredicateOperator::Gt => arrow_gt(column, scalar), + PredicateOperator::GtEq => arrow_gt_eq(column, scalar), + PredicateOperator::StartsWith + | PredicateOperator::EndsWith + | PredicateOperator::Contains + | PredicateOperator::Like => { + let pattern = pattern_scalar_for_string_kernel(scalar, column.data_type())?; + match op { + PredicateOperator::StartsWith => arrow_starts_with(column, &pattern), + PredicateOperator::EndsWith => arrow_ends_with(column, &pattern), + PredicateOperator::Contains => arrow_contains(column, &pattern), + PredicateOperator::Like => arrow_like(column, &pattern), + _ => unreachable!(), + } + } + PredicateOperator::IsNull + | PredicateOperator::IsNotNull + | PredicateOperator::In + | PredicateOperator::NotIn + | PredicateOperator::Between + | PredicateOperator::NotBetween => Ok(BooleanArray::new_null(column.len())), + } +} + +/// Evaluate an ordering predicate (`Lt`/`LtEq`/`Gt`/`GtEq`) on a Binary column +/// using Paimon's Java signed-byte order, matching `Datum::Bytes` semantics. +/// `scalar` is a single-element Binary array (the literal). NULL column rows +/// produce NULL in the mask (later collapsed to `false` by `sanitize_filter_mask`). +fn evaluate_binary_ordering_predicate( + column: &ArrayRef, + scalar: &Scalar, + op: PredicateOperator, +) -> Result { + use arrow_array::cast::AsArray; + use std::cmp::Ordering; + + // The scalar wraps a length-1 Binary array holding the literal bytes. + let (scalar_array, _) = scalar.get(); + let literal: &[u8] = if let Some(a) = scalar_array.as_binary_opt::() { + a.value(0) + } else if let Some(a) = scalar_array.as_binary_opt::() { + a.value(0) + } else { + return Err(ArrowError::ComputeError( + "binary ordering predicate expects a Binary literal".to_string(), + )); + }; + + // Row-wise comparison via the shared signed-byte comparator (single source of + // truth with `Datum` ordering). + let compare = |bytes: &[u8]| -> bool { + let ord = crate::spec::java_bytes_cmp(bytes, literal); + match op { + PredicateOperator::Lt => ord == Ordering::Less, + PredicateOperator::LtEq => ord != Ordering::Greater, + PredicateOperator::Gt => ord == Ordering::Greater, + PredicateOperator::GtEq => ord != Ordering::Less, + _ => unreachable!("only ordering ops reach here"), + } + }; + + let mask: BooleanArray = if let Some(a) = column.as_binary_opt::() { + a.iter().map(|v| v.map(compare)).collect() + } else if let Some(a) = column.as_binary_opt::() { + a.iter().map(|v| v.map(compare)).collect() + } else { + return Err(ArrowError::ComputeError( + "binary ordering predicate expects a Binary column".to_string(), + )); + }; + Ok(mask) +} + +/// `arrow_string::like::*` kernels reject mismatched string types — Utf8 column +/// against Utf8 pattern is fine, but a LargeUtf8 / Utf8View column needs a +/// pattern of the same flavour. The shared scalar built upstream is always +/// `StringArray` (Utf8); promote it to match the column when needed. +fn pattern_scalar_for_string_kernel( + scalar: &Scalar, + column_type: &arrow_schema::DataType, +) -> Result, ArrowError> { + use arrow_array::{LargeStringArray, StringArray, StringViewArray}; + use arrow_schema::DataType as ArrowDataType; + + let arr = scalar.get().0; + let value = arr + .as_any() + .downcast_ref::() + .and_then(|s| (s.len() == 1 && s.is_valid(0)).then(|| s.value(0).to_string())); + let Some(value) = value else { + return Ok(scalar.clone()); + }; + Ok(match column_type { + ArrowDataType::Utf8 => Scalar::new(Arc::new(StringArray::from(vec![value])) as ArrayRef), + ArrowDataType::LargeUtf8 => { + Scalar::new(Arc::new(LargeStringArray::from(vec![value])) as ArrayRef) + } + ArrowDataType::Utf8View => { + Scalar::new(Arc::new(StringViewArray::from(vec![value])) as ArrayRef) + } + ArrowDataType::Dictionary(_, value_type) if value_type.as_ref() == &ArrowDataType::Utf8 => { + Scalar::new(Arc::new(StringArray::from(vec![value])) as ArrayRef) + } + other => { + return Err(ArrowError::InvalidArgumentError(format!( + "string predicate against non-string column type {other:?}" + ))) + } + }) +} + +/// Collapse `NULL` mask entries to `false`, matching the residual-filter +/// convention. A mask with no nulls is returned unchanged. +pub(crate) fn sanitize_filter_mask(mask: BooleanArray) -> BooleanArray { + if mask.null_count() == 0 { + return mask; + } + + boolean_mask_from_predicate(mask.len(), |row_index| { + mask.is_valid(row_index) && mask.value(row_index) + }) +} + +fn combine_filter_masks(left: &BooleanArray, right: &BooleanArray, use_or: bool) -> BooleanArray { + debug_assert_eq!(left.len(), right.len()); + boolean_mask_from_predicate(left.len(), |row_index| { + if use_or { + left.value(row_index) || right.value(row_index) + } else { + left.value(row_index) && right.value(row_index) + } + }) +} + +fn boolean_mask_from_predicate( + len: usize, + mut predicate: impl FnMut(usize) -> bool, +) -> BooleanArray { + BooleanArray::from((0..len).map(&mut predicate).collect::>()) +} + +// --------------------------------------------------------------------------- +// Literal → Arrow scalar conversion +// --------------------------------------------------------------------------- + +/// Convert a paimon [`Datum`] literal into an Arrow scalar matching +/// `file_data_type`, or `None` when the literal / type pair is unsupported for +/// filtering (in which case the caller falls open, keeping the row). +pub(crate) fn literal_scalar_for_arrow_filter( + literal: &Datum, + file_data_type: &DataType, +) -> crate::Result>> { + let array: ArrayRef = match file_data_type { + DataType::Boolean(_) => match literal { + Datum::Bool(value) => Arc::new(BooleanArray::new_scalar(*value).into_inner()), + _ => return Ok(None), + }, + DataType::TinyInt(_) => { + match integer_literal(literal).and_then(|value| i8::try_from(value).ok()) { + Some(value) => Arc::new(Int8Array::new_scalar(value).into_inner()), + None => return Ok(None), + } + } + DataType::SmallInt(_) => { + match integer_literal(literal).and_then(|value| i16::try_from(value).ok()) { + Some(value) => Arc::new(Int16Array::new_scalar(value).into_inner()), + None => return Ok(None), + } + } + DataType::Int(_) => { + match integer_literal(literal).and_then(|value| i32::try_from(value).ok()) { + Some(value) => Arc::new(Int32Array::new_scalar(value).into_inner()), + None => return Ok(None), + } + } + DataType::BigInt(_) => { + match integer_literal(literal).and_then(|value| i64::try_from(value).ok()) { + Some(value) => Arc::new(Int64Array::new_scalar(value).into_inner()), + None => return Ok(None), + } + } + DataType::Float(_) => match float32_literal(literal) { + Some(value) => Arc::new(Float32Array::new_scalar(value).into_inner()), + None => return Ok(None), + }, + DataType::Double(_) => match float64_literal(literal) { + Some(value) => Arc::new(Float64Array::new_scalar(value).into_inner()), + None => return Ok(None), + }, + DataType::Char(_) | DataType::VarChar(_) => match literal { + Datum::String(value) => Arc::new(StringArray::new_scalar(value.as_str()).into_inner()), + _ => return Ok(None), + }, + DataType::Binary(_) | DataType::VarBinary(_) | DataType::Blob(_) => match literal { + Datum::Bytes(value) => Arc::new(BinaryArray::new_scalar(value.as_slice()).into_inner()), + _ => return Ok(None), + }, + DataType::Date(_) => match literal { + Datum::Date(value) => Arc::new(Date32Array::new_scalar(*value).into_inner()), + _ => return Ok(None), + }, + DataType::Time(_) => match literal { + Datum::Time(value) => Arc::new(Time32MillisecondArray::new_scalar(*value).into_inner()), + _ => return Ok(None), + }, + DataType::Timestamp(ts) => match literal { + Datum::Timestamp { millis, nanos } => { + let Some(array) = timestamp_scalar(*millis, *nanos, ts.precision(), None)? else { + return Ok(None); + }; + array + } + _ => return Ok(None), + }, + DataType::LocalZonedTimestamp(ts) => match literal { + Datum::LocalZonedTimestamp { millis, nanos } => { + let Some(array) = timestamp_scalar(*millis, *nanos, ts.precision(), Some("UTC"))? + else { + return Ok(None); + }; + array + } + _ => return Ok(None), + }, + DataType::Decimal(_) => { + // Decimals never reach here: `evaluate_exact_leaf_predicate` routes + // decimal columns to `evaluate_decimal_leaf`, which compares by value + // across scales (matching Paimon `datum_cmp`). Returning `None` is a + // defensive guard — if a decimal ever reached the Arrow-scalar path it + // would surface as an unconvertible-literal error rather than a silent + // wrong-read. + return Ok(None); + } + DataType::Array(_) + | DataType::Map(_) + | DataType::Multiset(_) + | DataType::Row(_) + | DataType::Vector(_) => { + return Ok(None); + } + }; + + Ok(Some(Scalar::new(array))) +} + +fn timestamp_scalar( + millis: i64, + nanos: i32, + precision: u32, + timezone: Option<&'static str>, +) -> crate::Result> { + let array: ArrayRef = match precision { + 0..=3 => { + let array = TimestampMillisecondArray::new_scalar(millis).into_inner(); + match timezone { + Some(tz) => Arc::new(array.with_timezone(tz)), + None => Arc::new(array), + } + } + 4..=6 => { + let value = millis * 1_000 + (nanos as i64) / 1_000; + let array = TimestampMicrosecondArray::new_scalar(value).into_inner(); + match timezone { + Some(tz) => Arc::new(array.with_timezone(tz)), + None => Arc::new(array), + } + } + 7..=9 => { + let value = millis * 1_000_000 + (nanos as i64); + let array = TimestampNanosecondArray::new_scalar(value).into_inner(); + match timezone { + Some(tz) => Arc::new(array.with_timezone(tz)), + None => Arc::new(array), + } + } + _ => return Ok(None), + }; + Ok(Some(array)) +} + +fn integer_literal(literal: &Datum) -> Option { + match literal { + Datum::TinyInt(value) => Some(i128::from(*value)), + Datum::SmallInt(value) => Some(i128::from(*value)), + Datum::Int(value) => Some(i128::from(*value)), + Datum::Long(value) => Some(i128::from(*value)), + _ => None, + } +} + +fn float32_literal(literal: &Datum) -> Option { + match literal { + Datum::Float(value) => Some(*value), + Datum::Double(value) => { + let casted = *value as f32; + ((casted as f64) == *value).then_some(casted) + } + _ => None, + } +} + +fn float64_literal(literal: &Datum) -> Option { + match literal { + Datum::Float(value) => Some(f64::from(*value)), + Datum::Double(value) => Some(*value), + _ => None, + } +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +#[cfg(test)] +mod tests { + use super::*; + use crate::spec::{IntType, VarCharType}; + use arrow_array::{Int32Array, StringArray}; + use arrow_schema::{DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema}; + use std::sync::Arc; + + fn int_field(id: i32, name: &str) -> DataField { + DataField::new(id, name.to_string(), DataType::Int(IntType::new())) + } + + fn str_field(id: i32, name: &str) -> DataField { + DataField::new( + id, + name.to_string(), + DataType::VarChar(VarCharType::string_type()), + ) + } + + fn leaf( + index: usize, + data_type: DataType, + op: PredicateOperator, + literals: Vec, + ) -> Predicate { + Predicate::Leaf { + column: format!("c{index}"), + index, + data_type, + op, + literals, + } + } + + fn file_predicates(predicates: Vec, file_fields: Vec) -> FilePredicates { + FilePredicates { + predicates, + file_fields, + } + } + + fn int_batch(name: &str, values: Vec>) -> RecordBatch { + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + name, + ArrowDataType::Int32, + true, + )])); + RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(values))]).unwrap() + } + + fn str_batch(name: &str, values: Vec>) -> RecordBatch { + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + name, + ArrowDataType::Utf8, + true, + )])); + RecordBatch::try_new(schema, vec![Arc::new(StringArray::from(values))]).unwrap() + } + + fn int_values(batch: &RecordBatch) -> Vec { + batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .values() + .to_vec() + } + + fn str_values(batch: &RecordBatch) -> Vec { + let col = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + (0..col.len()).map(|i| col.value(i).to_string()).collect() + } + + #[test] + fn test_gt_filters_exactly() { + let f = int_field(0, "age"); + let b = int_batch( + "age", + vec![Some(10), Some(20), Some(30), Some(40), Some(50)], + ); + let pred = leaf( + 0, + DataType::Int(IntType::new()), + PredicateOperator::Gt, + vec![Datum::Int(25)], + ); + let fp = file_predicates(vec![pred], vec![f.clone()]); + let out = filter_record_batch_by_predicates(b, &fp, &[f]).unwrap(); + assert_eq!(out.num_rows(), 3); + assert_eq!(int_values(&out), vec![30, 40, 50]); + } + + #[test] + fn test_starts_with_exact() { + let f = str_field(0, "name"); + let b = str_batch( + "name", + vec![ + Some("apple"), + Some("banana"), + Some("apricot"), + Some("cherry"), + ], + ); + let pred = leaf( + 0, + DataType::VarChar(VarCharType::string_type()), + PredicateOperator::StartsWith, + vec![Datum::String("ap".to_string())], + ); + let fp = file_predicates(vec![pred], vec![f.clone()]); + let out = filter_record_batch_by_predicates(b, &fp, &[f]).unwrap(); + assert_eq!(str_values(&out), vec!["apple", "apricot"]); + } + + #[test] + fn test_ends_with_exact() { + let f = str_field(0, "name"); + let b = str_batch( + "name", + vec![ + Some("apple"), + Some("banana"), + Some("apricot"), + Some("cherry"), + ], + ); + let pred = leaf( + 0, + DataType::VarChar(VarCharType::string_type()), + PredicateOperator::EndsWith, + vec![Datum::String("y".to_string())], + ); + let fp = file_predicates(vec![pred], vec![f.clone()]); + let out = filter_record_batch_by_predicates(b, &fp, &[f]).unwrap(); + assert_eq!(str_values(&out), vec!["cherry"]); + } + + #[test] + fn test_contains_exact() { + let f = str_field(0, "name"); + let b = str_batch( + "name", + vec![ + Some("apple"), + Some("banana"), + Some("apricot"), + Some("cherry"), + ], + ); + let pred = leaf( + 0, + DataType::VarChar(VarCharType::string_type()), + PredicateOperator::Contains, + vec![Datum::String("err".to_string())], + ); + let fp = file_predicates(vec![pred], vec![f.clone()]); + let out = filter_record_batch_by_predicates(b, &fp, &[f]).unwrap(); + assert_eq!(str_values(&out), vec!["cherry"]); + } + + #[test] + fn test_like_exact() { + let f = str_field(0, "name"); + let b = str_batch( + "name", + vec![ + Some("apple"), + Some("banana"), + Some("apricot"), + Some("cherry"), + ], + ); + let pred = leaf( + 0, + DataType::VarChar(VarCharType::string_type()), + PredicateOperator::Like, + vec![Datum::String("a%".to_string())], + ); + let fp = file_predicates(vec![pred], vec![f.clone()]); + let out = filter_record_batch_by_predicates(b, &fp, &[f]).unwrap(); + assert_eq!(str_values(&out), vec!["apple", "apricot"]); + } + + #[test] + fn test_between_inclusive_exact() { + let f = int_field(0, "age"); + let b = int_batch( + "age", + vec![Some(10), Some(20), Some(30), Some(40), Some(50)], + ); + let pred = leaf( + 0, + DataType::Int(IntType::new()), + PredicateOperator::Between, + vec![Datum::Int(20), Datum::Int(40)], + ); + let fp = file_predicates(vec![pred], vec![f.clone()]); + let out = filter_record_batch_by_predicates(b, &fp, &[f]).unwrap(); + // Inclusive boundaries → [20, 30, 40]. + assert_eq!(int_values(&out), vec![20, 30, 40]); + } + + #[test] + fn test_and_composes() { + // age > 15 AND age < 45 -> [20, 30, 40] + let f = int_field(0, "age"); + let b = int_batch( + "age", + vec![Some(10), Some(20), Some(30), Some(40), Some(50)], + ); + let pred = Predicate::And(vec![ + leaf( + 0, + DataType::Int(IntType::new()), + PredicateOperator::Gt, + vec![Datum::Int(15)], + ), + leaf( + 0, + DataType::Int(IntType::new()), + PredicateOperator::Lt, + vec![Datum::Int(45)], + ), + ]); + let fp = file_predicates(vec![pred], vec![f.clone()]); + let out = filter_record_batch_by_predicates(b, &fp, &[f]).unwrap(); + assert_eq!(int_values(&out), vec![20, 30, 40]); + } + + #[test] + fn test_or_composes() { + // age < 15 OR age > 45 -> [10, 50] + let f = int_field(0, "age"); + let b = int_batch( + "age", + vec![Some(10), Some(20), Some(30), Some(40), Some(50)], + ); + let pred = Predicate::Or(vec![ + leaf( + 0, + DataType::Int(IntType::new()), + PredicateOperator::Lt, + vec![Datum::Int(15)], + ), + leaf( + 0, + DataType::Int(IntType::new()), + PredicateOperator::Gt, + vec![Datum::Int(45)], + ), + ]); + let fp = file_predicates(vec![pred], vec![f.clone()]); + let out = filter_record_batch_by_predicates(b, &fp, &[f]).unwrap(); + assert_eq!(int_values(&out), vec![10, 50]); + } + + #[test] + fn test_not_composes() { + // NOT(age > 25) -> [10, 20] + let f = int_field(0, "age"); + let b = int_batch( + "age", + vec![Some(10), Some(20), Some(30), Some(40), Some(50)], + ); + let pred = Predicate::Not(Box::new(leaf( + 0, + DataType::Int(IntType::new()), + PredicateOperator::Gt, + vec![Datum::Int(25)], + ))); + let fp = file_predicates(vec![pred], vec![f.clone()]); + let out = filter_record_batch_by_predicates(b, &fp, &[f]).unwrap(); + assert_eq!(int_values(&out), vec![10, 20]); + } + + #[test] + fn test_null_row_excluded() { + // age = [10, null, 30]; age > 5 -> [10, 30] (null → false). + let f = int_field(0, "age"); + let b = int_batch("age", vec![Some(10), None, Some(30)]); + let pred = leaf( + 0, + DataType::Int(IntType::new()), + PredicateOperator::Gt, + vec![Datum::Int(5)], + ); + let fp = file_predicates(vec![pred], vec![f.clone()]); + let out = filter_record_batch_by_predicates(b, &fp, &[f]).unwrap(); + assert_eq!(int_values(&out), vec![10, 30]); + } + + fn name_age_batch(names: Vec<&str>, ages: Vec>) -> RecordBatch { + let schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("name", ArrowDataType::Utf8, true), + ArrowField::new("age", ArrowDataType::Int32, true), + ])); + RecordBatch::try_new( + schema, + vec![ + Arc::new(StringArray::from(names)), + Arc::new(Int32Array::from(ages)), + ], + ) + .unwrap() + } + + #[test] + fn test_widen_scan_fields_adds_predicate_columns_deduped() { + let name = str_field(1, "name"); + let age = int_field(2, "age"); + + // read_fields=[name]; predicate on age -> widened contains name AND age. + let pred = leaf( + 1, + DataType::Int(IntType::new()), + PredicateOperator::Gt, + vec![Datum::Int(25)], + ); + let fp = file_predicates(vec![pred], vec![name.clone(), age.clone()]); + let widened = widen_scan_fields(std::slice::from_ref(&name), Some(&fp)); + assert_eq!(widened.len(), 2); + assert_eq!(widened[0].name(), "name"); + assert_eq!(widened[1].name(), "age"); + + // If the predicate column is already in read_fields, no duplicate. + let pred_on_name = leaf( + 0, + DataType::VarChar(VarCharType::string_type()), + PredicateOperator::StartsWith, + vec![Datum::String("a".to_string())], + ); + let fp2 = file_predicates(vec![pred_on_name], vec![name.clone()]); + let widened2 = widen_scan_fields(std::slice::from_ref(&name), Some(&fp2)); + assert_eq!(widened2.len(), 1); + assert_eq!(widened2[0].name(), "name"); + + // No predicates -> read_fields unchanged. + let widened3 = widen_scan_fields(std::slice::from_ref(&name), None); + assert_eq!(widened3.len(), 1); + } + + #[test] + fn test_filter_on_batch_containing_predicate_col_is_exact() { + // batch has [name, age]; scan_fields=[name,age]; predicate age>25. + let name = str_field(1, "name"); + let age = int_field(2, "age"); + let batch = name_age_batch( + vec!["a", "b", "c", "d", "e"], + vec![Some(10), Some(20), Some(30), Some(40), Some(50)], + ); + let pred = leaf( + 1, + DataType::Int(IntType::new()), + PredicateOperator::Gt, + vec![Datum::Int(25)], + ); + let fp = file_predicates(vec![pred], vec![name.clone(), age.clone()]); + let scan_fields = vec![name, age]; + let out = filter_record_batch_by_predicates(batch, &fp, &scan_fields).unwrap(); + // Only matching rows kept, and the batch still has both columns. + assert_eq!(out.num_columns(), 2); + assert_eq!(out.num_rows(), 3); + let names = out + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!( + (0..names.len()) + .map(|i| names.value(i).to_string()) + .collect::>(), + vec!["c", "d", "e"] + ); + } + + #[test] + #[cfg(debug_assertions)] + #[should_panic] + fn test_missing_predicate_column_is_caught_not_silently_skipped() { + // batch has [name] only; predicate age>25 with age present in file_fields. + // The debug_assert must fire (Gap A would otherwise silently skip). + let name = str_field(1, "name"); + let age = int_field(2, "age"); + let batch = str_batch("name", vec![Some("a"), Some("b"), Some("c")]); + let pred = leaf( + 1, + DataType::Int(IntType::new()), + PredicateOperator::Gt, + vec![Datum::Int(25)], + ); + // file_fields has age; scan_fields (present in batch) is [name] only. + let fp = file_predicates(vec![pred], vec![name.clone(), age]); + let scan_fields = vec![name]; + let _ = filter_record_batch_by_predicates(batch, &fp, &scan_fields); + } + + #[test] + fn test_filter_when_batch_column_order_differs_from_scan_fields() { + // Regression: a reader (e.g. ORC `ProjectionMask::named_roots`) may emit + // batch columns in file-schema order, NOT scan_fields order. The residual + // evaluator must resolve the predicate column by NAME in the batch, not by + // its position in scan_fields — otherwise it compares the wrong column and + // fails with a type mismatch (e.g. "Utf8 == Int32"). + let name = str_field(1, "name"); + let age = int_field(2, "age"); + // scan_fields order is [name, age] (index of age == 1)... + let scan_fields = vec![name.clone(), age.clone()]; + // ...but the batch emits columns in the OPPOSITE order [age, name]. + let schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("age", ArrowDataType::Int32, true), + ArrowField::new("name", ArrowDataType::Utf8, true), + ])); + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(Int32Array::from(vec![Some(10), Some(20), Some(30)])), + Arc::new(StringArray::from(vec![Some("a"), Some("b"), Some("c")])), + ], + ) + .unwrap(); + // predicate age > 15; age is file_fields index 1. Positional indexing into + // the batch by scan_fields position (1) would hit the Utf8 `name` column. + let pred = leaf( + 1, + DataType::Int(IntType::new()), + PredicateOperator::Gt, + vec![Datum::Int(15)], + ); + let fp = file_predicates(vec![pred], vec![name, age]); + let out = filter_record_batch_by_predicates(batch, &fp, &scan_fields).unwrap(); + assert_eq!(out.num_rows(), 2); // age in {20, 30} + // age column is still at batch position 0. + let ages = out.column(0).as_any().downcast_ref::().unwrap(); + assert_eq!( + (0..ages.len()).map(|i| ages.value(i)).collect::>(), + vec![20, 30] + ); + } + + #[test] + fn test_unconvertible_literal_errors_not_fail_open() { + // A leaf whose literal cannot be converted to an Arrow scalar for the + // column type (here: an Int column compared to a Bool literal) must + // ERROR, not silently return all rows. The residual pass is the only + // enforcement point when the row filter rejected the leaf, so fail-open + // would be a silent wrong-read. + let age = int_field(0, "age"); + let b = int_batch("age", vec![Some(10), Some(20), Some(30)]); + let pred = leaf( + 0, + DataType::Int(IntType::new()), + PredicateOperator::Eq, + vec![Datum::Bool(true)], + ); + let fp = file_predicates(vec![pred], vec![age.clone()]); + let result = filter_record_batch_by_predicates(b, &fp, std::slice::from_ref(&age)); + assert!( + result.is_err(), + "unconvertible literal must error, not fail-open to all-rows" + ); + } + + #[test] + fn test_promoted_column_out_of_range_literal_yields_no_rows_not_error() { + // Schema evolution: table column promoted INT -> BIGINT, old file column + // is still INT. A predicate value beyond i32 range must yield "no rows" + // for this file, NOT an error. The residual casts the INT column up to + // the predicate's BIGINT type so the literal is representable. + use crate::spec::BigIntType; + let file_age = int_field(0, "age"); // file column: INT + let batch = int_batch("age", vec![Some(10), Some(20), Some(30)]); + let pred = leaf( + 0, + DataType::BigInt(BigIntType::new()), + PredicateOperator::Eq, + vec![Datum::Long(i64::from(i32::MAX) + 1)], + ); + let fp = file_predicates(vec![pred], vec![file_age.clone()]); + let out = + filter_record_batch_by_predicates(batch, &fp, std::slice::from_ref(&file_age)).unwrap(); + assert_eq!(out.num_rows(), 0, "no INT value can equal i32::MAX+1"); + } + + #[test] + fn test_promoted_column_in_range_literal_filters_exactly() { + use crate::spec::BigIntType; + let file_age = int_field(0, "age"); + let batch = int_batch("age", vec![Some(10), Some(20), Some(30)]); + let pred = leaf( + 0, + DataType::BigInt(BigIntType::new()), + PredicateOperator::Gt, + vec![Datum::Long(15)], + ); + let fp = file_predicates(vec![pred], vec![file_age.clone()]); + let out = + filter_record_batch_by_predicates(batch, &fp, std::slice::from_ref(&file_age)).unwrap(); + assert_eq!(out.num_rows(), 2, "age in {{20, 30}}"); + } + + #[test] + fn test_no_predicates_is_identity() { + let f = int_field(0, "age"); + let b = int_batch("age", vec![Some(10), Some(20), Some(30)]); + // evaluate_predicates_mask returns None with an empty predicate list. + let mask = + evaluate_predicates_mask(&b, &[], std::slice::from_ref(&f), std::slice::from_ref(&f)) + .unwrap(); + assert!(mask.is_none()); + // filter_record_batch_by_predicates leaves the batch unchanged. + let fp = file_predicates(vec![], vec![f.clone()]); + let out = filter_record_batch_by_predicates(b, &fp, &[f]).unwrap(); + assert_eq!(out.num_rows(), 3); + assert_eq!(int_values(&out), vec![10, 20, 30]); + } + + #[test] + fn test_binary_ordering_uses_java_signed_byte_order() { + // Paimon Datum::Bytes orders by signed byte (0xFF < 0x00). Arrow's + // unsigned comparison would order 0xFF as the largest. Verify the + // residual matches Paimon: filter `col > 0x00` must EXCLUDE 0xFF. + use crate::spec::BinaryType; + let col = DataField::new( + 0, + "b".to_string(), + DataType::Binary(BinaryType::new(1).unwrap()), + ); + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "b", + ArrowDataType::Binary, + true, + )])); + let values: Vec> = vec![Some(&[0x00]), Some(&[0x01]), Some(&[0xFF])]; + let batch = + RecordBatch::try_new(schema, vec![Arc::new(BinaryArray::from(values))]).unwrap(); + // col > 0x00 : signed order -> only 0x01 (0xFF is negative, < 0x00). + let pred = leaf( + 0, + DataType::Binary(BinaryType::new(1).unwrap()), + PredicateOperator::Gt, + vec![Datum::Bytes(vec![0x00])], + ); + let fp = file_predicates(vec![pred], vec![col.clone()]); + let out = + filter_record_batch_by_predicates(batch, &fp, std::slice::from_ref(&col)).unwrap(); + let out_col = out + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let got: Vec<&[u8]> = (0..out_col.len()).map(|i| out_col.value(i)).collect(); + assert_eq!( + got, + vec![&[0x01u8][..]], + "0xFF must be excluded (signed < 0x00)" + ); + } + + #[test] + fn test_binary_between_uses_java_signed_byte_order() { + // Between must inherit the signed-byte order too (regression: it called + // Arrow's unsigned gt_eq/lt_eq directly). `b BETWEEN 0xFF AND 0x01` is, + // under signed order, the range [-1, 1] -> keeps 0xFF(-1), 0x00(0), + // 0x01(1) and excludes 0x7F(127). Under Arrow's unsigned order it would be + // [255, 1] = empty, so this distinguishes the two. + use crate::spec::BinaryType; + let col = DataField::new( + 0, + "b".to_string(), + DataType::Binary(BinaryType::new(1).unwrap()), + ); + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "b", + ArrowDataType::Binary, + true, + )])); + let values: Vec> = + vec![Some(&[0xFF]), Some(&[0x00]), Some(&[0x01]), Some(&[0x7F])]; + let batch = + RecordBatch::try_new(schema, vec![Arc::new(BinaryArray::from(values))]).unwrap(); + let pred = leaf( + 0, + DataType::Binary(BinaryType::new(1).unwrap()), + PredicateOperator::Between, + vec![Datum::Bytes(vec![0xFF]), Datum::Bytes(vec![0x01])], + ); + let fp = file_predicates(vec![pred], vec![col.clone()]); + let out = + filter_record_batch_by_predicates(batch, &fp, std::slice::from_ref(&col)).unwrap(); + let out_col = out + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let got: Vec<&[u8]> = (0..out_col.len()).map(|i| out_col.value(i)).collect(); + assert_eq!( + got, + vec![&[0xFFu8][..], &[0x00u8][..], &[0x01u8][..]], + "signed range [-1, 1] keeps 0xFF/0x00/0x01, excludes 0x7F" + ); + } + + #[test] + fn test_decimal_cross_scale_predicate_matches_by_value() { + // Paimon compares decimals by value across scales. A literal 1.0 (scale 1) + // against a DECIMAL(_, 2) column must not error and must match 1.00. + use crate::spec::DecimalType; + let col = DataField::new( + 0, + "d".to_string(), + DataType::Decimal(DecimalType::with_nullable(true, 10, 2).unwrap()), + ); + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "d", + ArrowDataType::Decimal128(10, 2), + true, + )])); + // values: 1.00, 2.00, 0.50 (unscaled at scale 2) + let arr = arrow_array::Decimal128Array::from(vec![Some(100), Some(200), Some(50)]) + .with_precision_and_scale(10, 2) + .unwrap(); + let batch = RecordBatch::try_new(schema, vec![Arc::new(arr)]).unwrap(); + // predicate: d = 1.0 (literal unscaled=10, scale=1) -> must match 1.00. + let pred = leaf( + 0, + DataType::Decimal(DecimalType::with_nullable(true, 10, 2).unwrap()), + PredicateOperator::Eq, + vec![Datum::Decimal { + unscaled: 10, + precision: 10, + scale: 1, + }], + ); + let fp = file_predicates(vec![pred], vec![col.clone()]); + let out = + filter_record_batch_by_predicates(batch, &fp, std::slice::from_ref(&col)).unwrap(); + assert_eq!(out.num_rows(), 1, "1.0 must match 1.00 across scales"); + } + + #[test] + fn test_decimal_finer_scale_literal_is_exact_not_error() { + // Reviewer's case: DECIMAL(_,1) column filtered by `d > 1.05` (literal + // scale 2, not representable at scale 1). Must be exact (== `d >= 1.1`), + // never a read error. Column values: 1.0, 1.1, 1.2 (unscaled at scale 1). + use crate::spec::DecimalType; + let col = DataField::new( + 0, + "d".to_string(), + DataType::Decimal(DecimalType::with_nullable(true, 10, 1).unwrap()), + ); + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "d", + ArrowDataType::Decimal128(10, 1), + true, + )])); + let arr = arrow_array::Decimal128Array::from(vec![Some(10), Some(11), Some(12)]) + .with_precision_and_scale(10, 1) + .unwrap(); + let batch = RecordBatch::try_new(schema, vec![Arc::new(arr)]).unwrap(); + // d > 1.05 (unscaled 105, scale 2) -> d in {1.1, 1.2}. + let pred = leaf( + 0, + DataType::Decimal(DecimalType::with_nullable(true, 10, 1).unwrap()), + PredicateOperator::Gt, + vec![Datum::Decimal { + unscaled: 105, + precision: 10, + scale: 2, + }], + ); + let fp = file_predicates(vec![pred], vec![col.clone()]); + let out = + filter_record_batch_by_predicates(batch, &fp, std::slice::from_ref(&col)).unwrap(); + assert_eq!(out.num_rows(), 2, "d > 1.05 == d >= 1.1 -> {{1.1, 1.2}}"); + } +} diff --git a/crates/paimon/src/spec/mod.rs b/crates/paimon/src/spec/mod.rs index 765bb767..79bdc7c4 100644 --- a/crates/paimon/src/spec/mod.rs +++ b/crates/paimon/src/spec/mod.rs @@ -90,6 +90,7 @@ mod predicate; pub(crate) use predicate::datum_cmp; pub(crate) use predicate::eval_row; pub(crate) use predicate::extract_datum; +pub(crate) use predicate::java_bytes_cmp; pub use predicate::{ field_idx_to_partition_idx, Datum, Predicate, PredicateBuilder, PredicateOperator, }; diff --git a/crates/paimon/src/spec/predicate.rs b/crates/paimon/src/spec/predicate.rs index a419f77f..399e88a7 100644 --- a/crates/paimon/src/spec/predicate.rs +++ b/crates/paimon/src/spec/predicate.rs @@ -184,7 +184,7 @@ fn decimal_cmp(ua: i128, sa: u32, ub: i128, sb: u32) -> Option { /// Match Java `CompareUtils.compare(byte[], byte[])`, which compares signed /// bytes lexicographically. -fn java_bytes_cmp(a: &[u8], b: &[u8]) -> Ordering { +pub(crate) fn java_bytes_cmp(a: &[u8], b: &[u8]) -> Ordering { for (&lhs, &rhs) in a.iter().zip(b.iter()) { let cmp = (lhs as i8).cmp(&(rhs as i8)); if cmp != Ordering::Equal { From 8cac0c32e3962560a2723b9143e92a3b6c2aa04f Mon Sep 17 00:00:00 2001 From: JunRuiLee Date: Sat, 4 Jul 2026 16:56:47 +0800 Subject: [PATCH 2/4] fix(core): apply exact residual filtering in ORC/Avro/Row readers ORC pruned by stripe stats only; Avro and Row ignored predicates entirely, so a filtered read silently returned non-matching rows. Each reader now scans the predicate columns and applies the shared residual filter; DataFileReader projects to the requested output by name. Row keeps its full physical (positional) read. --- crates/paimon/src/arrow/format/avro.rs | 253 ++++++++++++++++- crates/paimon/src/arrow/format/orc.rs | 372 ++++++++++++++++++------- crates/paimon/src/arrow/format/row.rs | 159 ++++++++++- 3 files changed, 671 insertions(+), 113 deletions(-) diff --git a/crates/paimon/src/arrow/format/avro.rs b/crates/paimon/src/arrow/format/avro.rs index bbf32c2e..73dda9c6 100644 --- a/crates/paimon/src/arrow/format/avro.rs +++ b/crates/paimon/src/arrow/format/avro.rs @@ -46,7 +46,7 @@ impl FormatFileReader for AvroFormatReader { reader: Box, file_size: u64, read_fields: &[DataField], - _predicates: Option<&FilePredicates>, + predicates: Option<&FilePredicates>, batch_size: Option, row_selection: Option>, ) -> crate::Result { @@ -54,9 +54,18 @@ impl FormatFileReader for AvroFormatReader { // This is fine for typical Paimon data files but may be problematic for very large files. let file_bytes = reader.read(0..file_size).await?; - let read_fields = read_fields.to_vec(); - let target_schema = build_target_arrow_schema(&read_fields)?; + // Widen the decoded schema to include any predicate columns that are not + // in the projection, so residual filtering can see them. DataFileReader + // projects the returned batch to `read_fields` by name, dropping extras. + let scan_fields = crate::arrow::residual::widen_scan_fields(read_fields, predicates); + let target_schema = build_target_arrow_schema(&scan_fields)?; let batch_size = batch_size.unwrap_or(DEFAULT_BATCH_SIZE); + // Own the predicates so the returned 'static stream does not borrow the + // caller's `&FilePredicates` (FilePredicates is not `Clone`; rebuild it). + let predicates = predicates.map(|fp| FilePredicates { + predicates: fp.predicates.clone(), + file_fields: fp.file_fields.clone(), + }); // Collect Avro records directly as apache_avro::Value, avoiding intermediate conversion. let all_records: Vec = Reader::new(&file_bytes[..]) @@ -87,7 +96,15 @@ impl FormatFileReader for AvroFormatReader { Ok(try_stream! { for chunk in records.chunks(batch_size) { - let batch = records_to_batch(chunk, &read_fields, &target_schema)?; + let batch = records_to_batch(chunk, &scan_fields, &target_schema)?; + let batch = match predicates.as_ref() { + Some(fp) => crate::arrow::residual::filter_record_batch_by_predicates( + batch, + fp, + &scan_fields, + )?, + None => batch, + }; yield batch; } } @@ -943,4 +960,232 @@ mod tests { let batch = records_to_batch(&records, &fields, &schema).unwrap(); assert_eq!(batch.num_rows(), 0); } + + // ----------------------------------------------------------------------- + // Exact residual predicate filtering on read + // ----------------------------------------------------------------------- + + use crate::spec::{Datum, Predicate, PredicateOperator}; + + fn leaf( + index: usize, + data_type: DataType, + op: PredicateOperator, + literals: Vec, + ) -> Predicate { + Predicate::Leaf { + column: format!("c{index}"), + index, + data_type, + op, + literals, + } + } + + /// Write the given records into an in-memory Avro OCF file, returning the + /// encoded bytes. The Avro schema mirrors `age: long, name: string`. + fn write_avro_age_name(rows: &[(i64, &str)]) -> Vec { + use apache_avro::{Codec, Schema, Writer}; + + let schema = Schema::parse_str( + r#"{"type": "record", "name": "row", "fields": [ + {"name": "age", "type": "long"}, + {"name": "name", "type": "string"} + ]}"#, + ) + .unwrap(); + let mut writer = Writer::with_codec(&schema, Vec::new(), Codec::Null); + for (age, name) in rows { + let mut record = apache_avro::types::Record::new(&schema).unwrap(); + record.put("age", *age); + record.put("name", *name); + writer.append(record).unwrap(); + } + writer.into_inner().unwrap() + } + + #[tokio::test] + async fn avro_reader_applies_exact_residual_filter_int() { + use crate::btree::test_util::BytesFileRead; + use crate::spec::{BigIntType, VarCharType}; + use futures::TryStreamExt; + + let bytes = write_avro_age_name(&[(10, "a"), (20, "b"), (30, "c"), (40, "d"), (50, "e")]); + + let age = DataField::new(0, "age".to_string(), DataType::BigInt(BigIntType::new())); + let name = DataField::new( + 1, + "name".to_string(), + DataType::VarChar(VarCharType::new(50).unwrap()), + ); + let read_fields = vec![age.clone(), name.clone()]; + let file_fields = vec![age.clone(), name.clone()]; + + // age > 25 → [30, 40, 50]. + let predicates = FilePredicates { + predicates: vec![leaf( + 0, + DataType::BigInt(BigIntType::new()), + PredicateOperator::Gt, + vec![Datum::Long(25)], + )], + file_fields, + }; + + let batches = AvroFormatReader + .read_batch_stream( + Box::new(BytesFileRead(bytes.clone().into())), + bytes.len() as u64, + &read_fields, + Some(&predicates), + None, + None, + ) + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + + let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!(total_rows, 3); + let ages: Vec = batches + .iter() + .flat_map(|b| { + b.column(0) + .as_any() + .downcast_ref::() + .unwrap() + .values() + .to_vec() + }) + .collect(); + assert_eq!(ages, vec![30, 40, 50]); + } + + #[tokio::test] + async fn avro_reader_filters_on_non_projected_predicate_column() { + use crate::btree::test_util::BytesFileRead; + use crate::spec::{BigIntType, VarCharType}; + use futures::TryStreamExt; + + let bytes = write_avro_age_name(&[(10, "a"), (20, "b"), (30, "c"), (40, "d"), (50, "e")]); + + let age = DataField::new(0, "age".to_string(), DataType::BigInt(BigIntType::new())); + let name = DataField::new( + 1, + "name".to_string(), + DataType::VarChar(VarCharType::new(50).unwrap()), + ); + // Project ONLY `name`; the predicate is on `age`, which is NOT projected. + let read_fields = vec![name.clone()]; + let file_fields = vec![age.clone(), name.clone()]; + + // age > 25 → rows c, d, e (age is a BigInt/long, so literal is Datum::Long). + let predicates = FilePredicates { + predicates: vec![leaf( + 0, + DataType::BigInt(BigIntType::new()), + PredicateOperator::Gt, + vec![Datum::Long(25)], + )], + file_fields, + }; + + let batches = AvroFormatReader + .read_batch_stream( + Box::new(BytesFileRead(bytes.clone().into())), + bytes.len() as u64, + &read_fields, + Some(&predicates), + None, + None, + ) + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + + // Assert on the FILTERED rows/values, not an exact column set (the batch + // may contain the extra `age` column — DataFileReader projects it away). + let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!(total_rows, 3); + let names: Vec = batches + .iter() + .flat_map(|b| { + let col = b + .column_by_name("name") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + (0..col.len()) + .map(|i| col.value(i).to_string()) + .collect::>() + }) + .collect(); + assert_eq!(names, vec!["c", "d", "e"]); + } + + #[tokio::test] + async fn avro_reader_applies_exact_residual_filter_like() { + use crate::btree::test_util::BytesFileRead; + use crate::spec::{BigIntType, VarCharType}; + use futures::TryStreamExt; + + let bytes = write_avro_age_name(&[ + (10, "apple"), + (20, "banana"), + (30, "apricot"), + (40, "cherry"), + ]); + + let age = DataField::new(0, "age".to_string(), DataType::BigInt(BigIntType::new())); + let name = DataField::new( + 1, + "name".to_string(), + DataType::VarChar(VarCharType::new(50).unwrap()), + ); + let read_fields = vec![age.clone(), name.clone()]; + + // name LIKE 'a%' → ["apple", "apricot"]. + let predicates = FilePredicates { + predicates: vec![leaf( + 1, + DataType::VarChar(VarCharType::new(50).unwrap()), + PredicateOperator::Like, + vec![Datum::String("a%".to_string())], + )], + file_fields: vec![age, name], + }; + + let batches = AvroFormatReader + .read_batch_stream( + Box::new(BytesFileRead(bytes.clone().into())), + bytes.len() as u64, + &read_fields, + Some(&predicates), + None, + None, + ) + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + + let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!(total_rows, 2); + let names: Vec = batches + .iter() + .flat_map(|b| { + let col = b.column(1).as_any().downcast_ref::().unwrap(); + (0..col.len()) + .map(|i| col.value(i).to_string()) + .collect::>() + }) + .collect(); + assert_eq!(names, vec!["apple", "apricot"]); + } } diff --git a/crates/paimon/src/arrow/format/orc.rs b/crates/paimon/src/arrow/format/orc.rs index 68d986c3..81556b14 100644 --- a/crates/paimon/src/arrow/format/orc.rs +++ b/crates/paimon/src/arrow/format/orc.rs @@ -20,7 +20,6 @@ use crate::io::FileRead; use crate::spec::{DataField, DataType, Datum, Predicate, PredicateOperator}; use crate::table::{ArrowRecordBatchStream, RowRange}; use crate::Error; -use arrow_array::RecordBatch; use async_trait::async_trait; use bytes::Bytes; use futures::{future::BoxFuture, StreamExt}; @@ -53,12 +52,15 @@ impl FormatFileReader for OrcFormatReader { source: Some(Box::new(e)), })?; - let mut projected_names: Vec = - read_fields.iter().map(|f| f.name().to_string()).collect(); + // Widen the scan to include predicate columns so the residual filter can + // see every column it references, even when a predicate column is not part + // of the requested projection. DataFileReader projects the returned batch + // to the requested output by name afterwards, so the extra columns are + // harmless. + let scan_fields = crate::arrow::residual::widen_scan_fields(read_fields, predicates); + let projected_names: Vec = + scan_fields.iter().map(|f| f.name().to_string()).collect(); let orc_predicate = build_orc_predicate(predicates); - if let Some(ref predicate) = orc_predicate { - collect_orc_predicate_columns(predicate, &mut projected_names); - } let projection = ProjectionMask::named_roots(builder.file_metadata().root_data_type(), &projected_names); @@ -84,15 +86,38 @@ impl FormatFileReader for OrcFormatReader { } let stream = builder.build_async(); - let requested_names: Vec = - read_fields.iter().map(|f| f.name().to_string()).collect(); + // Stripe/row-group pruning above only skips whole stripes whose stats cannot + // match; non-matching rows inside a selected stripe survive. Apply the exact + // residual filter on each emitted batch so the reader returns exactly the rows + // matching the pushed-down predicate. The batch is widened with predicate + // columns; DataFileReader projects to the requested output by name afterwards. + // Own the predicate context (scan_fields + cloned FilePredicates) for the + // 'static stream. + let residual: Option<(FilePredicates, Vec)> = predicates.map(|fp| { + ( + FilePredicates { + predicates: fp.predicates.clone(), + file_fields: fp.file_fields.clone(), + }, + scan_fields, + ) + }); Ok(stream .map(move |r| { let batch = r.map_err(|e| Error::UnexpectedError { message: format!("ORC read error: {e}"), source: Some(Box::new(e)), })?; - project_orc_batch_to_requested_fields(batch, &requested_names) + match &residual { + Some((fp, scan_fields)) => { + crate::arrow::residual::filter_record_batch_by_predicates( + batch, + fp, + scan_fields, + ) + } + None => Ok(batch), + } }) .boxed()) } @@ -302,59 +327,6 @@ fn datum_to_orc_value(datum: &Datum, data_type: &DataType) -> Option, -) { - collect_orc_predicate_columns_inner(predicate, projected_names); -} - -fn collect_orc_predicate_columns_inner( - predicate: &orc_rust::predicate::Predicate, - projected_names: &mut Vec, -) { - match predicate { - orc_rust::predicate::Predicate::Comparison { column, .. } - | orc_rust::predicate::Predicate::IsNull { column } - | orc_rust::predicate::Predicate::IsNotNull { column } => { - if !projected_names.iter().any(|name| name == column) { - projected_names.push(column.clone()); - } - } - orc_rust::predicate::Predicate::And(children) - | orc_rust::predicate::Predicate::Or(children) => { - for child in children { - collect_orc_predicate_columns_inner(child, projected_names); - } - } - orc_rust::predicate::Predicate::Not(child) => { - collect_orc_predicate_columns_inner(child, projected_names); - } - } -} - -fn project_orc_batch_to_requested_fields( - batch: RecordBatch, - requested_names: &[String], -) -> crate::Result { - let indices: Vec = requested_names - .iter() - .map(|name| { - batch - .schema() - .index_of(name) - .map_err(|e| Error::UnexpectedError { - message: format!("ORC batch is missing requested column '{name}': {e}"), - source: Some(Box::new(e)), - }) - }) - .collect::>()?; - batch.project(&indices).map_err(|e| Error::UnexpectedError { - message: format!("Failed to project ORC batch: {e}"), - source: Some(Box::new(e)), - }) -} - // --------------------------------------------------------------------------- // Row ranges → orc_rust::RowSelection // --------------------------------------------------------------------------- @@ -428,7 +400,7 @@ impl AsyncChunkReader for OrcFileReader { #[cfg(test)] mod tests { use super::*; - use arrow_array::{Int32Array, StringArray}; + use arrow_array::{Array, Int32Array, RecordBatch, StringArray}; use arrow_schema::{DataType as ArrowDataType, Field, Schema}; use orc_rust::row_selection::RowSelector; use std::sync::Arc; @@ -731,63 +703,253 @@ mod tests { ); } - #[test] - fn test_collect_orc_predicate_columns_adds_filter_columns_once() { - let predicate = orc_rust::predicate::Predicate::and(vec![ - orc_rust::predicate::Predicate::eq( - "category", - PredicateValue::Utf8(Some("a".to_string())), - ), - orc_rust::predicate::Predicate::gt("score", PredicateValue::Int32(Some(1))), - ]); - let mut projected_names = vec!["name".to_string()]; + /// Encode a single-stripe ORC file (all rows in one stripe) into memory bytes. + fn write_single_stripe_orc(schema: Arc, batch: &RecordBatch) -> Vec { + let mut buf: Vec = Vec::new(); + let mut writer = orc_rust::ArrowWriterBuilder::new(&mut buf, schema) + // Large stripe byte size keeps all rows in a single stripe so stripe + // stats cannot exclude the non-matching rows. + .with_stripe_byte_size(64 * 1024 * 1024) + .try_build() + .unwrap(); + writer.write(batch).unwrap(); + writer.close().unwrap(); + buf + } - collect_orc_predicate_columns(&predicate, &mut projected_names); - collect_orc_predicate_columns(&predicate, &mut projected_names); + #[tokio::test] + async fn test_orc_read_applies_exact_residual_filter() { + use crate::io::FileIOBuilder; + use crate::spec::{IntType, VarCharType}; - assert_eq!(projected_names, vec!["name", "category", "score"]); + let arrow_schema = Arc::new(Schema::new(vec![ + Field::new("age", ArrowDataType::Int32, true), + Field::new("name", ArrowDataType::Utf8, true), + ])); + let batch = RecordBatch::try_new( + arrow_schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![10, 20, 30, 40, 50])), + Arc::new(StringArray::from(vec![ + "apple", "banana", "apricot", "cherry", "avocado", + ])), + ], + ) + .unwrap(); + + let orc_bytes = write_single_stripe_orc(arrow_schema, &batch); + + let file_io = FileIOBuilder::new("memory").build().unwrap(); + let path = "memory:/test_orc_residual_filter.orc"; + let output = file_io.new_output(path).unwrap(); + output.write(Bytes::from(orc_bytes)).await.unwrap(); + + let input = file_io.new_input(path).unwrap(); + let file_size = input.metadata().await.unwrap().size; + + let read_fields = vec![ + field(0, "age", DataType::Int(IntType::new())), + field(1, "name", DataType::VarChar(VarCharType::string_type())), + ]; + + // age > 25 -> [30, 40, 50]; single stripe means stripe pruning cannot drop + // the non-matching rows, so the exact residual filter must do it. + let predicates = file_predicates( + vec![leaf(0, PredicateOperator::Gt, vec![Datum::Int(25)])], + read_fields.clone(), + ); + + let reader_input = input.reader().await.unwrap(); + let reader = OrcFormatReader; + let mut stream = reader + .read_batch_stream( + Box::new(reader_input), + file_size, + &read_fields, + Some(&predicates), + None, + None, + ) + .await + .unwrap(); + + let mut ages: Vec = Vec::new(); + while let Some(result) = stream.next().await { + let batch = result.unwrap(); + let col = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + ages.extend(col.values().iter().copied()); + } + + assert_eq!(ages, vec![30, 40, 50]); } - #[test] - fn test_project_orc_batch_to_requested_fields() { - let schema = Arc::new(Schema::new(vec![ - Field::new("name", ArrowDataType::Utf8, true), + /// Encode a single-stripe ORC file with three columns (id, name, age). + fn write_single_stripe_orc_three_cols(schema: Arc, batch: &RecordBatch) -> Vec { + write_single_stripe_orc(schema, batch) + } + + #[tokio::test] + async fn test_orc_read_filters_on_non_projected_predicate_column() { + // Gap A: read only [name] but filter on the non-projected [age] column. + // The reader must scan the predicate column, filter exactly, and still + // return the matching rows. (DataFileReader later projects to [name] by + // name; the reader-level batch may keep the extra `age` column.) + use crate::io::FileIOBuilder; + use crate::spec::{IntType, VarCharType}; + + let arrow_schema = Arc::new(Schema::new(vec![ Field::new("id", ArrowDataType::Int32, true), + Field::new("name", ArrowDataType::Utf8, true), + Field::new("age", ArrowDataType::Int32, true), ])); let batch = RecordBatch::try_new( - schema, + arrow_schema.clone(), vec![ - Arc::new(StringArray::from(vec!["a", "b"])), - Arc::new(Int32Array::from(vec![1, 2])), + Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])), + Arc::new(StringArray::from(vec!["a", "b", "c", "d", "e"])), + Arc::new(Int32Array::from(vec![10, 20, 30, 40, 50])), ], ) .unwrap(); - let projected = project_orc_batch_to_requested_fields(batch, &["name".to_string()]) - .expect("project ORC batch"); + let orc_bytes = write_single_stripe_orc_three_cols(arrow_schema, &batch); - assert_eq!(projected.num_columns(), 1); - assert_eq!(projected.schema().field(0).name(), "name"); - } + let file_io = FileIOBuilder::new("memory").build().unwrap(); + let path = "memory:/test_orc_non_projected_predicate.orc"; + let output = file_io.new_output(path).unwrap(); + output.write(Bytes::from(orc_bytes)).await.unwrap(); - #[test] - fn test_project_orc_batch_to_requested_fields_errors_on_missing_column() { - let schema = Arc::new(Schema::new(vec![Field::new( + let input = file_io.new_input(path).unwrap(); + let file_size = input.metadata().await.unwrap().size; + + // Read only [name] (NOT age). + let read_fields = vec![field( + 1, "name", - ArrowDataType::Utf8, - true, - )])); - let batch = - RecordBatch::try_new(schema, vec![Arc::new(StringArray::from(vec!["a"]))]).unwrap(); - - let error = project_orc_batch_to_requested_fields(batch, &["id".to_string()]) - .expect_err("missing requested column should error"); - - assert!( - error - .to_string() - .contains("ORC batch is missing requested column 'id'"), - "unexpected error: {error}" + DataType::VarChar(VarCharType::string_type()), + )]; + + // File-level schema for predicate resolution: (id, name, age). + let file_fields = vec![ + field(0, "id", DataType::Int(IntType::new())), + field(1, "name", DataType::VarChar(VarCharType::string_type())), + field(2, "age", DataType::Int(IntType::new())), + ]; + + // age > 25 -> rows c, d, e. `age` is not in read_fields, so before the fix + // the residual evaluator can't see it and silently returns all 5 names. + let predicates = file_predicates( + vec![leaf(2, PredicateOperator::Gt, vec![Datum::Int(25)])], + file_fields, ); + + let reader_input = input.reader().await.unwrap(); + let reader = OrcFormatReader; + let mut stream = reader + .read_batch_stream( + Box::new(reader_input), + file_size, + &read_fields, + Some(&predicates), + None, + None, + ) + .await + .unwrap(); + + let mut names: Vec = Vec::new(); + while let Some(result) = stream.next().await { + let batch = result.unwrap(); + let name_idx = batch.schema().index_of("name").unwrap(); + let col = batch + .column(name_idx) + .as_any() + .downcast_ref::() + .unwrap(); + names.extend((0..col.len()).map(|i| col.value(i).to_string())); + } + + // Assert on FILTERED ROWS/values, not on an exact column set. + assert_eq!(names, vec!["c", "d", "e"]); + } + + #[tokio::test] + async fn test_orc_read_applies_exact_residual_filter_like() { + use crate::io::FileIOBuilder; + use crate::spec::{IntType, VarCharType}; + + let arrow_schema = Arc::new(Schema::new(vec![ + Field::new("age", ArrowDataType::Int32, true), + Field::new("name", ArrowDataType::Utf8, true), + ])); + let batch = RecordBatch::try_new( + arrow_schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![10, 20, 30, 40, 50])), + Arc::new(StringArray::from(vec![ + "apple", "banana", "apricot", "cherry", "avocado", + ])), + ], + ) + .unwrap(); + + let orc_bytes = write_single_stripe_orc(arrow_schema, &batch); + + let file_io = FileIOBuilder::new("memory").build().unwrap(); + let path = "memory:/test_orc_residual_filter_like.orc"; + let output = file_io.new_output(path).unwrap(); + output.write(Bytes::from(orc_bytes)).await.unwrap(); + + let input = file_io.new_input(path).unwrap(); + let file_size = input.metadata().await.unwrap().size; + + let read_fields = vec![ + field(0, "age", DataType::Int(IntType::new())), + field(1, "name", DataType::VarChar(VarCharType::string_type())), + ]; + + // name like 'a%' -> apple, apricot, avocado (3 rows). `Like` is not pushed + // into ORC stripe pruning, so this exercises the residual filter directly. + let predicates = file_predicates( + vec![Predicate::Leaf { + column: "name".to_string(), + index: 1, + data_type: DataType::VarChar(VarCharType::string_type()), + op: PredicateOperator::Like, + literals: vec![Datum::String("a%".to_string())], + }], + read_fields.clone(), + ); + + let reader_input = input.reader().await.unwrap(); + let reader = OrcFormatReader; + let mut stream = reader + .read_batch_stream( + Box::new(reader_input), + file_size, + &read_fields, + Some(&predicates), + None, + None, + ) + .await + .unwrap(); + + let mut names: Vec = Vec::new(); + while let Some(result) = stream.next().await { + let batch = result.unwrap(); + let col = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + names.extend((0..col.len()).map(|i| col.value(i).to_string())); + } + + assert_eq!(names, vec!["apple", "apricot", "avocado"]); } } diff --git a/crates/paimon/src/arrow/format/row.rs b/crates/paimon/src/arrow/format/row.rs index 0ffcfd9e..e92fe377 100644 --- a/crates/paimon/src/arrow/format/row.rs +++ b/crates/paimon/src/arrow/format/row.rs @@ -226,7 +226,7 @@ impl FormatFileReader for RowFormatReader { reader: Box, file_size: u64, read_fields: &[DataField], - _predicates: Option<&FilePredicates>, + predicates: Option<&FilePredicates>, batch_size: Option, row_selection: Option>, ) -> crate::Result { @@ -295,6 +295,10 @@ impl FormatFileReader for RowFormatReader { let batch_size = batch_size.unwrap_or(DEFAULT_BATCH_SIZE); let blocks_to_read = blocks_to_read(&index, total_rows, row_selection.as_deref()); + let predicates = predicates.map(|fp| FilePredicates { + predicates: fp.predicates.clone(), + file_fields: fp.file_fields.clone(), + }); Ok(try_stream! { let mut blocks = futures::stream::iter(blocks_to_read.into_iter().map(|block_idx| { read_row_block(reader.as_ref(), &index, block_idx) @@ -316,6 +320,12 @@ impl FormatFileReader for RowFormatReader { for chunk in selected.chunks(batch_size) { let batch = decode_row_block(&data, &row_type, schema.clone(), chunk)?; + let batch = match predicates.as_ref() { + Some(fp) => crate::arrow::residual::filter_record_batch_by_predicates( + batch, fp, &row_type, + )?, + None => batch, + }; yield batch; } } @@ -2185,9 +2195,9 @@ mod tests { use crate::btree::test_util::BytesFileRead; use crate::io::FileIOBuilder; use crate::spec::{ - ArrayType, BigIntType, BooleanType, DataType, DateType, DecimalType, DoubleType, FloatType, - IntType, MapType, MultisetType, RowType, TimeType, TimestampType, VarBinaryType, - VarCharType, + ArrayType, BigIntType, BooleanType, DataType, DateType, Datum, DecimalType, DoubleType, + FloatType, IntType, MapType, MultisetType, Predicate, PredicateOperator, RowType, TimeType, + TimestampType, VarBinaryType, VarCharType, }; use futures::TryStreamExt; use std::ops::Range; @@ -2249,6 +2259,147 @@ mod tests { assert!(message.contains("first block row start must be 0")); } + #[tokio::test] + async fn row_reader_applies_exact_residual_filter() { + let file_io = FileIOBuilder::new("memory").build().unwrap(); + let path = "memory:/row-residual/data-1.row"; + let output = file_io.new_output(path).unwrap(); + let fields = vec![DataField::new( + 0, + "age".to_string(), + DataType::Int(IntType::new()), + )]; + let schema = build_target_arrow_schema(&fields).unwrap(); + let mut writer = RowFormatWriter::new(&output, schema.clone(), fields.clone(), 1) + .await + .unwrap(); + let batch = RecordBatch::try_new( + schema, + vec![Arc::new(Int32Array::from(vec![10, 20, 30, 40, 50]))], + ) + .unwrap(); + writer.write(&batch).await.unwrap(); + Box::new(writer).close().await.unwrap(); + + let predicates = FilePredicates { + predicates: vec![Predicate::Leaf { + column: "age".to_string(), + index: 0, + data_type: DataType::Int(IntType::new()), + op: PredicateOperator::Gt, + literals: vec![Datum::Int(25)], + }], + file_fields: fields.clone(), + }; + + let input = file_io.new_input(path).unwrap(); + let bytes = input.read().await.unwrap(); + let batches = RowFormatReader + .read_batch_stream( + Box::new(BytesFileRead(bytes.clone())), + bytes.len() as u64, + &fields, + Some(&predicates), + Some(8), + None, + ) + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + + let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!(total_rows, 3); + let values: Vec = batches + .iter() + .flat_map(|b| { + let col = b.column(0).as_any().downcast_ref::().unwrap(); + (0..col.len()).map(|i| col.value(i)).collect::>() + }) + .collect(); + assert_eq!(values, vec![30, 40, 50]); + } + + #[tokio::test] + async fn row_reader_filters_on_full_physical_batch_predicate_on_non_first_column() { + // Regression for Gap A (Row): the reader is handed the FULL physical + // schema `(id, name, age)` (exactly as DataFileReader passes for `.row`, + // see data_file_reader.rs:186-190) and a predicate on `age`. It must apply + // the residual filter on the full batch, keeping only the rows where + // age > 25, so the `name` column ends up as [c, d, e]. Without the filter + // the reader returns all 5 rows. + let file_io = FileIOBuilder::new("memory").build().unwrap(); + let path = "memory:/row-residual/data-full-schema.row"; + let output = file_io.new_output(path).unwrap(); + let fields = vec![ + DataField::new(0, "id".to_string(), DataType::Int(IntType::new())), + DataField::new( + 1, + "name".to_string(), + DataType::VarChar(VarCharType::string_type()), + ), + DataField::new(2, "age".to_string(), DataType::Int(IntType::new())), + ]; + let schema = build_target_arrow_schema(&fields).unwrap(); + let mut writer = RowFormatWriter::new(&output, schema.clone(), fields.clone(), 1) + .await + .unwrap(); + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])), + Arc::new(StringArray::from(vec!["a", "b", "c", "d", "e"])), + Arc::new(Int32Array::from(vec![10, 20, 30, 40, 50])), + ], + ) + .unwrap(); + writer.write(&batch).await.unwrap(); + Box::new(writer).close().await.unwrap(); + + let predicates = FilePredicates { + predicates: vec![Predicate::Leaf { + column: "age".to_string(), + index: 2, + data_type: DataType::Int(IntType::new()), + op: PredicateOperator::Gt, + literals: vec![Datum::Int(25)], + }], + file_fields: fields.clone(), + }; + + let input = file_io.new_input(path).unwrap(); + let bytes = input.read().await.unwrap(); + // read_fields is the FULL physical schema, as DataFileReader passes for `.row`. + let batches = RowFormatReader + .read_batch_stream( + Box::new(BytesFileRead(bytes.clone())), + bytes.len() as u64, + &fields, + Some(&predicates), + Some(8), + None, + ) + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + + let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!(total_rows, 3); + let names: Vec = batches + .iter() + .flat_map(|b| { + let col = b.column(1).as_any().downcast_ref::().unwrap(); + (0..col.len()) + .map(|i| col.value(i).to_string()) + .collect::>() + }) + .collect(); + assert_eq!(names, vec!["c", "d", "e"]); + } + #[tokio::test] async fn row_writer_reader_roundtrip_primitives_and_selection() { let file_io = FileIOBuilder::new("memory").build().unwrap(); From 74bbe5b45f7e1388a427ec46ed20e5dc60ce2843 Mon Sep 17 00:00:00 2001 From: JunRuiLee Date: Sat, 4 Jul 2026 16:56:47 +0800 Subject: [PATCH 3/4] fix(core): keep full predicate on the public read path for exact residual ReadBuilder/TableRead pruned And/Or/Not before the reader, so the residual pass never saw compound predicates on the public path and an Or/Not filter returned all rows. Stop pruning: native pushdown and stats already skip compound nodes they cannot use, and the residual pass enforces the full predicate exactly. Removes the now-unused reader_pruning_predicates. --- crates/paimon/src/arrow/filtering.rs | 35 ------------ crates/paimon/src/table/read_builder.rs | 72 ++++++++++++++++++++++++- crates/paimon/src/table/table_read.rs | 7 ++- 3 files changed, 75 insertions(+), 39 deletions(-) diff --git a/crates/paimon/src/arrow/filtering.rs b/crates/paimon/src/arrow/filtering.rs index 1f450bdc..16170317 100644 --- a/crates/paimon/src/arrow/filtering.rs +++ b/crates/paimon/src/arrow/filtering.rs @@ -19,13 +19,6 @@ use crate::arrow::schema_evolution::create_index_mapping; pub(crate) use crate::predicate_stats::{predicates_may_match_with_schema, StatsAccessor}; use crate::spec::{DataField, Predicate, PredicateOperator}; -pub(crate) fn reader_pruning_predicates(data_predicates: Vec) -> Vec { - data_predicates - .into_iter() - .filter(predicate_supported_for_reader_pruning) - .collect() -} - /// Remap predicates from table-level indices to file-level indices. /// Predicates referencing fields not present in the file are resolved based on /// NULL semantics: the missing column is treated as all-NULL, so `IS NULL` @@ -127,34 +120,6 @@ pub(crate) fn build_field_mapping( ) } -fn predicate_supported_for_reader_pruning(predicate: &Predicate) -> bool { - match predicate { - Predicate::AlwaysFalse => true, - Predicate::Leaf { op, .. } => { - matches!( - op, - PredicateOperator::IsNull - | PredicateOperator::IsNotNull - | PredicateOperator::Eq - | PredicateOperator::NotEq - | PredicateOperator::Lt - | PredicateOperator::LtEq - | PredicateOperator::Gt - | PredicateOperator::GtEq - | PredicateOperator::In - | PredicateOperator::NotIn - | PredicateOperator::StartsWith - | PredicateOperator::EndsWith - | PredicateOperator::Contains - | PredicateOperator::Like - | PredicateOperator::Between - | PredicateOperator::NotBetween - ) - } - Predicate::AlwaysTrue | Predicate::And(_) | Predicate::Or(_) | Predicate::Not(_) => false, - } -} - fn identity_field_mapping(num_fields: usize) -> Vec> { (0..num_fields).map(Some).collect() } diff --git a/crates/paimon/src/table/read_builder.rs b/crates/paimon/src/table/read_builder.rs index dbf22cac..5a67d3ec 100644 --- a/crates/paimon/src/table/read_builder.rs +++ b/crates/paimon/src/table/read_builder.rs @@ -24,7 +24,6 @@ use super::bucket_filter::{extract_predicate_for_keys, split_partition_and_data_ use super::partition_filter::PartitionFilter; use super::table_read::TableRead; use super::{Table, TableScan}; -use crate::arrow::filtering::reader_pruning_predicates; use crate::spec::{CoreOptions, DataField, Predicate}; use crate::table::source::RowRange; use crate::{Error, Result}; @@ -233,10 +232,13 @@ impl<'a> ReadBuilder<'a> { Some(projected) => self.resolve_projected_fields(projected)?, }; + // Pass the FULL data predicate through (including `And`/`Or`/`Not`). + // Pushdown/stats skip compound nodes; the residual pass enforces the full + // predicate exactly. Pruning here would drop compound predicates. Ok(TableRead::new( self.table, read_type, - reader_pruning_predicates(self.filter.data_predicates.clone()), + self.filter.data_predicates.clone(), )) } @@ -661,6 +663,72 @@ mod tests { assert_eq!(collect_int_column(&batches, "id"), vec![2, 3, 4]); } + /// Real-path regression: an `Or` predicate must be applied exactly through the + /// public ReadBuilder/TableRead path. The data predicate set is no longer + /// pruned before the reader, so the residual pass receives the full `Or` and + /// filters exactly. Single row group so stats pruning cannot exclude anything. + #[tokio::test] + async fn test_new_read_applies_or_predicate_exactly_via_public_path() { + let tempdir = tempdir().unwrap(); + let table_path = local_file_path(tempdir.path()); + let bucket_dir = tempdir.path().join("bucket-0"); + fs::create_dir_all(&bucket_dir).unwrap(); + + let parquet_path = bucket_dir.join("data.parquet"); + write_int_parquet_file( + &parquet_path, + vec![("id", vec![1, 2, 3, 4]), ("value", vec![5, 20, 30, 40])], + None, + ); + let file_size = fs::metadata(&parquet_path).unwrap().len() as i64; + + let file_io = FileIOBuilder::new("file").build().unwrap(); + let table_schema = TableSchema::new( + 0, + &Schema::builder() + .column("id", DataType::Int(IntType::new())) + .column("value", DataType::Int(IntType::new())) + .build() + .unwrap(), + ); + let table = Table::new( + file_io, + Identifier::new("default", "t"), + table_path, + table_schema, + None, + ); + + let split = DataSplitBuilder::new() + .with_snapshot(1) + .with_partition(BinaryRow::new(0)) + .with_bucket(0) + .with_bucket_path(local_file_path(&bucket_dir)) + .with_total_buckets(1) + .with_data_files(vec![test_data_file("data.parquet", 4, file_size)]) + .build() + .unwrap(); + + // id = 1 OR value = 40 -> rows {id=1} and {id=4}. + let pb = PredicateBuilder::new(table.schema().fields()); + let predicate = crate::spec::Predicate::or(vec![ + pb.equal("id", crate::spec::Datum::Int(1)).unwrap(), + pb.equal("value", crate::spec::Datum::Int(40)).unwrap(), + ]); + + let mut builder = table.new_read_builder(); + builder.with_projection(&["id"]).with_filter(predicate); + let read = builder.new_read().unwrap(); + let batches = read + .to_arrow(&[split]) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + assert_eq!(collect_int_column(&batches, "id"), vec![1, 4]); + } + #[tokio::test] async fn test_reader_pruning_ignores_partition_conjuncts() { let tempdir = tempdir().unwrap(); diff --git a/crates/paimon/src/table/table_read.rs b/crates/paimon/src/table/table_read.rs index 6f71455c..83cef607 100644 --- a/crates/paimon/src/table/table_read.rs +++ b/crates/paimon/src/table/table_read.rs @@ -20,7 +20,6 @@ use super::data_file_reader::DataFileReader; use super::kv_file_reader::{KeyValueFileReader, KeyValueReadConfig}; use super::read_builder::split_scan_predicates; use super::{ArrowRecordBatchStream, Table}; -use crate::arrow::filtering::reader_pruning_predicates; use crate::spec::{CoreOptions, DataField, MergeEngine, Predicate}; use crate::DataSplit; @@ -66,7 +65,11 @@ impl<'a> TableRead<'a> { /// Set a filter predicate for conservative read-side pruning. pub fn with_filter(mut self, filter: Predicate) -> Self { let (_, data_predicates) = split_scan_predicates(self.table, filter); - self.data_predicates = reader_pruning_predicates(data_predicates); + // Keep the FULL data predicate (including `And`/`Or`/`Not`). Native + // pushdown / stats pruning skip compound nodes they cannot use, and the + // residual pass applies the full predicate exactly. Pruning here would + // drop compound predicates before the residual could enforce them. + self.data_predicates = data_predicates; self } From 509fce15ddd13485a48da2b5ff7dcbfecdaaad61 Mon Sep 17 00:00:00 2001 From: JunRuiLee Date: Sat, 4 Jul 2026 16:56:47 +0800 Subject: [PATCH 4/4] fix(core): guard _ROW_ID projection against a row-filtering predicate _ROW_ID is materialized positionally from each batch's row count, which desyncs once the residual filter drops rows. Reject projecting _ROW_ID together with a predicate that can actually filter rows (a constant AlwaysTrue is allowed). The guard lives in read_single_file_stream, the true _ROW_ID generation site. --- crates/paimon/src/table/data_file_reader.rs | 225 +++++++++++++++++++- 1 file changed, 224 insertions(+), 1 deletion(-) diff --git a/crates/paimon/src/table/data_file_reader.rs b/crates/paimon/src/table/data_file_reader.rs index 60ecdf47..c5d6f54e 100644 --- a/crates/paimon/src/table/data_file_reader.rs +++ b/crates/paimon/src/table/data_file_reader.rs @@ -69,6 +69,32 @@ impl DataFileReader { self } + /// Reject projecting `_ROW_ID` alongside a data predicate. `_ROW_ID` is + /// assigned positionally from post-filter batch row counts, so a residual + /// filter that drops rows would desync it. (`_ROW_ID` predicates travel via + /// `row_ranges`, not `predicates`, so they do not trip this.) + fn reject_row_id_with_predicate( + read_type: &[DataField], + predicates: &[Predicate], + ) -> crate::Result<()> { + let projects_row_id = read_type + .iter() + .any(|field| field.name() == ROW_ID_FIELD_NAME); + // Only predicates that can actually drop rows desync positional `_ROW_ID`. + // A constant `AlwaysTrue` keeps every row in order and is harmless, so it + // must not trip the guard. + let has_row_filtering_predicate = predicates + .iter() + .any(|p| !matches!(p, Predicate::AlwaysTrue)); + if projects_row_id && has_row_filtering_predicate { + return Err(crate::Error::Unsupported { + message: "reading _ROW_ID together with a data predicate is not supported yet" + .to_string(), + }); + } + Ok(()) + } + /// Take a stream of DataSplits and read every data file in each split. /// Returns a stream of Arrow RecordBatches from all files. /// @@ -147,6 +173,18 @@ impl DataFileReader { dv: Option>, row_ranges: Option>, ) -> crate::Result { + // Guard at the true risk site: `_ROW_ID` is materialized positionally from + // each batch's row count (see `row_id_column_for_batch`), assuming the + // reader emits rows in original file order and count. The format readers + // apply an exact residual filter that drops non-matching rows *before* + // `_ROW_ID` is assigned here, which would desync the ids. So projecting + // `_ROW_ID` together with a data predicate is unsupported — fail loudly + // rather than return wrong ids. Placed here (not only in `read()`) because + // `read_single_file_stream` is also called directly by the KV and + // data-evolution readers; both strip/omit `_ROW_ID` from the read_type + // they pass, so this guard does not affect them. + Self::reject_row_id_with_predicate(&self.read_type, &self.predicates)?; + let read_type = self.read_type.clone(); let table_fields = self.table_fields.clone(); let predicates = self.predicates.clone(); @@ -556,7 +594,9 @@ mod row_tests { use crate::arrow::format::create_format_writer; use crate::io::FileIOBuilder; use crate::spec::stats::BinaryTableStats; - use crate::spec::{BinaryRow, DataFileMeta, DataType, IntType, VarCharType}; + use crate::spec::{ + BinaryRow, DataFileMeta, DataType, Datum, IntType, Predicate, PredicateBuilder, VarCharType, + }; use crate::table::source::DataSplitBuilder; use arrow_array::{Int32Array, StringArray}; use futures::TryStreamExt; @@ -657,6 +697,189 @@ mod row_tests { .unwrap(); assert_eq!(scores.values(), &[10, 20, 30]); } + + /// End-to-end guard: a non-partition predicate on a `.row` file must be + /// applied exactly by the time batches leave `DataFileReader`. The Row + /// format has no pushdown, so without a residual filter this would return + /// every row; the residual filter makes it exact. Guards against a + /// regression if the per-reader wiring is later refactored. + #[tokio::test] + async fn row_read_applies_exact_residual_filter_end_to_end() { + let fields = vec![ + field(0, "id", DataType::Int(IntType::new())), + field(1, "age", DataType::Int(IntType::new())), + ]; + let schema = build_target_arrow_schema(&fields).unwrap(); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])), + Arc::new(Int32Array::from(vec![10, 20, 30, 40, 50])), + ], + ) + .unwrap(); + + let file_io = FileIOBuilder::new("memory").build().unwrap(); + let table_path = "memory:/row_residual"; + let bucket_path = format!("{table_path}/bucket-0"); + let file_name = "part-0.row"; + let file_path = format!("{bucket_path}/{file_name}"); + let output = file_io.new_output(&file_path).unwrap(); + let mut writer = create_format_writer(&output, schema, "zstd", 1, None, None) + .await + .unwrap(); + writer.write(&batch).await.unwrap(); + let file_size = writer.close().await.unwrap() as i64; + + let schema_id = 1; + let split = DataSplitBuilder::new() + .with_snapshot(1) + .with_partition(BinaryRow::new(0)) + .with_bucket(0) + .with_bucket_path(bucket_path) + .with_total_buckets(1) + .with_data_files(vec![data_file(file_name, file_size, 5, schema_id)]) + .build() + .unwrap(); + + // age > 25 -> only [30, 40, 50] must survive. + let predicate: Predicate = PredicateBuilder::new(&fields) + .greater_than("age", Datum::Int(25)) + .unwrap(); + let read_type = vec![fields[1].clone()]; + let reader = DataFileReader::new( + file_io.clone(), + SchemaManager::new(file_io, table_path.to_string()), + schema_id, + fields, + read_type, + vec![predicate], + ); + + let batches = reader + .read(&[split]) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + let ages: Vec = batches + .iter() + .flat_map(|b| { + b.column(0) + .as_any() + .downcast_ref::() + .unwrap() + .values() + .to_vec() + }) + .collect(); + assert_eq!(ages, vec![30, 40, 50]); + } + + /// Guard: projecting `_ROW_ID` together with a data predicate must fail + /// loudly rather than assign wrong row ids. `_ROW_ID` is materialized + /// positionally from post-filter batch row counts, so the readers' residual + /// filter dropping rows would desync it. See the guard in `read()`. + #[tokio::test] + async fn read_rejects_row_id_projection_with_data_predicate() { + // Write a real .row file so read() reaches read_single_file_stream (where + // the guard lives). Project _ROW_ID alongside a data predicate → Unsupported. + let fields = vec![ + field(0, "id", DataType::Int(IntType::new())), + field(1, "age", DataType::Int(IntType::new())), + ]; + let schema = build_target_arrow_schema(&fields).unwrap(); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![1, 2, 3])), + Arc::new(Int32Array::from(vec![10, 20, 30])), + ], + ) + .unwrap(); + + let file_io = FileIOBuilder::new("memory").build().unwrap(); + let table_path = "memory:/row_id_guard"; + let bucket_path = format!("{table_path}/bucket-0"); + let file_name = "part-0.row"; + let output = file_io + .new_output(&format!("{bucket_path}/{file_name}")) + .unwrap(); + let mut writer = create_format_writer(&output, schema, "zstd", 1, None, None) + .await + .unwrap(); + writer.write(&batch).await.unwrap(); + let file_size = writer.close().await.unwrap() as i64; + + let split = DataSplitBuilder::new() + .with_snapshot(1) + .with_partition(BinaryRow::new(0)) + .with_bucket(0) + .with_bucket_path(bucket_path) + .with_total_buckets(1) + .with_data_files(vec![data_file(file_name, file_size, 3, 1)]) + .build() + .unwrap(); + + let row_id = DataField::new( + crate::spec::ROW_ID_FIELD_ID, + ROW_ID_FIELD_NAME.to_string(), + DataType::BigInt(crate::spec::BigIntType::new()), + ); + // read_type projects _ROW_ID alongside a real column; predicate on age. + let read_type = vec![fields[1].clone(), row_id]; + let predicate: Predicate = PredicateBuilder::new(&fields) + .greater_than("age", Datum::Int(25)) + .unwrap(); + + let reader = DataFileReader::new( + file_io.clone(), + SchemaManager::new(file_io, table_path.to_string()), + 1, + fields, + read_type, + vec![predicate], + ); + + // The guard is inside read_single_file_stream, reached while consuming the + // stream, so the error surfaces on collect. + let result = reader.read(&[split]).unwrap().try_collect::>().await; + let err = match result { + Ok(_) => panic!("must reject _ROW_ID + predicate"), + Err(err) => err, + }; + assert!( + matches!(&err, crate::Error::Unsupported { message } if message.contains("_ROW_ID")), + "expected Unsupported mentioning _ROW_ID, got: {err:?}" + ); + } + + #[test] + fn reject_row_id_guard_allows_constant_always_true_predicate() { + // A constant AlwaysTrue keeps every row in order, so it cannot desync + // positional _ROW_ID and must NOT trip the guard. + let row_id = DataField::new( + crate::spec::ROW_ID_FIELD_ID, + ROW_ID_FIELD_NAME.to_string(), + DataType::BigInt(crate::spec::BigIntType::new()), + ); + let read_type = vec![row_id]; + // AlwaysTrue alone -> allowed. + assert!( + DataFileReader::reject_row_id_with_predicate(&read_type, &[Predicate::AlwaysTrue]) + .is_ok(), + "AlwaysTrue must not trip the _ROW_ID guard" + ); + // A real filtering predicate -> rejected. + let filtering = PredicateBuilder::new(&[field(0, "age", DataType::Int(IntType::new()))]) + .greater_than("age", Datum::Int(1)) + .unwrap(); + assert!( + DataFileReader::reject_row_id_with_predicate(&read_type, &[filtering]).is_err(), + "a row-filtering predicate must trip the _ROW_ID guard" + ); + } } #[cfg(all(test, feature = "mosaic"))]