Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/visitor/jsonvectorassembler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ function* binaryToString(vector: Vector<Binary> | Vector<LargeBinary> | Vector<F

/** @ignore */
function* bigNumsToStrings(values: BigUint64Array | BigInt64Array | Uint32Array | Int32Array | IntArray, stride: number) {
const u32s = new Uint32Array(values.buffer);
const u32s = new Uint32Array(values.buffer, values.byteOffset, values.byteLength / Uint32Array.BYTES_PER_ELEMENT);
for (let i = -1, n = u32s.length / stride; ++i < n;) {
yield `${BN.new(u32s.subarray((i + 0) * stride, (i + 1) * stride), false)}`;
}
Expand Down
116 changes: 116 additions & 0 deletions test/unit/ipc/writer/json-offset-views-tests.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

// The bug: bigNumsToStrings() in src/visitor/jsonvectorassembler.ts used to
// do `new Uint32Array(values.buffer)` and ignore values.byteOffset/byteLength.
// When the underlying typed array was a subarray view with byteOffset > 0
// (which happens whenever Data was constructed against a shared/padded buffer
// or after slicing through tableFromIPC), the helper would read the wrong
// 64-bit words and emit incorrect JSON for OFFSET/DATA fields.
//
// For each affected type, we take a freshly-generated Vector (byteOffset === 0),
// rewrap every backing typed array in a subarray view at byteOffset > 0, and
// assert the JSONVectorAssembler produces the same JSON in both cases.
//
// These are focused unit tests of bigNumsToStrings' buffer-windowing contract.
// They pinpoint exactly which type / which callsite breaks if a regression
// lands, and they don't depend on tableFromIPC continuing to produce offset
// views. End-to-end coverage of the same flow on a broader type matrix lives
// in json-writer-tests.ts, which catches the same regression at the
// user-facing API level.

import {
Data, DataType, Field, RecordBatch, Schema, Struct,
} from 'apache-arrow';
import { JSONVectorAssembler } from 'apache-arrow/visitor/jsonvectorassembler';
import * as generate from '../../../generate-test-data.js';

/**
* Return a copy of `arr` whose data lives at a nonzero byteOffset inside a
* larger underlying buffer. Equivalent to the state of a typed array that
* came out of `tableFromIPC` — same logical contents, shifted memory layout.
*/
function padBuffer<T extends ArrayBufferView>(arr: T | undefined): T | undefined {
if (!arr) return arr;
const Ctor = arr.constructor as new (n: number) => any;
const padding = 4;
const padded = new Ctor((arr as any).length + padding * 2);
padded.set(arr, padding);
return padded.subarray(padding, padding + (arr as any).length) as T;
}

/**
* Clone a Data so every backing typed array sits at byteOffset > 0.
* Recurses into children so nested types stay consistent.
*/
function withOffsetViews<T extends DataType>(d: Data<T>): Data<T> {
const newBuffers = Array.from(d.buffers as unknown as Array<ArrayBufferView | undefined>, padBuffer);
const newChildren = d.children.map((c) => withOffsetViews(c));
return d.clone(d.type, d.offset, d.length, d.nullCount, newBuffers as any, newChildren);
}

/**
* Wrap a Data in a single-field RecordBatch so we can hand it to assemble().
*/
function batchOf<T extends DataType>(type: T, data: Data<T>): RecordBatch<any> {
const field = new Field('x', type, true);
const schema = new Schema([field]);
const structType = new Struct([field]);
const root = data.clone(structType as any, 0, data.length, 0, [undefined, undefined, undefined, undefined] as any, [data]);
return new RecordBatch(schema, root as Data<Struct>);
}

// Every affected callsite of bigNumsToStrings, one entry per type. Adding
// a new affected type is a one-line addition here.
const affectedTypeCases: Array<{ name: string; gen: () => generate.GeneratedVector<any> }> = [
{ name: 'Int64', gen: () => generate.int64() },
{ name: 'Uint64', gen: () => generate.uint64() },
{ name: 'DateMillisecond', gen: () => generate.dateMillisecond() },
{ name: 'TimestampSecond', gen: () => generate.timestampSecond() },
{ name: 'TimestampMillisecond', gen: () => generate.timestampMillisecond() },
{ name: 'TimestampMicrosecond', gen: () => generate.timestampMicrosecond() },
{ name: 'TimestampNanosecond', gen: () => generate.timestampNanosecond() },
{ name: 'TimeMicrosecond', gen: () => generate.timeMicrosecond() },
{ name: 'TimeNanosecond', gen: () => generate.timeNanosecond() },
{ name: 'DurationSecond', gen: () => generate.durationSecond() },
{ name: 'DurationMillisecond', gen: () => generate.durationMillisecond() },
{ name: 'DurationMicrosecond', gen: () => generate.durationMicrosecond() },
{ name: 'DurationNanosecond', gen: () => generate.durationNanosecond() },
{ name: 'Decimal', gen: () => generate.decimal() },
{ name: 'LargeUtf8', gen: () => generate.largeUtf8() },
{ name: 'LargeBinary', gen: () => generate.largeBinary() },
];

describe('JSONVectorAssembler offset-view safety', () => {
for (const { name, gen } of affectedTypeCases) {
test(name, () => {
const { vector } = gen();
const fresh = vector.data[0];
const viewed = withOffsetViews(fresh);

// Sanity: the rewrap really did shift at least one buffer
// past byteOffset 0, so this run actually exercises the bug
// precondition.
const buffers = Array.from(viewed.buffers as unknown as Array<ArrayBufferView | undefined>);
expect(buffers.some((b) => b && b.byteOffset > 0)).toBe(true);

const [[jsonFresh]] = JSONVectorAssembler.assemble(batchOf(fresh.type, fresh));
const [[jsonViewed]] = JSONVectorAssembler.assemble(batchOf(fresh.type, viewed));
expect(jsonViewed).toEqual(jsonFresh);
});
}
});
14 changes: 10 additions & 4 deletions test/unit/ipc/writer/json-writer-tests.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@ import {
ArrowJSONLike,
RecordBatchJSONWriter,
RecordBatchReader,
Table
Table,
tableFromIPC,
tableToIPC,
} from 'apache-arrow';

describe('RecordBatchJSONWriter', () => {
Expand All @@ -38,12 +40,16 @@ describe('RecordBatchJSONWriter', () => {

function testJSONWriter(table: Table, name: string) {
describe(`should write the Arrow IPC JSON format (${name})`, () => {
test(`Table`, validateTable.bind(0, table));
test(`Table`, validateJSONWriter.bind(0, table));
});
}

async function validateTable(source: Table) {
const writer = RecordBatchJSONWriter.writeAll(source);
async function validateJSONWriter(source: Table) {
// Route through binary IPC so the JSON writer sees realistic typed-array
// views with byteOffset > 0 (as `tableFromIPC` produces), not just fresh
// allocations from the generator.
const loaded = tableFromIPC(tableToIPC(source, 'stream'));
const writer = RecordBatchJSONWriter.writeAll(loaded);
const string = await writer.toString();
const json = JSON.parse(string) as ArrowJSONLike;
const result = new Table(RecordBatchReader.from(json));
Expand Down
Loading