diff --git a/src/visitor/jsonvectorassembler.ts b/src/visitor/jsonvectorassembler.ts index 2f4973ad..a48e5ba9 100644 --- a/src/visitor/jsonvectorassembler.ts +++ b/src/visitor/jsonvectorassembler.ts @@ -201,7 +201,7 @@ function* binaryToString(vector: Vector | Vector | Vector 0 +// (which happens whenever Data was constructed against a shared/padded buffer +// or after slicing through tableFromIPC), the helper would read the wrong +// 64-bit words and emit incorrect JSON for OFFSET/DATA fields. +// +// For each affected type, we take a freshly-generated Vector (byteOffset === 0), +// rewrap every backing typed array in a subarray view at byteOffset > 0, and +// assert the JSONVectorAssembler produces the same JSON in both cases. +// +// These are focused unit tests of bigNumsToStrings' buffer-windowing contract. +// They pinpoint exactly which type / which callsite breaks if a regression +// lands, and they don't depend on tableFromIPC continuing to produce offset +// views. End-to-end coverage of the same flow on a broader type matrix lives +// in json-writer-tests.ts, which catches the same regression at the +// user-facing API level. + +import { + Data, DataType, Field, RecordBatch, Schema, Struct, +} from 'apache-arrow'; +import { JSONVectorAssembler } from 'apache-arrow/visitor/jsonvectorassembler'; +import * as generate from '../../../generate-test-data.js'; + +/** + * Return a copy of `arr` whose data lives at a nonzero byteOffset inside a + * larger underlying buffer. Equivalent to the state of a typed array that + * came out of `tableFromIPC` — same logical contents, shifted memory layout. + */ +function padBuffer(arr: T | undefined): T | undefined { + if (!arr) return arr; + const Ctor = arr.constructor as new (n: number) => any; + const padding = 4; + const padded = new Ctor((arr as any).length + padding * 2); + padded.set(arr, padding); + return padded.subarray(padding, padding + (arr as any).length) as T; +} + +/** + * Clone a Data so every backing typed array sits at byteOffset > 0. + * Recurses into children so nested types stay consistent. + */ +function withOffsetViews(d: Data): Data { + const newBuffers = Array.from(d.buffers as unknown as Array, padBuffer); + const newChildren = d.children.map((c) => withOffsetViews(c)); + return d.clone(d.type, d.offset, d.length, d.nullCount, newBuffers as any, newChildren); +} + +/** + * Wrap a Data in a single-field RecordBatch so we can hand it to assemble(). + */ +function batchOf(type: T, data: Data): RecordBatch { + const field = new Field('x', type, true); + const schema = new Schema([field]); + const structType = new Struct([field]); + const root = data.clone(structType as any, 0, data.length, 0, [undefined, undefined, undefined, undefined] as any, [data]); + return new RecordBatch(schema, root as Data); +} + +// Every affected callsite of bigNumsToStrings, one entry per type. Adding +// a new affected type is a one-line addition here. +const affectedTypeCases: Array<{ name: string; gen: () => generate.GeneratedVector }> = [ + { name: 'Int64', gen: () => generate.int64() }, + { name: 'Uint64', gen: () => generate.uint64() }, + { name: 'DateMillisecond', gen: () => generate.dateMillisecond() }, + { name: 'TimestampSecond', gen: () => generate.timestampSecond() }, + { name: 'TimestampMillisecond', gen: () => generate.timestampMillisecond() }, + { name: 'TimestampMicrosecond', gen: () => generate.timestampMicrosecond() }, + { name: 'TimestampNanosecond', gen: () => generate.timestampNanosecond() }, + { name: 'TimeMicrosecond', gen: () => generate.timeMicrosecond() }, + { name: 'TimeNanosecond', gen: () => generate.timeNanosecond() }, + { name: 'DurationSecond', gen: () => generate.durationSecond() }, + { name: 'DurationMillisecond', gen: () => generate.durationMillisecond() }, + { name: 'DurationMicrosecond', gen: () => generate.durationMicrosecond() }, + { name: 'DurationNanosecond', gen: () => generate.durationNanosecond() }, + { name: 'Decimal', gen: () => generate.decimal() }, + { name: 'LargeUtf8', gen: () => generate.largeUtf8() }, + { name: 'LargeBinary', gen: () => generate.largeBinary() }, +]; + +describe('JSONVectorAssembler offset-view safety', () => { + for (const { name, gen } of affectedTypeCases) { + test(name, () => { + const { vector } = gen(); + const fresh = vector.data[0]; + const viewed = withOffsetViews(fresh); + + // Sanity: the rewrap really did shift at least one buffer + // past byteOffset 0, so this run actually exercises the bug + // precondition. + const buffers = Array.from(viewed.buffers as unknown as Array); + expect(buffers.some((b) => b && b.byteOffset > 0)).toBe(true); + + const [[jsonFresh]] = JSONVectorAssembler.assemble(batchOf(fresh.type, fresh)); + const [[jsonViewed]] = JSONVectorAssembler.assemble(batchOf(fresh.type, viewed)); + expect(jsonViewed).toEqual(jsonFresh); + }); + } +}); diff --git a/test/unit/ipc/writer/json-writer-tests.ts b/test/unit/ipc/writer/json-writer-tests.ts index 29b66515..c7d48ee5 100644 --- a/test/unit/ipc/writer/json-writer-tests.ts +++ b/test/unit/ipc/writer/json-writer-tests.ts @@ -24,7 +24,9 @@ import { ArrowJSONLike, RecordBatchJSONWriter, RecordBatchReader, - Table + Table, + tableFromIPC, + tableToIPC, } from 'apache-arrow'; describe('RecordBatchJSONWriter', () => { @@ -38,12 +40,16 @@ describe('RecordBatchJSONWriter', () => { function testJSONWriter(table: Table, name: string) { describe(`should write the Arrow IPC JSON format (${name})`, () => { - test(`Table`, validateTable.bind(0, table)); + test(`Table`, validateJSONWriter.bind(0, table)); }); } -async function validateTable(source: Table) { - const writer = RecordBatchJSONWriter.writeAll(source); +async function validateJSONWriter(source: Table) { + // Route through binary IPC so the JSON writer sees realistic typed-array + // views with byteOffset > 0 (as `tableFromIPC` produces), not just fresh + // allocations from the generator. + const loaded = tableFromIPC(tableToIPC(source, 'stream')); + const writer = RecordBatchJSONWriter.writeAll(loaded); const string = await writer.toString(); const json = JSON.parse(string) as ArrowJSONLike; const result = new Table(RecordBatchReader.from(json));