doublesync/FastCDC.js at main · FizzyFlow/doublesync · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import { GEAR } from './GearHash.js';

/**
 * Content-defined chunker using the FastCDC algorithm (2016, Xia et al.).
 *
 * Splits `data` into variable-size chunks at boundaries determined by content,
 * not by absolute offset. This is the key property: inserting or deleting bytes
 * only changes the chunks immediately surrounding the edit — every other chunk
 * stays bit-identical, so it can be deduplicated.
 *
 * Boundary detection runs a 32-bit Gear hash byte-by-byte. We declare a chunk
 * boundary at byte `i` when the low bits of the running hash satisfy a mask.
 * Two masks are used (FastCDC's "normalized chunking"):
 *
 *   - `maskS` (strict, more 1-bits → harder to satisfy) for [minSize, avgSize)
 *   - `maskL` (lenient, fewer 1-bits → easier to satisfy) for [avgSize, maxSize)
 *
 * Result: chunk size distribution is sharply centered around `avgSize` with hard
 * clamps at `minSize` and `maxSize`. Without normalization, chunk sizes would
 * follow a geometric distribution with a long fat tail.
 *
 * @example
 * const cdc = new FastCDC({ avgSize: 8192 });
 * for (const { offset, length } of cdc.chunks(data)) {
 *     handleChunk(data.subarray(offset, offset + length));
 * }
 */
export default class FastCDC {
    /**
     * @param {Object} [params]
     * @param {number} [params.avgSize=8192]   - target average chunk size in bytes (must be a power of 2)
     * @param {number} [params.minSize=avgSize/4] - lower clamp; chunks never smaller than this
     * @param {number} [params.maxSize=avgSize*8] - upper clamp; chunks never larger than this
     */
    constructor(params = {}) {
        const avgSize = params.avgSize ?? 8192;
        if (!Number.isInteger(avgSize) || avgSize <= 0 || (avgSize & (avgSize - 1)) !== 0) {
            throw new Error(`FastCDC: avgSize must be a positive power of 2, got ${avgSize}`);
        }

        const minSize = params.minSize ?? Math.max(64, avgSize >>> 2);
        const maxSize = params.maxSize ?? avgSize * 8;
        if (minSize <= 0 || maxSize < avgSize || minSize > avgSize) {
            throw new Error(`FastCDC: require 0 < minSize <= avgSize <= maxSize; got ${minSize}/${avgSize}/${maxSize}`);
        }

        /** @type {number} */
        this.avgSize = avgSize;
        /** @type {number} */
        this.minSize = minSize;
        /** @type {number} */
        this.maxSize = maxSize;

        // bits = log2(avgSize). Build two masks bracketing the target boundary probability:
        //   - maskS = (bits+2) low bits set  → P(hit per byte) = 1/(4·avgSize)  → strict
        //   - maskL = (bits-2) low bits set  → P(hit per byte) = 4/avgSize       → lenient
        // Low bits are used because Gear's shift-add scrambles them most freely;
        // high bits accumulate carry chains and lose entropy after many bytes.
        const bits = Math.log2(avgSize) | 0;
        /** @type {number} - strict boundary mask (used in the first half of a chunk) */
        this.maskS = FastCDC._lowBitMask(bits + 2);
        /** @type {number} - lenient boundary mask (used in the second half of a chunk) */
        this.maskL = FastCDC._lowBitMask(bits - 2);
    }

    /**
     * Build a 32-bit mask with the lowest `count` bits set.
     *
     * @param {number} count - 1..32
     * @returns {number}
     */
    static _lowBitMask(count) {
        if (count <= 0) return 0;
        if (count >= 32) return 0xFFFFFFFF >>> 0;
        return ((1 << count) - 1) >>> 0;
    }

    /**
     * Iterate chunk boundaries over `data`. Each yielded chunk is `{ offset, length }`
     * pointing into `data` — callers can extract bytes via `data.subarray(...)` without
     * copying.
     *
     * @param {Uint8Array} data
     * @yields {{offset: number, length: number}}
     */
    *chunks(data) {
        if (!(data instanceof Uint8Array)) throw new Error('FastCDC.chunks: data must be Uint8Array');

        let i = 0;
        const N = data.length;
        while (i < N) {
            const length = this._nextBoundary(data, i, N);
            yield { offset: i, length };
            i += length;
        }
    }

    /**
     * Find the next chunk boundary starting at `start`, returning the chunk's length.
     * Implements FastCDC's normalized boundary search:
     *
     *   - bytes [start, start+minSize)         → skip (don't even hash; respect minSize)
     *   - bytes [start+minSize, start+avgSize) → boundary if `h & maskS == 0` (strict)
     *   - bytes [start+avgSize, start+maxSize) → boundary if `h & maskL == 0` (lenient)
     *   - at start+maxSize                     → force boundary
     *
     * @param {Uint8Array} data
     * @param {number} start
     * @param {number} N - data.length
     * @returns {number} chunk length (≥ 1)
     */
    _nextBoundary(data, start, N) {
        const remaining = N - start;
        if (remaining <= this.minSize) return remaining; // tail smaller than minSize → one final chunk

        const limit = Math.min(start + this.maxSize, N);
        const strictEnd = Math.min(start + this.avgSize, limit);

        let h = 0;
        let i = start + this.minSize;

        // strict region
        for (; i < strictEnd; i++) {
            h = (((h << 1) >>> 0) + GEAR[data[i]]) >>> 0;
            if ((h & this.maskS) === 0) return i - start + 1;
        }

        // lenient region
        for (; i < limit; i++) {
            h = (((h << 1) >>> 0) + GEAR[data[i]]) >>> 0;
            if ((h & this.maskL) === 0) return i - start + 1;
        }

        return limit - start; // forced boundary at maxSize (or end of buffer)
    }

    /**
     * Convenience: split `data` once and return all chunks as a regular array.
     * For very large inputs prefer `chunks(data)` to avoid building the array.
     *
     * @param {Uint8Array} data
     * @returns {Array<{offset: number, length: number}>}
     */
    splitAll(data) {
        return [...this.chunks(data)];
    }
}