-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathCDCSync.js
More file actions
170 lines (159 loc) · 6.59 KB
/
Copy pathCDCSync.js
File metadata and controls
170 lines (159 loc) · 6.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import FastCDC from './FastCDC.js';
import CDCStore from './CDCStore.js';
import CDCManifest from './CDCManifest.js';
/**
* Content-defined delta sync — dedup-friendly building block used by `DoubleSync`.
*
* High-level protocol between a *receiver* (has old data + a CAS of past chunks) and a
* *sender* (has new data):
*
* 1. Sender splits new data into content-defined chunks, hashes each one, and emits a
* manifest: an ordered list of `(hash, length)` entries describing the new bytes.
*
* 2. Receiver compares manifest hashes against its local `CDCStore`. The missing
* hashes are the only bytes that actually need to cross the wire.
*
* 3. Sender transmits just the missing chunks. Receiver inserts them into its store.
*
* 4. Receiver reconstructs the new data by concatenating store chunks in manifest order.
*
* The killer property: dedup is automatic and cross-file. If the same 8KB chunk appears
* in 50 different files, it's stored once and transferred once. Inserting one byte at the
* start of a file only changes the chunk surrounding the insertion — every other chunk
* keeps its identity, so its hash matches and no bytes are transferred for it.
*
* @example
* const cdc = new CDCSync({ avgSize: 8192 });
*
* // Sender side
* const { manifest, chunks } = cdc.split(newData);
*
* // Receiver side
* const receiverStore = new CDCStore();
* // ... (receiver has accumulated chunks over time)
* const missing = cdc.missingChunks({ manifest, store: receiverStore });
*
* // Sender packages the missing chunks; receiver inserts them into its store:
* for (const { hash, bytes } of senderProvidesMissing(missing, chunks)) {
* receiverStore.putWithHash(hash, bytes);
* }
*
* // Reconstruct
* const rebuilt = cdc.reconstruct({ manifest, store: receiverStore });
*/
export default class CDCSync {
/**
* @param {Object} [params]
* @param {number} [params.avgSize=8192] - target average chunk size in bytes (power of 2)
* @param {number} [params.minSize] - lower clamp; defaults to avgSize/4
* @param {number} [params.maxSize] - upper clamp; defaults to avgSize*8
*/
constructor(params = {}) {
/** @type {FastCDC} */
this.cdc = new FastCDC(params);
}
get avgSize() { return this.cdc.avgSize; }
get minSize() { return this.cdc.minSize; }
get maxSize() { return this.cdc.maxSize; }
/**
* Split `data` into content-defined chunks, hash each chunk, and produce a manifest
* along with the raw chunks themselves.
*
* The returned `chunks` array is in manifest order — `chunks[i]` corresponds to
* the `i`th manifest entry. `bytes` is a zero-copy `Uint8Array` view into `data`.
*
* @param {Uint8Array} data
* @returns {{manifest: Uint8Array, chunks: Array<{hash: Uint8Array, bytes: Uint8Array}>}}
*/
split(data) {
if (!(data instanceof Uint8Array)) throw new Error('CDCSync.split: data must be Uint8Array');
const chunks = [];
const entries = [];
for (const { offset, length } of this.cdc.chunks(data)) {
const bytes = data.subarray(offset, offset + length);
const hash = CDCStore.hashOf(bytes);
entries.push({ hash, length });
chunks.push({ hash, bytes });
}
const manifest = CDCManifest.encode({ entries });
return { manifest, chunks };
}
/**
* Convenience: split `data`, insert every chunk into `store`, and return the manifest.
* Use this when the sender is also the local store for those chunks (e.g. you're
* snapshotting your own data over time and want to dedup against past snapshots).
*
* @param {Object} params
* @param {Uint8Array} params.data
* @param {CDCStore} params.store
* @returns {Uint8Array} the manifest document
*/
splitInto({ data, store }) {
const { manifest, chunks } = this.split(data);
for (const { hash, bytes } of chunks) {
store.putWithHash(hash, bytes);
}
return manifest;
}
/**
* Inspect a manifest and return the list of chunk hashes the receiver's store does
* NOT yet have — i.e. the bytes that need to actually cross the wire.
*
* Order matches manifest order. Duplicates within the manifest are deduplicated in
* the returned list (the same chunk only needs to be transferred once even if it
* appears multiple times in the file).
*
* @param {Object} params
* @param {Uint8Array|CDCManifest} params.manifest
* @param {CDCStore} params.store
* @returns {Array<{hash: Uint8Array, length: number}>}
*/
missingChunks({ manifest, store }) {
const m = manifest instanceof CDCManifest ? manifest : new CDCManifest(manifest);
/** @type {Set<string>} */
const seen = new Set();
/** @type {Array<{hash: Uint8Array, length: number}>} */
const missing = [];
for (const { hash, length } of m.entries()) {
const key = CDCStore.hashToHex(hash);
if (seen.has(key)) continue;
if (store.has(hash)) continue;
seen.add(key);
missing.push({ hash: hash.slice(), length });
}
return missing;
}
/**
* Reconstruct the original bytes by concatenating `store` chunks in manifest order.
*
* @param {Object} params
* @param {Uint8Array|CDCManifest} params.manifest
* @param {CDCStore} params.store - must contain every chunk referenced by the manifest
* @returns {Uint8Array}
* @throws {Error} if a referenced chunk is missing or its actual length differs from
* the manifest entry (corruption check)
*/
reconstruct({ manifest, store }) {
const m = manifest instanceof CDCManifest ? manifest : new CDCManifest(manifest);
const out = new Uint8Array(Number(m.totalLength));
let cur = 0;
for (const { hash, length } of m.entries()) {
const bytes = store.get(hash);
if (!bytes) {
throw new Error(`CDCSync.reconstruct: missing chunk ${CDCStore.hashToHex(hash)}`);
}
if (bytes.length !== length) {
throw new Error(
`CDCSync.reconstruct: chunk ${CDCStore.hashToHex(hash)} length mismatch (` +
`manifest=${length}, store=${bytes.length})`,
);
}
out.set(bytes, cur);
cur += length;
}
if (cur !== out.length) {
throw new Error(`CDCSync.reconstruct: rebuilt ${cur} bytes, expected ${out.length}`);
}
return out;
}
}