1 | /* SPDX-License-Identifier: GPL-2.0-only */ |
2 | /* |
3 | -*- linux-c -*- |
4 | drbd_receiver.c |
5 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. |
6 | |
7 | Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. |
8 | Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. |
9 | Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. |
10 | |
11 | */ |
12 | |
13 | #ifndef _DRBD_VLI_H |
14 | #define _DRBD_VLI_H |
15 | |
16 | /* |
17 | * At a granularity of 4KiB storage represented per bit, |
18 | * and stroage sizes of several TiB, |
19 | * and possibly small-bandwidth replication, |
20 | * the bitmap transfer time can take much too long, |
21 | * if transmitted in plain text. |
22 | * |
23 | * We try to reduce the transferred bitmap information |
24 | * by encoding runlengths of bit polarity. |
25 | * |
26 | * We never actually need to encode a "zero" (runlengths are positive). |
27 | * But then we have to store the value of the first bit. |
28 | * The first bit of information thus shall encode if the first runlength |
29 | * gives the number of set or unset bits. |
30 | * |
31 | * We assume that large areas are either completely set or unset, |
32 | * which gives good compression with any runlength method, |
33 | * even when encoding the runlength as fixed size 32bit/64bit integers. |
34 | * |
35 | * Still, there may be areas where the polarity flips every few bits, |
36 | * and encoding the runlength sequence of those areas with fix size |
37 | * integers would be much worse than plaintext. |
38 | * |
39 | * We want to encode small runlength values with minimum code length, |
40 | * while still being able to encode a Huge run of all zeros. |
41 | * |
42 | * Thus we need a Variable Length Integer encoding, VLI. |
43 | * |
44 | * For some cases, we produce more code bits than plaintext input. |
45 | * We need to send incompressible chunks as plaintext, skip over them |
46 | * and then see if the next chunk compresses better. |
47 | * |
48 | * We don't care too much about "excellent" compression ratio for large |
49 | * runlengths (all set/all clear): whether we achieve a factor of 100 |
50 | * or 1000 is not that much of an issue. |
51 | * We do not want to waste too much on short runlengths in the "noisy" |
52 | * parts of the bitmap, though. |
53 | * |
54 | * There are endless variants of VLI, we experimented with: |
55 | * * simple byte-based |
56 | * * various bit based with different code word length. |
57 | * |
58 | * To avoid yet an other configuration parameter (choice of bitmap compression |
59 | * algorithm) which was difficult to explain and tune, we just chose the one |
60 | * variant that turned out best in all test cases. |
61 | * Based on real world usage patterns, with device sizes ranging from a few GiB |
62 | * to several TiB, file server/mailserver/webserver/mysql/postgress, |
63 | * mostly idle to really busy, the all time winner (though sometimes only |
64 | * marginally better) is: |
65 | */ |
66 | |
67 | /* |
68 | * encoding is "visualised" as |
69 | * __little endian__ bitstream, least significant bit first (left most) |
70 | * |
71 | * this particular encoding is chosen so that the prefix code |
72 | * starts as unary encoding the level, then modified so that |
73 | * 10 levels can be described in 8bit, with minimal overhead |
74 | * for the smaller levels. |
75 | * |
76 | * Number of data bits follow fibonacci sequence, with the exception of the |
77 | * last level (+1 data bit, so it makes 64bit total). The only worse code when |
78 | * encoding bit polarity runlength is 1 plain bits => 2 code bits. |
79 | prefix data bits max val NÂș data bits |
80 | 0 x 0x2 1 |
81 | 10 x 0x4 1 |
82 | 110 xx 0x8 2 |
83 | 1110 xxx 0x10 3 |
84 | 11110 xxx xx 0x30 5 |
85 | 111110 xx xxxxxx 0x130 8 |
86 | 11111100 xxxxxxxx xxxxx 0x2130 13 |
87 | 11111110 xxxxxxxx xxxxxxxx xxxxx 0x202130 21 |
88 | 11111101 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx 0x400202130 34 |
89 | 11111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56 |
90 | * maximum encodable value: 0x100000400202130 == 2**56 + some */ |
91 | |
92 | /* compression "table": |
93 | transmitted x 0.29 |
94 | as plaintext x ........................ |
95 | x ........................ |
96 | x ........................ |
97 | x 0.59 0.21........................ |
98 | x ........................................................ |
99 | x .. c ................................................... |
100 | x 0.44.. o ................................................... |
101 | x .......... d ................................................... |
102 | x .......... e ................................................... |
103 | X............. ................................................... |
104 | x.............. b ................................................... |
105 | 2.0x............... i ................................................... |
106 | #X................ t ................................................... |
107 | #................. s ........................... plain bits .......... |
108 | -+----------------------------------------------------------------------- |
109 | 1 16 32 64 |
110 | */ |
111 | |
112 | /* LEVEL: (total bits, prefix bits, prefix value), |
113 | * sorted ascending by number of total bits. |
114 | * The rest of the code table is calculated at compiletime from this. */ |
115 | |
116 | /* fibonacci data 1, 1, ... */ |
117 | #define VLI_L_1_1() do { \ |
118 | LEVEL( 2, 1, 0x00); \ |
119 | LEVEL( 3, 2, 0x01); \ |
120 | LEVEL( 5, 3, 0x03); \ |
121 | LEVEL( 7, 4, 0x07); \ |
122 | LEVEL(10, 5, 0x0f); \ |
123 | LEVEL(14, 6, 0x1f); \ |
124 | LEVEL(21, 8, 0x3f); \ |
125 | LEVEL(29, 8, 0x7f); \ |
126 | LEVEL(42, 8, 0xbf); \ |
127 | LEVEL(64, 8, 0xff); \ |
128 | } while (0) |
129 | |
130 | /* finds a suitable level to decode the least significant part of in. |
131 | * returns number of bits consumed. |
132 | * |
133 | * BUG() for bad input, as that would mean a buggy code table. */ |
134 | static inline int vli_decode_bits(u64 *out, const u64 in) |
135 | { |
136 | u64 adj = 1; |
137 | |
138 | #define LEVEL(t,b,v) \ |
139 | do { \ |
140 | if ((in & ((1 << b) -1)) == v) { \ |
141 | *out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \ |
142 | return t; \ |
143 | } \ |
144 | adj += 1ULL << (t - b); \ |
145 | } while (0) |
146 | |
147 | VLI_L_1_1(); |
148 | |
149 | /* NOT REACHED, if VLI_LEVELS code table is defined properly */ |
150 | BUG(); |
151 | #undef LEVEL |
152 | } |
153 | |
154 | /* return number of code bits needed, |
155 | * or negative error number */ |
156 | static inline int __vli_encode_bits(u64 *out, const u64 in) |
157 | { |
158 | u64 max = 0; |
159 | u64 adj = 1; |
160 | |
161 | if (in == 0) |
162 | return -EINVAL; |
163 | |
164 | #define LEVEL(t,b,v) do { \ |
165 | max += 1ULL << (t - b); \ |
166 | if (in <= max) { \ |
167 | if (out) \ |
168 | *out = ((in - adj) << b) | v; \ |
169 | return t; \ |
170 | } \ |
171 | adj = max + 1; \ |
172 | } while (0) |
173 | |
174 | VLI_L_1_1(); |
175 | |
176 | return -EOVERFLOW; |
177 | #undef LEVEL |
178 | } |
179 | |
180 | #undef VLI_L_1_1 |
181 | |
182 | /* code from here down is independend of actually used bit code */ |
183 | |
184 | /* |
185 | * Code length is determined by some unique (e.g. unary) prefix. |
186 | * This encodes arbitrary bit length, not whole bytes: we have a bit-stream, |
187 | * not a byte stream. |
188 | */ |
189 | |
190 | /* for the bitstream, we need a cursor */ |
191 | struct bitstream_cursor { |
192 | /* the current byte */ |
193 | u8 *b; |
194 | /* the current bit within *b, nomalized: 0..7 */ |
195 | unsigned int bit; |
196 | }; |
197 | |
198 | /* initialize cursor to point to first bit of stream */ |
199 | static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s) |
200 | { |
201 | cur->b = s; |
202 | cur->bit = 0; |
203 | } |
204 | |
205 | /* advance cursor by that many bits; maximum expected input value: 64, |
206 | * but depending on VLI implementation, it may be more. */ |
207 | static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits) |
208 | { |
209 | bits += cur->bit; |
210 | cur->b = cur->b + (bits >> 3); |
211 | cur->bit = bits & 7; |
212 | } |
213 | |
214 | /* the bitstream itself knows its length */ |
215 | struct bitstream { |
216 | struct bitstream_cursor cur; |
217 | unsigned char *buf; |
218 | size_t buf_len; /* in bytes */ |
219 | |
220 | /* for input stream: |
221 | * number of trailing 0 bits for padding |
222 | * total number of valid bits in stream: buf_len * 8 - pad_bits */ |
223 | unsigned int pad_bits; |
224 | }; |
225 | |
226 | static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits) |
227 | { |
228 | bs->buf = s; |
229 | bs->buf_len = len; |
230 | bs->pad_bits = pad_bits; |
231 | bitstream_cursor_reset(cur: &bs->cur, s: bs->buf); |
232 | } |
233 | |
234 | static inline void bitstream_rewind(struct bitstream *bs) |
235 | { |
236 | bitstream_cursor_reset(cur: &bs->cur, s: bs->buf); |
237 | memset(bs->buf, 0, bs->buf_len); |
238 | } |
239 | |
240 | /* Put (at most 64) least significant bits of val into bitstream, and advance cursor. |
241 | * Ignores "pad_bits". |
242 | * Returns zero if bits == 0 (nothing to do). |
243 | * Returns number of bits used if successful. |
244 | * |
245 | * If there is not enough room left in bitstream, |
246 | * leaves bitstream unchanged and returns -ENOBUFS. |
247 | */ |
248 | static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits) |
249 | { |
250 | unsigned char *b = bs->cur.b; |
251 | unsigned int tmp; |
252 | |
253 | if (bits == 0) |
254 | return 0; |
255 | |
256 | if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len) |
257 | return -ENOBUFS; |
258 | |
259 | /* paranoia: strip off hi bits; they should not be set anyways. */ |
260 | if (bits < 64) |
261 | val &= ~0ULL >> (64 - bits); |
262 | |
263 | *b++ |= (val & 0xff) << bs->cur.bit; |
264 | |
265 | for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8) |
266 | *b++ |= (val >> tmp) & 0xff; |
267 | |
268 | bitstream_cursor_advance(cur: &bs->cur, bits); |
269 | return bits; |
270 | } |
271 | |
272 | /* Fetch (at most 64) bits from bitstream into *out, and advance cursor. |
273 | * |
274 | * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged. |
275 | * |
276 | * If there are less than the requested number of valid bits left in the |
277 | * bitstream, still fetches all available bits. |
278 | * |
279 | * Returns number of actually fetched bits. |
280 | */ |
281 | static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits) |
282 | { |
283 | u64 val; |
284 | unsigned int n; |
285 | |
286 | if (bits > 64) |
287 | return -EINVAL; |
288 | |
289 | if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len) |
290 | bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3) |
291 | - bs->cur.bit - bs->pad_bits; |
292 | |
293 | if (bits == 0) { |
294 | *out = 0; |
295 | return 0; |
296 | } |
297 | |
298 | /* get the high bits */ |
299 | val = 0; |
300 | n = (bs->cur.bit + bits + 7) >> 3; |
301 | /* n may be at most 9, if cur.bit + bits > 64 */ |
302 | /* which means this copies at most 8 byte */ |
303 | if (n) { |
304 | memcpy(&val, bs->cur.b+1, n - 1); |
305 | val = le64_to_cpu(val) << (8 - bs->cur.bit); |
306 | } |
307 | |
308 | /* we still need the low bits */ |
309 | val |= bs->cur.b[0] >> bs->cur.bit; |
310 | |
311 | /* and mask out bits we don't want */ |
312 | val &= ~0ULL >> (64 - bits); |
313 | |
314 | bitstream_cursor_advance(cur: &bs->cur, bits); |
315 | *out = val; |
316 | |
317 | return bits; |
318 | } |
319 | |
320 | /* encodes @in as vli into @bs; |
321 | |
322 | * return values |
323 | * > 0: number of bits successfully stored in bitstream |
324 | * -ENOBUFS @bs is full |
325 | * -EINVAL input zero (invalid) |
326 | * -EOVERFLOW input too large for this vli code (invalid) |
327 | */ |
328 | static inline int vli_encode_bits(struct bitstream *bs, u64 in) |
329 | { |
330 | u64 code; |
331 | int bits = __vli_encode_bits(out: &code, in); |
332 | |
333 | if (bits <= 0) |
334 | return bits; |
335 | |
336 | return bitstream_put_bits(bs, val: code, bits); |
337 | } |
338 | |
339 | #endif |
340 | |