1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3-*- linux-c -*-
4 drbd_receiver.c
5 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
6
7 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
8 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
9 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
10
11 */
12
13#ifndef _DRBD_VLI_H
14#define _DRBD_VLI_H
15
16/*
17 * At a granularity of 4KiB storage represented per bit,
18 * and stroage sizes of several TiB,
19 * and possibly small-bandwidth replication,
20 * the bitmap transfer time can take much too long,
21 * if transmitted in plain text.
22 *
23 * We try to reduce the transferred bitmap information
24 * by encoding runlengths of bit polarity.
25 *
26 * We never actually need to encode a "zero" (runlengths are positive).
27 * But then we have to store the value of the first bit.
28 * The first bit of information thus shall encode if the first runlength
29 * gives the number of set or unset bits.
30 *
31 * We assume that large areas are either completely set or unset,
32 * which gives good compression with any runlength method,
33 * even when encoding the runlength as fixed size 32bit/64bit integers.
34 *
35 * Still, there may be areas where the polarity flips every few bits,
36 * and encoding the runlength sequence of those areas with fix size
37 * integers would be much worse than plaintext.
38 *
39 * We want to encode small runlength values with minimum code length,
40 * while still being able to encode a Huge run of all zeros.
41 *
42 * Thus we need a Variable Length Integer encoding, VLI.
43 *
44 * For some cases, we produce more code bits than plaintext input.
45 * We need to send incompressible chunks as plaintext, skip over them
46 * and then see if the next chunk compresses better.
47 *
48 * We don't care too much about "excellent" compression ratio for large
49 * runlengths (all set/all clear): whether we achieve a factor of 100
50 * or 1000 is not that much of an issue.
51 * We do not want to waste too much on short runlengths in the "noisy"
52 * parts of the bitmap, though.
53 *
54 * There are endless variants of VLI, we experimented with:
55 * * simple byte-based
56 * * various bit based with different code word length.
57 *
58 * To avoid yet an other configuration parameter (choice of bitmap compression
59 * algorithm) which was difficult to explain and tune, we just chose the one
60 * variant that turned out best in all test cases.
61 * Based on real world usage patterns, with device sizes ranging from a few GiB
62 * to several TiB, file server/mailserver/webserver/mysql/postgress,
63 * mostly idle to really busy, the all time winner (though sometimes only
64 * marginally better) is:
65 */
66
67/*
68 * encoding is "visualised" as
69 * __little endian__ bitstream, least significant bit first (left most)
70 *
71 * this particular encoding is chosen so that the prefix code
72 * starts as unary encoding the level, then modified so that
73 * 10 levels can be described in 8bit, with minimal overhead
74 * for the smaller levels.
75 *
76 * Number of data bits follow fibonacci sequence, with the exception of the
77 * last level (+1 data bit, so it makes 64bit total). The only worse code when
78 * encoding bit polarity runlength is 1 plain bits => 2 code bits.
79prefix data bits max val NÂș data bits
800 x 0x2 1
8110 x 0x4 1
82110 xx 0x8 2
831110 xxx 0x10 3
8411110 xxx xx 0x30 5
85111110 xx xxxxxx 0x130 8
8611111100 xxxxxxxx xxxxx 0x2130 13
8711111110 xxxxxxxx xxxxxxxx xxxxx 0x202130 21
8811111101 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx 0x400202130 34
8911111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56
90 * maximum encodable value: 0x100000400202130 == 2**56 + some */
91
92/* compression "table":
93 transmitted x 0.29
94 as plaintext x ........................
95 x ........................
96 x ........................
97 x 0.59 0.21........................
98 x ........................................................
99 x .. c ...................................................
100 x 0.44.. o ...................................................
101 x .......... d ...................................................
102 x .......... e ...................................................
103 X............. ...................................................
104 x.............. b ...................................................
1052.0x............... i ...................................................
106 #X................ t ...................................................
107 #................. s ........................... plain bits ..........
108-+-----------------------------------------------------------------------
109 1 16 32 64
110*/
111
112/* LEVEL: (total bits, prefix bits, prefix value),
113 * sorted ascending by number of total bits.
114 * The rest of the code table is calculated at compiletime from this. */
115
116/* fibonacci data 1, 1, ... */
117#define VLI_L_1_1() do { \
118 LEVEL( 2, 1, 0x00); \
119 LEVEL( 3, 2, 0x01); \
120 LEVEL( 5, 3, 0x03); \
121 LEVEL( 7, 4, 0x07); \
122 LEVEL(10, 5, 0x0f); \
123 LEVEL(14, 6, 0x1f); \
124 LEVEL(21, 8, 0x3f); \
125 LEVEL(29, 8, 0x7f); \
126 LEVEL(42, 8, 0xbf); \
127 LEVEL(64, 8, 0xff); \
128 } while (0)
129
130/* finds a suitable level to decode the least significant part of in.
131 * returns number of bits consumed.
132 *
133 * BUG() for bad input, as that would mean a buggy code table. */
134static inline int vli_decode_bits(u64 *out, const u64 in)
135{
136 u64 adj = 1;
137
138#define LEVEL(t,b,v) \
139 do { \
140 if ((in & ((1 << b) -1)) == v) { \
141 *out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \
142 return t; \
143 } \
144 adj += 1ULL << (t - b); \
145 } while (0)
146
147 VLI_L_1_1();
148
149 /* NOT REACHED, if VLI_LEVELS code table is defined properly */
150 BUG();
151#undef LEVEL
152}
153
154/* return number of code bits needed,
155 * or negative error number */
156static inline int __vli_encode_bits(u64 *out, const u64 in)
157{
158 u64 max = 0;
159 u64 adj = 1;
160
161 if (in == 0)
162 return -EINVAL;
163
164#define LEVEL(t,b,v) do { \
165 max += 1ULL << (t - b); \
166 if (in <= max) { \
167 if (out) \
168 *out = ((in - adj) << b) | v; \
169 return t; \
170 } \
171 adj = max + 1; \
172 } while (0)
173
174 VLI_L_1_1();
175
176 return -EOVERFLOW;
177#undef LEVEL
178}
179
180#undef VLI_L_1_1
181
182/* code from here down is independend of actually used bit code */
183
184/*
185 * Code length is determined by some unique (e.g. unary) prefix.
186 * This encodes arbitrary bit length, not whole bytes: we have a bit-stream,
187 * not a byte stream.
188 */
189
190/* for the bitstream, we need a cursor */
191struct bitstream_cursor {
192 /* the current byte */
193 u8 *b;
194 /* the current bit within *b, nomalized: 0..7 */
195 unsigned int bit;
196};
197
198/* initialize cursor to point to first bit of stream */
199static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s)
200{
201 cur->b = s;
202 cur->bit = 0;
203}
204
205/* advance cursor by that many bits; maximum expected input value: 64,
206 * but depending on VLI implementation, it may be more. */
207static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits)
208{
209 bits += cur->bit;
210 cur->b = cur->b + (bits >> 3);
211 cur->bit = bits & 7;
212}
213
214/* the bitstream itself knows its length */
215struct bitstream {
216 struct bitstream_cursor cur;
217 unsigned char *buf;
218 size_t buf_len; /* in bytes */
219
220 /* for input stream:
221 * number of trailing 0 bits for padding
222 * total number of valid bits in stream: buf_len * 8 - pad_bits */
223 unsigned int pad_bits;
224};
225
226static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits)
227{
228 bs->buf = s;
229 bs->buf_len = len;
230 bs->pad_bits = pad_bits;
231 bitstream_cursor_reset(cur: &bs->cur, s: bs->buf);
232}
233
234static inline void bitstream_rewind(struct bitstream *bs)
235{
236 bitstream_cursor_reset(cur: &bs->cur, s: bs->buf);
237 memset(bs->buf, 0, bs->buf_len);
238}
239
240/* Put (at most 64) least significant bits of val into bitstream, and advance cursor.
241 * Ignores "pad_bits".
242 * Returns zero if bits == 0 (nothing to do).
243 * Returns number of bits used if successful.
244 *
245 * If there is not enough room left in bitstream,
246 * leaves bitstream unchanged and returns -ENOBUFS.
247 */
248static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits)
249{
250 unsigned char *b = bs->cur.b;
251 unsigned int tmp;
252
253 if (bits == 0)
254 return 0;
255
256 if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len)
257 return -ENOBUFS;
258
259 /* paranoia: strip off hi bits; they should not be set anyways. */
260 if (bits < 64)
261 val &= ~0ULL >> (64 - bits);
262
263 *b++ |= (val & 0xff) << bs->cur.bit;
264
265 for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8)
266 *b++ |= (val >> tmp) & 0xff;
267
268 bitstream_cursor_advance(cur: &bs->cur, bits);
269 return bits;
270}
271
272/* Fetch (at most 64) bits from bitstream into *out, and advance cursor.
273 *
274 * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged.
275 *
276 * If there are less than the requested number of valid bits left in the
277 * bitstream, still fetches all available bits.
278 *
279 * Returns number of actually fetched bits.
280 */
281static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits)
282{
283 u64 val;
284 unsigned int n;
285
286 if (bits > 64)
287 return -EINVAL;
288
289 if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len)
290 bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3)
291 - bs->cur.bit - bs->pad_bits;
292
293 if (bits == 0) {
294 *out = 0;
295 return 0;
296 }
297
298 /* get the high bits */
299 val = 0;
300 n = (bs->cur.bit + bits + 7) >> 3;
301 /* n may be at most 9, if cur.bit + bits > 64 */
302 /* which means this copies at most 8 byte */
303 if (n) {
304 memcpy(&val, bs->cur.b+1, n - 1);
305 val = le64_to_cpu(val) << (8 - bs->cur.bit);
306 }
307
308 /* we still need the low bits */
309 val |= bs->cur.b[0] >> bs->cur.bit;
310
311 /* and mask out bits we don't want */
312 val &= ~0ULL >> (64 - bits);
313
314 bitstream_cursor_advance(cur: &bs->cur, bits);
315 *out = val;
316
317 return bits;
318}
319
320/* encodes @in as vli into @bs;
321
322 * return values
323 * > 0: number of bits successfully stored in bitstream
324 * -ENOBUFS @bs is full
325 * -EINVAL input zero (invalid)
326 * -EOVERFLOW input too large for this vli code (invalid)
327 */
328static inline int vli_encode_bits(struct bitstream *bs, u64 in)
329{
330 u64 code;
331 int bits = __vli_encode_bits(out: &code, in);
332
333 if (bits <= 0)
334 return bits;
335
336 return bitstream_put_bits(bs, val: code, bits);
337}
338
339#endif
340

source code of linux/drivers/block/drbd/drbd_vli.h