1/* 2-*- linux-c -*- 3 drbd_receiver.c 4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg. 5 6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. 7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. 8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. 9 10 drbd is free software; you can redistribute it and/or modify 11 it under the terms of the GNU General Public License as published by 12 the Free Software Foundation; either version 2, or (at your option) 13 any later version. 14 15 drbd is distributed in the hope that it will be useful, 16 but WITHOUT ANY WARRANTY; without even the implied warranty of 17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 GNU General Public License for more details. 19 20 You should have received a copy of the GNU General Public License 21 along with drbd; see the file COPYING. If not, write to 22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 23 */ 24 25#ifndef _DRBD_VLI_H 26#define _DRBD_VLI_H 27 28/* 29 * At a granularity of 4KiB storage represented per bit, 30 * and stroage sizes of several TiB, 31 * and possibly small-bandwidth replication, 32 * the bitmap transfer time can take much too long, 33 * if transmitted in plain text. 34 * 35 * We try to reduce the transferred bitmap information 36 * by encoding runlengths of bit polarity. 37 * 38 * We never actually need to encode a "zero" (runlengths are positive). 39 * But then we have to store the value of the first bit. 40 * The first bit of information thus shall encode if the first runlength 41 * gives the number of set or unset bits. 42 * 43 * We assume that large areas are either completely set or unset, 44 * which gives good compression with any runlength method, 45 * even when encoding the runlength as fixed size 32bit/64bit integers. 46 * 47 * Still, there may be areas where the polarity flips every few bits, 48 * and encoding the runlength sequence of those areas with fix size 49 * integers would be much worse than plaintext. 50 * 51 * We want to encode small runlength values with minimum code length, 52 * while still being able to encode a Huge run of all zeros. 53 * 54 * Thus we need a Variable Length Integer encoding, VLI. 55 * 56 * For some cases, we produce more code bits than plaintext input. 57 * We need to send incompressible chunks as plaintext, skip over them 58 * and then see if the next chunk compresses better. 59 * 60 * We don't care too much about "excellent" compression ratio for large 61 * runlengths (all set/all clear): whether we achieve a factor of 100 62 * or 1000 is not that much of an issue. 63 * We do not want to waste too much on short runlengths in the "noisy" 64 * parts of the bitmap, though. 65 * 66 * There are endless variants of VLI, we experimented with: 67 * * simple byte-based 68 * * various bit based with different code word length. 69 * 70 * To avoid yet an other configuration parameter (choice of bitmap compression 71 * algorithm) which was difficult to explain and tune, we just chose the one 72 * variant that turned out best in all test cases. 73 * Based on real world usage patterns, with device sizes ranging from a few GiB 74 * to several TiB, file server/mailserver/webserver/mysql/postgress, 75 * mostly idle to really busy, the all time winner (though sometimes only 76 * marginally better) is: 77 */ 78 79/* 80 * encoding is "visualised" as 81 * __little endian__ bitstream, least significant bit first (left most) 82 * 83 * this particular encoding is chosen so that the prefix code 84 * starts as unary encoding the level, then modified so that 85 * 10 levels can be described in 8bit, with minimal overhead 86 * for the smaller levels. 87 * 88 * Number of data bits follow fibonacci sequence, with the exception of the 89 * last level (+1 data bit, so it makes 64bit total). The only worse code when 90 * encoding bit polarity runlength is 1 plain bits => 2 code bits. 91prefix data bits max val NÂș data bits 920 x 0x2 1 9310 x 0x4 1 94110 xx 0x8 2 951110 xxx 0x10 3 9611110 xxx xx 0x30 5 97111110 xx xxxxxx 0x130 8 9811111100 xxxxxxxx xxxxx 0x2130 13 9911111110 xxxxxxxx xxxxxxxx xxxxx 0x202130 21 10011111101 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx 0x400202130 34 10111111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56 102 * maximum encodable value: 0x100000400202130 == 2**56 + some */ 103 104/* compression "table": 105 transmitted x 0.29 106 as plaintext x ........................ 107 x ........................ 108 x ........................ 109 x 0.59 0.21........................ 110 x ........................................................ 111 x .. c ................................................... 112 x 0.44.. o ................................................... 113 x .......... d ................................................... 114 x .......... e ................................................... 115 X............. ................................................... 116 x.............. b ................................................... 1172.0x............... i ................................................... 118 #X................ t ................................................... 119 #................. s ........................... plain bits .......... 120-+----------------------------------------------------------------------- 121 1 16 32 64 122*/ 123 124/* LEVEL: (total bits, prefix bits, prefix value), 125 * sorted ascending by number of total bits. 126 * The rest of the code table is calculated at compiletime from this. */ 127 128/* fibonacci data 1, 1, ... */ 129#define VLI_L_1_1() do { \ 130 LEVEL( 2, 1, 0x00); \ 131 LEVEL( 3, 2, 0x01); \ 132 LEVEL( 5, 3, 0x03); \ 133 LEVEL( 7, 4, 0x07); \ 134 LEVEL(10, 5, 0x0f); \ 135 LEVEL(14, 6, 0x1f); \ 136 LEVEL(21, 8, 0x3f); \ 137 LEVEL(29, 8, 0x7f); \ 138 LEVEL(42, 8, 0xbf); \ 139 LEVEL(64, 8, 0xff); \ 140 } while (0) 141 142/* finds a suitable level to decode the least significant part of in. 143 * returns number of bits consumed. 144 * 145 * BUG() for bad input, as that would mean a buggy code table. */ 146static inline int vli_decode_bits(u64 *out, const u64 in) 147{ 148 u64 adj = 1; 149 150#define LEVEL(t,b,v) \ 151 do { \ 152 if ((in & ((1 << b) -1)) == v) { \ 153 *out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \ 154 return t; \ 155 } \ 156 adj += 1ULL << (t - b); \ 157 } while (0) 158 159 VLI_L_1_1(); 160 161 /* NOT REACHED, if VLI_LEVELS code table is defined properly */ 162 BUG(); 163#undef LEVEL 164} 165 166/* return number of code bits needed, 167 * or negative error number */ 168static inline int __vli_encode_bits(u64 *out, const u64 in) 169{ 170 u64 max = 0; 171 u64 adj = 1; 172 173 if (in == 0) 174 return -EINVAL; 175 176#define LEVEL(t,b,v) do { \ 177 max += 1ULL << (t - b); \ 178 if (in <= max) { \ 179 if (out) \ 180 *out = ((in - adj) << b) | v; \ 181 return t; \ 182 } \ 183 adj = max + 1; \ 184 } while (0) 185 186 VLI_L_1_1(); 187 188 return -EOVERFLOW; 189#undef LEVEL 190} 191 192#undef VLI_L_1_1 193 194/* code from here down is independend of actually used bit code */ 195 196/* 197 * Code length is determined by some unique (e.g. unary) prefix. 198 * This encodes arbitrary bit length, not whole bytes: we have a bit-stream, 199 * not a byte stream. 200 */ 201 202/* for the bitstream, we need a cursor */ 203struct bitstream_cursor { 204 /* the current byte */ 205 u8 *b; 206 /* the current bit within *b, nomalized: 0..7 */ 207 unsigned int bit; 208}; 209 210/* initialize cursor to point to first bit of stream */ 211static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s) 212{ 213 cur->b = s; 214 cur->bit = 0; 215} 216 217/* advance cursor by that many bits; maximum expected input value: 64, 218 * but depending on VLI implementation, it may be more. */ 219static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits) 220{ 221 bits += cur->bit; 222 cur->b = cur->b + (bits >> 3); 223 cur->bit = bits & 7; 224} 225 226/* the bitstream itself knows its length */ 227struct bitstream { 228 struct bitstream_cursor cur; 229 unsigned char *buf; 230 size_t buf_len; /* in bytes */ 231 232 /* for input stream: 233 * number of trailing 0 bits for padding 234 * total number of valid bits in stream: buf_len * 8 - pad_bits */ 235 unsigned int pad_bits; 236}; 237 238static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits) 239{ 240 bs->buf = s; 241 bs->buf_len = len; 242 bs->pad_bits = pad_bits; 243 bitstream_cursor_reset(&bs->cur, bs->buf); 244} 245 246static inline void bitstream_rewind(struct bitstream *bs) 247{ 248 bitstream_cursor_reset(&bs->cur, bs->buf); 249 memset(bs->buf, 0, bs->buf_len); 250} 251 252/* Put (at most 64) least significant bits of val into bitstream, and advance cursor. 253 * Ignores "pad_bits". 254 * Returns zero if bits == 0 (nothing to do). 255 * Returns number of bits used if successful. 256 * 257 * If there is not enough room left in bitstream, 258 * leaves bitstream unchanged and returns -ENOBUFS. 259 */ 260static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits) 261{ 262 unsigned char *b = bs->cur.b; 263 unsigned int tmp; 264 265 if (bits == 0) 266 return 0; 267 268 if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len) 269 return -ENOBUFS; 270 271 /* paranoia: strip off hi bits; they should not be set anyways. */ 272 if (bits < 64) 273 val &= ~0ULL >> (64 - bits); 274 275 *b++ |= (val & 0xff) << bs->cur.bit; 276 277 for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8) 278 *b++ |= (val >> tmp) & 0xff; 279 280 bitstream_cursor_advance(&bs->cur, bits); 281 return bits; 282} 283 284/* Fetch (at most 64) bits from bitstream into *out, and advance cursor. 285 * 286 * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged. 287 * 288 * If there are less than the requested number of valid bits left in the 289 * bitstream, still fetches all available bits. 290 * 291 * Returns number of actually fetched bits. 292 */ 293static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits) 294{ 295 u64 val; 296 unsigned int n; 297 298 if (bits > 64) 299 return -EINVAL; 300 301 if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len) 302 bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3) 303 - bs->cur.bit - bs->pad_bits; 304 305 if (bits == 0) { 306 *out = 0; 307 return 0; 308 } 309 310 /* get the high bits */ 311 val = 0; 312 n = (bs->cur.bit + bits + 7) >> 3; 313 /* n may be at most 9, if cur.bit + bits > 64 */ 314 /* which means this copies at most 8 byte */ 315 if (n) { 316 memcpy(&val, bs->cur.b+1, n - 1); 317 val = le64_to_cpu(val) << (8 - bs->cur.bit); 318 } 319 320 /* we still need the low bits */ 321 val |= bs->cur.b[0] >> bs->cur.bit; 322 323 /* and mask out bits we don't want */ 324 val &= ~0ULL >> (64 - bits); 325 326 bitstream_cursor_advance(&bs->cur, bits); 327 *out = val; 328 329 return bits; 330} 331 332/* encodes @in as vli into @bs; 333 334 * return values 335 * > 0: number of bits successfully stored in bitstream 336 * -ENOBUFS @bs is full 337 * -EINVAL input zero (invalid) 338 * -EOVERFLOW input too large for this vli code (invalid) 339 */ 340static inline int vli_encode_bits(struct bitstream *bs, u64 in) 341{ 342 u64 code = code; 343 int bits = __vli_encode_bits(&code, in); 344 345 if (bits <= 0) 346 return bits; 347 348 return bitstream_put_bits(bs, code, bits); 349} 350 351#endif 352