LXR linux/drivers/block/drbd/drbd

   1/*
   2-*- linux-c -*-
   3   drbd_receiver.c
   4   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
   5
   6   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
   7   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
   8   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
   9
  10   drbd is free software; you can redistribute it and/or modify
  11   it under the terms of the GNU General Public License as published by
  12   the Free Software Foundation; either version 2, or (at your option)
  13   any later version.
  14
  15   drbd is distributed in the hope that it will be useful,
  16   but WITHOUT ANY WARRANTY; without even the implied warranty of
  17   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18   GNU General Public License for more details.
  19
  20   You should have received a copy of the GNU General Public License
  21   along with drbd; see the file COPYING.  If not, write to
  22   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  23 */
  24
  25#ifndef _DRBD_VLI_H
  26#define _DRBD_VLI_H
  27
  28/*
  29 * At a granularity of 4KiB storage represented per bit,
  30 * and stroage sizes of several TiB,
  31 * and possibly small-bandwidth replication,
  32 * the bitmap transfer time can take much too long,
  33 * if transmitted in plain text.
  34 *
  35 * We try to reduce the transferred bitmap information
  36 * by encoding runlengths of bit polarity.
  37 *
  38 * We never actually need to encode a "zero" (runlengths are positive).
  39 * But then we have to store the value of the first bit.
  40 * The first bit of information thus shall encode if the first runlength
  41 * gives the number of set or unset bits.
  42 *
  43 * We assume that large areas are either completely set or unset,
  44 * which gives good compression with any runlength method,
  45 * even when encoding the runlength as fixed size 32bit/64bit integers.
  46 *
  47 * Still, there may be areas where the polarity flips every few bits,
  48 * and encoding the runlength sequence of those areas with fix size
  49 * integers would be much worse than plaintext.
  50 *
  51 * We want to encode small runlength values with minimum code length,
  52 * while still being able to encode a Huge run of all zeros.
  53 *
  54 * Thus we need a Variable Length Integer encoding, VLI.
  55 *
  56 * For some cases, we produce more code bits than plaintext input.
  57 * We need to send incompressible chunks as plaintext, skip over them
  58 * and then see if the next chunk compresses better.
  59 *
  60 * We don't care too much about "excellent" compression ratio for large
  61 * runlengths (all set/all clear): whether we achieve a factor of 100
  62 * or 1000 is not that much of an issue.
  63 * We do not want to waste too much on short runlengths in the "noisy"
  64 * parts of the bitmap, though.
  65 *
  66 * There are endless variants of VLI, we experimented with:
  67 *  * simple byte-based
  68 *  * various bit based with different code word length.
  69 *
  70 * To avoid yet an other configuration parameter (choice of bitmap compression
  71 * algorithm) which was difficult to explain and tune, we just chose the one
  72 * variant that turned out best in all test cases.
  73 * Based on real world usage patterns, with device sizes ranging from a few GiB
  74 * to several TiB, file server/mailserver/webserver/mysql/postgress,
  75 * mostly idle to really busy, the all time winner (though sometimes only
  76 * marginally better) is:
  77 */
  78
  79/*
  80 * encoding is "visualised" as
  81 * __little endian__ bitstream, least significant bit first (left most)
  82 *
  83 * this particular encoding is chosen so that the prefix code
  84 * starts as unary encoding the level, then modified so that
  85 * 10 levels can be described in 8bit, with minimal overhead
  86 * for the smaller levels.
  87 *
  88 * Number of data bits follow fibonacci sequence, with the exception of the
  89 * last level (+1 data bit, so it makes 64bit total).  The only worse code when
  90 * encoding bit polarity runlength is 1 plain bits => 2 code bits.
  91prefix    data bits                                    max val  Nº data bits
  920 x                                                         0x2            1
  9310 x                                                        0x4            1
  94110 xx                                                      0x8            2
  951110 xxx                                                   0x10            3
  9611110 xxx xx                                               0x30            5
  97111110 xx xxxxxx                                          0x130            8
  9811111100  xxxxxxxx xxxxx                                 0x2130           13
  9911111110  xxxxxxxx xxxxxxxx xxxxx                      0x202130           21
 10011111101  xxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xx   0x400202130           34
 10111111111  xxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56
 102 * maximum encodable value: 0x100000400202130 == 2**56 + some */
 103
 104/* compression "table":
 105 transmitted   x                                0.29
 106 as plaintext x                                  ........................
 107             x                                   ........................
 108            x                                    ........................
 109           x    0.59                         0.21........................
 110          x      ........................................................
 111         x       .. c ...................................................
 112        x    0.44.. o ...................................................
 113       x .......... d ...................................................
 114      x  .......... e ...................................................
 115     X.............   ...................................................
 116    x.............. b ...................................................
 1172.0x............... i ...................................................
 118 #X................ t ...................................................
 119 #................. s ...........................  plain bits  ..........
 120-+-----------------------------------------------------------------------
 121 1             16              32                              64
 122*/
 123
 124/* LEVEL: (total bits, prefix bits, prefix value),
 125 * sorted ascending by number of total bits.
 126 * The rest of the code table is calculated at compiletime from this. */
 127
 128/* fibonacci data 1, 1, ... */
 129#define VLI_L_1_1() do { \
 130        LEVEL( 2, 1, 0x00); \
 131        LEVEL( 3, 2, 0x01); \
 132        LEVEL( 5, 3, 0x03); \
 133        LEVEL( 7, 4, 0x07); \
 134        LEVEL(10, 5, 0x0f); \
 135        LEVEL(14, 6, 0x1f); \
 136        LEVEL(21, 8, 0x3f); \
 137        LEVEL(29, 8, 0x7f); \
 138        LEVEL(42, 8, 0xbf); \
 139        LEVEL(64, 8, 0xff); \
 140        } while (0)
 141
 142/* finds a suitable level to decode the least significant part of in.
 143 * returns number of bits consumed.
 144 *
 145 * BUG() for bad input, as that would mean a buggy code table. */
 146static inline int vli_decode_bits(u64 *out, const u64 in)
 147{
 148        u64 adj = 1;
 149
 150#define LEVEL(t,b,v)                                    \
 151        do {                                            \
 152                if ((in & ((1 << b) -1)) == v) {        \
 153                        *out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \
 154                        return t;                       \
 155                }                                       \
 156                adj += 1ULL << (t - b);                 \
 157        } while (0)
 158
 159        VLI_L_1_1();
 160
 161        /* NOT REACHED, if VLI_LEVELS code table is defined properly */
 162        BUG();
 163#undef LEVEL
 164}
 165
 166/* return number of code bits needed,
 167 * or negative error number */
 168static inline int __vli_encode_bits(u64 *out, const u64 in)
 169{
 170        u64 max = 0;
 171        u64 adj = 1;
 172
 173        if (in == 0)
 174                return -EINVAL;
 175
 176#define LEVEL(t,b,v) do {               \
 177                max += 1ULL << (t - b); \
 178                if (in <= max) {        \
 179                        if (out)        \
 180                                *out = ((in - adj) << b) | v;   \
 181                        return t;       \
 182                }                       \
 183                adj = max + 1;          \
 184        } while (0)
 185
 186        VLI_L_1_1();
 187
 188        return -EOVERFLOW;
 189#undef LEVEL
 190}
 191
 192#undef VLI_L_1_1
 193
 194/* code from here down is independend of actually used bit code */
 195
 196/*
 197 * Code length is determined by some unique (e.g. unary) prefix.
 198 * This encodes arbitrary bit length, not whole bytes: we have a bit-stream,
 199 * not a byte stream.
 200 */
 201
 202/* for the bitstream, we need a cursor */
 203struct bitstream_cursor {
 204        /* the current byte */
 205        u8 *b;
 206        /* the current bit within *b, nomalized: 0..7 */
 207        unsigned int bit;
 208};
 209
 210/* initialize cursor to point to first bit of stream */
 211static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s)
 212{
 213        cur->b = s;
 214        cur->bit = 0;
 215}
 216
 217/* advance cursor by that many bits; maximum expected input value: 64,
 218 * but depending on VLI implementation, it may be more. */
 219static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits)
 220{
 221        bits += cur->bit;
 222        cur->b = cur->b + (bits >> 3);
 223        cur->bit = bits & 7;
 224}
 225
 226/* the bitstream itself knows its length */
 227struct bitstream {
 228        struct bitstream_cursor cur;
 229        unsigned char *buf;
 230        size_t buf_len;         /* in bytes */
 231
 232        /* for input stream:
 233         * number of trailing 0 bits for padding
 234         * total number of valid bits in stream: buf_len * 8 - pad_bits */
 235        unsigned int pad_bits;
 236};
 237
 238static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits)
 239{
 240        bs->buf = s;
 241        bs->buf_len = len;
 242        bs->pad_bits = pad_bits;
 243        bitstream_cursor_reset(&bs->cur, bs->buf);
 244}
 245
 246static inline void bitstream_rewind(struct bitstream *bs)
 247{
 248        bitstream_cursor_reset(&bs->cur, bs->buf);
 249        memset(bs->buf, 0, bs->buf_len);
 250}
 251
 252/* Put (at most 64) least significant bits of val into bitstream, and advance cursor.
 253 * Ignores "pad_bits".
 254 * Returns zero if bits == 0 (nothing to do).
 255 * Returns number of bits used if successful.
 256 *
 257 * If there is not enough room left in bitstream,
 258 * leaves bitstream unchanged and returns -ENOBUFS.
 259 */
 260static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits)
 261{
 262        unsigned char *b = bs->cur.b;
 263        unsigned int tmp;
 264
 265        if (bits == 0)
 266                return 0;
 267
 268        if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len)
 269                return -ENOBUFS;
 270
 271        /* paranoia: strip off hi bits; they should not be set anyways. */
 272        if (bits < 64)
 273                val &= ~0ULL >> (64 - bits);
 274
 275        *b++ |= (val & 0xff) << bs->cur.bit;
 276
 277        for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8)
 278                *b++ |= (val >> tmp) & 0xff;
 279
 280        bitstream_cursor_advance(&bs->cur, bits);
 281        return bits;
 282}
 283
 284/* Fetch (at most 64) bits from bitstream into *out, and advance cursor.
 285 *
 286 * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged.
 287 *
 288 * If there are less than the requested number of valid bits left in the
 289 * bitstream, still fetches all available bits.
 290 *
 291 * Returns number of actually fetched bits.
 292 */
 293static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits)
 294{
 295        u64 val;
 296        unsigned int n;
 297
 298        if (bits > 64)
 299                return -EINVAL;
 300
 301        if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len)
 302                bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3)
 303                        - bs->cur.bit - bs->pad_bits;
 304
 305        if (bits == 0) {
 306                *out = 0;
 307                return 0;
 308        }
 309
 310        /* get the high bits */
 311        val = 0;
 312        n = (bs->cur.bit + bits + 7) >> 3;
 313        /* n may be at most 9, if cur.bit + bits > 64 */
 314        /* which means this copies at most 8 byte */
 315        if (n) {
 316                memcpy(&val, bs->cur.b+1, n - 1);
 317                val = le64_to_cpu(val) << (8 - bs->cur.bit);
 318        }
 319
 320        /* we still need the low bits */
 321        val |= bs->cur.b[0] >> bs->cur.bit;
 322
 323        /* and mask out bits we don't want */
 324        val &= ~0ULL >> (64 - bits);
 325
 326        bitstream_cursor_advance(&bs->cur, bits);
 327        *out = val;
 328
 329        return bits;
 330}
 331
 332/* encodes @in as vli into @bs;
 333
 334 * return values
 335 *  > 0: number of bits successfully stored in bitstream
 336 * -ENOBUFS @bs is full
 337 * -EINVAL input zero (invalid)
 338 * -EOVERFLOW input too large for this vli code (invalid)
 339 */
 340static inline int vli_encode_bits(struct bitstream *bs, u64 in)
 341{
 342        u64 code = code;
 343        int bits = __vli_encode_bits(&code, in);
 344
 345        if (bits <= 0)
 346                return bits;
 347
 348        return bitstream_put_bits(bs, code, bits);
 349}
 350
 351#endif
 352