linux/include/linux/crush/crush.h
<<
>>
Prefs
   1#ifndef CEPH_CRUSH_CRUSH_H
   2#define CEPH_CRUSH_CRUSH_H
   3
   4#ifdef __KERNEL__
   5# include <linux/types.h>
   6#else
   7# include "crush_compat.h"
   8#endif
   9
  10/*
  11 * CRUSH is a pseudo-random data distribution algorithm that
  12 * efficiently distributes input values (typically, data objects)
  13 * across a heterogeneous, structured storage cluster.
  14 *
  15 * The algorithm was originally described in detail in this paper
  16 * (although the algorithm has evolved somewhat since then):
  17 *
  18 *     http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
  19 *
  20 * LGPL2
  21 */
  22
  23
  24#define CRUSH_MAGIC 0x00010000ul   /* for detecting algorithm revisions */
  25
  26#define CRUSH_MAX_DEPTH 10  /* max crush hierarchy depth */
  27#define CRUSH_MAX_RULESET (1<<8)  /* max crush ruleset number */
  28#define CRUSH_MAX_RULES CRUSH_MAX_RULESET  /* should be the same as max rulesets */
  29
  30#define CRUSH_MAX_DEVICE_WEIGHT (100u * 0x10000u)
  31#define CRUSH_MAX_BUCKET_WEIGHT (65535u * 0x10000u)
  32
  33#define CRUSH_ITEM_UNDEF  0x7ffffffe  /* undefined result (internal use only) */
  34#define CRUSH_ITEM_NONE   0x7fffffff  /* no result */
  35
  36/*
  37 * CRUSH uses user-defined "rules" to describe how inputs should be
  38 * mapped to devices.  A rule consists of sequence of steps to perform
  39 * to generate the set of output devices.
  40 */
  41struct crush_rule_step {
  42        __u32 op;
  43        __s32 arg1;
  44        __s32 arg2;
  45};
  46
  47/* step op codes */
  48enum {
  49        CRUSH_RULE_NOOP = 0,
  50        CRUSH_RULE_TAKE = 1,          /* arg1 = value to start with */
  51        CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
  52                                      /* arg2 = type */
  53        CRUSH_RULE_CHOOSE_INDEP = 3,  /* same */
  54        CRUSH_RULE_EMIT = 4,          /* no args */
  55        CRUSH_RULE_CHOOSELEAF_FIRSTN = 6,
  56        CRUSH_RULE_CHOOSELEAF_INDEP = 7,
  57
  58        CRUSH_RULE_SET_CHOOSE_TRIES = 8, /* override choose_total_tries */
  59        CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */
  60        CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10,
  61        CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11,
  62        CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12,
  63        CRUSH_RULE_SET_CHOOSELEAF_STABLE = 13
  64};
  65
  66/*
  67 * for specifying choose num (arg1) relative to the max parameter
  68 * passed to do_rule
  69 */
  70#define CRUSH_CHOOSE_N            0
  71#define CRUSH_CHOOSE_N_MINUS(x)   (-(x))
  72
  73/*
  74 * The rule mask is used to describe what the rule is intended for.
  75 * Given a ruleset and size of output set, we search through the
  76 * rule list for a matching rule_mask.
  77 */
  78struct crush_rule_mask {
  79        __u8 ruleset;
  80        __u8 type;
  81        __u8 min_size;
  82        __u8 max_size;
  83};
  84
  85struct crush_rule {
  86        __u32 len;
  87        struct crush_rule_mask mask;
  88        struct crush_rule_step steps[0];
  89};
  90
  91#define crush_rule_size(len) (sizeof(struct crush_rule) + \
  92                              (len)*sizeof(struct crush_rule_step))
  93
  94
  95
  96/*
  97 * A bucket is a named container of other items (either devices or
  98 * other buckets).  Items within a bucket are chosen using one of a
  99 * few different algorithms.  The table summarizes how the speed of
 100 * each option measures up against mapping stability when items are
 101 * added or removed.
 102 *
 103 *  Bucket Alg     Speed       Additions    Removals
 104 *  ------------------------------------------------
 105 *  uniform         O(1)       poor         poor
 106 *  list            O(n)       optimal      poor
 107 *  tree            O(log n)   good         good
 108 *  straw           O(n)       better       better
 109 *  straw2          O(n)       optimal      optimal
 110 */
 111enum {
 112        CRUSH_BUCKET_UNIFORM = 1,
 113        CRUSH_BUCKET_LIST = 2,
 114        CRUSH_BUCKET_TREE = 3,
 115        CRUSH_BUCKET_STRAW = 4,
 116        CRUSH_BUCKET_STRAW2 = 5,
 117};
 118extern const char *crush_bucket_alg_name(int alg);
 119
 120/*
 121 * although tree was a legacy algorithm, it has been buggy, so
 122 * exclude it.
 123 */
 124#define CRUSH_LEGACY_ALLOWED_BUCKET_ALGS (      \
 125                (1 << CRUSH_BUCKET_UNIFORM) |   \
 126                (1 << CRUSH_BUCKET_LIST) |      \
 127                (1 << CRUSH_BUCKET_STRAW))
 128
 129struct crush_bucket {
 130        __s32 id;        /* this'll be negative */
 131        __u16 type;      /* non-zero; type=0 is reserved for devices */
 132        __u8 alg;        /* one of CRUSH_BUCKET_* */
 133        __u8 hash;       /* which hash function to use, CRUSH_HASH_* */
 134        __u32 weight;    /* 16-bit fixed point */
 135        __u32 size;      /* num items */
 136        __s32 *items;
 137
 138        /*
 139         * cached random permutation: used for uniform bucket and for
 140         * the linear search fallback for the other bucket types.
 141         */
 142        __u32 perm_x;  /* @x for which *perm is defined */
 143        __u32 perm_n;  /* num elements of *perm that are permuted/defined */
 144        __u32 *perm;
 145};
 146
 147struct crush_bucket_uniform {
 148        struct crush_bucket h;
 149        __u32 item_weight;  /* 16-bit fixed point; all items equally weighted */
 150};
 151
 152struct crush_bucket_list {
 153        struct crush_bucket h;
 154        __u32 *item_weights;  /* 16-bit fixed point */
 155        __u32 *sum_weights;   /* 16-bit fixed point.  element i is sum
 156                                 of weights 0..i, inclusive */
 157};
 158
 159struct crush_bucket_tree {
 160        struct crush_bucket h;  /* note: h.size is _tree_ size, not number of
 161                                   actual items */
 162        __u8 num_nodes;
 163        __u32 *node_weights;
 164};
 165
 166struct crush_bucket_straw {
 167        struct crush_bucket h;
 168        __u32 *item_weights;   /* 16-bit fixed point */
 169        __u32 *straws;         /* 16-bit fixed point */
 170};
 171
 172struct crush_bucket_straw2 {
 173        struct crush_bucket h;
 174        __u32 *item_weights;   /* 16-bit fixed point */
 175};
 176
 177
 178
 179/*
 180 * CRUSH map includes all buckets, rules, etc.
 181 */
 182struct crush_map {
 183        struct crush_bucket **buckets;
 184        struct crush_rule **rules;
 185
 186        __s32 max_buckets;
 187        __u32 max_rules;
 188        __s32 max_devices;
 189
 190        /* choose local retries before re-descent */
 191        __u32 choose_local_tries;
 192        /* choose local attempts using a fallback permutation before
 193         * re-descent */
 194        __u32 choose_local_fallback_tries;
 195        /* choose attempts before giving up */
 196        __u32 choose_total_tries;
 197        /* attempt chooseleaf inner descent once for firstn mode; on
 198         * reject retry outer descent.  Note that this does *not*
 199         * apply to a collision: in that case we will retry as we used
 200         * to. */
 201        __u32 chooseleaf_descend_once;
 202
 203        /* if non-zero, feed r into chooseleaf, bit-shifted right by (r-1)
 204         * bits.  a value of 1 is best for new clusters.  for legacy clusters
 205         * that want to limit reshuffling, a value of 3 or 4 will make the
 206         * mappings line up a bit better with previous mappings. */
 207        __u8 chooseleaf_vary_r;
 208
 209        /* if true, it makes chooseleaf firstn to return stable results (if
 210         * no local retry) so that data migrations would be optimal when some
 211         * device fails. */
 212        __u8 chooseleaf_stable;
 213
 214#ifndef __KERNEL__
 215        /*
 216         * version 0 (original) of straw_calc has various flaws.  version 1
 217         * fixes a few of them.
 218         */
 219        __u8 straw_calc_version;
 220
 221        /*
 222         * allowed bucket algs is a bitmask, here the bit positions
 223         * are CRUSH_BUCKET_*.  note that these are *bits* and
 224         * CRUSH_BUCKET_* values are not, so we need to or together (1
 225         * << CRUSH_BUCKET_WHATEVER).  The 0th bit is not used to
 226         * minimize confusion (bucket type values start at 1).
 227         */
 228        __u32 allowed_bucket_algs;
 229
 230        __u32 *choose_tries;
 231#endif
 232};
 233
 234
 235/* crush.c */
 236extern int crush_get_bucket_item_weight(const struct crush_bucket *b, int pos);
 237extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
 238extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
 239extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
 240extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
 241extern void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b);
 242extern void crush_destroy_bucket(struct crush_bucket *b);
 243extern void crush_destroy_rule(struct crush_rule *r);
 244extern void crush_destroy(struct crush_map *map);
 245
 246static inline int crush_calc_tree_node(int i)
 247{
 248        return ((i+1) << 1)-1;
 249}
 250
 251#endif
 252