linux/net/ceph/crush/mapper.c
<<
>>
Prefs
   1
   2#ifdef __KERNEL__
   3# include <linux/string.h>
   4# include <linux/slab.h>
   5# include <linux/bug.h>
   6# include <linux/kernel.h>
   7# ifndef dprintk
   8#  define dprintk(args...)
   9# endif
  10#else
  11# include <string.h>
  12# include <stdio.h>
  13# include <stdlib.h>
  14# include <assert.h>
  15# define BUG_ON(x) assert(!(x))
  16# define dprintk(args...) /* printf(args) */
  17# define kmalloc(x, f) malloc(x)
  18# define kfree(x) free(x)
  19#endif
  20
  21#include <linux/crush/crush.h>
  22#include <linux/crush/hash.h>
  23
  24/*
  25 * Implement the core CRUSH mapping algorithm.
  26 */
  27
  28/**
  29 * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
  30 * @map: the crush_map
  31 * @ruleset: the storage ruleset id (user defined)
  32 * @type: storage ruleset type (user defined)
  33 * @size: output set size
  34 */
  35int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
  36{
  37        int i;
  38
  39        for (i = 0; i < map->max_rules; i++) {
  40                if (map->rules[i] &&
  41                    map->rules[i]->mask.ruleset == ruleset &&
  42                    map->rules[i]->mask.type == type &&
  43                    map->rules[i]->mask.min_size <= size &&
  44                    map->rules[i]->mask.max_size >= size)
  45                        return i;
  46        }
  47        return -1;
  48}
  49
  50
  51/*
  52 * bucket choose methods
  53 *
  54 * For each bucket algorithm, we have a "choose" method that, given a
  55 * crush input @x and replica position (usually, position in output set) @r,
  56 * will produce an item in the bucket.
  57 */
  58
  59/*
  60 * Choose based on a random permutation of the bucket.
  61 *
  62 * We used to use some prime number arithmetic to do this, but it
  63 * wasn't very random, and had some other bad behaviors.  Instead, we
  64 * calculate an actual random permutation of the bucket members.
  65 * Since this is expensive, we optimize for the r=0 case, which
  66 * captures the vast majority of calls.
  67 */
  68static int bucket_perm_choose(struct crush_bucket *bucket,
  69                              int x, int r)
  70{
  71        unsigned pr = r % bucket->size;
  72        unsigned i, s;
  73
  74        /* start a new permutation if @x has changed */
  75        if (bucket->perm_x != x || bucket->perm_n == 0) {
  76                dprintk("bucket %d new x=%d\n", bucket->id, x);
  77                bucket->perm_x = x;
  78
  79                /* optimize common r=0 case */
  80                if (pr == 0) {
  81                        s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
  82                                bucket->size;
  83                        bucket->perm[0] = s;
  84                        bucket->perm_n = 0xffff;   /* magic value, see below */
  85                        goto out;
  86                }
  87
  88                for (i = 0; i < bucket->size; i++)
  89                        bucket->perm[i] = i;
  90                bucket->perm_n = 0;
  91        } else if (bucket->perm_n == 0xffff) {
  92                /* clean up after the r=0 case above */
  93                for (i = 1; i < bucket->size; i++)
  94                        bucket->perm[i] = i;
  95                bucket->perm[bucket->perm[0]] = 0;
  96                bucket->perm_n = 1;
  97        }
  98
  99        /* calculate permutation up to pr */
 100        for (i = 0; i < bucket->perm_n; i++)
 101                dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
 102        while (bucket->perm_n <= pr) {
 103                unsigned p = bucket->perm_n;
 104                /* no point in swapping the final entry */
 105                if (p < bucket->size - 1) {
 106                        i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
 107                                (bucket->size - p);
 108                        if (i) {
 109                                unsigned t = bucket->perm[p + i];
 110                                bucket->perm[p + i] = bucket->perm[p];
 111                                bucket->perm[p] = t;
 112                        }
 113                        dprintk(" perm_choose swap %d with %d\n", p, p+i);
 114                }
 115                bucket->perm_n++;
 116        }
 117        for (i = 0; i < bucket->size; i++)
 118                dprintk(" perm_choose  %d: %d\n", i, bucket->perm[i]);
 119
 120        s = bucket->perm[pr];
 121out:
 122        dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
 123                bucket->size, x, r, pr, s);
 124        return bucket->items[s];
 125}
 126
 127/* uniform */
 128static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
 129                                 int x, int r)
 130{
 131        return bucket_perm_choose(&bucket->h, x, r);
 132}
 133
 134/* list */
 135static int bucket_list_choose(struct crush_bucket_list *bucket,
 136                              int x, int r)
 137{
 138        int i;
 139
 140        for (i = bucket->h.size-1; i >= 0; i--) {
 141                __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
 142                                         r, bucket->h.id);
 143                w &= 0xffff;
 144                dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
 145                        "sw %x rand %llx",
 146                        i, x, r, bucket->h.items[i], bucket->item_weights[i],
 147                        bucket->sum_weights[i], w);
 148                w *= bucket->sum_weights[i];
 149                w = w >> 16;
 150                /*dprintk(" scaled %llx\n", w);*/
 151                if (w < bucket->item_weights[i])
 152                        return bucket->h.items[i];
 153        }
 154
 155        BUG_ON(1);
 156        return 0;
 157}
 158
 159
 160/* (binary) tree */
 161static int height(int n)
 162{
 163        int h = 0;
 164        while ((n & 1) == 0) {
 165                h++;
 166                n = n >> 1;
 167        }
 168        return h;
 169}
 170
 171static int left(int x)
 172{
 173        int h = height(x);
 174        return x - (1 << (h-1));
 175}
 176
 177static int right(int x)
 178{
 179        int h = height(x);
 180        return x + (1 << (h-1));
 181}
 182
 183static int terminal(int x)
 184{
 185        return x & 1;
 186}
 187
 188static int bucket_tree_choose(struct crush_bucket_tree *bucket,
 189                              int x, int r)
 190{
 191        int n, l;
 192        __u32 w;
 193        __u64 t;
 194
 195        /* start at root */
 196        n = bucket->num_nodes >> 1;
 197
 198        while (!terminal(n)) {
 199                /* pick point in [0, w) */
 200                w = bucket->node_weights[n];
 201                t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
 202                                          bucket->h.id) * (__u64)w;
 203                t = t >> 32;
 204
 205                /* descend to the left or right? */
 206                l = left(n);
 207                if (t < bucket->node_weights[l])
 208                        n = l;
 209                else
 210                        n = right(n);
 211        }
 212
 213        return bucket->h.items[n >> 1];
 214}
 215
 216
 217/* straw */
 218
 219static int bucket_straw_choose(struct crush_bucket_straw *bucket,
 220                               int x, int r)
 221{
 222        int i;
 223        int high = 0;
 224        __u64 high_draw = 0;
 225        __u64 draw;
 226
 227        for (i = 0; i < bucket->h.size; i++) {
 228                draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
 229                draw &= 0xffff;
 230                draw *= bucket->straws[i];
 231                if (i == 0 || draw > high_draw) {
 232                        high = i;
 233                        high_draw = draw;
 234                }
 235        }
 236        return bucket->h.items[high];
 237}
 238
 239static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
 240{
 241        dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
 242        switch (in->alg) {
 243        case CRUSH_BUCKET_UNIFORM:
 244                return bucket_uniform_choose((struct crush_bucket_uniform *)in,
 245                                          x, r);
 246        case CRUSH_BUCKET_LIST:
 247                return bucket_list_choose((struct crush_bucket_list *)in,
 248                                          x, r);
 249        case CRUSH_BUCKET_TREE:
 250                return bucket_tree_choose((struct crush_bucket_tree *)in,
 251                                          x, r);
 252        case CRUSH_BUCKET_STRAW:
 253                return bucket_straw_choose((struct crush_bucket_straw *)in,
 254                                           x, r);
 255        default:
 256                BUG_ON(1);
 257                return in->items[0];
 258        }
 259}
 260
 261/*
 262 * true if device is marked "out" (failed, fully offloaded)
 263 * of the cluster
 264 */
 265static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
 266{
 267        if (weight[item] >= 0x10000)
 268                return 0;
 269        if (weight[item] == 0)
 270                return 1;
 271        if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
 272            < weight[item])
 273                return 0;
 274        return 1;
 275}
 276
 277/**
 278 * crush_choose - choose numrep distinct items of given type
 279 * @map: the crush_map
 280 * @bucket: the bucket we are choose an item from
 281 * @x: crush input value
 282 * @numrep: the number of items to choose
 283 * @type: the type of item to choose
 284 * @out: pointer to output vector
 285 * @outpos: our position in that vector
 286 * @firstn: true if choosing "first n" items, false if choosing "indep"
 287 * @recurse_to_leaf: true if we want one device under each item of given type
 288 * @out2: second output vector for leaf items (if @recurse_to_leaf)
 289 */
 290static int crush_choose(struct crush_map *map,
 291                        struct crush_bucket *bucket,
 292                        __u32 *weight,
 293                        int x, int numrep, int type,
 294                        int *out, int outpos,
 295                        int firstn, int recurse_to_leaf,
 296                        int *out2)
 297{
 298        int rep;
 299        int ftotal, flocal;
 300        int retry_descent, retry_bucket, skip_rep;
 301        struct crush_bucket *in = bucket;
 302        int r;
 303        int i;
 304        int item = 0;
 305        int itemtype;
 306        int collide, reject;
 307        const int orig_tries = 5; /* attempts before we fall back to search */
 308
 309        dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
 310                bucket->id, x, outpos, numrep);
 311
 312        for (rep = outpos; rep < numrep; rep++) {
 313                /* keep trying until we get a non-out, non-colliding item */
 314                ftotal = 0;
 315                skip_rep = 0;
 316                do {
 317                        retry_descent = 0;
 318                        in = bucket;               /* initial bucket */
 319
 320                        /* choose through intervening buckets */
 321                        flocal = 0;
 322                        do {
 323                                collide = 0;
 324                                retry_bucket = 0;
 325                                r = rep;
 326                                if (in->alg == CRUSH_BUCKET_UNIFORM) {
 327                                        /* be careful */
 328                                        if (firstn || numrep >= in->size)
 329                                                /* r' = r + f_total */
 330                                                r += ftotal;
 331                                        else if (in->size % numrep == 0)
 332                                                /* r'=r+(n+1)*f_local */
 333                                                r += (numrep+1) *
 334                                                        (flocal+ftotal);
 335                                        else
 336                                                /* r' = r + n*f_local */
 337                                                r += numrep * (flocal+ftotal);
 338                                } else {
 339                                        if (firstn)
 340                                                /* r' = r + f_total */
 341                                                r += ftotal;
 342                                        else
 343                                                /* r' = r + n*f_local */
 344                                                r += numrep * (flocal+ftotal);
 345                                }
 346
 347                                /* bucket choose */
 348                                if (in->size == 0) {
 349                                        reject = 1;
 350                                        goto reject;
 351                                }
 352                                if (flocal >= (in->size>>1) &&
 353                                    flocal > orig_tries)
 354                                        item = bucket_perm_choose(in, x, r);
 355                                else
 356                                        item = crush_bucket_choose(in, x, r);
 357                                BUG_ON(item >= map->max_devices);
 358
 359                                /* desired type? */
 360                                if (item < 0)
 361                                        itemtype = map->buckets[-1-item]->type;
 362                                else
 363                                        itemtype = 0;
 364                                dprintk("  item %d type %d\n", item, itemtype);
 365
 366                                /* keep going? */
 367                                if (itemtype != type) {
 368                                        BUG_ON(item >= 0 ||
 369                                               (-1-item) >= map->max_buckets);
 370                                        in = map->buckets[-1-item];
 371                                        retry_bucket = 1;
 372                                        continue;
 373                                }
 374
 375                                /* collision? */
 376                                for (i = 0; i < outpos; i++) {
 377                                        if (out[i] == item) {
 378                                                collide = 1;
 379                                                break;
 380                                        }
 381                                }
 382
 383                                reject = 0;
 384                                if (recurse_to_leaf) {
 385                                        if (item < 0) {
 386                                                if (crush_choose(map,
 387                                                         map->buckets[-1-item],
 388                                                         weight,
 389                                                         x, outpos+1, 0,
 390                                                         out2, outpos,
 391                                                         firstn, 0,
 392                                                         NULL) <= outpos)
 393                                                        /* didn't get leaf */
 394                                                        reject = 1;
 395                                        } else {
 396                                                /* we already have a leaf! */
 397                                                out2[outpos] = item;
 398                                        }
 399                                }
 400
 401                                if (!reject) {
 402                                        /* out? */
 403                                        if (itemtype == 0)
 404                                                reject = is_out(map, weight,
 405                                                                item, x);
 406                                        else
 407                                                reject = 0;
 408                                }
 409
 410reject:
 411                                if (reject || collide) {
 412                                        ftotal++;
 413                                        flocal++;
 414
 415                                        if (collide && flocal < 3)
 416                                                /* retry locally a few times */
 417                                                retry_bucket = 1;
 418                                        else if (flocal < in->size + orig_tries)
 419                                                /* exhaustive bucket search */
 420                                                retry_bucket = 1;
 421                                        else if (ftotal < 20)
 422                                                /* then retry descent */
 423                                                retry_descent = 1;
 424                                        else
 425                                                /* else give up */
 426                                                skip_rep = 1;
 427                                        dprintk("  reject %d  collide %d  "
 428                                                "ftotal %d  flocal %d\n",
 429                                                reject, collide, ftotal,
 430                                                flocal);
 431                                }
 432                        } while (retry_bucket);
 433                } while (retry_descent);
 434
 435                if (skip_rep) {
 436                        dprintk("skip rep\n");
 437                        continue;
 438                }
 439
 440                dprintk("CHOOSE got %d\n", item);
 441                out[outpos] = item;
 442                outpos++;
 443        }
 444
 445        dprintk("CHOOSE returns %d\n", outpos);
 446        return outpos;
 447}
 448
 449
 450/**
 451 * crush_do_rule - calculate a mapping with the given input and rule
 452 * @map: the crush_map
 453 * @ruleno: the rule id
 454 * @x: hash input
 455 * @result: pointer to result vector
 456 * @result_max: maximum result size
 457 * @force: force initial replica choice; -1 for none
 458 */
 459int crush_do_rule(struct crush_map *map,
 460                  int ruleno, int x, int *result, int result_max,
 461                  int force, __u32 *weight)
 462{
 463        int result_len;
 464        int force_context[CRUSH_MAX_DEPTH];
 465        int force_pos = -1;
 466        int a[CRUSH_MAX_SET];
 467        int b[CRUSH_MAX_SET];
 468        int c[CRUSH_MAX_SET];
 469        int recurse_to_leaf;
 470        int *w;
 471        int wsize = 0;
 472        int *o;
 473        int osize;
 474        int *tmp;
 475        struct crush_rule *rule;
 476        int step;
 477        int i, j;
 478        int numrep;
 479        int firstn;
 480        int rc = -1;
 481
 482        BUG_ON(ruleno >= map->max_rules);
 483
 484        rule = map->rules[ruleno];
 485        result_len = 0;
 486        w = a;
 487        o = b;
 488
 489        /*
 490         * determine hierarchical context of force, if any.  note
 491         * that this may or may not correspond to the specific types
 492         * referenced by the crush rule.
 493         */
 494        if (force >= 0) {
 495                if (force >= map->max_devices ||
 496                    map->device_parents[force] == 0) {
 497                        /*dprintk("CRUSH: forcefed device dne\n");*/
 498                        rc = -1;  /* force fed device dne */
 499                        goto out;
 500                }
 501                if (!is_out(map, weight, force, x)) {
 502                        while (1) {
 503                                force_context[++force_pos] = force;
 504                                if (force >= 0)
 505                                        force = map->device_parents[force];
 506                                else
 507                                        force = map->bucket_parents[-1-force];
 508                                if (force == 0)
 509                                        break;
 510                        }
 511                }
 512        }
 513
 514        for (step = 0; step < rule->len; step++) {
 515                firstn = 0;
 516                switch (rule->steps[step].op) {
 517                case CRUSH_RULE_TAKE:
 518                        w[0] = rule->steps[step].arg1;
 519                        if (force_pos >= 0) {
 520                                BUG_ON(force_context[force_pos] != w[0]);
 521                                force_pos--;
 522                        }
 523                        wsize = 1;
 524                        break;
 525
 526                case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
 527                case CRUSH_RULE_CHOOSE_FIRSTN:
 528                        firstn = 1;
 529                case CRUSH_RULE_CHOOSE_LEAF_INDEP:
 530                case CRUSH_RULE_CHOOSE_INDEP:
 531                        BUG_ON(wsize == 0);
 532
 533                        recurse_to_leaf =
 534                                rule->steps[step].op ==
 535                                 CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
 536                                rule->steps[step].op ==
 537                                CRUSH_RULE_CHOOSE_LEAF_INDEP;
 538
 539                        /* reset output */
 540                        osize = 0;
 541
 542                        for (i = 0; i < wsize; i++) {
 543                                /*
 544                                 * see CRUSH_N, CRUSH_N_MINUS macros.
 545                                 * basically, numrep <= 0 means relative to
 546                                 * the provided result_max
 547                                 */
 548                                numrep = rule->steps[step].arg1;
 549                                if (numrep <= 0) {
 550                                        numrep += result_max;
 551                                        if (numrep <= 0)
 552                                                continue;
 553                                }
 554                                j = 0;
 555                                if (osize == 0 && force_pos >= 0) {
 556                                        /* skip any intermediate types */
 557                                        while (force_pos &&
 558                                               force_context[force_pos] < 0 &&
 559                                               rule->steps[step].arg2 !=
 560                                               map->buckets[-1 -
 561                                               force_context[force_pos]]->type)
 562                                                force_pos--;
 563                                        o[osize] = force_context[force_pos];
 564                                        if (recurse_to_leaf)
 565                                                c[osize] = force_context[0];
 566                                        j++;
 567                                        force_pos--;
 568                                }
 569                                osize += crush_choose(map,
 570                                                      map->buckets[-1-w[i]],
 571                                                      weight,
 572                                                      x, numrep,
 573                                                      rule->steps[step].arg2,
 574                                                      o+osize, j,
 575                                                      firstn,
 576                                                      recurse_to_leaf, c+osize);
 577                        }
 578
 579                        if (recurse_to_leaf)
 580                                /* copy final _leaf_ values to output set */
 581                                memcpy(o, c, osize*sizeof(*o));
 582
 583                        /* swap t and w arrays */
 584                        tmp = o;
 585                        o = w;
 586                        w = tmp;
 587                        wsize = osize;
 588                        break;
 589
 590
 591                case CRUSH_RULE_EMIT:
 592                        for (i = 0; i < wsize && result_len < result_max; i++) {
 593                                result[result_len] = w[i];
 594                                result_len++;
 595                        }
 596                        wsize = 0;
 597                        break;
 598
 599                default:
 600                        BUG_ON(1);
 601                }
 602        }
 603        rc = result_len;
 604
 605out:
 606        return rc;
 607}
 608
 609
 610