linux/net/ceph/crush/mapper.c
<<
>>
Prefs
   1/*
   2 * Ceph - scalable distributed file system
   3 *
   4 * Copyright (C) 2015 Intel Corporation All Rights Reserved
   5 *
   6 * This is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU Lesser General Public
   8 * License version 2.1, as published by the Free Software
   9 * Foundation.  See file COPYING.
  10 *
  11 */
  12
  13#ifdef __KERNEL__
  14# include <linux/string.h>
  15# include <linux/slab.h>
  16# include <linux/bug.h>
  17# include <linux/kernel.h>
  18# include <linux/crush/crush.h>
  19# include <linux/crush/hash.h>
  20#else
  21# include "crush_compat.h"
  22# include "crush.h"
  23# include "hash.h"
  24#endif
  25#include "crush_ln_table.h"
  26
  27#define dprintk(args...) /* printf(args) */
  28
  29/*
  30 * Implement the core CRUSH mapping algorithm.
  31 */
  32
  33/**
  34 * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
  35 * @map: the crush_map
  36 * @ruleset: the storage ruleset id (user defined)
  37 * @type: storage ruleset type (user defined)
  38 * @size: output set size
  39 */
  40int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size)
  41{
  42        __u32 i;
  43
  44        for (i = 0; i < map->max_rules; i++) {
  45                if (map->rules[i] &&
  46                    map->rules[i]->mask.ruleset == ruleset &&
  47                    map->rules[i]->mask.type == type &&
  48                    map->rules[i]->mask.min_size <= size &&
  49                    map->rules[i]->mask.max_size >= size)
  50                        return i;
  51        }
  52        return -1;
  53}
  54
  55
  56/*
  57 * bucket choose methods
  58 *
  59 * For each bucket algorithm, we have a "choose" method that, given a
  60 * crush input @x and replica position (usually, position in output set) @r,
  61 * will produce an item in the bucket.
  62 */
  63
  64/*
  65 * Choose based on a random permutation of the bucket.
  66 *
  67 * We used to use some prime number arithmetic to do this, but it
  68 * wasn't very random, and had some other bad behaviors.  Instead, we
  69 * calculate an actual random permutation of the bucket members.
  70 * Since this is expensive, we optimize for the r=0 case, which
  71 * captures the vast majority of calls.
  72 */
  73static int bucket_perm_choose(struct crush_bucket *bucket,
  74                              int x, int r)
  75{
  76        unsigned int pr = r % bucket->size;
  77        unsigned int i, s;
  78
  79        /* start a new permutation if @x has changed */
  80        if (bucket->perm_x != (__u32)x || bucket->perm_n == 0) {
  81                dprintk("bucket %d new x=%d\n", bucket->id, x);
  82                bucket->perm_x = x;
  83
  84                /* optimize common r=0 case */
  85                if (pr == 0) {
  86                        s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
  87                                bucket->size;
  88                        bucket->perm[0] = s;
  89                        bucket->perm_n = 0xffff;   /* magic value, see below */
  90                        goto out;
  91                }
  92
  93                for (i = 0; i < bucket->size; i++)
  94                        bucket->perm[i] = i;
  95                bucket->perm_n = 0;
  96        } else if (bucket->perm_n == 0xffff) {
  97                /* clean up after the r=0 case above */
  98                for (i = 1; i < bucket->size; i++)
  99                        bucket->perm[i] = i;
 100                bucket->perm[bucket->perm[0]] = 0;
 101                bucket->perm_n = 1;
 102        }
 103
 104        /* calculate permutation up to pr */
 105        for (i = 0; i < bucket->perm_n; i++)
 106                dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
 107        while (bucket->perm_n <= pr) {
 108                unsigned int p = bucket->perm_n;
 109                /* no point in swapping the final entry */
 110                if (p < bucket->size - 1) {
 111                        i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
 112                                (bucket->size - p);
 113                        if (i) {
 114                                unsigned int t = bucket->perm[p + i];
 115                                bucket->perm[p + i] = bucket->perm[p];
 116                                bucket->perm[p] = t;
 117                        }
 118                        dprintk(" perm_choose swap %d with %d\n", p, p+i);
 119                }
 120                bucket->perm_n++;
 121        }
 122        for (i = 0; i < bucket->size; i++)
 123                dprintk(" perm_choose  %d: %d\n", i, bucket->perm[i]);
 124
 125        s = bucket->perm[pr];
 126out:
 127        dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
 128                bucket->size, x, r, pr, s);
 129        return bucket->items[s];
 130}
 131
 132/* uniform */
 133static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
 134                                 int x, int r)
 135{
 136        return bucket_perm_choose(&bucket->h, x, r);
 137}
 138
 139/* list */
 140static int bucket_list_choose(struct crush_bucket_list *bucket,
 141                              int x, int r)
 142{
 143        int i;
 144
 145        for (i = bucket->h.size-1; i >= 0; i--) {
 146                __u64 w = crush_hash32_4(bucket->h.hash, x, bucket->h.items[i],
 147                                         r, bucket->h.id);
 148                w &= 0xffff;
 149                dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
 150                        "sw %x rand %llx",
 151                        i, x, r, bucket->h.items[i], bucket->item_weights[i],
 152                        bucket->sum_weights[i], w);
 153                w *= bucket->sum_weights[i];
 154                w = w >> 16;
 155                /*dprintk(" scaled %llx\n", w);*/
 156                if (w < bucket->item_weights[i])
 157                        return bucket->h.items[i];
 158        }
 159
 160        dprintk("bad list sums for bucket %d\n", bucket->h.id);
 161        return bucket->h.items[0];
 162}
 163
 164
 165/* (binary) tree */
 166static int height(int n)
 167{
 168        int h = 0;
 169        while ((n & 1) == 0) {
 170                h++;
 171                n = n >> 1;
 172        }
 173        return h;
 174}
 175
 176static int left(int x)
 177{
 178        int h = height(x);
 179        return x - (1 << (h-1));
 180}
 181
 182static int right(int x)
 183{
 184        int h = height(x);
 185        return x + (1 << (h-1));
 186}
 187
 188static int terminal(int x)
 189{
 190        return x & 1;
 191}
 192
 193static int bucket_tree_choose(struct crush_bucket_tree *bucket,
 194                              int x, int r)
 195{
 196        int n;
 197        __u32 w;
 198        __u64 t;
 199
 200        /* start at root */
 201        n = bucket->num_nodes >> 1;
 202
 203        while (!terminal(n)) {
 204                int l;
 205                /* pick point in [0, w) */
 206                w = bucket->node_weights[n];
 207                t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
 208                                          bucket->h.id) * (__u64)w;
 209                t = t >> 32;
 210
 211                /* descend to the left or right? */
 212                l = left(n);
 213                if (t < bucket->node_weights[l])
 214                        n = l;
 215                else
 216                        n = right(n);
 217        }
 218
 219        return bucket->h.items[n >> 1];
 220}
 221
 222
 223/* straw */
 224
 225static int bucket_straw_choose(struct crush_bucket_straw *bucket,
 226                               int x, int r)
 227{
 228        __u32 i;
 229        int high = 0;
 230        __u64 high_draw = 0;
 231        __u64 draw;
 232
 233        for (i = 0; i < bucket->h.size; i++) {
 234                draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
 235                draw &= 0xffff;
 236                draw *= bucket->straws[i];
 237                if (i == 0 || draw > high_draw) {
 238                        high = i;
 239                        high_draw = draw;
 240                }
 241        }
 242        return bucket->h.items[high];
 243}
 244
 245/* compute 2^44*log2(input+1) */
 246static __u64 crush_ln(unsigned int xin)
 247{
 248        unsigned int x = xin, x1;
 249        int iexpon, index1, index2;
 250        __u64 RH, LH, LL, xl64, result;
 251
 252        x++;
 253
 254        /* normalize input */
 255        iexpon = 15;
 256        while (!(x & 0x18000)) {
 257                x <<= 1;
 258                iexpon--;
 259        }
 260
 261        index1 = (x >> 8) << 1;
 262        /* RH ~ 2^56/index1 */
 263        RH = __RH_LH_tbl[index1 - 256];
 264        /* LH ~ 2^48 * log2(index1/256) */
 265        LH = __RH_LH_tbl[index1 + 1 - 256];
 266
 267        /* RH*x ~ 2^48 * (2^15 + xf), xf<2^8 */
 268        xl64 = (__s64)x * RH;
 269        xl64 >>= 48;
 270        x1 = xl64;
 271
 272        result = iexpon;
 273        result <<= (12 + 32);
 274
 275        index2 = x1 & 0xff;
 276        /* LL ~ 2^48*log2(1.0+index2/2^15) */
 277        LL = __LL_tbl[index2];
 278
 279        LH = LH + LL;
 280
 281        LH >>= (48 - 12 - 32);
 282        result += LH;
 283
 284        return result;
 285}
 286
 287
 288/*
 289 * straw2
 290 *
 291 * for reference, see:
 292 *
 293 * http://en.wikipedia.org/wiki/Exponential_distribution#Distribution_of_the_minimum_of_exponential_random_variables
 294 *
 295 */
 296
 297static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket,
 298                                int x, int r)
 299{
 300        unsigned int i, high = 0;
 301        unsigned int u;
 302        unsigned int w;
 303        __s64 ln, draw, high_draw = 0;
 304
 305        for (i = 0; i < bucket->h.size; i++) {
 306                w = bucket->item_weights[i];
 307                if (w) {
 308                        u = crush_hash32_3(bucket->h.hash, x,
 309                                           bucket->h.items[i], r);
 310                        u &= 0xffff;
 311
 312                        /*
 313                         * for some reason slightly less than 0x10000 produces
 314                         * a slightly more accurate distribution... probably a
 315                         * rounding effect.
 316                         *
 317                         * the natural log lookup table maps [0,0xffff]
 318                         * (corresponding to real numbers [1/0x10000, 1] to
 319                         * [0, 0xffffffffffff] (corresponding to real numbers
 320                         * [-11.090355,0]).
 321                         */
 322                        ln = crush_ln(u) - 0x1000000000000ll;
 323
 324                        /*
 325                         * divide by 16.16 fixed-point weight.  note
 326                         * that the ln value is negative, so a larger
 327                         * weight means a larger (less negative) value
 328                         * for draw.
 329                         */
 330                        draw = div64_s64(ln, w);
 331                } else {
 332                        draw = S64_MIN;
 333                }
 334
 335                if (i == 0 || draw > high_draw) {
 336                        high = i;
 337                        high_draw = draw;
 338                }
 339        }
 340        return bucket->h.items[high];
 341}
 342
 343
 344static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
 345{
 346        dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
 347        BUG_ON(in->size == 0);
 348        switch (in->alg) {
 349        case CRUSH_BUCKET_UNIFORM:
 350                return bucket_uniform_choose((struct crush_bucket_uniform *)in,
 351                                          x, r);
 352        case CRUSH_BUCKET_LIST:
 353                return bucket_list_choose((struct crush_bucket_list *)in,
 354                                          x, r);
 355        case CRUSH_BUCKET_TREE:
 356                return bucket_tree_choose((struct crush_bucket_tree *)in,
 357                                          x, r);
 358        case CRUSH_BUCKET_STRAW:
 359                return bucket_straw_choose((struct crush_bucket_straw *)in,
 360                                           x, r);
 361        case CRUSH_BUCKET_STRAW2:
 362                return bucket_straw2_choose((struct crush_bucket_straw2 *)in,
 363                                            x, r);
 364        default:
 365                dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
 366                return in->items[0];
 367        }
 368}
 369
 370
 371/*
 372 * true if device is marked "out" (failed, fully offloaded)
 373 * of the cluster
 374 */
 375static int is_out(const struct crush_map *map,
 376                  const __u32 *weight, int weight_max,
 377                  int item, int x)
 378{
 379        if (item >= weight_max)
 380                return 1;
 381        if (weight[item] >= 0x10000)
 382                return 0;
 383        if (weight[item] == 0)
 384                return 1;
 385        if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
 386            < weight[item])
 387                return 0;
 388        return 1;
 389}
 390
 391/**
 392 * crush_choose_firstn - choose numrep distinct items of given type
 393 * @map: the crush_map
 394 * @bucket: the bucket we are choose an item from
 395 * @x: crush input value
 396 * @numrep: the number of items to choose
 397 * @type: the type of item to choose
 398 * @out: pointer to output vector
 399 * @outpos: our position in that vector
 400 * @out_size: size of the out vector
 401 * @tries: number of attempts to make
 402 * @recurse_tries: number of attempts to have recursive chooseleaf make
 403 * @local_retries: localized retries
 404 * @local_fallback_retries: localized fallback retries
 405 * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
 406 * @stable: stable mode starts rep=0 in the recursive call for all replicas
 407 * @vary_r: pass r to recursive calls
 408 * @out2: second output vector for leaf items (if @recurse_to_leaf)
 409 * @parent_r: r value passed from the parent
 410 */
 411static int crush_choose_firstn(const struct crush_map *map,
 412                               struct crush_bucket *bucket,
 413                               const __u32 *weight, int weight_max,
 414                               int x, int numrep, int type,
 415                               int *out, int outpos,
 416                               int out_size,
 417                               unsigned int tries,
 418                               unsigned int recurse_tries,
 419                               unsigned int local_retries,
 420                               unsigned int local_fallback_retries,
 421                               int recurse_to_leaf,
 422                               unsigned int vary_r,
 423                               unsigned int stable,
 424                               int *out2,
 425                               int parent_r)
 426{
 427        int rep;
 428        unsigned int ftotal, flocal;
 429        int retry_descent, retry_bucket, skip_rep;
 430        struct crush_bucket *in = bucket;
 431        int r;
 432        int i;
 433        int item = 0;
 434        int itemtype;
 435        int collide, reject;
 436        int count = out_size;
 437
 438        dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d stable %d\n",
 439                recurse_to_leaf ? "_LEAF" : "",
 440                bucket->id, x, outpos, numrep,
 441                tries, recurse_tries, local_retries, local_fallback_retries,
 442                parent_r, stable);
 443
 444        for (rep = stable ? 0 : outpos; rep < numrep && count > 0 ; rep++) {
 445                /* keep trying until we get a non-out, non-colliding item */
 446                ftotal = 0;
 447                skip_rep = 0;
 448                do {
 449                        retry_descent = 0;
 450                        in = bucket;               /* initial bucket */
 451
 452                        /* choose through intervening buckets */
 453                        flocal = 0;
 454                        do {
 455                                collide = 0;
 456                                retry_bucket = 0;
 457                                r = rep + parent_r;
 458                                /* r' = r + f_total */
 459                                r += ftotal;
 460
 461                                /* bucket choose */
 462                                if (in->size == 0) {
 463                                        reject = 1;
 464                                        goto reject;
 465                                }
 466                                if (local_fallback_retries > 0 &&
 467                                    flocal >= (in->size>>1) &&
 468                                    flocal > local_fallback_retries)
 469                                        item = bucket_perm_choose(in, x, r);
 470                                else
 471                                        item = crush_bucket_choose(in, x, r);
 472                                if (item >= map->max_devices) {
 473                                        dprintk("   bad item %d\n", item);
 474                                        skip_rep = 1;
 475                                        break;
 476                                }
 477
 478                                /* desired type? */
 479                                if (item < 0)
 480                                        itemtype = map->buckets[-1-item]->type;
 481                                else
 482                                        itemtype = 0;
 483                                dprintk("  item %d type %d\n", item, itemtype);
 484
 485                                /* keep going? */
 486                                if (itemtype != type) {
 487                                        if (item >= 0 ||
 488                                            (-1-item) >= map->max_buckets) {
 489                                                dprintk("   bad item type %d\n", type);
 490                                                skip_rep = 1;
 491                                                break;
 492                                        }
 493                                        in = map->buckets[-1-item];
 494                                        retry_bucket = 1;
 495                                        continue;
 496                                }
 497
 498                                /* collision? */
 499                                for (i = 0; i < outpos; i++) {
 500                                        if (out[i] == item) {
 501                                                collide = 1;
 502                                                break;
 503                                        }
 504                                }
 505
 506                                reject = 0;
 507                                if (!collide && recurse_to_leaf) {
 508                                        if (item < 0) {
 509                                                int sub_r;
 510                                                if (vary_r)
 511                                                        sub_r = r >> (vary_r-1);
 512                                                else
 513                                                        sub_r = 0;
 514                                                if (crush_choose_firstn(map,
 515                                                         map->buckets[-1-item],
 516                                                         weight, weight_max,
 517                                                         x, stable ? 1 : outpos+1, 0,
 518                                                         out2, outpos, count,
 519                                                         recurse_tries, 0,
 520                                                         local_retries,
 521                                                         local_fallback_retries,
 522                                                         0,
 523                                                         vary_r,
 524                                                         stable,
 525                                                         NULL,
 526                                                         sub_r) <= outpos)
 527                                                        /* didn't get leaf */
 528                                                        reject = 1;
 529                                        } else {
 530                                                /* we already have a leaf! */
 531                                                out2[outpos] = item;
 532                                        }
 533                                }
 534
 535                                if (!reject) {
 536                                        /* out? */
 537                                        if (itemtype == 0)
 538                                                reject = is_out(map, weight,
 539                                                                weight_max,
 540                                                                item, x);
 541                                        else
 542                                                reject = 0;
 543                                }
 544
 545reject:
 546                                if (reject || collide) {
 547                                        ftotal++;
 548                                        flocal++;
 549
 550                                        if (collide && flocal <= local_retries)
 551                                                /* retry locally a few times */
 552                                                retry_bucket = 1;
 553                                        else if (local_fallback_retries > 0 &&
 554                                                 flocal <= in->size + local_fallback_retries)
 555                                                /* exhaustive bucket search */
 556                                                retry_bucket = 1;
 557                                        else if (ftotal < tries)
 558                                                /* then retry descent */
 559                                                retry_descent = 1;
 560                                        else
 561                                                /* else give up */
 562                                                skip_rep = 1;
 563                                        dprintk("  reject %d  collide %d  "
 564                                                "ftotal %u  flocal %u\n",
 565                                                reject, collide, ftotal,
 566                                                flocal);
 567                                }
 568                        } while (retry_bucket);
 569                } while (retry_descent);
 570
 571                if (skip_rep) {
 572                        dprintk("skip rep\n");
 573                        continue;
 574                }
 575
 576                dprintk("CHOOSE got %d\n", item);
 577                out[outpos] = item;
 578                outpos++;
 579                count--;
 580#ifndef __KERNEL__
 581                if (map->choose_tries && ftotal <= map->choose_total_tries)
 582                        map->choose_tries[ftotal]++;
 583#endif
 584        }
 585
 586        dprintk("CHOOSE returns %d\n", outpos);
 587        return outpos;
 588}
 589
 590
 591/**
 592 * crush_choose_indep: alternative breadth-first positionally stable mapping
 593 *
 594 */
 595static void crush_choose_indep(const struct crush_map *map,
 596                               struct crush_bucket *bucket,
 597                               const __u32 *weight, int weight_max,
 598                               int x, int left, int numrep, int type,
 599                               int *out, int outpos,
 600                               unsigned int tries,
 601                               unsigned int recurse_tries,
 602                               int recurse_to_leaf,
 603                               int *out2,
 604                               int parent_r)
 605{
 606        struct crush_bucket *in = bucket;
 607        int endpos = outpos + left;
 608        int rep;
 609        unsigned int ftotal;
 610        int r;
 611        int i;
 612        int item = 0;
 613        int itemtype;
 614        int collide;
 615
 616        dprintk("CHOOSE%s INDEP bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
 617                bucket->id, x, outpos, numrep);
 618
 619        /* initially my result is undefined */
 620        for (rep = outpos; rep < endpos; rep++) {
 621                out[rep] = CRUSH_ITEM_UNDEF;
 622                if (out2)
 623                        out2[rep] = CRUSH_ITEM_UNDEF;
 624        }
 625
 626        for (ftotal = 0; left > 0 && ftotal < tries; ftotal++) {
 627#ifdef DEBUG_INDEP
 628                if (out2 && ftotal) {
 629                        dprintk("%u %d a: ", ftotal, left);
 630                        for (rep = outpos; rep < endpos; rep++) {
 631                                dprintk(" %d", out[rep]);
 632                        }
 633                        dprintk("\n");
 634                        dprintk("%u %d b: ", ftotal, left);
 635                        for (rep = outpos; rep < endpos; rep++) {
 636                                dprintk(" %d", out2[rep]);
 637                        }
 638                        dprintk("\n");
 639                }
 640#endif
 641                for (rep = outpos; rep < endpos; rep++) {
 642                        if (out[rep] != CRUSH_ITEM_UNDEF)
 643                                continue;
 644
 645                        in = bucket;  /* initial bucket */
 646
 647                        /* choose through intervening buckets */
 648                        for (;;) {
 649                                /* note: we base the choice on the position
 650                                 * even in the nested call.  that means that
 651                                 * if the first layer chooses the same bucket
 652                                 * in a different position, we will tend to
 653                                 * choose a different item in that bucket.
 654                                 * this will involve more devices in data
 655                                 * movement and tend to distribute the load.
 656                                 */
 657                                r = rep + parent_r;
 658
 659                                /* be careful */
 660                                if (in->alg == CRUSH_BUCKET_UNIFORM &&
 661                                    in->size % numrep == 0)
 662                                        /* r'=r+(n+1)*f_total */
 663                                        r += (numrep+1) * ftotal;
 664                                else
 665                                        /* r' = r + n*f_total */
 666                                        r += numrep * ftotal;
 667
 668                                /* bucket choose */
 669                                if (in->size == 0) {
 670                                        dprintk("   empty bucket\n");
 671                                        break;
 672                                }
 673
 674                                item = crush_bucket_choose(in, x, r);
 675                                if (item >= map->max_devices) {
 676                                        dprintk("   bad item %d\n", item);
 677                                        out[rep] = CRUSH_ITEM_NONE;
 678                                        if (out2)
 679                                                out2[rep] = CRUSH_ITEM_NONE;
 680                                        left--;
 681                                        break;
 682                                }
 683
 684                                /* desired type? */
 685                                if (item < 0)
 686                                        itemtype = map->buckets[-1-item]->type;
 687                                else
 688                                        itemtype = 0;
 689                                dprintk("  item %d type %d\n", item, itemtype);
 690
 691                                /* keep going? */
 692                                if (itemtype != type) {
 693                                        if (item >= 0 ||
 694                                            (-1-item) >= map->max_buckets) {
 695                                                dprintk("   bad item type %d\n", type);
 696                                                out[rep] = CRUSH_ITEM_NONE;
 697                                                if (out2)
 698                                                        out2[rep] =
 699                                                                CRUSH_ITEM_NONE;
 700                                                left--;
 701                                                break;
 702                                        }
 703                                        in = map->buckets[-1-item];
 704                                        continue;
 705                                }
 706
 707                                /* collision? */
 708                                collide = 0;
 709                                for (i = outpos; i < endpos; i++) {
 710                                        if (out[i] == item) {
 711                                                collide = 1;
 712                                                break;
 713                                        }
 714                                }
 715                                if (collide)
 716                                        break;
 717
 718                                if (recurse_to_leaf) {
 719                                        if (item < 0) {
 720                                                crush_choose_indep(map,
 721                                                   map->buckets[-1-item],
 722                                                   weight, weight_max,
 723                                                   x, 1, numrep, 0,
 724                                                   out2, rep,
 725                                                   recurse_tries, 0,
 726                                                   0, NULL, r);
 727                                                if (out2[rep] == CRUSH_ITEM_NONE) {
 728                                                        /* placed nothing; no leaf */
 729                                                        break;
 730                                                }
 731                                        } else {
 732                                                /* we already have a leaf! */
 733                                                out2[rep] = item;
 734                                        }
 735                                }
 736
 737                                /* out? */
 738                                if (itemtype == 0 &&
 739                                    is_out(map, weight, weight_max, item, x))
 740                                        break;
 741
 742                                /* yay! */
 743                                out[rep] = item;
 744                                left--;
 745                                break;
 746                        }
 747                }
 748        }
 749        for (rep = outpos; rep < endpos; rep++) {
 750                if (out[rep] == CRUSH_ITEM_UNDEF) {
 751                        out[rep] = CRUSH_ITEM_NONE;
 752                }
 753                if (out2 && out2[rep] == CRUSH_ITEM_UNDEF) {
 754                        out2[rep] = CRUSH_ITEM_NONE;
 755                }
 756        }
 757#ifndef __KERNEL__
 758        if (map->choose_tries && ftotal <= map->choose_total_tries)
 759                map->choose_tries[ftotal]++;
 760#endif
 761#ifdef DEBUG_INDEP
 762        if (out2) {
 763                dprintk("%u %d a: ", ftotal, left);
 764                for (rep = outpos; rep < endpos; rep++) {
 765                        dprintk(" %d", out[rep]);
 766                }
 767                dprintk("\n");
 768                dprintk("%u %d b: ", ftotal, left);
 769                for (rep = outpos; rep < endpos; rep++) {
 770                        dprintk(" %d", out2[rep]);
 771                }
 772                dprintk("\n");
 773        }
 774#endif
 775}
 776
 777/**
 778 * crush_do_rule - calculate a mapping with the given input and rule
 779 * @map: the crush_map
 780 * @ruleno: the rule id
 781 * @x: hash input
 782 * @result: pointer to result vector
 783 * @result_max: maximum result size
 784 * @weight: weight vector (for map leaves)
 785 * @weight_max: size of weight vector
 786 * @scratch: scratch vector for private use; must be >= 3 * result_max
 787 */
 788int crush_do_rule(const struct crush_map *map,
 789                  int ruleno, int x, int *result, int result_max,
 790                  const __u32 *weight, int weight_max,
 791                  int *scratch)
 792{
 793        int result_len;
 794        int *a = scratch;
 795        int *b = scratch + result_max;
 796        int *c = scratch + result_max*2;
 797        int recurse_to_leaf;
 798        int *w;
 799        int wsize = 0;
 800        int *o;
 801        int osize;
 802        int *tmp;
 803        struct crush_rule *rule;
 804        __u32 step;
 805        int i, j;
 806        int numrep;
 807        int out_size;
 808        /*
 809         * the original choose_total_tries value was off by one (it
 810         * counted "retries" and not "tries").  add one.
 811         */
 812        int choose_tries = map->choose_total_tries + 1;
 813        int choose_leaf_tries = 0;
 814        /*
 815         * the local tries values were counted as "retries", though,
 816         * and need no adjustment
 817         */
 818        int choose_local_retries = map->choose_local_tries;
 819        int choose_local_fallback_retries = map->choose_local_fallback_tries;
 820
 821        int vary_r = map->chooseleaf_vary_r;
 822        int stable = map->chooseleaf_stable;
 823
 824        if ((__u32)ruleno >= map->max_rules) {
 825                dprintk(" bad ruleno %d\n", ruleno);
 826                return 0;
 827        }
 828
 829        rule = map->rules[ruleno];
 830        result_len = 0;
 831        w = a;
 832        o = b;
 833
 834        for (step = 0; step < rule->len; step++) {
 835                int firstn = 0;
 836                struct crush_rule_step *curstep = &rule->steps[step];
 837
 838                switch (curstep->op) {
 839                case CRUSH_RULE_TAKE:
 840                        if ((curstep->arg1 >= 0 &&
 841                             curstep->arg1 < map->max_devices) ||
 842                            (-1-curstep->arg1 >= 0 &&
 843                             -1-curstep->arg1 < map->max_buckets &&
 844                             map->buckets[-1-curstep->arg1])) {
 845                                w[0] = curstep->arg1;
 846                                wsize = 1;
 847                        } else {
 848                                dprintk(" bad take value %d\n", curstep->arg1);
 849                        }
 850                        break;
 851
 852                case CRUSH_RULE_SET_CHOOSE_TRIES:
 853                        if (curstep->arg1 > 0)
 854                                choose_tries = curstep->arg1;
 855                        break;
 856
 857                case CRUSH_RULE_SET_CHOOSELEAF_TRIES:
 858                        if (curstep->arg1 > 0)
 859                                choose_leaf_tries = curstep->arg1;
 860                        break;
 861
 862                case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES:
 863                        if (curstep->arg1 >= 0)
 864                                choose_local_retries = curstep->arg1;
 865                        break;
 866
 867                case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES:
 868                        if (curstep->arg1 >= 0)
 869                                choose_local_fallback_retries = curstep->arg1;
 870                        break;
 871
 872                case CRUSH_RULE_SET_CHOOSELEAF_VARY_R:
 873                        if (curstep->arg1 >= 0)
 874                                vary_r = curstep->arg1;
 875                        break;
 876
 877                case CRUSH_RULE_SET_CHOOSELEAF_STABLE:
 878                        if (curstep->arg1 >= 0)
 879                                stable = curstep->arg1;
 880                        break;
 881
 882                case CRUSH_RULE_CHOOSELEAF_FIRSTN:
 883                case CRUSH_RULE_CHOOSE_FIRSTN:
 884                        firstn = 1;
 885                        /* fall through */
 886                case CRUSH_RULE_CHOOSELEAF_INDEP:
 887                case CRUSH_RULE_CHOOSE_INDEP:
 888                        if (wsize == 0)
 889                                break;
 890
 891                        recurse_to_leaf =
 892                                curstep->op ==
 893                                 CRUSH_RULE_CHOOSELEAF_FIRSTN ||
 894                                curstep->op ==
 895                                CRUSH_RULE_CHOOSELEAF_INDEP;
 896
 897                        /* reset output */
 898                        osize = 0;
 899
 900                        for (i = 0; i < wsize; i++) {
 901                                int bno;
 902                                /*
 903                                 * see CRUSH_N, CRUSH_N_MINUS macros.
 904                                 * basically, numrep <= 0 means relative to
 905                                 * the provided result_max
 906                                 */
 907                                numrep = curstep->arg1;
 908                                if (numrep <= 0) {
 909                                        numrep += result_max;
 910                                        if (numrep <= 0)
 911                                                continue;
 912                                }
 913                                j = 0;
 914                                /* make sure bucket id is valid */
 915                                bno = -1 - w[i];
 916                                if (bno < 0 || bno >= map->max_buckets) {
 917                                        /* w[i] is probably CRUSH_ITEM_NONE */
 918                                        dprintk("  bad w[i] %d\n", w[i]);
 919                                        continue;
 920                                }
 921                                if (firstn) {
 922                                        int recurse_tries;
 923                                        if (choose_leaf_tries)
 924                                                recurse_tries =
 925                                                        choose_leaf_tries;
 926                                        else if (map->chooseleaf_descend_once)
 927                                                recurse_tries = 1;
 928                                        else
 929                                                recurse_tries = choose_tries;
 930                                        osize += crush_choose_firstn(
 931                                                map,
 932                                                map->buckets[bno],
 933                                                weight, weight_max,
 934                                                x, numrep,
 935                                                curstep->arg2,
 936                                                o+osize, j,
 937                                                result_max-osize,
 938                                                choose_tries,
 939                                                recurse_tries,
 940                                                choose_local_retries,
 941                                                choose_local_fallback_retries,
 942                                                recurse_to_leaf,
 943                                                vary_r,
 944                                                stable,
 945                                                c+osize,
 946                                                0);
 947                                } else {
 948                                        out_size = ((numrep < (result_max-osize)) ?
 949                                                    numrep : (result_max-osize));
 950                                        crush_choose_indep(
 951                                                map,
 952                                                map->buckets[bno],
 953                                                weight, weight_max,
 954                                                x, out_size, numrep,
 955                                                curstep->arg2,
 956                                                o+osize, j,
 957                                                choose_tries,
 958                                                choose_leaf_tries ?
 959                                                   choose_leaf_tries : 1,
 960                                                recurse_to_leaf,
 961                                                c+osize,
 962                                                0);
 963                                        osize += out_size;
 964                                }
 965                        }
 966
 967                        if (recurse_to_leaf)
 968                                /* copy final _leaf_ values to output set */
 969                                memcpy(o, c, osize*sizeof(*o));
 970
 971                        /* swap o and w arrays */
 972                        tmp = o;
 973                        o = w;
 974                        w = tmp;
 975                        wsize = osize;
 976                        break;
 977
 978
 979                case CRUSH_RULE_EMIT:
 980                        for (i = 0; i < wsize && result_len < result_max; i++) {
 981                                result[result_len] = w[i];
 982                                result_len++;
 983                        }
 984                        wsize = 0;
 985                        break;
 986
 987                default:
 988                        dprintk(" unknown op %d at step %d\n",
 989                                curstep->op, step);
 990                        break;
 991                }
 992        }
 993        return result_len;
 994}
 995