linux/drivers/md/persistent-data/dm-btree-remove.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2011 Red Hat, Inc.
   3 *
   4 * This file is released under the GPL.
   5 */
   6
   7#include "dm-btree.h"
   8#include "dm-btree-internal.h"
   9#include "dm-transaction-manager.h"
  10
  11#include <linux/export.h>
  12
  13/*
  14 * Removing an entry from a btree
  15 * ==============================
  16 *
  17 * A very important constraint for our btree is that no node, except the
  18 * root, may have fewer than a certain number of entries.
  19 * (MIN_ENTRIES <= nr_entries <= MAX_ENTRIES).
  20 *
  21 * Ensuring this is complicated by the way we want to only ever hold the
  22 * locks on 2 nodes concurrently, and only change nodes in a top to bottom
  23 * fashion.
  24 *
  25 * Each node may have a left or right sibling.  When decending the spine,
  26 * if a node contains only MIN_ENTRIES then we try and increase this to at
  27 * least MIN_ENTRIES + 1.  We do this in the following ways:
  28 *
  29 * [A] No siblings => this can only happen if the node is the root, in which
  30 *     case we copy the childs contents over the root.
  31 *
  32 * [B] No left sibling
  33 *     ==> rebalance(node, right sibling)
  34 *
  35 * [C] No right sibling
  36 *     ==> rebalance(left sibling, node)
  37 *
  38 * [D] Both siblings, total_entries(left, node, right) <= DEL_THRESHOLD
  39 *     ==> delete node adding it's contents to left and right
  40 *
  41 * [E] Both siblings, total_entries(left, node, right) > DEL_THRESHOLD
  42 *     ==> rebalance(left, node, right)
  43 *
  44 * After these operations it's possible that the our original node no
  45 * longer contains the desired sub tree.  For this reason this rebalancing
  46 * is performed on the children of the current node.  This also avoids
  47 * having a special case for the root.
  48 *
  49 * Once this rebalancing has occurred we can then step into the child node
  50 * for internal nodes.  Or delete the entry for leaf nodes.
  51 */
  52
  53/*
  54 * Some little utilities for moving node data around.
  55 */
  56static void node_shift(struct btree_node *n, int shift)
  57{
  58        uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
  59        uint32_t value_size = le32_to_cpu(n->header.value_size);
  60
  61        if (shift < 0) {
  62                shift = -shift;
  63                BUG_ON(shift > nr_entries);
  64                BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift));
  65                memmove(key_ptr(n, 0),
  66                        key_ptr(n, shift),
  67                        (nr_entries - shift) * sizeof(__le64));
  68                memmove(value_ptr(n, 0),
  69                        value_ptr(n, shift),
  70                        (nr_entries - shift) * value_size);
  71        } else {
  72                BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries));
  73                memmove(key_ptr(n, shift),
  74                        key_ptr(n, 0),
  75                        nr_entries * sizeof(__le64));
  76                memmove(value_ptr(n, shift),
  77                        value_ptr(n, 0),
  78                        nr_entries * value_size);
  79        }
  80}
  81
  82static void node_copy(struct btree_node *left, struct btree_node *right, int shift)
  83{
  84        uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
  85        uint32_t value_size = le32_to_cpu(left->header.value_size);
  86        BUG_ON(value_size != le32_to_cpu(right->header.value_size));
  87
  88        if (shift < 0) {
  89                shift = -shift;
  90                BUG_ON(nr_left + shift > le32_to_cpu(left->header.max_entries));
  91                memcpy(key_ptr(left, nr_left),
  92                       key_ptr(right, 0),
  93                       shift * sizeof(__le64));
  94                memcpy(value_ptr(left, nr_left),
  95                       value_ptr(right, 0),
  96                       shift * value_size);
  97        } else {
  98                BUG_ON(shift > le32_to_cpu(right->header.max_entries));
  99                memcpy(key_ptr(right, 0),
 100                       key_ptr(left, nr_left - shift),
 101                       shift * sizeof(__le64));
 102                memcpy(value_ptr(right, 0),
 103                       value_ptr(left, nr_left - shift),
 104                       shift * value_size);
 105        }
 106}
 107
 108/*
 109 * Delete a specific entry from a leaf node.
 110 */
 111static void delete_at(struct btree_node *n, unsigned index)
 112{
 113        unsigned nr_entries = le32_to_cpu(n->header.nr_entries);
 114        unsigned nr_to_copy = nr_entries - (index + 1);
 115        uint32_t value_size = le32_to_cpu(n->header.value_size);
 116        BUG_ON(index >= nr_entries);
 117
 118        if (nr_to_copy) {
 119                memmove(key_ptr(n, index),
 120                        key_ptr(n, index + 1),
 121                        nr_to_copy * sizeof(__le64));
 122
 123                memmove(value_ptr(n, index),
 124                        value_ptr(n, index + 1),
 125                        nr_to_copy * value_size);
 126        }
 127
 128        n->header.nr_entries = cpu_to_le32(nr_entries - 1);
 129}
 130
 131static unsigned merge_threshold(struct btree_node *n)
 132{
 133        return le32_to_cpu(n->header.max_entries) / 3;
 134}
 135
 136struct child {
 137        unsigned index;
 138        struct dm_block *block;
 139        struct btree_node *n;
 140};
 141
 142static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt,
 143                      struct btree_node *parent,
 144                      unsigned index, struct child *result)
 145{
 146        int r, inc;
 147        dm_block_t root;
 148
 149        result->index = index;
 150        root = value64(parent, index);
 151
 152        r = dm_tm_shadow_block(info->tm, root, &btree_node_validator,
 153                               &result->block, &inc);
 154        if (r)
 155                return r;
 156
 157        result->n = dm_block_data(result->block);
 158
 159        if (inc)
 160                inc_children(info->tm, result->n, vt);
 161
 162        *((__le64 *) value_ptr(parent, index)) =
 163                cpu_to_le64(dm_block_location(result->block));
 164
 165        return 0;
 166}
 167
 168static int exit_child(struct dm_btree_info *info, struct child *c)
 169{
 170        return dm_tm_unlock(info->tm, c->block);
 171}
 172
 173static void shift(struct btree_node *left, struct btree_node *right, int count)
 174{
 175        uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
 176        uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
 177        uint32_t max_entries = le32_to_cpu(left->header.max_entries);
 178        uint32_t r_max_entries = le32_to_cpu(right->header.max_entries);
 179
 180        BUG_ON(max_entries != r_max_entries);
 181        BUG_ON(nr_left - count > max_entries);
 182        BUG_ON(nr_right + count > max_entries);
 183
 184        if (!count)
 185                return;
 186
 187        if (count > 0) {
 188                node_shift(right, count);
 189                node_copy(left, right, count);
 190        } else {
 191                node_copy(left, right, count);
 192                node_shift(right, count);
 193        }
 194
 195        left->header.nr_entries = cpu_to_le32(nr_left - count);
 196        right->header.nr_entries = cpu_to_le32(nr_right + count);
 197}
 198
 199static void __rebalance2(struct dm_btree_info *info, struct btree_node *parent,
 200                         struct child *l, struct child *r)
 201{
 202        struct btree_node *left = l->n;
 203        struct btree_node *right = r->n;
 204        uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
 205        uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
 206        unsigned threshold = 2 * merge_threshold(left) + 1;
 207
 208        if (nr_left + nr_right < threshold) {
 209                /*
 210                 * Merge
 211                 */
 212                node_copy(left, right, -nr_right);
 213                left->header.nr_entries = cpu_to_le32(nr_left + nr_right);
 214                delete_at(parent, r->index);
 215
 216                /*
 217                 * We need to decrement the right block, but not it's
 218                 * children, since they're still referenced by left.
 219                 */
 220                dm_tm_dec(info->tm, dm_block_location(r->block));
 221        } else {
 222                /*
 223                 * Rebalance.
 224                 */
 225                unsigned target_left = (nr_left + nr_right) / 2;
 226                shift(left, right, nr_left - target_left);
 227                *key_ptr(parent, r->index) = right->keys[0];
 228        }
 229}
 230
 231static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
 232                      struct dm_btree_value_type *vt, unsigned left_index)
 233{
 234        int r;
 235        struct btree_node *parent;
 236        struct child left, right;
 237
 238        parent = dm_block_data(shadow_current(s));
 239
 240        r = init_child(info, vt, parent, left_index, &left);
 241        if (r)
 242                return r;
 243
 244        r = init_child(info, vt, parent, left_index + 1, &right);
 245        if (r) {
 246                exit_child(info, &left);
 247                return r;
 248        }
 249
 250        __rebalance2(info, parent, &left, &right);
 251
 252        r = exit_child(info, &left);
 253        if (r) {
 254                exit_child(info, &right);
 255                return r;
 256        }
 257
 258        return exit_child(info, &right);
 259}
 260
 261/*
 262 * We dump as many entries from center as possible into left, then the rest
 263 * in right, then rebalance2.  This wastes some cpu, but I want something
 264 * simple atm.
 265 */
 266static void delete_center_node(struct dm_btree_info *info, struct btree_node *parent,
 267                               struct child *l, struct child *c, struct child *r,
 268                               struct btree_node *left, struct btree_node *center, struct btree_node *right,
 269                               uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
 270{
 271        uint32_t max_entries = le32_to_cpu(left->header.max_entries);
 272        unsigned shift = min(max_entries - nr_left, nr_center);
 273
 274        BUG_ON(nr_left + shift > max_entries);
 275        node_copy(left, center, -shift);
 276        left->header.nr_entries = cpu_to_le32(nr_left + shift);
 277
 278        if (shift != nr_center) {
 279                shift = nr_center - shift;
 280                BUG_ON((nr_right + shift) > max_entries);
 281                node_shift(right, shift);
 282                node_copy(center, right, shift);
 283                right->header.nr_entries = cpu_to_le32(nr_right + shift);
 284        }
 285        *key_ptr(parent, r->index) = right->keys[0];
 286
 287        delete_at(parent, c->index);
 288        r->index--;
 289
 290        dm_tm_dec(info->tm, dm_block_location(c->block));
 291        __rebalance2(info, parent, l, r);
 292}
 293
 294/*
 295 * Redistributes entries among 3 sibling nodes.
 296 */
 297static void redistribute3(struct dm_btree_info *info, struct btree_node *parent,
 298                          struct child *l, struct child *c, struct child *r,
 299                          struct btree_node *left, struct btree_node *center, struct btree_node *right,
 300                          uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
 301{
 302        int s;
 303        uint32_t max_entries = le32_to_cpu(left->header.max_entries);
 304        unsigned target = (nr_left + nr_center + nr_right) / 3;
 305        BUG_ON(target > max_entries);
 306
 307        if (nr_left < nr_right) {
 308                s = nr_left - target;
 309
 310                if (s < 0 && nr_center < -s) {
 311                        /* not enough in central node */
 312                        shift(left, center, nr_center);
 313                        s = nr_center - target;
 314                        shift(left, right, s);
 315                        nr_right += s;
 316                } else
 317                        shift(left, center, s);
 318
 319                shift(center, right, target - nr_right);
 320
 321        } else {
 322                s = target - nr_right;
 323                if (s > 0 && nr_center < s) {
 324                        /* not enough in central node */
 325                        shift(center, right, nr_center);
 326                        s = target - nr_center;
 327                        shift(left, right, s);
 328                        nr_left -= s;
 329                } else
 330                        shift(center, right, s);
 331
 332                shift(left, center, nr_left - target);
 333        }
 334
 335        *key_ptr(parent, c->index) = center->keys[0];
 336        *key_ptr(parent, r->index) = right->keys[0];
 337}
 338
 339static void __rebalance3(struct dm_btree_info *info, struct btree_node *parent,
 340                         struct child *l, struct child *c, struct child *r)
 341{
 342        struct btree_node *left = l->n;
 343        struct btree_node *center = c->n;
 344        struct btree_node *right = r->n;
 345
 346        uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
 347        uint32_t nr_center = le32_to_cpu(center->header.nr_entries);
 348        uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
 349
 350        unsigned threshold = merge_threshold(left) * 4 + 1;
 351
 352        BUG_ON(left->header.max_entries != center->header.max_entries);
 353        BUG_ON(center->header.max_entries != right->header.max_entries);
 354
 355        if ((nr_left + nr_center + nr_right) < threshold)
 356                delete_center_node(info, parent, l, c, r, left, center, right,
 357                                   nr_left, nr_center, nr_right);
 358        else
 359                redistribute3(info, parent, l, c, r, left, center, right,
 360                              nr_left, nr_center, nr_right);
 361}
 362
 363static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
 364                      struct dm_btree_value_type *vt, unsigned left_index)
 365{
 366        int r;
 367        struct btree_node *parent = dm_block_data(shadow_current(s));
 368        struct child left, center, right;
 369
 370        /*
 371         * FIXME: fill out an array?
 372         */
 373        r = init_child(info, vt, parent, left_index, &left);
 374        if (r)
 375                return r;
 376
 377        r = init_child(info, vt, parent, left_index + 1, &center);
 378        if (r) {
 379                exit_child(info, &left);
 380                return r;
 381        }
 382
 383        r = init_child(info, vt, parent, left_index + 2, &right);
 384        if (r) {
 385                exit_child(info, &left);
 386                exit_child(info, &center);
 387                return r;
 388        }
 389
 390        __rebalance3(info, parent, &left, &center, &right);
 391
 392        r = exit_child(info, &left);
 393        if (r) {
 394                exit_child(info, &center);
 395                exit_child(info, &right);
 396                return r;
 397        }
 398
 399        r = exit_child(info, &center);
 400        if (r) {
 401                exit_child(info, &right);
 402                return r;
 403        }
 404
 405        r = exit_child(info, &right);
 406        if (r)
 407                return r;
 408
 409        return 0;
 410}
 411
 412static int get_nr_entries(struct dm_transaction_manager *tm,
 413                          dm_block_t b, uint32_t *result)
 414{
 415        int r;
 416        struct dm_block *block;
 417        struct btree_node *n;
 418
 419        r = dm_tm_read_lock(tm, b, &btree_node_validator, &block);
 420        if (r)
 421                return r;
 422
 423        n = dm_block_data(block);
 424        *result = le32_to_cpu(n->header.nr_entries);
 425
 426        return dm_tm_unlock(tm, block);
 427}
 428
 429static int rebalance_children(struct shadow_spine *s,
 430                              struct dm_btree_info *info,
 431                              struct dm_btree_value_type *vt, uint64_t key)
 432{
 433        int i, r, has_left_sibling, has_right_sibling;
 434        uint32_t child_entries;
 435        struct btree_node *n;
 436
 437        n = dm_block_data(shadow_current(s));
 438
 439        if (le32_to_cpu(n->header.nr_entries) == 1) {
 440                struct dm_block *child;
 441                dm_block_t b = value64(n, 0);
 442
 443                r = dm_tm_read_lock(info->tm, b, &btree_node_validator, &child);
 444                if (r)
 445                        return r;
 446
 447                memcpy(n, dm_block_data(child),
 448                       dm_bm_block_size(dm_tm_get_bm(info->tm)));
 449                r = dm_tm_unlock(info->tm, child);
 450                if (r)
 451                        return r;
 452
 453                dm_tm_dec(info->tm, dm_block_location(child));
 454                return 0;
 455        }
 456
 457        i = lower_bound(n, key);
 458        if (i < 0)
 459                return -ENODATA;
 460
 461        r = get_nr_entries(info->tm, value64(n, i), &child_entries);
 462        if (r)
 463                return r;
 464
 465        has_left_sibling = i > 0;
 466        has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1);
 467
 468        if (!has_left_sibling)
 469                r = rebalance2(s, info, vt, i);
 470
 471        else if (!has_right_sibling)
 472                r = rebalance2(s, info, vt, i - 1);
 473
 474        else
 475                r = rebalance3(s, info, vt, i - 1);
 476
 477        return r;
 478}
 479
 480static int do_leaf(struct btree_node *n, uint64_t key, unsigned *index)
 481{
 482        int i = lower_bound(n, key);
 483
 484        if ((i < 0) ||
 485            (i >= le32_to_cpu(n->header.nr_entries)) ||
 486            (le64_to_cpu(n->keys[i]) != key))
 487                return -ENODATA;
 488
 489        *index = i;
 490
 491        return 0;
 492}
 493
 494/*
 495 * Prepares for removal from one level of the hierarchy.  The caller must
 496 * call delete_at() to remove the entry at index.
 497 */
 498static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
 499                      struct dm_btree_value_type *vt, dm_block_t root,
 500                      uint64_t key, unsigned *index)
 501{
 502        int i = *index, r;
 503        struct btree_node *n;
 504
 505        for (;;) {
 506                r = shadow_step(s, root, vt);
 507                if (r < 0)
 508                        break;
 509
 510                /*
 511                 * We have to patch up the parent node, ugly, but I don't
 512                 * see a way to do this automatically as part of the spine
 513                 * op.
 514                 */
 515                if (shadow_has_parent(s)) {
 516                        __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
 517                        memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
 518                               &location, sizeof(__le64));
 519                }
 520
 521                n = dm_block_data(shadow_current(s));
 522
 523                if (le32_to_cpu(n->header.flags) & LEAF_NODE)
 524                        return do_leaf(n, key, index);
 525
 526                r = rebalance_children(s, info, vt, key);
 527                if (r)
 528                        break;
 529
 530                n = dm_block_data(shadow_current(s));
 531                if (le32_to_cpu(n->header.flags) & LEAF_NODE)
 532                        return do_leaf(n, key, index);
 533
 534                i = lower_bound(n, key);
 535
 536                /*
 537                 * We know the key is present, or else
 538                 * rebalance_children would have returned
 539                 * -ENODATA
 540                 */
 541                root = value64(n, i);
 542        }
 543
 544        return r;
 545}
 546
 547static struct dm_btree_value_type le64_type = {
 548        .context = NULL,
 549        .size = sizeof(__le64),
 550        .inc = NULL,
 551        .dec = NULL,
 552        .equal = NULL
 553};
 554
 555int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
 556                    uint64_t *keys, dm_block_t *new_root)
 557{
 558        unsigned level, last_level = info->levels - 1;
 559        int index = 0, r = 0;
 560        struct shadow_spine spine;
 561        struct btree_node *n;
 562
 563        init_shadow_spine(&spine, info);
 564        for (level = 0; level < info->levels; level++) {
 565                r = remove_raw(&spine, info,
 566                               (level == last_level ?
 567                                &info->value_type : &le64_type),
 568                               root, keys[level], (unsigned *)&index);
 569                if (r < 0)
 570                        break;
 571
 572                n = dm_block_data(shadow_current(&spine));
 573                if (level != last_level) {
 574                        root = value64(n, index);
 575                        continue;
 576                }
 577
 578                BUG_ON(index < 0 || index >= le32_to_cpu(n->header.nr_entries));
 579
 580                if (info->value_type.dec)
 581                        info->value_type.dec(info->value_type.context,
 582                                             value_ptr(n, index));
 583
 584                delete_at(n, index);
 585        }
 586
 587        *new_root = shadow_root(&spine);
 588        exit_shadow_spine(&spine);
 589
 590        return r;
 591}
 592EXPORT_SYMBOL_GPL(dm_btree_remove);
 593