linux/drivers/md/persistent-data/dm-btree-remove.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2011 Red Hat, Inc.
   3 *
   4 * This file is released under the GPL.
   5 */
   6
   7#include "dm-btree.h"
   8#include "dm-btree-internal.h"
   9#include "dm-transaction-manager.h"
  10
  11#include <linux/export.h>
  12
  13/*
  14 * Removing an entry from a btree
  15 * ==============================
  16 *
  17 * A very important constraint for our btree is that no node, except the
  18 * root, may have fewer than a certain number of entries.
  19 * (MIN_ENTRIES <= nr_entries <= MAX_ENTRIES).
  20 *
  21 * Ensuring this is complicated by the way we want to only ever hold the
  22 * locks on 2 nodes concurrently, and only change nodes in a top to bottom
  23 * fashion.
  24 *
  25 * Each node may have a left or right sibling.  When decending the spine,
  26 * if a node contains only MIN_ENTRIES then we try and increase this to at
  27 * least MIN_ENTRIES + 1.  We do this in the following ways:
  28 *
  29 * [A] No siblings => this can only happen if the node is the root, in which
  30 *     case we copy the childs contents over the root.
  31 *
  32 * [B] No left sibling
  33 *     ==> rebalance(node, right sibling)
  34 *
  35 * [C] No right sibling
  36 *     ==> rebalance(left sibling, node)
  37 *
  38 * [D] Both siblings, total_entries(left, node, right) <= DEL_THRESHOLD
  39 *     ==> delete node adding it's contents to left and right
  40 *
  41 * [E] Both siblings, total_entries(left, node, right) > DEL_THRESHOLD
  42 *     ==> rebalance(left, node, right)
  43 *
  44 * After these operations it's possible that the our original node no
  45 * longer contains the desired sub tree.  For this reason this rebalancing
  46 * is performed on the children of the current node.  This also avoids
  47 * having a special case for the root.
  48 *
  49 * Once this rebalancing has occurred we can then step into the child node
  50 * for internal nodes.  Or delete the entry for leaf nodes.
  51 */
  52
  53/*
  54 * Some little utilities for moving node data around.
  55 */
  56static void node_shift(struct btree_node *n, int shift)
  57{
  58        uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
  59        uint32_t value_size = le32_to_cpu(n->header.value_size);
  60
  61        if (shift < 0) {
  62                shift = -shift;
  63                BUG_ON(shift > nr_entries);
  64                BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift));
  65                memmove(key_ptr(n, 0),
  66                        key_ptr(n, shift),
  67                        (nr_entries - shift) * sizeof(__le64));
  68                memmove(value_ptr(n, 0),
  69                        value_ptr(n, shift),
  70                        (nr_entries - shift) * value_size);
  71        } else {
  72                BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries));
  73                memmove(key_ptr(n, shift),
  74                        key_ptr(n, 0),
  75                        nr_entries * sizeof(__le64));
  76                memmove(value_ptr(n, shift),
  77                        value_ptr(n, 0),
  78                        nr_entries * value_size);
  79        }
  80}
  81
  82static void node_copy(struct btree_node *left, struct btree_node *right, int shift)
  83{
  84        uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
  85        uint32_t value_size = le32_to_cpu(left->header.value_size);
  86        BUG_ON(value_size != le32_to_cpu(right->header.value_size));
  87
  88        if (shift < 0) {
  89                shift = -shift;
  90                BUG_ON(nr_left + shift > le32_to_cpu(left->header.max_entries));
  91                memcpy(key_ptr(left, nr_left),
  92                       key_ptr(right, 0),
  93                       shift * sizeof(__le64));
  94                memcpy(value_ptr(left, nr_left),
  95                       value_ptr(right, 0),
  96                       shift * value_size);
  97        } else {
  98                BUG_ON(shift > le32_to_cpu(right->header.max_entries));
  99                memcpy(key_ptr(right, 0),
 100                       key_ptr(left, nr_left - shift),
 101                       shift * sizeof(__le64));
 102                memcpy(value_ptr(right, 0),
 103                       value_ptr(left, nr_left - shift),
 104                       shift * value_size);
 105        }
 106}
 107
 108/*
 109 * Delete a specific entry from a leaf node.
 110 */
 111static void delete_at(struct btree_node *n, unsigned index)
 112{
 113        unsigned nr_entries = le32_to_cpu(n->header.nr_entries);
 114        unsigned nr_to_copy = nr_entries - (index + 1);
 115        uint32_t value_size = le32_to_cpu(n->header.value_size);
 116        BUG_ON(index >= nr_entries);
 117
 118        if (nr_to_copy) {
 119                memmove(key_ptr(n, index),
 120                        key_ptr(n, index + 1),
 121                        nr_to_copy * sizeof(__le64));
 122
 123                memmove(value_ptr(n, index),
 124                        value_ptr(n, index + 1),
 125                        nr_to_copy * value_size);
 126        }
 127
 128        n->header.nr_entries = cpu_to_le32(nr_entries - 1);
 129}
 130
 131static unsigned merge_threshold(struct btree_node *n)
 132{
 133        return le32_to_cpu(n->header.max_entries) / 3;
 134}
 135
 136struct child {
 137        unsigned index;
 138        struct dm_block *block;
 139        struct btree_node *n;
 140};
 141
 142static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt,
 143                      struct btree_node *parent,
 144                      unsigned index, struct child *result)
 145{
 146        int r, inc;
 147        dm_block_t root;
 148
 149        result->index = index;
 150        root = value64(parent, index);
 151
 152        r = dm_tm_shadow_block(info->tm, root, &btree_node_validator,
 153                               &result->block, &inc);
 154        if (r)
 155                return r;
 156
 157        result->n = dm_block_data(result->block);
 158
 159        if (inc)
 160                inc_children(info->tm, result->n, vt);
 161
 162        *((__le64 *) value_ptr(parent, index)) =
 163                cpu_to_le64(dm_block_location(result->block));
 164
 165        return 0;
 166}
 167
 168static void exit_child(struct dm_btree_info *info, struct child *c)
 169{
 170        dm_tm_unlock(info->tm, c->block);
 171}
 172
 173static void shift(struct btree_node *left, struct btree_node *right, int count)
 174{
 175        uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
 176        uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
 177        uint32_t max_entries = le32_to_cpu(left->header.max_entries);
 178        uint32_t r_max_entries = le32_to_cpu(right->header.max_entries);
 179
 180        BUG_ON(max_entries != r_max_entries);
 181        BUG_ON(nr_left - count > max_entries);
 182        BUG_ON(nr_right + count > max_entries);
 183
 184        if (!count)
 185                return;
 186
 187        if (count > 0) {
 188                node_shift(right, count);
 189                node_copy(left, right, count);
 190        } else {
 191                node_copy(left, right, count);
 192                node_shift(right, count);
 193        }
 194
 195        left->header.nr_entries = cpu_to_le32(nr_left - count);
 196        right->header.nr_entries = cpu_to_le32(nr_right + count);
 197}
 198
 199static void __rebalance2(struct dm_btree_info *info, struct btree_node *parent,
 200                         struct child *l, struct child *r)
 201{
 202        struct btree_node *left = l->n;
 203        struct btree_node *right = r->n;
 204        uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
 205        uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
 206        /*
 207         * Ensure the number of entries in each child will be greater
 208         * than or equal to (max_entries / 3 + 1), so no matter which
 209         * child is used for removal, the number will still be not
 210         * less than (max_entries / 3).
 211         */
 212        unsigned int threshold = 2 * (merge_threshold(left) + 1);
 213
 214        if (nr_left + nr_right < threshold) {
 215                /*
 216                 * Merge
 217                 */
 218                node_copy(left, right, -nr_right);
 219                left->header.nr_entries = cpu_to_le32(nr_left + nr_right);
 220                delete_at(parent, r->index);
 221
 222                /*
 223                 * We need to decrement the right block, but not it's
 224                 * children, since they're still referenced by left.
 225                 */
 226                dm_tm_dec(info->tm, dm_block_location(r->block));
 227        } else {
 228                /*
 229                 * Rebalance.
 230                 */
 231                unsigned target_left = (nr_left + nr_right) / 2;
 232                shift(left, right, nr_left - target_left);
 233                *key_ptr(parent, r->index) = right->keys[0];
 234        }
 235}
 236
 237static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
 238                      struct dm_btree_value_type *vt, unsigned left_index)
 239{
 240        int r;
 241        struct btree_node *parent;
 242        struct child left, right;
 243
 244        parent = dm_block_data(shadow_current(s));
 245
 246        r = init_child(info, vt, parent, left_index, &left);
 247        if (r)
 248                return r;
 249
 250        r = init_child(info, vt, parent, left_index + 1, &right);
 251        if (r) {
 252                exit_child(info, &left);
 253                return r;
 254        }
 255
 256        __rebalance2(info, parent, &left, &right);
 257
 258        exit_child(info, &left);
 259        exit_child(info, &right);
 260
 261        return 0;
 262}
 263
 264/*
 265 * We dump as many entries from center as possible into left, then the rest
 266 * in right, then rebalance2.  This wastes some cpu, but I want something
 267 * simple atm.
 268 */
 269static void delete_center_node(struct dm_btree_info *info, struct btree_node *parent,
 270                               struct child *l, struct child *c, struct child *r,
 271                               struct btree_node *left, struct btree_node *center, struct btree_node *right,
 272                               uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
 273{
 274        uint32_t max_entries = le32_to_cpu(left->header.max_entries);
 275        unsigned shift = min(max_entries - nr_left, nr_center);
 276
 277        BUG_ON(nr_left + shift > max_entries);
 278        node_copy(left, center, -shift);
 279        left->header.nr_entries = cpu_to_le32(nr_left + shift);
 280
 281        if (shift != nr_center) {
 282                shift = nr_center - shift;
 283                BUG_ON((nr_right + shift) > max_entries);
 284                node_shift(right, shift);
 285                node_copy(center, right, shift);
 286                right->header.nr_entries = cpu_to_le32(nr_right + shift);
 287        }
 288        *key_ptr(parent, r->index) = right->keys[0];
 289
 290        delete_at(parent, c->index);
 291        r->index--;
 292
 293        dm_tm_dec(info->tm, dm_block_location(c->block));
 294        __rebalance2(info, parent, l, r);
 295}
 296
 297/*
 298 * Redistributes entries among 3 sibling nodes.
 299 */
 300static void redistribute3(struct dm_btree_info *info, struct btree_node *parent,
 301                          struct child *l, struct child *c, struct child *r,
 302                          struct btree_node *left, struct btree_node *center, struct btree_node *right,
 303                          uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
 304{
 305        int s;
 306        uint32_t max_entries = le32_to_cpu(left->header.max_entries);
 307        unsigned total = nr_left + nr_center + nr_right;
 308        unsigned target_right = total / 3;
 309        unsigned remainder = (target_right * 3) != total;
 310        unsigned target_left = target_right + remainder;
 311
 312        BUG_ON(target_left > max_entries);
 313        BUG_ON(target_right > max_entries);
 314
 315        if (nr_left < nr_right) {
 316                s = nr_left - target_left;
 317
 318                if (s < 0 && nr_center < -s) {
 319                        /* not enough in central node */
 320                        shift(left, center, -nr_center);
 321                        s += nr_center;
 322                        shift(left, right, s);
 323                        nr_right += s;
 324                } else
 325                        shift(left, center, s);
 326
 327                shift(center, right, target_right - nr_right);
 328
 329        } else {
 330                s = target_right - nr_right;
 331                if (s > 0 && nr_center < s) {
 332                        /* not enough in central node */
 333                        shift(center, right, nr_center);
 334                        s -= nr_center;
 335                        shift(left, right, s);
 336                        nr_left -= s;
 337                } else
 338                        shift(center, right, s);
 339
 340                shift(left, center, nr_left - target_left);
 341        }
 342
 343        *key_ptr(parent, c->index) = center->keys[0];
 344        *key_ptr(parent, r->index) = right->keys[0];
 345}
 346
 347static void __rebalance3(struct dm_btree_info *info, struct btree_node *parent,
 348                         struct child *l, struct child *c, struct child *r)
 349{
 350        struct btree_node *left = l->n;
 351        struct btree_node *center = c->n;
 352        struct btree_node *right = r->n;
 353
 354        uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
 355        uint32_t nr_center = le32_to_cpu(center->header.nr_entries);
 356        uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
 357
 358        unsigned threshold = merge_threshold(left) * 4 + 1;
 359
 360        BUG_ON(left->header.max_entries != center->header.max_entries);
 361        BUG_ON(center->header.max_entries != right->header.max_entries);
 362
 363        if ((nr_left + nr_center + nr_right) < threshold)
 364                delete_center_node(info, parent, l, c, r, left, center, right,
 365                                   nr_left, nr_center, nr_right);
 366        else
 367                redistribute3(info, parent, l, c, r, left, center, right,
 368                              nr_left, nr_center, nr_right);
 369}
 370
 371static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
 372                      struct dm_btree_value_type *vt, unsigned left_index)
 373{
 374        int r;
 375        struct btree_node *parent = dm_block_data(shadow_current(s));
 376        struct child left, center, right;
 377
 378        /*
 379         * FIXME: fill out an array?
 380         */
 381        r = init_child(info, vt, parent, left_index, &left);
 382        if (r)
 383                return r;
 384
 385        r = init_child(info, vt, parent, left_index + 1, &center);
 386        if (r) {
 387                exit_child(info, &left);
 388                return r;
 389        }
 390
 391        r = init_child(info, vt, parent, left_index + 2, &right);
 392        if (r) {
 393                exit_child(info, &left);
 394                exit_child(info, &center);
 395                return r;
 396        }
 397
 398        __rebalance3(info, parent, &left, &center, &right);
 399
 400        exit_child(info, &left);
 401        exit_child(info, &center);
 402        exit_child(info, &right);
 403
 404        return 0;
 405}
 406
 407static int rebalance_children(struct shadow_spine *s,
 408                              struct dm_btree_info *info,
 409                              struct dm_btree_value_type *vt, uint64_t key)
 410{
 411        int i, r, has_left_sibling, has_right_sibling;
 412        struct btree_node *n;
 413
 414        n = dm_block_data(shadow_current(s));
 415
 416        if (le32_to_cpu(n->header.nr_entries) == 1) {
 417                struct dm_block *child;
 418                dm_block_t b = value64(n, 0);
 419
 420                r = dm_tm_read_lock(info->tm, b, &btree_node_validator, &child);
 421                if (r)
 422                        return r;
 423
 424                memcpy(n, dm_block_data(child),
 425                       dm_bm_block_size(dm_tm_get_bm(info->tm)));
 426                dm_tm_unlock(info->tm, child);
 427
 428                dm_tm_dec(info->tm, dm_block_location(child));
 429                return 0;
 430        }
 431
 432        i = lower_bound(n, key);
 433        if (i < 0)
 434                return -ENODATA;
 435
 436        has_left_sibling = i > 0;
 437        has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1);
 438
 439        if (!has_left_sibling)
 440                r = rebalance2(s, info, vt, i);
 441
 442        else if (!has_right_sibling)
 443                r = rebalance2(s, info, vt, i - 1);
 444
 445        else
 446                r = rebalance3(s, info, vt, i - 1);
 447
 448        return r;
 449}
 450
 451static int do_leaf(struct btree_node *n, uint64_t key, unsigned *index)
 452{
 453        int i = lower_bound(n, key);
 454
 455        if ((i < 0) ||
 456            (i >= le32_to_cpu(n->header.nr_entries)) ||
 457            (le64_to_cpu(n->keys[i]) != key))
 458                return -ENODATA;
 459
 460        *index = i;
 461
 462        return 0;
 463}
 464
 465/*
 466 * Prepares for removal from one level of the hierarchy.  The caller must
 467 * call delete_at() to remove the entry at index.
 468 */
 469static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
 470                      struct dm_btree_value_type *vt, dm_block_t root,
 471                      uint64_t key, unsigned *index)
 472{
 473        int i = *index, r;
 474        struct btree_node *n;
 475
 476        for (;;) {
 477                r = shadow_step(s, root, vt);
 478                if (r < 0)
 479                        break;
 480
 481                /*
 482                 * We have to patch up the parent node, ugly, but I don't
 483                 * see a way to do this automatically as part of the spine
 484                 * op.
 485                 */
 486                if (shadow_has_parent(s)) {
 487                        __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
 488                        memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
 489                               &location, sizeof(__le64));
 490                }
 491
 492                n = dm_block_data(shadow_current(s));
 493
 494                if (le32_to_cpu(n->header.flags) & LEAF_NODE)
 495                        return do_leaf(n, key, index);
 496
 497                r = rebalance_children(s, info, vt, key);
 498                if (r)
 499                        break;
 500
 501                n = dm_block_data(shadow_current(s));
 502                if (le32_to_cpu(n->header.flags) & LEAF_NODE)
 503                        return do_leaf(n, key, index);
 504
 505                i = lower_bound(n, key);
 506
 507                /*
 508                 * We know the key is present, or else
 509                 * rebalance_children would have returned
 510                 * -ENODATA
 511                 */
 512                root = value64(n, i);
 513        }
 514
 515        return r;
 516}
 517
 518int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
 519                    uint64_t *keys, dm_block_t *new_root)
 520{
 521        unsigned level, last_level = info->levels - 1;
 522        int index = 0, r = 0;
 523        struct shadow_spine spine;
 524        struct btree_node *n;
 525        struct dm_btree_value_type le64_vt;
 526
 527        init_le64_type(info->tm, &le64_vt);
 528        init_shadow_spine(&spine, info);
 529        for (level = 0; level < info->levels; level++) {
 530                r = remove_raw(&spine, info,
 531                               (level == last_level ?
 532                                &info->value_type : &le64_vt),
 533                               root, keys[level], (unsigned *)&index);
 534                if (r < 0)
 535                        break;
 536
 537                n = dm_block_data(shadow_current(&spine));
 538                if (level != last_level) {
 539                        root = value64(n, index);
 540                        continue;
 541                }
 542
 543                BUG_ON(index < 0 || index >= le32_to_cpu(n->header.nr_entries));
 544
 545                if (info->value_type.dec)
 546                        info->value_type.dec(info->value_type.context,
 547                                             value_ptr(n, index), 1);
 548
 549                delete_at(n, index);
 550        }
 551
 552        if (!r)
 553                *new_root = shadow_root(&spine);
 554        exit_shadow_spine(&spine);
 555
 556        return r;
 557}
 558EXPORT_SYMBOL_GPL(dm_btree_remove);
 559
 560/*----------------------------------------------------------------*/
 561
 562static int remove_nearest(struct shadow_spine *s, struct dm_btree_info *info,
 563                          struct dm_btree_value_type *vt, dm_block_t root,
 564                          uint64_t key, int *index)
 565{
 566        int i = *index, r;
 567        struct btree_node *n;
 568
 569        for (;;) {
 570                r = shadow_step(s, root, vt);
 571                if (r < 0)
 572                        break;
 573
 574                /*
 575                 * We have to patch up the parent node, ugly, but I don't
 576                 * see a way to do this automatically as part of the spine
 577                 * op.
 578                 */
 579                if (shadow_has_parent(s)) {
 580                        __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
 581                        memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
 582                               &location, sizeof(__le64));
 583                }
 584
 585                n = dm_block_data(shadow_current(s));
 586
 587                if (le32_to_cpu(n->header.flags) & LEAF_NODE) {
 588                        *index = lower_bound(n, key);
 589                        return 0;
 590                }
 591
 592                r = rebalance_children(s, info, vt, key);
 593                if (r)
 594                        break;
 595
 596                n = dm_block_data(shadow_current(s));
 597                if (le32_to_cpu(n->header.flags) & LEAF_NODE) {
 598                        *index = lower_bound(n, key);
 599                        return 0;
 600                }
 601
 602                i = lower_bound(n, key);
 603
 604                /*
 605                 * We know the key is present, or else
 606                 * rebalance_children would have returned
 607                 * -ENODATA
 608                 */
 609                root = value64(n, i);
 610        }
 611
 612        return r;
 613}
 614
 615static int remove_one(struct dm_btree_info *info, dm_block_t root,
 616                      uint64_t *keys, uint64_t end_key,
 617                      dm_block_t *new_root, unsigned *nr_removed)
 618{
 619        unsigned level, last_level = info->levels - 1;
 620        int index = 0, r = 0;
 621        struct shadow_spine spine;
 622        struct btree_node *n;
 623        struct dm_btree_value_type le64_vt;
 624        uint64_t k;
 625
 626        init_le64_type(info->tm, &le64_vt);
 627        init_shadow_spine(&spine, info);
 628        for (level = 0; level < last_level; level++) {
 629                r = remove_raw(&spine, info, &le64_vt,
 630                               root, keys[level], (unsigned *) &index);
 631                if (r < 0)
 632                        goto out;
 633
 634                n = dm_block_data(shadow_current(&spine));
 635                root = value64(n, index);
 636        }
 637
 638        r = remove_nearest(&spine, info, &info->value_type,
 639                           root, keys[last_level], &index);
 640        if (r < 0)
 641                goto out;
 642
 643        n = dm_block_data(shadow_current(&spine));
 644
 645        if (index < 0)
 646                index = 0;
 647
 648        if (index >= le32_to_cpu(n->header.nr_entries)) {
 649                r = -ENODATA;
 650                goto out;
 651        }
 652
 653        k = le64_to_cpu(n->keys[index]);
 654        if (k >= keys[last_level] && k < end_key) {
 655                if (info->value_type.dec)
 656                        info->value_type.dec(info->value_type.context,
 657                                             value_ptr(n, index), 1);
 658
 659                delete_at(n, index);
 660                keys[last_level] = k + 1ull;
 661
 662        } else
 663                r = -ENODATA;
 664
 665out:
 666        *new_root = shadow_root(&spine);
 667        exit_shadow_spine(&spine);
 668
 669        return r;
 670}
 671
 672int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root,
 673                           uint64_t *first_key, uint64_t end_key,
 674                           dm_block_t *new_root, unsigned *nr_removed)
 675{
 676        int r;
 677
 678        *nr_removed = 0;
 679        do {
 680                r = remove_one(info, root, first_key, end_key, &root, nr_removed);
 681                if (!r)
 682                        (*nr_removed)++;
 683        } while (!r);
 684
 685        *new_root = root;
 686        return r == -ENODATA ? 0 : r;
 687}
 688EXPORT_SYMBOL_GPL(dm_btree_remove_leaves);
 689