linux/drivers/md/persistent-data/dm-btree-remove.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2011 Red Hat, Inc.
   3 *
   4 * This file is released under the GPL.
   5 */
   6
   7#include "dm-btree.h"
   8#include "dm-btree-internal.h"
   9#include "dm-transaction-manager.h"
  10
  11#include <linux/export.h>
  12
  13/*
  14 * Removing an entry from a btree
  15 * ==============================
  16 *
  17 * A very important constraint for our btree is that no node, except the
  18 * root, may have fewer than a certain number of entries.
  19 * (MIN_ENTRIES <= nr_entries <= MAX_ENTRIES).
  20 *
  21 * Ensuring this is complicated by the way we want to only ever hold the
  22 * locks on 2 nodes concurrently, and only change nodes in a top to bottom
  23 * fashion.
  24 *
  25 * Each node may have a left or right sibling.  When decending the spine,
  26 * if a node contains only MIN_ENTRIES then we try and increase this to at
  27 * least MIN_ENTRIES + 1.  We do this in the following ways:
  28 *
  29 * [A] No siblings => this can only happen if the node is the root, in which
  30 *     case we copy the childs contents over the root.
  31 *
  32 * [B] No left sibling
  33 *     ==> rebalance(node, right sibling)
  34 *
  35 * [C] No right sibling
  36 *     ==> rebalance(left sibling, node)
  37 *
  38 * [D] Both siblings, total_entries(left, node, right) <= DEL_THRESHOLD
  39 *     ==> delete node adding it's contents to left and right
  40 *
  41 * [E] Both siblings, total_entries(left, node, right) > DEL_THRESHOLD
  42 *     ==> rebalance(left, node, right)
  43 *
  44 * After these operations it's possible that the our original node no
  45 * longer contains the desired sub tree.  For this reason this rebalancing
  46 * is performed on the children of the current node.  This also avoids
  47 * having a special case for the root.
  48 *
  49 * Once this rebalancing has occurred we can then step into the child node
  50 * for internal nodes.  Or delete the entry for leaf nodes.
  51 */
  52
  53/*
  54 * Some little utilities for moving node data around.
  55 */
  56static void node_shift(struct node *n, int shift)
  57{
  58        uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
  59        uint32_t value_size = le32_to_cpu(n->header.value_size);
  60
  61        if (shift < 0) {
  62                shift = -shift;
  63                BUG_ON(shift > nr_entries);
  64                BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift));
  65                memmove(key_ptr(n, 0),
  66                        key_ptr(n, shift),
  67                        (nr_entries - shift) * sizeof(__le64));
  68                memmove(value_ptr(n, 0),
  69                        value_ptr(n, shift),
  70                        (nr_entries - shift) * value_size);
  71        } else {
  72                BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries));
  73                memmove(key_ptr(n, shift),
  74                        key_ptr(n, 0),
  75                        nr_entries * sizeof(__le64));
  76                memmove(value_ptr(n, shift),
  77                        value_ptr(n, 0),
  78                        nr_entries * value_size);
  79        }
  80}
  81
  82static void node_copy(struct node *left, struct node *right, int shift)
  83{
  84        uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
  85        uint32_t value_size = le32_to_cpu(left->header.value_size);
  86        BUG_ON(value_size != le32_to_cpu(right->header.value_size));
  87
  88        if (shift < 0) {
  89                shift = -shift;
  90                BUG_ON(nr_left + shift > le32_to_cpu(left->header.max_entries));
  91                memcpy(key_ptr(left, nr_left),
  92                       key_ptr(right, 0),
  93                       shift * sizeof(__le64));
  94                memcpy(value_ptr(left, nr_left),
  95                       value_ptr(right, 0),
  96                       shift * value_size);
  97        } else {
  98                BUG_ON(shift > le32_to_cpu(right->header.max_entries));
  99                memcpy(key_ptr(right, 0),
 100                       key_ptr(left, nr_left - shift),
 101                       shift * sizeof(__le64));
 102                memcpy(value_ptr(right, 0),
 103                       value_ptr(left, nr_left - shift),
 104                       shift * value_size);
 105        }
 106}
 107
 108/*
 109 * Delete a specific entry from a leaf node.
 110 */
 111static void delete_at(struct node *n, unsigned index)
 112{
 113        unsigned nr_entries = le32_to_cpu(n->header.nr_entries);
 114        unsigned nr_to_copy = nr_entries - (index + 1);
 115        uint32_t value_size = le32_to_cpu(n->header.value_size);
 116        BUG_ON(index >= nr_entries);
 117
 118        if (nr_to_copy) {
 119                memmove(key_ptr(n, index),
 120                        key_ptr(n, index + 1),
 121                        nr_to_copy * sizeof(__le64));
 122
 123                memmove(value_ptr(n, index),
 124                        value_ptr(n, index + 1),
 125                        nr_to_copy * value_size);
 126        }
 127
 128        n->header.nr_entries = cpu_to_le32(nr_entries - 1);
 129}
 130
 131static unsigned merge_threshold(struct node *n)
 132{
 133        return le32_to_cpu(n->header.max_entries) / 3;
 134}
 135
 136struct child {
 137        unsigned index;
 138        struct dm_block *block;
 139        struct node *n;
 140};
 141
 142static struct dm_btree_value_type le64_type = {
 143        .context = NULL,
 144        .size = sizeof(__le64),
 145        .inc = NULL,
 146        .dec = NULL,
 147        .equal = NULL
 148};
 149
 150static int init_child(struct dm_btree_info *info, struct node *parent,
 151                      unsigned index, struct child *result)
 152{
 153        int r, inc;
 154        dm_block_t root;
 155
 156        result->index = index;
 157        root = value64(parent, index);
 158
 159        r = dm_tm_shadow_block(info->tm, root, &btree_node_validator,
 160                               &result->block, &inc);
 161        if (r)
 162                return r;
 163
 164        result->n = dm_block_data(result->block);
 165
 166        if (inc)
 167                inc_children(info->tm, result->n, &le64_type);
 168
 169        *((__le64 *) value_ptr(parent, index)) =
 170                cpu_to_le64(dm_block_location(result->block));
 171
 172        return 0;
 173}
 174
 175static int exit_child(struct dm_btree_info *info, struct child *c)
 176{
 177        return dm_tm_unlock(info->tm, c->block);
 178}
 179
 180static void shift(struct node *left, struct node *right, int count)
 181{
 182        uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
 183        uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
 184        uint32_t max_entries = le32_to_cpu(left->header.max_entries);
 185        uint32_t r_max_entries = le32_to_cpu(right->header.max_entries);
 186
 187        BUG_ON(max_entries != r_max_entries);
 188        BUG_ON(nr_left - count > max_entries);
 189        BUG_ON(nr_right + count > max_entries);
 190
 191        if (!count)
 192                return;
 193
 194        if (count > 0) {
 195                node_shift(right, count);
 196                node_copy(left, right, count);
 197        } else {
 198                node_copy(left, right, count);
 199                node_shift(right, count);
 200        }
 201
 202        left->header.nr_entries = cpu_to_le32(nr_left - count);
 203        right->header.nr_entries = cpu_to_le32(nr_right + count);
 204}
 205
 206static void __rebalance2(struct dm_btree_info *info, struct node *parent,
 207                         struct child *l, struct child *r)
 208{
 209        struct node *left = l->n;
 210        struct node *right = r->n;
 211        uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
 212        uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
 213        unsigned threshold = 2 * merge_threshold(left) + 1;
 214
 215        if (nr_left + nr_right < threshold) {
 216                /*
 217                 * Merge
 218                 */
 219                node_copy(left, right, -nr_right);
 220                left->header.nr_entries = cpu_to_le32(nr_left + nr_right);
 221                delete_at(parent, r->index);
 222
 223                /*
 224                 * We need to decrement the right block, but not it's
 225                 * children, since they're still referenced by left.
 226                 */
 227                dm_tm_dec(info->tm, dm_block_location(r->block));
 228        } else {
 229                /*
 230                 * Rebalance.
 231                 */
 232                unsigned target_left = (nr_left + nr_right) / 2;
 233                shift(left, right, nr_left - target_left);
 234                *key_ptr(parent, r->index) = right->keys[0];
 235        }
 236}
 237
 238static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
 239                      unsigned left_index)
 240{
 241        int r;
 242        struct node *parent;
 243        struct child left, right;
 244
 245        parent = dm_block_data(shadow_current(s));
 246
 247        r = init_child(info, parent, left_index, &left);
 248        if (r)
 249                return r;
 250
 251        r = init_child(info, parent, left_index + 1, &right);
 252        if (r) {
 253                exit_child(info, &left);
 254                return r;
 255        }
 256
 257        __rebalance2(info, parent, &left, &right);
 258
 259        r = exit_child(info, &left);
 260        if (r) {
 261                exit_child(info, &right);
 262                return r;
 263        }
 264
 265        return exit_child(info, &right);
 266}
 267
 268/*
 269 * We dump as many entries from center as possible into left, then the rest
 270 * in right, then rebalance2.  This wastes some cpu, but I want something
 271 * simple atm.
 272 */
 273static void delete_center_node(struct dm_btree_info *info, struct node *parent,
 274                               struct child *l, struct child *c, struct child *r,
 275                               struct node *left, struct node *center, struct node *right,
 276                               uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
 277{
 278        uint32_t max_entries = le32_to_cpu(left->header.max_entries);
 279        unsigned shift = min(max_entries - nr_left, nr_center);
 280
 281        BUG_ON(nr_left + shift > max_entries);
 282        node_copy(left, center, -shift);
 283        left->header.nr_entries = cpu_to_le32(nr_left + shift);
 284
 285        if (shift != nr_center) {
 286                shift = nr_center - shift;
 287                BUG_ON((nr_right + shift) > max_entries);
 288                node_shift(right, shift);
 289                node_copy(center, right, shift);
 290                right->header.nr_entries = cpu_to_le32(nr_right + shift);
 291        }
 292        *key_ptr(parent, r->index) = right->keys[0];
 293
 294        delete_at(parent, c->index);
 295        r->index--;
 296
 297        dm_tm_dec(info->tm, dm_block_location(c->block));
 298        __rebalance2(info, parent, l, r);
 299}
 300
 301/*
 302 * Redistributes entries among 3 sibling nodes.
 303 */
 304static void redistribute3(struct dm_btree_info *info, struct node *parent,
 305                          struct child *l, struct child *c, struct child *r,
 306                          struct node *left, struct node *center, struct node *right,
 307                          uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
 308{
 309        int s;
 310        uint32_t max_entries = le32_to_cpu(left->header.max_entries);
 311        unsigned target = (nr_left + nr_center + nr_right) / 3;
 312        BUG_ON(target > max_entries);
 313
 314        if (nr_left < nr_right) {
 315                s = nr_left - target;
 316
 317                if (s < 0 && nr_center < -s) {
 318                        /* not enough in central node */
 319                        shift(left, center, nr_center);
 320                        s = nr_center - target;
 321                        shift(left, right, s);
 322                        nr_right += s;
 323                } else
 324                        shift(left, center, s);
 325
 326                shift(center, right, target - nr_right);
 327
 328        } else {
 329                s = target - nr_right;
 330                if (s > 0 && nr_center < s) {
 331                        /* not enough in central node */
 332                        shift(center, right, nr_center);
 333                        s = target - nr_center;
 334                        shift(left, right, s);
 335                        nr_left -= s;
 336                } else
 337                        shift(center, right, s);
 338
 339                shift(left, center, nr_left - target);
 340        }
 341
 342        *key_ptr(parent, c->index) = center->keys[0];
 343        *key_ptr(parent, r->index) = right->keys[0];
 344}
 345
 346static void __rebalance3(struct dm_btree_info *info, struct node *parent,
 347                         struct child *l, struct child *c, struct child *r)
 348{
 349        struct node *left = l->n;
 350        struct node *center = c->n;
 351        struct node *right = r->n;
 352
 353        uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
 354        uint32_t nr_center = le32_to_cpu(center->header.nr_entries);
 355        uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
 356
 357        unsigned threshold = merge_threshold(left) * 4 + 1;
 358
 359        BUG_ON(left->header.max_entries != center->header.max_entries);
 360        BUG_ON(center->header.max_entries != right->header.max_entries);
 361
 362        if ((nr_left + nr_center + nr_right) < threshold)
 363                delete_center_node(info, parent, l, c, r, left, center, right,
 364                                   nr_left, nr_center, nr_right);
 365        else
 366                redistribute3(info, parent, l, c, r, left, center, right,
 367                              nr_left, nr_center, nr_right);
 368}
 369
 370static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
 371                      unsigned left_index)
 372{
 373        int r;
 374        struct node *parent = dm_block_data(shadow_current(s));
 375        struct child left, center, right;
 376
 377        /*
 378         * FIXME: fill out an array?
 379         */
 380        r = init_child(info, parent, left_index, &left);
 381        if (r)
 382                return r;
 383
 384        r = init_child(info, parent, left_index + 1, &center);
 385        if (r) {
 386                exit_child(info, &left);
 387                return r;
 388        }
 389
 390        r = init_child(info, parent, left_index + 2, &right);
 391        if (r) {
 392                exit_child(info, &left);
 393                exit_child(info, &center);
 394                return r;
 395        }
 396
 397        __rebalance3(info, parent, &left, &center, &right);
 398
 399        r = exit_child(info, &left);
 400        if (r) {
 401                exit_child(info, &center);
 402                exit_child(info, &right);
 403                return r;
 404        }
 405
 406        r = exit_child(info, &center);
 407        if (r) {
 408                exit_child(info, &right);
 409                return r;
 410        }
 411
 412        r = exit_child(info, &right);
 413        if (r)
 414                return r;
 415
 416        return 0;
 417}
 418
 419static int get_nr_entries(struct dm_transaction_manager *tm,
 420                          dm_block_t b, uint32_t *result)
 421{
 422        int r;
 423        struct dm_block *block;
 424        struct node *n;
 425
 426        r = dm_tm_read_lock(tm, b, &btree_node_validator, &block);
 427        if (r)
 428                return r;
 429
 430        n = dm_block_data(block);
 431        *result = le32_to_cpu(n->header.nr_entries);
 432
 433        return dm_tm_unlock(tm, block);
 434}
 435
 436static int rebalance_children(struct shadow_spine *s,
 437                              struct dm_btree_info *info, uint64_t key)
 438{
 439        int i, r, has_left_sibling, has_right_sibling;
 440        uint32_t child_entries;
 441        struct node *n;
 442
 443        n = dm_block_data(shadow_current(s));
 444
 445        if (le32_to_cpu(n->header.nr_entries) == 1) {
 446                struct dm_block *child;
 447                dm_block_t b = value64(n, 0);
 448
 449                r = dm_tm_read_lock(info->tm, b, &btree_node_validator, &child);
 450                if (r)
 451                        return r;
 452
 453                memcpy(n, dm_block_data(child),
 454                       dm_bm_block_size(dm_tm_get_bm(info->tm)));
 455                r = dm_tm_unlock(info->tm, child);
 456                if (r)
 457                        return r;
 458
 459                dm_tm_dec(info->tm, dm_block_location(child));
 460                return 0;
 461        }
 462
 463        i = lower_bound(n, key);
 464        if (i < 0)
 465                return -ENODATA;
 466
 467        r = get_nr_entries(info->tm, value64(n, i), &child_entries);
 468        if (r)
 469                return r;
 470
 471        has_left_sibling = i > 0;
 472        has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1);
 473
 474        if (!has_left_sibling)
 475                r = rebalance2(s, info, i);
 476
 477        else if (!has_right_sibling)
 478                r = rebalance2(s, info, i - 1);
 479
 480        else
 481                r = rebalance3(s, info, i - 1);
 482
 483        return r;
 484}
 485
 486static int do_leaf(struct node *n, uint64_t key, unsigned *index)
 487{
 488        int i = lower_bound(n, key);
 489
 490        if ((i < 0) ||
 491            (i >= le32_to_cpu(n->header.nr_entries)) ||
 492            (le64_to_cpu(n->keys[i]) != key))
 493                return -ENODATA;
 494
 495        *index = i;
 496
 497        return 0;
 498}
 499
 500/*
 501 * Prepares for removal from one level of the hierarchy.  The caller must
 502 * call delete_at() to remove the entry at index.
 503 */
 504static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
 505                      struct dm_btree_value_type *vt, dm_block_t root,
 506                      uint64_t key, unsigned *index)
 507{
 508        int i = *index, r;
 509        struct node *n;
 510
 511        for (;;) {
 512                r = shadow_step(s, root, vt);
 513                if (r < 0)
 514                        break;
 515
 516                /*
 517                 * We have to patch up the parent node, ugly, but I don't
 518                 * see a way to do this automatically as part of the spine
 519                 * op.
 520                 */
 521                if (shadow_has_parent(s)) {
 522                        __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
 523                        memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
 524                               &location, sizeof(__le64));
 525                }
 526
 527                n = dm_block_data(shadow_current(s));
 528
 529                if (le32_to_cpu(n->header.flags) & LEAF_NODE)
 530                        return do_leaf(n, key, index);
 531
 532                r = rebalance_children(s, info, key);
 533                if (r)
 534                        break;
 535
 536                n = dm_block_data(shadow_current(s));
 537                if (le32_to_cpu(n->header.flags) & LEAF_NODE)
 538                        return do_leaf(n, key, index);
 539
 540                i = lower_bound(n, key);
 541
 542                /*
 543                 * We know the key is present, or else
 544                 * rebalance_children would have returned
 545                 * -ENODATA
 546                 */
 547                root = value64(n, i);
 548        }
 549
 550        return r;
 551}
 552
 553int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
 554                    uint64_t *keys, dm_block_t *new_root)
 555{
 556        unsigned level, last_level = info->levels - 1;
 557        int index = 0, r = 0;
 558        struct shadow_spine spine;
 559        struct node *n;
 560
 561        init_shadow_spine(&spine, info);
 562        for (level = 0; level < info->levels; level++) {
 563                r = remove_raw(&spine, info,
 564                               (level == last_level ?
 565                                &info->value_type : &le64_type),
 566                               root, keys[level], (unsigned *)&index);
 567                if (r < 0)
 568                        break;
 569
 570                n = dm_block_data(shadow_current(&spine));
 571                if (level != last_level) {
 572                        root = value64(n, index);
 573                        continue;
 574                }
 575
 576                BUG_ON(index < 0 || index >= le32_to_cpu(n->header.nr_entries));
 577
 578                if (info->value_type.dec)
 579                        info->value_type.dec(info->value_type.context,
 580                                             value_ptr(n, index));
 581
 582                delete_at(n, index);
 583        }
 584
 585        *new_root = shadow_root(&spine);
 586        exit_shadow_spine(&spine);
 587
 588        return r;
 589}
 590EXPORT_SYMBOL_GPL(dm_btree_remove);
 591