linux/drivers/md/persistent-data/dm-btree.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2011 Red Hat, Inc.
   3 *
   4 * This file is released under the GPL.
   5 */
   6
   7#include "dm-btree-internal.h"
   8#include "dm-space-map.h"
   9#include "dm-transaction-manager.h"
  10
  11#include <linux/export.h>
  12#include <linux/device-mapper.h>
  13
  14#define DM_MSG_PREFIX "btree"
  15
  16/*----------------------------------------------------------------
  17 * Array manipulation
  18 *--------------------------------------------------------------*/
  19static void memcpy_disk(void *dest, const void *src, size_t len)
  20        __dm_written_to_disk(src)
  21{
  22        memcpy(dest, src, len);
  23        __dm_unbless_for_disk(src);
  24}
  25
  26static void array_insert(void *base, size_t elt_size, unsigned nr_elts,
  27                         unsigned index, void *elt)
  28        __dm_written_to_disk(elt)
  29{
  30        if (index < nr_elts)
  31                memmove(base + (elt_size * (index + 1)),
  32                        base + (elt_size * index),
  33                        (nr_elts - index) * elt_size);
  34
  35        memcpy_disk(base + (elt_size * index), elt, elt_size);
  36}
  37
  38/*----------------------------------------------------------------*/
  39
  40/* makes the assumption that no two keys are the same. */
  41static int bsearch(struct btree_node *n, uint64_t key, int want_hi)
  42{
  43        int lo = -1, hi = le32_to_cpu(n->header.nr_entries);
  44
  45        while (hi - lo > 1) {
  46                int mid = lo + ((hi - lo) / 2);
  47                uint64_t mid_key = le64_to_cpu(n->keys[mid]);
  48
  49                if (mid_key == key)
  50                        return mid;
  51
  52                if (mid_key < key)
  53                        lo = mid;
  54                else
  55                        hi = mid;
  56        }
  57
  58        return want_hi ? hi : lo;
  59}
  60
  61int lower_bound(struct btree_node *n, uint64_t key)
  62{
  63        return bsearch(n, key, 0);
  64}
  65
  66void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
  67                  struct dm_btree_value_type *vt)
  68{
  69        unsigned i;
  70        uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
  71
  72        if (le32_to_cpu(n->header.flags) & INTERNAL_NODE)
  73                for (i = 0; i < nr_entries; i++)
  74                        dm_tm_inc(tm, value64(n, i));
  75        else if (vt->inc)
  76                for (i = 0; i < nr_entries; i++)
  77                        vt->inc(vt->context, value_ptr(n, i));
  78}
  79
  80static int insert_at(size_t value_size, struct btree_node *node, unsigned index,
  81                      uint64_t key, void *value)
  82                      __dm_written_to_disk(value)
  83{
  84        uint32_t nr_entries = le32_to_cpu(node->header.nr_entries);
  85        __le64 key_le = cpu_to_le64(key);
  86
  87        if (index > nr_entries ||
  88            index >= le32_to_cpu(node->header.max_entries)) {
  89                DMERR("too many entries in btree node for insert");
  90                __dm_unbless_for_disk(value);
  91                return -ENOMEM;
  92        }
  93
  94        __dm_bless_for_disk(&key_le);
  95
  96        array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le);
  97        array_insert(value_base(node), value_size, nr_entries, index, value);
  98        node->header.nr_entries = cpu_to_le32(nr_entries + 1);
  99
 100        return 0;
 101}
 102
 103/*----------------------------------------------------------------*/
 104
 105/*
 106 * We want 3n entries (for some n).  This works more nicely for repeated
 107 * insert remove loops than (2n + 1).
 108 */
 109static uint32_t calc_max_entries(size_t value_size, size_t block_size)
 110{
 111        uint32_t total, n;
 112        size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */
 113
 114        block_size -= sizeof(struct node_header);
 115        total = block_size / elt_size;
 116        n = total / 3;          /* rounds down */
 117
 118        return 3 * n;
 119}
 120
 121int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root)
 122{
 123        int r;
 124        struct dm_block *b;
 125        struct btree_node *n;
 126        size_t block_size;
 127        uint32_t max_entries;
 128
 129        r = new_block(info, &b);
 130        if (r < 0)
 131                return r;
 132
 133        block_size = dm_bm_block_size(dm_tm_get_bm(info->tm));
 134        max_entries = calc_max_entries(info->value_type.size, block_size);
 135
 136        n = dm_block_data(b);
 137        memset(n, 0, block_size);
 138        n->header.flags = cpu_to_le32(LEAF_NODE);
 139        n->header.nr_entries = cpu_to_le32(0);
 140        n->header.max_entries = cpu_to_le32(max_entries);
 141        n->header.value_size = cpu_to_le32(info->value_type.size);
 142
 143        *root = dm_block_location(b);
 144        return unlock_block(info, b);
 145}
 146EXPORT_SYMBOL_GPL(dm_btree_empty);
 147
 148/*----------------------------------------------------------------*/
 149
 150/*
 151 * Deletion uses a recursive algorithm, since we have limited stack space
 152 * we explicitly manage our own stack on the heap.
 153 */
 154#define MAX_SPINE_DEPTH 64
 155struct frame {
 156        struct dm_block *b;
 157        struct btree_node *n;
 158        unsigned level;
 159        unsigned nr_children;
 160        unsigned current_child;
 161};
 162
 163struct del_stack {
 164        struct dm_transaction_manager *tm;
 165        int top;
 166        struct frame spine[MAX_SPINE_DEPTH];
 167};
 168
 169static int top_frame(struct del_stack *s, struct frame **f)
 170{
 171        if (s->top < 0) {
 172                DMERR("btree deletion stack empty");
 173                return -EINVAL;
 174        }
 175
 176        *f = s->spine + s->top;
 177
 178        return 0;
 179}
 180
 181static int unprocessed_frames(struct del_stack *s)
 182{
 183        return s->top >= 0;
 184}
 185
 186static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
 187{
 188        int r;
 189        uint32_t ref_count;
 190
 191        if (s->top >= MAX_SPINE_DEPTH - 1) {
 192                DMERR("btree deletion stack out of memory");
 193                return -ENOMEM;
 194        }
 195
 196        r = dm_tm_ref(s->tm, b, &ref_count);
 197        if (r)
 198                return r;
 199
 200        if (ref_count > 1)
 201                /*
 202                 * This is a shared node, so we can just decrement it's
 203                 * reference counter and leave the children.
 204                 */
 205                dm_tm_dec(s->tm, b);
 206
 207        else {
 208                struct frame *f = s->spine + ++s->top;
 209
 210                r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b);
 211                if (r) {
 212                        s->top--;
 213                        return r;
 214                }
 215
 216                f->n = dm_block_data(f->b);
 217                f->level = level;
 218                f->nr_children = le32_to_cpu(f->n->header.nr_entries);
 219                f->current_child = 0;
 220        }
 221
 222        return 0;
 223}
 224
 225static void pop_frame(struct del_stack *s)
 226{
 227        struct frame *f = s->spine + s->top--;
 228
 229        dm_tm_dec(s->tm, dm_block_location(f->b));
 230        dm_tm_unlock(s->tm, f->b);
 231}
 232
 233static bool is_internal_level(struct dm_btree_info *info, struct frame *f)
 234{
 235        return f->level < (info->levels - 1);
 236}
 237
 238int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
 239{
 240        int r;
 241        struct del_stack *s;
 242
 243        s = kmalloc(sizeof(*s), GFP_KERNEL);
 244        if (!s)
 245                return -ENOMEM;
 246        s->tm = info->tm;
 247        s->top = -1;
 248
 249        r = push_frame(s, root, 0);
 250        if (r)
 251                goto out;
 252
 253        while (unprocessed_frames(s)) {
 254                uint32_t flags;
 255                struct frame *f;
 256                dm_block_t b;
 257
 258                r = top_frame(s, &f);
 259                if (r)
 260                        goto out;
 261
 262                if (f->current_child >= f->nr_children) {
 263                        pop_frame(s);
 264                        continue;
 265                }
 266
 267                flags = le32_to_cpu(f->n->header.flags);
 268                if (flags & INTERNAL_NODE) {
 269                        b = value64(f->n, f->current_child);
 270                        f->current_child++;
 271                        r = push_frame(s, b, f->level);
 272                        if (r)
 273                                goto out;
 274
 275                } else if (is_internal_level(info, f)) {
 276                        b = value64(f->n, f->current_child);
 277                        f->current_child++;
 278                        r = push_frame(s, b, f->level + 1);
 279                        if (r)
 280                                goto out;
 281
 282                } else {
 283                        if (info->value_type.dec) {
 284                                unsigned i;
 285
 286                                for (i = 0; i < f->nr_children; i++)
 287                                        info->value_type.dec(info->value_type.context,
 288                                                             value_ptr(f->n, i));
 289                        }
 290                        f->current_child = f->nr_children;
 291                }
 292        }
 293
 294out:
 295        kfree(s);
 296        return r;
 297}
 298EXPORT_SYMBOL_GPL(dm_btree_del);
 299
 300/*----------------------------------------------------------------*/
 301
 302static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
 303                            int (*search_fn)(struct btree_node *, uint64_t),
 304                            uint64_t *result_key, void *v, size_t value_size)
 305{
 306        int i, r;
 307        uint32_t flags, nr_entries;
 308
 309        do {
 310                r = ro_step(s, block);
 311                if (r < 0)
 312                        return r;
 313
 314                i = search_fn(ro_node(s), key);
 315
 316                flags = le32_to_cpu(ro_node(s)->header.flags);
 317                nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries);
 318                if (i < 0 || i >= nr_entries)
 319                        return -ENODATA;
 320
 321                if (flags & INTERNAL_NODE)
 322                        block = value64(ro_node(s), i);
 323
 324        } while (!(flags & LEAF_NODE));
 325
 326        *result_key = le64_to_cpu(ro_node(s)->keys[i]);
 327        memcpy(v, value_ptr(ro_node(s), i), value_size);
 328
 329        return 0;
 330}
 331
 332int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
 333                    uint64_t *keys, void *value_le)
 334{
 335        unsigned level, last_level = info->levels - 1;
 336        int r = -ENODATA;
 337        uint64_t rkey;
 338        __le64 internal_value_le;
 339        struct ro_spine spine;
 340
 341        init_ro_spine(&spine, info);
 342        for (level = 0; level < info->levels; level++) {
 343                size_t size;
 344                void *value_p;
 345
 346                if (level == last_level) {
 347                        value_p = value_le;
 348                        size = info->value_type.size;
 349
 350                } else {
 351                        value_p = &internal_value_le;
 352                        size = sizeof(uint64_t);
 353                }
 354
 355                r = btree_lookup_raw(&spine, root, keys[level],
 356                                     lower_bound, &rkey,
 357                                     value_p, size);
 358
 359                if (!r) {
 360                        if (rkey != keys[level]) {
 361                                exit_ro_spine(&spine);
 362                                return -ENODATA;
 363                        }
 364                } else {
 365                        exit_ro_spine(&spine);
 366                        return r;
 367                }
 368
 369                root = le64_to_cpu(internal_value_le);
 370        }
 371        exit_ro_spine(&spine);
 372
 373        return r;
 374}
 375EXPORT_SYMBOL_GPL(dm_btree_lookup);
 376
 377/*
 378 * Splits a node by creating a sibling node and shifting half the nodes
 379 * contents across.  Assumes there is a parent node, and it has room for
 380 * another child.
 381 *
 382 * Before:
 383 *        +--------+
 384 *        | Parent |
 385 *        +--------+
 386 *           |
 387 *           v
 388 *      +----------+
 389 *      | A ++++++ |
 390 *      +----------+
 391 *
 392 *
 393 * After:
 394 *              +--------+
 395 *              | Parent |
 396 *              +--------+
 397 *                |     |
 398 *                v     +------+
 399 *          +---------+        |
 400 *          | A* +++  |        v
 401 *          +---------+   +-------+
 402 *                        | B +++ |
 403 *                        +-------+
 404 *
 405 * Where A* is a shadow of A.
 406 */
 407static int btree_split_sibling(struct shadow_spine *s, dm_block_t root,
 408                               unsigned parent_index, uint64_t key)
 409{
 410        int r;
 411        size_t size;
 412        unsigned nr_left, nr_right;
 413        struct dm_block *left, *right, *parent;
 414        struct btree_node *ln, *rn, *pn;
 415        __le64 location;
 416
 417        left = shadow_current(s);
 418
 419        r = new_block(s->info, &right);
 420        if (r < 0)
 421                return r;
 422
 423        ln = dm_block_data(left);
 424        rn = dm_block_data(right);
 425
 426        nr_left = le32_to_cpu(ln->header.nr_entries) / 2;
 427        nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left;
 428
 429        ln->header.nr_entries = cpu_to_le32(nr_left);
 430
 431        rn->header.flags = ln->header.flags;
 432        rn->header.nr_entries = cpu_to_le32(nr_right);
 433        rn->header.max_entries = ln->header.max_entries;
 434        rn->header.value_size = ln->header.value_size;
 435        memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0]));
 436
 437        size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
 438                sizeof(uint64_t) : s->info->value_type.size;
 439        memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left),
 440               size * nr_right);
 441
 442        /*
 443         * Patch up the parent
 444         */
 445        parent = shadow_parent(s);
 446
 447        pn = dm_block_data(parent);
 448        location = cpu_to_le64(dm_block_location(left));
 449        __dm_bless_for_disk(&location);
 450        memcpy_disk(value_ptr(pn, parent_index),
 451                    &location, sizeof(__le64));
 452
 453        location = cpu_to_le64(dm_block_location(right));
 454        __dm_bless_for_disk(&location);
 455
 456        r = insert_at(sizeof(__le64), pn, parent_index + 1,
 457                      le64_to_cpu(rn->keys[0]), &location);
 458        if (r)
 459                return r;
 460
 461        if (key < le64_to_cpu(rn->keys[0])) {
 462                unlock_block(s->info, right);
 463                s->nodes[1] = left;
 464        } else {
 465                unlock_block(s->info, left);
 466                s->nodes[1] = right;
 467        }
 468
 469        return 0;
 470}
 471
 472/*
 473 * Splits a node by creating two new children beneath the given node.
 474 *
 475 * Before:
 476 *        +----------+
 477 *        | A ++++++ |
 478 *        +----------+
 479 *
 480 *
 481 * After:
 482 *      +------------+
 483 *      | A (shadow) |
 484 *      +------------+
 485 *          |   |
 486 *   +------+   +----+
 487 *   |               |
 488 *   v               v
 489 * +-------+     +-------+
 490 * | B +++ |     | C +++ |
 491 * +-------+     +-------+
 492 */
 493static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
 494{
 495        int r;
 496        size_t size;
 497        unsigned nr_left, nr_right;
 498        struct dm_block *left, *right, *new_parent;
 499        struct btree_node *pn, *ln, *rn;
 500        __le64 val;
 501
 502        new_parent = shadow_current(s);
 503
 504        r = new_block(s->info, &left);
 505        if (r < 0)
 506                return r;
 507
 508        r = new_block(s->info, &right);
 509        if (r < 0) {
 510                /* FIXME: put left */
 511                return r;
 512        }
 513
 514        pn = dm_block_data(new_parent);
 515        ln = dm_block_data(left);
 516        rn = dm_block_data(right);
 517
 518        nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
 519        nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
 520
 521        ln->header.flags = pn->header.flags;
 522        ln->header.nr_entries = cpu_to_le32(nr_left);
 523        ln->header.max_entries = pn->header.max_entries;
 524        ln->header.value_size = pn->header.value_size;
 525
 526        rn->header.flags = pn->header.flags;
 527        rn->header.nr_entries = cpu_to_le32(nr_right);
 528        rn->header.max_entries = pn->header.max_entries;
 529        rn->header.value_size = pn->header.value_size;
 530
 531        memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
 532        memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
 533
 534        size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
 535                sizeof(__le64) : s->info->value_type.size;
 536        memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
 537        memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
 538               nr_right * size);
 539
 540        /* new_parent should just point to l and r now */
 541        pn->header.flags = cpu_to_le32(INTERNAL_NODE);
 542        pn->header.nr_entries = cpu_to_le32(2);
 543        pn->header.max_entries = cpu_to_le32(
 544                calc_max_entries(sizeof(__le64),
 545                                 dm_bm_block_size(
 546                                         dm_tm_get_bm(s->info->tm))));
 547        pn->header.value_size = cpu_to_le32(sizeof(__le64));
 548
 549        val = cpu_to_le64(dm_block_location(left));
 550        __dm_bless_for_disk(&val);
 551        pn->keys[0] = ln->keys[0];
 552        memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64));
 553
 554        val = cpu_to_le64(dm_block_location(right));
 555        __dm_bless_for_disk(&val);
 556        pn->keys[1] = rn->keys[0];
 557        memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64));
 558
 559        /*
 560         * rejig the spine.  This is ugly, since it knows too
 561         * much about the spine
 562         */
 563        if (s->nodes[0] != new_parent) {
 564                unlock_block(s->info, s->nodes[0]);
 565                s->nodes[0] = new_parent;
 566        }
 567        if (key < le64_to_cpu(rn->keys[0])) {
 568                unlock_block(s->info, right);
 569                s->nodes[1] = left;
 570        } else {
 571                unlock_block(s->info, left);
 572                s->nodes[1] = right;
 573        }
 574        s->count = 2;
 575
 576        return 0;
 577}
 578
 579static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
 580                            struct dm_btree_value_type *vt,
 581                            uint64_t key, unsigned *index)
 582{
 583        int r, i = *index, top = 1;
 584        struct btree_node *node;
 585
 586        for (;;) {
 587                r = shadow_step(s, root, vt);
 588                if (r < 0)
 589                        return r;
 590
 591                node = dm_block_data(shadow_current(s));
 592
 593                /*
 594                 * We have to patch up the parent node, ugly, but I don't
 595                 * see a way to do this automatically as part of the spine
 596                 * op.
 597                 */
 598                if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */
 599                        __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
 600
 601                        __dm_bless_for_disk(&location);
 602                        memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
 603                                    &location, sizeof(__le64));
 604                }
 605
 606                node = dm_block_data(shadow_current(s));
 607
 608                if (node->header.nr_entries == node->header.max_entries) {
 609                        if (top)
 610                                r = btree_split_beneath(s, key);
 611                        else
 612                                r = btree_split_sibling(s, root, i, key);
 613
 614                        if (r < 0)
 615                                return r;
 616                }
 617
 618                node = dm_block_data(shadow_current(s));
 619
 620                i = lower_bound(node, key);
 621
 622                if (le32_to_cpu(node->header.flags) & LEAF_NODE)
 623                        break;
 624
 625                if (i < 0) {
 626                        /* change the bounds on the lowest key */
 627                        node->keys[0] = cpu_to_le64(key);
 628                        i = 0;
 629                }
 630
 631                root = value64(node, i);
 632                top = 0;
 633        }
 634
 635        if (i < 0 || le64_to_cpu(node->keys[i]) != key)
 636                i++;
 637
 638        *index = i;
 639        return 0;
 640}
 641
 642static int insert(struct dm_btree_info *info, dm_block_t root,
 643                  uint64_t *keys, void *value, dm_block_t *new_root,
 644                  int *inserted)
 645                  __dm_written_to_disk(value)
 646{
 647        int r, need_insert;
 648        unsigned level, index = -1, last_level = info->levels - 1;
 649        dm_block_t block = root;
 650        struct shadow_spine spine;
 651        struct btree_node *n;
 652        struct dm_btree_value_type le64_type;
 653
 654        le64_type.context = NULL;
 655        le64_type.size = sizeof(__le64);
 656        le64_type.inc = NULL;
 657        le64_type.dec = NULL;
 658        le64_type.equal = NULL;
 659
 660        init_shadow_spine(&spine, info);
 661
 662        for (level = 0; level < (info->levels - 1); level++) {
 663                r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index);
 664                if (r < 0)
 665                        goto bad;
 666
 667                n = dm_block_data(shadow_current(&spine));
 668                need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
 669                               (le64_to_cpu(n->keys[index]) != keys[level]));
 670
 671                if (need_insert) {
 672                        dm_block_t new_tree;
 673                        __le64 new_le;
 674
 675                        r = dm_btree_empty(info, &new_tree);
 676                        if (r < 0)
 677                                goto bad;
 678
 679                        new_le = cpu_to_le64(new_tree);
 680                        __dm_bless_for_disk(&new_le);
 681
 682                        r = insert_at(sizeof(uint64_t), n, index,
 683                                      keys[level], &new_le);
 684                        if (r)
 685                                goto bad;
 686                }
 687
 688                if (level < last_level)
 689                        block = value64(n, index);
 690        }
 691
 692        r = btree_insert_raw(&spine, block, &info->value_type,
 693                             keys[level], &index);
 694        if (r < 0)
 695                goto bad;
 696
 697        n = dm_block_data(shadow_current(&spine));
 698        need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
 699                       (le64_to_cpu(n->keys[index]) != keys[level]));
 700
 701        if (need_insert) {
 702                if (inserted)
 703                        *inserted = 1;
 704
 705                r = insert_at(info->value_type.size, n, index,
 706                              keys[level], value);
 707                if (r)
 708                        goto bad_unblessed;
 709        } else {
 710                if (inserted)
 711                        *inserted = 0;
 712
 713                if (info->value_type.dec &&
 714                    (!info->value_type.equal ||
 715                     !info->value_type.equal(
 716                             info->value_type.context,
 717                             value_ptr(n, index),
 718                             value))) {
 719                        info->value_type.dec(info->value_type.context,
 720                                             value_ptr(n, index));
 721                }
 722                memcpy_disk(value_ptr(n, index),
 723                            value, info->value_type.size);
 724        }
 725
 726        *new_root = shadow_root(&spine);
 727        exit_shadow_spine(&spine);
 728
 729        return 0;
 730
 731bad:
 732        __dm_unbless_for_disk(value);
 733bad_unblessed:
 734        exit_shadow_spine(&spine);
 735        return r;
 736}
 737
 738int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
 739                    uint64_t *keys, void *value, dm_block_t *new_root)
 740                    __dm_written_to_disk(value)
 741{
 742        return insert(info, root, keys, value, new_root, NULL);
 743}
 744EXPORT_SYMBOL_GPL(dm_btree_insert);
 745
 746int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
 747                           uint64_t *keys, void *value, dm_block_t *new_root,
 748                           int *inserted)
 749                           __dm_written_to_disk(value)
 750{
 751        return insert(info, root, keys, value, new_root, inserted);
 752}
 753EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
 754
 755/*----------------------------------------------------------------*/
 756
 757static int find_highest_key(struct ro_spine *s, dm_block_t block,
 758                            uint64_t *result_key, dm_block_t *next_block)
 759{
 760        int i, r;
 761        uint32_t flags;
 762
 763        do {
 764                r = ro_step(s, block);
 765                if (r < 0)
 766                        return r;
 767
 768                flags = le32_to_cpu(ro_node(s)->header.flags);
 769                i = le32_to_cpu(ro_node(s)->header.nr_entries);
 770                if (!i)
 771                        return -ENODATA;
 772                else
 773                        i--;
 774
 775                *result_key = le64_to_cpu(ro_node(s)->keys[i]);
 776                if (next_block || flags & INTERNAL_NODE)
 777                        block = value64(ro_node(s), i);
 778
 779        } while (flags & INTERNAL_NODE);
 780
 781        if (next_block)
 782                *next_block = block;
 783        return 0;
 784}
 785
 786int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
 787                              uint64_t *result_keys)
 788{
 789        int r = 0, count = 0, level;
 790        struct ro_spine spine;
 791
 792        init_ro_spine(&spine, info);
 793        for (level = 0; level < info->levels; level++) {
 794                r = find_highest_key(&spine, root, result_keys + level,
 795                                     level == info->levels - 1 ? NULL : &root);
 796                if (r == -ENODATA) {
 797                        r = 0;
 798                        break;
 799
 800                } else if (r)
 801                        break;
 802
 803                count++;
 804        }
 805        exit_ro_spine(&spine);
 806
 807        return r ? r : count;
 808}
 809EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
 810
 811/*
 812 * FIXME: We shouldn't use a recursive algorithm when we have limited stack
 813 * space.  Also this only works for single level trees.
 814 */
 815static int walk_node(struct ro_spine *s, dm_block_t block,
 816                     int (*fn)(void *context, uint64_t *keys, void *leaf),
 817                     void *context)
 818{
 819        int r;
 820        unsigned i, nr;
 821        struct btree_node *n;
 822        uint64_t keys;
 823
 824        r = ro_step(s, block);
 825        n = ro_node(s);
 826
 827        nr = le32_to_cpu(n->header.nr_entries);
 828        for (i = 0; i < nr; i++) {
 829                if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) {
 830                        r = walk_node(s, value64(n, i), fn, context);
 831                        if (r)
 832                                goto out;
 833                } else {
 834                        keys = le64_to_cpu(*key_ptr(n, i));
 835                        r = fn(context, &keys, value_ptr(n, i));
 836                        if (r)
 837                                goto out;
 838                }
 839        }
 840
 841out:
 842        ro_pop(s);
 843        return r;
 844}
 845
 846int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
 847                  int (*fn)(void *context, uint64_t *keys, void *leaf),
 848                  void *context)
 849{
 850        int r;
 851        struct ro_spine spine;
 852
 853        BUG_ON(info->levels > 1);
 854
 855        init_ro_spine(&spine, info);
 856        r = walk_node(&spine, root, fn, context);
 857        exit_ro_spine(&spine);
 858
 859        return r;
 860}
 861EXPORT_SYMBOL_GPL(dm_btree_walk);
 862