linux/drivers/md/persistent-data/dm-btree.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2011 Red Hat, Inc.
   3 *
   4 * This file is released under the GPL.
   5 */
   6
   7#include "dm-btree-internal.h"
   8#include "dm-space-map.h"
   9#include "dm-transaction-manager.h"
  10
  11#include <linux/export.h>
  12#include <linux/device-mapper.h>
  13
  14#define DM_MSG_PREFIX "btree"
  15
  16/*----------------------------------------------------------------
  17 * Array manipulation
  18 *--------------------------------------------------------------*/
  19static void memcpy_disk(void *dest, const void *src, size_t len)
  20        __dm_written_to_disk(src)
  21{
  22        memcpy(dest, src, len);
  23        __dm_unbless_for_disk(src);
  24}
  25
  26static void array_insert(void *base, size_t elt_size, unsigned nr_elts,
  27                         unsigned index, void *elt)
  28        __dm_written_to_disk(elt)
  29{
  30        if (index < nr_elts)
  31                memmove(base + (elt_size * (index + 1)),
  32                        base + (elt_size * index),
  33                        (nr_elts - index) * elt_size);
  34
  35        memcpy_disk(base + (elt_size * index), elt, elt_size);
  36}
  37
  38/*----------------------------------------------------------------*/
  39
  40/* makes the assumption that no two keys are the same. */
  41static int bsearch(struct btree_node *n, uint64_t key, int want_hi)
  42{
  43        int lo = -1, hi = le32_to_cpu(n->header.nr_entries);
  44
  45        while (hi - lo > 1) {
  46                int mid = lo + ((hi - lo) / 2);
  47                uint64_t mid_key = le64_to_cpu(n->keys[mid]);
  48
  49                if (mid_key == key)
  50                        return mid;
  51
  52                if (mid_key < key)
  53                        lo = mid;
  54                else
  55                        hi = mid;
  56        }
  57
  58        return want_hi ? hi : lo;
  59}
  60
  61int lower_bound(struct btree_node *n, uint64_t key)
  62{
  63        return bsearch(n, key, 0);
  64}
  65
  66void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
  67                  struct dm_btree_value_type *vt)
  68{
  69        unsigned i;
  70        uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
  71
  72        if (le32_to_cpu(n->header.flags) & INTERNAL_NODE)
  73                for (i = 0; i < nr_entries; i++)
  74                        dm_tm_inc(tm, value64(n, i));
  75        else if (vt->inc)
  76                for (i = 0; i < nr_entries; i++)
  77                        vt->inc(vt->context, value_ptr(n, i));
  78}
  79
  80static int insert_at(size_t value_size, struct btree_node *node, unsigned index,
  81                      uint64_t key, void *value)
  82                      __dm_written_to_disk(value)
  83{
  84        uint32_t nr_entries = le32_to_cpu(node->header.nr_entries);
  85        __le64 key_le = cpu_to_le64(key);
  86
  87        if (index > nr_entries ||
  88            index >= le32_to_cpu(node->header.max_entries)) {
  89                DMERR("too many entries in btree node for insert");
  90                __dm_unbless_for_disk(value);
  91                return -ENOMEM;
  92        }
  93
  94        __dm_bless_for_disk(&key_le);
  95
  96        array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le);
  97        array_insert(value_base(node), value_size, nr_entries, index, value);
  98        node->header.nr_entries = cpu_to_le32(nr_entries + 1);
  99
 100        return 0;
 101}
 102
 103/*----------------------------------------------------------------*/
 104
 105/*
 106 * We want 3n entries (for some n).  This works more nicely for repeated
 107 * insert remove loops than (2n + 1).
 108 */
 109static uint32_t calc_max_entries(size_t value_size, size_t block_size)
 110{
 111        uint32_t total, n;
 112        size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */
 113
 114        block_size -= sizeof(struct node_header);
 115        total = block_size / elt_size;
 116        n = total / 3;          /* rounds down */
 117
 118        return 3 * n;
 119}
 120
 121int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root)
 122{
 123        int r;
 124        struct dm_block *b;
 125        struct btree_node *n;
 126        size_t block_size;
 127        uint32_t max_entries;
 128
 129        r = new_block(info, &b);
 130        if (r < 0)
 131                return r;
 132
 133        block_size = dm_bm_block_size(dm_tm_get_bm(info->tm));
 134        max_entries = calc_max_entries(info->value_type.size, block_size);
 135
 136        n = dm_block_data(b);
 137        memset(n, 0, block_size);
 138        n->header.flags = cpu_to_le32(LEAF_NODE);
 139        n->header.nr_entries = cpu_to_le32(0);
 140        n->header.max_entries = cpu_to_le32(max_entries);
 141        n->header.value_size = cpu_to_le32(info->value_type.size);
 142
 143        *root = dm_block_location(b);
 144        return unlock_block(info, b);
 145}
 146EXPORT_SYMBOL_GPL(dm_btree_empty);
 147
 148/*----------------------------------------------------------------*/
 149
 150/*
 151 * Deletion uses a recursive algorithm, since we have limited stack space
 152 * we explicitly manage our own stack on the heap.
 153 */
 154#define MAX_SPINE_DEPTH 64
 155struct frame {
 156        struct dm_block *b;
 157        struct btree_node *n;
 158        unsigned level;
 159        unsigned nr_children;
 160        unsigned current_child;
 161};
 162
 163struct del_stack {
 164        struct dm_btree_info *info;
 165        struct dm_transaction_manager *tm;
 166        int top;
 167        struct frame spine[MAX_SPINE_DEPTH];
 168};
 169
 170static int top_frame(struct del_stack *s, struct frame **f)
 171{
 172        if (s->top < 0) {
 173                DMERR("btree deletion stack empty");
 174                return -EINVAL;
 175        }
 176
 177        *f = s->spine + s->top;
 178
 179        return 0;
 180}
 181
 182static int unprocessed_frames(struct del_stack *s)
 183{
 184        return s->top >= 0;
 185}
 186
 187static void prefetch_children(struct del_stack *s, struct frame *f)
 188{
 189        unsigned i;
 190        struct dm_block_manager *bm = dm_tm_get_bm(s->tm);
 191
 192        for (i = 0; i < f->nr_children; i++)
 193                dm_bm_prefetch(bm, value64(f->n, i));
 194}
 195
 196static bool is_internal_level(struct dm_btree_info *info, struct frame *f)
 197{
 198        return f->level < (info->levels - 1);
 199}
 200
 201static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
 202{
 203        int r;
 204        uint32_t ref_count;
 205
 206        if (s->top >= MAX_SPINE_DEPTH - 1) {
 207                DMERR("btree deletion stack out of memory");
 208                return -ENOMEM;
 209        }
 210
 211        r = dm_tm_ref(s->tm, b, &ref_count);
 212        if (r)
 213                return r;
 214
 215        if (ref_count > 1)
 216                /*
 217                 * This is a shared node, so we can just decrement it's
 218                 * reference counter and leave the children.
 219                 */
 220                dm_tm_dec(s->tm, b);
 221
 222        else {
 223                uint32_t flags;
 224                struct frame *f = s->spine + ++s->top;
 225
 226                r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b);
 227                if (r) {
 228                        s->top--;
 229                        return r;
 230                }
 231
 232                f->n = dm_block_data(f->b);
 233                f->level = level;
 234                f->nr_children = le32_to_cpu(f->n->header.nr_entries);
 235                f->current_child = 0;
 236
 237                flags = le32_to_cpu(f->n->header.flags);
 238                if (flags & INTERNAL_NODE || is_internal_level(s->info, f))
 239                        prefetch_children(s, f);
 240        }
 241
 242        return 0;
 243}
 244
 245static void pop_frame(struct del_stack *s)
 246{
 247        struct frame *f = s->spine + s->top--;
 248
 249        dm_tm_dec(s->tm, dm_block_location(f->b));
 250        dm_tm_unlock(s->tm, f->b);
 251}
 252
 253int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
 254{
 255        int r;
 256        struct del_stack *s;
 257
 258        s = kmalloc(sizeof(*s), GFP_NOIO);
 259        if (!s)
 260                return -ENOMEM;
 261        s->info = info;
 262        s->tm = info->tm;
 263        s->top = -1;
 264
 265        r = push_frame(s, root, 0);
 266        if (r)
 267                goto out;
 268
 269        while (unprocessed_frames(s)) {
 270                uint32_t flags;
 271                struct frame *f;
 272                dm_block_t b;
 273
 274                r = top_frame(s, &f);
 275                if (r)
 276                        goto out;
 277
 278                if (f->current_child >= f->nr_children) {
 279                        pop_frame(s);
 280                        continue;
 281                }
 282
 283                flags = le32_to_cpu(f->n->header.flags);
 284                if (flags & INTERNAL_NODE) {
 285                        b = value64(f->n, f->current_child);
 286                        f->current_child++;
 287                        r = push_frame(s, b, f->level);
 288                        if (r)
 289                                goto out;
 290
 291                } else if (is_internal_level(info, f)) {
 292                        b = value64(f->n, f->current_child);
 293                        f->current_child++;
 294                        r = push_frame(s, b, f->level + 1);
 295                        if (r)
 296                                goto out;
 297
 298                } else {
 299                        if (info->value_type.dec) {
 300                                unsigned i;
 301
 302                                for (i = 0; i < f->nr_children; i++)
 303                                        info->value_type.dec(info->value_type.context,
 304                                                             value_ptr(f->n, i));
 305                        }
 306                        pop_frame(s);
 307                }
 308        }
 309
 310out:
 311        kfree(s);
 312        return r;
 313}
 314EXPORT_SYMBOL_GPL(dm_btree_del);
 315
 316/*----------------------------------------------------------------*/
 317
 318static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
 319                            int (*search_fn)(struct btree_node *, uint64_t),
 320                            uint64_t *result_key, void *v, size_t value_size)
 321{
 322        int i, r;
 323        uint32_t flags, nr_entries;
 324
 325        do {
 326                r = ro_step(s, block);
 327                if (r < 0)
 328                        return r;
 329
 330                i = search_fn(ro_node(s), key);
 331
 332                flags = le32_to_cpu(ro_node(s)->header.flags);
 333                nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries);
 334                if (i < 0 || i >= nr_entries)
 335                        return -ENODATA;
 336
 337                if (flags & INTERNAL_NODE)
 338                        block = value64(ro_node(s), i);
 339
 340        } while (!(flags & LEAF_NODE));
 341
 342        *result_key = le64_to_cpu(ro_node(s)->keys[i]);
 343        memcpy(v, value_ptr(ro_node(s), i), value_size);
 344
 345        return 0;
 346}
 347
 348int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
 349                    uint64_t *keys, void *value_le)
 350{
 351        unsigned level, last_level = info->levels - 1;
 352        int r = -ENODATA;
 353        uint64_t rkey;
 354        __le64 internal_value_le;
 355        struct ro_spine spine;
 356
 357        init_ro_spine(&spine, info);
 358        for (level = 0; level < info->levels; level++) {
 359                size_t size;
 360                void *value_p;
 361
 362                if (level == last_level) {
 363                        value_p = value_le;
 364                        size = info->value_type.size;
 365
 366                } else {
 367                        value_p = &internal_value_le;
 368                        size = sizeof(uint64_t);
 369                }
 370
 371                r = btree_lookup_raw(&spine, root, keys[level],
 372                                     lower_bound, &rkey,
 373                                     value_p, size);
 374
 375                if (!r) {
 376                        if (rkey != keys[level]) {
 377                                exit_ro_spine(&spine);
 378                                return -ENODATA;
 379                        }
 380                } else {
 381                        exit_ro_spine(&spine);
 382                        return r;
 383                }
 384
 385                root = le64_to_cpu(internal_value_le);
 386        }
 387        exit_ro_spine(&spine);
 388
 389        return r;
 390}
 391EXPORT_SYMBOL_GPL(dm_btree_lookup);
 392
 393/*
 394 * Splits a node by creating a sibling node and shifting half the nodes
 395 * contents across.  Assumes there is a parent node, and it has room for
 396 * another child.
 397 *
 398 * Before:
 399 *        +--------+
 400 *        | Parent |
 401 *        +--------+
 402 *           |
 403 *           v
 404 *      +----------+
 405 *      | A ++++++ |
 406 *      +----------+
 407 *
 408 *
 409 * After:
 410 *              +--------+
 411 *              | Parent |
 412 *              +--------+
 413 *                |     |
 414 *                v     +------+
 415 *          +---------+        |
 416 *          | A* +++  |        v
 417 *          +---------+   +-------+
 418 *                        | B +++ |
 419 *                        +-------+
 420 *
 421 * Where A* is a shadow of A.
 422 */
 423static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index,
 424                               uint64_t key)
 425{
 426        int r;
 427        size_t size;
 428        unsigned nr_left, nr_right;
 429        struct dm_block *left, *right, *parent;
 430        struct btree_node *ln, *rn, *pn;
 431        __le64 location;
 432
 433        left = shadow_current(s);
 434
 435        r = new_block(s->info, &right);
 436        if (r < 0)
 437                return r;
 438
 439        ln = dm_block_data(left);
 440        rn = dm_block_data(right);
 441
 442        nr_left = le32_to_cpu(ln->header.nr_entries) / 2;
 443        nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left;
 444
 445        ln->header.nr_entries = cpu_to_le32(nr_left);
 446
 447        rn->header.flags = ln->header.flags;
 448        rn->header.nr_entries = cpu_to_le32(nr_right);
 449        rn->header.max_entries = ln->header.max_entries;
 450        rn->header.value_size = ln->header.value_size;
 451        memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0]));
 452
 453        size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
 454                sizeof(uint64_t) : s->info->value_type.size;
 455        memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left),
 456               size * nr_right);
 457
 458        /*
 459         * Patch up the parent
 460         */
 461        parent = shadow_parent(s);
 462
 463        pn = dm_block_data(parent);
 464        location = cpu_to_le64(dm_block_location(left));
 465        __dm_bless_for_disk(&location);
 466        memcpy_disk(value_ptr(pn, parent_index),
 467                    &location, sizeof(__le64));
 468
 469        location = cpu_to_le64(dm_block_location(right));
 470        __dm_bless_for_disk(&location);
 471
 472        r = insert_at(sizeof(__le64), pn, parent_index + 1,
 473                      le64_to_cpu(rn->keys[0]), &location);
 474        if (r)
 475                return r;
 476
 477        if (key < le64_to_cpu(rn->keys[0])) {
 478                unlock_block(s->info, right);
 479                s->nodes[1] = left;
 480        } else {
 481                unlock_block(s->info, left);
 482                s->nodes[1] = right;
 483        }
 484
 485        return 0;
 486}
 487
 488/*
 489 * Splits a node by creating two new children beneath the given node.
 490 *
 491 * Before:
 492 *        +----------+
 493 *        | A ++++++ |
 494 *        +----------+
 495 *
 496 *
 497 * After:
 498 *      +------------+
 499 *      | A (shadow) |
 500 *      +------------+
 501 *          |   |
 502 *   +------+   +----+
 503 *   |               |
 504 *   v               v
 505 * +-------+     +-------+
 506 * | B +++ |     | C +++ |
 507 * +-------+     +-------+
 508 */
 509static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
 510{
 511        int r;
 512        size_t size;
 513        unsigned nr_left, nr_right;
 514        struct dm_block *left, *right, *new_parent;
 515        struct btree_node *pn, *ln, *rn;
 516        __le64 val;
 517
 518        new_parent = shadow_current(s);
 519
 520        r = new_block(s->info, &left);
 521        if (r < 0)
 522                return r;
 523
 524        r = new_block(s->info, &right);
 525        if (r < 0) {
 526                unlock_block(s->info, left);
 527                return r;
 528        }
 529
 530        pn = dm_block_data(new_parent);
 531        ln = dm_block_data(left);
 532        rn = dm_block_data(right);
 533
 534        nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
 535        nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
 536
 537        ln->header.flags = pn->header.flags;
 538        ln->header.nr_entries = cpu_to_le32(nr_left);
 539        ln->header.max_entries = pn->header.max_entries;
 540        ln->header.value_size = pn->header.value_size;
 541
 542        rn->header.flags = pn->header.flags;
 543        rn->header.nr_entries = cpu_to_le32(nr_right);
 544        rn->header.max_entries = pn->header.max_entries;
 545        rn->header.value_size = pn->header.value_size;
 546
 547        memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
 548        memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
 549
 550        size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
 551                sizeof(__le64) : s->info->value_type.size;
 552        memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
 553        memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
 554               nr_right * size);
 555
 556        /* new_parent should just point to l and r now */
 557        pn->header.flags = cpu_to_le32(INTERNAL_NODE);
 558        pn->header.nr_entries = cpu_to_le32(2);
 559        pn->header.max_entries = cpu_to_le32(
 560                calc_max_entries(sizeof(__le64),
 561                                 dm_bm_block_size(
 562                                         dm_tm_get_bm(s->info->tm))));
 563        pn->header.value_size = cpu_to_le32(sizeof(__le64));
 564
 565        val = cpu_to_le64(dm_block_location(left));
 566        __dm_bless_for_disk(&val);
 567        pn->keys[0] = ln->keys[0];
 568        memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64));
 569
 570        val = cpu_to_le64(dm_block_location(right));
 571        __dm_bless_for_disk(&val);
 572        pn->keys[1] = rn->keys[0];
 573        memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64));
 574
 575        /*
 576         * rejig the spine.  This is ugly, since it knows too
 577         * much about the spine
 578         */
 579        if (s->nodes[0] != new_parent) {
 580                unlock_block(s->info, s->nodes[0]);
 581                s->nodes[0] = new_parent;
 582        }
 583        if (key < le64_to_cpu(rn->keys[0])) {
 584                unlock_block(s->info, right);
 585                s->nodes[1] = left;
 586        } else {
 587                unlock_block(s->info, left);
 588                s->nodes[1] = right;
 589        }
 590        s->count = 2;
 591
 592        return 0;
 593}
 594
 595static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
 596                            struct dm_btree_value_type *vt,
 597                            uint64_t key, unsigned *index)
 598{
 599        int r, i = *index, top = 1;
 600        struct btree_node *node;
 601
 602        for (;;) {
 603                r = shadow_step(s, root, vt);
 604                if (r < 0)
 605                        return r;
 606
 607                node = dm_block_data(shadow_current(s));
 608
 609                /*
 610                 * We have to patch up the parent node, ugly, but I don't
 611                 * see a way to do this automatically as part of the spine
 612                 * op.
 613                 */
 614                if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */
 615                        __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
 616
 617                        __dm_bless_for_disk(&location);
 618                        memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
 619                                    &location, sizeof(__le64));
 620                }
 621
 622                node = dm_block_data(shadow_current(s));
 623
 624                if (node->header.nr_entries == node->header.max_entries) {
 625                        if (top)
 626                                r = btree_split_beneath(s, key);
 627                        else
 628                                r = btree_split_sibling(s, i, key);
 629
 630                        if (r < 0)
 631                                return r;
 632                }
 633
 634                node = dm_block_data(shadow_current(s));
 635
 636                i = lower_bound(node, key);
 637
 638                if (le32_to_cpu(node->header.flags) & LEAF_NODE)
 639                        break;
 640
 641                if (i < 0) {
 642                        /* change the bounds on the lowest key */
 643                        node->keys[0] = cpu_to_le64(key);
 644                        i = 0;
 645                }
 646
 647                root = value64(node, i);
 648                top = 0;
 649        }
 650
 651        if (i < 0 || le64_to_cpu(node->keys[i]) != key)
 652                i++;
 653
 654        *index = i;
 655        return 0;
 656}
 657
 658static int insert(struct dm_btree_info *info, dm_block_t root,
 659                  uint64_t *keys, void *value, dm_block_t *new_root,
 660                  int *inserted)
 661                  __dm_written_to_disk(value)
 662{
 663        int r, need_insert;
 664        unsigned level, index = -1, last_level = info->levels - 1;
 665        dm_block_t block = root;
 666        struct shadow_spine spine;
 667        struct btree_node *n;
 668        struct dm_btree_value_type le64_type;
 669
 670        init_le64_type(info->tm, &le64_type);
 671        init_shadow_spine(&spine, info);
 672
 673        for (level = 0; level < (info->levels - 1); level++) {
 674                r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index);
 675                if (r < 0)
 676                        goto bad;
 677
 678                n = dm_block_data(shadow_current(&spine));
 679                need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
 680                               (le64_to_cpu(n->keys[index]) != keys[level]));
 681
 682                if (need_insert) {
 683                        dm_block_t new_tree;
 684                        __le64 new_le;
 685
 686                        r = dm_btree_empty(info, &new_tree);
 687                        if (r < 0)
 688                                goto bad;
 689
 690                        new_le = cpu_to_le64(new_tree);
 691                        __dm_bless_for_disk(&new_le);
 692
 693                        r = insert_at(sizeof(uint64_t), n, index,
 694                                      keys[level], &new_le);
 695                        if (r)
 696                                goto bad;
 697                }
 698
 699                if (level < last_level)
 700                        block = value64(n, index);
 701        }
 702
 703        r = btree_insert_raw(&spine, block, &info->value_type,
 704                             keys[level], &index);
 705        if (r < 0)
 706                goto bad;
 707
 708        n = dm_block_data(shadow_current(&spine));
 709        need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
 710                       (le64_to_cpu(n->keys[index]) != keys[level]));
 711
 712        if (need_insert) {
 713                if (inserted)
 714                        *inserted = 1;
 715
 716                r = insert_at(info->value_type.size, n, index,
 717                              keys[level], value);
 718                if (r)
 719                        goto bad_unblessed;
 720        } else {
 721                if (inserted)
 722                        *inserted = 0;
 723
 724                if (info->value_type.dec &&
 725                    (!info->value_type.equal ||
 726                     !info->value_type.equal(
 727                             info->value_type.context,
 728                             value_ptr(n, index),
 729                             value))) {
 730                        info->value_type.dec(info->value_type.context,
 731                                             value_ptr(n, index));
 732                }
 733                memcpy_disk(value_ptr(n, index),
 734                            value, info->value_type.size);
 735        }
 736
 737        *new_root = shadow_root(&spine);
 738        exit_shadow_spine(&spine);
 739
 740        return 0;
 741
 742bad:
 743        __dm_unbless_for_disk(value);
 744bad_unblessed:
 745        exit_shadow_spine(&spine);
 746        return r;
 747}
 748
 749int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
 750                    uint64_t *keys, void *value, dm_block_t *new_root)
 751                    __dm_written_to_disk(value)
 752{
 753        return insert(info, root, keys, value, new_root, NULL);
 754}
 755EXPORT_SYMBOL_GPL(dm_btree_insert);
 756
 757int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
 758                           uint64_t *keys, void *value, dm_block_t *new_root,
 759                           int *inserted)
 760                           __dm_written_to_disk(value)
 761{
 762        return insert(info, root, keys, value, new_root, inserted);
 763}
 764EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
 765
 766/*----------------------------------------------------------------*/
 767
 768static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest,
 769                    uint64_t *result_key, dm_block_t *next_block)
 770{
 771        int i, r;
 772        uint32_t flags;
 773
 774        do {
 775                r = ro_step(s, block);
 776                if (r < 0)
 777                        return r;
 778
 779                flags = le32_to_cpu(ro_node(s)->header.flags);
 780                i = le32_to_cpu(ro_node(s)->header.nr_entries);
 781                if (!i)
 782                        return -ENODATA;
 783                else
 784                        i--;
 785
 786                if (find_highest)
 787                        *result_key = le64_to_cpu(ro_node(s)->keys[i]);
 788                else
 789                        *result_key = le64_to_cpu(ro_node(s)->keys[0]);
 790
 791                if (next_block || flags & INTERNAL_NODE)
 792                        block = value64(ro_node(s), i);
 793
 794        } while (flags & INTERNAL_NODE);
 795
 796        if (next_block)
 797                *next_block = block;
 798        return 0;
 799}
 800
 801static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root,
 802                             bool find_highest, uint64_t *result_keys)
 803{
 804        int r = 0, count = 0, level;
 805        struct ro_spine spine;
 806
 807        init_ro_spine(&spine, info);
 808        for (level = 0; level < info->levels; level++) {
 809                r = find_key(&spine, root, find_highest, result_keys + level,
 810                             level == info->levels - 1 ? NULL : &root);
 811                if (r == -ENODATA) {
 812                        r = 0;
 813                        break;
 814
 815                } else if (r)
 816                        break;
 817
 818                count++;
 819        }
 820        exit_ro_spine(&spine);
 821
 822        return r ? r : count;
 823}
 824
 825int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
 826                              uint64_t *result_keys)
 827{
 828        return dm_btree_find_key(info, root, true, result_keys);
 829}
 830EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
 831
 832int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root,
 833                             uint64_t *result_keys)
 834{
 835        return dm_btree_find_key(info, root, false, result_keys);
 836}
 837EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key);
 838
 839/*----------------------------------------------------------------*/
 840
 841/*
 842 * FIXME: We shouldn't use a recursive algorithm when we have limited stack
 843 * space.  Also this only works for single level trees.
 844 */
 845static int walk_node(struct dm_btree_info *info, dm_block_t block,
 846                     int (*fn)(void *context, uint64_t *keys, void *leaf),
 847                     void *context)
 848{
 849        int r;
 850        unsigned i, nr;
 851        struct dm_block *node;
 852        struct btree_node *n;
 853        uint64_t keys;
 854
 855        r = bn_read_lock(info, block, &node);
 856        if (r)
 857                return r;
 858
 859        n = dm_block_data(node);
 860
 861        nr = le32_to_cpu(n->header.nr_entries);
 862        for (i = 0; i < nr; i++) {
 863                if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) {
 864                        r = walk_node(info, value64(n, i), fn, context);
 865                        if (r)
 866                                goto out;
 867                } else {
 868                        keys = le64_to_cpu(*key_ptr(n, i));
 869                        r = fn(context, &keys, value_ptr(n, i));
 870                        if (r)
 871                                goto out;
 872                }
 873        }
 874
 875out:
 876        dm_tm_unlock(info->tm, node);
 877        return r;
 878}
 879
 880int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
 881                  int (*fn)(void *context, uint64_t *keys, void *leaf),
 882                  void *context)
 883{
 884        BUG_ON(info->levels > 1);
 885        return walk_node(info, root, fn, context);
 886}
 887EXPORT_SYMBOL_GPL(dm_btree_walk);
 888