linux/drivers/md/persistent-data/dm-btree.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2011 Red Hat, Inc.
   3 *
   4 * This file is released under the GPL.
   5 */
   6
   7#include "dm-btree-internal.h"
   8#include "dm-space-map.h"
   9#include "dm-transaction-manager.h"
  10
  11#include <linux/export.h>
  12#include <linux/device-mapper.h>
  13
  14#define DM_MSG_PREFIX "btree"
  15
  16/*----------------------------------------------------------------
  17 * Array manipulation
  18 *--------------------------------------------------------------*/
  19static void memcpy_disk(void *dest, const void *src, size_t len)
  20        __dm_written_to_disk(src)
  21{
  22        memcpy(dest, src, len);
  23        __dm_unbless_for_disk(src);
  24}
  25
  26static void array_insert(void *base, size_t elt_size, unsigned nr_elts,
  27                         unsigned index, void *elt)
  28        __dm_written_to_disk(elt)
  29{
  30        if (index < nr_elts)
  31                memmove(base + (elt_size * (index + 1)),
  32                        base + (elt_size * index),
  33                        (nr_elts - index) * elt_size);
  34
  35        memcpy_disk(base + (elt_size * index), elt, elt_size);
  36}
  37
  38/*----------------------------------------------------------------*/
  39
  40/* makes the assumption that no two keys are the same. */
  41static int bsearch(struct btree_node *n, uint64_t key, int want_hi)
  42{
  43        int lo = -1, hi = le32_to_cpu(n->header.nr_entries);
  44
  45        while (hi - lo > 1) {
  46                int mid = lo + ((hi - lo) / 2);
  47                uint64_t mid_key = le64_to_cpu(n->keys[mid]);
  48
  49                if (mid_key == key)
  50                        return mid;
  51
  52                if (mid_key < key)
  53                        lo = mid;
  54                else
  55                        hi = mid;
  56        }
  57
  58        return want_hi ? hi : lo;
  59}
  60
  61int lower_bound(struct btree_node *n, uint64_t key)
  62{
  63        return bsearch(n, key, 0);
  64}
  65
  66static int upper_bound(struct btree_node *n, uint64_t key)
  67{
  68        return bsearch(n, key, 1);
  69}
  70
  71void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
  72                  struct dm_btree_value_type *vt)
  73{
  74        uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
  75
  76        if (le32_to_cpu(n->header.flags) & INTERNAL_NODE)
  77                dm_tm_with_runs(tm, value_ptr(n, 0), nr_entries, dm_tm_inc_range);
  78
  79        else if (vt->inc)
  80                vt->inc(vt->context, value_ptr(n, 0), nr_entries);
  81}
  82
  83static int insert_at(size_t value_size, struct btree_node *node, unsigned index,
  84                      uint64_t key, void *value)
  85                      __dm_written_to_disk(value)
  86{
  87        uint32_t nr_entries = le32_to_cpu(node->header.nr_entries);
  88        __le64 key_le = cpu_to_le64(key);
  89
  90        if (index > nr_entries ||
  91            index >= le32_to_cpu(node->header.max_entries)) {
  92                DMERR("too many entries in btree node for insert");
  93                __dm_unbless_for_disk(value);
  94                return -ENOMEM;
  95        }
  96
  97        __dm_bless_for_disk(&key_le);
  98
  99        array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le);
 100        array_insert(value_base(node), value_size, nr_entries, index, value);
 101        node->header.nr_entries = cpu_to_le32(nr_entries + 1);
 102
 103        return 0;
 104}
 105
 106/*----------------------------------------------------------------*/
 107
 108/*
 109 * We want 3n entries (for some n).  This works more nicely for repeated
 110 * insert remove loops than (2n + 1).
 111 */
 112static uint32_t calc_max_entries(size_t value_size, size_t block_size)
 113{
 114        uint32_t total, n;
 115        size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */
 116
 117        block_size -= sizeof(struct node_header);
 118        total = block_size / elt_size;
 119        n = total / 3;          /* rounds down */
 120
 121        return 3 * n;
 122}
 123
 124int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root)
 125{
 126        int r;
 127        struct dm_block *b;
 128        struct btree_node *n;
 129        size_t block_size;
 130        uint32_t max_entries;
 131
 132        r = new_block(info, &b);
 133        if (r < 0)
 134                return r;
 135
 136        block_size = dm_bm_block_size(dm_tm_get_bm(info->tm));
 137        max_entries = calc_max_entries(info->value_type.size, block_size);
 138
 139        n = dm_block_data(b);
 140        memset(n, 0, block_size);
 141        n->header.flags = cpu_to_le32(LEAF_NODE);
 142        n->header.nr_entries = cpu_to_le32(0);
 143        n->header.max_entries = cpu_to_le32(max_entries);
 144        n->header.value_size = cpu_to_le32(info->value_type.size);
 145
 146        *root = dm_block_location(b);
 147        unlock_block(info, b);
 148
 149        return 0;
 150}
 151EXPORT_SYMBOL_GPL(dm_btree_empty);
 152
 153/*----------------------------------------------------------------*/
 154
 155/*
 156 * Deletion uses a recursive algorithm, since we have limited stack space
 157 * we explicitly manage our own stack on the heap.
 158 */
 159#define MAX_SPINE_DEPTH 64
 160struct frame {
 161        struct dm_block *b;
 162        struct btree_node *n;
 163        unsigned level;
 164        unsigned nr_children;
 165        unsigned current_child;
 166};
 167
 168struct del_stack {
 169        struct dm_btree_info *info;
 170        struct dm_transaction_manager *tm;
 171        int top;
 172        struct frame spine[MAX_SPINE_DEPTH];
 173};
 174
 175static int top_frame(struct del_stack *s, struct frame **f)
 176{
 177        if (s->top < 0) {
 178                DMERR("btree deletion stack empty");
 179                return -EINVAL;
 180        }
 181
 182        *f = s->spine + s->top;
 183
 184        return 0;
 185}
 186
 187static int unprocessed_frames(struct del_stack *s)
 188{
 189        return s->top >= 0;
 190}
 191
 192static void prefetch_children(struct del_stack *s, struct frame *f)
 193{
 194        unsigned i;
 195        struct dm_block_manager *bm = dm_tm_get_bm(s->tm);
 196
 197        for (i = 0; i < f->nr_children; i++)
 198                dm_bm_prefetch(bm, value64(f->n, i));
 199}
 200
 201static bool is_internal_level(struct dm_btree_info *info, struct frame *f)
 202{
 203        return f->level < (info->levels - 1);
 204}
 205
 206static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
 207{
 208        int r;
 209        uint32_t ref_count;
 210
 211        if (s->top >= MAX_SPINE_DEPTH - 1) {
 212                DMERR("btree deletion stack out of memory");
 213                return -ENOMEM;
 214        }
 215
 216        r = dm_tm_ref(s->tm, b, &ref_count);
 217        if (r)
 218                return r;
 219
 220        if (ref_count > 1)
 221                /*
 222                 * This is a shared node, so we can just decrement it's
 223                 * reference counter and leave the children.
 224                 */
 225                dm_tm_dec(s->tm, b);
 226
 227        else {
 228                uint32_t flags;
 229                struct frame *f = s->spine + ++s->top;
 230
 231                r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b);
 232                if (r) {
 233                        s->top--;
 234                        return r;
 235                }
 236
 237                f->n = dm_block_data(f->b);
 238                f->level = level;
 239                f->nr_children = le32_to_cpu(f->n->header.nr_entries);
 240                f->current_child = 0;
 241
 242                flags = le32_to_cpu(f->n->header.flags);
 243                if (flags & INTERNAL_NODE || is_internal_level(s->info, f))
 244                        prefetch_children(s, f);
 245        }
 246
 247        return 0;
 248}
 249
 250static void pop_frame(struct del_stack *s)
 251{
 252        struct frame *f = s->spine + s->top--;
 253
 254        dm_tm_dec(s->tm, dm_block_location(f->b));
 255        dm_tm_unlock(s->tm, f->b);
 256}
 257
 258static void unlock_all_frames(struct del_stack *s)
 259{
 260        struct frame *f;
 261
 262        while (unprocessed_frames(s)) {
 263                f = s->spine + s->top--;
 264                dm_tm_unlock(s->tm, f->b);
 265        }
 266}
 267
 268int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
 269{
 270        int r;
 271        struct del_stack *s;
 272
 273        /*
 274         * dm_btree_del() is called via an ioctl, as such should be
 275         * considered an FS op.  We can't recurse back into the FS, so we
 276         * allocate GFP_NOFS.
 277         */
 278        s = kmalloc(sizeof(*s), GFP_NOFS);
 279        if (!s)
 280                return -ENOMEM;
 281        s->info = info;
 282        s->tm = info->tm;
 283        s->top = -1;
 284
 285        r = push_frame(s, root, 0);
 286        if (r)
 287                goto out;
 288
 289        while (unprocessed_frames(s)) {
 290                uint32_t flags;
 291                struct frame *f;
 292                dm_block_t b;
 293
 294                r = top_frame(s, &f);
 295                if (r)
 296                        goto out;
 297
 298                if (f->current_child >= f->nr_children) {
 299                        pop_frame(s);
 300                        continue;
 301                }
 302
 303                flags = le32_to_cpu(f->n->header.flags);
 304                if (flags & INTERNAL_NODE) {
 305                        b = value64(f->n, f->current_child);
 306                        f->current_child++;
 307                        r = push_frame(s, b, f->level);
 308                        if (r)
 309                                goto out;
 310
 311                } else if (is_internal_level(info, f)) {
 312                        b = value64(f->n, f->current_child);
 313                        f->current_child++;
 314                        r = push_frame(s, b, f->level + 1);
 315                        if (r)
 316                                goto out;
 317
 318                } else {
 319                        if (info->value_type.dec)
 320                                info->value_type.dec(info->value_type.context,
 321                                                     value_ptr(f->n, 0), f->nr_children);
 322                        pop_frame(s);
 323                }
 324        }
 325out:
 326        if (r) {
 327                /* cleanup all frames of del_stack */
 328                unlock_all_frames(s);
 329        }
 330        kfree(s);
 331
 332        return r;
 333}
 334EXPORT_SYMBOL_GPL(dm_btree_del);
 335
 336/*----------------------------------------------------------------*/
 337
 338static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
 339                            int (*search_fn)(struct btree_node *, uint64_t),
 340                            uint64_t *result_key, void *v, size_t value_size)
 341{
 342        int i, r;
 343        uint32_t flags, nr_entries;
 344
 345        do {
 346                r = ro_step(s, block);
 347                if (r < 0)
 348                        return r;
 349
 350                i = search_fn(ro_node(s), key);
 351
 352                flags = le32_to_cpu(ro_node(s)->header.flags);
 353                nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries);
 354                if (i < 0 || i >= nr_entries)
 355                        return -ENODATA;
 356
 357                if (flags & INTERNAL_NODE)
 358                        block = value64(ro_node(s), i);
 359
 360        } while (!(flags & LEAF_NODE));
 361
 362        *result_key = le64_to_cpu(ro_node(s)->keys[i]);
 363        if (v)
 364                memcpy(v, value_ptr(ro_node(s), i), value_size);
 365
 366        return 0;
 367}
 368
 369int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
 370                    uint64_t *keys, void *value_le)
 371{
 372        unsigned level, last_level = info->levels - 1;
 373        int r = -ENODATA;
 374        uint64_t rkey;
 375        __le64 internal_value_le;
 376        struct ro_spine spine;
 377
 378        init_ro_spine(&spine, info);
 379        for (level = 0; level < info->levels; level++) {
 380                size_t size;
 381                void *value_p;
 382
 383                if (level == last_level) {
 384                        value_p = value_le;
 385                        size = info->value_type.size;
 386
 387                } else {
 388                        value_p = &internal_value_le;
 389                        size = sizeof(uint64_t);
 390                }
 391
 392                r = btree_lookup_raw(&spine, root, keys[level],
 393                                     lower_bound, &rkey,
 394                                     value_p, size);
 395
 396                if (!r) {
 397                        if (rkey != keys[level]) {
 398                                exit_ro_spine(&spine);
 399                                return -ENODATA;
 400                        }
 401                } else {
 402                        exit_ro_spine(&spine);
 403                        return r;
 404                }
 405
 406                root = le64_to_cpu(internal_value_le);
 407        }
 408        exit_ro_spine(&spine);
 409
 410        return r;
 411}
 412EXPORT_SYMBOL_GPL(dm_btree_lookup);
 413
 414static int dm_btree_lookup_next_single(struct dm_btree_info *info, dm_block_t root,
 415                                       uint64_t key, uint64_t *rkey, void *value_le)
 416{
 417        int r, i;
 418        uint32_t flags, nr_entries;
 419        struct dm_block *node;
 420        struct btree_node *n;
 421
 422        r = bn_read_lock(info, root, &node);
 423        if (r)
 424                return r;
 425
 426        n = dm_block_data(node);
 427        flags = le32_to_cpu(n->header.flags);
 428        nr_entries = le32_to_cpu(n->header.nr_entries);
 429
 430        if (flags & INTERNAL_NODE) {
 431                i = lower_bound(n, key);
 432                if (i < 0) {
 433                        /*
 434                         * avoid early -ENODATA return when all entries are
 435                         * higher than the search @key.
 436                         */
 437                        i = 0;
 438                }
 439                if (i >= nr_entries) {
 440                        r = -ENODATA;
 441                        goto out;
 442                }
 443
 444                r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le);
 445                if (r == -ENODATA && i < (nr_entries - 1)) {
 446                        i++;
 447                        r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le);
 448                }
 449
 450        } else {
 451                i = upper_bound(n, key);
 452                if (i < 0 || i >= nr_entries) {
 453                        r = -ENODATA;
 454                        goto out;
 455                }
 456
 457                *rkey = le64_to_cpu(n->keys[i]);
 458                memcpy(value_le, value_ptr(n, i), info->value_type.size);
 459        }
 460out:
 461        dm_tm_unlock(info->tm, node);
 462        return r;
 463}
 464
 465int dm_btree_lookup_next(struct dm_btree_info *info, dm_block_t root,
 466                         uint64_t *keys, uint64_t *rkey, void *value_le)
 467{
 468        unsigned level;
 469        int r = -ENODATA;
 470        __le64 internal_value_le;
 471        struct ro_spine spine;
 472
 473        init_ro_spine(&spine, info);
 474        for (level = 0; level < info->levels - 1u; level++) {
 475                r = btree_lookup_raw(&spine, root, keys[level],
 476                                     lower_bound, rkey,
 477                                     &internal_value_le, sizeof(uint64_t));
 478                if (r)
 479                        goto out;
 480
 481                if (*rkey != keys[level]) {
 482                        r = -ENODATA;
 483                        goto out;
 484                }
 485
 486                root = le64_to_cpu(internal_value_le);
 487        }
 488
 489        r = dm_btree_lookup_next_single(info, root, keys[level], rkey, value_le);
 490out:
 491        exit_ro_spine(&spine);
 492        return r;
 493}
 494
 495EXPORT_SYMBOL_GPL(dm_btree_lookup_next);
 496
 497/*----------------------------------------------------------------*/
 498
 499/*
 500 * Copies entries from one region of a btree node to another.  The regions
 501 * must not overlap.
 502 */
 503static void copy_entries(struct btree_node *dest, unsigned dest_offset,
 504                         struct btree_node *src, unsigned src_offset,
 505                         unsigned count)
 506{
 507        size_t value_size = le32_to_cpu(dest->header.value_size);
 508        memcpy(dest->keys + dest_offset, src->keys + src_offset, count * sizeof(uint64_t));
 509        memcpy(value_ptr(dest, dest_offset), value_ptr(src, src_offset), count * value_size);
 510}
 511
 512/*
 513 * Moves entries from one region fo a btree node to another.  The regions
 514 * may overlap.
 515 */
 516static void move_entries(struct btree_node *dest, unsigned dest_offset,
 517                         struct btree_node *src, unsigned src_offset,
 518                         unsigned count)
 519{
 520        size_t value_size = le32_to_cpu(dest->header.value_size);
 521        memmove(dest->keys + dest_offset, src->keys + src_offset, count * sizeof(uint64_t));
 522        memmove(value_ptr(dest, dest_offset), value_ptr(src, src_offset), count * value_size);
 523}
 524
 525/*
 526 * Erases the first 'count' entries of a btree node, shifting following
 527 * entries down into their place.
 528 */
 529static void shift_down(struct btree_node *n, unsigned count)
 530{
 531        move_entries(n, 0, n, count, le32_to_cpu(n->header.nr_entries) - count);
 532}
 533
 534/*
 535 * Moves entries in a btree node up 'count' places, making space for
 536 * new entries at the start of the node.
 537 */
 538static void shift_up(struct btree_node *n, unsigned count)
 539{
 540        move_entries(n, count, n, 0, le32_to_cpu(n->header.nr_entries));
 541}
 542
 543/*
 544 * Redistributes entries between two btree nodes to make them
 545 * have similar numbers of entries.
 546 */
 547static void redistribute2(struct btree_node *left, struct btree_node *right)
 548{
 549        unsigned nr_left = le32_to_cpu(left->header.nr_entries);
 550        unsigned nr_right = le32_to_cpu(right->header.nr_entries);
 551        unsigned total = nr_left + nr_right;
 552        unsigned target_left = total / 2;
 553        unsigned target_right = total - target_left;
 554
 555        if (nr_left < target_left) {
 556                unsigned delta = target_left - nr_left;
 557                copy_entries(left, nr_left, right, 0, delta);
 558                shift_down(right, delta);
 559        } else if (nr_left > target_left) {
 560                unsigned delta = nr_left - target_left;
 561                if (nr_right)
 562                        shift_up(right, delta);
 563                copy_entries(right, 0, left, target_left, delta);
 564        }
 565
 566        left->header.nr_entries = cpu_to_le32(target_left);
 567        right->header.nr_entries = cpu_to_le32(target_right);
 568}
 569
 570/*
 571 * Redistribute entries between three nodes.  Assumes the central
 572 * node is empty.
 573 */
 574static void redistribute3(struct btree_node *left, struct btree_node *center,
 575                          struct btree_node *right)
 576{
 577        unsigned nr_left = le32_to_cpu(left->header.nr_entries);
 578        unsigned nr_center = le32_to_cpu(center->header.nr_entries);
 579        unsigned nr_right = le32_to_cpu(right->header.nr_entries);
 580        unsigned total, target_left, target_center, target_right;
 581
 582        BUG_ON(nr_center);
 583
 584        total = nr_left + nr_right;
 585        target_left = total / 3;
 586        target_center = (total - target_left) / 2;
 587        target_right = (total - target_left - target_center);
 588
 589        if (nr_left < target_left) {
 590                unsigned left_short = target_left - nr_left;
 591                copy_entries(left, nr_left, right, 0, left_short);
 592                copy_entries(center, 0, right, left_short, target_center);
 593                shift_down(right, nr_right - target_right);
 594
 595        } else if (nr_left < (target_left + target_center)) {
 596                unsigned left_to_center = nr_left - target_left;
 597                copy_entries(center, 0, left, target_left, left_to_center);
 598                copy_entries(center, left_to_center, right, 0, target_center - left_to_center);
 599                shift_down(right, nr_right - target_right);
 600
 601        } else {
 602                unsigned right_short = target_right - nr_right;
 603                shift_up(right, right_short);
 604                copy_entries(right, 0, left, nr_left - right_short, right_short);
 605                copy_entries(center, 0, left, target_left, nr_left - target_left);
 606        }
 607
 608        left->header.nr_entries = cpu_to_le32(target_left);
 609        center->header.nr_entries = cpu_to_le32(target_center);
 610        right->header.nr_entries = cpu_to_le32(target_right);
 611}
 612
 613/*
 614 * Splits a node by creating a sibling node and shifting half the nodes
 615 * contents across.  Assumes there is a parent node, and it has room for
 616 * another child.
 617 *
 618 * Before:
 619 *        +--------+
 620 *        | Parent |
 621 *        +--------+
 622 *           |
 623 *           v
 624 *      +----------+
 625 *      | A ++++++ |
 626 *      +----------+
 627 *
 628 *
 629 * After:
 630 *              +--------+
 631 *              | Parent |
 632 *              +--------+
 633 *                |     |
 634 *                v     +------+
 635 *          +---------+        |
 636 *          | A* +++  |        v
 637 *          +---------+   +-------+
 638 *                        | B +++ |
 639 *                        +-------+
 640 *
 641 * Where A* is a shadow of A.
 642 */
 643static int split_one_into_two(struct shadow_spine *s, unsigned parent_index,
 644                              struct dm_btree_value_type *vt, uint64_t key)
 645{
 646        int r;
 647        struct dm_block *left, *right, *parent;
 648        struct btree_node *ln, *rn, *pn;
 649        __le64 location;
 650
 651        left = shadow_current(s);
 652
 653        r = new_block(s->info, &right);
 654        if (r < 0)
 655                return r;
 656
 657        ln = dm_block_data(left);
 658        rn = dm_block_data(right);
 659
 660        rn->header.flags = ln->header.flags;
 661        rn->header.nr_entries = cpu_to_le32(0);
 662        rn->header.max_entries = ln->header.max_entries;
 663        rn->header.value_size = ln->header.value_size;
 664        redistribute2(ln, rn);
 665
 666        /* patch up the parent */
 667        parent = shadow_parent(s);
 668        pn = dm_block_data(parent);
 669
 670        location = cpu_to_le64(dm_block_location(right));
 671        __dm_bless_for_disk(&location);
 672        r = insert_at(sizeof(__le64), pn, parent_index + 1,
 673                      le64_to_cpu(rn->keys[0]), &location);
 674        if (r) {
 675                unlock_block(s->info, right);
 676                return r;
 677        }
 678
 679        /* patch up the spine */
 680        if (key < le64_to_cpu(rn->keys[0])) {
 681                unlock_block(s->info, right);
 682                s->nodes[1] = left;
 683        } else {
 684                unlock_block(s->info, left);
 685                s->nodes[1] = right;
 686        }
 687
 688        return 0;
 689}
 690
 691/*
 692 * We often need to modify a sibling node.  This function shadows a particular
 693 * child of the given parent node.  Making sure to update the parent to point
 694 * to the new shadow.
 695 */
 696static int shadow_child(struct dm_btree_info *info, struct dm_btree_value_type *vt,
 697                        struct btree_node *parent, unsigned index,
 698                        struct dm_block **result)
 699{
 700        int r, inc;
 701        dm_block_t root;
 702        struct btree_node *node;
 703
 704        root = value64(parent, index);
 705
 706        r = dm_tm_shadow_block(info->tm, root, &btree_node_validator,
 707                               result, &inc);
 708        if (r)
 709                return r;
 710
 711        node = dm_block_data(*result);
 712
 713        if (inc)
 714                inc_children(info->tm, node, vt);
 715
 716        *((__le64 *) value_ptr(parent, index)) =
 717                cpu_to_le64(dm_block_location(*result));
 718
 719        return 0;
 720}
 721
 722/*
 723 * Splits two nodes into three.  This is more work, but results in fuller
 724 * nodes, so saves metadata space.
 725 */
 726static int split_two_into_three(struct shadow_spine *s, unsigned parent_index,
 727                                struct dm_btree_value_type *vt, uint64_t key)
 728{
 729        int r;
 730        unsigned middle_index;
 731        struct dm_block *left, *middle, *right, *parent;
 732        struct btree_node *ln, *rn, *mn, *pn;
 733        __le64 location;
 734
 735        parent = shadow_parent(s);
 736        pn = dm_block_data(parent);
 737
 738        if (parent_index == 0) {
 739                middle_index = 1;
 740                left = shadow_current(s);
 741                r = shadow_child(s->info, vt, pn, parent_index + 1, &right);
 742                if (r)
 743                        return r;
 744        } else {
 745                middle_index = parent_index;
 746                right = shadow_current(s);
 747                r = shadow_child(s->info, vt, pn, parent_index - 1, &left);
 748                if (r)
 749                        return r;
 750        }
 751
 752        r = new_block(s->info, &middle);
 753        if (r < 0)
 754                return r;
 755
 756        ln = dm_block_data(left);
 757        mn = dm_block_data(middle);
 758        rn = dm_block_data(right);
 759
 760        mn->header.nr_entries = cpu_to_le32(0);
 761        mn->header.flags = ln->header.flags;
 762        mn->header.max_entries = ln->header.max_entries;
 763        mn->header.value_size = ln->header.value_size;
 764
 765        redistribute3(ln, mn, rn);
 766
 767        /* patch up the parent */
 768        pn->keys[middle_index] = rn->keys[0];
 769        location = cpu_to_le64(dm_block_location(middle));
 770        __dm_bless_for_disk(&location);
 771        r = insert_at(sizeof(__le64), pn, middle_index,
 772                      le64_to_cpu(mn->keys[0]), &location);
 773        if (r) {
 774                if (shadow_current(s) != left)
 775                        unlock_block(s->info, left);
 776
 777                unlock_block(s->info, middle);
 778
 779                if (shadow_current(s) != right)
 780                        unlock_block(s->info, right);
 781
 782                return r;
 783        }
 784
 785
 786        /* patch up the spine */
 787        if (key < le64_to_cpu(mn->keys[0])) {
 788                unlock_block(s->info, middle);
 789                unlock_block(s->info, right);
 790                s->nodes[1] = left;
 791        } else if (key < le64_to_cpu(rn->keys[0])) {
 792                unlock_block(s->info, left);
 793                unlock_block(s->info, right);
 794                s->nodes[1] = middle;
 795        } else {
 796                unlock_block(s->info, left);
 797                unlock_block(s->info, middle);
 798                s->nodes[1] = right;
 799        }
 800
 801        return 0;
 802}
 803
 804/*----------------------------------------------------------------*/
 805
 806/*
 807 * Splits a node by creating two new children beneath the given node.
 808 *
 809 * Before:
 810 *        +----------+
 811 *        | A ++++++ |
 812 *        +----------+
 813 *
 814 *
 815 * After:
 816 *      +------------+
 817 *      | A (shadow) |
 818 *      +------------+
 819 *          |   |
 820 *   +------+   +----+
 821 *   |               |
 822 *   v               v
 823 * +-------+     +-------+
 824 * | B +++ |     | C +++ |
 825 * +-------+     +-------+
 826 */
 827static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
 828{
 829        int r;
 830        size_t size;
 831        unsigned nr_left, nr_right;
 832        struct dm_block *left, *right, *new_parent;
 833        struct btree_node *pn, *ln, *rn;
 834        __le64 val;
 835
 836        new_parent = shadow_current(s);
 837
 838        pn = dm_block_data(new_parent);
 839        size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
 840                sizeof(__le64) : s->info->value_type.size;
 841
 842        /* create & init the left block */
 843        r = new_block(s->info, &left);
 844        if (r < 0)
 845                return r;
 846
 847        ln = dm_block_data(left);
 848        nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
 849
 850        ln->header.flags = pn->header.flags;
 851        ln->header.nr_entries = cpu_to_le32(nr_left);
 852        ln->header.max_entries = pn->header.max_entries;
 853        ln->header.value_size = pn->header.value_size;
 854        memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
 855        memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
 856
 857        /* create & init the right block */
 858        r = new_block(s->info, &right);
 859        if (r < 0) {
 860                unlock_block(s->info, left);
 861                return r;
 862        }
 863
 864        rn = dm_block_data(right);
 865        nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
 866
 867        rn->header.flags = pn->header.flags;
 868        rn->header.nr_entries = cpu_to_le32(nr_right);
 869        rn->header.max_entries = pn->header.max_entries;
 870        rn->header.value_size = pn->header.value_size;
 871        memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
 872        memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
 873               nr_right * size);
 874
 875        /* new_parent should just point to l and r now */
 876        pn->header.flags = cpu_to_le32(INTERNAL_NODE);
 877        pn->header.nr_entries = cpu_to_le32(2);
 878        pn->header.max_entries = cpu_to_le32(
 879                calc_max_entries(sizeof(__le64),
 880                                 dm_bm_block_size(
 881                                         dm_tm_get_bm(s->info->tm))));
 882        pn->header.value_size = cpu_to_le32(sizeof(__le64));
 883
 884        val = cpu_to_le64(dm_block_location(left));
 885        __dm_bless_for_disk(&val);
 886        pn->keys[0] = ln->keys[0];
 887        memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64));
 888
 889        val = cpu_to_le64(dm_block_location(right));
 890        __dm_bless_for_disk(&val);
 891        pn->keys[1] = rn->keys[0];
 892        memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64));
 893
 894        unlock_block(s->info, left);
 895        unlock_block(s->info, right);
 896        return 0;
 897}
 898
 899/*----------------------------------------------------------------*/
 900
 901/*
 902 * Redistributes a node's entries with its left sibling.
 903 */
 904static int rebalance_left(struct shadow_spine *s, struct dm_btree_value_type *vt,
 905                          unsigned parent_index, uint64_t key)
 906{
 907        int r;
 908        struct dm_block *sib;
 909        struct btree_node *left, *right, *parent = dm_block_data(shadow_parent(s));
 910
 911        r = shadow_child(s->info, vt, parent, parent_index - 1, &sib);
 912        if (r)
 913                return r;
 914
 915        left = dm_block_data(sib);
 916        right = dm_block_data(shadow_current(s));
 917        redistribute2(left, right);
 918        *key_ptr(parent, parent_index) = right->keys[0];
 919
 920        if (key < le64_to_cpu(right->keys[0])) {
 921                unlock_block(s->info, s->nodes[1]);
 922                s->nodes[1] = sib;
 923        } else {
 924                unlock_block(s->info, sib);
 925        }
 926
 927        return 0;
 928}
 929
 930/*
 931 * Redistributes a nodes entries with its right sibling.
 932 */
 933static int rebalance_right(struct shadow_spine *s, struct dm_btree_value_type *vt,
 934                           unsigned parent_index, uint64_t key)
 935{
 936        int r;
 937        struct dm_block *sib;
 938        struct btree_node *left, *right, *parent = dm_block_data(shadow_parent(s));
 939
 940        r = shadow_child(s->info, vt, parent, parent_index + 1, &sib);
 941        if (r)
 942                return r;
 943
 944        left = dm_block_data(shadow_current(s));
 945        right = dm_block_data(sib);
 946        redistribute2(left, right);
 947        *key_ptr(parent, parent_index + 1) = right->keys[0];
 948
 949        if (key < le64_to_cpu(right->keys[0])) {
 950                unlock_block(s->info, sib);
 951        } else {
 952                unlock_block(s->info, s->nodes[1]);
 953                s->nodes[1] = sib;
 954        }
 955
 956        return 0;
 957}
 958
 959/*
 960 * Returns the number of spare entries in a node.
 961 */
 962static int get_node_free_space(struct dm_btree_info *info, dm_block_t b, unsigned *space)
 963{
 964        int r;
 965        unsigned nr_entries;
 966        struct dm_block *block;
 967        struct btree_node *node;
 968
 969        r = bn_read_lock(info, b, &block);
 970        if (r)
 971                return r;
 972
 973        node = dm_block_data(block);
 974        nr_entries = le32_to_cpu(node->header.nr_entries);
 975        *space = le32_to_cpu(node->header.max_entries) - nr_entries;
 976
 977        unlock_block(info, block);
 978        return 0;
 979}
 980
 981/*
 982 * Make space in a node, either by moving some entries to a sibling,
 983 * or creating a new sibling node.  SPACE_THRESHOLD defines the minimum
 984 * number of free entries that must be in the sibling to make the move
 985 * worth while.  If the siblings are shared (eg, part of a snapshot),
 986 * then they are not touched, since this break sharing and so consume
 987 * more space than we save.
 988 */
 989#define SPACE_THRESHOLD 8
 990static int rebalance_or_split(struct shadow_spine *s, struct dm_btree_value_type *vt,
 991                              unsigned parent_index, uint64_t key)
 992{
 993        int r;
 994        struct btree_node *parent = dm_block_data(shadow_parent(s));
 995        unsigned nr_parent = le32_to_cpu(parent->header.nr_entries);
 996        unsigned free_space;
 997        int left_shared = 0, right_shared = 0;
 998
 999        /* Should we move entries to the left sibling? */
1000        if (parent_index > 0) {
1001                dm_block_t left_b = value64(parent, parent_index - 1);
1002                r = dm_tm_block_is_shared(s->info->tm, left_b, &left_shared);
1003                if (r)
1004                        return r;
1005
1006                if (!left_shared) {
1007                        r = get_node_free_space(s->info, left_b, &free_space);
1008                        if (r)
1009                                return r;
1010
1011                        if (free_space >= SPACE_THRESHOLD)
1012                                return rebalance_left(s, vt, parent_index, key);
1013                }
1014        }
1015
1016        /* Should we move entries to the right sibling? */
1017        if (parent_index < (nr_parent - 1)) {
1018                dm_block_t right_b = value64(parent, parent_index + 1);
1019                r = dm_tm_block_is_shared(s->info->tm, right_b, &right_shared);
1020                if (r)
1021                        return r;
1022
1023                if (!right_shared) {
1024                        r = get_node_free_space(s->info, right_b, &free_space);
1025                        if (r)
1026                                return r;
1027
1028                        if (free_space >= SPACE_THRESHOLD)
1029                                return rebalance_right(s, vt, parent_index, key);
1030                }
1031        }
1032
1033        /*
1034         * We need to split the node, normally we split two nodes
1035         * into three.  But when inserting a sequence that is either
1036         * monotonically increasing or decreasing it's better to split
1037         * a single node into two.
1038         */
1039        if (left_shared || right_shared || (nr_parent <= 2) ||
1040            (parent_index == 0) || (parent_index + 1 == nr_parent)) {
1041                return split_one_into_two(s, parent_index, vt, key);
1042        } else {
1043                return split_two_into_three(s, parent_index, vt, key);
1044        }
1045}
1046
1047/*
1048 * Does the node contain a particular key?
1049 */
1050static bool contains_key(struct btree_node *node, uint64_t key)
1051{
1052        int i = lower_bound(node, key);
1053
1054        if (i >= 0 && le64_to_cpu(node->keys[i]) == key)
1055                return true;
1056
1057        return false;
1058}
1059
1060/*
1061 * In general we preemptively make sure there's a free entry in every
1062 * node on the spine when doing an insert.  But we can avoid that with
1063 * leaf nodes if we know it's an overwrite.
1064 */
1065static bool has_space_for_insert(struct btree_node *node, uint64_t key)
1066{
1067        if (node->header.nr_entries == node->header.max_entries) {
1068                if (le32_to_cpu(node->header.flags) & LEAF_NODE) {
1069                        /* we don't need space if it's an overwrite */
1070                        return contains_key(node, key);
1071                }
1072
1073                return false;
1074        }
1075
1076        return true;
1077}
1078
1079static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
1080                            struct dm_btree_value_type *vt,
1081                            uint64_t key, unsigned *index)
1082{
1083        int r, i = *index, top = 1;
1084        struct btree_node *node;
1085
1086        for (;;) {
1087                r = shadow_step(s, root, vt);
1088                if (r < 0)
1089                        return r;
1090
1091                node = dm_block_data(shadow_current(s));
1092
1093                /*
1094                 * We have to patch up the parent node, ugly, but I don't
1095                 * see a way to do this automatically as part of the spine
1096                 * op.
1097                 */
1098                if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */
1099                        __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
1100
1101                        __dm_bless_for_disk(&location);
1102                        memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
1103                                    &location, sizeof(__le64));
1104                }
1105
1106                node = dm_block_data(shadow_current(s));
1107
1108                if (!has_space_for_insert(node, key)) {
1109                        if (top)
1110                                r = btree_split_beneath(s, key);
1111                        else
1112                                r = rebalance_or_split(s, vt, i, key);
1113
1114                        if (r < 0)
1115                                return r;
1116
1117                        /* making space can cause the current node to change */
1118                        node = dm_block_data(shadow_current(s));
1119                }
1120
1121                i = lower_bound(node, key);
1122
1123                if (le32_to_cpu(node->header.flags) & LEAF_NODE)
1124                        break;
1125
1126                if (i < 0) {
1127                        /* change the bounds on the lowest key */
1128                        node->keys[0] = cpu_to_le64(key);
1129                        i = 0;
1130                }
1131
1132                root = value64(node, i);
1133                top = 0;
1134        }
1135
1136        if (i < 0 || le64_to_cpu(node->keys[i]) != key)
1137                i++;
1138
1139        *index = i;
1140        return 0;
1141}
1142
1143static int __btree_get_overwrite_leaf(struct shadow_spine *s, dm_block_t root,
1144                                      uint64_t key, int *index)
1145{
1146        int r, i = -1;
1147        struct btree_node *node;
1148
1149        *index = 0;
1150        for (;;) {
1151                r = shadow_step(s, root, &s->info->value_type);
1152                if (r < 0)
1153                        return r;
1154
1155                node = dm_block_data(shadow_current(s));
1156
1157                /*
1158                 * We have to patch up the parent node, ugly, but I don't
1159                 * see a way to do this automatically as part of the spine
1160                 * op.
1161                 */
1162                if (shadow_has_parent(s) && i >= 0) {
1163                        __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
1164
1165                        __dm_bless_for_disk(&location);
1166                        memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
1167                                    &location, sizeof(__le64));
1168                }
1169
1170                node = dm_block_data(shadow_current(s));
1171                i = lower_bound(node, key);
1172
1173                BUG_ON(i < 0);
1174                BUG_ON(i >= le32_to_cpu(node->header.nr_entries));
1175
1176                if (le32_to_cpu(node->header.flags) & LEAF_NODE) {
1177                        if (key != le64_to_cpu(node->keys[i]))
1178                                return -EINVAL;
1179                        break;
1180                }
1181
1182                root = value64(node, i);
1183        }
1184
1185        *index = i;
1186        return 0;
1187}
1188
1189int btree_get_overwrite_leaf(struct dm_btree_info *info, dm_block_t root,
1190                             uint64_t key, int *index,
1191                             dm_block_t *new_root, struct dm_block **leaf)
1192{
1193        int r;
1194        struct shadow_spine spine;
1195
1196        BUG_ON(info->levels > 1);
1197        init_shadow_spine(&spine, info);
1198        r = __btree_get_overwrite_leaf(&spine, root, key, index);
1199        if (!r) {
1200                *new_root = shadow_root(&spine);
1201                *leaf = shadow_current(&spine);
1202
1203                /*
1204                 * Decrement the count so exit_shadow_spine() doesn't
1205                 * unlock the leaf.
1206                 */
1207                spine.count--;
1208        }
1209        exit_shadow_spine(&spine);
1210
1211        return r;
1212}
1213
1214static bool need_insert(struct btree_node *node, uint64_t *keys,
1215                        unsigned level, unsigned index)
1216{
1217        return ((index >= le32_to_cpu(node->header.nr_entries)) ||
1218                (le64_to_cpu(node->keys[index]) != keys[level]));
1219}
1220
1221static int insert(struct dm_btree_info *info, dm_block_t root,
1222                  uint64_t *keys, void *value, dm_block_t *new_root,
1223                  int *inserted)
1224                  __dm_written_to_disk(value)
1225{
1226        int r;
1227        unsigned level, index = -1, last_level = info->levels - 1;
1228        dm_block_t block = root;
1229        struct shadow_spine spine;
1230        struct btree_node *n;
1231        struct dm_btree_value_type le64_type;
1232
1233        init_le64_type(info->tm, &le64_type);
1234        init_shadow_spine(&spine, info);
1235
1236        for (level = 0; level < (info->levels - 1); level++) {
1237                r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index);
1238                if (r < 0)
1239                        goto bad;
1240
1241                n = dm_block_data(shadow_current(&spine));
1242
1243                if (need_insert(n, keys, level, index)) {
1244                        dm_block_t new_tree;
1245                        __le64 new_le;
1246
1247                        r = dm_btree_empty(info, &new_tree);
1248                        if (r < 0)
1249                                goto bad;
1250
1251                        new_le = cpu_to_le64(new_tree);
1252                        __dm_bless_for_disk(&new_le);
1253
1254                        r = insert_at(sizeof(uint64_t), n, index,
1255                                      keys[level], &new_le);
1256                        if (r)
1257                                goto bad;
1258                }
1259
1260                if (level < last_level)
1261                        block = value64(n, index);
1262        }
1263
1264        r = btree_insert_raw(&spine, block, &info->value_type,
1265                             keys[level], &index);
1266        if (r < 0)
1267                goto bad;
1268
1269        n = dm_block_data(shadow_current(&spine));
1270
1271        if (need_insert(n, keys, level, index)) {
1272                if (inserted)
1273                        *inserted = 1;
1274
1275                r = insert_at(info->value_type.size, n, index,
1276                              keys[level], value);
1277                if (r)
1278                        goto bad_unblessed;
1279        } else {
1280                if (inserted)
1281                        *inserted = 0;
1282
1283                if (info->value_type.dec &&
1284                    (!info->value_type.equal ||
1285                     !info->value_type.equal(
1286                             info->value_type.context,
1287                             value_ptr(n, index),
1288                             value))) {
1289                        info->value_type.dec(info->value_type.context,
1290                                             value_ptr(n, index), 1);
1291                }
1292                memcpy_disk(value_ptr(n, index),
1293                            value, info->value_type.size);
1294        }
1295
1296        *new_root = shadow_root(&spine);
1297        exit_shadow_spine(&spine);
1298
1299        return 0;
1300
1301bad:
1302        __dm_unbless_for_disk(value);
1303bad_unblessed:
1304        exit_shadow_spine(&spine);
1305        return r;
1306}
1307
1308int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
1309                    uint64_t *keys, void *value, dm_block_t *new_root)
1310                    __dm_written_to_disk(value)
1311{
1312        return insert(info, root, keys, value, new_root, NULL);
1313}
1314EXPORT_SYMBOL_GPL(dm_btree_insert);
1315
1316int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
1317                           uint64_t *keys, void *value, dm_block_t *new_root,
1318                           int *inserted)
1319                           __dm_written_to_disk(value)
1320{
1321        return insert(info, root, keys, value, new_root, inserted);
1322}
1323EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
1324
1325/*----------------------------------------------------------------*/
1326
1327static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest,
1328                    uint64_t *result_key, dm_block_t *next_block)
1329{
1330        int i, r;
1331        uint32_t flags;
1332
1333        do {
1334                r = ro_step(s, block);
1335                if (r < 0)
1336                        return r;
1337
1338                flags = le32_to_cpu(ro_node(s)->header.flags);
1339                i = le32_to_cpu(ro_node(s)->header.nr_entries);
1340                if (!i)
1341                        return -ENODATA;
1342                else
1343                        i--;
1344
1345                if (find_highest)
1346                        *result_key = le64_to_cpu(ro_node(s)->keys[i]);
1347                else
1348                        *result_key = le64_to_cpu(ro_node(s)->keys[0]);
1349
1350                if (next_block || flags & INTERNAL_NODE) {
1351                        if (find_highest)
1352                                block = value64(ro_node(s), i);
1353                        else
1354                                block = value64(ro_node(s), 0);
1355                }
1356
1357        } while (flags & INTERNAL_NODE);
1358
1359        if (next_block)
1360                *next_block = block;
1361        return 0;
1362}
1363
1364static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root,
1365                             bool find_highest, uint64_t *result_keys)
1366{
1367        int r = 0, count = 0, level;
1368        struct ro_spine spine;
1369
1370        init_ro_spine(&spine, info);
1371        for (level = 0; level < info->levels; level++) {
1372                r = find_key(&spine, root, find_highest, result_keys + level,
1373                             level == info->levels - 1 ? NULL : &root);
1374                if (r == -ENODATA) {
1375                        r = 0;
1376                        break;
1377
1378                } else if (r)
1379                        break;
1380
1381                count++;
1382        }
1383        exit_ro_spine(&spine);
1384
1385        return r ? r : count;
1386}
1387
1388int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
1389                              uint64_t *result_keys)
1390{
1391        return dm_btree_find_key(info, root, true, result_keys);
1392}
1393EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
1394
1395int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root,
1396                             uint64_t *result_keys)
1397{
1398        return dm_btree_find_key(info, root, false, result_keys);
1399}
1400EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key);
1401
1402/*----------------------------------------------------------------*/
1403
1404/*
1405 * FIXME: We shouldn't use a recursive algorithm when we have limited stack
1406 * space.  Also this only works for single level trees.
1407 */
1408static int walk_node(struct dm_btree_info *info, dm_block_t block,
1409                     int (*fn)(void *context, uint64_t *keys, void *leaf),
1410                     void *context)
1411{
1412        int r;
1413        unsigned i, nr;
1414        struct dm_block *node;
1415        struct btree_node *n;
1416        uint64_t keys;
1417
1418        r = bn_read_lock(info, block, &node);
1419        if (r)
1420                return r;
1421
1422        n = dm_block_data(node);
1423
1424        nr = le32_to_cpu(n->header.nr_entries);
1425        for (i = 0; i < nr; i++) {
1426                if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) {
1427                        r = walk_node(info, value64(n, i), fn, context);
1428                        if (r)
1429                                goto out;
1430                } else {
1431                        keys = le64_to_cpu(*key_ptr(n, i));
1432                        r = fn(context, &keys, value_ptr(n, i));
1433                        if (r)
1434                                goto out;
1435                }
1436        }
1437
1438out:
1439        dm_tm_unlock(info->tm, node);
1440        return r;
1441}
1442
1443int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
1444                  int (*fn)(void *context, uint64_t *keys, void *leaf),
1445                  void *context)
1446{
1447        BUG_ON(info->levels > 1);
1448        return walk_node(info, root, fn, context);
1449}
1450EXPORT_SYMBOL_GPL(dm_btree_walk);
1451
1452/*----------------------------------------------------------------*/
1453
1454static void prefetch_values(struct dm_btree_cursor *c)
1455{
1456        unsigned i, nr;
1457        __le64 value_le;
1458        struct cursor_node *n = c->nodes + c->depth - 1;
1459        struct btree_node *bn = dm_block_data(n->b);
1460        struct dm_block_manager *bm = dm_tm_get_bm(c->info->tm);
1461
1462        BUG_ON(c->info->value_type.size != sizeof(value_le));
1463
1464        nr = le32_to_cpu(bn->header.nr_entries);
1465        for (i = 0; i < nr; i++) {
1466                memcpy(&value_le, value_ptr(bn, i), sizeof(value_le));
1467                dm_bm_prefetch(bm, le64_to_cpu(value_le));
1468        }
1469}
1470
1471static bool leaf_node(struct dm_btree_cursor *c)
1472{
1473        struct cursor_node *n = c->nodes + c->depth - 1;
1474        struct btree_node *bn = dm_block_data(n->b);
1475
1476        return le32_to_cpu(bn->header.flags) & LEAF_NODE;
1477}
1478
1479static int push_node(struct dm_btree_cursor *c, dm_block_t b)
1480{
1481        int r;
1482        struct cursor_node *n = c->nodes + c->depth;
1483
1484        if (c->depth >= DM_BTREE_CURSOR_MAX_DEPTH - 1) {
1485                DMERR("couldn't push cursor node, stack depth too high");
1486                return -EINVAL;
1487        }
1488
1489        r = bn_read_lock(c->info, b, &n->b);
1490        if (r)
1491                return r;
1492
1493        n->index = 0;
1494        c->depth++;
1495
1496        if (c->prefetch_leaves || !leaf_node(c))
1497                prefetch_values(c);
1498
1499        return 0;
1500}
1501
1502static void pop_node(struct dm_btree_cursor *c)
1503{
1504        c->depth--;
1505        unlock_block(c->info, c->nodes[c->depth].b);
1506}
1507
1508static int inc_or_backtrack(struct dm_btree_cursor *c)
1509{
1510        struct cursor_node *n;
1511        struct btree_node *bn;
1512
1513        for (;;) {
1514                if (!c->depth)
1515                        return -ENODATA;
1516
1517                n = c->nodes + c->depth - 1;
1518                bn = dm_block_data(n->b);
1519
1520                n->index++;
1521                if (n->index < le32_to_cpu(bn->header.nr_entries))
1522                        break;
1523
1524                pop_node(c);
1525        }
1526
1527        return 0;
1528}
1529
1530static int find_leaf(struct dm_btree_cursor *c)
1531{
1532        int r = 0;
1533        struct cursor_node *n;
1534        struct btree_node *bn;
1535        __le64 value_le;
1536
1537        for (;;) {
1538                n = c->nodes + c->depth - 1;
1539                bn = dm_block_data(n->b);
1540
1541                if (le32_to_cpu(bn->header.flags) & LEAF_NODE)
1542                        break;
1543
1544                memcpy(&value_le, value_ptr(bn, n->index), sizeof(value_le));
1545                r = push_node(c, le64_to_cpu(value_le));
1546                if (r) {
1547                        DMERR("push_node failed");
1548                        break;
1549                }
1550        }
1551
1552        if (!r && (le32_to_cpu(bn->header.nr_entries) == 0))
1553                return -ENODATA;
1554
1555        return r;
1556}
1557
1558int dm_btree_cursor_begin(struct dm_btree_info *info, dm_block_t root,
1559                          bool prefetch_leaves, struct dm_btree_cursor *c)
1560{
1561        int r;
1562
1563        c->info = info;
1564        c->root = root;
1565        c->depth = 0;
1566        c->prefetch_leaves = prefetch_leaves;
1567
1568        r = push_node(c, root);
1569        if (r)
1570                return r;
1571
1572        return find_leaf(c);
1573}
1574EXPORT_SYMBOL_GPL(dm_btree_cursor_begin);
1575
1576void dm_btree_cursor_end(struct dm_btree_cursor *c)
1577{
1578        while (c->depth)
1579                pop_node(c);
1580}
1581EXPORT_SYMBOL_GPL(dm_btree_cursor_end);
1582
1583int dm_btree_cursor_next(struct dm_btree_cursor *c)
1584{
1585        int r = inc_or_backtrack(c);
1586        if (!r) {
1587                r = find_leaf(c);
1588                if (r)
1589                        DMERR("find_leaf failed");
1590        }
1591
1592        return r;
1593}
1594EXPORT_SYMBOL_GPL(dm_btree_cursor_next);
1595
1596int dm_btree_cursor_skip(struct dm_btree_cursor *c, uint32_t count)
1597{
1598        int r = 0;
1599
1600        while (count-- && !r)
1601                r = dm_btree_cursor_next(c);
1602
1603        return r;
1604}
1605EXPORT_SYMBOL_GPL(dm_btree_cursor_skip);
1606
1607int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le)
1608{
1609        if (c->depth) {
1610                struct cursor_node *n = c->nodes + c->depth - 1;
1611                struct btree_node *bn = dm_block_data(n->b);
1612
1613                if (le32_to_cpu(bn->header.flags) & INTERNAL_NODE)
1614                        return -EINVAL;
1615
1616                *key = le64_to_cpu(*key_ptr(bn, n->index));
1617                memcpy(value_le, value_ptr(bn, n->index), c->info->value_type.size);
1618                return 0;
1619
1620        } else
1621                return -ENODATA;
1622}
1623EXPORT_SYMBOL_GPL(dm_btree_cursor_get_value);
1624