1
2
3
4
5
6
7#include "dm-btree-internal.h"
8#include "dm-space-map.h"
9#include "dm-transaction-manager.h"
10
11#include <linux/export.h>
12#include <linux/device-mapper.h>
13
14#define DM_MSG_PREFIX "btree"
15
16
17
18
19static void memcpy_disk(void *dest, const void *src, size_t len)
20 __dm_written_to_disk(src)
21{
22 memcpy(dest, src, len);
23 __dm_unbless_for_disk(src);
24}
25
26static void array_insert(void *base, size_t elt_size, unsigned nr_elts,
27 unsigned index, void *elt)
28 __dm_written_to_disk(elt)
29{
30 if (index < nr_elts)
31 memmove(base + (elt_size * (index + 1)),
32 base + (elt_size * index),
33 (nr_elts - index) * elt_size);
34
35 memcpy_disk(base + (elt_size * index), elt, elt_size);
36}
37
38
39
40
41static int bsearch(struct btree_node *n, uint64_t key, int want_hi)
42{
43 int lo = -1, hi = le32_to_cpu(n->header.nr_entries);
44
45 while (hi - lo > 1) {
46 int mid = lo + ((hi - lo) / 2);
47 uint64_t mid_key = le64_to_cpu(n->keys[mid]);
48
49 if (mid_key == key)
50 return mid;
51
52 if (mid_key < key)
53 lo = mid;
54 else
55 hi = mid;
56 }
57
58 return want_hi ? hi : lo;
59}
60
61int lower_bound(struct btree_node *n, uint64_t key)
62{
63 return bsearch(n, key, 0);
64}
65
66static int upper_bound(struct btree_node *n, uint64_t key)
67{
68 return bsearch(n, key, 1);
69}
70
71void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
72 struct dm_btree_value_type *vt)
73{
74 unsigned i;
75 uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
76
77 if (le32_to_cpu(n->header.flags) & INTERNAL_NODE)
78 for (i = 0; i < nr_entries; i++)
79 dm_tm_inc(tm, value64(n, i));
80 else if (vt->inc)
81 for (i = 0; i < nr_entries; i++)
82 vt->inc(vt->context, value_ptr(n, i));
83}
84
85static int insert_at(size_t value_size, struct btree_node *node, unsigned index,
86 uint64_t key, void *value)
87 __dm_written_to_disk(value)
88{
89 uint32_t nr_entries = le32_to_cpu(node->header.nr_entries);
90 __le64 key_le = cpu_to_le64(key);
91
92 if (index > nr_entries ||
93 index >= le32_to_cpu(node->header.max_entries)) {
94 DMERR("too many entries in btree node for insert");
95 __dm_unbless_for_disk(value);
96 return -ENOMEM;
97 }
98
99 __dm_bless_for_disk(&key_le);
100
101 array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le);
102 array_insert(value_base(node), value_size, nr_entries, index, value);
103 node->header.nr_entries = cpu_to_le32(nr_entries + 1);
104
105 return 0;
106}
107
108
109
110
111
112
113
114static uint32_t calc_max_entries(size_t value_size, size_t block_size)
115{
116 uint32_t total, n;
117 size_t elt_size = sizeof(uint64_t) + value_size;
118
119 block_size -= sizeof(struct node_header);
120 total = block_size / elt_size;
121 n = total / 3;
122
123 return 3 * n;
124}
125
126int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root)
127{
128 int r;
129 struct dm_block *b;
130 struct btree_node *n;
131 size_t block_size;
132 uint32_t max_entries;
133
134 r = new_block(info, &b);
135 if (r < 0)
136 return r;
137
138 block_size = dm_bm_block_size(dm_tm_get_bm(info->tm));
139 max_entries = calc_max_entries(info->value_type.size, block_size);
140
141 n = dm_block_data(b);
142 memset(n, 0, block_size);
143 n->header.flags = cpu_to_le32(LEAF_NODE);
144 n->header.nr_entries = cpu_to_le32(0);
145 n->header.max_entries = cpu_to_le32(max_entries);
146 n->header.value_size = cpu_to_le32(info->value_type.size);
147
148 *root = dm_block_location(b);
149 unlock_block(info, b);
150
151 return 0;
152}
153EXPORT_SYMBOL_GPL(dm_btree_empty);
154
155
156
157
158
159
160
161#define MAX_SPINE_DEPTH 64
162struct frame {
163 struct dm_block *b;
164 struct btree_node *n;
165 unsigned level;
166 unsigned nr_children;
167 unsigned current_child;
168};
169
170struct del_stack {
171 struct dm_btree_info *info;
172 struct dm_transaction_manager *tm;
173 int top;
174 struct frame spine[MAX_SPINE_DEPTH];
175};
176
177static int top_frame(struct del_stack *s, struct frame **f)
178{
179 if (s->top < 0) {
180 DMERR("btree deletion stack empty");
181 return -EINVAL;
182 }
183
184 *f = s->spine + s->top;
185
186 return 0;
187}
188
189static int unprocessed_frames(struct del_stack *s)
190{
191 return s->top >= 0;
192}
193
194static void prefetch_children(struct del_stack *s, struct frame *f)
195{
196 unsigned i;
197 struct dm_block_manager *bm = dm_tm_get_bm(s->tm);
198
199 for (i = 0; i < f->nr_children; i++)
200 dm_bm_prefetch(bm, value64(f->n, i));
201}
202
203static bool is_internal_level(struct dm_btree_info *info, struct frame *f)
204{
205 return f->level < (info->levels - 1);
206}
207
208static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
209{
210 int r;
211 uint32_t ref_count;
212
213 if (s->top >= MAX_SPINE_DEPTH - 1) {
214 DMERR("btree deletion stack out of memory");
215 return -ENOMEM;
216 }
217
218 r = dm_tm_ref(s->tm, b, &ref_count);
219 if (r)
220 return r;
221
222 if (ref_count > 1)
223
224
225
226
227 dm_tm_dec(s->tm, b);
228
229 else {
230 uint32_t flags;
231 struct frame *f = s->spine + ++s->top;
232
233 r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b);
234 if (r) {
235 s->top--;
236 return r;
237 }
238
239 f->n = dm_block_data(f->b);
240 f->level = level;
241 f->nr_children = le32_to_cpu(f->n->header.nr_entries);
242 f->current_child = 0;
243
244 flags = le32_to_cpu(f->n->header.flags);
245 if (flags & INTERNAL_NODE || is_internal_level(s->info, f))
246 prefetch_children(s, f);
247 }
248
249 return 0;
250}
251
252static void pop_frame(struct del_stack *s)
253{
254 struct frame *f = s->spine + s->top--;
255
256 dm_tm_dec(s->tm, dm_block_location(f->b));
257 dm_tm_unlock(s->tm, f->b);
258}
259
260static void unlock_all_frames(struct del_stack *s)
261{
262 struct frame *f;
263
264 while (unprocessed_frames(s)) {
265 f = s->spine + s->top--;
266 dm_tm_unlock(s->tm, f->b);
267 }
268}
269
270int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
271{
272 int r;
273 struct del_stack *s;
274
275
276
277
278
279
280 s = kmalloc(sizeof(*s), GFP_NOFS);
281 if (!s)
282 return -ENOMEM;
283 s->info = info;
284 s->tm = info->tm;
285 s->top = -1;
286
287 r = push_frame(s, root, 0);
288 if (r)
289 goto out;
290
291 while (unprocessed_frames(s)) {
292 uint32_t flags;
293 struct frame *f;
294 dm_block_t b;
295
296 r = top_frame(s, &f);
297 if (r)
298 goto out;
299
300 if (f->current_child >= f->nr_children) {
301 pop_frame(s);
302 continue;
303 }
304
305 flags = le32_to_cpu(f->n->header.flags);
306 if (flags & INTERNAL_NODE) {
307 b = value64(f->n, f->current_child);
308 f->current_child++;
309 r = push_frame(s, b, f->level);
310 if (r)
311 goto out;
312
313 } else if (is_internal_level(info, f)) {
314 b = value64(f->n, f->current_child);
315 f->current_child++;
316 r = push_frame(s, b, f->level + 1);
317 if (r)
318 goto out;
319
320 } else {
321 if (info->value_type.dec) {
322 unsigned i;
323
324 for (i = 0; i < f->nr_children; i++)
325 info->value_type.dec(info->value_type.context,
326 value_ptr(f->n, i));
327 }
328 pop_frame(s);
329 }
330 }
331out:
332 if (r) {
333
334 unlock_all_frames(s);
335 }
336 kfree(s);
337
338 return r;
339}
340EXPORT_SYMBOL_GPL(dm_btree_del);
341
342
343
344static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
345 int (*search_fn)(struct btree_node *, uint64_t),
346 uint64_t *result_key, void *v, size_t value_size)
347{
348 int i, r;
349 uint32_t flags, nr_entries;
350
351 do {
352 r = ro_step(s, block);
353 if (r < 0)
354 return r;
355
356 i = search_fn(ro_node(s), key);
357
358 flags = le32_to_cpu(ro_node(s)->header.flags);
359 nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries);
360 if (i < 0 || i >= nr_entries)
361 return -ENODATA;
362
363 if (flags & INTERNAL_NODE)
364 block = value64(ro_node(s), i);
365
366 } while (!(flags & LEAF_NODE));
367
368 *result_key = le64_to_cpu(ro_node(s)->keys[i]);
369 memcpy(v, value_ptr(ro_node(s), i), value_size);
370
371 return 0;
372}
373
374int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
375 uint64_t *keys, void *value_le)
376{
377 unsigned level, last_level = info->levels - 1;
378 int r = -ENODATA;
379 uint64_t rkey;
380 __le64 internal_value_le;
381 struct ro_spine spine;
382
383 init_ro_spine(&spine, info);
384 for (level = 0; level < info->levels; level++) {
385 size_t size;
386 void *value_p;
387
388 if (level == last_level) {
389 value_p = value_le;
390 size = info->value_type.size;
391
392 } else {
393 value_p = &internal_value_le;
394 size = sizeof(uint64_t);
395 }
396
397 r = btree_lookup_raw(&spine, root, keys[level],
398 lower_bound, &rkey,
399 value_p, size);
400
401 if (!r) {
402 if (rkey != keys[level]) {
403 exit_ro_spine(&spine);
404 return -ENODATA;
405 }
406 } else {
407 exit_ro_spine(&spine);
408 return r;
409 }
410
411 root = le64_to_cpu(internal_value_le);
412 }
413 exit_ro_spine(&spine);
414
415 return r;
416}
417EXPORT_SYMBOL_GPL(dm_btree_lookup);
418
419static int dm_btree_lookup_next_single(struct dm_btree_info *info, dm_block_t root,
420 uint64_t key, uint64_t *rkey, void *value_le)
421{
422 int r, i;
423 uint32_t flags, nr_entries;
424 struct dm_block *node;
425 struct btree_node *n;
426
427 r = bn_read_lock(info, root, &node);
428 if (r)
429 return r;
430
431 n = dm_block_data(node);
432 flags = le32_to_cpu(n->header.flags);
433 nr_entries = le32_to_cpu(n->header.nr_entries);
434
435 if (flags & INTERNAL_NODE) {
436 i = lower_bound(n, key);
437 if (i < 0) {
438
439
440
441
442 i = 0;
443 }
444 if (i >= nr_entries) {
445 r = -ENODATA;
446 goto out;
447 }
448
449 r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le);
450 if (r == -ENODATA && i < (nr_entries - 1)) {
451 i++;
452 r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le);
453 }
454
455 } else {
456 i = upper_bound(n, key);
457 if (i < 0 || i >= nr_entries) {
458 r = -ENODATA;
459 goto out;
460 }
461
462 *rkey = le64_to_cpu(n->keys[i]);
463 memcpy(value_le, value_ptr(n, i), info->value_type.size);
464 }
465out:
466 dm_tm_unlock(info->tm, node);
467 return r;
468}
469
470int dm_btree_lookup_next(struct dm_btree_info *info, dm_block_t root,
471 uint64_t *keys, uint64_t *rkey, void *value_le)
472{
473 unsigned level;
474 int r = -ENODATA;
475 __le64 internal_value_le;
476 struct ro_spine spine;
477
478 init_ro_spine(&spine, info);
479 for (level = 0; level < info->levels - 1u; level++) {
480 r = btree_lookup_raw(&spine, root, keys[level],
481 lower_bound, rkey,
482 &internal_value_le, sizeof(uint64_t));
483 if (r)
484 goto out;
485
486 if (*rkey != keys[level]) {
487 r = -ENODATA;
488 goto out;
489 }
490
491 root = le64_to_cpu(internal_value_le);
492 }
493
494 r = dm_btree_lookup_next_single(info, root, keys[level], rkey, value_le);
495out:
496 exit_ro_spine(&spine);
497 return r;
498}
499
500EXPORT_SYMBOL_GPL(dm_btree_lookup_next);
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index,
533 uint64_t key)
534{
535 int r;
536 size_t size;
537 unsigned nr_left, nr_right;
538 struct dm_block *left, *right, *parent;
539 struct btree_node *ln, *rn, *pn;
540 __le64 location;
541
542 left = shadow_current(s);
543
544 r = new_block(s->info, &right);
545 if (r < 0)
546 return r;
547
548 ln = dm_block_data(left);
549 rn = dm_block_data(right);
550
551 nr_left = le32_to_cpu(ln->header.nr_entries) / 2;
552 nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left;
553
554 ln->header.nr_entries = cpu_to_le32(nr_left);
555
556 rn->header.flags = ln->header.flags;
557 rn->header.nr_entries = cpu_to_le32(nr_right);
558 rn->header.max_entries = ln->header.max_entries;
559 rn->header.value_size = ln->header.value_size;
560 memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0]));
561
562 size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
563 sizeof(uint64_t) : s->info->value_type.size;
564 memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left),
565 size * nr_right);
566
567
568
569
570 parent = shadow_parent(s);
571
572 pn = dm_block_data(parent);
573 location = cpu_to_le64(dm_block_location(left));
574 __dm_bless_for_disk(&location);
575 memcpy_disk(value_ptr(pn, parent_index),
576 &location, sizeof(__le64));
577
578 location = cpu_to_le64(dm_block_location(right));
579 __dm_bless_for_disk(&location);
580
581 r = insert_at(sizeof(__le64), pn, parent_index + 1,
582 le64_to_cpu(rn->keys[0]), &location);
583 if (r) {
584 unlock_block(s->info, right);
585 return r;
586 }
587
588 if (key < le64_to_cpu(rn->keys[0])) {
589 unlock_block(s->info, right);
590 s->nodes[1] = left;
591 } else {
592 unlock_block(s->info, left);
593 s->nodes[1] = right;
594 }
595
596 return 0;
597}
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
621{
622 int r;
623 size_t size;
624 unsigned nr_left, nr_right;
625 struct dm_block *left, *right, *new_parent;
626 struct btree_node *pn, *ln, *rn;
627 __le64 val;
628
629 new_parent = shadow_current(s);
630
631 r = new_block(s->info, &left);
632 if (r < 0)
633 return r;
634
635 r = new_block(s->info, &right);
636 if (r < 0) {
637 unlock_block(s->info, left);
638 return r;
639 }
640
641 pn = dm_block_data(new_parent);
642 ln = dm_block_data(left);
643 rn = dm_block_data(right);
644
645 nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
646 nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
647
648 ln->header.flags = pn->header.flags;
649 ln->header.nr_entries = cpu_to_le32(nr_left);
650 ln->header.max_entries = pn->header.max_entries;
651 ln->header.value_size = pn->header.value_size;
652
653 rn->header.flags = pn->header.flags;
654 rn->header.nr_entries = cpu_to_le32(nr_right);
655 rn->header.max_entries = pn->header.max_entries;
656 rn->header.value_size = pn->header.value_size;
657
658 memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
659 memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
660
661 size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
662 sizeof(__le64) : s->info->value_type.size;
663 memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
664 memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
665 nr_right * size);
666
667
668 pn->header.flags = cpu_to_le32(INTERNAL_NODE);
669 pn->header.nr_entries = cpu_to_le32(2);
670 pn->header.max_entries = cpu_to_le32(
671 calc_max_entries(sizeof(__le64),
672 dm_bm_block_size(
673 dm_tm_get_bm(s->info->tm))));
674 pn->header.value_size = cpu_to_le32(sizeof(__le64));
675
676 val = cpu_to_le64(dm_block_location(left));
677 __dm_bless_for_disk(&val);
678 pn->keys[0] = ln->keys[0];
679 memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64));
680
681 val = cpu_to_le64(dm_block_location(right));
682 __dm_bless_for_disk(&val);
683 pn->keys[1] = rn->keys[0];
684 memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64));
685
686 unlock_block(s->info, left);
687 unlock_block(s->info, right);
688 return 0;
689}
690
691static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
692 struct dm_btree_value_type *vt,
693 uint64_t key, unsigned *index)
694{
695 int r, i = *index, top = 1;
696 struct btree_node *node;
697
698 for (;;) {
699 r = shadow_step(s, root, vt);
700 if (r < 0)
701 return r;
702
703 node = dm_block_data(shadow_current(s));
704
705
706
707
708
709
710 if (shadow_has_parent(s) && i >= 0) {
711 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
712
713 __dm_bless_for_disk(&location);
714 memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
715 &location, sizeof(__le64));
716 }
717
718 node = dm_block_data(shadow_current(s));
719
720 if (node->header.nr_entries == node->header.max_entries) {
721 if (top)
722 r = btree_split_beneath(s, key);
723 else
724 r = btree_split_sibling(s, i, key);
725
726 if (r < 0)
727 return r;
728 }
729
730 node = dm_block_data(shadow_current(s));
731
732 i = lower_bound(node, key);
733
734 if (le32_to_cpu(node->header.flags) & LEAF_NODE)
735 break;
736
737 if (i < 0) {
738
739 node->keys[0] = cpu_to_le64(key);
740 i = 0;
741 }
742
743 root = value64(node, i);
744 top = 0;
745 }
746
747 if (i < 0 || le64_to_cpu(node->keys[i]) != key)
748 i++;
749
750 *index = i;
751 return 0;
752}
753
754static bool need_insert(struct btree_node *node, uint64_t *keys,
755 unsigned level, unsigned index)
756{
757 return ((index >= le32_to_cpu(node->header.nr_entries)) ||
758 (le64_to_cpu(node->keys[index]) != keys[level]));
759}
760
761static int insert(struct dm_btree_info *info, dm_block_t root,
762 uint64_t *keys, void *value, dm_block_t *new_root,
763 int *inserted)
764 __dm_written_to_disk(value)
765{
766 int r;
767 unsigned level, index = -1, last_level = info->levels - 1;
768 dm_block_t block = root;
769 struct shadow_spine spine;
770 struct btree_node *n;
771 struct dm_btree_value_type le64_type;
772
773 init_le64_type(info->tm, &le64_type);
774 init_shadow_spine(&spine, info);
775
776 for (level = 0; level < (info->levels - 1); level++) {
777 r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index);
778 if (r < 0)
779 goto bad;
780
781 n = dm_block_data(shadow_current(&spine));
782
783 if (need_insert(n, keys, level, index)) {
784 dm_block_t new_tree;
785 __le64 new_le;
786
787 r = dm_btree_empty(info, &new_tree);
788 if (r < 0)
789 goto bad;
790
791 new_le = cpu_to_le64(new_tree);
792 __dm_bless_for_disk(&new_le);
793
794 r = insert_at(sizeof(uint64_t), n, index,
795 keys[level], &new_le);
796 if (r)
797 goto bad;
798 }
799
800 if (level < last_level)
801 block = value64(n, index);
802 }
803
804 r = btree_insert_raw(&spine, block, &info->value_type,
805 keys[level], &index);
806 if (r < 0)
807 goto bad;
808
809 n = dm_block_data(shadow_current(&spine));
810
811 if (need_insert(n, keys, level, index)) {
812 if (inserted)
813 *inserted = 1;
814
815 r = insert_at(info->value_type.size, n, index,
816 keys[level], value);
817 if (r)
818 goto bad_unblessed;
819 } else {
820 if (inserted)
821 *inserted = 0;
822
823 if (info->value_type.dec &&
824 (!info->value_type.equal ||
825 !info->value_type.equal(
826 info->value_type.context,
827 value_ptr(n, index),
828 value))) {
829 info->value_type.dec(info->value_type.context,
830 value_ptr(n, index));
831 }
832 memcpy_disk(value_ptr(n, index),
833 value, info->value_type.size);
834 }
835
836 *new_root = shadow_root(&spine);
837 exit_shadow_spine(&spine);
838
839 return 0;
840
841bad:
842 __dm_unbless_for_disk(value);
843bad_unblessed:
844 exit_shadow_spine(&spine);
845 return r;
846}
847
848int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
849 uint64_t *keys, void *value, dm_block_t *new_root)
850 __dm_written_to_disk(value)
851{
852 return insert(info, root, keys, value, new_root, NULL);
853}
854EXPORT_SYMBOL_GPL(dm_btree_insert);
855
856int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
857 uint64_t *keys, void *value, dm_block_t *new_root,
858 int *inserted)
859 __dm_written_to_disk(value)
860{
861 return insert(info, root, keys, value, new_root, inserted);
862}
863EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
864
865
866
867static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest,
868 uint64_t *result_key, dm_block_t *next_block)
869{
870 int i, r;
871 uint32_t flags;
872
873 do {
874 r = ro_step(s, block);
875 if (r < 0)
876 return r;
877
878 flags = le32_to_cpu(ro_node(s)->header.flags);
879 i = le32_to_cpu(ro_node(s)->header.nr_entries);
880 if (!i)
881 return -ENODATA;
882 else
883 i--;
884
885 if (find_highest)
886 *result_key = le64_to_cpu(ro_node(s)->keys[i]);
887 else
888 *result_key = le64_to_cpu(ro_node(s)->keys[0]);
889
890 if (next_block || flags & INTERNAL_NODE) {
891 if (find_highest)
892 block = value64(ro_node(s), i);
893 else
894 block = value64(ro_node(s), 0);
895 }
896
897 } while (flags & INTERNAL_NODE);
898
899 if (next_block)
900 *next_block = block;
901 return 0;
902}
903
904static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root,
905 bool find_highest, uint64_t *result_keys)
906{
907 int r = 0, count = 0, level;
908 struct ro_spine spine;
909
910 init_ro_spine(&spine, info);
911 for (level = 0; level < info->levels; level++) {
912 r = find_key(&spine, root, find_highest, result_keys + level,
913 level == info->levels - 1 ? NULL : &root);
914 if (r == -ENODATA) {
915 r = 0;
916 break;
917
918 } else if (r)
919 break;
920
921 count++;
922 }
923 exit_ro_spine(&spine);
924
925 return r ? r : count;
926}
927
928int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
929 uint64_t *result_keys)
930{
931 return dm_btree_find_key(info, root, true, result_keys);
932}
933EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
934
935int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root,
936 uint64_t *result_keys)
937{
938 return dm_btree_find_key(info, root, false, result_keys);
939}
940EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key);
941
942
943
944
945
946
947
948static int walk_node(struct dm_btree_info *info, dm_block_t block,
949 int (*fn)(void *context, uint64_t *keys, void *leaf),
950 void *context)
951{
952 int r;
953 unsigned i, nr;
954 struct dm_block *node;
955 struct btree_node *n;
956 uint64_t keys;
957
958 r = bn_read_lock(info, block, &node);
959 if (r)
960 return r;
961
962 n = dm_block_data(node);
963
964 nr = le32_to_cpu(n->header.nr_entries);
965 for (i = 0; i < nr; i++) {
966 if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) {
967 r = walk_node(info, value64(n, i), fn, context);
968 if (r)
969 goto out;
970 } else {
971 keys = le64_to_cpu(*key_ptr(n, i));
972 r = fn(context, &keys, value_ptr(n, i));
973 if (r)
974 goto out;
975 }
976 }
977
978out:
979 dm_tm_unlock(info->tm, node);
980 return r;
981}
982
983int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
984 int (*fn)(void *context, uint64_t *keys, void *leaf),
985 void *context)
986{
987 BUG_ON(info->levels > 1);
988 return walk_node(info, root, fn, context);
989}
990EXPORT_SYMBOL_GPL(dm_btree_walk);
991
992
993
994static void prefetch_values(struct dm_btree_cursor *c)
995{
996 unsigned i, nr;
997 __le64 value_le;
998 struct cursor_node *n = c->nodes + c->depth - 1;
999 struct btree_node *bn = dm_block_data(n->b);
1000 struct dm_block_manager *bm = dm_tm_get_bm(c->info->tm);
1001
1002 BUG_ON(c->info->value_type.size != sizeof(value_le));
1003
1004 nr = le32_to_cpu(bn->header.nr_entries);
1005 for (i = 0; i < nr; i++) {
1006 memcpy(&value_le, value_ptr(bn, i), sizeof(value_le));
1007 dm_bm_prefetch(bm, le64_to_cpu(value_le));
1008 }
1009}
1010
1011static bool leaf_node(struct dm_btree_cursor *c)
1012{
1013 struct cursor_node *n = c->nodes + c->depth - 1;
1014 struct btree_node *bn = dm_block_data(n->b);
1015
1016 return le32_to_cpu(bn->header.flags) & LEAF_NODE;
1017}
1018
1019static int push_node(struct dm_btree_cursor *c, dm_block_t b)
1020{
1021 int r;
1022 struct cursor_node *n = c->nodes + c->depth;
1023
1024 if (c->depth >= DM_BTREE_CURSOR_MAX_DEPTH - 1) {
1025 DMERR("couldn't push cursor node, stack depth too high");
1026 return -EINVAL;
1027 }
1028
1029 r = bn_read_lock(c->info, b, &n->b);
1030 if (r)
1031 return r;
1032
1033 n->index = 0;
1034 c->depth++;
1035
1036 if (c->prefetch_leaves || !leaf_node(c))
1037 prefetch_values(c);
1038
1039 return 0;
1040}
1041
1042static void pop_node(struct dm_btree_cursor *c)
1043{
1044 c->depth--;
1045 unlock_block(c->info, c->nodes[c->depth].b);
1046}
1047
1048static int inc_or_backtrack(struct dm_btree_cursor *c)
1049{
1050 struct cursor_node *n;
1051 struct btree_node *bn;
1052
1053 for (;;) {
1054 if (!c->depth)
1055 return -ENODATA;
1056
1057 n = c->nodes + c->depth - 1;
1058 bn = dm_block_data(n->b);
1059
1060 n->index++;
1061 if (n->index < le32_to_cpu(bn->header.nr_entries))
1062 break;
1063
1064 pop_node(c);
1065 }
1066
1067 return 0;
1068}
1069
1070static int find_leaf(struct dm_btree_cursor *c)
1071{
1072 int r = 0;
1073 struct cursor_node *n;
1074 struct btree_node *bn;
1075 __le64 value_le;
1076
1077 for (;;) {
1078 n = c->nodes + c->depth - 1;
1079 bn = dm_block_data(n->b);
1080
1081 if (le32_to_cpu(bn->header.flags) & LEAF_NODE)
1082 break;
1083
1084 memcpy(&value_le, value_ptr(bn, n->index), sizeof(value_le));
1085 r = push_node(c, le64_to_cpu(value_le));
1086 if (r) {
1087 DMERR("push_node failed");
1088 break;
1089 }
1090 }
1091
1092 if (!r && (le32_to_cpu(bn->header.nr_entries) == 0))
1093 return -ENODATA;
1094
1095 return r;
1096}
1097
1098int dm_btree_cursor_begin(struct dm_btree_info *info, dm_block_t root,
1099 bool prefetch_leaves, struct dm_btree_cursor *c)
1100{
1101 int r;
1102
1103 c->info = info;
1104 c->root = root;
1105 c->depth = 0;
1106 c->prefetch_leaves = prefetch_leaves;
1107
1108 r = push_node(c, root);
1109 if (r)
1110 return r;
1111
1112 return find_leaf(c);
1113}
1114EXPORT_SYMBOL_GPL(dm_btree_cursor_begin);
1115
1116void dm_btree_cursor_end(struct dm_btree_cursor *c)
1117{
1118 while (c->depth)
1119 pop_node(c);
1120}
1121EXPORT_SYMBOL_GPL(dm_btree_cursor_end);
1122
1123int dm_btree_cursor_next(struct dm_btree_cursor *c)
1124{
1125 int r = inc_or_backtrack(c);
1126 if (!r) {
1127 r = find_leaf(c);
1128 if (r)
1129 DMERR("find_leaf failed");
1130 }
1131
1132 return r;
1133}
1134EXPORT_SYMBOL_GPL(dm_btree_cursor_next);
1135
1136int dm_btree_cursor_skip(struct dm_btree_cursor *c, uint32_t count)
1137{
1138 int r = 0;
1139
1140 while (count-- && !r)
1141 r = dm_btree_cursor_next(c);
1142
1143 return r;
1144}
1145EXPORT_SYMBOL_GPL(dm_btree_cursor_skip);
1146
1147int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le)
1148{
1149 if (c->depth) {
1150 struct cursor_node *n = c->nodes + c->depth - 1;
1151 struct btree_node *bn = dm_block_data(n->b);
1152
1153 if (le32_to_cpu(bn->header.flags) & INTERNAL_NODE)
1154 return -EINVAL;
1155
1156 *key = le64_to_cpu(*key_ptr(bn, n->index));
1157 memcpy(value_le, value_ptr(bn, n->index), c->info->value_type.size);
1158 return 0;
1159
1160 } else
1161 return -ENODATA;
1162}
1163EXPORT_SYMBOL_GPL(dm_btree_cursor_get_value);
1164