1
2
3
4
5
6
7#include "dm-btree-internal.h"
8#include "dm-space-map.h"
9#include "dm-transaction-manager.h"
10
11#include <linux/export.h>
12#include <linux/device-mapper.h>
13
14#define DM_MSG_PREFIX "btree"
15
16
17
18
19static void memcpy_disk(void *dest, const void *src, size_t len)
20 __dm_written_to_disk(src)
21{
22 memcpy(dest, src, len);
23 __dm_unbless_for_disk(src);
24}
25
26static void array_insert(void *base, size_t elt_size, unsigned nr_elts,
27 unsigned index, void *elt)
28 __dm_written_to_disk(elt)
29{
30 if (index < nr_elts)
31 memmove(base + (elt_size * (index + 1)),
32 base + (elt_size * index),
33 (nr_elts - index) * elt_size);
34
35 memcpy_disk(base + (elt_size * index), elt, elt_size);
36}
37
38
39
40
41static int bsearch(struct btree_node *n, uint64_t key, int want_hi)
42{
43 int lo = -1, hi = le32_to_cpu(n->header.nr_entries);
44
45 while (hi - lo > 1) {
46 int mid = lo + ((hi - lo) / 2);
47 uint64_t mid_key = le64_to_cpu(n->keys[mid]);
48
49 if (mid_key == key)
50 return mid;
51
52 if (mid_key < key)
53 lo = mid;
54 else
55 hi = mid;
56 }
57
58 return want_hi ? hi : lo;
59}
60
61int lower_bound(struct btree_node *n, uint64_t key)
62{
63 return bsearch(n, key, 0);
64}
65
66static int upper_bound(struct btree_node *n, uint64_t key)
67{
68 return bsearch(n, key, 1);
69}
70
71void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
72 struct dm_btree_value_type *vt)
73{
74 unsigned i;
75 uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
76
77 if (le32_to_cpu(n->header.flags) & INTERNAL_NODE)
78 for (i = 0; i < nr_entries; i++)
79 dm_tm_inc(tm, value64(n, i));
80 else if (vt->inc)
81 for (i = 0; i < nr_entries; i++)
82 vt->inc(vt->context, value_ptr(n, i));
83}
84
85static int insert_at(size_t value_size, struct btree_node *node, unsigned index,
86 uint64_t key, void *value)
87 __dm_written_to_disk(value)
88{
89 uint32_t nr_entries = le32_to_cpu(node->header.nr_entries);
90 __le64 key_le = cpu_to_le64(key);
91
92 if (index > nr_entries ||
93 index >= le32_to_cpu(node->header.max_entries)) {
94 DMERR("too many entries in btree node for insert");
95 __dm_unbless_for_disk(value);
96 return -ENOMEM;
97 }
98
99 __dm_bless_for_disk(&key_le);
100
101 array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le);
102 array_insert(value_base(node), value_size, nr_entries, index, value);
103 node->header.nr_entries = cpu_to_le32(nr_entries + 1);
104
105 return 0;
106}
107
108
109
110
111
112
113
114static uint32_t calc_max_entries(size_t value_size, size_t block_size)
115{
116 uint32_t total, n;
117 size_t elt_size = sizeof(uint64_t) + value_size;
118
119 block_size -= sizeof(struct node_header);
120 total = block_size / elt_size;
121 n = total / 3;
122
123 return 3 * n;
124}
125
126int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root)
127{
128 int r;
129 struct dm_block *b;
130 struct btree_node *n;
131 size_t block_size;
132 uint32_t max_entries;
133
134 r = new_block(info, &b);
135 if (r < 0)
136 return r;
137
138 block_size = dm_bm_block_size(dm_tm_get_bm(info->tm));
139 max_entries = calc_max_entries(info->value_type.size, block_size);
140
141 n = dm_block_data(b);
142 memset(n, 0, block_size);
143 n->header.flags = cpu_to_le32(LEAF_NODE);
144 n->header.nr_entries = cpu_to_le32(0);
145 n->header.max_entries = cpu_to_le32(max_entries);
146 n->header.value_size = cpu_to_le32(info->value_type.size);
147
148 *root = dm_block_location(b);
149 unlock_block(info, b);
150
151 return 0;
152}
153EXPORT_SYMBOL_GPL(dm_btree_empty);
154
155
156
157
158
159
160
161#define MAX_SPINE_DEPTH 64
162struct frame {
163 struct dm_block *b;
164 struct btree_node *n;
165 unsigned level;
166 unsigned nr_children;
167 unsigned current_child;
168};
169
170struct del_stack {
171 struct dm_btree_info *info;
172 struct dm_transaction_manager *tm;
173 int top;
174 struct frame spine[MAX_SPINE_DEPTH];
175};
176
177static int top_frame(struct del_stack *s, struct frame **f)
178{
179 if (s->top < 0) {
180 DMERR("btree deletion stack empty");
181 return -EINVAL;
182 }
183
184 *f = s->spine + s->top;
185
186 return 0;
187}
188
189static int unprocessed_frames(struct del_stack *s)
190{
191 return s->top >= 0;
192}
193
194static void prefetch_children(struct del_stack *s, struct frame *f)
195{
196 unsigned i;
197 struct dm_block_manager *bm = dm_tm_get_bm(s->tm);
198
199 for (i = 0; i < f->nr_children; i++)
200 dm_bm_prefetch(bm, value64(f->n, i));
201}
202
203static bool is_internal_level(struct dm_btree_info *info, struct frame *f)
204{
205 return f->level < (info->levels - 1);
206}
207
208static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
209{
210 int r;
211 uint32_t ref_count;
212
213 if (s->top >= MAX_SPINE_DEPTH - 1) {
214 DMERR("btree deletion stack out of memory");
215 return -ENOMEM;
216 }
217
218 r = dm_tm_ref(s->tm, b, &ref_count);
219 if (r)
220 return r;
221
222 if (ref_count > 1)
223
224
225
226
227 dm_tm_dec(s->tm, b);
228
229 else {
230 uint32_t flags;
231 struct frame *f = s->spine + ++s->top;
232
233 r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b);
234 if (r) {
235 s->top--;
236 return r;
237 }
238
239 f->n = dm_block_data(f->b);
240 f->level = level;
241 f->nr_children = le32_to_cpu(f->n->header.nr_entries);
242 f->current_child = 0;
243
244 flags = le32_to_cpu(f->n->header.flags);
245 if (flags & INTERNAL_NODE || is_internal_level(s->info, f))
246 prefetch_children(s, f);
247 }
248
249 return 0;
250}
251
252static void pop_frame(struct del_stack *s)
253{
254 struct frame *f = s->spine + s->top--;
255
256 dm_tm_dec(s->tm, dm_block_location(f->b));
257 dm_tm_unlock(s->tm, f->b);
258}
259
260static void unlock_all_frames(struct del_stack *s)
261{
262 struct frame *f;
263
264 while (unprocessed_frames(s)) {
265 f = s->spine + s->top--;
266 dm_tm_unlock(s->tm, f->b);
267 }
268}
269
270int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
271{
272 int r;
273 struct del_stack *s;
274
275
276
277
278
279
280 s = kmalloc(sizeof(*s), GFP_NOFS);
281 if (!s)
282 return -ENOMEM;
283 s->info = info;
284 s->tm = info->tm;
285 s->top = -1;
286
287 r = push_frame(s, root, 0);
288 if (r)
289 goto out;
290
291 while (unprocessed_frames(s)) {
292 uint32_t flags;
293 struct frame *f;
294 dm_block_t b;
295
296 r = top_frame(s, &f);
297 if (r)
298 goto out;
299
300 if (f->current_child >= f->nr_children) {
301 pop_frame(s);
302 continue;
303 }
304
305 flags = le32_to_cpu(f->n->header.flags);
306 if (flags & INTERNAL_NODE) {
307 b = value64(f->n, f->current_child);
308 f->current_child++;
309 r = push_frame(s, b, f->level);
310 if (r)
311 goto out;
312
313 } else if (is_internal_level(info, f)) {
314 b = value64(f->n, f->current_child);
315 f->current_child++;
316 r = push_frame(s, b, f->level + 1);
317 if (r)
318 goto out;
319
320 } else {
321 if (info->value_type.dec) {
322 unsigned i;
323
324 for (i = 0; i < f->nr_children; i++)
325 info->value_type.dec(info->value_type.context,
326 value_ptr(f->n, i));
327 }
328 pop_frame(s);
329 }
330 }
331out:
332 if (r) {
333
334 unlock_all_frames(s);
335 }
336 kfree(s);
337
338 return r;
339}
340EXPORT_SYMBOL_GPL(dm_btree_del);
341
342
343
344static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
345 int (*search_fn)(struct btree_node *, uint64_t),
346 uint64_t *result_key, void *v, size_t value_size)
347{
348 int i, r;
349 uint32_t flags, nr_entries;
350
351 do {
352 r = ro_step(s, block);
353 if (r < 0)
354 return r;
355
356 i = search_fn(ro_node(s), key);
357
358 flags = le32_to_cpu(ro_node(s)->header.flags);
359 nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries);
360 if (i < 0 || i >= nr_entries)
361 return -ENODATA;
362
363 if (flags & INTERNAL_NODE)
364 block = value64(ro_node(s), i);
365
366 } while (!(flags & LEAF_NODE));
367
368 *result_key = le64_to_cpu(ro_node(s)->keys[i]);
369 memcpy(v, value_ptr(ro_node(s), i), value_size);
370
371 return 0;
372}
373
374int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
375 uint64_t *keys, void *value_le)
376{
377 unsigned level, last_level = info->levels - 1;
378 int r = -ENODATA;
379 uint64_t rkey;
380 __le64 internal_value_le;
381 struct ro_spine spine;
382
383 init_ro_spine(&spine, info);
384 for (level = 0; level < info->levels; level++) {
385 size_t size;
386 void *value_p;
387
388 if (level == last_level) {
389 value_p = value_le;
390 size = info->value_type.size;
391
392 } else {
393 value_p = &internal_value_le;
394 size = sizeof(uint64_t);
395 }
396
397 r = btree_lookup_raw(&spine, root, keys[level],
398 lower_bound, &rkey,
399 value_p, size);
400
401 if (!r) {
402 if (rkey != keys[level]) {
403 exit_ro_spine(&spine);
404 return -ENODATA;
405 }
406 } else {
407 exit_ro_spine(&spine);
408 return r;
409 }
410
411 root = le64_to_cpu(internal_value_le);
412 }
413 exit_ro_spine(&spine);
414
415 return r;
416}
417EXPORT_SYMBOL_GPL(dm_btree_lookup);
418
419static int dm_btree_lookup_next_single(struct dm_btree_info *info, dm_block_t root,
420 uint64_t key, uint64_t *rkey, void *value_le)
421{
422 int r, i;
423 uint32_t flags, nr_entries;
424 struct dm_block *node;
425 struct btree_node *n;
426
427 r = bn_read_lock(info, root, &node);
428 if (r)
429 return r;
430
431 n = dm_block_data(node);
432 flags = le32_to_cpu(n->header.flags);
433 nr_entries = le32_to_cpu(n->header.nr_entries);
434
435 if (flags & INTERNAL_NODE) {
436 i = lower_bound(n, key);
437 if (i < 0) {
438
439
440
441
442 i = 0;
443 }
444 if (i >= nr_entries) {
445 r = -ENODATA;
446 goto out;
447 }
448
449 r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le);
450 if (r == -ENODATA && i < (nr_entries - 1)) {
451 i++;
452 r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le);
453 }
454
455 } else {
456 i = upper_bound(n, key);
457 if (i < 0 || i >= nr_entries) {
458 r = -ENODATA;
459 goto out;
460 }
461
462 *rkey = le64_to_cpu(n->keys[i]);
463 memcpy(value_le, value_ptr(n, i), info->value_type.size);
464 }
465out:
466 dm_tm_unlock(info->tm, node);
467 return r;
468}
469
470int dm_btree_lookup_next(struct dm_btree_info *info, dm_block_t root,
471 uint64_t *keys, uint64_t *rkey, void *value_le)
472{
473 unsigned level;
474 int r = -ENODATA;
475 __le64 internal_value_le;
476 struct ro_spine spine;
477
478 init_ro_spine(&spine, info);
479 for (level = 0; level < info->levels - 1u; level++) {
480 r = btree_lookup_raw(&spine, root, keys[level],
481 lower_bound, rkey,
482 &internal_value_le, sizeof(uint64_t));
483 if (r)
484 goto out;
485
486 if (*rkey != keys[level]) {
487 r = -ENODATA;
488 goto out;
489 }
490
491 root = le64_to_cpu(internal_value_le);
492 }
493
494 r = dm_btree_lookup_next_single(info, root, keys[level], rkey, value_le);
495out:
496 exit_ro_spine(&spine);
497 return r;
498}
499
500EXPORT_SYMBOL_GPL(dm_btree_lookup_next);
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index,
533 uint64_t key)
534{
535 int r;
536 size_t size;
537 unsigned nr_left, nr_right;
538 struct dm_block *left, *right, *parent;
539 struct btree_node *ln, *rn, *pn;
540 __le64 location;
541
542 left = shadow_current(s);
543
544 r = new_block(s->info, &right);
545 if (r < 0)
546 return r;
547
548 ln = dm_block_data(left);
549 rn = dm_block_data(right);
550
551 nr_left = le32_to_cpu(ln->header.nr_entries) / 2;
552 nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left;
553
554 ln->header.nr_entries = cpu_to_le32(nr_left);
555
556 rn->header.flags = ln->header.flags;
557 rn->header.nr_entries = cpu_to_le32(nr_right);
558 rn->header.max_entries = ln->header.max_entries;
559 rn->header.value_size = ln->header.value_size;
560 memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0]));
561
562 size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
563 sizeof(uint64_t) : s->info->value_type.size;
564 memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left),
565 size * nr_right);
566
567
568
569
570 parent = shadow_parent(s);
571
572 pn = dm_block_data(parent);
573 location = cpu_to_le64(dm_block_location(left));
574 __dm_bless_for_disk(&location);
575 memcpy_disk(value_ptr(pn, parent_index),
576 &location, sizeof(__le64));
577
578 location = cpu_to_le64(dm_block_location(right));
579 __dm_bless_for_disk(&location);
580
581 r = insert_at(sizeof(__le64), pn, parent_index + 1,
582 le64_to_cpu(rn->keys[0]), &location);
583 if (r) {
584 unlock_block(s->info, right);
585 return r;
586 }
587
588 if (key < le64_to_cpu(rn->keys[0])) {
589 unlock_block(s->info, right);
590 s->nodes[1] = left;
591 } else {
592 unlock_block(s->info, left);
593 s->nodes[1] = right;
594 }
595
596 return 0;
597}
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
621{
622 int r;
623 size_t size;
624 unsigned nr_left, nr_right;
625 struct dm_block *left, *right, *new_parent;
626 struct btree_node *pn, *ln, *rn;
627 __le64 val;
628
629 new_parent = shadow_current(s);
630
631 r = new_block(s->info, &left);
632 if (r < 0)
633 return r;
634
635 r = new_block(s->info, &right);
636 if (r < 0) {
637 unlock_block(s->info, left);
638 return r;
639 }
640
641 pn = dm_block_data(new_parent);
642 ln = dm_block_data(left);
643 rn = dm_block_data(right);
644
645 nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
646 nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
647
648 ln->header.flags = pn->header.flags;
649 ln->header.nr_entries = cpu_to_le32(nr_left);
650 ln->header.max_entries = pn->header.max_entries;
651 ln->header.value_size = pn->header.value_size;
652
653 rn->header.flags = pn->header.flags;
654 rn->header.nr_entries = cpu_to_le32(nr_right);
655 rn->header.max_entries = pn->header.max_entries;
656 rn->header.value_size = pn->header.value_size;
657
658 memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
659 memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
660
661 size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
662 sizeof(__le64) : s->info->value_type.size;
663 memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
664 memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
665 nr_right * size);
666
667
668 pn->header.flags = cpu_to_le32(INTERNAL_NODE);
669 pn->header.nr_entries = cpu_to_le32(2);
670 pn->header.max_entries = cpu_to_le32(
671 calc_max_entries(sizeof(__le64),
672 dm_bm_block_size(
673 dm_tm_get_bm(s->info->tm))));
674 pn->header.value_size = cpu_to_le32(sizeof(__le64));
675
676 val = cpu_to_le64(dm_block_location(left));
677 __dm_bless_for_disk(&val);
678 pn->keys[0] = ln->keys[0];
679 memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64));
680
681 val = cpu_to_le64(dm_block_location(right));
682 __dm_bless_for_disk(&val);
683 pn->keys[1] = rn->keys[0];
684 memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64));
685
686
687
688
689
690 if (s->nodes[0] != new_parent) {
691 unlock_block(s->info, s->nodes[0]);
692 s->nodes[0] = new_parent;
693 }
694 if (key < le64_to_cpu(rn->keys[0])) {
695 unlock_block(s->info, right);
696 s->nodes[1] = left;
697 } else {
698 unlock_block(s->info, left);
699 s->nodes[1] = right;
700 }
701 s->count = 2;
702
703 return 0;
704}
705
706static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
707 struct dm_btree_value_type *vt,
708 uint64_t key, unsigned *index)
709{
710 int r, i = *index, top = 1;
711 struct btree_node *node;
712
713 for (;;) {
714 r = shadow_step(s, root, vt);
715 if (r < 0)
716 return r;
717
718 node = dm_block_data(shadow_current(s));
719
720
721
722
723
724
725 if (shadow_has_parent(s) && i >= 0) {
726 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
727
728 __dm_bless_for_disk(&location);
729 memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
730 &location, sizeof(__le64));
731 }
732
733 node = dm_block_data(shadow_current(s));
734
735 if (node->header.nr_entries == node->header.max_entries) {
736 if (top)
737 r = btree_split_beneath(s, key);
738 else
739 r = btree_split_sibling(s, i, key);
740
741 if (r < 0)
742 return r;
743 }
744
745 node = dm_block_data(shadow_current(s));
746
747 i = lower_bound(node, key);
748
749 if (le32_to_cpu(node->header.flags) & LEAF_NODE)
750 break;
751
752 if (i < 0) {
753
754 node->keys[0] = cpu_to_le64(key);
755 i = 0;
756 }
757
758 root = value64(node, i);
759 top = 0;
760 }
761
762 if (i < 0 || le64_to_cpu(node->keys[i]) != key)
763 i++;
764
765 *index = i;
766 return 0;
767}
768
769static bool need_insert(struct btree_node *node, uint64_t *keys,
770 unsigned level, unsigned index)
771{
772 return ((index >= le32_to_cpu(node->header.nr_entries)) ||
773 (le64_to_cpu(node->keys[index]) != keys[level]));
774}
775
776static int insert(struct dm_btree_info *info, dm_block_t root,
777 uint64_t *keys, void *value, dm_block_t *new_root,
778 int *inserted)
779 __dm_written_to_disk(value)
780{
781 int r;
782 unsigned level, index = -1, last_level = info->levels - 1;
783 dm_block_t block = root;
784 struct shadow_spine spine;
785 struct btree_node *n;
786 struct dm_btree_value_type le64_type;
787
788 init_le64_type(info->tm, &le64_type);
789 init_shadow_spine(&spine, info);
790
791 for (level = 0; level < (info->levels - 1); level++) {
792 r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index);
793 if (r < 0)
794 goto bad;
795
796 n = dm_block_data(shadow_current(&spine));
797
798 if (need_insert(n, keys, level, index)) {
799 dm_block_t new_tree;
800 __le64 new_le;
801
802 r = dm_btree_empty(info, &new_tree);
803 if (r < 0)
804 goto bad;
805
806 new_le = cpu_to_le64(new_tree);
807 __dm_bless_for_disk(&new_le);
808
809 r = insert_at(sizeof(uint64_t), n, index,
810 keys[level], &new_le);
811 if (r)
812 goto bad;
813 }
814
815 if (level < last_level)
816 block = value64(n, index);
817 }
818
819 r = btree_insert_raw(&spine, block, &info->value_type,
820 keys[level], &index);
821 if (r < 0)
822 goto bad;
823
824 n = dm_block_data(shadow_current(&spine));
825
826 if (need_insert(n, keys, level, index)) {
827 if (inserted)
828 *inserted = 1;
829
830 r = insert_at(info->value_type.size, n, index,
831 keys[level], value);
832 if (r)
833 goto bad_unblessed;
834 } else {
835 if (inserted)
836 *inserted = 0;
837
838 if (info->value_type.dec &&
839 (!info->value_type.equal ||
840 !info->value_type.equal(
841 info->value_type.context,
842 value_ptr(n, index),
843 value))) {
844 info->value_type.dec(info->value_type.context,
845 value_ptr(n, index));
846 }
847 memcpy_disk(value_ptr(n, index),
848 value, info->value_type.size);
849 }
850
851 *new_root = shadow_root(&spine);
852 exit_shadow_spine(&spine);
853
854 return 0;
855
856bad:
857 __dm_unbless_for_disk(value);
858bad_unblessed:
859 exit_shadow_spine(&spine);
860 return r;
861}
862
863int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
864 uint64_t *keys, void *value, dm_block_t *new_root)
865 __dm_written_to_disk(value)
866{
867 return insert(info, root, keys, value, new_root, NULL);
868}
869EXPORT_SYMBOL_GPL(dm_btree_insert);
870
871int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
872 uint64_t *keys, void *value, dm_block_t *new_root,
873 int *inserted)
874 __dm_written_to_disk(value)
875{
876 return insert(info, root, keys, value, new_root, inserted);
877}
878EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
879
880
881
882static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest,
883 uint64_t *result_key, dm_block_t *next_block)
884{
885 int i, r;
886 uint32_t flags;
887
888 do {
889 r = ro_step(s, block);
890 if (r < 0)
891 return r;
892
893 flags = le32_to_cpu(ro_node(s)->header.flags);
894 i = le32_to_cpu(ro_node(s)->header.nr_entries);
895 if (!i)
896 return -ENODATA;
897 else
898 i--;
899
900 if (find_highest)
901 *result_key = le64_to_cpu(ro_node(s)->keys[i]);
902 else
903 *result_key = le64_to_cpu(ro_node(s)->keys[0]);
904
905 if (next_block || flags & INTERNAL_NODE) {
906 if (find_highest)
907 block = value64(ro_node(s), i);
908 else
909 block = value64(ro_node(s), 0);
910 }
911
912 } while (flags & INTERNAL_NODE);
913
914 if (next_block)
915 *next_block = block;
916 return 0;
917}
918
919static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root,
920 bool find_highest, uint64_t *result_keys)
921{
922 int r = 0, count = 0, level;
923 struct ro_spine spine;
924
925 init_ro_spine(&spine, info);
926 for (level = 0; level < info->levels; level++) {
927 r = find_key(&spine, root, find_highest, result_keys + level,
928 level == info->levels - 1 ? NULL : &root);
929 if (r == -ENODATA) {
930 r = 0;
931 break;
932
933 } else if (r)
934 break;
935
936 count++;
937 }
938 exit_ro_spine(&spine);
939
940 return r ? r : count;
941}
942
943int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
944 uint64_t *result_keys)
945{
946 return dm_btree_find_key(info, root, true, result_keys);
947}
948EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
949
950int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root,
951 uint64_t *result_keys)
952{
953 return dm_btree_find_key(info, root, false, result_keys);
954}
955EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key);
956
957
958
959
960
961
962
963static int walk_node(struct dm_btree_info *info, dm_block_t block,
964 int (*fn)(void *context, uint64_t *keys, void *leaf),
965 void *context)
966{
967 int r;
968 unsigned i, nr;
969 struct dm_block *node;
970 struct btree_node *n;
971 uint64_t keys;
972
973 r = bn_read_lock(info, block, &node);
974 if (r)
975 return r;
976
977 n = dm_block_data(node);
978
979 nr = le32_to_cpu(n->header.nr_entries);
980 for (i = 0; i < nr; i++) {
981 if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) {
982 r = walk_node(info, value64(n, i), fn, context);
983 if (r)
984 goto out;
985 } else {
986 keys = le64_to_cpu(*key_ptr(n, i));
987 r = fn(context, &keys, value_ptr(n, i));
988 if (r)
989 goto out;
990 }
991 }
992
993out:
994 dm_tm_unlock(info->tm, node);
995 return r;
996}
997
998int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
999 int (*fn)(void *context, uint64_t *keys, void *leaf),
1000 void *context)
1001{
1002 BUG_ON(info->levels > 1);
1003 return walk_node(info, root, fn, context);
1004}
1005EXPORT_SYMBOL_GPL(dm_btree_walk);
1006
1007
1008
1009static void prefetch_values(struct dm_btree_cursor *c)
1010{
1011 unsigned i, nr;
1012 __le64 value_le;
1013 struct cursor_node *n = c->nodes + c->depth - 1;
1014 struct btree_node *bn = dm_block_data(n->b);
1015 struct dm_block_manager *bm = dm_tm_get_bm(c->info->tm);
1016
1017 BUG_ON(c->info->value_type.size != sizeof(value_le));
1018
1019 nr = le32_to_cpu(bn->header.nr_entries);
1020 for (i = 0; i < nr; i++) {
1021 memcpy(&value_le, value_ptr(bn, i), sizeof(value_le));
1022 dm_bm_prefetch(bm, le64_to_cpu(value_le));
1023 }
1024}
1025
1026static bool leaf_node(struct dm_btree_cursor *c)
1027{
1028 struct cursor_node *n = c->nodes + c->depth - 1;
1029 struct btree_node *bn = dm_block_data(n->b);
1030
1031 return le32_to_cpu(bn->header.flags) & LEAF_NODE;
1032}
1033
1034static int push_node(struct dm_btree_cursor *c, dm_block_t b)
1035{
1036 int r;
1037 struct cursor_node *n = c->nodes + c->depth;
1038
1039 if (c->depth >= DM_BTREE_CURSOR_MAX_DEPTH - 1) {
1040 DMERR("couldn't push cursor node, stack depth too high");
1041 return -EINVAL;
1042 }
1043
1044 r = bn_read_lock(c->info, b, &n->b);
1045 if (r)
1046 return r;
1047
1048 n->index = 0;
1049 c->depth++;
1050
1051 if (c->prefetch_leaves || !leaf_node(c))
1052 prefetch_values(c);
1053
1054 return 0;
1055}
1056
1057static void pop_node(struct dm_btree_cursor *c)
1058{
1059 c->depth--;
1060 unlock_block(c->info, c->nodes[c->depth].b);
1061}
1062
1063static int inc_or_backtrack(struct dm_btree_cursor *c)
1064{
1065 struct cursor_node *n;
1066 struct btree_node *bn;
1067
1068 for (;;) {
1069 if (!c->depth)
1070 return -ENODATA;
1071
1072 n = c->nodes + c->depth - 1;
1073 bn = dm_block_data(n->b);
1074
1075 n->index++;
1076 if (n->index < le32_to_cpu(bn->header.nr_entries))
1077 break;
1078
1079 pop_node(c);
1080 }
1081
1082 return 0;
1083}
1084
1085static int find_leaf(struct dm_btree_cursor *c)
1086{
1087 int r = 0;
1088 struct cursor_node *n;
1089 struct btree_node *bn;
1090 __le64 value_le;
1091
1092 for (;;) {
1093 n = c->nodes + c->depth - 1;
1094 bn = dm_block_data(n->b);
1095
1096 if (le32_to_cpu(bn->header.flags) & LEAF_NODE)
1097 break;
1098
1099 memcpy(&value_le, value_ptr(bn, n->index), sizeof(value_le));
1100 r = push_node(c, le64_to_cpu(value_le));
1101 if (r) {
1102 DMERR("push_node failed");
1103 break;
1104 }
1105 }
1106
1107 if (!r && (le32_to_cpu(bn->header.nr_entries) == 0))
1108 return -ENODATA;
1109
1110 return r;
1111}
1112
1113int dm_btree_cursor_begin(struct dm_btree_info *info, dm_block_t root,
1114 bool prefetch_leaves, struct dm_btree_cursor *c)
1115{
1116 int r;
1117
1118 c->info = info;
1119 c->root = root;
1120 c->depth = 0;
1121 c->prefetch_leaves = prefetch_leaves;
1122
1123 r = push_node(c, root);
1124 if (r)
1125 return r;
1126
1127 return find_leaf(c);
1128}
1129EXPORT_SYMBOL_GPL(dm_btree_cursor_begin);
1130
1131void dm_btree_cursor_end(struct dm_btree_cursor *c)
1132{
1133 while (c->depth)
1134 pop_node(c);
1135}
1136EXPORT_SYMBOL_GPL(dm_btree_cursor_end);
1137
1138int dm_btree_cursor_next(struct dm_btree_cursor *c)
1139{
1140 int r = inc_or_backtrack(c);
1141 if (!r) {
1142 r = find_leaf(c);
1143 if (r)
1144 DMERR("find_leaf failed");
1145 }
1146
1147 return r;
1148}
1149EXPORT_SYMBOL_GPL(dm_btree_cursor_next);
1150
1151int dm_btree_cursor_skip(struct dm_btree_cursor *c, uint32_t count)
1152{
1153 int r = 0;
1154
1155 while (count-- && !r)
1156 r = dm_btree_cursor_next(c);
1157
1158 return r;
1159}
1160EXPORT_SYMBOL_GPL(dm_btree_cursor_skip);
1161
1162int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le)
1163{
1164 if (c->depth) {
1165 struct cursor_node *n = c->nodes + c->depth - 1;
1166 struct btree_node *bn = dm_block_data(n->b);
1167
1168 if (le32_to_cpu(bn->header.flags) & INTERNAL_NODE)
1169 return -EINVAL;
1170
1171 *key = le64_to_cpu(*key_ptr(bn, n->index));
1172 memcpy(value_le, value_ptr(bn, n->index), c->info->value_type.size);
1173 return 0;
1174
1175 } else
1176 return -ENODATA;
1177}
1178EXPORT_SYMBOL_GPL(dm_btree_cursor_get_value);
1179