1
2
3
4
5
6
7#include "dm-btree-internal.h"
8#include "dm-space-map.h"
9#include "dm-transaction-manager.h"
10
11#include <linux/export.h>
12#include <linux/device-mapper.h>
13
14#define DM_MSG_PREFIX "btree"
15
16
17
18
19static void memcpy_disk(void *dest, const void *src, size_t len)
20 __dm_written_to_disk(src)
21{
22 memcpy(dest, src, len);
23 __dm_unbless_for_disk(src);
24}
25
26static void array_insert(void *base, size_t elt_size, unsigned nr_elts,
27 unsigned index, void *elt)
28 __dm_written_to_disk(elt)
29{
30 if (index < nr_elts)
31 memmove(base + (elt_size * (index + 1)),
32 base + (elt_size * index),
33 (nr_elts - index) * elt_size);
34
35 memcpy_disk(base + (elt_size * index), elt, elt_size);
36}
37
38
39
40
41static int bsearch(struct btree_node *n, uint64_t key, int want_hi)
42{
43 int lo = -1, hi = le32_to_cpu(n->header.nr_entries);
44
45 while (hi - lo > 1) {
46 int mid = lo + ((hi - lo) / 2);
47 uint64_t mid_key = le64_to_cpu(n->keys[mid]);
48
49 if (mid_key == key)
50 return mid;
51
52 if (mid_key < key)
53 lo = mid;
54 else
55 hi = mid;
56 }
57
58 return want_hi ? hi : lo;
59}
60
61int lower_bound(struct btree_node *n, uint64_t key)
62{
63 return bsearch(n, key, 0);
64}
65
66static int upper_bound(struct btree_node *n, uint64_t key)
67{
68 return bsearch(n, key, 1);
69}
70
71void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
72 struct dm_btree_value_type *vt)
73{
74 uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
75
76 if (le32_to_cpu(n->header.flags) & INTERNAL_NODE)
77 dm_tm_with_runs(tm, value_ptr(n, 0), nr_entries, dm_tm_inc_range);
78
79 else if (vt->inc)
80 vt->inc(vt->context, value_ptr(n, 0), nr_entries);
81}
82
83static int insert_at(size_t value_size, struct btree_node *node, unsigned index,
84 uint64_t key, void *value)
85 __dm_written_to_disk(value)
86{
87 uint32_t nr_entries = le32_to_cpu(node->header.nr_entries);
88 __le64 key_le = cpu_to_le64(key);
89
90 if (index > nr_entries ||
91 index >= le32_to_cpu(node->header.max_entries)) {
92 DMERR("too many entries in btree node for insert");
93 __dm_unbless_for_disk(value);
94 return -ENOMEM;
95 }
96
97 __dm_bless_for_disk(&key_le);
98
99 array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le);
100 array_insert(value_base(node), value_size, nr_entries, index, value);
101 node->header.nr_entries = cpu_to_le32(nr_entries + 1);
102
103 return 0;
104}
105
106
107
108
109
110
111
112static uint32_t calc_max_entries(size_t value_size, size_t block_size)
113{
114 uint32_t total, n;
115 size_t elt_size = sizeof(uint64_t) + value_size;
116
117 block_size -= sizeof(struct node_header);
118 total = block_size / elt_size;
119 n = total / 3;
120
121 return 3 * n;
122}
123
124int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root)
125{
126 int r;
127 struct dm_block *b;
128 struct btree_node *n;
129 size_t block_size;
130 uint32_t max_entries;
131
132 r = new_block(info, &b);
133 if (r < 0)
134 return r;
135
136 block_size = dm_bm_block_size(dm_tm_get_bm(info->tm));
137 max_entries = calc_max_entries(info->value_type.size, block_size);
138
139 n = dm_block_data(b);
140 memset(n, 0, block_size);
141 n->header.flags = cpu_to_le32(LEAF_NODE);
142 n->header.nr_entries = cpu_to_le32(0);
143 n->header.max_entries = cpu_to_le32(max_entries);
144 n->header.value_size = cpu_to_le32(info->value_type.size);
145
146 *root = dm_block_location(b);
147 unlock_block(info, b);
148
149 return 0;
150}
151EXPORT_SYMBOL_GPL(dm_btree_empty);
152
153
154
155
156
157
158
159#define MAX_SPINE_DEPTH 64
160struct frame {
161 struct dm_block *b;
162 struct btree_node *n;
163 unsigned level;
164 unsigned nr_children;
165 unsigned current_child;
166};
167
168struct del_stack {
169 struct dm_btree_info *info;
170 struct dm_transaction_manager *tm;
171 int top;
172 struct frame spine[MAX_SPINE_DEPTH];
173};
174
175static int top_frame(struct del_stack *s, struct frame **f)
176{
177 if (s->top < 0) {
178 DMERR("btree deletion stack empty");
179 return -EINVAL;
180 }
181
182 *f = s->spine + s->top;
183
184 return 0;
185}
186
187static int unprocessed_frames(struct del_stack *s)
188{
189 return s->top >= 0;
190}
191
192static void prefetch_children(struct del_stack *s, struct frame *f)
193{
194 unsigned i;
195 struct dm_block_manager *bm = dm_tm_get_bm(s->tm);
196
197 for (i = 0; i < f->nr_children; i++)
198 dm_bm_prefetch(bm, value64(f->n, i));
199}
200
201static bool is_internal_level(struct dm_btree_info *info, struct frame *f)
202{
203 return f->level < (info->levels - 1);
204}
205
206static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
207{
208 int r;
209 uint32_t ref_count;
210
211 if (s->top >= MAX_SPINE_DEPTH - 1) {
212 DMERR("btree deletion stack out of memory");
213 return -ENOMEM;
214 }
215
216 r = dm_tm_ref(s->tm, b, &ref_count);
217 if (r)
218 return r;
219
220 if (ref_count > 1)
221
222
223
224
225 dm_tm_dec(s->tm, b);
226
227 else {
228 uint32_t flags;
229 struct frame *f = s->spine + ++s->top;
230
231 r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b);
232 if (r) {
233 s->top--;
234 return r;
235 }
236
237 f->n = dm_block_data(f->b);
238 f->level = level;
239 f->nr_children = le32_to_cpu(f->n->header.nr_entries);
240 f->current_child = 0;
241
242 flags = le32_to_cpu(f->n->header.flags);
243 if (flags & INTERNAL_NODE || is_internal_level(s->info, f))
244 prefetch_children(s, f);
245 }
246
247 return 0;
248}
249
250static void pop_frame(struct del_stack *s)
251{
252 struct frame *f = s->spine + s->top--;
253
254 dm_tm_dec(s->tm, dm_block_location(f->b));
255 dm_tm_unlock(s->tm, f->b);
256}
257
258static void unlock_all_frames(struct del_stack *s)
259{
260 struct frame *f;
261
262 while (unprocessed_frames(s)) {
263 f = s->spine + s->top--;
264 dm_tm_unlock(s->tm, f->b);
265 }
266}
267
268int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
269{
270 int r;
271 struct del_stack *s;
272
273
274
275
276
277
278 s = kmalloc(sizeof(*s), GFP_NOFS);
279 if (!s)
280 return -ENOMEM;
281 s->info = info;
282 s->tm = info->tm;
283 s->top = -1;
284
285 r = push_frame(s, root, 0);
286 if (r)
287 goto out;
288
289 while (unprocessed_frames(s)) {
290 uint32_t flags;
291 struct frame *f;
292 dm_block_t b;
293
294 r = top_frame(s, &f);
295 if (r)
296 goto out;
297
298 if (f->current_child >= f->nr_children) {
299 pop_frame(s);
300 continue;
301 }
302
303 flags = le32_to_cpu(f->n->header.flags);
304 if (flags & INTERNAL_NODE) {
305 b = value64(f->n, f->current_child);
306 f->current_child++;
307 r = push_frame(s, b, f->level);
308 if (r)
309 goto out;
310
311 } else if (is_internal_level(info, f)) {
312 b = value64(f->n, f->current_child);
313 f->current_child++;
314 r = push_frame(s, b, f->level + 1);
315 if (r)
316 goto out;
317
318 } else {
319 if (info->value_type.dec)
320 info->value_type.dec(info->value_type.context,
321 value_ptr(f->n, 0), f->nr_children);
322 pop_frame(s);
323 }
324 }
325out:
326 if (r) {
327
328 unlock_all_frames(s);
329 }
330 kfree(s);
331
332 return r;
333}
334EXPORT_SYMBOL_GPL(dm_btree_del);
335
336
337
338static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
339 int (*search_fn)(struct btree_node *, uint64_t),
340 uint64_t *result_key, void *v, size_t value_size)
341{
342 int i, r;
343 uint32_t flags, nr_entries;
344
345 do {
346 r = ro_step(s, block);
347 if (r < 0)
348 return r;
349
350 i = search_fn(ro_node(s), key);
351
352 flags = le32_to_cpu(ro_node(s)->header.flags);
353 nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries);
354 if (i < 0 || i >= nr_entries)
355 return -ENODATA;
356
357 if (flags & INTERNAL_NODE)
358 block = value64(ro_node(s), i);
359
360 } while (!(flags & LEAF_NODE));
361
362 *result_key = le64_to_cpu(ro_node(s)->keys[i]);
363 if (v)
364 memcpy(v, value_ptr(ro_node(s), i), value_size);
365
366 return 0;
367}
368
369int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
370 uint64_t *keys, void *value_le)
371{
372 unsigned level, last_level = info->levels - 1;
373 int r = -ENODATA;
374 uint64_t rkey;
375 __le64 internal_value_le;
376 struct ro_spine spine;
377
378 init_ro_spine(&spine, info);
379 for (level = 0; level < info->levels; level++) {
380 size_t size;
381 void *value_p;
382
383 if (level == last_level) {
384 value_p = value_le;
385 size = info->value_type.size;
386
387 } else {
388 value_p = &internal_value_le;
389 size = sizeof(uint64_t);
390 }
391
392 r = btree_lookup_raw(&spine, root, keys[level],
393 lower_bound, &rkey,
394 value_p, size);
395
396 if (!r) {
397 if (rkey != keys[level]) {
398 exit_ro_spine(&spine);
399 return -ENODATA;
400 }
401 } else {
402 exit_ro_spine(&spine);
403 return r;
404 }
405
406 root = le64_to_cpu(internal_value_le);
407 }
408 exit_ro_spine(&spine);
409
410 return r;
411}
412EXPORT_SYMBOL_GPL(dm_btree_lookup);
413
414static int dm_btree_lookup_next_single(struct dm_btree_info *info, dm_block_t root,
415 uint64_t key, uint64_t *rkey, void *value_le)
416{
417 int r, i;
418 uint32_t flags, nr_entries;
419 struct dm_block *node;
420 struct btree_node *n;
421
422 r = bn_read_lock(info, root, &node);
423 if (r)
424 return r;
425
426 n = dm_block_data(node);
427 flags = le32_to_cpu(n->header.flags);
428 nr_entries = le32_to_cpu(n->header.nr_entries);
429
430 if (flags & INTERNAL_NODE) {
431 i = lower_bound(n, key);
432 if (i < 0) {
433
434
435
436
437 i = 0;
438 }
439 if (i >= nr_entries) {
440 r = -ENODATA;
441 goto out;
442 }
443
444 r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le);
445 if (r == -ENODATA && i < (nr_entries - 1)) {
446 i++;
447 r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le);
448 }
449
450 } else {
451 i = upper_bound(n, key);
452 if (i < 0 || i >= nr_entries) {
453 r = -ENODATA;
454 goto out;
455 }
456
457 *rkey = le64_to_cpu(n->keys[i]);
458 memcpy(value_le, value_ptr(n, i), info->value_type.size);
459 }
460out:
461 dm_tm_unlock(info->tm, node);
462 return r;
463}
464
465int dm_btree_lookup_next(struct dm_btree_info *info, dm_block_t root,
466 uint64_t *keys, uint64_t *rkey, void *value_le)
467{
468 unsigned level;
469 int r = -ENODATA;
470 __le64 internal_value_le;
471 struct ro_spine spine;
472
473 init_ro_spine(&spine, info);
474 for (level = 0; level < info->levels - 1u; level++) {
475 r = btree_lookup_raw(&spine, root, keys[level],
476 lower_bound, rkey,
477 &internal_value_le, sizeof(uint64_t));
478 if (r)
479 goto out;
480
481 if (*rkey != keys[level]) {
482 r = -ENODATA;
483 goto out;
484 }
485
486 root = le64_to_cpu(internal_value_le);
487 }
488
489 r = dm_btree_lookup_next_single(info, root, keys[level], rkey, value_le);
490out:
491 exit_ro_spine(&spine);
492 return r;
493}
494
495EXPORT_SYMBOL_GPL(dm_btree_lookup_next);
496
497
498
499
500
501
502
503static void copy_entries(struct btree_node *dest, unsigned dest_offset,
504 struct btree_node *src, unsigned src_offset,
505 unsigned count)
506{
507 size_t value_size = le32_to_cpu(dest->header.value_size);
508 memcpy(dest->keys + dest_offset, src->keys + src_offset, count * sizeof(uint64_t));
509 memcpy(value_ptr(dest, dest_offset), value_ptr(src, src_offset), count * value_size);
510}
511
512
513
514
515
516static void move_entries(struct btree_node *dest, unsigned dest_offset,
517 struct btree_node *src, unsigned src_offset,
518 unsigned count)
519{
520 size_t value_size = le32_to_cpu(dest->header.value_size);
521 memmove(dest->keys + dest_offset, src->keys + src_offset, count * sizeof(uint64_t));
522 memmove(value_ptr(dest, dest_offset), value_ptr(src, src_offset), count * value_size);
523}
524
525
526
527
528
529static void shift_down(struct btree_node *n, unsigned count)
530{
531 move_entries(n, 0, n, count, le32_to_cpu(n->header.nr_entries) - count);
532}
533
534
535
536
537
538static void shift_up(struct btree_node *n, unsigned count)
539{
540 move_entries(n, count, n, 0, le32_to_cpu(n->header.nr_entries));
541}
542
543
544
545
546
547static void redistribute2(struct btree_node *left, struct btree_node *right)
548{
549 unsigned nr_left = le32_to_cpu(left->header.nr_entries);
550 unsigned nr_right = le32_to_cpu(right->header.nr_entries);
551 unsigned total = nr_left + nr_right;
552 unsigned target_left = total / 2;
553 unsigned target_right = total - target_left;
554
555 if (nr_left < target_left) {
556 unsigned delta = target_left - nr_left;
557 copy_entries(left, nr_left, right, 0, delta);
558 shift_down(right, delta);
559 } else if (nr_left > target_left) {
560 unsigned delta = nr_left - target_left;
561 if (nr_right)
562 shift_up(right, delta);
563 copy_entries(right, 0, left, target_left, delta);
564 }
565
566 left->header.nr_entries = cpu_to_le32(target_left);
567 right->header.nr_entries = cpu_to_le32(target_right);
568}
569
570
571
572
573
574static void redistribute3(struct btree_node *left, struct btree_node *center,
575 struct btree_node *right)
576{
577 unsigned nr_left = le32_to_cpu(left->header.nr_entries);
578 unsigned nr_center = le32_to_cpu(center->header.nr_entries);
579 unsigned nr_right = le32_to_cpu(right->header.nr_entries);
580 unsigned total, target_left, target_center, target_right;
581
582 BUG_ON(nr_center);
583
584 total = nr_left + nr_right;
585 target_left = total / 3;
586 target_center = (total - target_left) / 2;
587 target_right = (total - target_left - target_center);
588
589 if (nr_left < target_left) {
590 unsigned left_short = target_left - nr_left;
591 copy_entries(left, nr_left, right, 0, left_short);
592 copy_entries(center, 0, right, left_short, target_center);
593 shift_down(right, nr_right - target_right);
594
595 } else if (nr_left < (target_left + target_center)) {
596 unsigned left_to_center = nr_left - target_left;
597 copy_entries(center, 0, left, target_left, left_to_center);
598 copy_entries(center, left_to_center, right, 0, target_center - left_to_center);
599 shift_down(right, nr_right - target_right);
600
601 } else {
602 unsigned right_short = target_right - nr_right;
603 shift_up(right, right_short);
604 copy_entries(right, 0, left, nr_left - right_short, right_short);
605 copy_entries(center, 0, left, target_left, nr_left - target_left);
606 }
607
608 left->header.nr_entries = cpu_to_le32(target_left);
609 center->header.nr_entries = cpu_to_le32(target_center);
610 right->header.nr_entries = cpu_to_le32(target_right);
611}
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643static int split_one_into_two(struct shadow_spine *s, unsigned parent_index,
644 struct dm_btree_value_type *vt, uint64_t key)
645{
646 int r;
647 struct dm_block *left, *right, *parent;
648 struct btree_node *ln, *rn, *pn;
649 __le64 location;
650
651 left = shadow_current(s);
652
653 r = new_block(s->info, &right);
654 if (r < 0)
655 return r;
656
657 ln = dm_block_data(left);
658 rn = dm_block_data(right);
659
660 rn->header.flags = ln->header.flags;
661 rn->header.nr_entries = cpu_to_le32(0);
662 rn->header.max_entries = ln->header.max_entries;
663 rn->header.value_size = ln->header.value_size;
664 redistribute2(ln, rn);
665
666
667 parent = shadow_parent(s);
668 pn = dm_block_data(parent);
669
670 location = cpu_to_le64(dm_block_location(right));
671 __dm_bless_for_disk(&location);
672 r = insert_at(sizeof(__le64), pn, parent_index + 1,
673 le64_to_cpu(rn->keys[0]), &location);
674 if (r) {
675 unlock_block(s->info, right);
676 return r;
677 }
678
679
680 if (key < le64_to_cpu(rn->keys[0])) {
681 unlock_block(s->info, right);
682 s->nodes[1] = left;
683 } else {
684 unlock_block(s->info, left);
685 s->nodes[1] = right;
686 }
687
688 return 0;
689}
690
691
692
693
694
695
696static int shadow_child(struct dm_btree_info *info, struct dm_btree_value_type *vt,
697 struct btree_node *parent, unsigned index,
698 struct dm_block **result)
699{
700 int r, inc;
701 dm_block_t root;
702 struct btree_node *node;
703
704 root = value64(parent, index);
705
706 r = dm_tm_shadow_block(info->tm, root, &btree_node_validator,
707 result, &inc);
708 if (r)
709 return r;
710
711 node = dm_block_data(*result);
712
713 if (inc)
714 inc_children(info->tm, node, vt);
715
716 *((__le64 *) value_ptr(parent, index)) =
717 cpu_to_le64(dm_block_location(*result));
718
719 return 0;
720}
721
722
723
724
725
726static int split_two_into_three(struct shadow_spine *s, unsigned parent_index,
727 struct dm_btree_value_type *vt, uint64_t key)
728{
729 int r;
730 unsigned middle_index;
731 struct dm_block *left, *middle, *right, *parent;
732 struct btree_node *ln, *rn, *mn, *pn;
733 __le64 location;
734
735 parent = shadow_parent(s);
736 pn = dm_block_data(parent);
737
738 if (parent_index == 0) {
739 middle_index = 1;
740 left = shadow_current(s);
741 r = shadow_child(s->info, vt, pn, parent_index + 1, &right);
742 if (r)
743 return r;
744 } else {
745 middle_index = parent_index;
746 right = shadow_current(s);
747 r = shadow_child(s->info, vt, pn, parent_index - 1, &left);
748 if (r)
749 return r;
750 }
751
752 r = new_block(s->info, &middle);
753 if (r < 0)
754 return r;
755
756 ln = dm_block_data(left);
757 mn = dm_block_data(middle);
758 rn = dm_block_data(right);
759
760 mn->header.nr_entries = cpu_to_le32(0);
761 mn->header.flags = ln->header.flags;
762 mn->header.max_entries = ln->header.max_entries;
763 mn->header.value_size = ln->header.value_size;
764
765 redistribute3(ln, mn, rn);
766
767
768 pn->keys[middle_index] = rn->keys[0];
769 location = cpu_to_le64(dm_block_location(middle));
770 __dm_bless_for_disk(&location);
771 r = insert_at(sizeof(__le64), pn, middle_index,
772 le64_to_cpu(mn->keys[0]), &location);
773 if (r) {
774 if (shadow_current(s) != left)
775 unlock_block(s->info, left);
776
777 unlock_block(s->info, middle);
778
779 if (shadow_current(s) != right)
780 unlock_block(s->info, right);
781
782 return r;
783 }
784
785
786
787 if (key < le64_to_cpu(mn->keys[0])) {
788 unlock_block(s->info, middle);
789 unlock_block(s->info, right);
790 s->nodes[1] = left;
791 } else if (key < le64_to_cpu(rn->keys[0])) {
792 unlock_block(s->info, left);
793 unlock_block(s->info, right);
794 s->nodes[1] = middle;
795 } else {
796 unlock_block(s->info, left);
797 unlock_block(s->info, middle);
798 s->nodes[1] = right;
799 }
800
801 return 0;
802}
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
828{
829 int r;
830 size_t size;
831 unsigned nr_left, nr_right;
832 struct dm_block *left, *right, *new_parent;
833 struct btree_node *pn, *ln, *rn;
834 __le64 val;
835
836 new_parent = shadow_current(s);
837
838 pn = dm_block_data(new_parent);
839 size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
840 sizeof(__le64) : s->info->value_type.size;
841
842
843 r = new_block(s->info, &left);
844 if (r < 0)
845 return r;
846
847 ln = dm_block_data(left);
848 nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
849
850 ln->header.flags = pn->header.flags;
851 ln->header.nr_entries = cpu_to_le32(nr_left);
852 ln->header.max_entries = pn->header.max_entries;
853 ln->header.value_size = pn->header.value_size;
854 memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
855 memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
856
857
858 r = new_block(s->info, &right);
859 if (r < 0) {
860 unlock_block(s->info, left);
861 return r;
862 }
863
864 rn = dm_block_data(right);
865 nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
866
867 rn->header.flags = pn->header.flags;
868 rn->header.nr_entries = cpu_to_le32(nr_right);
869 rn->header.max_entries = pn->header.max_entries;
870 rn->header.value_size = pn->header.value_size;
871 memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
872 memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
873 nr_right * size);
874
875
876 pn->header.flags = cpu_to_le32(INTERNAL_NODE);
877 pn->header.nr_entries = cpu_to_le32(2);
878 pn->header.max_entries = cpu_to_le32(
879 calc_max_entries(sizeof(__le64),
880 dm_bm_block_size(
881 dm_tm_get_bm(s->info->tm))));
882 pn->header.value_size = cpu_to_le32(sizeof(__le64));
883
884 val = cpu_to_le64(dm_block_location(left));
885 __dm_bless_for_disk(&val);
886 pn->keys[0] = ln->keys[0];
887 memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64));
888
889 val = cpu_to_le64(dm_block_location(right));
890 __dm_bless_for_disk(&val);
891 pn->keys[1] = rn->keys[0];
892 memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64));
893
894 unlock_block(s->info, left);
895 unlock_block(s->info, right);
896 return 0;
897}
898
899
900
901
902
903
904static int rebalance_left(struct shadow_spine *s, struct dm_btree_value_type *vt,
905 unsigned parent_index, uint64_t key)
906{
907 int r;
908 struct dm_block *sib;
909 struct btree_node *left, *right, *parent = dm_block_data(shadow_parent(s));
910
911 r = shadow_child(s->info, vt, parent, parent_index - 1, &sib);
912 if (r)
913 return r;
914
915 left = dm_block_data(sib);
916 right = dm_block_data(shadow_current(s));
917 redistribute2(left, right);
918 *key_ptr(parent, parent_index) = right->keys[0];
919
920 if (key < le64_to_cpu(right->keys[0])) {
921 unlock_block(s->info, s->nodes[1]);
922 s->nodes[1] = sib;
923 } else {
924 unlock_block(s->info, sib);
925 }
926
927 return 0;
928}
929
930
931
932
933static int rebalance_right(struct shadow_spine *s, struct dm_btree_value_type *vt,
934 unsigned parent_index, uint64_t key)
935{
936 int r;
937 struct dm_block *sib;
938 struct btree_node *left, *right, *parent = dm_block_data(shadow_parent(s));
939
940 r = shadow_child(s->info, vt, parent, parent_index + 1, &sib);
941 if (r)
942 return r;
943
944 left = dm_block_data(shadow_current(s));
945 right = dm_block_data(sib);
946 redistribute2(left, right);
947 *key_ptr(parent, parent_index + 1) = right->keys[0];
948
949 if (key < le64_to_cpu(right->keys[0])) {
950 unlock_block(s->info, sib);
951 } else {
952 unlock_block(s->info, s->nodes[1]);
953 s->nodes[1] = sib;
954 }
955
956 return 0;
957}
958
959
960
961
962static int get_node_free_space(struct dm_btree_info *info, dm_block_t b, unsigned *space)
963{
964 int r;
965 unsigned nr_entries;
966 struct dm_block *block;
967 struct btree_node *node;
968
969 r = bn_read_lock(info, b, &block);
970 if (r)
971 return r;
972
973 node = dm_block_data(block);
974 nr_entries = le32_to_cpu(node->header.nr_entries);
975 *space = le32_to_cpu(node->header.max_entries) - nr_entries;
976
977 unlock_block(info, block);
978 return 0;
979}
980
981
982
983
984
985
986
987
988
989#define SPACE_THRESHOLD 8
990static int rebalance_or_split(struct shadow_spine *s, struct dm_btree_value_type *vt,
991 unsigned parent_index, uint64_t key)
992{
993 int r;
994 struct btree_node *parent = dm_block_data(shadow_parent(s));
995 unsigned nr_parent = le32_to_cpu(parent->header.nr_entries);
996 unsigned free_space;
997 int left_shared = 0, right_shared = 0;
998
999
1000 if (parent_index > 0) {
1001 dm_block_t left_b = value64(parent, parent_index - 1);
1002 r = dm_tm_block_is_shared(s->info->tm, left_b, &left_shared);
1003 if (r)
1004 return r;
1005
1006 if (!left_shared) {
1007 r = get_node_free_space(s->info, left_b, &free_space);
1008 if (r)
1009 return r;
1010
1011 if (free_space >= SPACE_THRESHOLD)
1012 return rebalance_left(s, vt, parent_index, key);
1013 }
1014 }
1015
1016
1017 if (parent_index < (nr_parent - 1)) {
1018 dm_block_t right_b = value64(parent, parent_index + 1);
1019 r = dm_tm_block_is_shared(s->info->tm, right_b, &right_shared);
1020 if (r)
1021 return r;
1022
1023 if (!right_shared) {
1024 r = get_node_free_space(s->info, right_b, &free_space);
1025 if (r)
1026 return r;
1027
1028 if (free_space >= SPACE_THRESHOLD)
1029 return rebalance_right(s, vt, parent_index, key);
1030 }
1031 }
1032
1033
1034
1035
1036
1037
1038
1039 if (left_shared || right_shared || (nr_parent <= 2) ||
1040 (parent_index == 0) || (parent_index + 1 == nr_parent)) {
1041 return split_one_into_two(s, parent_index, vt, key);
1042 } else {
1043 return split_two_into_three(s, parent_index, vt, key);
1044 }
1045}
1046
1047
1048
1049
1050static bool contains_key(struct btree_node *node, uint64_t key)
1051{
1052 int i = lower_bound(node, key);
1053
1054 if (i >= 0 && le64_to_cpu(node->keys[i]) == key)
1055 return true;
1056
1057 return false;
1058}
1059
1060
1061
1062
1063
1064
1065static bool has_space_for_insert(struct btree_node *node, uint64_t key)
1066{
1067 if (node->header.nr_entries == node->header.max_entries) {
1068 if (le32_to_cpu(node->header.flags) & LEAF_NODE) {
1069
1070 return contains_key(node, key);
1071 }
1072
1073 return false;
1074 }
1075
1076 return true;
1077}
1078
1079static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
1080 struct dm_btree_value_type *vt,
1081 uint64_t key, unsigned *index)
1082{
1083 int r, i = *index, top = 1;
1084 struct btree_node *node;
1085
1086 for (;;) {
1087 r = shadow_step(s, root, vt);
1088 if (r < 0)
1089 return r;
1090
1091 node = dm_block_data(shadow_current(s));
1092
1093
1094
1095
1096
1097
1098 if (shadow_has_parent(s) && i >= 0) {
1099 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
1100
1101 __dm_bless_for_disk(&location);
1102 memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
1103 &location, sizeof(__le64));
1104 }
1105
1106 node = dm_block_data(shadow_current(s));
1107
1108 if (!has_space_for_insert(node, key)) {
1109 if (top)
1110 r = btree_split_beneath(s, key);
1111 else
1112 r = rebalance_or_split(s, vt, i, key);
1113
1114 if (r < 0)
1115 return r;
1116
1117
1118 node = dm_block_data(shadow_current(s));
1119 }
1120
1121 i = lower_bound(node, key);
1122
1123 if (le32_to_cpu(node->header.flags) & LEAF_NODE)
1124 break;
1125
1126 if (i < 0) {
1127
1128 node->keys[0] = cpu_to_le64(key);
1129 i = 0;
1130 }
1131
1132 root = value64(node, i);
1133 top = 0;
1134 }
1135
1136 if (i < 0 || le64_to_cpu(node->keys[i]) != key)
1137 i++;
1138
1139 *index = i;
1140 return 0;
1141}
1142
1143static int __btree_get_overwrite_leaf(struct shadow_spine *s, dm_block_t root,
1144 uint64_t key, int *index)
1145{
1146 int r, i = -1;
1147 struct btree_node *node;
1148
1149 *index = 0;
1150 for (;;) {
1151 r = shadow_step(s, root, &s->info->value_type);
1152 if (r < 0)
1153 return r;
1154
1155 node = dm_block_data(shadow_current(s));
1156
1157
1158
1159
1160
1161
1162 if (shadow_has_parent(s) && i >= 0) {
1163 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
1164
1165 __dm_bless_for_disk(&location);
1166 memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
1167 &location, sizeof(__le64));
1168 }
1169
1170 node = dm_block_data(shadow_current(s));
1171 i = lower_bound(node, key);
1172
1173 BUG_ON(i < 0);
1174 BUG_ON(i >= le32_to_cpu(node->header.nr_entries));
1175
1176 if (le32_to_cpu(node->header.flags) & LEAF_NODE) {
1177 if (key != le64_to_cpu(node->keys[i]))
1178 return -EINVAL;
1179 break;
1180 }
1181
1182 root = value64(node, i);
1183 }
1184
1185 *index = i;
1186 return 0;
1187}
1188
1189int btree_get_overwrite_leaf(struct dm_btree_info *info, dm_block_t root,
1190 uint64_t key, int *index,
1191 dm_block_t *new_root, struct dm_block **leaf)
1192{
1193 int r;
1194 struct shadow_spine spine;
1195
1196 BUG_ON(info->levels > 1);
1197 init_shadow_spine(&spine, info);
1198 r = __btree_get_overwrite_leaf(&spine, root, key, index);
1199 if (!r) {
1200 *new_root = shadow_root(&spine);
1201 *leaf = shadow_current(&spine);
1202
1203
1204
1205
1206
1207 spine.count--;
1208 }
1209 exit_shadow_spine(&spine);
1210
1211 return r;
1212}
1213
1214static bool need_insert(struct btree_node *node, uint64_t *keys,
1215 unsigned level, unsigned index)
1216{
1217 return ((index >= le32_to_cpu(node->header.nr_entries)) ||
1218 (le64_to_cpu(node->keys[index]) != keys[level]));
1219}
1220
1221static int insert(struct dm_btree_info *info, dm_block_t root,
1222 uint64_t *keys, void *value, dm_block_t *new_root,
1223 int *inserted)
1224 __dm_written_to_disk(value)
1225{
1226 int r;
1227 unsigned level, index = -1, last_level = info->levels - 1;
1228 dm_block_t block = root;
1229 struct shadow_spine spine;
1230 struct btree_node *n;
1231 struct dm_btree_value_type le64_type;
1232
1233 init_le64_type(info->tm, &le64_type);
1234 init_shadow_spine(&spine, info);
1235
1236 for (level = 0; level < (info->levels - 1); level++) {
1237 r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index);
1238 if (r < 0)
1239 goto bad;
1240
1241 n = dm_block_data(shadow_current(&spine));
1242
1243 if (need_insert(n, keys, level, index)) {
1244 dm_block_t new_tree;
1245 __le64 new_le;
1246
1247 r = dm_btree_empty(info, &new_tree);
1248 if (r < 0)
1249 goto bad;
1250
1251 new_le = cpu_to_le64(new_tree);
1252 __dm_bless_for_disk(&new_le);
1253
1254 r = insert_at(sizeof(uint64_t), n, index,
1255 keys[level], &new_le);
1256 if (r)
1257 goto bad;
1258 }
1259
1260 if (level < last_level)
1261 block = value64(n, index);
1262 }
1263
1264 r = btree_insert_raw(&spine, block, &info->value_type,
1265 keys[level], &index);
1266 if (r < 0)
1267 goto bad;
1268
1269 n = dm_block_data(shadow_current(&spine));
1270
1271 if (need_insert(n, keys, level, index)) {
1272 if (inserted)
1273 *inserted = 1;
1274
1275 r = insert_at(info->value_type.size, n, index,
1276 keys[level], value);
1277 if (r)
1278 goto bad_unblessed;
1279 } else {
1280 if (inserted)
1281 *inserted = 0;
1282
1283 if (info->value_type.dec &&
1284 (!info->value_type.equal ||
1285 !info->value_type.equal(
1286 info->value_type.context,
1287 value_ptr(n, index),
1288 value))) {
1289 info->value_type.dec(info->value_type.context,
1290 value_ptr(n, index), 1);
1291 }
1292 memcpy_disk(value_ptr(n, index),
1293 value, info->value_type.size);
1294 }
1295
1296 *new_root = shadow_root(&spine);
1297 exit_shadow_spine(&spine);
1298
1299 return 0;
1300
1301bad:
1302 __dm_unbless_for_disk(value);
1303bad_unblessed:
1304 exit_shadow_spine(&spine);
1305 return r;
1306}
1307
1308int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
1309 uint64_t *keys, void *value, dm_block_t *new_root)
1310 __dm_written_to_disk(value)
1311{
1312 return insert(info, root, keys, value, new_root, NULL);
1313}
1314EXPORT_SYMBOL_GPL(dm_btree_insert);
1315
1316int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
1317 uint64_t *keys, void *value, dm_block_t *new_root,
1318 int *inserted)
1319 __dm_written_to_disk(value)
1320{
1321 return insert(info, root, keys, value, new_root, inserted);
1322}
1323EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
1324
1325
1326
1327static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest,
1328 uint64_t *result_key, dm_block_t *next_block)
1329{
1330 int i, r;
1331 uint32_t flags;
1332
1333 do {
1334 r = ro_step(s, block);
1335 if (r < 0)
1336 return r;
1337
1338 flags = le32_to_cpu(ro_node(s)->header.flags);
1339 i = le32_to_cpu(ro_node(s)->header.nr_entries);
1340 if (!i)
1341 return -ENODATA;
1342 else
1343 i--;
1344
1345 if (find_highest)
1346 *result_key = le64_to_cpu(ro_node(s)->keys[i]);
1347 else
1348 *result_key = le64_to_cpu(ro_node(s)->keys[0]);
1349
1350 if (next_block || flags & INTERNAL_NODE) {
1351 if (find_highest)
1352 block = value64(ro_node(s), i);
1353 else
1354 block = value64(ro_node(s), 0);
1355 }
1356
1357 } while (flags & INTERNAL_NODE);
1358
1359 if (next_block)
1360 *next_block = block;
1361 return 0;
1362}
1363
1364static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root,
1365 bool find_highest, uint64_t *result_keys)
1366{
1367 int r = 0, count = 0, level;
1368 struct ro_spine spine;
1369
1370 init_ro_spine(&spine, info);
1371 for (level = 0; level < info->levels; level++) {
1372 r = find_key(&spine, root, find_highest, result_keys + level,
1373 level == info->levels - 1 ? NULL : &root);
1374 if (r == -ENODATA) {
1375 r = 0;
1376 break;
1377
1378 } else if (r)
1379 break;
1380
1381 count++;
1382 }
1383 exit_ro_spine(&spine);
1384
1385 return r ? r : count;
1386}
1387
1388int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
1389 uint64_t *result_keys)
1390{
1391 return dm_btree_find_key(info, root, true, result_keys);
1392}
1393EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
1394
1395int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root,
1396 uint64_t *result_keys)
1397{
1398 return dm_btree_find_key(info, root, false, result_keys);
1399}
1400EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key);
1401
1402
1403
1404
1405
1406
1407
1408static int walk_node(struct dm_btree_info *info, dm_block_t block,
1409 int (*fn)(void *context, uint64_t *keys, void *leaf),
1410 void *context)
1411{
1412 int r;
1413 unsigned i, nr;
1414 struct dm_block *node;
1415 struct btree_node *n;
1416 uint64_t keys;
1417
1418 r = bn_read_lock(info, block, &node);
1419 if (r)
1420 return r;
1421
1422 n = dm_block_data(node);
1423
1424 nr = le32_to_cpu(n->header.nr_entries);
1425 for (i = 0; i < nr; i++) {
1426 if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) {
1427 r = walk_node(info, value64(n, i), fn, context);
1428 if (r)
1429 goto out;
1430 } else {
1431 keys = le64_to_cpu(*key_ptr(n, i));
1432 r = fn(context, &keys, value_ptr(n, i));
1433 if (r)
1434 goto out;
1435 }
1436 }
1437
1438out:
1439 dm_tm_unlock(info->tm, node);
1440 return r;
1441}
1442
1443int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
1444 int (*fn)(void *context, uint64_t *keys, void *leaf),
1445 void *context)
1446{
1447 BUG_ON(info->levels > 1);
1448 return walk_node(info, root, fn, context);
1449}
1450EXPORT_SYMBOL_GPL(dm_btree_walk);
1451
1452
1453
1454static void prefetch_values(struct dm_btree_cursor *c)
1455{
1456 unsigned i, nr;
1457 __le64 value_le;
1458 struct cursor_node *n = c->nodes + c->depth - 1;
1459 struct btree_node *bn = dm_block_data(n->b);
1460 struct dm_block_manager *bm = dm_tm_get_bm(c->info->tm);
1461
1462 BUG_ON(c->info->value_type.size != sizeof(value_le));
1463
1464 nr = le32_to_cpu(bn->header.nr_entries);
1465 for (i = 0; i < nr; i++) {
1466 memcpy(&value_le, value_ptr(bn, i), sizeof(value_le));
1467 dm_bm_prefetch(bm, le64_to_cpu(value_le));
1468 }
1469}
1470
1471static bool leaf_node(struct dm_btree_cursor *c)
1472{
1473 struct cursor_node *n = c->nodes + c->depth - 1;
1474 struct btree_node *bn = dm_block_data(n->b);
1475
1476 return le32_to_cpu(bn->header.flags) & LEAF_NODE;
1477}
1478
1479static int push_node(struct dm_btree_cursor *c, dm_block_t b)
1480{
1481 int r;
1482 struct cursor_node *n = c->nodes + c->depth;
1483
1484 if (c->depth >= DM_BTREE_CURSOR_MAX_DEPTH - 1) {
1485 DMERR("couldn't push cursor node, stack depth too high");
1486 return -EINVAL;
1487 }
1488
1489 r = bn_read_lock(c->info, b, &n->b);
1490 if (r)
1491 return r;
1492
1493 n->index = 0;
1494 c->depth++;
1495
1496 if (c->prefetch_leaves || !leaf_node(c))
1497 prefetch_values(c);
1498
1499 return 0;
1500}
1501
1502static void pop_node(struct dm_btree_cursor *c)
1503{
1504 c->depth--;
1505 unlock_block(c->info, c->nodes[c->depth].b);
1506}
1507
1508static int inc_or_backtrack(struct dm_btree_cursor *c)
1509{
1510 struct cursor_node *n;
1511 struct btree_node *bn;
1512
1513 for (;;) {
1514 if (!c->depth)
1515 return -ENODATA;
1516
1517 n = c->nodes + c->depth - 1;
1518 bn = dm_block_data(n->b);
1519
1520 n->index++;
1521 if (n->index < le32_to_cpu(bn->header.nr_entries))
1522 break;
1523
1524 pop_node(c);
1525 }
1526
1527 return 0;
1528}
1529
1530static int find_leaf(struct dm_btree_cursor *c)
1531{
1532 int r = 0;
1533 struct cursor_node *n;
1534 struct btree_node *bn;
1535 __le64 value_le;
1536
1537 for (;;) {
1538 n = c->nodes + c->depth - 1;
1539 bn = dm_block_data(n->b);
1540
1541 if (le32_to_cpu(bn->header.flags) & LEAF_NODE)
1542 break;
1543
1544 memcpy(&value_le, value_ptr(bn, n->index), sizeof(value_le));
1545 r = push_node(c, le64_to_cpu(value_le));
1546 if (r) {
1547 DMERR("push_node failed");
1548 break;
1549 }
1550 }
1551
1552 if (!r && (le32_to_cpu(bn->header.nr_entries) == 0))
1553 return -ENODATA;
1554
1555 return r;
1556}
1557
1558int dm_btree_cursor_begin(struct dm_btree_info *info, dm_block_t root,
1559 bool prefetch_leaves, struct dm_btree_cursor *c)
1560{
1561 int r;
1562
1563 c->info = info;
1564 c->root = root;
1565 c->depth = 0;
1566 c->prefetch_leaves = prefetch_leaves;
1567
1568 r = push_node(c, root);
1569 if (r)
1570 return r;
1571
1572 return find_leaf(c);
1573}
1574EXPORT_SYMBOL_GPL(dm_btree_cursor_begin);
1575
1576void dm_btree_cursor_end(struct dm_btree_cursor *c)
1577{
1578 while (c->depth)
1579 pop_node(c);
1580}
1581EXPORT_SYMBOL_GPL(dm_btree_cursor_end);
1582
1583int dm_btree_cursor_next(struct dm_btree_cursor *c)
1584{
1585 int r = inc_or_backtrack(c);
1586 if (!r) {
1587 r = find_leaf(c);
1588 if (r)
1589 DMERR("find_leaf failed");
1590 }
1591
1592 return r;
1593}
1594EXPORT_SYMBOL_GPL(dm_btree_cursor_next);
1595
1596int dm_btree_cursor_skip(struct dm_btree_cursor *c, uint32_t count)
1597{
1598 int r = 0;
1599
1600 while (count-- && !r)
1601 r = dm_btree_cursor_next(c);
1602
1603 return r;
1604}
1605EXPORT_SYMBOL_GPL(dm_btree_cursor_skip);
1606
1607int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le)
1608{
1609 if (c->depth) {
1610 struct cursor_node *n = c->nodes + c->depth - 1;
1611 struct btree_node *bn = dm_block_data(n->b);
1612
1613 if (le32_to_cpu(bn->header.flags) & INTERNAL_NODE)
1614 return -EINVAL;
1615
1616 *key = le64_to_cpu(*key_ptr(bn, n->index));
1617 memcpy(value_le, value_ptr(bn, n->index), c->info->value_type.size);
1618 return 0;
1619
1620 } else
1621 return -ENODATA;
1622}
1623EXPORT_SYMBOL_GPL(dm_btree_cursor_get_value);
1624