1
2
3
4
5
6
7#include "dm-btree-internal.h"
8#include "dm-space-map.h"
9#include "dm-transaction-manager.h"
10
11#include <linux/export.h>
12#include <linux/device-mapper.h>
13
14#define DM_MSG_PREFIX "btree"
15
16
17
18
19static void memcpy_disk(void *dest, const void *src, size_t len)
20 __dm_written_to_disk(src)
21{
22 memcpy(dest, src, len);
23 __dm_unbless_for_disk(src);
24}
25
26static void array_insert(void *base, size_t elt_size, unsigned nr_elts,
27 unsigned index, void *elt)
28 __dm_written_to_disk(elt)
29{
30 if (index < nr_elts)
31 memmove(base + (elt_size * (index + 1)),
32 base + (elt_size * index),
33 (nr_elts - index) * elt_size);
34
35 memcpy_disk(base + (elt_size * index), elt, elt_size);
36}
37
38
39
40
41static int bsearch(struct btree_node *n, uint64_t key, int want_hi)
42{
43 int lo = -1, hi = le32_to_cpu(n->header.nr_entries);
44
45 while (hi - lo > 1) {
46 int mid = lo + ((hi - lo) / 2);
47 uint64_t mid_key = le64_to_cpu(n->keys[mid]);
48
49 if (mid_key == key)
50 return mid;
51
52 if (mid_key < key)
53 lo = mid;
54 else
55 hi = mid;
56 }
57
58 return want_hi ? hi : lo;
59}
60
61int lower_bound(struct btree_node *n, uint64_t key)
62{
63 return bsearch(n, key, 0);
64}
65
66void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
67 struct dm_btree_value_type *vt)
68{
69 unsigned i;
70 uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
71
72 if (le32_to_cpu(n->header.flags) & INTERNAL_NODE)
73 for (i = 0; i < nr_entries; i++)
74 dm_tm_inc(tm, value64(n, i));
75 else if (vt->inc)
76 for (i = 0; i < nr_entries; i++)
77 vt->inc(vt->context, value_ptr(n, i));
78}
79
80static int insert_at(size_t value_size, struct btree_node *node, unsigned index,
81 uint64_t key, void *value)
82 __dm_written_to_disk(value)
83{
84 uint32_t nr_entries = le32_to_cpu(node->header.nr_entries);
85 __le64 key_le = cpu_to_le64(key);
86
87 if (index > nr_entries ||
88 index >= le32_to_cpu(node->header.max_entries)) {
89 DMERR("too many entries in btree node for insert");
90 __dm_unbless_for_disk(value);
91 return -ENOMEM;
92 }
93
94 __dm_bless_for_disk(&key_le);
95
96 array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le);
97 array_insert(value_base(node), value_size, nr_entries, index, value);
98 node->header.nr_entries = cpu_to_le32(nr_entries + 1);
99
100 return 0;
101}
102
103
104
105
106
107
108
109static uint32_t calc_max_entries(size_t value_size, size_t block_size)
110{
111 uint32_t total, n;
112 size_t elt_size = sizeof(uint64_t) + value_size;
113
114 block_size -= sizeof(struct node_header);
115 total = block_size / elt_size;
116 n = total / 3;
117
118 return 3 * n;
119}
120
121int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root)
122{
123 int r;
124 struct dm_block *b;
125 struct btree_node *n;
126 size_t block_size;
127 uint32_t max_entries;
128
129 r = new_block(info, &b);
130 if (r < 0)
131 return r;
132
133 block_size = dm_bm_block_size(dm_tm_get_bm(info->tm));
134 max_entries = calc_max_entries(info->value_type.size, block_size);
135
136 n = dm_block_data(b);
137 memset(n, 0, block_size);
138 n->header.flags = cpu_to_le32(LEAF_NODE);
139 n->header.nr_entries = cpu_to_le32(0);
140 n->header.max_entries = cpu_to_le32(max_entries);
141 n->header.value_size = cpu_to_le32(info->value_type.size);
142
143 *root = dm_block_location(b);
144 return unlock_block(info, b);
145}
146EXPORT_SYMBOL_GPL(dm_btree_empty);
147
148
149
150
151
152
153
154#define MAX_SPINE_DEPTH 64
155struct frame {
156 struct dm_block *b;
157 struct btree_node *n;
158 unsigned level;
159 unsigned nr_children;
160 unsigned current_child;
161};
162
163struct del_stack {
164 struct dm_btree_info *info;
165 struct dm_transaction_manager *tm;
166 int top;
167 struct frame spine[MAX_SPINE_DEPTH];
168};
169
170static int top_frame(struct del_stack *s, struct frame **f)
171{
172 if (s->top < 0) {
173 DMERR("btree deletion stack empty");
174 return -EINVAL;
175 }
176
177 *f = s->spine + s->top;
178
179 return 0;
180}
181
182static int unprocessed_frames(struct del_stack *s)
183{
184 return s->top >= 0;
185}
186
187static void prefetch_children(struct del_stack *s, struct frame *f)
188{
189 unsigned i;
190 struct dm_block_manager *bm = dm_tm_get_bm(s->tm);
191
192 for (i = 0; i < f->nr_children; i++)
193 dm_bm_prefetch(bm, value64(f->n, i));
194}
195
196static bool is_internal_level(struct dm_btree_info *info, struct frame *f)
197{
198 return f->level < (info->levels - 1);
199}
200
201static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
202{
203 int r;
204 uint32_t ref_count;
205
206 if (s->top >= MAX_SPINE_DEPTH - 1) {
207 DMERR("btree deletion stack out of memory");
208 return -ENOMEM;
209 }
210
211 r = dm_tm_ref(s->tm, b, &ref_count);
212 if (r)
213 return r;
214
215 if (ref_count > 1)
216
217
218
219
220 dm_tm_dec(s->tm, b);
221
222 else {
223 uint32_t flags;
224 struct frame *f = s->spine + ++s->top;
225
226 r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b);
227 if (r) {
228 s->top--;
229 return r;
230 }
231
232 f->n = dm_block_data(f->b);
233 f->level = level;
234 f->nr_children = le32_to_cpu(f->n->header.nr_entries);
235 f->current_child = 0;
236
237 flags = le32_to_cpu(f->n->header.flags);
238 if (flags & INTERNAL_NODE || is_internal_level(s->info, f))
239 prefetch_children(s, f);
240 }
241
242 return 0;
243}
244
245static void pop_frame(struct del_stack *s)
246{
247 struct frame *f = s->spine + s->top--;
248
249 dm_tm_dec(s->tm, dm_block_location(f->b));
250 dm_tm_unlock(s->tm, f->b);
251}
252
253int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
254{
255 int r;
256 struct del_stack *s;
257
258 s = kmalloc(sizeof(*s), GFP_NOIO);
259 if (!s)
260 return -ENOMEM;
261 s->info = info;
262 s->tm = info->tm;
263 s->top = -1;
264
265 r = push_frame(s, root, 0);
266 if (r)
267 goto out;
268
269 while (unprocessed_frames(s)) {
270 uint32_t flags;
271 struct frame *f;
272 dm_block_t b;
273
274 r = top_frame(s, &f);
275 if (r)
276 goto out;
277
278 if (f->current_child >= f->nr_children) {
279 pop_frame(s);
280 continue;
281 }
282
283 flags = le32_to_cpu(f->n->header.flags);
284 if (flags & INTERNAL_NODE) {
285 b = value64(f->n, f->current_child);
286 f->current_child++;
287 r = push_frame(s, b, f->level);
288 if (r)
289 goto out;
290
291 } else if (is_internal_level(info, f)) {
292 b = value64(f->n, f->current_child);
293 f->current_child++;
294 r = push_frame(s, b, f->level + 1);
295 if (r)
296 goto out;
297
298 } else {
299 if (info->value_type.dec) {
300 unsigned i;
301
302 for (i = 0; i < f->nr_children; i++)
303 info->value_type.dec(info->value_type.context,
304 value_ptr(f->n, i));
305 }
306 pop_frame(s);
307 }
308 }
309
310out:
311 kfree(s);
312 return r;
313}
314EXPORT_SYMBOL_GPL(dm_btree_del);
315
316
317
318static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
319 int (*search_fn)(struct btree_node *, uint64_t),
320 uint64_t *result_key, void *v, size_t value_size)
321{
322 int i, r;
323 uint32_t flags, nr_entries;
324
325 do {
326 r = ro_step(s, block);
327 if (r < 0)
328 return r;
329
330 i = search_fn(ro_node(s), key);
331
332 flags = le32_to_cpu(ro_node(s)->header.flags);
333 nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries);
334 if (i < 0 || i >= nr_entries)
335 return -ENODATA;
336
337 if (flags & INTERNAL_NODE)
338 block = value64(ro_node(s), i);
339
340 } while (!(flags & LEAF_NODE));
341
342 *result_key = le64_to_cpu(ro_node(s)->keys[i]);
343 memcpy(v, value_ptr(ro_node(s), i), value_size);
344
345 return 0;
346}
347
348int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
349 uint64_t *keys, void *value_le)
350{
351 unsigned level, last_level = info->levels - 1;
352 int r = -ENODATA;
353 uint64_t rkey;
354 __le64 internal_value_le;
355 struct ro_spine spine;
356
357 init_ro_spine(&spine, info);
358 for (level = 0; level < info->levels; level++) {
359 size_t size;
360 void *value_p;
361
362 if (level == last_level) {
363 value_p = value_le;
364 size = info->value_type.size;
365
366 } else {
367 value_p = &internal_value_le;
368 size = sizeof(uint64_t);
369 }
370
371 r = btree_lookup_raw(&spine, root, keys[level],
372 lower_bound, &rkey,
373 value_p, size);
374
375 if (!r) {
376 if (rkey != keys[level]) {
377 exit_ro_spine(&spine);
378 return -ENODATA;
379 }
380 } else {
381 exit_ro_spine(&spine);
382 return r;
383 }
384
385 root = le64_to_cpu(internal_value_le);
386 }
387 exit_ro_spine(&spine);
388
389 return r;
390}
391EXPORT_SYMBOL_GPL(dm_btree_lookup);
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index,
424 uint64_t key)
425{
426 int r;
427 size_t size;
428 unsigned nr_left, nr_right;
429 struct dm_block *left, *right, *parent;
430 struct btree_node *ln, *rn, *pn;
431 __le64 location;
432
433 left = shadow_current(s);
434
435 r = new_block(s->info, &right);
436 if (r < 0)
437 return r;
438
439 ln = dm_block_data(left);
440 rn = dm_block_data(right);
441
442 nr_left = le32_to_cpu(ln->header.nr_entries) / 2;
443 nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left;
444
445 ln->header.nr_entries = cpu_to_le32(nr_left);
446
447 rn->header.flags = ln->header.flags;
448 rn->header.nr_entries = cpu_to_le32(nr_right);
449 rn->header.max_entries = ln->header.max_entries;
450 rn->header.value_size = ln->header.value_size;
451 memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0]));
452
453 size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
454 sizeof(uint64_t) : s->info->value_type.size;
455 memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left),
456 size * nr_right);
457
458
459
460
461 parent = shadow_parent(s);
462
463 pn = dm_block_data(parent);
464 location = cpu_to_le64(dm_block_location(left));
465 __dm_bless_for_disk(&location);
466 memcpy_disk(value_ptr(pn, parent_index),
467 &location, sizeof(__le64));
468
469 location = cpu_to_le64(dm_block_location(right));
470 __dm_bless_for_disk(&location);
471
472 r = insert_at(sizeof(__le64), pn, parent_index + 1,
473 le64_to_cpu(rn->keys[0]), &location);
474 if (r)
475 return r;
476
477 if (key < le64_to_cpu(rn->keys[0])) {
478 unlock_block(s->info, right);
479 s->nodes[1] = left;
480 } else {
481 unlock_block(s->info, left);
482 s->nodes[1] = right;
483 }
484
485 return 0;
486}
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
510{
511 int r;
512 size_t size;
513 unsigned nr_left, nr_right;
514 struct dm_block *left, *right, *new_parent;
515 struct btree_node *pn, *ln, *rn;
516 __le64 val;
517
518 new_parent = shadow_current(s);
519
520 r = new_block(s->info, &left);
521 if (r < 0)
522 return r;
523
524 r = new_block(s->info, &right);
525 if (r < 0) {
526 unlock_block(s->info, left);
527 return r;
528 }
529
530 pn = dm_block_data(new_parent);
531 ln = dm_block_data(left);
532 rn = dm_block_data(right);
533
534 nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
535 nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
536
537 ln->header.flags = pn->header.flags;
538 ln->header.nr_entries = cpu_to_le32(nr_left);
539 ln->header.max_entries = pn->header.max_entries;
540 ln->header.value_size = pn->header.value_size;
541
542 rn->header.flags = pn->header.flags;
543 rn->header.nr_entries = cpu_to_le32(nr_right);
544 rn->header.max_entries = pn->header.max_entries;
545 rn->header.value_size = pn->header.value_size;
546
547 memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
548 memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
549
550 size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
551 sizeof(__le64) : s->info->value_type.size;
552 memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
553 memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
554 nr_right * size);
555
556
557 pn->header.flags = cpu_to_le32(INTERNAL_NODE);
558 pn->header.nr_entries = cpu_to_le32(2);
559 pn->header.max_entries = cpu_to_le32(
560 calc_max_entries(sizeof(__le64),
561 dm_bm_block_size(
562 dm_tm_get_bm(s->info->tm))));
563 pn->header.value_size = cpu_to_le32(sizeof(__le64));
564
565 val = cpu_to_le64(dm_block_location(left));
566 __dm_bless_for_disk(&val);
567 pn->keys[0] = ln->keys[0];
568 memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64));
569
570 val = cpu_to_le64(dm_block_location(right));
571 __dm_bless_for_disk(&val);
572 pn->keys[1] = rn->keys[0];
573 memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64));
574
575
576
577
578
579 if (s->nodes[0] != new_parent) {
580 unlock_block(s->info, s->nodes[0]);
581 s->nodes[0] = new_parent;
582 }
583 if (key < le64_to_cpu(rn->keys[0])) {
584 unlock_block(s->info, right);
585 s->nodes[1] = left;
586 } else {
587 unlock_block(s->info, left);
588 s->nodes[1] = right;
589 }
590 s->count = 2;
591
592 return 0;
593}
594
595static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
596 struct dm_btree_value_type *vt,
597 uint64_t key, unsigned *index)
598{
599 int r, i = *index, top = 1;
600 struct btree_node *node;
601
602 for (;;) {
603 r = shadow_step(s, root, vt);
604 if (r < 0)
605 return r;
606
607 node = dm_block_data(shadow_current(s));
608
609
610
611
612
613
614 if (shadow_has_parent(s) && i >= 0) {
615 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
616
617 __dm_bless_for_disk(&location);
618 memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
619 &location, sizeof(__le64));
620 }
621
622 node = dm_block_data(shadow_current(s));
623
624 if (node->header.nr_entries == node->header.max_entries) {
625 if (top)
626 r = btree_split_beneath(s, key);
627 else
628 r = btree_split_sibling(s, i, key);
629
630 if (r < 0)
631 return r;
632 }
633
634 node = dm_block_data(shadow_current(s));
635
636 i = lower_bound(node, key);
637
638 if (le32_to_cpu(node->header.flags) & LEAF_NODE)
639 break;
640
641 if (i < 0) {
642
643 node->keys[0] = cpu_to_le64(key);
644 i = 0;
645 }
646
647 root = value64(node, i);
648 top = 0;
649 }
650
651 if (i < 0 || le64_to_cpu(node->keys[i]) != key)
652 i++;
653
654 *index = i;
655 return 0;
656}
657
658static int insert(struct dm_btree_info *info, dm_block_t root,
659 uint64_t *keys, void *value, dm_block_t *new_root,
660 int *inserted)
661 __dm_written_to_disk(value)
662{
663 int r, need_insert;
664 unsigned level, index = -1, last_level = info->levels - 1;
665 dm_block_t block = root;
666 struct shadow_spine spine;
667 struct btree_node *n;
668 struct dm_btree_value_type le64_type;
669
670 init_le64_type(info->tm, &le64_type);
671 init_shadow_spine(&spine, info);
672
673 for (level = 0; level < (info->levels - 1); level++) {
674 r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index);
675 if (r < 0)
676 goto bad;
677
678 n = dm_block_data(shadow_current(&spine));
679 need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
680 (le64_to_cpu(n->keys[index]) != keys[level]));
681
682 if (need_insert) {
683 dm_block_t new_tree;
684 __le64 new_le;
685
686 r = dm_btree_empty(info, &new_tree);
687 if (r < 0)
688 goto bad;
689
690 new_le = cpu_to_le64(new_tree);
691 __dm_bless_for_disk(&new_le);
692
693 r = insert_at(sizeof(uint64_t), n, index,
694 keys[level], &new_le);
695 if (r)
696 goto bad;
697 }
698
699 if (level < last_level)
700 block = value64(n, index);
701 }
702
703 r = btree_insert_raw(&spine, block, &info->value_type,
704 keys[level], &index);
705 if (r < 0)
706 goto bad;
707
708 n = dm_block_data(shadow_current(&spine));
709 need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
710 (le64_to_cpu(n->keys[index]) != keys[level]));
711
712 if (need_insert) {
713 if (inserted)
714 *inserted = 1;
715
716 r = insert_at(info->value_type.size, n, index,
717 keys[level], value);
718 if (r)
719 goto bad_unblessed;
720 } else {
721 if (inserted)
722 *inserted = 0;
723
724 if (info->value_type.dec &&
725 (!info->value_type.equal ||
726 !info->value_type.equal(
727 info->value_type.context,
728 value_ptr(n, index),
729 value))) {
730 info->value_type.dec(info->value_type.context,
731 value_ptr(n, index));
732 }
733 memcpy_disk(value_ptr(n, index),
734 value, info->value_type.size);
735 }
736
737 *new_root = shadow_root(&spine);
738 exit_shadow_spine(&spine);
739
740 return 0;
741
742bad:
743 __dm_unbless_for_disk(value);
744bad_unblessed:
745 exit_shadow_spine(&spine);
746 return r;
747}
748
749int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
750 uint64_t *keys, void *value, dm_block_t *new_root)
751 __dm_written_to_disk(value)
752{
753 return insert(info, root, keys, value, new_root, NULL);
754}
755EXPORT_SYMBOL_GPL(dm_btree_insert);
756
757int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
758 uint64_t *keys, void *value, dm_block_t *new_root,
759 int *inserted)
760 __dm_written_to_disk(value)
761{
762 return insert(info, root, keys, value, new_root, inserted);
763}
764EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
765
766
767
768static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest,
769 uint64_t *result_key, dm_block_t *next_block)
770{
771 int i, r;
772 uint32_t flags;
773
774 do {
775 r = ro_step(s, block);
776 if (r < 0)
777 return r;
778
779 flags = le32_to_cpu(ro_node(s)->header.flags);
780 i = le32_to_cpu(ro_node(s)->header.nr_entries);
781 if (!i)
782 return -ENODATA;
783 else
784 i--;
785
786 if (find_highest)
787 *result_key = le64_to_cpu(ro_node(s)->keys[i]);
788 else
789 *result_key = le64_to_cpu(ro_node(s)->keys[0]);
790
791 if (next_block || flags & INTERNAL_NODE)
792 block = value64(ro_node(s), i);
793
794 } while (flags & INTERNAL_NODE);
795
796 if (next_block)
797 *next_block = block;
798 return 0;
799}
800
801static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root,
802 bool find_highest, uint64_t *result_keys)
803{
804 int r = 0, count = 0, level;
805 struct ro_spine spine;
806
807 init_ro_spine(&spine, info);
808 for (level = 0; level < info->levels; level++) {
809 r = find_key(&spine, root, find_highest, result_keys + level,
810 level == info->levels - 1 ? NULL : &root);
811 if (r == -ENODATA) {
812 r = 0;
813 break;
814
815 } else if (r)
816 break;
817
818 count++;
819 }
820 exit_ro_spine(&spine);
821
822 return r ? r : count;
823}
824
825int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
826 uint64_t *result_keys)
827{
828 return dm_btree_find_key(info, root, true, result_keys);
829}
830EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
831
832int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root,
833 uint64_t *result_keys)
834{
835 return dm_btree_find_key(info, root, false, result_keys);
836}
837EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key);
838
839
840
841
842
843
844
845static int walk_node(struct dm_btree_info *info, dm_block_t block,
846 int (*fn)(void *context, uint64_t *keys, void *leaf),
847 void *context)
848{
849 int r;
850 unsigned i, nr;
851 struct dm_block *node;
852 struct btree_node *n;
853 uint64_t keys;
854
855 r = bn_read_lock(info, block, &node);
856 if (r)
857 return r;
858
859 n = dm_block_data(node);
860
861 nr = le32_to_cpu(n->header.nr_entries);
862 for (i = 0; i < nr; i++) {
863 if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) {
864 r = walk_node(info, value64(n, i), fn, context);
865 if (r)
866 goto out;
867 } else {
868 keys = le64_to_cpu(*key_ptr(n, i));
869 r = fn(context, &keys, value_ptr(n, i));
870 if (r)
871 goto out;
872 }
873 }
874
875out:
876 dm_tm_unlock(info->tm, node);
877 return r;
878}
879
880int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
881 int (*fn)(void *context, uint64_t *keys, void *leaf),
882 void *context)
883{
884 BUG_ON(info->levels > 1);
885 return walk_node(info, root, fn, context);
886}
887EXPORT_SYMBOL_GPL(dm_btree_walk);
888