1
2
3
4
5
6
7#include "dm-btree-internal.h"
8#include "dm-space-map.h"
9#include "dm-transaction-manager.h"
10
11#include <linux/export.h>
12#include <linux/device-mapper.h>
13
14#define DM_MSG_PREFIX "btree"
15
16
17
18
19static void memcpy_disk(void *dest, const void *src, size_t len)
20 __dm_written_to_disk(src)
21{
22 memcpy(dest, src, len);
23 __dm_unbless_for_disk(src);
24}
25
26static void array_insert(void *base, size_t elt_size, unsigned nr_elts,
27 unsigned index, void *elt)
28 __dm_written_to_disk(elt)
29{
30 if (index < nr_elts)
31 memmove(base + (elt_size * (index + 1)),
32 base + (elt_size * index),
33 (nr_elts - index) * elt_size);
34
35 memcpy_disk(base + (elt_size * index), elt, elt_size);
36}
37
38
39
40
41static int bsearch(struct btree_node *n, uint64_t key, int want_hi)
42{
43 int lo = -1, hi = le32_to_cpu(n->header.nr_entries);
44
45 while (hi - lo > 1) {
46 int mid = lo + ((hi - lo) / 2);
47 uint64_t mid_key = le64_to_cpu(n->keys[mid]);
48
49 if (mid_key == key)
50 return mid;
51
52 if (mid_key < key)
53 lo = mid;
54 else
55 hi = mid;
56 }
57
58 return want_hi ? hi : lo;
59}
60
61int lower_bound(struct btree_node *n, uint64_t key)
62{
63 return bsearch(n, key, 0);
64}
65
66void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
67 struct dm_btree_value_type *vt)
68{
69 unsigned i;
70 uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
71
72 if (le32_to_cpu(n->header.flags) & INTERNAL_NODE)
73 for (i = 0; i < nr_entries; i++)
74 dm_tm_inc(tm, value64(n, i));
75 else if (vt->inc)
76 for (i = 0; i < nr_entries; i++)
77 vt->inc(vt->context, value_ptr(n, i));
78}
79
80static int insert_at(size_t value_size, struct btree_node *node, unsigned index,
81 uint64_t key, void *value)
82 __dm_written_to_disk(value)
83{
84 uint32_t nr_entries = le32_to_cpu(node->header.nr_entries);
85 __le64 key_le = cpu_to_le64(key);
86
87 if (index > nr_entries ||
88 index >= le32_to_cpu(node->header.max_entries)) {
89 DMERR("too many entries in btree node for insert");
90 __dm_unbless_for_disk(value);
91 return -ENOMEM;
92 }
93
94 __dm_bless_for_disk(&key_le);
95
96 array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le);
97 array_insert(value_base(node), value_size, nr_entries, index, value);
98 node->header.nr_entries = cpu_to_le32(nr_entries + 1);
99
100 return 0;
101}
102
103
104
105
106
107
108
109static uint32_t calc_max_entries(size_t value_size, size_t block_size)
110{
111 uint32_t total, n;
112 size_t elt_size = sizeof(uint64_t) + value_size;
113
114 block_size -= sizeof(struct node_header);
115 total = block_size / elt_size;
116 n = total / 3;
117
118 return 3 * n;
119}
120
121int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root)
122{
123 int r;
124 struct dm_block *b;
125 struct btree_node *n;
126 size_t block_size;
127 uint32_t max_entries;
128
129 r = new_block(info, &b);
130 if (r < 0)
131 return r;
132
133 block_size = dm_bm_block_size(dm_tm_get_bm(info->tm));
134 max_entries = calc_max_entries(info->value_type.size, block_size);
135
136 n = dm_block_data(b);
137 memset(n, 0, block_size);
138 n->header.flags = cpu_to_le32(LEAF_NODE);
139 n->header.nr_entries = cpu_to_le32(0);
140 n->header.max_entries = cpu_to_le32(max_entries);
141 n->header.value_size = cpu_to_le32(info->value_type.size);
142
143 *root = dm_block_location(b);
144 return unlock_block(info, b);
145}
146EXPORT_SYMBOL_GPL(dm_btree_empty);
147
148
149
150
151
152
153
154#define MAX_SPINE_DEPTH 64
155struct frame {
156 struct dm_block *b;
157 struct btree_node *n;
158 unsigned level;
159 unsigned nr_children;
160 unsigned current_child;
161};
162
163struct del_stack {
164 struct dm_transaction_manager *tm;
165 int top;
166 struct frame spine[MAX_SPINE_DEPTH];
167};
168
169static int top_frame(struct del_stack *s, struct frame **f)
170{
171 if (s->top < 0) {
172 DMERR("btree deletion stack empty");
173 return -EINVAL;
174 }
175
176 *f = s->spine + s->top;
177
178 return 0;
179}
180
181static int unprocessed_frames(struct del_stack *s)
182{
183 return s->top >= 0;
184}
185
186static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
187{
188 int r;
189 uint32_t ref_count;
190
191 if (s->top >= MAX_SPINE_DEPTH - 1) {
192 DMERR("btree deletion stack out of memory");
193 return -ENOMEM;
194 }
195
196 r = dm_tm_ref(s->tm, b, &ref_count);
197 if (r)
198 return r;
199
200 if (ref_count > 1)
201
202
203
204
205 dm_tm_dec(s->tm, b);
206
207 else {
208 struct frame *f = s->spine + ++s->top;
209
210 r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b);
211 if (r) {
212 s->top--;
213 return r;
214 }
215
216 f->n = dm_block_data(f->b);
217 f->level = level;
218 f->nr_children = le32_to_cpu(f->n->header.nr_entries);
219 f->current_child = 0;
220 }
221
222 return 0;
223}
224
225static void pop_frame(struct del_stack *s)
226{
227 struct frame *f = s->spine + s->top--;
228
229 dm_tm_dec(s->tm, dm_block_location(f->b));
230 dm_tm_unlock(s->tm, f->b);
231}
232
233static bool is_internal_level(struct dm_btree_info *info, struct frame *f)
234{
235 return f->level < (info->levels - 1);
236}
237
238int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
239{
240 int r;
241 struct del_stack *s;
242
243 s = kmalloc(sizeof(*s), GFP_KERNEL);
244 if (!s)
245 return -ENOMEM;
246 s->tm = info->tm;
247 s->top = -1;
248
249 r = push_frame(s, root, 0);
250 if (r)
251 goto out;
252
253 while (unprocessed_frames(s)) {
254 uint32_t flags;
255 struct frame *f;
256 dm_block_t b;
257
258 r = top_frame(s, &f);
259 if (r)
260 goto out;
261
262 if (f->current_child >= f->nr_children) {
263 pop_frame(s);
264 continue;
265 }
266
267 flags = le32_to_cpu(f->n->header.flags);
268 if (flags & INTERNAL_NODE) {
269 b = value64(f->n, f->current_child);
270 f->current_child++;
271 r = push_frame(s, b, f->level);
272 if (r)
273 goto out;
274
275 } else if (is_internal_level(info, f)) {
276 b = value64(f->n, f->current_child);
277 f->current_child++;
278 r = push_frame(s, b, f->level + 1);
279 if (r)
280 goto out;
281
282 } else {
283 if (info->value_type.dec) {
284 unsigned i;
285
286 for (i = 0; i < f->nr_children; i++)
287 info->value_type.dec(info->value_type.context,
288 value_ptr(f->n, i));
289 }
290 f->current_child = f->nr_children;
291 }
292 }
293
294out:
295 kfree(s);
296 return r;
297}
298EXPORT_SYMBOL_GPL(dm_btree_del);
299
300
301
302static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
303 int (*search_fn)(struct btree_node *, uint64_t),
304 uint64_t *result_key, void *v, size_t value_size)
305{
306 int i, r;
307 uint32_t flags, nr_entries;
308
309 do {
310 r = ro_step(s, block);
311 if (r < 0)
312 return r;
313
314 i = search_fn(ro_node(s), key);
315
316 flags = le32_to_cpu(ro_node(s)->header.flags);
317 nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries);
318 if (i < 0 || i >= nr_entries)
319 return -ENODATA;
320
321 if (flags & INTERNAL_NODE)
322 block = value64(ro_node(s), i);
323
324 } while (!(flags & LEAF_NODE));
325
326 *result_key = le64_to_cpu(ro_node(s)->keys[i]);
327 memcpy(v, value_ptr(ro_node(s), i), value_size);
328
329 return 0;
330}
331
332int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
333 uint64_t *keys, void *value_le)
334{
335 unsigned level, last_level = info->levels - 1;
336 int r = -ENODATA;
337 uint64_t rkey;
338 __le64 internal_value_le;
339 struct ro_spine spine;
340
341 init_ro_spine(&spine, info);
342 for (level = 0; level < info->levels; level++) {
343 size_t size;
344 void *value_p;
345
346 if (level == last_level) {
347 value_p = value_le;
348 size = info->value_type.size;
349
350 } else {
351 value_p = &internal_value_le;
352 size = sizeof(uint64_t);
353 }
354
355 r = btree_lookup_raw(&spine, root, keys[level],
356 lower_bound, &rkey,
357 value_p, size);
358
359 if (!r) {
360 if (rkey != keys[level]) {
361 exit_ro_spine(&spine);
362 return -ENODATA;
363 }
364 } else {
365 exit_ro_spine(&spine);
366 return r;
367 }
368
369 root = le64_to_cpu(internal_value_le);
370 }
371 exit_ro_spine(&spine);
372
373 return r;
374}
375EXPORT_SYMBOL_GPL(dm_btree_lookup);
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407static int btree_split_sibling(struct shadow_spine *s, dm_block_t root,
408 unsigned parent_index, uint64_t key)
409{
410 int r;
411 size_t size;
412 unsigned nr_left, nr_right;
413 struct dm_block *left, *right, *parent;
414 struct btree_node *ln, *rn, *pn;
415 __le64 location;
416
417 left = shadow_current(s);
418
419 r = new_block(s->info, &right);
420 if (r < 0)
421 return r;
422
423 ln = dm_block_data(left);
424 rn = dm_block_data(right);
425
426 nr_left = le32_to_cpu(ln->header.nr_entries) / 2;
427 nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left;
428
429 ln->header.nr_entries = cpu_to_le32(nr_left);
430
431 rn->header.flags = ln->header.flags;
432 rn->header.nr_entries = cpu_to_le32(nr_right);
433 rn->header.max_entries = ln->header.max_entries;
434 rn->header.value_size = ln->header.value_size;
435 memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0]));
436
437 size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
438 sizeof(uint64_t) : s->info->value_type.size;
439 memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left),
440 size * nr_right);
441
442
443
444
445 parent = shadow_parent(s);
446
447 pn = dm_block_data(parent);
448 location = cpu_to_le64(dm_block_location(left));
449 __dm_bless_for_disk(&location);
450 memcpy_disk(value_ptr(pn, parent_index),
451 &location, sizeof(__le64));
452
453 location = cpu_to_le64(dm_block_location(right));
454 __dm_bless_for_disk(&location);
455
456 r = insert_at(sizeof(__le64), pn, parent_index + 1,
457 le64_to_cpu(rn->keys[0]), &location);
458 if (r)
459 return r;
460
461 if (key < le64_to_cpu(rn->keys[0])) {
462 unlock_block(s->info, right);
463 s->nodes[1] = left;
464 } else {
465 unlock_block(s->info, left);
466 s->nodes[1] = right;
467 }
468
469 return 0;
470}
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
494{
495 int r;
496 size_t size;
497 unsigned nr_left, nr_right;
498 struct dm_block *left, *right, *new_parent;
499 struct btree_node *pn, *ln, *rn;
500 __le64 val;
501
502 new_parent = shadow_current(s);
503
504 r = new_block(s->info, &left);
505 if (r < 0)
506 return r;
507
508 r = new_block(s->info, &right);
509 if (r < 0) {
510
511 return r;
512 }
513
514 pn = dm_block_data(new_parent);
515 ln = dm_block_data(left);
516 rn = dm_block_data(right);
517
518 nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
519 nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
520
521 ln->header.flags = pn->header.flags;
522 ln->header.nr_entries = cpu_to_le32(nr_left);
523 ln->header.max_entries = pn->header.max_entries;
524 ln->header.value_size = pn->header.value_size;
525
526 rn->header.flags = pn->header.flags;
527 rn->header.nr_entries = cpu_to_le32(nr_right);
528 rn->header.max_entries = pn->header.max_entries;
529 rn->header.value_size = pn->header.value_size;
530
531 memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
532 memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
533
534 size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
535 sizeof(__le64) : s->info->value_type.size;
536 memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
537 memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
538 nr_right * size);
539
540
541 pn->header.flags = cpu_to_le32(INTERNAL_NODE);
542 pn->header.nr_entries = cpu_to_le32(2);
543 pn->header.max_entries = cpu_to_le32(
544 calc_max_entries(sizeof(__le64),
545 dm_bm_block_size(
546 dm_tm_get_bm(s->info->tm))));
547 pn->header.value_size = cpu_to_le32(sizeof(__le64));
548
549 val = cpu_to_le64(dm_block_location(left));
550 __dm_bless_for_disk(&val);
551 pn->keys[0] = ln->keys[0];
552 memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64));
553
554 val = cpu_to_le64(dm_block_location(right));
555 __dm_bless_for_disk(&val);
556 pn->keys[1] = rn->keys[0];
557 memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64));
558
559
560
561
562
563 if (s->nodes[0] != new_parent) {
564 unlock_block(s->info, s->nodes[0]);
565 s->nodes[0] = new_parent;
566 }
567 if (key < le64_to_cpu(rn->keys[0])) {
568 unlock_block(s->info, right);
569 s->nodes[1] = left;
570 } else {
571 unlock_block(s->info, left);
572 s->nodes[1] = right;
573 }
574 s->count = 2;
575
576 return 0;
577}
578
579static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
580 struct dm_btree_value_type *vt,
581 uint64_t key, unsigned *index)
582{
583 int r, i = *index, top = 1;
584 struct btree_node *node;
585
586 for (;;) {
587 r = shadow_step(s, root, vt);
588 if (r < 0)
589 return r;
590
591 node = dm_block_data(shadow_current(s));
592
593
594
595
596
597
598 if (shadow_has_parent(s) && i >= 0) {
599 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
600
601 __dm_bless_for_disk(&location);
602 memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
603 &location, sizeof(__le64));
604 }
605
606 node = dm_block_data(shadow_current(s));
607
608 if (node->header.nr_entries == node->header.max_entries) {
609 if (top)
610 r = btree_split_beneath(s, key);
611 else
612 r = btree_split_sibling(s, root, i, key);
613
614 if (r < 0)
615 return r;
616 }
617
618 node = dm_block_data(shadow_current(s));
619
620 i = lower_bound(node, key);
621
622 if (le32_to_cpu(node->header.flags) & LEAF_NODE)
623 break;
624
625 if (i < 0) {
626
627 node->keys[0] = cpu_to_le64(key);
628 i = 0;
629 }
630
631 root = value64(node, i);
632 top = 0;
633 }
634
635 if (i < 0 || le64_to_cpu(node->keys[i]) != key)
636 i++;
637
638 *index = i;
639 return 0;
640}
641
642static int insert(struct dm_btree_info *info, dm_block_t root,
643 uint64_t *keys, void *value, dm_block_t *new_root,
644 int *inserted)
645 __dm_written_to_disk(value)
646{
647 int r, need_insert;
648 unsigned level, index = -1, last_level = info->levels - 1;
649 dm_block_t block = root;
650 struct shadow_spine spine;
651 struct btree_node *n;
652 struct dm_btree_value_type le64_type;
653
654 le64_type.context = NULL;
655 le64_type.size = sizeof(__le64);
656 le64_type.inc = NULL;
657 le64_type.dec = NULL;
658 le64_type.equal = NULL;
659
660 init_shadow_spine(&spine, info);
661
662 for (level = 0; level < (info->levels - 1); level++) {
663 r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index);
664 if (r < 0)
665 goto bad;
666
667 n = dm_block_data(shadow_current(&spine));
668 need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
669 (le64_to_cpu(n->keys[index]) != keys[level]));
670
671 if (need_insert) {
672 dm_block_t new_tree;
673 __le64 new_le;
674
675 r = dm_btree_empty(info, &new_tree);
676 if (r < 0)
677 goto bad;
678
679 new_le = cpu_to_le64(new_tree);
680 __dm_bless_for_disk(&new_le);
681
682 r = insert_at(sizeof(uint64_t), n, index,
683 keys[level], &new_le);
684 if (r)
685 goto bad;
686 }
687
688 if (level < last_level)
689 block = value64(n, index);
690 }
691
692 r = btree_insert_raw(&spine, block, &info->value_type,
693 keys[level], &index);
694 if (r < 0)
695 goto bad;
696
697 n = dm_block_data(shadow_current(&spine));
698 need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
699 (le64_to_cpu(n->keys[index]) != keys[level]));
700
701 if (need_insert) {
702 if (inserted)
703 *inserted = 1;
704
705 r = insert_at(info->value_type.size, n, index,
706 keys[level], value);
707 if (r)
708 goto bad_unblessed;
709 } else {
710 if (inserted)
711 *inserted = 0;
712
713 if (info->value_type.dec &&
714 (!info->value_type.equal ||
715 !info->value_type.equal(
716 info->value_type.context,
717 value_ptr(n, index),
718 value))) {
719 info->value_type.dec(info->value_type.context,
720 value_ptr(n, index));
721 }
722 memcpy_disk(value_ptr(n, index),
723 value, info->value_type.size);
724 }
725
726 *new_root = shadow_root(&spine);
727 exit_shadow_spine(&spine);
728
729 return 0;
730
731bad:
732 __dm_unbless_for_disk(value);
733bad_unblessed:
734 exit_shadow_spine(&spine);
735 return r;
736}
737
738int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
739 uint64_t *keys, void *value, dm_block_t *new_root)
740 __dm_written_to_disk(value)
741{
742 return insert(info, root, keys, value, new_root, NULL);
743}
744EXPORT_SYMBOL_GPL(dm_btree_insert);
745
746int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
747 uint64_t *keys, void *value, dm_block_t *new_root,
748 int *inserted)
749 __dm_written_to_disk(value)
750{
751 return insert(info, root, keys, value, new_root, inserted);
752}
753EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
754
755
756
757static int find_highest_key(struct ro_spine *s, dm_block_t block,
758 uint64_t *result_key, dm_block_t *next_block)
759{
760 int i, r;
761 uint32_t flags;
762
763 do {
764 r = ro_step(s, block);
765 if (r < 0)
766 return r;
767
768 flags = le32_to_cpu(ro_node(s)->header.flags);
769 i = le32_to_cpu(ro_node(s)->header.nr_entries);
770 if (!i)
771 return -ENODATA;
772 else
773 i--;
774
775 *result_key = le64_to_cpu(ro_node(s)->keys[i]);
776 if (next_block || flags & INTERNAL_NODE)
777 block = value64(ro_node(s), i);
778
779 } while (flags & INTERNAL_NODE);
780
781 if (next_block)
782 *next_block = block;
783 return 0;
784}
785
786int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
787 uint64_t *result_keys)
788{
789 int r = 0, count = 0, level;
790 struct ro_spine spine;
791
792 init_ro_spine(&spine, info);
793 for (level = 0; level < info->levels; level++) {
794 r = find_highest_key(&spine, root, result_keys + level,
795 level == info->levels - 1 ? NULL : &root);
796 if (r == -ENODATA) {
797 r = 0;
798 break;
799
800 } else if (r)
801 break;
802
803 count++;
804 }
805 exit_ro_spine(&spine);
806
807 return r ? r : count;
808}
809EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
810
811
812
813
814
815static int walk_node(struct ro_spine *s, dm_block_t block,
816 int (*fn)(void *context, uint64_t *keys, void *leaf),
817 void *context)
818{
819 int r;
820 unsigned i, nr;
821 struct btree_node *n;
822 uint64_t keys;
823
824 r = ro_step(s, block);
825 n = ro_node(s);
826
827 nr = le32_to_cpu(n->header.nr_entries);
828 for (i = 0; i < nr; i++) {
829 if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) {
830 r = walk_node(s, value64(n, i), fn, context);
831 if (r)
832 goto out;
833 } else {
834 keys = le64_to_cpu(*key_ptr(n, i));
835 r = fn(context, &keys, value_ptr(n, i));
836 if (r)
837 goto out;
838 }
839 }
840
841out:
842 ro_pop(s);
843 return r;
844}
845
846int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
847 int (*fn)(void *context, uint64_t *keys, void *leaf),
848 void *context)
849{
850 int r;
851 struct ro_spine spine;
852
853 BUG_ON(info->levels > 1);
854
855 init_ro_spine(&spine, info);
856 r = walk_node(&spine, root, fn, context);
857 exit_ro_spine(&spine);
858
859 return r;
860}
861EXPORT_SYMBOL_GPL(dm_btree_walk);
862