1
2
3
4
5
6
7#include "dm-btree-internal.h"
8#include "dm-space-map.h"
9#include "dm-transaction-manager.h"
10
11#include <linux/export.h>
12#include <linux/device-mapper.h>
13
14#define DM_MSG_PREFIX "btree"
15
16
17
18
19static void memcpy_disk(void *dest, const void *src, size_t len)
20 __dm_written_to_disk(src)
21{
22 memcpy(dest, src, len);
23 __dm_unbless_for_disk(src);
24}
25
26static void array_insert(void *base, size_t elt_size, unsigned nr_elts,
27 unsigned index, void *elt)
28 __dm_written_to_disk(elt)
29{
30 if (index < nr_elts)
31 memmove(base + (elt_size * (index + 1)),
32 base + (elt_size * index),
33 (nr_elts - index) * elt_size);
34
35 memcpy_disk(base + (elt_size * index), elt, elt_size);
36}
37
38
39
40
41static int bsearch(struct node *n, uint64_t key, int want_hi)
42{
43 int lo = -1, hi = le32_to_cpu(n->header.nr_entries);
44
45 while (hi - lo > 1) {
46 int mid = lo + ((hi - lo) / 2);
47 uint64_t mid_key = le64_to_cpu(n->keys[mid]);
48
49 if (mid_key == key)
50 return mid;
51
52 if (mid_key < key)
53 lo = mid;
54 else
55 hi = mid;
56 }
57
58 return want_hi ? hi : lo;
59}
60
61int lower_bound(struct node *n, uint64_t key)
62{
63 return bsearch(n, key, 0);
64}
65
66void inc_children(struct dm_transaction_manager *tm, struct node *n,
67 struct dm_btree_value_type *vt)
68{
69 unsigned i;
70 uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
71
72 if (le32_to_cpu(n->header.flags) & INTERNAL_NODE)
73 for (i = 0; i < nr_entries; i++)
74 dm_tm_inc(tm, value64(n, i));
75 else if (vt->inc)
76 for (i = 0; i < nr_entries; i++)
77 vt->inc(vt->context, value_ptr(n, i));
78}
79
80static int insert_at(size_t value_size, struct node *node, unsigned index,
81 uint64_t key, void *value)
82 __dm_written_to_disk(value)
83{
84 uint32_t nr_entries = le32_to_cpu(node->header.nr_entries);
85 __le64 key_le = cpu_to_le64(key);
86
87 if (index > nr_entries ||
88 index >= le32_to_cpu(node->header.max_entries)) {
89 DMERR("too many entries in btree node for insert");
90 __dm_unbless_for_disk(value);
91 return -ENOMEM;
92 }
93
94 __dm_bless_for_disk(&key_le);
95
96 array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le);
97 array_insert(value_base(node), value_size, nr_entries, index, value);
98 node->header.nr_entries = cpu_to_le32(nr_entries + 1);
99
100 return 0;
101}
102
103
104
105
106
107
108
109static uint32_t calc_max_entries(size_t value_size, size_t block_size)
110{
111 uint32_t total, n;
112 size_t elt_size = sizeof(uint64_t) + value_size;
113
114 block_size -= sizeof(struct node_header);
115 total = block_size / elt_size;
116 n = total / 3;
117
118 return 3 * n;
119}
120
121int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root)
122{
123 int r;
124 struct dm_block *b;
125 struct node *n;
126 size_t block_size;
127 uint32_t max_entries;
128
129 r = new_block(info, &b);
130 if (r < 0)
131 return r;
132
133 block_size = dm_bm_block_size(dm_tm_get_bm(info->tm));
134 max_entries = calc_max_entries(info->value_type.size, block_size);
135
136 n = dm_block_data(b);
137 memset(n, 0, block_size);
138 n->header.flags = cpu_to_le32(LEAF_NODE);
139 n->header.nr_entries = cpu_to_le32(0);
140 n->header.max_entries = cpu_to_le32(max_entries);
141 n->header.value_size = cpu_to_le32(info->value_type.size);
142
143 *root = dm_block_location(b);
144 return unlock_block(info, b);
145}
146EXPORT_SYMBOL_GPL(dm_btree_empty);
147
148
149
150
151
152
153
154#define MAX_SPINE_DEPTH 64
155struct frame {
156 struct dm_block *b;
157 struct node *n;
158 unsigned level;
159 unsigned nr_children;
160 unsigned current_child;
161};
162
163struct del_stack {
164 struct dm_transaction_manager *tm;
165 int top;
166 struct frame spine[MAX_SPINE_DEPTH];
167};
168
169static int top_frame(struct del_stack *s, struct frame **f)
170{
171 if (s->top < 0) {
172 DMERR("btree deletion stack empty");
173 return -EINVAL;
174 }
175
176 *f = s->spine + s->top;
177
178 return 0;
179}
180
181static int unprocessed_frames(struct del_stack *s)
182{
183 return s->top >= 0;
184}
185
186static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
187{
188 int r;
189 uint32_t ref_count;
190
191 if (s->top >= MAX_SPINE_DEPTH - 1) {
192 DMERR("btree deletion stack out of memory");
193 return -ENOMEM;
194 }
195
196 r = dm_tm_ref(s->tm, b, &ref_count);
197 if (r)
198 return r;
199
200 if (ref_count > 1)
201
202
203
204
205 dm_tm_dec(s->tm, b);
206
207 else {
208 struct frame *f = s->spine + ++s->top;
209
210 r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b);
211 if (r) {
212 s->top--;
213 return r;
214 }
215
216 f->n = dm_block_data(f->b);
217 f->level = level;
218 f->nr_children = le32_to_cpu(f->n->header.nr_entries);
219 f->current_child = 0;
220 }
221
222 return 0;
223}
224
225static void pop_frame(struct del_stack *s)
226{
227 struct frame *f = s->spine + s->top--;
228
229 dm_tm_dec(s->tm, dm_block_location(f->b));
230 dm_tm_unlock(s->tm, f->b);
231}
232
233int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
234{
235 int r;
236 struct del_stack *s;
237
238 s = kmalloc(sizeof(*s), GFP_KERNEL);
239 if (!s)
240 return -ENOMEM;
241 s->tm = info->tm;
242 s->top = -1;
243
244 r = push_frame(s, root, 1);
245 if (r)
246 goto out;
247
248 while (unprocessed_frames(s)) {
249 uint32_t flags;
250 struct frame *f;
251 dm_block_t b;
252
253 r = top_frame(s, &f);
254 if (r)
255 goto out;
256
257 if (f->current_child >= f->nr_children) {
258 pop_frame(s);
259 continue;
260 }
261
262 flags = le32_to_cpu(f->n->header.flags);
263 if (flags & INTERNAL_NODE) {
264 b = value64(f->n, f->current_child);
265 f->current_child++;
266 r = push_frame(s, b, f->level);
267 if (r)
268 goto out;
269
270 } else if (f->level != (info->levels - 1)) {
271 b = value64(f->n, f->current_child);
272 f->current_child++;
273 r = push_frame(s, b, f->level + 1);
274 if (r)
275 goto out;
276
277 } else {
278 if (info->value_type.dec) {
279 unsigned i;
280
281 for (i = 0; i < f->nr_children; i++)
282 info->value_type.dec(info->value_type.context,
283 value_ptr(f->n, i));
284 }
285 f->current_child = f->nr_children;
286 }
287 }
288
289out:
290 kfree(s);
291 return r;
292}
293EXPORT_SYMBOL_GPL(dm_btree_del);
294
295
296
297static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
298 int (*search_fn)(struct node *, uint64_t),
299 uint64_t *result_key, void *v, size_t value_size)
300{
301 int i, r;
302 uint32_t flags, nr_entries;
303
304 do {
305 r = ro_step(s, block);
306 if (r < 0)
307 return r;
308
309 i = search_fn(ro_node(s), key);
310
311 flags = le32_to_cpu(ro_node(s)->header.flags);
312 nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries);
313 if (i < 0 || i >= nr_entries)
314 return -ENODATA;
315
316 if (flags & INTERNAL_NODE)
317 block = value64(ro_node(s), i);
318
319 } while (!(flags & LEAF_NODE));
320
321 *result_key = le64_to_cpu(ro_node(s)->keys[i]);
322 memcpy(v, value_ptr(ro_node(s), i), value_size);
323
324 return 0;
325}
326
327int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
328 uint64_t *keys, void *value_le)
329{
330 unsigned level, last_level = info->levels - 1;
331 int r = -ENODATA;
332 uint64_t rkey;
333 __le64 internal_value_le;
334 struct ro_spine spine;
335
336 init_ro_spine(&spine, info);
337 for (level = 0; level < info->levels; level++) {
338 size_t size;
339 void *value_p;
340
341 if (level == last_level) {
342 value_p = value_le;
343 size = info->value_type.size;
344
345 } else {
346 value_p = &internal_value_le;
347 size = sizeof(uint64_t);
348 }
349
350 r = btree_lookup_raw(&spine, root, keys[level],
351 lower_bound, &rkey,
352 value_p, size);
353
354 if (!r) {
355 if (rkey != keys[level]) {
356 exit_ro_spine(&spine);
357 return -ENODATA;
358 }
359 } else {
360 exit_ro_spine(&spine);
361 return r;
362 }
363
364 root = le64_to_cpu(internal_value_le);
365 }
366 exit_ro_spine(&spine);
367
368 return r;
369}
370EXPORT_SYMBOL_GPL(dm_btree_lookup);
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402static int btree_split_sibling(struct shadow_spine *s, dm_block_t root,
403 unsigned parent_index, uint64_t key)
404{
405 int r;
406 size_t size;
407 unsigned nr_left, nr_right;
408 struct dm_block *left, *right, *parent;
409 struct node *ln, *rn, *pn;
410 __le64 location;
411
412 left = shadow_current(s);
413
414 r = new_block(s->info, &right);
415 if (r < 0)
416 return r;
417
418 ln = dm_block_data(left);
419 rn = dm_block_data(right);
420
421 nr_left = le32_to_cpu(ln->header.nr_entries) / 2;
422 nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left;
423
424 ln->header.nr_entries = cpu_to_le32(nr_left);
425
426 rn->header.flags = ln->header.flags;
427 rn->header.nr_entries = cpu_to_le32(nr_right);
428 rn->header.max_entries = ln->header.max_entries;
429 rn->header.value_size = ln->header.value_size;
430 memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0]));
431
432 size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
433 sizeof(uint64_t) : s->info->value_type.size;
434 memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left),
435 size * nr_right);
436
437
438
439
440 parent = shadow_parent(s);
441
442 pn = dm_block_data(parent);
443 location = cpu_to_le64(dm_block_location(left));
444 __dm_bless_for_disk(&location);
445 memcpy_disk(value_ptr(pn, parent_index),
446 &location, sizeof(__le64));
447
448 location = cpu_to_le64(dm_block_location(right));
449 __dm_bless_for_disk(&location);
450
451 r = insert_at(sizeof(__le64), pn, parent_index + 1,
452 le64_to_cpu(rn->keys[0]), &location);
453 if (r)
454 return r;
455
456 if (key < le64_to_cpu(rn->keys[0])) {
457 unlock_block(s->info, right);
458 s->nodes[1] = left;
459 } else {
460 unlock_block(s->info, left);
461 s->nodes[1] = right;
462 }
463
464 return 0;
465}
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
489{
490 int r;
491 size_t size;
492 unsigned nr_left, nr_right;
493 struct dm_block *left, *right, *new_parent;
494 struct node *pn, *ln, *rn;
495 __le64 val;
496
497 new_parent = shadow_current(s);
498
499 r = new_block(s->info, &left);
500 if (r < 0)
501 return r;
502
503 r = new_block(s->info, &right);
504 if (r < 0) {
505
506 return r;
507 }
508
509 pn = dm_block_data(new_parent);
510 ln = dm_block_data(left);
511 rn = dm_block_data(right);
512
513 nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
514 nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
515
516 ln->header.flags = pn->header.flags;
517 ln->header.nr_entries = cpu_to_le32(nr_left);
518 ln->header.max_entries = pn->header.max_entries;
519 ln->header.value_size = pn->header.value_size;
520
521 rn->header.flags = pn->header.flags;
522 rn->header.nr_entries = cpu_to_le32(nr_right);
523 rn->header.max_entries = pn->header.max_entries;
524 rn->header.value_size = pn->header.value_size;
525
526 memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
527 memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
528
529 size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
530 sizeof(__le64) : s->info->value_type.size;
531 memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
532 memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
533 nr_right * size);
534
535
536 pn->header.flags = cpu_to_le32(INTERNAL_NODE);
537 pn->header.nr_entries = cpu_to_le32(2);
538 pn->header.max_entries = cpu_to_le32(
539 calc_max_entries(sizeof(__le64),
540 dm_bm_block_size(
541 dm_tm_get_bm(s->info->tm))));
542 pn->header.value_size = cpu_to_le32(sizeof(__le64));
543
544 val = cpu_to_le64(dm_block_location(left));
545 __dm_bless_for_disk(&val);
546 pn->keys[0] = ln->keys[0];
547 memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64));
548
549 val = cpu_to_le64(dm_block_location(right));
550 __dm_bless_for_disk(&val);
551 pn->keys[1] = rn->keys[0];
552 memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64));
553
554
555
556
557
558 if (s->nodes[0] != new_parent) {
559 unlock_block(s->info, s->nodes[0]);
560 s->nodes[0] = new_parent;
561 }
562 if (key < le64_to_cpu(rn->keys[0])) {
563 unlock_block(s->info, right);
564 s->nodes[1] = left;
565 } else {
566 unlock_block(s->info, left);
567 s->nodes[1] = right;
568 }
569 s->count = 2;
570
571 return 0;
572}
573
574static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
575 struct dm_btree_value_type *vt,
576 uint64_t key, unsigned *index)
577{
578 int r, i = *index, top = 1;
579 struct node *node;
580
581 for (;;) {
582 r = shadow_step(s, root, vt);
583 if (r < 0)
584 return r;
585
586 node = dm_block_data(shadow_current(s));
587
588
589
590
591
592
593 if (shadow_has_parent(s) && i >= 0) {
594 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
595
596 __dm_bless_for_disk(&location);
597 memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
598 &location, sizeof(__le64));
599 }
600
601 node = dm_block_data(shadow_current(s));
602
603 if (node->header.nr_entries == node->header.max_entries) {
604 if (top)
605 r = btree_split_beneath(s, key);
606 else
607 r = btree_split_sibling(s, root, i, key);
608
609 if (r < 0)
610 return r;
611 }
612
613 node = dm_block_data(shadow_current(s));
614
615 i = lower_bound(node, key);
616
617 if (le32_to_cpu(node->header.flags) & LEAF_NODE)
618 break;
619
620 if (i < 0) {
621
622 node->keys[0] = cpu_to_le64(key);
623 i = 0;
624 }
625
626 root = value64(node, i);
627 top = 0;
628 }
629
630 if (i < 0 || le64_to_cpu(node->keys[i]) != key)
631 i++;
632
633 *index = i;
634 return 0;
635}
636
637static int insert(struct dm_btree_info *info, dm_block_t root,
638 uint64_t *keys, void *value, dm_block_t *new_root,
639 int *inserted)
640 __dm_written_to_disk(value)
641{
642 int r, need_insert;
643 unsigned level, index = -1, last_level = info->levels - 1;
644 dm_block_t block = root;
645 struct shadow_spine spine;
646 struct node *n;
647 struct dm_btree_value_type le64_type;
648
649 le64_type.context = NULL;
650 le64_type.size = sizeof(__le64);
651 le64_type.inc = NULL;
652 le64_type.dec = NULL;
653 le64_type.equal = NULL;
654
655 init_shadow_spine(&spine, info);
656
657 for (level = 0; level < (info->levels - 1); level++) {
658 r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index);
659 if (r < 0)
660 goto bad;
661
662 n = dm_block_data(shadow_current(&spine));
663 need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
664 (le64_to_cpu(n->keys[index]) != keys[level]));
665
666 if (need_insert) {
667 dm_block_t new_tree;
668 __le64 new_le;
669
670 r = dm_btree_empty(info, &new_tree);
671 if (r < 0)
672 goto bad;
673
674 new_le = cpu_to_le64(new_tree);
675 __dm_bless_for_disk(&new_le);
676
677 r = insert_at(sizeof(uint64_t), n, index,
678 keys[level], &new_le);
679 if (r)
680 goto bad;
681 }
682
683 if (level < last_level)
684 block = value64(n, index);
685 }
686
687 r = btree_insert_raw(&spine, block, &info->value_type,
688 keys[level], &index);
689 if (r < 0)
690 goto bad;
691
692 n = dm_block_data(shadow_current(&spine));
693 need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
694 (le64_to_cpu(n->keys[index]) != keys[level]));
695
696 if (need_insert) {
697 if (inserted)
698 *inserted = 1;
699
700 r = insert_at(info->value_type.size, n, index,
701 keys[level], value);
702 if (r)
703 goto bad_unblessed;
704 } else {
705 if (inserted)
706 *inserted = 0;
707
708 if (info->value_type.dec &&
709 (!info->value_type.equal ||
710 !info->value_type.equal(
711 info->value_type.context,
712 value_ptr(n, index),
713 value))) {
714 info->value_type.dec(info->value_type.context,
715 value_ptr(n, index));
716 }
717 memcpy_disk(value_ptr(n, index),
718 value, info->value_type.size);
719 }
720
721 *new_root = shadow_root(&spine);
722 exit_shadow_spine(&spine);
723
724 return 0;
725
726bad:
727 __dm_unbless_for_disk(value);
728bad_unblessed:
729 exit_shadow_spine(&spine);
730 return r;
731}
732
733int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
734 uint64_t *keys, void *value, dm_block_t *new_root)
735 __dm_written_to_disk(value)
736{
737 return insert(info, root, keys, value, new_root, NULL);
738}
739EXPORT_SYMBOL_GPL(dm_btree_insert);
740
741int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
742 uint64_t *keys, void *value, dm_block_t *new_root,
743 int *inserted)
744 __dm_written_to_disk(value)
745{
746 return insert(info, root, keys, value, new_root, inserted);
747}
748EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
749
750
751
752static int find_highest_key(struct ro_spine *s, dm_block_t block,
753 uint64_t *result_key, dm_block_t *next_block)
754{
755 int i, r;
756 uint32_t flags;
757
758 do {
759 r = ro_step(s, block);
760 if (r < 0)
761 return r;
762
763 flags = le32_to_cpu(ro_node(s)->header.flags);
764 i = le32_to_cpu(ro_node(s)->header.nr_entries);
765 if (!i)
766 return -ENODATA;
767 else
768 i--;
769
770 *result_key = le64_to_cpu(ro_node(s)->keys[i]);
771 if (next_block || flags & INTERNAL_NODE)
772 block = value64(ro_node(s), i);
773
774 } while (flags & INTERNAL_NODE);
775
776 if (next_block)
777 *next_block = block;
778 return 0;
779}
780
781int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
782 uint64_t *result_keys)
783{
784 int r = 0, count = 0, level;
785 struct ro_spine spine;
786
787 init_ro_spine(&spine, info);
788 for (level = 0; level < info->levels; level++) {
789 r = find_highest_key(&spine, root, result_keys + level,
790 level == info->levels - 1 ? NULL : &root);
791 if (r == -ENODATA) {
792 r = 0;
793 break;
794
795 } else if (r)
796 break;
797
798 count++;
799 }
800 exit_ro_spine(&spine);
801
802 return r ? r : count;
803}
804EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
805