1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
65
66#include <linux/bitmap.h>
67#include <linux/memblock.h>
68#include <linux/err.h>
69#include <linux/lcm.h>
70#include <linux/list.h>
71#include <linux/log2.h>
72#include <linux/mm.h>
73#include <linux/module.h>
74#include <linux/mutex.h>
75#include <linux/percpu.h>
76#include <linux/pfn.h>
77#include <linux/slab.h>
78#include <linux/spinlock.h>
79#include <linux/vmalloc.h>
80#include <linux/workqueue.h>
81#include <linux/kmemleak.h>
82#include <linux/sched.h>
83
84#include <asm/cacheflush.h>
85#include <asm/sections.h>
86#include <asm/tlbflush.h>
87#include <asm/io.h>
88
89#define CREATE_TRACE_POINTS
90#include <trace/events/percpu.h>
91
92#include "percpu-internal.h"
93
94
95#define PCPU_SLOT_BASE_SHIFT 5
96
97#define PCPU_SLOT_FAIL_THRESHOLD 3
98
99#define PCPU_EMPTY_POP_PAGES_LOW 2
100#define PCPU_EMPTY_POP_PAGES_HIGH 4
101
102#ifdef CONFIG_SMP
103
104#ifndef __addr_to_pcpu_ptr
105#define __addr_to_pcpu_ptr(addr) \
106 (void __percpu *)((unsigned long)(addr) - \
107 (unsigned long)pcpu_base_addr + \
108 (unsigned long)__per_cpu_start)
109#endif
110#ifndef __pcpu_ptr_to_addr
111#define __pcpu_ptr_to_addr(ptr) \
112 (void __force *)((unsigned long)(ptr) + \
113 (unsigned long)pcpu_base_addr - \
114 (unsigned long)__per_cpu_start)
115#endif
116#else
117
118#define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr)
119#define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr)
120#endif
121
122static int pcpu_unit_pages __ro_after_init;
123static int pcpu_unit_size __ro_after_init;
124static int pcpu_nr_units __ro_after_init;
125static int pcpu_atom_size __ro_after_init;
126int pcpu_nr_slots __ro_after_init;
127static size_t pcpu_chunk_struct_size __ro_after_init;
128
129
130static unsigned int pcpu_low_unit_cpu __ro_after_init;
131static unsigned int pcpu_high_unit_cpu __ro_after_init;
132
133
134void *pcpu_base_addr __ro_after_init;
135EXPORT_SYMBOL_GPL(pcpu_base_addr);
136
137static const int *pcpu_unit_map __ro_after_init;
138const unsigned long *pcpu_unit_offsets __ro_after_init;
139
140
141static int pcpu_nr_groups __ro_after_init;
142static const unsigned long *pcpu_group_offsets __ro_after_init;
143static const size_t *pcpu_group_sizes __ro_after_init;
144
145
146
147
148
149
150struct pcpu_chunk *pcpu_first_chunk __ro_after_init;
151
152
153
154
155
156
157struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;
158
159DEFINE_SPINLOCK(pcpu_lock);
160static DEFINE_MUTEX(pcpu_alloc_mutex);
161
162struct list_head *pcpu_slot __ro_after_init;
163
164
165static LIST_HEAD(pcpu_map_extend_chunks);
166
167
168
169
170
171int pcpu_nr_empty_pop_pages;
172
173
174
175
176
177
178
179static unsigned long pcpu_nr_populated;
180
181
182
183
184
185
186
187static void pcpu_balance_workfn(struct work_struct *work);
188static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
189static bool pcpu_async_enabled __read_mostly;
190static bool pcpu_atomic_alloc_failed;
191
192static void pcpu_schedule_balance_work(void)
193{
194 if (pcpu_async_enabled)
195 schedule_work(&pcpu_balance_work);
196}
197
198
199
200
201
202
203
204
205
206static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr)
207{
208 void *start_addr, *end_addr;
209
210 if (!chunk)
211 return false;
212
213 start_addr = chunk->base_addr + chunk->start_offset;
214 end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE -
215 chunk->end_offset;
216
217 return addr >= start_addr && addr < end_addr;
218}
219
220static int __pcpu_size_to_slot(int size)
221{
222 int highbit = fls(size);
223 return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
224}
225
226static int pcpu_size_to_slot(int size)
227{
228 if (size == pcpu_unit_size)
229 return pcpu_nr_slots - 1;
230 return __pcpu_size_to_slot(size);
231}
232
233static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
234{
235 const struct pcpu_block_md *chunk_md = &chunk->chunk_md;
236
237 if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE ||
238 chunk_md->contig_hint == 0)
239 return 0;
240
241 return pcpu_size_to_slot(chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE);
242}
243
244
245static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
246{
247 page->index = (unsigned long)pcpu;
248}
249
250
251static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
252{
253 return (struct pcpu_chunk *)page->index;
254}
255
256static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
257{
258 return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
259}
260
261static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx)
262{
263 return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT);
264}
265
266static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
267 unsigned int cpu, int page_idx)
268{
269 return (unsigned long)chunk->base_addr +
270 pcpu_unit_page_offset(cpu, page_idx);
271}
272
273static void pcpu_next_unpop(unsigned long *bitmap, int *rs, int *re, int end)
274{
275 *rs = find_next_zero_bit(bitmap, end, *rs);
276 *re = find_next_bit(bitmap, end, *rs + 1);
277}
278
279static void pcpu_next_pop(unsigned long *bitmap, int *rs, int *re, int end)
280{
281 *rs = find_next_bit(bitmap, end, *rs);
282 *re = find_next_zero_bit(bitmap, end, *rs + 1);
283}
284
285
286
287
288
289
290#define pcpu_for_each_unpop_region(bitmap, rs, re, start, end) \
291 for ((rs) = (start), pcpu_next_unpop((bitmap), &(rs), &(re), (end)); \
292 (rs) < (re); \
293 (rs) = (re) + 1, pcpu_next_unpop((bitmap), &(rs), &(re), (end)))
294
295#define pcpu_for_each_pop_region(bitmap, rs, re, start, end) \
296 for ((rs) = (start), pcpu_next_pop((bitmap), &(rs), &(re), (end)); \
297 (rs) < (re); \
298 (rs) = (re) + 1, pcpu_next_pop((bitmap), &(rs), &(re), (end)))
299
300
301
302
303
304static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index)
305{
306 return chunk->alloc_map +
307 (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG);
308}
309
310static unsigned long pcpu_off_to_block_index(int off)
311{
312 return off / PCPU_BITMAP_BLOCK_BITS;
313}
314
315static unsigned long pcpu_off_to_block_off(int off)
316{
317 return off & (PCPU_BITMAP_BLOCK_BITS - 1);
318}
319
320static unsigned long pcpu_block_off_to_off(int index, int off)
321{
322 return index * PCPU_BITMAP_BLOCK_BITS + off;
323}
324
325
326
327
328
329
330
331
332
333
334
335
336static int pcpu_next_hint(struct pcpu_block_md *block, int alloc_bits)
337{
338
339
340
341
342
343
344
345 if (block->scan_hint &&
346 block->contig_hint_start > block->scan_hint_start &&
347 alloc_bits > block->scan_hint)
348 return block->scan_hint_start + block->scan_hint;
349
350 return block->first_free;
351}
352
353
354
355
356
357
358
359
360
361
362
363
364static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off,
365 int *bits)
366{
367 int i = pcpu_off_to_block_index(*bit_off);
368 int block_off = pcpu_off_to_block_off(*bit_off);
369 struct pcpu_block_md *block;
370
371 *bits = 0;
372 for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
373 block++, i++) {
374
375 if (*bits) {
376 *bits += block->left_free;
377 if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
378 continue;
379 return;
380 }
381
382
383
384
385
386
387
388
389
390 *bits = block->contig_hint;
391 if (*bits && block->contig_hint_start >= block_off &&
392 *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) {
393 *bit_off = pcpu_block_off_to_off(i,
394 block->contig_hint_start);
395 return;
396 }
397
398 block_off = 0;
399
400 *bits = block->right_free;
401 *bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free;
402 }
403}
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
420 int align, int *bit_off, int *bits)
421{
422 int i = pcpu_off_to_block_index(*bit_off);
423 int block_off = pcpu_off_to_block_off(*bit_off);
424 struct pcpu_block_md *block;
425
426 *bits = 0;
427 for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
428 block++, i++) {
429
430 if (*bits) {
431 *bits += block->left_free;
432 if (*bits >= alloc_bits)
433 return;
434 if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
435 continue;
436 }
437
438
439 *bits = ALIGN(block->contig_hint_start, align) -
440 block->contig_hint_start;
441
442
443
444
445 if (block->contig_hint &&
446 block->contig_hint_start >= block_off &&
447 block->contig_hint >= *bits + alloc_bits) {
448 int start = pcpu_next_hint(block, alloc_bits);
449
450 *bits += alloc_bits + block->contig_hint_start -
451 start;
452 *bit_off = pcpu_block_off_to_off(i, start);
453 return;
454 }
455
456 block_off = 0;
457
458 *bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free,
459 align);
460 *bits = PCPU_BITMAP_BLOCK_BITS - *bit_off;
461 *bit_off = pcpu_block_off_to_off(i, *bit_off);
462 if (*bits >= alloc_bits)
463 return;
464 }
465
466
467 *bit_off = pcpu_chunk_map_bits(chunk);
468}
469
470
471
472
473
474
475
476#define pcpu_for_each_md_free_region(chunk, bit_off, bits) \
477 for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits)); \
478 (bit_off) < pcpu_chunk_map_bits((chunk)); \
479 (bit_off) += (bits) + 1, \
480 pcpu_next_md_free_region((chunk), &(bit_off), &(bits)))
481
482#define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) \
483 for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
484 &(bits)); \
485 (bit_off) < pcpu_chunk_map_bits((chunk)); \
486 (bit_off) += (bits), \
487 pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
488 &(bits)))
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
504{
505 if (WARN_ON_ONCE(!slab_is_available()))
506 return NULL;
507
508 if (size <= PAGE_SIZE)
509 return kzalloc(size, gfp);
510 else
511 return __vmalloc(size, gfp | __GFP_ZERO, PAGE_KERNEL);
512}
513
514
515
516
517
518
519
520static void pcpu_mem_free(void *ptr)
521{
522 kvfree(ptr);
523}
524
525static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot,
526 bool move_front)
527{
528 if (chunk != pcpu_reserved_chunk) {
529 if (move_front)
530 list_move(&chunk->list, &pcpu_slot[slot]);
531 else
532 list_move_tail(&chunk->list, &pcpu_slot[slot]);
533 }
534}
535
536static void pcpu_chunk_move(struct pcpu_chunk *chunk, int slot)
537{
538 __pcpu_chunk_move(chunk, slot, true);
539}
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
555{
556 int nslot = pcpu_chunk_slot(chunk);
557
558 if (oslot != nslot)
559 __pcpu_chunk_move(chunk, nslot, oslot < nslot);
560}
561
562
563
564
565
566
567
568
569
570
571static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr)
572{
573 chunk->nr_empty_pop_pages += nr;
574 if (chunk != pcpu_reserved_chunk)
575 pcpu_nr_empty_pop_pages += nr;
576}
577
578
579
580
581
582
583
584
585
586
587
588static inline bool pcpu_region_overlap(int a, int b, int x, int y)
589{
590 return (a < y) && (x < b);
591}
592
593
594
595
596
597
598
599
600
601
602
603static void pcpu_block_update(struct pcpu_block_md *block, int start, int end)
604{
605 int contig = end - start;
606
607 block->first_free = min(block->first_free, start);
608 if (start == 0)
609 block->left_free = contig;
610
611 if (end == block->nr_bits)
612 block->right_free = contig;
613
614 if (contig > block->contig_hint) {
615
616 if (start > block->contig_hint_start) {
617 if (block->contig_hint > block->scan_hint) {
618 block->scan_hint_start =
619 block->contig_hint_start;
620 block->scan_hint = block->contig_hint;
621 } else if (start < block->scan_hint_start) {
622
623
624
625
626
627 block->scan_hint = 0;
628 }
629 } else {
630 block->scan_hint = 0;
631 }
632 block->contig_hint_start = start;
633 block->contig_hint = contig;
634 } else if (contig == block->contig_hint) {
635 if (block->contig_hint_start &&
636 (!start ||
637 __ffs(start) > __ffs(block->contig_hint_start))) {
638
639 block->contig_hint_start = start;
640 if (start < block->scan_hint_start &&
641 block->contig_hint > block->scan_hint)
642 block->scan_hint = 0;
643 } else if (start > block->scan_hint_start ||
644 block->contig_hint > block->scan_hint) {
645
646
647
648
649
650 block->scan_hint_start = start;
651 block->scan_hint = contig;
652 }
653 } else {
654
655
656
657
658
659 if ((start < block->contig_hint_start &&
660 (contig > block->scan_hint ||
661 (contig == block->scan_hint &&
662 start > block->scan_hint_start)))) {
663 block->scan_hint_start = start;
664 block->scan_hint = contig;
665 }
666 }
667}
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685static void pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off,
686 int bits)
687{
688 int s_off = pcpu_off_to_block_off(bit_off);
689 int e_off = s_off + bits;
690 int s_index, l_bit;
691 struct pcpu_block_md *block;
692
693 if (e_off > PCPU_BITMAP_BLOCK_BITS)
694 return;
695
696 s_index = pcpu_off_to_block_index(bit_off);
697 block = chunk->md_blocks + s_index;
698
699
700 l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), s_off);
701 s_off = (s_off == l_bit) ? 0 : l_bit + 1;
702
703 pcpu_block_update(block, s_off, e_off);
704}
705
706
707
708
709
710
711
712
713
714
715
716
717
718static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan)
719{
720 struct pcpu_block_md *chunk_md = &chunk->chunk_md;
721 int bit_off, bits;
722
723
724 if (!full_scan && chunk_md->scan_hint) {
725 bit_off = chunk_md->scan_hint_start + chunk_md->scan_hint;
726 chunk_md->contig_hint_start = chunk_md->scan_hint_start;
727 chunk_md->contig_hint = chunk_md->scan_hint;
728 chunk_md->scan_hint = 0;
729 } else {
730 bit_off = chunk_md->first_free;
731 chunk_md->contig_hint = 0;
732 }
733
734 bits = 0;
735 pcpu_for_each_md_free_region(chunk, bit_off, bits) {
736 pcpu_block_update(chunk_md, bit_off, bit_off + bits);
737 }
738}
739
740
741
742
743
744
745
746
747
748static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
749{
750 struct pcpu_block_md *block = chunk->md_blocks + index;
751 unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
752 int rs, re, start;
753
754
755 if (block->scan_hint) {
756 start = block->scan_hint_start + block->scan_hint;
757 block->contig_hint_start = block->scan_hint_start;
758 block->contig_hint = block->scan_hint;
759 block->scan_hint = 0;
760 } else {
761 start = block->first_free;
762 block->contig_hint = 0;
763 }
764
765 block->right_free = 0;
766
767
768 pcpu_for_each_unpop_region(alloc_map, rs, re, start,
769 PCPU_BITMAP_BLOCK_BITS) {
770 pcpu_block_update(block, rs, re);
771 }
772}
773
774
775
776
777
778
779
780
781
782
783
784static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
785 int bits)
786{
787 struct pcpu_block_md *chunk_md = &chunk->chunk_md;
788 int nr_empty_pages = 0;
789 struct pcpu_block_md *s_block, *e_block, *block;
790 int s_index, e_index;
791 int s_off, e_off;
792
793
794
795
796
797
798
799 s_index = pcpu_off_to_block_index(bit_off);
800 e_index = pcpu_off_to_block_index(bit_off + bits - 1);
801 s_off = pcpu_off_to_block_off(bit_off);
802 e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
803
804 s_block = chunk->md_blocks + s_index;
805 e_block = chunk->md_blocks + e_index;
806
807
808
809
810
811
812
813 if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
814 nr_empty_pages++;
815
816 if (s_off == s_block->first_free)
817 s_block->first_free = find_next_zero_bit(
818 pcpu_index_alloc_map(chunk, s_index),
819 PCPU_BITMAP_BLOCK_BITS,
820 s_off + bits);
821
822 if (pcpu_region_overlap(s_block->scan_hint_start,
823 s_block->scan_hint_start + s_block->scan_hint,
824 s_off,
825 s_off + bits))
826 s_block->scan_hint = 0;
827
828 if (pcpu_region_overlap(s_block->contig_hint_start,
829 s_block->contig_hint_start +
830 s_block->contig_hint,
831 s_off,
832 s_off + bits)) {
833
834 if (!s_off)
835 s_block->left_free = 0;
836 pcpu_block_refresh_hint(chunk, s_index);
837 } else {
838
839 s_block->left_free = min(s_block->left_free, s_off);
840 if (s_index == e_index)
841 s_block->right_free = min_t(int, s_block->right_free,
842 PCPU_BITMAP_BLOCK_BITS - e_off);
843 else
844 s_block->right_free = 0;
845 }
846
847
848
849
850 if (s_index != e_index) {
851 if (e_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
852 nr_empty_pages++;
853
854
855
856
857
858 e_block->first_free = find_next_zero_bit(
859 pcpu_index_alloc_map(chunk, e_index),
860 PCPU_BITMAP_BLOCK_BITS, e_off);
861
862 if (e_off == PCPU_BITMAP_BLOCK_BITS) {
863
864 e_block++;
865 } else {
866 if (e_off > e_block->scan_hint_start)
867 e_block->scan_hint = 0;
868
869 e_block->left_free = 0;
870 if (e_off > e_block->contig_hint_start) {
871
872 pcpu_block_refresh_hint(chunk, e_index);
873 } else {
874 e_block->right_free =
875 min_t(int, e_block->right_free,
876 PCPU_BITMAP_BLOCK_BITS - e_off);
877 }
878 }
879
880
881 nr_empty_pages += (e_index - s_index - 1);
882 for (block = s_block + 1; block < e_block; block++) {
883 block->scan_hint = 0;
884 block->contig_hint = 0;
885 block->left_free = 0;
886 block->right_free = 0;
887 }
888 }
889
890 if (nr_empty_pages)
891 pcpu_update_empty_pages(chunk, -nr_empty_pages);
892
893 if (pcpu_region_overlap(chunk_md->scan_hint_start,
894 chunk_md->scan_hint_start +
895 chunk_md->scan_hint,
896 bit_off,
897 bit_off + bits))
898 chunk_md->scan_hint = 0;
899
900
901
902
903
904
905 if (pcpu_region_overlap(chunk_md->contig_hint_start,
906 chunk_md->contig_hint_start +
907 chunk_md->contig_hint,
908 bit_off,
909 bit_off + bits))
910 pcpu_chunk_refresh_hint(chunk, false);
911}
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
932 int bits)
933{
934 int nr_empty_pages = 0;
935 struct pcpu_block_md *s_block, *e_block, *block;
936 int s_index, e_index;
937 int s_off, e_off;
938 int start, end;
939
940
941
942
943
944
945
946 s_index = pcpu_off_to_block_index(bit_off);
947 e_index = pcpu_off_to_block_index(bit_off + bits - 1);
948 s_off = pcpu_off_to_block_off(bit_off);
949 e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
950
951 s_block = chunk->md_blocks + s_index;
952 e_block = chunk->md_blocks + e_index;
953
954
955
956
957
958
959
960
961
962
963
964 start = s_off;
965 if (s_off == s_block->contig_hint + s_block->contig_hint_start) {
966 start = s_block->contig_hint_start;
967 } else {
968
969
970
971
972
973
974 int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index),
975 start);
976 start = (start == l_bit) ? 0 : l_bit + 1;
977 }
978
979 end = e_off;
980 if (e_off == e_block->contig_hint_start)
981 end = e_block->contig_hint_start + e_block->contig_hint;
982 else
983 end = find_next_bit(pcpu_index_alloc_map(chunk, e_index),
984 PCPU_BITMAP_BLOCK_BITS, end);
985
986
987 e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS;
988 if (!start && e_off == PCPU_BITMAP_BLOCK_BITS)
989 nr_empty_pages++;
990 pcpu_block_update(s_block, start, e_off);
991
992
993 if (s_index != e_index) {
994
995 if (end == PCPU_BITMAP_BLOCK_BITS)
996 nr_empty_pages++;
997 pcpu_block_update(e_block, 0, end);
998
999
1000 nr_empty_pages += (e_index - s_index - 1);
1001 for (block = s_block + 1; block < e_block; block++) {
1002 block->first_free = 0;
1003 block->scan_hint = 0;
1004 block->contig_hint_start = 0;
1005 block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
1006 block->left_free = PCPU_BITMAP_BLOCK_BITS;
1007 block->right_free = PCPU_BITMAP_BLOCK_BITS;
1008 }
1009 }
1010
1011 if (nr_empty_pages)
1012 pcpu_update_empty_pages(chunk, nr_empty_pages);
1013
1014
1015
1016
1017
1018
1019
1020 if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index)
1021 pcpu_chunk_refresh_hint(chunk, true);
1022 else
1023 pcpu_block_update(&chunk->chunk_md,
1024 pcpu_block_off_to_off(s_index, start),
1025 end);
1026}
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
1042 int *next_off)
1043{
1044 int page_start, page_end, rs, re;
1045
1046 page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
1047 page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
1048
1049 rs = page_start;
1050 pcpu_next_unpop(chunk->populated, &rs, &re, page_end);
1051 if (rs >= page_end)
1052 return true;
1053
1054 *next_off = re * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
1055 return false;
1056}
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
1078 size_t align, bool pop_only)
1079{
1080 struct pcpu_block_md *chunk_md = &chunk->chunk_md;
1081 int bit_off, bits, next_off;
1082
1083
1084
1085
1086
1087
1088
1089 bit_off = ALIGN(chunk_md->contig_hint_start, align) -
1090 chunk_md->contig_hint_start;
1091 if (bit_off + alloc_bits > chunk_md->contig_hint)
1092 return -1;
1093
1094 bit_off = pcpu_next_hint(chunk_md, alloc_bits);
1095 bits = 0;
1096 pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) {
1097 if (!pop_only || pcpu_is_populated(chunk, bit_off, bits,
1098 &next_off))
1099 break;
1100
1101 bit_off = next_off;
1102 bits = 0;
1103 }
1104
1105 if (bit_off == pcpu_chunk_map_bits(chunk))
1106 return -1;
1107
1108 return bit_off;
1109}
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131static unsigned long pcpu_find_zero_area(unsigned long *map,
1132 unsigned long size,
1133 unsigned long start,
1134 unsigned long nr,
1135 unsigned long align_mask,
1136 unsigned long *largest_off,
1137 unsigned long *largest_bits)
1138{
1139 unsigned long index, end, i, area_off, area_bits;
1140again:
1141 index = find_next_zero_bit(map, size, start);
1142
1143
1144 index = __ALIGN_MASK(index, align_mask);
1145 area_off = index;
1146
1147 end = index + nr;
1148 if (end > size)
1149 return end;
1150 i = find_next_bit(map, end, index);
1151 if (i < end) {
1152 area_bits = i - area_off;
1153
1154 if (area_bits > *largest_bits ||
1155 (area_bits == *largest_bits && *largest_off &&
1156 (!area_off || __ffs(area_off) > __ffs(*largest_off)))) {
1157 *largest_off = area_off;
1158 *largest_bits = area_bits;
1159 }
1160
1161 start = i + 1;
1162 goto again;
1163 }
1164 return index;
1165}
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
1187 size_t align, int start)
1188{
1189 struct pcpu_block_md *chunk_md = &chunk->chunk_md;
1190 size_t align_mask = (align) ? (align - 1) : 0;
1191 unsigned long area_off = 0, area_bits = 0;
1192 int bit_off, end, oslot;
1193
1194 lockdep_assert_held(&pcpu_lock);
1195
1196 oslot = pcpu_chunk_slot(chunk);
1197
1198
1199
1200
1201 end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS,
1202 pcpu_chunk_map_bits(chunk));
1203 bit_off = pcpu_find_zero_area(chunk->alloc_map, end, start, alloc_bits,
1204 align_mask, &area_off, &area_bits);
1205 if (bit_off >= end)
1206 return -1;
1207
1208 if (area_bits)
1209 pcpu_block_update_scan(chunk, area_off, area_bits);
1210
1211
1212 bitmap_set(chunk->alloc_map, bit_off, alloc_bits);
1213
1214
1215 set_bit(bit_off, chunk->bound_map);
1216 bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1);
1217 set_bit(bit_off + alloc_bits, chunk->bound_map);
1218
1219 chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE;
1220
1221
1222 if (bit_off == chunk_md->first_free)
1223 chunk_md->first_free = find_next_zero_bit(
1224 chunk->alloc_map,
1225 pcpu_chunk_map_bits(chunk),
1226 bit_off + alloc_bits);
1227
1228 pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits);
1229
1230 pcpu_chunk_relocate(chunk, oslot);
1231
1232 return bit_off * PCPU_MIN_ALLOC_SIZE;
1233}
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243static void pcpu_free_area(struct pcpu_chunk *chunk, int off)
1244{
1245 struct pcpu_block_md *chunk_md = &chunk->chunk_md;
1246 int bit_off, bits, end, oslot;
1247
1248 lockdep_assert_held(&pcpu_lock);
1249 pcpu_stats_area_dealloc(chunk);
1250
1251 oslot = pcpu_chunk_slot(chunk);
1252
1253 bit_off = off / PCPU_MIN_ALLOC_SIZE;
1254
1255
1256 end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
1257 bit_off + 1);
1258 bits = end - bit_off;
1259 bitmap_clear(chunk->alloc_map, bit_off, bits);
1260
1261
1262 chunk->free_bytes += bits * PCPU_MIN_ALLOC_SIZE;
1263
1264
1265 chunk_md->first_free = min(chunk_md->first_free, bit_off);
1266
1267 pcpu_block_update_hint_free(chunk, bit_off, bits);
1268
1269 pcpu_chunk_relocate(chunk, oslot);
1270}
1271
1272static void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits)
1273{
1274 block->scan_hint = 0;
1275 block->contig_hint = nr_bits;
1276 block->left_free = nr_bits;
1277 block->right_free = nr_bits;
1278 block->first_free = 0;
1279 block->nr_bits = nr_bits;
1280}
1281
1282static void pcpu_init_md_blocks(struct pcpu_chunk *chunk)
1283{
1284 struct pcpu_block_md *md_block;
1285
1286
1287 pcpu_init_md_block(&chunk->chunk_md, pcpu_chunk_map_bits(chunk));
1288
1289 for (md_block = chunk->md_blocks;
1290 md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk);
1291 md_block++)
1292 pcpu_init_md_block(md_block, PCPU_BITMAP_BLOCK_BITS);
1293}
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
1309 int map_size)
1310{
1311 struct pcpu_chunk *chunk;
1312 unsigned long aligned_addr, lcm_align;
1313 int start_offset, offset_bits, region_size, region_bits;
1314 size_t alloc_size;
1315
1316
1317 aligned_addr = tmp_addr & PAGE_MASK;
1318
1319 start_offset = tmp_addr - aligned_addr;
1320
1321
1322
1323
1324
1325
1326 lcm_align = lcm(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE);
1327 region_size = ALIGN(start_offset + map_size, lcm_align);
1328
1329
1330 alloc_size = sizeof(struct pcpu_chunk) +
1331 BITS_TO_LONGS(region_size >> PAGE_SHIFT);
1332 chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
1333 if (!chunk)
1334 panic("%s: Failed to allocate %zu bytes\n", __func__,
1335 alloc_size);
1336
1337 INIT_LIST_HEAD(&chunk->list);
1338
1339 chunk->base_addr = (void *)aligned_addr;
1340 chunk->start_offset = start_offset;
1341 chunk->end_offset = region_size - chunk->start_offset - map_size;
1342
1343 chunk->nr_pages = region_size >> PAGE_SHIFT;
1344 region_bits = pcpu_chunk_map_bits(chunk);
1345
1346 alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]);
1347 chunk->alloc_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
1348 if (!chunk->alloc_map)
1349 panic("%s: Failed to allocate %zu bytes\n", __func__,
1350 alloc_size);
1351
1352 alloc_size =
1353 BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]);
1354 chunk->bound_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
1355 if (!chunk->bound_map)
1356 panic("%s: Failed to allocate %zu bytes\n", __func__,
1357 alloc_size);
1358
1359 alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]);
1360 chunk->md_blocks = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
1361 if (!chunk->md_blocks)
1362 panic("%s: Failed to allocate %zu bytes\n", __func__,
1363 alloc_size);
1364
1365 pcpu_init_md_blocks(chunk);
1366
1367
1368 chunk->immutable = true;
1369 bitmap_fill(chunk->populated, chunk->nr_pages);
1370 chunk->nr_populated = chunk->nr_pages;
1371 chunk->nr_empty_pop_pages = chunk->nr_pages;
1372
1373 chunk->free_bytes = map_size;
1374
1375 if (chunk->start_offset) {
1376
1377 offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
1378 bitmap_set(chunk->alloc_map, 0, offset_bits);
1379 set_bit(0, chunk->bound_map);
1380 set_bit(offset_bits, chunk->bound_map);
1381
1382 chunk->chunk_md.first_free = offset_bits;
1383
1384 pcpu_block_update_hint_alloc(chunk, 0, offset_bits);
1385 }
1386
1387 if (chunk->end_offset) {
1388
1389 offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE;
1390 bitmap_set(chunk->alloc_map,
1391 pcpu_chunk_map_bits(chunk) - offset_bits,
1392 offset_bits);
1393 set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE,
1394 chunk->bound_map);
1395 set_bit(region_bits, chunk->bound_map);
1396
1397 pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk)
1398 - offset_bits, offset_bits);
1399 }
1400
1401 return chunk;
1402}
1403
1404static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
1405{
1406 struct pcpu_chunk *chunk;
1407 int region_bits;
1408
1409 chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp);
1410 if (!chunk)
1411 return NULL;
1412
1413 INIT_LIST_HEAD(&chunk->list);
1414 chunk->nr_pages = pcpu_unit_pages;
1415 region_bits = pcpu_chunk_map_bits(chunk);
1416
1417 chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
1418 sizeof(chunk->alloc_map[0]), gfp);
1419 if (!chunk->alloc_map)
1420 goto alloc_map_fail;
1421
1422 chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
1423 sizeof(chunk->bound_map[0]), gfp);
1424 if (!chunk->bound_map)
1425 goto bound_map_fail;
1426
1427 chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) *
1428 sizeof(chunk->md_blocks[0]), gfp);
1429 if (!chunk->md_blocks)
1430 goto md_blocks_fail;
1431
1432 pcpu_init_md_blocks(chunk);
1433
1434
1435 chunk->free_bytes = chunk->nr_pages * PAGE_SIZE;
1436
1437 return chunk;
1438
1439md_blocks_fail:
1440 pcpu_mem_free(chunk->bound_map);
1441bound_map_fail:
1442 pcpu_mem_free(chunk->alloc_map);
1443alloc_map_fail:
1444 pcpu_mem_free(chunk);
1445
1446 return NULL;
1447}
1448
1449static void pcpu_free_chunk(struct pcpu_chunk *chunk)
1450{
1451 if (!chunk)
1452 return;
1453 pcpu_mem_free(chunk->md_blocks);
1454 pcpu_mem_free(chunk->bound_map);
1455 pcpu_mem_free(chunk->alloc_map);
1456 pcpu_mem_free(chunk);
1457}
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
1473 int page_end)
1474{
1475 int nr = page_end - page_start;
1476
1477 lockdep_assert_held(&pcpu_lock);
1478
1479 bitmap_set(chunk->populated, page_start, nr);
1480 chunk->nr_populated += nr;
1481 pcpu_nr_populated += nr;
1482
1483 pcpu_update_empty_pages(chunk, nr);
1484}
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
1497 int page_start, int page_end)
1498{
1499 int nr = page_end - page_start;
1500
1501 lockdep_assert_held(&pcpu_lock);
1502
1503 bitmap_clear(chunk->populated, page_start, nr);
1504 chunk->nr_populated -= nr;
1505 pcpu_nr_populated -= nr;
1506
1507 pcpu_update_empty_pages(chunk, -nr);
1508}
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
1526 int page_start, int page_end, gfp_t gfp);
1527static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
1528 int page_start, int page_end);
1529static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
1530static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
1531static struct page *pcpu_addr_to_page(void *addr);
1532static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
1533
1534#ifdef CONFIG_NEED_PER_CPU_KM
1535#include "percpu-km.c"
1536#else
1537#include "percpu-vm.c"
1538#endif
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
1551{
1552
1553 if (pcpu_addr_in_chunk(pcpu_first_chunk, addr))
1554 return pcpu_first_chunk;
1555
1556
1557 if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr))
1558 return pcpu_reserved_chunk;
1559
1560
1561
1562
1563
1564
1565
1566
1567 addr += pcpu_unit_offsets[raw_smp_processor_id()];
1568 return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
1569}
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
1587 gfp_t gfp)
1588{
1589
1590 gfp_t pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
1591 bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
1592 bool do_warn = !(gfp & __GFP_NOWARN);
1593 static int warn_limit = 10;
1594 struct pcpu_chunk *chunk, *next;
1595 const char *err;
1596 int slot, off, cpu, ret;
1597 unsigned long flags;
1598 void __percpu *ptr;
1599 size_t bits, bit_align;
1600
1601
1602
1603
1604
1605
1606
1607 if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
1608 align = PCPU_MIN_ALLOC_SIZE;
1609
1610 size = ALIGN(size, PCPU_MIN_ALLOC_SIZE);
1611 bits = size >> PCPU_MIN_ALLOC_SHIFT;
1612 bit_align = align >> PCPU_MIN_ALLOC_SHIFT;
1613
1614 if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
1615 !is_power_of_2(align))) {
1616 WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation\n",
1617 size, align);
1618 return NULL;
1619 }
1620
1621 if (!is_atomic) {
1622
1623
1624
1625
1626
1627 if (gfp & __GFP_NOFAIL)
1628 mutex_lock(&pcpu_alloc_mutex);
1629 else if (mutex_lock_killable(&pcpu_alloc_mutex))
1630 return NULL;
1631 }
1632
1633 spin_lock_irqsave(&pcpu_lock, flags);
1634
1635
1636 if (reserved && pcpu_reserved_chunk) {
1637 chunk = pcpu_reserved_chunk;
1638
1639 off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic);
1640 if (off < 0) {
1641 err = "alloc from reserved chunk failed";
1642 goto fail_unlock;
1643 }
1644
1645 off = pcpu_alloc_area(chunk, bits, bit_align, off);
1646 if (off >= 0)
1647 goto area_found;
1648
1649 err = "alloc from reserved chunk failed";
1650 goto fail_unlock;
1651 }
1652
1653restart:
1654
1655 for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
1656 list_for_each_entry_safe(chunk, next, &pcpu_slot[slot], list) {
1657 off = pcpu_find_block_fit(chunk, bits, bit_align,
1658 is_atomic);
1659 if (off < 0) {
1660 if (slot < PCPU_SLOT_FAIL_THRESHOLD)
1661 pcpu_chunk_move(chunk, 0);
1662 continue;
1663 }
1664
1665 off = pcpu_alloc_area(chunk, bits, bit_align, off);
1666 if (off >= 0)
1667 goto area_found;
1668
1669 }
1670 }
1671
1672 spin_unlock_irqrestore(&pcpu_lock, flags);
1673
1674
1675
1676
1677
1678
1679 if (is_atomic) {
1680 err = "atomic alloc failed, no space left";
1681 goto fail;
1682 }
1683
1684 if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
1685 chunk = pcpu_create_chunk(pcpu_gfp);
1686 if (!chunk) {
1687 err = "failed to allocate new chunk";
1688 goto fail;
1689 }
1690
1691 spin_lock_irqsave(&pcpu_lock, flags);
1692 pcpu_chunk_relocate(chunk, -1);
1693 } else {
1694 spin_lock_irqsave(&pcpu_lock, flags);
1695 }
1696
1697 goto restart;
1698
1699area_found:
1700 pcpu_stats_area_alloc(chunk, size);
1701 spin_unlock_irqrestore(&pcpu_lock, flags);
1702
1703
1704 if (!is_atomic) {
1705 int page_start, page_end, rs, re;
1706
1707 page_start = PFN_DOWN(off);
1708 page_end = PFN_UP(off + size);
1709
1710 pcpu_for_each_unpop_region(chunk->populated, rs, re,
1711 page_start, page_end) {
1712 WARN_ON(chunk->immutable);
1713
1714 ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);
1715
1716 spin_lock_irqsave(&pcpu_lock, flags);
1717 if (ret) {
1718 pcpu_free_area(chunk, off);
1719 err = "failed to populate";
1720 goto fail_unlock;
1721 }
1722 pcpu_chunk_populated(chunk, rs, re);
1723 spin_unlock_irqrestore(&pcpu_lock, flags);
1724 }
1725
1726 mutex_unlock(&pcpu_alloc_mutex);
1727 }
1728
1729 if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
1730 pcpu_schedule_balance_work();
1731
1732
1733 for_each_possible_cpu(cpu)
1734 memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
1735
1736 ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
1737 kmemleak_alloc_percpu(ptr, size, gfp);
1738
1739 trace_percpu_alloc_percpu(reserved, is_atomic, size, align,
1740 chunk->base_addr, off, ptr);
1741
1742 return ptr;
1743
1744fail_unlock:
1745 spin_unlock_irqrestore(&pcpu_lock, flags);
1746fail:
1747 trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);
1748
1749 if (!is_atomic && do_warn && warn_limit) {
1750 pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
1751 size, align, is_atomic, err);
1752 dump_stack();
1753 if (!--warn_limit)
1754 pr_info("limit reached, disable warning\n");
1755 }
1756 if (is_atomic) {
1757
1758 pcpu_atomic_alloc_failed = true;
1759 pcpu_schedule_balance_work();
1760 } else {
1761 mutex_unlock(&pcpu_alloc_mutex);
1762 }
1763 return NULL;
1764}
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp)
1782{
1783 return pcpu_alloc(size, align, false, gfp);
1784}
1785EXPORT_SYMBOL_GPL(__alloc_percpu_gfp);
1786
1787
1788
1789
1790
1791
1792
1793
1794void __percpu *__alloc_percpu(size_t size, size_t align)
1795{
1796 return pcpu_alloc(size, align, false, GFP_KERNEL);
1797}
1798EXPORT_SYMBOL_GPL(__alloc_percpu);
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
1817{
1818 return pcpu_alloc(size, align, true, GFP_KERNEL);
1819}
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832static void pcpu_balance_workfn(struct work_struct *work)
1833{
1834
1835 const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
1836 LIST_HEAD(to_free);
1837 struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
1838 struct pcpu_chunk *chunk, *next;
1839 int slot, nr_to_pop, ret;
1840
1841
1842
1843
1844
1845 mutex_lock(&pcpu_alloc_mutex);
1846 spin_lock_irq(&pcpu_lock);
1847
1848 list_for_each_entry_safe(chunk, next, free_head, list) {
1849 WARN_ON(chunk->immutable);
1850
1851
1852 if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
1853 continue;
1854
1855 list_move(&chunk->list, &to_free);
1856 }
1857
1858 spin_unlock_irq(&pcpu_lock);
1859
1860 list_for_each_entry_safe(chunk, next, &to_free, list) {
1861 int rs, re;
1862
1863 pcpu_for_each_pop_region(chunk->populated, rs, re, 0,
1864 chunk->nr_pages) {
1865 pcpu_depopulate_chunk(chunk, rs, re);
1866 spin_lock_irq(&pcpu_lock);
1867 pcpu_chunk_depopulated(chunk, rs, re);
1868 spin_unlock_irq(&pcpu_lock);
1869 }
1870 pcpu_destroy_chunk(chunk);
1871 cond_resched();
1872 }
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884retry_pop:
1885 if (pcpu_atomic_alloc_failed) {
1886 nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
1887
1888 pcpu_atomic_alloc_failed = false;
1889 } else {
1890 nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
1891 pcpu_nr_empty_pop_pages,
1892 0, PCPU_EMPTY_POP_PAGES_HIGH);
1893 }
1894
1895 for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
1896 int nr_unpop = 0, rs, re;
1897
1898 if (!nr_to_pop)
1899 break;
1900
1901 spin_lock_irq(&pcpu_lock);
1902 list_for_each_entry(chunk, &pcpu_slot[slot], list) {
1903 nr_unpop = chunk->nr_pages - chunk->nr_populated;
1904 if (nr_unpop)
1905 break;
1906 }
1907 spin_unlock_irq(&pcpu_lock);
1908
1909 if (!nr_unpop)
1910 continue;
1911
1912
1913 pcpu_for_each_unpop_region(chunk->populated, rs, re, 0,
1914 chunk->nr_pages) {
1915 int nr = min(re - rs, nr_to_pop);
1916
1917 ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
1918 if (!ret) {
1919 nr_to_pop -= nr;
1920 spin_lock_irq(&pcpu_lock);
1921 pcpu_chunk_populated(chunk, rs, rs + nr);
1922 spin_unlock_irq(&pcpu_lock);
1923 } else {
1924 nr_to_pop = 0;
1925 }
1926
1927 if (!nr_to_pop)
1928 break;
1929 }
1930 }
1931
1932 if (nr_to_pop) {
1933
1934 chunk = pcpu_create_chunk(gfp);
1935 if (chunk) {
1936 spin_lock_irq(&pcpu_lock);
1937 pcpu_chunk_relocate(chunk, -1);
1938 spin_unlock_irq(&pcpu_lock);
1939 goto retry_pop;
1940 }
1941 }
1942
1943 mutex_unlock(&pcpu_alloc_mutex);
1944}
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955void free_percpu(void __percpu *ptr)
1956{
1957 void *addr;
1958 struct pcpu_chunk *chunk;
1959 unsigned long flags;
1960 int off;
1961 bool need_balance = false;
1962
1963 if (!ptr)
1964 return;
1965
1966 kmemleak_free_percpu(ptr);
1967
1968 addr = __pcpu_ptr_to_addr(ptr);
1969
1970 spin_lock_irqsave(&pcpu_lock, flags);
1971
1972 chunk = pcpu_chunk_addr_search(addr);
1973 off = addr - chunk->base_addr;
1974
1975 pcpu_free_area(chunk, off);
1976
1977
1978 if (chunk->free_bytes == pcpu_unit_size) {
1979 struct pcpu_chunk *pos;
1980
1981 list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
1982 if (pos != chunk) {
1983 need_balance = true;
1984 break;
1985 }
1986 }
1987
1988 trace_percpu_free_percpu(chunk->base_addr, off, ptr);
1989
1990 spin_unlock_irqrestore(&pcpu_lock, flags);
1991
1992 if (need_balance)
1993 pcpu_schedule_balance_work();
1994}
1995EXPORT_SYMBOL_GPL(free_percpu);
1996
1997bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
1998{
1999#ifdef CONFIG_SMP
2000 const size_t static_size = __per_cpu_end - __per_cpu_start;
2001 void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
2002 unsigned int cpu;
2003
2004 for_each_possible_cpu(cpu) {
2005 void *start = per_cpu_ptr(base, cpu);
2006 void *va = (void *)addr;
2007
2008 if (va >= start && va < start + static_size) {
2009 if (can_addr) {
2010 *can_addr = (unsigned long) (va - start);
2011 *can_addr += (unsigned long)
2012 per_cpu_ptr(base, get_boot_cpu_id());
2013 }
2014 return true;
2015 }
2016 }
2017#endif
2018
2019 return false;
2020}
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033bool is_kernel_percpu_address(unsigned long addr)
2034{
2035 return __is_kernel_percpu_address(addr, NULL);
2036}
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061phys_addr_t per_cpu_ptr_to_phys(void *addr)
2062{
2063 void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
2064 bool in_first_chunk = false;
2065 unsigned long first_low, first_high;
2066 unsigned int cpu;
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078 first_low = (unsigned long)pcpu_base_addr +
2079 pcpu_unit_page_offset(pcpu_low_unit_cpu, 0);
2080 first_high = (unsigned long)pcpu_base_addr +
2081 pcpu_unit_page_offset(pcpu_high_unit_cpu, pcpu_unit_pages);
2082 if ((unsigned long)addr >= first_low &&
2083 (unsigned long)addr < first_high) {
2084 for_each_possible_cpu(cpu) {
2085 void *start = per_cpu_ptr(base, cpu);
2086
2087 if (addr >= start && addr < start + pcpu_unit_size) {
2088 in_first_chunk = true;
2089 break;
2090 }
2091 }
2092 }
2093
2094 if (in_first_chunk) {
2095 if (!is_vmalloc_addr(addr))
2096 return __pa(addr);
2097 else
2098 return page_to_phys(vmalloc_to_page(addr)) +
2099 offset_in_page(addr);
2100 } else
2101 return page_to_phys(pcpu_addr_to_page(addr)) +
2102 offset_in_page(addr);
2103}
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
2121 int nr_units)
2122{
2123 struct pcpu_alloc_info *ai;
2124 size_t base_size, ai_size;
2125 void *ptr;
2126 int unit;
2127
2128 base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]),
2129 __alignof__(ai->groups[0].cpu_map[0]));
2130 ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
2131
2132 ptr = memblock_alloc(PFN_ALIGN(ai_size), PAGE_SIZE);
2133 if (!ptr)
2134 return NULL;
2135 ai = ptr;
2136 ptr += base_size;
2137
2138 ai->groups[0].cpu_map = ptr;
2139
2140 for (unit = 0; unit < nr_units; unit++)
2141 ai->groups[0].cpu_map[unit] = NR_CPUS;
2142
2143 ai->nr_groups = nr_groups;
2144 ai->__ai_size = PFN_ALIGN(ai_size);
2145
2146 return ai;
2147}
2148
2149
2150
2151
2152
2153
2154
2155void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
2156{
2157 memblock_free_early(__pa(ai), ai->__ai_size);
2158}
2159
2160
2161
2162
2163
2164
2165
2166
2167static void pcpu_dump_alloc_info(const char *lvl,
2168 const struct pcpu_alloc_info *ai)
2169{
2170 int group_width = 1, cpu_width = 1, width;
2171 char empty_str[] = "--------";
2172 int alloc = 0, alloc_end = 0;
2173 int group, v;
2174 int upa, apl;
2175
2176 v = ai->nr_groups;
2177 while (v /= 10)
2178 group_width++;
2179
2180 v = num_possible_cpus();
2181 while (v /= 10)
2182 cpu_width++;
2183 empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
2184
2185 upa = ai->alloc_size / ai->unit_size;
2186 width = upa * (cpu_width + 1) + group_width + 3;
2187 apl = rounddown_pow_of_two(max(60 / width, 1));
2188
2189 printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
2190 lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
2191 ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);
2192
2193 for (group = 0; group < ai->nr_groups; group++) {
2194 const struct pcpu_group_info *gi = &ai->groups[group];
2195 int unit = 0, unit_end = 0;
2196
2197 BUG_ON(gi->nr_units % upa);
2198 for (alloc_end += gi->nr_units / upa;
2199 alloc < alloc_end; alloc++) {
2200 if (!(alloc % apl)) {
2201 pr_cont("\n");
2202 printk("%spcpu-alloc: ", lvl);
2203 }
2204 pr_cont("[%0*d] ", group_width, group);
2205
2206 for (unit_end += upa; unit < unit_end; unit++)
2207 if (gi->cpu_map[unit] != NR_CPUS)
2208 pr_cont("%0*d ",
2209 cpu_width, gi->cpu_map[unit]);
2210 else
2211 pr_cont("%s ", empty_str);
2212 }
2213 }
2214 pr_cont("\n");
2215}
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
2275 void *base_addr)
2276{
2277 size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
2278 size_t static_size, dyn_size;
2279 struct pcpu_chunk *chunk;
2280 unsigned long *group_offsets;
2281 size_t *group_sizes;
2282 unsigned long *unit_off;
2283 unsigned int cpu;
2284 int *unit_map;
2285 int group, unit, i;
2286 int map_size;
2287 unsigned long tmp_addr;
2288 size_t alloc_size;
2289
2290#define PCPU_SETUP_BUG_ON(cond) do { \
2291 if (unlikely(cond)) { \
2292 pr_emerg("failed to initialize, %s\n", #cond); \
2293 pr_emerg("cpu_possible_mask=%*pb\n", \
2294 cpumask_pr_args(cpu_possible_mask)); \
2295 pcpu_dump_alloc_info(KERN_EMERG, ai); \
2296 BUG(); \
2297 } \
2298} while (0)
2299
2300
2301 PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
2302#ifdef CONFIG_SMP
2303 PCPU_SETUP_BUG_ON(!ai->static_size);
2304 PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start));
2305#endif
2306 PCPU_SETUP_BUG_ON(!base_addr);
2307 PCPU_SETUP_BUG_ON(offset_in_page(base_addr));
2308 PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
2309 PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
2310 PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
2311 PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE));
2312 PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
2313 PCPU_SETUP_BUG_ON(!ai->dyn_size);
2314 PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE));
2315 PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) ||
2316 IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE)));
2317 PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
2318
2319
2320 alloc_size = ai->nr_groups * sizeof(group_offsets[0]);
2321 group_offsets = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
2322 if (!group_offsets)
2323 panic("%s: Failed to allocate %zu bytes\n", __func__,
2324 alloc_size);
2325
2326 alloc_size = ai->nr_groups * sizeof(group_sizes[0]);
2327 group_sizes = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
2328 if (!group_sizes)
2329 panic("%s: Failed to allocate %zu bytes\n", __func__,
2330 alloc_size);
2331
2332 alloc_size = nr_cpu_ids * sizeof(unit_map[0]);
2333 unit_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
2334 if (!unit_map)
2335 panic("%s: Failed to allocate %zu bytes\n", __func__,
2336 alloc_size);
2337
2338 alloc_size = nr_cpu_ids * sizeof(unit_off[0]);
2339 unit_off = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
2340 if (!unit_off)
2341 panic("%s: Failed to allocate %zu bytes\n", __func__,
2342 alloc_size);
2343
2344 for (cpu = 0; cpu < nr_cpu_ids; cpu++)
2345 unit_map[cpu] = UINT_MAX;
2346
2347 pcpu_low_unit_cpu = NR_CPUS;
2348 pcpu_high_unit_cpu = NR_CPUS;
2349
2350 for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
2351 const struct pcpu_group_info *gi = &ai->groups[group];
2352
2353 group_offsets[group] = gi->base_offset;
2354 group_sizes[group] = gi->nr_units * ai->unit_size;
2355
2356 for (i = 0; i < gi->nr_units; i++) {
2357 cpu = gi->cpu_map[i];
2358 if (cpu == NR_CPUS)
2359 continue;
2360
2361 PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids);
2362 PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
2363 PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);
2364
2365 unit_map[cpu] = unit + i;
2366 unit_off[cpu] = gi->base_offset + i * ai->unit_size;
2367
2368
2369 if (pcpu_low_unit_cpu == NR_CPUS ||
2370 unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
2371 pcpu_low_unit_cpu = cpu;
2372 if (pcpu_high_unit_cpu == NR_CPUS ||
2373 unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
2374 pcpu_high_unit_cpu = cpu;
2375 }
2376 }
2377 pcpu_nr_units = unit;
2378
2379 for_each_possible_cpu(cpu)
2380 PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);
2381
2382
2383#undef PCPU_SETUP_BUG_ON
2384 pcpu_dump_alloc_info(KERN_DEBUG, ai);
2385
2386 pcpu_nr_groups = ai->nr_groups;
2387 pcpu_group_offsets = group_offsets;
2388 pcpu_group_sizes = group_sizes;
2389 pcpu_unit_map = unit_map;
2390 pcpu_unit_offsets = unit_off;
2391
2392
2393 pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
2394 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
2395 pcpu_atom_size = ai->atom_size;
2396 pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
2397 BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
2398
2399 pcpu_stats_save_ai(ai);
2400
2401
2402
2403
2404
2405 pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
2406 pcpu_slot = memblock_alloc(pcpu_nr_slots * sizeof(pcpu_slot[0]),
2407 SMP_CACHE_BYTES);
2408 if (!pcpu_slot)
2409 panic("%s: Failed to allocate %zu bytes\n", __func__,
2410 pcpu_nr_slots * sizeof(pcpu_slot[0]));
2411 for (i = 0; i < pcpu_nr_slots; i++)
2412 INIT_LIST_HEAD(&pcpu_slot[i]);
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422 static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE);
2423 dyn_size = ai->dyn_size - (static_size - ai->static_size);
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433 tmp_addr = (unsigned long)base_addr + static_size;
2434 map_size = ai->reserved_size ?: dyn_size;
2435 chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
2436
2437
2438 if (ai->reserved_size) {
2439 pcpu_reserved_chunk = chunk;
2440
2441 tmp_addr = (unsigned long)base_addr + static_size +
2442 ai->reserved_size;
2443 map_size = dyn_size;
2444 chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
2445 }
2446
2447
2448 pcpu_first_chunk = chunk;
2449 pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages;
2450 pcpu_chunk_relocate(pcpu_first_chunk, -1);
2451
2452
2453 pcpu_nr_populated += PFN_DOWN(size_sum);
2454
2455 pcpu_stats_chunk_alloc();
2456 trace_percpu_create_chunk(base_addr);
2457
2458
2459 pcpu_base_addr = base_addr;
2460 return 0;
2461}
2462
2463#ifdef CONFIG_SMP
2464
2465const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
2466 [PCPU_FC_AUTO] = "auto",
2467 [PCPU_FC_EMBED] = "embed",
2468 [PCPU_FC_PAGE] = "page",
2469};
2470
2471enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
2472
2473static int __init percpu_alloc_setup(char *str)
2474{
2475 if (!str)
2476 return -EINVAL;
2477
2478 if (0)
2479 ;
2480#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
2481 else if (!strcmp(str, "embed"))
2482 pcpu_chosen_fc = PCPU_FC_EMBED;
2483#endif
2484#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
2485 else if (!strcmp(str, "page"))
2486 pcpu_chosen_fc = PCPU_FC_PAGE;
2487#endif
2488 else
2489 pr_warn("unknown allocator %s specified\n", str);
2490
2491 return 0;
2492}
2493early_param("percpu_alloc", percpu_alloc_setup);
2494
2495
2496
2497
2498
2499
2500#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
2501 !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
2502#define BUILD_EMBED_FIRST_CHUNK
2503#endif
2504
2505
2506#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
2507#define BUILD_PAGE_FIRST_CHUNK
2508#endif
2509
2510
2511#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
2534 size_t reserved_size, size_t dyn_size,
2535 size_t atom_size,
2536 pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
2537{
2538 static int group_map[NR_CPUS] __initdata;
2539 static int group_cnt[NR_CPUS] __initdata;
2540 const size_t static_size = __per_cpu_end - __per_cpu_start;
2541 int nr_groups = 1, nr_units = 0;
2542 size_t size_sum, min_unit_size, alloc_size;
2543 int upa, max_upa, uninitialized_var(best_upa);
2544 int last_allocs, group, unit;
2545 unsigned int cpu, tcpu;
2546 struct pcpu_alloc_info *ai;
2547 unsigned int *cpu_map;
2548
2549
2550 memset(group_map, 0, sizeof(group_map));
2551 memset(group_cnt, 0, sizeof(group_cnt));
2552
2553
2554 size_sum = PFN_ALIGN(static_size + reserved_size +
2555 max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
2556 dyn_size = size_sum - static_size - reserved_size;
2557
2558
2559
2560
2561
2562
2563
2564 min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
2565
2566
2567 alloc_size = roundup(min_unit_size, atom_size);
2568 upa = alloc_size / min_unit_size;
2569 while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
2570 upa--;
2571 max_upa = upa;
2572
2573
2574 for_each_possible_cpu(cpu) {
2575 group = 0;
2576 next_group:
2577 for_each_possible_cpu(tcpu) {
2578 if (cpu == tcpu)
2579 break;
2580 if (group_map[tcpu] == group && cpu_distance_fn &&
2581 (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
2582 cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
2583 group++;
2584 nr_groups = max(nr_groups, group + 1);
2585 goto next_group;
2586 }
2587 }
2588 group_map[cpu] = group;
2589 group_cnt[group]++;
2590 }
2591
2592
2593
2594
2595
2596
2597 last_allocs = INT_MAX;
2598 for (upa = max_upa; upa; upa--) {
2599 int allocs = 0, wasted = 0;
2600
2601 if (alloc_size % upa || (offset_in_page(alloc_size / upa)))
2602 continue;
2603
2604 for (group = 0; group < nr_groups; group++) {
2605 int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
2606 allocs += this_allocs;
2607 wasted += this_allocs * upa - group_cnt[group];
2608 }
2609
2610
2611
2612
2613
2614
2615 if (wasted > num_possible_cpus() / 3)
2616 continue;
2617
2618
2619 if (allocs > last_allocs)
2620 break;
2621 last_allocs = allocs;
2622 best_upa = upa;
2623 }
2624 upa = best_upa;
2625
2626
2627 for (group = 0; group < nr_groups; group++)
2628 nr_units += roundup(group_cnt[group], upa);
2629
2630 ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
2631 if (!ai)
2632 return ERR_PTR(-ENOMEM);
2633 cpu_map = ai->groups[0].cpu_map;
2634
2635 for (group = 0; group < nr_groups; group++) {
2636 ai->groups[group].cpu_map = cpu_map;
2637 cpu_map += roundup(group_cnt[group], upa);
2638 }
2639
2640 ai->static_size = static_size;
2641 ai->reserved_size = reserved_size;
2642 ai->dyn_size = dyn_size;
2643 ai->unit_size = alloc_size / upa;
2644 ai->atom_size = atom_size;
2645 ai->alloc_size = alloc_size;
2646
2647 for (group = 0, unit = 0; group < nr_groups; group++) {
2648 struct pcpu_group_info *gi = &ai->groups[group];
2649
2650
2651
2652
2653
2654
2655 gi->base_offset = unit * ai->unit_size;
2656
2657 for_each_possible_cpu(cpu)
2658 if (group_map[cpu] == group)
2659 gi->cpu_map[gi->nr_units++] = cpu;
2660 gi->nr_units = roundup(gi->nr_units, upa);
2661 unit += gi->nr_units;
2662 }
2663 BUG_ON(unit != nr_units);
2664
2665 return ai;
2666}
2667#endif
2668
2669#if defined(BUILD_EMBED_FIRST_CHUNK)
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
2703 size_t atom_size,
2704 pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
2705 pcpu_fc_alloc_fn_t alloc_fn,
2706 pcpu_fc_free_fn_t free_fn)
2707{
2708 void *base = (void *)ULONG_MAX;
2709 void **areas = NULL;
2710 struct pcpu_alloc_info *ai;
2711 size_t size_sum, areas_size;
2712 unsigned long max_distance;
2713 int group, i, highest_group, rc;
2714
2715 ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
2716 cpu_distance_fn);
2717 if (IS_ERR(ai))
2718 return PTR_ERR(ai);
2719
2720 size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
2721 areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
2722
2723 areas = memblock_alloc(areas_size, SMP_CACHE_BYTES);
2724 if (!areas) {
2725 rc = -ENOMEM;
2726 goto out_free;
2727 }
2728
2729
2730 highest_group = 0;
2731 for (group = 0; group < ai->nr_groups; group++) {
2732 struct pcpu_group_info *gi = &ai->groups[group];
2733 unsigned int cpu = NR_CPUS;
2734 void *ptr;
2735
2736 for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
2737 cpu = gi->cpu_map[i];
2738 BUG_ON(cpu == NR_CPUS);
2739
2740
2741 ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
2742 if (!ptr) {
2743 rc = -ENOMEM;
2744 goto out_free_areas;
2745 }
2746
2747 kmemleak_free(ptr);
2748 areas[group] = ptr;
2749
2750 base = min(ptr, base);
2751 if (ptr > areas[highest_group])
2752 highest_group = group;
2753 }
2754 max_distance = areas[highest_group] - base;
2755 max_distance += ai->unit_size * ai->groups[highest_group].nr_units;
2756
2757
2758 if (max_distance > VMALLOC_TOTAL * 3 / 4) {
2759 pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n",
2760 max_distance, VMALLOC_TOTAL);
2761#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
2762
2763 rc = -EINVAL;
2764 goto out_free_areas;
2765#endif
2766 }
2767
2768
2769
2770
2771
2772
2773 for (group = 0; group < ai->nr_groups; group++) {
2774 struct pcpu_group_info *gi = &ai->groups[group];
2775 void *ptr = areas[group];
2776
2777 for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
2778 if (gi->cpu_map[i] == NR_CPUS) {
2779
2780 free_fn(ptr, ai->unit_size);
2781 continue;
2782 }
2783
2784 memcpy(ptr, __per_cpu_load, ai->static_size);
2785 free_fn(ptr + size_sum, ai->unit_size - size_sum);
2786 }
2787 }
2788
2789
2790 for (group = 0; group < ai->nr_groups; group++) {
2791 ai->groups[group].base_offset = areas[group] - base;
2792 }
2793
2794 pr_info("Embedded %zu pages/cpu s%zu r%zu d%zu u%zu\n",
2795 PFN_DOWN(size_sum), ai->static_size, ai->reserved_size,
2796 ai->dyn_size, ai->unit_size);
2797
2798 rc = pcpu_setup_first_chunk(ai, base);
2799 goto out_free;
2800
2801out_free_areas:
2802 for (group = 0; group < ai->nr_groups; group++)
2803 if (areas[group])
2804 free_fn(areas[group],
2805 ai->groups[group].nr_units * ai->unit_size);
2806out_free:
2807 pcpu_free_alloc_info(ai);
2808 if (areas)
2809 memblock_free_early(__pa(areas), areas_size);
2810 return rc;
2811}
2812#endif
2813
2814#ifdef BUILD_PAGE_FIRST_CHUNK
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831int __init pcpu_page_first_chunk(size_t reserved_size,
2832 pcpu_fc_alloc_fn_t alloc_fn,
2833 pcpu_fc_free_fn_t free_fn,
2834 pcpu_fc_populate_pte_fn_t populate_pte_fn)
2835{
2836 static struct vm_struct vm;
2837 struct pcpu_alloc_info *ai;
2838 char psize_str[16];
2839 int unit_pages;
2840 size_t pages_size;
2841 struct page **pages;
2842 int unit, i, j, rc;
2843 int upa;
2844 int nr_g0_units;
2845
2846 snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
2847
2848 ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
2849 if (IS_ERR(ai))
2850 return PTR_ERR(ai);
2851 BUG_ON(ai->nr_groups != 1);
2852 upa = ai->alloc_size/ai->unit_size;
2853 nr_g0_units = roundup(num_possible_cpus(), upa);
2854 if (WARN_ON(ai->groups[0].nr_units != nr_g0_units)) {
2855 pcpu_free_alloc_info(ai);
2856 return -EINVAL;
2857 }
2858
2859 unit_pages = ai->unit_size >> PAGE_SHIFT;
2860
2861
2862 pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
2863 sizeof(pages[0]));
2864 pages = memblock_alloc(pages_size, SMP_CACHE_BYTES);
2865 if (!pages)
2866 panic("%s: Failed to allocate %zu bytes\n", __func__,
2867 pages_size);
2868
2869
2870 j = 0;
2871 for (unit = 0; unit < num_possible_cpus(); unit++) {
2872 unsigned int cpu = ai->groups[0].cpu_map[unit];
2873 for (i = 0; i < unit_pages; i++) {
2874 void *ptr;
2875
2876 ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
2877 if (!ptr) {
2878 pr_warn("failed to allocate %s page for cpu%u\n",
2879 psize_str, cpu);
2880 goto enomem;
2881 }
2882
2883 kmemleak_free(ptr);
2884 pages[j++] = virt_to_page(ptr);
2885 }
2886 }
2887
2888
2889 vm.flags = VM_ALLOC;
2890 vm.size = num_possible_cpus() * ai->unit_size;
2891 vm_area_register_early(&vm, PAGE_SIZE);
2892
2893 for (unit = 0; unit < num_possible_cpus(); unit++) {
2894 unsigned long unit_addr =
2895 (unsigned long)vm.addr + unit * ai->unit_size;
2896
2897 for (i = 0; i < unit_pages; i++)
2898 populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
2899
2900
2901 rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
2902 unit_pages);
2903 if (rc < 0)
2904 panic("failed to map percpu area, err=%d\n", rc);
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915 memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
2916 }
2917
2918
2919 pr_info("%d %s pages/cpu s%zu r%zu d%zu\n",
2920 unit_pages, psize_str, ai->static_size,
2921 ai->reserved_size, ai->dyn_size);
2922
2923 rc = pcpu_setup_first_chunk(ai, vm.addr);
2924 goto out_free_ar;
2925
2926enomem:
2927 while (--j >= 0)
2928 free_fn(page_address(pages[j]), PAGE_SIZE);
2929 rc = -ENOMEM;
2930out_free_ar:
2931 memblock_free_early(__pa(pages), pages_size);
2932 pcpu_free_alloc_info(ai);
2933 return rc;
2934}
2935#endif
2936
2937#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
2951EXPORT_SYMBOL(__per_cpu_offset);
2952
2953static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
2954 size_t align)
2955{
2956 return memblock_alloc_from(size, align, __pa(MAX_DMA_ADDRESS));
2957}
2958
2959static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
2960{
2961 memblock_free_early(__pa(ptr), size);
2962}
2963
2964void __init setup_per_cpu_areas(void)
2965{
2966 unsigned long delta;
2967 unsigned int cpu;
2968 int rc;
2969
2970
2971
2972
2973
2974 rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
2975 PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
2976 pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
2977 if (rc < 0)
2978 panic("Failed to initialize percpu areas.");
2979
2980 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
2981 for_each_possible_cpu(cpu)
2982 __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
2983}
2984#endif
2985
2986#else
2987
2988
2989
2990
2991
2992
2993
2994
2995void __init setup_per_cpu_areas(void)
2996{
2997 const size_t unit_size =
2998 roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
2999 PERCPU_DYNAMIC_RESERVE));
3000 struct pcpu_alloc_info *ai;
3001 void *fc;
3002
3003 ai = pcpu_alloc_alloc_info(1, 1);
3004 fc = memblock_alloc_from(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
3005 if (!ai || !fc)
3006 panic("Failed to allocate memory for percpu areas.");
3007
3008 kmemleak_free(fc);
3009
3010 ai->dyn_size = unit_size;
3011 ai->unit_size = unit_size;
3012 ai->atom_size = unit_size;
3013 ai->alloc_size = unit_size;
3014 ai->groups[0].nr_units = 1;
3015 ai->groups[0].cpu_map[0] = 0;
3016
3017 if (pcpu_setup_first_chunk(ai, fc) < 0)
3018 panic("Failed to initialize percpu areas.");
3019 pcpu_free_alloc_info(ai);
3020}
3021
3022#endif
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035unsigned long pcpu_nr_pages(void)
3036{
3037 return pcpu_nr_populated * pcpu_nr_units;
3038}
3039
3040
3041
3042
3043
3044
3045static int __init percpu_enable_async(void)
3046{
3047 pcpu_async_enabled = true;
3048 return 0;
3049}
3050subsys_initcall(percpu_enable_async);
3051