1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
70
71#include <linux/bitmap.h>
72#include <linux/memblock.h>
73#include <linux/err.h>
74#include <linux/lcm.h>
75#include <linux/list.h>
76#include <linux/log2.h>
77#include <linux/mm.h>
78#include <linux/module.h>
79#include <linux/mutex.h>
80#include <linux/percpu.h>
81#include <linux/pfn.h>
82#include <linux/slab.h>
83#include <linux/spinlock.h>
84#include <linux/vmalloc.h>
85#include <linux/workqueue.h>
86#include <linux/kmemleak.h>
87#include <linux/sched.h>
88#include <linux/sched/mm.h>
89#include <linux/memcontrol.h>
90
91#include <asm/cacheflush.h>
92#include <asm/sections.h>
93#include <asm/tlbflush.h>
94#include <asm/io.h>
95
96#define CREATE_TRACE_POINTS
97#include <trace/events/percpu.h>
98
99#include "percpu-internal.h"
100
101
102#define PCPU_SLOT_BASE_SHIFT 5
103
104#define PCPU_SLOT_FAIL_THRESHOLD 3
105
106#define PCPU_EMPTY_POP_PAGES_LOW 2
107#define PCPU_EMPTY_POP_PAGES_HIGH 4
108
109#ifdef CONFIG_SMP
110
111#ifndef __addr_to_pcpu_ptr
112#define __addr_to_pcpu_ptr(addr) \
113 (void __percpu *)((unsigned long)(addr) - \
114 (unsigned long)pcpu_base_addr + \
115 (unsigned long)__per_cpu_start)
116#endif
117#ifndef __pcpu_ptr_to_addr
118#define __pcpu_ptr_to_addr(ptr) \
119 (void __force *)((unsigned long)(ptr) + \
120 (unsigned long)pcpu_base_addr - \
121 (unsigned long)__per_cpu_start)
122#endif
123#else
124
125#define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr)
126#define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr)
127#endif
128
129static int pcpu_unit_pages __ro_after_init;
130static int pcpu_unit_size __ro_after_init;
131static int pcpu_nr_units __ro_after_init;
132static int pcpu_atom_size __ro_after_init;
133int pcpu_nr_slots __ro_after_init;
134static size_t pcpu_chunk_struct_size __ro_after_init;
135
136
137static unsigned int pcpu_low_unit_cpu __ro_after_init;
138static unsigned int pcpu_high_unit_cpu __ro_after_init;
139
140
141void *pcpu_base_addr __ro_after_init;
142EXPORT_SYMBOL_GPL(pcpu_base_addr);
143
144static const int *pcpu_unit_map __ro_after_init;
145const unsigned long *pcpu_unit_offsets __ro_after_init;
146
147
148static int pcpu_nr_groups __ro_after_init;
149static const unsigned long *pcpu_group_offsets __ro_after_init;
150static const size_t *pcpu_group_sizes __ro_after_init;
151
152
153
154
155
156
157struct pcpu_chunk *pcpu_first_chunk __ro_after_init;
158
159
160
161
162
163
164struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;
165
166DEFINE_SPINLOCK(pcpu_lock);
167static DEFINE_MUTEX(pcpu_alloc_mutex);
168
169struct list_head *pcpu_chunk_lists __ro_after_init;
170
171
172static LIST_HEAD(pcpu_map_extend_chunks);
173
174
175
176
177
178int pcpu_nr_empty_pop_pages;
179
180
181
182
183
184
185
186static unsigned long pcpu_nr_populated;
187
188
189
190
191
192
193
194static void pcpu_balance_workfn(struct work_struct *work);
195static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
196static bool pcpu_async_enabled __read_mostly;
197static bool pcpu_atomic_alloc_failed;
198
199static void pcpu_schedule_balance_work(void)
200{
201 if (pcpu_async_enabled)
202 schedule_work(&pcpu_balance_work);
203}
204
205
206
207
208
209
210
211
212
213static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr)
214{
215 void *start_addr, *end_addr;
216
217 if (!chunk)
218 return false;
219
220 start_addr = chunk->base_addr + chunk->start_offset;
221 end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE -
222 chunk->end_offset;
223
224 return addr >= start_addr && addr < end_addr;
225}
226
227static int __pcpu_size_to_slot(int size)
228{
229 int highbit = fls(size);
230 return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
231}
232
233static int pcpu_size_to_slot(int size)
234{
235 if (size == pcpu_unit_size)
236 return pcpu_nr_slots - 1;
237 return __pcpu_size_to_slot(size);
238}
239
240static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
241{
242 const struct pcpu_block_md *chunk_md = &chunk->chunk_md;
243
244 if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE ||
245 chunk_md->contig_hint == 0)
246 return 0;
247
248 return pcpu_size_to_slot(chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE);
249}
250
251
252static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
253{
254 page->index = (unsigned long)pcpu;
255}
256
257
258static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
259{
260 return (struct pcpu_chunk *)page->index;
261}
262
263static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
264{
265 return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
266}
267
268static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx)
269{
270 return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT);
271}
272
273static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
274 unsigned int cpu, int page_idx)
275{
276 return (unsigned long)chunk->base_addr +
277 pcpu_unit_page_offset(cpu, page_idx);
278}
279
280
281
282
283
284static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index)
285{
286 return chunk->alloc_map +
287 (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG);
288}
289
290static unsigned long pcpu_off_to_block_index(int off)
291{
292 return off / PCPU_BITMAP_BLOCK_BITS;
293}
294
295static unsigned long pcpu_off_to_block_off(int off)
296{
297 return off & (PCPU_BITMAP_BLOCK_BITS - 1);
298}
299
300static unsigned long pcpu_block_off_to_off(int index, int off)
301{
302 return index * PCPU_BITMAP_BLOCK_BITS + off;
303}
304
305
306
307
308
309
310
311
312
313
314
315
316static int pcpu_next_hint(struct pcpu_block_md *block, int alloc_bits)
317{
318
319
320
321
322
323
324
325 if (block->scan_hint &&
326 block->contig_hint_start > block->scan_hint_start &&
327 alloc_bits > block->scan_hint)
328 return block->scan_hint_start + block->scan_hint;
329
330 return block->first_free;
331}
332
333
334
335
336
337
338
339
340
341
342
343
344static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off,
345 int *bits)
346{
347 int i = pcpu_off_to_block_index(*bit_off);
348 int block_off = pcpu_off_to_block_off(*bit_off);
349 struct pcpu_block_md *block;
350
351 *bits = 0;
352 for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
353 block++, i++) {
354
355 if (*bits) {
356 *bits += block->left_free;
357 if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
358 continue;
359 return;
360 }
361
362
363
364
365
366
367
368
369
370 *bits = block->contig_hint;
371 if (*bits && block->contig_hint_start >= block_off &&
372 *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) {
373 *bit_off = pcpu_block_off_to_off(i,
374 block->contig_hint_start);
375 return;
376 }
377
378 block_off = 0;
379
380 *bits = block->right_free;
381 *bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free;
382 }
383}
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
400 int align, int *bit_off, int *bits)
401{
402 int i = pcpu_off_to_block_index(*bit_off);
403 int block_off = pcpu_off_to_block_off(*bit_off);
404 struct pcpu_block_md *block;
405
406 *bits = 0;
407 for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
408 block++, i++) {
409
410 if (*bits) {
411 *bits += block->left_free;
412 if (*bits >= alloc_bits)
413 return;
414 if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
415 continue;
416 }
417
418
419 *bits = ALIGN(block->contig_hint_start, align) -
420 block->contig_hint_start;
421
422
423
424
425 if (block->contig_hint &&
426 block->contig_hint_start >= block_off &&
427 block->contig_hint >= *bits + alloc_bits) {
428 int start = pcpu_next_hint(block, alloc_bits);
429
430 *bits += alloc_bits + block->contig_hint_start -
431 start;
432 *bit_off = pcpu_block_off_to_off(i, start);
433 return;
434 }
435
436 block_off = 0;
437
438 *bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free,
439 align);
440 *bits = PCPU_BITMAP_BLOCK_BITS - *bit_off;
441 *bit_off = pcpu_block_off_to_off(i, *bit_off);
442 if (*bits >= alloc_bits)
443 return;
444 }
445
446
447 *bit_off = pcpu_chunk_map_bits(chunk);
448}
449
450
451
452
453
454
455
456#define pcpu_for_each_md_free_region(chunk, bit_off, bits) \
457 for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits)); \
458 (bit_off) < pcpu_chunk_map_bits((chunk)); \
459 (bit_off) += (bits) + 1, \
460 pcpu_next_md_free_region((chunk), &(bit_off), &(bits)))
461
462#define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) \
463 for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
464 &(bits)); \
465 (bit_off) < pcpu_chunk_map_bits((chunk)); \
466 (bit_off) += (bits), \
467 pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
468 &(bits)))
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
484{
485 if (WARN_ON_ONCE(!slab_is_available()))
486 return NULL;
487
488 if (size <= PAGE_SIZE)
489 return kzalloc(size, gfp);
490 else
491 return __vmalloc(size, gfp | __GFP_ZERO);
492}
493
494
495
496
497
498
499
500static void pcpu_mem_free(void *ptr)
501{
502 kvfree(ptr);
503}
504
505static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot,
506 bool move_front)
507{
508 if (chunk != pcpu_reserved_chunk) {
509 struct list_head *pcpu_slot;
510
511 pcpu_slot = pcpu_chunk_list(pcpu_chunk_type(chunk));
512 if (move_front)
513 list_move(&chunk->list, &pcpu_slot[slot]);
514 else
515 list_move_tail(&chunk->list, &pcpu_slot[slot]);
516 }
517}
518
519static void pcpu_chunk_move(struct pcpu_chunk *chunk, int slot)
520{
521 __pcpu_chunk_move(chunk, slot, true);
522}
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
538{
539 int nslot = pcpu_chunk_slot(chunk);
540
541 if (oslot != nslot)
542 __pcpu_chunk_move(chunk, nslot, oslot < nslot);
543}
544
545
546
547
548
549
550
551
552
553
554static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr)
555{
556 chunk->nr_empty_pop_pages += nr;
557 if (chunk != pcpu_reserved_chunk)
558 pcpu_nr_empty_pop_pages += nr;
559}
560
561
562
563
564
565
566
567
568
569
570
571static inline bool pcpu_region_overlap(int a, int b, int x, int y)
572{
573 return (a < y) && (x < b);
574}
575
576
577
578
579
580
581
582
583
584
585
586static void pcpu_block_update(struct pcpu_block_md *block, int start, int end)
587{
588 int contig = end - start;
589
590 block->first_free = min(block->first_free, start);
591 if (start == 0)
592 block->left_free = contig;
593
594 if (end == block->nr_bits)
595 block->right_free = contig;
596
597 if (contig > block->contig_hint) {
598
599 if (start > block->contig_hint_start) {
600 if (block->contig_hint > block->scan_hint) {
601 block->scan_hint_start =
602 block->contig_hint_start;
603 block->scan_hint = block->contig_hint;
604 } else if (start < block->scan_hint_start) {
605
606
607
608
609
610 block->scan_hint = 0;
611 }
612 } else {
613 block->scan_hint = 0;
614 }
615 block->contig_hint_start = start;
616 block->contig_hint = contig;
617 } else if (contig == block->contig_hint) {
618 if (block->contig_hint_start &&
619 (!start ||
620 __ffs(start) > __ffs(block->contig_hint_start))) {
621
622 block->contig_hint_start = start;
623 if (start < block->scan_hint_start &&
624 block->contig_hint > block->scan_hint)
625 block->scan_hint = 0;
626 } else if (start > block->scan_hint_start ||
627 block->contig_hint > block->scan_hint) {
628
629
630
631
632
633 block->scan_hint_start = start;
634 block->scan_hint = contig;
635 }
636 } else {
637
638
639
640
641
642 if ((start < block->contig_hint_start &&
643 (contig > block->scan_hint ||
644 (contig == block->scan_hint &&
645 start > block->scan_hint_start)))) {
646 block->scan_hint_start = start;
647 block->scan_hint = contig;
648 }
649 }
650}
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668static void pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off,
669 int bits)
670{
671 int s_off = pcpu_off_to_block_off(bit_off);
672 int e_off = s_off + bits;
673 int s_index, l_bit;
674 struct pcpu_block_md *block;
675
676 if (e_off > PCPU_BITMAP_BLOCK_BITS)
677 return;
678
679 s_index = pcpu_off_to_block_index(bit_off);
680 block = chunk->md_blocks + s_index;
681
682
683 l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), s_off);
684 s_off = (s_off == l_bit) ? 0 : l_bit + 1;
685
686 pcpu_block_update(block, s_off, e_off);
687}
688
689
690
691
692
693
694
695
696
697
698
699
700
701static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan)
702{
703 struct pcpu_block_md *chunk_md = &chunk->chunk_md;
704 int bit_off, bits;
705
706
707 if (!full_scan && chunk_md->scan_hint) {
708 bit_off = chunk_md->scan_hint_start + chunk_md->scan_hint;
709 chunk_md->contig_hint_start = chunk_md->scan_hint_start;
710 chunk_md->contig_hint = chunk_md->scan_hint;
711 chunk_md->scan_hint = 0;
712 } else {
713 bit_off = chunk_md->first_free;
714 chunk_md->contig_hint = 0;
715 }
716
717 bits = 0;
718 pcpu_for_each_md_free_region(chunk, bit_off, bits)
719 pcpu_block_update(chunk_md, bit_off, bit_off + bits);
720}
721
722
723
724
725
726
727
728
729
730static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
731{
732 struct pcpu_block_md *block = chunk->md_blocks + index;
733 unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
734 unsigned int rs, re, start;
735
736
737 if (block->scan_hint) {
738 start = block->scan_hint_start + block->scan_hint;
739 block->contig_hint_start = block->scan_hint_start;
740 block->contig_hint = block->scan_hint;
741 block->scan_hint = 0;
742 } else {
743 start = block->first_free;
744 block->contig_hint = 0;
745 }
746
747 block->right_free = 0;
748
749
750 bitmap_for_each_clear_region(alloc_map, rs, re, start,
751 PCPU_BITMAP_BLOCK_BITS)
752 pcpu_block_update(block, rs, re);
753}
754
755
756
757
758
759
760
761
762
763
764
765static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
766 int bits)
767{
768 struct pcpu_block_md *chunk_md = &chunk->chunk_md;
769 int nr_empty_pages = 0;
770 struct pcpu_block_md *s_block, *e_block, *block;
771 int s_index, e_index;
772 int s_off, e_off;
773
774
775
776
777
778
779
780 s_index = pcpu_off_to_block_index(bit_off);
781 e_index = pcpu_off_to_block_index(bit_off + bits - 1);
782 s_off = pcpu_off_to_block_off(bit_off);
783 e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
784
785 s_block = chunk->md_blocks + s_index;
786 e_block = chunk->md_blocks + e_index;
787
788
789
790
791
792
793
794 if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
795 nr_empty_pages++;
796
797 if (s_off == s_block->first_free)
798 s_block->first_free = find_next_zero_bit(
799 pcpu_index_alloc_map(chunk, s_index),
800 PCPU_BITMAP_BLOCK_BITS,
801 s_off + bits);
802
803 if (pcpu_region_overlap(s_block->scan_hint_start,
804 s_block->scan_hint_start + s_block->scan_hint,
805 s_off,
806 s_off + bits))
807 s_block->scan_hint = 0;
808
809 if (pcpu_region_overlap(s_block->contig_hint_start,
810 s_block->contig_hint_start +
811 s_block->contig_hint,
812 s_off,
813 s_off + bits)) {
814
815 if (!s_off)
816 s_block->left_free = 0;
817 pcpu_block_refresh_hint(chunk, s_index);
818 } else {
819
820 s_block->left_free = min(s_block->left_free, s_off);
821 if (s_index == e_index)
822 s_block->right_free = min_t(int, s_block->right_free,
823 PCPU_BITMAP_BLOCK_BITS - e_off);
824 else
825 s_block->right_free = 0;
826 }
827
828
829
830
831 if (s_index != e_index) {
832 if (e_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
833 nr_empty_pages++;
834
835
836
837
838
839 e_block->first_free = find_next_zero_bit(
840 pcpu_index_alloc_map(chunk, e_index),
841 PCPU_BITMAP_BLOCK_BITS, e_off);
842
843 if (e_off == PCPU_BITMAP_BLOCK_BITS) {
844
845 e_block++;
846 } else {
847 if (e_off > e_block->scan_hint_start)
848 e_block->scan_hint = 0;
849
850 e_block->left_free = 0;
851 if (e_off > e_block->contig_hint_start) {
852
853 pcpu_block_refresh_hint(chunk, e_index);
854 } else {
855 e_block->right_free =
856 min_t(int, e_block->right_free,
857 PCPU_BITMAP_BLOCK_BITS - e_off);
858 }
859 }
860
861
862 nr_empty_pages += (e_index - s_index - 1);
863 for (block = s_block + 1; block < e_block; block++) {
864 block->scan_hint = 0;
865 block->contig_hint = 0;
866 block->left_free = 0;
867 block->right_free = 0;
868 }
869 }
870
871 if (nr_empty_pages)
872 pcpu_update_empty_pages(chunk, -nr_empty_pages);
873
874 if (pcpu_region_overlap(chunk_md->scan_hint_start,
875 chunk_md->scan_hint_start +
876 chunk_md->scan_hint,
877 bit_off,
878 bit_off + bits))
879 chunk_md->scan_hint = 0;
880
881
882
883
884
885
886 if (pcpu_region_overlap(chunk_md->contig_hint_start,
887 chunk_md->contig_hint_start +
888 chunk_md->contig_hint,
889 bit_off,
890 bit_off + bits))
891 pcpu_chunk_refresh_hint(chunk, false);
892}
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
913 int bits)
914{
915 int nr_empty_pages = 0;
916 struct pcpu_block_md *s_block, *e_block, *block;
917 int s_index, e_index;
918 int s_off, e_off;
919 int start, end;
920
921
922
923
924
925
926
927 s_index = pcpu_off_to_block_index(bit_off);
928 e_index = pcpu_off_to_block_index(bit_off + bits - 1);
929 s_off = pcpu_off_to_block_off(bit_off);
930 e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
931
932 s_block = chunk->md_blocks + s_index;
933 e_block = chunk->md_blocks + e_index;
934
935
936
937
938
939
940
941
942
943
944
945 start = s_off;
946 if (s_off == s_block->contig_hint + s_block->contig_hint_start) {
947 start = s_block->contig_hint_start;
948 } else {
949
950
951
952
953
954
955 int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index),
956 start);
957 start = (start == l_bit) ? 0 : l_bit + 1;
958 }
959
960 end = e_off;
961 if (e_off == e_block->contig_hint_start)
962 end = e_block->contig_hint_start + e_block->contig_hint;
963 else
964 end = find_next_bit(pcpu_index_alloc_map(chunk, e_index),
965 PCPU_BITMAP_BLOCK_BITS, end);
966
967
968 e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS;
969 if (!start && e_off == PCPU_BITMAP_BLOCK_BITS)
970 nr_empty_pages++;
971 pcpu_block_update(s_block, start, e_off);
972
973
974 if (s_index != e_index) {
975
976 if (end == PCPU_BITMAP_BLOCK_BITS)
977 nr_empty_pages++;
978 pcpu_block_update(e_block, 0, end);
979
980
981 nr_empty_pages += (e_index - s_index - 1);
982 for (block = s_block + 1; block < e_block; block++) {
983 block->first_free = 0;
984 block->scan_hint = 0;
985 block->contig_hint_start = 0;
986 block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
987 block->left_free = PCPU_BITMAP_BLOCK_BITS;
988 block->right_free = PCPU_BITMAP_BLOCK_BITS;
989 }
990 }
991
992 if (nr_empty_pages)
993 pcpu_update_empty_pages(chunk, nr_empty_pages);
994
995
996
997
998
999
1000
1001 if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index)
1002 pcpu_chunk_refresh_hint(chunk, true);
1003 else
1004 pcpu_block_update(&chunk->chunk_md,
1005 pcpu_block_off_to_off(s_index, start),
1006 end);
1007}
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
1023 int *next_off)
1024{
1025 unsigned int page_start, page_end, rs, re;
1026
1027 page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
1028 page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
1029
1030 rs = page_start;
1031 bitmap_next_clear_region(chunk->populated, &rs, &re, page_end);
1032 if (rs >= page_end)
1033 return true;
1034
1035 *next_off = re * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
1036 return false;
1037}
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
1059 size_t align, bool pop_only)
1060{
1061 struct pcpu_block_md *chunk_md = &chunk->chunk_md;
1062 int bit_off, bits, next_off;
1063
1064
1065
1066
1067
1068
1069
1070 bit_off = ALIGN(chunk_md->contig_hint_start, align) -
1071 chunk_md->contig_hint_start;
1072 if (bit_off + alloc_bits > chunk_md->contig_hint)
1073 return -1;
1074
1075 bit_off = pcpu_next_hint(chunk_md, alloc_bits);
1076 bits = 0;
1077 pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) {
1078 if (!pop_only || pcpu_is_populated(chunk, bit_off, bits,
1079 &next_off))
1080 break;
1081
1082 bit_off = next_off;
1083 bits = 0;
1084 }
1085
1086 if (bit_off == pcpu_chunk_map_bits(chunk))
1087 return -1;
1088
1089 return bit_off;
1090}
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112static unsigned long pcpu_find_zero_area(unsigned long *map,
1113 unsigned long size,
1114 unsigned long start,
1115 unsigned long nr,
1116 unsigned long align_mask,
1117 unsigned long *largest_off,
1118 unsigned long *largest_bits)
1119{
1120 unsigned long index, end, i, area_off, area_bits;
1121again:
1122 index = find_next_zero_bit(map, size, start);
1123
1124
1125 index = __ALIGN_MASK(index, align_mask);
1126 area_off = index;
1127
1128 end = index + nr;
1129 if (end > size)
1130 return end;
1131 i = find_next_bit(map, end, index);
1132 if (i < end) {
1133 area_bits = i - area_off;
1134
1135 if (area_bits > *largest_bits ||
1136 (area_bits == *largest_bits && *largest_off &&
1137 (!area_off || __ffs(area_off) > __ffs(*largest_off)))) {
1138 *largest_off = area_off;
1139 *largest_bits = area_bits;
1140 }
1141
1142 start = i + 1;
1143 goto again;
1144 }
1145 return index;
1146}
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
1168 size_t align, int start)
1169{
1170 struct pcpu_block_md *chunk_md = &chunk->chunk_md;
1171 size_t align_mask = (align) ? (align - 1) : 0;
1172 unsigned long area_off = 0, area_bits = 0;
1173 int bit_off, end, oslot;
1174
1175 lockdep_assert_held(&pcpu_lock);
1176
1177 oslot = pcpu_chunk_slot(chunk);
1178
1179
1180
1181
1182 end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS,
1183 pcpu_chunk_map_bits(chunk));
1184 bit_off = pcpu_find_zero_area(chunk->alloc_map, end, start, alloc_bits,
1185 align_mask, &area_off, &area_bits);
1186 if (bit_off >= end)
1187 return -1;
1188
1189 if (area_bits)
1190 pcpu_block_update_scan(chunk, area_off, area_bits);
1191
1192
1193 bitmap_set(chunk->alloc_map, bit_off, alloc_bits);
1194
1195
1196 set_bit(bit_off, chunk->bound_map);
1197 bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1);
1198 set_bit(bit_off + alloc_bits, chunk->bound_map);
1199
1200 chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE;
1201
1202
1203 if (bit_off == chunk_md->first_free)
1204 chunk_md->first_free = find_next_zero_bit(
1205 chunk->alloc_map,
1206 pcpu_chunk_map_bits(chunk),
1207 bit_off + alloc_bits);
1208
1209 pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits);
1210
1211 pcpu_chunk_relocate(chunk, oslot);
1212
1213 return bit_off * PCPU_MIN_ALLOC_SIZE;
1214}
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227static int pcpu_free_area(struct pcpu_chunk *chunk, int off)
1228{
1229 struct pcpu_block_md *chunk_md = &chunk->chunk_md;
1230 int bit_off, bits, end, oslot, freed;
1231
1232 lockdep_assert_held(&pcpu_lock);
1233 pcpu_stats_area_dealloc(chunk);
1234
1235 oslot = pcpu_chunk_slot(chunk);
1236
1237 bit_off = off / PCPU_MIN_ALLOC_SIZE;
1238
1239
1240 end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
1241 bit_off + 1);
1242 bits = end - bit_off;
1243 bitmap_clear(chunk->alloc_map, bit_off, bits);
1244
1245 freed = bits * PCPU_MIN_ALLOC_SIZE;
1246
1247
1248 chunk->free_bytes += freed;
1249
1250
1251 chunk_md->first_free = min(chunk_md->first_free, bit_off);
1252
1253 pcpu_block_update_hint_free(chunk, bit_off, bits);
1254
1255 pcpu_chunk_relocate(chunk, oslot);
1256
1257 return freed;
1258}
1259
1260static void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits)
1261{
1262 block->scan_hint = 0;
1263 block->contig_hint = nr_bits;
1264 block->left_free = nr_bits;
1265 block->right_free = nr_bits;
1266 block->first_free = 0;
1267 block->nr_bits = nr_bits;
1268}
1269
1270static void pcpu_init_md_blocks(struct pcpu_chunk *chunk)
1271{
1272 struct pcpu_block_md *md_block;
1273
1274
1275 pcpu_init_md_block(&chunk->chunk_md, pcpu_chunk_map_bits(chunk));
1276
1277 for (md_block = chunk->md_blocks;
1278 md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk);
1279 md_block++)
1280 pcpu_init_md_block(md_block, PCPU_BITMAP_BLOCK_BITS);
1281}
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
1297 int map_size)
1298{
1299 struct pcpu_chunk *chunk;
1300 unsigned long aligned_addr, lcm_align;
1301 int start_offset, offset_bits, region_size, region_bits;
1302 size_t alloc_size;
1303
1304
1305 aligned_addr = tmp_addr & PAGE_MASK;
1306
1307 start_offset = tmp_addr - aligned_addr;
1308
1309
1310
1311
1312
1313
1314 lcm_align = lcm(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE);
1315 region_size = ALIGN(start_offset + map_size, lcm_align);
1316
1317
1318 alloc_size = struct_size(chunk, populated,
1319 BITS_TO_LONGS(region_size >> PAGE_SHIFT));
1320 chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
1321 if (!chunk)
1322 panic("%s: Failed to allocate %zu bytes\n", __func__,
1323 alloc_size);
1324
1325 INIT_LIST_HEAD(&chunk->list);
1326
1327 chunk->base_addr = (void *)aligned_addr;
1328 chunk->start_offset = start_offset;
1329 chunk->end_offset = region_size - chunk->start_offset - map_size;
1330
1331 chunk->nr_pages = region_size >> PAGE_SHIFT;
1332 region_bits = pcpu_chunk_map_bits(chunk);
1333
1334 alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]);
1335 chunk->alloc_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
1336 if (!chunk->alloc_map)
1337 panic("%s: Failed to allocate %zu bytes\n", __func__,
1338 alloc_size);
1339
1340 alloc_size =
1341 BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]);
1342 chunk->bound_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
1343 if (!chunk->bound_map)
1344 panic("%s: Failed to allocate %zu bytes\n", __func__,
1345 alloc_size);
1346
1347 alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]);
1348 chunk->md_blocks = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
1349 if (!chunk->md_blocks)
1350 panic("%s: Failed to allocate %zu bytes\n", __func__,
1351 alloc_size);
1352
1353#ifdef CONFIG_MEMCG_KMEM
1354
1355 chunk->obj_cgroups = NULL;
1356#endif
1357 pcpu_init_md_blocks(chunk);
1358
1359
1360 chunk->immutable = true;
1361 bitmap_fill(chunk->populated, chunk->nr_pages);
1362 chunk->nr_populated = chunk->nr_pages;
1363 chunk->nr_empty_pop_pages = chunk->nr_pages;
1364
1365 chunk->free_bytes = map_size;
1366
1367 if (chunk->start_offset) {
1368
1369 offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
1370 bitmap_set(chunk->alloc_map, 0, offset_bits);
1371 set_bit(0, chunk->bound_map);
1372 set_bit(offset_bits, chunk->bound_map);
1373
1374 chunk->chunk_md.first_free = offset_bits;
1375
1376 pcpu_block_update_hint_alloc(chunk, 0, offset_bits);
1377 }
1378
1379 if (chunk->end_offset) {
1380
1381 offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE;
1382 bitmap_set(chunk->alloc_map,
1383 pcpu_chunk_map_bits(chunk) - offset_bits,
1384 offset_bits);
1385 set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE,
1386 chunk->bound_map);
1387 set_bit(region_bits, chunk->bound_map);
1388
1389 pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk)
1390 - offset_bits, offset_bits);
1391 }
1392
1393 return chunk;
1394}
1395
1396static struct pcpu_chunk *pcpu_alloc_chunk(enum pcpu_chunk_type type, gfp_t gfp)
1397{
1398 struct pcpu_chunk *chunk;
1399 int region_bits;
1400
1401 chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp);
1402 if (!chunk)
1403 return NULL;
1404
1405 INIT_LIST_HEAD(&chunk->list);
1406 chunk->nr_pages = pcpu_unit_pages;
1407 region_bits = pcpu_chunk_map_bits(chunk);
1408
1409 chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
1410 sizeof(chunk->alloc_map[0]), gfp);
1411 if (!chunk->alloc_map)
1412 goto alloc_map_fail;
1413
1414 chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
1415 sizeof(chunk->bound_map[0]), gfp);
1416 if (!chunk->bound_map)
1417 goto bound_map_fail;
1418
1419 chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) *
1420 sizeof(chunk->md_blocks[0]), gfp);
1421 if (!chunk->md_blocks)
1422 goto md_blocks_fail;
1423
1424#ifdef CONFIG_MEMCG_KMEM
1425 if (pcpu_is_memcg_chunk(type)) {
1426 chunk->obj_cgroups =
1427 pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) *
1428 sizeof(struct obj_cgroup *), gfp);
1429 if (!chunk->obj_cgroups)
1430 goto objcg_fail;
1431 }
1432#endif
1433
1434 pcpu_init_md_blocks(chunk);
1435
1436
1437 chunk->free_bytes = chunk->nr_pages * PAGE_SIZE;
1438
1439 return chunk;
1440
1441#ifdef CONFIG_MEMCG_KMEM
1442objcg_fail:
1443 pcpu_mem_free(chunk->md_blocks);
1444#endif
1445md_blocks_fail:
1446 pcpu_mem_free(chunk->bound_map);
1447bound_map_fail:
1448 pcpu_mem_free(chunk->alloc_map);
1449alloc_map_fail:
1450 pcpu_mem_free(chunk);
1451
1452 return NULL;
1453}
1454
1455static void pcpu_free_chunk(struct pcpu_chunk *chunk)
1456{
1457 if (!chunk)
1458 return;
1459#ifdef CONFIG_MEMCG_KMEM
1460 pcpu_mem_free(chunk->obj_cgroups);
1461#endif
1462 pcpu_mem_free(chunk->md_blocks);
1463 pcpu_mem_free(chunk->bound_map);
1464 pcpu_mem_free(chunk->alloc_map);
1465 pcpu_mem_free(chunk);
1466}
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
1482 int page_end)
1483{
1484 int nr = page_end - page_start;
1485
1486 lockdep_assert_held(&pcpu_lock);
1487
1488 bitmap_set(chunk->populated, page_start, nr);
1489 chunk->nr_populated += nr;
1490 pcpu_nr_populated += nr;
1491
1492 pcpu_update_empty_pages(chunk, nr);
1493}
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
1506 int page_start, int page_end)
1507{
1508 int nr = page_end - page_start;
1509
1510 lockdep_assert_held(&pcpu_lock);
1511
1512 bitmap_clear(chunk->populated, page_start, nr);
1513 chunk->nr_populated -= nr;
1514 pcpu_nr_populated -= nr;
1515
1516 pcpu_update_empty_pages(chunk, -nr);
1517}
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
1535 int page_start, int page_end, gfp_t gfp);
1536static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
1537 int page_start, int page_end);
1538static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type,
1539 gfp_t gfp);
1540static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
1541static struct page *pcpu_addr_to_page(void *addr);
1542static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
1543
1544#ifdef CONFIG_NEED_PER_CPU_KM
1545#include "percpu-km.c"
1546#else
1547#include "percpu-vm.c"
1548#endif
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
1561{
1562
1563 if (pcpu_addr_in_chunk(pcpu_first_chunk, addr))
1564 return pcpu_first_chunk;
1565
1566
1567 if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr))
1568 return pcpu_reserved_chunk;
1569
1570
1571
1572
1573
1574
1575
1576
1577 addr += pcpu_unit_offsets[raw_smp_processor_id()];
1578 return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
1579}
1580
1581#ifdef CONFIG_MEMCG_KMEM
1582static enum pcpu_chunk_type pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
1583 struct obj_cgroup **objcgp)
1584{
1585 struct obj_cgroup *objcg;
1586
1587 if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT))
1588 return PCPU_CHUNK_ROOT;
1589
1590 objcg = get_obj_cgroup_from_current();
1591 if (!objcg)
1592 return PCPU_CHUNK_ROOT;
1593
1594 if (obj_cgroup_charge(objcg, gfp, size * num_possible_cpus())) {
1595 obj_cgroup_put(objcg);
1596 return PCPU_FAIL_ALLOC;
1597 }
1598
1599 *objcgp = objcg;
1600 return PCPU_CHUNK_MEMCG;
1601}
1602
1603static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
1604 struct pcpu_chunk *chunk, int off,
1605 size_t size)
1606{
1607 if (!objcg)
1608 return;
1609
1610 if (chunk) {
1611 chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg;
1612
1613 rcu_read_lock();
1614 mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
1615 size * num_possible_cpus());
1616 rcu_read_unlock();
1617 } else {
1618 obj_cgroup_uncharge(objcg, size * num_possible_cpus());
1619 obj_cgroup_put(objcg);
1620 }
1621}
1622
1623static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
1624{
1625 struct obj_cgroup *objcg;
1626
1627 if (!pcpu_is_memcg_chunk(pcpu_chunk_type(chunk)))
1628 return;
1629
1630 objcg = chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT];
1631 chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL;
1632
1633 obj_cgroup_uncharge(objcg, size * num_possible_cpus());
1634
1635 rcu_read_lock();
1636 mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
1637 -(size * num_possible_cpus()));
1638 rcu_read_unlock();
1639
1640 obj_cgroup_put(objcg);
1641}
1642
1643#else
1644static enum pcpu_chunk_type
1645pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp)
1646{
1647 return PCPU_CHUNK_ROOT;
1648}
1649
1650static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
1651 struct pcpu_chunk *chunk, int off,
1652 size_t size)
1653{
1654}
1655
1656static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
1657{
1658}
1659#endif
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
1677 gfp_t gfp)
1678{
1679 gfp_t pcpu_gfp;
1680 bool is_atomic;
1681 bool do_warn;
1682 enum pcpu_chunk_type type;
1683 struct list_head *pcpu_slot;
1684 struct obj_cgroup *objcg = NULL;
1685 static int warn_limit = 10;
1686 struct pcpu_chunk *chunk, *next;
1687 const char *err;
1688 int slot, off, cpu, ret;
1689 unsigned long flags;
1690 void __percpu *ptr;
1691 size_t bits, bit_align;
1692
1693 gfp = current_gfp_context(gfp);
1694
1695 pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
1696 is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
1697 do_warn = !(gfp & __GFP_NOWARN);
1698
1699
1700
1701
1702
1703
1704
1705 if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
1706 align = PCPU_MIN_ALLOC_SIZE;
1707
1708 size = ALIGN(size, PCPU_MIN_ALLOC_SIZE);
1709 bits = size >> PCPU_MIN_ALLOC_SHIFT;
1710 bit_align = align >> PCPU_MIN_ALLOC_SHIFT;
1711
1712 if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
1713 !is_power_of_2(align))) {
1714 WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation\n",
1715 size, align);
1716 return NULL;
1717 }
1718
1719 type = pcpu_memcg_pre_alloc_hook(size, gfp, &objcg);
1720 if (unlikely(type == PCPU_FAIL_ALLOC))
1721 return NULL;
1722 pcpu_slot = pcpu_chunk_list(type);
1723
1724 if (!is_atomic) {
1725
1726
1727
1728
1729
1730 if (gfp & __GFP_NOFAIL) {
1731 mutex_lock(&pcpu_alloc_mutex);
1732 } else if (mutex_lock_killable(&pcpu_alloc_mutex)) {
1733 pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
1734 return NULL;
1735 }
1736 }
1737
1738 spin_lock_irqsave(&pcpu_lock, flags);
1739
1740
1741 if (reserved && pcpu_reserved_chunk) {
1742 chunk = pcpu_reserved_chunk;
1743
1744 off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic);
1745 if (off < 0) {
1746 err = "alloc from reserved chunk failed";
1747 goto fail_unlock;
1748 }
1749
1750 off = pcpu_alloc_area(chunk, bits, bit_align, off);
1751 if (off >= 0)
1752 goto area_found;
1753
1754 err = "alloc from reserved chunk failed";
1755 goto fail_unlock;
1756 }
1757
1758restart:
1759
1760 for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
1761 list_for_each_entry_safe(chunk, next, &pcpu_slot[slot], list) {
1762 off = pcpu_find_block_fit(chunk, bits, bit_align,
1763 is_atomic);
1764 if (off < 0) {
1765 if (slot < PCPU_SLOT_FAIL_THRESHOLD)
1766 pcpu_chunk_move(chunk, 0);
1767 continue;
1768 }
1769
1770 off = pcpu_alloc_area(chunk, bits, bit_align, off);
1771 if (off >= 0)
1772 goto area_found;
1773
1774 }
1775 }
1776
1777 spin_unlock_irqrestore(&pcpu_lock, flags);
1778
1779
1780
1781
1782
1783
1784 if (is_atomic) {
1785 err = "atomic alloc failed, no space left";
1786 goto fail;
1787 }
1788
1789 if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
1790 chunk = pcpu_create_chunk(type, pcpu_gfp);
1791 if (!chunk) {
1792 err = "failed to allocate new chunk";
1793 goto fail;
1794 }
1795
1796 spin_lock_irqsave(&pcpu_lock, flags);
1797 pcpu_chunk_relocate(chunk, -1);
1798 } else {
1799 spin_lock_irqsave(&pcpu_lock, flags);
1800 }
1801
1802 goto restart;
1803
1804area_found:
1805 pcpu_stats_area_alloc(chunk, size);
1806 spin_unlock_irqrestore(&pcpu_lock, flags);
1807
1808
1809 if (!is_atomic) {
1810 unsigned int page_start, page_end, rs, re;
1811
1812 page_start = PFN_DOWN(off);
1813 page_end = PFN_UP(off + size);
1814
1815 bitmap_for_each_clear_region(chunk->populated, rs, re,
1816 page_start, page_end) {
1817 WARN_ON(chunk->immutable);
1818
1819 ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);
1820
1821 spin_lock_irqsave(&pcpu_lock, flags);
1822 if (ret) {
1823 pcpu_free_area(chunk, off);
1824 err = "failed to populate";
1825 goto fail_unlock;
1826 }
1827 pcpu_chunk_populated(chunk, rs, re);
1828 spin_unlock_irqrestore(&pcpu_lock, flags);
1829 }
1830
1831 mutex_unlock(&pcpu_alloc_mutex);
1832 }
1833
1834 if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
1835 pcpu_schedule_balance_work();
1836
1837
1838 for_each_possible_cpu(cpu)
1839 memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
1840
1841 ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
1842 kmemleak_alloc_percpu(ptr, size, gfp);
1843
1844 trace_percpu_alloc_percpu(reserved, is_atomic, size, align,
1845 chunk->base_addr, off, ptr);
1846
1847 pcpu_memcg_post_alloc_hook(objcg, chunk, off, size);
1848
1849 return ptr;
1850
1851fail_unlock:
1852 spin_unlock_irqrestore(&pcpu_lock, flags);
1853fail:
1854 trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);
1855
1856 if (!is_atomic && do_warn && warn_limit) {
1857 pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
1858 size, align, is_atomic, err);
1859 dump_stack();
1860 if (!--warn_limit)
1861 pr_info("limit reached, disable warning\n");
1862 }
1863 if (is_atomic) {
1864
1865 pcpu_atomic_alloc_failed = true;
1866 pcpu_schedule_balance_work();
1867 } else {
1868 mutex_unlock(&pcpu_alloc_mutex);
1869 }
1870
1871 pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
1872
1873 return NULL;
1874}
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp)
1892{
1893 return pcpu_alloc(size, align, false, gfp);
1894}
1895EXPORT_SYMBOL_GPL(__alloc_percpu_gfp);
1896
1897
1898
1899
1900
1901
1902
1903
1904void __percpu *__alloc_percpu(size_t size, size_t align)
1905{
1906 return pcpu_alloc(size, align, false, GFP_KERNEL);
1907}
1908EXPORT_SYMBOL_GPL(__alloc_percpu);
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
1927{
1928 return pcpu_alloc(size, align, true, GFP_KERNEL);
1929}
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942static void __pcpu_balance_workfn(enum pcpu_chunk_type type)
1943{
1944
1945 const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
1946 LIST_HEAD(to_free);
1947 struct list_head *pcpu_slot = pcpu_chunk_list(type);
1948 struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
1949 struct pcpu_chunk *chunk, *next;
1950 int slot, nr_to_pop, ret;
1951
1952
1953
1954
1955
1956 mutex_lock(&pcpu_alloc_mutex);
1957 spin_lock_irq(&pcpu_lock);
1958
1959 list_for_each_entry_safe(chunk, next, free_head, list) {
1960 WARN_ON(chunk->immutable);
1961
1962
1963 if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
1964 continue;
1965
1966 list_move(&chunk->list, &to_free);
1967 }
1968
1969 spin_unlock_irq(&pcpu_lock);
1970
1971 list_for_each_entry_safe(chunk, next, &to_free, list) {
1972 unsigned int rs, re;
1973
1974 bitmap_for_each_set_region(chunk->populated, rs, re, 0,
1975 chunk->nr_pages) {
1976 pcpu_depopulate_chunk(chunk, rs, re);
1977 spin_lock_irq(&pcpu_lock);
1978 pcpu_chunk_depopulated(chunk, rs, re);
1979 spin_unlock_irq(&pcpu_lock);
1980 }
1981 pcpu_destroy_chunk(chunk);
1982 cond_resched();
1983 }
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995retry_pop:
1996 if (pcpu_atomic_alloc_failed) {
1997 nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
1998
1999 pcpu_atomic_alloc_failed = false;
2000 } else {
2001 nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
2002 pcpu_nr_empty_pop_pages,
2003 0, PCPU_EMPTY_POP_PAGES_HIGH);
2004 }
2005
2006 for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
2007 unsigned int nr_unpop = 0, rs, re;
2008
2009 if (!nr_to_pop)
2010 break;
2011
2012 spin_lock_irq(&pcpu_lock);
2013 list_for_each_entry(chunk, &pcpu_slot[slot], list) {
2014 nr_unpop = chunk->nr_pages - chunk->nr_populated;
2015 if (nr_unpop)
2016 break;
2017 }
2018 spin_unlock_irq(&pcpu_lock);
2019
2020 if (!nr_unpop)
2021 continue;
2022
2023
2024 bitmap_for_each_clear_region(chunk->populated, rs, re, 0,
2025 chunk->nr_pages) {
2026 int nr = min_t(int, re - rs, nr_to_pop);
2027
2028 ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
2029 if (!ret) {
2030 nr_to_pop -= nr;
2031 spin_lock_irq(&pcpu_lock);
2032 pcpu_chunk_populated(chunk, rs, rs + nr);
2033 spin_unlock_irq(&pcpu_lock);
2034 } else {
2035 nr_to_pop = 0;
2036 }
2037
2038 if (!nr_to_pop)
2039 break;
2040 }
2041 }
2042
2043 if (nr_to_pop) {
2044
2045 chunk = pcpu_create_chunk(type, gfp);
2046 if (chunk) {
2047 spin_lock_irq(&pcpu_lock);
2048 pcpu_chunk_relocate(chunk, -1);
2049 spin_unlock_irq(&pcpu_lock);
2050 goto retry_pop;
2051 }
2052 }
2053
2054 mutex_unlock(&pcpu_alloc_mutex);
2055}
2056
2057
2058
2059
2060
2061
2062
2063static void pcpu_balance_workfn(struct work_struct *work)
2064{
2065 enum pcpu_chunk_type type;
2066
2067 for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
2068 __pcpu_balance_workfn(type);
2069}
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080void free_percpu(void __percpu *ptr)
2081{
2082 void *addr;
2083 struct pcpu_chunk *chunk;
2084 unsigned long flags;
2085 int size, off;
2086 bool need_balance = false;
2087 struct list_head *pcpu_slot;
2088
2089 if (!ptr)
2090 return;
2091
2092 kmemleak_free_percpu(ptr);
2093
2094 addr = __pcpu_ptr_to_addr(ptr);
2095
2096 spin_lock_irqsave(&pcpu_lock, flags);
2097
2098 chunk = pcpu_chunk_addr_search(addr);
2099 off = addr - chunk->base_addr;
2100
2101 size = pcpu_free_area(chunk, off);
2102
2103 pcpu_slot = pcpu_chunk_list(pcpu_chunk_type(chunk));
2104
2105 pcpu_memcg_free_hook(chunk, off, size);
2106
2107
2108 if (chunk->free_bytes == pcpu_unit_size) {
2109 struct pcpu_chunk *pos;
2110
2111 list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
2112 if (pos != chunk) {
2113 need_balance = true;
2114 break;
2115 }
2116 }
2117
2118 trace_percpu_free_percpu(chunk->base_addr, off, ptr);
2119
2120 spin_unlock_irqrestore(&pcpu_lock, flags);
2121
2122 if (need_balance)
2123 pcpu_schedule_balance_work();
2124}
2125EXPORT_SYMBOL_GPL(free_percpu);
2126
2127bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
2128{
2129#ifdef CONFIG_SMP
2130 const size_t static_size = __per_cpu_end - __per_cpu_start;
2131 void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
2132 unsigned int cpu;
2133
2134 for_each_possible_cpu(cpu) {
2135 void *start = per_cpu_ptr(base, cpu);
2136 void *va = (void *)addr;
2137
2138 if (va >= start && va < start + static_size) {
2139 if (can_addr) {
2140 *can_addr = (unsigned long) (va - start);
2141 *can_addr += (unsigned long)
2142 per_cpu_ptr(base, get_boot_cpu_id());
2143 }
2144 return true;
2145 }
2146 }
2147#endif
2148
2149 return false;
2150}
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163bool is_kernel_percpu_address(unsigned long addr)
2164{
2165 return __is_kernel_percpu_address(addr, NULL);
2166}
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191phys_addr_t per_cpu_ptr_to_phys(void *addr)
2192{
2193 void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
2194 bool in_first_chunk = false;
2195 unsigned long first_low, first_high;
2196 unsigned int cpu;
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208 first_low = (unsigned long)pcpu_base_addr +
2209 pcpu_unit_page_offset(pcpu_low_unit_cpu, 0);
2210 first_high = (unsigned long)pcpu_base_addr +
2211 pcpu_unit_page_offset(pcpu_high_unit_cpu, pcpu_unit_pages);
2212 if ((unsigned long)addr >= first_low &&
2213 (unsigned long)addr < first_high) {
2214 for_each_possible_cpu(cpu) {
2215 void *start = per_cpu_ptr(base, cpu);
2216
2217 if (addr >= start && addr < start + pcpu_unit_size) {
2218 in_first_chunk = true;
2219 break;
2220 }
2221 }
2222 }
2223
2224 if (in_first_chunk) {
2225 if (!is_vmalloc_addr(addr))
2226 return __pa(addr);
2227 else
2228 return page_to_phys(vmalloc_to_page(addr)) +
2229 offset_in_page(addr);
2230 } else
2231 return page_to_phys(pcpu_addr_to_page(addr)) +
2232 offset_in_page(addr);
2233}
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
2251 int nr_units)
2252{
2253 struct pcpu_alloc_info *ai;
2254 size_t base_size, ai_size;
2255 void *ptr;
2256 int unit;
2257
2258 base_size = ALIGN(struct_size(ai, groups, nr_groups),
2259 __alignof__(ai->groups[0].cpu_map[0]));
2260 ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
2261
2262 ptr = memblock_alloc(PFN_ALIGN(ai_size), PAGE_SIZE);
2263 if (!ptr)
2264 return NULL;
2265 ai = ptr;
2266 ptr += base_size;
2267
2268 ai->groups[0].cpu_map = ptr;
2269
2270 for (unit = 0; unit < nr_units; unit++)
2271 ai->groups[0].cpu_map[unit] = NR_CPUS;
2272
2273 ai->nr_groups = nr_groups;
2274 ai->__ai_size = PFN_ALIGN(ai_size);
2275
2276 return ai;
2277}
2278
2279
2280
2281
2282
2283
2284
2285void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
2286{
2287 memblock_free_early(__pa(ai), ai->__ai_size);
2288}
2289
2290
2291
2292
2293
2294
2295
2296
2297static void pcpu_dump_alloc_info(const char *lvl,
2298 const struct pcpu_alloc_info *ai)
2299{
2300 int group_width = 1, cpu_width = 1, width;
2301 char empty_str[] = "--------";
2302 int alloc = 0, alloc_end = 0;
2303 int group, v;
2304 int upa, apl;
2305
2306 v = ai->nr_groups;
2307 while (v /= 10)
2308 group_width++;
2309
2310 v = num_possible_cpus();
2311 while (v /= 10)
2312 cpu_width++;
2313 empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
2314
2315 upa = ai->alloc_size / ai->unit_size;
2316 width = upa * (cpu_width + 1) + group_width + 3;
2317 apl = rounddown_pow_of_two(max(60 / width, 1));
2318
2319 printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
2320 lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
2321 ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);
2322
2323 for (group = 0; group < ai->nr_groups; group++) {
2324 const struct pcpu_group_info *gi = &ai->groups[group];
2325 int unit = 0, unit_end = 0;
2326
2327 BUG_ON(gi->nr_units % upa);
2328 for (alloc_end += gi->nr_units / upa;
2329 alloc < alloc_end; alloc++) {
2330 if (!(alloc % apl)) {
2331 pr_cont("\n");
2332 printk("%spcpu-alloc: ", lvl);
2333 }
2334 pr_cont("[%0*d] ", group_width, group);
2335
2336 for (unit_end += upa; unit < unit_end; unit++)
2337 if (gi->cpu_map[unit] != NR_CPUS)
2338 pr_cont("%0*d ",
2339 cpu_width, gi->cpu_map[unit]);
2340 else
2341 pr_cont("%s ", empty_str);
2342 }
2343 }
2344 pr_cont("\n");
2345}
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
2402 void *base_addr)
2403{
2404 size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
2405 size_t static_size, dyn_size;
2406 struct pcpu_chunk *chunk;
2407 unsigned long *group_offsets;
2408 size_t *group_sizes;
2409 unsigned long *unit_off;
2410 unsigned int cpu;
2411 int *unit_map;
2412 int group, unit, i;
2413 int map_size;
2414 unsigned long tmp_addr;
2415 size_t alloc_size;
2416 enum pcpu_chunk_type type;
2417
2418#define PCPU_SETUP_BUG_ON(cond) do { \
2419 if (unlikely(cond)) { \
2420 pr_emerg("failed to initialize, %s\n", #cond); \
2421 pr_emerg("cpu_possible_mask=%*pb\n", \
2422 cpumask_pr_args(cpu_possible_mask)); \
2423 pcpu_dump_alloc_info(KERN_EMERG, ai); \
2424 BUG(); \
2425 } \
2426} while (0)
2427
2428
2429 PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
2430#ifdef CONFIG_SMP
2431 PCPU_SETUP_BUG_ON(!ai->static_size);
2432 PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start));
2433#endif
2434 PCPU_SETUP_BUG_ON(!base_addr);
2435 PCPU_SETUP_BUG_ON(offset_in_page(base_addr));
2436 PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
2437 PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
2438 PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
2439 PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE));
2440 PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
2441 PCPU_SETUP_BUG_ON(!ai->dyn_size);
2442 PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE));
2443 PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) ||
2444 IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE)));
2445 PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
2446
2447
2448 alloc_size = ai->nr_groups * sizeof(group_offsets[0]);
2449 group_offsets = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
2450 if (!group_offsets)
2451 panic("%s: Failed to allocate %zu bytes\n", __func__,
2452 alloc_size);
2453
2454 alloc_size = ai->nr_groups * sizeof(group_sizes[0]);
2455 group_sizes = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
2456 if (!group_sizes)
2457 panic("%s: Failed to allocate %zu bytes\n", __func__,
2458 alloc_size);
2459
2460 alloc_size = nr_cpu_ids * sizeof(unit_map[0]);
2461 unit_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
2462 if (!unit_map)
2463 panic("%s: Failed to allocate %zu bytes\n", __func__,
2464 alloc_size);
2465
2466 alloc_size = nr_cpu_ids * sizeof(unit_off[0]);
2467 unit_off = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
2468 if (!unit_off)
2469 panic("%s: Failed to allocate %zu bytes\n", __func__,
2470 alloc_size);
2471
2472 for (cpu = 0; cpu < nr_cpu_ids; cpu++)
2473 unit_map[cpu] = UINT_MAX;
2474
2475 pcpu_low_unit_cpu = NR_CPUS;
2476 pcpu_high_unit_cpu = NR_CPUS;
2477
2478 for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
2479 const struct pcpu_group_info *gi = &ai->groups[group];
2480
2481 group_offsets[group] = gi->base_offset;
2482 group_sizes[group] = gi->nr_units * ai->unit_size;
2483
2484 for (i = 0; i < gi->nr_units; i++) {
2485 cpu = gi->cpu_map[i];
2486 if (cpu == NR_CPUS)
2487 continue;
2488
2489 PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids);
2490 PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
2491 PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);
2492
2493 unit_map[cpu] = unit + i;
2494 unit_off[cpu] = gi->base_offset + i * ai->unit_size;
2495
2496
2497 if (pcpu_low_unit_cpu == NR_CPUS ||
2498 unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
2499 pcpu_low_unit_cpu = cpu;
2500 if (pcpu_high_unit_cpu == NR_CPUS ||
2501 unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
2502 pcpu_high_unit_cpu = cpu;
2503 }
2504 }
2505 pcpu_nr_units = unit;
2506
2507 for_each_possible_cpu(cpu)
2508 PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);
2509
2510
2511#undef PCPU_SETUP_BUG_ON
2512 pcpu_dump_alloc_info(KERN_DEBUG, ai);
2513
2514 pcpu_nr_groups = ai->nr_groups;
2515 pcpu_group_offsets = group_offsets;
2516 pcpu_group_sizes = group_sizes;
2517 pcpu_unit_map = unit_map;
2518 pcpu_unit_offsets = unit_off;
2519
2520
2521 pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
2522 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
2523 pcpu_atom_size = ai->atom_size;
2524 pcpu_chunk_struct_size = struct_size(chunk, populated,
2525 BITS_TO_LONGS(pcpu_unit_pages));
2526
2527 pcpu_stats_save_ai(ai);
2528
2529
2530
2531
2532
2533 pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
2534 pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots *
2535 sizeof(pcpu_chunk_lists[0]) *
2536 PCPU_NR_CHUNK_TYPES,
2537 SMP_CACHE_BYTES);
2538 if (!pcpu_chunk_lists)
2539 panic("%s: Failed to allocate %zu bytes\n", __func__,
2540 pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]) *
2541 PCPU_NR_CHUNK_TYPES);
2542
2543 for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
2544 for (i = 0; i < pcpu_nr_slots; i++)
2545 INIT_LIST_HEAD(&pcpu_chunk_list(type)[i]);
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555 static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE);
2556 dyn_size = ai->dyn_size - (static_size - ai->static_size);
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566 tmp_addr = (unsigned long)base_addr + static_size;
2567 map_size = ai->reserved_size ?: dyn_size;
2568 chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
2569
2570
2571 if (ai->reserved_size) {
2572 pcpu_reserved_chunk = chunk;
2573
2574 tmp_addr = (unsigned long)base_addr + static_size +
2575 ai->reserved_size;
2576 map_size = dyn_size;
2577 chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
2578 }
2579
2580
2581 pcpu_first_chunk = chunk;
2582 pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages;
2583 pcpu_chunk_relocate(pcpu_first_chunk, -1);
2584
2585
2586 pcpu_nr_populated += PFN_DOWN(size_sum);
2587
2588 pcpu_stats_chunk_alloc();
2589 trace_percpu_create_chunk(base_addr);
2590
2591
2592 pcpu_base_addr = base_addr;
2593}
2594
2595#ifdef CONFIG_SMP
2596
2597const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
2598 [PCPU_FC_AUTO] = "auto",
2599 [PCPU_FC_EMBED] = "embed",
2600 [PCPU_FC_PAGE] = "page",
2601};
2602
2603enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
2604
2605static int __init percpu_alloc_setup(char *str)
2606{
2607 if (!str)
2608 return -EINVAL;
2609
2610 if (0)
2611 ;
2612#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
2613 else if (!strcmp(str, "embed"))
2614 pcpu_chosen_fc = PCPU_FC_EMBED;
2615#endif
2616#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
2617 else if (!strcmp(str, "page"))
2618 pcpu_chosen_fc = PCPU_FC_PAGE;
2619#endif
2620 else
2621 pr_warn("unknown allocator %s specified\n", str);
2622
2623 return 0;
2624}
2625early_param("percpu_alloc", percpu_alloc_setup);
2626
2627
2628
2629
2630
2631
2632#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
2633 !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
2634#define BUILD_EMBED_FIRST_CHUNK
2635#endif
2636
2637
2638#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
2639#define BUILD_PAGE_FIRST_CHUNK
2640#endif
2641
2642
2643#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
2666 size_t reserved_size, size_t dyn_size,
2667 size_t atom_size,
2668 pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
2669{
2670 static int group_map[NR_CPUS] __initdata;
2671 static int group_cnt[NR_CPUS] __initdata;
2672 const size_t static_size = __per_cpu_end - __per_cpu_start;
2673 int nr_groups = 1, nr_units = 0;
2674 size_t size_sum, min_unit_size, alloc_size;
2675 int upa, max_upa, best_upa;
2676 int last_allocs, group, unit;
2677 unsigned int cpu, tcpu;
2678 struct pcpu_alloc_info *ai;
2679 unsigned int *cpu_map;
2680
2681
2682 memset(group_map, 0, sizeof(group_map));
2683 memset(group_cnt, 0, sizeof(group_cnt));
2684
2685
2686 size_sum = PFN_ALIGN(static_size + reserved_size +
2687 max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
2688 dyn_size = size_sum - static_size - reserved_size;
2689
2690
2691
2692
2693
2694
2695
2696 min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
2697
2698
2699 alloc_size = roundup(min_unit_size, atom_size);
2700 upa = alloc_size / min_unit_size;
2701 while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
2702 upa--;
2703 max_upa = upa;
2704
2705
2706 for_each_possible_cpu(cpu) {
2707 group = 0;
2708 next_group:
2709 for_each_possible_cpu(tcpu) {
2710 if (cpu == tcpu)
2711 break;
2712 if (group_map[tcpu] == group && cpu_distance_fn &&
2713 (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
2714 cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
2715 group++;
2716 nr_groups = max(nr_groups, group + 1);
2717 goto next_group;
2718 }
2719 }
2720 group_map[cpu] = group;
2721 group_cnt[group]++;
2722 }
2723
2724
2725
2726
2727
2728
2729 last_allocs = INT_MAX;
2730 for (upa = max_upa; upa; upa--) {
2731 int allocs = 0, wasted = 0;
2732
2733 if (alloc_size % upa || (offset_in_page(alloc_size / upa)))
2734 continue;
2735
2736 for (group = 0; group < nr_groups; group++) {
2737 int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
2738 allocs += this_allocs;
2739 wasted += this_allocs * upa - group_cnt[group];
2740 }
2741
2742
2743
2744
2745
2746
2747 if (wasted > num_possible_cpus() / 3)
2748 continue;
2749
2750
2751 if (allocs > last_allocs)
2752 break;
2753 last_allocs = allocs;
2754 best_upa = upa;
2755 }
2756 upa = best_upa;
2757
2758
2759 for (group = 0; group < nr_groups; group++)
2760 nr_units += roundup(group_cnt[group], upa);
2761
2762 ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
2763 if (!ai)
2764 return ERR_PTR(-ENOMEM);
2765 cpu_map = ai->groups[0].cpu_map;
2766
2767 for (group = 0; group < nr_groups; group++) {
2768 ai->groups[group].cpu_map = cpu_map;
2769 cpu_map += roundup(group_cnt[group], upa);
2770 }
2771
2772 ai->static_size = static_size;
2773 ai->reserved_size = reserved_size;
2774 ai->dyn_size = dyn_size;
2775 ai->unit_size = alloc_size / upa;
2776 ai->atom_size = atom_size;
2777 ai->alloc_size = alloc_size;
2778
2779 for (group = 0, unit = 0; group < nr_groups; group++) {
2780 struct pcpu_group_info *gi = &ai->groups[group];
2781
2782
2783
2784
2785
2786
2787 gi->base_offset = unit * ai->unit_size;
2788
2789 for_each_possible_cpu(cpu)
2790 if (group_map[cpu] == group)
2791 gi->cpu_map[gi->nr_units++] = cpu;
2792 gi->nr_units = roundup(gi->nr_units, upa);
2793 unit += gi->nr_units;
2794 }
2795 BUG_ON(unit != nr_units);
2796
2797 return ai;
2798}
2799#endif
2800
2801#if defined(BUILD_EMBED_FIRST_CHUNK)
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
2835 size_t atom_size,
2836 pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
2837 pcpu_fc_alloc_fn_t alloc_fn,
2838 pcpu_fc_free_fn_t free_fn)
2839{
2840 void *base = (void *)ULONG_MAX;
2841 void **areas = NULL;
2842 struct pcpu_alloc_info *ai;
2843 size_t size_sum, areas_size;
2844 unsigned long max_distance;
2845 int group, i, highest_group, rc = 0;
2846
2847 ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
2848 cpu_distance_fn);
2849 if (IS_ERR(ai))
2850 return PTR_ERR(ai);
2851
2852 size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
2853 areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
2854
2855 areas = memblock_alloc(areas_size, SMP_CACHE_BYTES);
2856 if (!areas) {
2857 rc = -ENOMEM;
2858 goto out_free;
2859 }
2860
2861
2862 highest_group = 0;
2863 for (group = 0; group < ai->nr_groups; group++) {
2864 struct pcpu_group_info *gi = &ai->groups[group];
2865 unsigned int cpu = NR_CPUS;
2866 void *ptr;
2867
2868 for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
2869 cpu = gi->cpu_map[i];
2870 BUG_ON(cpu == NR_CPUS);
2871
2872
2873 ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
2874 if (!ptr) {
2875 rc = -ENOMEM;
2876 goto out_free_areas;
2877 }
2878
2879 kmemleak_free(ptr);
2880 areas[group] = ptr;
2881
2882 base = min(ptr, base);
2883 if (ptr > areas[highest_group])
2884 highest_group = group;
2885 }
2886 max_distance = areas[highest_group] - base;
2887 max_distance += ai->unit_size * ai->groups[highest_group].nr_units;
2888
2889
2890 if (max_distance > VMALLOC_TOTAL * 3 / 4) {
2891 pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n",
2892 max_distance, VMALLOC_TOTAL);
2893#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
2894
2895 rc = -EINVAL;
2896 goto out_free_areas;
2897#endif
2898 }
2899
2900
2901
2902
2903
2904
2905 for (group = 0; group < ai->nr_groups; group++) {
2906 struct pcpu_group_info *gi = &ai->groups[group];
2907 void *ptr = areas[group];
2908
2909 for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
2910 if (gi->cpu_map[i] == NR_CPUS) {
2911
2912 free_fn(ptr, ai->unit_size);
2913 continue;
2914 }
2915
2916 memcpy(ptr, __per_cpu_load, ai->static_size);
2917 free_fn(ptr + size_sum, ai->unit_size - size_sum);
2918 }
2919 }
2920
2921
2922 for (group = 0; group < ai->nr_groups; group++) {
2923 ai->groups[group].base_offset = areas[group] - base;
2924 }
2925
2926 pr_info("Embedded %zu pages/cpu s%zu r%zu d%zu u%zu\n",
2927 PFN_DOWN(size_sum), ai->static_size, ai->reserved_size,
2928 ai->dyn_size, ai->unit_size);
2929
2930 pcpu_setup_first_chunk(ai, base);
2931 goto out_free;
2932
2933out_free_areas:
2934 for (group = 0; group < ai->nr_groups; group++)
2935 if (areas[group])
2936 free_fn(areas[group],
2937 ai->groups[group].nr_units * ai->unit_size);
2938out_free:
2939 pcpu_free_alloc_info(ai);
2940 if (areas)
2941 memblock_free_early(__pa(areas), areas_size);
2942 return rc;
2943}
2944#endif
2945
2946#ifdef BUILD_PAGE_FIRST_CHUNK
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963int __init pcpu_page_first_chunk(size_t reserved_size,
2964 pcpu_fc_alloc_fn_t alloc_fn,
2965 pcpu_fc_free_fn_t free_fn,
2966 pcpu_fc_populate_pte_fn_t populate_pte_fn)
2967{
2968 static struct vm_struct vm;
2969 struct pcpu_alloc_info *ai;
2970 char psize_str[16];
2971 int unit_pages;
2972 size_t pages_size;
2973 struct page **pages;
2974 int unit, i, j, rc = 0;
2975 int upa;
2976 int nr_g0_units;
2977
2978 snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
2979
2980 ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
2981 if (IS_ERR(ai))
2982 return PTR_ERR(ai);
2983 BUG_ON(ai->nr_groups != 1);
2984 upa = ai->alloc_size/ai->unit_size;
2985 nr_g0_units = roundup(num_possible_cpus(), upa);
2986 if (WARN_ON(ai->groups[0].nr_units != nr_g0_units)) {
2987 pcpu_free_alloc_info(ai);
2988 return -EINVAL;
2989 }
2990
2991 unit_pages = ai->unit_size >> PAGE_SHIFT;
2992
2993
2994 pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
2995 sizeof(pages[0]));
2996 pages = memblock_alloc(pages_size, SMP_CACHE_BYTES);
2997 if (!pages)
2998 panic("%s: Failed to allocate %zu bytes\n", __func__,
2999 pages_size);
3000
3001
3002 j = 0;
3003 for (unit = 0; unit < num_possible_cpus(); unit++) {
3004 unsigned int cpu = ai->groups[0].cpu_map[unit];
3005 for (i = 0; i < unit_pages; i++) {
3006 void *ptr;
3007
3008 ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
3009 if (!ptr) {
3010 pr_warn("failed to allocate %s page for cpu%u\n",
3011 psize_str, cpu);
3012 goto enomem;
3013 }
3014
3015 kmemleak_free(ptr);
3016 pages[j++] = virt_to_page(ptr);
3017 }
3018 }
3019
3020
3021 vm.flags = VM_ALLOC;
3022 vm.size = num_possible_cpus() * ai->unit_size;
3023 vm_area_register_early(&vm, PAGE_SIZE);
3024
3025 for (unit = 0; unit < num_possible_cpus(); unit++) {
3026 unsigned long unit_addr =
3027 (unsigned long)vm.addr + unit * ai->unit_size;
3028
3029 for (i = 0; i < unit_pages; i++)
3030 populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
3031
3032
3033 rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
3034 unit_pages);
3035 if (rc < 0)
3036 panic("failed to map percpu area, err=%d\n", rc);
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047 memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
3048 }
3049
3050
3051 pr_info("%d %s pages/cpu s%zu r%zu d%zu\n",
3052 unit_pages, psize_str, ai->static_size,
3053 ai->reserved_size, ai->dyn_size);
3054
3055 pcpu_setup_first_chunk(ai, vm.addr);
3056 goto out_free_ar;
3057
3058enomem:
3059 while (--j >= 0)
3060 free_fn(page_address(pages[j]), PAGE_SIZE);
3061 rc = -ENOMEM;
3062out_free_ar:
3063 memblock_free_early(__pa(pages), pages_size);
3064 pcpu_free_alloc_info(ai);
3065 return rc;
3066}
3067#endif
3068
3069#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
3083EXPORT_SYMBOL(__per_cpu_offset);
3084
3085static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
3086 size_t align)
3087{
3088 return memblock_alloc_from(size, align, __pa(MAX_DMA_ADDRESS));
3089}
3090
3091static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
3092{
3093 memblock_free_early(__pa(ptr), size);
3094}
3095
3096void __init setup_per_cpu_areas(void)
3097{
3098 unsigned long delta;
3099 unsigned int cpu;
3100 int rc;
3101
3102
3103
3104
3105
3106 rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
3107 PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
3108 pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
3109 if (rc < 0)
3110 panic("Failed to initialize percpu areas.");
3111
3112 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
3113 for_each_possible_cpu(cpu)
3114 __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
3115}
3116#endif
3117
3118#else
3119
3120
3121
3122
3123
3124
3125
3126
3127void __init setup_per_cpu_areas(void)
3128{
3129 const size_t unit_size =
3130 roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
3131 PERCPU_DYNAMIC_RESERVE));
3132 struct pcpu_alloc_info *ai;
3133 void *fc;
3134
3135 ai = pcpu_alloc_alloc_info(1, 1);
3136 fc = memblock_alloc_from(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
3137 if (!ai || !fc)
3138 panic("Failed to allocate memory for percpu areas.");
3139
3140 kmemleak_free(fc);
3141
3142 ai->dyn_size = unit_size;
3143 ai->unit_size = unit_size;
3144 ai->atom_size = unit_size;
3145 ai->alloc_size = unit_size;
3146 ai->groups[0].nr_units = 1;
3147 ai->groups[0].cpu_map[0] = 0;
3148
3149 pcpu_setup_first_chunk(ai, fc);
3150 pcpu_free_alloc_info(ai);
3151}
3152
3153#endif
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166unsigned long pcpu_nr_pages(void)
3167{
3168 return pcpu_nr_populated * pcpu_nr_units;
3169}
3170
3171
3172
3173
3174
3175
3176static int __init percpu_enable_async(void)
3177{
3178 pcpu_async_enabled = true;
3179 return 0;
3180}
3181subsys_initcall(percpu_enable_async);
3182