1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
70
71#include <linux/bitmap.h>
72#include <linux/cpumask.h>
73#include <linux/memblock.h>
74#include <linux/err.h>
75#include <linux/lcm.h>
76#include <linux/list.h>
77#include <linux/log2.h>
78#include <linux/mm.h>
79#include <linux/module.h>
80#include <linux/mutex.h>
81#include <linux/percpu.h>
82#include <linux/pfn.h>
83#include <linux/slab.h>
84#include <linux/spinlock.h>
85#include <linux/vmalloc.h>
86#include <linux/workqueue.h>
87#include <linux/kmemleak.h>
88#include <linux/sched.h>
89#include <linux/sched/mm.h>
90#include <linux/memcontrol.h>
91
92#include <asm/cacheflush.h>
93#include <asm/sections.h>
94#include <asm/tlbflush.h>
95#include <asm/io.h>
96
97#define CREATE_TRACE_POINTS
98#include <trace/events/percpu.h>
99
100#include "percpu-internal.h"
101
102
103
104
105
106#define PCPU_SLOT_BASE_SHIFT 5
107
108#define PCPU_SLOT_FAIL_THRESHOLD 3
109
110#define PCPU_EMPTY_POP_PAGES_LOW 2
111#define PCPU_EMPTY_POP_PAGES_HIGH 4
112
113#ifdef CONFIG_SMP
114
115#ifndef __addr_to_pcpu_ptr
116#define __addr_to_pcpu_ptr(addr) \
117 (void __percpu *)((unsigned long)(addr) - \
118 (unsigned long)pcpu_base_addr + \
119 (unsigned long)__per_cpu_start)
120#endif
121#ifndef __pcpu_ptr_to_addr
122#define __pcpu_ptr_to_addr(ptr) \
123 (void __force *)((unsigned long)(ptr) + \
124 (unsigned long)pcpu_base_addr - \
125 (unsigned long)__per_cpu_start)
126#endif
127#else
128
129#define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr)
130#define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr)
131#endif
132
133static int pcpu_unit_pages __ro_after_init;
134static int pcpu_unit_size __ro_after_init;
135static int pcpu_nr_units __ro_after_init;
136static int pcpu_atom_size __ro_after_init;
137int pcpu_nr_slots __ro_after_init;
138static int pcpu_free_slot __ro_after_init;
139int pcpu_sidelined_slot __ro_after_init;
140int pcpu_to_depopulate_slot __ro_after_init;
141static size_t pcpu_chunk_struct_size __ro_after_init;
142
143
144static unsigned int pcpu_low_unit_cpu __ro_after_init;
145static unsigned int pcpu_high_unit_cpu __ro_after_init;
146
147
148void *pcpu_base_addr __ro_after_init;
149
150static const int *pcpu_unit_map __ro_after_init;
151const unsigned long *pcpu_unit_offsets __ro_after_init;
152
153
154static int pcpu_nr_groups __ro_after_init;
155static const unsigned long *pcpu_group_offsets __ro_after_init;
156static const size_t *pcpu_group_sizes __ro_after_init;
157
158
159
160
161
162
163struct pcpu_chunk *pcpu_first_chunk __ro_after_init;
164
165
166
167
168
169
170struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;
171
172DEFINE_SPINLOCK(pcpu_lock);
173static DEFINE_MUTEX(pcpu_alloc_mutex);
174
175struct list_head *pcpu_chunk_lists __ro_after_init;
176
177
178static LIST_HEAD(pcpu_map_extend_chunks);
179
180
181
182
183
184int pcpu_nr_empty_pop_pages;
185
186
187
188
189
190
191
192static unsigned long pcpu_nr_populated;
193
194
195
196
197
198
199
200static void pcpu_balance_workfn(struct work_struct *work);
201static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
202static bool pcpu_async_enabled __read_mostly;
203static bool pcpu_atomic_alloc_failed;
204
205static void pcpu_schedule_balance_work(void)
206{
207 if (pcpu_async_enabled)
208 schedule_work(&pcpu_balance_work);
209}
210
211
212
213
214
215
216
217
218
219static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr)
220{
221 void *start_addr, *end_addr;
222
223 if (!chunk)
224 return false;
225
226 start_addr = chunk->base_addr + chunk->start_offset;
227 end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE -
228 chunk->end_offset;
229
230 return addr >= start_addr && addr < end_addr;
231}
232
233static int __pcpu_size_to_slot(int size)
234{
235 int highbit = fls(size);
236 return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
237}
238
239static int pcpu_size_to_slot(int size)
240{
241 if (size == pcpu_unit_size)
242 return pcpu_free_slot;
243 return __pcpu_size_to_slot(size);
244}
245
246static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
247{
248 const struct pcpu_block_md *chunk_md = &chunk->chunk_md;
249
250 if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE ||
251 chunk_md->contig_hint == 0)
252 return 0;
253
254 return pcpu_size_to_slot(chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE);
255}
256
257
258static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
259{
260 page->index = (unsigned long)pcpu;
261}
262
263
264static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
265{
266 return (struct pcpu_chunk *)page->index;
267}
268
269static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
270{
271 return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
272}
273
274static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx)
275{
276 return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT);
277}
278
279static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
280 unsigned int cpu, int page_idx)
281{
282 return (unsigned long)chunk->base_addr +
283 pcpu_unit_page_offset(cpu, page_idx);
284}
285
286
287
288
289
290static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index)
291{
292 return chunk->alloc_map +
293 (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG);
294}
295
296static unsigned long pcpu_off_to_block_index(int off)
297{
298 return off / PCPU_BITMAP_BLOCK_BITS;
299}
300
301static unsigned long pcpu_off_to_block_off(int off)
302{
303 return off & (PCPU_BITMAP_BLOCK_BITS - 1);
304}
305
306static unsigned long pcpu_block_off_to_off(int index, int off)
307{
308 return index * PCPU_BITMAP_BLOCK_BITS + off;
309}
310
311
312
313
314
315
316
317
318
319
320
321static bool pcpu_check_block_hint(struct pcpu_block_md *block, int bits,
322 size_t align)
323{
324 int bit_off = ALIGN(block->contig_hint_start, align) -
325 block->contig_hint_start;
326
327 return bit_off + bits <= block->contig_hint;
328}
329
330
331
332
333
334
335
336
337
338
339
340
341static int pcpu_next_hint(struct pcpu_block_md *block, int alloc_bits)
342{
343
344
345
346
347
348
349
350 if (block->scan_hint &&
351 block->contig_hint_start > block->scan_hint_start &&
352 alloc_bits > block->scan_hint)
353 return block->scan_hint_start + block->scan_hint;
354
355 return block->first_free;
356}
357
358
359
360
361
362
363
364
365
366
367
368
369static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off,
370 int *bits)
371{
372 int i = pcpu_off_to_block_index(*bit_off);
373 int block_off = pcpu_off_to_block_off(*bit_off);
374 struct pcpu_block_md *block;
375
376 *bits = 0;
377 for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
378 block++, i++) {
379
380 if (*bits) {
381 *bits += block->left_free;
382 if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
383 continue;
384 return;
385 }
386
387
388
389
390
391
392
393
394
395 *bits = block->contig_hint;
396 if (*bits && block->contig_hint_start >= block_off &&
397 *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) {
398 *bit_off = pcpu_block_off_to_off(i,
399 block->contig_hint_start);
400 return;
401 }
402
403 block_off = 0;
404
405 *bits = block->right_free;
406 *bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free;
407 }
408}
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
425 int align, int *bit_off, int *bits)
426{
427 int i = pcpu_off_to_block_index(*bit_off);
428 int block_off = pcpu_off_to_block_off(*bit_off);
429 struct pcpu_block_md *block;
430
431 *bits = 0;
432 for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
433 block++, i++) {
434
435 if (*bits) {
436 *bits += block->left_free;
437 if (*bits >= alloc_bits)
438 return;
439 if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
440 continue;
441 }
442
443
444 *bits = ALIGN(block->contig_hint_start, align) -
445 block->contig_hint_start;
446
447
448
449
450 if (block->contig_hint &&
451 block->contig_hint_start >= block_off &&
452 block->contig_hint >= *bits + alloc_bits) {
453 int start = pcpu_next_hint(block, alloc_bits);
454
455 *bits += alloc_bits + block->contig_hint_start -
456 start;
457 *bit_off = pcpu_block_off_to_off(i, start);
458 return;
459 }
460
461 block_off = 0;
462
463 *bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free,
464 align);
465 *bits = PCPU_BITMAP_BLOCK_BITS - *bit_off;
466 *bit_off = pcpu_block_off_to_off(i, *bit_off);
467 if (*bits >= alloc_bits)
468 return;
469 }
470
471
472 *bit_off = pcpu_chunk_map_bits(chunk);
473}
474
475
476
477
478
479
480
481#define pcpu_for_each_md_free_region(chunk, bit_off, bits) \
482 for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits)); \
483 (bit_off) < pcpu_chunk_map_bits((chunk)); \
484 (bit_off) += (bits) + 1, \
485 pcpu_next_md_free_region((chunk), &(bit_off), &(bits)))
486
487#define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) \
488 for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
489 &(bits)); \
490 (bit_off) < pcpu_chunk_map_bits((chunk)); \
491 (bit_off) += (bits), \
492 pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
493 &(bits)))
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
509{
510 if (WARN_ON_ONCE(!slab_is_available()))
511 return NULL;
512
513 if (size <= PAGE_SIZE)
514 return kzalloc(size, gfp);
515 else
516 return __vmalloc(size, gfp | __GFP_ZERO);
517}
518
519
520
521
522
523
524
525static void pcpu_mem_free(void *ptr)
526{
527 kvfree(ptr);
528}
529
530static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot,
531 bool move_front)
532{
533 if (chunk != pcpu_reserved_chunk) {
534 if (move_front)
535 list_move(&chunk->list, &pcpu_chunk_lists[slot]);
536 else
537 list_move_tail(&chunk->list, &pcpu_chunk_lists[slot]);
538 }
539}
540
541static void pcpu_chunk_move(struct pcpu_chunk *chunk, int slot)
542{
543 __pcpu_chunk_move(chunk, slot, true);
544}
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
560{
561 int nslot = pcpu_chunk_slot(chunk);
562
563
564 if (chunk->isolated)
565 return;
566
567 if (oslot != nslot)
568 __pcpu_chunk_move(chunk, nslot, oslot < nslot);
569}
570
571static void pcpu_isolate_chunk(struct pcpu_chunk *chunk)
572{
573 lockdep_assert_held(&pcpu_lock);
574
575 if (!chunk->isolated) {
576 chunk->isolated = true;
577 pcpu_nr_empty_pop_pages -= chunk->nr_empty_pop_pages;
578 }
579 list_move(&chunk->list, &pcpu_chunk_lists[pcpu_to_depopulate_slot]);
580}
581
582static void pcpu_reintegrate_chunk(struct pcpu_chunk *chunk)
583{
584 lockdep_assert_held(&pcpu_lock);
585
586 if (chunk->isolated) {
587 chunk->isolated = false;
588 pcpu_nr_empty_pop_pages += chunk->nr_empty_pop_pages;
589 pcpu_chunk_relocate(chunk, -1);
590 }
591}
592
593
594
595
596
597
598
599
600
601
602static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr)
603{
604 chunk->nr_empty_pop_pages += nr;
605 if (chunk != pcpu_reserved_chunk && !chunk->isolated)
606 pcpu_nr_empty_pop_pages += nr;
607}
608
609
610
611
612
613
614
615
616
617
618
619static inline bool pcpu_region_overlap(int a, int b, int x, int y)
620{
621 return (a < y) && (x < b);
622}
623
624
625
626
627
628
629
630
631
632
633
634static void pcpu_block_update(struct pcpu_block_md *block, int start, int end)
635{
636 int contig = end - start;
637
638 block->first_free = min(block->first_free, start);
639 if (start == 0)
640 block->left_free = contig;
641
642 if (end == block->nr_bits)
643 block->right_free = contig;
644
645 if (contig > block->contig_hint) {
646
647 if (start > block->contig_hint_start) {
648 if (block->contig_hint > block->scan_hint) {
649 block->scan_hint_start =
650 block->contig_hint_start;
651 block->scan_hint = block->contig_hint;
652 } else if (start < block->scan_hint_start) {
653
654
655
656
657
658 block->scan_hint = 0;
659 }
660 } else {
661 block->scan_hint = 0;
662 }
663 block->contig_hint_start = start;
664 block->contig_hint = contig;
665 } else if (contig == block->contig_hint) {
666 if (block->contig_hint_start &&
667 (!start ||
668 __ffs(start) > __ffs(block->contig_hint_start))) {
669
670 block->contig_hint_start = start;
671 if (start < block->scan_hint_start &&
672 block->contig_hint > block->scan_hint)
673 block->scan_hint = 0;
674 } else if (start > block->scan_hint_start ||
675 block->contig_hint > block->scan_hint) {
676
677
678
679
680
681 block->scan_hint_start = start;
682 block->scan_hint = contig;
683 }
684 } else {
685
686
687
688
689
690 if ((start < block->contig_hint_start &&
691 (contig > block->scan_hint ||
692 (contig == block->scan_hint &&
693 start > block->scan_hint_start)))) {
694 block->scan_hint_start = start;
695 block->scan_hint = contig;
696 }
697 }
698}
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716static void pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off,
717 int bits)
718{
719 int s_off = pcpu_off_to_block_off(bit_off);
720 int e_off = s_off + bits;
721 int s_index, l_bit;
722 struct pcpu_block_md *block;
723
724 if (e_off > PCPU_BITMAP_BLOCK_BITS)
725 return;
726
727 s_index = pcpu_off_to_block_index(bit_off);
728 block = chunk->md_blocks + s_index;
729
730
731 l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), s_off);
732 s_off = (s_off == l_bit) ? 0 : l_bit + 1;
733
734 pcpu_block_update(block, s_off, e_off);
735}
736
737
738
739
740
741
742
743
744
745
746
747
748
749static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan)
750{
751 struct pcpu_block_md *chunk_md = &chunk->chunk_md;
752 int bit_off, bits;
753
754
755 if (!full_scan && chunk_md->scan_hint) {
756 bit_off = chunk_md->scan_hint_start + chunk_md->scan_hint;
757 chunk_md->contig_hint_start = chunk_md->scan_hint_start;
758 chunk_md->contig_hint = chunk_md->scan_hint;
759 chunk_md->scan_hint = 0;
760 } else {
761 bit_off = chunk_md->first_free;
762 chunk_md->contig_hint = 0;
763 }
764
765 bits = 0;
766 pcpu_for_each_md_free_region(chunk, bit_off, bits)
767 pcpu_block_update(chunk_md, bit_off, bit_off + bits);
768}
769
770
771
772
773
774
775
776
777
778static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
779{
780 struct pcpu_block_md *block = chunk->md_blocks + index;
781 unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
782 unsigned int rs, re, start;
783
784
785 if (block->scan_hint) {
786 start = block->scan_hint_start + block->scan_hint;
787 block->contig_hint_start = block->scan_hint_start;
788 block->contig_hint = block->scan_hint;
789 block->scan_hint = 0;
790 } else {
791 start = block->first_free;
792 block->contig_hint = 0;
793 }
794
795 block->right_free = 0;
796
797
798 bitmap_for_each_clear_region(alloc_map, rs, re, start,
799 PCPU_BITMAP_BLOCK_BITS)
800 pcpu_block_update(block, rs, re);
801}
802
803
804
805
806
807
808
809
810
811
812
813static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
814 int bits)
815{
816 struct pcpu_block_md *chunk_md = &chunk->chunk_md;
817 int nr_empty_pages = 0;
818 struct pcpu_block_md *s_block, *e_block, *block;
819 int s_index, e_index;
820 int s_off, e_off;
821
822
823
824
825
826
827
828 s_index = pcpu_off_to_block_index(bit_off);
829 e_index = pcpu_off_to_block_index(bit_off + bits - 1);
830 s_off = pcpu_off_to_block_off(bit_off);
831 e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
832
833 s_block = chunk->md_blocks + s_index;
834 e_block = chunk->md_blocks + e_index;
835
836
837
838
839
840
841
842 if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
843 nr_empty_pages++;
844
845 if (s_off == s_block->first_free)
846 s_block->first_free = find_next_zero_bit(
847 pcpu_index_alloc_map(chunk, s_index),
848 PCPU_BITMAP_BLOCK_BITS,
849 s_off + bits);
850
851 if (pcpu_region_overlap(s_block->scan_hint_start,
852 s_block->scan_hint_start + s_block->scan_hint,
853 s_off,
854 s_off + bits))
855 s_block->scan_hint = 0;
856
857 if (pcpu_region_overlap(s_block->contig_hint_start,
858 s_block->contig_hint_start +
859 s_block->contig_hint,
860 s_off,
861 s_off + bits)) {
862
863 if (!s_off)
864 s_block->left_free = 0;
865 pcpu_block_refresh_hint(chunk, s_index);
866 } else {
867
868 s_block->left_free = min(s_block->left_free, s_off);
869 if (s_index == e_index)
870 s_block->right_free = min_t(int, s_block->right_free,
871 PCPU_BITMAP_BLOCK_BITS - e_off);
872 else
873 s_block->right_free = 0;
874 }
875
876
877
878
879 if (s_index != e_index) {
880 if (e_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
881 nr_empty_pages++;
882
883
884
885
886
887 e_block->first_free = find_next_zero_bit(
888 pcpu_index_alloc_map(chunk, e_index),
889 PCPU_BITMAP_BLOCK_BITS, e_off);
890
891 if (e_off == PCPU_BITMAP_BLOCK_BITS) {
892
893 e_block++;
894 } else {
895 if (e_off > e_block->scan_hint_start)
896 e_block->scan_hint = 0;
897
898 e_block->left_free = 0;
899 if (e_off > e_block->contig_hint_start) {
900
901 pcpu_block_refresh_hint(chunk, e_index);
902 } else {
903 e_block->right_free =
904 min_t(int, e_block->right_free,
905 PCPU_BITMAP_BLOCK_BITS - e_off);
906 }
907 }
908
909
910 nr_empty_pages += (e_index - s_index - 1);
911 for (block = s_block + 1; block < e_block; block++) {
912 block->scan_hint = 0;
913 block->contig_hint = 0;
914 block->left_free = 0;
915 block->right_free = 0;
916 }
917 }
918
919 if (nr_empty_pages)
920 pcpu_update_empty_pages(chunk, -nr_empty_pages);
921
922 if (pcpu_region_overlap(chunk_md->scan_hint_start,
923 chunk_md->scan_hint_start +
924 chunk_md->scan_hint,
925 bit_off,
926 bit_off + bits))
927 chunk_md->scan_hint = 0;
928
929
930
931
932
933
934 if (pcpu_region_overlap(chunk_md->contig_hint_start,
935 chunk_md->contig_hint_start +
936 chunk_md->contig_hint,
937 bit_off,
938 bit_off + bits))
939 pcpu_chunk_refresh_hint(chunk, false);
940}
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
961 int bits)
962{
963 int nr_empty_pages = 0;
964 struct pcpu_block_md *s_block, *e_block, *block;
965 int s_index, e_index;
966 int s_off, e_off;
967 int start, end;
968
969
970
971
972
973
974
975 s_index = pcpu_off_to_block_index(bit_off);
976 e_index = pcpu_off_to_block_index(bit_off + bits - 1);
977 s_off = pcpu_off_to_block_off(bit_off);
978 e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
979
980 s_block = chunk->md_blocks + s_index;
981 e_block = chunk->md_blocks + e_index;
982
983
984
985
986
987
988
989
990
991
992
993 start = s_off;
994 if (s_off == s_block->contig_hint + s_block->contig_hint_start) {
995 start = s_block->contig_hint_start;
996 } else {
997
998
999
1000
1001
1002
1003 int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index),
1004 start);
1005 start = (start == l_bit) ? 0 : l_bit + 1;
1006 }
1007
1008 end = e_off;
1009 if (e_off == e_block->contig_hint_start)
1010 end = e_block->contig_hint_start + e_block->contig_hint;
1011 else
1012 end = find_next_bit(pcpu_index_alloc_map(chunk, e_index),
1013 PCPU_BITMAP_BLOCK_BITS, end);
1014
1015
1016 e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS;
1017 if (!start && e_off == PCPU_BITMAP_BLOCK_BITS)
1018 nr_empty_pages++;
1019 pcpu_block_update(s_block, start, e_off);
1020
1021
1022 if (s_index != e_index) {
1023
1024 if (end == PCPU_BITMAP_BLOCK_BITS)
1025 nr_empty_pages++;
1026 pcpu_block_update(e_block, 0, end);
1027
1028
1029 nr_empty_pages += (e_index - s_index - 1);
1030 for (block = s_block + 1; block < e_block; block++) {
1031 block->first_free = 0;
1032 block->scan_hint = 0;
1033 block->contig_hint_start = 0;
1034 block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
1035 block->left_free = PCPU_BITMAP_BLOCK_BITS;
1036 block->right_free = PCPU_BITMAP_BLOCK_BITS;
1037 }
1038 }
1039
1040 if (nr_empty_pages)
1041 pcpu_update_empty_pages(chunk, nr_empty_pages);
1042
1043
1044
1045
1046
1047
1048
1049 if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index)
1050 pcpu_chunk_refresh_hint(chunk, true);
1051 else
1052 pcpu_block_update(&chunk->chunk_md,
1053 pcpu_block_off_to_off(s_index, start),
1054 end);
1055}
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
1071 int *next_off)
1072{
1073 unsigned int page_start, page_end, rs, re;
1074
1075 page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
1076 page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
1077
1078 rs = page_start;
1079 bitmap_next_clear_region(chunk->populated, &rs, &re, page_end);
1080 if (rs >= page_end)
1081 return true;
1082
1083 *next_off = re * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
1084 return false;
1085}
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
1107 size_t align, bool pop_only)
1108{
1109 struct pcpu_block_md *chunk_md = &chunk->chunk_md;
1110 int bit_off, bits, next_off;
1111
1112
1113
1114
1115
1116
1117 if (!pcpu_check_block_hint(chunk_md, alloc_bits, align))
1118 return -1;
1119
1120 bit_off = pcpu_next_hint(chunk_md, alloc_bits);
1121 bits = 0;
1122 pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) {
1123 if (!pop_only || pcpu_is_populated(chunk, bit_off, bits,
1124 &next_off))
1125 break;
1126
1127 bit_off = next_off;
1128 bits = 0;
1129 }
1130
1131 if (bit_off == pcpu_chunk_map_bits(chunk))
1132 return -1;
1133
1134 return bit_off;
1135}
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157static unsigned long pcpu_find_zero_area(unsigned long *map,
1158 unsigned long size,
1159 unsigned long start,
1160 unsigned long nr,
1161 unsigned long align_mask,
1162 unsigned long *largest_off,
1163 unsigned long *largest_bits)
1164{
1165 unsigned long index, end, i, area_off, area_bits;
1166again:
1167 index = find_next_zero_bit(map, size, start);
1168
1169
1170 index = __ALIGN_MASK(index, align_mask);
1171 area_off = index;
1172
1173 end = index + nr;
1174 if (end > size)
1175 return end;
1176 i = find_next_bit(map, end, index);
1177 if (i < end) {
1178 area_bits = i - area_off;
1179
1180 if (area_bits > *largest_bits ||
1181 (area_bits == *largest_bits && *largest_off &&
1182 (!area_off || __ffs(area_off) > __ffs(*largest_off)))) {
1183 *largest_off = area_off;
1184 *largest_bits = area_bits;
1185 }
1186
1187 start = i + 1;
1188 goto again;
1189 }
1190 return index;
1191}
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
1213 size_t align, int start)
1214{
1215 struct pcpu_block_md *chunk_md = &chunk->chunk_md;
1216 size_t align_mask = (align) ? (align - 1) : 0;
1217 unsigned long area_off = 0, area_bits = 0;
1218 int bit_off, end, oslot;
1219
1220 lockdep_assert_held(&pcpu_lock);
1221
1222 oslot = pcpu_chunk_slot(chunk);
1223
1224
1225
1226
1227 end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS,
1228 pcpu_chunk_map_bits(chunk));
1229 bit_off = pcpu_find_zero_area(chunk->alloc_map, end, start, alloc_bits,
1230 align_mask, &area_off, &area_bits);
1231 if (bit_off >= end)
1232 return -1;
1233
1234 if (area_bits)
1235 pcpu_block_update_scan(chunk, area_off, area_bits);
1236
1237
1238 bitmap_set(chunk->alloc_map, bit_off, alloc_bits);
1239
1240
1241 set_bit(bit_off, chunk->bound_map);
1242 bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1);
1243 set_bit(bit_off + alloc_bits, chunk->bound_map);
1244
1245 chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE;
1246
1247
1248 if (bit_off == chunk_md->first_free)
1249 chunk_md->first_free = find_next_zero_bit(
1250 chunk->alloc_map,
1251 pcpu_chunk_map_bits(chunk),
1252 bit_off + alloc_bits);
1253
1254 pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits);
1255
1256 pcpu_chunk_relocate(chunk, oslot);
1257
1258 return bit_off * PCPU_MIN_ALLOC_SIZE;
1259}
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272static int pcpu_free_area(struct pcpu_chunk *chunk, int off)
1273{
1274 struct pcpu_block_md *chunk_md = &chunk->chunk_md;
1275 int bit_off, bits, end, oslot, freed;
1276
1277 lockdep_assert_held(&pcpu_lock);
1278 pcpu_stats_area_dealloc(chunk);
1279
1280 oslot = pcpu_chunk_slot(chunk);
1281
1282 bit_off = off / PCPU_MIN_ALLOC_SIZE;
1283
1284
1285 end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
1286 bit_off + 1);
1287 bits = end - bit_off;
1288 bitmap_clear(chunk->alloc_map, bit_off, bits);
1289
1290 freed = bits * PCPU_MIN_ALLOC_SIZE;
1291
1292
1293 chunk->free_bytes += freed;
1294
1295
1296 chunk_md->first_free = min(chunk_md->first_free, bit_off);
1297
1298 pcpu_block_update_hint_free(chunk, bit_off, bits);
1299
1300 pcpu_chunk_relocate(chunk, oslot);
1301
1302 return freed;
1303}
1304
1305static void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits)
1306{
1307 block->scan_hint = 0;
1308 block->contig_hint = nr_bits;
1309 block->left_free = nr_bits;
1310 block->right_free = nr_bits;
1311 block->first_free = 0;
1312 block->nr_bits = nr_bits;
1313}
1314
1315static void pcpu_init_md_blocks(struct pcpu_chunk *chunk)
1316{
1317 struct pcpu_block_md *md_block;
1318
1319
1320 pcpu_init_md_block(&chunk->chunk_md, pcpu_chunk_map_bits(chunk));
1321
1322 for (md_block = chunk->md_blocks;
1323 md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk);
1324 md_block++)
1325 pcpu_init_md_block(md_block, PCPU_BITMAP_BLOCK_BITS);
1326}
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
1342 int map_size)
1343{
1344 struct pcpu_chunk *chunk;
1345 unsigned long aligned_addr, lcm_align;
1346 int start_offset, offset_bits, region_size, region_bits;
1347 size_t alloc_size;
1348
1349
1350 aligned_addr = tmp_addr & PAGE_MASK;
1351
1352 start_offset = tmp_addr - aligned_addr;
1353
1354
1355
1356
1357
1358
1359 lcm_align = lcm(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE);
1360 region_size = ALIGN(start_offset + map_size, lcm_align);
1361
1362
1363 alloc_size = struct_size(chunk, populated,
1364 BITS_TO_LONGS(region_size >> PAGE_SHIFT));
1365 chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
1366 if (!chunk)
1367 panic("%s: Failed to allocate %zu bytes\n", __func__,
1368 alloc_size);
1369
1370 INIT_LIST_HEAD(&chunk->list);
1371
1372 chunk->base_addr = (void *)aligned_addr;
1373 chunk->start_offset = start_offset;
1374 chunk->end_offset = region_size - chunk->start_offset - map_size;
1375
1376 chunk->nr_pages = region_size >> PAGE_SHIFT;
1377 region_bits = pcpu_chunk_map_bits(chunk);
1378
1379 alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]);
1380 chunk->alloc_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
1381 if (!chunk->alloc_map)
1382 panic("%s: Failed to allocate %zu bytes\n", __func__,
1383 alloc_size);
1384
1385 alloc_size =
1386 BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]);
1387 chunk->bound_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
1388 if (!chunk->bound_map)
1389 panic("%s: Failed to allocate %zu bytes\n", __func__,
1390 alloc_size);
1391
1392 alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]);
1393 chunk->md_blocks = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
1394 if (!chunk->md_blocks)
1395 panic("%s: Failed to allocate %zu bytes\n", __func__,
1396 alloc_size);
1397
1398#ifdef CONFIG_MEMCG_KMEM
1399
1400 chunk->obj_cgroups = NULL;
1401#endif
1402 pcpu_init_md_blocks(chunk);
1403
1404
1405 chunk->immutable = true;
1406 bitmap_fill(chunk->populated, chunk->nr_pages);
1407 chunk->nr_populated = chunk->nr_pages;
1408 chunk->nr_empty_pop_pages = chunk->nr_pages;
1409
1410 chunk->free_bytes = map_size;
1411
1412 if (chunk->start_offset) {
1413
1414 offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
1415 bitmap_set(chunk->alloc_map, 0, offset_bits);
1416 set_bit(0, chunk->bound_map);
1417 set_bit(offset_bits, chunk->bound_map);
1418
1419 chunk->chunk_md.first_free = offset_bits;
1420
1421 pcpu_block_update_hint_alloc(chunk, 0, offset_bits);
1422 }
1423
1424 if (chunk->end_offset) {
1425
1426 offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE;
1427 bitmap_set(chunk->alloc_map,
1428 pcpu_chunk_map_bits(chunk) - offset_bits,
1429 offset_bits);
1430 set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE,
1431 chunk->bound_map);
1432 set_bit(region_bits, chunk->bound_map);
1433
1434 pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk)
1435 - offset_bits, offset_bits);
1436 }
1437
1438 return chunk;
1439}
1440
1441static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
1442{
1443 struct pcpu_chunk *chunk;
1444 int region_bits;
1445
1446 chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp);
1447 if (!chunk)
1448 return NULL;
1449
1450 INIT_LIST_HEAD(&chunk->list);
1451 chunk->nr_pages = pcpu_unit_pages;
1452 region_bits = pcpu_chunk_map_bits(chunk);
1453
1454 chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
1455 sizeof(chunk->alloc_map[0]), gfp);
1456 if (!chunk->alloc_map)
1457 goto alloc_map_fail;
1458
1459 chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
1460 sizeof(chunk->bound_map[0]), gfp);
1461 if (!chunk->bound_map)
1462 goto bound_map_fail;
1463
1464 chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) *
1465 sizeof(chunk->md_blocks[0]), gfp);
1466 if (!chunk->md_blocks)
1467 goto md_blocks_fail;
1468
1469#ifdef CONFIG_MEMCG_KMEM
1470 if (!mem_cgroup_kmem_disabled()) {
1471 chunk->obj_cgroups =
1472 pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) *
1473 sizeof(struct obj_cgroup *), gfp);
1474 if (!chunk->obj_cgroups)
1475 goto objcg_fail;
1476 }
1477#endif
1478
1479 pcpu_init_md_blocks(chunk);
1480
1481
1482 chunk->free_bytes = chunk->nr_pages * PAGE_SIZE;
1483
1484 return chunk;
1485
1486#ifdef CONFIG_MEMCG_KMEM
1487objcg_fail:
1488 pcpu_mem_free(chunk->md_blocks);
1489#endif
1490md_blocks_fail:
1491 pcpu_mem_free(chunk->bound_map);
1492bound_map_fail:
1493 pcpu_mem_free(chunk->alloc_map);
1494alloc_map_fail:
1495 pcpu_mem_free(chunk);
1496
1497 return NULL;
1498}
1499
1500static void pcpu_free_chunk(struct pcpu_chunk *chunk)
1501{
1502 if (!chunk)
1503 return;
1504#ifdef CONFIG_MEMCG_KMEM
1505 pcpu_mem_free(chunk->obj_cgroups);
1506#endif
1507 pcpu_mem_free(chunk->md_blocks);
1508 pcpu_mem_free(chunk->bound_map);
1509 pcpu_mem_free(chunk->alloc_map);
1510 pcpu_mem_free(chunk);
1511}
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
1524 int page_end)
1525{
1526 int nr = page_end - page_start;
1527
1528 lockdep_assert_held(&pcpu_lock);
1529
1530 bitmap_set(chunk->populated, page_start, nr);
1531 chunk->nr_populated += nr;
1532 pcpu_nr_populated += nr;
1533
1534 pcpu_update_empty_pages(chunk, nr);
1535}
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
1548 int page_start, int page_end)
1549{
1550 int nr = page_end - page_start;
1551
1552 lockdep_assert_held(&pcpu_lock);
1553
1554 bitmap_clear(chunk->populated, page_start, nr);
1555 chunk->nr_populated -= nr;
1556 pcpu_nr_populated -= nr;
1557
1558 pcpu_update_empty_pages(chunk, -nr);
1559}
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
1578 int page_start, int page_end, gfp_t gfp);
1579static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
1580 int page_start, int page_end);
1581static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
1582 int page_start, int page_end);
1583static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
1584static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
1585static struct page *pcpu_addr_to_page(void *addr);
1586static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
1587
1588#ifdef CONFIG_NEED_PER_CPU_KM
1589#include "percpu-km.c"
1590#else
1591#include "percpu-vm.c"
1592#endif
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
1605{
1606
1607 if (pcpu_addr_in_chunk(pcpu_first_chunk, addr))
1608 return pcpu_first_chunk;
1609
1610
1611 if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr))
1612 return pcpu_reserved_chunk;
1613
1614
1615
1616
1617
1618
1619
1620
1621 addr += pcpu_unit_offsets[raw_smp_processor_id()];
1622 return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
1623}
1624
1625#ifdef CONFIG_MEMCG_KMEM
1626static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
1627 struct obj_cgroup **objcgp)
1628{
1629 struct obj_cgroup *objcg;
1630
1631 if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT))
1632 return true;
1633
1634 objcg = get_obj_cgroup_from_current();
1635 if (!objcg)
1636 return true;
1637
1638 if (obj_cgroup_charge(objcg, gfp, size * num_possible_cpus())) {
1639 obj_cgroup_put(objcg);
1640 return false;
1641 }
1642
1643 *objcgp = objcg;
1644 return true;
1645}
1646
1647static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
1648 struct pcpu_chunk *chunk, int off,
1649 size_t size)
1650{
1651 if (!objcg)
1652 return;
1653
1654 if (likely(chunk && chunk->obj_cgroups)) {
1655 chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg;
1656
1657 rcu_read_lock();
1658 mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
1659 size * num_possible_cpus());
1660 rcu_read_unlock();
1661 } else {
1662 obj_cgroup_uncharge(objcg, size * num_possible_cpus());
1663 obj_cgroup_put(objcg);
1664 }
1665}
1666
1667static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
1668{
1669 struct obj_cgroup *objcg;
1670
1671 if (unlikely(!chunk->obj_cgroups))
1672 return;
1673
1674 objcg = chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT];
1675 if (!objcg)
1676 return;
1677 chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL;
1678
1679 obj_cgroup_uncharge(objcg, size * num_possible_cpus());
1680
1681 rcu_read_lock();
1682 mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
1683 -(size * num_possible_cpus()));
1684 rcu_read_unlock();
1685
1686 obj_cgroup_put(objcg);
1687}
1688
1689#else
1690static bool
1691pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp)
1692{
1693 return true;
1694}
1695
1696static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
1697 struct pcpu_chunk *chunk, int off,
1698 size_t size)
1699{
1700}
1701
1702static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
1703{
1704}
1705#endif
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
1723 gfp_t gfp)
1724{
1725 gfp_t pcpu_gfp;
1726 bool is_atomic;
1727 bool do_warn;
1728 struct obj_cgroup *objcg = NULL;
1729 static int warn_limit = 10;
1730 struct pcpu_chunk *chunk, *next;
1731 const char *err;
1732 int slot, off, cpu, ret;
1733 unsigned long flags;
1734 void __percpu *ptr;
1735 size_t bits, bit_align;
1736
1737 gfp = current_gfp_context(gfp);
1738
1739 pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
1740 is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
1741 do_warn = !(gfp & __GFP_NOWARN);
1742
1743
1744
1745
1746
1747
1748
1749 if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
1750 align = PCPU_MIN_ALLOC_SIZE;
1751
1752 size = ALIGN(size, PCPU_MIN_ALLOC_SIZE);
1753 bits = size >> PCPU_MIN_ALLOC_SHIFT;
1754 bit_align = align >> PCPU_MIN_ALLOC_SHIFT;
1755
1756 if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
1757 !is_power_of_2(align))) {
1758 WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation\n",
1759 size, align);
1760 return NULL;
1761 }
1762
1763 if (unlikely(!pcpu_memcg_pre_alloc_hook(size, gfp, &objcg)))
1764 return NULL;
1765
1766 if (!is_atomic) {
1767
1768
1769
1770
1771
1772 if (gfp & __GFP_NOFAIL) {
1773 mutex_lock(&pcpu_alloc_mutex);
1774 } else if (mutex_lock_killable(&pcpu_alloc_mutex)) {
1775 pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
1776 return NULL;
1777 }
1778 }
1779
1780 spin_lock_irqsave(&pcpu_lock, flags);
1781
1782
1783 if (reserved && pcpu_reserved_chunk) {
1784 chunk = pcpu_reserved_chunk;
1785
1786 off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic);
1787 if (off < 0) {
1788 err = "alloc from reserved chunk failed";
1789 goto fail_unlock;
1790 }
1791
1792 off = pcpu_alloc_area(chunk, bits, bit_align, off);
1793 if (off >= 0)
1794 goto area_found;
1795
1796 err = "alloc from reserved chunk failed";
1797 goto fail_unlock;
1798 }
1799
1800restart:
1801
1802 for (slot = pcpu_size_to_slot(size); slot <= pcpu_free_slot; slot++) {
1803 list_for_each_entry_safe(chunk, next, &pcpu_chunk_lists[slot],
1804 list) {
1805 off = pcpu_find_block_fit(chunk, bits, bit_align,
1806 is_atomic);
1807 if (off < 0) {
1808 if (slot < PCPU_SLOT_FAIL_THRESHOLD)
1809 pcpu_chunk_move(chunk, 0);
1810 continue;
1811 }
1812
1813 off = pcpu_alloc_area(chunk, bits, bit_align, off);
1814 if (off >= 0) {
1815 pcpu_reintegrate_chunk(chunk);
1816 goto area_found;
1817 }
1818 }
1819 }
1820
1821 spin_unlock_irqrestore(&pcpu_lock, flags);
1822
1823
1824
1825
1826
1827
1828 if (is_atomic) {
1829 err = "atomic alloc failed, no space left";
1830 goto fail;
1831 }
1832
1833 if (list_empty(&pcpu_chunk_lists[pcpu_free_slot])) {
1834 chunk = pcpu_create_chunk(pcpu_gfp);
1835 if (!chunk) {
1836 err = "failed to allocate new chunk";
1837 goto fail;
1838 }
1839
1840 spin_lock_irqsave(&pcpu_lock, flags);
1841 pcpu_chunk_relocate(chunk, -1);
1842 } else {
1843 spin_lock_irqsave(&pcpu_lock, flags);
1844 }
1845
1846 goto restart;
1847
1848area_found:
1849 pcpu_stats_area_alloc(chunk, size);
1850 spin_unlock_irqrestore(&pcpu_lock, flags);
1851
1852
1853 if (!is_atomic) {
1854 unsigned int page_start, page_end, rs, re;
1855
1856 page_start = PFN_DOWN(off);
1857 page_end = PFN_UP(off + size);
1858
1859 bitmap_for_each_clear_region(chunk->populated, rs, re,
1860 page_start, page_end) {
1861 WARN_ON(chunk->immutable);
1862
1863 ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);
1864
1865 spin_lock_irqsave(&pcpu_lock, flags);
1866 if (ret) {
1867 pcpu_free_area(chunk, off);
1868 err = "failed to populate";
1869 goto fail_unlock;
1870 }
1871 pcpu_chunk_populated(chunk, rs, re);
1872 spin_unlock_irqrestore(&pcpu_lock, flags);
1873 }
1874
1875 mutex_unlock(&pcpu_alloc_mutex);
1876 }
1877
1878 if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
1879 pcpu_schedule_balance_work();
1880
1881
1882 for_each_possible_cpu(cpu)
1883 memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
1884
1885 ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
1886 kmemleak_alloc_percpu(ptr, size, gfp);
1887
1888 trace_percpu_alloc_percpu(reserved, is_atomic, size, align,
1889 chunk->base_addr, off, ptr);
1890
1891 pcpu_memcg_post_alloc_hook(objcg, chunk, off, size);
1892
1893 return ptr;
1894
1895fail_unlock:
1896 spin_unlock_irqrestore(&pcpu_lock, flags);
1897fail:
1898 trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);
1899
1900 if (!is_atomic && do_warn && warn_limit) {
1901 pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
1902 size, align, is_atomic, err);
1903 dump_stack();
1904 if (!--warn_limit)
1905 pr_info("limit reached, disable warning\n");
1906 }
1907 if (is_atomic) {
1908
1909 pcpu_atomic_alloc_failed = true;
1910 pcpu_schedule_balance_work();
1911 } else {
1912 mutex_unlock(&pcpu_alloc_mutex);
1913 }
1914
1915 pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
1916
1917 return NULL;
1918}
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp)
1936{
1937 return pcpu_alloc(size, align, false, gfp);
1938}
1939EXPORT_SYMBOL_GPL(__alloc_percpu_gfp);
1940
1941
1942
1943
1944
1945
1946
1947
1948void __percpu *__alloc_percpu(size_t size, size_t align)
1949{
1950 return pcpu_alloc(size, align, false, GFP_KERNEL);
1951}
1952EXPORT_SYMBOL_GPL(__alloc_percpu);
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
1971{
1972 return pcpu_alloc(size, align, true, GFP_KERNEL);
1973}
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986static void pcpu_balance_free(bool empty_only)
1987{
1988 LIST_HEAD(to_free);
1989 struct list_head *free_head = &pcpu_chunk_lists[pcpu_free_slot];
1990 struct pcpu_chunk *chunk, *next;
1991
1992 lockdep_assert_held(&pcpu_lock);
1993
1994
1995
1996
1997
1998 list_for_each_entry_safe(chunk, next, free_head, list) {
1999 WARN_ON(chunk->immutable);
2000
2001
2002 if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
2003 continue;
2004
2005 if (!empty_only || chunk->nr_empty_pop_pages == 0)
2006 list_move(&chunk->list, &to_free);
2007 }
2008
2009 if (list_empty(&to_free))
2010 return;
2011
2012 spin_unlock_irq(&pcpu_lock);
2013 list_for_each_entry_safe(chunk, next, &to_free, list) {
2014 unsigned int rs, re;
2015
2016 bitmap_for_each_set_region(chunk->populated, rs, re, 0,
2017 chunk->nr_pages) {
2018 pcpu_depopulate_chunk(chunk, rs, re);
2019 spin_lock_irq(&pcpu_lock);
2020 pcpu_chunk_depopulated(chunk, rs, re);
2021 spin_unlock_irq(&pcpu_lock);
2022 }
2023 pcpu_destroy_chunk(chunk);
2024 cond_resched();
2025 }
2026 spin_lock_irq(&pcpu_lock);
2027}
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041static void pcpu_balance_populated(void)
2042{
2043
2044 const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
2045 struct pcpu_chunk *chunk;
2046 int slot, nr_to_pop, ret;
2047
2048 lockdep_assert_held(&pcpu_lock);
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060retry_pop:
2061 if (pcpu_atomic_alloc_failed) {
2062 nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
2063
2064 pcpu_atomic_alloc_failed = false;
2065 } else {
2066 nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
2067 pcpu_nr_empty_pop_pages,
2068 0, PCPU_EMPTY_POP_PAGES_HIGH);
2069 }
2070
2071 for (slot = pcpu_size_to_slot(PAGE_SIZE); slot <= pcpu_free_slot; slot++) {
2072 unsigned int nr_unpop = 0, rs, re;
2073
2074 if (!nr_to_pop)
2075 break;
2076
2077 list_for_each_entry(chunk, &pcpu_chunk_lists[slot], list) {
2078 nr_unpop = chunk->nr_pages - chunk->nr_populated;
2079 if (nr_unpop)
2080 break;
2081 }
2082
2083 if (!nr_unpop)
2084 continue;
2085
2086
2087 bitmap_for_each_clear_region(chunk->populated, rs, re, 0,
2088 chunk->nr_pages) {
2089 int nr = min_t(int, re - rs, nr_to_pop);
2090
2091 spin_unlock_irq(&pcpu_lock);
2092 ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
2093 cond_resched();
2094 spin_lock_irq(&pcpu_lock);
2095 if (!ret) {
2096 nr_to_pop -= nr;
2097 pcpu_chunk_populated(chunk, rs, rs + nr);
2098 } else {
2099 nr_to_pop = 0;
2100 }
2101
2102 if (!nr_to_pop)
2103 break;
2104 }
2105 }
2106
2107 if (nr_to_pop) {
2108
2109 spin_unlock_irq(&pcpu_lock);
2110 chunk = pcpu_create_chunk(gfp);
2111 cond_resched();
2112 spin_lock_irq(&pcpu_lock);
2113 if (chunk) {
2114 pcpu_chunk_relocate(chunk, -1);
2115 goto retry_pop;
2116 }
2117 }
2118}
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135static void pcpu_reclaim_populated(void)
2136{
2137 struct pcpu_chunk *chunk;
2138 struct pcpu_block_md *block;
2139 int freed_page_start, freed_page_end;
2140 int i, end;
2141 bool reintegrate;
2142
2143 lockdep_assert_held(&pcpu_lock);
2144
2145
2146
2147
2148
2149
2150
2151 while (!list_empty(&pcpu_chunk_lists[pcpu_to_depopulate_slot])) {
2152 chunk = list_first_entry(&pcpu_chunk_lists[pcpu_to_depopulate_slot],
2153 struct pcpu_chunk, list);
2154 WARN_ON(chunk->immutable);
2155
2156
2157
2158
2159
2160 freed_page_start = chunk->nr_pages;
2161 freed_page_end = 0;
2162 reintegrate = false;
2163 for (i = chunk->nr_pages - 1, end = -1; i >= 0; i--) {
2164
2165 if (chunk->nr_empty_pop_pages == 0)
2166 break;
2167
2168
2169 if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH) {
2170 reintegrate = true;
2171 goto end_chunk;
2172 }
2173
2174
2175
2176
2177
2178
2179
2180 block = chunk->md_blocks + i;
2181 if (block->contig_hint == PCPU_BITMAP_BLOCK_BITS &&
2182 test_bit(i, chunk->populated)) {
2183 if (end == -1)
2184 end = i;
2185 if (i > 0)
2186 continue;
2187 i--;
2188 }
2189
2190
2191 if (end == -1)
2192 continue;
2193
2194 spin_unlock_irq(&pcpu_lock);
2195 pcpu_depopulate_chunk(chunk, i + 1, end + 1);
2196 cond_resched();
2197 spin_lock_irq(&pcpu_lock);
2198
2199 pcpu_chunk_depopulated(chunk, i + 1, end + 1);
2200 freed_page_start = min(freed_page_start, i + 1);
2201 freed_page_end = max(freed_page_end, end + 1);
2202
2203
2204 end = -1;
2205 }
2206
2207end_chunk:
2208
2209 if (freed_page_start < freed_page_end) {
2210 spin_unlock_irq(&pcpu_lock);
2211 pcpu_post_unmap_tlb_flush(chunk,
2212 freed_page_start,
2213 freed_page_end);
2214 cond_resched();
2215 spin_lock_irq(&pcpu_lock);
2216 }
2217
2218 if (reintegrate || chunk->free_bytes == pcpu_unit_size)
2219 pcpu_reintegrate_chunk(chunk);
2220 else
2221 list_move_tail(&chunk->list,
2222 &pcpu_chunk_lists[pcpu_sidelined_slot]);
2223 }
2224}
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234static void pcpu_balance_workfn(struct work_struct *work)
2235{
2236
2237
2238
2239
2240
2241
2242
2243 mutex_lock(&pcpu_alloc_mutex);
2244 spin_lock_irq(&pcpu_lock);
2245
2246 pcpu_balance_free(false);
2247 pcpu_reclaim_populated();
2248 pcpu_balance_populated();
2249 pcpu_balance_free(true);
2250
2251 spin_unlock_irq(&pcpu_lock);
2252 mutex_unlock(&pcpu_alloc_mutex);
2253}
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264void free_percpu(void __percpu *ptr)
2265{
2266 void *addr;
2267 struct pcpu_chunk *chunk;
2268 unsigned long flags;
2269 int size, off;
2270 bool need_balance = false;
2271
2272 if (!ptr)
2273 return;
2274
2275 kmemleak_free_percpu(ptr);
2276
2277 addr = __pcpu_ptr_to_addr(ptr);
2278
2279 spin_lock_irqsave(&pcpu_lock, flags);
2280
2281 chunk = pcpu_chunk_addr_search(addr);
2282 off = addr - chunk->base_addr;
2283
2284 size = pcpu_free_area(chunk, off);
2285
2286 pcpu_memcg_free_hook(chunk, off, size);
2287
2288
2289
2290
2291
2292
2293 if (!chunk->isolated && chunk->free_bytes == pcpu_unit_size) {
2294 struct pcpu_chunk *pos;
2295
2296 list_for_each_entry(pos, &pcpu_chunk_lists[pcpu_free_slot], list)
2297 if (pos != chunk) {
2298 need_balance = true;
2299 break;
2300 }
2301 } else if (pcpu_should_reclaim_chunk(chunk)) {
2302 pcpu_isolate_chunk(chunk);
2303 need_balance = true;
2304 }
2305
2306 trace_percpu_free_percpu(chunk->base_addr, off, ptr);
2307
2308 spin_unlock_irqrestore(&pcpu_lock, flags);
2309
2310 if (need_balance)
2311 pcpu_schedule_balance_work();
2312}
2313EXPORT_SYMBOL_GPL(free_percpu);
2314
2315bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
2316{
2317#ifdef CONFIG_SMP
2318 const size_t static_size = __per_cpu_end - __per_cpu_start;
2319 void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
2320 unsigned int cpu;
2321
2322 for_each_possible_cpu(cpu) {
2323 void *start = per_cpu_ptr(base, cpu);
2324 void *va = (void *)addr;
2325
2326 if (va >= start && va < start + static_size) {
2327 if (can_addr) {
2328 *can_addr = (unsigned long) (va - start);
2329 *can_addr += (unsigned long)
2330 per_cpu_ptr(base, get_boot_cpu_id());
2331 }
2332 return true;
2333 }
2334 }
2335#endif
2336
2337 return false;
2338}
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351bool is_kernel_percpu_address(unsigned long addr)
2352{
2353 return __is_kernel_percpu_address(addr, NULL);
2354}
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379phys_addr_t per_cpu_ptr_to_phys(void *addr)
2380{
2381 void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
2382 bool in_first_chunk = false;
2383 unsigned long first_low, first_high;
2384 unsigned int cpu;
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396 first_low = (unsigned long)pcpu_base_addr +
2397 pcpu_unit_page_offset(pcpu_low_unit_cpu, 0);
2398 first_high = (unsigned long)pcpu_base_addr +
2399 pcpu_unit_page_offset(pcpu_high_unit_cpu, pcpu_unit_pages);
2400 if ((unsigned long)addr >= first_low &&
2401 (unsigned long)addr < first_high) {
2402 for_each_possible_cpu(cpu) {
2403 void *start = per_cpu_ptr(base, cpu);
2404
2405 if (addr >= start && addr < start + pcpu_unit_size) {
2406 in_first_chunk = true;
2407 break;
2408 }
2409 }
2410 }
2411
2412 if (in_first_chunk) {
2413 if (!is_vmalloc_addr(addr))
2414 return __pa(addr);
2415 else
2416 return page_to_phys(vmalloc_to_page(addr)) +
2417 offset_in_page(addr);
2418 } else
2419 return page_to_phys(pcpu_addr_to_page(addr)) +
2420 offset_in_page(addr);
2421}
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
2439 int nr_units)
2440{
2441 struct pcpu_alloc_info *ai;
2442 size_t base_size, ai_size;
2443 void *ptr;
2444 int unit;
2445
2446 base_size = ALIGN(struct_size(ai, groups, nr_groups),
2447 __alignof__(ai->groups[0].cpu_map[0]));
2448 ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
2449
2450 ptr = memblock_alloc(PFN_ALIGN(ai_size), PAGE_SIZE);
2451 if (!ptr)
2452 return NULL;
2453 ai = ptr;
2454 ptr += base_size;
2455
2456 ai->groups[0].cpu_map = ptr;
2457
2458 for (unit = 0; unit < nr_units; unit++)
2459 ai->groups[0].cpu_map[unit] = NR_CPUS;
2460
2461 ai->nr_groups = nr_groups;
2462 ai->__ai_size = PFN_ALIGN(ai_size);
2463
2464 return ai;
2465}
2466
2467
2468
2469
2470
2471
2472
2473void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
2474{
2475 memblock_free_early(__pa(ai), ai->__ai_size);
2476}
2477
2478
2479
2480
2481
2482
2483
2484
2485static void pcpu_dump_alloc_info(const char *lvl,
2486 const struct pcpu_alloc_info *ai)
2487{
2488 int group_width = 1, cpu_width = 1, width;
2489 char empty_str[] = "--------";
2490 int alloc = 0, alloc_end = 0;
2491 int group, v;
2492 int upa, apl;
2493
2494 v = ai->nr_groups;
2495 while (v /= 10)
2496 group_width++;
2497
2498 v = num_possible_cpus();
2499 while (v /= 10)
2500 cpu_width++;
2501 empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
2502
2503 upa = ai->alloc_size / ai->unit_size;
2504 width = upa * (cpu_width + 1) + group_width + 3;
2505 apl = rounddown_pow_of_two(max(60 / width, 1));
2506
2507 printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
2508 lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
2509 ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);
2510
2511 for (group = 0; group < ai->nr_groups; group++) {
2512 const struct pcpu_group_info *gi = &ai->groups[group];
2513 int unit = 0, unit_end = 0;
2514
2515 BUG_ON(gi->nr_units % upa);
2516 for (alloc_end += gi->nr_units / upa;
2517 alloc < alloc_end; alloc++) {
2518 if (!(alloc % apl)) {
2519 pr_cont("\n");
2520 printk("%spcpu-alloc: ", lvl);
2521 }
2522 pr_cont("[%0*d] ", group_width, group);
2523
2524 for (unit_end += upa; unit < unit_end; unit++)
2525 if (gi->cpu_map[unit] != NR_CPUS)
2526 pr_cont("%0*d ",
2527 cpu_width, gi->cpu_map[unit]);
2528 else
2529 pr_cont("%s ", empty_str);
2530 }
2531 }
2532 pr_cont("\n");
2533}
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
2590 void *base_addr)
2591{
2592 size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
2593 size_t static_size, dyn_size;
2594 struct pcpu_chunk *chunk;
2595 unsigned long *group_offsets;
2596 size_t *group_sizes;
2597 unsigned long *unit_off;
2598 unsigned int cpu;
2599 int *unit_map;
2600 int group, unit, i;
2601 int map_size;
2602 unsigned long tmp_addr;
2603 size_t alloc_size;
2604
2605#define PCPU_SETUP_BUG_ON(cond) do { \
2606 if (unlikely(cond)) { \
2607 pr_emerg("failed to initialize, %s\n", #cond); \
2608 pr_emerg("cpu_possible_mask=%*pb\n", \
2609 cpumask_pr_args(cpu_possible_mask)); \
2610 pcpu_dump_alloc_info(KERN_EMERG, ai); \
2611 BUG(); \
2612 } \
2613} while (0)
2614
2615
2616 PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
2617#ifdef CONFIG_SMP
2618 PCPU_SETUP_BUG_ON(!ai->static_size);
2619 PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start));
2620#endif
2621 PCPU_SETUP_BUG_ON(!base_addr);
2622 PCPU_SETUP_BUG_ON(offset_in_page(base_addr));
2623 PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
2624 PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
2625 PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
2626 PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE));
2627 PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
2628 PCPU_SETUP_BUG_ON(!ai->dyn_size);
2629 PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE));
2630 PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) ||
2631 IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE)));
2632 PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
2633
2634
2635 alloc_size = ai->nr_groups * sizeof(group_offsets[0]);
2636 group_offsets = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
2637 if (!group_offsets)
2638 panic("%s: Failed to allocate %zu bytes\n", __func__,
2639 alloc_size);
2640
2641 alloc_size = ai->nr_groups * sizeof(group_sizes[0]);
2642 group_sizes = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
2643 if (!group_sizes)
2644 panic("%s: Failed to allocate %zu bytes\n", __func__,
2645 alloc_size);
2646
2647 alloc_size = nr_cpu_ids * sizeof(unit_map[0]);
2648 unit_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
2649 if (!unit_map)
2650 panic("%s: Failed to allocate %zu bytes\n", __func__,
2651 alloc_size);
2652
2653 alloc_size = nr_cpu_ids * sizeof(unit_off[0]);
2654 unit_off = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
2655 if (!unit_off)
2656 panic("%s: Failed to allocate %zu bytes\n", __func__,
2657 alloc_size);
2658
2659 for (cpu = 0; cpu < nr_cpu_ids; cpu++)
2660 unit_map[cpu] = UINT_MAX;
2661
2662 pcpu_low_unit_cpu = NR_CPUS;
2663 pcpu_high_unit_cpu = NR_CPUS;
2664
2665 for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
2666 const struct pcpu_group_info *gi = &ai->groups[group];
2667
2668 group_offsets[group] = gi->base_offset;
2669 group_sizes[group] = gi->nr_units * ai->unit_size;
2670
2671 for (i = 0; i < gi->nr_units; i++) {
2672 cpu = gi->cpu_map[i];
2673 if (cpu == NR_CPUS)
2674 continue;
2675
2676 PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids);
2677 PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
2678 PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);
2679
2680 unit_map[cpu] = unit + i;
2681 unit_off[cpu] = gi->base_offset + i * ai->unit_size;
2682
2683
2684 if (pcpu_low_unit_cpu == NR_CPUS ||
2685 unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
2686 pcpu_low_unit_cpu = cpu;
2687 if (pcpu_high_unit_cpu == NR_CPUS ||
2688 unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
2689 pcpu_high_unit_cpu = cpu;
2690 }
2691 }
2692 pcpu_nr_units = unit;
2693
2694 for_each_possible_cpu(cpu)
2695 PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);
2696
2697
2698#undef PCPU_SETUP_BUG_ON
2699 pcpu_dump_alloc_info(KERN_DEBUG, ai);
2700
2701 pcpu_nr_groups = ai->nr_groups;
2702 pcpu_group_offsets = group_offsets;
2703 pcpu_group_sizes = group_sizes;
2704 pcpu_unit_map = unit_map;
2705 pcpu_unit_offsets = unit_off;
2706
2707
2708 pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
2709 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
2710 pcpu_atom_size = ai->atom_size;
2711 pcpu_chunk_struct_size = struct_size(chunk, populated,
2712 BITS_TO_LONGS(pcpu_unit_pages));
2713
2714 pcpu_stats_save_ai(ai);
2715
2716
2717
2718
2719
2720
2721
2722 pcpu_sidelined_slot = __pcpu_size_to_slot(pcpu_unit_size) + 1;
2723 pcpu_free_slot = pcpu_sidelined_slot + 1;
2724 pcpu_to_depopulate_slot = pcpu_free_slot + 1;
2725 pcpu_nr_slots = pcpu_to_depopulate_slot + 1;
2726 pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots *
2727 sizeof(pcpu_chunk_lists[0]),
2728 SMP_CACHE_BYTES);
2729 if (!pcpu_chunk_lists)
2730 panic("%s: Failed to allocate %zu bytes\n", __func__,
2731 pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]));
2732
2733 for (i = 0; i < pcpu_nr_slots; i++)
2734 INIT_LIST_HEAD(&pcpu_chunk_lists[i]);
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744 static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE);
2745 dyn_size = ai->dyn_size - (static_size - ai->static_size);
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755 tmp_addr = (unsigned long)base_addr + static_size;
2756 map_size = ai->reserved_size ?: dyn_size;
2757 chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
2758
2759
2760 if (ai->reserved_size) {
2761 pcpu_reserved_chunk = chunk;
2762
2763 tmp_addr = (unsigned long)base_addr + static_size +
2764 ai->reserved_size;
2765 map_size = dyn_size;
2766 chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
2767 }
2768
2769
2770 pcpu_first_chunk = chunk;
2771 pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages;
2772 pcpu_chunk_relocate(pcpu_first_chunk, -1);
2773
2774
2775 pcpu_nr_populated += PFN_DOWN(size_sum);
2776
2777 pcpu_stats_chunk_alloc();
2778 trace_percpu_create_chunk(base_addr);
2779
2780
2781 pcpu_base_addr = base_addr;
2782}
2783
2784#ifdef CONFIG_SMP
2785
2786const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
2787 [PCPU_FC_AUTO] = "auto",
2788 [PCPU_FC_EMBED] = "embed",
2789 [PCPU_FC_PAGE] = "page",
2790};
2791
2792enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
2793
2794static int __init percpu_alloc_setup(char *str)
2795{
2796 if (!str)
2797 return -EINVAL;
2798
2799 if (0)
2800 ;
2801#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
2802 else if (!strcmp(str, "embed"))
2803 pcpu_chosen_fc = PCPU_FC_EMBED;
2804#endif
2805#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
2806 else if (!strcmp(str, "page"))
2807 pcpu_chosen_fc = PCPU_FC_PAGE;
2808#endif
2809 else
2810 pr_warn("unknown allocator %s specified\n", str);
2811
2812 return 0;
2813}
2814early_param("percpu_alloc", percpu_alloc_setup);
2815
2816
2817
2818
2819
2820
2821#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
2822 !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
2823#define BUILD_EMBED_FIRST_CHUNK
2824#endif
2825
2826
2827#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
2828#define BUILD_PAGE_FIRST_CHUNK
2829#endif
2830
2831
2832#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info(
2855 size_t reserved_size, size_t dyn_size,
2856 size_t atom_size,
2857 pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
2858{
2859 static int group_map[NR_CPUS] __initdata;
2860 static int group_cnt[NR_CPUS] __initdata;
2861 static struct cpumask mask __initdata;
2862 const size_t static_size = __per_cpu_end - __per_cpu_start;
2863 int nr_groups = 1, nr_units = 0;
2864 size_t size_sum, min_unit_size, alloc_size;
2865 int upa, max_upa, best_upa;
2866 int last_allocs, group, unit;
2867 unsigned int cpu, tcpu;
2868 struct pcpu_alloc_info *ai;
2869 unsigned int *cpu_map;
2870
2871
2872 memset(group_map, 0, sizeof(group_map));
2873 memset(group_cnt, 0, sizeof(group_cnt));
2874 cpumask_clear(&mask);
2875
2876
2877 size_sum = PFN_ALIGN(static_size + reserved_size +
2878 max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
2879 dyn_size = size_sum - static_size - reserved_size;
2880
2881
2882
2883
2884
2885
2886
2887 min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
2888
2889
2890 alloc_size = roundup(min_unit_size, atom_size);
2891 upa = alloc_size / min_unit_size;
2892 while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
2893 upa--;
2894 max_upa = upa;
2895
2896 cpumask_copy(&mask, cpu_possible_mask);
2897
2898
2899 for (group = 0; !cpumask_empty(&mask); group++) {
2900
2901 cpu = cpumask_first(&mask);
2902 group_map[cpu] = group;
2903 group_cnt[group]++;
2904 cpumask_clear_cpu(cpu, &mask);
2905
2906 for_each_cpu(tcpu, &mask) {
2907 if (!cpu_distance_fn ||
2908 (cpu_distance_fn(cpu, tcpu) == LOCAL_DISTANCE &&
2909 cpu_distance_fn(tcpu, cpu) == LOCAL_DISTANCE)) {
2910 group_map[tcpu] = group;
2911 group_cnt[group]++;
2912 cpumask_clear_cpu(tcpu, &mask);
2913 }
2914 }
2915 }
2916 nr_groups = group;
2917
2918
2919
2920
2921
2922
2923 last_allocs = INT_MAX;
2924 best_upa = 0;
2925 for (upa = max_upa; upa; upa--) {
2926 int allocs = 0, wasted = 0;
2927
2928 if (alloc_size % upa || (offset_in_page(alloc_size / upa)))
2929 continue;
2930
2931 for (group = 0; group < nr_groups; group++) {
2932 int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
2933 allocs += this_allocs;
2934 wasted += this_allocs * upa - group_cnt[group];
2935 }
2936
2937
2938
2939
2940
2941
2942 if (wasted > num_possible_cpus() / 3)
2943 continue;
2944
2945
2946 if (allocs > last_allocs)
2947 break;
2948 last_allocs = allocs;
2949 best_upa = upa;
2950 }
2951 BUG_ON(!best_upa);
2952 upa = best_upa;
2953
2954
2955 for (group = 0; group < nr_groups; group++)
2956 nr_units += roundup(group_cnt[group], upa);
2957
2958 ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
2959 if (!ai)
2960 return ERR_PTR(-ENOMEM);
2961 cpu_map = ai->groups[0].cpu_map;
2962
2963 for (group = 0; group < nr_groups; group++) {
2964 ai->groups[group].cpu_map = cpu_map;
2965 cpu_map += roundup(group_cnt[group], upa);
2966 }
2967
2968 ai->static_size = static_size;
2969 ai->reserved_size = reserved_size;
2970 ai->dyn_size = dyn_size;
2971 ai->unit_size = alloc_size / upa;
2972 ai->atom_size = atom_size;
2973 ai->alloc_size = alloc_size;
2974
2975 for (group = 0, unit = 0; group < nr_groups; group++) {
2976 struct pcpu_group_info *gi = &ai->groups[group];
2977
2978
2979
2980
2981
2982
2983 gi->base_offset = unit * ai->unit_size;
2984
2985 for_each_possible_cpu(cpu)
2986 if (group_map[cpu] == group)
2987 gi->cpu_map[gi->nr_units++] = cpu;
2988 gi->nr_units = roundup(gi->nr_units, upa);
2989 unit += gi->nr_units;
2990 }
2991 BUG_ON(unit != nr_units);
2992
2993 return ai;
2994}
2995#endif
2996
2997#if defined(BUILD_EMBED_FIRST_CHUNK)
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
3031 size_t atom_size,
3032 pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
3033 pcpu_fc_alloc_fn_t alloc_fn,
3034 pcpu_fc_free_fn_t free_fn)
3035{
3036 void *base = (void *)ULONG_MAX;
3037 void **areas = NULL;
3038 struct pcpu_alloc_info *ai;
3039 size_t size_sum, areas_size;
3040 unsigned long max_distance;
3041 int group, i, highest_group, rc = 0;
3042
3043 ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
3044 cpu_distance_fn);
3045 if (IS_ERR(ai))
3046 return PTR_ERR(ai);
3047
3048 size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
3049 areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
3050
3051 areas = memblock_alloc(areas_size, SMP_CACHE_BYTES);
3052 if (!areas) {
3053 rc = -ENOMEM;
3054 goto out_free;
3055 }
3056
3057
3058 highest_group = 0;
3059 for (group = 0; group < ai->nr_groups; group++) {
3060 struct pcpu_group_info *gi = &ai->groups[group];
3061 unsigned int cpu = NR_CPUS;
3062 void *ptr;
3063
3064 for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
3065 cpu = gi->cpu_map[i];
3066 BUG_ON(cpu == NR_CPUS);
3067
3068
3069 ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
3070 if (!ptr) {
3071 rc = -ENOMEM;
3072 goto out_free_areas;
3073 }
3074
3075 kmemleak_free(ptr);
3076 areas[group] = ptr;
3077
3078 base = min(ptr, base);
3079 if (ptr > areas[highest_group])
3080 highest_group = group;
3081 }
3082 max_distance = areas[highest_group] - base;
3083 max_distance += ai->unit_size * ai->groups[highest_group].nr_units;
3084
3085
3086 if (max_distance > VMALLOC_TOTAL * 3 / 4) {
3087 pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n",
3088 max_distance, VMALLOC_TOTAL);
3089#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
3090
3091 rc = -EINVAL;
3092 goto out_free_areas;
3093#endif
3094 }
3095
3096
3097
3098
3099
3100
3101 for (group = 0; group < ai->nr_groups; group++) {
3102 struct pcpu_group_info *gi = &ai->groups[group];
3103 void *ptr = areas[group];
3104
3105 for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
3106 if (gi->cpu_map[i] == NR_CPUS) {
3107
3108 free_fn(ptr, ai->unit_size);
3109 continue;
3110 }
3111
3112 memcpy(ptr, __per_cpu_load, ai->static_size);
3113 free_fn(ptr + size_sum, ai->unit_size - size_sum);
3114 }
3115 }
3116
3117
3118 for (group = 0; group < ai->nr_groups; group++) {
3119 ai->groups[group].base_offset = areas[group] - base;
3120 }
3121
3122 pr_info("Embedded %zu pages/cpu s%zu r%zu d%zu u%zu\n",
3123 PFN_DOWN(size_sum), ai->static_size, ai->reserved_size,
3124 ai->dyn_size, ai->unit_size);
3125
3126 pcpu_setup_first_chunk(ai, base);
3127 goto out_free;
3128
3129out_free_areas:
3130 for (group = 0; group < ai->nr_groups; group++)
3131 if (areas[group])
3132 free_fn(areas[group],
3133 ai->groups[group].nr_units * ai->unit_size);
3134out_free:
3135 pcpu_free_alloc_info(ai);
3136 if (areas)
3137 memblock_free_early(__pa(areas), areas_size);
3138 return rc;
3139}
3140#endif
3141
3142#ifdef BUILD_PAGE_FIRST_CHUNK
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159int __init pcpu_page_first_chunk(size_t reserved_size,
3160 pcpu_fc_alloc_fn_t alloc_fn,
3161 pcpu_fc_free_fn_t free_fn,
3162 pcpu_fc_populate_pte_fn_t populate_pte_fn)
3163{
3164 static struct vm_struct vm;
3165 struct pcpu_alloc_info *ai;
3166 char psize_str[16];
3167 int unit_pages;
3168 size_t pages_size;
3169 struct page **pages;
3170 int unit, i, j, rc = 0;
3171 int upa;
3172 int nr_g0_units;
3173
3174 snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
3175
3176 ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
3177 if (IS_ERR(ai))
3178 return PTR_ERR(ai);
3179 BUG_ON(ai->nr_groups != 1);
3180 upa = ai->alloc_size/ai->unit_size;
3181 nr_g0_units = roundup(num_possible_cpus(), upa);
3182 if (WARN_ON(ai->groups[0].nr_units != nr_g0_units)) {
3183 pcpu_free_alloc_info(ai);
3184 return -EINVAL;
3185 }
3186
3187 unit_pages = ai->unit_size >> PAGE_SHIFT;
3188
3189
3190 pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
3191 sizeof(pages[0]));
3192 pages = memblock_alloc(pages_size, SMP_CACHE_BYTES);
3193 if (!pages)
3194 panic("%s: Failed to allocate %zu bytes\n", __func__,
3195 pages_size);
3196
3197
3198 j = 0;
3199 for (unit = 0; unit < num_possible_cpus(); unit++) {
3200 unsigned int cpu = ai->groups[0].cpu_map[unit];
3201 for (i = 0; i < unit_pages; i++) {
3202 void *ptr;
3203
3204 ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
3205 if (!ptr) {
3206 pr_warn("failed to allocate %s page for cpu%u\n",
3207 psize_str, cpu);
3208 goto enomem;
3209 }
3210
3211 kmemleak_free(ptr);
3212 pages[j++] = virt_to_page(ptr);
3213 }
3214 }
3215
3216
3217 vm.flags = VM_ALLOC;
3218 vm.size = num_possible_cpus() * ai->unit_size;
3219 vm_area_register_early(&vm, PAGE_SIZE);
3220
3221 for (unit = 0; unit < num_possible_cpus(); unit++) {
3222 unsigned long unit_addr =
3223 (unsigned long)vm.addr + unit * ai->unit_size;
3224
3225 for (i = 0; i < unit_pages; i++)
3226 populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
3227
3228
3229 rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
3230 unit_pages);
3231 if (rc < 0)
3232 panic("failed to map percpu area, err=%d\n", rc);
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243 memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
3244 }
3245
3246
3247 pr_info("%d %s pages/cpu s%zu r%zu d%zu\n",
3248 unit_pages, psize_str, ai->static_size,
3249 ai->reserved_size, ai->dyn_size);
3250
3251 pcpu_setup_first_chunk(ai, vm.addr);
3252 goto out_free_ar;
3253
3254enomem:
3255 while (--j >= 0)
3256 free_fn(page_address(pages[j]), PAGE_SIZE);
3257 rc = -ENOMEM;
3258out_free_ar:
3259 memblock_free_early(__pa(pages), pages_size);
3260 pcpu_free_alloc_info(ai);
3261 return rc;
3262}
3263#endif
3264
3265#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
3279EXPORT_SYMBOL(__per_cpu_offset);
3280
3281static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
3282 size_t align)
3283{
3284 return memblock_alloc_from(size, align, __pa(MAX_DMA_ADDRESS));
3285}
3286
3287static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
3288{
3289 memblock_free_early(__pa(ptr), size);
3290}
3291
3292void __init setup_per_cpu_areas(void)
3293{
3294 unsigned long delta;
3295 unsigned int cpu;
3296 int rc;
3297
3298
3299
3300
3301
3302 rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
3303 PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
3304 pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
3305 if (rc < 0)
3306 panic("Failed to initialize percpu areas.");
3307
3308 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
3309 for_each_possible_cpu(cpu)
3310 __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
3311}
3312#endif
3313
3314#else
3315
3316
3317
3318
3319
3320
3321
3322
3323void __init setup_per_cpu_areas(void)
3324{
3325 const size_t unit_size =
3326 roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
3327 PERCPU_DYNAMIC_RESERVE));
3328 struct pcpu_alloc_info *ai;
3329 void *fc;
3330
3331 ai = pcpu_alloc_alloc_info(1, 1);
3332 fc = memblock_alloc_from(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
3333 if (!ai || !fc)
3334 panic("Failed to allocate memory for percpu areas.");
3335
3336 kmemleak_free(fc);
3337
3338 ai->dyn_size = unit_size;
3339 ai->unit_size = unit_size;
3340 ai->atom_size = unit_size;
3341 ai->alloc_size = unit_size;
3342 ai->groups[0].nr_units = 1;
3343 ai->groups[0].cpu_map[0] = 0;
3344
3345 pcpu_setup_first_chunk(ai, fc);
3346 pcpu_free_alloc_info(ai);
3347}
3348
3349#endif
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362unsigned long pcpu_nr_pages(void)
3363{
3364 return pcpu_nr_populated * pcpu_nr_units;
3365}
3366
3367
3368
3369
3370
3371
3372static int __init percpu_enable_async(void)
3373{
3374 pcpu_async_enabled = true;
3375 return 0;
3376}
3377subsys_initcall(percpu_enable_async);
3378