1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
70
71#include <linux/bitmap.h>
72#include <linux/cpumask.h>
73#include <linux/memblock.h>
74#include <linux/err.h>
75#include <linux/lcm.h>
76#include <linux/list.h>
77#include <linux/log2.h>
78#include <linux/mm.h>
79#include <linux/module.h>
80#include <linux/mutex.h>
81#include <linux/percpu.h>
82#include <linux/pfn.h>
83#include <linux/slab.h>
84#include <linux/spinlock.h>
85#include <linux/vmalloc.h>
86#include <linux/workqueue.h>
87#include <linux/kmemleak.h>
88#include <linux/sched.h>
89#include <linux/sched/mm.h>
90#include <linux/memcontrol.h>
91
92#include <asm/cacheflush.h>
93#include <asm/sections.h>
94#include <asm/tlbflush.h>
95#include <asm/io.h>
96
97#define CREATE_TRACE_POINTS
98#include <trace/events/percpu.h>
99
100#include "percpu-internal.h"
101
102
103#define PCPU_SLOT_BASE_SHIFT 5
104
105#define PCPU_SLOT_FAIL_THRESHOLD 3
106
107#define PCPU_EMPTY_POP_PAGES_LOW 2
108#define PCPU_EMPTY_POP_PAGES_HIGH 4
109
110#ifdef CONFIG_SMP
111
112#ifndef __addr_to_pcpu_ptr
113#define __addr_to_pcpu_ptr(addr) \
114 (void __percpu *)((unsigned long)(addr) - \
115 (unsigned long)pcpu_base_addr + \
116 (unsigned long)__per_cpu_start)
117#endif
118#ifndef __pcpu_ptr_to_addr
119#define __pcpu_ptr_to_addr(ptr) \
120 (void __force *)((unsigned long)(ptr) + \
121 (unsigned long)pcpu_base_addr - \
122 (unsigned long)__per_cpu_start)
123#endif
124#else
125
126#define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr)
127#define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr)
128#endif
129
130static int pcpu_unit_pages __ro_after_init;
131static int pcpu_unit_size __ro_after_init;
132static int pcpu_nr_units __ro_after_init;
133static int pcpu_atom_size __ro_after_init;
134int pcpu_nr_slots __ro_after_init;
135static size_t pcpu_chunk_struct_size __ro_after_init;
136
137
138static unsigned int pcpu_low_unit_cpu __ro_after_init;
139static unsigned int pcpu_high_unit_cpu __ro_after_init;
140
141
142void *pcpu_base_addr __ro_after_init;
143EXPORT_SYMBOL_GPL(pcpu_base_addr);
144
145static const int *pcpu_unit_map __ro_after_init;
146const unsigned long *pcpu_unit_offsets __ro_after_init;
147
148
149static int pcpu_nr_groups __ro_after_init;
150static const unsigned long *pcpu_group_offsets __ro_after_init;
151static const size_t *pcpu_group_sizes __ro_after_init;
152
153
154
155
156
157
158struct pcpu_chunk *pcpu_first_chunk __ro_after_init;
159
160
161
162
163
164
165struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;
166
167DEFINE_SPINLOCK(pcpu_lock);
168static DEFINE_MUTEX(pcpu_alloc_mutex);
169
170struct list_head *pcpu_chunk_lists __ro_after_init;
171
172
173static LIST_HEAD(pcpu_map_extend_chunks);
174
175
176
177
178
179int pcpu_nr_empty_pop_pages[PCPU_NR_CHUNK_TYPES];
180
181
182
183
184
185
186
187static unsigned long pcpu_nr_populated;
188
189
190
191
192
193
194
195static void pcpu_balance_workfn(struct work_struct *work);
196static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
197static bool pcpu_async_enabled __read_mostly;
198static bool pcpu_atomic_alloc_failed;
199
200static void pcpu_schedule_balance_work(void)
201{
202 if (pcpu_async_enabled)
203 schedule_work(&pcpu_balance_work);
204}
205
206
207
208
209
210
211
212
213
214static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr)
215{
216 void *start_addr, *end_addr;
217
218 if (!chunk)
219 return false;
220
221 start_addr = chunk->base_addr + chunk->start_offset;
222 end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE -
223 chunk->end_offset;
224
225 return addr >= start_addr && addr < end_addr;
226}
227
228static int __pcpu_size_to_slot(int size)
229{
230 int highbit = fls(size);
231 return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
232}
233
234static int pcpu_size_to_slot(int size)
235{
236 if (size == pcpu_unit_size)
237 return pcpu_nr_slots - 1;
238 return __pcpu_size_to_slot(size);
239}
240
241static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
242{
243 const struct pcpu_block_md *chunk_md = &chunk->chunk_md;
244
245 if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE ||
246 chunk_md->contig_hint == 0)
247 return 0;
248
249 return pcpu_size_to_slot(chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE);
250}
251
252
253static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
254{
255 page->index = (unsigned long)pcpu;
256}
257
258
259static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
260{
261 return (struct pcpu_chunk *)page->index;
262}
263
264static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
265{
266 return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
267}
268
269static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx)
270{
271 return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT);
272}
273
274static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
275 unsigned int cpu, int page_idx)
276{
277 return (unsigned long)chunk->base_addr +
278 pcpu_unit_page_offset(cpu, page_idx);
279}
280
281
282
283
284
285static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index)
286{
287 return chunk->alloc_map +
288 (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG);
289}
290
291static unsigned long pcpu_off_to_block_index(int off)
292{
293 return off / PCPU_BITMAP_BLOCK_BITS;
294}
295
296static unsigned long pcpu_off_to_block_off(int off)
297{
298 return off & (PCPU_BITMAP_BLOCK_BITS - 1);
299}
300
301static unsigned long pcpu_block_off_to_off(int index, int off)
302{
303 return index * PCPU_BITMAP_BLOCK_BITS + off;
304}
305
306
307
308
309
310
311
312
313
314
315
316
317static int pcpu_next_hint(struct pcpu_block_md *block, int alloc_bits)
318{
319
320
321
322
323
324
325
326 if (block->scan_hint &&
327 block->contig_hint_start > block->scan_hint_start &&
328 alloc_bits > block->scan_hint)
329 return block->scan_hint_start + block->scan_hint;
330
331 return block->first_free;
332}
333
334
335
336
337
338
339
340
341
342
343
344
345static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off,
346 int *bits)
347{
348 int i = pcpu_off_to_block_index(*bit_off);
349 int block_off = pcpu_off_to_block_off(*bit_off);
350 struct pcpu_block_md *block;
351
352 *bits = 0;
353 for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
354 block++, i++) {
355
356 if (*bits) {
357 *bits += block->left_free;
358 if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
359 continue;
360 return;
361 }
362
363
364
365
366
367
368
369
370
371 *bits = block->contig_hint;
372 if (*bits && block->contig_hint_start >= block_off &&
373 *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) {
374 *bit_off = pcpu_block_off_to_off(i,
375 block->contig_hint_start);
376 return;
377 }
378
379 block_off = 0;
380
381 *bits = block->right_free;
382 *bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free;
383 }
384}
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
401 int align, int *bit_off, int *bits)
402{
403 int i = pcpu_off_to_block_index(*bit_off);
404 int block_off = pcpu_off_to_block_off(*bit_off);
405 struct pcpu_block_md *block;
406
407 *bits = 0;
408 for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
409 block++, i++) {
410
411 if (*bits) {
412 *bits += block->left_free;
413 if (*bits >= alloc_bits)
414 return;
415 if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
416 continue;
417 }
418
419
420 *bits = ALIGN(block->contig_hint_start, align) -
421 block->contig_hint_start;
422
423
424
425
426 if (block->contig_hint &&
427 block->contig_hint_start >= block_off &&
428 block->contig_hint >= *bits + alloc_bits) {
429 int start = pcpu_next_hint(block, alloc_bits);
430
431 *bits += alloc_bits + block->contig_hint_start -
432 start;
433 *bit_off = pcpu_block_off_to_off(i, start);
434 return;
435 }
436
437 block_off = 0;
438
439 *bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free,
440 align);
441 *bits = PCPU_BITMAP_BLOCK_BITS - *bit_off;
442 *bit_off = pcpu_block_off_to_off(i, *bit_off);
443 if (*bits >= alloc_bits)
444 return;
445 }
446
447
448 *bit_off = pcpu_chunk_map_bits(chunk);
449}
450
451
452
453
454
455
456
457#define pcpu_for_each_md_free_region(chunk, bit_off, bits) \
458 for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits)); \
459 (bit_off) < pcpu_chunk_map_bits((chunk)); \
460 (bit_off) += (bits) + 1, \
461 pcpu_next_md_free_region((chunk), &(bit_off), &(bits)))
462
463#define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) \
464 for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
465 &(bits)); \
466 (bit_off) < pcpu_chunk_map_bits((chunk)); \
467 (bit_off) += (bits), \
468 pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
469 &(bits)))
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
485{
486 if (WARN_ON_ONCE(!slab_is_available()))
487 return NULL;
488
489 if (size <= PAGE_SIZE)
490 return kzalloc(size, gfp);
491 else
492 return __vmalloc(size, gfp | __GFP_ZERO);
493}
494
495
496
497
498
499
500
501static void pcpu_mem_free(void *ptr)
502{
503 kvfree(ptr);
504}
505
506static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot,
507 bool move_front)
508{
509 if (chunk != pcpu_reserved_chunk) {
510 struct list_head *pcpu_slot;
511
512 pcpu_slot = pcpu_chunk_list(pcpu_chunk_type(chunk));
513 if (move_front)
514 list_move(&chunk->list, &pcpu_slot[slot]);
515 else
516 list_move_tail(&chunk->list, &pcpu_slot[slot]);
517 }
518}
519
520static void pcpu_chunk_move(struct pcpu_chunk *chunk, int slot)
521{
522 __pcpu_chunk_move(chunk, slot, true);
523}
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
539{
540 int nslot = pcpu_chunk_slot(chunk);
541
542 if (oslot != nslot)
543 __pcpu_chunk_move(chunk, nslot, oslot < nslot);
544}
545
546
547
548
549
550
551
552
553
554
555static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr)
556{
557 chunk->nr_empty_pop_pages += nr;
558 if (chunk != pcpu_reserved_chunk)
559 pcpu_nr_empty_pop_pages[pcpu_chunk_type(chunk)] += nr;
560}
561
562
563
564
565
566
567
568
569
570
571
572static inline bool pcpu_region_overlap(int a, int b, int x, int y)
573{
574 return (a < y) && (x < b);
575}
576
577
578
579
580
581
582
583
584
585
586
587static void pcpu_block_update(struct pcpu_block_md *block, int start, int end)
588{
589 int contig = end - start;
590
591 block->first_free = min(block->first_free, start);
592 if (start == 0)
593 block->left_free = contig;
594
595 if (end == block->nr_bits)
596 block->right_free = contig;
597
598 if (contig > block->contig_hint) {
599
600 if (start > block->contig_hint_start) {
601 if (block->contig_hint > block->scan_hint) {
602 block->scan_hint_start =
603 block->contig_hint_start;
604 block->scan_hint = block->contig_hint;
605 } else if (start < block->scan_hint_start) {
606
607
608
609
610
611 block->scan_hint = 0;
612 }
613 } else {
614 block->scan_hint = 0;
615 }
616 block->contig_hint_start = start;
617 block->contig_hint = contig;
618 } else if (contig == block->contig_hint) {
619 if (block->contig_hint_start &&
620 (!start ||
621 __ffs(start) > __ffs(block->contig_hint_start))) {
622
623 block->contig_hint_start = start;
624 if (start < block->scan_hint_start &&
625 block->contig_hint > block->scan_hint)
626 block->scan_hint = 0;
627 } else if (start > block->scan_hint_start ||
628 block->contig_hint > block->scan_hint) {
629
630
631
632
633
634 block->scan_hint_start = start;
635 block->scan_hint = contig;
636 }
637 } else {
638
639
640
641
642
643 if ((start < block->contig_hint_start &&
644 (contig > block->scan_hint ||
645 (contig == block->scan_hint &&
646 start > block->scan_hint_start)))) {
647 block->scan_hint_start = start;
648 block->scan_hint = contig;
649 }
650 }
651}
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669static void pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off,
670 int bits)
671{
672 int s_off = pcpu_off_to_block_off(bit_off);
673 int e_off = s_off + bits;
674 int s_index, l_bit;
675 struct pcpu_block_md *block;
676
677 if (e_off > PCPU_BITMAP_BLOCK_BITS)
678 return;
679
680 s_index = pcpu_off_to_block_index(bit_off);
681 block = chunk->md_blocks + s_index;
682
683
684 l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), s_off);
685 s_off = (s_off == l_bit) ? 0 : l_bit + 1;
686
687 pcpu_block_update(block, s_off, e_off);
688}
689
690
691
692
693
694
695
696
697
698
699
700
701
702static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan)
703{
704 struct pcpu_block_md *chunk_md = &chunk->chunk_md;
705 int bit_off, bits;
706
707
708 if (!full_scan && chunk_md->scan_hint) {
709 bit_off = chunk_md->scan_hint_start + chunk_md->scan_hint;
710 chunk_md->contig_hint_start = chunk_md->scan_hint_start;
711 chunk_md->contig_hint = chunk_md->scan_hint;
712 chunk_md->scan_hint = 0;
713 } else {
714 bit_off = chunk_md->first_free;
715 chunk_md->contig_hint = 0;
716 }
717
718 bits = 0;
719 pcpu_for_each_md_free_region(chunk, bit_off, bits)
720 pcpu_block_update(chunk_md, bit_off, bit_off + bits);
721}
722
723
724
725
726
727
728
729
730
731static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
732{
733 struct pcpu_block_md *block = chunk->md_blocks + index;
734 unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
735 unsigned int rs, re, start;
736
737
738 if (block->scan_hint) {
739 start = block->scan_hint_start + block->scan_hint;
740 block->contig_hint_start = block->scan_hint_start;
741 block->contig_hint = block->scan_hint;
742 block->scan_hint = 0;
743 } else {
744 start = block->first_free;
745 block->contig_hint = 0;
746 }
747
748 block->right_free = 0;
749
750
751 bitmap_for_each_clear_region(alloc_map, rs, re, start,
752 PCPU_BITMAP_BLOCK_BITS)
753 pcpu_block_update(block, rs, re);
754}
755
756
757
758
759
760
761
762
763
764
765
766static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
767 int bits)
768{
769 struct pcpu_block_md *chunk_md = &chunk->chunk_md;
770 int nr_empty_pages = 0;
771 struct pcpu_block_md *s_block, *e_block, *block;
772 int s_index, e_index;
773 int s_off, e_off;
774
775
776
777
778
779
780
781 s_index = pcpu_off_to_block_index(bit_off);
782 e_index = pcpu_off_to_block_index(bit_off + bits - 1);
783 s_off = pcpu_off_to_block_off(bit_off);
784 e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
785
786 s_block = chunk->md_blocks + s_index;
787 e_block = chunk->md_blocks + e_index;
788
789
790
791
792
793
794
795 if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
796 nr_empty_pages++;
797
798 if (s_off == s_block->first_free)
799 s_block->first_free = find_next_zero_bit(
800 pcpu_index_alloc_map(chunk, s_index),
801 PCPU_BITMAP_BLOCK_BITS,
802 s_off + bits);
803
804 if (pcpu_region_overlap(s_block->scan_hint_start,
805 s_block->scan_hint_start + s_block->scan_hint,
806 s_off,
807 s_off + bits))
808 s_block->scan_hint = 0;
809
810 if (pcpu_region_overlap(s_block->contig_hint_start,
811 s_block->contig_hint_start +
812 s_block->contig_hint,
813 s_off,
814 s_off + bits)) {
815
816 if (!s_off)
817 s_block->left_free = 0;
818 pcpu_block_refresh_hint(chunk, s_index);
819 } else {
820
821 s_block->left_free = min(s_block->left_free, s_off);
822 if (s_index == e_index)
823 s_block->right_free = min_t(int, s_block->right_free,
824 PCPU_BITMAP_BLOCK_BITS - e_off);
825 else
826 s_block->right_free = 0;
827 }
828
829
830
831
832 if (s_index != e_index) {
833 if (e_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
834 nr_empty_pages++;
835
836
837
838
839
840 e_block->first_free = find_next_zero_bit(
841 pcpu_index_alloc_map(chunk, e_index),
842 PCPU_BITMAP_BLOCK_BITS, e_off);
843
844 if (e_off == PCPU_BITMAP_BLOCK_BITS) {
845
846 e_block++;
847 } else {
848 if (e_off > e_block->scan_hint_start)
849 e_block->scan_hint = 0;
850
851 e_block->left_free = 0;
852 if (e_off > e_block->contig_hint_start) {
853
854 pcpu_block_refresh_hint(chunk, e_index);
855 } else {
856 e_block->right_free =
857 min_t(int, e_block->right_free,
858 PCPU_BITMAP_BLOCK_BITS - e_off);
859 }
860 }
861
862
863 nr_empty_pages += (e_index - s_index - 1);
864 for (block = s_block + 1; block < e_block; block++) {
865 block->scan_hint = 0;
866 block->contig_hint = 0;
867 block->left_free = 0;
868 block->right_free = 0;
869 }
870 }
871
872 if (nr_empty_pages)
873 pcpu_update_empty_pages(chunk, -nr_empty_pages);
874
875 if (pcpu_region_overlap(chunk_md->scan_hint_start,
876 chunk_md->scan_hint_start +
877 chunk_md->scan_hint,
878 bit_off,
879 bit_off + bits))
880 chunk_md->scan_hint = 0;
881
882
883
884
885
886
887 if (pcpu_region_overlap(chunk_md->contig_hint_start,
888 chunk_md->contig_hint_start +
889 chunk_md->contig_hint,
890 bit_off,
891 bit_off + bits))
892 pcpu_chunk_refresh_hint(chunk, false);
893}
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
914 int bits)
915{
916 int nr_empty_pages = 0;
917 struct pcpu_block_md *s_block, *e_block, *block;
918 int s_index, e_index;
919 int s_off, e_off;
920 int start, end;
921
922
923
924
925
926
927
928 s_index = pcpu_off_to_block_index(bit_off);
929 e_index = pcpu_off_to_block_index(bit_off + bits - 1);
930 s_off = pcpu_off_to_block_off(bit_off);
931 e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
932
933 s_block = chunk->md_blocks + s_index;
934 e_block = chunk->md_blocks + e_index;
935
936
937
938
939
940
941
942
943
944
945
946 start = s_off;
947 if (s_off == s_block->contig_hint + s_block->contig_hint_start) {
948 start = s_block->contig_hint_start;
949 } else {
950
951
952
953
954
955
956 int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index),
957 start);
958 start = (start == l_bit) ? 0 : l_bit + 1;
959 }
960
961 end = e_off;
962 if (e_off == e_block->contig_hint_start)
963 end = e_block->contig_hint_start + e_block->contig_hint;
964 else
965 end = find_next_bit(pcpu_index_alloc_map(chunk, e_index),
966 PCPU_BITMAP_BLOCK_BITS, end);
967
968
969 e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS;
970 if (!start && e_off == PCPU_BITMAP_BLOCK_BITS)
971 nr_empty_pages++;
972 pcpu_block_update(s_block, start, e_off);
973
974
975 if (s_index != e_index) {
976
977 if (end == PCPU_BITMAP_BLOCK_BITS)
978 nr_empty_pages++;
979 pcpu_block_update(e_block, 0, end);
980
981
982 nr_empty_pages += (e_index - s_index - 1);
983 for (block = s_block + 1; block < e_block; block++) {
984 block->first_free = 0;
985 block->scan_hint = 0;
986 block->contig_hint_start = 0;
987 block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
988 block->left_free = PCPU_BITMAP_BLOCK_BITS;
989 block->right_free = PCPU_BITMAP_BLOCK_BITS;
990 }
991 }
992
993 if (nr_empty_pages)
994 pcpu_update_empty_pages(chunk, nr_empty_pages);
995
996
997
998
999
1000
1001
1002 if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index)
1003 pcpu_chunk_refresh_hint(chunk, true);
1004 else
1005 pcpu_block_update(&chunk->chunk_md,
1006 pcpu_block_off_to_off(s_index, start),
1007 end);
1008}
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
1024 int *next_off)
1025{
1026 unsigned int page_start, page_end, rs, re;
1027
1028 page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
1029 page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
1030
1031 rs = page_start;
1032 bitmap_next_clear_region(chunk->populated, &rs, &re, page_end);
1033 if (rs >= page_end)
1034 return true;
1035
1036 *next_off = re * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
1037 return false;
1038}
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
1060 size_t align, bool pop_only)
1061{
1062 struct pcpu_block_md *chunk_md = &chunk->chunk_md;
1063 int bit_off, bits, next_off;
1064
1065
1066
1067
1068
1069
1070
1071 bit_off = ALIGN(chunk_md->contig_hint_start, align) -
1072 chunk_md->contig_hint_start;
1073 if (bit_off + alloc_bits > chunk_md->contig_hint)
1074 return -1;
1075
1076 bit_off = pcpu_next_hint(chunk_md, alloc_bits);
1077 bits = 0;
1078 pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) {
1079 if (!pop_only || pcpu_is_populated(chunk, bit_off, bits,
1080 &next_off))
1081 break;
1082
1083 bit_off = next_off;
1084 bits = 0;
1085 }
1086
1087 if (bit_off == pcpu_chunk_map_bits(chunk))
1088 return -1;
1089
1090 return bit_off;
1091}
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113static unsigned long pcpu_find_zero_area(unsigned long *map,
1114 unsigned long size,
1115 unsigned long start,
1116 unsigned long nr,
1117 unsigned long align_mask,
1118 unsigned long *largest_off,
1119 unsigned long *largest_bits)
1120{
1121 unsigned long index, end, i, area_off, area_bits;
1122again:
1123 index = find_next_zero_bit(map, size, start);
1124
1125
1126 index = __ALIGN_MASK(index, align_mask);
1127 area_off = index;
1128
1129 end = index + nr;
1130 if (end > size)
1131 return end;
1132 i = find_next_bit(map, end, index);
1133 if (i < end) {
1134 area_bits = i - area_off;
1135
1136 if (area_bits > *largest_bits ||
1137 (area_bits == *largest_bits && *largest_off &&
1138 (!area_off || __ffs(area_off) > __ffs(*largest_off)))) {
1139 *largest_off = area_off;
1140 *largest_bits = area_bits;
1141 }
1142
1143 start = i + 1;
1144 goto again;
1145 }
1146 return index;
1147}
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
1169 size_t align, int start)
1170{
1171 struct pcpu_block_md *chunk_md = &chunk->chunk_md;
1172 size_t align_mask = (align) ? (align - 1) : 0;
1173 unsigned long area_off = 0, area_bits = 0;
1174 int bit_off, end, oslot;
1175
1176 lockdep_assert_held(&pcpu_lock);
1177
1178 oslot = pcpu_chunk_slot(chunk);
1179
1180
1181
1182
1183 end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS,
1184 pcpu_chunk_map_bits(chunk));
1185 bit_off = pcpu_find_zero_area(chunk->alloc_map, end, start, alloc_bits,
1186 align_mask, &area_off, &area_bits);
1187 if (bit_off >= end)
1188 return -1;
1189
1190 if (area_bits)
1191 pcpu_block_update_scan(chunk, area_off, area_bits);
1192
1193
1194 bitmap_set(chunk->alloc_map, bit_off, alloc_bits);
1195
1196
1197 set_bit(bit_off, chunk->bound_map);
1198 bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1);
1199 set_bit(bit_off + alloc_bits, chunk->bound_map);
1200
1201 chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE;
1202
1203
1204 if (bit_off == chunk_md->first_free)
1205 chunk_md->first_free = find_next_zero_bit(
1206 chunk->alloc_map,
1207 pcpu_chunk_map_bits(chunk),
1208 bit_off + alloc_bits);
1209
1210 pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits);
1211
1212 pcpu_chunk_relocate(chunk, oslot);
1213
1214 return bit_off * PCPU_MIN_ALLOC_SIZE;
1215}
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228static int pcpu_free_area(struct pcpu_chunk *chunk, int off)
1229{
1230 struct pcpu_block_md *chunk_md = &chunk->chunk_md;
1231 int bit_off, bits, end, oslot, freed;
1232
1233 lockdep_assert_held(&pcpu_lock);
1234 pcpu_stats_area_dealloc(chunk);
1235
1236 oslot = pcpu_chunk_slot(chunk);
1237
1238 bit_off = off / PCPU_MIN_ALLOC_SIZE;
1239
1240
1241 end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
1242 bit_off + 1);
1243 bits = end - bit_off;
1244 bitmap_clear(chunk->alloc_map, bit_off, bits);
1245
1246 freed = bits * PCPU_MIN_ALLOC_SIZE;
1247
1248
1249 chunk->free_bytes += freed;
1250
1251
1252 chunk_md->first_free = min(chunk_md->first_free, bit_off);
1253
1254 pcpu_block_update_hint_free(chunk, bit_off, bits);
1255
1256 pcpu_chunk_relocate(chunk, oslot);
1257
1258 return freed;
1259}
1260
1261static void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits)
1262{
1263 block->scan_hint = 0;
1264 block->contig_hint = nr_bits;
1265 block->left_free = nr_bits;
1266 block->right_free = nr_bits;
1267 block->first_free = 0;
1268 block->nr_bits = nr_bits;
1269}
1270
1271static void pcpu_init_md_blocks(struct pcpu_chunk *chunk)
1272{
1273 struct pcpu_block_md *md_block;
1274
1275
1276 pcpu_init_md_block(&chunk->chunk_md, pcpu_chunk_map_bits(chunk));
1277
1278 for (md_block = chunk->md_blocks;
1279 md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk);
1280 md_block++)
1281 pcpu_init_md_block(md_block, PCPU_BITMAP_BLOCK_BITS);
1282}
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
1298 int map_size)
1299{
1300 struct pcpu_chunk *chunk;
1301 unsigned long aligned_addr, lcm_align;
1302 int start_offset, offset_bits, region_size, region_bits;
1303 size_t alloc_size;
1304
1305
1306 aligned_addr = tmp_addr & PAGE_MASK;
1307
1308 start_offset = tmp_addr - aligned_addr;
1309
1310
1311
1312
1313
1314
1315 lcm_align = lcm(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE);
1316 region_size = ALIGN(start_offset + map_size, lcm_align);
1317
1318
1319 alloc_size = struct_size(chunk, populated,
1320 BITS_TO_LONGS(region_size >> PAGE_SHIFT));
1321 chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
1322 if (!chunk)
1323 panic("%s: Failed to allocate %zu bytes\n", __func__,
1324 alloc_size);
1325
1326 INIT_LIST_HEAD(&chunk->list);
1327
1328 chunk->base_addr = (void *)aligned_addr;
1329 chunk->start_offset = start_offset;
1330 chunk->end_offset = region_size - chunk->start_offset - map_size;
1331
1332 chunk->nr_pages = region_size >> PAGE_SHIFT;
1333 region_bits = pcpu_chunk_map_bits(chunk);
1334
1335 alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]);
1336 chunk->alloc_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
1337 if (!chunk->alloc_map)
1338 panic("%s: Failed to allocate %zu bytes\n", __func__,
1339 alloc_size);
1340
1341 alloc_size =
1342 BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]);
1343 chunk->bound_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
1344 if (!chunk->bound_map)
1345 panic("%s: Failed to allocate %zu bytes\n", __func__,
1346 alloc_size);
1347
1348 alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]);
1349 chunk->md_blocks = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
1350 if (!chunk->md_blocks)
1351 panic("%s: Failed to allocate %zu bytes\n", __func__,
1352 alloc_size);
1353
1354#ifdef CONFIG_MEMCG_KMEM
1355
1356 chunk->obj_cgroups = NULL;
1357#endif
1358 pcpu_init_md_blocks(chunk);
1359
1360
1361 chunk->immutable = true;
1362 bitmap_fill(chunk->populated, chunk->nr_pages);
1363 chunk->nr_populated = chunk->nr_pages;
1364 chunk->nr_empty_pop_pages = chunk->nr_pages;
1365
1366 chunk->free_bytes = map_size;
1367
1368 if (chunk->start_offset) {
1369
1370 offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
1371 bitmap_set(chunk->alloc_map, 0, offset_bits);
1372 set_bit(0, chunk->bound_map);
1373 set_bit(offset_bits, chunk->bound_map);
1374
1375 chunk->chunk_md.first_free = offset_bits;
1376
1377 pcpu_block_update_hint_alloc(chunk, 0, offset_bits);
1378 }
1379
1380 if (chunk->end_offset) {
1381
1382 offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE;
1383 bitmap_set(chunk->alloc_map,
1384 pcpu_chunk_map_bits(chunk) - offset_bits,
1385 offset_bits);
1386 set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE,
1387 chunk->bound_map);
1388 set_bit(region_bits, chunk->bound_map);
1389
1390 pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk)
1391 - offset_bits, offset_bits);
1392 }
1393
1394 return chunk;
1395}
1396
1397static struct pcpu_chunk *pcpu_alloc_chunk(enum pcpu_chunk_type type, gfp_t gfp)
1398{
1399 struct pcpu_chunk *chunk;
1400 int region_bits;
1401
1402 chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp);
1403 if (!chunk)
1404 return NULL;
1405
1406 INIT_LIST_HEAD(&chunk->list);
1407 chunk->nr_pages = pcpu_unit_pages;
1408 region_bits = pcpu_chunk_map_bits(chunk);
1409
1410 chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
1411 sizeof(chunk->alloc_map[0]), gfp);
1412 if (!chunk->alloc_map)
1413 goto alloc_map_fail;
1414
1415 chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
1416 sizeof(chunk->bound_map[0]), gfp);
1417 if (!chunk->bound_map)
1418 goto bound_map_fail;
1419
1420 chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) *
1421 sizeof(chunk->md_blocks[0]), gfp);
1422 if (!chunk->md_blocks)
1423 goto md_blocks_fail;
1424
1425#ifdef CONFIG_MEMCG_KMEM
1426 if (pcpu_is_memcg_chunk(type)) {
1427 chunk->obj_cgroups =
1428 pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) *
1429 sizeof(struct obj_cgroup *), gfp);
1430 if (!chunk->obj_cgroups)
1431 goto objcg_fail;
1432 }
1433#endif
1434
1435 pcpu_init_md_blocks(chunk);
1436
1437
1438 chunk->free_bytes = chunk->nr_pages * PAGE_SIZE;
1439
1440 return chunk;
1441
1442#ifdef CONFIG_MEMCG_KMEM
1443objcg_fail:
1444 pcpu_mem_free(chunk->md_blocks);
1445#endif
1446md_blocks_fail:
1447 pcpu_mem_free(chunk->bound_map);
1448bound_map_fail:
1449 pcpu_mem_free(chunk->alloc_map);
1450alloc_map_fail:
1451 pcpu_mem_free(chunk);
1452
1453 return NULL;
1454}
1455
1456static void pcpu_free_chunk(struct pcpu_chunk *chunk)
1457{
1458 if (!chunk)
1459 return;
1460#ifdef CONFIG_MEMCG_KMEM
1461 pcpu_mem_free(chunk->obj_cgroups);
1462#endif
1463 pcpu_mem_free(chunk->md_blocks);
1464 pcpu_mem_free(chunk->bound_map);
1465 pcpu_mem_free(chunk->alloc_map);
1466 pcpu_mem_free(chunk);
1467}
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
1483 int page_end)
1484{
1485 int nr = page_end - page_start;
1486
1487 lockdep_assert_held(&pcpu_lock);
1488
1489 bitmap_set(chunk->populated, page_start, nr);
1490 chunk->nr_populated += nr;
1491 pcpu_nr_populated += nr;
1492
1493 pcpu_update_empty_pages(chunk, nr);
1494}
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
1507 int page_start, int page_end)
1508{
1509 int nr = page_end - page_start;
1510
1511 lockdep_assert_held(&pcpu_lock);
1512
1513 bitmap_clear(chunk->populated, page_start, nr);
1514 chunk->nr_populated -= nr;
1515 pcpu_nr_populated -= nr;
1516
1517 pcpu_update_empty_pages(chunk, -nr);
1518}
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
1536 int page_start, int page_end, gfp_t gfp);
1537static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
1538 int page_start, int page_end);
1539static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type,
1540 gfp_t gfp);
1541static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
1542static struct page *pcpu_addr_to_page(void *addr);
1543static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
1544
1545#ifdef CONFIG_NEED_PER_CPU_KM
1546#include "percpu-km.c"
1547#else
1548#include "percpu-vm.c"
1549#endif
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
1562{
1563
1564 if (pcpu_addr_in_chunk(pcpu_first_chunk, addr))
1565 return pcpu_first_chunk;
1566
1567
1568 if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr))
1569 return pcpu_reserved_chunk;
1570
1571
1572
1573
1574
1575
1576
1577
1578 addr += pcpu_unit_offsets[raw_smp_processor_id()];
1579 return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
1580}
1581
1582#ifdef CONFIG_MEMCG_KMEM
1583static enum pcpu_chunk_type pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
1584 struct obj_cgroup **objcgp)
1585{
1586 struct obj_cgroup *objcg;
1587
1588 if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT))
1589 return PCPU_CHUNK_ROOT;
1590
1591 objcg = get_obj_cgroup_from_current();
1592 if (!objcg)
1593 return PCPU_CHUNK_ROOT;
1594
1595 if (obj_cgroup_charge(objcg, gfp, size * num_possible_cpus())) {
1596 obj_cgroup_put(objcg);
1597 return PCPU_FAIL_ALLOC;
1598 }
1599
1600 *objcgp = objcg;
1601 return PCPU_CHUNK_MEMCG;
1602}
1603
1604static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
1605 struct pcpu_chunk *chunk, int off,
1606 size_t size)
1607{
1608 if (!objcg)
1609 return;
1610
1611 if (chunk) {
1612 chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg;
1613
1614 rcu_read_lock();
1615 mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
1616 size * num_possible_cpus());
1617 rcu_read_unlock();
1618 } else {
1619 obj_cgroup_uncharge(objcg, size * num_possible_cpus());
1620 obj_cgroup_put(objcg);
1621 }
1622}
1623
1624static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
1625{
1626 struct obj_cgroup *objcg;
1627
1628 if (!pcpu_is_memcg_chunk(pcpu_chunk_type(chunk)))
1629 return;
1630
1631 objcg = chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT];
1632 chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL;
1633
1634 obj_cgroup_uncharge(objcg, size * num_possible_cpus());
1635
1636 rcu_read_lock();
1637 mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
1638 -(size * num_possible_cpus()));
1639 rcu_read_unlock();
1640
1641 obj_cgroup_put(objcg);
1642}
1643
1644#else
1645static enum pcpu_chunk_type
1646pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp)
1647{
1648 return PCPU_CHUNK_ROOT;
1649}
1650
1651static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
1652 struct pcpu_chunk *chunk, int off,
1653 size_t size)
1654{
1655}
1656
1657static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
1658{
1659}
1660#endif
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
1678 gfp_t gfp)
1679{
1680 gfp_t pcpu_gfp;
1681 bool is_atomic;
1682 bool do_warn;
1683 enum pcpu_chunk_type type;
1684 struct list_head *pcpu_slot;
1685 struct obj_cgroup *objcg = NULL;
1686 static int warn_limit = 10;
1687 struct pcpu_chunk *chunk, *next;
1688 const char *err;
1689 int slot, off, cpu, ret;
1690 unsigned long flags;
1691 void __percpu *ptr;
1692 size_t bits, bit_align;
1693
1694 gfp = current_gfp_context(gfp);
1695
1696 pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
1697 is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
1698 do_warn = !(gfp & __GFP_NOWARN);
1699
1700
1701
1702
1703
1704
1705
1706 if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
1707 align = PCPU_MIN_ALLOC_SIZE;
1708
1709 size = ALIGN(size, PCPU_MIN_ALLOC_SIZE);
1710 bits = size >> PCPU_MIN_ALLOC_SHIFT;
1711 bit_align = align >> PCPU_MIN_ALLOC_SHIFT;
1712
1713 if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
1714 !is_power_of_2(align))) {
1715 WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation\n",
1716 size, align);
1717 return NULL;
1718 }
1719
1720 type = pcpu_memcg_pre_alloc_hook(size, gfp, &objcg);
1721 if (unlikely(type == PCPU_FAIL_ALLOC))
1722 return NULL;
1723 pcpu_slot = pcpu_chunk_list(type);
1724
1725 if (!is_atomic) {
1726
1727
1728
1729
1730
1731 if (gfp & __GFP_NOFAIL) {
1732 mutex_lock(&pcpu_alloc_mutex);
1733 } else if (mutex_lock_killable(&pcpu_alloc_mutex)) {
1734 pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
1735 return NULL;
1736 }
1737 }
1738
1739 spin_lock_irqsave(&pcpu_lock, flags);
1740
1741
1742 if (reserved && pcpu_reserved_chunk) {
1743 chunk = pcpu_reserved_chunk;
1744
1745 off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic);
1746 if (off < 0) {
1747 err = "alloc from reserved chunk failed";
1748 goto fail_unlock;
1749 }
1750
1751 off = pcpu_alloc_area(chunk, bits, bit_align, off);
1752 if (off >= 0)
1753 goto area_found;
1754
1755 err = "alloc from reserved chunk failed";
1756 goto fail_unlock;
1757 }
1758
1759restart:
1760
1761 for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
1762 list_for_each_entry_safe(chunk, next, &pcpu_slot[slot], list) {
1763 off = pcpu_find_block_fit(chunk, bits, bit_align,
1764 is_atomic);
1765 if (off < 0) {
1766 if (slot < PCPU_SLOT_FAIL_THRESHOLD)
1767 pcpu_chunk_move(chunk, 0);
1768 continue;
1769 }
1770
1771 off = pcpu_alloc_area(chunk, bits, bit_align, off);
1772 if (off >= 0)
1773 goto area_found;
1774
1775 }
1776 }
1777
1778 spin_unlock_irqrestore(&pcpu_lock, flags);
1779
1780
1781
1782
1783
1784
1785 if (is_atomic) {
1786 err = "atomic alloc failed, no space left";
1787 goto fail;
1788 }
1789
1790 if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
1791 chunk = pcpu_create_chunk(type, pcpu_gfp);
1792 if (!chunk) {
1793 err = "failed to allocate new chunk";
1794 goto fail;
1795 }
1796
1797 spin_lock_irqsave(&pcpu_lock, flags);
1798 pcpu_chunk_relocate(chunk, -1);
1799 } else {
1800 spin_lock_irqsave(&pcpu_lock, flags);
1801 }
1802
1803 goto restart;
1804
1805area_found:
1806 pcpu_stats_area_alloc(chunk, size);
1807 spin_unlock_irqrestore(&pcpu_lock, flags);
1808
1809
1810 if (!is_atomic) {
1811 unsigned int page_start, page_end, rs, re;
1812
1813 page_start = PFN_DOWN(off);
1814 page_end = PFN_UP(off + size);
1815
1816 bitmap_for_each_clear_region(chunk->populated, rs, re,
1817 page_start, page_end) {
1818 WARN_ON(chunk->immutable);
1819
1820 ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);
1821
1822 spin_lock_irqsave(&pcpu_lock, flags);
1823 if (ret) {
1824 pcpu_free_area(chunk, off);
1825 err = "failed to populate";
1826 goto fail_unlock;
1827 }
1828 pcpu_chunk_populated(chunk, rs, re);
1829 spin_unlock_irqrestore(&pcpu_lock, flags);
1830 }
1831
1832 mutex_unlock(&pcpu_alloc_mutex);
1833 }
1834
1835 if (pcpu_nr_empty_pop_pages[type] < PCPU_EMPTY_POP_PAGES_LOW)
1836 pcpu_schedule_balance_work();
1837
1838
1839 for_each_possible_cpu(cpu)
1840 memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
1841
1842 ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
1843 kmemleak_alloc_percpu(ptr, size, gfp);
1844
1845 trace_percpu_alloc_percpu(reserved, is_atomic, size, align,
1846 chunk->base_addr, off, ptr);
1847
1848 pcpu_memcg_post_alloc_hook(objcg, chunk, off, size);
1849
1850 return ptr;
1851
1852fail_unlock:
1853 spin_unlock_irqrestore(&pcpu_lock, flags);
1854fail:
1855 trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);
1856
1857 if (!is_atomic && do_warn && warn_limit) {
1858 pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
1859 size, align, is_atomic, err);
1860 dump_stack();
1861 if (!--warn_limit)
1862 pr_info("limit reached, disable warning\n");
1863 }
1864 if (is_atomic) {
1865
1866 pcpu_atomic_alloc_failed = true;
1867 pcpu_schedule_balance_work();
1868 } else {
1869 mutex_unlock(&pcpu_alloc_mutex);
1870 }
1871
1872 pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
1873
1874 return NULL;
1875}
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp)
1893{
1894 return pcpu_alloc(size, align, false, gfp);
1895}
1896EXPORT_SYMBOL_GPL(__alloc_percpu_gfp);
1897
1898
1899
1900
1901
1902
1903
1904
1905void __percpu *__alloc_percpu(size_t size, size_t align)
1906{
1907 return pcpu_alloc(size, align, false, GFP_KERNEL);
1908}
1909EXPORT_SYMBOL_GPL(__alloc_percpu);
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
1928{
1929 return pcpu_alloc(size, align, true, GFP_KERNEL);
1930}
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943static void __pcpu_balance_workfn(enum pcpu_chunk_type type)
1944{
1945
1946 const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
1947 LIST_HEAD(to_free);
1948 struct list_head *pcpu_slot = pcpu_chunk_list(type);
1949 struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
1950 struct pcpu_chunk *chunk, *next;
1951 int slot, nr_to_pop, ret;
1952
1953
1954
1955
1956
1957 mutex_lock(&pcpu_alloc_mutex);
1958 spin_lock_irq(&pcpu_lock);
1959
1960 list_for_each_entry_safe(chunk, next, free_head, list) {
1961 WARN_ON(chunk->immutable);
1962
1963
1964 if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
1965 continue;
1966
1967 list_move(&chunk->list, &to_free);
1968 }
1969
1970 spin_unlock_irq(&pcpu_lock);
1971
1972 list_for_each_entry_safe(chunk, next, &to_free, list) {
1973 unsigned int rs, re;
1974
1975 bitmap_for_each_set_region(chunk->populated, rs, re, 0,
1976 chunk->nr_pages) {
1977 pcpu_depopulate_chunk(chunk, rs, re);
1978 spin_lock_irq(&pcpu_lock);
1979 pcpu_chunk_depopulated(chunk, rs, re);
1980 spin_unlock_irq(&pcpu_lock);
1981 }
1982 pcpu_destroy_chunk(chunk);
1983 cond_resched();
1984 }
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996retry_pop:
1997 if (pcpu_atomic_alloc_failed) {
1998 nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
1999
2000 pcpu_atomic_alloc_failed = false;
2001 } else {
2002 nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
2003 pcpu_nr_empty_pop_pages[type],
2004 0, PCPU_EMPTY_POP_PAGES_HIGH);
2005 }
2006
2007 for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
2008 unsigned int nr_unpop = 0, rs, re;
2009
2010 if (!nr_to_pop)
2011 break;
2012
2013 spin_lock_irq(&pcpu_lock);
2014 list_for_each_entry(chunk, &pcpu_slot[slot], list) {
2015 nr_unpop = chunk->nr_pages - chunk->nr_populated;
2016 if (nr_unpop)
2017 break;
2018 }
2019 spin_unlock_irq(&pcpu_lock);
2020
2021 if (!nr_unpop)
2022 continue;
2023
2024
2025 bitmap_for_each_clear_region(chunk->populated, rs, re, 0,
2026 chunk->nr_pages) {
2027 int nr = min_t(int, re - rs, nr_to_pop);
2028
2029 ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
2030 if (!ret) {
2031 nr_to_pop -= nr;
2032 spin_lock_irq(&pcpu_lock);
2033 pcpu_chunk_populated(chunk, rs, rs + nr);
2034 spin_unlock_irq(&pcpu_lock);
2035 } else {
2036 nr_to_pop = 0;
2037 }
2038
2039 if (!nr_to_pop)
2040 break;
2041 }
2042 }
2043
2044 if (nr_to_pop) {
2045
2046 chunk = pcpu_create_chunk(type, gfp);
2047 if (chunk) {
2048 spin_lock_irq(&pcpu_lock);
2049 pcpu_chunk_relocate(chunk, -1);
2050 spin_unlock_irq(&pcpu_lock);
2051 goto retry_pop;
2052 }
2053 }
2054
2055 mutex_unlock(&pcpu_alloc_mutex);
2056}
2057
2058
2059
2060
2061
2062
2063
2064static void pcpu_balance_workfn(struct work_struct *work)
2065{
2066 enum pcpu_chunk_type type;
2067
2068 for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
2069 __pcpu_balance_workfn(type);
2070}
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081void free_percpu(void __percpu *ptr)
2082{
2083 void *addr;
2084 struct pcpu_chunk *chunk;
2085 unsigned long flags;
2086 int size, off;
2087 bool need_balance = false;
2088 struct list_head *pcpu_slot;
2089
2090 if (!ptr)
2091 return;
2092
2093 kmemleak_free_percpu(ptr);
2094
2095 addr = __pcpu_ptr_to_addr(ptr);
2096
2097 spin_lock_irqsave(&pcpu_lock, flags);
2098
2099 chunk = pcpu_chunk_addr_search(addr);
2100 off = addr - chunk->base_addr;
2101
2102 size = pcpu_free_area(chunk, off);
2103
2104 pcpu_slot = pcpu_chunk_list(pcpu_chunk_type(chunk));
2105
2106 pcpu_memcg_free_hook(chunk, off, size);
2107
2108
2109 if (chunk->free_bytes == pcpu_unit_size) {
2110 struct pcpu_chunk *pos;
2111
2112 list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
2113 if (pos != chunk) {
2114 need_balance = true;
2115 break;
2116 }
2117 }
2118
2119 trace_percpu_free_percpu(chunk->base_addr, off, ptr);
2120
2121 spin_unlock_irqrestore(&pcpu_lock, flags);
2122
2123 if (need_balance)
2124 pcpu_schedule_balance_work();
2125}
2126EXPORT_SYMBOL_GPL(free_percpu);
2127
2128bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
2129{
2130#ifdef CONFIG_SMP
2131 const size_t static_size = __per_cpu_end - __per_cpu_start;
2132 void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
2133 unsigned int cpu;
2134
2135 for_each_possible_cpu(cpu) {
2136 void *start = per_cpu_ptr(base, cpu);
2137 void *va = (void *)addr;
2138
2139 if (va >= start && va < start + static_size) {
2140 if (can_addr) {
2141 *can_addr = (unsigned long) (va - start);
2142 *can_addr += (unsigned long)
2143 per_cpu_ptr(base, get_boot_cpu_id());
2144 }
2145 return true;
2146 }
2147 }
2148#endif
2149
2150 return false;
2151}
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164bool is_kernel_percpu_address(unsigned long addr)
2165{
2166 return __is_kernel_percpu_address(addr, NULL);
2167}
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192phys_addr_t per_cpu_ptr_to_phys(void *addr)
2193{
2194 void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
2195 bool in_first_chunk = false;
2196 unsigned long first_low, first_high;
2197 unsigned int cpu;
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209 first_low = (unsigned long)pcpu_base_addr +
2210 pcpu_unit_page_offset(pcpu_low_unit_cpu, 0);
2211 first_high = (unsigned long)pcpu_base_addr +
2212 pcpu_unit_page_offset(pcpu_high_unit_cpu, pcpu_unit_pages);
2213 if ((unsigned long)addr >= first_low &&
2214 (unsigned long)addr < first_high) {
2215 for_each_possible_cpu(cpu) {
2216 void *start = per_cpu_ptr(base, cpu);
2217
2218 if (addr >= start && addr < start + pcpu_unit_size) {
2219 in_first_chunk = true;
2220 break;
2221 }
2222 }
2223 }
2224
2225 if (in_first_chunk) {
2226 if (!is_vmalloc_addr(addr))
2227 return __pa(addr);
2228 else
2229 return page_to_phys(vmalloc_to_page(addr)) +
2230 offset_in_page(addr);
2231 } else
2232 return page_to_phys(pcpu_addr_to_page(addr)) +
2233 offset_in_page(addr);
2234}
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
2252 int nr_units)
2253{
2254 struct pcpu_alloc_info *ai;
2255 size_t base_size, ai_size;
2256 void *ptr;
2257 int unit;
2258
2259 base_size = ALIGN(struct_size(ai, groups, nr_groups),
2260 __alignof__(ai->groups[0].cpu_map[0]));
2261 ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
2262
2263 ptr = memblock_alloc(PFN_ALIGN(ai_size), PAGE_SIZE);
2264 if (!ptr)
2265 return NULL;
2266 ai = ptr;
2267 ptr += base_size;
2268
2269 ai->groups[0].cpu_map = ptr;
2270
2271 for (unit = 0; unit < nr_units; unit++)
2272 ai->groups[0].cpu_map[unit] = NR_CPUS;
2273
2274 ai->nr_groups = nr_groups;
2275 ai->__ai_size = PFN_ALIGN(ai_size);
2276
2277 return ai;
2278}
2279
2280
2281
2282
2283
2284
2285
2286void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
2287{
2288 memblock_free_early(__pa(ai), ai->__ai_size);
2289}
2290
2291
2292
2293
2294
2295
2296
2297
2298static void pcpu_dump_alloc_info(const char *lvl,
2299 const struct pcpu_alloc_info *ai)
2300{
2301 int group_width = 1, cpu_width = 1, width;
2302 char empty_str[] = "--------";
2303 int alloc = 0, alloc_end = 0;
2304 int group, v;
2305 int upa, apl;
2306
2307 v = ai->nr_groups;
2308 while (v /= 10)
2309 group_width++;
2310
2311 v = num_possible_cpus();
2312 while (v /= 10)
2313 cpu_width++;
2314 empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
2315
2316 upa = ai->alloc_size / ai->unit_size;
2317 width = upa * (cpu_width + 1) + group_width + 3;
2318 apl = rounddown_pow_of_two(max(60 / width, 1));
2319
2320 printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
2321 lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
2322 ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);
2323
2324 for (group = 0; group < ai->nr_groups; group++) {
2325 const struct pcpu_group_info *gi = &ai->groups[group];
2326 int unit = 0, unit_end = 0;
2327
2328 BUG_ON(gi->nr_units % upa);
2329 for (alloc_end += gi->nr_units / upa;
2330 alloc < alloc_end; alloc++) {
2331 if (!(alloc % apl)) {
2332 pr_cont("\n");
2333 printk("%spcpu-alloc: ", lvl);
2334 }
2335 pr_cont("[%0*d] ", group_width, group);
2336
2337 for (unit_end += upa; unit < unit_end; unit++)
2338 if (gi->cpu_map[unit] != NR_CPUS)
2339 pr_cont("%0*d ",
2340 cpu_width, gi->cpu_map[unit]);
2341 else
2342 pr_cont("%s ", empty_str);
2343 }
2344 }
2345 pr_cont("\n");
2346}
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
2403 void *base_addr)
2404{
2405 size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
2406 size_t static_size, dyn_size;
2407 struct pcpu_chunk *chunk;
2408 unsigned long *group_offsets;
2409 size_t *group_sizes;
2410 unsigned long *unit_off;
2411 unsigned int cpu;
2412 int *unit_map;
2413 int group, unit, i;
2414 int map_size;
2415 unsigned long tmp_addr;
2416 size_t alloc_size;
2417 enum pcpu_chunk_type type;
2418
2419#define PCPU_SETUP_BUG_ON(cond) do { \
2420 if (unlikely(cond)) { \
2421 pr_emerg("failed to initialize, %s\n", #cond); \
2422 pr_emerg("cpu_possible_mask=%*pb\n", \
2423 cpumask_pr_args(cpu_possible_mask)); \
2424 pcpu_dump_alloc_info(KERN_EMERG, ai); \
2425 BUG(); \
2426 } \
2427} while (0)
2428
2429
2430 PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
2431#ifdef CONFIG_SMP
2432 PCPU_SETUP_BUG_ON(!ai->static_size);
2433 PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start));
2434#endif
2435 PCPU_SETUP_BUG_ON(!base_addr);
2436 PCPU_SETUP_BUG_ON(offset_in_page(base_addr));
2437 PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
2438 PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
2439 PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
2440 PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE));
2441 PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
2442 PCPU_SETUP_BUG_ON(!ai->dyn_size);
2443 PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE));
2444 PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) ||
2445 IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE)));
2446 PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
2447
2448
2449 alloc_size = ai->nr_groups * sizeof(group_offsets[0]);
2450 group_offsets = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
2451 if (!group_offsets)
2452 panic("%s: Failed to allocate %zu bytes\n", __func__,
2453 alloc_size);
2454
2455 alloc_size = ai->nr_groups * sizeof(group_sizes[0]);
2456 group_sizes = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
2457 if (!group_sizes)
2458 panic("%s: Failed to allocate %zu bytes\n", __func__,
2459 alloc_size);
2460
2461 alloc_size = nr_cpu_ids * sizeof(unit_map[0]);
2462 unit_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
2463 if (!unit_map)
2464 panic("%s: Failed to allocate %zu bytes\n", __func__,
2465 alloc_size);
2466
2467 alloc_size = nr_cpu_ids * sizeof(unit_off[0]);
2468 unit_off = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
2469 if (!unit_off)
2470 panic("%s: Failed to allocate %zu bytes\n", __func__,
2471 alloc_size);
2472
2473 for (cpu = 0; cpu < nr_cpu_ids; cpu++)
2474 unit_map[cpu] = UINT_MAX;
2475
2476 pcpu_low_unit_cpu = NR_CPUS;
2477 pcpu_high_unit_cpu = NR_CPUS;
2478
2479 for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
2480 const struct pcpu_group_info *gi = &ai->groups[group];
2481
2482 group_offsets[group] = gi->base_offset;
2483 group_sizes[group] = gi->nr_units * ai->unit_size;
2484
2485 for (i = 0; i < gi->nr_units; i++) {
2486 cpu = gi->cpu_map[i];
2487 if (cpu == NR_CPUS)
2488 continue;
2489
2490 PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids);
2491 PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
2492 PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);
2493
2494 unit_map[cpu] = unit + i;
2495 unit_off[cpu] = gi->base_offset + i * ai->unit_size;
2496
2497
2498 if (pcpu_low_unit_cpu == NR_CPUS ||
2499 unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
2500 pcpu_low_unit_cpu = cpu;
2501 if (pcpu_high_unit_cpu == NR_CPUS ||
2502 unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
2503 pcpu_high_unit_cpu = cpu;
2504 }
2505 }
2506 pcpu_nr_units = unit;
2507
2508 for_each_possible_cpu(cpu)
2509 PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);
2510
2511
2512#undef PCPU_SETUP_BUG_ON
2513 pcpu_dump_alloc_info(KERN_DEBUG, ai);
2514
2515 pcpu_nr_groups = ai->nr_groups;
2516 pcpu_group_offsets = group_offsets;
2517 pcpu_group_sizes = group_sizes;
2518 pcpu_unit_map = unit_map;
2519 pcpu_unit_offsets = unit_off;
2520
2521
2522 pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
2523 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
2524 pcpu_atom_size = ai->atom_size;
2525 pcpu_chunk_struct_size = struct_size(chunk, populated,
2526 BITS_TO_LONGS(pcpu_unit_pages));
2527
2528 pcpu_stats_save_ai(ai);
2529
2530
2531
2532
2533
2534 pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
2535 pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots *
2536 sizeof(pcpu_chunk_lists[0]) *
2537 PCPU_NR_CHUNK_TYPES,
2538 SMP_CACHE_BYTES);
2539 if (!pcpu_chunk_lists)
2540 panic("%s: Failed to allocate %zu bytes\n", __func__,
2541 pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]) *
2542 PCPU_NR_CHUNK_TYPES);
2543
2544 for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
2545 for (i = 0; i < pcpu_nr_slots; i++)
2546 INIT_LIST_HEAD(&pcpu_chunk_list(type)[i]);
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556 static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE);
2557 dyn_size = ai->dyn_size - (static_size - ai->static_size);
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567 tmp_addr = (unsigned long)base_addr + static_size;
2568 map_size = ai->reserved_size ?: dyn_size;
2569 chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
2570
2571
2572 if (ai->reserved_size) {
2573 pcpu_reserved_chunk = chunk;
2574
2575 tmp_addr = (unsigned long)base_addr + static_size +
2576 ai->reserved_size;
2577 map_size = dyn_size;
2578 chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
2579 }
2580
2581
2582 pcpu_first_chunk = chunk;
2583 pcpu_nr_empty_pop_pages[PCPU_CHUNK_ROOT] = pcpu_first_chunk->nr_empty_pop_pages;
2584 pcpu_chunk_relocate(pcpu_first_chunk, -1);
2585
2586
2587 pcpu_nr_populated += PFN_DOWN(size_sum);
2588
2589 pcpu_stats_chunk_alloc();
2590 trace_percpu_create_chunk(base_addr);
2591
2592
2593 pcpu_base_addr = base_addr;
2594}
2595
2596#ifdef CONFIG_SMP
2597
2598const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
2599 [PCPU_FC_AUTO] = "auto",
2600 [PCPU_FC_EMBED] = "embed",
2601 [PCPU_FC_PAGE] = "page",
2602};
2603
2604enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
2605
2606static int __init percpu_alloc_setup(char *str)
2607{
2608 if (!str)
2609 return -EINVAL;
2610
2611 if (0)
2612 ;
2613#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
2614 else if (!strcmp(str, "embed"))
2615 pcpu_chosen_fc = PCPU_FC_EMBED;
2616#endif
2617#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
2618 else if (!strcmp(str, "page"))
2619 pcpu_chosen_fc = PCPU_FC_PAGE;
2620#endif
2621 else
2622 pr_warn("unknown allocator %s specified\n", str);
2623
2624 return 0;
2625}
2626early_param("percpu_alloc", percpu_alloc_setup);
2627
2628
2629
2630
2631
2632
2633#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
2634 !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
2635#define BUILD_EMBED_FIRST_CHUNK
2636#endif
2637
2638
2639#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
2640#define BUILD_PAGE_FIRST_CHUNK
2641#endif
2642
2643
2644#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info(
2667 size_t reserved_size, size_t dyn_size,
2668 size_t atom_size,
2669 pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
2670{
2671 static int group_map[NR_CPUS] __initdata;
2672 static int group_cnt[NR_CPUS] __initdata;
2673 static struct cpumask mask __initdata;
2674 const size_t static_size = __per_cpu_end - __per_cpu_start;
2675 int nr_groups = 1, nr_units = 0;
2676 size_t size_sum, min_unit_size, alloc_size;
2677 int upa, max_upa, best_upa;
2678 int last_allocs, group, unit;
2679 unsigned int cpu, tcpu;
2680 struct pcpu_alloc_info *ai;
2681 unsigned int *cpu_map;
2682
2683
2684 memset(group_map, 0, sizeof(group_map));
2685 memset(group_cnt, 0, sizeof(group_cnt));
2686 cpumask_clear(&mask);
2687
2688
2689 size_sum = PFN_ALIGN(static_size + reserved_size +
2690 max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
2691 dyn_size = size_sum - static_size - reserved_size;
2692
2693
2694
2695
2696
2697
2698
2699 min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
2700
2701
2702 alloc_size = roundup(min_unit_size, atom_size);
2703 upa = alloc_size / min_unit_size;
2704 while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
2705 upa--;
2706 max_upa = upa;
2707
2708 cpumask_copy(&mask, cpu_possible_mask);
2709
2710
2711 for (group = 0; !cpumask_empty(&mask); group++) {
2712
2713 cpu = cpumask_first(&mask);
2714 group_map[cpu] = group;
2715 group_cnt[group]++;
2716 cpumask_clear_cpu(cpu, &mask);
2717
2718 for_each_cpu(tcpu, &mask) {
2719 if (!cpu_distance_fn ||
2720 (cpu_distance_fn(cpu, tcpu) == LOCAL_DISTANCE &&
2721 cpu_distance_fn(tcpu, cpu) == LOCAL_DISTANCE)) {
2722 group_map[tcpu] = group;
2723 group_cnt[group]++;
2724 cpumask_clear_cpu(tcpu, &mask);
2725 }
2726 }
2727 }
2728 nr_groups = group;
2729
2730
2731
2732
2733
2734
2735 last_allocs = INT_MAX;
2736 for (upa = max_upa; upa; upa--) {
2737 int allocs = 0, wasted = 0;
2738
2739 if (alloc_size % upa || (offset_in_page(alloc_size / upa)))
2740 continue;
2741
2742 for (group = 0; group < nr_groups; group++) {
2743 int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
2744 allocs += this_allocs;
2745 wasted += this_allocs * upa - group_cnt[group];
2746 }
2747
2748
2749
2750
2751
2752
2753 if (wasted > num_possible_cpus() / 3)
2754 continue;
2755
2756
2757 if (allocs > last_allocs)
2758 break;
2759 last_allocs = allocs;
2760 best_upa = upa;
2761 }
2762 upa = best_upa;
2763
2764
2765 for (group = 0; group < nr_groups; group++)
2766 nr_units += roundup(group_cnt[group], upa);
2767
2768 ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
2769 if (!ai)
2770 return ERR_PTR(-ENOMEM);
2771 cpu_map = ai->groups[0].cpu_map;
2772
2773 for (group = 0; group < nr_groups; group++) {
2774 ai->groups[group].cpu_map = cpu_map;
2775 cpu_map += roundup(group_cnt[group], upa);
2776 }
2777
2778 ai->static_size = static_size;
2779 ai->reserved_size = reserved_size;
2780 ai->dyn_size = dyn_size;
2781 ai->unit_size = alloc_size / upa;
2782 ai->atom_size = atom_size;
2783 ai->alloc_size = alloc_size;
2784
2785 for (group = 0, unit = 0; group < nr_groups; group++) {
2786 struct pcpu_group_info *gi = &ai->groups[group];
2787
2788
2789
2790
2791
2792
2793 gi->base_offset = unit * ai->unit_size;
2794
2795 for_each_possible_cpu(cpu)
2796 if (group_map[cpu] == group)
2797 gi->cpu_map[gi->nr_units++] = cpu;
2798 gi->nr_units = roundup(gi->nr_units, upa);
2799 unit += gi->nr_units;
2800 }
2801 BUG_ON(unit != nr_units);
2802
2803 return ai;
2804}
2805#endif
2806
2807#if defined(BUILD_EMBED_FIRST_CHUNK)
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
2841 size_t atom_size,
2842 pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
2843 pcpu_fc_alloc_fn_t alloc_fn,
2844 pcpu_fc_free_fn_t free_fn)
2845{
2846 void *base = (void *)ULONG_MAX;
2847 void **areas = NULL;
2848 struct pcpu_alloc_info *ai;
2849 size_t size_sum, areas_size;
2850 unsigned long max_distance;
2851 int group, i, highest_group, rc = 0;
2852
2853 ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
2854 cpu_distance_fn);
2855 if (IS_ERR(ai))
2856 return PTR_ERR(ai);
2857
2858 size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
2859 areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
2860
2861 areas = memblock_alloc(areas_size, SMP_CACHE_BYTES);
2862 if (!areas) {
2863 rc = -ENOMEM;
2864 goto out_free;
2865 }
2866
2867
2868 highest_group = 0;
2869 for (group = 0; group < ai->nr_groups; group++) {
2870 struct pcpu_group_info *gi = &ai->groups[group];
2871 unsigned int cpu = NR_CPUS;
2872 void *ptr;
2873
2874 for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
2875 cpu = gi->cpu_map[i];
2876 BUG_ON(cpu == NR_CPUS);
2877
2878
2879 ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
2880 if (!ptr) {
2881 rc = -ENOMEM;
2882 goto out_free_areas;
2883 }
2884
2885 kmemleak_free(ptr);
2886 areas[group] = ptr;
2887
2888 base = min(ptr, base);
2889 if (ptr > areas[highest_group])
2890 highest_group = group;
2891 }
2892 max_distance = areas[highest_group] - base;
2893 max_distance += ai->unit_size * ai->groups[highest_group].nr_units;
2894
2895
2896 if (max_distance > VMALLOC_TOTAL * 3 / 4) {
2897 pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n",
2898 max_distance, VMALLOC_TOTAL);
2899#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
2900
2901 rc = -EINVAL;
2902 goto out_free_areas;
2903#endif
2904 }
2905
2906
2907
2908
2909
2910
2911 for (group = 0; group < ai->nr_groups; group++) {
2912 struct pcpu_group_info *gi = &ai->groups[group];
2913 void *ptr = areas[group];
2914
2915 for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
2916 if (gi->cpu_map[i] == NR_CPUS) {
2917
2918 free_fn(ptr, ai->unit_size);
2919 continue;
2920 }
2921
2922 memcpy(ptr, __per_cpu_load, ai->static_size);
2923 free_fn(ptr + size_sum, ai->unit_size - size_sum);
2924 }
2925 }
2926
2927
2928 for (group = 0; group < ai->nr_groups; group++) {
2929 ai->groups[group].base_offset = areas[group] - base;
2930 }
2931
2932 pr_info("Embedded %zu pages/cpu s%zu r%zu d%zu u%zu\n",
2933 PFN_DOWN(size_sum), ai->static_size, ai->reserved_size,
2934 ai->dyn_size, ai->unit_size);
2935
2936 pcpu_setup_first_chunk(ai, base);
2937 goto out_free;
2938
2939out_free_areas:
2940 for (group = 0; group < ai->nr_groups; group++)
2941 if (areas[group])
2942 free_fn(areas[group],
2943 ai->groups[group].nr_units * ai->unit_size);
2944out_free:
2945 pcpu_free_alloc_info(ai);
2946 if (areas)
2947 memblock_free_early(__pa(areas), areas_size);
2948 return rc;
2949}
2950#endif
2951
2952#ifdef BUILD_PAGE_FIRST_CHUNK
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969int __init pcpu_page_first_chunk(size_t reserved_size,
2970 pcpu_fc_alloc_fn_t alloc_fn,
2971 pcpu_fc_free_fn_t free_fn,
2972 pcpu_fc_populate_pte_fn_t populate_pte_fn)
2973{
2974 static struct vm_struct vm;
2975 struct pcpu_alloc_info *ai;
2976 char psize_str[16];
2977 int unit_pages;
2978 size_t pages_size;
2979 struct page **pages;
2980 int unit, i, j, rc = 0;
2981 int upa;
2982 int nr_g0_units;
2983
2984 snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
2985
2986 ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
2987 if (IS_ERR(ai))
2988 return PTR_ERR(ai);
2989 BUG_ON(ai->nr_groups != 1);
2990 upa = ai->alloc_size/ai->unit_size;
2991 nr_g0_units = roundup(num_possible_cpus(), upa);
2992 if (WARN_ON(ai->groups[0].nr_units != nr_g0_units)) {
2993 pcpu_free_alloc_info(ai);
2994 return -EINVAL;
2995 }
2996
2997 unit_pages = ai->unit_size >> PAGE_SHIFT;
2998
2999
3000 pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
3001 sizeof(pages[0]));
3002 pages = memblock_alloc(pages_size, SMP_CACHE_BYTES);
3003 if (!pages)
3004 panic("%s: Failed to allocate %zu bytes\n", __func__,
3005 pages_size);
3006
3007
3008 j = 0;
3009 for (unit = 0; unit < num_possible_cpus(); unit++) {
3010 unsigned int cpu = ai->groups[0].cpu_map[unit];
3011 for (i = 0; i < unit_pages; i++) {
3012 void *ptr;
3013
3014 ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
3015 if (!ptr) {
3016 pr_warn("failed to allocate %s page for cpu%u\n",
3017 psize_str, cpu);
3018 goto enomem;
3019 }
3020
3021 kmemleak_free(ptr);
3022 pages[j++] = virt_to_page(ptr);
3023 }
3024 }
3025
3026
3027 vm.flags = VM_ALLOC;
3028 vm.size = num_possible_cpus() * ai->unit_size;
3029 vm_area_register_early(&vm, PAGE_SIZE);
3030
3031 for (unit = 0; unit < num_possible_cpus(); unit++) {
3032 unsigned long unit_addr =
3033 (unsigned long)vm.addr + unit * ai->unit_size;
3034
3035 for (i = 0; i < unit_pages; i++)
3036 populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
3037
3038
3039 rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
3040 unit_pages);
3041 if (rc < 0)
3042 panic("failed to map percpu area, err=%d\n", rc);
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053 memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
3054 }
3055
3056
3057 pr_info("%d %s pages/cpu s%zu r%zu d%zu\n",
3058 unit_pages, psize_str, ai->static_size,
3059 ai->reserved_size, ai->dyn_size);
3060
3061 pcpu_setup_first_chunk(ai, vm.addr);
3062 goto out_free_ar;
3063
3064enomem:
3065 while (--j >= 0)
3066 free_fn(page_address(pages[j]), PAGE_SIZE);
3067 rc = -ENOMEM;
3068out_free_ar:
3069 memblock_free_early(__pa(pages), pages_size);
3070 pcpu_free_alloc_info(ai);
3071 return rc;
3072}
3073#endif
3074
3075#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
3089EXPORT_SYMBOL(__per_cpu_offset);
3090
3091static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
3092 size_t align)
3093{
3094 return memblock_alloc_from(size, align, __pa(MAX_DMA_ADDRESS));
3095}
3096
3097static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
3098{
3099 memblock_free_early(__pa(ptr), size);
3100}
3101
3102void __init setup_per_cpu_areas(void)
3103{
3104 unsigned long delta;
3105 unsigned int cpu;
3106 int rc;
3107
3108
3109
3110
3111
3112 rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
3113 PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
3114 pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
3115 if (rc < 0)
3116 panic("Failed to initialize percpu areas.");
3117
3118 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
3119 for_each_possible_cpu(cpu)
3120 __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
3121}
3122#endif
3123
3124#else
3125
3126
3127
3128
3129
3130
3131
3132
3133void __init setup_per_cpu_areas(void)
3134{
3135 const size_t unit_size =
3136 roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
3137 PERCPU_DYNAMIC_RESERVE));
3138 struct pcpu_alloc_info *ai;
3139 void *fc;
3140
3141 ai = pcpu_alloc_alloc_info(1, 1);
3142 fc = memblock_alloc_from(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
3143 if (!ai || !fc)
3144 panic("Failed to allocate memory for percpu areas.");
3145
3146 kmemleak_free(fc);
3147
3148 ai->dyn_size = unit_size;
3149 ai->unit_size = unit_size;
3150 ai->atom_size = unit_size;
3151 ai->alloc_size = unit_size;
3152 ai->groups[0].nr_units = 1;
3153 ai->groups[0].cpu_map[0] = 0;
3154
3155 pcpu_setup_first_chunk(ai, fc);
3156 pcpu_free_alloc_info(ai);
3157}
3158
3159#endif
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172unsigned long pcpu_nr_pages(void)
3173{
3174 return pcpu_nr_populated * pcpu_nr_units;
3175}
3176
3177
3178
3179
3180
3181
3182static int __init percpu_enable_async(void)
3183{
3184 pcpu_async_enabled = true;
3185 return 0;
3186}
3187subsys_initcall(percpu_enable_async);
3188