1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
57
58#include <linux/bitmap.h>
59#include <linux/bootmem.h>
60#include <linux/err.h>
61#include <linux/list.h>
62#include <linux/log2.h>
63#include <linux/mm.h>
64#include <linux/module.h>
65#include <linux/mutex.h>
66#include <linux/percpu.h>
67#include <linux/pfn.h>
68#include <linux/slab.h>
69#include <linux/spinlock.h>
70#include <linux/vmalloc.h>
71#include <linux/workqueue.h>
72#include <linux/kmemleak.h>
73
74#include <asm/cacheflush.h>
75#include <asm/sections.h>
76#include <asm/tlbflush.h>
77#include <asm/io.h>
78
79#define PCPU_SLOT_BASE_SHIFT 5
80#define PCPU_DFL_MAP_ALLOC 16
81#define PCPU_ATOMIC_MAP_MARGIN_LOW 32
82#define PCPU_ATOMIC_MAP_MARGIN_HIGH 64
83#define PCPU_EMPTY_POP_PAGES_LOW 2
84#define PCPU_EMPTY_POP_PAGES_HIGH 4
85
86#ifdef CONFIG_SMP
87
88#ifndef __addr_to_pcpu_ptr
89#define __addr_to_pcpu_ptr(addr) \
90 (void __percpu *)((unsigned long)(addr) - \
91 (unsigned long)pcpu_base_addr + \
92 (unsigned long)__per_cpu_start)
93#endif
94#ifndef __pcpu_ptr_to_addr
95#define __pcpu_ptr_to_addr(ptr) \
96 (void __force *)((unsigned long)(ptr) + \
97 (unsigned long)pcpu_base_addr - \
98 (unsigned long)__per_cpu_start)
99#endif
100#else
101
102#define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr)
103#define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr)
104#endif
105
106struct pcpu_chunk {
107 struct list_head list;
108 int free_size;
109 int contig_hint;
110 void *base_addr;
111
112 int map_used;
113 int map_alloc;
114 int *map;
115 struct list_head map_extend_list;
116
117 void *data;
118 int first_free;
119 bool immutable;
120 int nr_populated;
121 unsigned long populated[];
122};
123
124static int pcpu_unit_pages __read_mostly;
125static int pcpu_unit_size __read_mostly;
126static int pcpu_nr_units __read_mostly;
127static int pcpu_atom_size __read_mostly;
128static int pcpu_nr_slots __read_mostly;
129static size_t pcpu_chunk_struct_size __read_mostly;
130
131
132static unsigned int pcpu_low_unit_cpu __read_mostly;
133static unsigned int pcpu_high_unit_cpu __read_mostly;
134
135
136void *pcpu_base_addr __read_mostly;
137EXPORT_SYMBOL_GPL(pcpu_base_addr);
138
139static const int *pcpu_unit_map __read_mostly;
140const unsigned long *pcpu_unit_offsets __read_mostly;
141
142
143static int pcpu_nr_groups __read_mostly;
144static const unsigned long *pcpu_group_offsets __read_mostly;
145static const size_t *pcpu_group_sizes __read_mostly;
146
147
148
149
150
151
152static struct pcpu_chunk *pcpu_first_chunk;
153
154
155
156
157
158
159
160
161static struct pcpu_chunk *pcpu_reserved_chunk;
162static int pcpu_reserved_chunk_limit;
163
164static DEFINE_SPINLOCK(pcpu_lock);
165static DEFINE_MUTEX(pcpu_alloc_mutex);
166
167static struct list_head *pcpu_slot __read_mostly;
168
169
170static LIST_HEAD(pcpu_map_extend_chunks);
171
172
173
174
175
176static int pcpu_nr_empty_pop_pages;
177
178
179
180
181
182
183
184static void pcpu_balance_workfn(struct work_struct *work);
185static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
186static bool pcpu_async_enabled __read_mostly;
187static bool pcpu_atomic_alloc_failed;
188
189static void pcpu_schedule_balance_work(void)
190{
191 if (pcpu_async_enabled)
192 schedule_work(&pcpu_balance_work);
193}
194
195static bool pcpu_addr_in_first_chunk(void *addr)
196{
197 void *first_start = pcpu_first_chunk->base_addr;
198
199 return addr >= first_start && addr < first_start + pcpu_unit_size;
200}
201
202static bool pcpu_addr_in_reserved_chunk(void *addr)
203{
204 void *first_start = pcpu_first_chunk->base_addr;
205
206 return addr >= first_start &&
207 addr < first_start + pcpu_reserved_chunk_limit;
208}
209
210static int __pcpu_size_to_slot(int size)
211{
212 int highbit = fls(size);
213 return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
214}
215
216static int pcpu_size_to_slot(int size)
217{
218 if (size == pcpu_unit_size)
219 return pcpu_nr_slots - 1;
220 return __pcpu_size_to_slot(size);
221}
222
223static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
224{
225 if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
226 return 0;
227
228 return pcpu_size_to_slot(chunk->free_size);
229}
230
231
232static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
233{
234 page->index = (unsigned long)pcpu;
235}
236
237
238static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
239{
240 return (struct pcpu_chunk *)page->index;
241}
242
243static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
244{
245 return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
246}
247
248static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
249 unsigned int cpu, int page_idx)
250{
251 return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] +
252 (page_idx << PAGE_SHIFT);
253}
254
255static void __maybe_unused pcpu_next_unpop(struct pcpu_chunk *chunk,
256 int *rs, int *re, int end)
257{
258 *rs = find_next_zero_bit(chunk->populated, end, *rs);
259 *re = find_next_bit(chunk->populated, end, *rs + 1);
260}
261
262static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
263 int *rs, int *re, int end)
264{
265 *rs = find_next_bit(chunk->populated, end, *rs);
266 *re = find_next_zero_bit(chunk->populated, end, *rs + 1);
267}
268
269
270
271
272
273
274
275#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \
276 for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \
277 (rs) < (re); \
278 (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end)))
279
280#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \
281 for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \
282 (rs) < (re); \
283 (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299static void *pcpu_mem_zalloc(size_t size)
300{
301 if (WARN_ON_ONCE(!slab_is_available()))
302 return NULL;
303
304 if (size <= PAGE_SIZE)
305 return kzalloc(size, GFP_KERNEL);
306 else
307 return vzalloc(size);
308}
309
310
311
312
313
314
315
316static void pcpu_mem_free(void *ptr)
317{
318 kvfree(ptr);
319}
320
321
322
323
324
325
326
327
328
329
330static int pcpu_count_occupied_pages(struct pcpu_chunk *chunk, int i)
331{
332 int off = chunk->map[i] & ~1;
333 int end = chunk->map[i + 1] & ~1;
334
335 if (!PAGE_ALIGNED(off) && i > 0) {
336 int prev = chunk->map[i - 1];
337
338 if (!(prev & 1) && prev <= round_down(off, PAGE_SIZE))
339 off = round_down(off, PAGE_SIZE);
340 }
341
342 if (!PAGE_ALIGNED(end) && i + 1 < chunk->map_used) {
343 int next = chunk->map[i + 1];
344 int nend = chunk->map[i + 2] & ~1;
345
346 if (!(next & 1) && nend >= round_up(end, PAGE_SIZE))
347 end = round_up(end, PAGE_SIZE);
348 }
349
350 return max_t(int, PFN_DOWN(end) - PFN_UP(off), 0);
351}
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
367{
368 int nslot = pcpu_chunk_slot(chunk);
369
370 if (chunk != pcpu_reserved_chunk && oslot != nslot) {
371 if (oslot < nslot)
372 list_move(&chunk->list, &pcpu_slot[nslot]);
373 else
374 list_move_tail(&chunk->list, &pcpu_slot[nslot]);
375 }
376}
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic)
398{
399 int margin, new_alloc;
400
401 lockdep_assert_held(&pcpu_lock);
402
403 if (is_atomic) {
404 margin = 3;
405
406 if (chunk->map_alloc <
407 chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) {
408 if (list_empty(&chunk->map_extend_list)) {
409 list_add_tail(&chunk->map_extend_list,
410 &pcpu_map_extend_chunks);
411 pcpu_schedule_balance_work();
412 }
413 }
414 } else {
415 margin = PCPU_ATOMIC_MAP_MARGIN_HIGH;
416 }
417
418 if (chunk->map_alloc >= chunk->map_used + margin)
419 return 0;
420
421 new_alloc = PCPU_DFL_MAP_ALLOC;
422 while (new_alloc < chunk->map_used + margin)
423 new_alloc *= 2;
424
425 return new_alloc;
426}
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
442{
443 int *old = NULL, *new = NULL;
444 size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
445 unsigned long flags;
446
447 lockdep_assert_held(&pcpu_alloc_mutex);
448
449 new = pcpu_mem_zalloc(new_size);
450 if (!new)
451 return -ENOMEM;
452
453
454 spin_lock_irqsave(&pcpu_lock, flags);
455
456 if (new_alloc <= chunk->map_alloc)
457 goto out_unlock;
458
459 old_size = chunk->map_alloc * sizeof(chunk->map[0]);
460 old = chunk->map;
461
462 memcpy(new, old, old_size);
463
464 chunk->map_alloc = new_alloc;
465 chunk->map = new;
466 new = NULL;
467
468out_unlock:
469 spin_unlock_irqrestore(&pcpu_lock, flags);
470
471
472
473
474
475 pcpu_mem_free(old);
476 pcpu_mem_free(new);
477
478 return 0;
479}
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size,
500 int size, int align, bool pop_only)
501{
502 int cand_off = off;
503
504 while (true) {
505 int head = ALIGN(cand_off, align) - off;
506 int page_start, page_end, rs, re;
507
508 if (this_size < head + size)
509 return -1;
510
511 if (!pop_only)
512 return head;
513
514
515
516
517
518
519 page_start = PFN_DOWN(head + off);
520 page_end = PFN_UP(head + off + size);
521
522 rs = page_start;
523 pcpu_next_unpop(chunk, &rs, &re, PFN_UP(off + this_size));
524 if (rs >= page_end)
525 return head;
526 cand_off = re * PAGE_SIZE;
527 }
528}
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align,
552 bool pop_only, int *occ_pages_p)
553{
554 int oslot = pcpu_chunk_slot(chunk);
555 int max_contig = 0;
556 int i, off;
557 bool seen_free = false;
558 int *p;
559
560 for (i = chunk->first_free, p = chunk->map + i; i < chunk->map_used; i++, p++) {
561 int head, tail;
562 int this_size;
563
564 off = *p;
565 if (off & 1)
566 continue;
567
568 this_size = (p[1] & ~1) - off;
569
570 head = pcpu_fit_in_area(chunk, off, this_size, size, align,
571 pop_only);
572 if (head < 0) {
573 if (!seen_free) {
574 chunk->first_free = i;
575 seen_free = true;
576 }
577 max_contig = max(this_size, max_contig);
578 continue;
579 }
580
581
582
583
584
585
586
587 if (head && (head < sizeof(int) || !(p[-1] & 1))) {
588 *p = off += head;
589 if (p[-1] & 1)
590 chunk->free_size -= head;
591 else
592 max_contig = max(*p - p[-1], max_contig);
593 this_size -= head;
594 head = 0;
595 }
596
597
598 tail = this_size - head - size;
599 if (tail < sizeof(int)) {
600 tail = 0;
601 size = this_size - head;
602 }
603
604
605 if (head || tail) {
606 int nr_extra = !!head + !!tail;
607
608
609 memmove(p + nr_extra + 1, p + 1,
610 sizeof(chunk->map[0]) * (chunk->map_used - i));
611 chunk->map_used += nr_extra;
612
613 if (head) {
614 if (!seen_free) {
615 chunk->first_free = i;
616 seen_free = true;
617 }
618 *++p = off += head;
619 ++i;
620 max_contig = max(head, max_contig);
621 }
622 if (tail) {
623 p[1] = off + size;
624 max_contig = max(tail, max_contig);
625 }
626 }
627
628 if (!seen_free)
629 chunk->first_free = i + 1;
630
631
632 if (i + 1 == chunk->map_used)
633 chunk->contig_hint = max_contig;
634 else
635 chunk->contig_hint = max(chunk->contig_hint,
636 max_contig);
637
638 chunk->free_size -= size;
639 *p |= 1;
640
641 *occ_pages_p = pcpu_count_occupied_pages(chunk, i);
642 pcpu_chunk_relocate(chunk, oslot);
643 return off;
644 }
645
646 chunk->contig_hint = max_contig;
647 pcpu_chunk_relocate(chunk, oslot);
648
649
650 return -1;
651}
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme,
667 int *occ_pages_p)
668{
669 int oslot = pcpu_chunk_slot(chunk);
670 int off = 0;
671 unsigned i, j;
672 int to_free = 0;
673 int *p;
674
675 freeme |= 1;
676
677 i = 0;
678 j = chunk->map_used;
679 while (i != j) {
680 unsigned k = (i + j) / 2;
681 off = chunk->map[k];
682 if (off < freeme)
683 i = k + 1;
684 else if (off > freeme)
685 j = k;
686 else
687 i = j = k;
688 }
689 BUG_ON(off != freeme);
690
691 if (i < chunk->first_free)
692 chunk->first_free = i;
693
694 p = chunk->map + i;
695 *p = off &= ~1;
696 chunk->free_size += (p[1] & ~1) - off;
697
698 *occ_pages_p = pcpu_count_occupied_pages(chunk, i);
699
700
701 if (!(p[1] & 1))
702 to_free++;
703
704 if (i > 0 && !(p[-1] & 1)) {
705 to_free++;
706 i--;
707 p--;
708 }
709 if (to_free) {
710 chunk->map_used -= to_free;
711 memmove(p + 1, p + 1 + to_free,
712 (chunk->map_used - i) * sizeof(chunk->map[0]));
713 }
714
715 chunk->contig_hint = max(chunk->map[i + 1] - chunk->map[i] - 1, chunk->contig_hint);
716 pcpu_chunk_relocate(chunk, oslot);
717}
718
719static struct pcpu_chunk *pcpu_alloc_chunk(void)
720{
721 struct pcpu_chunk *chunk;
722
723 chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size);
724 if (!chunk)
725 return NULL;
726
727 chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *
728 sizeof(chunk->map[0]));
729 if (!chunk->map) {
730 pcpu_mem_free(chunk);
731 return NULL;
732 }
733
734 chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
735 chunk->map[0] = 0;
736 chunk->map[1] = pcpu_unit_size | 1;
737 chunk->map_used = 1;
738
739 INIT_LIST_HEAD(&chunk->list);
740 INIT_LIST_HEAD(&chunk->map_extend_list);
741 chunk->free_size = pcpu_unit_size;
742 chunk->contig_hint = pcpu_unit_size;
743
744 return chunk;
745}
746
747static void pcpu_free_chunk(struct pcpu_chunk *chunk)
748{
749 if (!chunk)
750 return;
751 pcpu_mem_free(chunk->map);
752 pcpu_mem_free(chunk);
753}
754
755
756
757
758
759
760
761
762
763
764
765static void pcpu_chunk_populated(struct pcpu_chunk *chunk,
766 int page_start, int page_end)
767{
768 int nr = page_end - page_start;
769
770 lockdep_assert_held(&pcpu_lock);
771
772 bitmap_set(chunk->populated, page_start, nr);
773 chunk->nr_populated += nr;
774 pcpu_nr_empty_pop_pages += nr;
775}
776
777
778
779
780
781
782
783
784
785
786
787static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
788 int page_start, int page_end)
789{
790 int nr = page_end - page_start;
791
792 lockdep_assert_held(&pcpu_lock);
793
794 bitmap_clear(chunk->populated, page_start, nr);
795 chunk->nr_populated -= nr;
796 pcpu_nr_empty_pop_pages -= nr;
797}
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size);
815static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size);
816static struct pcpu_chunk *pcpu_create_chunk(void);
817static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
818static struct page *pcpu_addr_to_page(void *addr);
819static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
820
821#ifdef CONFIG_NEED_PER_CPU_KM
822#include "percpu-km.c"
823#else
824#include "percpu-vm.c"
825#endif
826
827
828
829
830
831
832
833
834static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
835{
836
837 if (pcpu_addr_in_first_chunk(addr)) {
838
839 if (pcpu_addr_in_reserved_chunk(addr))
840 return pcpu_reserved_chunk;
841 return pcpu_first_chunk;
842 }
843
844
845
846
847
848
849
850
851 addr += pcpu_unit_offsets[raw_smp_processor_id()];
852 return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
853}
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
869 gfp_t gfp)
870{
871 static int warn_limit = 10;
872 struct pcpu_chunk *chunk;
873 const char *err;
874 bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
875 int occ_pages = 0;
876 int slot, off, new_alloc, cpu, ret;
877 unsigned long flags;
878 void __percpu *ptr;
879
880
881
882
883
884 if (unlikely(align < 2))
885 align = 2;
886
887 size = ALIGN(size, 2);
888
889 if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
890 WARN(true, "illegal size (%zu) or align (%zu) for percpu allocation\n",
891 size, align);
892 return NULL;
893 }
894
895 if (!is_atomic)
896 mutex_lock(&pcpu_alloc_mutex);
897
898 spin_lock_irqsave(&pcpu_lock, flags);
899
900
901 if (reserved && pcpu_reserved_chunk) {
902 chunk = pcpu_reserved_chunk;
903
904 if (size > chunk->contig_hint) {
905 err = "alloc from reserved chunk failed";
906 goto fail_unlock;
907 }
908
909 while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) {
910 spin_unlock_irqrestore(&pcpu_lock, flags);
911 if (is_atomic ||
912 pcpu_extend_area_map(chunk, new_alloc) < 0) {
913 err = "failed to extend area map of reserved chunk";
914 goto fail;
915 }
916 spin_lock_irqsave(&pcpu_lock, flags);
917 }
918
919 off = pcpu_alloc_area(chunk, size, align, is_atomic,
920 &occ_pages);
921 if (off >= 0)
922 goto area_found;
923
924 err = "alloc from reserved chunk failed";
925 goto fail_unlock;
926 }
927
928restart:
929
930 for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
931 list_for_each_entry(chunk, &pcpu_slot[slot], list) {
932 if (size > chunk->contig_hint)
933 continue;
934
935 new_alloc = pcpu_need_to_extend(chunk, is_atomic);
936 if (new_alloc) {
937 if (is_atomic)
938 continue;
939 spin_unlock_irqrestore(&pcpu_lock, flags);
940 if (pcpu_extend_area_map(chunk,
941 new_alloc) < 0) {
942 err = "failed to extend area map";
943 goto fail;
944 }
945 spin_lock_irqsave(&pcpu_lock, flags);
946
947
948
949
950 goto restart;
951 }
952
953 off = pcpu_alloc_area(chunk, size, align, is_atomic,
954 &occ_pages);
955 if (off >= 0)
956 goto area_found;
957 }
958 }
959
960 spin_unlock_irqrestore(&pcpu_lock, flags);
961
962
963
964
965
966
967 if (is_atomic)
968 goto fail;
969
970 if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
971 chunk = pcpu_create_chunk();
972 if (!chunk) {
973 err = "failed to allocate new chunk";
974 goto fail;
975 }
976
977 spin_lock_irqsave(&pcpu_lock, flags);
978 pcpu_chunk_relocate(chunk, -1);
979 } else {
980 spin_lock_irqsave(&pcpu_lock, flags);
981 }
982
983 goto restart;
984
985area_found:
986 spin_unlock_irqrestore(&pcpu_lock, flags);
987
988
989 if (!is_atomic) {
990 int page_start, page_end, rs, re;
991
992 page_start = PFN_DOWN(off);
993 page_end = PFN_UP(off + size);
994
995 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
996 WARN_ON(chunk->immutable);
997
998 ret = pcpu_populate_chunk(chunk, rs, re);
999
1000 spin_lock_irqsave(&pcpu_lock, flags);
1001 if (ret) {
1002 pcpu_free_area(chunk, off, &occ_pages);
1003 err = "failed to populate";
1004 goto fail_unlock;
1005 }
1006 pcpu_chunk_populated(chunk, rs, re);
1007 spin_unlock_irqrestore(&pcpu_lock, flags);
1008 }
1009
1010 mutex_unlock(&pcpu_alloc_mutex);
1011 }
1012
1013 if (chunk != pcpu_reserved_chunk)
1014 pcpu_nr_empty_pop_pages -= occ_pages;
1015
1016 if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
1017 pcpu_schedule_balance_work();
1018
1019
1020 for_each_possible_cpu(cpu)
1021 memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
1022
1023 ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
1024 kmemleak_alloc_percpu(ptr, size, gfp);
1025 return ptr;
1026
1027fail_unlock:
1028 spin_unlock_irqrestore(&pcpu_lock, flags);
1029fail:
1030 if (!is_atomic && warn_limit) {
1031 pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
1032 size, align, is_atomic, err);
1033 dump_stack();
1034 if (!--warn_limit)
1035 pr_info("limit reached, disable warning\n");
1036 }
1037 if (is_atomic) {
1038
1039 pcpu_atomic_alloc_failed = true;
1040 pcpu_schedule_balance_work();
1041 } else {
1042 mutex_unlock(&pcpu_alloc_mutex);
1043 }
1044 return NULL;
1045}
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp)
1061{
1062 return pcpu_alloc(size, align, false, gfp);
1063}
1064EXPORT_SYMBOL_GPL(__alloc_percpu_gfp);
1065
1066
1067
1068
1069
1070
1071
1072
1073void __percpu *__alloc_percpu(size_t size, size_t align)
1074{
1075 return pcpu_alloc(size, align, false, GFP_KERNEL);
1076}
1077EXPORT_SYMBOL_GPL(__alloc_percpu);
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
1096{
1097 return pcpu_alloc(size, align, true, GFP_KERNEL);
1098}
1099
1100
1101
1102
1103
1104
1105
1106static void pcpu_balance_workfn(struct work_struct *work)
1107{
1108 LIST_HEAD(to_free);
1109 struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
1110 struct pcpu_chunk *chunk, *next;
1111 int slot, nr_to_pop, ret;
1112
1113
1114
1115
1116
1117 mutex_lock(&pcpu_alloc_mutex);
1118 spin_lock_irq(&pcpu_lock);
1119
1120 list_for_each_entry_safe(chunk, next, free_head, list) {
1121 WARN_ON(chunk->immutable);
1122
1123
1124 if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
1125 continue;
1126
1127 list_del_init(&chunk->map_extend_list);
1128 list_move(&chunk->list, &to_free);
1129 }
1130
1131 spin_unlock_irq(&pcpu_lock);
1132
1133 list_for_each_entry_safe(chunk, next, &to_free, list) {
1134 int rs, re;
1135
1136 pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) {
1137 pcpu_depopulate_chunk(chunk, rs, re);
1138 spin_lock_irq(&pcpu_lock);
1139 pcpu_chunk_depopulated(chunk, rs, re);
1140 spin_unlock_irq(&pcpu_lock);
1141 }
1142 pcpu_destroy_chunk(chunk);
1143 }
1144
1145
1146 do {
1147 int new_alloc = 0;
1148
1149 spin_lock_irq(&pcpu_lock);
1150
1151 chunk = list_first_entry_or_null(&pcpu_map_extend_chunks,
1152 struct pcpu_chunk, map_extend_list);
1153 if (chunk) {
1154 list_del_init(&chunk->map_extend_list);
1155 new_alloc = pcpu_need_to_extend(chunk, false);
1156 }
1157
1158 spin_unlock_irq(&pcpu_lock);
1159
1160 if (new_alloc)
1161 pcpu_extend_area_map(chunk, new_alloc);
1162 } while (chunk);
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174retry_pop:
1175 if (pcpu_atomic_alloc_failed) {
1176 nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
1177
1178 pcpu_atomic_alloc_failed = false;
1179 } else {
1180 nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
1181 pcpu_nr_empty_pop_pages,
1182 0, PCPU_EMPTY_POP_PAGES_HIGH);
1183 }
1184
1185 for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
1186 int nr_unpop = 0, rs, re;
1187
1188 if (!nr_to_pop)
1189 break;
1190
1191 spin_lock_irq(&pcpu_lock);
1192 list_for_each_entry(chunk, &pcpu_slot[slot], list) {
1193 nr_unpop = pcpu_unit_pages - chunk->nr_populated;
1194 if (nr_unpop)
1195 break;
1196 }
1197 spin_unlock_irq(&pcpu_lock);
1198
1199 if (!nr_unpop)
1200 continue;
1201
1202
1203 pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) {
1204 int nr = min(re - rs, nr_to_pop);
1205
1206 ret = pcpu_populate_chunk(chunk, rs, rs + nr);
1207 if (!ret) {
1208 nr_to_pop -= nr;
1209 spin_lock_irq(&pcpu_lock);
1210 pcpu_chunk_populated(chunk, rs, rs + nr);
1211 spin_unlock_irq(&pcpu_lock);
1212 } else {
1213 nr_to_pop = 0;
1214 }
1215
1216 if (!nr_to_pop)
1217 break;
1218 }
1219 }
1220
1221 if (nr_to_pop) {
1222
1223 chunk = pcpu_create_chunk();
1224 if (chunk) {
1225 spin_lock_irq(&pcpu_lock);
1226 pcpu_chunk_relocate(chunk, -1);
1227 spin_unlock_irq(&pcpu_lock);
1228 goto retry_pop;
1229 }
1230 }
1231
1232 mutex_unlock(&pcpu_alloc_mutex);
1233}
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244void free_percpu(void __percpu *ptr)
1245{
1246 void *addr;
1247 struct pcpu_chunk *chunk;
1248 unsigned long flags;
1249 int off, occ_pages;
1250
1251 if (!ptr)
1252 return;
1253
1254 kmemleak_free_percpu(ptr);
1255
1256 addr = __pcpu_ptr_to_addr(ptr);
1257
1258 spin_lock_irqsave(&pcpu_lock, flags);
1259
1260 chunk = pcpu_chunk_addr_search(addr);
1261 off = addr - chunk->base_addr;
1262
1263 pcpu_free_area(chunk, off, &occ_pages);
1264
1265 if (chunk != pcpu_reserved_chunk)
1266 pcpu_nr_empty_pop_pages += occ_pages;
1267
1268
1269 if (chunk->free_size == pcpu_unit_size) {
1270 struct pcpu_chunk *pos;
1271
1272 list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
1273 if (pos != chunk) {
1274 pcpu_schedule_balance_work();
1275 break;
1276 }
1277 }
1278
1279 spin_unlock_irqrestore(&pcpu_lock, flags);
1280}
1281EXPORT_SYMBOL_GPL(free_percpu);
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294bool is_kernel_percpu_address(unsigned long addr)
1295{
1296#ifdef CONFIG_SMP
1297 const size_t static_size = __per_cpu_end - __per_cpu_start;
1298 void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
1299 unsigned int cpu;
1300
1301 for_each_possible_cpu(cpu) {
1302 void *start = per_cpu_ptr(base, cpu);
1303
1304 if ((void *)addr >= start && (void *)addr < start + static_size)
1305 return true;
1306 }
1307#endif
1308
1309 return false;
1310}
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335phys_addr_t per_cpu_ptr_to_phys(void *addr)
1336{
1337 void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
1338 bool in_first_chunk = false;
1339 unsigned long first_low, first_high;
1340 unsigned int cpu;
1341
1342
1343
1344
1345
1346
1347 first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0);
1348 first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu,
1349 pcpu_unit_pages);
1350 if ((unsigned long)addr >= first_low &&
1351 (unsigned long)addr < first_high) {
1352 for_each_possible_cpu(cpu) {
1353 void *start = per_cpu_ptr(base, cpu);
1354
1355 if (addr >= start && addr < start + pcpu_unit_size) {
1356 in_first_chunk = true;
1357 break;
1358 }
1359 }
1360 }
1361
1362 if (in_first_chunk) {
1363 if (!is_vmalloc_addr(addr))
1364 return __pa(addr);
1365 else
1366 return page_to_phys(vmalloc_to_page(addr)) +
1367 offset_in_page(addr);
1368 } else
1369 return page_to_phys(pcpu_addr_to_page(addr)) +
1370 offset_in_page(addr);
1371}
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
1389 int nr_units)
1390{
1391 struct pcpu_alloc_info *ai;
1392 size_t base_size, ai_size;
1393 void *ptr;
1394 int unit;
1395
1396 base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]),
1397 __alignof__(ai->groups[0].cpu_map[0]));
1398 ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
1399
1400 ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), 0);
1401 if (!ptr)
1402 return NULL;
1403 ai = ptr;
1404 ptr += base_size;
1405
1406 ai->groups[0].cpu_map = ptr;
1407
1408 for (unit = 0; unit < nr_units; unit++)
1409 ai->groups[0].cpu_map[unit] = NR_CPUS;
1410
1411 ai->nr_groups = nr_groups;
1412 ai->__ai_size = PFN_ALIGN(ai_size);
1413
1414 return ai;
1415}
1416
1417
1418
1419
1420
1421
1422
1423void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
1424{
1425 memblock_free_early(__pa(ai), ai->__ai_size);
1426}
1427
1428
1429
1430
1431
1432
1433
1434
1435static void pcpu_dump_alloc_info(const char *lvl,
1436 const struct pcpu_alloc_info *ai)
1437{
1438 int group_width = 1, cpu_width = 1, width;
1439 char empty_str[] = "--------";
1440 int alloc = 0, alloc_end = 0;
1441 int group, v;
1442 int upa, apl;
1443
1444 v = ai->nr_groups;
1445 while (v /= 10)
1446 group_width++;
1447
1448 v = num_possible_cpus();
1449 while (v /= 10)
1450 cpu_width++;
1451 empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
1452
1453 upa = ai->alloc_size / ai->unit_size;
1454 width = upa * (cpu_width + 1) + group_width + 3;
1455 apl = rounddown_pow_of_two(max(60 / width, 1));
1456
1457 printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
1458 lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
1459 ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);
1460
1461 for (group = 0; group < ai->nr_groups; group++) {
1462 const struct pcpu_group_info *gi = &ai->groups[group];
1463 int unit = 0, unit_end = 0;
1464
1465 BUG_ON(gi->nr_units % upa);
1466 for (alloc_end += gi->nr_units / upa;
1467 alloc < alloc_end; alloc++) {
1468 if (!(alloc % apl)) {
1469 pr_cont("\n");
1470 printk("%spcpu-alloc: ", lvl);
1471 }
1472 pr_cont("[%0*d] ", group_width, group);
1473
1474 for (unit_end += upa; unit < unit_end; unit++)
1475 if (gi->cpu_map[unit] != NR_CPUS)
1476 pr_cont("%0*d ",
1477 cpu_width, gi->cpu_map[unit]);
1478 else
1479 pr_cont("%s ", empty_str);
1480 }
1481 }
1482 pr_cont("\n");
1483}
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1542 void *base_addr)
1543{
1544 static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
1545 static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
1546 size_t dyn_size = ai->dyn_size;
1547 size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
1548 struct pcpu_chunk *schunk, *dchunk = NULL;
1549 unsigned long *group_offsets;
1550 size_t *group_sizes;
1551 unsigned long *unit_off;
1552 unsigned int cpu;
1553 int *unit_map;
1554 int group, unit, i;
1555
1556#define PCPU_SETUP_BUG_ON(cond) do { \
1557 if (unlikely(cond)) { \
1558 pr_emerg("failed to initialize, %s\n", #cond); \
1559 pr_emerg("cpu_possible_mask=%*pb\n", \
1560 cpumask_pr_args(cpu_possible_mask)); \
1561 pcpu_dump_alloc_info(KERN_EMERG, ai); \
1562 BUG(); \
1563 } \
1564} while (0)
1565
1566
1567 PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
1568#ifdef CONFIG_SMP
1569 PCPU_SETUP_BUG_ON(!ai->static_size);
1570 PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start));
1571#endif
1572 PCPU_SETUP_BUG_ON(!base_addr);
1573 PCPU_SETUP_BUG_ON(offset_in_page(base_addr));
1574 PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
1575 PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
1576 PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
1577 PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
1578 PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
1579
1580
1581 group_offsets = memblock_virt_alloc(ai->nr_groups *
1582 sizeof(group_offsets[0]), 0);
1583 group_sizes = memblock_virt_alloc(ai->nr_groups *
1584 sizeof(group_sizes[0]), 0);
1585 unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0);
1586 unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0);
1587
1588 for (cpu = 0; cpu < nr_cpu_ids; cpu++)
1589 unit_map[cpu] = UINT_MAX;
1590
1591 pcpu_low_unit_cpu = NR_CPUS;
1592 pcpu_high_unit_cpu = NR_CPUS;
1593
1594 for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
1595 const struct pcpu_group_info *gi = &ai->groups[group];
1596
1597 group_offsets[group] = gi->base_offset;
1598 group_sizes[group] = gi->nr_units * ai->unit_size;
1599
1600 for (i = 0; i < gi->nr_units; i++) {
1601 cpu = gi->cpu_map[i];
1602 if (cpu == NR_CPUS)
1603 continue;
1604
1605 PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids);
1606 PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
1607 PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);
1608
1609 unit_map[cpu] = unit + i;
1610 unit_off[cpu] = gi->base_offset + i * ai->unit_size;
1611
1612
1613 if (pcpu_low_unit_cpu == NR_CPUS ||
1614 unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
1615 pcpu_low_unit_cpu = cpu;
1616 if (pcpu_high_unit_cpu == NR_CPUS ||
1617 unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
1618 pcpu_high_unit_cpu = cpu;
1619 }
1620 }
1621 pcpu_nr_units = unit;
1622
1623 for_each_possible_cpu(cpu)
1624 PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);
1625
1626
1627#undef PCPU_SETUP_BUG_ON
1628 pcpu_dump_alloc_info(KERN_DEBUG, ai);
1629
1630 pcpu_nr_groups = ai->nr_groups;
1631 pcpu_group_offsets = group_offsets;
1632 pcpu_group_sizes = group_sizes;
1633 pcpu_unit_map = unit_map;
1634 pcpu_unit_offsets = unit_off;
1635
1636
1637 pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
1638 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
1639 pcpu_atom_size = ai->atom_size;
1640 pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
1641 BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
1642
1643
1644
1645
1646
1647 pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
1648 pcpu_slot = memblock_virt_alloc(
1649 pcpu_nr_slots * sizeof(pcpu_slot[0]), 0);
1650 for (i = 0; i < pcpu_nr_slots; i++)
1651 INIT_LIST_HEAD(&pcpu_slot[i]);
1652
1653
1654
1655
1656
1657
1658
1659
1660 schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
1661 INIT_LIST_HEAD(&schunk->list);
1662 INIT_LIST_HEAD(&schunk->map_extend_list);
1663 schunk->base_addr = base_addr;
1664 schunk->map = smap;
1665 schunk->map_alloc = ARRAY_SIZE(smap);
1666 schunk->immutable = true;
1667 bitmap_fill(schunk->populated, pcpu_unit_pages);
1668 schunk->nr_populated = pcpu_unit_pages;
1669
1670 if (ai->reserved_size) {
1671 schunk->free_size = ai->reserved_size;
1672 pcpu_reserved_chunk = schunk;
1673 pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size;
1674 } else {
1675 schunk->free_size = dyn_size;
1676 dyn_size = 0;
1677 }
1678 schunk->contig_hint = schunk->free_size;
1679
1680 schunk->map[0] = 1;
1681 schunk->map[1] = ai->static_size;
1682 schunk->map_used = 1;
1683 if (schunk->free_size)
1684 schunk->map[++schunk->map_used] = ai->static_size + schunk->free_size;
1685 schunk->map[schunk->map_used] |= 1;
1686
1687
1688 if (dyn_size) {
1689 dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
1690 INIT_LIST_HEAD(&dchunk->list);
1691 INIT_LIST_HEAD(&dchunk->map_extend_list);
1692 dchunk->base_addr = base_addr;
1693 dchunk->map = dmap;
1694 dchunk->map_alloc = ARRAY_SIZE(dmap);
1695 dchunk->immutable = true;
1696 bitmap_fill(dchunk->populated, pcpu_unit_pages);
1697 dchunk->nr_populated = pcpu_unit_pages;
1698
1699 dchunk->contig_hint = dchunk->free_size = dyn_size;
1700 dchunk->map[0] = 1;
1701 dchunk->map[1] = pcpu_reserved_chunk_limit;
1702 dchunk->map[2] = (pcpu_reserved_chunk_limit + dchunk->free_size) | 1;
1703 dchunk->map_used = 2;
1704 }
1705
1706
1707 pcpu_first_chunk = dchunk ?: schunk;
1708 pcpu_nr_empty_pop_pages +=
1709 pcpu_count_occupied_pages(pcpu_first_chunk, 1);
1710 pcpu_chunk_relocate(pcpu_first_chunk, -1);
1711
1712
1713 pcpu_base_addr = base_addr;
1714 return 0;
1715}
1716
1717#ifdef CONFIG_SMP
1718
1719const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
1720 [PCPU_FC_AUTO] = "auto",
1721 [PCPU_FC_EMBED] = "embed",
1722 [PCPU_FC_PAGE] = "page",
1723};
1724
1725enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
1726
1727static int __init percpu_alloc_setup(char *str)
1728{
1729 if (!str)
1730 return -EINVAL;
1731
1732 if (0)
1733 ;
1734#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
1735 else if (!strcmp(str, "embed"))
1736 pcpu_chosen_fc = PCPU_FC_EMBED;
1737#endif
1738#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
1739 else if (!strcmp(str, "page"))
1740 pcpu_chosen_fc = PCPU_FC_PAGE;
1741#endif
1742 else
1743 pr_warn("unknown allocator %s specified\n", str);
1744
1745 return 0;
1746}
1747early_param("percpu_alloc", percpu_alloc_setup);
1748
1749
1750
1751
1752
1753
1754#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
1755 !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
1756#define BUILD_EMBED_FIRST_CHUNK
1757#endif
1758
1759
1760#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
1761#define BUILD_PAGE_FIRST_CHUNK
1762#endif
1763
1764
1765#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
1788 size_t reserved_size, size_t dyn_size,
1789 size_t atom_size,
1790 pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
1791{
1792 static int group_map[NR_CPUS] __initdata;
1793 static int group_cnt[NR_CPUS] __initdata;
1794 const size_t static_size = __per_cpu_end - __per_cpu_start;
1795 int nr_groups = 1, nr_units = 0;
1796 size_t size_sum, min_unit_size, alloc_size;
1797 int upa, max_upa, uninitialized_var(best_upa);
1798 int last_allocs, group, unit;
1799 unsigned int cpu, tcpu;
1800 struct pcpu_alloc_info *ai;
1801 unsigned int *cpu_map;
1802
1803
1804 memset(group_map, 0, sizeof(group_map));
1805 memset(group_cnt, 0, sizeof(group_cnt));
1806
1807
1808 size_sum = PFN_ALIGN(static_size + reserved_size +
1809 max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
1810 dyn_size = size_sum - static_size - reserved_size;
1811
1812
1813
1814
1815
1816
1817
1818 min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
1819
1820 alloc_size = roundup(min_unit_size, atom_size);
1821 upa = alloc_size / min_unit_size;
1822 while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
1823 upa--;
1824 max_upa = upa;
1825
1826
1827 for_each_possible_cpu(cpu) {
1828 group = 0;
1829 next_group:
1830 for_each_possible_cpu(tcpu) {
1831 if (cpu == tcpu)
1832 break;
1833 if (group_map[tcpu] == group && cpu_distance_fn &&
1834 (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
1835 cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
1836 group++;
1837 nr_groups = max(nr_groups, group + 1);
1838 goto next_group;
1839 }
1840 }
1841 group_map[cpu] = group;
1842 group_cnt[group]++;
1843 }
1844
1845
1846
1847
1848
1849
1850 last_allocs = INT_MAX;
1851 for (upa = max_upa; upa; upa--) {
1852 int allocs = 0, wasted = 0;
1853
1854 if (alloc_size % upa || (offset_in_page(alloc_size / upa)))
1855 continue;
1856
1857 for (group = 0; group < nr_groups; group++) {
1858 int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
1859 allocs += this_allocs;
1860 wasted += this_allocs * upa - group_cnt[group];
1861 }
1862
1863
1864
1865
1866
1867
1868 if (wasted > num_possible_cpus() / 3)
1869 continue;
1870
1871
1872 if (allocs > last_allocs)
1873 break;
1874 last_allocs = allocs;
1875 best_upa = upa;
1876 }
1877 upa = best_upa;
1878
1879
1880 for (group = 0; group < nr_groups; group++)
1881 nr_units += roundup(group_cnt[group], upa);
1882
1883 ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
1884 if (!ai)
1885 return ERR_PTR(-ENOMEM);
1886 cpu_map = ai->groups[0].cpu_map;
1887
1888 for (group = 0; group < nr_groups; group++) {
1889 ai->groups[group].cpu_map = cpu_map;
1890 cpu_map += roundup(group_cnt[group], upa);
1891 }
1892
1893 ai->static_size = static_size;
1894 ai->reserved_size = reserved_size;
1895 ai->dyn_size = dyn_size;
1896 ai->unit_size = alloc_size / upa;
1897 ai->atom_size = atom_size;
1898 ai->alloc_size = alloc_size;
1899
1900 for (group = 0, unit = 0; group_cnt[group]; group++) {
1901 struct pcpu_group_info *gi = &ai->groups[group];
1902
1903
1904
1905
1906
1907
1908 gi->base_offset = unit * ai->unit_size;
1909
1910 for_each_possible_cpu(cpu)
1911 if (group_map[cpu] == group)
1912 gi->cpu_map[gi->nr_units++] = cpu;
1913 gi->nr_units = roundup(gi->nr_units, upa);
1914 unit += gi->nr_units;
1915 }
1916 BUG_ON(unit != nr_units);
1917
1918 return ai;
1919}
1920#endif
1921
1922#if defined(BUILD_EMBED_FIRST_CHUNK)
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
1956 size_t atom_size,
1957 pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
1958 pcpu_fc_alloc_fn_t alloc_fn,
1959 pcpu_fc_free_fn_t free_fn)
1960{
1961 void *base = (void *)ULONG_MAX;
1962 void **areas = NULL;
1963 struct pcpu_alloc_info *ai;
1964 size_t size_sum, areas_size;
1965 unsigned long max_distance;
1966 int group, i, highest_group, rc;
1967
1968 ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
1969 cpu_distance_fn);
1970 if (IS_ERR(ai))
1971 return PTR_ERR(ai);
1972
1973 size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
1974 areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
1975
1976 areas = memblock_virt_alloc_nopanic(areas_size, 0);
1977 if (!areas) {
1978 rc = -ENOMEM;
1979 goto out_free;
1980 }
1981
1982
1983 highest_group = 0;
1984 for (group = 0; group < ai->nr_groups; group++) {
1985 struct pcpu_group_info *gi = &ai->groups[group];
1986 unsigned int cpu = NR_CPUS;
1987 void *ptr;
1988
1989 for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
1990 cpu = gi->cpu_map[i];
1991 BUG_ON(cpu == NR_CPUS);
1992
1993
1994 ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
1995 if (!ptr) {
1996 rc = -ENOMEM;
1997 goto out_free_areas;
1998 }
1999
2000 kmemleak_free(ptr);
2001 areas[group] = ptr;
2002
2003 base = min(ptr, base);
2004 if (ptr > areas[highest_group])
2005 highest_group = group;
2006 }
2007 max_distance = areas[highest_group] - base;
2008 max_distance += ai->unit_size * ai->groups[highest_group].nr_units;
2009
2010
2011 if (max_distance > VMALLOC_TOTAL * 3 / 4) {
2012 pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n",
2013 max_distance, VMALLOC_TOTAL);
2014#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
2015
2016 rc = -EINVAL;
2017 goto out_free_areas;
2018#endif
2019 }
2020
2021
2022
2023
2024
2025
2026 for (group = 0; group < ai->nr_groups; group++) {
2027 struct pcpu_group_info *gi = &ai->groups[group];
2028 void *ptr = areas[group];
2029
2030 for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
2031 if (gi->cpu_map[i] == NR_CPUS) {
2032
2033 free_fn(ptr, ai->unit_size);
2034 continue;
2035 }
2036
2037 memcpy(ptr, __per_cpu_load, ai->static_size);
2038 free_fn(ptr + size_sum, ai->unit_size - size_sum);
2039 }
2040 }
2041
2042
2043 for (group = 0; group < ai->nr_groups; group++) {
2044 ai->groups[group].base_offset = areas[group] - base;
2045 }
2046
2047 pr_info("Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
2048 PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
2049 ai->dyn_size, ai->unit_size);
2050
2051 rc = pcpu_setup_first_chunk(ai, base);
2052 goto out_free;
2053
2054out_free_areas:
2055 for (group = 0; group < ai->nr_groups; group++)
2056 if (areas[group])
2057 free_fn(areas[group],
2058 ai->groups[group].nr_units * ai->unit_size);
2059out_free:
2060 pcpu_free_alloc_info(ai);
2061 if (areas)
2062 memblock_free_early(__pa(areas), areas_size);
2063 return rc;
2064}
2065#endif
2066
2067#ifdef BUILD_PAGE_FIRST_CHUNK
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084int __init pcpu_page_first_chunk(size_t reserved_size,
2085 pcpu_fc_alloc_fn_t alloc_fn,
2086 pcpu_fc_free_fn_t free_fn,
2087 pcpu_fc_populate_pte_fn_t populate_pte_fn)
2088{
2089 static struct vm_struct vm;
2090 struct pcpu_alloc_info *ai;
2091 char psize_str[16];
2092 int unit_pages;
2093 size_t pages_size;
2094 struct page **pages;
2095 int unit, i, j, rc;
2096
2097 snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
2098
2099 ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
2100 if (IS_ERR(ai))
2101 return PTR_ERR(ai);
2102 BUG_ON(ai->nr_groups != 1);
2103 BUG_ON(ai->groups[0].nr_units != num_possible_cpus());
2104
2105 unit_pages = ai->unit_size >> PAGE_SHIFT;
2106
2107
2108 pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
2109 sizeof(pages[0]));
2110 pages = memblock_virt_alloc(pages_size, 0);
2111
2112
2113 j = 0;
2114 for (unit = 0; unit < num_possible_cpus(); unit++)
2115 for (i = 0; i < unit_pages; i++) {
2116 unsigned int cpu = ai->groups[0].cpu_map[unit];
2117 void *ptr;
2118
2119 ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
2120 if (!ptr) {
2121 pr_warn("failed to allocate %s page for cpu%u\n",
2122 psize_str, cpu);
2123 goto enomem;
2124 }
2125
2126 kmemleak_free(ptr);
2127 pages[j++] = virt_to_page(ptr);
2128 }
2129
2130
2131 vm.flags = VM_ALLOC;
2132 vm.size = num_possible_cpus() * ai->unit_size;
2133 vm_area_register_early(&vm, PAGE_SIZE);
2134
2135 for (unit = 0; unit < num_possible_cpus(); unit++) {
2136 unsigned long unit_addr =
2137 (unsigned long)vm.addr + unit * ai->unit_size;
2138
2139 for (i = 0; i < unit_pages; i++)
2140 populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
2141
2142
2143 rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
2144 unit_pages);
2145 if (rc < 0)
2146 panic("failed to map percpu area, err=%d\n", rc);
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157 memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
2158 }
2159
2160
2161 pr_info("%d %s pages/cpu @%p s%zu r%zu d%zu\n",
2162 unit_pages, psize_str, vm.addr, ai->static_size,
2163 ai->reserved_size, ai->dyn_size);
2164
2165 rc = pcpu_setup_first_chunk(ai, vm.addr);
2166 goto out_free_ar;
2167
2168enomem:
2169 while (--j >= 0)
2170 free_fn(page_address(pages[j]), PAGE_SIZE);
2171 rc = -ENOMEM;
2172out_free_ar:
2173 memblock_free_early(__pa(pages), pages_size);
2174 pcpu_free_alloc_info(ai);
2175 return rc;
2176}
2177#endif
2178
2179#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
2193EXPORT_SYMBOL(__per_cpu_offset);
2194
2195static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
2196 size_t align)
2197{
2198 return memblock_virt_alloc_from_nopanic(
2199 size, align, __pa(MAX_DMA_ADDRESS));
2200}
2201
2202static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
2203{
2204 memblock_free_early(__pa(ptr), size);
2205}
2206
2207void __init setup_per_cpu_areas(void)
2208{
2209 unsigned long delta;
2210 unsigned int cpu;
2211 int rc;
2212
2213
2214
2215
2216
2217 rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
2218 PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
2219 pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
2220 if (rc < 0)
2221 panic("Failed to initialize percpu areas.");
2222
2223 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
2224 for_each_possible_cpu(cpu)
2225 __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
2226}
2227#endif
2228
2229#else
2230
2231
2232
2233
2234
2235
2236
2237
2238void __init setup_per_cpu_areas(void)
2239{
2240 const size_t unit_size =
2241 roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
2242 PERCPU_DYNAMIC_RESERVE));
2243 struct pcpu_alloc_info *ai;
2244 void *fc;
2245
2246 ai = pcpu_alloc_alloc_info(1, 1);
2247 fc = memblock_virt_alloc_from_nopanic(unit_size,
2248 PAGE_SIZE,
2249 __pa(MAX_DMA_ADDRESS));
2250 if (!ai || !fc)
2251 panic("Failed to allocate memory for percpu areas.");
2252
2253 kmemleak_free(fc);
2254
2255 ai->dyn_size = unit_size;
2256 ai->unit_size = unit_size;
2257 ai->atom_size = unit_size;
2258 ai->alloc_size = unit_size;
2259 ai->groups[0].nr_units = 1;
2260 ai->groups[0].cpu_map[0] = 0;
2261
2262 if (pcpu_setup_first_chunk(ai, fc) < 0)
2263 panic("Failed to initialize percpu areas.");
2264}
2265
2266#endif
2267
2268
2269
2270
2271
2272
2273
2274void __init percpu_init_late(void)
2275{
2276 struct pcpu_chunk *target_chunks[] =
2277 { pcpu_first_chunk, pcpu_reserved_chunk, NULL };
2278 struct pcpu_chunk *chunk;
2279 unsigned long flags;
2280 int i;
2281
2282 for (i = 0; (chunk = target_chunks[i]); i++) {
2283 int *map;
2284 const size_t size = PERCPU_DYNAMIC_EARLY_SLOTS * sizeof(map[0]);
2285
2286 BUILD_BUG_ON(size > PAGE_SIZE);
2287
2288 map = pcpu_mem_zalloc(size);
2289 BUG_ON(!map);
2290
2291 spin_lock_irqsave(&pcpu_lock, flags);
2292 memcpy(map, chunk->map, size);
2293 chunk->map = map;
2294 spin_unlock_irqrestore(&pcpu_lock, flags);
2295 }
2296}
2297
2298
2299
2300
2301
2302
2303static int __init percpu_enable_async(void)
2304{
2305 pcpu_async_enabled = true;
2306 return 0;
2307}
2308subsys_initcall(percpu_enable_async);
2309