1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56#include <linux/bitmap.h>
57#include <linux/bootmem.h>
58#include <linux/err.h>
59#include <linux/list.h>
60#include <linux/log2.h>
61#include <linux/mm.h>
62#include <linux/module.h>
63#include <linux/mutex.h>
64#include <linux/percpu.h>
65#include <linux/pfn.h>
66#include <linux/slab.h>
67#include <linux/spinlock.h>
68#include <linux/vmalloc.h>
69#include <linux/workqueue.h>
70#include <linux/kmemleak.h>
71
72#include <asm/cacheflush.h>
73#include <asm/sections.h>
74#include <asm/tlbflush.h>
75#include <asm/io.h>
76
77#define PCPU_SLOT_BASE_SHIFT 5
78#define PCPU_DFL_MAP_ALLOC 16
79#define PCPU_ATOMIC_MAP_MARGIN_LOW 32
80#define PCPU_ATOMIC_MAP_MARGIN_HIGH 64
81#define PCPU_EMPTY_POP_PAGES_LOW 2
82#define PCPU_EMPTY_POP_PAGES_HIGH 4
83
84#ifdef CONFIG_SMP
85
86#ifndef __addr_to_pcpu_ptr
87#define __addr_to_pcpu_ptr(addr) \
88 (void __percpu *)((unsigned long)(addr) - \
89 (unsigned long)pcpu_base_addr + \
90 (unsigned long)__per_cpu_start)
91#endif
92#ifndef __pcpu_ptr_to_addr
93#define __pcpu_ptr_to_addr(ptr) \
94 (void __force *)((unsigned long)(ptr) + \
95 (unsigned long)pcpu_base_addr - \
96 (unsigned long)__per_cpu_start)
97#endif
98#else
99
100#define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr)
101#define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr)
102#endif
103
104struct pcpu_chunk {
105 struct list_head list;
106 int free_size;
107 int contig_hint;
108 void *base_addr;
109
110 int map_used;
111 int map_alloc;
112 int *map;
113 struct work_struct map_extend_work;
114
115 void *data;
116 int first_free;
117 bool immutable;
118 int nr_populated;
119 unsigned long populated[];
120};
121
122static int pcpu_unit_pages __read_mostly;
123static int pcpu_unit_size __read_mostly;
124static int pcpu_nr_units __read_mostly;
125static int pcpu_atom_size __read_mostly;
126static int pcpu_nr_slots __read_mostly;
127static size_t pcpu_chunk_struct_size __read_mostly;
128
129
130static unsigned int pcpu_low_unit_cpu __read_mostly;
131static unsigned int pcpu_high_unit_cpu __read_mostly;
132
133
134void *pcpu_base_addr __read_mostly;
135EXPORT_SYMBOL_GPL(pcpu_base_addr);
136
137static const int *pcpu_unit_map __read_mostly;
138const unsigned long *pcpu_unit_offsets __read_mostly;
139
140
141static int pcpu_nr_groups __read_mostly;
142static const unsigned long *pcpu_group_offsets __read_mostly;
143static const size_t *pcpu_group_sizes __read_mostly;
144
145
146
147
148
149
150static struct pcpu_chunk *pcpu_first_chunk;
151
152
153
154
155
156
157
158
159static struct pcpu_chunk *pcpu_reserved_chunk;
160static int pcpu_reserved_chunk_limit;
161
162static DEFINE_SPINLOCK(pcpu_lock);
163static DEFINE_MUTEX(pcpu_alloc_mutex);
164
165static struct list_head *pcpu_slot __read_mostly;
166
167
168
169
170
171static int pcpu_nr_empty_pop_pages;
172
173
174
175
176
177
178
179static void pcpu_balance_workfn(struct work_struct *work);
180static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
181static bool pcpu_async_enabled __read_mostly;
182static bool pcpu_atomic_alloc_failed;
183
184static void pcpu_schedule_balance_work(void)
185{
186 if (pcpu_async_enabled)
187 schedule_work(&pcpu_balance_work);
188}
189
190static bool pcpu_addr_in_first_chunk(void *addr)
191{
192 void *first_start = pcpu_first_chunk->base_addr;
193
194 return addr >= first_start && addr < first_start + pcpu_unit_size;
195}
196
197static bool pcpu_addr_in_reserved_chunk(void *addr)
198{
199 void *first_start = pcpu_first_chunk->base_addr;
200
201 return addr >= first_start &&
202 addr < first_start + pcpu_reserved_chunk_limit;
203}
204
205static int __pcpu_size_to_slot(int size)
206{
207 int highbit = fls(size);
208 return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
209}
210
211static int pcpu_size_to_slot(int size)
212{
213 if (size == pcpu_unit_size)
214 return pcpu_nr_slots - 1;
215 return __pcpu_size_to_slot(size);
216}
217
218static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
219{
220 if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
221 return 0;
222
223 return pcpu_size_to_slot(chunk->free_size);
224}
225
226
227static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
228{
229 page->index = (unsigned long)pcpu;
230}
231
232
233static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
234{
235 return (struct pcpu_chunk *)page->index;
236}
237
238static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
239{
240 return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
241}
242
243static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
244 unsigned int cpu, int page_idx)
245{
246 return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] +
247 (page_idx << PAGE_SHIFT);
248}
249
250static void __maybe_unused pcpu_next_unpop(struct pcpu_chunk *chunk,
251 int *rs, int *re, int end)
252{
253 *rs = find_next_zero_bit(chunk->populated, end, *rs);
254 *re = find_next_bit(chunk->populated, end, *rs + 1);
255}
256
257static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
258 int *rs, int *re, int end)
259{
260 *rs = find_next_bit(chunk->populated, end, *rs);
261 *re = find_next_zero_bit(chunk->populated, end, *rs + 1);
262}
263
264
265
266
267
268
269
270#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \
271 for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \
272 (rs) < (re); \
273 (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end)))
274
275#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \
276 for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \
277 (rs) < (re); \
278 (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294static void *pcpu_mem_zalloc(size_t size)
295{
296 if (WARN_ON_ONCE(!slab_is_available()))
297 return NULL;
298
299 if (size <= PAGE_SIZE)
300 return kzalloc(size, GFP_KERNEL);
301 else
302 return vzalloc(size);
303}
304
305
306
307
308
309
310
311
312static void pcpu_mem_free(void *ptr, size_t size)
313{
314 if (size <= PAGE_SIZE)
315 kfree(ptr);
316 else
317 vfree(ptr);
318}
319
320
321
322
323
324
325
326
327
328
329static int pcpu_count_occupied_pages(struct pcpu_chunk *chunk, int i)
330{
331 int off = chunk->map[i] & ~1;
332 int end = chunk->map[i + 1] & ~1;
333
334 if (!PAGE_ALIGNED(off) && i > 0) {
335 int prev = chunk->map[i - 1];
336
337 if (!(prev & 1) && prev <= round_down(off, PAGE_SIZE))
338 off = round_down(off, PAGE_SIZE);
339 }
340
341 if (!PAGE_ALIGNED(end) && i + 1 < chunk->map_used) {
342 int next = chunk->map[i + 1];
343 int nend = chunk->map[i + 2] & ~1;
344
345 if (!(next & 1) && nend >= round_up(end, PAGE_SIZE))
346 end = round_up(end, PAGE_SIZE);
347 }
348
349 return max_t(int, PFN_DOWN(end) - PFN_UP(off), 0);
350}
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
366{
367 int nslot = pcpu_chunk_slot(chunk);
368
369 if (chunk != pcpu_reserved_chunk && oslot != nslot) {
370 if (oslot < nslot)
371 list_move(&chunk->list, &pcpu_slot[nslot]);
372 else
373 list_move_tail(&chunk->list, &pcpu_slot[nslot]);
374 }
375}
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic)
397{
398 int margin, new_alloc;
399
400 if (is_atomic) {
401 margin = 3;
402
403 if (chunk->map_alloc <
404 chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW &&
405 pcpu_async_enabled)
406 schedule_work(&chunk->map_extend_work);
407 } else {
408 margin = PCPU_ATOMIC_MAP_MARGIN_HIGH;
409 }
410
411 if (chunk->map_alloc >= chunk->map_used + margin)
412 return 0;
413
414 new_alloc = PCPU_DFL_MAP_ALLOC;
415 while (new_alloc < chunk->map_used + margin)
416 new_alloc *= 2;
417
418 return new_alloc;
419}
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
435{
436 int *old = NULL, *new = NULL;
437 size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
438 unsigned long flags;
439
440 new = pcpu_mem_zalloc(new_size);
441 if (!new)
442 return -ENOMEM;
443
444
445 spin_lock_irqsave(&pcpu_lock, flags);
446
447 if (new_alloc <= chunk->map_alloc)
448 goto out_unlock;
449
450 old_size = chunk->map_alloc * sizeof(chunk->map[0]);
451 old = chunk->map;
452
453 memcpy(new, old, old_size);
454
455 chunk->map_alloc = new_alloc;
456 chunk->map = new;
457 new = NULL;
458
459out_unlock:
460 spin_unlock_irqrestore(&pcpu_lock, flags);
461
462
463
464
465
466 pcpu_mem_free(old, old_size);
467 pcpu_mem_free(new, new_size);
468
469 return 0;
470}
471
472static void pcpu_map_extend_workfn(struct work_struct *work)
473{
474 struct pcpu_chunk *chunk = container_of(work, struct pcpu_chunk,
475 map_extend_work);
476 int new_alloc;
477
478 spin_lock_irq(&pcpu_lock);
479 new_alloc = pcpu_need_to_extend(chunk, false);
480 spin_unlock_irq(&pcpu_lock);
481
482 if (new_alloc)
483 pcpu_extend_area_map(chunk, new_alloc);
484}
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size,
505 int size, int align, bool pop_only)
506{
507 int cand_off = off;
508
509 while (true) {
510 int head = ALIGN(cand_off, align) - off;
511 int page_start, page_end, rs, re;
512
513 if (this_size < head + size)
514 return -1;
515
516 if (!pop_only)
517 return head;
518
519
520
521
522
523
524 page_start = PFN_DOWN(head + off);
525 page_end = PFN_UP(head + off + size);
526
527 rs = page_start;
528 pcpu_next_unpop(chunk, &rs, &re, PFN_UP(off + this_size));
529 if (rs >= page_end)
530 return head;
531 cand_off = re * PAGE_SIZE;
532 }
533}
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align,
557 bool pop_only, int *occ_pages_p)
558{
559 int oslot = pcpu_chunk_slot(chunk);
560 int max_contig = 0;
561 int i, off;
562 bool seen_free = false;
563 int *p;
564
565 for (i = chunk->first_free, p = chunk->map + i; i < chunk->map_used; i++, p++) {
566 int head, tail;
567 int this_size;
568
569 off = *p;
570 if (off & 1)
571 continue;
572
573 this_size = (p[1] & ~1) - off;
574
575 head = pcpu_fit_in_area(chunk, off, this_size, size, align,
576 pop_only);
577 if (head < 0) {
578 if (!seen_free) {
579 chunk->first_free = i;
580 seen_free = true;
581 }
582 max_contig = max(this_size, max_contig);
583 continue;
584 }
585
586
587
588
589
590
591
592 if (head && (head < sizeof(int) || !(p[-1] & 1))) {
593 *p = off += head;
594 if (p[-1] & 1)
595 chunk->free_size -= head;
596 else
597 max_contig = max(*p - p[-1], max_contig);
598 this_size -= head;
599 head = 0;
600 }
601
602
603 tail = this_size - head - size;
604 if (tail < sizeof(int)) {
605 tail = 0;
606 size = this_size - head;
607 }
608
609
610 if (head || tail) {
611 int nr_extra = !!head + !!tail;
612
613
614 memmove(p + nr_extra + 1, p + 1,
615 sizeof(chunk->map[0]) * (chunk->map_used - i));
616 chunk->map_used += nr_extra;
617
618 if (head) {
619 if (!seen_free) {
620 chunk->first_free = i;
621 seen_free = true;
622 }
623 *++p = off += head;
624 ++i;
625 max_contig = max(head, max_contig);
626 }
627 if (tail) {
628 p[1] = off + size;
629 max_contig = max(tail, max_contig);
630 }
631 }
632
633 if (!seen_free)
634 chunk->first_free = i + 1;
635
636
637 if (i + 1 == chunk->map_used)
638 chunk->contig_hint = max_contig;
639 else
640 chunk->contig_hint = max(chunk->contig_hint,
641 max_contig);
642
643 chunk->free_size -= size;
644 *p |= 1;
645
646 *occ_pages_p = pcpu_count_occupied_pages(chunk, i);
647 pcpu_chunk_relocate(chunk, oslot);
648 return off;
649 }
650
651 chunk->contig_hint = max_contig;
652 pcpu_chunk_relocate(chunk, oslot);
653
654
655 return -1;
656}
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme,
672 int *occ_pages_p)
673{
674 int oslot = pcpu_chunk_slot(chunk);
675 int off = 0;
676 unsigned i, j;
677 int to_free = 0;
678 int *p;
679
680 freeme |= 1;
681
682 i = 0;
683 j = chunk->map_used;
684 while (i != j) {
685 unsigned k = (i + j) / 2;
686 off = chunk->map[k];
687 if (off < freeme)
688 i = k + 1;
689 else if (off > freeme)
690 j = k;
691 else
692 i = j = k;
693 }
694 BUG_ON(off != freeme);
695
696 if (i < chunk->first_free)
697 chunk->first_free = i;
698
699 p = chunk->map + i;
700 *p = off &= ~1;
701 chunk->free_size += (p[1] & ~1) - off;
702
703 *occ_pages_p = pcpu_count_occupied_pages(chunk, i);
704
705
706 if (!(p[1] & 1))
707 to_free++;
708
709 if (i > 0 && !(p[-1] & 1)) {
710 to_free++;
711 i--;
712 p--;
713 }
714 if (to_free) {
715 chunk->map_used -= to_free;
716 memmove(p + 1, p + 1 + to_free,
717 (chunk->map_used - i) * sizeof(chunk->map[0]));
718 }
719
720 chunk->contig_hint = max(chunk->map[i + 1] - chunk->map[i] - 1, chunk->contig_hint);
721 pcpu_chunk_relocate(chunk, oslot);
722}
723
724static struct pcpu_chunk *pcpu_alloc_chunk(void)
725{
726 struct pcpu_chunk *chunk;
727
728 chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size);
729 if (!chunk)
730 return NULL;
731
732 chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *
733 sizeof(chunk->map[0]));
734 if (!chunk->map) {
735 pcpu_mem_free(chunk, pcpu_chunk_struct_size);
736 return NULL;
737 }
738
739 chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
740 chunk->map[0] = 0;
741 chunk->map[1] = pcpu_unit_size | 1;
742 chunk->map_used = 1;
743
744 INIT_LIST_HEAD(&chunk->list);
745 INIT_WORK(&chunk->map_extend_work, pcpu_map_extend_workfn);
746 chunk->free_size = pcpu_unit_size;
747 chunk->contig_hint = pcpu_unit_size;
748
749 return chunk;
750}
751
752static void pcpu_free_chunk(struct pcpu_chunk *chunk)
753{
754 if (!chunk)
755 return;
756 pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
757 pcpu_mem_free(chunk, pcpu_chunk_struct_size);
758}
759
760
761
762
763
764
765
766
767
768
769
770static void pcpu_chunk_populated(struct pcpu_chunk *chunk,
771 int page_start, int page_end)
772{
773 int nr = page_end - page_start;
774
775 lockdep_assert_held(&pcpu_lock);
776
777 bitmap_set(chunk->populated, page_start, nr);
778 chunk->nr_populated += nr;
779 pcpu_nr_empty_pop_pages += nr;
780}
781
782
783
784
785
786
787
788
789
790
791
792static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
793 int page_start, int page_end)
794{
795 int nr = page_end - page_start;
796
797 lockdep_assert_held(&pcpu_lock);
798
799 bitmap_clear(chunk->populated, page_start, nr);
800 chunk->nr_populated -= nr;
801 pcpu_nr_empty_pop_pages -= nr;
802}
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size);
820static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size);
821static struct pcpu_chunk *pcpu_create_chunk(void);
822static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
823static struct page *pcpu_addr_to_page(void *addr);
824static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
825
826#ifdef CONFIG_NEED_PER_CPU_KM
827#include "percpu-km.c"
828#else
829#include "percpu-vm.c"
830#endif
831
832
833
834
835
836
837
838
839static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
840{
841
842 if (pcpu_addr_in_first_chunk(addr)) {
843
844 if (pcpu_addr_in_reserved_chunk(addr))
845 return pcpu_reserved_chunk;
846 return pcpu_first_chunk;
847 }
848
849
850
851
852
853
854
855
856 addr += pcpu_unit_offsets[raw_smp_processor_id()];
857 return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
858}
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
874 gfp_t gfp)
875{
876 static int warn_limit = 10;
877 struct pcpu_chunk *chunk;
878 const char *err;
879 bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
880 int occ_pages = 0;
881 int slot, off, new_alloc, cpu, ret;
882 unsigned long flags;
883 void __percpu *ptr;
884
885
886
887
888
889 if (unlikely(align < 2))
890 align = 2;
891
892 size = ALIGN(size, 2);
893
894 if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
895 WARN(true, "illegal size (%zu) or align (%zu) for "
896 "percpu allocation\n", size, align);
897 return NULL;
898 }
899
900 spin_lock_irqsave(&pcpu_lock, flags);
901
902
903 if (reserved && pcpu_reserved_chunk) {
904 chunk = pcpu_reserved_chunk;
905
906 if (size > chunk->contig_hint) {
907 err = "alloc from reserved chunk failed";
908 goto fail_unlock;
909 }
910
911 while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) {
912 spin_unlock_irqrestore(&pcpu_lock, flags);
913 if (is_atomic ||
914 pcpu_extend_area_map(chunk, new_alloc) < 0) {
915 err = "failed to extend area map of reserved chunk";
916 goto fail;
917 }
918 spin_lock_irqsave(&pcpu_lock, flags);
919 }
920
921 off = pcpu_alloc_area(chunk, size, align, is_atomic,
922 &occ_pages);
923 if (off >= 0)
924 goto area_found;
925
926 err = "alloc from reserved chunk failed";
927 goto fail_unlock;
928 }
929
930restart:
931
932 for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
933 list_for_each_entry(chunk, &pcpu_slot[slot], list) {
934 if (size > chunk->contig_hint)
935 continue;
936
937 new_alloc = pcpu_need_to_extend(chunk, is_atomic);
938 if (new_alloc) {
939 if (is_atomic)
940 continue;
941 spin_unlock_irqrestore(&pcpu_lock, flags);
942 if (pcpu_extend_area_map(chunk,
943 new_alloc) < 0) {
944 err = "failed to extend area map";
945 goto fail;
946 }
947 spin_lock_irqsave(&pcpu_lock, flags);
948
949
950
951
952 goto restart;
953 }
954
955 off = pcpu_alloc_area(chunk, size, align, is_atomic,
956 &occ_pages);
957 if (off >= 0)
958 goto area_found;
959 }
960 }
961
962 spin_unlock_irqrestore(&pcpu_lock, flags);
963
964
965
966
967
968
969 if (is_atomic)
970 goto fail;
971
972 mutex_lock(&pcpu_alloc_mutex);
973
974 if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
975 chunk = pcpu_create_chunk();
976 if (!chunk) {
977 mutex_unlock(&pcpu_alloc_mutex);
978 err = "failed to allocate new chunk";
979 goto fail;
980 }
981
982 spin_lock_irqsave(&pcpu_lock, flags);
983 pcpu_chunk_relocate(chunk, -1);
984 } else {
985 spin_lock_irqsave(&pcpu_lock, flags);
986 }
987
988 mutex_unlock(&pcpu_alloc_mutex);
989 goto restart;
990
991area_found:
992 spin_unlock_irqrestore(&pcpu_lock, flags);
993
994
995 if (!is_atomic) {
996 int page_start, page_end, rs, re;
997
998 mutex_lock(&pcpu_alloc_mutex);
999
1000 page_start = PFN_DOWN(off);
1001 page_end = PFN_UP(off + size);
1002
1003 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
1004 WARN_ON(chunk->immutable);
1005
1006 ret = pcpu_populate_chunk(chunk, rs, re);
1007
1008 spin_lock_irqsave(&pcpu_lock, flags);
1009 if (ret) {
1010 mutex_unlock(&pcpu_alloc_mutex);
1011 pcpu_free_area(chunk, off, &occ_pages);
1012 err = "failed to populate";
1013 goto fail_unlock;
1014 }
1015 pcpu_chunk_populated(chunk, rs, re);
1016 spin_unlock_irqrestore(&pcpu_lock, flags);
1017 }
1018
1019 mutex_unlock(&pcpu_alloc_mutex);
1020 }
1021
1022 if (chunk != pcpu_reserved_chunk)
1023 pcpu_nr_empty_pop_pages -= occ_pages;
1024
1025 if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
1026 pcpu_schedule_balance_work();
1027
1028
1029 for_each_possible_cpu(cpu)
1030 memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
1031
1032 ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
1033 kmemleak_alloc_percpu(ptr, size);
1034 return ptr;
1035
1036fail_unlock:
1037 spin_unlock_irqrestore(&pcpu_lock, flags);
1038fail:
1039 if (!is_atomic && warn_limit) {
1040 pr_warning("PERCPU: allocation failed, size=%zu align=%zu atomic=%d, %s\n",
1041 size, align, is_atomic, err);
1042 dump_stack();
1043 if (!--warn_limit)
1044 pr_info("PERCPU: limit reached, disable warning\n");
1045 }
1046 if (is_atomic) {
1047
1048 pcpu_atomic_alloc_failed = true;
1049 pcpu_schedule_balance_work();
1050 }
1051 return NULL;
1052}
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp)
1068{
1069 return pcpu_alloc(size, align, false, gfp);
1070}
1071EXPORT_SYMBOL_GPL(__alloc_percpu_gfp);
1072
1073
1074
1075
1076
1077
1078
1079
1080void __percpu *__alloc_percpu(size_t size, size_t align)
1081{
1082 return pcpu_alloc(size, align, false, GFP_KERNEL);
1083}
1084EXPORT_SYMBOL_GPL(__alloc_percpu);
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
1103{
1104 return pcpu_alloc(size, align, true, GFP_KERNEL);
1105}
1106
1107
1108
1109
1110
1111
1112
1113static void pcpu_balance_workfn(struct work_struct *work)
1114{
1115 LIST_HEAD(to_free);
1116 struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
1117 struct pcpu_chunk *chunk, *next;
1118 int slot, nr_to_pop, ret;
1119
1120
1121
1122
1123
1124 mutex_lock(&pcpu_alloc_mutex);
1125 spin_lock_irq(&pcpu_lock);
1126
1127 list_for_each_entry_safe(chunk, next, free_head, list) {
1128 WARN_ON(chunk->immutable);
1129
1130
1131 if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
1132 continue;
1133
1134 list_move(&chunk->list, &to_free);
1135 }
1136
1137 spin_unlock_irq(&pcpu_lock);
1138
1139 list_for_each_entry_safe(chunk, next, &to_free, list) {
1140 int rs, re;
1141
1142 pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) {
1143 pcpu_depopulate_chunk(chunk, rs, re);
1144 spin_lock_irq(&pcpu_lock);
1145 pcpu_chunk_depopulated(chunk, rs, re);
1146 spin_unlock_irq(&pcpu_lock);
1147 }
1148 pcpu_destroy_chunk(chunk);
1149 }
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161retry_pop:
1162 if (pcpu_atomic_alloc_failed) {
1163 nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
1164
1165 pcpu_atomic_alloc_failed = false;
1166 } else {
1167 nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
1168 pcpu_nr_empty_pop_pages,
1169 0, PCPU_EMPTY_POP_PAGES_HIGH);
1170 }
1171
1172 for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
1173 int nr_unpop = 0, rs, re;
1174
1175 if (!nr_to_pop)
1176 break;
1177
1178 spin_lock_irq(&pcpu_lock);
1179 list_for_each_entry(chunk, &pcpu_slot[slot], list) {
1180 nr_unpop = pcpu_unit_pages - chunk->nr_populated;
1181 if (nr_unpop)
1182 break;
1183 }
1184 spin_unlock_irq(&pcpu_lock);
1185
1186 if (!nr_unpop)
1187 continue;
1188
1189
1190 pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) {
1191 int nr = min(re - rs, nr_to_pop);
1192
1193 ret = pcpu_populate_chunk(chunk, rs, rs + nr);
1194 if (!ret) {
1195 nr_to_pop -= nr;
1196 spin_lock_irq(&pcpu_lock);
1197 pcpu_chunk_populated(chunk, rs, rs + nr);
1198 spin_unlock_irq(&pcpu_lock);
1199 } else {
1200 nr_to_pop = 0;
1201 }
1202
1203 if (!nr_to_pop)
1204 break;
1205 }
1206 }
1207
1208 if (nr_to_pop) {
1209
1210 chunk = pcpu_create_chunk();
1211 if (chunk) {
1212 spin_lock_irq(&pcpu_lock);
1213 pcpu_chunk_relocate(chunk, -1);
1214 spin_unlock_irq(&pcpu_lock);
1215 goto retry_pop;
1216 }
1217 }
1218
1219 mutex_unlock(&pcpu_alloc_mutex);
1220}
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231void free_percpu(void __percpu *ptr)
1232{
1233 void *addr;
1234 struct pcpu_chunk *chunk;
1235 unsigned long flags;
1236 int off, occ_pages;
1237
1238 if (!ptr)
1239 return;
1240
1241 kmemleak_free_percpu(ptr);
1242
1243 addr = __pcpu_ptr_to_addr(ptr);
1244
1245 spin_lock_irqsave(&pcpu_lock, flags);
1246
1247 chunk = pcpu_chunk_addr_search(addr);
1248 off = addr - chunk->base_addr;
1249
1250 pcpu_free_area(chunk, off, &occ_pages);
1251
1252 if (chunk != pcpu_reserved_chunk)
1253 pcpu_nr_empty_pop_pages += occ_pages;
1254
1255
1256 if (chunk->free_size == pcpu_unit_size) {
1257 struct pcpu_chunk *pos;
1258
1259 list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
1260 if (pos != chunk) {
1261 pcpu_schedule_balance_work();
1262 break;
1263 }
1264 }
1265
1266 spin_unlock_irqrestore(&pcpu_lock, flags);
1267}
1268EXPORT_SYMBOL_GPL(free_percpu);
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281bool is_kernel_percpu_address(unsigned long addr)
1282{
1283#ifdef CONFIG_SMP
1284 const size_t static_size = __per_cpu_end - __per_cpu_start;
1285 void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
1286 unsigned int cpu;
1287
1288 for_each_possible_cpu(cpu) {
1289 void *start = per_cpu_ptr(base, cpu);
1290
1291 if ((void *)addr >= start && (void *)addr < start + static_size)
1292 return true;
1293 }
1294#endif
1295
1296 return false;
1297}
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322phys_addr_t per_cpu_ptr_to_phys(void *addr)
1323{
1324 void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
1325 bool in_first_chunk = false;
1326 unsigned long first_low, first_high;
1327 unsigned int cpu;
1328
1329
1330
1331
1332
1333
1334 first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0);
1335 first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu,
1336 pcpu_unit_pages);
1337 if ((unsigned long)addr >= first_low &&
1338 (unsigned long)addr < first_high) {
1339 for_each_possible_cpu(cpu) {
1340 void *start = per_cpu_ptr(base, cpu);
1341
1342 if (addr >= start && addr < start + pcpu_unit_size) {
1343 in_first_chunk = true;
1344 break;
1345 }
1346 }
1347 }
1348
1349 if (in_first_chunk) {
1350 if (!is_vmalloc_addr(addr))
1351 return __pa(addr);
1352 else
1353 return page_to_phys(vmalloc_to_page(addr)) +
1354 offset_in_page(addr);
1355 } else
1356 return page_to_phys(pcpu_addr_to_page(addr)) +
1357 offset_in_page(addr);
1358}
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
1376 int nr_units)
1377{
1378 struct pcpu_alloc_info *ai;
1379 size_t base_size, ai_size;
1380 void *ptr;
1381 int unit;
1382
1383 base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]),
1384 __alignof__(ai->groups[0].cpu_map[0]));
1385 ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
1386
1387 ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), 0);
1388 if (!ptr)
1389 return NULL;
1390 ai = ptr;
1391 ptr += base_size;
1392
1393 ai->groups[0].cpu_map = ptr;
1394
1395 for (unit = 0; unit < nr_units; unit++)
1396 ai->groups[0].cpu_map[unit] = NR_CPUS;
1397
1398 ai->nr_groups = nr_groups;
1399 ai->__ai_size = PFN_ALIGN(ai_size);
1400
1401 return ai;
1402}
1403
1404
1405
1406
1407
1408
1409
1410void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
1411{
1412 memblock_free_early(__pa(ai), ai->__ai_size);
1413}
1414
1415
1416
1417
1418
1419
1420
1421
1422static void pcpu_dump_alloc_info(const char *lvl,
1423 const struct pcpu_alloc_info *ai)
1424{
1425 int group_width = 1, cpu_width = 1, width;
1426 char empty_str[] = "--------";
1427 int alloc = 0, alloc_end = 0;
1428 int group, v;
1429 int upa, apl;
1430
1431 v = ai->nr_groups;
1432 while (v /= 10)
1433 group_width++;
1434
1435 v = num_possible_cpus();
1436 while (v /= 10)
1437 cpu_width++;
1438 empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
1439
1440 upa = ai->alloc_size / ai->unit_size;
1441 width = upa * (cpu_width + 1) + group_width + 3;
1442 apl = rounddown_pow_of_two(max(60 / width, 1));
1443
1444 printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
1445 lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
1446 ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);
1447
1448 for (group = 0; group < ai->nr_groups; group++) {
1449 const struct pcpu_group_info *gi = &ai->groups[group];
1450 int unit = 0, unit_end = 0;
1451
1452 BUG_ON(gi->nr_units % upa);
1453 for (alloc_end += gi->nr_units / upa;
1454 alloc < alloc_end; alloc++) {
1455 if (!(alloc % apl)) {
1456 printk(KERN_CONT "\n");
1457 printk("%spcpu-alloc: ", lvl);
1458 }
1459 printk(KERN_CONT "[%0*d] ", group_width, group);
1460
1461 for (unit_end += upa; unit < unit_end; unit++)
1462 if (gi->cpu_map[unit] != NR_CPUS)
1463 printk(KERN_CONT "%0*d ", cpu_width,
1464 gi->cpu_map[unit]);
1465 else
1466 printk(KERN_CONT "%s ", empty_str);
1467 }
1468 }
1469 printk(KERN_CONT "\n");
1470}
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1529 void *base_addr)
1530{
1531 static char cpus_buf[4096] __initdata;
1532 static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
1533 static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
1534 size_t dyn_size = ai->dyn_size;
1535 size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
1536 struct pcpu_chunk *schunk, *dchunk = NULL;
1537 unsigned long *group_offsets;
1538 size_t *group_sizes;
1539 unsigned long *unit_off;
1540 unsigned int cpu;
1541 int *unit_map;
1542 int group, unit, i;
1543
1544 cpumask_scnprintf(cpus_buf, sizeof(cpus_buf), cpu_possible_mask);
1545
1546#define PCPU_SETUP_BUG_ON(cond) do { \
1547 if (unlikely(cond)) { \
1548 pr_emerg("PERCPU: failed to initialize, %s", #cond); \
1549 pr_emerg("PERCPU: cpu_possible_mask=%s\n", cpus_buf); \
1550 pcpu_dump_alloc_info(KERN_EMERG, ai); \
1551 BUG(); \
1552 } \
1553} while (0)
1554
1555
1556 PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
1557#ifdef CONFIG_SMP
1558 PCPU_SETUP_BUG_ON(!ai->static_size);
1559 PCPU_SETUP_BUG_ON((unsigned long)__per_cpu_start & ~PAGE_MASK);
1560#endif
1561 PCPU_SETUP_BUG_ON(!base_addr);
1562 PCPU_SETUP_BUG_ON((unsigned long)base_addr & ~PAGE_MASK);
1563 PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
1564 PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
1565 PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
1566 PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
1567 PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
1568
1569
1570 group_offsets = memblock_virt_alloc(ai->nr_groups *
1571 sizeof(group_offsets[0]), 0);
1572 group_sizes = memblock_virt_alloc(ai->nr_groups *
1573 sizeof(group_sizes[0]), 0);
1574 unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0);
1575 unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0);
1576
1577 for (cpu = 0; cpu < nr_cpu_ids; cpu++)
1578 unit_map[cpu] = UINT_MAX;
1579
1580 pcpu_low_unit_cpu = NR_CPUS;
1581 pcpu_high_unit_cpu = NR_CPUS;
1582
1583 for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
1584 const struct pcpu_group_info *gi = &ai->groups[group];
1585
1586 group_offsets[group] = gi->base_offset;
1587 group_sizes[group] = gi->nr_units * ai->unit_size;
1588
1589 for (i = 0; i < gi->nr_units; i++) {
1590 cpu = gi->cpu_map[i];
1591 if (cpu == NR_CPUS)
1592 continue;
1593
1594 PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids);
1595 PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
1596 PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);
1597
1598 unit_map[cpu] = unit + i;
1599 unit_off[cpu] = gi->base_offset + i * ai->unit_size;
1600
1601
1602 if (pcpu_low_unit_cpu == NR_CPUS ||
1603 unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
1604 pcpu_low_unit_cpu = cpu;
1605 if (pcpu_high_unit_cpu == NR_CPUS ||
1606 unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
1607 pcpu_high_unit_cpu = cpu;
1608 }
1609 }
1610 pcpu_nr_units = unit;
1611
1612 for_each_possible_cpu(cpu)
1613 PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);
1614
1615
1616#undef PCPU_SETUP_BUG_ON
1617 pcpu_dump_alloc_info(KERN_DEBUG, ai);
1618
1619 pcpu_nr_groups = ai->nr_groups;
1620 pcpu_group_offsets = group_offsets;
1621 pcpu_group_sizes = group_sizes;
1622 pcpu_unit_map = unit_map;
1623 pcpu_unit_offsets = unit_off;
1624
1625
1626 pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
1627 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
1628 pcpu_atom_size = ai->atom_size;
1629 pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
1630 BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
1631
1632
1633
1634
1635
1636 pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
1637 pcpu_slot = memblock_virt_alloc(
1638 pcpu_nr_slots * sizeof(pcpu_slot[0]), 0);
1639 for (i = 0; i < pcpu_nr_slots; i++)
1640 INIT_LIST_HEAD(&pcpu_slot[i]);
1641
1642
1643
1644
1645
1646
1647
1648
1649 schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
1650 INIT_LIST_HEAD(&schunk->list);
1651 INIT_WORK(&schunk->map_extend_work, pcpu_map_extend_workfn);
1652 schunk->base_addr = base_addr;
1653 schunk->map = smap;
1654 schunk->map_alloc = ARRAY_SIZE(smap);
1655 schunk->immutable = true;
1656 bitmap_fill(schunk->populated, pcpu_unit_pages);
1657 schunk->nr_populated = pcpu_unit_pages;
1658
1659 if (ai->reserved_size) {
1660 schunk->free_size = ai->reserved_size;
1661 pcpu_reserved_chunk = schunk;
1662 pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size;
1663 } else {
1664 schunk->free_size = dyn_size;
1665 dyn_size = 0;
1666 }
1667 schunk->contig_hint = schunk->free_size;
1668
1669 schunk->map[0] = 1;
1670 schunk->map[1] = ai->static_size;
1671 schunk->map_used = 1;
1672 if (schunk->free_size)
1673 schunk->map[++schunk->map_used] = 1 | (ai->static_size + schunk->free_size);
1674 else
1675 schunk->map[1] |= 1;
1676
1677
1678 if (dyn_size) {
1679 dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
1680 INIT_LIST_HEAD(&dchunk->list);
1681 INIT_WORK(&dchunk->map_extend_work, pcpu_map_extend_workfn);
1682 dchunk->base_addr = base_addr;
1683 dchunk->map = dmap;
1684 dchunk->map_alloc = ARRAY_SIZE(dmap);
1685 dchunk->immutable = true;
1686 bitmap_fill(dchunk->populated, pcpu_unit_pages);
1687 dchunk->nr_populated = pcpu_unit_pages;
1688
1689 dchunk->contig_hint = dchunk->free_size = dyn_size;
1690 dchunk->map[0] = 1;
1691 dchunk->map[1] = pcpu_reserved_chunk_limit;
1692 dchunk->map[2] = (pcpu_reserved_chunk_limit + dchunk->free_size) | 1;
1693 dchunk->map_used = 2;
1694 }
1695
1696
1697 pcpu_first_chunk = dchunk ?: schunk;
1698 pcpu_nr_empty_pop_pages +=
1699 pcpu_count_occupied_pages(pcpu_first_chunk, 1);
1700 pcpu_chunk_relocate(pcpu_first_chunk, -1);
1701
1702
1703 pcpu_base_addr = base_addr;
1704 return 0;
1705}
1706
1707#ifdef CONFIG_SMP
1708
1709const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
1710 [PCPU_FC_AUTO] = "auto",
1711 [PCPU_FC_EMBED] = "embed",
1712 [PCPU_FC_PAGE] = "page",
1713};
1714
1715enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
1716
1717static int __init percpu_alloc_setup(char *str)
1718{
1719 if (!str)
1720 return -EINVAL;
1721
1722 if (0)
1723 ;
1724#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
1725 else if (!strcmp(str, "embed"))
1726 pcpu_chosen_fc = PCPU_FC_EMBED;
1727#endif
1728#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
1729 else if (!strcmp(str, "page"))
1730 pcpu_chosen_fc = PCPU_FC_PAGE;
1731#endif
1732 else
1733 pr_warning("PERCPU: unknown allocator %s specified\n", str);
1734
1735 return 0;
1736}
1737early_param("percpu_alloc", percpu_alloc_setup);
1738
1739
1740
1741
1742
1743
1744#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
1745 !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
1746#define BUILD_EMBED_FIRST_CHUNK
1747#endif
1748
1749
1750#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
1751#define BUILD_PAGE_FIRST_CHUNK
1752#endif
1753
1754
1755#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
1778 size_t reserved_size, size_t dyn_size,
1779 size_t atom_size,
1780 pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
1781{
1782 static int group_map[NR_CPUS] __initdata;
1783 static int group_cnt[NR_CPUS] __initdata;
1784 const size_t static_size = __per_cpu_end - __per_cpu_start;
1785 int nr_groups = 1, nr_units = 0;
1786 size_t size_sum, min_unit_size, alloc_size;
1787 int upa, max_upa, uninitialized_var(best_upa);
1788 int last_allocs, group, unit;
1789 unsigned int cpu, tcpu;
1790 struct pcpu_alloc_info *ai;
1791 unsigned int *cpu_map;
1792
1793
1794 memset(group_map, 0, sizeof(group_map));
1795 memset(group_cnt, 0, sizeof(group_cnt));
1796
1797
1798 size_sum = PFN_ALIGN(static_size + reserved_size +
1799 max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
1800 dyn_size = size_sum - static_size - reserved_size;
1801
1802
1803
1804
1805
1806
1807
1808 min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
1809
1810 alloc_size = roundup(min_unit_size, atom_size);
1811 upa = alloc_size / min_unit_size;
1812 while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
1813 upa--;
1814 max_upa = upa;
1815
1816
1817 for_each_possible_cpu(cpu) {
1818 group = 0;
1819 next_group:
1820 for_each_possible_cpu(tcpu) {
1821 if (cpu == tcpu)
1822 break;
1823 if (group_map[tcpu] == group && cpu_distance_fn &&
1824 (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
1825 cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
1826 group++;
1827 nr_groups = max(nr_groups, group + 1);
1828 goto next_group;
1829 }
1830 }
1831 group_map[cpu] = group;
1832 group_cnt[group]++;
1833 }
1834
1835
1836
1837
1838
1839
1840 last_allocs = INT_MAX;
1841 for (upa = max_upa; upa; upa--) {
1842 int allocs = 0, wasted = 0;
1843
1844 if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
1845 continue;
1846
1847 for (group = 0; group < nr_groups; group++) {
1848 int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
1849 allocs += this_allocs;
1850 wasted += this_allocs * upa - group_cnt[group];
1851 }
1852
1853
1854
1855
1856
1857
1858 if (wasted > num_possible_cpus() / 3)
1859 continue;
1860
1861
1862 if (allocs > last_allocs)
1863 break;
1864 last_allocs = allocs;
1865 best_upa = upa;
1866 }
1867 upa = best_upa;
1868
1869
1870 for (group = 0; group < nr_groups; group++)
1871 nr_units += roundup(group_cnt[group], upa);
1872
1873 ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
1874 if (!ai)
1875 return ERR_PTR(-ENOMEM);
1876 cpu_map = ai->groups[0].cpu_map;
1877
1878 for (group = 0; group < nr_groups; group++) {
1879 ai->groups[group].cpu_map = cpu_map;
1880 cpu_map += roundup(group_cnt[group], upa);
1881 }
1882
1883 ai->static_size = static_size;
1884 ai->reserved_size = reserved_size;
1885 ai->dyn_size = dyn_size;
1886 ai->unit_size = alloc_size / upa;
1887 ai->atom_size = atom_size;
1888 ai->alloc_size = alloc_size;
1889
1890 for (group = 0, unit = 0; group_cnt[group]; group++) {
1891 struct pcpu_group_info *gi = &ai->groups[group];
1892
1893
1894
1895
1896
1897
1898 gi->base_offset = unit * ai->unit_size;
1899
1900 for_each_possible_cpu(cpu)
1901 if (group_map[cpu] == group)
1902 gi->cpu_map[gi->nr_units++] = cpu;
1903 gi->nr_units = roundup(gi->nr_units, upa);
1904 unit += gi->nr_units;
1905 }
1906 BUG_ON(unit != nr_units);
1907
1908 return ai;
1909}
1910#endif
1911
1912#if defined(BUILD_EMBED_FIRST_CHUNK)
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
1946 size_t atom_size,
1947 pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
1948 pcpu_fc_alloc_fn_t alloc_fn,
1949 pcpu_fc_free_fn_t free_fn)
1950{
1951 void *base = (void *)ULONG_MAX;
1952 void **areas = NULL;
1953 struct pcpu_alloc_info *ai;
1954 size_t size_sum, areas_size, max_distance;
1955 int group, i, rc;
1956
1957 ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
1958 cpu_distance_fn);
1959 if (IS_ERR(ai))
1960 return PTR_ERR(ai);
1961
1962 size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
1963 areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
1964
1965 areas = memblock_virt_alloc_nopanic(areas_size, 0);
1966 if (!areas) {
1967 rc = -ENOMEM;
1968 goto out_free;
1969 }
1970
1971
1972 for (group = 0; group < ai->nr_groups; group++) {
1973 struct pcpu_group_info *gi = &ai->groups[group];
1974 unsigned int cpu = NR_CPUS;
1975 void *ptr;
1976
1977 for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
1978 cpu = gi->cpu_map[i];
1979 BUG_ON(cpu == NR_CPUS);
1980
1981
1982 ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
1983 if (!ptr) {
1984 rc = -ENOMEM;
1985 goto out_free_areas;
1986 }
1987
1988 kmemleak_free(ptr);
1989 areas[group] = ptr;
1990
1991 base = min(ptr, base);
1992 }
1993
1994
1995
1996
1997
1998
1999 for (group = 0; group < ai->nr_groups; group++) {
2000 struct pcpu_group_info *gi = &ai->groups[group];
2001 void *ptr = areas[group];
2002
2003 for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
2004 if (gi->cpu_map[i] == NR_CPUS) {
2005
2006 free_fn(ptr, ai->unit_size);
2007 continue;
2008 }
2009
2010 memcpy(ptr, __per_cpu_load, ai->static_size);
2011 free_fn(ptr + size_sum, ai->unit_size - size_sum);
2012 }
2013 }
2014
2015
2016 max_distance = 0;
2017 for (group = 0; group < ai->nr_groups; group++) {
2018 ai->groups[group].base_offset = areas[group] - base;
2019 max_distance = max_t(size_t, max_distance,
2020 ai->groups[group].base_offset);
2021 }
2022 max_distance += ai->unit_size;
2023
2024
2025 if (max_distance > VMALLOC_TOTAL * 3 / 4) {
2026 pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc "
2027 "space 0x%lx\n", max_distance,
2028 VMALLOC_TOTAL);
2029#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
2030
2031 rc = -EINVAL;
2032 goto out_free;
2033#endif
2034 }
2035
2036 pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
2037 PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
2038 ai->dyn_size, ai->unit_size);
2039
2040 rc = pcpu_setup_first_chunk(ai, base);
2041 goto out_free;
2042
2043out_free_areas:
2044 for (group = 0; group < ai->nr_groups; group++)
2045 if (areas[group])
2046 free_fn(areas[group],
2047 ai->groups[group].nr_units * ai->unit_size);
2048out_free:
2049 pcpu_free_alloc_info(ai);
2050 if (areas)
2051 memblock_free_early(__pa(areas), areas_size);
2052 return rc;
2053}
2054#endif
2055
2056#ifdef BUILD_PAGE_FIRST_CHUNK
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073int __init pcpu_page_first_chunk(size_t reserved_size,
2074 pcpu_fc_alloc_fn_t alloc_fn,
2075 pcpu_fc_free_fn_t free_fn,
2076 pcpu_fc_populate_pte_fn_t populate_pte_fn)
2077{
2078 static struct vm_struct vm;
2079 struct pcpu_alloc_info *ai;
2080 char psize_str[16];
2081 int unit_pages;
2082 size_t pages_size;
2083 struct page **pages;
2084 int unit, i, j, rc;
2085
2086 snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
2087
2088 ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
2089 if (IS_ERR(ai))
2090 return PTR_ERR(ai);
2091 BUG_ON(ai->nr_groups != 1);
2092 BUG_ON(ai->groups[0].nr_units != num_possible_cpus());
2093
2094 unit_pages = ai->unit_size >> PAGE_SHIFT;
2095
2096
2097 pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
2098 sizeof(pages[0]));
2099 pages = memblock_virt_alloc(pages_size, 0);
2100
2101
2102 j = 0;
2103 for (unit = 0; unit < num_possible_cpus(); unit++)
2104 for (i = 0; i < unit_pages; i++) {
2105 unsigned int cpu = ai->groups[0].cpu_map[unit];
2106 void *ptr;
2107
2108 ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
2109 if (!ptr) {
2110 pr_warning("PERCPU: failed to allocate %s page "
2111 "for cpu%u\n", psize_str, cpu);
2112 goto enomem;
2113 }
2114
2115 kmemleak_free(ptr);
2116 pages[j++] = virt_to_page(ptr);
2117 }
2118
2119
2120 vm.flags = VM_ALLOC;
2121 vm.size = num_possible_cpus() * ai->unit_size;
2122 vm_area_register_early(&vm, PAGE_SIZE);
2123
2124 for (unit = 0; unit < num_possible_cpus(); unit++) {
2125 unsigned long unit_addr =
2126 (unsigned long)vm.addr + unit * ai->unit_size;
2127
2128 for (i = 0; i < unit_pages; i++)
2129 populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
2130
2131
2132 rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
2133 unit_pages);
2134 if (rc < 0)
2135 panic("failed to map percpu area, err=%d\n", rc);
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146 memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
2147 }
2148
2149
2150 pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n",
2151 unit_pages, psize_str, vm.addr, ai->static_size,
2152 ai->reserved_size, ai->dyn_size);
2153
2154 rc = pcpu_setup_first_chunk(ai, vm.addr);
2155 goto out_free_ar;
2156
2157enomem:
2158 while (--j >= 0)
2159 free_fn(page_address(pages[j]), PAGE_SIZE);
2160 rc = -ENOMEM;
2161out_free_ar:
2162 memblock_free_early(__pa(pages), pages_size);
2163 pcpu_free_alloc_info(ai);
2164 return rc;
2165}
2166#endif
2167
2168#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
2182EXPORT_SYMBOL(__per_cpu_offset);
2183
2184static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
2185 size_t align)
2186{
2187 return memblock_virt_alloc_from_nopanic(
2188 size, align, __pa(MAX_DMA_ADDRESS));
2189}
2190
2191static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
2192{
2193 memblock_free_early(__pa(ptr), size);
2194}
2195
2196void __init setup_per_cpu_areas(void)
2197{
2198 unsigned long delta;
2199 unsigned int cpu;
2200 int rc;
2201
2202
2203
2204
2205
2206 rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
2207 PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
2208 pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
2209 if (rc < 0)
2210 panic("Failed to initialize percpu areas.");
2211
2212 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
2213 for_each_possible_cpu(cpu)
2214 __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
2215}
2216#endif
2217
2218#else
2219
2220
2221
2222
2223
2224
2225
2226
2227void __init setup_per_cpu_areas(void)
2228{
2229 const size_t unit_size =
2230 roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
2231 PERCPU_DYNAMIC_RESERVE));
2232 struct pcpu_alloc_info *ai;
2233 void *fc;
2234
2235 ai = pcpu_alloc_alloc_info(1, 1);
2236 fc = memblock_virt_alloc_from_nopanic(unit_size,
2237 PAGE_SIZE,
2238 __pa(MAX_DMA_ADDRESS));
2239 if (!ai || !fc)
2240 panic("Failed to allocate memory for percpu areas.");
2241
2242 kmemleak_free(fc);
2243
2244 ai->dyn_size = unit_size;
2245 ai->unit_size = unit_size;
2246 ai->atom_size = unit_size;
2247 ai->alloc_size = unit_size;
2248 ai->groups[0].nr_units = 1;
2249 ai->groups[0].cpu_map[0] = 0;
2250
2251 if (pcpu_setup_first_chunk(ai, fc) < 0)
2252 panic("Failed to initialize percpu areas.");
2253}
2254
2255#endif
2256
2257
2258
2259
2260
2261
2262
2263void __init percpu_init_late(void)
2264{
2265 struct pcpu_chunk *target_chunks[] =
2266 { pcpu_first_chunk, pcpu_reserved_chunk, NULL };
2267 struct pcpu_chunk *chunk;
2268 unsigned long flags;
2269 int i;
2270
2271 for (i = 0; (chunk = target_chunks[i]); i++) {
2272 int *map;
2273 const size_t size = PERCPU_DYNAMIC_EARLY_SLOTS * sizeof(map[0]);
2274
2275 BUILD_BUG_ON(size > PAGE_SIZE);
2276
2277 map = pcpu_mem_zalloc(size);
2278 BUG_ON(!map);
2279
2280 spin_lock_irqsave(&pcpu_lock, flags);
2281 memcpy(map, chunk->map, size);
2282 chunk->map = map;
2283 spin_unlock_irqrestore(&pcpu_lock, flags);
2284 }
2285}
2286
2287
2288
2289
2290
2291
2292static int __init percpu_enable_async(void)
2293{
2294 pcpu_async_enabled = true;
2295 return 0;
2296}
2297subsys_initcall(percpu_enable_async);
2298