1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56#include <linux/bitmap.h>
57#include <linux/bootmem.h>
58#include <linux/err.h>
59#include <linux/list.h>
60#include <linux/log2.h>
61#include <linux/mm.h>
62#include <linux/module.h>
63#include <linux/mutex.h>
64#include <linux/percpu.h>
65#include <linux/pfn.h>
66#include <linux/slab.h>
67#include <linux/spinlock.h>
68#include <linux/vmalloc.h>
69#include <linux/workqueue.h>
70#include <linux/kmemleak.h>
71
72#include <asm/cacheflush.h>
73#include <asm/sections.h>
74#include <asm/tlbflush.h>
75#include <asm/io.h>
76
77#define PCPU_SLOT_BASE_SHIFT 5
78#define PCPU_DFL_MAP_ALLOC 16
79#define PCPU_ATOMIC_MAP_MARGIN_LOW 32
80#define PCPU_ATOMIC_MAP_MARGIN_HIGH 64
81#define PCPU_EMPTY_POP_PAGES_LOW 2
82#define PCPU_EMPTY_POP_PAGES_HIGH 4
83
84#ifdef CONFIG_SMP
85
86#ifndef __addr_to_pcpu_ptr
87#define __addr_to_pcpu_ptr(addr) \
88 (void __percpu *)((unsigned long)(addr) - \
89 (unsigned long)pcpu_base_addr + \
90 (unsigned long)__per_cpu_start)
91#endif
92#ifndef __pcpu_ptr_to_addr
93#define __pcpu_ptr_to_addr(ptr) \
94 (void __force *)((unsigned long)(ptr) + \
95 (unsigned long)pcpu_base_addr - \
96 (unsigned long)__per_cpu_start)
97#endif
98#else
99
100#define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr)
101#define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr)
102#endif
103
104struct pcpu_chunk {
105 struct list_head list;
106 int free_size;
107 int contig_hint;
108 void *base_addr;
109
110 int map_used;
111 int map_alloc;
112 int *map;
113 struct work_struct map_extend_work;
114
115 void *data;
116 int first_free;
117 bool immutable;
118 int nr_populated;
119 unsigned long populated[];
120};
121
122static int pcpu_unit_pages __read_mostly;
123static int pcpu_unit_size __read_mostly;
124static int pcpu_nr_units __read_mostly;
125static int pcpu_atom_size __read_mostly;
126static int pcpu_nr_slots __read_mostly;
127static size_t pcpu_chunk_struct_size __read_mostly;
128
129
130static unsigned int pcpu_low_unit_cpu __read_mostly;
131static unsigned int pcpu_high_unit_cpu __read_mostly;
132
133
134void *pcpu_base_addr __read_mostly;
135EXPORT_SYMBOL_GPL(pcpu_base_addr);
136
137static const int *pcpu_unit_map __read_mostly;
138const unsigned long *pcpu_unit_offsets __read_mostly;
139
140
141static int pcpu_nr_groups __read_mostly;
142static const unsigned long *pcpu_group_offsets __read_mostly;
143static const size_t *pcpu_group_sizes __read_mostly;
144
145
146
147
148
149
150static struct pcpu_chunk *pcpu_first_chunk;
151
152
153
154
155
156
157
158
159static struct pcpu_chunk *pcpu_reserved_chunk;
160static int pcpu_reserved_chunk_limit;
161
162static DEFINE_SPINLOCK(pcpu_lock);
163static DEFINE_MUTEX(pcpu_alloc_mutex);
164
165static struct list_head *pcpu_slot __read_mostly;
166
167
168
169
170
171static int pcpu_nr_empty_pop_pages;
172
173
174
175
176
177
178
179static void pcpu_balance_workfn(struct work_struct *work);
180static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
181static bool pcpu_async_enabled __read_mostly;
182static bool pcpu_atomic_alloc_failed;
183
184static void pcpu_schedule_balance_work(void)
185{
186 if (pcpu_async_enabled)
187 schedule_work(&pcpu_balance_work);
188}
189
190static bool pcpu_addr_in_first_chunk(void *addr)
191{
192 void *first_start = pcpu_first_chunk->base_addr;
193
194 return addr >= first_start && addr < first_start + pcpu_unit_size;
195}
196
197static bool pcpu_addr_in_reserved_chunk(void *addr)
198{
199 void *first_start = pcpu_first_chunk->base_addr;
200
201 return addr >= first_start &&
202 addr < first_start + pcpu_reserved_chunk_limit;
203}
204
205static int __pcpu_size_to_slot(int size)
206{
207 int highbit = fls(size);
208 return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
209}
210
211static int pcpu_size_to_slot(int size)
212{
213 if (size == pcpu_unit_size)
214 return pcpu_nr_slots - 1;
215 return __pcpu_size_to_slot(size);
216}
217
218static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
219{
220 if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
221 return 0;
222
223 return pcpu_size_to_slot(chunk->free_size);
224}
225
226
227static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
228{
229 page->index = (unsigned long)pcpu;
230}
231
232
233static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
234{
235 return (struct pcpu_chunk *)page->index;
236}
237
238static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
239{
240 return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
241}
242
243static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
244 unsigned int cpu, int page_idx)
245{
246 return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] +
247 (page_idx << PAGE_SHIFT);
248}
249
250static void __maybe_unused pcpu_next_unpop(struct pcpu_chunk *chunk,
251 int *rs, int *re, int end)
252{
253 *rs = find_next_zero_bit(chunk->populated, end, *rs);
254 *re = find_next_bit(chunk->populated, end, *rs + 1);
255}
256
257static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
258 int *rs, int *re, int end)
259{
260 *rs = find_next_bit(chunk->populated, end, *rs);
261 *re = find_next_zero_bit(chunk->populated, end, *rs + 1);
262}
263
264
265
266
267
268
269
270#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \
271 for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \
272 (rs) < (re); \
273 (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end)))
274
275#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \
276 for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \
277 (rs) < (re); \
278 (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294static void *pcpu_mem_zalloc(size_t size)
295{
296 if (WARN_ON_ONCE(!slab_is_available()))
297 return NULL;
298
299 if (size <= PAGE_SIZE)
300 return kzalloc(size, GFP_KERNEL);
301 else
302 return vzalloc(size);
303}
304
305
306
307
308
309
310
311
312static void pcpu_mem_free(void *ptr, size_t size)
313{
314 if (size <= PAGE_SIZE)
315 kfree(ptr);
316 else
317 vfree(ptr);
318}
319
320
321
322
323
324
325
326
327
328
329static int pcpu_count_occupied_pages(struct pcpu_chunk *chunk, int i)
330{
331 int off = chunk->map[i] & ~1;
332 int end = chunk->map[i + 1] & ~1;
333
334 if (!PAGE_ALIGNED(off) && i > 0) {
335 int prev = chunk->map[i - 1];
336
337 if (!(prev & 1) && prev <= round_down(off, PAGE_SIZE))
338 off = round_down(off, PAGE_SIZE);
339 }
340
341 if (!PAGE_ALIGNED(end) && i + 1 < chunk->map_used) {
342 int next = chunk->map[i + 1];
343 int nend = chunk->map[i + 2] & ~1;
344
345 if (!(next & 1) && nend >= round_up(end, PAGE_SIZE))
346 end = round_up(end, PAGE_SIZE);
347 }
348
349 return max_t(int, PFN_DOWN(end) - PFN_UP(off), 0);
350}
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
366{
367 int nslot = pcpu_chunk_slot(chunk);
368
369 if (chunk != pcpu_reserved_chunk && oslot != nslot) {
370 if (oslot < nslot)
371 list_move(&chunk->list, &pcpu_slot[nslot]);
372 else
373 list_move_tail(&chunk->list, &pcpu_slot[nslot]);
374 }
375}
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic)
397{
398 int margin, new_alloc;
399
400 if (is_atomic) {
401 margin = 3;
402
403 if (chunk->map_alloc <
404 chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW &&
405 pcpu_async_enabled)
406 schedule_work(&chunk->map_extend_work);
407 } else {
408 margin = PCPU_ATOMIC_MAP_MARGIN_HIGH;
409 }
410
411 if (chunk->map_alloc >= chunk->map_used + margin)
412 return 0;
413
414 new_alloc = PCPU_DFL_MAP_ALLOC;
415 while (new_alloc < chunk->map_used + margin)
416 new_alloc *= 2;
417
418 return new_alloc;
419}
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
435{
436 int *old = NULL, *new = NULL;
437 size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
438 unsigned long flags;
439
440 new = pcpu_mem_zalloc(new_size);
441 if (!new)
442 return -ENOMEM;
443
444
445 spin_lock_irqsave(&pcpu_lock, flags);
446
447 if (new_alloc <= chunk->map_alloc)
448 goto out_unlock;
449
450 old_size = chunk->map_alloc * sizeof(chunk->map[0]);
451 old = chunk->map;
452
453 memcpy(new, old, old_size);
454
455 chunk->map_alloc = new_alloc;
456 chunk->map = new;
457 new = NULL;
458
459out_unlock:
460 spin_unlock_irqrestore(&pcpu_lock, flags);
461
462
463
464
465
466 pcpu_mem_free(old, old_size);
467 pcpu_mem_free(new, new_size);
468
469 return 0;
470}
471
472static void pcpu_map_extend_workfn(struct work_struct *work)
473{
474 struct pcpu_chunk *chunk = container_of(work, struct pcpu_chunk,
475 map_extend_work);
476 int new_alloc;
477
478 spin_lock_irq(&pcpu_lock);
479 new_alloc = pcpu_need_to_extend(chunk, false);
480 spin_unlock_irq(&pcpu_lock);
481
482 if (new_alloc)
483 pcpu_extend_area_map(chunk, new_alloc);
484}
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size,
505 int size, int align, bool pop_only)
506{
507 int cand_off = off;
508
509 while (true) {
510 int head = ALIGN(cand_off, align) - off;
511 int page_start, page_end, rs, re;
512
513 if (this_size < head + size)
514 return -1;
515
516 if (!pop_only)
517 return head;
518
519
520
521
522
523
524 page_start = PFN_DOWN(head + off);
525 page_end = PFN_UP(head + off + size);
526
527 rs = page_start;
528 pcpu_next_unpop(chunk, &rs, &re, PFN_UP(off + this_size));
529 if (rs >= page_end)
530 return head;
531 cand_off = re * PAGE_SIZE;
532 }
533}
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align,
557 bool pop_only, int *occ_pages_p)
558{
559 int oslot = pcpu_chunk_slot(chunk);
560 int max_contig = 0;
561 int i, off;
562 bool seen_free = false;
563 int *p;
564
565 for (i = chunk->first_free, p = chunk->map + i; i < chunk->map_used; i++, p++) {
566 int head, tail;
567 int this_size;
568
569 off = *p;
570 if (off & 1)
571 continue;
572
573 this_size = (p[1] & ~1) - off;
574
575 head = pcpu_fit_in_area(chunk, off, this_size, size, align,
576 pop_only);
577 if (head < 0) {
578 if (!seen_free) {
579 chunk->first_free = i;
580 seen_free = true;
581 }
582 max_contig = max(this_size, max_contig);
583 continue;
584 }
585
586
587
588
589
590
591
592 if (head && (head < sizeof(int) || !(p[-1] & 1))) {
593 *p = off += head;
594 if (p[-1] & 1)
595 chunk->free_size -= head;
596 else
597 max_contig = max(*p - p[-1], max_contig);
598 this_size -= head;
599 head = 0;
600 }
601
602
603 tail = this_size - head - size;
604 if (tail < sizeof(int)) {
605 tail = 0;
606 size = this_size - head;
607 }
608
609
610 if (head || tail) {
611 int nr_extra = !!head + !!tail;
612
613
614 memmove(p + nr_extra + 1, p + 1,
615 sizeof(chunk->map[0]) * (chunk->map_used - i));
616 chunk->map_used += nr_extra;
617
618 if (head) {
619 if (!seen_free) {
620 chunk->first_free = i;
621 seen_free = true;
622 }
623 *++p = off += head;
624 ++i;
625 max_contig = max(head, max_contig);
626 }
627 if (tail) {
628 p[1] = off + size;
629 max_contig = max(tail, max_contig);
630 }
631 }
632
633 if (!seen_free)
634 chunk->first_free = i + 1;
635
636
637 if (i + 1 == chunk->map_used)
638 chunk->contig_hint = max_contig;
639 else
640 chunk->contig_hint = max(chunk->contig_hint,
641 max_contig);
642
643 chunk->free_size -= size;
644 *p |= 1;
645
646 *occ_pages_p = pcpu_count_occupied_pages(chunk, i);
647 pcpu_chunk_relocate(chunk, oslot);
648 return off;
649 }
650
651 chunk->contig_hint = max_contig;
652 pcpu_chunk_relocate(chunk, oslot);
653
654
655 return -1;
656}
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme,
672 int *occ_pages_p)
673{
674 int oslot = pcpu_chunk_slot(chunk);
675 int off = 0;
676 unsigned i, j;
677 int to_free = 0;
678 int *p;
679
680 freeme |= 1;
681
682 i = 0;
683 j = chunk->map_used;
684 while (i != j) {
685 unsigned k = (i + j) / 2;
686 off = chunk->map[k];
687 if (off < freeme)
688 i = k + 1;
689 else if (off > freeme)
690 j = k;
691 else
692 i = j = k;
693 }
694 BUG_ON(off != freeme);
695
696 if (i < chunk->first_free)
697 chunk->first_free = i;
698
699 p = chunk->map + i;
700 *p = off &= ~1;
701 chunk->free_size += (p[1] & ~1) - off;
702
703 *occ_pages_p = pcpu_count_occupied_pages(chunk, i);
704
705
706 if (!(p[1] & 1))
707 to_free++;
708
709 if (i > 0 && !(p[-1] & 1)) {
710 to_free++;
711 i--;
712 p--;
713 }
714 if (to_free) {
715 chunk->map_used -= to_free;
716 memmove(p + 1, p + 1 + to_free,
717 (chunk->map_used - i) * sizeof(chunk->map[0]));
718 }
719
720 chunk->contig_hint = max(chunk->map[i + 1] - chunk->map[i] - 1, chunk->contig_hint);
721 pcpu_chunk_relocate(chunk, oslot);
722}
723
724static struct pcpu_chunk *pcpu_alloc_chunk(void)
725{
726 struct pcpu_chunk *chunk;
727
728 chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size);
729 if (!chunk)
730 return NULL;
731
732 chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *
733 sizeof(chunk->map[0]));
734 if (!chunk->map) {
735 pcpu_mem_free(chunk, pcpu_chunk_struct_size);
736 return NULL;
737 }
738
739 chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
740 chunk->map[0] = 0;
741 chunk->map[1] = pcpu_unit_size | 1;
742 chunk->map_used = 1;
743
744 INIT_LIST_HEAD(&chunk->list);
745 INIT_WORK(&chunk->map_extend_work, pcpu_map_extend_workfn);
746 chunk->free_size = pcpu_unit_size;
747 chunk->contig_hint = pcpu_unit_size;
748
749 return chunk;
750}
751
752static void pcpu_free_chunk(struct pcpu_chunk *chunk)
753{
754 if (!chunk)
755 return;
756 pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
757 pcpu_mem_free(chunk, pcpu_chunk_struct_size);
758}
759
760
761
762
763
764
765
766
767
768
769
770static void pcpu_chunk_populated(struct pcpu_chunk *chunk,
771 int page_start, int page_end)
772{
773 int nr = page_end - page_start;
774
775 lockdep_assert_held(&pcpu_lock);
776
777 bitmap_set(chunk->populated, page_start, nr);
778 chunk->nr_populated += nr;
779 pcpu_nr_empty_pop_pages += nr;
780}
781
782
783
784
785
786
787
788
789
790
791
792static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
793 int page_start, int page_end)
794{
795 int nr = page_end - page_start;
796
797 lockdep_assert_held(&pcpu_lock);
798
799 bitmap_clear(chunk->populated, page_start, nr);
800 chunk->nr_populated -= nr;
801 pcpu_nr_empty_pop_pages -= nr;
802}
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size);
820static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size);
821static struct pcpu_chunk *pcpu_create_chunk(void);
822static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
823static struct page *pcpu_addr_to_page(void *addr);
824static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
825
826#ifdef CONFIG_NEED_PER_CPU_KM
827#include "percpu-km.c"
828#else
829#include "percpu-vm.c"
830#endif
831
832
833
834
835
836
837
838
839static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
840{
841
842 if (pcpu_addr_in_first_chunk(addr)) {
843
844 if (pcpu_addr_in_reserved_chunk(addr))
845 return pcpu_reserved_chunk;
846 return pcpu_first_chunk;
847 }
848
849
850
851
852
853
854
855
856 addr += pcpu_unit_offsets[raw_smp_processor_id()];
857 return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
858}
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
874 gfp_t gfp)
875{
876 static int warn_limit = 10;
877 struct pcpu_chunk *chunk;
878 const char *err;
879 bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
880 int occ_pages = 0;
881 int slot, off, new_alloc, cpu, ret;
882 unsigned long flags;
883 void __percpu *ptr;
884
885
886
887
888
889 if (unlikely(align < 2))
890 align = 2;
891
892 size = ALIGN(size, 2);
893
894 if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
895 WARN(true, "illegal size (%zu) or align (%zu) for "
896 "percpu allocation\n", size, align);
897 return NULL;
898 }
899
900 spin_lock_irqsave(&pcpu_lock, flags);
901
902
903 if (reserved && pcpu_reserved_chunk) {
904 chunk = pcpu_reserved_chunk;
905
906 if (size > chunk->contig_hint) {
907 err = "alloc from reserved chunk failed";
908 goto fail_unlock;
909 }
910
911 while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) {
912 spin_unlock_irqrestore(&pcpu_lock, flags);
913 if (is_atomic ||
914 pcpu_extend_area_map(chunk, new_alloc) < 0) {
915 err = "failed to extend area map of reserved chunk";
916 goto fail;
917 }
918 spin_lock_irqsave(&pcpu_lock, flags);
919 }
920
921 off = pcpu_alloc_area(chunk, size, align, is_atomic,
922 &occ_pages);
923 if (off >= 0)
924 goto area_found;
925
926 err = "alloc from reserved chunk failed";
927 goto fail_unlock;
928 }
929
930restart:
931
932 for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
933 list_for_each_entry(chunk, &pcpu_slot[slot], list) {
934 if (size > chunk->contig_hint)
935 continue;
936
937 new_alloc = pcpu_need_to_extend(chunk, is_atomic);
938 if (new_alloc) {
939 if (is_atomic)
940 continue;
941 spin_unlock_irqrestore(&pcpu_lock, flags);
942 if (pcpu_extend_area_map(chunk,
943 new_alloc) < 0) {
944 err = "failed to extend area map";
945 goto fail;
946 }
947 spin_lock_irqsave(&pcpu_lock, flags);
948
949
950
951
952 goto restart;
953 }
954
955 off = pcpu_alloc_area(chunk, size, align, is_atomic,
956 &occ_pages);
957 if (off >= 0)
958 goto area_found;
959 }
960 }
961
962 spin_unlock_irqrestore(&pcpu_lock, flags);
963
964
965
966
967
968
969 if (is_atomic)
970 goto fail;
971
972 mutex_lock(&pcpu_alloc_mutex);
973
974 if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
975 chunk = pcpu_create_chunk();
976 if (!chunk) {
977 mutex_unlock(&pcpu_alloc_mutex);
978 err = "failed to allocate new chunk";
979 goto fail;
980 }
981
982 spin_lock_irqsave(&pcpu_lock, flags);
983 pcpu_chunk_relocate(chunk, -1);
984 } else {
985 spin_lock_irqsave(&pcpu_lock, flags);
986 }
987
988 mutex_unlock(&pcpu_alloc_mutex);
989 goto restart;
990
991area_found:
992 spin_unlock_irqrestore(&pcpu_lock, flags);
993
994
995 if (!is_atomic) {
996 int page_start, page_end, rs, re;
997
998 mutex_lock(&pcpu_alloc_mutex);
999
1000 page_start = PFN_DOWN(off);
1001 page_end = PFN_UP(off + size);
1002
1003 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
1004 WARN_ON(chunk->immutable);
1005
1006 ret = pcpu_populate_chunk(chunk, rs, re);
1007
1008 spin_lock_irqsave(&pcpu_lock, flags);
1009 if (ret) {
1010 mutex_unlock(&pcpu_alloc_mutex);
1011 pcpu_free_area(chunk, off, &occ_pages);
1012 err = "failed to populate";
1013 goto fail_unlock;
1014 }
1015 pcpu_chunk_populated(chunk, rs, re);
1016 spin_unlock_irqrestore(&pcpu_lock, flags);
1017 }
1018
1019 mutex_unlock(&pcpu_alloc_mutex);
1020 }
1021
1022 if (chunk != pcpu_reserved_chunk)
1023 pcpu_nr_empty_pop_pages -= occ_pages;
1024
1025 if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
1026 pcpu_schedule_balance_work();
1027
1028
1029 for_each_possible_cpu(cpu)
1030 memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
1031
1032 ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
1033 kmemleak_alloc_percpu(ptr, size);
1034 return ptr;
1035
1036fail_unlock:
1037 spin_unlock_irqrestore(&pcpu_lock, flags);
1038fail:
1039 if (!is_atomic && warn_limit) {
1040 pr_warning("PERCPU: allocation failed, size=%zu align=%zu atomic=%d, %s\n",
1041 size, align, is_atomic, err);
1042 dump_stack();
1043 if (!--warn_limit)
1044 pr_info("PERCPU: limit reached, disable warning\n");
1045 }
1046 if (is_atomic) {
1047
1048 pcpu_atomic_alloc_failed = true;
1049 pcpu_schedule_balance_work();
1050 }
1051 return NULL;
1052}
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp)
1068{
1069 return pcpu_alloc(size, align, false, gfp);
1070}
1071EXPORT_SYMBOL_GPL(__alloc_percpu_gfp);
1072
1073
1074
1075
1076
1077
1078
1079
1080void __percpu *__alloc_percpu(size_t size, size_t align)
1081{
1082 return pcpu_alloc(size, align, false, GFP_KERNEL);
1083}
1084EXPORT_SYMBOL_GPL(__alloc_percpu);
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
1103{
1104 return pcpu_alloc(size, align, true, GFP_KERNEL);
1105}
1106
1107
1108
1109
1110
1111
1112
1113static void pcpu_balance_workfn(struct work_struct *work)
1114{
1115 LIST_HEAD(to_free);
1116 struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
1117 struct pcpu_chunk *chunk, *next;
1118 int slot, nr_to_pop, ret;
1119
1120
1121
1122
1123
1124 mutex_lock(&pcpu_alloc_mutex);
1125 spin_lock_irq(&pcpu_lock);
1126
1127 list_for_each_entry_safe(chunk, next, free_head, list) {
1128 WARN_ON(chunk->immutable);
1129
1130
1131 if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
1132 continue;
1133
1134 list_move(&chunk->list, &to_free);
1135 }
1136
1137 spin_unlock_irq(&pcpu_lock);
1138
1139 list_for_each_entry_safe(chunk, next, &to_free, list) {
1140 int rs, re;
1141
1142 pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) {
1143 pcpu_depopulate_chunk(chunk, rs, re);
1144 spin_lock_irq(&pcpu_lock);
1145 pcpu_chunk_depopulated(chunk, rs, re);
1146 spin_unlock_irq(&pcpu_lock);
1147 }
1148 pcpu_destroy_chunk(chunk);
1149 }
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161retry_pop:
1162 if (pcpu_atomic_alloc_failed) {
1163 nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
1164
1165 pcpu_atomic_alloc_failed = false;
1166 } else {
1167 nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
1168 pcpu_nr_empty_pop_pages,
1169 0, PCPU_EMPTY_POP_PAGES_HIGH);
1170 }
1171
1172 for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
1173 int nr_unpop = 0, rs, re;
1174
1175 if (!nr_to_pop)
1176 break;
1177
1178 spin_lock_irq(&pcpu_lock);
1179 list_for_each_entry(chunk, &pcpu_slot[slot], list) {
1180 nr_unpop = pcpu_unit_pages - chunk->nr_populated;
1181 if (nr_unpop)
1182 break;
1183 }
1184 spin_unlock_irq(&pcpu_lock);
1185
1186 if (!nr_unpop)
1187 continue;
1188
1189
1190 pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) {
1191 int nr = min(re - rs, nr_to_pop);
1192
1193 ret = pcpu_populate_chunk(chunk, rs, rs + nr);
1194 if (!ret) {
1195 nr_to_pop -= nr;
1196 spin_lock_irq(&pcpu_lock);
1197 pcpu_chunk_populated(chunk, rs, rs + nr);
1198 spin_unlock_irq(&pcpu_lock);
1199 } else {
1200 nr_to_pop = 0;
1201 }
1202
1203 if (!nr_to_pop)
1204 break;
1205 }
1206 }
1207
1208 if (nr_to_pop) {
1209
1210 chunk = pcpu_create_chunk();
1211 if (chunk) {
1212 spin_lock_irq(&pcpu_lock);
1213 pcpu_chunk_relocate(chunk, -1);
1214 spin_unlock_irq(&pcpu_lock);
1215 goto retry_pop;
1216 }
1217 }
1218
1219 mutex_unlock(&pcpu_alloc_mutex);
1220}
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231void free_percpu(void __percpu *ptr)
1232{
1233 void *addr;
1234 struct pcpu_chunk *chunk;
1235 unsigned long flags;
1236 int off, occ_pages;
1237
1238 if (!ptr)
1239 return;
1240
1241 kmemleak_free_percpu(ptr);
1242
1243 addr = __pcpu_ptr_to_addr(ptr);
1244
1245 spin_lock_irqsave(&pcpu_lock, flags);
1246
1247 chunk = pcpu_chunk_addr_search(addr);
1248 off = addr - chunk->base_addr;
1249
1250 pcpu_free_area(chunk, off, &occ_pages);
1251
1252 if (chunk != pcpu_reserved_chunk)
1253 pcpu_nr_empty_pop_pages += occ_pages;
1254
1255
1256 if (chunk->free_size == pcpu_unit_size) {
1257 struct pcpu_chunk *pos;
1258
1259 list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
1260 if (pos != chunk) {
1261 pcpu_schedule_balance_work();
1262 break;
1263 }
1264 }
1265
1266 spin_unlock_irqrestore(&pcpu_lock, flags);
1267}
1268EXPORT_SYMBOL_GPL(free_percpu);
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281bool is_kernel_percpu_address(unsigned long addr)
1282{
1283#ifdef CONFIG_SMP
1284 const size_t static_size = __per_cpu_end - __per_cpu_start;
1285 void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
1286 unsigned int cpu;
1287
1288 for_each_possible_cpu(cpu) {
1289 void *start = per_cpu_ptr(base, cpu);
1290
1291 if ((void *)addr >= start && (void *)addr < start + static_size)
1292 return true;
1293 }
1294#endif
1295
1296 return false;
1297}
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322phys_addr_t per_cpu_ptr_to_phys(void *addr)
1323{
1324 void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
1325 bool in_first_chunk = false;
1326 unsigned long first_low, first_high;
1327 unsigned int cpu;
1328
1329
1330
1331
1332
1333
1334 first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0);
1335 first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu,
1336 pcpu_unit_pages);
1337 if ((unsigned long)addr >= first_low &&
1338 (unsigned long)addr < first_high) {
1339 for_each_possible_cpu(cpu) {
1340 void *start = per_cpu_ptr(base, cpu);
1341
1342 if (addr >= start && addr < start + pcpu_unit_size) {
1343 in_first_chunk = true;
1344 break;
1345 }
1346 }
1347 }
1348
1349 if (in_first_chunk) {
1350 if (!is_vmalloc_addr(addr))
1351 return __pa(addr);
1352 else
1353 return page_to_phys(vmalloc_to_page(addr)) +
1354 offset_in_page(addr);
1355 } else
1356 return page_to_phys(pcpu_addr_to_page(addr)) +
1357 offset_in_page(addr);
1358}
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
1376 int nr_units)
1377{
1378 struct pcpu_alloc_info *ai;
1379 size_t base_size, ai_size;
1380 void *ptr;
1381 int unit;
1382
1383 base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]),
1384 __alignof__(ai->groups[0].cpu_map[0]));
1385 ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
1386
1387 ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), 0);
1388 if (!ptr)
1389 return NULL;
1390 ai = ptr;
1391 ptr += base_size;
1392
1393 ai->groups[0].cpu_map = ptr;
1394
1395 for (unit = 0; unit < nr_units; unit++)
1396 ai->groups[0].cpu_map[unit] = NR_CPUS;
1397
1398 ai->nr_groups = nr_groups;
1399 ai->__ai_size = PFN_ALIGN(ai_size);
1400
1401 return ai;
1402}
1403
1404
1405
1406
1407
1408
1409
1410void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
1411{
1412 memblock_free_early(__pa(ai), ai->__ai_size);
1413}
1414
1415
1416
1417
1418
1419
1420
1421
1422static void pcpu_dump_alloc_info(const char *lvl,
1423 const struct pcpu_alloc_info *ai)
1424{
1425 int group_width = 1, cpu_width = 1, width;
1426 char empty_str[] = "--------";
1427 int alloc = 0, alloc_end = 0;
1428 int group, v;
1429 int upa, apl;
1430
1431 v = ai->nr_groups;
1432 while (v /= 10)
1433 group_width++;
1434
1435 v = num_possible_cpus();
1436 while (v /= 10)
1437 cpu_width++;
1438 empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
1439
1440 upa = ai->alloc_size / ai->unit_size;
1441 width = upa * (cpu_width + 1) + group_width + 3;
1442 apl = rounddown_pow_of_two(max(60 / width, 1));
1443
1444 printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
1445 lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
1446 ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);
1447
1448 for (group = 0; group < ai->nr_groups; group++) {
1449 const struct pcpu_group_info *gi = &ai->groups[group];
1450 int unit = 0, unit_end = 0;
1451
1452 BUG_ON(gi->nr_units % upa);
1453 for (alloc_end += gi->nr_units / upa;
1454 alloc < alloc_end; alloc++) {
1455 if (!(alloc % apl)) {
1456 printk(KERN_CONT "\n");
1457 printk("%spcpu-alloc: ", lvl);
1458 }
1459 printk(KERN_CONT "[%0*d] ", group_width, group);
1460
1461 for (unit_end += upa; unit < unit_end; unit++)
1462 if (gi->cpu_map[unit] != NR_CPUS)
1463 printk(KERN_CONT "%0*d ", cpu_width,
1464 gi->cpu_map[unit]);
1465 else
1466 printk(KERN_CONT "%s ", empty_str);
1467 }
1468 }
1469 printk(KERN_CONT "\n");
1470}
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1529 void *base_addr)
1530{
1531 static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
1532 static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
1533 size_t dyn_size = ai->dyn_size;
1534 size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
1535 struct pcpu_chunk *schunk, *dchunk = NULL;
1536 unsigned long *group_offsets;
1537 size_t *group_sizes;
1538 unsigned long *unit_off;
1539 unsigned int cpu;
1540 int *unit_map;
1541 int group, unit, i;
1542
1543#define PCPU_SETUP_BUG_ON(cond) do { \
1544 if (unlikely(cond)) { \
1545 pr_emerg("PERCPU: failed to initialize, %s", #cond); \
1546 pr_emerg("PERCPU: cpu_possible_mask=%*pb\n", \
1547 cpumask_pr_args(cpu_possible_mask)); \
1548 pcpu_dump_alloc_info(KERN_EMERG, ai); \
1549 BUG(); \
1550 } \
1551} while (0)
1552
1553
1554 PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
1555#ifdef CONFIG_SMP
1556 PCPU_SETUP_BUG_ON(!ai->static_size);
1557 PCPU_SETUP_BUG_ON((unsigned long)__per_cpu_start & ~PAGE_MASK);
1558#endif
1559 PCPU_SETUP_BUG_ON(!base_addr);
1560 PCPU_SETUP_BUG_ON((unsigned long)base_addr & ~PAGE_MASK);
1561 PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
1562 PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
1563 PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
1564 PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
1565 PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
1566
1567
1568 group_offsets = memblock_virt_alloc(ai->nr_groups *
1569 sizeof(group_offsets[0]), 0);
1570 group_sizes = memblock_virt_alloc(ai->nr_groups *
1571 sizeof(group_sizes[0]), 0);
1572 unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0);
1573 unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0);
1574
1575 for (cpu = 0; cpu < nr_cpu_ids; cpu++)
1576 unit_map[cpu] = UINT_MAX;
1577
1578 pcpu_low_unit_cpu = NR_CPUS;
1579 pcpu_high_unit_cpu = NR_CPUS;
1580
1581 for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
1582 const struct pcpu_group_info *gi = &ai->groups[group];
1583
1584 group_offsets[group] = gi->base_offset;
1585 group_sizes[group] = gi->nr_units * ai->unit_size;
1586
1587 for (i = 0; i < gi->nr_units; i++) {
1588 cpu = gi->cpu_map[i];
1589 if (cpu == NR_CPUS)
1590 continue;
1591
1592 PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids);
1593 PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
1594 PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);
1595
1596 unit_map[cpu] = unit + i;
1597 unit_off[cpu] = gi->base_offset + i * ai->unit_size;
1598
1599
1600 if (pcpu_low_unit_cpu == NR_CPUS ||
1601 unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
1602 pcpu_low_unit_cpu = cpu;
1603 if (pcpu_high_unit_cpu == NR_CPUS ||
1604 unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
1605 pcpu_high_unit_cpu = cpu;
1606 }
1607 }
1608 pcpu_nr_units = unit;
1609
1610 for_each_possible_cpu(cpu)
1611 PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);
1612
1613
1614#undef PCPU_SETUP_BUG_ON
1615 pcpu_dump_alloc_info(KERN_DEBUG, ai);
1616
1617 pcpu_nr_groups = ai->nr_groups;
1618 pcpu_group_offsets = group_offsets;
1619 pcpu_group_sizes = group_sizes;
1620 pcpu_unit_map = unit_map;
1621 pcpu_unit_offsets = unit_off;
1622
1623
1624 pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
1625 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
1626 pcpu_atom_size = ai->atom_size;
1627 pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
1628 BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
1629
1630
1631
1632
1633
1634 pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
1635 pcpu_slot = memblock_virt_alloc(
1636 pcpu_nr_slots * sizeof(pcpu_slot[0]), 0);
1637 for (i = 0; i < pcpu_nr_slots; i++)
1638 INIT_LIST_HEAD(&pcpu_slot[i]);
1639
1640
1641
1642
1643
1644
1645
1646
1647 schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
1648 INIT_LIST_HEAD(&schunk->list);
1649 INIT_WORK(&schunk->map_extend_work, pcpu_map_extend_workfn);
1650 schunk->base_addr = base_addr;
1651 schunk->map = smap;
1652 schunk->map_alloc = ARRAY_SIZE(smap);
1653 schunk->immutable = true;
1654 bitmap_fill(schunk->populated, pcpu_unit_pages);
1655 schunk->nr_populated = pcpu_unit_pages;
1656
1657 if (ai->reserved_size) {
1658 schunk->free_size = ai->reserved_size;
1659 pcpu_reserved_chunk = schunk;
1660 pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size;
1661 } else {
1662 schunk->free_size = dyn_size;
1663 dyn_size = 0;
1664 }
1665 schunk->contig_hint = schunk->free_size;
1666
1667 schunk->map[0] = 1;
1668 schunk->map[1] = ai->static_size;
1669 schunk->map_used = 1;
1670 if (schunk->free_size)
1671 schunk->map[++schunk->map_used] = 1 | (ai->static_size + schunk->free_size);
1672 else
1673 schunk->map[1] |= 1;
1674
1675
1676 if (dyn_size) {
1677 dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
1678 INIT_LIST_HEAD(&dchunk->list);
1679 INIT_WORK(&dchunk->map_extend_work, pcpu_map_extend_workfn);
1680 dchunk->base_addr = base_addr;
1681 dchunk->map = dmap;
1682 dchunk->map_alloc = ARRAY_SIZE(dmap);
1683 dchunk->immutable = true;
1684 bitmap_fill(dchunk->populated, pcpu_unit_pages);
1685 dchunk->nr_populated = pcpu_unit_pages;
1686
1687 dchunk->contig_hint = dchunk->free_size = dyn_size;
1688 dchunk->map[0] = 1;
1689 dchunk->map[1] = pcpu_reserved_chunk_limit;
1690 dchunk->map[2] = (pcpu_reserved_chunk_limit + dchunk->free_size) | 1;
1691 dchunk->map_used = 2;
1692 }
1693
1694
1695 pcpu_first_chunk = dchunk ?: schunk;
1696 pcpu_nr_empty_pop_pages +=
1697 pcpu_count_occupied_pages(pcpu_first_chunk, 1);
1698 pcpu_chunk_relocate(pcpu_first_chunk, -1);
1699
1700
1701 pcpu_base_addr = base_addr;
1702 return 0;
1703}
1704
1705#ifdef CONFIG_SMP
1706
1707const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
1708 [PCPU_FC_AUTO] = "auto",
1709 [PCPU_FC_EMBED] = "embed",
1710 [PCPU_FC_PAGE] = "page",
1711};
1712
1713enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
1714
1715static int __init percpu_alloc_setup(char *str)
1716{
1717 if (!str)
1718 return -EINVAL;
1719
1720 if (0)
1721 ;
1722#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
1723 else if (!strcmp(str, "embed"))
1724 pcpu_chosen_fc = PCPU_FC_EMBED;
1725#endif
1726#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
1727 else if (!strcmp(str, "page"))
1728 pcpu_chosen_fc = PCPU_FC_PAGE;
1729#endif
1730 else
1731 pr_warning("PERCPU: unknown allocator %s specified\n", str);
1732
1733 return 0;
1734}
1735early_param("percpu_alloc", percpu_alloc_setup);
1736
1737
1738
1739
1740
1741
1742#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
1743 !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
1744#define BUILD_EMBED_FIRST_CHUNK
1745#endif
1746
1747
1748#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
1749#define BUILD_PAGE_FIRST_CHUNK
1750#endif
1751
1752
1753#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
1776 size_t reserved_size, size_t dyn_size,
1777 size_t atom_size,
1778 pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
1779{
1780 static int group_map[NR_CPUS] __initdata;
1781 static int group_cnt[NR_CPUS] __initdata;
1782 const size_t static_size = __per_cpu_end - __per_cpu_start;
1783 int nr_groups = 1, nr_units = 0;
1784 size_t size_sum, min_unit_size, alloc_size;
1785 int upa, max_upa, uninitialized_var(best_upa);
1786 int last_allocs, group, unit;
1787 unsigned int cpu, tcpu;
1788 struct pcpu_alloc_info *ai;
1789 unsigned int *cpu_map;
1790
1791
1792 memset(group_map, 0, sizeof(group_map));
1793 memset(group_cnt, 0, sizeof(group_cnt));
1794
1795
1796 size_sum = PFN_ALIGN(static_size + reserved_size +
1797 max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
1798 dyn_size = size_sum - static_size - reserved_size;
1799
1800
1801
1802
1803
1804
1805
1806 min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
1807
1808 alloc_size = roundup(min_unit_size, atom_size);
1809 upa = alloc_size / min_unit_size;
1810 while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
1811 upa--;
1812 max_upa = upa;
1813
1814
1815 for_each_possible_cpu(cpu) {
1816 group = 0;
1817 next_group:
1818 for_each_possible_cpu(tcpu) {
1819 if (cpu == tcpu)
1820 break;
1821 if (group_map[tcpu] == group && cpu_distance_fn &&
1822 (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
1823 cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
1824 group++;
1825 nr_groups = max(nr_groups, group + 1);
1826 goto next_group;
1827 }
1828 }
1829 group_map[cpu] = group;
1830 group_cnt[group]++;
1831 }
1832
1833
1834
1835
1836
1837
1838 last_allocs = INT_MAX;
1839 for (upa = max_upa; upa; upa--) {
1840 int allocs = 0, wasted = 0;
1841
1842 if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
1843 continue;
1844
1845 for (group = 0; group < nr_groups; group++) {
1846 int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
1847 allocs += this_allocs;
1848 wasted += this_allocs * upa - group_cnt[group];
1849 }
1850
1851
1852
1853
1854
1855
1856 if (wasted > num_possible_cpus() / 3)
1857 continue;
1858
1859
1860 if (allocs > last_allocs)
1861 break;
1862 last_allocs = allocs;
1863 best_upa = upa;
1864 }
1865 upa = best_upa;
1866
1867
1868 for (group = 0; group < nr_groups; group++)
1869 nr_units += roundup(group_cnt[group], upa);
1870
1871 ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
1872 if (!ai)
1873 return ERR_PTR(-ENOMEM);
1874 cpu_map = ai->groups[0].cpu_map;
1875
1876 for (group = 0; group < nr_groups; group++) {
1877 ai->groups[group].cpu_map = cpu_map;
1878 cpu_map += roundup(group_cnt[group], upa);
1879 }
1880
1881 ai->static_size = static_size;
1882 ai->reserved_size = reserved_size;
1883 ai->dyn_size = dyn_size;
1884 ai->unit_size = alloc_size / upa;
1885 ai->atom_size = atom_size;
1886 ai->alloc_size = alloc_size;
1887
1888 for (group = 0, unit = 0; group_cnt[group]; group++) {
1889 struct pcpu_group_info *gi = &ai->groups[group];
1890
1891
1892
1893
1894
1895
1896 gi->base_offset = unit * ai->unit_size;
1897
1898 for_each_possible_cpu(cpu)
1899 if (group_map[cpu] == group)
1900 gi->cpu_map[gi->nr_units++] = cpu;
1901 gi->nr_units = roundup(gi->nr_units, upa);
1902 unit += gi->nr_units;
1903 }
1904 BUG_ON(unit != nr_units);
1905
1906 return ai;
1907}
1908#endif
1909
1910#if defined(BUILD_EMBED_FIRST_CHUNK)
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
1944 size_t atom_size,
1945 pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
1946 pcpu_fc_alloc_fn_t alloc_fn,
1947 pcpu_fc_free_fn_t free_fn)
1948{
1949 void *base = (void *)ULONG_MAX;
1950 void **areas = NULL;
1951 struct pcpu_alloc_info *ai;
1952 size_t size_sum, areas_size, max_distance;
1953 int group, i, rc;
1954
1955 ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
1956 cpu_distance_fn);
1957 if (IS_ERR(ai))
1958 return PTR_ERR(ai);
1959
1960 size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
1961 areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
1962
1963 areas = memblock_virt_alloc_nopanic(areas_size, 0);
1964 if (!areas) {
1965 rc = -ENOMEM;
1966 goto out_free;
1967 }
1968
1969
1970 for (group = 0; group < ai->nr_groups; group++) {
1971 struct pcpu_group_info *gi = &ai->groups[group];
1972 unsigned int cpu = NR_CPUS;
1973 void *ptr;
1974
1975 for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
1976 cpu = gi->cpu_map[i];
1977 BUG_ON(cpu == NR_CPUS);
1978
1979
1980 ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
1981 if (!ptr) {
1982 rc = -ENOMEM;
1983 goto out_free_areas;
1984 }
1985
1986 kmemleak_free(ptr);
1987 areas[group] = ptr;
1988
1989 base = min(ptr, base);
1990 }
1991
1992
1993
1994
1995
1996
1997 for (group = 0; group < ai->nr_groups; group++) {
1998 struct pcpu_group_info *gi = &ai->groups[group];
1999 void *ptr = areas[group];
2000
2001 for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
2002 if (gi->cpu_map[i] == NR_CPUS) {
2003
2004 free_fn(ptr, ai->unit_size);
2005 continue;
2006 }
2007
2008 memcpy(ptr, __per_cpu_load, ai->static_size);
2009 free_fn(ptr + size_sum, ai->unit_size - size_sum);
2010 }
2011 }
2012
2013
2014 max_distance = 0;
2015 for (group = 0; group < ai->nr_groups; group++) {
2016 ai->groups[group].base_offset = areas[group] - base;
2017 max_distance = max_t(size_t, max_distance,
2018 ai->groups[group].base_offset);
2019 }
2020 max_distance += ai->unit_size;
2021
2022
2023 if (max_distance > VMALLOC_TOTAL * 3 / 4) {
2024 pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc "
2025 "space 0x%lx\n", max_distance,
2026 VMALLOC_TOTAL);
2027#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
2028
2029 rc = -EINVAL;
2030 goto out_free;
2031#endif
2032 }
2033
2034 pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
2035 PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
2036 ai->dyn_size, ai->unit_size);
2037
2038 rc = pcpu_setup_first_chunk(ai, base);
2039 goto out_free;
2040
2041out_free_areas:
2042 for (group = 0; group < ai->nr_groups; group++)
2043 if (areas[group])
2044 free_fn(areas[group],
2045 ai->groups[group].nr_units * ai->unit_size);
2046out_free:
2047 pcpu_free_alloc_info(ai);
2048 if (areas)
2049 memblock_free_early(__pa(areas), areas_size);
2050 return rc;
2051}
2052#endif
2053
2054#ifdef BUILD_PAGE_FIRST_CHUNK
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071int __init pcpu_page_first_chunk(size_t reserved_size,
2072 pcpu_fc_alloc_fn_t alloc_fn,
2073 pcpu_fc_free_fn_t free_fn,
2074 pcpu_fc_populate_pte_fn_t populate_pte_fn)
2075{
2076 static struct vm_struct vm;
2077 struct pcpu_alloc_info *ai;
2078 char psize_str[16];
2079 int unit_pages;
2080 size_t pages_size;
2081 struct page **pages;
2082 int unit, i, j, rc;
2083
2084 snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
2085
2086 ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
2087 if (IS_ERR(ai))
2088 return PTR_ERR(ai);
2089 BUG_ON(ai->nr_groups != 1);
2090 BUG_ON(ai->groups[0].nr_units != num_possible_cpus());
2091
2092 unit_pages = ai->unit_size >> PAGE_SHIFT;
2093
2094
2095 pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
2096 sizeof(pages[0]));
2097 pages = memblock_virt_alloc(pages_size, 0);
2098
2099
2100 j = 0;
2101 for (unit = 0; unit < num_possible_cpus(); unit++)
2102 for (i = 0; i < unit_pages; i++) {
2103 unsigned int cpu = ai->groups[0].cpu_map[unit];
2104 void *ptr;
2105
2106 ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
2107 if (!ptr) {
2108 pr_warning("PERCPU: failed to allocate %s page "
2109 "for cpu%u\n", psize_str, cpu);
2110 goto enomem;
2111 }
2112
2113 kmemleak_free(ptr);
2114 pages[j++] = virt_to_page(ptr);
2115 }
2116
2117
2118 vm.flags = VM_ALLOC;
2119 vm.size = num_possible_cpus() * ai->unit_size;
2120 vm_area_register_early(&vm, PAGE_SIZE);
2121
2122 for (unit = 0; unit < num_possible_cpus(); unit++) {
2123 unsigned long unit_addr =
2124 (unsigned long)vm.addr + unit * ai->unit_size;
2125
2126 for (i = 0; i < unit_pages; i++)
2127 populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
2128
2129
2130 rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
2131 unit_pages);
2132 if (rc < 0)
2133 panic("failed to map percpu area, err=%d\n", rc);
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144 memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
2145 }
2146
2147
2148 pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n",
2149 unit_pages, psize_str, vm.addr, ai->static_size,
2150 ai->reserved_size, ai->dyn_size);
2151
2152 rc = pcpu_setup_first_chunk(ai, vm.addr);
2153 goto out_free_ar;
2154
2155enomem:
2156 while (--j >= 0)
2157 free_fn(page_address(pages[j]), PAGE_SIZE);
2158 rc = -ENOMEM;
2159out_free_ar:
2160 memblock_free_early(__pa(pages), pages_size);
2161 pcpu_free_alloc_info(ai);
2162 return rc;
2163}
2164#endif
2165
2166#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
2180EXPORT_SYMBOL(__per_cpu_offset);
2181
2182static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
2183 size_t align)
2184{
2185 return memblock_virt_alloc_from_nopanic(
2186 size, align, __pa(MAX_DMA_ADDRESS));
2187}
2188
2189static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
2190{
2191 memblock_free_early(__pa(ptr), size);
2192}
2193
2194void __init setup_per_cpu_areas(void)
2195{
2196 unsigned long delta;
2197 unsigned int cpu;
2198 int rc;
2199
2200
2201
2202
2203
2204 rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
2205 PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
2206 pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
2207 if (rc < 0)
2208 panic("Failed to initialize percpu areas.");
2209
2210 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
2211 for_each_possible_cpu(cpu)
2212 __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
2213}
2214#endif
2215
2216#else
2217
2218
2219
2220
2221
2222
2223
2224
2225void __init setup_per_cpu_areas(void)
2226{
2227 const size_t unit_size =
2228 roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
2229 PERCPU_DYNAMIC_RESERVE));
2230 struct pcpu_alloc_info *ai;
2231 void *fc;
2232
2233 ai = pcpu_alloc_alloc_info(1, 1);
2234 fc = memblock_virt_alloc_from_nopanic(unit_size,
2235 PAGE_SIZE,
2236 __pa(MAX_DMA_ADDRESS));
2237 if (!ai || !fc)
2238 panic("Failed to allocate memory for percpu areas.");
2239
2240 kmemleak_free(fc);
2241
2242 ai->dyn_size = unit_size;
2243 ai->unit_size = unit_size;
2244 ai->atom_size = unit_size;
2245 ai->alloc_size = unit_size;
2246 ai->groups[0].nr_units = 1;
2247 ai->groups[0].cpu_map[0] = 0;
2248
2249 if (pcpu_setup_first_chunk(ai, fc) < 0)
2250 panic("Failed to initialize percpu areas.");
2251}
2252
2253#endif
2254
2255
2256
2257
2258
2259
2260
2261void __init percpu_init_late(void)
2262{
2263 struct pcpu_chunk *target_chunks[] =
2264 { pcpu_first_chunk, pcpu_reserved_chunk, NULL };
2265 struct pcpu_chunk *chunk;
2266 unsigned long flags;
2267 int i;
2268
2269 for (i = 0; (chunk = target_chunks[i]); i++) {
2270 int *map;
2271 const size_t size = PERCPU_DYNAMIC_EARLY_SLOTS * sizeof(map[0]);
2272
2273 BUILD_BUG_ON(size > PAGE_SIZE);
2274
2275 map = pcpu_mem_zalloc(size);
2276 BUG_ON(!map);
2277
2278 spin_lock_irqsave(&pcpu_lock, flags);
2279 memcpy(map, chunk->map, size);
2280 chunk->map = map;
2281 spin_unlock_irqrestore(&pcpu_lock, flags);
2282 }
2283}
2284
2285
2286
2287
2288
2289
2290static int __init percpu_enable_async(void)
2291{
2292 pcpu_async_enabled = true;
2293 return 0;
2294}
2295subsys_initcall(percpu_enable_async);
2296