1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56#include <linux/bitmap.h>
57#include <linux/bootmem.h>
58#include <linux/err.h>
59#include <linux/list.h>
60#include <linux/log2.h>
61#include <linux/mm.h>
62#include <linux/module.h>
63#include <linux/mutex.h>
64#include <linux/percpu.h>
65#include <linux/pfn.h>
66#include <linux/slab.h>
67#include <linux/spinlock.h>
68#include <linux/vmalloc.h>
69#include <linux/workqueue.h>
70#include <linux/kmemleak.h>
71
72#include <asm/cacheflush.h>
73#include <asm/sections.h>
74#include <asm/tlbflush.h>
75#include <asm/io.h>
76
77#define PCPU_SLOT_BASE_SHIFT 5
78#define PCPU_DFL_MAP_ALLOC 16
79#define PCPU_ATOMIC_MAP_MARGIN_LOW 32
80#define PCPU_ATOMIC_MAP_MARGIN_HIGH 64
81#define PCPU_EMPTY_POP_PAGES_LOW 2
82#define PCPU_EMPTY_POP_PAGES_HIGH 4
83
84#ifdef CONFIG_SMP
85
86#ifndef __addr_to_pcpu_ptr
87#define __addr_to_pcpu_ptr(addr) \
88 (void __percpu *)((unsigned long)(addr) - \
89 (unsigned long)pcpu_base_addr + \
90 (unsigned long)__per_cpu_start)
91#endif
92#ifndef __pcpu_ptr_to_addr
93#define __pcpu_ptr_to_addr(ptr) \
94 (void __force *)((unsigned long)(ptr) + \
95 (unsigned long)pcpu_base_addr - \
96 (unsigned long)__per_cpu_start)
97#endif
98#else
99
100#define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr)
101#define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr)
102#endif
103
104struct pcpu_chunk {
105 struct list_head list;
106 int free_size;
107 int contig_hint;
108 void *base_addr;
109
110 int map_used;
111 int map_alloc;
112 int *map;
113 struct work_struct map_extend_work;
114
115 void *data;
116 int first_free;
117 bool immutable;
118 int nr_populated;
119 unsigned long populated[];
120};
121
122static int pcpu_unit_pages __read_mostly;
123static int pcpu_unit_size __read_mostly;
124static int pcpu_nr_units __read_mostly;
125static int pcpu_atom_size __read_mostly;
126static int pcpu_nr_slots __read_mostly;
127static size_t pcpu_chunk_struct_size __read_mostly;
128
129
130static unsigned int pcpu_low_unit_cpu __read_mostly;
131static unsigned int pcpu_high_unit_cpu __read_mostly;
132
133
134void *pcpu_base_addr __read_mostly;
135EXPORT_SYMBOL_GPL(pcpu_base_addr);
136
137static const int *pcpu_unit_map __read_mostly;
138const unsigned long *pcpu_unit_offsets __read_mostly;
139
140
141static int pcpu_nr_groups __read_mostly;
142static const unsigned long *pcpu_group_offsets __read_mostly;
143static const size_t *pcpu_group_sizes __read_mostly;
144
145
146
147
148
149
150static struct pcpu_chunk *pcpu_first_chunk;
151
152
153
154
155
156
157
158
159static struct pcpu_chunk *pcpu_reserved_chunk;
160static int pcpu_reserved_chunk_limit;
161
162static DEFINE_SPINLOCK(pcpu_lock);
163static DEFINE_MUTEX(pcpu_alloc_mutex);
164
165static struct list_head *pcpu_slot __read_mostly;
166
167
168
169
170
171static int pcpu_nr_empty_pop_pages;
172
173
174
175
176
177
178
179static void pcpu_balance_workfn(struct work_struct *work);
180static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
181static bool pcpu_async_enabled __read_mostly;
182static bool pcpu_atomic_alloc_failed;
183
184static void pcpu_schedule_balance_work(void)
185{
186 if (pcpu_async_enabled)
187 schedule_work(&pcpu_balance_work);
188}
189
190static bool pcpu_addr_in_first_chunk(void *addr)
191{
192 void *first_start = pcpu_first_chunk->base_addr;
193
194 return addr >= first_start && addr < first_start + pcpu_unit_size;
195}
196
197static bool pcpu_addr_in_reserved_chunk(void *addr)
198{
199 void *first_start = pcpu_first_chunk->base_addr;
200
201 return addr >= first_start &&
202 addr < first_start + pcpu_reserved_chunk_limit;
203}
204
205static int __pcpu_size_to_slot(int size)
206{
207 int highbit = fls(size);
208 return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
209}
210
211static int pcpu_size_to_slot(int size)
212{
213 if (size == pcpu_unit_size)
214 return pcpu_nr_slots - 1;
215 return __pcpu_size_to_slot(size);
216}
217
218static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
219{
220 if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
221 return 0;
222
223 return pcpu_size_to_slot(chunk->free_size);
224}
225
226
227static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
228{
229 page->index = (unsigned long)pcpu;
230}
231
232
233static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
234{
235 return (struct pcpu_chunk *)page->index;
236}
237
238static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
239{
240 return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
241}
242
243static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
244 unsigned int cpu, int page_idx)
245{
246 return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] +
247 (page_idx << PAGE_SHIFT);
248}
249
250static void __maybe_unused pcpu_next_unpop(struct pcpu_chunk *chunk,
251 int *rs, int *re, int end)
252{
253 *rs = find_next_zero_bit(chunk->populated, end, *rs);
254 *re = find_next_bit(chunk->populated, end, *rs + 1);
255}
256
257static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
258 int *rs, int *re, int end)
259{
260 *rs = find_next_bit(chunk->populated, end, *rs);
261 *re = find_next_zero_bit(chunk->populated, end, *rs + 1);
262}
263
264
265
266
267
268
269
270#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \
271 for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \
272 (rs) < (re); \
273 (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end)))
274
275#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \
276 for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \
277 (rs) < (re); \
278 (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294static void *pcpu_mem_zalloc(size_t size)
295{
296 if (WARN_ON_ONCE(!slab_is_available()))
297 return NULL;
298
299 if (size <= PAGE_SIZE)
300 return kzalloc(size, GFP_KERNEL);
301 else
302 return vzalloc(size);
303}
304
305
306
307
308
309
310
311static void pcpu_mem_free(void *ptr)
312{
313 kvfree(ptr);
314}
315
316
317
318
319
320
321
322
323
324
325static int pcpu_count_occupied_pages(struct pcpu_chunk *chunk, int i)
326{
327 int off = chunk->map[i] & ~1;
328 int end = chunk->map[i + 1] & ~1;
329
330 if (!PAGE_ALIGNED(off) && i > 0) {
331 int prev = chunk->map[i - 1];
332
333 if (!(prev & 1) && prev <= round_down(off, PAGE_SIZE))
334 off = round_down(off, PAGE_SIZE);
335 }
336
337 if (!PAGE_ALIGNED(end) && i + 1 < chunk->map_used) {
338 int next = chunk->map[i + 1];
339 int nend = chunk->map[i + 2] & ~1;
340
341 if (!(next & 1) && nend >= round_up(end, PAGE_SIZE))
342 end = round_up(end, PAGE_SIZE);
343 }
344
345 return max_t(int, PFN_DOWN(end) - PFN_UP(off), 0);
346}
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
362{
363 int nslot = pcpu_chunk_slot(chunk);
364
365 if (chunk != pcpu_reserved_chunk && oslot != nslot) {
366 if (oslot < nslot)
367 list_move(&chunk->list, &pcpu_slot[nslot]);
368 else
369 list_move_tail(&chunk->list, &pcpu_slot[nslot]);
370 }
371}
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic)
393{
394 int margin, new_alloc;
395
396 if (is_atomic) {
397 margin = 3;
398
399 if (chunk->map_alloc <
400 chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW &&
401 pcpu_async_enabled)
402 schedule_work(&chunk->map_extend_work);
403 } else {
404 margin = PCPU_ATOMIC_MAP_MARGIN_HIGH;
405 }
406
407 if (chunk->map_alloc >= chunk->map_used + margin)
408 return 0;
409
410 new_alloc = PCPU_DFL_MAP_ALLOC;
411 while (new_alloc < chunk->map_used + margin)
412 new_alloc *= 2;
413
414 return new_alloc;
415}
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
431{
432 int *old = NULL, *new = NULL;
433 size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
434 unsigned long flags;
435
436 new = pcpu_mem_zalloc(new_size);
437 if (!new)
438 return -ENOMEM;
439
440
441 spin_lock_irqsave(&pcpu_lock, flags);
442
443 if (new_alloc <= chunk->map_alloc)
444 goto out_unlock;
445
446 old_size = chunk->map_alloc * sizeof(chunk->map[0]);
447 old = chunk->map;
448
449 memcpy(new, old, old_size);
450
451 chunk->map_alloc = new_alloc;
452 chunk->map = new;
453 new = NULL;
454
455out_unlock:
456 spin_unlock_irqrestore(&pcpu_lock, flags);
457
458
459
460
461
462 pcpu_mem_free(old);
463 pcpu_mem_free(new);
464
465 return 0;
466}
467
468static void pcpu_map_extend_workfn(struct work_struct *work)
469{
470 struct pcpu_chunk *chunk = container_of(work, struct pcpu_chunk,
471 map_extend_work);
472 int new_alloc;
473
474 spin_lock_irq(&pcpu_lock);
475 new_alloc = pcpu_need_to_extend(chunk, false);
476 spin_unlock_irq(&pcpu_lock);
477
478 if (new_alloc)
479 pcpu_extend_area_map(chunk, new_alloc);
480}
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size,
501 int size, int align, bool pop_only)
502{
503 int cand_off = off;
504
505 while (true) {
506 int head = ALIGN(cand_off, align) - off;
507 int page_start, page_end, rs, re;
508
509 if (this_size < head + size)
510 return -1;
511
512 if (!pop_only)
513 return head;
514
515
516
517
518
519
520 page_start = PFN_DOWN(head + off);
521 page_end = PFN_UP(head + off + size);
522
523 rs = page_start;
524 pcpu_next_unpop(chunk, &rs, &re, PFN_UP(off + this_size));
525 if (rs >= page_end)
526 return head;
527 cand_off = re * PAGE_SIZE;
528 }
529}
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align,
553 bool pop_only, int *occ_pages_p)
554{
555 int oslot = pcpu_chunk_slot(chunk);
556 int max_contig = 0;
557 int i, off;
558 bool seen_free = false;
559 int *p;
560
561 for (i = chunk->first_free, p = chunk->map + i; i < chunk->map_used; i++, p++) {
562 int head, tail;
563 int this_size;
564
565 off = *p;
566 if (off & 1)
567 continue;
568
569 this_size = (p[1] & ~1) - off;
570
571 head = pcpu_fit_in_area(chunk, off, this_size, size, align,
572 pop_only);
573 if (head < 0) {
574 if (!seen_free) {
575 chunk->first_free = i;
576 seen_free = true;
577 }
578 max_contig = max(this_size, max_contig);
579 continue;
580 }
581
582
583
584
585
586
587
588 if (head && (head < sizeof(int) || !(p[-1] & 1))) {
589 *p = off += head;
590 if (p[-1] & 1)
591 chunk->free_size -= head;
592 else
593 max_contig = max(*p - p[-1], max_contig);
594 this_size -= head;
595 head = 0;
596 }
597
598
599 tail = this_size - head - size;
600 if (tail < sizeof(int)) {
601 tail = 0;
602 size = this_size - head;
603 }
604
605
606 if (head || tail) {
607 int nr_extra = !!head + !!tail;
608
609
610 memmove(p + nr_extra + 1, p + 1,
611 sizeof(chunk->map[0]) * (chunk->map_used - i));
612 chunk->map_used += nr_extra;
613
614 if (head) {
615 if (!seen_free) {
616 chunk->first_free = i;
617 seen_free = true;
618 }
619 *++p = off += head;
620 ++i;
621 max_contig = max(head, max_contig);
622 }
623 if (tail) {
624 p[1] = off + size;
625 max_contig = max(tail, max_contig);
626 }
627 }
628
629 if (!seen_free)
630 chunk->first_free = i + 1;
631
632
633 if (i + 1 == chunk->map_used)
634 chunk->contig_hint = max_contig;
635 else
636 chunk->contig_hint = max(chunk->contig_hint,
637 max_contig);
638
639 chunk->free_size -= size;
640 *p |= 1;
641
642 *occ_pages_p = pcpu_count_occupied_pages(chunk, i);
643 pcpu_chunk_relocate(chunk, oslot);
644 return off;
645 }
646
647 chunk->contig_hint = max_contig;
648 pcpu_chunk_relocate(chunk, oslot);
649
650
651 return -1;
652}
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme,
668 int *occ_pages_p)
669{
670 int oslot = pcpu_chunk_slot(chunk);
671 int off = 0;
672 unsigned i, j;
673 int to_free = 0;
674 int *p;
675
676 freeme |= 1;
677
678 i = 0;
679 j = chunk->map_used;
680 while (i != j) {
681 unsigned k = (i + j) / 2;
682 off = chunk->map[k];
683 if (off < freeme)
684 i = k + 1;
685 else if (off > freeme)
686 j = k;
687 else
688 i = j = k;
689 }
690 BUG_ON(off != freeme);
691
692 if (i < chunk->first_free)
693 chunk->first_free = i;
694
695 p = chunk->map + i;
696 *p = off &= ~1;
697 chunk->free_size += (p[1] & ~1) - off;
698
699 *occ_pages_p = pcpu_count_occupied_pages(chunk, i);
700
701
702 if (!(p[1] & 1))
703 to_free++;
704
705 if (i > 0 && !(p[-1] & 1)) {
706 to_free++;
707 i--;
708 p--;
709 }
710 if (to_free) {
711 chunk->map_used -= to_free;
712 memmove(p + 1, p + 1 + to_free,
713 (chunk->map_used - i) * sizeof(chunk->map[0]));
714 }
715
716 chunk->contig_hint = max(chunk->map[i + 1] - chunk->map[i] - 1, chunk->contig_hint);
717 pcpu_chunk_relocate(chunk, oslot);
718}
719
720static struct pcpu_chunk *pcpu_alloc_chunk(void)
721{
722 struct pcpu_chunk *chunk;
723
724 chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size);
725 if (!chunk)
726 return NULL;
727
728 chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *
729 sizeof(chunk->map[0]));
730 if (!chunk->map) {
731 pcpu_mem_free(chunk);
732 return NULL;
733 }
734
735 chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
736 chunk->map[0] = 0;
737 chunk->map[1] = pcpu_unit_size | 1;
738 chunk->map_used = 1;
739
740 INIT_LIST_HEAD(&chunk->list);
741 INIT_WORK(&chunk->map_extend_work, pcpu_map_extend_workfn);
742 chunk->free_size = pcpu_unit_size;
743 chunk->contig_hint = pcpu_unit_size;
744
745 return chunk;
746}
747
748static void pcpu_free_chunk(struct pcpu_chunk *chunk)
749{
750 if (!chunk)
751 return;
752 pcpu_mem_free(chunk->map);
753 pcpu_mem_free(chunk);
754}
755
756
757
758
759
760
761
762
763
764
765
766static void pcpu_chunk_populated(struct pcpu_chunk *chunk,
767 int page_start, int page_end)
768{
769 int nr = page_end - page_start;
770
771 lockdep_assert_held(&pcpu_lock);
772
773 bitmap_set(chunk->populated, page_start, nr);
774 chunk->nr_populated += nr;
775 pcpu_nr_empty_pop_pages += nr;
776}
777
778
779
780
781
782
783
784
785
786
787
788static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
789 int page_start, int page_end)
790{
791 int nr = page_end - page_start;
792
793 lockdep_assert_held(&pcpu_lock);
794
795 bitmap_clear(chunk->populated, page_start, nr);
796 chunk->nr_populated -= nr;
797 pcpu_nr_empty_pop_pages -= nr;
798}
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size);
816static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size);
817static struct pcpu_chunk *pcpu_create_chunk(void);
818static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
819static struct page *pcpu_addr_to_page(void *addr);
820static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
821
822#ifdef CONFIG_NEED_PER_CPU_KM
823#include "percpu-km.c"
824#else
825#include "percpu-vm.c"
826#endif
827
828
829
830
831
832
833
834
835static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
836{
837
838 if (pcpu_addr_in_first_chunk(addr)) {
839
840 if (pcpu_addr_in_reserved_chunk(addr))
841 return pcpu_reserved_chunk;
842 return pcpu_first_chunk;
843 }
844
845
846
847
848
849
850
851
852 addr += pcpu_unit_offsets[raw_smp_processor_id()];
853 return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
854}
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
870 gfp_t gfp)
871{
872 static int warn_limit = 10;
873 struct pcpu_chunk *chunk;
874 const char *err;
875 bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
876 int occ_pages = 0;
877 int slot, off, new_alloc, cpu, ret;
878 unsigned long flags;
879 void __percpu *ptr;
880
881
882
883
884
885 if (unlikely(align < 2))
886 align = 2;
887
888 size = ALIGN(size, 2);
889
890 if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
891 WARN(true, "illegal size (%zu) or align (%zu) for "
892 "percpu allocation\n", size, align);
893 return NULL;
894 }
895
896 spin_lock_irqsave(&pcpu_lock, flags);
897
898
899 if (reserved && pcpu_reserved_chunk) {
900 chunk = pcpu_reserved_chunk;
901
902 if (size > chunk->contig_hint) {
903 err = "alloc from reserved chunk failed";
904 goto fail_unlock;
905 }
906
907 while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) {
908 spin_unlock_irqrestore(&pcpu_lock, flags);
909 if (is_atomic ||
910 pcpu_extend_area_map(chunk, new_alloc) < 0) {
911 err = "failed to extend area map of reserved chunk";
912 goto fail;
913 }
914 spin_lock_irqsave(&pcpu_lock, flags);
915 }
916
917 off = pcpu_alloc_area(chunk, size, align, is_atomic,
918 &occ_pages);
919 if (off >= 0)
920 goto area_found;
921
922 err = "alloc from reserved chunk failed";
923 goto fail_unlock;
924 }
925
926restart:
927
928 for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
929 list_for_each_entry(chunk, &pcpu_slot[slot], list) {
930 if (size > chunk->contig_hint)
931 continue;
932
933 new_alloc = pcpu_need_to_extend(chunk, is_atomic);
934 if (new_alloc) {
935 if (is_atomic)
936 continue;
937 spin_unlock_irqrestore(&pcpu_lock, flags);
938 if (pcpu_extend_area_map(chunk,
939 new_alloc) < 0) {
940 err = "failed to extend area map";
941 goto fail;
942 }
943 spin_lock_irqsave(&pcpu_lock, flags);
944
945
946
947
948 goto restart;
949 }
950
951 off = pcpu_alloc_area(chunk, size, align, is_atomic,
952 &occ_pages);
953 if (off >= 0)
954 goto area_found;
955 }
956 }
957
958 spin_unlock_irqrestore(&pcpu_lock, flags);
959
960
961
962
963
964
965 if (is_atomic)
966 goto fail;
967
968 mutex_lock(&pcpu_alloc_mutex);
969
970 if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
971 chunk = pcpu_create_chunk();
972 if (!chunk) {
973 mutex_unlock(&pcpu_alloc_mutex);
974 err = "failed to allocate new chunk";
975 goto fail;
976 }
977
978 spin_lock_irqsave(&pcpu_lock, flags);
979 pcpu_chunk_relocate(chunk, -1);
980 } else {
981 spin_lock_irqsave(&pcpu_lock, flags);
982 }
983
984 mutex_unlock(&pcpu_alloc_mutex);
985 goto restart;
986
987area_found:
988 spin_unlock_irqrestore(&pcpu_lock, flags);
989
990
991 if (!is_atomic) {
992 int page_start, page_end, rs, re;
993
994 mutex_lock(&pcpu_alloc_mutex);
995
996 page_start = PFN_DOWN(off);
997 page_end = PFN_UP(off + size);
998
999 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
1000 WARN_ON(chunk->immutable);
1001
1002 ret = pcpu_populate_chunk(chunk, rs, re);
1003
1004 spin_lock_irqsave(&pcpu_lock, flags);
1005 if (ret) {
1006 mutex_unlock(&pcpu_alloc_mutex);
1007 pcpu_free_area(chunk, off, &occ_pages);
1008 err = "failed to populate";
1009 goto fail_unlock;
1010 }
1011 pcpu_chunk_populated(chunk, rs, re);
1012 spin_unlock_irqrestore(&pcpu_lock, flags);
1013 }
1014
1015 mutex_unlock(&pcpu_alloc_mutex);
1016 }
1017
1018 if (chunk != pcpu_reserved_chunk)
1019 pcpu_nr_empty_pop_pages -= occ_pages;
1020
1021 if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
1022 pcpu_schedule_balance_work();
1023
1024
1025 for_each_possible_cpu(cpu)
1026 memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
1027
1028 ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
1029 kmemleak_alloc_percpu(ptr, size, gfp);
1030 return ptr;
1031
1032fail_unlock:
1033 spin_unlock_irqrestore(&pcpu_lock, flags);
1034fail:
1035 if (!is_atomic && warn_limit) {
1036 pr_warning("PERCPU: allocation failed, size=%zu align=%zu atomic=%d, %s\n",
1037 size, align, is_atomic, err);
1038 dump_stack();
1039 if (!--warn_limit)
1040 pr_info("PERCPU: limit reached, disable warning\n");
1041 }
1042 if (is_atomic) {
1043
1044 pcpu_atomic_alloc_failed = true;
1045 pcpu_schedule_balance_work();
1046 }
1047 return NULL;
1048}
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp)
1064{
1065 return pcpu_alloc(size, align, false, gfp);
1066}
1067EXPORT_SYMBOL_GPL(__alloc_percpu_gfp);
1068
1069
1070
1071
1072
1073
1074
1075
1076void __percpu *__alloc_percpu(size_t size, size_t align)
1077{
1078 return pcpu_alloc(size, align, false, GFP_KERNEL);
1079}
1080EXPORT_SYMBOL_GPL(__alloc_percpu);
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
1099{
1100 return pcpu_alloc(size, align, true, GFP_KERNEL);
1101}
1102
1103
1104
1105
1106
1107
1108
1109static void pcpu_balance_workfn(struct work_struct *work)
1110{
1111 LIST_HEAD(to_free);
1112 struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
1113 struct pcpu_chunk *chunk, *next;
1114 int slot, nr_to_pop, ret;
1115
1116
1117
1118
1119
1120 mutex_lock(&pcpu_alloc_mutex);
1121 spin_lock_irq(&pcpu_lock);
1122
1123 list_for_each_entry_safe(chunk, next, free_head, list) {
1124 WARN_ON(chunk->immutable);
1125
1126
1127 if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
1128 continue;
1129
1130 list_move(&chunk->list, &to_free);
1131 }
1132
1133 spin_unlock_irq(&pcpu_lock);
1134
1135 list_for_each_entry_safe(chunk, next, &to_free, list) {
1136 int rs, re;
1137
1138 pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) {
1139 pcpu_depopulate_chunk(chunk, rs, re);
1140 spin_lock_irq(&pcpu_lock);
1141 pcpu_chunk_depopulated(chunk, rs, re);
1142 spin_unlock_irq(&pcpu_lock);
1143 }
1144 pcpu_destroy_chunk(chunk);
1145 }
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157retry_pop:
1158 if (pcpu_atomic_alloc_failed) {
1159 nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
1160
1161 pcpu_atomic_alloc_failed = false;
1162 } else {
1163 nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
1164 pcpu_nr_empty_pop_pages,
1165 0, PCPU_EMPTY_POP_PAGES_HIGH);
1166 }
1167
1168 for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
1169 int nr_unpop = 0, rs, re;
1170
1171 if (!nr_to_pop)
1172 break;
1173
1174 spin_lock_irq(&pcpu_lock);
1175 list_for_each_entry(chunk, &pcpu_slot[slot], list) {
1176 nr_unpop = pcpu_unit_pages - chunk->nr_populated;
1177 if (nr_unpop)
1178 break;
1179 }
1180 spin_unlock_irq(&pcpu_lock);
1181
1182 if (!nr_unpop)
1183 continue;
1184
1185
1186 pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) {
1187 int nr = min(re - rs, nr_to_pop);
1188
1189 ret = pcpu_populate_chunk(chunk, rs, rs + nr);
1190 if (!ret) {
1191 nr_to_pop -= nr;
1192 spin_lock_irq(&pcpu_lock);
1193 pcpu_chunk_populated(chunk, rs, rs + nr);
1194 spin_unlock_irq(&pcpu_lock);
1195 } else {
1196 nr_to_pop = 0;
1197 }
1198
1199 if (!nr_to_pop)
1200 break;
1201 }
1202 }
1203
1204 if (nr_to_pop) {
1205
1206 chunk = pcpu_create_chunk();
1207 if (chunk) {
1208 spin_lock_irq(&pcpu_lock);
1209 pcpu_chunk_relocate(chunk, -1);
1210 spin_unlock_irq(&pcpu_lock);
1211 goto retry_pop;
1212 }
1213 }
1214
1215 mutex_unlock(&pcpu_alloc_mutex);
1216}
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227void free_percpu(void __percpu *ptr)
1228{
1229 void *addr;
1230 struct pcpu_chunk *chunk;
1231 unsigned long flags;
1232 int off, occ_pages;
1233
1234 if (!ptr)
1235 return;
1236
1237 kmemleak_free_percpu(ptr);
1238
1239 addr = __pcpu_ptr_to_addr(ptr);
1240
1241 spin_lock_irqsave(&pcpu_lock, flags);
1242
1243 chunk = pcpu_chunk_addr_search(addr);
1244 off = addr - chunk->base_addr;
1245
1246 pcpu_free_area(chunk, off, &occ_pages);
1247
1248 if (chunk != pcpu_reserved_chunk)
1249 pcpu_nr_empty_pop_pages += occ_pages;
1250
1251
1252 if (chunk->free_size == pcpu_unit_size) {
1253 struct pcpu_chunk *pos;
1254
1255 list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
1256 if (pos != chunk) {
1257 pcpu_schedule_balance_work();
1258 break;
1259 }
1260 }
1261
1262 spin_unlock_irqrestore(&pcpu_lock, flags);
1263}
1264EXPORT_SYMBOL_GPL(free_percpu);
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277bool is_kernel_percpu_address(unsigned long addr)
1278{
1279#ifdef CONFIG_SMP
1280 const size_t static_size = __per_cpu_end - __per_cpu_start;
1281 void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
1282 unsigned int cpu;
1283
1284 for_each_possible_cpu(cpu) {
1285 void *start = per_cpu_ptr(base, cpu);
1286
1287 if ((void *)addr >= start && (void *)addr < start + static_size)
1288 return true;
1289 }
1290#endif
1291
1292 return false;
1293}
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318phys_addr_t per_cpu_ptr_to_phys(void *addr)
1319{
1320 void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
1321 bool in_first_chunk = false;
1322 unsigned long first_low, first_high;
1323 unsigned int cpu;
1324
1325
1326
1327
1328
1329
1330 first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0);
1331 first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu,
1332 pcpu_unit_pages);
1333 if ((unsigned long)addr >= first_low &&
1334 (unsigned long)addr < first_high) {
1335 for_each_possible_cpu(cpu) {
1336 void *start = per_cpu_ptr(base, cpu);
1337
1338 if (addr >= start && addr < start + pcpu_unit_size) {
1339 in_first_chunk = true;
1340 break;
1341 }
1342 }
1343 }
1344
1345 if (in_first_chunk) {
1346 if (!is_vmalloc_addr(addr))
1347 return __pa(addr);
1348 else
1349 return page_to_phys(vmalloc_to_page(addr)) +
1350 offset_in_page(addr);
1351 } else
1352 return page_to_phys(pcpu_addr_to_page(addr)) +
1353 offset_in_page(addr);
1354}
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
1372 int nr_units)
1373{
1374 struct pcpu_alloc_info *ai;
1375 size_t base_size, ai_size;
1376 void *ptr;
1377 int unit;
1378
1379 base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]),
1380 __alignof__(ai->groups[0].cpu_map[0]));
1381 ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
1382
1383 ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), 0);
1384 if (!ptr)
1385 return NULL;
1386 ai = ptr;
1387 ptr += base_size;
1388
1389 ai->groups[0].cpu_map = ptr;
1390
1391 for (unit = 0; unit < nr_units; unit++)
1392 ai->groups[0].cpu_map[unit] = NR_CPUS;
1393
1394 ai->nr_groups = nr_groups;
1395 ai->__ai_size = PFN_ALIGN(ai_size);
1396
1397 return ai;
1398}
1399
1400
1401
1402
1403
1404
1405
1406void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
1407{
1408 memblock_free_early(__pa(ai), ai->__ai_size);
1409}
1410
1411
1412
1413
1414
1415
1416
1417
1418static void pcpu_dump_alloc_info(const char *lvl,
1419 const struct pcpu_alloc_info *ai)
1420{
1421 int group_width = 1, cpu_width = 1, width;
1422 char empty_str[] = "--------";
1423 int alloc = 0, alloc_end = 0;
1424 int group, v;
1425 int upa, apl;
1426
1427 v = ai->nr_groups;
1428 while (v /= 10)
1429 group_width++;
1430
1431 v = num_possible_cpus();
1432 while (v /= 10)
1433 cpu_width++;
1434 empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
1435
1436 upa = ai->alloc_size / ai->unit_size;
1437 width = upa * (cpu_width + 1) + group_width + 3;
1438 apl = rounddown_pow_of_two(max(60 / width, 1));
1439
1440 printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
1441 lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
1442 ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);
1443
1444 for (group = 0; group < ai->nr_groups; group++) {
1445 const struct pcpu_group_info *gi = &ai->groups[group];
1446 int unit = 0, unit_end = 0;
1447
1448 BUG_ON(gi->nr_units % upa);
1449 for (alloc_end += gi->nr_units / upa;
1450 alloc < alloc_end; alloc++) {
1451 if (!(alloc % apl)) {
1452 printk(KERN_CONT "\n");
1453 printk("%spcpu-alloc: ", lvl);
1454 }
1455 printk(KERN_CONT "[%0*d] ", group_width, group);
1456
1457 for (unit_end += upa; unit < unit_end; unit++)
1458 if (gi->cpu_map[unit] != NR_CPUS)
1459 printk(KERN_CONT "%0*d ", cpu_width,
1460 gi->cpu_map[unit]);
1461 else
1462 printk(KERN_CONT "%s ", empty_str);
1463 }
1464 }
1465 printk(KERN_CONT "\n");
1466}
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1525 void *base_addr)
1526{
1527 static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
1528 static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
1529 size_t dyn_size = ai->dyn_size;
1530 size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
1531 struct pcpu_chunk *schunk, *dchunk = NULL;
1532 unsigned long *group_offsets;
1533 size_t *group_sizes;
1534 unsigned long *unit_off;
1535 unsigned int cpu;
1536 int *unit_map;
1537 int group, unit, i;
1538
1539#define PCPU_SETUP_BUG_ON(cond) do { \
1540 if (unlikely(cond)) { \
1541 pr_emerg("PERCPU: failed to initialize, %s", #cond); \
1542 pr_emerg("PERCPU: cpu_possible_mask=%*pb\n", \
1543 cpumask_pr_args(cpu_possible_mask)); \
1544 pcpu_dump_alloc_info(KERN_EMERG, ai); \
1545 BUG(); \
1546 } \
1547} while (0)
1548
1549
1550 PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
1551#ifdef CONFIG_SMP
1552 PCPU_SETUP_BUG_ON(!ai->static_size);
1553 PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start));
1554#endif
1555 PCPU_SETUP_BUG_ON(!base_addr);
1556 PCPU_SETUP_BUG_ON(offset_in_page(base_addr));
1557 PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
1558 PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
1559 PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
1560 PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
1561 PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
1562
1563
1564 group_offsets = memblock_virt_alloc(ai->nr_groups *
1565 sizeof(group_offsets[0]), 0);
1566 group_sizes = memblock_virt_alloc(ai->nr_groups *
1567 sizeof(group_sizes[0]), 0);
1568 unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0);
1569 unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0);
1570
1571 for (cpu = 0; cpu < nr_cpu_ids; cpu++)
1572 unit_map[cpu] = UINT_MAX;
1573
1574 pcpu_low_unit_cpu = NR_CPUS;
1575 pcpu_high_unit_cpu = NR_CPUS;
1576
1577 for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
1578 const struct pcpu_group_info *gi = &ai->groups[group];
1579
1580 group_offsets[group] = gi->base_offset;
1581 group_sizes[group] = gi->nr_units * ai->unit_size;
1582
1583 for (i = 0; i < gi->nr_units; i++) {
1584 cpu = gi->cpu_map[i];
1585 if (cpu == NR_CPUS)
1586 continue;
1587
1588 PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids);
1589 PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
1590 PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);
1591
1592 unit_map[cpu] = unit + i;
1593 unit_off[cpu] = gi->base_offset + i * ai->unit_size;
1594
1595
1596 if (pcpu_low_unit_cpu == NR_CPUS ||
1597 unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
1598 pcpu_low_unit_cpu = cpu;
1599 if (pcpu_high_unit_cpu == NR_CPUS ||
1600 unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
1601 pcpu_high_unit_cpu = cpu;
1602 }
1603 }
1604 pcpu_nr_units = unit;
1605
1606 for_each_possible_cpu(cpu)
1607 PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);
1608
1609
1610#undef PCPU_SETUP_BUG_ON
1611 pcpu_dump_alloc_info(KERN_DEBUG, ai);
1612
1613 pcpu_nr_groups = ai->nr_groups;
1614 pcpu_group_offsets = group_offsets;
1615 pcpu_group_sizes = group_sizes;
1616 pcpu_unit_map = unit_map;
1617 pcpu_unit_offsets = unit_off;
1618
1619
1620 pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
1621 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
1622 pcpu_atom_size = ai->atom_size;
1623 pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
1624 BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
1625
1626
1627
1628
1629
1630 pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
1631 pcpu_slot = memblock_virt_alloc(
1632 pcpu_nr_slots * sizeof(pcpu_slot[0]), 0);
1633 for (i = 0; i < pcpu_nr_slots; i++)
1634 INIT_LIST_HEAD(&pcpu_slot[i]);
1635
1636
1637
1638
1639
1640
1641
1642
1643 schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
1644 INIT_LIST_HEAD(&schunk->list);
1645 INIT_WORK(&schunk->map_extend_work, pcpu_map_extend_workfn);
1646 schunk->base_addr = base_addr;
1647 schunk->map = smap;
1648 schunk->map_alloc = ARRAY_SIZE(smap);
1649 schunk->immutable = true;
1650 bitmap_fill(schunk->populated, pcpu_unit_pages);
1651 schunk->nr_populated = pcpu_unit_pages;
1652
1653 if (ai->reserved_size) {
1654 schunk->free_size = ai->reserved_size;
1655 pcpu_reserved_chunk = schunk;
1656 pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size;
1657 } else {
1658 schunk->free_size = dyn_size;
1659 dyn_size = 0;
1660 }
1661 schunk->contig_hint = schunk->free_size;
1662
1663 schunk->map[0] = 1;
1664 schunk->map[1] = ai->static_size;
1665 schunk->map_used = 1;
1666 if (schunk->free_size)
1667 schunk->map[++schunk->map_used] = ai->static_size + schunk->free_size;
1668 schunk->map[schunk->map_used] |= 1;
1669
1670
1671 if (dyn_size) {
1672 dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
1673 INIT_LIST_HEAD(&dchunk->list);
1674 INIT_WORK(&dchunk->map_extend_work, pcpu_map_extend_workfn);
1675 dchunk->base_addr = base_addr;
1676 dchunk->map = dmap;
1677 dchunk->map_alloc = ARRAY_SIZE(dmap);
1678 dchunk->immutable = true;
1679 bitmap_fill(dchunk->populated, pcpu_unit_pages);
1680 dchunk->nr_populated = pcpu_unit_pages;
1681
1682 dchunk->contig_hint = dchunk->free_size = dyn_size;
1683 dchunk->map[0] = 1;
1684 dchunk->map[1] = pcpu_reserved_chunk_limit;
1685 dchunk->map[2] = (pcpu_reserved_chunk_limit + dchunk->free_size) | 1;
1686 dchunk->map_used = 2;
1687 }
1688
1689
1690 pcpu_first_chunk = dchunk ?: schunk;
1691 pcpu_nr_empty_pop_pages +=
1692 pcpu_count_occupied_pages(pcpu_first_chunk, 1);
1693 pcpu_chunk_relocate(pcpu_first_chunk, -1);
1694
1695
1696 pcpu_base_addr = base_addr;
1697 return 0;
1698}
1699
1700#ifdef CONFIG_SMP
1701
1702const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
1703 [PCPU_FC_AUTO] = "auto",
1704 [PCPU_FC_EMBED] = "embed",
1705 [PCPU_FC_PAGE] = "page",
1706};
1707
1708enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
1709
1710static int __init percpu_alloc_setup(char *str)
1711{
1712 if (!str)
1713 return -EINVAL;
1714
1715 if (0)
1716 ;
1717#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
1718 else if (!strcmp(str, "embed"))
1719 pcpu_chosen_fc = PCPU_FC_EMBED;
1720#endif
1721#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
1722 else if (!strcmp(str, "page"))
1723 pcpu_chosen_fc = PCPU_FC_PAGE;
1724#endif
1725 else
1726 pr_warning("PERCPU: unknown allocator %s specified\n", str);
1727
1728 return 0;
1729}
1730early_param("percpu_alloc", percpu_alloc_setup);
1731
1732
1733
1734
1735
1736
1737#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
1738 !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
1739#define BUILD_EMBED_FIRST_CHUNK
1740#endif
1741
1742
1743#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
1744#define BUILD_PAGE_FIRST_CHUNK
1745#endif
1746
1747
1748#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
1771 size_t reserved_size, size_t dyn_size,
1772 size_t atom_size,
1773 pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
1774{
1775 static int group_map[NR_CPUS] __initdata;
1776 static int group_cnt[NR_CPUS] __initdata;
1777 const size_t static_size = __per_cpu_end - __per_cpu_start;
1778 int nr_groups = 1, nr_units = 0;
1779 size_t size_sum, min_unit_size, alloc_size;
1780 int upa, max_upa, uninitialized_var(best_upa);
1781 int last_allocs, group, unit;
1782 unsigned int cpu, tcpu;
1783 struct pcpu_alloc_info *ai;
1784 unsigned int *cpu_map;
1785
1786
1787 memset(group_map, 0, sizeof(group_map));
1788 memset(group_cnt, 0, sizeof(group_cnt));
1789
1790
1791 size_sum = PFN_ALIGN(static_size + reserved_size +
1792 max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
1793 dyn_size = size_sum - static_size - reserved_size;
1794
1795
1796
1797
1798
1799
1800
1801 min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
1802
1803 alloc_size = roundup(min_unit_size, atom_size);
1804 upa = alloc_size / min_unit_size;
1805 while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
1806 upa--;
1807 max_upa = upa;
1808
1809
1810 for_each_possible_cpu(cpu) {
1811 group = 0;
1812 next_group:
1813 for_each_possible_cpu(tcpu) {
1814 if (cpu == tcpu)
1815 break;
1816 if (group_map[tcpu] == group && cpu_distance_fn &&
1817 (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
1818 cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
1819 group++;
1820 nr_groups = max(nr_groups, group + 1);
1821 goto next_group;
1822 }
1823 }
1824 group_map[cpu] = group;
1825 group_cnt[group]++;
1826 }
1827
1828
1829
1830
1831
1832
1833 last_allocs = INT_MAX;
1834 for (upa = max_upa; upa; upa--) {
1835 int allocs = 0, wasted = 0;
1836
1837 if (alloc_size % upa || (offset_in_page(alloc_size / upa)))
1838 continue;
1839
1840 for (group = 0; group < nr_groups; group++) {
1841 int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
1842 allocs += this_allocs;
1843 wasted += this_allocs * upa - group_cnt[group];
1844 }
1845
1846
1847
1848
1849
1850
1851 if (wasted > num_possible_cpus() / 3)
1852 continue;
1853
1854
1855 if (allocs > last_allocs)
1856 break;
1857 last_allocs = allocs;
1858 best_upa = upa;
1859 }
1860 upa = best_upa;
1861
1862
1863 for (group = 0; group < nr_groups; group++)
1864 nr_units += roundup(group_cnt[group], upa);
1865
1866 ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
1867 if (!ai)
1868 return ERR_PTR(-ENOMEM);
1869 cpu_map = ai->groups[0].cpu_map;
1870
1871 for (group = 0; group < nr_groups; group++) {
1872 ai->groups[group].cpu_map = cpu_map;
1873 cpu_map += roundup(group_cnt[group], upa);
1874 }
1875
1876 ai->static_size = static_size;
1877 ai->reserved_size = reserved_size;
1878 ai->dyn_size = dyn_size;
1879 ai->unit_size = alloc_size / upa;
1880 ai->atom_size = atom_size;
1881 ai->alloc_size = alloc_size;
1882
1883 for (group = 0, unit = 0; group_cnt[group]; group++) {
1884 struct pcpu_group_info *gi = &ai->groups[group];
1885
1886
1887
1888
1889
1890
1891 gi->base_offset = unit * ai->unit_size;
1892
1893 for_each_possible_cpu(cpu)
1894 if (group_map[cpu] == group)
1895 gi->cpu_map[gi->nr_units++] = cpu;
1896 gi->nr_units = roundup(gi->nr_units, upa);
1897 unit += gi->nr_units;
1898 }
1899 BUG_ON(unit != nr_units);
1900
1901 return ai;
1902}
1903#endif
1904
1905#if defined(BUILD_EMBED_FIRST_CHUNK)
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
1939 size_t atom_size,
1940 pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
1941 pcpu_fc_alloc_fn_t alloc_fn,
1942 pcpu_fc_free_fn_t free_fn)
1943{
1944 void *base = (void *)ULONG_MAX;
1945 void **areas = NULL;
1946 struct pcpu_alloc_info *ai;
1947 size_t size_sum, areas_size, max_distance;
1948 int group, i, rc;
1949
1950 ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
1951 cpu_distance_fn);
1952 if (IS_ERR(ai))
1953 return PTR_ERR(ai);
1954
1955 size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
1956 areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
1957
1958 areas = memblock_virt_alloc_nopanic(areas_size, 0);
1959 if (!areas) {
1960 rc = -ENOMEM;
1961 goto out_free;
1962 }
1963
1964
1965 for (group = 0; group < ai->nr_groups; group++) {
1966 struct pcpu_group_info *gi = &ai->groups[group];
1967 unsigned int cpu = NR_CPUS;
1968 void *ptr;
1969
1970 for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
1971 cpu = gi->cpu_map[i];
1972 BUG_ON(cpu == NR_CPUS);
1973
1974
1975 ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
1976 if (!ptr) {
1977 rc = -ENOMEM;
1978 goto out_free_areas;
1979 }
1980
1981 kmemleak_free(ptr);
1982 areas[group] = ptr;
1983
1984 base = min(ptr, base);
1985 }
1986
1987
1988
1989
1990
1991
1992 for (group = 0; group < ai->nr_groups; group++) {
1993 struct pcpu_group_info *gi = &ai->groups[group];
1994 void *ptr = areas[group];
1995
1996 for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
1997 if (gi->cpu_map[i] == NR_CPUS) {
1998
1999 free_fn(ptr, ai->unit_size);
2000 continue;
2001 }
2002
2003 memcpy(ptr, __per_cpu_load, ai->static_size);
2004 free_fn(ptr + size_sum, ai->unit_size - size_sum);
2005 }
2006 }
2007
2008
2009 max_distance = 0;
2010 for (group = 0; group < ai->nr_groups; group++) {
2011 ai->groups[group].base_offset = areas[group] - base;
2012 max_distance = max_t(size_t, max_distance,
2013 ai->groups[group].base_offset);
2014 }
2015 max_distance += ai->unit_size;
2016
2017
2018 if (max_distance > VMALLOC_TOTAL * 3 / 4) {
2019 pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc "
2020 "space 0x%lx\n", max_distance,
2021 VMALLOC_TOTAL);
2022#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
2023
2024 rc = -EINVAL;
2025 goto out_free;
2026#endif
2027 }
2028
2029 pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
2030 PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
2031 ai->dyn_size, ai->unit_size);
2032
2033 rc = pcpu_setup_first_chunk(ai, base);
2034 goto out_free;
2035
2036out_free_areas:
2037 for (group = 0; group < ai->nr_groups; group++)
2038 if (areas[group])
2039 free_fn(areas[group],
2040 ai->groups[group].nr_units * ai->unit_size);
2041out_free:
2042 pcpu_free_alloc_info(ai);
2043 if (areas)
2044 memblock_free_early(__pa(areas), areas_size);
2045 return rc;
2046}
2047#endif
2048
2049#ifdef BUILD_PAGE_FIRST_CHUNK
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066int __init pcpu_page_first_chunk(size_t reserved_size,
2067 pcpu_fc_alloc_fn_t alloc_fn,
2068 pcpu_fc_free_fn_t free_fn,
2069 pcpu_fc_populate_pte_fn_t populate_pte_fn)
2070{
2071 static struct vm_struct vm;
2072 struct pcpu_alloc_info *ai;
2073 char psize_str[16];
2074 int unit_pages;
2075 size_t pages_size;
2076 struct page **pages;
2077 int unit, i, j, rc;
2078
2079 snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
2080
2081 ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
2082 if (IS_ERR(ai))
2083 return PTR_ERR(ai);
2084 BUG_ON(ai->nr_groups != 1);
2085 BUG_ON(ai->groups[0].nr_units != num_possible_cpus());
2086
2087 unit_pages = ai->unit_size >> PAGE_SHIFT;
2088
2089
2090 pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
2091 sizeof(pages[0]));
2092 pages = memblock_virt_alloc(pages_size, 0);
2093
2094
2095 j = 0;
2096 for (unit = 0; unit < num_possible_cpus(); unit++)
2097 for (i = 0; i < unit_pages; i++) {
2098 unsigned int cpu = ai->groups[0].cpu_map[unit];
2099 void *ptr;
2100
2101 ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
2102 if (!ptr) {
2103 pr_warning("PERCPU: failed to allocate %s page "
2104 "for cpu%u\n", psize_str, cpu);
2105 goto enomem;
2106 }
2107
2108 kmemleak_free(ptr);
2109 pages[j++] = virt_to_page(ptr);
2110 }
2111
2112
2113 vm.flags = VM_ALLOC;
2114 vm.size = num_possible_cpus() * ai->unit_size;
2115 vm_area_register_early(&vm, PAGE_SIZE);
2116
2117 for (unit = 0; unit < num_possible_cpus(); unit++) {
2118 unsigned long unit_addr =
2119 (unsigned long)vm.addr + unit * ai->unit_size;
2120
2121 for (i = 0; i < unit_pages; i++)
2122 populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
2123
2124
2125 rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
2126 unit_pages);
2127 if (rc < 0)
2128 panic("failed to map percpu area, err=%d\n", rc);
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139 memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
2140 }
2141
2142
2143 pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n",
2144 unit_pages, psize_str, vm.addr, ai->static_size,
2145 ai->reserved_size, ai->dyn_size);
2146
2147 rc = pcpu_setup_first_chunk(ai, vm.addr);
2148 goto out_free_ar;
2149
2150enomem:
2151 while (--j >= 0)
2152 free_fn(page_address(pages[j]), PAGE_SIZE);
2153 rc = -ENOMEM;
2154out_free_ar:
2155 memblock_free_early(__pa(pages), pages_size);
2156 pcpu_free_alloc_info(ai);
2157 return rc;
2158}
2159#endif
2160
2161#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
2175EXPORT_SYMBOL(__per_cpu_offset);
2176
2177static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
2178 size_t align)
2179{
2180 return memblock_virt_alloc_from_nopanic(
2181 size, align, __pa(MAX_DMA_ADDRESS));
2182}
2183
2184static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
2185{
2186 memblock_free_early(__pa(ptr), size);
2187}
2188
2189void __init setup_per_cpu_areas(void)
2190{
2191 unsigned long delta;
2192 unsigned int cpu;
2193 int rc;
2194
2195
2196
2197
2198
2199 rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
2200 PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
2201 pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
2202 if (rc < 0)
2203 panic("Failed to initialize percpu areas.");
2204
2205 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
2206 for_each_possible_cpu(cpu)
2207 __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
2208}
2209#endif
2210
2211#else
2212
2213
2214
2215
2216
2217
2218
2219
2220void __init setup_per_cpu_areas(void)
2221{
2222 const size_t unit_size =
2223 roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
2224 PERCPU_DYNAMIC_RESERVE));
2225 struct pcpu_alloc_info *ai;
2226 void *fc;
2227
2228 ai = pcpu_alloc_alloc_info(1, 1);
2229 fc = memblock_virt_alloc_from_nopanic(unit_size,
2230 PAGE_SIZE,
2231 __pa(MAX_DMA_ADDRESS));
2232 if (!ai || !fc)
2233 panic("Failed to allocate memory for percpu areas.");
2234
2235 kmemleak_free(fc);
2236
2237 ai->dyn_size = unit_size;
2238 ai->unit_size = unit_size;
2239 ai->atom_size = unit_size;
2240 ai->alloc_size = unit_size;
2241 ai->groups[0].nr_units = 1;
2242 ai->groups[0].cpu_map[0] = 0;
2243
2244 if (pcpu_setup_first_chunk(ai, fc) < 0)
2245 panic("Failed to initialize percpu areas.");
2246}
2247
2248#endif
2249
2250
2251
2252
2253
2254
2255
2256void __init percpu_init_late(void)
2257{
2258 struct pcpu_chunk *target_chunks[] =
2259 { pcpu_first_chunk, pcpu_reserved_chunk, NULL };
2260 struct pcpu_chunk *chunk;
2261 unsigned long flags;
2262 int i;
2263
2264 for (i = 0; (chunk = target_chunks[i]); i++) {
2265 int *map;
2266 const size_t size = PERCPU_DYNAMIC_EARLY_SLOTS * sizeof(map[0]);
2267
2268 BUILD_BUG_ON(size > PAGE_SIZE);
2269
2270 map = pcpu_mem_zalloc(size);
2271 BUG_ON(!map);
2272
2273 spin_lock_irqsave(&pcpu_lock, flags);
2274 memcpy(map, chunk->map, size);
2275 chunk->map = map;
2276 spin_unlock_irqrestore(&pcpu_lock, flags);
2277 }
2278}
2279
2280
2281
2282
2283
2284
2285static int __init percpu_enable_async(void)
2286{
2287 pcpu_async_enabled = true;
2288 return 0;
2289}
2290subsys_initcall(percpu_enable_async);
2291