1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56#include <linux/bitmap.h>
57#include <linux/bootmem.h>
58#include <linux/err.h>
59#include <linux/list.h>
60#include <linux/log2.h>
61#include <linux/mm.h>
62#include <linux/module.h>
63#include <linux/mutex.h>
64#include <linux/percpu.h>
65#include <linux/pfn.h>
66#include <linux/slab.h>
67#include <linux/spinlock.h>
68#include <linux/vmalloc.h>
69#include <linux/workqueue.h>
70#include <linux/kmemleak.h>
71
72#include <asm/cacheflush.h>
73#include <asm/sections.h>
74#include <asm/tlbflush.h>
75#include <asm/io.h>
76
77#define PCPU_SLOT_BASE_SHIFT 5
78#define PCPU_DFL_MAP_ALLOC 16
79#define PCPU_ATOMIC_MAP_MARGIN_LOW 32
80#define PCPU_ATOMIC_MAP_MARGIN_HIGH 64
81#define PCPU_EMPTY_POP_PAGES_LOW 2
82#define PCPU_EMPTY_POP_PAGES_HIGH 4
83
84#ifdef CONFIG_SMP
85
86#ifndef __addr_to_pcpu_ptr
87#define __addr_to_pcpu_ptr(addr) \
88 (void __percpu *)((unsigned long)(addr) - \
89 (unsigned long)pcpu_base_addr + \
90 (unsigned long)__per_cpu_start)
91#endif
92#ifndef __pcpu_ptr_to_addr
93#define __pcpu_ptr_to_addr(ptr) \
94 (void __force *)((unsigned long)(ptr) + \
95 (unsigned long)pcpu_base_addr - \
96 (unsigned long)__per_cpu_start)
97#endif
98#else
99
100#define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr)
101#define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr)
102#endif
103
104struct pcpu_chunk {
105 struct list_head list;
106 int free_size;
107 int contig_hint;
108 void *base_addr;
109
110 int map_used;
111 int map_alloc;
112 int *map;
113 struct list_head map_extend_list;
114
115 void *data;
116 int first_free;
117 bool immutable;
118 int nr_populated;
119 unsigned long populated[];
120};
121
122static int pcpu_unit_pages __read_mostly;
123static int pcpu_unit_size __read_mostly;
124static int pcpu_nr_units __read_mostly;
125static int pcpu_atom_size __read_mostly;
126static int pcpu_nr_slots __read_mostly;
127static size_t pcpu_chunk_struct_size __read_mostly;
128
129
130static unsigned int pcpu_low_unit_cpu __read_mostly;
131static unsigned int pcpu_high_unit_cpu __read_mostly;
132
133
134void *pcpu_base_addr __read_mostly;
135EXPORT_SYMBOL_GPL(pcpu_base_addr);
136
137static const int *pcpu_unit_map __read_mostly;
138const unsigned long *pcpu_unit_offsets __read_mostly;
139
140
141static int pcpu_nr_groups __read_mostly;
142static const unsigned long *pcpu_group_offsets __read_mostly;
143static const size_t *pcpu_group_sizes __read_mostly;
144
145
146
147
148
149
150static struct pcpu_chunk *pcpu_first_chunk;
151
152
153
154
155
156
157
158
159static struct pcpu_chunk *pcpu_reserved_chunk;
160static int pcpu_reserved_chunk_limit;
161
162static DEFINE_SPINLOCK(pcpu_lock);
163static DEFINE_MUTEX(pcpu_alloc_mutex);
164
165static struct list_head *pcpu_slot __read_mostly;
166
167
168static LIST_HEAD(pcpu_map_extend_chunks);
169
170
171
172
173
174static int pcpu_nr_empty_pop_pages;
175
176
177
178
179
180
181
182static void pcpu_balance_workfn(struct work_struct *work);
183static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
184static bool pcpu_async_enabled __read_mostly;
185static bool pcpu_atomic_alloc_failed;
186
187static void pcpu_schedule_balance_work(void)
188{
189 if (pcpu_async_enabled)
190 schedule_work(&pcpu_balance_work);
191}
192
193static bool pcpu_addr_in_first_chunk(void *addr)
194{
195 void *first_start = pcpu_first_chunk->base_addr;
196
197 return addr >= first_start && addr < first_start + pcpu_unit_size;
198}
199
200static bool pcpu_addr_in_reserved_chunk(void *addr)
201{
202 void *first_start = pcpu_first_chunk->base_addr;
203
204 return addr >= first_start &&
205 addr < first_start + pcpu_reserved_chunk_limit;
206}
207
208static int __pcpu_size_to_slot(int size)
209{
210 int highbit = fls(size);
211 return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
212}
213
214static int pcpu_size_to_slot(int size)
215{
216 if (size == pcpu_unit_size)
217 return pcpu_nr_slots - 1;
218 return __pcpu_size_to_slot(size);
219}
220
221static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
222{
223 if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
224 return 0;
225
226 return pcpu_size_to_slot(chunk->free_size);
227}
228
229
230static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
231{
232 page->index = (unsigned long)pcpu;
233}
234
235
236static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
237{
238 return (struct pcpu_chunk *)page->index;
239}
240
241static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
242{
243 return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
244}
245
246static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
247 unsigned int cpu, int page_idx)
248{
249 return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] +
250 (page_idx << PAGE_SHIFT);
251}
252
253static void __maybe_unused pcpu_next_unpop(struct pcpu_chunk *chunk,
254 int *rs, int *re, int end)
255{
256 *rs = find_next_zero_bit(chunk->populated, end, *rs);
257 *re = find_next_bit(chunk->populated, end, *rs + 1);
258}
259
260static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
261 int *rs, int *re, int end)
262{
263 *rs = find_next_bit(chunk->populated, end, *rs);
264 *re = find_next_zero_bit(chunk->populated, end, *rs + 1);
265}
266
267
268
269
270
271
272
273#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \
274 for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \
275 (rs) < (re); \
276 (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end)))
277
278#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \
279 for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \
280 (rs) < (re); \
281 (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297static void *pcpu_mem_zalloc(size_t size)
298{
299 if (WARN_ON_ONCE(!slab_is_available()))
300 return NULL;
301
302 if (size <= PAGE_SIZE)
303 return kzalloc(size, GFP_KERNEL);
304 else
305 return vzalloc(size);
306}
307
308
309
310
311
312
313
314
315static void pcpu_mem_free(void *ptr, size_t size)
316{
317 if (size <= PAGE_SIZE)
318 kfree(ptr);
319 else
320 vfree(ptr);
321}
322
323
324
325
326
327
328
329
330
331
332static int pcpu_count_occupied_pages(struct pcpu_chunk *chunk, int i)
333{
334 int off = chunk->map[i] & ~1;
335 int end = chunk->map[i + 1] & ~1;
336
337 if (!PAGE_ALIGNED(off) && i > 0) {
338 int prev = chunk->map[i - 1];
339
340 if (!(prev & 1) && prev <= round_down(off, PAGE_SIZE))
341 off = round_down(off, PAGE_SIZE);
342 }
343
344 if (!PAGE_ALIGNED(end) && i + 1 < chunk->map_used) {
345 int next = chunk->map[i + 1];
346 int nend = chunk->map[i + 2] & ~1;
347
348 if (!(next & 1) && nend >= round_up(end, PAGE_SIZE))
349 end = round_up(end, PAGE_SIZE);
350 }
351
352 return max_t(int, PFN_DOWN(end) - PFN_UP(off), 0);
353}
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
369{
370 int nslot = pcpu_chunk_slot(chunk);
371
372 if (chunk != pcpu_reserved_chunk && oslot != nslot) {
373 if (oslot < nslot)
374 list_move(&chunk->list, &pcpu_slot[nslot]);
375 else
376 list_move_tail(&chunk->list, &pcpu_slot[nslot]);
377 }
378}
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic)
400{
401 int margin, new_alloc;
402
403 lockdep_assert_held(&pcpu_lock);
404
405 if (is_atomic) {
406 margin = 3;
407
408 if (chunk->map_alloc <
409 chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) {
410 if (list_empty(&chunk->map_extend_list)) {
411 list_add_tail(&chunk->map_extend_list,
412 &pcpu_map_extend_chunks);
413 pcpu_schedule_balance_work();
414 }
415 }
416 } else {
417 margin = PCPU_ATOMIC_MAP_MARGIN_HIGH;
418 }
419
420 if (chunk->map_alloc >= chunk->map_used + margin)
421 return 0;
422
423 new_alloc = PCPU_DFL_MAP_ALLOC;
424 while (new_alloc < chunk->map_used + margin)
425 new_alloc *= 2;
426
427 return new_alloc;
428}
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
444{
445 int *old = NULL, *new = NULL;
446 size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
447 unsigned long flags;
448
449 lockdep_assert_held(&pcpu_alloc_mutex);
450
451 new = pcpu_mem_zalloc(new_size);
452 if (!new)
453 return -ENOMEM;
454
455
456 spin_lock_irqsave(&pcpu_lock, flags);
457
458 if (new_alloc <= chunk->map_alloc)
459 goto out_unlock;
460
461 old_size = chunk->map_alloc * sizeof(chunk->map[0]);
462 old = chunk->map;
463
464 memcpy(new, old, old_size);
465
466 chunk->map_alloc = new_alloc;
467 chunk->map = new;
468 new = NULL;
469
470out_unlock:
471 spin_unlock_irqrestore(&pcpu_lock, flags);
472
473
474
475
476
477 pcpu_mem_free(old, old_size);
478 pcpu_mem_free(new, new_size);
479
480 return 0;
481}
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size,
502 int size, int align, bool pop_only)
503{
504 int cand_off = off;
505
506 while (true) {
507 int head = ALIGN(cand_off, align) - off;
508 int page_start, page_end, rs, re;
509
510 if (this_size < head + size)
511 return -1;
512
513 if (!pop_only)
514 return head;
515
516
517
518
519
520
521 page_start = PFN_DOWN(head + off);
522 page_end = PFN_UP(head + off + size);
523
524 rs = page_start;
525 pcpu_next_unpop(chunk, &rs, &re, PFN_UP(off + this_size));
526 if (rs >= page_end)
527 return head;
528 cand_off = re * PAGE_SIZE;
529 }
530}
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align,
554 bool pop_only, int *occ_pages_p)
555{
556 int oslot = pcpu_chunk_slot(chunk);
557 int max_contig = 0;
558 int i, off;
559 bool seen_free = false;
560 int *p;
561
562 for (i = chunk->first_free, p = chunk->map + i; i < chunk->map_used; i++, p++) {
563 int head, tail;
564 int this_size;
565
566 off = *p;
567 if (off & 1)
568 continue;
569
570 this_size = (p[1] & ~1) - off;
571
572 head = pcpu_fit_in_area(chunk, off, this_size, size, align,
573 pop_only);
574 if (head < 0) {
575 if (!seen_free) {
576 chunk->first_free = i;
577 seen_free = true;
578 }
579 max_contig = max(this_size, max_contig);
580 continue;
581 }
582
583
584
585
586
587
588
589 if (head && (head < sizeof(int) || !(p[-1] & 1))) {
590 *p = off += head;
591 if (p[-1] & 1)
592 chunk->free_size -= head;
593 else
594 max_contig = max(*p - p[-1], max_contig);
595 this_size -= head;
596 head = 0;
597 }
598
599
600 tail = this_size - head - size;
601 if (tail < sizeof(int)) {
602 tail = 0;
603 size = this_size - head;
604 }
605
606
607 if (head || tail) {
608 int nr_extra = !!head + !!tail;
609
610
611 memmove(p + nr_extra + 1, p + 1,
612 sizeof(chunk->map[0]) * (chunk->map_used - i));
613 chunk->map_used += nr_extra;
614
615 if (head) {
616 if (!seen_free) {
617 chunk->first_free = i;
618 seen_free = true;
619 }
620 *++p = off += head;
621 ++i;
622 max_contig = max(head, max_contig);
623 }
624 if (tail) {
625 p[1] = off + size;
626 max_contig = max(tail, max_contig);
627 }
628 }
629
630 if (!seen_free)
631 chunk->first_free = i + 1;
632
633
634 if (i + 1 == chunk->map_used)
635 chunk->contig_hint = max_contig;
636 else
637 chunk->contig_hint = max(chunk->contig_hint,
638 max_contig);
639
640 chunk->free_size -= size;
641 *p |= 1;
642
643 *occ_pages_p = pcpu_count_occupied_pages(chunk, i);
644 pcpu_chunk_relocate(chunk, oslot);
645 return off;
646 }
647
648 chunk->contig_hint = max_contig;
649 pcpu_chunk_relocate(chunk, oslot);
650
651
652 return -1;
653}
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme,
669 int *occ_pages_p)
670{
671 int oslot = pcpu_chunk_slot(chunk);
672 int off = 0;
673 unsigned i, j;
674 int to_free = 0;
675 int *p;
676
677 freeme |= 1;
678
679 i = 0;
680 j = chunk->map_used;
681 while (i != j) {
682 unsigned k = (i + j) / 2;
683 off = chunk->map[k];
684 if (off < freeme)
685 i = k + 1;
686 else if (off > freeme)
687 j = k;
688 else
689 i = j = k;
690 }
691 BUG_ON(off != freeme);
692
693 if (i < chunk->first_free)
694 chunk->first_free = i;
695
696 p = chunk->map + i;
697 *p = off &= ~1;
698 chunk->free_size += (p[1] & ~1) - off;
699
700 *occ_pages_p = pcpu_count_occupied_pages(chunk, i);
701
702
703 if (!(p[1] & 1))
704 to_free++;
705
706 if (i > 0 && !(p[-1] & 1)) {
707 to_free++;
708 i--;
709 p--;
710 }
711 if (to_free) {
712 chunk->map_used -= to_free;
713 memmove(p + 1, p + 1 + to_free,
714 (chunk->map_used - i) * sizeof(chunk->map[0]));
715 }
716
717 chunk->contig_hint = max(chunk->map[i + 1] - chunk->map[i] - 1, chunk->contig_hint);
718 pcpu_chunk_relocate(chunk, oslot);
719}
720
721static struct pcpu_chunk *pcpu_alloc_chunk(void)
722{
723 struct pcpu_chunk *chunk;
724
725 chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size);
726 if (!chunk)
727 return NULL;
728
729 chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *
730 sizeof(chunk->map[0]));
731 if (!chunk->map) {
732 pcpu_mem_free(chunk, pcpu_chunk_struct_size);
733 return NULL;
734 }
735
736 chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
737 chunk->map[0] = 0;
738 chunk->map[1] = pcpu_unit_size | 1;
739 chunk->map_used = 1;
740
741 INIT_LIST_HEAD(&chunk->list);
742 INIT_LIST_HEAD(&chunk->map_extend_list);
743 chunk->free_size = pcpu_unit_size;
744 chunk->contig_hint = pcpu_unit_size;
745
746 return chunk;
747}
748
749static void pcpu_free_chunk(struct pcpu_chunk *chunk)
750{
751 if (!chunk)
752 return;
753 pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
754 pcpu_mem_free(chunk, pcpu_chunk_struct_size);
755}
756
757
758
759
760
761
762
763
764
765
766
767static void pcpu_chunk_populated(struct pcpu_chunk *chunk,
768 int page_start, int page_end)
769{
770 int nr = page_end - page_start;
771
772 lockdep_assert_held(&pcpu_lock);
773
774 bitmap_set(chunk->populated, page_start, nr);
775 chunk->nr_populated += nr;
776 pcpu_nr_empty_pop_pages += nr;
777}
778
779
780
781
782
783
784
785
786
787
788
789static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
790 int page_start, int page_end)
791{
792 int nr = page_end - page_start;
793
794 lockdep_assert_held(&pcpu_lock);
795
796 bitmap_clear(chunk->populated, page_start, nr);
797 chunk->nr_populated -= nr;
798 pcpu_nr_empty_pop_pages -= nr;
799}
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size);
817static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size);
818static struct pcpu_chunk *pcpu_create_chunk(void);
819static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
820static struct page *pcpu_addr_to_page(void *addr);
821static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
822
823#ifdef CONFIG_NEED_PER_CPU_KM
824#include "percpu-km.c"
825#else
826#include "percpu-vm.c"
827#endif
828
829
830
831
832
833
834
835
836static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
837{
838
839 if (pcpu_addr_in_first_chunk(addr)) {
840
841 if (pcpu_addr_in_reserved_chunk(addr))
842 return pcpu_reserved_chunk;
843 return pcpu_first_chunk;
844 }
845
846
847
848
849
850
851
852
853 addr += pcpu_unit_offsets[raw_smp_processor_id()];
854 return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
855}
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
873 gfp_t gfp)
874{
875 bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
876 bool do_warn = !(gfp & __GFP_NOWARN);
877 static int warn_limit = 10;
878 struct pcpu_chunk *chunk;
879 const char *err;
880 int occ_pages = 0;
881 int slot, off, new_alloc, cpu, ret;
882 unsigned long flags;
883 void __percpu *ptr;
884
885
886
887
888
889 if (unlikely(align < 2))
890 align = 2;
891
892 size = ALIGN(size, 2);
893
894 if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
895 WARN(do_warn, "illegal size (%zu) or align (%zu) for "
896 "percpu allocation\n", size, align);
897 return NULL;
898 }
899
900 if (!is_atomic)
901 mutex_lock(&pcpu_alloc_mutex);
902
903 spin_lock_irqsave(&pcpu_lock, flags);
904
905
906 if (reserved && pcpu_reserved_chunk) {
907 chunk = pcpu_reserved_chunk;
908
909 if (size > chunk->contig_hint) {
910 err = "alloc from reserved chunk failed";
911 goto fail_unlock;
912 }
913
914 while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) {
915 spin_unlock_irqrestore(&pcpu_lock, flags);
916 if (is_atomic ||
917 pcpu_extend_area_map(chunk, new_alloc) < 0) {
918 err = "failed to extend area map of reserved chunk";
919 goto fail;
920 }
921 spin_lock_irqsave(&pcpu_lock, flags);
922 }
923
924 off = pcpu_alloc_area(chunk, size, align, is_atomic,
925 &occ_pages);
926 if (off >= 0)
927 goto area_found;
928
929 err = "alloc from reserved chunk failed";
930 goto fail_unlock;
931 }
932
933restart:
934
935 for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
936 list_for_each_entry(chunk, &pcpu_slot[slot], list) {
937 if (size > chunk->contig_hint)
938 continue;
939
940 new_alloc = pcpu_need_to_extend(chunk, is_atomic);
941 if (new_alloc) {
942 if (is_atomic)
943 continue;
944 spin_unlock_irqrestore(&pcpu_lock, flags);
945 if (pcpu_extend_area_map(chunk,
946 new_alloc) < 0) {
947 err = "failed to extend area map";
948 goto fail;
949 }
950 spin_lock_irqsave(&pcpu_lock, flags);
951
952
953
954
955 goto restart;
956 }
957
958 off = pcpu_alloc_area(chunk, size, align, is_atomic,
959 &occ_pages);
960 if (off >= 0)
961 goto area_found;
962 }
963 }
964
965 spin_unlock_irqrestore(&pcpu_lock, flags);
966
967
968
969
970
971
972 if (is_atomic)
973 goto fail;
974
975 if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
976 chunk = pcpu_create_chunk();
977 if (!chunk) {
978 err = "failed to allocate new chunk";
979 goto fail;
980 }
981
982 spin_lock_irqsave(&pcpu_lock, flags);
983 pcpu_chunk_relocate(chunk, -1);
984 } else {
985 spin_lock_irqsave(&pcpu_lock, flags);
986 }
987
988 goto restart;
989
990area_found:
991 spin_unlock_irqrestore(&pcpu_lock, flags);
992
993
994 if (!is_atomic) {
995 int page_start, page_end, rs, re;
996
997 page_start = PFN_DOWN(off);
998 page_end = PFN_UP(off + size);
999
1000 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
1001 WARN_ON(chunk->immutable);
1002
1003 ret = pcpu_populate_chunk(chunk, rs, re);
1004
1005 spin_lock_irqsave(&pcpu_lock, flags);
1006 if (ret) {
1007 pcpu_free_area(chunk, off, &occ_pages);
1008 err = "failed to populate";
1009 goto fail_unlock;
1010 }
1011 pcpu_chunk_populated(chunk, rs, re);
1012 spin_unlock_irqrestore(&pcpu_lock, flags);
1013 }
1014
1015 mutex_unlock(&pcpu_alloc_mutex);
1016 }
1017
1018 if (chunk != pcpu_reserved_chunk)
1019 pcpu_nr_empty_pop_pages -= occ_pages;
1020
1021 if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
1022 pcpu_schedule_balance_work();
1023
1024
1025 for_each_possible_cpu(cpu)
1026 memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
1027
1028 ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
1029 kmemleak_alloc_percpu(ptr, size, gfp);
1030 return ptr;
1031
1032fail_unlock:
1033 spin_unlock_irqrestore(&pcpu_lock, flags);
1034fail:
1035 if (!is_atomic && do_warn && warn_limit) {
1036 pr_warning("PERCPU: allocation failed, size=%zu align=%zu atomic=%d, %s\n",
1037 size, align, is_atomic, err);
1038 dump_stack();
1039 if (!--warn_limit)
1040 pr_info("PERCPU: limit reached, disable warning\n");
1041 }
1042 if (is_atomic) {
1043
1044 pcpu_atomic_alloc_failed = true;
1045 pcpu_schedule_balance_work();
1046 } else {
1047 mutex_unlock(&pcpu_alloc_mutex);
1048 }
1049 return NULL;
1050}
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp)
1068{
1069 return pcpu_alloc(size, align, false, gfp);
1070}
1071EXPORT_SYMBOL_GPL(__alloc_percpu_gfp);
1072
1073
1074
1075
1076
1077
1078
1079
1080void __percpu *__alloc_percpu(size_t size, size_t align)
1081{
1082 return pcpu_alloc(size, align, false, GFP_KERNEL);
1083}
1084EXPORT_SYMBOL_GPL(__alloc_percpu);
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
1103{
1104 return pcpu_alloc(size, align, true, GFP_KERNEL);
1105}
1106
1107
1108
1109
1110
1111
1112
1113static void pcpu_balance_workfn(struct work_struct *work)
1114{
1115 LIST_HEAD(to_free);
1116 struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
1117 struct pcpu_chunk *chunk, *next;
1118 int slot, nr_to_pop, ret;
1119
1120
1121
1122
1123
1124 mutex_lock(&pcpu_alloc_mutex);
1125 spin_lock_irq(&pcpu_lock);
1126
1127 list_for_each_entry_safe(chunk, next, free_head, list) {
1128 WARN_ON(chunk->immutable);
1129
1130
1131 if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
1132 continue;
1133
1134 list_del_init(&chunk->map_extend_list);
1135 list_move(&chunk->list, &to_free);
1136 }
1137
1138 spin_unlock_irq(&pcpu_lock);
1139
1140 list_for_each_entry_safe(chunk, next, &to_free, list) {
1141 int rs, re;
1142
1143 pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) {
1144 pcpu_depopulate_chunk(chunk, rs, re);
1145 spin_lock_irq(&pcpu_lock);
1146 pcpu_chunk_depopulated(chunk, rs, re);
1147 spin_unlock_irq(&pcpu_lock);
1148 }
1149 pcpu_destroy_chunk(chunk);
1150 }
1151
1152
1153 do {
1154 int new_alloc = 0;
1155
1156 spin_lock_irq(&pcpu_lock);
1157
1158 chunk = list_first_entry_or_null(&pcpu_map_extend_chunks,
1159 struct pcpu_chunk, map_extend_list);
1160 if (chunk) {
1161 list_del_init(&chunk->map_extend_list);
1162 new_alloc = pcpu_need_to_extend(chunk, false);
1163 }
1164
1165 spin_unlock_irq(&pcpu_lock);
1166
1167 if (new_alloc)
1168 pcpu_extend_area_map(chunk, new_alloc);
1169 } while (chunk);
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181retry_pop:
1182 if (pcpu_atomic_alloc_failed) {
1183 nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
1184
1185 pcpu_atomic_alloc_failed = false;
1186 } else {
1187 nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
1188 pcpu_nr_empty_pop_pages,
1189 0, PCPU_EMPTY_POP_PAGES_HIGH);
1190 }
1191
1192 for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
1193 int nr_unpop = 0, rs, re;
1194
1195 if (!nr_to_pop)
1196 break;
1197
1198 spin_lock_irq(&pcpu_lock);
1199 list_for_each_entry(chunk, &pcpu_slot[slot], list) {
1200 nr_unpop = pcpu_unit_pages - chunk->nr_populated;
1201 if (nr_unpop)
1202 break;
1203 }
1204 spin_unlock_irq(&pcpu_lock);
1205
1206 if (!nr_unpop)
1207 continue;
1208
1209
1210 pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) {
1211 int nr = min(re - rs, nr_to_pop);
1212
1213 ret = pcpu_populate_chunk(chunk, rs, rs + nr);
1214 if (!ret) {
1215 nr_to_pop -= nr;
1216 spin_lock_irq(&pcpu_lock);
1217 pcpu_chunk_populated(chunk, rs, rs + nr);
1218 spin_unlock_irq(&pcpu_lock);
1219 } else {
1220 nr_to_pop = 0;
1221 }
1222
1223 if (!nr_to_pop)
1224 break;
1225 }
1226 }
1227
1228 if (nr_to_pop) {
1229
1230 chunk = pcpu_create_chunk();
1231 if (chunk) {
1232 spin_lock_irq(&pcpu_lock);
1233 pcpu_chunk_relocate(chunk, -1);
1234 spin_unlock_irq(&pcpu_lock);
1235 goto retry_pop;
1236 }
1237 }
1238
1239 mutex_unlock(&pcpu_alloc_mutex);
1240}
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251void free_percpu(void __percpu *ptr)
1252{
1253 void *addr;
1254 struct pcpu_chunk *chunk;
1255 unsigned long flags;
1256 int off, occ_pages;
1257
1258 if (!ptr)
1259 return;
1260
1261 kmemleak_free_percpu(ptr);
1262
1263 addr = __pcpu_ptr_to_addr(ptr);
1264
1265 spin_lock_irqsave(&pcpu_lock, flags);
1266
1267 chunk = pcpu_chunk_addr_search(addr);
1268 off = addr - chunk->base_addr;
1269
1270 pcpu_free_area(chunk, off, &occ_pages);
1271
1272 if (chunk != pcpu_reserved_chunk)
1273 pcpu_nr_empty_pop_pages += occ_pages;
1274
1275
1276 if (chunk->free_size == pcpu_unit_size) {
1277 struct pcpu_chunk *pos;
1278
1279 list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
1280 if (pos != chunk) {
1281 pcpu_schedule_balance_work();
1282 break;
1283 }
1284 }
1285
1286 spin_unlock_irqrestore(&pcpu_lock, flags);
1287}
1288EXPORT_SYMBOL_GPL(free_percpu);
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301bool is_kernel_percpu_address(unsigned long addr)
1302{
1303#ifdef CONFIG_SMP
1304 const size_t static_size = __per_cpu_end - __per_cpu_start;
1305 void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
1306 unsigned int cpu;
1307
1308 for_each_possible_cpu(cpu) {
1309 void *start = per_cpu_ptr(base, cpu);
1310
1311 if ((void *)addr >= start && (void *)addr < start + static_size)
1312 return true;
1313 }
1314#endif
1315
1316 return false;
1317}
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342phys_addr_t per_cpu_ptr_to_phys(void *addr)
1343{
1344 void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
1345 bool in_first_chunk = false;
1346 unsigned long first_low, first_high;
1347 unsigned int cpu;
1348
1349
1350
1351
1352
1353
1354 first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0);
1355 first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu,
1356 pcpu_unit_pages);
1357 if ((unsigned long)addr >= first_low &&
1358 (unsigned long)addr < first_high) {
1359 for_each_possible_cpu(cpu) {
1360 void *start = per_cpu_ptr(base, cpu);
1361
1362 if (addr >= start && addr < start + pcpu_unit_size) {
1363 in_first_chunk = true;
1364 break;
1365 }
1366 }
1367 }
1368
1369 if (in_first_chunk) {
1370 if (!is_vmalloc_addr(addr))
1371 return __pa(addr);
1372 else
1373 return page_to_phys(vmalloc_to_page(addr)) +
1374 offset_in_page(addr);
1375 } else
1376 return page_to_phys(pcpu_addr_to_page(addr)) +
1377 offset_in_page(addr);
1378}
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
1396 int nr_units)
1397{
1398 struct pcpu_alloc_info *ai;
1399 size_t base_size, ai_size;
1400 void *ptr;
1401 int unit;
1402
1403 base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]),
1404 __alignof__(ai->groups[0].cpu_map[0]));
1405 ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
1406
1407 ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size));
1408 if (!ptr)
1409 return NULL;
1410 ai = ptr;
1411 ptr += base_size;
1412
1413 ai->groups[0].cpu_map = ptr;
1414
1415 for (unit = 0; unit < nr_units; unit++)
1416 ai->groups[0].cpu_map[unit] = NR_CPUS;
1417
1418 ai->nr_groups = nr_groups;
1419 ai->__ai_size = PFN_ALIGN(ai_size);
1420
1421 return ai;
1422}
1423
1424
1425
1426
1427
1428
1429
1430void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
1431{
1432 free_bootmem(__pa(ai), ai->__ai_size);
1433}
1434
1435
1436
1437
1438
1439
1440
1441
1442static void pcpu_dump_alloc_info(const char *lvl,
1443 const struct pcpu_alloc_info *ai)
1444{
1445 int group_width = 1, cpu_width = 1, width;
1446 char empty_str[] = "--------";
1447 int alloc = 0, alloc_end = 0;
1448 int group, v;
1449 int upa, apl;
1450
1451 v = ai->nr_groups;
1452 while (v /= 10)
1453 group_width++;
1454
1455 v = num_possible_cpus();
1456 while (v /= 10)
1457 cpu_width++;
1458 empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
1459
1460 upa = ai->alloc_size / ai->unit_size;
1461 width = upa * (cpu_width + 1) + group_width + 3;
1462 apl = rounddown_pow_of_two(max(60 / width, 1));
1463
1464 printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
1465 lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
1466 ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);
1467
1468 for (group = 0; group < ai->nr_groups; group++) {
1469 const struct pcpu_group_info *gi = &ai->groups[group];
1470 int unit = 0, unit_end = 0;
1471
1472 BUG_ON(gi->nr_units % upa);
1473 for (alloc_end += gi->nr_units / upa;
1474 alloc < alloc_end; alloc++) {
1475 if (!(alloc % apl)) {
1476 printk(KERN_CONT "\n");
1477 printk("%spcpu-alloc: ", lvl);
1478 }
1479 printk(KERN_CONT "[%0*d] ", group_width, group);
1480
1481 for (unit_end += upa; unit < unit_end; unit++)
1482 if (gi->cpu_map[unit] != NR_CPUS)
1483 printk(KERN_CONT "%0*d ", cpu_width,
1484 gi->cpu_map[unit]);
1485 else
1486 printk(KERN_CONT "%s ", empty_str);
1487 }
1488 }
1489 printk(KERN_CONT "\n");
1490}
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1549 void *base_addr)
1550{
1551 static char cpus_buf[4096] __initdata;
1552 static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
1553 static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
1554 size_t dyn_size = ai->dyn_size;
1555 size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
1556 struct pcpu_chunk *schunk, *dchunk = NULL;
1557 unsigned long *group_offsets;
1558 size_t *group_sizes;
1559 unsigned long *unit_off;
1560 unsigned int cpu;
1561 int *unit_map;
1562 int group, unit, i;
1563
1564 cpumask_scnprintf(cpus_buf, sizeof(cpus_buf), cpu_possible_mask);
1565
1566#define PCPU_SETUP_BUG_ON(cond) do { \
1567 if (unlikely(cond)) { \
1568 pr_emerg("PERCPU: failed to initialize, %s", #cond); \
1569 pr_emerg("PERCPU: cpu_possible_mask=%s\n", cpus_buf); \
1570 pcpu_dump_alloc_info(KERN_EMERG, ai); \
1571 BUG(); \
1572 } \
1573} while (0)
1574
1575
1576 PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
1577#ifdef CONFIG_SMP
1578 PCPU_SETUP_BUG_ON(!ai->static_size);
1579 PCPU_SETUP_BUG_ON((unsigned long)__per_cpu_start & ~PAGE_MASK);
1580#endif
1581 PCPU_SETUP_BUG_ON(!base_addr);
1582 PCPU_SETUP_BUG_ON((unsigned long)base_addr & ~PAGE_MASK);
1583 PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
1584 PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
1585 PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
1586 PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
1587 PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
1588
1589
1590 group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0]));
1591 group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0]));
1592 unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0]));
1593 unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0]));
1594
1595 for (cpu = 0; cpu < nr_cpu_ids; cpu++)
1596 unit_map[cpu] = UINT_MAX;
1597
1598 pcpu_low_unit_cpu = NR_CPUS;
1599 pcpu_high_unit_cpu = NR_CPUS;
1600
1601 for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
1602 const struct pcpu_group_info *gi = &ai->groups[group];
1603
1604 group_offsets[group] = gi->base_offset;
1605 group_sizes[group] = gi->nr_units * ai->unit_size;
1606
1607 for (i = 0; i < gi->nr_units; i++) {
1608 cpu = gi->cpu_map[i];
1609 if (cpu == NR_CPUS)
1610 continue;
1611
1612 PCPU_SETUP_BUG_ON(cpu > nr_cpu_ids);
1613 PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
1614 PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);
1615
1616 unit_map[cpu] = unit + i;
1617 unit_off[cpu] = gi->base_offset + i * ai->unit_size;
1618
1619
1620 if (pcpu_low_unit_cpu == NR_CPUS ||
1621 unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
1622 pcpu_low_unit_cpu = cpu;
1623 if (pcpu_high_unit_cpu == NR_CPUS ||
1624 unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
1625 pcpu_high_unit_cpu = cpu;
1626 }
1627 }
1628 pcpu_nr_units = unit;
1629
1630 for_each_possible_cpu(cpu)
1631 PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);
1632
1633
1634#undef PCPU_SETUP_BUG_ON
1635 pcpu_dump_alloc_info(KERN_DEBUG, ai);
1636
1637 pcpu_nr_groups = ai->nr_groups;
1638 pcpu_group_offsets = group_offsets;
1639 pcpu_group_sizes = group_sizes;
1640 pcpu_unit_map = unit_map;
1641 pcpu_unit_offsets = unit_off;
1642
1643
1644 pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
1645 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
1646 pcpu_atom_size = ai->atom_size;
1647 pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
1648 BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
1649
1650
1651
1652
1653
1654 pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
1655 pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
1656 for (i = 0; i < pcpu_nr_slots; i++)
1657 INIT_LIST_HEAD(&pcpu_slot[i]);
1658
1659
1660
1661
1662
1663
1664
1665
1666 schunk = alloc_bootmem(pcpu_chunk_struct_size);
1667 INIT_LIST_HEAD(&schunk->list);
1668 INIT_LIST_HEAD(&schunk->map_extend_list);
1669 schunk->base_addr = base_addr;
1670 schunk->map = smap;
1671 schunk->map_alloc = ARRAY_SIZE(smap);
1672 schunk->immutable = true;
1673 bitmap_fill(schunk->populated, pcpu_unit_pages);
1674 schunk->nr_populated = pcpu_unit_pages;
1675
1676 if (ai->reserved_size) {
1677 schunk->free_size = ai->reserved_size;
1678 pcpu_reserved_chunk = schunk;
1679 pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size;
1680 } else {
1681 schunk->free_size = dyn_size;
1682 dyn_size = 0;
1683 }
1684 schunk->contig_hint = schunk->free_size;
1685
1686 schunk->map[0] = 1;
1687 schunk->map[1] = ai->static_size;
1688 schunk->map_used = 1;
1689 if (schunk->free_size)
1690 schunk->map[++schunk->map_used] = 1 | (ai->static_size + schunk->free_size);
1691 else
1692 schunk->map[1] |= 1;
1693
1694
1695 if (dyn_size) {
1696 dchunk = alloc_bootmem(pcpu_chunk_struct_size);
1697 INIT_LIST_HEAD(&dchunk->list);
1698 INIT_LIST_HEAD(&dchunk->map_extend_list);
1699 dchunk->base_addr = base_addr;
1700 dchunk->map = dmap;
1701 dchunk->map_alloc = ARRAY_SIZE(dmap);
1702 dchunk->immutable = true;
1703 bitmap_fill(dchunk->populated, pcpu_unit_pages);
1704 dchunk->nr_populated = pcpu_unit_pages;
1705
1706 dchunk->contig_hint = dchunk->free_size = dyn_size;
1707 dchunk->map[0] = 1;
1708 dchunk->map[1] = pcpu_reserved_chunk_limit;
1709 dchunk->map[2] = (pcpu_reserved_chunk_limit + dchunk->free_size) | 1;
1710 dchunk->map_used = 2;
1711 }
1712
1713
1714 pcpu_first_chunk = dchunk ?: schunk;
1715 pcpu_nr_empty_pop_pages +=
1716 pcpu_count_occupied_pages(pcpu_first_chunk, 1);
1717 pcpu_chunk_relocate(pcpu_first_chunk, -1);
1718
1719
1720 pcpu_base_addr = base_addr;
1721 return 0;
1722}
1723
1724#ifdef CONFIG_SMP
1725
1726const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
1727 [PCPU_FC_AUTO] = "auto",
1728 [PCPU_FC_EMBED] = "embed",
1729 [PCPU_FC_PAGE] = "page",
1730};
1731
1732enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
1733
1734static int __init percpu_alloc_setup(char *str)
1735{
1736 if (!str)
1737 return -EINVAL;
1738
1739 if (0)
1740 ;
1741#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
1742 else if (!strcmp(str, "embed"))
1743 pcpu_chosen_fc = PCPU_FC_EMBED;
1744#endif
1745#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
1746 else if (!strcmp(str, "page"))
1747 pcpu_chosen_fc = PCPU_FC_PAGE;
1748#endif
1749 else
1750 pr_warning("PERCPU: unknown allocator %s specified\n", str);
1751
1752 return 0;
1753}
1754early_param("percpu_alloc", percpu_alloc_setup);
1755
1756
1757
1758
1759
1760
1761#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
1762 !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
1763#define BUILD_EMBED_FIRST_CHUNK
1764#endif
1765
1766
1767#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
1768#define BUILD_PAGE_FIRST_CHUNK
1769#endif
1770
1771
1772#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
1795 size_t reserved_size, size_t dyn_size,
1796 size_t atom_size,
1797 pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
1798{
1799 static int group_map[NR_CPUS] __initdata;
1800 static int group_cnt[NR_CPUS] __initdata;
1801 const size_t static_size = __per_cpu_end - __per_cpu_start;
1802 int nr_groups = 1, nr_units = 0;
1803 size_t size_sum, min_unit_size, alloc_size;
1804 int upa, max_upa, uninitialized_var(best_upa);
1805 int last_allocs, group, unit;
1806 unsigned int cpu, tcpu;
1807 struct pcpu_alloc_info *ai;
1808 unsigned int *cpu_map;
1809
1810
1811 memset(group_map, 0, sizeof(group_map));
1812 memset(group_cnt, 0, sizeof(group_cnt));
1813
1814
1815 size_sum = PFN_ALIGN(static_size + reserved_size +
1816 max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
1817 dyn_size = size_sum - static_size - reserved_size;
1818
1819
1820
1821
1822
1823
1824
1825 min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
1826
1827 alloc_size = roundup(min_unit_size, atom_size);
1828 upa = alloc_size / min_unit_size;
1829 while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
1830 upa--;
1831 max_upa = upa;
1832
1833
1834 for_each_possible_cpu(cpu) {
1835 group = 0;
1836 next_group:
1837 for_each_possible_cpu(tcpu) {
1838 if (cpu == tcpu)
1839 break;
1840 if (group_map[tcpu] == group && cpu_distance_fn &&
1841 (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
1842 cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
1843 group++;
1844 nr_groups = max(nr_groups, group + 1);
1845 goto next_group;
1846 }
1847 }
1848 group_map[cpu] = group;
1849 group_cnt[group]++;
1850 }
1851
1852
1853
1854
1855
1856
1857 last_allocs = INT_MAX;
1858 for (upa = max_upa; upa; upa--) {
1859 int allocs = 0, wasted = 0;
1860
1861 if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
1862 continue;
1863
1864 for (group = 0; group < nr_groups; group++) {
1865 int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
1866 allocs += this_allocs;
1867 wasted += this_allocs * upa - group_cnt[group];
1868 }
1869
1870
1871
1872
1873
1874
1875 if (wasted > num_possible_cpus() / 3)
1876 continue;
1877
1878
1879 if (allocs > last_allocs)
1880 break;
1881 last_allocs = allocs;
1882 best_upa = upa;
1883 }
1884 upa = best_upa;
1885
1886
1887 for (group = 0; group < nr_groups; group++)
1888 nr_units += roundup(group_cnt[group], upa);
1889
1890 ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
1891 if (!ai)
1892 return ERR_PTR(-ENOMEM);
1893 cpu_map = ai->groups[0].cpu_map;
1894
1895 for (group = 0; group < nr_groups; group++) {
1896 ai->groups[group].cpu_map = cpu_map;
1897 cpu_map += roundup(group_cnt[group], upa);
1898 }
1899
1900 ai->static_size = static_size;
1901 ai->reserved_size = reserved_size;
1902 ai->dyn_size = dyn_size;
1903 ai->unit_size = alloc_size / upa;
1904 ai->atom_size = atom_size;
1905 ai->alloc_size = alloc_size;
1906
1907 for (group = 0, unit = 0; group_cnt[group]; group++) {
1908 struct pcpu_group_info *gi = &ai->groups[group];
1909
1910
1911
1912
1913
1914
1915 gi->base_offset = unit * ai->unit_size;
1916
1917 for_each_possible_cpu(cpu)
1918 if (group_map[cpu] == group)
1919 gi->cpu_map[gi->nr_units++] = cpu;
1920 gi->nr_units = roundup(gi->nr_units, upa);
1921 unit += gi->nr_units;
1922 }
1923 BUG_ON(unit != nr_units);
1924
1925 return ai;
1926}
1927#endif
1928
1929#if defined(BUILD_EMBED_FIRST_CHUNK)
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
1963 size_t atom_size,
1964 pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
1965 pcpu_fc_alloc_fn_t alloc_fn,
1966 pcpu_fc_free_fn_t free_fn)
1967{
1968 void *base = (void *)ULONG_MAX;
1969 void **areas = NULL;
1970 struct pcpu_alloc_info *ai;
1971 size_t size_sum, areas_size, max_distance;
1972 int group, i, rc;
1973
1974 ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
1975 cpu_distance_fn);
1976 if (IS_ERR(ai))
1977 return PTR_ERR(ai);
1978
1979 size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
1980 areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
1981
1982 areas = alloc_bootmem_nopanic(areas_size);
1983 if (!areas) {
1984 rc = -ENOMEM;
1985 goto out_free;
1986 }
1987
1988
1989 for (group = 0; group < ai->nr_groups; group++) {
1990 struct pcpu_group_info *gi = &ai->groups[group];
1991 unsigned int cpu = NR_CPUS;
1992 void *ptr;
1993
1994 for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
1995 cpu = gi->cpu_map[i];
1996 BUG_ON(cpu == NR_CPUS);
1997
1998
1999 ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
2000 if (!ptr) {
2001 rc = -ENOMEM;
2002 goto out_free_areas;
2003 }
2004
2005 kmemleak_free(ptr);
2006 areas[group] = ptr;
2007
2008 base = min(ptr, base);
2009 }
2010
2011
2012
2013
2014
2015
2016 for (group = 0; group < ai->nr_groups; group++) {
2017 struct pcpu_group_info *gi = &ai->groups[group];
2018 void *ptr = areas[group];
2019
2020 for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
2021 if (gi->cpu_map[i] == NR_CPUS) {
2022
2023 free_fn(ptr, ai->unit_size);
2024 continue;
2025 }
2026
2027 memcpy(ptr, __per_cpu_load, ai->static_size);
2028 free_fn(ptr + size_sum, ai->unit_size - size_sum);
2029 }
2030 }
2031
2032
2033 max_distance = 0;
2034 for (group = 0; group < ai->nr_groups; group++) {
2035 ai->groups[group].base_offset = areas[group] - base;
2036 max_distance = max_t(size_t, max_distance,
2037 ai->groups[group].base_offset);
2038 }
2039 max_distance += ai->unit_size;
2040
2041
2042 if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) {
2043 pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc "
2044 "space 0x%lx\n", max_distance,
2045 (unsigned long)(VMALLOC_END - VMALLOC_START));
2046#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
2047
2048 rc = -EINVAL;
2049 goto out_free;
2050#endif
2051 }
2052
2053 pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
2054 PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
2055 ai->dyn_size, ai->unit_size);
2056
2057 rc = pcpu_setup_first_chunk(ai, base);
2058 goto out_free;
2059
2060out_free_areas:
2061 for (group = 0; group < ai->nr_groups; group++)
2062 free_fn(areas[group],
2063 ai->groups[group].nr_units * ai->unit_size);
2064out_free:
2065 pcpu_free_alloc_info(ai);
2066 if (areas)
2067 free_bootmem(__pa(areas), areas_size);
2068 return rc;
2069}
2070#endif
2071
2072#ifdef BUILD_PAGE_FIRST_CHUNK
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089int __init pcpu_page_first_chunk(size_t reserved_size,
2090 pcpu_fc_alloc_fn_t alloc_fn,
2091 pcpu_fc_free_fn_t free_fn,
2092 pcpu_fc_populate_pte_fn_t populate_pte_fn)
2093{
2094 static struct vm_struct vm;
2095 struct pcpu_alloc_info *ai;
2096 char psize_str[16];
2097 int unit_pages;
2098 size_t pages_size;
2099 struct page **pages;
2100 int unit, i, j, rc;
2101
2102 snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
2103
2104 ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
2105 if (IS_ERR(ai))
2106 return PTR_ERR(ai);
2107 BUG_ON(ai->nr_groups != 1);
2108 BUG_ON(ai->groups[0].nr_units != num_possible_cpus());
2109
2110 unit_pages = ai->unit_size >> PAGE_SHIFT;
2111
2112
2113 pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
2114 sizeof(pages[0]));
2115 pages = alloc_bootmem(pages_size);
2116
2117
2118 j = 0;
2119 for (unit = 0; unit < num_possible_cpus(); unit++)
2120 for (i = 0; i < unit_pages; i++) {
2121 unsigned int cpu = ai->groups[0].cpu_map[unit];
2122 void *ptr;
2123
2124 ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
2125 if (!ptr) {
2126 pr_warning("PERCPU: failed to allocate %s page "
2127 "for cpu%u\n", psize_str, cpu);
2128 goto enomem;
2129 }
2130
2131 kmemleak_free(ptr);
2132 pages[j++] = virt_to_page(ptr);
2133 }
2134
2135
2136 vm.flags = VM_ALLOC;
2137 vm.size = num_possible_cpus() * ai->unit_size;
2138 vm_area_register_early(&vm, PAGE_SIZE);
2139
2140 for (unit = 0; unit < num_possible_cpus(); unit++) {
2141 unsigned long unit_addr =
2142 (unsigned long)vm.addr + unit * ai->unit_size;
2143
2144 for (i = 0; i < unit_pages; i++)
2145 populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
2146
2147
2148 rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
2149 unit_pages);
2150 if (rc < 0)
2151 panic("failed to map percpu area, err=%d\n", rc);
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162 memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
2163 }
2164
2165
2166 pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n",
2167 unit_pages, psize_str, vm.addr, ai->static_size,
2168 ai->reserved_size, ai->dyn_size);
2169
2170 rc = pcpu_setup_first_chunk(ai, vm.addr);
2171 goto out_free_ar;
2172
2173enomem:
2174 while (--j >= 0)
2175 free_fn(page_address(pages[j]), PAGE_SIZE);
2176 rc = -ENOMEM;
2177out_free_ar:
2178 free_bootmem(__pa(pages), pages_size);
2179 pcpu_free_alloc_info(ai);
2180 return rc;
2181}
2182#endif
2183
2184#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
2198EXPORT_SYMBOL(__per_cpu_offset);
2199
2200static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
2201 size_t align)
2202{
2203 return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS));
2204}
2205
2206static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
2207{
2208 free_bootmem(__pa(ptr), size);
2209}
2210
2211void __init setup_per_cpu_areas(void)
2212{
2213 unsigned long delta;
2214 unsigned int cpu;
2215 int rc;
2216
2217
2218
2219
2220
2221 rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
2222 PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
2223 pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
2224 if (rc < 0)
2225 panic("Failed to initialize percpu areas.");
2226
2227 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
2228 for_each_possible_cpu(cpu)
2229 __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
2230}
2231#endif
2232
2233#else
2234
2235
2236
2237
2238
2239
2240
2241
2242void __init setup_per_cpu_areas(void)
2243{
2244 const size_t unit_size =
2245 roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
2246 PERCPU_DYNAMIC_RESERVE));
2247 struct pcpu_alloc_info *ai;
2248 void *fc;
2249
2250 ai = pcpu_alloc_alloc_info(1, 1);
2251 fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
2252 if (!ai || !fc)
2253 panic("Failed to allocate memory for percpu areas.");
2254
2255 kmemleak_free(fc);
2256
2257 ai->dyn_size = unit_size;
2258 ai->unit_size = unit_size;
2259 ai->atom_size = unit_size;
2260 ai->alloc_size = unit_size;
2261 ai->groups[0].nr_units = 1;
2262 ai->groups[0].cpu_map[0] = 0;
2263
2264 if (pcpu_setup_first_chunk(ai, fc) < 0)
2265 panic("Failed to initialize percpu areas.");
2266}
2267
2268#endif
2269
2270
2271
2272
2273
2274
2275
2276void __init percpu_init_late(void)
2277{
2278 struct pcpu_chunk *target_chunks[] =
2279 { pcpu_first_chunk, pcpu_reserved_chunk, NULL };
2280 struct pcpu_chunk *chunk;
2281 unsigned long flags;
2282 int i;
2283
2284 for (i = 0; (chunk = target_chunks[i]); i++) {
2285 int *map;
2286 const size_t size = PERCPU_DYNAMIC_EARLY_SLOTS * sizeof(map[0]);
2287
2288 BUILD_BUG_ON(size > PAGE_SIZE);
2289
2290 map = pcpu_mem_zalloc(size);
2291 BUG_ON(!map);
2292
2293 spin_lock_irqsave(&pcpu_lock, flags);
2294 memcpy(map, chunk->map, size);
2295 chunk->map = map;
2296 spin_unlock_irqrestore(&pcpu_lock, flags);
2297 }
2298}
2299
2300
2301
2302
2303
2304
2305static int __init percpu_enable_async(void)
2306{
2307 pcpu_async_enabled = true;
2308 return 0;
2309}
2310subsys_initcall(percpu_enable_async);
2311