1
2
3
4
5
6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7
8#include <linux/mm.h>
9#include <linux/sched.h>
10#include <linux/sched/mm.h>
11#include <linux/sched/numa_balancing.h>
12#include <linux/highmem.h>
13#include <linux/hugetlb.h>
14#include <linux/mmu_notifier.h>
15#include <linux/rmap.h>
16#include <linux/swap.h>
17#include <linux/shrinker.h>
18#include <linux/mm_inline.h>
19#include <linux/swapops.h>
20#include <linux/backing-dev.h>
21#include <linux/dax.h>
22#include <linux/mm_types.h>
23#include <linux/khugepaged.h>
24#include <linux/freezer.h>
25#include <linux/mman.h>
26#include <linux/memremap.h>
27#include <linux/pagemap.h>
28#include <linux/debugfs.h>
29#include <linux/migrate.h>
30#include <linux/hashtable.h>
31#include <linux/userfaultfd_k.h>
32#include <linux/page_idle.h>
33#include <linux/shmem_fs.h>
34#include <linux/oom.h>
35#include <linux/numa.h>
36#include <linux/page_owner.h>
37#include <linux/sched/sysctl.h>
38#include <linux/memory-tiers.h>
39#include <linux/compat.h>
40#include <linux/pgalloc_tag.h>
41#include <linux/pagewalk.h>
42
43#include <asm/tlb.h>
44#include <asm/pgalloc.h>
45#include "internal.h"
46#include "swap.h"
47
48#define CREATE_TRACE_POINTS
49#include <trace/events/thp.h>
50
51
52
53
54
55
56
57
58
59unsigned long transparent_hugepage_flags __read_mostly =
60#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
61 (1<<TRANSPARENT_HUGEPAGE_FLAG)|
62#endif
63#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
64 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
65#endif
66 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
67 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
68 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
69
70static struct shrinker *deferred_split_shrinker;
71static unsigned long deferred_split_count(struct shrinker *shrink,
72 struct shrink_control *sc);
73static unsigned long deferred_split_scan(struct shrinker *shrink,
74 struct shrink_control *sc);
75static bool split_underused_thp = true;
76
77static atomic_t huge_zero_refcount;
78struct folio *huge_zero_folio __read_mostly;
79unsigned long huge_zero_pfn __read_mostly = ~0UL;
80unsigned long huge_anon_orders_always __read_mostly;
81unsigned long huge_anon_orders_madvise __read_mostly;
82unsigned long huge_anon_orders_inherit __read_mostly;
83static bool anon_orders_configured __initdata;
84
85static inline bool file_thp_enabled(struct vm_area_struct *vma)
86{
87 struct inode *inode;
88
89 if (!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS))
90 return false;
91
92 if (!vma->vm_file)
93 return false;
94
95 inode = file_inode(vma->vm_file);
96
97 return !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode);
98}
99
100unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
101 vm_flags_t vm_flags,
102 unsigned long tva_flags,
103 unsigned long orders)
104{
105 bool smaps = tva_flags & TVA_SMAPS;
106 bool in_pf = tva_flags & TVA_IN_PF;
107 bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS;
108 unsigned long supported_orders;
109
110
111 if (vma_is_anonymous(vma))
112 supported_orders = THP_ORDERS_ALL_ANON;
113 else if (vma_is_special_huge(vma))
114 supported_orders = THP_ORDERS_ALL_SPECIAL;
115 else
116 supported_orders = THP_ORDERS_ALL_FILE_DEFAULT;
117
118 orders &= supported_orders;
119 if (!orders)
120 return 0;
121
122 if (!vma->vm_mm)
123 return 0;
124
125 if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags))
126 return 0;
127
128
129 if (vma_is_dax(vma))
130 return in_pf ? orders : 0;
131
132
133
134
135
136
137 if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED))
138 return 0;
139
140
141
142
143
144
145
146
147 if (!in_pf) {
148 int order = highest_order(orders);
149 unsigned long addr;
150
151 while (orders) {
152 addr = vma->vm_end - (PAGE_SIZE << order);
153 if (thp_vma_suitable_order(vma, addr, order))
154 break;
155 order = next_order(&orders, order);
156 }
157
158 if (!orders)
159 return 0;
160 }
161
162
163
164
165
166
167 if (!in_pf && shmem_file(vma->vm_file))
168 return orders & shmem_allowable_huge_orders(file_inode(vma->vm_file),
169 vma, vma->vm_pgoff, 0,
170 !enforce_sysfs);
171
172 if (!vma_is_anonymous(vma)) {
173
174
175
176
177 if (enforce_sysfs &&
178 (!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) &&
179 !hugepage_global_always())))
180 return 0;
181
182
183
184
185
186 if (((in_pf || smaps)) && vma->vm_ops->huge_fault)
187 return orders;
188
189 if (((!in_pf || smaps)) && file_thp_enabled(vma))
190 return orders;
191 return 0;
192 }
193
194 if (vma_is_temporary_stack(vma))
195 return 0;
196
197
198
199
200
201
202
203
204 if (!vma->anon_vma)
205 return (smaps || in_pf) ? orders : 0;
206
207 return orders;
208}
209
210static bool get_huge_zero_page(void)
211{
212 struct folio *zero_folio;
213retry:
214 if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
215 return true;
216
217 zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
218 HPAGE_PMD_ORDER);
219 if (!zero_folio) {
220 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
221 return false;
222 }
223
224 folio_clear_large_rmappable(zero_folio);
225 preempt_disable();
226 if (cmpxchg(&huge_zero_folio, NULL, zero_folio)) {
227 preempt_enable();
228 folio_put(zero_folio);
229 goto retry;
230 }
231 WRITE_ONCE(huge_zero_pfn, folio_pfn(zero_folio));
232
233
234 atomic_set(&huge_zero_refcount, 2);
235 preempt_enable();
236 count_vm_event(THP_ZERO_PAGE_ALLOC);
237 return true;
238}
239
240static void put_huge_zero_page(void)
241{
242
243
244
245
246 BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
247}
248
249struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
250{
251 if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
252 return READ_ONCE(huge_zero_folio);
253
254 if (!get_huge_zero_page())
255 return NULL;
256
257 if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
258 put_huge_zero_page();
259
260 return READ_ONCE(huge_zero_folio);
261}
262
263void mm_put_huge_zero_folio(struct mm_struct *mm)
264{
265 if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
266 put_huge_zero_page();
267}
268
269static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
270 struct shrink_control *sc)
271{
272
273 return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
274}
275
276static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
277 struct shrink_control *sc)
278{
279 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
280 struct folio *zero_folio = xchg(&huge_zero_folio, NULL);
281 BUG_ON(zero_folio == NULL);
282 WRITE_ONCE(huge_zero_pfn, ~0UL);
283 folio_put(zero_folio);
284 return HPAGE_PMD_NR;
285 }
286
287 return 0;
288}
289
290static struct shrinker *huge_zero_page_shrinker;
291
292#ifdef CONFIG_SYSFS
293static ssize_t enabled_show(struct kobject *kobj,
294 struct kobj_attribute *attr, char *buf)
295{
296 const char *output;
297
298 if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
299 output = "[always] madvise never";
300 else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
301 &transparent_hugepage_flags))
302 output = "always [madvise] never";
303 else
304 output = "always madvise [never]";
305
306 return sysfs_emit(buf, "%s\n", output);
307}
308
309static ssize_t enabled_store(struct kobject *kobj,
310 struct kobj_attribute *attr,
311 const char *buf, size_t count)
312{
313 ssize_t ret = count;
314
315 if (sysfs_streq(buf, "always")) {
316 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
317 set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
318 } else if (sysfs_streq(buf, "madvise")) {
319 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
320 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
321 } else if (sysfs_streq(buf, "never")) {
322 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
323 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
324 } else
325 ret = -EINVAL;
326
327 if (ret > 0) {
328 int err = start_stop_khugepaged();
329 if (err)
330 ret = err;
331 }
332 return ret;
333}
334
335static struct kobj_attribute enabled_attr = __ATTR_RW(enabled);
336
337ssize_t single_hugepage_flag_show(struct kobject *kobj,
338 struct kobj_attribute *attr, char *buf,
339 enum transparent_hugepage_flag flag)
340{
341 return sysfs_emit(buf, "%d\n",
342 !!test_bit(flag, &transparent_hugepage_flags));
343}
344
345ssize_t single_hugepage_flag_store(struct kobject *kobj,
346 struct kobj_attribute *attr,
347 const char *buf, size_t count,
348 enum transparent_hugepage_flag flag)
349{
350 unsigned long value;
351 int ret;
352
353 ret = kstrtoul(buf, 10, &value);
354 if (ret < 0)
355 return ret;
356 if (value > 1)
357 return -EINVAL;
358
359 if (value)
360 set_bit(flag, &transparent_hugepage_flags);
361 else
362 clear_bit(flag, &transparent_hugepage_flags);
363
364 return count;
365}
366
367static ssize_t defrag_show(struct kobject *kobj,
368 struct kobj_attribute *attr, char *buf)
369{
370 const char *output;
371
372 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
373 &transparent_hugepage_flags))
374 output = "[always] defer defer+madvise madvise never";
375 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
376 &transparent_hugepage_flags))
377 output = "always [defer] defer+madvise madvise never";
378 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
379 &transparent_hugepage_flags))
380 output = "always defer [defer+madvise] madvise never";
381 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
382 &transparent_hugepage_flags))
383 output = "always defer defer+madvise [madvise] never";
384 else
385 output = "always defer defer+madvise madvise [never]";
386
387 return sysfs_emit(buf, "%s\n", output);
388}
389
390static ssize_t defrag_store(struct kobject *kobj,
391 struct kobj_attribute *attr,
392 const char *buf, size_t count)
393{
394 if (sysfs_streq(buf, "always")) {
395 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
396 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
397 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
398 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
399 } else if (sysfs_streq(buf, "defer+madvise")) {
400 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
401 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
402 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
403 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
404 } else if (sysfs_streq(buf, "defer")) {
405 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
406 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
407 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
408 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
409 } else if (sysfs_streq(buf, "madvise")) {
410 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
411 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
412 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
413 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
414 } else if (sysfs_streq(buf, "never")) {
415 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
416 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
417 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
418 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
419 } else
420 return -EINVAL;
421
422 return count;
423}
424static struct kobj_attribute defrag_attr = __ATTR_RW(defrag);
425
426static ssize_t use_zero_page_show(struct kobject *kobj,
427 struct kobj_attribute *attr, char *buf)
428{
429 return single_hugepage_flag_show(kobj, attr, buf,
430 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
431}
432static ssize_t use_zero_page_store(struct kobject *kobj,
433 struct kobj_attribute *attr, const char *buf, size_t count)
434{
435 return single_hugepage_flag_store(kobj, attr, buf, count,
436 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
437}
438static struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page);
439
440static ssize_t hpage_pmd_size_show(struct kobject *kobj,
441 struct kobj_attribute *attr, char *buf)
442{
443 return sysfs_emit(buf, "%lu\n", HPAGE_PMD_SIZE);
444}
445static struct kobj_attribute hpage_pmd_size_attr =
446 __ATTR_RO(hpage_pmd_size);
447
448static ssize_t split_underused_thp_show(struct kobject *kobj,
449 struct kobj_attribute *attr, char *buf)
450{
451 return sysfs_emit(buf, "%d\n", split_underused_thp);
452}
453
454static ssize_t split_underused_thp_store(struct kobject *kobj,
455 struct kobj_attribute *attr,
456 const char *buf, size_t count)
457{
458 int err = kstrtobool(buf, &split_underused_thp);
459
460 if (err < 0)
461 return err;
462
463 return count;
464}
465
466static struct kobj_attribute split_underused_thp_attr = __ATTR(
467 shrink_underused, 0644, split_underused_thp_show, split_underused_thp_store);
468
469static struct attribute *hugepage_attr[] = {
470 &enabled_attr.attr,
471 &defrag_attr.attr,
472 &use_zero_page_attr.attr,
473 &hpage_pmd_size_attr.attr,
474#ifdef CONFIG_SHMEM
475 &shmem_enabled_attr.attr,
476#endif
477 &split_underused_thp_attr.attr,
478 NULL,
479};
480
481static const struct attribute_group hugepage_attr_group = {
482 .attrs = hugepage_attr,
483};
484
485static void hugepage_exit_sysfs(struct kobject *hugepage_kobj);
486static void thpsize_release(struct kobject *kobj);
487static DEFINE_SPINLOCK(huge_anon_orders_lock);
488static LIST_HEAD(thpsize_list);
489
490static ssize_t anon_enabled_show(struct kobject *kobj,
491 struct kobj_attribute *attr, char *buf)
492{
493 int order = to_thpsize(kobj)->order;
494 const char *output;
495
496 if (test_bit(order, &huge_anon_orders_always))
497 output = "[always] inherit madvise never";
498 else if (test_bit(order, &huge_anon_orders_inherit))
499 output = "always [inherit] madvise never";
500 else if (test_bit(order, &huge_anon_orders_madvise))
501 output = "always inherit [madvise] never";
502 else
503 output = "always inherit madvise [never]";
504
505 return sysfs_emit(buf, "%s\n", output);
506}
507
508static ssize_t anon_enabled_store(struct kobject *kobj,
509 struct kobj_attribute *attr,
510 const char *buf, size_t count)
511{
512 int order = to_thpsize(kobj)->order;
513 ssize_t ret = count;
514
515 if (sysfs_streq(buf, "always")) {
516 spin_lock(&huge_anon_orders_lock);
517 clear_bit(order, &huge_anon_orders_inherit);
518 clear_bit(order, &huge_anon_orders_madvise);
519 set_bit(order, &huge_anon_orders_always);
520 spin_unlock(&huge_anon_orders_lock);
521 } else if (sysfs_streq(buf, "inherit")) {
522 spin_lock(&huge_anon_orders_lock);
523 clear_bit(order, &huge_anon_orders_always);
524 clear_bit(order, &huge_anon_orders_madvise);
525 set_bit(order, &huge_anon_orders_inherit);
526 spin_unlock(&huge_anon_orders_lock);
527 } else if (sysfs_streq(buf, "madvise")) {
528 spin_lock(&huge_anon_orders_lock);
529 clear_bit(order, &huge_anon_orders_always);
530 clear_bit(order, &huge_anon_orders_inherit);
531 set_bit(order, &huge_anon_orders_madvise);
532 spin_unlock(&huge_anon_orders_lock);
533 } else if (sysfs_streq(buf, "never")) {
534 spin_lock(&huge_anon_orders_lock);
535 clear_bit(order, &huge_anon_orders_always);
536 clear_bit(order, &huge_anon_orders_inherit);
537 clear_bit(order, &huge_anon_orders_madvise);
538 spin_unlock(&huge_anon_orders_lock);
539 } else
540 ret = -EINVAL;
541
542 if (ret > 0) {
543 int err;
544
545 err = start_stop_khugepaged();
546 if (err)
547 ret = err;
548 }
549 return ret;
550}
551
552static struct kobj_attribute anon_enabled_attr =
553 __ATTR(enabled, 0644, anon_enabled_show, anon_enabled_store);
554
555static struct attribute *anon_ctrl_attrs[] = {
556 &anon_enabled_attr.attr,
557 NULL,
558};
559
560static const struct attribute_group anon_ctrl_attr_grp = {
561 .attrs = anon_ctrl_attrs,
562};
563
564static struct attribute *file_ctrl_attrs[] = {
565#ifdef CONFIG_SHMEM
566 &thpsize_shmem_enabled_attr.attr,
567#endif
568 NULL,
569};
570
571static const struct attribute_group file_ctrl_attr_grp = {
572 .attrs = file_ctrl_attrs,
573};
574
575static struct attribute *any_ctrl_attrs[] = {
576 NULL,
577};
578
579static const struct attribute_group any_ctrl_attr_grp = {
580 .attrs = any_ctrl_attrs,
581};
582
583static const struct kobj_type thpsize_ktype = {
584 .release = &thpsize_release,
585 .sysfs_ops = &kobj_sysfs_ops,
586};
587
588DEFINE_PER_CPU(struct mthp_stat, mthp_stats) = {{{0}}};
589
590static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
591{
592 unsigned long sum = 0;
593 int cpu;
594
595 for_each_possible_cpu(cpu) {
596 struct mthp_stat *this = &per_cpu(mthp_stats, cpu);
597
598 sum += this->stats[order][item];
599 }
600
601 return sum;
602}
603
604#define DEFINE_MTHP_STAT_ATTR(_name, _index) \
605static ssize_t _name##_show(struct kobject *kobj, \
606 struct kobj_attribute *attr, char *buf) \
607{ \
608 int order = to_thpsize(kobj)->order; \
609 \
610 return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \
611} \
612static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
613
614DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
615DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
616DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
617DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT);
618DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN);
619DEFINE_MTHP_STAT_ATTR(swpin_fallback, MTHP_STAT_SWPIN_FALLBACK);
620DEFINE_MTHP_STAT_ATTR(swpin_fallback_charge, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
621DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT);
622DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK);
623#ifdef CONFIG_SHMEM
624DEFINE_MTHP_STAT_ATTR(shmem_alloc, MTHP_STAT_SHMEM_ALLOC);
625DEFINE_MTHP_STAT_ATTR(shmem_fallback, MTHP_STAT_SHMEM_FALLBACK);
626DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, MTHP_STAT_SHMEM_FALLBACK_CHARGE);
627#endif
628DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT);
629DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
630DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
631DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON);
632DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED);
633
634static struct attribute *anon_stats_attrs[] = {
635 &anon_fault_alloc_attr.attr,
636 &anon_fault_fallback_attr.attr,
637 &anon_fault_fallback_charge_attr.attr,
638#ifndef CONFIG_SHMEM
639 &zswpout_attr.attr,
640 &swpin_attr.attr,
641 &swpin_fallback_attr.attr,
642 &swpin_fallback_charge_attr.attr,
643 &swpout_attr.attr,
644 &swpout_fallback_attr.attr,
645#endif
646 &split_deferred_attr.attr,
647 &nr_anon_attr.attr,
648 &nr_anon_partially_mapped_attr.attr,
649 NULL,
650};
651
652static struct attribute_group anon_stats_attr_grp = {
653 .name = "stats",
654 .attrs = anon_stats_attrs,
655};
656
657static struct attribute *file_stats_attrs[] = {
658#ifdef CONFIG_SHMEM
659 &shmem_alloc_attr.attr,
660 &shmem_fallback_attr.attr,
661 &shmem_fallback_charge_attr.attr,
662#endif
663 NULL,
664};
665
666static struct attribute_group file_stats_attr_grp = {
667 .name = "stats",
668 .attrs = file_stats_attrs,
669};
670
671static struct attribute *any_stats_attrs[] = {
672#ifdef CONFIG_SHMEM
673 &zswpout_attr.attr,
674 &swpin_attr.attr,
675 &swpin_fallback_attr.attr,
676 &swpin_fallback_charge_attr.attr,
677 &swpout_attr.attr,
678 &swpout_fallback_attr.attr,
679#endif
680 &split_attr.attr,
681 &split_failed_attr.attr,
682 NULL,
683};
684
685static struct attribute_group any_stats_attr_grp = {
686 .name = "stats",
687 .attrs = any_stats_attrs,
688};
689
690static int sysfs_add_group(struct kobject *kobj,
691 const struct attribute_group *grp)
692{
693 int ret = -ENOENT;
694
695
696
697
698
699
700 if (grp->name)
701 ret = sysfs_merge_group(kobj, grp);
702 if (ret)
703 ret = sysfs_create_group(kobj, grp);
704
705 return ret;
706}
707
708static struct thpsize *thpsize_create(int order, struct kobject *parent)
709{
710 unsigned long size = (PAGE_SIZE << order) / SZ_1K;
711 struct thpsize *thpsize;
712 int ret = -ENOMEM;
713
714 thpsize = kzalloc(sizeof(*thpsize), GFP_KERNEL);
715 if (!thpsize)
716 goto err;
717
718 thpsize->order = order;
719
720 ret = kobject_init_and_add(&thpsize->kobj, &thpsize_ktype, parent,
721 "hugepages-%lukB", size);
722 if (ret) {
723 kfree(thpsize);
724 goto err;
725 }
726
727
728 ret = sysfs_add_group(&thpsize->kobj, &any_ctrl_attr_grp);
729 if (ret)
730 goto err_put;
731
732 ret = sysfs_add_group(&thpsize->kobj, &any_stats_attr_grp);
733 if (ret)
734 goto err_put;
735
736 if (BIT(order) & THP_ORDERS_ALL_ANON) {
737 ret = sysfs_add_group(&thpsize->kobj, &anon_ctrl_attr_grp);
738 if (ret)
739 goto err_put;
740
741 ret = sysfs_add_group(&thpsize->kobj, &anon_stats_attr_grp);
742 if (ret)
743 goto err_put;
744 }
745
746 if (BIT(order) & THP_ORDERS_ALL_FILE_DEFAULT) {
747 ret = sysfs_add_group(&thpsize->kobj, &file_ctrl_attr_grp);
748 if (ret)
749 goto err_put;
750
751 ret = sysfs_add_group(&thpsize->kobj, &file_stats_attr_grp);
752 if (ret)
753 goto err_put;
754 }
755
756 return thpsize;
757err_put:
758 kobject_put(&thpsize->kobj);
759err:
760 return ERR_PTR(ret);
761}
762
763static void thpsize_release(struct kobject *kobj)
764{
765 kfree(to_thpsize(kobj));
766}
767
768static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
769{
770 int err;
771 struct thpsize *thpsize;
772 unsigned long orders;
773 int order;
774
775
776
777
778
779
780 if (!anon_orders_configured)
781 huge_anon_orders_inherit = BIT(PMD_ORDER);
782
783 *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
784 if (unlikely(!*hugepage_kobj)) {
785 pr_err("failed to create transparent hugepage kobject\n");
786 return -ENOMEM;
787 }
788
789 err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
790 if (err) {
791 pr_err("failed to register transparent hugepage group\n");
792 goto delete_obj;
793 }
794
795 err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
796 if (err) {
797 pr_err("failed to register transparent hugepage group\n");
798 goto remove_hp_group;
799 }
800
801 orders = THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DEFAULT;
802 order = highest_order(orders);
803 while (orders) {
804 thpsize = thpsize_create(order, *hugepage_kobj);
805 if (IS_ERR(thpsize)) {
806 pr_err("failed to create thpsize for order %d\n", order);
807 err = PTR_ERR(thpsize);
808 goto remove_all;
809 }
810 list_add(&thpsize->node, &thpsize_list);
811 order = next_order(&orders, order);
812 }
813
814 return 0;
815
816remove_all:
817 hugepage_exit_sysfs(*hugepage_kobj);
818 return err;
819remove_hp_group:
820 sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
821delete_obj:
822 kobject_put(*hugepage_kobj);
823 return err;
824}
825
826static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
827{
828 struct thpsize *thpsize, *tmp;
829
830 list_for_each_entry_safe(thpsize, tmp, &thpsize_list, node) {
831 list_del(&thpsize->node);
832 kobject_put(&thpsize->kobj);
833 }
834
835 sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
836 sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
837 kobject_put(hugepage_kobj);
838}
839#else
840static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
841{
842 return 0;
843}
844
845static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
846{
847}
848#endif
849
850static int __init thp_shrinker_init(void)
851{
852 huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero");
853 if (!huge_zero_page_shrinker)
854 return -ENOMEM;
855
856 deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
857 SHRINKER_MEMCG_AWARE |
858 SHRINKER_NONSLAB,
859 "thp-deferred_split");
860 if (!deferred_split_shrinker) {
861 shrinker_free(huge_zero_page_shrinker);
862 return -ENOMEM;
863 }
864
865 huge_zero_page_shrinker->count_objects = shrink_huge_zero_page_count;
866 huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan;
867 shrinker_register(huge_zero_page_shrinker);
868
869 deferred_split_shrinker->count_objects = deferred_split_count;
870 deferred_split_shrinker->scan_objects = deferred_split_scan;
871 shrinker_register(deferred_split_shrinker);
872
873 return 0;
874}
875
876static void __init thp_shrinker_exit(void)
877{
878 shrinker_free(huge_zero_page_shrinker);
879 shrinker_free(deferred_split_shrinker);
880}
881
882static int __init hugepage_init(void)
883{
884 int err;
885 struct kobject *hugepage_kobj;
886
887 if (!has_transparent_hugepage()) {
888 transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED;
889 return -EINVAL;
890 }
891
892
893
894
895 MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_PAGE_ORDER);
896
897 err = hugepage_init_sysfs(&hugepage_kobj);
898 if (err)
899 goto err_sysfs;
900
901 err = khugepaged_init();
902 if (err)
903 goto err_slab;
904
905 err = thp_shrinker_init();
906 if (err)
907 goto err_shrinker;
908
909
910
911
912
913
914 if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
915 transparent_hugepage_flags = 0;
916 return 0;
917 }
918
919 err = start_stop_khugepaged();
920 if (err)
921 goto err_khugepaged;
922
923 return 0;
924err_khugepaged:
925 thp_shrinker_exit();
926err_shrinker:
927 khugepaged_destroy();
928err_slab:
929 hugepage_exit_sysfs(hugepage_kobj);
930err_sysfs:
931 return err;
932}
933subsys_initcall(hugepage_init);
934
935static int __init setup_transparent_hugepage(char *str)
936{
937 int ret = 0;
938 if (!str)
939 goto out;
940 if (!strcmp(str, "always")) {
941 set_bit(TRANSPARENT_HUGEPAGE_FLAG,
942 &transparent_hugepage_flags);
943 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
944 &transparent_hugepage_flags);
945 ret = 1;
946 } else if (!strcmp(str, "madvise")) {
947 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
948 &transparent_hugepage_flags);
949 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
950 &transparent_hugepage_flags);
951 ret = 1;
952 } else if (!strcmp(str, "never")) {
953 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
954 &transparent_hugepage_flags);
955 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
956 &transparent_hugepage_flags);
957 ret = 1;
958 }
959out:
960 if (!ret)
961 pr_warn("transparent_hugepage= cannot parse, ignored\n");
962 return ret;
963}
964__setup("transparent_hugepage=", setup_transparent_hugepage);
965
966static char str_dup[PAGE_SIZE] __initdata;
967static int __init setup_thp_anon(char *str)
968{
969 char *token, *range, *policy, *subtoken;
970 unsigned long always, inherit, madvise;
971 char *start_size, *end_size;
972 int start, end, nr;
973 char *p;
974
975 if (!str || strlen(str) + 1 > PAGE_SIZE)
976 goto err;
977 strscpy(str_dup, str);
978
979 always = huge_anon_orders_always;
980 madvise = huge_anon_orders_madvise;
981 inherit = huge_anon_orders_inherit;
982 p = str_dup;
983 while ((token = strsep(&p, ";")) != NULL) {
984 range = strsep(&token, ":");
985 policy = token;
986
987 if (!policy)
988 goto err;
989
990 while ((subtoken = strsep(&range, ",")) != NULL) {
991 if (strchr(subtoken, '-')) {
992 start_size = strsep(&subtoken, "-");
993 end_size = subtoken;
994
995 start = get_order_from_str(start_size, THP_ORDERS_ALL_ANON);
996 end = get_order_from_str(end_size, THP_ORDERS_ALL_ANON);
997 } else {
998 start_size = end_size = subtoken;
999 start = end = get_order_from_str(subtoken,
1000 THP_ORDERS_ALL_ANON);
1001 }
1002
1003 if (start == -EINVAL) {
1004 pr_err("invalid size %s in thp_anon boot parameter\n", start_size);
1005 goto err;
1006 }
1007
1008 if (end == -EINVAL) {
1009 pr_err("invalid size %s in thp_anon boot parameter\n", end_size);
1010 goto err;
1011 }
1012
1013 if (start < 0 || end < 0 || start > end)
1014 goto err;
1015
1016 nr = end - start + 1;
1017 if (!strcmp(policy, "always")) {
1018 bitmap_set(&always, start, nr);
1019 bitmap_clear(&inherit, start, nr);
1020 bitmap_clear(&madvise, start, nr);
1021 } else if (!strcmp(policy, "madvise")) {
1022 bitmap_set(&madvise, start, nr);
1023 bitmap_clear(&inherit, start, nr);
1024 bitmap_clear(&always, start, nr);
1025 } else if (!strcmp(policy, "inherit")) {
1026 bitmap_set(&inherit, start, nr);
1027 bitmap_clear(&madvise, start, nr);
1028 bitmap_clear(&always, start, nr);
1029 } else if (!strcmp(policy, "never")) {
1030 bitmap_clear(&inherit, start, nr);
1031 bitmap_clear(&madvise, start, nr);
1032 bitmap_clear(&always, start, nr);
1033 } else {
1034 pr_err("invalid policy %s in thp_anon boot parameter\n", policy);
1035 goto err;
1036 }
1037 }
1038 }
1039
1040 huge_anon_orders_always = always;
1041 huge_anon_orders_madvise = madvise;
1042 huge_anon_orders_inherit = inherit;
1043 anon_orders_configured = true;
1044 return 1;
1045
1046err:
1047 pr_warn("thp_anon=%s: error parsing string, ignoring setting\n", str);
1048 return 0;
1049}
1050__setup("thp_anon=", setup_thp_anon);
1051
1052pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
1053{
1054 if (likely(vma->vm_flags & VM_WRITE))
1055 pmd = pmd_mkwrite(pmd, vma);
1056 return pmd;
1057}
1058
1059#ifdef CONFIG_MEMCG
1060static inline
1061struct deferred_split *get_deferred_split_queue(struct folio *folio)
1062{
1063 struct mem_cgroup *memcg = folio_memcg(folio);
1064 struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
1065
1066 if (memcg)
1067 return &memcg->deferred_split_queue;
1068 else
1069 return &pgdat->deferred_split_queue;
1070}
1071#else
1072static inline
1073struct deferred_split *get_deferred_split_queue(struct folio *folio)
1074{
1075 struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
1076
1077 return &pgdat->deferred_split_queue;
1078}
1079#endif
1080
1081static inline bool is_transparent_hugepage(const struct folio *folio)
1082{
1083 if (!folio_test_large(folio))
1084 return false;
1085
1086 return is_huge_zero_folio(folio) ||
1087 folio_test_large_rmappable(folio);
1088}
1089
1090static unsigned long __thp_get_unmapped_area(struct file *filp,
1091 unsigned long addr, unsigned long len,
1092 loff_t off, unsigned long flags, unsigned long size,
1093 vm_flags_t vm_flags)
1094{
1095 loff_t off_end = off + len;
1096 loff_t off_align = round_up(off, size);
1097 unsigned long len_pad, ret, off_sub;
1098
1099 if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall())
1100 return 0;
1101
1102 if (off_end <= off_align || (off_end - off_align) < size)
1103 return 0;
1104
1105 len_pad = len + size;
1106 if (len_pad < len || (off + len_pad) < off)
1107 return 0;
1108
1109 ret = mm_get_unmapped_area_vmflags(current->mm, filp, addr, len_pad,
1110 off >> PAGE_SHIFT, flags, vm_flags);
1111
1112
1113
1114
1115
1116 if (IS_ERR_VALUE(ret))
1117 return 0;
1118
1119
1120
1121
1122
1123 if (ret == addr)
1124 return addr;
1125
1126 off_sub = (off - ret) & (size - 1);
1127
1128 if (test_bit(MMF_TOPDOWN, ¤t->mm->flags) && !off_sub)
1129 return ret + size;
1130
1131 ret += off_sub;
1132 return ret;
1133}
1134
1135unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
1136 unsigned long len, unsigned long pgoff, unsigned long flags,
1137 vm_flags_t vm_flags)
1138{
1139 unsigned long ret;
1140 loff_t off = (loff_t)pgoff << PAGE_SHIFT;
1141
1142 ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE, vm_flags);
1143 if (ret)
1144 return ret;
1145
1146 return mm_get_unmapped_area_vmflags(current->mm, filp, addr, len, pgoff, flags,
1147 vm_flags);
1148}
1149
1150unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
1151 unsigned long len, unsigned long pgoff, unsigned long flags)
1152{
1153 return thp_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, 0);
1154}
1155EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
1156
1157static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma,
1158 unsigned long addr)
1159{
1160 gfp_t gfp = vma_thp_gfp_mask(vma);
1161 const int order = HPAGE_PMD_ORDER;
1162 struct folio *folio;
1163
1164 folio = vma_alloc_folio(gfp, order, vma, addr & HPAGE_PMD_MASK);
1165
1166 if (unlikely(!folio)) {
1167 count_vm_event(THP_FAULT_FALLBACK);
1168 count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
1169 return NULL;
1170 }
1171
1172 VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
1173 if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
1174 folio_put(folio);
1175 count_vm_event(THP_FAULT_FALLBACK);
1176 count_vm_event(THP_FAULT_FALLBACK_CHARGE);
1177 count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
1178 count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
1179 return NULL;
1180 }
1181 folio_throttle_swaprate(folio, gfp);
1182
1183
1184
1185
1186
1187
1188
1189 if (user_alloc_needs_zeroing())
1190 folio_zero_user(folio, addr);
1191
1192
1193
1194
1195
1196 __folio_mark_uptodate(folio);
1197 return folio;
1198}
1199
1200static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd,
1201 struct vm_area_struct *vma, unsigned long haddr)
1202{
1203 pmd_t entry;
1204
1205 entry = folio_mk_pmd(folio, vma->vm_page_prot);
1206 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1207 folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
1208 folio_add_lru_vma(folio, vma);
1209 set_pmd_at(vma->vm_mm, haddr, pmd, entry);
1210 update_mmu_cache_pmd(vma, haddr, pmd);
1211 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1212 count_vm_event(THP_FAULT_ALLOC);
1213 count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
1214 count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
1215}
1216
1217static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
1218{
1219 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1220 struct vm_area_struct *vma = vmf->vma;
1221 struct folio *folio;
1222 pgtable_t pgtable;
1223 vm_fault_t ret = 0;
1224
1225 folio = vma_alloc_anon_folio_pmd(vma, vmf->address);
1226 if (unlikely(!folio))
1227 return VM_FAULT_FALLBACK;
1228
1229 pgtable = pte_alloc_one(vma->vm_mm);
1230 if (unlikely(!pgtable)) {
1231 ret = VM_FAULT_OOM;
1232 goto release;
1233 }
1234
1235 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1236 if (unlikely(!pmd_none(*vmf->pmd))) {
1237 goto unlock_release;
1238 } else {
1239 ret = check_stable_address_space(vma->vm_mm);
1240 if (ret)
1241 goto unlock_release;
1242
1243
1244 if (userfaultfd_missing(vma)) {
1245 spin_unlock(vmf->ptl);
1246 folio_put(folio);
1247 pte_free(vma->vm_mm, pgtable);
1248 ret = handle_userfault(vmf, VM_UFFD_MISSING);
1249 VM_BUG_ON(ret & VM_FAULT_FALLBACK);
1250 return ret;
1251 }
1252 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
1253 map_anon_folio_pmd(folio, vmf->pmd, vma, haddr);
1254 mm_inc_nr_ptes(vma->vm_mm);
1255 deferred_split_folio(folio, false);
1256 spin_unlock(vmf->ptl);
1257 }
1258
1259 return 0;
1260unlock_release:
1261 spin_unlock(vmf->ptl);
1262release:
1263 if (pgtable)
1264 pte_free(vma->vm_mm, pgtable);
1265 folio_put(folio);
1266 return ret;
1267
1268}
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma)
1280{
1281 const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE);
1282
1283
1284 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
1285 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
1286
1287
1288 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
1289 return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
1290
1291
1292 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
1293 return GFP_TRANSHUGE_LIGHT |
1294 (vma_madvised ? __GFP_DIRECT_RECLAIM :
1295 __GFP_KSWAPD_RECLAIM);
1296
1297
1298 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
1299 return GFP_TRANSHUGE_LIGHT |
1300 (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
1301
1302 return GFP_TRANSHUGE_LIGHT;
1303}
1304
1305
1306static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
1307 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
1308 struct folio *zero_folio)
1309{
1310 pmd_t entry;
1311 entry = folio_mk_pmd(zero_folio, vma->vm_page_prot);
1312 pgtable_trans_huge_deposit(mm, pmd, pgtable);
1313 set_pmd_at(mm, haddr, pmd, entry);
1314 mm_inc_nr_ptes(mm);
1315}
1316
1317vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
1318{
1319 struct vm_area_struct *vma = vmf->vma;
1320 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1321 vm_fault_t ret;
1322
1323 if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
1324 return VM_FAULT_FALLBACK;
1325 ret = vmf_anon_prepare(vmf);
1326 if (ret)
1327 return ret;
1328 khugepaged_enter_vma(vma, vma->vm_flags);
1329
1330 if (!(vmf->flags & FAULT_FLAG_WRITE) &&
1331 !mm_forbids_zeropage(vma->vm_mm) &&
1332 transparent_hugepage_use_zero_page()) {
1333 pgtable_t pgtable;
1334 struct folio *zero_folio;
1335 vm_fault_t ret;
1336
1337 pgtable = pte_alloc_one(vma->vm_mm);
1338 if (unlikely(!pgtable))
1339 return VM_FAULT_OOM;
1340 zero_folio = mm_get_huge_zero_folio(vma->vm_mm);
1341 if (unlikely(!zero_folio)) {
1342 pte_free(vma->vm_mm, pgtable);
1343 count_vm_event(THP_FAULT_FALLBACK);
1344 return VM_FAULT_FALLBACK;
1345 }
1346 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1347 ret = 0;
1348 if (pmd_none(*vmf->pmd)) {
1349 ret = check_stable_address_space(vma->vm_mm);
1350 if (ret) {
1351 spin_unlock(vmf->ptl);
1352 pte_free(vma->vm_mm, pgtable);
1353 } else if (userfaultfd_missing(vma)) {
1354 spin_unlock(vmf->ptl);
1355 pte_free(vma->vm_mm, pgtable);
1356 ret = handle_userfault(vmf, VM_UFFD_MISSING);
1357 VM_BUG_ON(ret & VM_FAULT_FALLBACK);
1358 } else {
1359 set_huge_zero_folio(pgtable, vma->vm_mm, vma,
1360 haddr, vmf->pmd, zero_folio);
1361 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
1362 spin_unlock(vmf->ptl);
1363 }
1364 } else {
1365 spin_unlock(vmf->ptl);
1366 pte_free(vma->vm_mm, pgtable);
1367 }
1368 return ret;
1369 }
1370
1371 return __do_huge_pmd_anonymous_page(vmf);
1372}
1373
1374struct folio_or_pfn {
1375 union {
1376 struct folio *folio;
1377 unsigned long pfn;
1378 };
1379 bool is_folio;
1380};
1381
1382static int insert_pmd(struct vm_area_struct *vma, unsigned long addr,
1383 pmd_t *pmd, struct folio_or_pfn fop, pgprot_t prot,
1384 bool write, pgtable_t pgtable)
1385{
1386 struct mm_struct *mm = vma->vm_mm;
1387 pmd_t entry;
1388
1389 lockdep_assert_held(pmd_lockptr(mm, pmd));
1390
1391 if (!pmd_none(*pmd)) {
1392 const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) :
1393 fop.pfn;
1394
1395 if (write) {
1396 if (pmd_pfn(*pmd) != pfn) {
1397 WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
1398 return -EEXIST;
1399 }
1400 entry = pmd_mkyoung(*pmd);
1401 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1402 if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
1403 update_mmu_cache_pmd(vma, addr, pmd);
1404 }
1405
1406 return -EEXIST;
1407 }
1408
1409 if (fop.is_folio) {
1410 entry = folio_mk_pmd(fop.folio, vma->vm_page_prot);
1411
1412 folio_get(fop.folio);
1413 folio_add_file_rmap_pmd(fop.folio, &fop.folio->page, vma);
1414 add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PMD_NR);
1415 } else {
1416 entry = pmd_mkhuge(pfn_pmd(fop.pfn, prot));
1417 entry = pmd_mkspecial(entry);
1418 }
1419 if (write) {
1420 entry = pmd_mkyoung(pmd_mkdirty(entry));
1421 entry = maybe_pmd_mkwrite(entry, vma);
1422 }
1423
1424 if (pgtable) {
1425 pgtable_trans_huge_deposit(mm, pmd, pgtable);
1426 mm_inc_nr_ptes(mm);
1427 }
1428
1429 set_pmd_at(mm, addr, pmd, entry);
1430 update_mmu_cache_pmd(vma, addr, pmd);
1431 return 0;
1432}
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, unsigned long pfn,
1445 bool write)
1446{
1447 unsigned long addr = vmf->address & PMD_MASK;
1448 struct vm_area_struct *vma = vmf->vma;
1449 pgprot_t pgprot = vma->vm_page_prot;
1450 struct folio_or_pfn fop = {
1451 .pfn = pfn,
1452 };
1453 pgtable_t pgtable = NULL;
1454 spinlock_t *ptl;
1455 int error;
1456
1457
1458
1459
1460
1461
1462 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
1463 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1464 (VM_PFNMAP|VM_MIXEDMAP));
1465 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1466
1467 if (addr < vma->vm_start || addr >= vma->vm_end)
1468 return VM_FAULT_SIGBUS;
1469
1470 if (arch_needs_pgtable_deposit()) {
1471 pgtable = pte_alloc_one(vma->vm_mm);
1472 if (!pgtable)
1473 return VM_FAULT_OOM;
1474 }
1475
1476 pfnmap_setup_cachemode_pfn(pfn, &pgprot);
1477
1478 ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1479 error = insert_pmd(vma, addr, vmf->pmd, fop, pgprot, write,
1480 pgtable);
1481 spin_unlock(ptl);
1482 if (error && pgtable)
1483 pte_free(vma->vm_mm, pgtable);
1484
1485 return VM_FAULT_NOPAGE;
1486}
1487EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
1488
1489vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio,
1490 bool write)
1491{
1492 struct vm_area_struct *vma = vmf->vma;
1493 unsigned long addr = vmf->address & PMD_MASK;
1494 struct mm_struct *mm = vma->vm_mm;
1495 struct folio_or_pfn fop = {
1496 .folio = folio,
1497 .is_folio = true,
1498 };
1499 spinlock_t *ptl;
1500 pgtable_t pgtable = NULL;
1501 int error;
1502
1503 if (addr < vma->vm_start || addr >= vma->vm_end)
1504 return VM_FAULT_SIGBUS;
1505
1506 if (WARN_ON_ONCE(folio_order(folio) != PMD_ORDER))
1507 return VM_FAULT_SIGBUS;
1508
1509 if (arch_needs_pgtable_deposit()) {
1510 pgtable = pte_alloc_one(vma->vm_mm);
1511 if (!pgtable)
1512 return VM_FAULT_OOM;
1513 }
1514
1515 ptl = pmd_lock(mm, vmf->pmd);
1516 error = insert_pmd(vma, addr, vmf->pmd, fop, vma->vm_page_prot,
1517 write, pgtable);
1518 spin_unlock(ptl);
1519 if (error && pgtable)
1520 pte_free(mm, pgtable);
1521
1522 return VM_FAULT_NOPAGE;
1523}
1524EXPORT_SYMBOL_GPL(vmf_insert_folio_pmd);
1525
1526#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1527static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
1528{
1529 if (likely(vma->vm_flags & VM_WRITE))
1530 pud = pud_mkwrite(pud);
1531 return pud;
1532}
1533
1534static void insert_pud(struct vm_area_struct *vma, unsigned long addr,
1535 pud_t *pud, struct folio_or_pfn fop, pgprot_t prot, bool write)
1536{
1537 struct mm_struct *mm = vma->vm_mm;
1538 pud_t entry;
1539
1540 if (!pud_none(*pud)) {
1541 const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) :
1542 fop.pfn;
1543
1544 if (write) {
1545 if (WARN_ON_ONCE(pud_pfn(*pud) != pfn))
1546 return;
1547 entry = pud_mkyoung(*pud);
1548 entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
1549 if (pudp_set_access_flags(vma, addr, pud, entry, 1))
1550 update_mmu_cache_pud(vma, addr, pud);
1551 }
1552 return;
1553 }
1554
1555 if (fop.is_folio) {
1556 entry = folio_mk_pud(fop.folio, vma->vm_page_prot);
1557
1558 folio_get(fop.folio);
1559 folio_add_file_rmap_pud(fop.folio, &fop.folio->page, vma);
1560 add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PUD_NR);
1561 } else {
1562 entry = pud_mkhuge(pfn_pud(fop.pfn, prot));
1563 entry = pud_mkspecial(entry);
1564 }
1565 if (write) {
1566 entry = pud_mkyoung(pud_mkdirty(entry));
1567 entry = maybe_pud_mkwrite(entry, vma);
1568 }
1569 set_pud_at(mm, addr, pud, entry);
1570 update_mmu_cache_pud(vma, addr, pud);
1571}
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, unsigned long pfn,
1584 bool write)
1585{
1586 unsigned long addr = vmf->address & PUD_MASK;
1587 struct vm_area_struct *vma = vmf->vma;
1588 pgprot_t pgprot = vma->vm_page_prot;
1589 struct folio_or_pfn fop = {
1590 .pfn = pfn,
1591 };
1592 spinlock_t *ptl;
1593
1594
1595
1596
1597
1598
1599 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
1600 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1601 (VM_PFNMAP|VM_MIXEDMAP));
1602 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1603
1604 if (addr < vma->vm_start || addr >= vma->vm_end)
1605 return VM_FAULT_SIGBUS;
1606
1607 pfnmap_setup_cachemode_pfn(pfn, &pgprot);
1608
1609 ptl = pud_lock(vma->vm_mm, vmf->pud);
1610 insert_pud(vma, addr, vmf->pud, fop, pgprot, write);
1611 spin_unlock(ptl);
1612
1613 return VM_FAULT_NOPAGE;
1614}
1615EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio,
1626 bool write)
1627{
1628 struct vm_area_struct *vma = vmf->vma;
1629 unsigned long addr = vmf->address & PUD_MASK;
1630 pud_t *pud = vmf->pud;
1631 struct mm_struct *mm = vma->vm_mm;
1632 struct folio_or_pfn fop = {
1633 .folio = folio,
1634 .is_folio = true,
1635 };
1636 spinlock_t *ptl;
1637
1638 if (addr < vma->vm_start || addr >= vma->vm_end)
1639 return VM_FAULT_SIGBUS;
1640
1641 if (WARN_ON_ONCE(folio_order(folio) != PUD_ORDER))
1642 return VM_FAULT_SIGBUS;
1643
1644 ptl = pud_lock(mm, pud);
1645 insert_pud(vma, addr, vmf->pud, fop, vma->vm_page_prot, write);
1646 spin_unlock(ptl);
1647
1648 return VM_FAULT_NOPAGE;
1649}
1650EXPORT_SYMBOL_GPL(vmf_insert_folio_pud);
1651#endif
1652
1653void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
1654 pmd_t *pmd, bool write)
1655{
1656 pmd_t _pmd;
1657
1658 _pmd = pmd_mkyoung(*pmd);
1659 if (write)
1660 _pmd = pmd_mkdirty(_pmd);
1661 if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
1662 pmd, _pmd, write))
1663 update_mmu_cache_pmd(vma, addr, pmd);
1664}
1665
1666int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1667 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
1668 struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
1669{
1670 spinlock_t *dst_ptl, *src_ptl;
1671 struct page *src_page;
1672 struct folio *src_folio;
1673 pmd_t pmd;
1674 pgtable_t pgtable = NULL;
1675 int ret = -ENOMEM;
1676
1677 pmd = pmdp_get_lockless(src_pmd);
1678 if (unlikely(pmd_present(pmd) && pmd_special(pmd))) {
1679 dst_ptl = pmd_lock(dst_mm, dst_pmd);
1680 src_ptl = pmd_lockptr(src_mm, src_pmd);
1681 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691 VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) && pmd_write(pmd));
1692 goto set_pmd;
1693 }
1694
1695
1696 if (!vma_is_anonymous(dst_vma))
1697 return 0;
1698
1699 pgtable = pte_alloc_one(dst_mm);
1700 if (unlikely(!pgtable))
1701 goto out;
1702
1703 dst_ptl = pmd_lock(dst_mm, dst_pmd);
1704 src_ptl = pmd_lockptr(src_mm, src_pmd);
1705 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1706
1707 ret = -EAGAIN;
1708 pmd = *src_pmd;
1709
1710#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
1711 if (unlikely(is_swap_pmd(pmd))) {
1712 swp_entry_t entry = pmd_to_swp_entry(pmd);
1713
1714 VM_BUG_ON(!is_pmd_migration_entry(pmd));
1715 if (!is_readable_migration_entry(entry)) {
1716 entry = make_readable_migration_entry(
1717 swp_offset(entry));
1718 pmd = swp_entry_to_pmd(entry);
1719 if (pmd_swp_soft_dirty(*src_pmd))
1720 pmd = pmd_swp_mksoft_dirty(pmd);
1721 if (pmd_swp_uffd_wp(*src_pmd))
1722 pmd = pmd_swp_mkuffd_wp(pmd);
1723 set_pmd_at(src_mm, addr, src_pmd, pmd);
1724 }
1725 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1726 mm_inc_nr_ptes(dst_mm);
1727 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
1728 if (!userfaultfd_wp(dst_vma))
1729 pmd = pmd_swp_clear_uffd_wp(pmd);
1730 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
1731 ret = 0;
1732 goto out_unlock;
1733 }
1734#endif
1735
1736 if (unlikely(!pmd_trans_huge(pmd))) {
1737 pte_free(dst_mm, pgtable);
1738 goto out_unlock;
1739 }
1740
1741
1742
1743
1744
1745 if (is_huge_zero_pmd(pmd)) {
1746
1747
1748
1749
1750
1751 mm_get_huge_zero_folio(dst_mm);
1752 goto out_zero_page;
1753 }
1754
1755 src_page = pmd_page(pmd);
1756 VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
1757 src_folio = page_folio(src_page);
1758
1759 folio_get(src_folio);
1760 if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, dst_vma, src_vma))) {
1761
1762 folio_put(src_folio);
1763 pte_free(dst_mm, pgtable);
1764 spin_unlock(src_ptl);
1765 spin_unlock(dst_ptl);
1766 __split_huge_pmd(src_vma, src_pmd, addr, false);
1767 return -EAGAIN;
1768 }
1769 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1770out_zero_page:
1771 mm_inc_nr_ptes(dst_mm);
1772 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
1773 pmdp_set_wrprotect(src_mm, addr, src_pmd);
1774 if (!userfaultfd_wp(dst_vma))
1775 pmd = pmd_clear_uffd_wp(pmd);
1776 pmd = pmd_wrprotect(pmd);
1777set_pmd:
1778 pmd = pmd_mkold(pmd);
1779 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
1780
1781 ret = 0;
1782out_unlock:
1783 spin_unlock(src_ptl);
1784 spin_unlock(dst_ptl);
1785out:
1786 return ret;
1787}
1788
1789#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1790void touch_pud(struct vm_area_struct *vma, unsigned long addr,
1791 pud_t *pud, bool write)
1792{
1793 pud_t _pud;
1794
1795 _pud = pud_mkyoung(*pud);
1796 if (write)
1797 _pud = pud_mkdirty(_pud);
1798 if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
1799 pud, _pud, write))
1800 update_mmu_cache_pud(vma, addr, pud);
1801}
1802
1803int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1804 pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
1805 struct vm_area_struct *vma)
1806{
1807 spinlock_t *dst_ptl, *src_ptl;
1808 pud_t pud;
1809 int ret;
1810
1811 dst_ptl = pud_lock(dst_mm, dst_pud);
1812 src_ptl = pud_lockptr(src_mm, src_pud);
1813 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1814
1815 ret = -EAGAIN;
1816 pud = *src_pud;
1817 if (unlikely(!pud_trans_huge(pud)))
1818 goto out_unlock;
1819
1820
1821
1822
1823
1824 if (is_cow_mapping(vma->vm_flags) && pud_write(pud)) {
1825 pudp_set_wrprotect(src_mm, addr, src_pud);
1826 pud = pud_wrprotect(pud);
1827 }
1828 pud = pud_mkold(pud);
1829 set_pud_at(dst_mm, addr, dst_pud, pud);
1830
1831 ret = 0;
1832out_unlock:
1833 spin_unlock(src_ptl);
1834 spin_unlock(dst_ptl);
1835 return ret;
1836}
1837
1838void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
1839{
1840 bool write = vmf->flags & FAULT_FLAG_WRITE;
1841
1842 vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
1843 if (unlikely(!pud_same(*vmf->pud, orig_pud)))
1844 goto unlock;
1845
1846 touch_pud(vmf->vma, vmf->address, vmf->pud, write);
1847unlock:
1848 spin_unlock(vmf->ptl);
1849}
1850#endif
1851
1852void huge_pmd_set_accessed(struct vm_fault *vmf)
1853{
1854 bool write = vmf->flags & FAULT_FLAG_WRITE;
1855
1856 vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
1857 if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd)))
1858 goto unlock;
1859
1860 touch_pmd(vmf->vma, vmf->address, vmf->pmd, write);
1861
1862unlock:
1863 spin_unlock(vmf->ptl);
1864}
1865
1866static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf)
1867{
1868 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1869 struct vm_area_struct *vma = vmf->vma;
1870 struct mmu_notifier_range range;
1871 struct folio *folio;
1872 vm_fault_t ret = 0;
1873
1874 folio = vma_alloc_anon_folio_pmd(vma, vmf->address);
1875 if (unlikely(!folio))
1876 return VM_FAULT_FALLBACK;
1877
1878 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr,
1879 haddr + HPAGE_PMD_SIZE);
1880 mmu_notifier_invalidate_range_start(&range);
1881 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1882 if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd)))
1883 goto release;
1884 ret = check_stable_address_space(vma->vm_mm);
1885 if (ret)
1886 goto release;
1887 (void)pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
1888 map_anon_folio_pmd(folio, vmf->pmd, vma, haddr);
1889 goto unlock;
1890release:
1891 folio_put(folio);
1892unlock:
1893 spin_unlock(vmf->ptl);
1894 mmu_notifier_invalidate_range_end(&range);
1895 return ret;
1896}
1897
1898vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
1899{
1900 const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
1901 struct vm_area_struct *vma = vmf->vma;
1902 struct folio *folio;
1903 struct page *page;
1904 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1905 pmd_t orig_pmd = vmf->orig_pmd;
1906
1907 vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
1908 VM_BUG_ON_VMA(!vma->anon_vma, vma);
1909
1910 if (is_huge_zero_pmd(orig_pmd)) {
1911 vm_fault_t ret = do_huge_zero_wp_pmd(vmf);
1912
1913 if (!(ret & VM_FAULT_FALLBACK))
1914 return ret;
1915
1916
1917 goto fallback;
1918 }
1919
1920 spin_lock(vmf->ptl);
1921
1922 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
1923 spin_unlock(vmf->ptl);
1924 return 0;
1925 }
1926
1927 page = pmd_page(orig_pmd);
1928 folio = page_folio(page);
1929 VM_BUG_ON_PAGE(!PageHead(page), page);
1930
1931
1932 if (PageAnonExclusive(page))
1933 goto reuse;
1934
1935 if (!folio_trylock(folio)) {
1936 folio_get(folio);
1937 spin_unlock(vmf->ptl);
1938 folio_lock(folio);
1939 spin_lock(vmf->ptl);
1940 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
1941 spin_unlock(vmf->ptl);
1942 folio_unlock(folio);
1943 folio_put(folio);
1944 return 0;
1945 }
1946 folio_put(folio);
1947 }
1948
1949
1950 if (PageAnonExclusive(page)) {
1951 folio_unlock(folio);
1952 goto reuse;
1953 }
1954
1955
1956
1957
1958
1959
1960 if (folio_ref_count(folio) >
1961 1 + folio_test_swapcache(folio) * folio_nr_pages(folio))
1962 goto unlock_fallback;
1963 if (folio_test_swapcache(folio))
1964 folio_free_swap(folio);
1965 if (folio_ref_count(folio) == 1) {
1966 pmd_t entry;
1967
1968 folio_move_anon_rmap(folio, vma);
1969 SetPageAnonExclusive(page);
1970 folio_unlock(folio);
1971reuse:
1972 if (unlikely(unshare)) {
1973 spin_unlock(vmf->ptl);
1974 return 0;
1975 }
1976 entry = pmd_mkyoung(orig_pmd);
1977 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1978 if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
1979 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
1980 spin_unlock(vmf->ptl);
1981 return 0;
1982 }
1983
1984unlock_fallback:
1985 folio_unlock(folio);
1986 spin_unlock(vmf->ptl);
1987fallback:
1988 __split_huge_pmd(vma, vmf->pmd, vmf->address, false);
1989 return VM_FAULT_FALLBACK;
1990}
1991
1992static inline bool can_change_pmd_writable(struct vm_area_struct *vma,
1993 unsigned long addr, pmd_t pmd)
1994{
1995 struct page *page;
1996
1997 if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE)))
1998 return false;
1999
2000
2001 if (pmd_protnone(pmd))
2002 return false;
2003
2004
2005 if (pmd_needs_soft_dirty_wp(vma, pmd))
2006 return false;
2007
2008
2009 if (userfaultfd_huge_pmd_wp(vma, pmd))
2010 return false;
2011
2012 if (!(vma->vm_flags & VM_SHARED)) {
2013
2014 page = vm_normal_page_pmd(vma, addr, pmd);
2015 return page && PageAnon(page) && PageAnonExclusive(page);
2016 }
2017
2018
2019 return pmd_dirty(pmd);
2020}
2021
2022
2023vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
2024{
2025 struct vm_area_struct *vma = vmf->vma;
2026 struct folio *folio;
2027 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
2028 int nid = NUMA_NO_NODE;
2029 int target_nid, last_cpupid;
2030 pmd_t pmd, old_pmd;
2031 bool writable = false;
2032 int flags = 0;
2033
2034 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
2035 old_pmd = pmdp_get(vmf->pmd);
2036
2037 if (unlikely(!pmd_same(old_pmd, vmf->orig_pmd))) {
2038 spin_unlock(vmf->ptl);
2039 return 0;
2040 }
2041
2042 pmd = pmd_modify(old_pmd, vma->vm_page_prot);
2043
2044
2045
2046
2047
2048 writable = pmd_write(pmd);
2049 if (!writable && vma_wants_manual_pte_write_upgrade(vma) &&
2050 can_change_pmd_writable(vma, vmf->address, pmd))
2051 writable = true;
2052
2053 folio = vm_normal_folio_pmd(vma, haddr, pmd);
2054 if (!folio)
2055 goto out_map;
2056
2057 nid = folio_nid(folio);
2058
2059 target_nid = numa_migrate_check(folio, vmf, haddr, &flags, writable,
2060 &last_cpupid);
2061 if (target_nid == NUMA_NO_NODE)
2062 goto out_map;
2063 if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) {
2064 flags |= TNF_MIGRATE_FAIL;
2065 goto out_map;
2066 }
2067
2068 spin_unlock(vmf->ptl);
2069 writable = false;
2070
2071 if (!migrate_misplaced_folio(folio, target_nid)) {
2072 flags |= TNF_MIGRATED;
2073 nid = target_nid;
2074 task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
2075 return 0;
2076 }
2077
2078 flags |= TNF_MIGRATE_FAIL;
2079 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
2080 if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd))) {
2081 spin_unlock(vmf->ptl);
2082 return 0;
2083 }
2084out_map:
2085
2086 pmd = pmd_modify(pmdp_get(vmf->pmd), vma->vm_page_prot);
2087 pmd = pmd_mkyoung(pmd);
2088 if (writable)
2089 pmd = pmd_mkwrite(pmd, vma);
2090 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
2091 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
2092 spin_unlock(vmf->ptl);
2093
2094 if (nid != NUMA_NO_NODE)
2095 task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
2096 return 0;
2097}
2098
2099
2100
2101
2102
2103bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
2104 pmd_t *pmd, unsigned long addr, unsigned long next)
2105{
2106 spinlock_t *ptl;
2107 pmd_t orig_pmd;
2108 struct folio *folio;
2109 struct mm_struct *mm = tlb->mm;
2110 bool ret = false;
2111
2112 tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
2113
2114 ptl = pmd_trans_huge_lock(pmd, vma);
2115 if (!ptl)
2116 goto out_unlocked;
2117
2118 orig_pmd = *pmd;
2119 if (is_huge_zero_pmd(orig_pmd))
2120 goto out;
2121
2122 if (unlikely(!pmd_present(orig_pmd))) {
2123 VM_BUG_ON(thp_migration_supported() &&
2124 !is_pmd_migration_entry(orig_pmd));
2125 goto out;
2126 }
2127
2128 folio = pmd_folio(orig_pmd);
2129
2130
2131
2132
2133 if (folio_maybe_mapped_shared(folio))
2134 goto out;
2135
2136 if (!folio_trylock(folio))
2137 goto out;
2138
2139
2140
2141
2142
2143 if (next - addr != HPAGE_PMD_SIZE) {
2144 folio_get(folio);
2145 spin_unlock(ptl);
2146 split_folio(folio);
2147 folio_unlock(folio);
2148 folio_put(folio);
2149 goto out_unlocked;
2150 }
2151
2152 if (folio_test_dirty(folio))
2153 folio_clear_dirty(folio);
2154 folio_unlock(folio);
2155
2156 if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
2157 pmdp_invalidate(vma, addr, pmd);
2158 orig_pmd = pmd_mkold(orig_pmd);
2159 orig_pmd = pmd_mkclean(orig_pmd);
2160
2161 set_pmd_at(mm, addr, pmd, orig_pmd);
2162 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
2163 }
2164
2165 folio_mark_lazyfree(folio);
2166 ret = true;
2167out:
2168 spin_unlock(ptl);
2169out_unlocked:
2170 return ret;
2171}
2172
2173static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
2174{
2175 pgtable_t pgtable;
2176
2177 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2178 pte_free(mm, pgtable);
2179 mm_dec_nr_ptes(mm);
2180}
2181
2182int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
2183 pmd_t *pmd, unsigned long addr)
2184{
2185 pmd_t orig_pmd;
2186 spinlock_t *ptl;
2187
2188 tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
2189
2190 ptl = __pmd_trans_huge_lock(pmd, vma);
2191 if (!ptl)
2192 return 0;
2193
2194
2195
2196
2197
2198
2199 orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd,
2200 tlb->fullmm);
2201 arch_check_zapped_pmd(vma, orig_pmd);
2202 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
2203 if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
2204 if (arch_needs_pgtable_deposit())
2205 zap_deposited_table(tlb->mm, pmd);
2206 spin_unlock(ptl);
2207 } else if (is_huge_zero_pmd(orig_pmd)) {
2208 if (!vma_is_dax(vma) || arch_needs_pgtable_deposit())
2209 zap_deposited_table(tlb->mm, pmd);
2210 spin_unlock(ptl);
2211 } else {
2212 struct folio *folio = NULL;
2213 int flush_needed = 1;
2214
2215 if (pmd_present(orig_pmd)) {
2216 struct page *page = pmd_page(orig_pmd);
2217
2218 folio = page_folio(page);
2219 folio_remove_rmap_pmd(folio, page, vma);
2220 WARN_ON_ONCE(folio_mapcount(folio) < 0);
2221 VM_BUG_ON_PAGE(!PageHead(page), page);
2222 } else if (thp_migration_supported()) {
2223 swp_entry_t entry;
2224
2225 VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
2226 entry = pmd_to_swp_entry(orig_pmd);
2227 folio = pfn_swap_entry_folio(entry);
2228 flush_needed = 0;
2229 } else
2230 WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
2231
2232 if (folio_test_anon(folio)) {
2233 zap_deposited_table(tlb->mm, pmd);
2234 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
2235 } else {
2236 if (arch_needs_pgtable_deposit())
2237 zap_deposited_table(tlb->mm, pmd);
2238 add_mm_counter(tlb->mm, mm_counter_file(folio),
2239 -HPAGE_PMD_NR);
2240
2241
2242
2243
2244
2245 if (flush_needed && pmd_young(orig_pmd) &&
2246 likely(vma_has_recency(vma)))
2247 folio_mark_accessed(folio);
2248 }
2249
2250 spin_unlock(ptl);
2251 if (flush_needed)
2252 tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
2253 }
2254 return 1;
2255}
2256
2257#ifndef pmd_move_must_withdraw
2258static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
2259 spinlock_t *old_pmd_ptl,
2260 struct vm_area_struct *vma)
2261{
2262
2263
2264
2265
2266
2267
2268 return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
2269}
2270#endif
2271
2272static pmd_t move_soft_dirty_pmd(pmd_t pmd)
2273{
2274#ifdef CONFIG_MEM_SOFT_DIRTY
2275 if (unlikely(is_pmd_migration_entry(pmd)))
2276 pmd = pmd_swp_mksoft_dirty(pmd);
2277 else if (pmd_present(pmd))
2278 pmd = pmd_mksoft_dirty(pmd);
2279#endif
2280 return pmd;
2281}
2282
2283static pmd_t clear_uffd_wp_pmd(pmd_t pmd)
2284{
2285 if (pmd_present(pmd))
2286 pmd = pmd_clear_uffd_wp(pmd);
2287 else if (is_swap_pmd(pmd))
2288 pmd = pmd_swp_clear_uffd_wp(pmd);
2289
2290 return pmd;
2291}
2292
2293bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
2294 unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
2295{
2296 spinlock_t *old_ptl, *new_ptl;
2297 pmd_t pmd;
2298 struct mm_struct *mm = vma->vm_mm;
2299 bool force_flush = false;
2300
2301
2302
2303
2304
2305
2306 if (!pmd_none(*new_pmd)) {
2307 VM_BUG_ON(pmd_trans_huge(*new_pmd));
2308 return false;
2309 }
2310
2311
2312
2313
2314
2315 old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
2316 if (old_ptl) {
2317 new_ptl = pmd_lockptr(mm, new_pmd);
2318 if (new_ptl != old_ptl)
2319 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
2320 pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
2321 if (pmd_present(pmd))
2322 force_flush = true;
2323 VM_BUG_ON(!pmd_none(*new_pmd));
2324
2325 if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
2326 pgtable_t pgtable;
2327 pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
2328 pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
2329 }
2330 pmd = move_soft_dirty_pmd(pmd);
2331 if (vma_has_uffd_without_event_remap(vma))
2332 pmd = clear_uffd_wp_pmd(pmd);
2333 set_pmd_at(mm, new_addr, new_pmd, pmd);
2334 if (force_flush)
2335 flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
2336 if (new_ptl != old_ptl)
2337 spin_unlock(new_ptl);
2338 spin_unlock(old_ptl);
2339 return true;
2340 }
2341 return false;
2342}
2343
2344
2345
2346
2347
2348
2349
2350
2351int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
2352 pmd_t *pmd, unsigned long addr, pgprot_t newprot,
2353 unsigned long cp_flags)
2354{
2355 struct mm_struct *mm = vma->vm_mm;
2356 spinlock_t *ptl;
2357 pmd_t oldpmd, entry;
2358 bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
2359 bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
2360 bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
2361 int ret = 1;
2362
2363 tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
2364
2365 if (prot_numa && !thp_migration_supported())
2366 return 1;
2367
2368 ptl = __pmd_trans_huge_lock(pmd, vma);
2369 if (!ptl)
2370 return 0;
2371
2372#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
2373 if (is_swap_pmd(*pmd)) {
2374 swp_entry_t entry = pmd_to_swp_entry(*pmd);
2375 struct folio *folio = pfn_swap_entry_folio(entry);
2376 pmd_t newpmd;
2377
2378 VM_BUG_ON(!is_pmd_migration_entry(*pmd));
2379 if (is_writable_migration_entry(entry)) {
2380
2381
2382
2383
2384 if (folio_test_anon(folio))
2385 entry = make_readable_exclusive_migration_entry(swp_offset(entry));
2386 else
2387 entry = make_readable_migration_entry(swp_offset(entry));
2388 newpmd = swp_entry_to_pmd(entry);
2389 if (pmd_swp_soft_dirty(*pmd))
2390 newpmd = pmd_swp_mksoft_dirty(newpmd);
2391 } else {
2392 newpmd = *pmd;
2393 }
2394
2395 if (uffd_wp)
2396 newpmd = pmd_swp_mkuffd_wp(newpmd);
2397 else if (uffd_wp_resolve)
2398 newpmd = pmd_swp_clear_uffd_wp(newpmd);
2399 if (!pmd_same(*pmd, newpmd))
2400 set_pmd_at(mm, addr, pmd, newpmd);
2401 goto unlock;
2402 }
2403#endif
2404
2405 if (prot_numa) {
2406 struct folio *folio;
2407 bool toptier;
2408
2409
2410
2411
2412
2413 if (is_huge_zero_pmd(*pmd))
2414 goto unlock;
2415
2416 if (pmd_protnone(*pmd))
2417 goto unlock;
2418
2419 folio = pmd_folio(*pmd);
2420 toptier = node_is_toptier(folio_nid(folio));
2421
2422
2423
2424
2425 if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
2426 toptier)
2427 goto unlock;
2428
2429 if (folio_use_access_time(folio))
2430 folio_xchg_access_time(folio,
2431 jiffies_to_msecs(jiffies));
2432 }
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454 oldpmd = pmdp_invalidate_ad(vma, addr, pmd);
2455
2456 entry = pmd_modify(oldpmd, newprot);
2457 if (uffd_wp)
2458 entry = pmd_mkuffd_wp(entry);
2459 else if (uffd_wp_resolve)
2460
2461
2462
2463
2464
2465 entry = pmd_clear_uffd_wp(entry);
2466
2467
2468 if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) &&
2469 can_change_pmd_writable(vma, addr, entry))
2470 entry = pmd_mkwrite(entry, vma);
2471
2472 ret = HPAGE_PMD_NR;
2473 set_pmd_at(mm, addr, pmd, entry);
2474
2475 if (huge_pmd_needs_flush(oldpmd, entry))
2476 tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE);
2477unlock:
2478 spin_unlock(ptl);
2479 return ret;
2480}
2481
2482
2483
2484
2485
2486
2487
2488
2489#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
2490int change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
2491 pud_t *pudp, unsigned long addr, pgprot_t newprot,
2492 unsigned long cp_flags)
2493{
2494 struct mm_struct *mm = vma->vm_mm;
2495 pud_t oldpud, entry;
2496 spinlock_t *ptl;
2497
2498 tlb_change_page_size(tlb, HPAGE_PUD_SIZE);
2499
2500
2501 if (cp_flags & MM_CP_PROT_NUMA)
2502 return 1;
2503
2504
2505
2506
2507
2508 if (WARN_ON_ONCE(cp_flags & MM_CP_UFFD_WP_ALL))
2509 return 1;
2510
2511 ptl = __pud_trans_huge_lock(pudp, vma);
2512 if (!ptl)
2513 return 0;
2514
2515
2516
2517
2518
2519 oldpud = pudp_invalidate(vma, addr, pudp);
2520 entry = pud_modify(oldpud, newprot);
2521 set_pud_at(mm, addr, pudp, entry);
2522 tlb_flush_pud_range(tlb, addr, HPAGE_PUD_SIZE);
2523
2524 spin_unlock(ptl);
2525 return HPAGE_PUD_NR;
2526}
2527#endif
2528
2529#ifdef CONFIG_USERFAULTFD
2530
2531
2532
2533
2534
2535
2536
2537int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
2538 struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
2539 unsigned long dst_addr, unsigned long src_addr)
2540{
2541 pmd_t _dst_pmd, src_pmdval;
2542 struct page *src_page;
2543 struct folio *src_folio;
2544 struct anon_vma *src_anon_vma;
2545 spinlock_t *src_ptl, *dst_ptl;
2546 pgtable_t src_pgtable;
2547 struct mmu_notifier_range range;
2548 int err = 0;
2549
2550 src_pmdval = *src_pmd;
2551 src_ptl = pmd_lockptr(mm, src_pmd);
2552
2553 lockdep_assert_held(src_ptl);
2554 vma_assert_locked(src_vma);
2555 vma_assert_locked(dst_vma);
2556
2557
2558 if (WARN_ON_ONCE(!pmd_none(dst_pmdval)) || WARN_ON_ONCE(src_addr & ~HPAGE_PMD_MASK) ||
2559 WARN_ON_ONCE(dst_addr & ~HPAGE_PMD_MASK)) {
2560 spin_unlock(src_ptl);
2561 return -EINVAL;
2562 }
2563
2564 if (!pmd_trans_huge(src_pmdval)) {
2565 spin_unlock(src_ptl);
2566 if (is_pmd_migration_entry(src_pmdval)) {
2567 pmd_migration_entry_wait(mm, &src_pmdval);
2568 return -EAGAIN;
2569 }
2570 return -ENOENT;
2571 }
2572
2573 src_page = pmd_page(src_pmdval);
2574
2575 if (!is_huge_zero_pmd(src_pmdval)) {
2576 if (unlikely(!PageAnonExclusive(src_page))) {
2577 spin_unlock(src_ptl);
2578 return -EBUSY;
2579 }
2580
2581 src_folio = page_folio(src_page);
2582 folio_get(src_folio);
2583 } else
2584 src_folio = NULL;
2585
2586 spin_unlock(src_ptl);
2587
2588 flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE);
2589 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, src_addr,
2590 src_addr + HPAGE_PMD_SIZE);
2591 mmu_notifier_invalidate_range_start(&range);
2592
2593 if (src_folio) {
2594 folio_lock(src_folio);
2595
2596
2597
2598
2599
2600
2601 src_anon_vma = folio_get_anon_vma(src_folio);
2602 if (!src_anon_vma) {
2603 err = -EAGAIN;
2604 goto unlock_folio;
2605 }
2606 anon_vma_lock_write(src_anon_vma);
2607 } else
2608 src_anon_vma = NULL;
2609
2610 dst_ptl = pmd_lockptr(mm, dst_pmd);
2611 double_pt_lock(src_ptl, dst_ptl);
2612 if (unlikely(!pmd_same(*src_pmd, src_pmdval) ||
2613 !pmd_same(*dst_pmd, dst_pmdval))) {
2614 err = -EAGAIN;
2615 goto unlock_ptls;
2616 }
2617 if (src_folio) {
2618 if (folio_maybe_dma_pinned(src_folio) ||
2619 !PageAnonExclusive(&src_folio->page)) {
2620 err = -EBUSY;
2621 goto unlock_ptls;
2622 }
2623
2624 if (WARN_ON_ONCE(!folio_test_head(src_folio)) ||
2625 WARN_ON_ONCE(!folio_test_anon(src_folio))) {
2626 err = -EBUSY;
2627 goto unlock_ptls;
2628 }
2629
2630 src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
2631
2632 if (folio_maybe_dma_pinned(src_folio)) {
2633 set_pmd_at(mm, src_addr, src_pmd, src_pmdval);
2634 err = -EBUSY;
2635 goto unlock_ptls;
2636 }
2637
2638 folio_move_anon_rmap(src_folio, dst_vma);
2639 src_folio->index = linear_page_index(dst_vma, dst_addr);
2640
2641 _dst_pmd = folio_mk_pmd(src_folio, dst_vma->vm_page_prot);
2642
2643 _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
2644 } else {
2645 src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
2646 _dst_pmd = folio_mk_pmd(src_folio, dst_vma->vm_page_prot);
2647 }
2648 set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd);
2649
2650 src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd);
2651 pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable);
2652unlock_ptls:
2653 double_pt_unlock(src_ptl, dst_ptl);
2654 if (src_anon_vma) {
2655 anon_vma_unlock_write(src_anon_vma);
2656 put_anon_vma(src_anon_vma);
2657 }
2658unlock_folio:
2659
2660 if (src_folio)
2661 folio_unlock(src_folio);
2662 mmu_notifier_invalidate_range_end(&range);
2663 if (src_folio)
2664 folio_put(src_folio);
2665 return err;
2666}
2667#endif
2668
2669
2670
2671
2672
2673
2674
2675spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
2676{
2677 spinlock_t *ptl;
2678 ptl = pmd_lock(vma->vm_mm, pmd);
2679 if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd)))
2680 return ptl;
2681 spin_unlock(ptl);
2682 return NULL;
2683}
2684
2685
2686
2687
2688
2689
2690
2691spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
2692{
2693 spinlock_t *ptl;
2694
2695 ptl = pud_lock(vma->vm_mm, pud);
2696 if (likely(pud_trans_huge(*pud)))
2697 return ptl;
2698 spin_unlock(ptl);
2699 return NULL;
2700}
2701
2702#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
2703int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
2704 pud_t *pud, unsigned long addr)
2705{
2706 spinlock_t *ptl;
2707 pud_t orig_pud;
2708
2709 ptl = __pud_trans_huge_lock(pud, vma);
2710 if (!ptl)
2711 return 0;
2712
2713 orig_pud = pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
2714 arch_check_zapped_pud(vma, orig_pud);
2715 tlb_remove_pud_tlb_entry(tlb, pud, addr);
2716 if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
2717 spin_unlock(ptl);
2718
2719 } else {
2720 struct page *page = NULL;
2721 struct folio *folio;
2722
2723
2724 VM_WARN_ON_ONCE(vma_is_anonymous(vma) ||
2725 !pud_present(orig_pud));
2726
2727 page = pud_page(orig_pud);
2728 folio = page_folio(page);
2729 folio_remove_rmap_pud(folio, page, vma);
2730 add_mm_counter(tlb->mm, mm_counter_file(folio), -HPAGE_PUD_NR);
2731
2732 spin_unlock(ptl);
2733 tlb_remove_page_size(tlb, page, HPAGE_PUD_SIZE);
2734 }
2735 return 1;
2736}
2737
2738static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
2739 unsigned long haddr)
2740{
2741 struct folio *folio;
2742 struct page *page;
2743 pud_t old_pud;
2744
2745 VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
2746 VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
2747 VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
2748 VM_BUG_ON(!pud_trans_huge(*pud));
2749
2750 count_vm_event(THP_SPLIT_PUD);
2751
2752 old_pud = pudp_huge_clear_flush(vma, haddr, pud);
2753
2754 if (!vma_is_dax(vma))
2755 return;
2756
2757 page = pud_page(old_pud);
2758 folio = page_folio(page);
2759
2760 if (!folio_test_dirty(folio) && pud_dirty(old_pud))
2761 folio_mark_dirty(folio);
2762 if (!folio_test_referenced(folio) && pud_young(old_pud))
2763 folio_set_referenced(folio);
2764 folio_remove_rmap_pud(folio, page, vma);
2765 folio_put(folio);
2766 add_mm_counter(vma->vm_mm, mm_counter_file(folio),
2767 -HPAGE_PUD_NR);
2768}
2769
2770void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
2771 unsigned long address)
2772{
2773 spinlock_t *ptl;
2774 struct mmu_notifier_range range;
2775
2776 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
2777 address & HPAGE_PUD_MASK,
2778 (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
2779 mmu_notifier_invalidate_range_start(&range);
2780 ptl = pud_lock(vma->vm_mm, pud);
2781 if (unlikely(!pud_trans_huge(*pud)))
2782 goto out;
2783 __split_huge_pud_locked(vma, pud, range.start);
2784
2785out:
2786 spin_unlock(ptl);
2787 mmu_notifier_invalidate_range_end(&range);
2788}
2789#else
2790void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
2791 unsigned long address)
2792{
2793}
2794#endif
2795
2796static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2797 unsigned long haddr, pmd_t *pmd)
2798{
2799 struct mm_struct *mm = vma->vm_mm;
2800 pgtable_t pgtable;
2801 pmd_t _pmd, old_pmd;
2802 unsigned long addr;
2803 pte_t *pte;
2804 int i;
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814 old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
2815
2816 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2817 pmd_populate(mm, &_pmd, pgtable);
2818
2819 pte = pte_offset_map(&_pmd, haddr);
2820 VM_BUG_ON(!pte);
2821 for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
2822 pte_t entry;
2823
2824 entry = pfn_pte(my_zero_pfn(addr), vma->vm_page_prot);
2825 entry = pte_mkspecial(entry);
2826 if (pmd_uffd_wp(old_pmd))
2827 entry = pte_mkuffd_wp(entry);
2828 VM_BUG_ON(!pte_none(ptep_get(pte)));
2829 set_pte_at(mm, addr, pte, entry);
2830 pte++;
2831 }
2832 pte_unmap(pte - 1);
2833 smp_wmb();
2834 pmd_populate(mm, pmd, pgtable);
2835}
2836
2837static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
2838 unsigned long haddr, bool freeze)
2839{
2840 struct mm_struct *mm = vma->vm_mm;
2841 struct folio *folio;
2842 struct page *page;
2843 pgtable_t pgtable;
2844 pmd_t old_pmd, _pmd;
2845 bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
2846 bool anon_exclusive = false, dirty = false;
2847 unsigned long addr;
2848 pte_t *pte;
2849 int i;
2850
2851 VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
2852 VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
2853 VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
2854 VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd));
2855
2856 count_vm_event(THP_SPLIT_PMD);
2857
2858 if (!vma_is_anonymous(vma)) {
2859 old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
2860
2861
2862
2863
2864 if (arch_needs_pgtable_deposit())
2865 zap_deposited_table(mm, pmd);
2866 if (!vma_is_dax(vma) && vma_is_special_huge(vma))
2867 return;
2868 if (unlikely(is_pmd_migration_entry(old_pmd))) {
2869 swp_entry_t entry;
2870
2871 entry = pmd_to_swp_entry(old_pmd);
2872 folio = pfn_swap_entry_folio(entry);
2873 } else if (is_huge_zero_pmd(old_pmd)) {
2874 return;
2875 } else {
2876 page = pmd_page(old_pmd);
2877 folio = page_folio(page);
2878 if (!folio_test_dirty(folio) && pmd_dirty(old_pmd))
2879 folio_mark_dirty(folio);
2880 if (!folio_test_referenced(folio) && pmd_young(old_pmd))
2881 folio_set_referenced(folio);
2882 folio_remove_rmap_pmd(folio, page, vma);
2883 folio_put(folio);
2884 }
2885 add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR);
2886 return;
2887 }
2888
2889 if (is_huge_zero_pmd(*pmd)) {
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899 return __split_huge_zero_page_pmd(vma, haddr, pmd);
2900 }
2901
2902 pmd_migration = is_pmd_migration_entry(*pmd);
2903 if (unlikely(pmd_migration)) {
2904 swp_entry_t entry;
2905
2906 old_pmd = *pmd;
2907 entry = pmd_to_swp_entry(old_pmd);
2908 page = pfn_swap_entry_to_page(entry);
2909 write = is_writable_migration_entry(entry);
2910 if (PageAnon(page))
2911 anon_exclusive = is_readable_exclusive_migration_entry(entry);
2912 young = is_migration_entry_young(entry);
2913 dirty = is_migration_entry_dirty(entry);
2914 soft_dirty = pmd_swp_soft_dirty(old_pmd);
2915 uffd_wp = pmd_swp_uffd_wp(old_pmd);
2916 } else {
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940 old_pmd = pmdp_invalidate(vma, haddr, pmd);
2941 page = pmd_page(old_pmd);
2942 folio = page_folio(page);
2943 if (pmd_dirty(old_pmd)) {
2944 dirty = true;
2945 folio_set_dirty(folio);
2946 }
2947 write = pmd_write(old_pmd);
2948 young = pmd_young(old_pmd);
2949 soft_dirty = pmd_soft_dirty(old_pmd);
2950 uffd_wp = pmd_uffd_wp(old_pmd);
2951
2952 VM_WARN_ON_FOLIO(!folio_ref_count(folio), folio);
2953 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970 anon_exclusive = PageAnonExclusive(page);
2971 if (freeze && anon_exclusive &&
2972 folio_try_share_anon_rmap_pmd(folio, page))
2973 freeze = false;
2974 if (!freeze) {
2975 rmap_t rmap_flags = RMAP_NONE;
2976
2977 folio_ref_add(folio, HPAGE_PMD_NR - 1);
2978 if (anon_exclusive)
2979 rmap_flags |= RMAP_EXCLUSIVE;
2980 folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
2981 vma, haddr, rmap_flags);
2982 }
2983 }
2984
2985
2986
2987
2988
2989 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2990 pmd_populate(mm, &_pmd, pgtable);
2991
2992 pte = pte_offset_map(&_pmd, haddr);
2993 VM_BUG_ON(!pte);
2994
2995
2996
2997
2998
2999 if (freeze || pmd_migration) {
3000 for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
3001 pte_t entry;
3002 swp_entry_t swp_entry;
3003
3004 if (write)
3005 swp_entry = make_writable_migration_entry(
3006 page_to_pfn(page + i));
3007 else if (anon_exclusive)
3008 swp_entry = make_readable_exclusive_migration_entry(
3009 page_to_pfn(page + i));
3010 else
3011 swp_entry = make_readable_migration_entry(
3012 page_to_pfn(page + i));
3013 if (young)
3014 swp_entry = make_migration_entry_young(swp_entry);
3015 if (dirty)
3016 swp_entry = make_migration_entry_dirty(swp_entry);
3017 entry = swp_entry_to_pte(swp_entry);
3018 if (soft_dirty)
3019 entry = pte_swp_mksoft_dirty(entry);
3020 if (uffd_wp)
3021 entry = pte_swp_mkuffd_wp(entry);
3022
3023 VM_WARN_ON(!pte_none(ptep_get(pte + i)));
3024 set_pte_at(mm, addr, pte + i, entry);
3025 }
3026 } else {
3027 pte_t entry;
3028
3029 entry = mk_pte(page, READ_ONCE(vma->vm_page_prot));
3030 if (write)
3031 entry = pte_mkwrite(entry, vma);
3032 if (!young)
3033 entry = pte_mkold(entry);
3034
3035 if (dirty)
3036 entry = pte_mkdirty(entry);
3037 if (soft_dirty)
3038 entry = pte_mksoft_dirty(entry);
3039 if (uffd_wp)
3040 entry = pte_mkuffd_wp(entry);
3041
3042 for (i = 0; i < HPAGE_PMD_NR; i++)
3043 VM_WARN_ON(!pte_none(ptep_get(pte + i)));
3044
3045 set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR);
3046 }
3047 pte_unmap(pte);
3048
3049 if (!pmd_migration)
3050 folio_remove_rmap_pmd(folio, page, vma);
3051 if (freeze)
3052 put_page(page);
3053
3054 smp_wmb();
3055 pmd_populate(mm, pmd, pgtable);
3056}
3057
3058void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
3059 pmd_t *pmd, bool freeze)
3060{
3061 VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
3062 if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd))
3063 __split_huge_pmd_locked(vma, pmd, address, freeze);
3064}
3065
3066void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
3067 unsigned long address, bool freeze)
3068{
3069 spinlock_t *ptl;
3070 struct mmu_notifier_range range;
3071
3072 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
3073 address & HPAGE_PMD_MASK,
3074 (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
3075 mmu_notifier_invalidate_range_start(&range);
3076 ptl = pmd_lock(vma->vm_mm, pmd);
3077 split_huge_pmd_locked(vma, range.start, pmd, freeze);
3078 spin_unlock(ptl);
3079 mmu_notifier_invalidate_range_end(&range);
3080}
3081
3082void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
3083 bool freeze)
3084{
3085 pmd_t *pmd = mm_find_pmd(vma->vm_mm, address);
3086
3087 if (!pmd)
3088 return;
3089
3090 __split_huge_pmd(vma, pmd, address, freeze);
3091}
3092
3093static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
3094{
3095
3096
3097
3098
3099 if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) &&
3100 range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE),
3101 ALIGN(address, HPAGE_PMD_SIZE)))
3102 split_huge_pmd_address(vma, address, false);
3103}
3104
3105void vma_adjust_trans_huge(struct vm_area_struct *vma,
3106 unsigned long start,
3107 unsigned long end,
3108 struct vm_area_struct *next)
3109{
3110
3111 split_huge_pmd_if_needed(vma, start);
3112
3113
3114 split_huge_pmd_if_needed(vma, end);
3115
3116
3117 if (next)
3118 split_huge_pmd_if_needed(next, end);
3119}
3120
3121static void unmap_folio(struct folio *folio)
3122{
3123 enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SYNC |
3124 TTU_BATCH_FLUSH;
3125
3126 VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
3127
3128 if (folio_test_pmd_mappable(folio))
3129 ttu_flags |= TTU_SPLIT_HUGE_PMD;
3130
3131
3132
3133
3134
3135
3136 if (folio_test_anon(folio))
3137 try_to_migrate(folio, ttu_flags);
3138 else
3139 try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK);
3140
3141 try_to_unmap_flush();
3142}
3143
3144static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma,
3145 unsigned long addr, pmd_t *pmdp,
3146 struct folio *folio)
3147{
3148 struct mm_struct *mm = vma->vm_mm;
3149 int ref_count, map_count;
3150 pmd_t orig_pmd = *pmdp;
3151
3152 if (pmd_dirty(orig_pmd))
3153 folio_set_dirty(folio);
3154 if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
3155 folio_set_swapbacked(folio);
3156 return false;
3157 }
3158
3159 orig_pmd = pmdp_huge_clear_flush(vma, addr, pmdp);
3160
3161
3162
3163
3164
3165
3166 smp_mb();
3167
3168 ref_count = folio_ref_count(folio);
3169 map_count = folio_mapcount(folio);
3170
3171
3172
3173
3174
3175 smp_rmb();
3176
3177
3178
3179
3180
3181
3182
3183
3184 if (pmd_dirty(orig_pmd))
3185 folio_set_dirty(folio);
3186 if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
3187 folio_set_swapbacked(folio);
3188 set_pmd_at(mm, addr, pmdp, orig_pmd);
3189 return false;
3190 }
3191
3192 if (ref_count != map_count + 1) {
3193 set_pmd_at(mm, addr, pmdp, orig_pmd);
3194 return false;
3195 }
3196
3197 folio_remove_rmap_pmd(folio, pmd_page(orig_pmd), vma);
3198 zap_deposited_table(mm, pmdp);
3199 add_mm_counter(mm, MM_ANONPAGES, -HPAGE_PMD_NR);
3200 if (vma->vm_flags & VM_LOCKED)
3201 mlock_drain_local();
3202 folio_put(folio);
3203
3204 return true;
3205}
3206
3207bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
3208 pmd_t *pmdp, struct folio *folio)
3209{
3210 VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio);
3211 VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
3212 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
3213 VM_WARN_ON_FOLIO(folio_test_swapbacked(folio), folio);
3214 VM_WARN_ON_ONCE(!IS_ALIGNED(addr, HPAGE_PMD_SIZE));
3215
3216 return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio);
3217}
3218
3219static void remap_page(struct folio *folio, unsigned long nr, int flags)
3220{
3221 int i = 0;
3222
3223
3224 if (!folio_test_anon(folio))
3225 return;
3226 for (;;) {
3227 remove_migration_ptes(folio, folio, RMP_LOCKED | flags);
3228 i += folio_nr_pages(folio);
3229 if (i >= nr)
3230 break;
3231 folio = folio_next(folio);
3232 }
3233}
3234
3235static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
3236 struct lruvec *lruvec, struct list_head *list)
3237{
3238 VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio);
3239 lockdep_assert_held(&lruvec->lru_lock);
3240
3241 if (list) {
3242
3243 VM_WARN_ON(folio_test_lru(folio));
3244 folio_get(new_folio);
3245 list_add_tail(&new_folio->lru, list);
3246 } else {
3247
3248 VM_WARN_ON(!folio_test_lru(folio));
3249 if (folio_test_unevictable(folio))
3250 new_folio->mlock_count = 0;
3251 else
3252 list_add_tail(&new_folio->lru, &folio->lru);
3253 folio_set_lru(new_folio);
3254 }
3255}
3256
3257
3258bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
3259{
3260 int extra_pins;
3261
3262
3263 if (folio_test_anon(folio))
3264 extra_pins = folio_test_swapcache(folio) ?
3265 folio_nr_pages(folio) : 0;
3266 else
3267 extra_pins = folio_nr_pages(folio);
3268 if (pextra_pins)
3269 *pextra_pins = extra_pins;
3270 return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins -
3271 caller_pins;
3272}
3273
3274
3275
3276
3277
3278static void __split_folio_to_order(struct folio *folio, int old_order,
3279 int new_order)
3280{
3281 long new_nr_pages = 1 << new_order;
3282 long nr_pages = 1 << old_order;
3283 long i;
3284
3285
3286
3287
3288
3289 for (i = new_nr_pages; i < nr_pages; i += new_nr_pages) {
3290 struct page *new_head = &folio->page + i;
3291
3292
3293
3294
3295
3296 struct folio *new_folio = (struct folio *)new_head;
3297
3298 VM_BUG_ON_PAGE(atomic_read(&new_folio->_mapcount) != -1, new_head);
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313 new_folio->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
3314 new_folio->flags |= (folio->flags &
3315 ((1L << PG_referenced) |
3316 (1L << PG_swapbacked) |
3317 (1L << PG_swapcache) |
3318 (1L << PG_mlocked) |
3319 (1L << PG_uptodate) |
3320 (1L << PG_active) |
3321 (1L << PG_workingset) |
3322 (1L << PG_locked) |
3323 (1L << PG_unevictable) |
3324#ifdef CONFIG_ARCH_USES_PG_ARCH_2
3325 (1L << PG_arch_2) |
3326#endif
3327#ifdef CONFIG_ARCH_USES_PG_ARCH_3
3328 (1L << PG_arch_3) |
3329#endif
3330 (1L << PG_dirty) |
3331 LRU_GEN_MASK | LRU_REFS_MASK));
3332
3333 new_folio->mapping = folio->mapping;
3334 new_folio->index = folio->index + i;
3335
3336
3337
3338
3339
3340 if (unlikely(new_folio->private)) {
3341 VM_WARN_ON_ONCE_PAGE(true, new_head);
3342 new_folio->private = NULL;
3343 }
3344
3345 if (folio_test_swapcache(folio))
3346 new_folio->swap.val = folio->swap.val + i;
3347
3348
3349 smp_wmb();
3350
3351
3352
3353
3354
3355
3356
3357 clear_compound_head(new_head);
3358 if (new_order) {
3359 prep_compound_page(new_head, new_order);
3360 folio_set_large_rmappable(new_folio);
3361 }
3362
3363 if (folio_test_young(folio))
3364 folio_set_young(new_folio);
3365 if (folio_test_idle(folio))
3366 folio_set_idle(new_folio);
3367#ifdef CONFIG_MEMCG
3368 new_folio->memcg_data = folio->memcg_data;
3369#endif
3370
3371 folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio));
3372 }
3373
3374 if (new_order)
3375 folio_set_order(folio, new_order);
3376 else
3377 ClearPageCompound(&folio->page);
3378}
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419static int __split_unmapped_folio(struct folio *folio, int new_order,
3420 struct page *split_at, struct xa_state *xas,
3421 struct address_space *mapping, bool uniform_split)
3422{
3423 int order = folio_order(folio);
3424 int start_order = uniform_split ? new_order : order - 1;
3425 bool stop_split = false;
3426 struct folio *next;
3427 int split_order;
3428 int ret = 0;
3429
3430 if (folio_test_anon(folio))
3431 mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
3432
3433 folio_clear_has_hwpoisoned(folio);
3434
3435
3436
3437
3438
3439 for (split_order = start_order;
3440 split_order >= new_order && !stop_split;
3441 split_order--) {
3442 struct folio *end_folio = folio_next(folio);
3443 int old_order = folio_order(folio);
3444 struct folio *new_folio;
3445
3446
3447 if (folio_test_anon(folio) && split_order == 1)
3448 continue;
3449 if (uniform_split && split_order != new_order)
3450 continue;
3451
3452 if (mapping) {
3453
3454
3455
3456
3457
3458 if (uniform_split)
3459 xas_split(xas, folio, old_order);
3460 else {
3461 xas_set_order(xas, folio->index, split_order);
3462 xas_try_split(xas, folio, old_order);
3463 if (xas_error(xas)) {
3464 ret = xas_error(xas);
3465 stop_split = true;
3466 }
3467 }
3468 }
3469
3470 if (!stop_split) {
3471 folio_split_memcg_refs(folio, old_order, split_order);
3472 split_page_owner(&folio->page, old_order, split_order);
3473 pgalloc_tag_split(folio, old_order, split_order);
3474
3475 __split_folio_to_order(folio, old_order, split_order);
3476 }
3477
3478
3479
3480
3481
3482
3483
3484
3485 for (new_folio = folio; new_folio != end_folio; new_folio = next) {
3486 next = folio_next(new_folio);
3487
3488
3489
3490
3491
3492
3493
3494 if (new_folio == page_folio(split_at)) {
3495 folio = new_folio;
3496 if (split_order != new_order && !stop_split)
3497 continue;
3498 }
3499 if (folio_test_anon(new_folio))
3500 mod_mthp_stat(folio_order(new_folio),
3501 MTHP_STAT_NR_ANON, 1);
3502 }
3503 }
3504
3505 return ret;
3506}
3507
3508bool non_uniform_split_supported(struct folio *folio, unsigned int new_order,
3509 bool warns)
3510{
3511 if (folio_test_anon(folio)) {
3512
3513 VM_WARN_ONCE(warns && new_order == 1,
3514 "Cannot split to order-1 folio");
3515 return new_order != 1;
3516 } else if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
3517 !mapping_large_folio_support(folio->mapping)) {
3518
3519
3520
3521
3522
3523
3524 VM_WARN_ONCE(warns,
3525 "Cannot split file folio to non-0 order");
3526 return false;
3527 }
3528
3529
3530 if (folio_test_swapcache(folio)) {
3531 VM_WARN_ONCE(warns,
3532 "Cannot split swapcache folio to non-0 order");
3533 return false;
3534 }
3535
3536 return true;
3537}
3538
3539
3540bool uniform_split_supported(struct folio *folio, unsigned int new_order,
3541 bool warns)
3542{
3543 if (folio_test_anon(folio)) {
3544 VM_WARN_ONCE(warns && new_order == 1,
3545 "Cannot split to order-1 folio");
3546 return new_order != 1;
3547 } else if (new_order) {
3548 if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
3549 !mapping_large_folio_support(folio->mapping)) {
3550 VM_WARN_ONCE(warns,
3551 "Cannot split file folio to non-0 order");
3552 return false;
3553 }
3554 }
3555
3556 if (new_order && folio_test_swapcache(folio)) {
3557 VM_WARN_ONCE(warns,
3558 "Cannot split swapcache folio to non-0 order");
3559 return false;
3560 }
3561
3562 return true;
3563}
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586static int __folio_split(struct folio *folio, unsigned int new_order,
3587 struct page *split_at, struct page *lock_at,
3588 struct list_head *list, bool uniform_split)
3589{
3590 struct deferred_split *ds_queue = get_deferred_split_queue(folio);
3591 XA_STATE(xas, &folio->mapping->i_pages, folio->index);
3592 struct folio *end_folio = folio_next(folio);
3593 bool is_anon = folio_test_anon(folio);
3594 struct address_space *mapping = NULL;
3595 struct anon_vma *anon_vma = NULL;
3596 int order = folio_order(folio);
3597 struct folio *new_folio, *next;
3598 int nr_shmem_dropped = 0;
3599 int remap_flags = 0;
3600 int extra_pins, ret;
3601 pgoff_t end;
3602 bool is_hzp;
3603
3604 VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
3605 VM_WARN_ON_ONCE_FOLIO(!folio_test_large(folio), folio);
3606
3607 if (folio != page_folio(split_at) || folio != page_folio(lock_at))
3608 return -EINVAL;
3609
3610 if (new_order >= folio_order(folio))
3611 return -EINVAL;
3612
3613 if (uniform_split && !uniform_split_supported(folio, new_order, true))
3614 return -EINVAL;
3615
3616 if (!uniform_split &&
3617 !non_uniform_split_supported(folio, new_order, true))
3618 return -EINVAL;
3619
3620 is_hzp = is_huge_zero_folio(folio);
3621 if (is_hzp) {
3622 pr_warn_ratelimited("Called split_huge_page for huge zero page\n");
3623 return -EBUSY;
3624 }
3625
3626 if (folio_test_writeback(folio))
3627 return -EBUSY;
3628
3629 if (is_anon) {
3630
3631
3632
3633
3634
3635
3636
3637
3638 anon_vma = folio_get_anon_vma(folio);
3639 if (!anon_vma) {
3640 ret = -EBUSY;
3641 goto out;
3642 }
3643 mapping = NULL;
3644 anon_vma_lock_write(anon_vma);
3645 } else {
3646 unsigned int min_order;
3647 gfp_t gfp;
3648
3649 mapping = folio->mapping;
3650
3651
3652
3653
3654
3655
3656
3657 if (!mapping) {
3658 ret = -EBUSY;
3659 goto out;
3660 }
3661
3662 min_order = mapping_min_folio_order(folio->mapping);
3663 if (new_order < min_order) {
3664 VM_WARN_ONCE(1, "Cannot split mapped folio below min-order: %u",
3665 min_order);
3666 ret = -EINVAL;
3667 goto out;
3668 }
3669
3670 gfp = current_gfp_context(mapping_gfp_mask(mapping) &
3671 GFP_RECLAIM_MASK);
3672
3673 if (!filemap_release_folio(folio, gfp)) {
3674 ret = -EBUSY;
3675 goto out;
3676 }
3677
3678 if (uniform_split) {
3679 xas_set_order(&xas, folio->index, new_order);
3680 xas_split_alloc(&xas, folio, folio_order(folio), gfp);
3681 if (xas_error(&xas)) {
3682 ret = xas_error(&xas);
3683 goto out;
3684 }
3685 }
3686
3687 anon_vma = NULL;
3688 i_mmap_lock_read(mapping);
3689
3690
3691
3692
3693
3694
3695
3696
3697 end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
3698 if (shmem_mapping(mapping))
3699 end = shmem_fallocend(mapping->host, end);
3700 }
3701
3702
3703
3704
3705
3706 if (!can_split_folio(folio, 1, &extra_pins)) {
3707 ret = -EAGAIN;
3708 goto out_unlock;
3709 }
3710
3711 unmap_folio(folio);
3712
3713
3714 local_irq_disable();
3715 if (mapping) {
3716
3717
3718
3719
3720 xas_lock(&xas);
3721 xas_reset(&xas);
3722 if (xas_load(&xas) != folio) {
3723 ret = -EAGAIN;
3724 goto fail;
3725 }
3726 }
3727
3728
3729 spin_lock(&ds_queue->split_queue_lock);
3730 if (folio_ref_freeze(folio, 1 + extra_pins)) {
3731 struct address_space *swap_cache = NULL;
3732 struct lruvec *lruvec;
3733 int expected_refs;
3734
3735 if (folio_order(folio) > 1 &&
3736 !list_empty(&folio->_deferred_list)) {
3737 ds_queue->split_queue_len--;
3738 if (folio_test_partially_mapped(folio)) {
3739 folio_clear_partially_mapped(folio);
3740 mod_mthp_stat(folio_order(folio),
3741 MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
3742 }
3743
3744
3745
3746
3747
3748
3749 list_del_init(&folio->_deferred_list);
3750 }
3751 spin_unlock(&ds_queue->split_queue_lock);
3752 if (mapping) {
3753 int nr = folio_nr_pages(folio);
3754
3755 if (folio_test_pmd_mappable(folio) &&
3756 new_order < HPAGE_PMD_ORDER) {
3757 if (folio_test_swapbacked(folio)) {
3758 __lruvec_stat_mod_folio(folio,
3759 NR_SHMEM_THPS, -nr);
3760 } else {
3761 __lruvec_stat_mod_folio(folio,
3762 NR_FILE_THPS, -nr);
3763 filemap_nr_thps_dec(mapping);
3764 }
3765 }
3766 }
3767
3768 if (folio_test_swapcache(folio)) {
3769 if (mapping) {
3770 VM_WARN_ON_ONCE_FOLIO(mapping, folio);
3771 ret = -EINVAL;
3772 goto fail;
3773 }
3774
3775 swap_cache = swap_address_space(folio->swap);
3776 xa_lock(&swap_cache->i_pages);
3777 }
3778
3779
3780 lruvec = folio_lruvec_lock(folio);
3781
3782 ret = __split_unmapped_folio(folio, new_order, split_at, &xas,
3783 mapping, uniform_split);
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793 for (new_folio = folio_next(folio); new_folio != end_folio;
3794 new_folio = next) {
3795 unsigned long nr_pages = folio_nr_pages(new_folio);
3796
3797 next = folio_next(new_folio);
3798
3799 expected_refs = folio_expected_ref_count(new_folio) + 1;
3800 folio_ref_unfreeze(new_folio, expected_refs);
3801
3802 lru_add_split_folio(folio, new_folio, lruvec, list);
3803
3804
3805
3806
3807
3808 if (swap_cache) {
3809 __xa_store(&swap_cache->i_pages,
3810 swap_cache_index(new_folio->swap),
3811 new_folio, 0);
3812 continue;
3813 }
3814
3815
3816 if (!mapping)
3817 continue;
3818
3819
3820 if (new_folio->index < end) {
3821 __xa_store(&mapping->i_pages, new_folio->index,
3822 new_folio, 0);
3823 continue;
3824 }
3825
3826
3827 if (shmem_mapping(mapping))
3828 nr_shmem_dropped += nr_pages;
3829 else if (folio_test_clear_dirty(new_folio))
3830 folio_account_cleaned(
3831 new_folio, inode_to_wb(mapping->host));
3832 __filemap_remove_folio(new_folio, NULL);
3833 folio_put_refs(new_folio, nr_pages);
3834 }
3835
3836
3837
3838
3839
3840
3841 expected_refs = folio_expected_ref_count(folio) + 1;
3842 folio_ref_unfreeze(folio, expected_refs);
3843
3844 unlock_page_lruvec(lruvec);
3845
3846 if (swap_cache)
3847 xa_unlock(&swap_cache->i_pages);
3848 } else {
3849 spin_unlock(&ds_queue->split_queue_lock);
3850 ret = -EAGAIN;
3851 }
3852fail:
3853 if (mapping)
3854 xas_unlock(&xas);
3855
3856 local_irq_enable();
3857
3858 if (nr_shmem_dropped)
3859 shmem_uncharge(mapping->host, nr_shmem_dropped);
3860
3861 if (!ret && is_anon)
3862 remap_flags = RMP_USE_SHARED_ZEROPAGE;
3863 remap_page(folio, 1 << order, remap_flags);
3864
3865
3866
3867
3868
3869 for (new_folio = folio; new_folio != end_folio; new_folio = next) {
3870 next = folio_next(new_folio);
3871 if (new_folio == page_folio(lock_at))
3872 continue;
3873
3874 folio_unlock(new_folio);
3875
3876
3877
3878
3879
3880
3881
3882 free_folio_and_swap_cache(new_folio);
3883 }
3884
3885out_unlock:
3886 if (anon_vma) {
3887 anon_vma_unlock_write(anon_vma);
3888 put_anon_vma(anon_vma);
3889 }
3890 if (mapping)
3891 i_mmap_unlock_read(mapping);
3892out:
3893 xas_destroy(&xas);
3894 if (order == HPAGE_PMD_ORDER)
3895 count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
3896 count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT : MTHP_STAT_SPLIT_FAILED);
3897 return ret;
3898}
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
3948 unsigned int new_order)
3949{
3950 struct folio *folio = page_folio(page);
3951
3952 return __folio_split(folio, new_order, &folio->page, page, list, true);
3953}
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977int folio_split(struct folio *folio, unsigned int new_order,
3978 struct page *split_at, struct list_head *list)
3979{
3980 return __folio_split(folio, new_order, split_at, &folio->page, list,
3981 false);
3982}
3983
3984int min_order_for_split(struct folio *folio)
3985{
3986 if (folio_test_anon(folio))
3987 return 0;
3988
3989 if (!folio->mapping) {
3990 if (folio_test_pmd_mappable(folio))
3991 count_vm_event(THP_SPLIT_PAGE_FAILED);
3992 return -EBUSY;
3993 }
3994
3995 return mapping_min_folio_order(folio->mapping);
3996}
3997
3998int split_folio_to_list(struct folio *folio, struct list_head *list)
3999{
4000 int ret = min_order_for_split(folio);
4001
4002 if (ret < 0)
4003 return ret;
4004
4005 return split_huge_page_to_list_to_order(&folio->page, list, ret);
4006}
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021bool __folio_unqueue_deferred_split(struct folio *folio)
4022{
4023 struct deferred_split *ds_queue;
4024 unsigned long flags;
4025 bool unqueued = false;
4026
4027 WARN_ON_ONCE(folio_ref_count(folio));
4028 WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg(folio));
4029
4030 ds_queue = get_deferred_split_queue(folio);
4031 spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
4032 if (!list_empty(&folio->_deferred_list)) {
4033 ds_queue->split_queue_len--;
4034 if (folio_test_partially_mapped(folio)) {
4035 folio_clear_partially_mapped(folio);
4036 mod_mthp_stat(folio_order(folio),
4037 MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
4038 }
4039 list_del_init(&folio->_deferred_list);
4040 unqueued = true;
4041 }
4042 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
4043
4044 return unqueued;
4045}
4046
4047
4048void deferred_split_folio(struct folio *folio, bool partially_mapped)
4049{
4050 struct deferred_split *ds_queue = get_deferred_split_queue(folio);
4051#ifdef CONFIG_MEMCG
4052 struct mem_cgroup *memcg = folio_memcg(folio);
4053#endif
4054 unsigned long flags;
4055
4056
4057
4058
4059
4060 if (folio_order(folio) <= 1)
4061 return;
4062
4063 if (!partially_mapped && !split_underused_thp)
4064 return;
4065
4066
4067
4068
4069
4070
4071
4072
4073 if (folio_test_swapcache(folio))
4074 return;
4075
4076 spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
4077 if (partially_mapped) {
4078 if (!folio_test_partially_mapped(folio)) {
4079 folio_set_partially_mapped(folio);
4080 if (folio_test_pmd_mappable(folio))
4081 count_vm_event(THP_DEFERRED_SPLIT_PAGE);
4082 count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
4083 mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, 1);
4084
4085 }
4086 } else {
4087
4088 VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
4089 }
4090 if (list_empty(&folio->_deferred_list)) {
4091 list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
4092 ds_queue->split_queue_len++;
4093#ifdef CONFIG_MEMCG
4094 if (memcg)
4095 set_shrinker_bit(memcg, folio_nid(folio),
4096 deferred_split_shrinker->id);
4097#endif
4098 }
4099 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
4100}
4101
4102static unsigned long deferred_split_count(struct shrinker *shrink,
4103 struct shrink_control *sc)
4104{
4105 struct pglist_data *pgdata = NODE_DATA(sc->nid);
4106 struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
4107
4108#ifdef CONFIG_MEMCG
4109 if (sc->memcg)
4110 ds_queue = &sc->memcg->deferred_split_queue;
4111#endif
4112 return READ_ONCE(ds_queue->split_queue_len);
4113}
4114
4115static bool thp_underused(struct folio *folio)
4116{
4117 int num_zero_pages = 0, num_filled_pages = 0;
4118 void *kaddr;
4119 int i;
4120
4121 if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1)
4122 return false;
4123
4124 for (i = 0; i < folio_nr_pages(folio); i++) {
4125 kaddr = kmap_local_folio(folio, i * PAGE_SIZE);
4126 if (!memchr_inv(kaddr, 0, PAGE_SIZE)) {
4127 num_zero_pages++;
4128 if (num_zero_pages > khugepaged_max_ptes_none) {
4129 kunmap_local(kaddr);
4130 return true;
4131 }
4132 } else {
4133
4134
4135
4136
4137 num_filled_pages++;
4138 if (num_filled_pages >= HPAGE_PMD_NR - khugepaged_max_ptes_none) {
4139 kunmap_local(kaddr);
4140 return false;
4141 }
4142 }
4143 kunmap_local(kaddr);
4144 }
4145 return false;
4146}
4147
4148static unsigned long deferred_split_scan(struct shrinker *shrink,
4149 struct shrink_control *sc)
4150{
4151 struct pglist_data *pgdata = NODE_DATA(sc->nid);
4152 struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
4153 unsigned long flags;
4154 LIST_HEAD(list);
4155 struct folio *folio, *next, *prev = NULL;
4156 int split = 0, removed = 0;
4157
4158#ifdef CONFIG_MEMCG
4159 if (sc->memcg)
4160 ds_queue = &sc->memcg->deferred_split_queue;
4161#endif
4162
4163 spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
4164
4165 list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
4166 _deferred_list) {
4167 if (folio_try_get(folio)) {
4168 list_move(&folio->_deferred_list, &list);
4169 } else {
4170
4171 if (folio_test_partially_mapped(folio)) {
4172 folio_clear_partially_mapped(folio);
4173 mod_mthp_stat(folio_order(folio),
4174 MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
4175 }
4176 list_del_init(&folio->_deferred_list);
4177 ds_queue->split_queue_len--;
4178 }
4179 if (!--sc->nr_to_scan)
4180 break;
4181 }
4182 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
4183
4184 list_for_each_entry_safe(folio, next, &list, _deferred_list) {
4185 bool did_split = false;
4186 bool underused = false;
4187
4188 if (!folio_test_partially_mapped(folio)) {
4189 underused = thp_underused(folio);
4190 if (!underused)
4191 goto next;
4192 }
4193 if (!folio_trylock(folio))
4194 goto next;
4195 if (!split_folio(folio)) {
4196 did_split = true;
4197 if (underused)
4198 count_vm_event(THP_UNDERUSED_SPLIT_PAGE);
4199 split++;
4200 }
4201 folio_unlock(folio);
4202next:
4203
4204
4205
4206
4207
4208
4209
4210 if (did_split) {
4211 ;
4212 } else if (!folio_test_partially_mapped(folio)) {
4213 list_del_init(&folio->_deferred_list);
4214 removed++;
4215 } else {
4216
4217
4218
4219
4220
4221
4222 swap(folio, prev);
4223 }
4224 if (folio)
4225 folio_put(folio);
4226 }
4227
4228 spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
4229 list_splice_tail(&list, &ds_queue->split_queue);
4230 ds_queue->split_queue_len -= removed;
4231 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
4232
4233 if (prev)
4234 folio_put(prev);
4235
4236
4237
4238
4239
4240 if (!split && list_empty(&ds_queue->split_queue))
4241 return SHRINK_STOP;
4242 return split;
4243}
4244
4245#ifdef CONFIG_DEBUG_FS
4246static void split_huge_pages_all(void)
4247{
4248 struct zone *zone;
4249 struct page *page;
4250 struct folio *folio;
4251 unsigned long pfn, max_zone_pfn;
4252 unsigned long total = 0, split = 0;
4253
4254 pr_debug("Split all THPs\n");
4255 for_each_zone(zone) {
4256 if (!managed_zone(zone))
4257 continue;
4258 max_zone_pfn = zone_end_pfn(zone);
4259 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
4260 int nr_pages;
4261
4262 page = pfn_to_online_page(pfn);
4263 if (!page || PageTail(page))
4264 continue;
4265 folio = page_folio(page);
4266 if (!folio_try_get(folio))
4267 continue;
4268
4269 if (unlikely(page_folio(page) != folio))
4270 goto next;
4271
4272 if (zone != folio_zone(folio))
4273 goto next;
4274
4275 if (!folio_test_large(folio)
4276 || folio_test_hugetlb(folio)
4277 || !folio_test_lru(folio))
4278 goto next;
4279
4280 total++;
4281 folio_lock(folio);
4282 nr_pages = folio_nr_pages(folio);
4283 if (!split_folio(folio))
4284 split++;
4285 pfn += nr_pages - 1;
4286 folio_unlock(folio);
4287next:
4288 folio_put(folio);
4289 cond_resched();
4290 }
4291 }
4292
4293 pr_debug("%lu of %lu THP split\n", split, total);
4294}
4295
4296static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma)
4297{
4298 return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) ||
4299 is_vm_hugetlb_page(vma);
4300}
4301
4302static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
4303 unsigned long vaddr_end, unsigned int new_order,
4304 long in_folio_offset)
4305{
4306 int ret = 0;
4307 struct task_struct *task;
4308 struct mm_struct *mm;
4309 unsigned long total = 0, split = 0;
4310 unsigned long addr;
4311
4312 vaddr_start &= PAGE_MASK;
4313 vaddr_end &= PAGE_MASK;
4314
4315 task = find_get_task_by_vpid(pid);
4316 if (!task) {
4317 ret = -ESRCH;
4318 goto out;
4319 }
4320
4321
4322 mm = get_task_mm(task);
4323 put_task_struct(task);
4324
4325 if (!mm) {
4326 ret = -EINVAL;
4327 goto out;
4328 }
4329
4330 pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
4331 pid, vaddr_start, vaddr_end);
4332
4333 mmap_read_lock(mm);
4334
4335
4336
4337
4338 for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
4339 struct vm_area_struct *vma = vma_lookup(mm, addr);
4340 struct folio_walk fw;
4341 struct folio *folio;
4342 struct address_space *mapping;
4343 unsigned int target_order = new_order;
4344
4345 if (!vma)
4346 break;
4347
4348
4349 if (vma_not_suitable_for_thp_split(vma)) {
4350 addr = vma->vm_end;
4351 continue;
4352 }
4353
4354 folio = folio_walk_start(&fw, vma, addr, 0);
4355 if (!folio)
4356 continue;
4357
4358 if (!is_transparent_hugepage(folio))
4359 goto next;
4360
4361 if (!folio_test_anon(folio)) {
4362 mapping = folio->mapping;
4363 target_order = max(new_order,
4364 mapping_min_folio_order(mapping));
4365 }
4366
4367 if (target_order >= folio_order(folio))
4368 goto next;
4369
4370 total++;
4371
4372
4373
4374
4375
4376 if (!folio_test_private(folio) &&
4377 !can_split_folio(folio, 0, NULL))
4378 goto next;
4379
4380 if (!folio_trylock(folio))
4381 goto next;
4382 folio_get(folio);
4383 folio_walk_end(&fw, vma);
4384
4385 if (!folio_test_anon(folio) && folio->mapping != mapping)
4386 goto unlock;
4387
4388 if (in_folio_offset < 0 ||
4389 in_folio_offset >= folio_nr_pages(folio)) {
4390 if (!split_folio_to_order(folio, target_order))
4391 split++;
4392 } else {
4393 struct page *split_at = folio_page(folio,
4394 in_folio_offset);
4395 if (!folio_split(folio, target_order, split_at, NULL))
4396 split++;
4397 }
4398
4399unlock:
4400
4401 folio_unlock(folio);
4402 folio_put(folio);
4403
4404 cond_resched();
4405 continue;
4406next:
4407 folio_walk_end(&fw, vma);
4408 cond_resched();
4409 }
4410 mmap_read_unlock(mm);
4411 mmput(mm);
4412
4413 pr_debug("%lu of %lu THP split\n", split, total);
4414
4415out:
4416 return ret;
4417}
4418
4419static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
4420 pgoff_t off_end, unsigned int new_order,
4421 long in_folio_offset)
4422{
4423 struct filename *file;
4424 struct file *candidate;
4425 struct address_space *mapping;
4426 int ret = -EINVAL;
4427 pgoff_t index;
4428 int nr_pages = 1;
4429 unsigned long total = 0, split = 0;
4430 unsigned int min_order;
4431 unsigned int target_order;
4432
4433 file = getname_kernel(file_path);
4434 if (IS_ERR(file))
4435 return ret;
4436
4437 candidate = file_open_name(file, O_RDONLY, 0);
4438 if (IS_ERR(candidate))
4439 goto out;
4440
4441 pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n",
4442 file_path, off_start, off_end);
4443
4444 mapping = candidate->f_mapping;
4445 min_order = mapping_min_folio_order(mapping);
4446 target_order = max(new_order, min_order);
4447
4448 for (index = off_start; index < off_end; index += nr_pages) {
4449 struct folio *folio = filemap_get_folio(mapping, index);
4450
4451 nr_pages = 1;
4452 if (IS_ERR(folio))
4453 continue;
4454
4455 if (!folio_test_large(folio))
4456 goto next;
4457
4458 total++;
4459 nr_pages = folio_nr_pages(folio);
4460
4461 if (target_order >= folio_order(folio))
4462 goto next;
4463
4464 if (!folio_trylock(folio))
4465 goto next;
4466
4467 if (folio->mapping != mapping)
4468 goto unlock;
4469
4470 if (in_folio_offset < 0 || in_folio_offset >= nr_pages) {
4471 if (!split_folio_to_order(folio, target_order))
4472 split++;
4473 } else {
4474 struct page *split_at = folio_page(folio,
4475 in_folio_offset);
4476 if (!folio_split(folio, target_order, split_at, NULL))
4477 split++;
4478 }
4479
4480unlock:
4481 folio_unlock(folio);
4482next:
4483 folio_put(folio);
4484 cond_resched();
4485 }
4486
4487 filp_close(candidate, NULL);
4488 ret = 0;
4489
4490 pr_debug("%lu of %lu file-backed THP split\n", split, total);
4491out:
4492 putname(file);
4493 return ret;
4494}
4495
4496#define MAX_INPUT_BUF_SZ 255
4497
4498static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
4499 size_t count, loff_t *ppops)
4500{
4501 static DEFINE_MUTEX(split_debug_mutex);
4502 ssize_t ret;
4503
4504
4505
4506
4507 char input_buf[MAX_INPUT_BUF_SZ];
4508 int pid;
4509 unsigned long vaddr_start, vaddr_end;
4510 unsigned int new_order = 0;
4511 long in_folio_offset = -1;
4512
4513 ret = mutex_lock_interruptible(&split_debug_mutex);
4514 if (ret)
4515 return ret;
4516
4517 ret = -EFAULT;
4518
4519 memset(input_buf, 0, MAX_INPUT_BUF_SZ);
4520 if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ)))
4521 goto out;
4522
4523 input_buf[MAX_INPUT_BUF_SZ - 1] = '\0';
4524
4525 if (input_buf[0] == '/') {
4526 char *tok;
4527 char *tok_buf = input_buf;
4528 char file_path[MAX_INPUT_BUF_SZ];
4529 pgoff_t off_start = 0, off_end = 0;
4530 size_t input_len = strlen(input_buf);
4531
4532 tok = strsep(&tok_buf, ",");
4533 if (tok && tok_buf) {
4534 strscpy(file_path, tok);
4535 } else {
4536 ret = -EINVAL;
4537 goto out;
4538 }
4539
4540 ret = sscanf(tok_buf, "0x%lx,0x%lx,%d,%ld", &off_start, &off_end,
4541 &new_order, &in_folio_offset);
4542 if (ret != 2 && ret != 3 && ret != 4) {
4543 ret = -EINVAL;
4544 goto out;
4545 }
4546 ret = split_huge_pages_in_file(file_path, off_start, off_end,
4547 new_order, in_folio_offset);
4548 if (!ret)
4549 ret = input_len;
4550
4551 goto out;
4552 }
4553
4554 ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d,%ld", &pid, &vaddr_start,
4555 &vaddr_end, &new_order, &in_folio_offset);
4556 if (ret == 1 && pid == 1) {
4557 split_huge_pages_all();
4558 ret = strlen(input_buf);
4559 goto out;
4560 } else if (ret != 3 && ret != 4 && ret != 5) {
4561 ret = -EINVAL;
4562 goto out;
4563 }
4564
4565 ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order,
4566 in_folio_offset);
4567 if (!ret)
4568 ret = strlen(input_buf);
4569out:
4570 mutex_unlock(&split_debug_mutex);
4571 return ret;
4572
4573}
4574
4575static const struct file_operations split_huge_pages_fops = {
4576 .owner = THIS_MODULE,
4577 .write = split_huge_pages_write,
4578};
4579
4580static int __init split_huge_pages_debugfs(void)
4581{
4582 debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
4583 &split_huge_pages_fops);
4584 return 0;
4585}
4586late_initcall(split_huge_pages_debugfs);
4587#endif
4588
4589#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
4590int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
4591 struct page *page)
4592{
4593 struct folio *folio = page_folio(page);
4594 struct vm_area_struct *vma = pvmw->vma;
4595 struct mm_struct *mm = vma->vm_mm;
4596 unsigned long address = pvmw->address;
4597 bool anon_exclusive;
4598 pmd_t pmdval;
4599 swp_entry_t entry;
4600 pmd_t pmdswp;
4601
4602 if (!(pvmw->pmd && !pvmw->pte))
4603 return 0;
4604
4605 flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
4606 pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
4607
4608
4609 anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page);
4610 if (anon_exclusive && folio_try_share_anon_rmap_pmd(folio, page)) {
4611 set_pmd_at(mm, address, pvmw->pmd, pmdval);
4612 return -EBUSY;
4613 }
4614
4615 if (pmd_dirty(pmdval))
4616 folio_mark_dirty(folio);
4617 if (pmd_write(pmdval))
4618 entry = make_writable_migration_entry(page_to_pfn(page));
4619 else if (anon_exclusive)
4620 entry = make_readable_exclusive_migration_entry(page_to_pfn(page));
4621 else
4622 entry = make_readable_migration_entry(page_to_pfn(page));
4623 if (pmd_young(pmdval))
4624 entry = make_migration_entry_young(entry);
4625 if (pmd_dirty(pmdval))
4626 entry = make_migration_entry_dirty(entry);
4627 pmdswp = swp_entry_to_pmd(entry);
4628 if (pmd_soft_dirty(pmdval))
4629 pmdswp = pmd_swp_mksoft_dirty(pmdswp);
4630 if (pmd_uffd_wp(pmdval))
4631 pmdswp = pmd_swp_mkuffd_wp(pmdswp);
4632 set_pmd_at(mm, address, pvmw->pmd, pmdswp);
4633 folio_remove_rmap_pmd(folio, page, vma);
4634 folio_put(folio);
4635 trace_set_migration_pmd(address, pmd_val(pmdswp));
4636
4637 return 0;
4638}
4639
4640void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
4641{
4642 struct folio *folio = page_folio(new);
4643 struct vm_area_struct *vma = pvmw->vma;
4644 struct mm_struct *mm = vma->vm_mm;
4645 unsigned long address = pvmw->address;
4646 unsigned long haddr = address & HPAGE_PMD_MASK;
4647 pmd_t pmde;
4648 swp_entry_t entry;
4649
4650 if (!(pvmw->pmd && !pvmw->pte))
4651 return;
4652
4653 entry = pmd_to_swp_entry(*pvmw->pmd);
4654 folio_get(folio);
4655 pmde = folio_mk_pmd(folio, READ_ONCE(vma->vm_page_prot));
4656 if (pmd_swp_soft_dirty(*pvmw->pmd))
4657 pmde = pmd_mksoft_dirty(pmde);
4658 if (is_writable_migration_entry(entry))
4659 pmde = pmd_mkwrite(pmde, vma);
4660 if (pmd_swp_uffd_wp(*pvmw->pmd))
4661 pmde = pmd_mkuffd_wp(pmde);
4662 if (!is_migration_entry_young(entry))
4663 pmde = pmd_mkold(pmde);
4664
4665 if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
4666 pmde = pmd_mkdirty(pmde);
4667
4668 if (folio_test_anon(folio)) {
4669 rmap_t rmap_flags = RMAP_NONE;
4670
4671 if (!is_readable_migration_entry(entry))
4672 rmap_flags |= RMAP_EXCLUSIVE;
4673
4674 folio_add_anon_rmap_pmd(folio, new, vma, haddr, rmap_flags);
4675 } else {
4676 folio_add_file_rmap_pmd(folio, new, vma);
4677 }
4678 VM_BUG_ON(pmd_write(pmde) && folio_test_anon(folio) && !PageAnonExclusive(new));
4679 set_pmd_at(mm, haddr, pvmw->pmd, pmde);
4680
4681
4682 update_mmu_cache_pmd(vma, address, pvmw->pmd);
4683 trace_remove_migration_pmd(address, pmd_val(pmde));
4684}
4685#endif
4686