1
2
3
4
5
6
7
8
9#include <linux/atomic.h>
10#include <linux/blkdev.h>
11#include <linux/buffer_head.h>
12#include <linux/dax.h>
13#include <linux/fs.h>
14#include <linux/highmem.h>
15#include <linux/memcontrol.h>
16#include <linux/mm.h>
17#include <linux/mutex.h>
18#include <linux/pagevec.h>
19#include <linux/sched.h>
20#include <linux/sched/signal.h>
21#include <linux/uio.h>
22#include <linux/vmstat.h>
23#include <linux/pfn_t.h>
24#include <linux/sizes.h>
25#include <linux/mmu_notifier.h>
26#include <linux/iomap.h>
27#include <asm/pgalloc.h>
28
29#define CREATE_TRACE_POINTS
30#include <trace/events/fs_dax.h>
31
32static inline unsigned int pe_order(enum page_entry_size pe_size)
33{
34 if (pe_size == PE_SIZE_PTE)
35 return PAGE_SHIFT - PAGE_SHIFT;
36 if (pe_size == PE_SIZE_PMD)
37 return PMD_SHIFT - PAGE_SHIFT;
38 if (pe_size == PE_SIZE_PUD)
39 return PUD_SHIFT - PAGE_SHIFT;
40 return ~0;
41}
42
43
44#define DAX_WAIT_TABLE_BITS 12
45#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
46
47
48#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
49#define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT)
50
51
52#define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT)
53
54static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
55
56static int __init init_dax_wait_table(void)
57{
58 int i;
59
60 for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
61 init_waitqueue_head(wait_table + i);
62 return 0;
63}
64fs_initcall(init_dax_wait_table);
65
66
67
68
69
70
71
72
73
74
75
76#define DAX_SHIFT (4)
77#define DAX_LOCKED (1UL << 0)
78#define DAX_PMD (1UL << 1)
79#define DAX_ZERO_PAGE (1UL << 2)
80#define DAX_EMPTY (1UL << 3)
81
82static unsigned long dax_to_pfn(void *entry)
83{
84 return xa_to_value(entry) >> DAX_SHIFT;
85}
86
87static void *dax_make_entry(pfn_t pfn, unsigned long flags)
88{
89 return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT));
90}
91
92static bool dax_is_locked(void *entry)
93{
94 return xa_to_value(entry) & DAX_LOCKED;
95}
96
97static unsigned int dax_entry_order(void *entry)
98{
99 if (xa_to_value(entry) & DAX_PMD)
100 return PMD_ORDER;
101 return 0;
102}
103
104static unsigned long dax_is_pmd_entry(void *entry)
105{
106 return xa_to_value(entry) & DAX_PMD;
107}
108
109static bool dax_is_pte_entry(void *entry)
110{
111 return !(xa_to_value(entry) & DAX_PMD);
112}
113
114static int dax_is_zero_entry(void *entry)
115{
116 return xa_to_value(entry) & DAX_ZERO_PAGE;
117}
118
119static int dax_is_empty_entry(void *entry)
120{
121 return xa_to_value(entry) & DAX_EMPTY;
122}
123
124
125
126
127
128static bool dax_is_conflict(void *entry)
129{
130 return entry == XA_RETRY_ENTRY;
131}
132
133
134
135
136struct exceptional_entry_key {
137 struct xarray *xa;
138 pgoff_t entry_start;
139};
140
141struct wait_exceptional_entry_queue {
142 wait_queue_entry_t wait;
143 struct exceptional_entry_key key;
144};
145
146
147
148
149
150
151enum dax_wake_mode {
152 WAKE_ALL,
153 WAKE_NEXT,
154};
155
156static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas,
157 void *entry, struct exceptional_entry_key *key)
158{
159 unsigned long hash;
160 unsigned long index = xas->xa_index;
161
162
163
164
165
166
167 if (dax_is_pmd_entry(entry))
168 index &= ~PG_PMD_COLOUR;
169 key->xa = xas->xa;
170 key->entry_start = index;
171
172 hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS);
173 return wait_table + hash;
174}
175
176static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
177 unsigned int mode, int sync, void *keyp)
178{
179 struct exceptional_entry_key *key = keyp;
180 struct wait_exceptional_entry_queue *ewait =
181 container_of(wait, struct wait_exceptional_entry_queue, wait);
182
183 if (key->xa != ewait->key.xa ||
184 key->entry_start != ewait->key.entry_start)
185 return 0;
186 return autoremove_wake_function(wait, mode, sync, NULL);
187}
188
189
190
191
192
193
194static void dax_wake_entry(struct xa_state *xas, void *entry,
195 enum dax_wake_mode mode)
196{
197 struct exceptional_entry_key key;
198 wait_queue_head_t *wq;
199
200 wq = dax_entry_waitqueue(xas, entry, &key);
201
202
203
204
205
206
207
208 if (waitqueue_active(wq))
209 __wake_up(wq, TASK_NORMAL, mode == WAKE_ALL ? 0 : 1, &key);
210}
211
212
213
214
215
216
217
218
219
220
221
222static void *get_unlocked_entry(struct xa_state *xas, unsigned int order)
223{
224 void *entry;
225 struct wait_exceptional_entry_queue ewait;
226 wait_queue_head_t *wq;
227
228 init_wait(&ewait.wait);
229 ewait.wait.func = wake_exceptional_entry_func;
230
231 for (;;) {
232 entry = xas_find_conflict(xas);
233 if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
234 return entry;
235 if (dax_entry_order(entry) < order)
236 return XA_RETRY_ENTRY;
237 if (!dax_is_locked(entry))
238 return entry;
239
240 wq = dax_entry_waitqueue(xas, entry, &ewait.key);
241 prepare_to_wait_exclusive(wq, &ewait.wait,
242 TASK_UNINTERRUPTIBLE);
243 xas_unlock_irq(xas);
244 xas_reset(xas);
245 schedule();
246 finish_wait(wq, &ewait.wait);
247 xas_lock_irq(xas);
248 }
249}
250
251
252
253
254
255
256static void wait_entry_unlocked(struct xa_state *xas, void *entry)
257{
258 struct wait_exceptional_entry_queue ewait;
259 wait_queue_head_t *wq;
260
261 init_wait(&ewait.wait);
262 ewait.wait.func = wake_exceptional_entry_func;
263
264 wq = dax_entry_waitqueue(xas, entry, &ewait.key);
265
266
267
268
269
270
271 prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE);
272 xas_unlock_irq(xas);
273 schedule();
274 finish_wait(wq, &ewait.wait);
275}
276
277static void put_unlocked_entry(struct xa_state *xas, void *entry,
278 enum dax_wake_mode mode)
279{
280 if (entry && !dax_is_conflict(entry))
281 dax_wake_entry(xas, entry, mode);
282}
283
284
285
286
287
288
289static void dax_unlock_entry(struct xa_state *xas, void *entry)
290{
291 void *old;
292
293 BUG_ON(dax_is_locked(entry));
294 xas_reset(xas);
295 xas_lock_irq(xas);
296 old = xas_store(xas, entry);
297 xas_unlock_irq(xas);
298 BUG_ON(!dax_is_locked(old));
299 dax_wake_entry(xas, entry, WAKE_NEXT);
300}
301
302
303
304
305static void *dax_lock_entry(struct xa_state *xas, void *entry)
306{
307 unsigned long v = xa_to_value(entry);
308 return xas_store(xas, xa_mk_value(v | DAX_LOCKED));
309}
310
311static unsigned long dax_entry_size(void *entry)
312{
313 if (dax_is_zero_entry(entry))
314 return 0;
315 else if (dax_is_empty_entry(entry))
316 return 0;
317 else if (dax_is_pmd_entry(entry))
318 return PMD_SIZE;
319 else
320 return PAGE_SIZE;
321}
322
323static unsigned long dax_end_pfn(void *entry)
324{
325 return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
326}
327
328
329
330
331
332#define for_each_mapped_pfn(entry, pfn) \
333 for (pfn = dax_to_pfn(entry); \
334 pfn < dax_end_pfn(entry); pfn++)
335
336
337
338
339
340
341static void dax_associate_entry(void *entry, struct address_space *mapping,
342 struct vm_area_struct *vma, unsigned long address)
343{
344 unsigned long size = dax_entry_size(entry), pfn, index;
345 int i = 0;
346
347 if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
348 return;
349
350 index = linear_page_index(vma, address & ~(size - 1));
351 for_each_mapped_pfn(entry, pfn) {
352 struct page *page = pfn_to_page(pfn);
353
354 WARN_ON_ONCE(page->mapping);
355 page->mapping = mapping;
356 page->index = index + i++;
357 }
358}
359
360static void dax_disassociate_entry(void *entry, struct address_space *mapping,
361 bool trunc)
362{
363 unsigned long pfn;
364
365 if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
366 return;
367
368 for_each_mapped_pfn(entry, pfn) {
369 struct page *page = pfn_to_page(pfn);
370
371 WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
372 WARN_ON_ONCE(page->mapping && page->mapping != mapping);
373 page->mapping = NULL;
374 page->index = 0;
375 }
376}
377
378static struct page *dax_busy_page(void *entry)
379{
380 unsigned long pfn;
381
382 for_each_mapped_pfn(entry, pfn) {
383 struct page *page = pfn_to_page(pfn);
384
385 if (page_ref_count(page) > 1)
386 return page;
387 }
388 return NULL;
389}
390
391
392
393
394
395
396
397
398
399dax_entry_t dax_lock_page(struct page *page)
400{
401 XA_STATE(xas, NULL, 0);
402 void *entry;
403
404
405 rcu_read_lock();
406 for (;;) {
407 struct address_space *mapping = READ_ONCE(page->mapping);
408
409 entry = NULL;
410 if (!mapping || !dax_mapping(mapping))
411 break;
412
413
414
415
416
417
418
419
420 entry = (void *)~0UL;
421 if (S_ISCHR(mapping->host->i_mode))
422 break;
423
424 xas.xa = &mapping->i_pages;
425 xas_lock_irq(&xas);
426 if (mapping != page->mapping) {
427 xas_unlock_irq(&xas);
428 continue;
429 }
430 xas_set(&xas, page->index);
431 entry = xas_load(&xas);
432 if (dax_is_locked(entry)) {
433 rcu_read_unlock();
434 wait_entry_unlocked(&xas, entry);
435 rcu_read_lock();
436 continue;
437 }
438 dax_lock_entry(&xas, entry);
439 xas_unlock_irq(&xas);
440 break;
441 }
442 rcu_read_unlock();
443 return (dax_entry_t)entry;
444}
445
446void dax_unlock_page(struct page *page, dax_entry_t cookie)
447{
448 struct address_space *mapping = page->mapping;
449 XA_STATE(xas, &mapping->i_pages, page->index);
450
451 if (S_ISCHR(mapping->host->i_mode))
452 return;
453
454 dax_unlock_entry(&xas, (void *)cookie);
455}
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486static void *grab_mapping_entry(struct xa_state *xas,
487 struct address_space *mapping, unsigned int order)
488{
489 unsigned long index = xas->xa_index;
490 bool pmd_downgrade;
491 void *entry;
492
493retry:
494 pmd_downgrade = false;
495 xas_lock_irq(xas);
496 entry = get_unlocked_entry(xas, order);
497
498 if (entry) {
499 if (dax_is_conflict(entry))
500 goto fallback;
501 if (!xa_is_value(entry)) {
502 xas_set_err(xas, -EIO);
503 goto out_unlock;
504 }
505
506 if (order == 0) {
507 if (dax_is_pmd_entry(entry) &&
508 (dax_is_zero_entry(entry) ||
509 dax_is_empty_entry(entry))) {
510 pmd_downgrade = true;
511 }
512 }
513 }
514
515 if (pmd_downgrade) {
516
517
518
519
520 dax_lock_entry(xas, entry);
521
522
523
524
525
526
527 if (dax_is_zero_entry(entry)) {
528 xas_unlock_irq(xas);
529 unmap_mapping_pages(mapping,
530 xas->xa_index & ~PG_PMD_COLOUR,
531 PG_PMD_NR, false);
532 xas_reset(xas);
533 xas_lock_irq(xas);
534 }
535
536 dax_disassociate_entry(entry, mapping, false);
537 xas_store(xas, NULL);
538 dax_wake_entry(xas, entry, WAKE_ALL);
539 mapping->nrpages -= PG_PMD_NR;
540 entry = NULL;
541 xas_set(xas, index);
542 }
543
544 if (entry) {
545 dax_lock_entry(xas, entry);
546 } else {
547 unsigned long flags = DAX_EMPTY;
548
549 if (order > 0)
550 flags |= DAX_PMD;
551 entry = dax_make_entry(pfn_to_pfn_t(0), flags);
552 dax_lock_entry(xas, entry);
553 if (xas_error(xas))
554 goto out_unlock;
555 mapping->nrpages += 1UL << order;
556 }
557
558out_unlock:
559 xas_unlock_irq(xas);
560 if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM))
561 goto retry;
562 if (xas->xa_node == XA_ERROR(-ENOMEM))
563 return xa_mk_internal(VM_FAULT_OOM);
564 if (xas_error(xas))
565 return xa_mk_internal(VM_FAULT_SIGBUS);
566 return entry;
567fallback:
568 xas_unlock_irq(xas);
569 return xa_mk_internal(VM_FAULT_FALLBACK);
570}
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590struct page *dax_layout_busy_page_range(struct address_space *mapping,
591 loff_t start, loff_t end)
592{
593 void *entry;
594 unsigned int scanned = 0;
595 struct page *page = NULL;
596 pgoff_t start_idx = start >> PAGE_SHIFT;
597 pgoff_t end_idx;
598 XA_STATE(xas, &mapping->i_pages, start_idx);
599
600
601
602
603 if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
604 return NULL;
605
606 if (!dax_mapping(mapping) || !mapping_mapped(mapping))
607 return NULL;
608
609
610 if (end == LLONG_MAX)
611 end_idx = ULONG_MAX;
612 else
613 end_idx = end >> PAGE_SHIFT;
614
615
616
617
618
619
620
621
622
623
624
625
626 unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0);
627
628 xas_lock_irq(&xas);
629 xas_for_each(&xas, entry, end_idx) {
630 if (WARN_ON_ONCE(!xa_is_value(entry)))
631 continue;
632 if (unlikely(dax_is_locked(entry)))
633 entry = get_unlocked_entry(&xas, 0);
634 if (entry)
635 page = dax_busy_page(entry);
636 put_unlocked_entry(&xas, entry, WAKE_NEXT);
637 if (page)
638 break;
639 if (++scanned % XA_CHECK_SCHED)
640 continue;
641
642 xas_pause(&xas);
643 xas_unlock_irq(&xas);
644 cond_resched();
645 xas_lock_irq(&xas);
646 }
647 xas_unlock_irq(&xas);
648 return page;
649}
650EXPORT_SYMBOL_GPL(dax_layout_busy_page_range);
651
652struct page *dax_layout_busy_page(struct address_space *mapping)
653{
654 return dax_layout_busy_page_range(mapping, 0, LLONG_MAX);
655}
656EXPORT_SYMBOL_GPL(dax_layout_busy_page);
657
658static int __dax_invalidate_entry(struct address_space *mapping,
659 pgoff_t index, bool trunc)
660{
661 XA_STATE(xas, &mapping->i_pages, index);
662 int ret = 0;
663 void *entry;
664
665 xas_lock_irq(&xas);
666 entry = get_unlocked_entry(&xas, 0);
667 if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
668 goto out;
669 if (!trunc &&
670 (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) ||
671 xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
672 goto out;
673 dax_disassociate_entry(entry, mapping, trunc);
674 xas_store(&xas, NULL);
675 mapping->nrpages -= 1UL << dax_entry_order(entry);
676 ret = 1;
677out:
678 put_unlocked_entry(&xas, entry, WAKE_ALL);
679 xas_unlock_irq(&xas);
680 return ret;
681}
682
683
684
685
686
687int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
688{
689 int ret = __dax_invalidate_entry(mapping, index, true);
690
691
692
693
694
695
696
697
698 WARN_ON_ONCE(!ret);
699 return ret;
700}
701
702
703
704
705int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
706 pgoff_t index)
707{
708 return __dax_invalidate_entry(mapping, index, false);
709}
710
711static pgoff_t dax_iomap_pgoff(const struct iomap *iomap, loff_t pos)
712{
713 return PHYS_PFN(iomap->addr + (pos & PAGE_MASK) - iomap->offset);
714}
715
716static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter)
717{
718 pgoff_t pgoff = dax_iomap_pgoff(&iter->iomap, iter->pos);
719 void *vto, *kaddr;
720 long rc;
721 int id;
722
723 id = dax_read_lock();
724 rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, &kaddr, NULL);
725 if (rc < 0) {
726 dax_read_unlock(id);
727 return rc;
728 }
729 vto = kmap_atomic(vmf->cow_page);
730 copy_user_page(vto, kaddr, vmf->address, vmf->cow_page);
731 kunmap_atomic(vto);
732 dax_read_unlock(id);
733 return 0;
734}
735
736
737
738
739
740
741
742
743static void *dax_insert_entry(struct xa_state *xas,
744 struct address_space *mapping, struct vm_fault *vmf,
745 void *entry, pfn_t pfn, unsigned long flags, bool dirty)
746{
747 void *new_entry = dax_make_entry(pfn, flags);
748
749 if (dirty)
750 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
751
752 if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) {
753 unsigned long index = xas->xa_index;
754
755 if (dax_is_pmd_entry(entry))
756 unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
757 PG_PMD_NR, false);
758 else
759 unmap_mapping_pages(mapping, index, 1, false);
760 }
761
762 xas_reset(xas);
763 xas_lock_irq(xas);
764 if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
765 void *old;
766
767 dax_disassociate_entry(entry, mapping, false);
768 dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
769
770
771
772
773
774
775
776
777 old = dax_lock_entry(xas, new_entry);
778 WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) |
779 DAX_LOCKED));
780 entry = new_entry;
781 } else {
782 xas_load(xas);
783 }
784
785 if (dirty)
786 xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
787
788 xas_unlock_irq(xas);
789 return entry;
790}
791
792static inline
793unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
794{
795 unsigned long address;
796
797 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
798 VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
799 return address;
800}
801
802
803static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
804 unsigned long pfn)
805{
806 struct vm_area_struct *vma;
807 pte_t pte, *ptep = NULL;
808 pmd_t *pmdp = NULL;
809 spinlock_t *ptl;
810
811 i_mmap_lock_read(mapping);
812 vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
813 struct mmu_notifier_range range;
814 unsigned long address;
815
816 cond_resched();
817
818 if (!(vma->vm_flags & VM_SHARED))
819 continue;
820
821 address = pgoff_address(index, vma);
822
823
824
825
826
827
828 if (follow_invalidate_pte(vma->vm_mm, address, &range, &ptep,
829 &pmdp, &ptl))
830 continue;
831
832
833
834
835
836
837
838
839 if (pmdp) {
840#ifdef CONFIG_FS_DAX_PMD
841 pmd_t pmd;
842
843 if (pfn != pmd_pfn(*pmdp))
844 goto unlock_pmd;
845 if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
846 goto unlock_pmd;
847
848 flush_cache_page(vma, address, pfn);
849 pmd = pmdp_invalidate(vma, address, pmdp);
850 pmd = pmd_wrprotect(pmd);
851 pmd = pmd_mkclean(pmd);
852 set_pmd_at(vma->vm_mm, address, pmdp, pmd);
853unlock_pmd:
854#endif
855 spin_unlock(ptl);
856 } else {
857 if (pfn != pte_pfn(*ptep))
858 goto unlock_pte;
859 if (!pte_dirty(*ptep) && !pte_write(*ptep))
860 goto unlock_pte;
861
862 flush_cache_page(vma, address, pfn);
863 pte = ptep_clear_flush(vma, address, ptep);
864 pte = pte_wrprotect(pte);
865 pte = pte_mkclean(pte);
866 set_pte_at(vma->vm_mm, address, ptep, pte);
867unlock_pte:
868 pte_unmap_unlock(ptep, ptl);
869 }
870
871 mmu_notifier_invalidate_range_end(&range);
872 }
873 i_mmap_unlock_read(mapping);
874}
875
876static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
877 struct address_space *mapping, void *entry)
878{
879 unsigned long pfn, index, count;
880 long ret = 0;
881
882
883
884
885
886 if (WARN_ON(!xa_is_value(entry)))
887 return -EIO;
888
889 if (unlikely(dax_is_locked(entry))) {
890 void *old_entry = entry;
891
892 entry = get_unlocked_entry(xas, 0);
893
894
895 if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
896 goto put_unlocked;
897
898
899
900
901
902 if (dax_to_pfn(old_entry) != dax_to_pfn(entry))
903 goto put_unlocked;
904 if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
905 dax_is_zero_entry(entry))) {
906 ret = -EIO;
907 goto put_unlocked;
908 }
909
910
911 if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE))
912 goto put_unlocked;
913 }
914
915
916 dax_lock_entry(xas, entry);
917
918
919
920
921
922
923
924
925 xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE);
926 xas_unlock_irq(xas);
927
928
929
930
931
932
933
934
935 pfn = dax_to_pfn(entry);
936 count = 1UL << dax_entry_order(entry);
937 index = xas->xa_index & ~(count - 1);
938
939 dax_entry_mkclean(mapping, index, pfn);
940 dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE);
941
942
943
944
945
946
947 xas_reset(xas);
948 xas_lock_irq(xas);
949 xas_store(xas, entry);
950 xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
951 dax_wake_entry(xas, entry, WAKE_NEXT);
952
953 trace_dax_writeback_one(mapping->host, index, count);
954 return ret;
955
956 put_unlocked:
957 put_unlocked_entry(xas, entry, WAKE_NEXT);
958 return ret;
959}
960
961
962
963
964
965
966int dax_writeback_mapping_range(struct address_space *mapping,
967 struct dax_device *dax_dev, struct writeback_control *wbc)
968{
969 XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT);
970 struct inode *inode = mapping->host;
971 pgoff_t end_index = wbc->range_end >> PAGE_SHIFT;
972 void *entry;
973 int ret = 0;
974 unsigned int scanned = 0;
975
976 if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
977 return -EIO;
978
979 if (mapping_empty(mapping) || wbc->sync_mode != WB_SYNC_ALL)
980 return 0;
981
982 trace_dax_writeback_range(inode, xas.xa_index, end_index);
983
984 tag_pages_for_writeback(mapping, xas.xa_index, end_index);
985
986 xas_lock_irq(&xas);
987 xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) {
988 ret = dax_writeback_one(&xas, dax_dev, mapping, entry);
989 if (ret < 0) {
990 mapping_set_error(mapping, ret);
991 break;
992 }
993 if (++scanned % XA_CHECK_SCHED)
994 continue;
995
996 xas_pause(&xas);
997 xas_unlock_irq(&xas);
998 cond_resched();
999 xas_lock_irq(&xas);
1000 }
1001 xas_unlock_irq(&xas);
1002 trace_dax_writeback_range_done(inode, xas.xa_index, end_index);
1003 return ret;
1004}
1005EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
1006
1007static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size,
1008 pfn_t *pfnp)
1009{
1010 pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
1011 int id, rc;
1012 long length;
1013
1014 id = dax_read_lock();
1015 length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
1016 NULL, pfnp);
1017 if (length < 0) {
1018 rc = length;
1019 goto out;
1020 }
1021 rc = -EINVAL;
1022 if (PFN_PHYS(length) < size)
1023 goto out;
1024 if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1))
1025 goto out;
1026
1027 if (length > 1 && !pfn_t_devmap(*pfnp))
1028 goto out;
1029 rc = 0;
1030out:
1031 dax_read_unlock(id);
1032 return rc;
1033}
1034
1035
1036
1037
1038
1039
1040
1041
1042static vm_fault_t dax_load_hole(struct xa_state *xas,
1043 struct address_space *mapping, void **entry,
1044 struct vm_fault *vmf)
1045{
1046 struct inode *inode = mapping->host;
1047 unsigned long vaddr = vmf->address;
1048 pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
1049 vm_fault_t ret;
1050
1051 *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
1052 DAX_ZERO_PAGE, false);
1053
1054 ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
1055 trace_dax_load_hole(inode, vmf, ret);
1056 return ret;
1057}
1058
1059#ifdef CONFIG_FS_DAX_PMD
1060static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
1061 const struct iomap *iomap, void **entry)
1062{
1063 struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1064 unsigned long pmd_addr = vmf->address & PMD_MASK;
1065 struct vm_area_struct *vma = vmf->vma;
1066 struct inode *inode = mapping->host;
1067 pgtable_t pgtable = NULL;
1068 struct page *zero_page;
1069 spinlock_t *ptl;
1070 pmd_t pmd_entry;
1071 pfn_t pfn;
1072
1073 zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
1074
1075 if (unlikely(!zero_page))
1076 goto fallback;
1077
1078 pfn = page_to_pfn_t(zero_page);
1079 *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
1080 DAX_PMD | DAX_ZERO_PAGE, false);
1081
1082 if (arch_needs_pgtable_deposit()) {
1083 pgtable = pte_alloc_one(vma->vm_mm);
1084 if (!pgtable)
1085 return VM_FAULT_OOM;
1086 }
1087
1088 ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
1089 if (!pmd_none(*(vmf->pmd))) {
1090 spin_unlock(ptl);
1091 goto fallback;
1092 }
1093
1094 if (pgtable) {
1095 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
1096 mm_inc_nr_ptes(vma->vm_mm);
1097 }
1098 pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
1099 pmd_entry = pmd_mkhuge(pmd_entry);
1100 set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
1101 spin_unlock(ptl);
1102 trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry);
1103 return VM_FAULT_NOPAGE;
1104
1105fallback:
1106 if (pgtable)
1107 pte_free(vma->vm_mm, pgtable);
1108 trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry);
1109 return VM_FAULT_FALLBACK;
1110}
1111#else
1112static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
1113 const struct iomap *iomap, void **entry)
1114{
1115 return VM_FAULT_FALLBACK;
1116}
1117#endif
1118
1119static int dax_memzero(struct dax_device *dax_dev, pgoff_t pgoff,
1120 unsigned int offset, size_t size)
1121{
1122 void *kaddr;
1123 long ret;
1124
1125 ret = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL);
1126 if (ret > 0) {
1127 memset(kaddr + offset, 0, size);
1128 dax_flush(dax_dev, kaddr + offset, size);
1129 }
1130 return ret;
1131}
1132
1133static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
1134{
1135 const struct iomap *iomap = &iter->iomap;
1136 const struct iomap *srcmap = iomap_iter_srcmap(iter);
1137 loff_t pos = iter->pos;
1138 u64 length = iomap_length(iter);
1139 s64 written = 0;
1140
1141
1142 if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
1143 return length;
1144
1145 do {
1146 unsigned offset = offset_in_page(pos);
1147 unsigned size = min_t(u64, PAGE_SIZE - offset, length);
1148 pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
1149 long rc;
1150 int id;
1151
1152 id = dax_read_lock();
1153 if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE)
1154 rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
1155 else
1156 rc = dax_memzero(iomap->dax_dev, pgoff, offset, size);
1157 dax_read_unlock(id);
1158
1159 if (rc < 0)
1160 return rc;
1161 pos += size;
1162 length -= size;
1163 written += size;
1164 if (did_zero)
1165 *did_zero = true;
1166 } while (length > 0);
1167
1168 return written;
1169}
1170
1171int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
1172 const struct iomap_ops *ops)
1173{
1174 struct iomap_iter iter = {
1175 .inode = inode,
1176 .pos = pos,
1177 .len = len,
1178 .flags = IOMAP_DAX | IOMAP_ZERO,
1179 };
1180 int ret;
1181
1182 while ((ret = iomap_iter(&iter, ops)) > 0)
1183 iter.processed = dax_zero_iter(&iter, did_zero);
1184 return ret;
1185}
1186EXPORT_SYMBOL_GPL(dax_zero_range);
1187
1188int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
1189 const struct iomap_ops *ops)
1190{
1191 unsigned int blocksize = i_blocksize(inode);
1192 unsigned int off = pos & (blocksize - 1);
1193
1194
1195 if (!off)
1196 return 0;
1197 return dax_zero_range(inode, pos, blocksize - off, did_zero, ops);
1198}
1199EXPORT_SYMBOL_GPL(dax_truncate_page);
1200
1201static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
1202 struct iov_iter *iter)
1203{
1204 const struct iomap *iomap = &iomi->iomap;
1205 loff_t length = iomap_length(iomi);
1206 loff_t pos = iomi->pos;
1207 struct dax_device *dax_dev = iomap->dax_dev;
1208 loff_t end = pos + length, done = 0;
1209 ssize_t ret = 0;
1210 size_t xfer;
1211 int id;
1212
1213 if (iov_iter_rw(iter) == READ) {
1214 end = min(end, i_size_read(iomi->inode));
1215 if (pos >= end)
1216 return 0;
1217
1218 if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
1219 return iov_iter_zero(min(length, end - pos), iter);
1220 }
1221
1222 if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
1223 return -EIO;
1224
1225
1226
1227
1228
1229
1230 if (iomap->flags & IOMAP_F_NEW) {
1231 invalidate_inode_pages2_range(iomi->inode->i_mapping,
1232 pos >> PAGE_SHIFT,
1233 (end - 1) >> PAGE_SHIFT);
1234 }
1235
1236 id = dax_read_lock();
1237 while (pos < end) {
1238 unsigned offset = pos & (PAGE_SIZE - 1);
1239 const size_t size = ALIGN(length + offset, PAGE_SIZE);
1240 pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
1241 ssize_t map_len;
1242 void *kaddr;
1243
1244 if (fatal_signal_pending(current)) {
1245 ret = -EINTR;
1246 break;
1247 }
1248
1249 map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
1250 &kaddr, NULL);
1251 if (map_len < 0) {
1252 ret = map_len;
1253 break;
1254 }
1255
1256 map_len = PFN_PHYS(map_len);
1257 kaddr += offset;
1258 map_len -= offset;
1259 if (map_len > end - pos)
1260 map_len = end - pos;
1261
1262 if (iov_iter_rw(iter) == WRITE)
1263 xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
1264 map_len, iter);
1265 else
1266 xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
1267 map_len, iter);
1268
1269 pos += xfer;
1270 length -= xfer;
1271 done += xfer;
1272
1273 if (xfer == 0)
1274 ret = -EFAULT;
1275 if (xfer < map_len)
1276 break;
1277 }
1278 dax_read_unlock(id);
1279
1280 return done ? done : ret;
1281}
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293ssize_t
1294dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
1295 const struct iomap_ops *ops)
1296{
1297 struct iomap_iter iomi = {
1298 .inode = iocb->ki_filp->f_mapping->host,
1299 .pos = iocb->ki_pos,
1300 .len = iov_iter_count(iter),
1301 .flags = IOMAP_DAX,
1302 };
1303 loff_t done = 0;
1304 int ret;
1305
1306 if (iov_iter_rw(iter) == WRITE) {
1307 lockdep_assert_held_write(&iomi.inode->i_rwsem);
1308 iomi.flags |= IOMAP_WRITE;
1309 } else {
1310 lockdep_assert_held(&iomi.inode->i_rwsem);
1311 }
1312
1313 if (iocb->ki_flags & IOCB_NOWAIT)
1314 iomi.flags |= IOMAP_NOWAIT;
1315
1316 while ((ret = iomap_iter(&iomi, ops)) > 0)
1317 iomi.processed = dax_iomap_iter(&iomi, iter);
1318
1319 done = iomi.pos - iocb->ki_pos;
1320 iocb->ki_pos = iomi.pos;
1321 return done ? done : ret;
1322}
1323EXPORT_SYMBOL_GPL(dax_iomap_rw);
1324
1325static vm_fault_t dax_fault_return(int error)
1326{
1327 if (error == 0)
1328 return VM_FAULT_NOPAGE;
1329 return vmf_error(error);
1330}
1331
1332
1333
1334
1335
1336static bool dax_fault_is_synchronous(unsigned long flags,
1337 struct vm_area_struct *vma, const struct iomap *iomap)
1338{
1339 return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC)
1340 && (iomap->flags & IOMAP_F_DIRTY);
1341}
1342
1343
1344
1345
1346
1347
1348
1349static vm_fault_t dax_fault_synchronous_pfnp(pfn_t *pfnp, pfn_t pfn)
1350{
1351 if (WARN_ON_ONCE(!pfnp))
1352 return VM_FAULT_SIGBUS;
1353 *pfnp = pfn;
1354 return VM_FAULT_NEEDDSYNC;
1355}
1356
1357static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf,
1358 const struct iomap_iter *iter)
1359{
1360 vm_fault_t ret;
1361 int error = 0;
1362
1363 switch (iter->iomap.type) {
1364 case IOMAP_HOLE:
1365 case IOMAP_UNWRITTEN:
1366 clear_user_highpage(vmf->cow_page, vmf->address);
1367 break;
1368 case IOMAP_MAPPED:
1369 error = copy_cow_page_dax(vmf, iter);
1370 break;
1371 default:
1372 WARN_ON_ONCE(1);
1373 error = -EIO;
1374 break;
1375 }
1376
1377 if (error)
1378 return dax_fault_return(error);
1379
1380 __SetPageUptodate(vmf->cow_page);
1381 ret = finish_fault(vmf);
1382 if (!ret)
1383 return VM_FAULT_DONE_COW;
1384 return ret;
1385}
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
1397 const struct iomap_iter *iter, pfn_t *pfnp,
1398 struct xa_state *xas, void **entry, bool pmd)
1399{
1400 struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1401 const struct iomap *iomap = &iter->iomap;
1402 size_t size = pmd ? PMD_SIZE : PAGE_SIZE;
1403 loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
1404 bool write = vmf->flags & FAULT_FLAG_WRITE;
1405 bool sync = dax_fault_is_synchronous(iter->flags, vmf->vma, iomap);
1406 unsigned long entry_flags = pmd ? DAX_PMD : 0;
1407 int err = 0;
1408 pfn_t pfn;
1409
1410 if (!pmd && vmf->cow_page)
1411 return dax_fault_cow_page(vmf, iter);
1412
1413
1414 if (!write &&
1415 (iomap->type == IOMAP_UNWRITTEN || iomap->type == IOMAP_HOLE)) {
1416 if (!pmd)
1417 return dax_load_hole(xas, mapping, entry, vmf);
1418 return dax_pmd_load_hole(xas, vmf, iomap, entry);
1419 }
1420
1421 if (iomap->type != IOMAP_MAPPED) {
1422 WARN_ON_ONCE(1);
1423 return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS;
1424 }
1425
1426 err = dax_iomap_pfn(&iter->iomap, pos, size, &pfn);
1427 if (err)
1428 return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err);
1429
1430 *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, entry_flags,
1431 write && !sync);
1432
1433 if (sync)
1434 return dax_fault_synchronous_pfnp(pfnp, pfn);
1435
1436
1437 if (pmd)
1438 return vmf_insert_pfn_pmd(vmf, pfn, write);
1439
1440
1441 if (write)
1442 return vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
1443 return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
1444}
1445
1446static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
1447 int *iomap_errp, const struct iomap_ops *ops)
1448{
1449 struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1450 XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
1451 struct iomap_iter iter = {
1452 .inode = mapping->host,
1453 .pos = (loff_t)vmf->pgoff << PAGE_SHIFT,
1454 .len = PAGE_SIZE,
1455 .flags = IOMAP_DAX | IOMAP_FAULT,
1456 };
1457 vm_fault_t ret = 0;
1458 void *entry;
1459 int error;
1460
1461 trace_dax_pte_fault(iter.inode, vmf, ret);
1462
1463
1464
1465
1466
1467 if (iter.pos >= i_size_read(iter.inode)) {
1468 ret = VM_FAULT_SIGBUS;
1469 goto out;
1470 }
1471
1472 if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
1473 iter.flags |= IOMAP_WRITE;
1474
1475 entry = grab_mapping_entry(&xas, mapping, 0);
1476 if (xa_is_internal(entry)) {
1477 ret = xa_to_internal(entry);
1478 goto out;
1479 }
1480
1481
1482
1483
1484
1485
1486
1487 if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) {
1488 ret = VM_FAULT_NOPAGE;
1489 goto unlock_entry;
1490 }
1491
1492 while ((error = iomap_iter(&iter, ops)) > 0) {
1493 if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) {
1494 iter.processed = -EIO;
1495 continue;
1496 }
1497
1498 ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, false);
1499 if (ret != VM_FAULT_SIGBUS &&
1500 (iter.iomap.flags & IOMAP_F_NEW)) {
1501 count_vm_event(PGMAJFAULT);
1502 count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
1503 ret |= VM_FAULT_MAJOR;
1504 }
1505
1506 if (!(ret & VM_FAULT_ERROR))
1507 iter.processed = PAGE_SIZE;
1508 }
1509
1510 if (iomap_errp)
1511 *iomap_errp = error;
1512 if (!ret && error)
1513 ret = dax_fault_return(error);
1514
1515unlock_entry:
1516 dax_unlock_entry(&xas, entry);
1517out:
1518 trace_dax_pte_fault_done(iter.inode, vmf, ret);
1519 return ret;
1520}
1521
1522#ifdef CONFIG_FS_DAX_PMD
1523static bool dax_fault_check_fallback(struct vm_fault *vmf, struct xa_state *xas,
1524 pgoff_t max_pgoff)
1525{
1526 unsigned long pmd_addr = vmf->address & PMD_MASK;
1527 bool write = vmf->flags & FAULT_FLAG_WRITE;
1528
1529
1530
1531
1532
1533
1534
1535 if ((vmf->pgoff & PG_PMD_COLOUR) !=
1536 ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
1537 return true;
1538
1539
1540 if (write && !(vmf->vma->vm_flags & VM_SHARED))
1541 return true;
1542
1543
1544 if (pmd_addr < vmf->vma->vm_start)
1545 return true;
1546 if ((pmd_addr + PMD_SIZE) > vmf->vma->vm_end)
1547 return true;
1548
1549
1550 if ((xas->xa_index | PG_PMD_COLOUR) >= max_pgoff)
1551 return true;
1552
1553 return false;
1554}
1555
1556static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
1557 const struct iomap_ops *ops)
1558{
1559 struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1560 XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER);
1561 struct iomap_iter iter = {
1562 .inode = mapping->host,
1563 .len = PMD_SIZE,
1564 .flags = IOMAP_DAX | IOMAP_FAULT,
1565 };
1566 vm_fault_t ret = VM_FAULT_FALLBACK;
1567 pgoff_t max_pgoff;
1568 void *entry;
1569 int error;
1570
1571 if (vmf->flags & FAULT_FLAG_WRITE)
1572 iter.flags |= IOMAP_WRITE;
1573
1574
1575
1576
1577
1578
1579 max_pgoff = DIV_ROUND_UP(i_size_read(iter.inode), PAGE_SIZE);
1580
1581 trace_dax_pmd_fault(iter.inode, vmf, max_pgoff, 0);
1582
1583 if (xas.xa_index >= max_pgoff) {
1584 ret = VM_FAULT_SIGBUS;
1585 goto out;
1586 }
1587
1588 if (dax_fault_check_fallback(vmf, &xas, max_pgoff))
1589 goto fallback;
1590
1591
1592
1593
1594
1595
1596
1597 entry = grab_mapping_entry(&xas, mapping, PMD_ORDER);
1598 if (xa_is_internal(entry)) {
1599 ret = xa_to_internal(entry);
1600 goto fallback;
1601 }
1602
1603
1604
1605
1606
1607
1608
1609 if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) &&
1610 !pmd_devmap(*vmf->pmd)) {
1611 ret = 0;
1612 goto unlock_entry;
1613 }
1614
1615 iter.pos = (loff_t)xas.xa_index << PAGE_SHIFT;
1616 while ((error = iomap_iter(&iter, ops)) > 0) {
1617 if (iomap_length(&iter) < PMD_SIZE)
1618 continue;
1619
1620 ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true);
1621 if (ret != VM_FAULT_FALLBACK)
1622 iter.processed = PMD_SIZE;
1623 }
1624
1625unlock_entry:
1626 dax_unlock_entry(&xas, entry);
1627fallback:
1628 if (ret == VM_FAULT_FALLBACK) {
1629 split_huge_pmd(vmf->vma, vmf->pmd, vmf->address);
1630 count_vm_event(THP_FAULT_FALLBACK);
1631 }
1632out:
1633 trace_dax_pmd_fault_done(iter.inode, vmf, max_pgoff, ret);
1634 return ret;
1635}
1636#else
1637static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
1638 const struct iomap_ops *ops)
1639{
1640 return VM_FAULT_FALLBACK;
1641}
1642#endif
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
1658 pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops)
1659{
1660 switch (pe_size) {
1661 case PE_SIZE_PTE:
1662 return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops);
1663 case PE_SIZE_PMD:
1664 return dax_iomap_pmd_fault(vmf, pfnp, ops);
1665 default:
1666 return VM_FAULT_FALLBACK;
1667 }
1668}
1669EXPORT_SYMBOL_GPL(dax_iomap_fault);
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680static vm_fault_t
1681dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
1682{
1683 struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1684 XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
1685 void *entry;
1686 vm_fault_t ret;
1687
1688 xas_lock_irq(&xas);
1689 entry = get_unlocked_entry(&xas, order);
1690
1691 if (!entry || dax_is_conflict(entry) ||
1692 (order == 0 && !dax_is_pte_entry(entry))) {
1693 put_unlocked_entry(&xas, entry, WAKE_NEXT);
1694 xas_unlock_irq(&xas);
1695 trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
1696 VM_FAULT_NOPAGE);
1697 return VM_FAULT_NOPAGE;
1698 }
1699 xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
1700 dax_lock_entry(&xas, entry);
1701 xas_unlock_irq(&xas);
1702 if (order == 0)
1703 ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
1704#ifdef CONFIG_FS_DAX_PMD
1705 else if (order == PMD_ORDER)
1706 ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE);
1707#endif
1708 else
1709 ret = VM_FAULT_FALLBACK;
1710 dax_unlock_entry(&xas, entry);
1711 trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
1712 return ret;
1713}
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
1726 enum page_entry_size pe_size, pfn_t pfn)
1727{
1728 int err;
1729 loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
1730 unsigned int order = pe_order(pe_size);
1731 size_t len = PAGE_SIZE << order;
1732
1733 err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
1734 if (err)
1735 return VM_FAULT_SIGBUS;
1736 return dax_insert_pfn_mkwrite(vmf, pfn, order);
1737}
1738EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
1739