1
2
3
4
5
6
7
8
9
10
11
12
13#include <linux/export.h>
14#include <linux/compiler.h>
15#include <linux/dax.h>
16#include <linux/fs.h>
17#include <linux/sched/signal.h>
18#include <linux/uaccess.h>
19#include <linux/capability.h>
20#include <linux/kernel_stat.h>
21#include <linux/gfp.h>
22#include <linux/mm.h>
23#include <linux/swap.h>
24#include <linux/mman.h>
25#include <linux/pagemap.h>
26#include <linux/file.h>
27#include <linux/uio.h>
28#include <linux/error-injection.h>
29#include <linux/hash.h>
30#include <linux/writeback.h>
31#include <linux/backing-dev.h>
32#include <linux/pagevec.h>
33#include <linux/blkdev.h>
34#include <linux/security.h>
35#include <linux/cpuset.h>
36#include <linux/hugetlb.h>
37#include <linux/memcontrol.h>
38#include <linux/cleancache.h>
39#include <linux/shmem_fs.h>
40#include <linux/rmap.h>
41#include <linux/delayacct.h>
42#include <linux/psi.h>
43#include <linux/ramfs.h>
44#include <linux/page_idle.h>
45#include <asm/pgalloc.h>
46#include <asm/tlbflush.h>
47#include "internal.h"
48
49#define CREATE_TRACE_POINTS
50#include <trace/events/filemap.h>
51
52
53
54
55#include <linux/buffer_head.h>
56
57#include <asm/mman.h>
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124static void page_cache_delete(struct address_space *mapping,
125 struct page *page, void *shadow)
126{
127 XA_STATE(xas, &mapping->i_pages, page->index);
128 unsigned int nr = 1;
129
130 mapping_set_update(&xas, mapping);
131
132
133 if (!PageHuge(page)) {
134 xas_set_order(&xas, page->index, compound_order(page));
135 nr = compound_nr(page);
136 }
137
138 VM_BUG_ON_PAGE(!PageLocked(page), page);
139 VM_BUG_ON_PAGE(PageTail(page), page);
140 VM_BUG_ON_PAGE(nr != 1 && shadow, page);
141
142 xas_store(&xas, shadow);
143 xas_init_marks(&xas);
144
145 page->mapping = NULL;
146
147 mapping->nrpages -= nr;
148}
149
150static void unaccount_page_cache_page(struct address_space *mapping,
151 struct page *page)
152{
153 int nr;
154
155
156
157
158
159
160 if (PageUptodate(page) && PageMappedToDisk(page))
161 cleancache_put_page(page);
162 else
163 cleancache_invalidate_page(mapping, page);
164
165 VM_BUG_ON_PAGE(PageTail(page), page);
166 VM_BUG_ON_PAGE(page_mapped(page), page);
167 if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) {
168 int mapcount;
169
170 pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n",
171 current->comm, page_to_pfn(page));
172 dump_page(page, "still mapped when deleted");
173 dump_stack();
174 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
175
176 mapcount = page_mapcount(page);
177 if (mapping_exiting(mapping) &&
178 page_count(page) >= mapcount + 2) {
179
180
181
182
183
184
185 page_mapcount_reset(page);
186 page_ref_sub(page, mapcount);
187 }
188 }
189
190
191 if (PageHuge(page))
192 return;
193
194 nr = thp_nr_pages(page);
195
196 __mod_lruvec_page_state(page, NR_FILE_PAGES, -nr);
197 if (PageSwapBacked(page)) {
198 __mod_lruvec_page_state(page, NR_SHMEM, -nr);
199 if (PageTransHuge(page))
200 __mod_lruvec_page_state(page, NR_SHMEM_THPS, -nr);
201 } else if (PageTransHuge(page)) {
202 __mod_lruvec_page_state(page, NR_FILE_THPS, -nr);
203 filemap_nr_thps_dec(mapping);
204 }
205
206
207
208
209
210
211
212
213
214
215
216 if (WARN_ON_ONCE(PageDirty(page)))
217 account_page_cleaned(page, mapping, inode_to_wb(mapping->host));
218}
219
220
221
222
223
224
225void __delete_from_page_cache(struct page *page, void *shadow)
226{
227 struct address_space *mapping = page->mapping;
228
229 trace_mm_filemap_delete_from_page_cache(page);
230
231 unaccount_page_cache_page(mapping, page);
232 page_cache_delete(mapping, page, shadow);
233}
234
235static void page_cache_free_page(struct address_space *mapping,
236 struct page *page)
237{
238 void (*freepage)(struct page *);
239
240 freepage = mapping->a_ops->freepage;
241 if (freepage)
242 freepage(page);
243
244 if (PageTransHuge(page) && !PageHuge(page)) {
245 page_ref_sub(page, thp_nr_pages(page));
246 VM_BUG_ON_PAGE(page_count(page) <= 0, page);
247 } else {
248 put_page(page);
249 }
250}
251
252
253
254
255
256
257
258
259
260void delete_from_page_cache(struct page *page)
261{
262 struct address_space *mapping = page_mapping(page);
263
264 BUG_ON(!PageLocked(page));
265 xa_lock_irq(&mapping->i_pages);
266 __delete_from_page_cache(page, NULL);
267 xa_unlock_irq(&mapping->i_pages);
268
269 page_cache_free_page(mapping, page);
270}
271EXPORT_SYMBOL(delete_from_page_cache);
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287static void page_cache_delete_batch(struct address_space *mapping,
288 struct pagevec *pvec)
289{
290 XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index);
291 int total_pages = 0;
292 int i = 0;
293 struct page *page;
294
295 mapping_set_update(&xas, mapping);
296 xas_for_each(&xas, page, ULONG_MAX) {
297 if (i >= pagevec_count(pvec))
298 break;
299
300
301 if (xa_is_value(page))
302 continue;
303
304
305
306
307
308
309
310 if (page != pvec->pages[i]) {
311 VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index,
312 page);
313 continue;
314 }
315
316 WARN_ON_ONCE(!PageLocked(page));
317
318 if (page->index == xas.xa_index)
319 page->mapping = NULL;
320
321
322
323
324
325
326
327 if (page->index + compound_nr(page) - 1 == xas.xa_index)
328 i++;
329 xas_store(&xas, NULL);
330 total_pages++;
331 }
332 mapping->nrpages -= total_pages;
333}
334
335void delete_from_page_cache_batch(struct address_space *mapping,
336 struct pagevec *pvec)
337{
338 int i;
339
340 if (!pagevec_count(pvec))
341 return;
342
343 xa_lock_irq(&mapping->i_pages);
344 for (i = 0; i < pagevec_count(pvec); i++) {
345 trace_mm_filemap_delete_from_page_cache(pvec->pages[i]);
346
347 unaccount_page_cache_page(mapping, pvec->pages[i]);
348 }
349 page_cache_delete_batch(mapping, pvec);
350 xa_unlock_irq(&mapping->i_pages);
351
352 for (i = 0; i < pagevec_count(pvec); i++)
353 page_cache_free_page(mapping, pvec->pages[i]);
354}
355
356int filemap_check_errors(struct address_space *mapping)
357{
358 int ret = 0;
359
360 if (test_bit(AS_ENOSPC, &mapping->flags) &&
361 test_and_clear_bit(AS_ENOSPC, &mapping->flags))
362 ret = -ENOSPC;
363 if (test_bit(AS_EIO, &mapping->flags) &&
364 test_and_clear_bit(AS_EIO, &mapping->flags))
365 ret = -EIO;
366 return ret;
367}
368EXPORT_SYMBOL(filemap_check_errors);
369
370static int filemap_check_and_keep_errors(struct address_space *mapping)
371{
372
373 if (test_bit(AS_EIO, &mapping->flags))
374 return -EIO;
375 if (test_bit(AS_ENOSPC, &mapping->flags))
376 return -ENOSPC;
377 return 0;
378}
379
380
381
382
383
384
385
386
387
388
389
390int filemap_fdatawrite_wbc(struct address_space *mapping,
391 struct writeback_control *wbc)
392{
393 int ret;
394
395 if (!mapping_can_writeback(mapping) ||
396 !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
397 return 0;
398
399 wbc_attach_fdatawrite_inode(wbc, mapping->host);
400 ret = do_writepages(mapping, wbc);
401 wbc_detach_inode(wbc);
402 return ret;
403}
404EXPORT_SYMBOL(filemap_fdatawrite_wbc);
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
424 loff_t end, int sync_mode)
425{
426 struct writeback_control wbc = {
427 .sync_mode = sync_mode,
428 .nr_to_write = LONG_MAX,
429 .range_start = start,
430 .range_end = end,
431 };
432
433 return filemap_fdatawrite_wbc(mapping, &wbc);
434}
435
436static inline int __filemap_fdatawrite(struct address_space *mapping,
437 int sync_mode)
438{
439 return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
440}
441
442int filemap_fdatawrite(struct address_space *mapping)
443{
444 return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
445}
446EXPORT_SYMBOL(filemap_fdatawrite);
447
448int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
449 loff_t end)
450{
451 return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
452}
453EXPORT_SYMBOL(filemap_fdatawrite_range);
454
455
456
457
458
459
460
461
462
463
464int filemap_flush(struct address_space *mapping)
465{
466 return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
467}
468EXPORT_SYMBOL(filemap_flush);
469
470
471
472
473
474
475
476
477
478
479
480
481
482bool filemap_range_has_page(struct address_space *mapping,
483 loff_t start_byte, loff_t end_byte)
484{
485 struct page *page;
486 XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
487 pgoff_t max = end_byte >> PAGE_SHIFT;
488
489 if (end_byte < start_byte)
490 return false;
491
492 rcu_read_lock();
493 for (;;) {
494 page = xas_find(&xas, max);
495 if (xas_retry(&xas, page))
496 continue;
497
498 if (xa_is_value(page))
499 continue;
500
501
502
503
504
505 break;
506 }
507 rcu_read_unlock();
508
509 return page != NULL;
510}
511EXPORT_SYMBOL(filemap_range_has_page);
512
513static void __filemap_fdatawait_range(struct address_space *mapping,
514 loff_t start_byte, loff_t end_byte)
515{
516 pgoff_t index = start_byte >> PAGE_SHIFT;
517 pgoff_t end = end_byte >> PAGE_SHIFT;
518 struct pagevec pvec;
519 int nr_pages;
520
521 if (end_byte < start_byte)
522 return;
523
524 pagevec_init(&pvec);
525 while (index <= end) {
526 unsigned i;
527
528 nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index,
529 end, PAGECACHE_TAG_WRITEBACK);
530 if (!nr_pages)
531 break;
532
533 for (i = 0; i < nr_pages; i++) {
534 struct page *page = pvec.pages[i];
535
536 wait_on_page_writeback(page);
537 ClearPageError(page);
538 }
539 pagevec_release(&pvec);
540 cond_resched();
541 }
542}
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
561 loff_t end_byte)
562{
563 __filemap_fdatawait_range(mapping, start_byte, end_byte);
564 return filemap_check_errors(mapping);
565}
566EXPORT_SYMBOL(filemap_fdatawait_range);
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
583 loff_t start_byte, loff_t end_byte)
584{
585 __filemap_fdatawait_range(mapping, start_byte, end_byte);
586 return filemap_check_and_keep_errors(mapping);
587}
588EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors);
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)
607{
608 struct address_space *mapping = file->f_mapping;
609
610 __filemap_fdatawait_range(mapping, start_byte, end_byte);
611 return file_check_and_advance_wb_err(file);
612}
613EXPORT_SYMBOL(file_fdatawait_range);
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629int filemap_fdatawait_keep_errors(struct address_space *mapping)
630{
631 __filemap_fdatawait_range(mapping, 0, LLONG_MAX);
632 return filemap_check_and_keep_errors(mapping);
633}
634EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
635
636
637static bool mapping_needs_writeback(struct address_space *mapping)
638{
639 return mapping->nrpages;
640}
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656bool filemap_range_needs_writeback(struct address_space *mapping,
657 loff_t start_byte, loff_t end_byte)
658{
659 XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
660 pgoff_t max = end_byte >> PAGE_SHIFT;
661 struct page *page;
662
663 if (!mapping_needs_writeback(mapping))
664 return false;
665 if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
666 !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
667 return false;
668 if (end_byte < start_byte)
669 return false;
670
671 rcu_read_lock();
672 xas_for_each(&xas, page, max) {
673 if (xas_retry(&xas, page))
674 continue;
675 if (xa_is_value(page))
676 continue;
677 if (PageDirty(page) || PageLocked(page) || PageWriteback(page))
678 break;
679 }
680 rcu_read_unlock();
681 return page != NULL;
682}
683EXPORT_SYMBOL_GPL(filemap_range_needs_writeback);
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698int filemap_write_and_wait_range(struct address_space *mapping,
699 loff_t lstart, loff_t lend)
700{
701 int err = 0;
702
703 if (mapping_needs_writeback(mapping)) {
704 err = __filemap_fdatawrite_range(mapping, lstart, lend,
705 WB_SYNC_ALL);
706
707
708
709
710
711
712 if (err != -EIO) {
713 int err2 = filemap_fdatawait_range(mapping,
714 lstart, lend);
715 if (!err)
716 err = err2;
717 } else {
718
719 filemap_check_errors(mapping);
720 }
721 } else {
722 err = filemap_check_errors(mapping);
723 }
724 return err;
725}
726EXPORT_SYMBOL(filemap_write_and_wait_range);
727
728void __filemap_set_wb_err(struct address_space *mapping, int err)
729{
730 errseq_t eseq = errseq_set(&mapping->wb_err, err);
731
732 trace_filemap_set_wb_err(mapping, eseq);
733}
734EXPORT_SYMBOL(__filemap_set_wb_err);
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760int file_check_and_advance_wb_err(struct file *file)
761{
762 int err = 0;
763 errseq_t old = READ_ONCE(file->f_wb_err);
764 struct address_space *mapping = file->f_mapping;
765
766
767 if (errseq_check(&mapping->wb_err, old)) {
768
769 spin_lock(&file->f_lock);
770 old = file->f_wb_err;
771 err = errseq_check_and_advance(&mapping->wb_err,
772 &file->f_wb_err);
773 trace_file_check_and_advance_wb_err(file, old);
774 spin_unlock(&file->f_lock);
775 }
776
777
778
779
780
781
782 clear_bit(AS_EIO, &mapping->flags);
783 clear_bit(AS_ENOSPC, &mapping->flags);
784 return err;
785}
786EXPORT_SYMBOL(file_check_and_advance_wb_err);
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
805{
806 int err = 0, err2;
807 struct address_space *mapping = file->f_mapping;
808
809 if (mapping_needs_writeback(mapping)) {
810 err = __filemap_fdatawrite_range(mapping, lstart, lend,
811 WB_SYNC_ALL);
812
813 if (err != -EIO)
814 __filemap_fdatawait_range(mapping, lstart, lend);
815 }
816 err2 = file_check_and_advance_wb_err(file);
817 if (!err)
818 err = err2;
819 return err;
820}
821EXPORT_SYMBOL(file_write_and_wait_range);
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836void replace_page_cache_page(struct page *old, struct page *new)
837{
838 struct address_space *mapping = old->mapping;
839 void (*freepage)(struct page *) = mapping->a_ops->freepage;
840 pgoff_t offset = old->index;
841 XA_STATE(xas, &mapping->i_pages, offset);
842
843 VM_BUG_ON_PAGE(!PageLocked(old), old);
844 VM_BUG_ON_PAGE(!PageLocked(new), new);
845 VM_BUG_ON_PAGE(new->mapping, new);
846
847 get_page(new);
848 new->mapping = mapping;
849 new->index = offset;
850
851 mem_cgroup_migrate(old, new);
852
853 xas_lock_irq(&xas);
854 xas_store(&xas, new);
855
856 old->mapping = NULL;
857
858 if (!PageHuge(old))
859 __dec_lruvec_page_state(old, NR_FILE_PAGES);
860 if (!PageHuge(new))
861 __inc_lruvec_page_state(new, NR_FILE_PAGES);
862 if (PageSwapBacked(old))
863 __dec_lruvec_page_state(old, NR_SHMEM);
864 if (PageSwapBacked(new))
865 __inc_lruvec_page_state(new, NR_SHMEM);
866 xas_unlock_irq(&xas);
867 if (freepage)
868 freepage(old);
869 put_page(old);
870}
871EXPORT_SYMBOL_GPL(replace_page_cache_page);
872
873noinline int __add_to_page_cache_locked(struct page *page,
874 struct address_space *mapping,
875 pgoff_t offset, gfp_t gfp,
876 void **shadowp)
877{
878 XA_STATE(xas, &mapping->i_pages, offset);
879 int huge = PageHuge(page);
880 int error;
881 bool charged = false;
882
883 VM_BUG_ON_PAGE(!PageLocked(page), page);
884 VM_BUG_ON_PAGE(PageSwapBacked(page), page);
885 mapping_set_update(&xas, mapping);
886
887 get_page(page);
888 page->mapping = mapping;
889 page->index = offset;
890
891 if (!huge) {
892 error = mem_cgroup_charge(page, NULL, gfp);
893 if (error)
894 goto error;
895 charged = true;
896 }
897
898 gfp &= GFP_RECLAIM_MASK;
899
900 do {
901 unsigned int order = xa_get_order(xas.xa, xas.xa_index);
902 void *entry, *old = NULL;
903
904 if (order > thp_order(page))
905 xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index),
906 order, gfp);
907 xas_lock_irq(&xas);
908 xas_for_each_conflict(&xas, entry) {
909 old = entry;
910 if (!xa_is_value(entry)) {
911 xas_set_err(&xas, -EEXIST);
912 goto unlock;
913 }
914 }
915
916 if (old) {
917 if (shadowp)
918 *shadowp = old;
919
920 order = xa_get_order(xas.xa, xas.xa_index);
921 if (order > thp_order(page)) {
922 xas_split(&xas, old, order);
923 xas_reset(&xas);
924 }
925 }
926
927 xas_store(&xas, page);
928 if (xas_error(&xas))
929 goto unlock;
930
931 mapping->nrpages++;
932
933
934 if (!huge)
935 __inc_lruvec_page_state(page, NR_FILE_PAGES);
936unlock:
937 xas_unlock_irq(&xas);
938 } while (xas_nomem(&xas, gfp));
939
940 if (xas_error(&xas)) {
941 error = xas_error(&xas);
942 if (charged)
943 mem_cgroup_uncharge(page);
944 goto error;
945 }
946
947 trace_mm_filemap_add_to_page_cache(page);
948 return 0;
949error:
950 page->mapping = NULL;
951
952 put_page(page);
953 return error;
954}
955ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO);
956
957
958
959
960
961
962
963
964
965
966
967
968
969int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
970 pgoff_t offset, gfp_t gfp_mask)
971{
972 return __add_to_page_cache_locked(page, mapping, offset,
973 gfp_mask, NULL);
974}
975EXPORT_SYMBOL(add_to_page_cache_locked);
976
977int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
978 pgoff_t offset, gfp_t gfp_mask)
979{
980 void *shadow = NULL;
981 int ret;
982
983 __SetPageLocked(page);
984 ret = __add_to_page_cache_locked(page, mapping, offset,
985 gfp_mask, &shadow);
986 if (unlikely(ret))
987 __ClearPageLocked(page);
988 else {
989
990
991
992
993
994
995
996
997 WARN_ON_ONCE(PageActive(page));
998 if (!(gfp_mask & __GFP_WRITE) && shadow)
999 workingset_refault(page, shadow);
1000 lru_cache_add(page);
1001 }
1002 return ret;
1003}
1004EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
1005
1006#ifdef CONFIG_NUMA
1007struct page *__page_cache_alloc(gfp_t gfp)
1008{
1009 int n;
1010 struct page *page;
1011
1012 if (cpuset_do_page_mem_spread()) {
1013 unsigned int cpuset_mems_cookie;
1014 do {
1015 cpuset_mems_cookie = read_mems_allowed_begin();
1016 n = cpuset_mem_spread_node();
1017 page = __alloc_pages_node(n, gfp, 0);
1018 } while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
1019
1020 return page;
1021 }
1022 return alloc_pages(gfp, 0);
1023}
1024EXPORT_SYMBOL(__page_cache_alloc);
1025#endif
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035void filemap_invalidate_lock_two(struct address_space *mapping1,
1036 struct address_space *mapping2)
1037{
1038 if (mapping1 > mapping2)
1039 swap(mapping1, mapping2);
1040 if (mapping1)
1041 down_write(&mapping1->invalidate_lock);
1042 if (mapping2 && mapping1 != mapping2)
1043 down_write_nested(&mapping2->invalidate_lock, 1);
1044}
1045EXPORT_SYMBOL(filemap_invalidate_lock_two);
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055void filemap_invalidate_unlock_two(struct address_space *mapping1,
1056 struct address_space *mapping2)
1057{
1058 if (mapping1)
1059 up_write(&mapping1->invalidate_lock);
1060 if (mapping2 && mapping1 != mapping2)
1061 up_write(&mapping2->invalidate_lock);
1062}
1063EXPORT_SYMBOL(filemap_invalidate_unlock_two);
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075#define PAGE_WAIT_TABLE_BITS 8
1076#define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
1077static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
1078
1079static wait_queue_head_t *page_waitqueue(struct page *page)
1080{
1081 return &page_wait_table[hash_ptr(page, PAGE_WAIT_TABLE_BITS)];
1082}
1083
1084void __init pagecache_init(void)
1085{
1086 int i;
1087
1088 for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
1089 init_waitqueue_head(&page_wait_table[i]);
1090
1091 page_writeback_init();
1092}
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
1129{
1130 unsigned int flags;
1131 struct wait_page_key *key = arg;
1132 struct wait_page_queue *wait_page
1133 = container_of(wait, struct wait_page_queue, wait);
1134
1135 if (!wake_page_match(wait_page, key))
1136 return 0;
1137
1138
1139
1140
1141
1142 flags = wait->flags;
1143 if (flags & WQ_FLAG_EXCLUSIVE) {
1144 if (test_bit(key->bit_nr, &key->page->flags))
1145 return -1;
1146 if (flags & WQ_FLAG_CUSTOM) {
1147 if (test_and_set_bit(key->bit_nr, &key->page->flags))
1148 return -1;
1149 flags |= WQ_FLAG_DONE;
1150 }
1151 }
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162 smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);
1163 wake_up_state(wait->private, mode);
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175 list_del_init_careful(&wait->entry);
1176 return (flags & WQ_FLAG_EXCLUSIVE) != 0;
1177}
1178
1179static void wake_up_page_bit(struct page *page, int bit_nr)
1180{
1181 wait_queue_head_t *q = page_waitqueue(page);
1182 struct wait_page_key key;
1183 unsigned long flags;
1184 wait_queue_entry_t bookmark;
1185
1186 key.page = page;
1187 key.bit_nr = bit_nr;
1188 key.page_match = 0;
1189
1190 bookmark.flags = 0;
1191 bookmark.private = NULL;
1192 bookmark.func = NULL;
1193 INIT_LIST_HEAD(&bookmark.entry);
1194
1195 spin_lock_irqsave(&q->lock, flags);
1196 __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
1197
1198 while (bookmark.flags & WQ_FLAG_BOOKMARK) {
1199
1200
1201
1202
1203
1204
1205 spin_unlock_irqrestore(&q->lock, flags);
1206 cpu_relax();
1207 spin_lock_irqsave(&q->lock, flags);
1208 __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
1209 }
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220 if (!waitqueue_active(q) || !key.page_match) {
1221 ClearPageWaiters(page);
1222
1223
1224
1225
1226
1227
1228
1229 }
1230 spin_unlock_irqrestore(&q->lock, flags);
1231}
1232
1233static void wake_up_page(struct page *page, int bit)
1234{
1235 if (!PageWaiters(page))
1236 return;
1237 wake_up_page_bit(page, bit);
1238}
1239
1240
1241
1242
1243enum behavior {
1244 EXCLUSIVE,
1245
1246
1247 SHARED,
1248
1249
1250 DROP,
1251
1252
1253};
1254
1255
1256
1257
1258
1259static inline bool trylock_page_bit_common(struct page *page, int bit_nr,
1260 struct wait_queue_entry *wait)
1261{
1262 if (wait->flags & WQ_FLAG_EXCLUSIVE) {
1263 if (test_and_set_bit(bit_nr, &page->flags))
1264 return false;
1265 } else if (test_bit(bit_nr, &page->flags))
1266 return false;
1267
1268 wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
1269 return true;
1270}
1271
1272
1273int sysctl_page_lock_unfairness = 5;
1274
1275static inline int wait_on_page_bit_common(wait_queue_head_t *q,
1276 struct page *page, int bit_nr, int state, enum behavior behavior)
1277{
1278 int unfairness = sysctl_page_lock_unfairness;
1279 struct wait_page_queue wait_page;
1280 wait_queue_entry_t *wait = &wait_page.wait;
1281 bool thrashing = false;
1282 bool delayacct = false;
1283 unsigned long pflags;
1284
1285 if (bit_nr == PG_locked &&
1286 !PageUptodate(page) && PageWorkingset(page)) {
1287 if (!PageSwapBacked(page)) {
1288 delayacct_thrashing_start();
1289 delayacct = true;
1290 }
1291 psi_memstall_enter(&pflags);
1292 thrashing = true;
1293 }
1294
1295 init_wait(wait);
1296 wait->func = wake_page_function;
1297 wait_page.page = page;
1298 wait_page.bit_nr = bit_nr;
1299
1300repeat:
1301 wait->flags = 0;
1302 if (behavior == EXCLUSIVE) {
1303 wait->flags = WQ_FLAG_EXCLUSIVE;
1304 if (--unfairness < 0)
1305 wait->flags |= WQ_FLAG_CUSTOM;
1306 }
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322 spin_lock_irq(&q->lock);
1323 SetPageWaiters(page);
1324 if (!trylock_page_bit_common(page, bit_nr, wait))
1325 __add_wait_queue_entry_tail(q, wait);
1326 spin_unlock_irq(&q->lock);
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336 if (behavior == DROP)
1337 put_page(page);
1338
1339
1340
1341
1342
1343
1344
1345 for (;;) {
1346 unsigned int flags;
1347
1348 set_current_state(state);
1349
1350
1351 flags = smp_load_acquire(&wait->flags);
1352 if (!(flags & WQ_FLAG_WOKEN)) {
1353 if (signal_pending_state(state, current))
1354 break;
1355
1356 io_schedule();
1357 continue;
1358 }
1359
1360
1361 if (behavior != EXCLUSIVE)
1362 break;
1363
1364
1365 if (flags & WQ_FLAG_DONE)
1366 break;
1367
1368
1369
1370
1371
1372
1373
1374 if (unlikely(test_and_set_bit(bit_nr, &page->flags)))
1375 goto repeat;
1376
1377 wait->flags |= WQ_FLAG_DONE;
1378 break;
1379 }
1380
1381
1382
1383
1384
1385
1386
1387 finish_wait(q, wait);
1388
1389 if (thrashing) {
1390 if (delayacct)
1391 delayacct_thrashing_end();
1392 psi_memstall_leave(&pflags);
1393 }
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408 if (behavior == EXCLUSIVE)
1409 return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR;
1410
1411 return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
1412}
1413
1414void wait_on_page_bit(struct page *page, int bit_nr)
1415{
1416 wait_queue_head_t *q = page_waitqueue(page);
1417 wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
1418}
1419EXPORT_SYMBOL(wait_on_page_bit);
1420
1421int wait_on_page_bit_killable(struct page *page, int bit_nr)
1422{
1423 wait_queue_head_t *q = page_waitqueue(page);
1424 return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, SHARED);
1425}
1426EXPORT_SYMBOL(wait_on_page_bit_killable);
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441int put_and_wait_on_page_locked(struct page *page, int state)
1442{
1443 wait_queue_head_t *q;
1444
1445 page = compound_head(page);
1446 q = page_waitqueue(page);
1447 return wait_on_page_bit_common(q, page, PG_locked, state, DROP);
1448}
1449
1450
1451
1452
1453
1454
1455
1456
1457void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter)
1458{
1459 wait_queue_head_t *q = page_waitqueue(page);
1460 unsigned long flags;
1461
1462 spin_lock_irqsave(&q->lock, flags);
1463 __add_wait_queue_entry_tail(q, waiter);
1464 SetPageWaiters(page);
1465 spin_unlock_irqrestore(&q->lock, flags);
1466}
1467EXPORT_SYMBOL_GPL(add_page_wait_queue);
1468
1469#ifndef clear_bit_unlock_is_negative_byte
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem)
1484{
1485 clear_bit_unlock(nr, mem);
1486
1487 return test_bit(PG_waiters, mem);
1488}
1489
1490#endif
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507void unlock_page(struct page *page)
1508{
1509 BUILD_BUG_ON(PG_waiters != 7);
1510 page = compound_head(page);
1511 VM_BUG_ON_PAGE(!PageLocked(page), page);
1512 if (clear_bit_unlock_is_negative_byte(PG_locked, &page->flags))
1513 wake_up_page_bit(page, PG_locked);
1514}
1515EXPORT_SYMBOL(unlock_page);
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528void end_page_private_2(struct page *page)
1529{
1530 page = compound_head(page);
1531 VM_BUG_ON_PAGE(!PagePrivate2(page), page);
1532 clear_bit_unlock(PG_private_2, &page->flags);
1533 wake_up_page_bit(page, PG_private_2);
1534 put_page(page);
1535}
1536EXPORT_SYMBOL(end_page_private_2);
1537
1538
1539
1540
1541
1542
1543
1544void wait_on_page_private_2(struct page *page)
1545{
1546 page = compound_head(page);
1547 while (PagePrivate2(page))
1548 wait_on_page_bit(page, PG_private_2);
1549}
1550EXPORT_SYMBOL(wait_on_page_private_2);
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563int wait_on_page_private_2_killable(struct page *page)
1564{
1565 int ret = 0;
1566
1567 page = compound_head(page);
1568 while (PagePrivate2(page)) {
1569 ret = wait_on_page_bit_killable(page, PG_private_2);
1570 if (ret < 0)
1571 break;
1572 }
1573
1574 return ret;
1575}
1576EXPORT_SYMBOL(wait_on_page_private_2_killable);
1577
1578
1579
1580
1581
1582void end_page_writeback(struct page *page)
1583{
1584
1585
1586
1587
1588
1589
1590
1591 if (PageReclaim(page)) {
1592 ClearPageReclaim(page);
1593 rotate_reclaimable_page(page);
1594 }
1595
1596
1597
1598
1599
1600
1601
1602 get_page(page);
1603 if (!test_clear_page_writeback(page))
1604 BUG();
1605
1606 smp_mb__after_atomic();
1607 wake_up_page(page, PG_writeback);
1608 put_page(page);
1609}
1610EXPORT_SYMBOL(end_page_writeback);
1611
1612
1613
1614
1615
1616void page_endio(struct page *page, bool is_write, int err)
1617{
1618 if (!is_write) {
1619 if (!err) {
1620 SetPageUptodate(page);
1621 } else {
1622 ClearPageUptodate(page);
1623 SetPageError(page);
1624 }
1625 unlock_page(page);
1626 } else {
1627 if (err) {
1628 struct address_space *mapping;
1629
1630 SetPageError(page);
1631 mapping = page_mapping(page);
1632 if (mapping)
1633 mapping_set_error(mapping, err);
1634 }
1635 end_page_writeback(page);
1636 }
1637}
1638EXPORT_SYMBOL_GPL(page_endio);
1639
1640
1641
1642
1643
1644void __lock_page(struct page *__page)
1645{
1646 struct page *page = compound_head(__page);
1647 wait_queue_head_t *q = page_waitqueue(page);
1648 wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE,
1649 EXCLUSIVE);
1650}
1651EXPORT_SYMBOL(__lock_page);
1652
1653int __lock_page_killable(struct page *__page)
1654{
1655 struct page *page = compound_head(__page);
1656 wait_queue_head_t *q = page_waitqueue(page);
1657 return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE,
1658 EXCLUSIVE);
1659}
1660EXPORT_SYMBOL_GPL(__lock_page_killable);
1661
1662int __lock_page_async(struct page *page, struct wait_page_queue *wait)
1663{
1664 struct wait_queue_head *q = page_waitqueue(page);
1665 int ret = 0;
1666
1667 wait->page = page;
1668 wait->bit_nr = PG_locked;
1669
1670 spin_lock_irq(&q->lock);
1671 __add_wait_queue_entry_tail(q, &wait->wait);
1672 SetPageWaiters(page);
1673 ret = !trylock_page(page);
1674
1675
1676
1677
1678
1679
1680 if (!ret)
1681 __remove_wait_queue(q, &wait->wait);
1682 else
1683 ret = -EIOCBQUEUED;
1684 spin_unlock_irq(&q->lock);
1685 return ret;
1686}
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
1700 unsigned int flags)
1701{
1702 if (fault_flag_allow_retry_first(flags)) {
1703
1704
1705
1706
1707 if (flags & FAULT_FLAG_RETRY_NOWAIT)
1708 return 0;
1709
1710 mmap_read_unlock(mm);
1711 if (flags & FAULT_FLAG_KILLABLE)
1712 wait_on_page_locked_killable(page);
1713 else
1714 wait_on_page_locked(page);
1715 return 0;
1716 }
1717 if (flags & FAULT_FLAG_KILLABLE) {
1718 int ret;
1719
1720 ret = __lock_page_killable(page);
1721 if (ret) {
1722 mmap_read_unlock(mm);
1723 return 0;
1724 }
1725 } else {
1726 __lock_page(page);
1727 }
1728 return 1;
1729
1730}
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751pgoff_t page_cache_next_miss(struct address_space *mapping,
1752 pgoff_t index, unsigned long max_scan)
1753{
1754 XA_STATE(xas, &mapping->i_pages, index);
1755
1756 while (max_scan--) {
1757 void *entry = xas_next(&xas);
1758 if (!entry || xa_is_value(entry))
1759 break;
1760 if (xas.xa_index == 0)
1761 break;
1762 }
1763
1764 return xas.xa_index;
1765}
1766EXPORT_SYMBOL(page_cache_next_miss);
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787pgoff_t page_cache_prev_miss(struct address_space *mapping,
1788 pgoff_t index, unsigned long max_scan)
1789{
1790 XA_STATE(xas, &mapping->i_pages, index);
1791
1792 while (max_scan--) {
1793 void *entry = xas_prev(&xas);
1794 if (!entry || xa_is_value(entry))
1795 break;
1796 if (xas.xa_index == ULONG_MAX)
1797 break;
1798 }
1799
1800 return xas.xa_index;
1801}
1802EXPORT_SYMBOL(page_cache_prev_miss);
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817static struct page *mapping_get_entry(struct address_space *mapping,
1818 pgoff_t index)
1819{
1820 XA_STATE(xas, &mapping->i_pages, index);
1821 struct page *page;
1822
1823 rcu_read_lock();
1824repeat:
1825 xas_reset(&xas);
1826 page = xas_load(&xas);
1827 if (xas_retry(&xas, page))
1828 goto repeat;
1829
1830
1831
1832
1833 if (!page || xa_is_value(page))
1834 goto out;
1835
1836 if (!page_cache_get_speculative(page))
1837 goto repeat;
1838
1839
1840
1841
1842
1843
1844 if (unlikely(page != xas_reload(&xas))) {
1845 put_page(page);
1846 goto repeat;
1847 }
1848out:
1849 rcu_read_unlock();
1850
1851 return page;
1852}
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
1889 int fgp_flags, gfp_t gfp_mask)
1890{
1891 struct page *page;
1892
1893repeat:
1894 page = mapping_get_entry(mapping, index);
1895 if (xa_is_value(page)) {
1896 if (fgp_flags & FGP_ENTRY)
1897 return page;
1898 page = NULL;
1899 }
1900 if (!page)
1901 goto no_page;
1902
1903 if (fgp_flags & FGP_LOCK) {
1904 if (fgp_flags & FGP_NOWAIT) {
1905 if (!trylock_page(page)) {
1906 put_page(page);
1907 return NULL;
1908 }
1909 } else {
1910 lock_page(page);
1911 }
1912
1913
1914 if (unlikely(page->mapping != mapping)) {
1915 unlock_page(page);
1916 put_page(page);
1917 goto repeat;
1918 }
1919 VM_BUG_ON_PAGE(!thp_contains(page, index), page);
1920 }
1921
1922 if (fgp_flags & FGP_ACCESSED)
1923 mark_page_accessed(page);
1924 else if (fgp_flags & FGP_WRITE) {
1925
1926 if (page_is_idle(page))
1927 clear_page_idle(page);
1928 }
1929 if (!(fgp_flags & FGP_HEAD))
1930 page = find_subpage(page, index);
1931
1932no_page:
1933 if (!page && (fgp_flags & FGP_CREAT)) {
1934 int err;
1935 if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
1936 gfp_mask |= __GFP_WRITE;
1937 if (fgp_flags & FGP_NOFS)
1938 gfp_mask &= ~__GFP_FS;
1939
1940 page = __page_cache_alloc(gfp_mask);
1941 if (!page)
1942 return NULL;
1943
1944 if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
1945 fgp_flags |= FGP_LOCK;
1946
1947
1948 if (fgp_flags & FGP_ACCESSED)
1949 __SetPageReferenced(page);
1950
1951 err = add_to_page_cache_lru(page, mapping, index, gfp_mask);
1952 if (unlikely(err)) {
1953 put_page(page);
1954 page = NULL;
1955 if (err == -EEXIST)
1956 goto repeat;
1957 }
1958
1959
1960
1961
1962
1963 if (page && (fgp_flags & FGP_FOR_MMAP))
1964 unlock_page(page);
1965 }
1966
1967 return page;
1968}
1969EXPORT_SYMBOL(pagecache_get_page);
1970
1971static inline struct page *find_get_entry(struct xa_state *xas, pgoff_t max,
1972 xa_mark_t mark)
1973{
1974 struct page *page;
1975
1976retry:
1977 if (mark == XA_PRESENT)
1978 page = xas_find(xas, max);
1979 else
1980 page = xas_find_marked(xas, max, mark);
1981
1982 if (xas_retry(xas, page))
1983 goto retry;
1984
1985
1986
1987
1988
1989 if (!page || xa_is_value(page))
1990 return page;
1991
1992 if (!page_cache_get_speculative(page))
1993 goto reset;
1994
1995
1996 if (unlikely(page != xas_reload(xas))) {
1997 put_page(page);
1998 goto reset;
1999 }
2000
2001 return page;
2002reset:
2003 xas_reset(xas);
2004 goto retry;
2005}
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
2034 pgoff_t end, struct pagevec *pvec, pgoff_t *indices)
2035{
2036 XA_STATE(xas, &mapping->i_pages, start);
2037 struct page *page;
2038 unsigned int ret = 0;
2039 unsigned nr_entries = PAGEVEC_SIZE;
2040
2041 rcu_read_lock();
2042 while ((page = find_get_entry(&xas, end, XA_PRESENT))) {
2043
2044
2045
2046
2047 if (!xa_is_value(page) && PageTransHuge(page) &&
2048 !PageHuge(page)) {
2049 page = find_subpage(page, xas.xa_index);
2050 nr_entries = ret + 1;
2051 }
2052
2053 indices[ret] = xas.xa_index;
2054 pvec->pages[ret] = page;
2055 if (++ret == nr_entries)
2056 break;
2057 }
2058 rcu_read_unlock();
2059
2060 pvec->nr = ret;
2061 return ret;
2062}
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
2086 pgoff_t end, struct pagevec *pvec, pgoff_t *indices)
2087{
2088 XA_STATE(xas, &mapping->i_pages, start);
2089 struct page *page;
2090
2091 rcu_read_lock();
2092 while ((page = find_get_entry(&xas, end, XA_PRESENT))) {
2093 if (!xa_is_value(page)) {
2094 if (page->index < start)
2095 goto put;
2096 VM_BUG_ON_PAGE(page->index != xas.xa_index, page);
2097 if (page->index + thp_nr_pages(page) - 1 > end)
2098 goto put;
2099 if (!trylock_page(page))
2100 goto put;
2101 if (page->mapping != mapping || PageWriteback(page))
2102 goto unlock;
2103 VM_BUG_ON_PAGE(!thp_contains(page, xas.xa_index),
2104 page);
2105 }
2106 indices[pvec->nr] = xas.xa_index;
2107 if (!pagevec_add(pvec, page))
2108 break;
2109 goto next;
2110unlock:
2111 unlock_page(page);
2112put:
2113 put_page(page);
2114next:
2115 if (!xa_is_value(page) && PageTransHuge(page)) {
2116 unsigned int nr_pages = thp_nr_pages(page);
2117
2118
2119 xas_set(&xas, page->index + nr_pages);
2120 if (xas.xa_index < nr_pages)
2121 break;
2122 }
2123 }
2124 rcu_read_unlock();
2125
2126 return pagevec_count(pvec);
2127}
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
2151 pgoff_t end, unsigned int nr_pages,
2152 struct page **pages)
2153{
2154 XA_STATE(xas, &mapping->i_pages, *start);
2155 struct page *page;
2156 unsigned ret = 0;
2157
2158 if (unlikely(!nr_pages))
2159 return 0;
2160
2161 rcu_read_lock();
2162 while ((page = find_get_entry(&xas, end, XA_PRESENT))) {
2163
2164 if (xa_is_value(page))
2165 continue;
2166
2167 pages[ret] = find_subpage(page, xas.xa_index);
2168 if (++ret == nr_pages) {
2169 *start = xas.xa_index + 1;
2170 goto out;
2171 }
2172 }
2173
2174
2175
2176
2177
2178
2179
2180 if (end == (pgoff_t)-1)
2181 *start = (pgoff_t)-1;
2182 else
2183 *start = end + 1;
2184out:
2185 rcu_read_unlock();
2186
2187 return ret;
2188}
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
2203 unsigned int nr_pages, struct page **pages)
2204{
2205 XA_STATE(xas, &mapping->i_pages, index);
2206 struct page *page;
2207 unsigned int ret = 0;
2208
2209 if (unlikely(!nr_pages))
2210 return 0;
2211
2212 rcu_read_lock();
2213 for (page = xas_load(&xas); page; page = xas_next(&xas)) {
2214 if (xas_retry(&xas, page))
2215 continue;
2216
2217
2218
2219
2220 if (xa_is_value(page))
2221 break;
2222
2223 if (!page_cache_get_speculative(page))
2224 goto retry;
2225
2226
2227 if (unlikely(page != xas_reload(&xas)))
2228 goto put_page;
2229
2230 pages[ret] = find_subpage(page, xas.xa_index);
2231 if (++ret == nr_pages)
2232 break;
2233 continue;
2234put_page:
2235 put_page(page);
2236retry:
2237 xas_reset(&xas);
2238 }
2239 rcu_read_unlock();
2240 return ret;
2241}
2242EXPORT_SYMBOL(find_get_pages_contig);
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
2260 pgoff_t end, xa_mark_t tag, unsigned int nr_pages,
2261 struct page **pages)
2262{
2263 XA_STATE(xas, &mapping->i_pages, *index);
2264 struct page *page;
2265 unsigned ret = 0;
2266
2267 if (unlikely(!nr_pages))
2268 return 0;
2269
2270 rcu_read_lock();
2271 while ((page = find_get_entry(&xas, end, tag))) {
2272
2273
2274
2275
2276
2277 if (xa_is_value(page))
2278 continue;
2279
2280 pages[ret] = page;
2281 if (++ret == nr_pages) {
2282 *index = page->index + thp_nr_pages(page);
2283 goto out;
2284 }
2285 }
2286
2287
2288
2289
2290
2291
2292
2293 if (end == (pgoff_t)-1)
2294 *index = (pgoff_t)-1;
2295 else
2296 *index = end + 1;
2297out:
2298 rcu_read_unlock();
2299
2300 return ret;
2301}
2302EXPORT_SYMBOL(find_get_pages_range_tag);
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319static void shrink_readahead_size_eio(struct file_ra_state *ra)
2320{
2321 ra->ra_pages /= 4;
2322}
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333static void filemap_get_read_batch(struct address_space *mapping,
2334 pgoff_t index, pgoff_t max, struct pagevec *pvec)
2335{
2336 XA_STATE(xas, &mapping->i_pages, index);
2337 struct page *head;
2338
2339 rcu_read_lock();
2340 for (head = xas_load(&xas); head; head = xas_next(&xas)) {
2341 if (xas_retry(&xas, head))
2342 continue;
2343 if (xas.xa_index > max || xa_is_value(head))
2344 break;
2345 if (!page_cache_get_speculative(head))
2346 goto retry;
2347
2348
2349 if (unlikely(head != xas_reload(&xas)))
2350 goto put_page;
2351
2352 if (!pagevec_add(pvec, head))
2353 break;
2354 if (!PageUptodate(head))
2355 break;
2356 if (PageReadahead(head))
2357 break;
2358 xas.xa_index = head->index + thp_nr_pages(head) - 1;
2359 xas.xa_offset = (xas.xa_index >> xas.xa_shift) & XA_CHUNK_MASK;
2360 continue;
2361put_page:
2362 put_page(head);
2363retry:
2364 xas_reset(&xas);
2365 }
2366 rcu_read_unlock();
2367}
2368
2369static int filemap_read_page(struct file *file, struct address_space *mapping,
2370 struct page *page)
2371{
2372 int error;
2373
2374
2375
2376
2377
2378
2379 ClearPageError(page);
2380
2381 error = mapping->a_ops->readpage(file, page);
2382 if (error)
2383 return error;
2384
2385 error = wait_on_page_locked_killable(page);
2386 if (error)
2387 return error;
2388 if (PageUptodate(page))
2389 return 0;
2390 shrink_readahead_size_eio(&file->f_ra);
2391 return -EIO;
2392}
2393
2394static bool filemap_range_uptodate(struct address_space *mapping,
2395 loff_t pos, struct iov_iter *iter, struct page *page)
2396{
2397 int count;
2398
2399 if (PageUptodate(page))
2400 return true;
2401
2402 if (iov_iter_is_pipe(iter))
2403 return false;
2404 if (!mapping->a_ops->is_partially_uptodate)
2405 return false;
2406 if (mapping->host->i_blkbits >= (PAGE_SHIFT + thp_order(page)))
2407 return false;
2408
2409 count = iter->count;
2410 if (page_offset(page) > pos) {
2411 count -= page_offset(page) - pos;
2412 pos = 0;
2413 } else {
2414 pos -= page_offset(page);
2415 }
2416
2417 return mapping->a_ops->is_partially_uptodate(page, pos, count);
2418}
2419
2420static int filemap_update_page(struct kiocb *iocb,
2421 struct address_space *mapping, struct iov_iter *iter,
2422 struct page *page)
2423{
2424 int error;
2425
2426 if (iocb->ki_flags & IOCB_NOWAIT) {
2427 if (!filemap_invalidate_trylock_shared(mapping))
2428 return -EAGAIN;
2429 } else {
2430 filemap_invalidate_lock_shared(mapping);
2431 }
2432
2433 if (!trylock_page(page)) {
2434 error = -EAGAIN;
2435 if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))
2436 goto unlock_mapping;
2437 if (!(iocb->ki_flags & IOCB_WAITQ)) {
2438 filemap_invalidate_unlock_shared(mapping);
2439 put_and_wait_on_page_locked(page, TASK_KILLABLE);
2440 return AOP_TRUNCATED_PAGE;
2441 }
2442 error = __lock_page_async(page, iocb->ki_waitq);
2443 if (error)
2444 goto unlock_mapping;
2445 }
2446
2447 error = AOP_TRUNCATED_PAGE;
2448 if (!page->mapping)
2449 goto unlock;
2450
2451 error = 0;
2452 if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, page))
2453 goto unlock;
2454
2455 error = -EAGAIN;
2456 if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))
2457 goto unlock;
2458
2459 error = filemap_read_page(iocb->ki_filp, mapping, page);
2460 goto unlock_mapping;
2461unlock:
2462 unlock_page(page);
2463unlock_mapping:
2464 filemap_invalidate_unlock_shared(mapping);
2465 if (error == AOP_TRUNCATED_PAGE)
2466 put_page(page);
2467 return error;
2468}
2469
2470static int filemap_create_page(struct file *file,
2471 struct address_space *mapping, pgoff_t index,
2472 struct pagevec *pvec)
2473{
2474 struct page *page;
2475 int error;
2476
2477 page = page_cache_alloc(mapping);
2478 if (!page)
2479 return -ENOMEM;
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493 filemap_invalidate_lock_shared(mapping);
2494 error = add_to_page_cache_lru(page, mapping, index,
2495 mapping_gfp_constraint(mapping, GFP_KERNEL));
2496 if (error == -EEXIST)
2497 error = AOP_TRUNCATED_PAGE;
2498 if (error)
2499 goto error;
2500
2501 error = filemap_read_page(file, mapping, page);
2502 if (error)
2503 goto error;
2504
2505 filemap_invalidate_unlock_shared(mapping);
2506 pagevec_add(pvec, page);
2507 return 0;
2508error:
2509 filemap_invalidate_unlock_shared(mapping);
2510 put_page(page);
2511 return error;
2512}
2513
2514static int filemap_readahead(struct kiocb *iocb, struct file *file,
2515 struct address_space *mapping, struct page *page,
2516 pgoff_t last_index)
2517{
2518 if (iocb->ki_flags & IOCB_NOIO)
2519 return -EAGAIN;
2520 page_cache_async_readahead(mapping, &file->f_ra, file, page,
2521 page->index, last_index - page->index);
2522 return 0;
2523}
2524
2525static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
2526 struct pagevec *pvec)
2527{
2528 struct file *filp = iocb->ki_filp;
2529 struct address_space *mapping = filp->f_mapping;
2530 struct file_ra_state *ra = &filp->f_ra;
2531 pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
2532 pgoff_t last_index;
2533 struct page *page;
2534 int err = 0;
2535
2536 last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE);
2537retry:
2538 if (fatal_signal_pending(current))
2539 return -EINTR;
2540
2541 filemap_get_read_batch(mapping, index, last_index, pvec);
2542 if (!pagevec_count(pvec)) {
2543 if (iocb->ki_flags & IOCB_NOIO)
2544 return -EAGAIN;
2545 page_cache_sync_readahead(mapping, ra, filp, index,
2546 last_index - index);
2547 filemap_get_read_batch(mapping, index, last_index, pvec);
2548 }
2549 if (!pagevec_count(pvec)) {
2550 if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
2551 return -EAGAIN;
2552 err = filemap_create_page(filp, mapping,
2553 iocb->ki_pos >> PAGE_SHIFT, pvec);
2554 if (err == AOP_TRUNCATED_PAGE)
2555 goto retry;
2556 return err;
2557 }
2558
2559 page = pvec->pages[pagevec_count(pvec) - 1];
2560 if (PageReadahead(page)) {
2561 err = filemap_readahead(iocb, filp, mapping, page, last_index);
2562 if (err)
2563 goto err;
2564 }
2565 if (!PageUptodate(page)) {
2566 if ((iocb->ki_flags & IOCB_WAITQ) && pagevec_count(pvec) > 1)
2567 iocb->ki_flags |= IOCB_NOWAIT;
2568 err = filemap_update_page(iocb, mapping, iter, page);
2569 if (err)
2570 goto err;
2571 }
2572
2573 return 0;
2574err:
2575 if (err < 0)
2576 put_page(page);
2577 if (likely(--pvec->nr))
2578 return 0;
2579 if (err == AOP_TRUNCATED_PAGE)
2580 goto retry;
2581 return err;
2582}
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
2598 ssize_t already_read)
2599{
2600 struct file *filp = iocb->ki_filp;
2601 struct file_ra_state *ra = &filp->f_ra;
2602 struct address_space *mapping = filp->f_mapping;
2603 struct inode *inode = mapping->host;
2604 struct pagevec pvec;
2605 int i, error = 0;
2606 bool writably_mapped;
2607 loff_t isize, end_offset;
2608
2609 if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
2610 return 0;
2611 if (unlikely(!iov_iter_count(iter)))
2612 return 0;
2613
2614 iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
2615 pagevec_init(&pvec);
2616
2617 do {
2618 cond_resched();
2619
2620
2621
2622
2623
2624
2625 if ((iocb->ki_flags & IOCB_WAITQ) && already_read)
2626 iocb->ki_flags |= IOCB_NOWAIT;
2627
2628 error = filemap_get_pages(iocb, iter, &pvec);
2629 if (error < 0)
2630 break;
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640 isize = i_size_read(inode);
2641 if (unlikely(iocb->ki_pos >= isize))
2642 goto put_pages;
2643 end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
2644
2645
2646
2647
2648
2649 writably_mapped = mapping_writably_mapped(mapping);
2650
2651
2652
2653
2654
2655 if (iocb->ki_pos >> PAGE_SHIFT !=
2656 ra->prev_pos >> PAGE_SHIFT)
2657 mark_page_accessed(pvec.pages[0]);
2658
2659 for (i = 0; i < pagevec_count(&pvec); i++) {
2660 struct page *page = pvec.pages[i];
2661 size_t page_size = thp_size(page);
2662 size_t offset = iocb->ki_pos & (page_size - 1);
2663 size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
2664 page_size - offset);
2665 size_t copied;
2666
2667 if (end_offset < page_offset(page))
2668 break;
2669 if (i > 0)
2670 mark_page_accessed(page);
2671
2672
2673
2674
2675
2676 if (writably_mapped) {
2677 int j;
2678
2679 for (j = 0; j < thp_nr_pages(page); j++)
2680 flush_dcache_page(page + j);
2681 }
2682
2683 copied = copy_page_to_iter(page, offset, bytes, iter);
2684
2685 already_read += copied;
2686 iocb->ki_pos += copied;
2687 ra->prev_pos = iocb->ki_pos;
2688
2689 if (copied < bytes) {
2690 error = -EFAULT;
2691 break;
2692 }
2693 }
2694put_pages:
2695 for (i = 0; i < pagevec_count(&pvec); i++)
2696 put_page(pvec.pages[i]);
2697 pagevec_reinit(&pvec);
2698 } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
2699
2700 file_accessed(filp);
2701
2702 return already_read ? already_read : error;
2703}
2704EXPORT_SYMBOL_GPL(filemap_read);
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727ssize_t
2728generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
2729{
2730 size_t count = iov_iter_count(iter);
2731 ssize_t retval = 0;
2732
2733 if (!count)
2734 return 0;
2735
2736 if (iocb->ki_flags & IOCB_DIRECT) {
2737 struct file *file = iocb->ki_filp;
2738 struct address_space *mapping = file->f_mapping;
2739 struct inode *inode = mapping->host;
2740 loff_t size;
2741
2742 size = i_size_read(inode);
2743 if (iocb->ki_flags & IOCB_NOWAIT) {
2744 if (filemap_range_needs_writeback(mapping, iocb->ki_pos,
2745 iocb->ki_pos + count - 1))
2746 return -EAGAIN;
2747 } else {
2748 retval = filemap_write_and_wait_range(mapping,
2749 iocb->ki_pos,
2750 iocb->ki_pos + count - 1);
2751 if (retval < 0)
2752 return retval;
2753 }
2754
2755 file_accessed(file);
2756
2757 retval = mapping->a_ops->direct_IO(iocb, iter);
2758 if (retval >= 0) {
2759 iocb->ki_pos += retval;
2760 count -= retval;
2761 }
2762 if (retval != -EIOCBQUEUED)
2763 iov_iter_revert(iter, count - iov_iter_count(iter));
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774 if (retval < 0 || !count || iocb->ki_pos >= size ||
2775 IS_DAX(inode))
2776 return retval;
2777 }
2778
2779 return filemap_read(iocb, iter, retval);
2780}
2781EXPORT_SYMBOL(generic_file_read_iter);
2782
2783static inline loff_t page_seek_hole_data(struct xa_state *xas,
2784 struct address_space *mapping, struct page *page,
2785 loff_t start, loff_t end, bool seek_data)
2786{
2787 const struct address_space_operations *ops = mapping->a_ops;
2788 size_t offset, bsz = i_blocksize(mapping->host);
2789
2790 if (xa_is_value(page) || PageUptodate(page))
2791 return seek_data ? start : end;
2792 if (!ops->is_partially_uptodate)
2793 return seek_data ? end : start;
2794
2795 xas_pause(xas);
2796 rcu_read_unlock();
2797 lock_page(page);
2798 if (unlikely(page->mapping != mapping))
2799 goto unlock;
2800
2801 offset = offset_in_thp(page, start) & ~(bsz - 1);
2802
2803 do {
2804 if (ops->is_partially_uptodate(page, offset, bsz) == seek_data)
2805 break;
2806 start = (start + bsz) & ~(bsz - 1);
2807 offset += bsz;
2808 } while (offset < thp_size(page));
2809unlock:
2810 unlock_page(page);
2811 rcu_read_lock();
2812 return start;
2813}
2814
2815static inline
2816unsigned int seek_page_size(struct xa_state *xas, struct page *page)
2817{
2818 if (xa_is_value(page))
2819 return PAGE_SIZE << xa_get_order(xas->xa, xas->xa_index);
2820 return thp_size(page);
2821}
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,
2842 loff_t end, int whence)
2843{
2844 XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT);
2845 pgoff_t max = (end - 1) >> PAGE_SHIFT;
2846 bool seek_data = (whence == SEEK_DATA);
2847 struct page *page;
2848
2849 if (end <= start)
2850 return -ENXIO;
2851
2852 rcu_read_lock();
2853 while ((page = find_get_entry(&xas, max, XA_PRESENT))) {
2854 loff_t pos = (u64)xas.xa_index << PAGE_SHIFT;
2855 unsigned int seek_size;
2856
2857 if (start < pos) {
2858 if (!seek_data)
2859 goto unlock;
2860 start = pos;
2861 }
2862
2863 seek_size = seek_page_size(&xas, page);
2864 pos = round_up(pos + 1, seek_size);
2865 start = page_seek_hole_data(&xas, mapping, page, start, pos,
2866 seek_data);
2867 if (start < pos)
2868 goto unlock;
2869 if (start >= end)
2870 break;
2871 if (seek_size > PAGE_SIZE)
2872 xas_set(&xas, pos >> PAGE_SHIFT);
2873 if (!xa_is_value(page))
2874 put_page(page);
2875 }
2876 if (seek_data)
2877 start = -ENXIO;
2878unlock:
2879 rcu_read_unlock();
2880 if (page && !xa_is_value(page))
2881 put_page(page);
2882 if (start > end)
2883 return end;
2884 return start;
2885}
2886
2887#ifdef CONFIG_MMU
2888#define MMAP_LOTSAMISS (100)
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
2901 struct file **fpin)
2902{
2903 if (trylock_page(page))
2904 return 1;
2905
2906
2907
2908
2909
2910
2911 if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
2912 return 0;
2913
2914 *fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
2915 if (vmf->flags & FAULT_FLAG_KILLABLE) {
2916 if (__lock_page_killable(page)) {
2917
2918
2919
2920
2921
2922
2923 if (*fpin == NULL)
2924 mmap_read_unlock(vmf->vma->vm_mm);
2925 return 0;
2926 }
2927 } else
2928 __lock_page(page);
2929 return 1;
2930}
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
2941{
2942 struct file *file = vmf->vma->vm_file;
2943 struct file_ra_state *ra = &file->f_ra;
2944 struct address_space *mapping = file->f_mapping;
2945 DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);
2946 struct file *fpin = NULL;
2947 unsigned int mmap_miss;
2948
2949
2950 if (vmf->vma->vm_flags & VM_RAND_READ)
2951 return fpin;
2952 if (!ra->ra_pages)
2953 return fpin;
2954
2955 if (vmf->vma->vm_flags & VM_SEQ_READ) {
2956 fpin = maybe_unlock_mmap_for_io(vmf, fpin);
2957 page_cache_sync_ra(&ractl, ra->ra_pages);
2958 return fpin;
2959 }
2960
2961
2962 mmap_miss = READ_ONCE(ra->mmap_miss);
2963 if (mmap_miss < MMAP_LOTSAMISS * 10)
2964 WRITE_ONCE(ra->mmap_miss, ++mmap_miss);
2965
2966
2967
2968
2969
2970 if (mmap_miss > MMAP_LOTSAMISS)
2971 return fpin;
2972
2973
2974
2975
2976 fpin = maybe_unlock_mmap_for_io(vmf, fpin);
2977 ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
2978 ra->size = ra->ra_pages;
2979 ra->async_size = ra->ra_pages / 4;
2980 ractl._index = ra->start;
2981 do_page_cache_ra(&ractl, ra->size, ra->async_size);
2982 return fpin;
2983}
2984
2985
2986
2987
2988
2989
2990static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
2991 struct page *page)
2992{
2993 struct file *file = vmf->vma->vm_file;
2994 struct file_ra_state *ra = &file->f_ra;
2995 struct address_space *mapping = file->f_mapping;
2996 struct file *fpin = NULL;
2997 unsigned int mmap_miss;
2998 pgoff_t offset = vmf->pgoff;
2999
3000
3001 if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
3002 return fpin;
3003 mmap_miss = READ_ONCE(ra->mmap_miss);
3004 if (mmap_miss)
3005 WRITE_ONCE(ra->mmap_miss, --mmap_miss);
3006 if (PageReadahead(page)) {
3007 fpin = maybe_unlock_mmap_for_io(vmf, fpin);
3008 page_cache_async_readahead(mapping, ra, file,
3009 page, offset, ra->ra_pages);
3010 }
3011 return fpin;
3012}
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037vm_fault_t filemap_fault(struct vm_fault *vmf)
3038{
3039 int error;
3040 struct file *file = vmf->vma->vm_file;
3041 struct file *fpin = NULL;
3042 struct address_space *mapping = file->f_mapping;
3043 struct inode *inode = mapping->host;
3044 pgoff_t offset = vmf->pgoff;
3045 pgoff_t max_off;
3046 struct page *page;
3047 vm_fault_t ret = 0;
3048 bool mapping_locked = false;
3049
3050 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
3051 if (unlikely(offset >= max_off))
3052 return VM_FAULT_SIGBUS;
3053
3054
3055
3056
3057 page = find_get_page(mapping, offset);
3058 if (likely(page)) {
3059
3060
3061
3062
3063 if (!(vmf->flags & FAULT_FLAG_TRIED))
3064 fpin = do_async_mmap_readahead(vmf, page);
3065 if (unlikely(!PageUptodate(page))) {
3066 filemap_invalidate_lock_shared(mapping);
3067 mapping_locked = true;
3068 }
3069 } else {
3070
3071 count_vm_event(PGMAJFAULT);
3072 count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
3073 ret = VM_FAULT_MAJOR;
3074 fpin = do_sync_mmap_readahead(vmf);
3075retry_find:
3076
3077
3078
3079
3080 if (!mapping_locked) {
3081 filemap_invalidate_lock_shared(mapping);
3082 mapping_locked = true;
3083 }
3084 page = pagecache_get_page(mapping, offset,
3085 FGP_CREAT|FGP_FOR_MMAP,
3086 vmf->gfp_mask);
3087 if (!page) {
3088 if (fpin)
3089 goto out_retry;
3090 filemap_invalidate_unlock_shared(mapping);
3091 return VM_FAULT_OOM;
3092 }
3093 }
3094
3095 if (!lock_page_maybe_drop_mmap(vmf, page, &fpin))
3096 goto out_retry;
3097
3098
3099 if (unlikely(compound_head(page)->mapping != mapping)) {
3100 unlock_page(page);
3101 put_page(page);
3102 goto retry_find;
3103 }
3104 VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
3105
3106
3107
3108
3109
3110 if (unlikely(!PageUptodate(page))) {
3111
3112
3113
3114
3115
3116
3117 if (!mapping_locked) {
3118 unlock_page(page);
3119 put_page(page);
3120 goto retry_find;
3121 }
3122 goto page_not_uptodate;
3123 }
3124
3125
3126
3127
3128
3129
3130 if (fpin) {
3131 unlock_page(page);
3132 goto out_retry;
3133 }
3134 if (mapping_locked)
3135 filemap_invalidate_unlock_shared(mapping);
3136
3137
3138
3139
3140
3141 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
3142 if (unlikely(offset >= max_off)) {
3143 unlock_page(page);
3144 put_page(page);
3145 return VM_FAULT_SIGBUS;
3146 }
3147
3148 vmf->page = page;
3149 return ret | VM_FAULT_LOCKED;
3150
3151page_not_uptodate:
3152
3153
3154
3155
3156
3157
3158 fpin = maybe_unlock_mmap_for_io(vmf, fpin);
3159 error = filemap_read_page(file, mapping, page);
3160 if (fpin)
3161 goto out_retry;
3162 put_page(page);
3163
3164 if (!error || error == AOP_TRUNCATED_PAGE)
3165 goto retry_find;
3166 filemap_invalidate_unlock_shared(mapping);
3167
3168 return VM_FAULT_SIGBUS;
3169
3170out_retry:
3171
3172
3173
3174
3175
3176 if (page)
3177 put_page(page);
3178 if (mapping_locked)
3179 filemap_invalidate_unlock_shared(mapping);
3180 if (fpin)
3181 fput(fpin);
3182 return ret | VM_FAULT_RETRY;
3183}
3184EXPORT_SYMBOL(filemap_fault);
3185
3186static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page)
3187{
3188 struct mm_struct *mm = vmf->vma->vm_mm;
3189
3190
3191 if (pmd_trans_huge(*vmf->pmd)) {
3192 unlock_page(page);
3193 put_page(page);
3194 return true;
3195 }
3196
3197 if (pmd_none(*vmf->pmd) && PageTransHuge(page)) {
3198 vm_fault_t ret = do_set_pmd(vmf, page);
3199 if (!ret) {
3200
3201 unlock_page(page);
3202 return true;
3203 }
3204 }
3205
3206 if (pmd_none(*vmf->pmd)) {
3207 vmf->ptl = pmd_lock(mm, vmf->pmd);
3208 if (likely(pmd_none(*vmf->pmd))) {
3209 mm_inc_nr_ptes(mm);
3210 pmd_populate(mm, vmf->pmd, vmf->prealloc_pte);
3211 vmf->prealloc_pte = NULL;
3212 }
3213 spin_unlock(vmf->ptl);
3214 }
3215
3216
3217 if (pmd_devmap_trans_unstable(vmf->pmd)) {
3218 unlock_page(page);
3219 put_page(page);
3220 return true;
3221 }
3222
3223 return false;
3224}
3225
3226static struct page *next_uptodate_page(struct page *page,
3227 struct address_space *mapping,
3228 struct xa_state *xas, pgoff_t end_pgoff)
3229{
3230 unsigned long max_idx;
3231
3232 do {
3233 if (!page)
3234 return NULL;
3235 if (xas_retry(xas, page))
3236 continue;
3237 if (xa_is_value(page))
3238 continue;
3239 if (PageLocked(page))
3240 continue;
3241 if (!page_cache_get_speculative(page))
3242 continue;
3243
3244 if (unlikely(page != xas_reload(xas)))
3245 goto skip;
3246 if (!PageUptodate(page) || PageReadahead(page))
3247 goto skip;
3248 if (PageHWPoison(page))
3249 goto skip;
3250 if (!trylock_page(page))
3251 goto skip;
3252 if (page->mapping != mapping)
3253 goto unlock;
3254 if (!PageUptodate(page))
3255 goto unlock;
3256 max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
3257 if (xas->xa_index >= max_idx)
3258 goto unlock;
3259 return page;
3260unlock:
3261 unlock_page(page);
3262skip:
3263 put_page(page);
3264 } while ((page = xas_next_entry(xas, end_pgoff)) != NULL);
3265
3266 return NULL;
3267}
3268
3269static inline struct page *first_map_page(struct address_space *mapping,
3270 struct xa_state *xas,
3271 pgoff_t end_pgoff)
3272{
3273 return next_uptodate_page(xas_find(xas, end_pgoff),
3274 mapping, xas, end_pgoff);
3275}
3276
3277static inline struct page *next_map_page(struct address_space *mapping,
3278 struct xa_state *xas,
3279 pgoff_t end_pgoff)
3280{
3281 return next_uptodate_page(xas_next_entry(xas, end_pgoff),
3282 mapping, xas, end_pgoff);
3283}
3284
3285vm_fault_t filemap_map_pages(struct vm_fault *vmf,
3286 pgoff_t start_pgoff, pgoff_t end_pgoff)
3287{
3288 struct vm_area_struct *vma = vmf->vma;
3289 struct file *file = vma->vm_file;
3290 struct address_space *mapping = file->f_mapping;
3291 pgoff_t last_pgoff = start_pgoff;
3292 unsigned long addr;
3293 XA_STATE(xas, &mapping->i_pages, start_pgoff);
3294 struct page *head, *page;
3295 unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
3296 vm_fault_t ret = 0;
3297
3298 rcu_read_lock();
3299 head = first_map_page(mapping, &xas, end_pgoff);
3300 if (!head)
3301 goto out;
3302
3303 if (filemap_map_pmd(vmf, head)) {
3304 ret = VM_FAULT_NOPAGE;
3305 goto out;
3306 }
3307
3308 addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);
3309 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
3310 do {
3311 page = find_subpage(head, xas.xa_index);
3312 if (PageHWPoison(page))
3313 goto unlock;
3314
3315 if (mmap_miss > 0)
3316 mmap_miss--;
3317
3318 addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
3319 vmf->pte += xas.xa_index - last_pgoff;
3320 last_pgoff = xas.xa_index;
3321
3322 if (!pte_none(*vmf->pte))
3323 goto unlock;
3324
3325
3326 if (vmf->address == addr)
3327 ret = VM_FAULT_NOPAGE;
3328
3329 do_set_pte(vmf, page, addr);
3330
3331 update_mmu_cache(vma, addr, vmf->pte);
3332 unlock_page(head);
3333 continue;
3334unlock:
3335 unlock_page(head);
3336 put_page(head);
3337 } while ((head = next_map_page(mapping, &xas, end_pgoff)) != NULL);
3338 pte_unmap_unlock(vmf->pte, vmf->ptl);
3339out:
3340 rcu_read_unlock();
3341 WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss);
3342 return ret;
3343}
3344EXPORT_SYMBOL(filemap_map_pages);
3345
3346vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
3347{
3348 struct address_space *mapping = vmf->vma->vm_file->f_mapping;
3349 struct page *page = vmf->page;
3350 vm_fault_t ret = VM_FAULT_LOCKED;
3351
3352 sb_start_pagefault(mapping->host->i_sb);
3353 file_update_time(vmf->vma->vm_file);
3354 lock_page(page);
3355 if (page->mapping != mapping) {
3356 unlock_page(page);
3357 ret = VM_FAULT_NOPAGE;
3358 goto out;
3359 }
3360
3361
3362
3363
3364
3365 set_page_dirty(page);
3366 wait_for_stable_page(page);
3367out:
3368 sb_end_pagefault(mapping->host->i_sb);
3369 return ret;
3370}
3371
3372const struct vm_operations_struct generic_file_vm_ops = {
3373 .fault = filemap_fault,
3374 .map_pages = filemap_map_pages,
3375 .page_mkwrite = filemap_page_mkwrite,
3376};
3377
3378
3379
3380int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
3381{
3382 struct address_space *mapping = file->f_mapping;
3383
3384 if (!mapping->a_ops->readpage)
3385 return -ENOEXEC;
3386 file_accessed(file);
3387 vma->vm_ops = &generic_file_vm_ops;
3388 return 0;
3389}
3390
3391
3392
3393
3394int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
3395{
3396 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
3397 return -EINVAL;
3398 return generic_file_mmap(file, vma);
3399}
3400#else
3401vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
3402{
3403 return VM_FAULT_SIGBUS;
3404}
3405int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
3406{
3407 return -ENOSYS;
3408}
3409int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
3410{
3411 return -ENOSYS;
3412}
3413#endif
3414
3415EXPORT_SYMBOL(filemap_page_mkwrite);
3416EXPORT_SYMBOL(generic_file_mmap);
3417EXPORT_SYMBOL(generic_file_readonly_mmap);
3418
3419static struct page *wait_on_page_read(struct page *page)
3420{
3421 if (!IS_ERR(page)) {
3422 wait_on_page_locked(page);
3423 if (!PageUptodate(page)) {
3424 put_page(page);
3425 page = ERR_PTR(-EIO);
3426 }
3427 }
3428 return page;
3429}
3430
3431static struct page *do_read_cache_page(struct address_space *mapping,
3432 pgoff_t index,
3433 int (*filler)(void *, struct page *),
3434 void *data,
3435 gfp_t gfp)
3436{
3437 struct page *page;
3438 int err;
3439repeat:
3440 page = find_get_page(mapping, index);
3441 if (!page) {
3442 page = __page_cache_alloc(gfp);
3443 if (!page)
3444 return ERR_PTR(-ENOMEM);
3445 err = add_to_page_cache_lru(page, mapping, index, gfp);
3446 if (unlikely(err)) {
3447 put_page(page);
3448 if (err == -EEXIST)
3449 goto repeat;
3450
3451 return ERR_PTR(err);
3452 }
3453
3454filler:
3455 if (filler)
3456 err = filler(data, page);
3457 else
3458 err = mapping->a_ops->readpage(data, page);
3459
3460 if (err < 0) {
3461 put_page(page);
3462 return ERR_PTR(err);
3463 }
3464
3465 page = wait_on_page_read(page);
3466 if (IS_ERR(page))
3467 return page;
3468 goto out;
3469 }
3470 if (PageUptodate(page))
3471 goto out;
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504 wait_on_page_locked(page);
3505 if (PageUptodate(page))
3506 goto out;
3507
3508
3509 lock_page(page);
3510
3511
3512 if (!page->mapping) {
3513 unlock_page(page);
3514 put_page(page);
3515 goto repeat;
3516 }
3517
3518
3519 if (PageUptodate(page)) {
3520 unlock_page(page);
3521 goto out;
3522 }
3523
3524
3525
3526
3527
3528
3529
3530 ClearPageError(page);
3531 goto filler;
3532
3533out:
3534 mark_page_accessed(page);
3535 return page;
3536}
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554struct page *read_cache_page(struct address_space *mapping,
3555 pgoff_t index,
3556 int (*filler)(void *, struct page *),
3557 void *data)
3558{
3559 return do_read_cache_page(mapping, index, filler, data,
3560 mapping_gfp_mask(mapping));
3561}
3562EXPORT_SYMBOL(read_cache_page);
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579struct page *read_cache_page_gfp(struct address_space *mapping,
3580 pgoff_t index,
3581 gfp_t gfp)
3582{
3583 return do_read_cache_page(mapping, index, NULL, NULL, gfp);
3584}
3585EXPORT_SYMBOL(read_cache_page_gfp);
3586
3587int pagecache_write_begin(struct file *file, struct address_space *mapping,
3588 loff_t pos, unsigned len, unsigned flags,
3589 struct page **pagep, void **fsdata)
3590{
3591 const struct address_space_operations *aops = mapping->a_ops;
3592
3593 return aops->write_begin(file, mapping, pos, len, flags,
3594 pagep, fsdata);
3595}
3596EXPORT_SYMBOL(pagecache_write_begin);
3597
3598int pagecache_write_end(struct file *file, struct address_space *mapping,
3599 loff_t pos, unsigned len, unsigned copied,
3600 struct page *page, void *fsdata)
3601{
3602 const struct address_space_operations *aops = mapping->a_ops;
3603
3604 return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
3605}
3606EXPORT_SYMBOL(pagecache_write_end);
3607
3608
3609
3610
3611void dio_warn_stale_pagecache(struct file *filp)
3612{
3613 static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
3614 char pathname[128];
3615 char *path;
3616
3617 errseq_set(&filp->f_mapping->wb_err, -EIO);
3618 if (__ratelimit(&_rs)) {
3619 path = file_path(filp, pathname, sizeof(pathname));
3620 if (IS_ERR(path))
3621 path = "(unknown)";
3622 pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n");
3623 pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid,
3624 current->comm);
3625 }
3626}
3627
3628ssize_t
3629generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
3630{
3631 struct file *file = iocb->ki_filp;
3632 struct address_space *mapping = file->f_mapping;
3633 struct inode *inode = mapping->host;
3634 loff_t pos = iocb->ki_pos;
3635 ssize_t written;
3636 size_t write_len;
3637 pgoff_t end;
3638
3639 write_len = iov_iter_count(from);
3640 end = (pos + write_len - 1) >> PAGE_SHIFT;
3641
3642 if (iocb->ki_flags & IOCB_NOWAIT) {
3643
3644 if (filemap_range_has_page(file->f_mapping, pos,
3645 pos + write_len - 1))
3646 return -EAGAIN;
3647 } else {
3648 written = filemap_write_and_wait_range(mapping, pos,
3649 pos + write_len - 1);
3650 if (written)
3651 goto out;
3652 }
3653
3654
3655
3656
3657
3658
3659
3660 written = invalidate_inode_pages2_range(mapping,
3661 pos >> PAGE_SHIFT, end);
3662
3663
3664
3665
3666 if (written) {
3667 if (written == -EBUSY)
3668 return 0;
3669 goto out;
3670 }
3671
3672 written = mapping->a_ops->direct_IO(iocb, from);
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691 if (written > 0 && mapping->nrpages &&
3692 invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end))
3693 dio_warn_stale_pagecache(file);
3694
3695 if (written > 0) {
3696 pos += written;
3697 write_len -= written;
3698 if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
3699 i_size_write(inode, pos);
3700 mark_inode_dirty(inode);
3701 }
3702 iocb->ki_pos = pos;
3703 }
3704 if (written != -EIOCBQUEUED)
3705 iov_iter_revert(from, write_len - iov_iter_count(from));
3706out:
3707 return written;
3708}
3709EXPORT_SYMBOL(generic_file_direct_write);
3710
3711
3712
3713
3714
3715struct page *grab_cache_page_write_begin(struct address_space *mapping,
3716 pgoff_t index, unsigned flags)
3717{
3718 struct page *page;
3719 int fgp_flags = FGP_LOCK|FGP_WRITE|FGP_CREAT;
3720
3721 if (flags & AOP_FLAG_NOFS)
3722 fgp_flags |= FGP_NOFS;
3723
3724 page = pagecache_get_page(mapping, index, fgp_flags,
3725 mapping_gfp_mask(mapping));
3726 if (page)
3727 wait_for_stable_page(page);
3728
3729 return page;
3730}
3731EXPORT_SYMBOL(grab_cache_page_write_begin);
3732
3733ssize_t generic_perform_write(struct file *file,
3734 struct iov_iter *i, loff_t pos)
3735{
3736 struct address_space *mapping = file->f_mapping;
3737 const struct address_space_operations *a_ops = mapping->a_ops;
3738 long status = 0;
3739 ssize_t written = 0;
3740 unsigned int flags = 0;
3741
3742 do {
3743 struct page *page;
3744 unsigned long offset;
3745 unsigned long bytes;
3746 size_t copied;
3747 void *fsdata;
3748
3749 offset = (pos & (PAGE_SIZE - 1));
3750 bytes = min_t(unsigned long, PAGE_SIZE - offset,
3751 iov_iter_count(i));
3752
3753again:
3754
3755
3756
3757
3758
3759
3760 if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
3761 status = -EFAULT;
3762 break;
3763 }
3764
3765 if (fatal_signal_pending(current)) {
3766 status = -EINTR;
3767 break;
3768 }
3769
3770 status = a_ops->write_begin(file, mapping, pos, bytes, flags,
3771 &page, &fsdata);
3772 if (unlikely(status < 0))
3773 break;
3774
3775 if (mapping_writably_mapped(mapping))
3776 flush_dcache_page(page);
3777
3778 copied = copy_page_from_iter_atomic(page, offset, bytes, i);
3779 flush_dcache_page(page);
3780
3781 status = a_ops->write_end(file, mapping, pos, bytes, copied,
3782 page, fsdata);
3783 if (unlikely(status != copied)) {
3784 iov_iter_revert(i, copied - max(status, 0L));
3785 if (unlikely(status < 0))
3786 break;
3787 }
3788 cond_resched();
3789
3790 if (unlikely(status == 0)) {
3791
3792
3793
3794
3795
3796
3797 if (copied)
3798 bytes = copied;
3799 goto again;
3800 }
3801 pos += status;
3802 written += status;
3803
3804 balance_dirty_pages_ratelimited(mapping);
3805 } while (iov_iter_count(i));
3806
3807 return written ? written : status;
3808}
3809EXPORT_SYMBOL(generic_perform_write);
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
3833{
3834 struct file *file = iocb->ki_filp;
3835 struct address_space *mapping = file->f_mapping;
3836 struct inode *inode = mapping->host;
3837 ssize_t written = 0;
3838 ssize_t err;
3839 ssize_t status;
3840
3841
3842 current->backing_dev_info = inode_to_bdi(inode);
3843 err = file_remove_privs(file);
3844 if (err)
3845 goto out;
3846
3847 err = file_update_time(file);
3848 if (err)
3849 goto out;
3850
3851 if (iocb->ki_flags & IOCB_DIRECT) {
3852 loff_t pos, endbyte;
3853
3854 written = generic_file_direct_write(iocb, from);
3855
3856
3857
3858
3859
3860
3861
3862 if (written < 0 || !iov_iter_count(from) || IS_DAX(inode))
3863 goto out;
3864
3865 status = generic_perform_write(file, from, pos = iocb->ki_pos);
3866
3867
3868
3869
3870
3871
3872
3873 if (unlikely(status < 0)) {
3874 err = status;
3875 goto out;
3876 }
3877
3878
3879
3880
3881
3882 endbyte = pos + status - 1;
3883 err = filemap_write_and_wait_range(mapping, pos, endbyte);
3884 if (err == 0) {
3885 iocb->ki_pos = endbyte + 1;
3886 written += status;
3887 invalidate_mapping_pages(mapping,
3888 pos >> PAGE_SHIFT,
3889 endbyte >> PAGE_SHIFT);
3890 } else {
3891
3892
3893
3894
3895 }
3896 } else {
3897 written = generic_perform_write(file, from, iocb->ki_pos);
3898 if (likely(written > 0))
3899 iocb->ki_pos += written;
3900 }
3901out:
3902 current->backing_dev_info = NULL;
3903 return written ? written : err;
3904}
3905EXPORT_SYMBOL(__generic_file_write_iter);
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
3921{
3922 struct file *file = iocb->ki_filp;
3923 struct inode *inode = file->f_mapping->host;
3924 ssize_t ret;
3925
3926 inode_lock(inode);
3927 ret = generic_write_checks(iocb, from);
3928 if (ret > 0)
3929 ret = __generic_file_write_iter(iocb, from);
3930 inode_unlock(inode);
3931
3932 if (ret > 0)
3933 ret = generic_write_sync(iocb, ret);
3934 return ret;
3935}
3936EXPORT_SYMBOL(generic_file_write_iter);
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955int try_to_release_page(struct page *page, gfp_t gfp_mask)
3956{
3957 struct address_space * const mapping = page->mapping;
3958
3959 BUG_ON(!PageLocked(page));
3960 if (PageWriteback(page))
3961 return 0;
3962
3963 if (mapping && mapping->a_ops->releasepage)
3964 return mapping->a_ops->releasepage(page, gfp_mask);
3965 return try_to_free_buffers(page);
3966}
3967
3968EXPORT_SYMBOL(try_to_release_page);
3969