1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17#include <linux/kernel.h>
18#include <linux/export.h>
19#include <linux/spinlock.h>
20#include <linux/slab.h>
21#include <linux/sched.h>
22#include <linux/fs.h>
23#include <linux/mm.h>
24#include <linux/pagemap.h>
25#include <linux/kthread.h>
26#include <linux/writeback.h>
27#include <linux/blkdev.h>
28#include <linux/backing-dev.h>
29#include <linux/tracepoint.h>
30#include <linux/device.h>
31#include <linux/memcontrol.h>
32#include "internal.h"
33
34
35
36
37#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10))
38
39
40
41
42struct wb_writeback_work {
43 long nr_pages;
44 struct super_block *sb;
45 enum writeback_sync_modes sync_mode;
46 unsigned int tagged_writepages:1;
47 unsigned int for_kupdate:1;
48 unsigned int range_cyclic:1;
49 unsigned int for_background:1;
50 unsigned int for_sync:1;
51 unsigned int auto_free:1;
52 enum wb_reason reason;
53
54 struct list_head list;
55 struct wb_completion *done;
56};
57
58
59
60
61
62
63
64
65
66
67
68unsigned int dirtytime_expire_interval = 12 * 60 * 60;
69
70static inline struct inode *wb_inode(struct list_head *head)
71{
72 return list_entry(head, struct inode, i_io_list);
73}
74
75
76
77
78
79
80#define CREATE_TRACE_POINTS
81#include <trace/events/writeback.h>
82
83EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
84
85static bool wb_io_lists_populated(struct bdi_writeback *wb)
86{
87 if (wb_has_dirty_io(wb)) {
88 return false;
89 } else {
90 set_bit(WB_has_dirty_io, &wb->state);
91 WARN_ON_ONCE(!wb->avg_write_bandwidth);
92 atomic_long_add(wb->avg_write_bandwidth,
93 &wb->bdi->tot_write_bandwidth);
94 return true;
95 }
96}
97
98static void wb_io_lists_depopulated(struct bdi_writeback *wb)
99{
100 if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) &&
101 list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) {
102 clear_bit(WB_has_dirty_io, &wb->state);
103 WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth,
104 &wb->bdi->tot_write_bandwidth) < 0);
105 }
106}
107
108
109
110
111
112
113
114
115
116
117
118static bool inode_io_list_move_locked(struct inode *inode,
119 struct bdi_writeback *wb,
120 struct list_head *head)
121{
122 assert_spin_locked(&wb->list_lock);
123
124 list_move(&inode->i_io_list, head);
125
126
127 if (head != &wb->b_dirty_time)
128 return wb_io_lists_populated(wb);
129
130 wb_io_lists_depopulated(wb);
131 return false;
132}
133
134static void wb_wakeup(struct bdi_writeback *wb)
135{
136 spin_lock_bh(&wb->work_lock);
137 if (test_bit(WB_registered, &wb->state))
138 mod_delayed_work(bdi_wq, &wb->dwork, 0);
139 spin_unlock_bh(&wb->work_lock);
140}
141
142static void finish_writeback_work(struct bdi_writeback *wb,
143 struct wb_writeback_work *work)
144{
145 struct wb_completion *done = work->done;
146
147 if (work->auto_free)
148 kfree(work);
149 if (done) {
150 wait_queue_head_t *waitq = done->waitq;
151
152
153 if (atomic_dec_and_test(&done->cnt))
154 wake_up_all(waitq);
155 }
156}
157
158static void wb_queue_work(struct bdi_writeback *wb,
159 struct wb_writeback_work *work)
160{
161 trace_writeback_queue(wb, work);
162
163 if (work->done)
164 atomic_inc(&work->done->cnt);
165
166 spin_lock_bh(&wb->work_lock);
167
168 if (test_bit(WB_registered, &wb->state)) {
169 list_add_tail(&work->list, &wb->work_list);
170 mod_delayed_work(bdi_wq, &wb->dwork, 0);
171 } else
172 finish_writeback_work(wb, work);
173
174 spin_unlock_bh(&wb->work_lock);
175}
176
177
178
179
180
181
182
183
184
185
186
187void wb_wait_for_completion(struct wb_completion *done)
188{
189 atomic_dec(&done->cnt);
190 wait_event(*done->waitq, !atomic_read(&done->cnt));
191}
192
193#ifdef CONFIG_CGROUP_WRITEBACK
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214#define WB_FRN_TIME_SHIFT 13
215#define WB_FRN_TIME_AVG_SHIFT 3
216#define WB_FRN_TIME_CUT_DIV 8
217#define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT))
218
219#define WB_FRN_HIST_SLOTS 16
220#define WB_FRN_HIST_UNIT (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS)
221
222#define WB_FRN_HIST_THR_SLOTS (WB_FRN_HIST_SLOTS / 2)
223
224#define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1)
225
226#define WB_FRN_MAX_IN_FLIGHT 1024
227
228
229
230
231
232#define WB_MAX_INODES_PER_ISW ((1024UL - sizeof(struct inode_switch_wbs_context)) \
233 / sizeof(struct inode *))
234
235static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
236static struct workqueue_struct *isw_wq;
237
238void __inode_attach_wb(struct inode *inode, struct page *page)
239{
240 struct backing_dev_info *bdi = inode_to_bdi(inode);
241 struct bdi_writeback *wb = NULL;
242
243 if (inode_cgwb_enabled(inode)) {
244 struct cgroup_subsys_state *memcg_css;
245
246 if (page) {
247 memcg_css = mem_cgroup_css_from_page(page);
248 wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
249 } else {
250
251 memcg_css = task_get_css(current, memory_cgrp_id);
252 wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
253 css_put(memcg_css);
254 }
255 }
256
257 if (!wb)
258 wb = &bdi->wb;
259
260
261
262
263
264 if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
265 wb_put(wb);
266}
267EXPORT_SYMBOL_GPL(__inode_attach_wb);
268
269
270
271
272
273
274
275
276
277static void inode_cgwb_move_to_attached(struct inode *inode,
278 struct bdi_writeback *wb)
279{
280 assert_spin_locked(&wb->list_lock);
281 assert_spin_locked(&inode->i_lock);
282
283 inode->i_state &= ~I_SYNC_QUEUED;
284 if (wb != &wb->bdi->wb)
285 list_move(&inode->i_io_list, &wb->b_attached);
286 else
287 list_del_init(&inode->i_io_list);
288 wb_io_lists_depopulated(wb);
289}
290
291
292
293
294
295
296
297
298
299static struct bdi_writeback *
300locked_inode_to_wb_and_lock_list(struct inode *inode)
301 __releases(&inode->i_lock)
302 __acquires(&wb->list_lock)
303{
304 while (true) {
305 struct bdi_writeback *wb = inode_to_wb(inode);
306
307
308
309
310
311
312
313 wb_get(wb);
314 spin_unlock(&inode->i_lock);
315 spin_lock(&wb->list_lock);
316
317
318 if (likely(wb == inode->i_wb)) {
319 wb_put(wb);
320 return wb;
321 }
322
323 spin_unlock(&wb->list_lock);
324 wb_put(wb);
325 cpu_relax();
326 spin_lock(&inode->i_lock);
327 }
328}
329
330
331
332
333
334
335
336
337static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
338 __acquires(&wb->list_lock)
339{
340 spin_lock(&inode->i_lock);
341 return locked_inode_to_wb_and_lock_list(inode);
342}
343
344struct inode_switch_wbs_context {
345 struct rcu_work work;
346
347
348
349
350
351
352
353
354
355 struct bdi_writeback *new_wb;
356 struct inode *inodes[];
357};
358
359static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
360{
361 down_write(&bdi->wb_switch_rwsem);
362}
363
364static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
365{
366 up_write(&bdi->wb_switch_rwsem);
367}
368
369static bool inode_do_switch_wbs(struct inode *inode,
370 struct bdi_writeback *old_wb,
371 struct bdi_writeback *new_wb)
372{
373 struct address_space *mapping = inode->i_mapping;
374 XA_STATE(xas, &mapping->i_pages, 0);
375 struct page *page;
376 bool switched = false;
377
378 spin_lock(&inode->i_lock);
379 xa_lock_irq(&mapping->i_pages);
380
381
382
383
384
385 if (unlikely(inode->i_state & (I_FREEING | I_WILL_FREE)))
386 goto skip_switch;
387
388 trace_inode_switch_wbs(inode, old_wb, new_wb);
389
390
391
392
393
394
395 xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_DIRTY) {
396 if (PageDirty(page)) {
397 dec_wb_stat(old_wb, WB_RECLAIMABLE);
398 inc_wb_stat(new_wb, WB_RECLAIMABLE);
399 }
400 }
401
402 xas_set(&xas, 0);
403 xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) {
404 WARN_ON_ONCE(!PageWriteback(page));
405 dec_wb_stat(old_wb, WB_WRITEBACK);
406 inc_wb_stat(new_wb, WB_WRITEBACK);
407 }
408
409 if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
410 atomic_dec(&old_wb->writeback_inodes);
411 atomic_inc(&new_wb->writeback_inodes);
412 }
413
414 wb_get(new_wb);
415
416
417
418
419
420
421
422
423
424 if (!list_empty(&inode->i_io_list)) {
425 inode->i_wb = new_wb;
426
427 if (inode->i_state & I_DIRTY_ALL) {
428 struct inode *pos;
429
430 list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
431 if (time_after_eq(inode->dirtied_when,
432 pos->dirtied_when))
433 break;
434 inode_io_list_move_locked(inode, new_wb,
435 pos->i_io_list.prev);
436 } else {
437 inode_cgwb_move_to_attached(inode, new_wb);
438 }
439 } else {
440 inode->i_wb = new_wb;
441 }
442
443
444 inode->i_wb_frn_winner = 0;
445 inode->i_wb_frn_avg_time = 0;
446 inode->i_wb_frn_history = 0;
447 switched = true;
448skip_switch:
449
450
451
452
453 smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
454
455 xa_unlock_irq(&mapping->i_pages);
456 spin_unlock(&inode->i_lock);
457
458 return switched;
459}
460
461static void inode_switch_wbs_work_fn(struct work_struct *work)
462{
463 struct inode_switch_wbs_context *isw =
464 container_of(to_rcu_work(work), struct inode_switch_wbs_context, work);
465 struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]);
466 struct bdi_writeback *old_wb = isw->inodes[0]->i_wb;
467 struct bdi_writeback *new_wb = isw->new_wb;
468 unsigned long nr_switched = 0;
469 struct inode **inodep;
470
471
472
473
474
475 down_read(&bdi->wb_switch_rwsem);
476
477
478
479
480
481
482
483
484
485
486
487 if (old_wb < new_wb) {
488 spin_lock(&old_wb->list_lock);
489 spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
490 } else {
491 spin_lock(&new_wb->list_lock);
492 spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
493 }
494
495 for (inodep = isw->inodes; *inodep; inodep++) {
496 WARN_ON_ONCE((*inodep)->i_wb != old_wb);
497 if (inode_do_switch_wbs(*inodep, old_wb, new_wb))
498 nr_switched++;
499 }
500
501 spin_unlock(&new_wb->list_lock);
502 spin_unlock(&old_wb->list_lock);
503
504 up_read(&bdi->wb_switch_rwsem);
505
506 if (nr_switched) {
507 wb_wakeup(new_wb);
508 wb_put_many(old_wb, nr_switched);
509 }
510
511 for (inodep = isw->inodes; *inodep; inodep++)
512 iput(*inodep);
513 wb_put(new_wb);
514 kfree(isw);
515 atomic_dec(&isw_nr_in_flight);
516}
517
518static bool inode_prepare_wbs_switch(struct inode *inode,
519 struct bdi_writeback *new_wb)
520{
521
522
523
524
525
526
527 smp_mb();
528
529 if (IS_DAX(inode))
530 return false;
531
532
533 spin_lock(&inode->i_lock);
534 if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
535 inode->i_state & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) ||
536 inode_to_wb(inode) == new_wb) {
537 spin_unlock(&inode->i_lock);
538 return false;
539 }
540 inode->i_state |= I_WB_SWITCH;
541 __iget(inode);
542 spin_unlock(&inode->i_lock);
543
544 return true;
545}
546
547
548
549
550
551
552
553
554
555static void inode_switch_wbs(struct inode *inode, int new_wb_id)
556{
557 struct backing_dev_info *bdi = inode_to_bdi(inode);
558 struct cgroup_subsys_state *memcg_css;
559 struct inode_switch_wbs_context *isw;
560
561
562 if (inode->i_state & I_WB_SWITCH)
563 return;
564
565
566 if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT)
567 return;
568
569 isw = kzalloc(struct_size(isw, inodes, 2), GFP_ATOMIC);
570 if (!isw)
571 return;
572
573 atomic_inc(&isw_nr_in_flight);
574
575
576 rcu_read_lock();
577 memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
578 if (memcg_css && !css_tryget(memcg_css))
579 memcg_css = NULL;
580 rcu_read_unlock();
581 if (!memcg_css)
582 goto out_free;
583
584 isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
585 css_put(memcg_css);
586 if (!isw->new_wb)
587 goto out_free;
588
589 if (!inode_prepare_wbs_switch(inode, isw->new_wb))
590 goto out_free;
591
592 isw->inodes[0] = inode;
593
594
595
596
597
598
599
600 INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
601 queue_rcu_work(isw_wq, &isw->work);
602 return;
603
604out_free:
605 atomic_dec(&isw_nr_in_flight);
606 if (isw->new_wb)
607 wb_put(isw->new_wb);
608 kfree(isw);
609}
610
611
612
613
614
615
616
617
618
619bool cleanup_offline_cgwb(struct bdi_writeback *wb)
620{
621 struct cgroup_subsys_state *memcg_css;
622 struct inode_switch_wbs_context *isw;
623 struct inode *inode;
624 int nr;
625 bool restart = false;
626
627 isw = kzalloc(struct_size(isw, inodes, WB_MAX_INODES_PER_ISW),
628 GFP_KERNEL);
629 if (!isw)
630 return restart;
631
632 atomic_inc(&isw_nr_in_flight);
633
634 for (memcg_css = wb->memcg_css->parent; memcg_css;
635 memcg_css = memcg_css->parent) {
636 isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
637 if (isw->new_wb)
638 break;
639 }
640 if (unlikely(!isw->new_wb))
641 isw->new_wb = &wb->bdi->wb;
642
643 nr = 0;
644 spin_lock(&wb->list_lock);
645 list_for_each_entry(inode, &wb->b_attached, i_io_list) {
646 if (!inode_prepare_wbs_switch(inode, isw->new_wb))
647 continue;
648
649 isw->inodes[nr++] = inode;
650
651 if (nr >= WB_MAX_INODES_PER_ISW - 1) {
652 restart = true;
653 break;
654 }
655 }
656 spin_unlock(&wb->list_lock);
657
658
659 if (nr == 0) {
660 atomic_dec(&isw_nr_in_flight);
661 wb_put(isw->new_wb);
662 kfree(isw);
663 return restart;
664 }
665
666
667
668
669
670
671
672 INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
673 queue_rcu_work(isw_wq, &isw->work);
674
675 return restart;
676}
677
678
679
680
681
682
683
684
685
686
687
688void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
689 struct inode *inode)
690{
691 if (!inode_cgwb_enabled(inode)) {
692 spin_unlock(&inode->i_lock);
693 return;
694 }
695
696 wbc->wb = inode_to_wb(inode);
697 wbc->inode = inode;
698
699 wbc->wb_id = wbc->wb->memcg_css->id;
700 wbc->wb_lcand_id = inode->i_wb_frn_winner;
701 wbc->wb_tcand_id = 0;
702 wbc->wb_bytes = 0;
703 wbc->wb_lcand_bytes = 0;
704 wbc->wb_tcand_bytes = 0;
705
706 wb_get(wbc->wb);
707 spin_unlock(&inode->i_lock);
708
709
710
711
712
713
714
715
716 if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css)))
717 inode_switch_wbs(inode, wbc->wb_id);
718}
719EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode);
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758void wbc_detach_inode(struct writeback_control *wbc)
759{
760 struct bdi_writeback *wb = wbc->wb;
761 struct inode *inode = wbc->inode;
762 unsigned long avg_time, max_bytes, max_time;
763 u16 history;
764 int max_id;
765
766 if (!wb)
767 return;
768
769 history = inode->i_wb_frn_history;
770 avg_time = inode->i_wb_frn_avg_time;
771
772
773 if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
774 wbc->wb_bytes >= wbc->wb_tcand_bytes) {
775 max_id = wbc->wb_id;
776 max_bytes = wbc->wb_bytes;
777 } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) {
778 max_id = wbc->wb_lcand_id;
779 max_bytes = wbc->wb_lcand_bytes;
780 } else {
781 max_id = wbc->wb_tcand_id;
782 max_bytes = wbc->wb_tcand_bytes;
783 }
784
785
786
787
788
789
790
791
792 max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT,
793 wb->avg_write_bandwidth);
794 if (avg_time)
795 avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) -
796 (avg_time >> WB_FRN_TIME_AVG_SHIFT);
797 else
798 avg_time = max_time;
799
800 if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) {
801 int slots;
802
803
804
805
806
807
808
809
810
811 slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT),
812 (unsigned long)WB_FRN_HIST_MAX_SLOTS);
813 history <<= slots;
814 if (wbc->wb_id != max_id)
815 history |= (1U << slots) - 1;
816
817 if (history)
818 trace_inode_foreign_history(inode, wbc, history);
819
820
821
822
823
824
825
826
827 if (hweight32(history) > WB_FRN_HIST_THR_SLOTS)
828 inode_switch_wbs(inode, max_id);
829 }
830
831
832
833
834
835 inode->i_wb_frn_winner = max_id;
836 inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX);
837 inode->i_wb_frn_history = history;
838
839 wb_put(wbc->wb);
840 wbc->wb = NULL;
841}
842EXPORT_SYMBOL_GPL(wbc_detach_inode);
843
844
845
846
847
848
849
850
851
852
853
854void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
855 size_t bytes)
856{
857 struct cgroup_subsys_state *css;
858 int id;
859
860
861
862
863
864
865
866 if (!wbc->wb || wbc->no_cgroup_owner)
867 return;
868
869 css = mem_cgroup_css_from_page(page);
870
871 if (!(css->flags & CSS_ONLINE))
872 return;
873
874 id = css->id;
875
876 if (id == wbc->wb_id) {
877 wbc->wb_bytes += bytes;
878 return;
879 }
880
881 if (id == wbc->wb_lcand_id)
882 wbc->wb_lcand_bytes += bytes;
883
884
885 if (!wbc->wb_tcand_bytes)
886 wbc->wb_tcand_id = id;
887 if (id == wbc->wb_tcand_id)
888 wbc->wb_tcand_bytes += bytes;
889 else
890 wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
891}
892EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910int inode_congested(struct inode *inode, int cong_bits)
911{
912
913
914
915
916 if (inode && inode_to_wb_is_valid(inode)) {
917 struct bdi_writeback *wb;
918 struct wb_lock_cookie lock_cookie = {};
919 bool congested;
920
921 wb = unlocked_inode_to_wb_begin(inode, &lock_cookie);
922 congested = wb_congested(wb, cong_bits);
923 unlocked_inode_to_wb_end(inode, &lock_cookie);
924 return congested;
925 }
926
927 return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
928}
929EXPORT_SYMBOL_GPL(inode_congested);
930
931
932
933
934
935
936
937
938
939
940static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
941{
942 unsigned long this_bw = wb->avg_write_bandwidth;
943 unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
944
945 if (nr_pages == LONG_MAX)
946 return LONG_MAX;
947
948
949
950
951
952
953 if (!tot_bw || this_bw >= tot_bw)
954 return nr_pages;
955 else
956 return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
957}
958
959
960
961
962
963
964
965
966
967
968
969
970static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
971 struct wb_writeback_work *base_work,
972 bool skip_if_busy)
973{
974 struct bdi_writeback *last_wb = NULL;
975 struct bdi_writeback *wb = list_entry(&bdi->wb_list,
976 struct bdi_writeback, bdi_node);
977
978 might_sleep();
979restart:
980 rcu_read_lock();
981 list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
982 DEFINE_WB_COMPLETION(fallback_work_done, bdi);
983 struct wb_writeback_work fallback_work;
984 struct wb_writeback_work *work;
985 long nr_pages;
986
987 if (last_wb) {
988 wb_put(last_wb);
989 last_wb = NULL;
990 }
991
992
993 if (!wb_has_dirty_io(wb) &&
994 (base_work->sync_mode == WB_SYNC_NONE ||
995 list_empty(&wb->b_dirty_time)))
996 continue;
997 if (skip_if_busy && writeback_in_progress(wb))
998 continue;
999
1000 nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);
1001
1002 work = kmalloc(sizeof(*work), GFP_ATOMIC);
1003 if (work) {
1004 *work = *base_work;
1005 work->nr_pages = nr_pages;
1006 work->auto_free = 1;
1007 wb_queue_work(wb, work);
1008 continue;
1009 }
1010
1011
1012 work = &fallback_work;
1013 *work = *base_work;
1014 work->nr_pages = nr_pages;
1015 work->auto_free = 0;
1016 work->done = &fallback_work_done;
1017
1018 wb_queue_work(wb, work);
1019
1020
1021
1022
1023
1024
1025 wb_get(wb);
1026 last_wb = wb;
1027
1028 rcu_read_unlock();
1029 wb_wait_for_completion(&fallback_work_done);
1030 goto restart;
1031 }
1032 rcu_read_unlock();
1033
1034 if (last_wb)
1035 wb_put(last_wb);
1036}
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048int cgroup_writeback_by_id(u64 bdi_id, int memcg_id,
1049 enum wb_reason reason, struct wb_completion *done)
1050{
1051 struct backing_dev_info *bdi;
1052 struct cgroup_subsys_state *memcg_css;
1053 struct bdi_writeback *wb;
1054 struct wb_writeback_work *work;
1055 unsigned long dirty;
1056 int ret;
1057
1058
1059 bdi = bdi_get_by_id(bdi_id);
1060 if (!bdi)
1061 return -ENOENT;
1062
1063 rcu_read_lock();
1064 memcg_css = css_from_id(memcg_id, &memory_cgrp_subsys);
1065 if (memcg_css && !css_tryget(memcg_css))
1066 memcg_css = NULL;
1067 rcu_read_unlock();
1068 if (!memcg_css) {
1069 ret = -ENOENT;
1070 goto out_bdi_put;
1071 }
1072
1073
1074
1075
1076
1077 wb = wb_get_lookup(bdi, memcg_css);
1078 if (!wb) {
1079 ret = -ENOENT;
1080 goto out_css_put;
1081 }
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093 dirty = memcg_page_state(mem_cgroup_from_css(memcg_css), NR_FILE_DIRTY);
1094 dirty = dirty * 10 / 8;
1095
1096
1097 work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN);
1098 if (work) {
1099 work->nr_pages = dirty;
1100 work->sync_mode = WB_SYNC_NONE;
1101 work->range_cyclic = 1;
1102 work->reason = reason;
1103 work->done = done;
1104 work->auto_free = 1;
1105 wb_queue_work(wb, work);
1106 ret = 0;
1107 } else {
1108 ret = -ENOMEM;
1109 }
1110
1111 wb_put(wb);
1112out_css_put:
1113 css_put(memcg_css);
1114out_bdi_put:
1115 bdi_put(bdi);
1116 return ret;
1117}
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129void cgroup_writeback_umount(void)
1130{
1131
1132
1133
1134
1135 smp_mb();
1136
1137 if (atomic_read(&isw_nr_in_flight)) {
1138
1139
1140
1141
1142 rcu_barrier();
1143 flush_workqueue(isw_wq);
1144 }
1145}
1146
1147static int __init cgroup_writeback_init(void)
1148{
1149 isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
1150 if (!isw_wq)
1151 return -ENOMEM;
1152 return 0;
1153}
1154fs_initcall(cgroup_writeback_init);
1155
1156#else
1157
1158static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
1159static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
1160
1161static void inode_cgwb_move_to_attached(struct inode *inode,
1162 struct bdi_writeback *wb)
1163{
1164 assert_spin_locked(&wb->list_lock);
1165 assert_spin_locked(&inode->i_lock);
1166
1167 inode->i_state &= ~I_SYNC_QUEUED;
1168 list_del_init(&inode->i_io_list);
1169 wb_io_lists_depopulated(wb);
1170}
1171
1172static struct bdi_writeback *
1173locked_inode_to_wb_and_lock_list(struct inode *inode)
1174 __releases(&inode->i_lock)
1175 __acquires(&wb->list_lock)
1176{
1177 struct bdi_writeback *wb = inode_to_wb(inode);
1178
1179 spin_unlock(&inode->i_lock);
1180 spin_lock(&wb->list_lock);
1181 return wb;
1182}
1183
1184static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
1185 __acquires(&wb->list_lock)
1186{
1187 struct bdi_writeback *wb = inode_to_wb(inode);
1188
1189 spin_lock(&wb->list_lock);
1190 return wb;
1191}
1192
1193static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
1194{
1195 return nr_pages;
1196}
1197
1198static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
1199 struct wb_writeback_work *base_work,
1200 bool skip_if_busy)
1201{
1202 might_sleep();
1203
1204 if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
1205 base_work->auto_free = 0;
1206 wb_queue_work(&bdi->wb, base_work);
1207 }
1208}
1209
1210#endif
1211
1212
1213
1214
1215
1216static unsigned long get_nr_dirty_pages(void)
1217{
1218 return global_node_page_state(NR_FILE_DIRTY) +
1219 get_nr_dirty_inodes();
1220}
1221
1222static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
1223{
1224 if (!wb_has_dirty_io(wb))
1225 return;
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235 if (test_bit(WB_start_all, &wb->state) ||
1236 test_and_set_bit(WB_start_all, &wb->state))
1237 return;
1238
1239 wb->start_all_reason = reason;
1240 wb_wakeup(wb);
1241}
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253void wb_start_background_writeback(struct bdi_writeback *wb)
1254{
1255
1256
1257
1258
1259 trace_writeback_wake_background(wb);
1260 wb_wakeup(wb);
1261}
1262
1263
1264
1265
1266void inode_io_list_del(struct inode *inode)
1267{
1268 struct bdi_writeback *wb;
1269
1270 wb = inode_to_wb_and_lock_list(inode);
1271 spin_lock(&inode->i_lock);
1272
1273 inode->i_state &= ~I_SYNC_QUEUED;
1274 list_del_init(&inode->i_io_list);
1275 wb_io_lists_depopulated(wb);
1276
1277 spin_unlock(&inode->i_lock);
1278 spin_unlock(&wb->list_lock);
1279}
1280EXPORT_SYMBOL(inode_io_list_del);
1281
1282
1283
1284
1285void sb_mark_inode_writeback(struct inode *inode)
1286{
1287 struct super_block *sb = inode->i_sb;
1288 unsigned long flags;
1289
1290 if (list_empty(&inode->i_wb_list)) {
1291 spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
1292 if (list_empty(&inode->i_wb_list)) {
1293 list_add_tail(&inode->i_wb_list, &sb->s_inodes_wb);
1294 trace_sb_mark_inode_writeback(inode);
1295 }
1296 spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
1297 }
1298}
1299
1300
1301
1302
1303void sb_clear_inode_writeback(struct inode *inode)
1304{
1305 struct super_block *sb = inode->i_sb;
1306 unsigned long flags;
1307
1308 if (!list_empty(&inode->i_wb_list)) {
1309 spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
1310 if (!list_empty(&inode->i_wb_list)) {
1311 list_del_init(&inode->i_wb_list);
1312 trace_sb_clear_inode_writeback(inode);
1313 }
1314 spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
1315 }
1316}
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb)
1328{
1329 assert_spin_locked(&inode->i_lock);
1330
1331 if (!list_empty(&wb->b_dirty)) {
1332 struct inode *tail;
1333
1334 tail = wb_inode(wb->b_dirty.next);
1335 if (time_before(inode->dirtied_when, tail->dirtied_when))
1336 inode->dirtied_when = jiffies;
1337 }
1338 inode_io_list_move_locked(inode, wb, &wb->b_dirty);
1339 inode->i_state &= ~I_SYNC_QUEUED;
1340}
1341
1342static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
1343{
1344 spin_lock(&inode->i_lock);
1345 redirty_tail_locked(inode, wb);
1346 spin_unlock(&inode->i_lock);
1347}
1348
1349
1350
1351
1352static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
1353{
1354 inode_io_list_move_locked(inode, wb, &wb->b_more_io);
1355}
1356
1357static void inode_sync_complete(struct inode *inode)
1358{
1359 inode->i_state &= ~I_SYNC;
1360
1361 inode_add_lru(inode);
1362
1363 smp_mb();
1364 wake_up_bit(&inode->i_state, __I_SYNC);
1365}
1366
1367static bool inode_dirtied_after(struct inode *inode, unsigned long t)
1368{
1369 bool ret = time_after(inode->dirtied_when, t);
1370#ifndef CONFIG_64BIT
1371
1372
1373
1374
1375
1376
1377 ret = ret && time_before_eq(inode->dirtied_when, jiffies);
1378#endif
1379 return ret;
1380}
1381
1382#define EXPIRE_DIRTY_ATIME 0x0001
1383
1384
1385
1386
1387
1388static int move_expired_inodes(struct list_head *delaying_queue,
1389 struct list_head *dispatch_queue,
1390 unsigned long dirtied_before)
1391{
1392 LIST_HEAD(tmp);
1393 struct list_head *pos, *node;
1394 struct super_block *sb = NULL;
1395 struct inode *inode;
1396 int do_sb_sort = 0;
1397 int moved = 0;
1398
1399 while (!list_empty(delaying_queue)) {
1400 inode = wb_inode(delaying_queue->prev);
1401 if (inode_dirtied_after(inode, dirtied_before))
1402 break;
1403 list_move(&inode->i_io_list, &tmp);
1404 moved++;
1405 spin_lock(&inode->i_lock);
1406 inode->i_state |= I_SYNC_QUEUED;
1407 spin_unlock(&inode->i_lock);
1408 if (sb_is_blkdev_sb(inode->i_sb))
1409 continue;
1410 if (sb && sb != inode->i_sb)
1411 do_sb_sort = 1;
1412 sb = inode->i_sb;
1413 }
1414
1415
1416 if (!do_sb_sort) {
1417 list_splice(&tmp, dispatch_queue);
1418 goto out;
1419 }
1420
1421
1422 while (!list_empty(&tmp)) {
1423 sb = wb_inode(tmp.prev)->i_sb;
1424 list_for_each_prev_safe(pos, node, &tmp) {
1425 inode = wb_inode(pos);
1426 if (inode->i_sb == sb)
1427 list_move(&inode->i_io_list, dispatch_queue);
1428 }
1429 }
1430out:
1431 return moved;
1432}
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work,
1446 unsigned long dirtied_before)
1447{
1448 int moved;
1449 unsigned long time_expire_jif = dirtied_before;
1450
1451 assert_spin_locked(&wb->list_lock);
1452 list_splice_init(&wb->b_more_io, &wb->b_io);
1453 moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, dirtied_before);
1454 if (!work->for_sync)
1455 time_expire_jif = jiffies - dirtytime_expire_interval * HZ;
1456 moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
1457 time_expire_jif);
1458 if (moved)
1459 wb_io_lists_populated(wb);
1460 trace_writeback_queue_io(wb, work, dirtied_before, moved);
1461}
1462
1463static int write_inode(struct inode *inode, struct writeback_control *wbc)
1464{
1465 int ret;
1466
1467 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) {
1468 trace_writeback_write_inode_start(inode, wbc);
1469 ret = inode->i_sb->s_op->write_inode(inode, wbc);
1470 trace_writeback_write_inode(inode, wbc);
1471 return ret;
1472 }
1473 return 0;
1474}
1475
1476
1477
1478
1479
1480static void __inode_wait_for_writeback(struct inode *inode)
1481 __releases(inode->i_lock)
1482 __acquires(inode->i_lock)
1483{
1484 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
1485 wait_queue_head_t *wqh;
1486
1487 wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
1488 while (inode->i_state & I_SYNC) {
1489 spin_unlock(&inode->i_lock);
1490 __wait_on_bit(wqh, &wq, bit_wait,
1491 TASK_UNINTERRUPTIBLE);
1492 spin_lock(&inode->i_lock);
1493 }
1494}
1495
1496
1497
1498
1499void inode_wait_for_writeback(struct inode *inode)
1500{
1501 spin_lock(&inode->i_lock);
1502 __inode_wait_for_writeback(inode);
1503 spin_unlock(&inode->i_lock);
1504}
1505
1506
1507
1508
1509
1510
1511static void inode_sleep_on_writeback(struct inode *inode)
1512 __releases(inode->i_lock)
1513{
1514 DEFINE_WAIT(wait);
1515 wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
1516 int sleep;
1517
1518 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
1519 sleep = inode->i_state & I_SYNC;
1520 spin_unlock(&inode->i_lock);
1521 if (sleep)
1522 schedule();
1523 finish_wait(wqh, &wait);
1524}
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
1535 struct writeback_control *wbc)
1536{
1537 if (inode->i_state & I_FREEING)
1538 return;
1539
1540
1541
1542
1543
1544
1545 if ((inode->i_state & I_DIRTY) &&
1546 (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
1547 inode->dirtied_when = jiffies;
1548
1549 if (wbc->pages_skipped) {
1550
1551
1552
1553
1554 redirty_tail_locked(inode, wb);
1555 return;
1556 }
1557
1558 if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
1559
1560
1561
1562
1563 if (wbc->nr_to_write <= 0) {
1564
1565 requeue_io(inode, wb);
1566 } else {
1567
1568
1569
1570
1571
1572
1573
1574 redirty_tail_locked(inode, wb);
1575 }
1576 } else if (inode->i_state & I_DIRTY) {
1577
1578
1579
1580
1581
1582 redirty_tail_locked(inode, wb);
1583 } else if (inode->i_state & I_DIRTY_TIME) {
1584 inode->dirtied_when = jiffies;
1585 inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
1586 inode->i_state &= ~I_SYNC_QUEUED;
1587 } else {
1588
1589 inode_cgwb_move_to_attached(inode, wb);
1590 }
1591}
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604static int
1605__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
1606{
1607 struct address_space *mapping = inode->i_mapping;
1608 long nr_to_write = wbc->nr_to_write;
1609 unsigned dirty;
1610 int ret;
1611
1612 WARN_ON(!(inode->i_state & I_SYNC));
1613
1614 trace_writeback_single_inode_start(inode, wbc, nr_to_write);
1615
1616 ret = do_writepages(mapping, wbc);
1617
1618
1619
1620
1621
1622
1623
1624
1625 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
1626 int err = filemap_fdatawait(mapping);
1627 if (ret == 0)
1628 ret = err;
1629 }
1630
1631
1632
1633
1634
1635
1636 if ((inode->i_state & I_DIRTY_TIME) &&
1637 (wbc->sync_mode == WB_SYNC_ALL ||
1638 time_after(jiffies, inode->dirtied_time_when +
1639 dirtytime_expire_interval * HZ))) {
1640 trace_writeback_lazytime(inode);
1641 mark_inode_dirty_sync(inode);
1642 }
1643
1644
1645
1646
1647
1648
1649
1650 spin_lock(&inode->i_lock);
1651 dirty = inode->i_state & I_DIRTY;
1652 inode->i_state &= ~dirty;
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665 smp_mb();
1666
1667 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
1668 inode->i_state |= I_DIRTY_PAGES;
1669
1670 spin_unlock(&inode->i_lock);
1671
1672
1673 if (dirty & ~I_DIRTY_PAGES) {
1674 int err = write_inode(inode, wbc);
1675 if (ret == 0)
1676 ret = err;
1677 }
1678 trace_writeback_single_inode(inode, wbc, nr_to_write);
1679 return ret;
1680}
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691static int writeback_single_inode(struct inode *inode,
1692 struct writeback_control *wbc)
1693{
1694 struct bdi_writeback *wb;
1695 int ret = 0;
1696
1697 spin_lock(&inode->i_lock);
1698 if (!atomic_read(&inode->i_count))
1699 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
1700 else
1701 WARN_ON(inode->i_state & I_WILL_FREE);
1702
1703 if (inode->i_state & I_SYNC) {
1704
1705
1706
1707
1708
1709
1710 if (wbc->sync_mode != WB_SYNC_ALL)
1711 goto out;
1712 __inode_wait_for_writeback(inode);
1713 }
1714 WARN_ON(inode->i_state & I_SYNC);
1715
1716
1717
1718
1719
1720
1721
1722 if (!(inode->i_state & I_DIRTY_ALL) &&
1723 (wbc->sync_mode != WB_SYNC_ALL ||
1724 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
1725 goto out;
1726 inode->i_state |= I_SYNC;
1727 wbc_attach_and_unlock_inode(wbc, inode);
1728
1729 ret = __writeback_single_inode(inode, wbc);
1730
1731 wbc_detach_inode(wbc);
1732
1733 wb = inode_to_wb_and_lock_list(inode);
1734 spin_lock(&inode->i_lock);
1735
1736
1737
1738
1739
1740 if (!(inode->i_state & I_DIRTY_ALL))
1741 inode_cgwb_move_to_attached(inode, wb);
1742 spin_unlock(&wb->list_lock);
1743 inode_sync_complete(inode);
1744out:
1745 spin_unlock(&inode->i_lock);
1746 return ret;
1747}
1748
1749static long writeback_chunk_size(struct bdi_writeback *wb,
1750 struct wb_writeback_work *work)
1751{
1752 long pages;
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767 if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
1768 pages = LONG_MAX;
1769 else {
1770 pages = min(wb->avg_write_bandwidth / 2,
1771 global_wb_domain.dirty_limit / DIRTY_SCOPE);
1772 pages = min(pages, work->nr_pages);
1773 pages = round_down(pages + MIN_WRITEBACK_PAGES,
1774 MIN_WRITEBACK_PAGES);
1775 }
1776
1777 return pages;
1778}
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789static long writeback_sb_inodes(struct super_block *sb,
1790 struct bdi_writeback *wb,
1791 struct wb_writeback_work *work)
1792{
1793 struct writeback_control wbc = {
1794 .sync_mode = work->sync_mode,
1795 .tagged_writepages = work->tagged_writepages,
1796 .for_kupdate = work->for_kupdate,
1797 .for_background = work->for_background,
1798 .for_sync = work->for_sync,
1799 .range_cyclic = work->range_cyclic,
1800 .range_start = 0,
1801 .range_end = LLONG_MAX,
1802 };
1803 unsigned long start_time = jiffies;
1804 long write_chunk;
1805 long wrote = 0;
1806
1807 while (!list_empty(&wb->b_io)) {
1808 struct inode *inode = wb_inode(wb->b_io.prev);
1809 struct bdi_writeback *tmp_wb;
1810
1811 if (inode->i_sb != sb) {
1812 if (work->sb) {
1813
1814
1815
1816
1817
1818 redirty_tail(inode, wb);
1819 continue;
1820 }
1821
1822
1823
1824
1825
1826
1827 break;
1828 }
1829
1830
1831
1832
1833
1834
1835 spin_lock(&inode->i_lock);
1836 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
1837 redirty_tail_locked(inode, wb);
1838 spin_unlock(&inode->i_lock);
1839 continue;
1840 }
1841 if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851 spin_unlock(&inode->i_lock);
1852 requeue_io(inode, wb);
1853 trace_writeback_sb_inodes_requeue(inode);
1854 continue;
1855 }
1856 spin_unlock(&wb->list_lock);
1857
1858
1859
1860
1861
1862
1863 if (inode->i_state & I_SYNC) {
1864
1865 inode_sleep_on_writeback(inode);
1866
1867 spin_lock(&wb->list_lock);
1868 continue;
1869 }
1870 inode->i_state |= I_SYNC;
1871 wbc_attach_and_unlock_inode(&wbc, inode);
1872
1873 write_chunk = writeback_chunk_size(wb, work);
1874 wbc.nr_to_write = write_chunk;
1875 wbc.pages_skipped = 0;
1876
1877
1878
1879
1880
1881 __writeback_single_inode(inode, &wbc);
1882
1883 wbc_detach_inode(&wbc);
1884 work->nr_pages -= write_chunk - wbc.nr_to_write;
1885 wrote += write_chunk - wbc.nr_to_write;
1886
1887 if (need_resched()) {
1888
1889
1890
1891
1892
1893
1894
1895
1896 if (current->plug)
1897 blk_flush_plug(current->plug, false);
1898 cond_resched();
1899 }
1900
1901
1902
1903
1904
1905 tmp_wb = inode_to_wb_and_lock_list(inode);
1906 spin_lock(&inode->i_lock);
1907 if (!(inode->i_state & I_DIRTY_ALL))
1908 wrote++;
1909 requeue_inode(inode, tmp_wb, &wbc);
1910 inode_sync_complete(inode);
1911 spin_unlock(&inode->i_lock);
1912
1913 if (unlikely(tmp_wb != wb)) {
1914 spin_unlock(&tmp_wb->list_lock);
1915 spin_lock(&wb->list_lock);
1916 }
1917
1918
1919
1920
1921
1922 if (wrote) {
1923 if (time_is_before_jiffies(start_time + HZ / 10UL))
1924 break;
1925 if (work->nr_pages <= 0)
1926 break;
1927 }
1928 }
1929 return wrote;
1930}
1931
1932static long __writeback_inodes_wb(struct bdi_writeback *wb,
1933 struct wb_writeback_work *work)
1934{
1935 unsigned long start_time = jiffies;
1936 long wrote = 0;
1937
1938 while (!list_empty(&wb->b_io)) {
1939 struct inode *inode = wb_inode(wb->b_io.prev);
1940 struct super_block *sb = inode->i_sb;
1941
1942 if (!trylock_super(sb)) {
1943
1944
1945
1946
1947
1948 redirty_tail(inode, wb);
1949 continue;
1950 }
1951 wrote += writeback_sb_inodes(sb, wb, work);
1952 up_read(&sb->s_umount);
1953
1954
1955 if (wrote) {
1956 if (time_is_before_jiffies(start_time + HZ / 10UL))
1957 break;
1958 if (work->nr_pages <= 0)
1959 break;
1960 }
1961 }
1962
1963 return wrote;
1964}
1965
1966static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
1967 enum wb_reason reason)
1968{
1969 struct wb_writeback_work work = {
1970 .nr_pages = nr_pages,
1971 .sync_mode = WB_SYNC_NONE,
1972 .range_cyclic = 1,
1973 .reason = reason,
1974 };
1975 struct blk_plug plug;
1976
1977 blk_start_plug(&plug);
1978 spin_lock(&wb->list_lock);
1979 if (list_empty(&wb->b_io))
1980 queue_io(wb, &work, jiffies);
1981 __writeback_inodes_wb(wb, &work);
1982 spin_unlock(&wb->list_lock);
1983 blk_finish_plug(&plug);
1984
1985 return nr_pages - work.nr_pages;
1986}
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003static long wb_writeback(struct bdi_writeback *wb,
2004 struct wb_writeback_work *work)
2005{
2006 long nr_pages = work->nr_pages;
2007 unsigned long dirtied_before = jiffies;
2008 struct inode *inode;
2009 long progress;
2010 struct blk_plug plug;
2011
2012 blk_start_plug(&plug);
2013 spin_lock(&wb->list_lock);
2014 for (;;) {
2015
2016
2017
2018 if (work->nr_pages <= 0)
2019 break;
2020
2021
2022
2023
2024
2025
2026
2027 if ((work->for_background || work->for_kupdate) &&
2028 !list_empty(&wb->work_list))
2029 break;
2030
2031
2032
2033
2034
2035 if (work->for_background && !wb_over_bg_thresh(wb))
2036 break;
2037
2038
2039
2040
2041
2042
2043
2044 if (work->for_kupdate) {
2045 dirtied_before = jiffies -
2046 msecs_to_jiffies(dirty_expire_interval * 10);
2047 } else if (work->for_background)
2048 dirtied_before = jiffies;
2049
2050 trace_writeback_start(wb, work);
2051 if (list_empty(&wb->b_io))
2052 queue_io(wb, work, dirtied_before);
2053 if (work->sb)
2054 progress = writeback_sb_inodes(work->sb, wb, work);
2055 else
2056 progress = __writeback_inodes_wb(wb, work);
2057 trace_writeback_written(wb, work);
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067 if (progress)
2068 continue;
2069
2070
2071
2072 if (list_empty(&wb->b_more_io))
2073 break;
2074
2075
2076
2077
2078
2079 trace_writeback_wait(wb, work);
2080 inode = wb_inode(wb->b_more_io.prev);
2081 spin_lock(&inode->i_lock);
2082 spin_unlock(&wb->list_lock);
2083
2084 inode_sleep_on_writeback(inode);
2085 spin_lock(&wb->list_lock);
2086 }
2087 spin_unlock(&wb->list_lock);
2088 blk_finish_plug(&plug);
2089
2090 return nr_pages - work->nr_pages;
2091}
2092
2093
2094
2095
2096static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
2097{
2098 struct wb_writeback_work *work = NULL;
2099
2100 spin_lock_bh(&wb->work_lock);
2101 if (!list_empty(&wb->work_list)) {
2102 work = list_entry(wb->work_list.next,
2103 struct wb_writeback_work, list);
2104 list_del_init(&work->list);
2105 }
2106 spin_unlock_bh(&wb->work_lock);
2107 return work;
2108}
2109
2110static long wb_check_background_flush(struct bdi_writeback *wb)
2111{
2112 if (wb_over_bg_thresh(wb)) {
2113
2114 struct wb_writeback_work work = {
2115 .nr_pages = LONG_MAX,
2116 .sync_mode = WB_SYNC_NONE,
2117 .for_background = 1,
2118 .range_cyclic = 1,
2119 .reason = WB_REASON_BACKGROUND,
2120 };
2121
2122 return wb_writeback(wb, &work);
2123 }
2124
2125 return 0;
2126}
2127
2128static long wb_check_old_data_flush(struct bdi_writeback *wb)
2129{
2130 unsigned long expired;
2131 long nr_pages;
2132
2133
2134
2135
2136 if (!dirty_writeback_interval)
2137 return 0;
2138
2139 expired = wb->last_old_flush +
2140 msecs_to_jiffies(dirty_writeback_interval * 10);
2141 if (time_before(jiffies, expired))
2142 return 0;
2143
2144 wb->last_old_flush = jiffies;
2145 nr_pages = get_nr_dirty_pages();
2146
2147 if (nr_pages) {
2148 struct wb_writeback_work work = {
2149 .nr_pages = nr_pages,
2150 .sync_mode = WB_SYNC_NONE,
2151 .for_kupdate = 1,
2152 .range_cyclic = 1,
2153 .reason = WB_REASON_PERIODIC,
2154 };
2155
2156 return wb_writeback(wb, &work);
2157 }
2158
2159 return 0;
2160}
2161
2162static long wb_check_start_all(struct bdi_writeback *wb)
2163{
2164 long nr_pages;
2165
2166 if (!test_bit(WB_start_all, &wb->state))
2167 return 0;
2168
2169 nr_pages = get_nr_dirty_pages();
2170 if (nr_pages) {
2171 struct wb_writeback_work work = {
2172 .nr_pages = wb_split_bdi_pages(wb, nr_pages),
2173 .sync_mode = WB_SYNC_NONE,
2174 .range_cyclic = 1,
2175 .reason = wb->start_all_reason,
2176 };
2177
2178 nr_pages = wb_writeback(wb, &work);
2179 }
2180
2181 clear_bit(WB_start_all, &wb->state);
2182 return nr_pages;
2183}
2184
2185
2186
2187
2188
2189static long wb_do_writeback(struct bdi_writeback *wb)
2190{
2191 struct wb_writeback_work *work;
2192 long wrote = 0;
2193
2194 set_bit(WB_writeback_running, &wb->state);
2195 while ((work = get_next_work_item(wb)) != NULL) {
2196 trace_writeback_exec(wb, work);
2197 wrote += wb_writeback(wb, work);
2198 finish_writeback_work(wb, work);
2199 }
2200
2201
2202
2203
2204 wrote += wb_check_start_all(wb);
2205
2206
2207
2208
2209 wrote += wb_check_old_data_flush(wb);
2210 wrote += wb_check_background_flush(wb);
2211 clear_bit(WB_writeback_running, &wb->state);
2212
2213 return wrote;
2214}
2215
2216
2217
2218
2219
2220void wb_workfn(struct work_struct *work)
2221{
2222 struct bdi_writeback *wb = container_of(to_delayed_work(work),
2223 struct bdi_writeback, dwork);
2224 long pages_written;
2225
2226 set_worker_desc("flush-%s", bdi_dev_name(wb->bdi));
2227 current->flags |= PF_SWAPWRITE;
2228
2229 if (likely(!current_is_workqueue_rescuer() ||
2230 !test_bit(WB_registered, &wb->state))) {
2231
2232
2233
2234
2235
2236
2237 do {
2238 pages_written = wb_do_writeback(wb);
2239 trace_writeback_pages_written(pages_written);
2240 } while (!list_empty(&wb->work_list));
2241 } else {
2242
2243
2244
2245
2246
2247 pages_written = writeback_inodes_wb(wb, 1024,
2248 WB_REASON_FORKER_THREAD);
2249 trace_writeback_pages_written(pages_written);
2250 }
2251
2252 if (!list_empty(&wb->work_list))
2253 wb_wakeup(wb);
2254 else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
2255 wb_wakeup_delayed(wb);
2256
2257 current->flags &= ~PF_SWAPWRITE;
2258}
2259
2260
2261
2262
2263
2264static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
2265 enum wb_reason reason)
2266{
2267 struct bdi_writeback *wb;
2268
2269 if (!bdi_has_dirty_io(bdi))
2270 return;
2271
2272 list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
2273 wb_start_writeback(wb, reason);
2274}
2275
2276void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
2277 enum wb_reason reason)
2278{
2279 rcu_read_lock();
2280 __wakeup_flusher_threads_bdi(bdi, reason);
2281 rcu_read_unlock();
2282}
2283
2284
2285
2286
2287void wakeup_flusher_threads(enum wb_reason reason)
2288{
2289 struct backing_dev_info *bdi;
2290
2291
2292
2293
2294 if (blk_needs_flush_plug(current))
2295 blk_flush_plug(current->plug, true);
2296
2297 rcu_read_lock();
2298 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
2299 __wakeup_flusher_threads_bdi(bdi, reason);
2300 rcu_read_unlock();
2301}
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318static void wakeup_dirtytime_writeback(struct work_struct *w);
2319static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback);
2320
2321static void wakeup_dirtytime_writeback(struct work_struct *w)
2322{
2323 struct backing_dev_info *bdi;
2324
2325 rcu_read_lock();
2326 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
2327 struct bdi_writeback *wb;
2328
2329 list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
2330 if (!list_empty(&wb->b_dirty_time))
2331 wb_wakeup(wb);
2332 }
2333 rcu_read_unlock();
2334 schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
2335}
2336
2337static int __init start_dirtytime_writeback(void)
2338{
2339 schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
2340 return 0;
2341}
2342__initcall(start_dirtytime_writeback);
2343
2344int dirtytime_interval_handler(struct ctl_table *table, int write,
2345 void *buffer, size_t *lenp, loff_t *ppos)
2346{
2347 int ret;
2348
2349 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
2350 if (ret == 0 && write)
2351 mod_delayed_work(system_wq, &dirtytime_work, 0);
2352 return ret;
2353}
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382void __mark_inode_dirty(struct inode *inode, int flags)
2383{
2384 struct super_block *sb = inode->i_sb;
2385 int dirtytime = 0;
2386
2387 trace_writeback_mark_inode_dirty(inode, flags);
2388
2389 if (flags & I_DIRTY_INODE) {
2390
2391
2392
2393
2394
2395
2396
2397 trace_writeback_dirty_inode_start(inode, flags);
2398 if (sb->s_op->dirty_inode)
2399 sb->s_op->dirty_inode(inode, flags & I_DIRTY_INODE);
2400 trace_writeback_dirty_inode(inode, flags);
2401
2402
2403 flags &= ~I_DIRTY_TIME;
2404 } else {
2405
2406
2407
2408
2409
2410 dirtytime = flags & I_DIRTY_TIME;
2411 WARN_ON_ONCE(dirtytime && flags != I_DIRTY_TIME);
2412 }
2413
2414
2415
2416
2417
2418 smp_mb();
2419
2420 if (((inode->i_state & flags) == flags) ||
2421 (dirtytime && (inode->i_state & I_DIRTY_INODE)))
2422 return;
2423
2424 spin_lock(&inode->i_lock);
2425 if (dirtytime && (inode->i_state & I_DIRTY_INODE))
2426 goto out_unlock_inode;
2427 if ((inode->i_state & flags) != flags) {
2428 const int was_dirty = inode->i_state & I_DIRTY;
2429
2430 inode_attach_wb(inode, NULL);
2431
2432
2433 if (flags & I_DIRTY_INODE)
2434 inode->i_state &= ~I_DIRTY_TIME;
2435 inode->i_state |= flags;
2436
2437
2438
2439
2440
2441
2442
2443 if (inode->i_state & I_SYNC_QUEUED)
2444 goto out_unlock_inode;
2445
2446
2447
2448
2449
2450 if (!S_ISBLK(inode->i_mode)) {
2451 if (inode_unhashed(inode))
2452 goto out_unlock_inode;
2453 }
2454 if (inode->i_state & I_FREEING)
2455 goto out_unlock_inode;
2456
2457
2458
2459
2460
2461 if (!was_dirty) {
2462 struct bdi_writeback *wb;
2463 struct list_head *dirty_list;
2464 bool wakeup_bdi = false;
2465
2466 wb = locked_inode_to_wb_and_lock_list(inode);
2467
2468 inode->dirtied_when = jiffies;
2469 if (dirtytime)
2470 inode->dirtied_time_when = jiffies;
2471
2472 if (inode->i_state & I_DIRTY)
2473 dirty_list = &wb->b_dirty;
2474 else
2475 dirty_list = &wb->b_dirty_time;
2476
2477 wakeup_bdi = inode_io_list_move_locked(inode, wb,
2478 dirty_list);
2479
2480 spin_unlock(&wb->list_lock);
2481 trace_writeback_dirty_inode_enqueue(inode);
2482
2483
2484
2485
2486
2487
2488
2489 if (wakeup_bdi &&
2490 (wb->bdi->capabilities & BDI_CAP_WRITEBACK))
2491 wb_wakeup_delayed(wb);
2492 return;
2493 }
2494 }
2495out_unlock_inode:
2496 spin_unlock(&inode->i_lock);
2497}
2498EXPORT_SYMBOL(__mark_inode_dirty);
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509static void wait_sb_inodes(struct super_block *sb)
2510{
2511 LIST_HEAD(sync_list);
2512
2513
2514
2515
2516
2517 WARN_ON(!rwsem_is_locked(&sb->s_umount));
2518
2519 mutex_lock(&sb->s_sync_lock);
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530 rcu_read_lock();
2531 spin_lock_irq(&sb->s_inode_wblist_lock);
2532 list_splice_init(&sb->s_inodes_wb, &sync_list);
2533
2534
2535
2536
2537
2538
2539
2540
2541 while (!list_empty(&sync_list)) {
2542 struct inode *inode = list_first_entry(&sync_list, struct inode,
2543 i_wb_list);
2544 struct address_space *mapping = inode->i_mapping;
2545
2546
2547
2548
2549
2550
2551
2552 list_move_tail(&inode->i_wb_list, &sb->s_inodes_wb);
2553
2554
2555
2556
2557
2558
2559 if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
2560 continue;
2561
2562 spin_unlock_irq(&sb->s_inode_wblist_lock);
2563
2564 spin_lock(&inode->i_lock);
2565 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
2566 spin_unlock(&inode->i_lock);
2567
2568 spin_lock_irq(&sb->s_inode_wblist_lock);
2569 continue;
2570 }
2571 __iget(inode);
2572 spin_unlock(&inode->i_lock);
2573 rcu_read_unlock();
2574
2575
2576
2577
2578
2579
2580 filemap_fdatawait_keep_errors(mapping);
2581
2582 cond_resched();
2583
2584 iput(inode);
2585
2586 rcu_read_lock();
2587 spin_lock_irq(&sb->s_inode_wblist_lock);
2588 }
2589 spin_unlock_irq(&sb->s_inode_wblist_lock);
2590 rcu_read_unlock();
2591 mutex_unlock(&sb->s_sync_lock);
2592}
2593
2594static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
2595 enum wb_reason reason, bool skip_if_busy)
2596{
2597 struct backing_dev_info *bdi = sb->s_bdi;
2598 DEFINE_WB_COMPLETION(done, bdi);
2599 struct wb_writeback_work work = {
2600 .sb = sb,
2601 .sync_mode = WB_SYNC_NONE,
2602 .tagged_writepages = 1,
2603 .done = &done,
2604 .nr_pages = nr,
2605 .reason = reason,
2606 };
2607
2608 if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
2609 return;
2610 WARN_ON(!rwsem_is_locked(&sb->s_umount));
2611
2612 bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
2613 wb_wait_for_completion(&done);
2614}
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626void writeback_inodes_sb_nr(struct super_block *sb,
2627 unsigned long nr,
2628 enum wb_reason reason)
2629{
2630 __writeback_inodes_sb_nr(sb, nr, reason, false);
2631}
2632EXPORT_SYMBOL(writeback_inodes_sb_nr);
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
2644{
2645 return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
2646}
2647EXPORT_SYMBOL(writeback_inodes_sb);
2648
2649
2650
2651
2652
2653
2654
2655
2656void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
2657{
2658 if (!down_read_trylock(&sb->s_umount))
2659 return;
2660
2661 __writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason, true);
2662 up_read(&sb->s_umount);
2663}
2664EXPORT_SYMBOL(try_to_writeback_inodes_sb);
2665
2666
2667
2668
2669
2670
2671
2672
2673void sync_inodes_sb(struct super_block *sb)
2674{
2675 struct backing_dev_info *bdi = sb->s_bdi;
2676 DEFINE_WB_COMPLETION(done, bdi);
2677 struct wb_writeback_work work = {
2678 .sb = sb,
2679 .sync_mode = WB_SYNC_ALL,
2680 .nr_pages = LONG_MAX,
2681 .range_cyclic = 0,
2682 .done = &done,
2683 .reason = WB_REASON_SYNC,
2684 .for_sync = 1,
2685 };
2686
2687
2688
2689
2690
2691
2692 if (bdi == &noop_backing_dev_info)
2693 return;
2694 WARN_ON(!rwsem_is_locked(&sb->s_umount));
2695
2696
2697 bdi_down_write_wb_switch_rwsem(bdi);
2698 bdi_split_work_to_wbs(bdi, &work, false);
2699 wb_wait_for_completion(&done);
2700 bdi_up_write_wb_switch_rwsem(bdi);
2701
2702 wait_sb_inodes(sb);
2703}
2704EXPORT_SYMBOL(sync_inodes_sb);
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716int write_inode_now(struct inode *inode, int sync)
2717{
2718 struct writeback_control wbc = {
2719 .nr_to_write = LONG_MAX,
2720 .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
2721 .range_start = 0,
2722 .range_end = LLONG_MAX,
2723 };
2724
2725 if (!mapping_can_writeback(inode->i_mapping))
2726 wbc.nr_to_write = 0;
2727
2728 might_sleep();
2729 return writeback_single_inode(inode, &wbc);
2730}
2731EXPORT_SYMBOL(write_inode_now);
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742int sync_inode_metadata(struct inode *inode, int wait)
2743{
2744 struct writeback_control wbc = {
2745 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
2746 .nr_to_write = 0,
2747 };
2748
2749 return writeback_single_inode(inode, &wbc);
2750}
2751EXPORT_SYMBOL(sync_inode_metadata);
2752