1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17#include <linux/kernel.h>
18#include <linux/export.h>
19#include <linux/spinlock.h>
20#include <linux/slab.h>
21#include <linux/sched.h>
22#include <linux/fs.h>
23#include <linux/mm.h>
24#include <linux/pagemap.h>
25#include <linux/kthread.h>
26#include <linux/writeback.h>
27#include <linux/blkdev.h>
28#include <linux/backing-dev.h>
29#include <linux/tracepoint.h>
30#include <linux/device.h>
31#include <linux/memcontrol.h>
32#include "internal.h"
33
34
35
36
37#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10))
38
39
40
41
42struct wb_writeback_work {
43 long nr_pages;
44 struct super_block *sb;
45 enum writeback_sync_modes sync_mode;
46 unsigned int tagged_writepages:1;
47 unsigned int for_kupdate:1;
48 unsigned int range_cyclic:1;
49 unsigned int for_background:1;
50 unsigned int for_sync:1;
51 unsigned int auto_free:1;
52 enum wb_reason reason;
53
54 struct list_head list;
55 struct wb_completion *done;
56};
57
58
59
60
61
62
63
64
65
66
67
68unsigned int dirtytime_expire_interval = 12 * 60 * 60;
69
70static inline struct inode *wb_inode(struct list_head *head)
71{
72 return list_entry(head, struct inode, i_io_list);
73}
74
75
76
77
78
79
80#define CREATE_TRACE_POINTS
81#include <trace/events/writeback.h>
82
83EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
84
85static bool wb_io_lists_populated(struct bdi_writeback *wb)
86{
87 if (wb_has_dirty_io(wb)) {
88 return false;
89 } else {
90 set_bit(WB_has_dirty_io, &wb->state);
91 WARN_ON_ONCE(!wb->avg_write_bandwidth);
92 atomic_long_add(wb->avg_write_bandwidth,
93 &wb->bdi->tot_write_bandwidth);
94 return true;
95 }
96}
97
98static void wb_io_lists_depopulated(struct bdi_writeback *wb)
99{
100 if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) &&
101 list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) {
102 clear_bit(WB_has_dirty_io, &wb->state);
103 WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth,
104 &wb->bdi->tot_write_bandwidth) < 0);
105 }
106}
107
108
109
110
111
112
113
114
115
116
117
118static bool inode_io_list_move_locked(struct inode *inode,
119 struct bdi_writeback *wb,
120 struct list_head *head)
121{
122 assert_spin_locked(&wb->list_lock);
123
124 list_move(&inode->i_io_list, head);
125
126
127 if (head != &wb->b_dirty_time)
128 return wb_io_lists_populated(wb);
129
130 wb_io_lists_depopulated(wb);
131 return false;
132}
133
134static void wb_wakeup(struct bdi_writeback *wb)
135{
136 spin_lock_bh(&wb->work_lock);
137 if (test_bit(WB_registered, &wb->state))
138 mod_delayed_work(bdi_wq, &wb->dwork, 0);
139 spin_unlock_bh(&wb->work_lock);
140}
141
142static void finish_writeback_work(struct bdi_writeback *wb,
143 struct wb_writeback_work *work)
144{
145 struct wb_completion *done = work->done;
146
147 if (work->auto_free)
148 kfree(work);
149 if (done) {
150 wait_queue_head_t *waitq = done->waitq;
151
152
153 if (atomic_dec_and_test(&done->cnt))
154 wake_up_all(waitq);
155 }
156}
157
158static void wb_queue_work(struct bdi_writeback *wb,
159 struct wb_writeback_work *work)
160{
161 trace_writeback_queue(wb, work);
162
163 if (work->done)
164 atomic_inc(&work->done->cnt);
165
166 spin_lock_bh(&wb->work_lock);
167
168 if (test_bit(WB_registered, &wb->state)) {
169 list_add_tail(&work->list, &wb->work_list);
170 mod_delayed_work(bdi_wq, &wb->dwork, 0);
171 } else
172 finish_writeback_work(wb, work);
173
174 spin_unlock_bh(&wb->work_lock);
175}
176
177
178
179
180
181
182
183
184
185
186
187void wb_wait_for_completion(struct wb_completion *done)
188{
189 atomic_dec(&done->cnt);
190 wait_event(*done->waitq, !atomic_read(&done->cnt));
191}
192
193#ifdef CONFIG_CGROUP_WRITEBACK
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214#define WB_FRN_TIME_SHIFT 13
215#define WB_FRN_TIME_AVG_SHIFT 3
216#define WB_FRN_TIME_CUT_DIV 8
217#define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT))
218
219#define WB_FRN_HIST_SLOTS 16
220#define WB_FRN_HIST_UNIT (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS)
221
222#define WB_FRN_HIST_THR_SLOTS (WB_FRN_HIST_SLOTS / 2)
223
224#define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1)
225
226#define WB_FRN_MAX_IN_FLIGHT 1024
227
228
229
230
231
232#define WB_MAX_INODES_PER_ISW ((1024UL - sizeof(struct inode_switch_wbs_context)) \
233 / sizeof(struct inode *))
234
235static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
236static struct workqueue_struct *isw_wq;
237
238void __inode_attach_wb(struct inode *inode, struct page *page)
239{
240 struct backing_dev_info *bdi = inode_to_bdi(inode);
241 struct bdi_writeback *wb = NULL;
242
243 if (inode_cgwb_enabled(inode)) {
244 struct cgroup_subsys_state *memcg_css;
245
246 if (page) {
247 memcg_css = mem_cgroup_css_from_page(page);
248 wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
249 } else {
250
251 memcg_css = task_get_css(current, memory_cgrp_id);
252 wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
253 css_put(memcg_css);
254 }
255 }
256
257 if (!wb)
258 wb = &bdi->wb;
259
260
261
262
263
264 if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
265 wb_put(wb);
266}
267EXPORT_SYMBOL_GPL(__inode_attach_wb);
268
269
270
271
272
273
274
275
276
277static void inode_cgwb_move_to_attached(struct inode *inode,
278 struct bdi_writeback *wb)
279{
280 assert_spin_locked(&wb->list_lock);
281 assert_spin_locked(&inode->i_lock);
282
283 inode->i_state &= ~I_SYNC_QUEUED;
284 if (wb != &wb->bdi->wb)
285 list_move(&inode->i_io_list, &wb->b_attached);
286 else
287 list_del_init(&inode->i_io_list);
288 wb_io_lists_depopulated(wb);
289}
290
291
292
293
294
295
296
297
298
299static struct bdi_writeback *
300locked_inode_to_wb_and_lock_list(struct inode *inode)
301 __releases(&inode->i_lock)
302 __acquires(&wb->list_lock)
303{
304 while (true) {
305 struct bdi_writeback *wb = inode_to_wb(inode);
306
307
308
309
310
311
312
313 wb_get(wb);
314 spin_unlock(&inode->i_lock);
315 spin_lock(&wb->list_lock);
316
317
318 if (likely(wb == inode->i_wb)) {
319 wb_put(wb);
320 return wb;
321 }
322
323 spin_unlock(&wb->list_lock);
324 wb_put(wb);
325 cpu_relax();
326 spin_lock(&inode->i_lock);
327 }
328}
329
330
331
332
333
334
335
336
337static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
338 __acquires(&wb->list_lock)
339{
340 spin_lock(&inode->i_lock);
341 return locked_inode_to_wb_and_lock_list(inode);
342}
343
344struct inode_switch_wbs_context {
345 struct rcu_work work;
346
347
348
349
350
351
352
353
354
355 struct bdi_writeback *new_wb;
356 struct inode *inodes[];
357};
358
359static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
360{
361 down_write(&bdi->wb_switch_rwsem);
362}
363
364static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
365{
366 up_write(&bdi->wb_switch_rwsem);
367}
368
369static bool inode_do_switch_wbs(struct inode *inode,
370 struct bdi_writeback *old_wb,
371 struct bdi_writeback *new_wb)
372{
373 struct address_space *mapping = inode->i_mapping;
374 XA_STATE(xas, &mapping->i_pages, 0);
375 struct page *page;
376 bool switched = false;
377
378 spin_lock(&inode->i_lock);
379 xa_lock_irq(&mapping->i_pages);
380
381
382
383
384
385 if (unlikely(inode->i_state & (I_FREEING | I_WILL_FREE)))
386 goto skip_switch;
387
388 trace_inode_switch_wbs(inode, old_wb, new_wb);
389
390
391
392
393
394
395 xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_DIRTY) {
396 if (PageDirty(page)) {
397 dec_wb_stat(old_wb, WB_RECLAIMABLE);
398 inc_wb_stat(new_wb, WB_RECLAIMABLE);
399 }
400 }
401
402 xas_set(&xas, 0);
403 xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) {
404 WARN_ON_ONCE(!PageWriteback(page));
405 dec_wb_stat(old_wb, WB_WRITEBACK);
406 inc_wb_stat(new_wb, WB_WRITEBACK);
407 }
408
409 if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
410 atomic_dec(&old_wb->writeback_inodes);
411 atomic_inc(&new_wb->writeback_inodes);
412 }
413
414 wb_get(new_wb);
415
416
417
418
419
420
421
422
423
424 if (!list_empty(&inode->i_io_list)) {
425 inode->i_wb = new_wb;
426
427 if (inode->i_state & I_DIRTY_ALL) {
428 struct inode *pos;
429
430 list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
431 if (time_after_eq(inode->dirtied_when,
432 pos->dirtied_when))
433 break;
434 inode_io_list_move_locked(inode, new_wb,
435 pos->i_io_list.prev);
436 } else {
437 inode_cgwb_move_to_attached(inode, new_wb);
438 }
439 } else {
440 inode->i_wb = new_wb;
441 }
442
443
444 inode->i_wb_frn_winner = 0;
445 inode->i_wb_frn_avg_time = 0;
446 inode->i_wb_frn_history = 0;
447 switched = true;
448skip_switch:
449
450
451
452
453 smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
454
455 xa_unlock_irq(&mapping->i_pages);
456 spin_unlock(&inode->i_lock);
457
458 return switched;
459}
460
461static void inode_switch_wbs_work_fn(struct work_struct *work)
462{
463 struct inode_switch_wbs_context *isw =
464 container_of(to_rcu_work(work), struct inode_switch_wbs_context, work);
465 struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]);
466 struct bdi_writeback *old_wb = isw->inodes[0]->i_wb;
467 struct bdi_writeback *new_wb = isw->new_wb;
468 unsigned long nr_switched = 0;
469 struct inode **inodep;
470
471
472
473
474
475 down_read(&bdi->wb_switch_rwsem);
476
477
478
479
480
481
482
483
484
485
486
487 if (old_wb < new_wb) {
488 spin_lock(&old_wb->list_lock);
489 spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
490 } else {
491 spin_lock(&new_wb->list_lock);
492 spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
493 }
494
495 for (inodep = isw->inodes; *inodep; inodep++) {
496 WARN_ON_ONCE((*inodep)->i_wb != old_wb);
497 if (inode_do_switch_wbs(*inodep, old_wb, new_wb))
498 nr_switched++;
499 }
500
501 spin_unlock(&new_wb->list_lock);
502 spin_unlock(&old_wb->list_lock);
503
504 up_read(&bdi->wb_switch_rwsem);
505
506 if (nr_switched) {
507 wb_wakeup(new_wb);
508 wb_put_many(old_wb, nr_switched);
509 }
510
511 for (inodep = isw->inodes; *inodep; inodep++)
512 iput(*inodep);
513 wb_put(new_wb);
514 kfree(isw);
515 atomic_dec(&isw_nr_in_flight);
516}
517
518static bool inode_prepare_wbs_switch(struct inode *inode,
519 struct bdi_writeback *new_wb)
520{
521
522
523
524
525
526
527 smp_mb();
528
529 if (IS_DAX(inode))
530 return false;
531
532
533 spin_lock(&inode->i_lock);
534 if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
535 inode->i_state & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) ||
536 inode_to_wb(inode) == new_wb) {
537 spin_unlock(&inode->i_lock);
538 return false;
539 }
540 inode->i_state |= I_WB_SWITCH;
541 __iget(inode);
542 spin_unlock(&inode->i_lock);
543
544 return true;
545}
546
547
548
549
550
551
552
553
554
555static void inode_switch_wbs(struct inode *inode, int new_wb_id)
556{
557 struct backing_dev_info *bdi = inode_to_bdi(inode);
558 struct cgroup_subsys_state *memcg_css;
559 struct inode_switch_wbs_context *isw;
560
561
562 if (inode->i_state & I_WB_SWITCH)
563 return;
564
565
566 if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT)
567 return;
568
569 isw = kzalloc(sizeof(*isw) + 2 * sizeof(struct inode *), GFP_ATOMIC);
570 if (!isw)
571 return;
572
573 atomic_inc(&isw_nr_in_flight);
574
575
576 rcu_read_lock();
577 memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
578 if (memcg_css && !css_tryget(memcg_css))
579 memcg_css = NULL;
580 rcu_read_unlock();
581 if (!memcg_css)
582 goto out_free;
583
584 isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
585 css_put(memcg_css);
586 if (!isw->new_wb)
587 goto out_free;
588
589 if (!inode_prepare_wbs_switch(inode, isw->new_wb))
590 goto out_free;
591
592 isw->inodes[0] = inode;
593
594
595
596
597
598
599
600 INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
601 queue_rcu_work(isw_wq, &isw->work);
602 return;
603
604out_free:
605 atomic_dec(&isw_nr_in_flight);
606 if (isw->new_wb)
607 wb_put(isw->new_wb);
608 kfree(isw);
609}
610
611
612
613
614
615
616
617
618
619bool cleanup_offline_cgwb(struct bdi_writeback *wb)
620{
621 struct cgroup_subsys_state *memcg_css;
622 struct inode_switch_wbs_context *isw;
623 struct inode *inode;
624 int nr;
625 bool restart = false;
626
627 isw = kzalloc(sizeof(*isw) + WB_MAX_INODES_PER_ISW *
628 sizeof(struct inode *), GFP_KERNEL);
629 if (!isw)
630 return restart;
631
632 atomic_inc(&isw_nr_in_flight);
633
634 for (memcg_css = wb->memcg_css->parent; memcg_css;
635 memcg_css = memcg_css->parent) {
636 isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
637 if (isw->new_wb)
638 break;
639 }
640 if (unlikely(!isw->new_wb))
641 isw->new_wb = &wb->bdi->wb;
642
643 nr = 0;
644 spin_lock(&wb->list_lock);
645 list_for_each_entry(inode, &wb->b_attached, i_io_list) {
646 if (!inode_prepare_wbs_switch(inode, isw->new_wb))
647 continue;
648
649 isw->inodes[nr++] = inode;
650
651 if (nr >= WB_MAX_INODES_PER_ISW - 1) {
652 restart = true;
653 break;
654 }
655 }
656 spin_unlock(&wb->list_lock);
657
658
659 if (nr == 0) {
660 atomic_dec(&isw_nr_in_flight);
661 wb_put(isw->new_wb);
662 kfree(isw);
663 return restart;
664 }
665
666
667
668
669
670
671
672 INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
673 queue_rcu_work(isw_wq, &isw->work);
674
675 return restart;
676}
677
678
679
680
681
682
683
684
685
686
687
688void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
689 struct inode *inode)
690{
691 if (!inode_cgwb_enabled(inode)) {
692 spin_unlock(&inode->i_lock);
693 return;
694 }
695
696 wbc->wb = inode_to_wb(inode);
697 wbc->inode = inode;
698
699 wbc->wb_id = wbc->wb->memcg_css->id;
700 wbc->wb_lcand_id = inode->i_wb_frn_winner;
701 wbc->wb_tcand_id = 0;
702 wbc->wb_bytes = 0;
703 wbc->wb_lcand_bytes = 0;
704 wbc->wb_tcand_bytes = 0;
705
706 wb_get(wbc->wb);
707 spin_unlock(&inode->i_lock);
708
709
710
711
712
713
714
715
716 if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css)))
717 inode_switch_wbs(inode, wbc->wb_id);
718}
719EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode);
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758void wbc_detach_inode(struct writeback_control *wbc)
759{
760 struct bdi_writeback *wb = wbc->wb;
761 struct inode *inode = wbc->inode;
762 unsigned long avg_time, max_bytes, max_time;
763 u16 history;
764 int max_id;
765
766 if (!wb)
767 return;
768
769 history = inode->i_wb_frn_history;
770 avg_time = inode->i_wb_frn_avg_time;
771
772
773 if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
774 wbc->wb_bytes >= wbc->wb_tcand_bytes) {
775 max_id = wbc->wb_id;
776 max_bytes = wbc->wb_bytes;
777 } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) {
778 max_id = wbc->wb_lcand_id;
779 max_bytes = wbc->wb_lcand_bytes;
780 } else {
781 max_id = wbc->wb_tcand_id;
782 max_bytes = wbc->wb_tcand_bytes;
783 }
784
785
786
787
788
789
790
791
792 max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT,
793 wb->avg_write_bandwidth);
794 if (avg_time)
795 avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) -
796 (avg_time >> WB_FRN_TIME_AVG_SHIFT);
797 else
798 avg_time = max_time;
799
800 if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) {
801 int slots;
802
803
804
805
806
807
808
809
810
811 slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT),
812 (unsigned long)WB_FRN_HIST_MAX_SLOTS);
813 history <<= slots;
814 if (wbc->wb_id != max_id)
815 history |= (1U << slots) - 1;
816
817 if (history)
818 trace_inode_foreign_history(inode, wbc, history);
819
820
821
822
823
824
825
826
827 if (hweight32(history) > WB_FRN_HIST_THR_SLOTS)
828 inode_switch_wbs(inode, max_id);
829 }
830
831
832
833
834
835 inode->i_wb_frn_winner = max_id;
836 inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX);
837 inode->i_wb_frn_history = history;
838
839 wb_put(wbc->wb);
840 wbc->wb = NULL;
841}
842EXPORT_SYMBOL_GPL(wbc_detach_inode);
843
844
845
846
847
848
849
850
851
852
853
854void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
855 size_t bytes)
856{
857 struct cgroup_subsys_state *css;
858 int id;
859
860
861
862
863
864
865
866 if (!wbc->wb || wbc->no_cgroup_owner)
867 return;
868
869 css = mem_cgroup_css_from_page(page);
870
871 if (!(css->flags & CSS_ONLINE))
872 return;
873
874 id = css->id;
875
876 if (id == wbc->wb_id) {
877 wbc->wb_bytes += bytes;
878 return;
879 }
880
881 if (id == wbc->wb_lcand_id)
882 wbc->wb_lcand_bytes += bytes;
883
884
885 if (!wbc->wb_tcand_bytes)
886 wbc->wb_tcand_id = id;
887 if (id == wbc->wb_tcand_id)
888 wbc->wb_tcand_bytes += bytes;
889 else
890 wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
891}
892EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910int inode_congested(struct inode *inode, int cong_bits)
911{
912
913
914
915
916 if (inode && inode_to_wb_is_valid(inode)) {
917 struct bdi_writeback *wb;
918 struct wb_lock_cookie lock_cookie = {};
919 bool congested;
920
921 wb = unlocked_inode_to_wb_begin(inode, &lock_cookie);
922 congested = wb_congested(wb, cong_bits);
923 unlocked_inode_to_wb_end(inode, &lock_cookie);
924 return congested;
925 }
926
927 return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
928}
929EXPORT_SYMBOL_GPL(inode_congested);
930
931
932
933
934
935
936
937
938
939
940static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
941{
942 unsigned long this_bw = wb->avg_write_bandwidth;
943 unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
944
945 if (nr_pages == LONG_MAX)
946 return LONG_MAX;
947
948
949
950
951
952
953 if (!tot_bw || this_bw >= tot_bw)
954 return nr_pages;
955 else
956 return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
957}
958
959
960
961
962
963
964
965
966
967
968
969
970static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
971 struct wb_writeback_work *base_work,
972 bool skip_if_busy)
973{
974 struct bdi_writeback *last_wb = NULL;
975 struct bdi_writeback *wb = list_entry(&bdi->wb_list,
976 struct bdi_writeback, bdi_node);
977
978 might_sleep();
979restart:
980 rcu_read_lock();
981 list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
982 DEFINE_WB_COMPLETION(fallback_work_done, bdi);
983 struct wb_writeback_work fallback_work;
984 struct wb_writeback_work *work;
985 long nr_pages;
986
987 if (last_wb) {
988 wb_put(last_wb);
989 last_wb = NULL;
990 }
991
992
993 if (!wb_has_dirty_io(wb) &&
994 (base_work->sync_mode == WB_SYNC_NONE ||
995 list_empty(&wb->b_dirty_time)))
996 continue;
997 if (skip_if_busy && writeback_in_progress(wb))
998 continue;
999
1000 nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);
1001
1002 work = kmalloc(sizeof(*work), GFP_ATOMIC);
1003 if (work) {
1004 *work = *base_work;
1005 work->nr_pages = nr_pages;
1006 work->auto_free = 1;
1007 wb_queue_work(wb, work);
1008 continue;
1009 }
1010
1011
1012 work = &fallback_work;
1013 *work = *base_work;
1014 work->nr_pages = nr_pages;
1015 work->auto_free = 0;
1016 work->done = &fallback_work_done;
1017
1018 wb_queue_work(wb, work);
1019
1020
1021
1022
1023
1024
1025 wb_get(wb);
1026 last_wb = wb;
1027
1028 rcu_read_unlock();
1029 wb_wait_for_completion(&fallback_work_done);
1030 goto restart;
1031 }
1032 rcu_read_unlock();
1033
1034 if (last_wb)
1035 wb_put(last_wb);
1036}
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048int cgroup_writeback_by_id(u64 bdi_id, int memcg_id,
1049 enum wb_reason reason, struct wb_completion *done)
1050{
1051 struct backing_dev_info *bdi;
1052 struct cgroup_subsys_state *memcg_css;
1053 struct bdi_writeback *wb;
1054 struct wb_writeback_work *work;
1055 unsigned long dirty;
1056 int ret;
1057
1058
1059 bdi = bdi_get_by_id(bdi_id);
1060 if (!bdi)
1061 return -ENOENT;
1062
1063 rcu_read_lock();
1064 memcg_css = css_from_id(memcg_id, &memory_cgrp_subsys);
1065 if (memcg_css && !css_tryget(memcg_css))
1066 memcg_css = NULL;
1067 rcu_read_unlock();
1068 if (!memcg_css) {
1069 ret = -ENOENT;
1070 goto out_bdi_put;
1071 }
1072
1073
1074
1075
1076
1077 wb = wb_get_lookup(bdi, memcg_css);
1078 if (!wb) {
1079 ret = -ENOENT;
1080 goto out_css_put;
1081 }
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093 dirty = memcg_page_state(mem_cgroup_from_css(memcg_css), NR_FILE_DIRTY);
1094 dirty = dirty * 10 / 8;
1095
1096
1097 work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN);
1098 if (work) {
1099 work->nr_pages = dirty;
1100 work->sync_mode = WB_SYNC_NONE;
1101 work->range_cyclic = 1;
1102 work->reason = reason;
1103 work->done = done;
1104 work->auto_free = 1;
1105 wb_queue_work(wb, work);
1106 ret = 0;
1107 } else {
1108 ret = -ENOMEM;
1109 }
1110
1111 wb_put(wb);
1112out_css_put:
1113 css_put(memcg_css);
1114out_bdi_put:
1115 bdi_put(bdi);
1116 return ret;
1117}
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129void cgroup_writeback_umount(void)
1130{
1131
1132
1133
1134
1135 smp_mb();
1136
1137 if (atomic_read(&isw_nr_in_flight)) {
1138
1139
1140
1141
1142 rcu_barrier();
1143 flush_workqueue(isw_wq);
1144 }
1145}
1146
1147static int __init cgroup_writeback_init(void)
1148{
1149 isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
1150 if (!isw_wq)
1151 return -ENOMEM;
1152 return 0;
1153}
1154fs_initcall(cgroup_writeback_init);
1155
1156#else
1157
1158static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
1159static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
1160
1161static void inode_cgwb_move_to_attached(struct inode *inode,
1162 struct bdi_writeback *wb)
1163{
1164 assert_spin_locked(&wb->list_lock);
1165 assert_spin_locked(&inode->i_lock);
1166
1167 inode->i_state &= ~I_SYNC_QUEUED;
1168 list_del_init(&inode->i_io_list);
1169 wb_io_lists_depopulated(wb);
1170}
1171
1172static struct bdi_writeback *
1173locked_inode_to_wb_and_lock_list(struct inode *inode)
1174 __releases(&inode->i_lock)
1175 __acquires(&wb->list_lock)
1176{
1177 struct bdi_writeback *wb = inode_to_wb(inode);
1178
1179 spin_unlock(&inode->i_lock);
1180 spin_lock(&wb->list_lock);
1181 return wb;
1182}
1183
1184static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
1185 __acquires(&wb->list_lock)
1186{
1187 struct bdi_writeback *wb = inode_to_wb(inode);
1188
1189 spin_lock(&wb->list_lock);
1190 return wb;
1191}
1192
1193static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
1194{
1195 return nr_pages;
1196}
1197
1198static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
1199 struct wb_writeback_work *base_work,
1200 bool skip_if_busy)
1201{
1202 might_sleep();
1203
1204 if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
1205 base_work->auto_free = 0;
1206 wb_queue_work(&bdi->wb, base_work);
1207 }
1208}
1209
1210#endif
1211
1212
1213
1214
1215
1216static unsigned long get_nr_dirty_pages(void)
1217{
1218 return global_node_page_state(NR_FILE_DIRTY) +
1219 get_nr_dirty_inodes();
1220}
1221
1222static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
1223{
1224 if (!wb_has_dirty_io(wb))
1225 return;
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235 if (test_bit(WB_start_all, &wb->state) ||
1236 test_and_set_bit(WB_start_all, &wb->state))
1237 return;
1238
1239 wb->start_all_reason = reason;
1240 wb_wakeup(wb);
1241}
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253void wb_start_background_writeback(struct bdi_writeback *wb)
1254{
1255
1256
1257
1258
1259 trace_writeback_wake_background(wb);
1260 wb_wakeup(wb);
1261}
1262
1263
1264
1265
1266void inode_io_list_del(struct inode *inode)
1267{
1268 struct bdi_writeback *wb;
1269
1270 wb = inode_to_wb_and_lock_list(inode);
1271 spin_lock(&inode->i_lock);
1272
1273 inode->i_state &= ~I_SYNC_QUEUED;
1274 list_del_init(&inode->i_io_list);
1275 wb_io_lists_depopulated(wb);
1276
1277 spin_unlock(&inode->i_lock);
1278 spin_unlock(&wb->list_lock);
1279}
1280EXPORT_SYMBOL(inode_io_list_del);
1281
1282
1283
1284
1285void sb_mark_inode_writeback(struct inode *inode)
1286{
1287 struct super_block *sb = inode->i_sb;
1288 unsigned long flags;
1289
1290 if (list_empty(&inode->i_wb_list)) {
1291 spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
1292 if (list_empty(&inode->i_wb_list)) {
1293 list_add_tail(&inode->i_wb_list, &sb->s_inodes_wb);
1294 trace_sb_mark_inode_writeback(inode);
1295 }
1296 spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
1297 }
1298}
1299
1300
1301
1302
1303void sb_clear_inode_writeback(struct inode *inode)
1304{
1305 struct super_block *sb = inode->i_sb;
1306 unsigned long flags;
1307
1308 if (!list_empty(&inode->i_wb_list)) {
1309 spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
1310 if (!list_empty(&inode->i_wb_list)) {
1311 list_del_init(&inode->i_wb_list);
1312 trace_sb_clear_inode_writeback(inode);
1313 }
1314 spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
1315 }
1316}
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb)
1328{
1329 assert_spin_locked(&inode->i_lock);
1330
1331 if (!list_empty(&wb->b_dirty)) {
1332 struct inode *tail;
1333
1334 tail = wb_inode(wb->b_dirty.next);
1335 if (time_before(inode->dirtied_when, tail->dirtied_when))
1336 inode->dirtied_when = jiffies;
1337 }
1338 inode_io_list_move_locked(inode, wb, &wb->b_dirty);
1339 inode->i_state &= ~I_SYNC_QUEUED;
1340}
1341
1342static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
1343{
1344 spin_lock(&inode->i_lock);
1345 redirty_tail_locked(inode, wb);
1346 spin_unlock(&inode->i_lock);
1347}
1348
1349
1350
1351
1352static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
1353{
1354 inode_io_list_move_locked(inode, wb, &wb->b_more_io);
1355}
1356
1357static void inode_sync_complete(struct inode *inode)
1358{
1359 inode->i_state &= ~I_SYNC;
1360
1361 inode_add_lru(inode);
1362
1363 smp_mb();
1364 wake_up_bit(&inode->i_state, __I_SYNC);
1365}
1366
1367static bool inode_dirtied_after(struct inode *inode, unsigned long t)
1368{
1369 bool ret = time_after(inode->dirtied_when, t);
1370#ifndef CONFIG_64BIT
1371
1372
1373
1374
1375
1376
1377 ret = ret && time_before_eq(inode->dirtied_when, jiffies);
1378#endif
1379 return ret;
1380}
1381
1382#define EXPIRE_DIRTY_ATIME 0x0001
1383
1384
1385
1386
1387
1388static int move_expired_inodes(struct list_head *delaying_queue,
1389 struct list_head *dispatch_queue,
1390 unsigned long dirtied_before)
1391{
1392 LIST_HEAD(tmp);
1393 struct list_head *pos, *node;
1394 struct super_block *sb = NULL;
1395 struct inode *inode;
1396 int do_sb_sort = 0;
1397 int moved = 0;
1398
1399 while (!list_empty(delaying_queue)) {
1400 inode = wb_inode(delaying_queue->prev);
1401 if (inode_dirtied_after(inode, dirtied_before))
1402 break;
1403 list_move(&inode->i_io_list, &tmp);
1404 moved++;
1405 spin_lock(&inode->i_lock);
1406 inode->i_state |= I_SYNC_QUEUED;
1407 spin_unlock(&inode->i_lock);
1408 if (sb_is_blkdev_sb(inode->i_sb))
1409 continue;
1410 if (sb && sb != inode->i_sb)
1411 do_sb_sort = 1;
1412 sb = inode->i_sb;
1413 }
1414
1415
1416 if (!do_sb_sort) {
1417 list_splice(&tmp, dispatch_queue);
1418 goto out;
1419 }
1420
1421
1422 while (!list_empty(&tmp)) {
1423 sb = wb_inode(tmp.prev)->i_sb;
1424 list_for_each_prev_safe(pos, node, &tmp) {
1425 inode = wb_inode(pos);
1426 if (inode->i_sb == sb)
1427 list_move(&inode->i_io_list, dispatch_queue);
1428 }
1429 }
1430out:
1431 return moved;
1432}
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work,
1446 unsigned long dirtied_before)
1447{
1448 int moved;
1449 unsigned long time_expire_jif = dirtied_before;
1450
1451 assert_spin_locked(&wb->list_lock);
1452 list_splice_init(&wb->b_more_io, &wb->b_io);
1453 moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, dirtied_before);
1454 if (!work->for_sync)
1455 time_expire_jif = jiffies - dirtytime_expire_interval * HZ;
1456 moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
1457 time_expire_jif);
1458 if (moved)
1459 wb_io_lists_populated(wb);
1460 trace_writeback_queue_io(wb, work, dirtied_before, moved);
1461}
1462
1463static int write_inode(struct inode *inode, struct writeback_control *wbc)
1464{
1465 int ret;
1466
1467 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) {
1468 trace_writeback_write_inode_start(inode, wbc);
1469 ret = inode->i_sb->s_op->write_inode(inode, wbc);
1470 trace_writeback_write_inode(inode, wbc);
1471 return ret;
1472 }
1473 return 0;
1474}
1475
1476
1477
1478
1479
1480static void __inode_wait_for_writeback(struct inode *inode)
1481 __releases(inode->i_lock)
1482 __acquires(inode->i_lock)
1483{
1484 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
1485 wait_queue_head_t *wqh;
1486
1487 wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
1488 while (inode->i_state & I_SYNC) {
1489 spin_unlock(&inode->i_lock);
1490 __wait_on_bit(wqh, &wq, bit_wait,
1491 TASK_UNINTERRUPTIBLE);
1492 spin_lock(&inode->i_lock);
1493 }
1494}
1495
1496
1497
1498
1499void inode_wait_for_writeback(struct inode *inode)
1500{
1501 spin_lock(&inode->i_lock);
1502 __inode_wait_for_writeback(inode);
1503 spin_unlock(&inode->i_lock);
1504}
1505
1506
1507
1508
1509
1510
1511static void inode_sleep_on_writeback(struct inode *inode)
1512 __releases(inode->i_lock)
1513{
1514 DEFINE_WAIT(wait);
1515 wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
1516 int sleep;
1517
1518 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
1519 sleep = inode->i_state & I_SYNC;
1520 spin_unlock(&inode->i_lock);
1521 if (sleep)
1522 schedule();
1523 finish_wait(wqh, &wait);
1524}
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
1535 struct writeback_control *wbc)
1536{
1537 if (inode->i_state & I_FREEING)
1538 return;
1539
1540
1541
1542
1543
1544
1545 if ((inode->i_state & I_DIRTY) &&
1546 (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
1547 inode->dirtied_when = jiffies;
1548
1549 if (wbc->pages_skipped) {
1550
1551
1552
1553
1554 redirty_tail_locked(inode, wb);
1555 return;
1556 }
1557
1558 if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
1559
1560
1561
1562
1563 if (wbc->nr_to_write <= 0) {
1564
1565 requeue_io(inode, wb);
1566 } else {
1567
1568
1569
1570
1571
1572
1573
1574 redirty_tail_locked(inode, wb);
1575 }
1576 } else if (inode->i_state & I_DIRTY) {
1577
1578
1579
1580
1581
1582 redirty_tail_locked(inode, wb);
1583 } else if (inode->i_state & I_DIRTY_TIME) {
1584 inode->dirtied_when = jiffies;
1585 inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
1586 inode->i_state &= ~I_SYNC_QUEUED;
1587 } else {
1588
1589 inode_cgwb_move_to_attached(inode, wb);
1590 }
1591}
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604static int
1605__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
1606{
1607 struct address_space *mapping = inode->i_mapping;
1608 long nr_to_write = wbc->nr_to_write;
1609 unsigned dirty;
1610 int ret;
1611
1612 WARN_ON(!(inode->i_state & I_SYNC));
1613
1614 trace_writeback_single_inode_start(inode, wbc, nr_to_write);
1615
1616 ret = do_writepages(mapping, wbc);
1617
1618
1619
1620
1621
1622
1623
1624
1625 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
1626 int err = filemap_fdatawait(mapping);
1627 if (ret == 0)
1628 ret = err;
1629 }
1630
1631
1632
1633
1634
1635
1636 if ((inode->i_state & I_DIRTY_TIME) &&
1637 (wbc->sync_mode == WB_SYNC_ALL ||
1638 time_after(jiffies, inode->dirtied_time_when +
1639 dirtytime_expire_interval * HZ))) {
1640 trace_writeback_lazytime(inode);
1641 mark_inode_dirty_sync(inode);
1642 }
1643
1644
1645
1646
1647
1648
1649
1650 spin_lock(&inode->i_lock);
1651 dirty = inode->i_state & I_DIRTY;
1652 inode->i_state &= ~dirty;
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665 smp_mb();
1666
1667 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
1668 inode->i_state |= I_DIRTY_PAGES;
1669
1670 spin_unlock(&inode->i_lock);
1671
1672
1673 if (dirty & ~I_DIRTY_PAGES) {
1674 int err = write_inode(inode, wbc);
1675 if (ret == 0)
1676 ret = err;
1677 }
1678 trace_writeback_single_inode(inode, wbc, nr_to_write);
1679 return ret;
1680}
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691static int writeback_single_inode(struct inode *inode,
1692 struct writeback_control *wbc)
1693{
1694 struct bdi_writeback *wb;
1695 int ret = 0;
1696
1697 spin_lock(&inode->i_lock);
1698 if (!atomic_read(&inode->i_count))
1699 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
1700 else
1701 WARN_ON(inode->i_state & I_WILL_FREE);
1702
1703 if (inode->i_state & I_SYNC) {
1704
1705
1706
1707
1708
1709
1710 if (wbc->sync_mode != WB_SYNC_ALL)
1711 goto out;
1712 __inode_wait_for_writeback(inode);
1713 }
1714 WARN_ON(inode->i_state & I_SYNC);
1715
1716
1717
1718
1719
1720
1721
1722 if (!(inode->i_state & I_DIRTY_ALL) &&
1723 (wbc->sync_mode != WB_SYNC_ALL ||
1724 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
1725 goto out;
1726 inode->i_state |= I_SYNC;
1727 wbc_attach_and_unlock_inode(wbc, inode);
1728
1729 ret = __writeback_single_inode(inode, wbc);
1730
1731 wbc_detach_inode(wbc);
1732
1733 wb = inode_to_wb_and_lock_list(inode);
1734 spin_lock(&inode->i_lock);
1735
1736
1737
1738
1739
1740 if (!(inode->i_state & I_DIRTY_ALL))
1741 inode_cgwb_move_to_attached(inode, wb);
1742 spin_unlock(&wb->list_lock);
1743 inode_sync_complete(inode);
1744out:
1745 spin_unlock(&inode->i_lock);
1746 return ret;
1747}
1748
1749static long writeback_chunk_size(struct bdi_writeback *wb,
1750 struct wb_writeback_work *work)
1751{
1752 long pages;
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767 if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
1768 pages = LONG_MAX;
1769 else {
1770 pages = min(wb->avg_write_bandwidth / 2,
1771 global_wb_domain.dirty_limit / DIRTY_SCOPE);
1772 pages = min(pages, work->nr_pages);
1773 pages = round_down(pages + MIN_WRITEBACK_PAGES,
1774 MIN_WRITEBACK_PAGES);
1775 }
1776
1777 return pages;
1778}
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789static long writeback_sb_inodes(struct super_block *sb,
1790 struct bdi_writeback *wb,
1791 struct wb_writeback_work *work)
1792{
1793 struct writeback_control wbc = {
1794 .sync_mode = work->sync_mode,
1795 .tagged_writepages = work->tagged_writepages,
1796 .for_kupdate = work->for_kupdate,
1797 .for_background = work->for_background,
1798 .for_sync = work->for_sync,
1799 .range_cyclic = work->range_cyclic,
1800 .range_start = 0,
1801 .range_end = LLONG_MAX,
1802 };
1803 unsigned long start_time = jiffies;
1804 long write_chunk;
1805 long wrote = 0;
1806
1807 while (!list_empty(&wb->b_io)) {
1808 struct inode *inode = wb_inode(wb->b_io.prev);
1809 struct bdi_writeback *tmp_wb;
1810
1811 if (inode->i_sb != sb) {
1812 if (work->sb) {
1813
1814
1815
1816
1817
1818 redirty_tail(inode, wb);
1819 continue;
1820 }
1821
1822
1823
1824
1825
1826
1827 break;
1828 }
1829
1830
1831
1832
1833
1834
1835 spin_lock(&inode->i_lock);
1836 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
1837 redirty_tail_locked(inode, wb);
1838 spin_unlock(&inode->i_lock);
1839 continue;
1840 }
1841 if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851 spin_unlock(&inode->i_lock);
1852 requeue_io(inode, wb);
1853 trace_writeback_sb_inodes_requeue(inode);
1854 continue;
1855 }
1856 spin_unlock(&wb->list_lock);
1857
1858
1859
1860
1861
1862
1863 if (inode->i_state & I_SYNC) {
1864
1865 inode_sleep_on_writeback(inode);
1866
1867 spin_lock(&wb->list_lock);
1868 continue;
1869 }
1870 inode->i_state |= I_SYNC;
1871 wbc_attach_and_unlock_inode(&wbc, inode);
1872
1873 write_chunk = writeback_chunk_size(wb, work);
1874 wbc.nr_to_write = write_chunk;
1875 wbc.pages_skipped = 0;
1876
1877
1878
1879
1880
1881 __writeback_single_inode(inode, &wbc);
1882
1883 wbc_detach_inode(&wbc);
1884 work->nr_pages -= write_chunk - wbc.nr_to_write;
1885 wrote += write_chunk - wbc.nr_to_write;
1886
1887 if (need_resched()) {
1888
1889
1890
1891
1892
1893
1894
1895
1896 blk_flush_plug(current);
1897 cond_resched();
1898 }
1899
1900
1901
1902
1903
1904 tmp_wb = inode_to_wb_and_lock_list(inode);
1905 spin_lock(&inode->i_lock);
1906 if (!(inode->i_state & I_DIRTY_ALL))
1907 wrote++;
1908 requeue_inode(inode, tmp_wb, &wbc);
1909 inode_sync_complete(inode);
1910 spin_unlock(&inode->i_lock);
1911
1912 if (unlikely(tmp_wb != wb)) {
1913 spin_unlock(&tmp_wb->list_lock);
1914 spin_lock(&wb->list_lock);
1915 }
1916
1917
1918
1919
1920
1921 if (wrote) {
1922 if (time_is_before_jiffies(start_time + HZ / 10UL))
1923 break;
1924 if (work->nr_pages <= 0)
1925 break;
1926 }
1927 }
1928 return wrote;
1929}
1930
1931static long __writeback_inodes_wb(struct bdi_writeback *wb,
1932 struct wb_writeback_work *work)
1933{
1934 unsigned long start_time = jiffies;
1935 long wrote = 0;
1936
1937 while (!list_empty(&wb->b_io)) {
1938 struct inode *inode = wb_inode(wb->b_io.prev);
1939 struct super_block *sb = inode->i_sb;
1940
1941 if (!trylock_super(sb)) {
1942
1943
1944
1945
1946
1947 redirty_tail(inode, wb);
1948 continue;
1949 }
1950 wrote += writeback_sb_inodes(sb, wb, work);
1951 up_read(&sb->s_umount);
1952
1953
1954 if (wrote) {
1955 if (time_is_before_jiffies(start_time + HZ / 10UL))
1956 break;
1957 if (work->nr_pages <= 0)
1958 break;
1959 }
1960 }
1961
1962 return wrote;
1963}
1964
1965static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
1966 enum wb_reason reason)
1967{
1968 struct wb_writeback_work work = {
1969 .nr_pages = nr_pages,
1970 .sync_mode = WB_SYNC_NONE,
1971 .range_cyclic = 1,
1972 .reason = reason,
1973 };
1974 struct blk_plug plug;
1975
1976 blk_start_plug(&plug);
1977 spin_lock(&wb->list_lock);
1978 if (list_empty(&wb->b_io))
1979 queue_io(wb, &work, jiffies);
1980 __writeback_inodes_wb(wb, &work);
1981 spin_unlock(&wb->list_lock);
1982 blk_finish_plug(&plug);
1983
1984 return nr_pages - work.nr_pages;
1985}
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002static long wb_writeback(struct bdi_writeback *wb,
2003 struct wb_writeback_work *work)
2004{
2005 long nr_pages = work->nr_pages;
2006 unsigned long dirtied_before = jiffies;
2007 struct inode *inode;
2008 long progress;
2009 struct blk_plug plug;
2010
2011 blk_start_plug(&plug);
2012 spin_lock(&wb->list_lock);
2013 for (;;) {
2014
2015
2016
2017 if (work->nr_pages <= 0)
2018 break;
2019
2020
2021
2022
2023
2024
2025
2026 if ((work->for_background || work->for_kupdate) &&
2027 !list_empty(&wb->work_list))
2028 break;
2029
2030
2031
2032
2033
2034 if (work->for_background && !wb_over_bg_thresh(wb))
2035 break;
2036
2037
2038
2039
2040
2041
2042
2043 if (work->for_kupdate) {
2044 dirtied_before = jiffies -
2045 msecs_to_jiffies(dirty_expire_interval * 10);
2046 } else if (work->for_background)
2047 dirtied_before = jiffies;
2048
2049 trace_writeback_start(wb, work);
2050 if (list_empty(&wb->b_io))
2051 queue_io(wb, work, dirtied_before);
2052 if (work->sb)
2053 progress = writeback_sb_inodes(work->sb, wb, work);
2054 else
2055 progress = __writeback_inodes_wb(wb, work);
2056 trace_writeback_written(wb, work);
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066 if (progress)
2067 continue;
2068
2069
2070
2071 if (list_empty(&wb->b_more_io))
2072 break;
2073
2074
2075
2076
2077
2078 trace_writeback_wait(wb, work);
2079 inode = wb_inode(wb->b_more_io.prev);
2080 spin_lock(&inode->i_lock);
2081 spin_unlock(&wb->list_lock);
2082
2083 inode_sleep_on_writeback(inode);
2084 spin_lock(&wb->list_lock);
2085 }
2086 spin_unlock(&wb->list_lock);
2087 blk_finish_plug(&plug);
2088
2089 return nr_pages - work->nr_pages;
2090}
2091
2092
2093
2094
2095static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
2096{
2097 struct wb_writeback_work *work = NULL;
2098
2099 spin_lock_bh(&wb->work_lock);
2100 if (!list_empty(&wb->work_list)) {
2101 work = list_entry(wb->work_list.next,
2102 struct wb_writeback_work, list);
2103 list_del_init(&work->list);
2104 }
2105 spin_unlock_bh(&wb->work_lock);
2106 return work;
2107}
2108
2109static long wb_check_background_flush(struct bdi_writeback *wb)
2110{
2111 if (wb_over_bg_thresh(wb)) {
2112
2113 struct wb_writeback_work work = {
2114 .nr_pages = LONG_MAX,
2115 .sync_mode = WB_SYNC_NONE,
2116 .for_background = 1,
2117 .range_cyclic = 1,
2118 .reason = WB_REASON_BACKGROUND,
2119 };
2120
2121 return wb_writeback(wb, &work);
2122 }
2123
2124 return 0;
2125}
2126
2127static long wb_check_old_data_flush(struct bdi_writeback *wb)
2128{
2129 unsigned long expired;
2130 long nr_pages;
2131
2132
2133
2134
2135 if (!dirty_writeback_interval)
2136 return 0;
2137
2138 expired = wb->last_old_flush +
2139 msecs_to_jiffies(dirty_writeback_interval * 10);
2140 if (time_before(jiffies, expired))
2141 return 0;
2142
2143 wb->last_old_flush = jiffies;
2144 nr_pages = get_nr_dirty_pages();
2145
2146 if (nr_pages) {
2147 struct wb_writeback_work work = {
2148 .nr_pages = nr_pages,
2149 .sync_mode = WB_SYNC_NONE,
2150 .for_kupdate = 1,
2151 .range_cyclic = 1,
2152 .reason = WB_REASON_PERIODIC,
2153 };
2154
2155 return wb_writeback(wb, &work);
2156 }
2157
2158 return 0;
2159}
2160
2161static long wb_check_start_all(struct bdi_writeback *wb)
2162{
2163 long nr_pages;
2164
2165 if (!test_bit(WB_start_all, &wb->state))
2166 return 0;
2167
2168 nr_pages = get_nr_dirty_pages();
2169 if (nr_pages) {
2170 struct wb_writeback_work work = {
2171 .nr_pages = wb_split_bdi_pages(wb, nr_pages),
2172 .sync_mode = WB_SYNC_NONE,
2173 .range_cyclic = 1,
2174 .reason = wb->start_all_reason,
2175 };
2176
2177 nr_pages = wb_writeback(wb, &work);
2178 }
2179
2180 clear_bit(WB_start_all, &wb->state);
2181 return nr_pages;
2182}
2183
2184
2185
2186
2187
2188static long wb_do_writeback(struct bdi_writeback *wb)
2189{
2190 struct wb_writeback_work *work;
2191 long wrote = 0;
2192
2193 set_bit(WB_writeback_running, &wb->state);
2194 while ((work = get_next_work_item(wb)) != NULL) {
2195 trace_writeback_exec(wb, work);
2196 wrote += wb_writeback(wb, work);
2197 finish_writeback_work(wb, work);
2198 }
2199
2200
2201
2202
2203 wrote += wb_check_start_all(wb);
2204
2205
2206
2207
2208 wrote += wb_check_old_data_flush(wb);
2209 wrote += wb_check_background_flush(wb);
2210 clear_bit(WB_writeback_running, &wb->state);
2211
2212 return wrote;
2213}
2214
2215
2216
2217
2218
2219void wb_workfn(struct work_struct *work)
2220{
2221 struct bdi_writeback *wb = container_of(to_delayed_work(work),
2222 struct bdi_writeback, dwork);
2223 long pages_written;
2224
2225 set_worker_desc("flush-%s", bdi_dev_name(wb->bdi));
2226 current->flags |= PF_SWAPWRITE;
2227
2228 if (likely(!current_is_workqueue_rescuer() ||
2229 !test_bit(WB_registered, &wb->state))) {
2230
2231
2232
2233
2234
2235
2236 do {
2237 pages_written = wb_do_writeback(wb);
2238 trace_writeback_pages_written(pages_written);
2239 } while (!list_empty(&wb->work_list));
2240 } else {
2241
2242
2243
2244
2245
2246 pages_written = writeback_inodes_wb(wb, 1024,
2247 WB_REASON_FORKER_THREAD);
2248 trace_writeback_pages_written(pages_written);
2249 }
2250
2251 if (!list_empty(&wb->work_list))
2252 wb_wakeup(wb);
2253 else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
2254 wb_wakeup_delayed(wb);
2255
2256 current->flags &= ~PF_SWAPWRITE;
2257}
2258
2259
2260
2261
2262
2263static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
2264 enum wb_reason reason)
2265{
2266 struct bdi_writeback *wb;
2267
2268 if (!bdi_has_dirty_io(bdi))
2269 return;
2270
2271 list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
2272 wb_start_writeback(wb, reason);
2273}
2274
2275void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
2276 enum wb_reason reason)
2277{
2278 rcu_read_lock();
2279 __wakeup_flusher_threads_bdi(bdi, reason);
2280 rcu_read_unlock();
2281}
2282
2283
2284
2285
2286void wakeup_flusher_threads(enum wb_reason reason)
2287{
2288 struct backing_dev_info *bdi;
2289
2290
2291
2292
2293 if (blk_needs_flush_plug(current))
2294 blk_schedule_flush_plug(current);
2295
2296 rcu_read_lock();
2297 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
2298 __wakeup_flusher_threads_bdi(bdi, reason);
2299 rcu_read_unlock();
2300}
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317static void wakeup_dirtytime_writeback(struct work_struct *w);
2318static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback);
2319
2320static void wakeup_dirtytime_writeback(struct work_struct *w)
2321{
2322 struct backing_dev_info *bdi;
2323
2324 rcu_read_lock();
2325 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
2326 struct bdi_writeback *wb;
2327
2328 list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
2329 if (!list_empty(&wb->b_dirty_time))
2330 wb_wakeup(wb);
2331 }
2332 rcu_read_unlock();
2333 schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
2334}
2335
2336static int __init start_dirtytime_writeback(void)
2337{
2338 schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
2339 return 0;
2340}
2341__initcall(start_dirtytime_writeback);
2342
2343int dirtytime_interval_handler(struct ctl_table *table, int write,
2344 void *buffer, size_t *lenp, loff_t *ppos)
2345{
2346 int ret;
2347
2348 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
2349 if (ret == 0 && write)
2350 mod_delayed_work(system_wq, &dirtytime_work, 0);
2351 return ret;
2352}
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381void __mark_inode_dirty(struct inode *inode, int flags)
2382{
2383 struct super_block *sb = inode->i_sb;
2384 int dirtytime = 0;
2385
2386 trace_writeback_mark_inode_dirty(inode, flags);
2387
2388 if (flags & I_DIRTY_INODE) {
2389
2390
2391
2392
2393
2394
2395
2396 trace_writeback_dirty_inode_start(inode, flags);
2397 if (sb->s_op->dirty_inode)
2398 sb->s_op->dirty_inode(inode, flags & I_DIRTY_INODE);
2399 trace_writeback_dirty_inode(inode, flags);
2400
2401
2402 flags &= ~I_DIRTY_TIME;
2403 } else {
2404
2405
2406
2407
2408
2409 dirtytime = flags & I_DIRTY_TIME;
2410 WARN_ON_ONCE(dirtytime && flags != I_DIRTY_TIME);
2411 }
2412
2413
2414
2415
2416
2417 smp_mb();
2418
2419 if (((inode->i_state & flags) == flags) ||
2420 (dirtytime && (inode->i_state & I_DIRTY_INODE)))
2421 return;
2422
2423 spin_lock(&inode->i_lock);
2424 if (dirtytime && (inode->i_state & I_DIRTY_INODE))
2425 goto out_unlock_inode;
2426 if ((inode->i_state & flags) != flags) {
2427 const int was_dirty = inode->i_state & I_DIRTY;
2428
2429 inode_attach_wb(inode, NULL);
2430
2431
2432 if (flags & I_DIRTY_INODE)
2433 inode->i_state &= ~I_DIRTY_TIME;
2434 inode->i_state |= flags;
2435
2436
2437
2438
2439
2440
2441
2442 if (inode->i_state & I_SYNC_QUEUED)
2443 goto out_unlock_inode;
2444
2445
2446
2447
2448
2449 if (!S_ISBLK(inode->i_mode)) {
2450 if (inode_unhashed(inode))
2451 goto out_unlock_inode;
2452 }
2453 if (inode->i_state & I_FREEING)
2454 goto out_unlock_inode;
2455
2456
2457
2458
2459
2460 if (!was_dirty) {
2461 struct bdi_writeback *wb;
2462 struct list_head *dirty_list;
2463 bool wakeup_bdi = false;
2464
2465 wb = locked_inode_to_wb_and_lock_list(inode);
2466
2467 inode->dirtied_when = jiffies;
2468 if (dirtytime)
2469 inode->dirtied_time_when = jiffies;
2470
2471 if (inode->i_state & I_DIRTY)
2472 dirty_list = &wb->b_dirty;
2473 else
2474 dirty_list = &wb->b_dirty_time;
2475
2476 wakeup_bdi = inode_io_list_move_locked(inode, wb,
2477 dirty_list);
2478
2479 spin_unlock(&wb->list_lock);
2480 trace_writeback_dirty_inode_enqueue(inode);
2481
2482
2483
2484
2485
2486
2487
2488 if (wakeup_bdi &&
2489 (wb->bdi->capabilities & BDI_CAP_WRITEBACK))
2490 wb_wakeup_delayed(wb);
2491 return;
2492 }
2493 }
2494out_unlock_inode:
2495 spin_unlock(&inode->i_lock);
2496}
2497EXPORT_SYMBOL(__mark_inode_dirty);
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508static void wait_sb_inodes(struct super_block *sb)
2509{
2510 LIST_HEAD(sync_list);
2511
2512
2513
2514
2515
2516 WARN_ON(!rwsem_is_locked(&sb->s_umount));
2517
2518 mutex_lock(&sb->s_sync_lock);
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529 rcu_read_lock();
2530 spin_lock_irq(&sb->s_inode_wblist_lock);
2531 list_splice_init(&sb->s_inodes_wb, &sync_list);
2532
2533
2534
2535
2536
2537
2538
2539
2540 while (!list_empty(&sync_list)) {
2541 struct inode *inode = list_first_entry(&sync_list, struct inode,
2542 i_wb_list);
2543 struct address_space *mapping = inode->i_mapping;
2544
2545
2546
2547
2548
2549
2550
2551 list_move_tail(&inode->i_wb_list, &sb->s_inodes_wb);
2552
2553
2554
2555
2556
2557
2558 if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
2559 continue;
2560
2561 spin_unlock_irq(&sb->s_inode_wblist_lock);
2562
2563 spin_lock(&inode->i_lock);
2564 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
2565 spin_unlock(&inode->i_lock);
2566
2567 spin_lock_irq(&sb->s_inode_wblist_lock);
2568 continue;
2569 }
2570 __iget(inode);
2571 spin_unlock(&inode->i_lock);
2572 rcu_read_unlock();
2573
2574
2575
2576
2577
2578
2579 filemap_fdatawait_keep_errors(mapping);
2580
2581 cond_resched();
2582
2583 iput(inode);
2584
2585 rcu_read_lock();
2586 spin_lock_irq(&sb->s_inode_wblist_lock);
2587 }
2588 spin_unlock_irq(&sb->s_inode_wblist_lock);
2589 rcu_read_unlock();
2590 mutex_unlock(&sb->s_sync_lock);
2591}
2592
2593static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
2594 enum wb_reason reason, bool skip_if_busy)
2595{
2596 struct backing_dev_info *bdi = sb->s_bdi;
2597 DEFINE_WB_COMPLETION(done, bdi);
2598 struct wb_writeback_work work = {
2599 .sb = sb,
2600 .sync_mode = WB_SYNC_NONE,
2601 .tagged_writepages = 1,
2602 .done = &done,
2603 .nr_pages = nr,
2604 .reason = reason,
2605 };
2606
2607 if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
2608 return;
2609 WARN_ON(!rwsem_is_locked(&sb->s_umount));
2610
2611 bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
2612 wb_wait_for_completion(&done);
2613}
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625void writeback_inodes_sb_nr(struct super_block *sb,
2626 unsigned long nr,
2627 enum wb_reason reason)
2628{
2629 __writeback_inodes_sb_nr(sb, nr, reason, false);
2630}
2631EXPORT_SYMBOL(writeback_inodes_sb_nr);
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
2643{
2644 return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
2645}
2646EXPORT_SYMBOL(writeback_inodes_sb);
2647
2648
2649
2650
2651
2652
2653
2654
2655void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
2656{
2657 if (!down_read_trylock(&sb->s_umount))
2658 return;
2659
2660 __writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason, true);
2661 up_read(&sb->s_umount);
2662}
2663EXPORT_SYMBOL(try_to_writeback_inodes_sb);
2664
2665
2666
2667
2668
2669
2670
2671
2672void sync_inodes_sb(struct super_block *sb)
2673{
2674 struct backing_dev_info *bdi = sb->s_bdi;
2675 DEFINE_WB_COMPLETION(done, bdi);
2676 struct wb_writeback_work work = {
2677 .sb = sb,
2678 .sync_mode = WB_SYNC_ALL,
2679 .nr_pages = LONG_MAX,
2680 .range_cyclic = 0,
2681 .done = &done,
2682 .reason = WB_REASON_SYNC,
2683 .for_sync = 1,
2684 };
2685
2686
2687
2688
2689
2690
2691 if (bdi == &noop_backing_dev_info)
2692 return;
2693 WARN_ON(!rwsem_is_locked(&sb->s_umount));
2694
2695
2696 bdi_down_write_wb_switch_rwsem(bdi);
2697 bdi_split_work_to_wbs(bdi, &work, false);
2698 wb_wait_for_completion(&done);
2699 bdi_up_write_wb_switch_rwsem(bdi);
2700
2701 wait_sb_inodes(sb);
2702}
2703EXPORT_SYMBOL(sync_inodes_sb);
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715int write_inode_now(struct inode *inode, int sync)
2716{
2717 struct writeback_control wbc = {
2718 .nr_to_write = LONG_MAX,
2719 .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
2720 .range_start = 0,
2721 .range_end = LLONG_MAX,
2722 };
2723
2724 if (!mapping_can_writeback(inode->i_mapping))
2725 wbc.nr_to_write = 0;
2726
2727 might_sleep();
2728 return writeback_single_inode(inode, &wbc);
2729}
2730EXPORT_SYMBOL(write_inode_now);
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741int sync_inode_metadata(struct inode *inode, int wait)
2742{
2743 struct writeback_control wbc = {
2744 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
2745 .nr_to_write = 0,
2746 };
2747
2748 return writeback_single_inode(inode, &wbc);
2749}
2750EXPORT_SYMBOL(sync_inode_metadata);
2751