1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17#include <linux/kernel.h>
18#include <linux/export.h>
19#include <linux/spinlock.h>
20#include <linux/slab.h>
21#include <linux/sched.h>
22#include <linux/fs.h>
23#include <linux/mm.h>
24#include <linux/pagemap.h>
25#include <linux/kthread.h>
26#include <linux/writeback.h>
27#include <linux/blkdev.h>
28#include <linux/backing-dev.h>
29#include <linux/tracepoint.h>
30#include <linux/device.h>
31#include <linux/memcontrol.h>
32#include "internal.h"
33
34
35
36
37#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10))
38
39
40
41
42struct wb_writeback_work {
43 long nr_pages;
44 struct super_block *sb;
45 enum writeback_sync_modes sync_mode;
46 unsigned int tagged_writepages:1;
47 unsigned int for_kupdate:1;
48 unsigned int range_cyclic:1;
49 unsigned int for_background:1;
50 unsigned int for_sync:1;
51 unsigned int auto_free:1;
52 enum wb_reason reason;
53
54 struct list_head list;
55 struct wb_completion *done;
56};
57
58
59
60
61
62
63
64
65
66
67
68unsigned int dirtytime_expire_interval = 12 * 60 * 60;
69
70static inline struct inode *wb_inode(struct list_head *head)
71{
72 return list_entry(head, struct inode, i_io_list);
73}
74
75
76
77
78
79
80#define CREATE_TRACE_POINTS
81#include <trace/events/writeback.h>
82
83EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
84
85static bool wb_io_lists_populated(struct bdi_writeback *wb)
86{
87 if (wb_has_dirty_io(wb)) {
88 return false;
89 } else {
90 set_bit(WB_has_dirty_io, &wb->state);
91 WARN_ON_ONCE(!wb->avg_write_bandwidth);
92 atomic_long_add(wb->avg_write_bandwidth,
93 &wb->bdi->tot_write_bandwidth);
94 return true;
95 }
96}
97
98static void wb_io_lists_depopulated(struct bdi_writeback *wb)
99{
100 if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) &&
101 list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) {
102 clear_bit(WB_has_dirty_io, &wb->state);
103 WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth,
104 &wb->bdi->tot_write_bandwidth) < 0);
105 }
106}
107
108
109
110
111
112
113
114
115
116
117
118static bool inode_io_list_move_locked(struct inode *inode,
119 struct bdi_writeback *wb,
120 struct list_head *head)
121{
122 assert_spin_locked(&wb->list_lock);
123
124 list_move(&inode->i_io_list, head);
125
126
127 if (head != &wb->b_dirty_time)
128 return wb_io_lists_populated(wb);
129
130 wb_io_lists_depopulated(wb);
131 return false;
132}
133
134static void wb_wakeup(struct bdi_writeback *wb)
135{
136 spin_lock_bh(&wb->work_lock);
137 if (test_bit(WB_registered, &wb->state))
138 mod_delayed_work(bdi_wq, &wb->dwork, 0);
139 spin_unlock_bh(&wb->work_lock);
140}
141
142static void finish_writeback_work(struct bdi_writeback *wb,
143 struct wb_writeback_work *work)
144{
145 struct wb_completion *done = work->done;
146
147 if (work->auto_free)
148 kfree(work);
149 if (done) {
150 wait_queue_head_t *waitq = done->waitq;
151
152
153 if (atomic_dec_and_test(&done->cnt))
154 wake_up_all(waitq);
155 }
156}
157
158static void wb_queue_work(struct bdi_writeback *wb,
159 struct wb_writeback_work *work)
160{
161 trace_writeback_queue(wb, work);
162
163 if (work->done)
164 atomic_inc(&work->done->cnt);
165
166 spin_lock_bh(&wb->work_lock);
167
168 if (test_bit(WB_registered, &wb->state)) {
169 list_add_tail(&work->list, &wb->work_list);
170 mod_delayed_work(bdi_wq, &wb->dwork, 0);
171 } else
172 finish_writeback_work(wb, work);
173
174 spin_unlock_bh(&wb->work_lock);
175}
176
177
178
179
180
181
182
183
184
185
186
187void wb_wait_for_completion(struct wb_completion *done)
188{
189 atomic_dec(&done->cnt);
190 wait_event(*done->waitq, !atomic_read(&done->cnt));
191}
192
193#ifdef CONFIG_CGROUP_WRITEBACK
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214#define WB_FRN_TIME_SHIFT 13
215#define WB_FRN_TIME_AVG_SHIFT 3
216#define WB_FRN_TIME_CUT_DIV 8
217#define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT))
218
219#define WB_FRN_HIST_SLOTS 16
220#define WB_FRN_HIST_UNIT (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS)
221
222#define WB_FRN_HIST_THR_SLOTS (WB_FRN_HIST_SLOTS / 2)
223
224#define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1)
225
226#define WB_FRN_MAX_IN_FLIGHT 1024
227
228
229
230
231
232#define WB_MAX_INODES_PER_ISW ((1024UL - sizeof(struct inode_switch_wbs_context)) \
233 / sizeof(struct inode *))
234
235static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
236static struct workqueue_struct *isw_wq;
237
238void __inode_attach_wb(struct inode *inode, struct page *page)
239{
240 struct backing_dev_info *bdi = inode_to_bdi(inode);
241 struct bdi_writeback *wb = NULL;
242
243 if (inode_cgwb_enabled(inode)) {
244 struct cgroup_subsys_state *memcg_css;
245
246 if (page) {
247 memcg_css = mem_cgroup_css_from_page(page);
248 wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
249 } else {
250
251 memcg_css = task_get_css(current, memory_cgrp_id);
252 wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
253 css_put(memcg_css);
254 }
255 }
256
257 if (!wb)
258 wb = &bdi->wb;
259
260
261
262
263
264 if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
265 wb_put(wb);
266}
267EXPORT_SYMBOL_GPL(__inode_attach_wb);
268
269
270
271
272
273
274
275
276
277static void inode_cgwb_move_to_attached(struct inode *inode,
278 struct bdi_writeback *wb)
279{
280 assert_spin_locked(&wb->list_lock);
281 assert_spin_locked(&inode->i_lock);
282
283 inode->i_state &= ~I_SYNC_QUEUED;
284 if (wb != &wb->bdi->wb)
285 list_move(&inode->i_io_list, &wb->b_attached);
286 else
287 list_del_init(&inode->i_io_list);
288 wb_io_lists_depopulated(wb);
289}
290
291
292
293
294
295
296
297
298
299static struct bdi_writeback *
300locked_inode_to_wb_and_lock_list(struct inode *inode)
301 __releases(&inode->i_lock)
302 __acquires(&wb->list_lock)
303{
304 while (true) {
305 struct bdi_writeback *wb = inode_to_wb(inode);
306
307
308
309
310
311
312
313 wb_get(wb);
314 spin_unlock(&inode->i_lock);
315 spin_lock(&wb->list_lock);
316
317
318 if (likely(wb == inode->i_wb)) {
319 wb_put(wb);
320 return wb;
321 }
322
323 spin_unlock(&wb->list_lock);
324 wb_put(wb);
325 cpu_relax();
326 spin_lock(&inode->i_lock);
327 }
328}
329
330
331
332
333
334
335
336
337static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
338 __acquires(&wb->list_lock)
339{
340 spin_lock(&inode->i_lock);
341 return locked_inode_to_wb_and_lock_list(inode);
342}
343
344struct inode_switch_wbs_context {
345 struct rcu_work work;
346
347
348
349
350
351
352
353
354
355 struct bdi_writeback *new_wb;
356 struct inode *inodes[];
357};
358
359static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
360{
361 down_write(&bdi->wb_switch_rwsem);
362}
363
364static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
365{
366 up_write(&bdi->wb_switch_rwsem);
367}
368
369static bool inode_do_switch_wbs(struct inode *inode,
370 struct bdi_writeback *old_wb,
371 struct bdi_writeback *new_wb)
372{
373 struct address_space *mapping = inode->i_mapping;
374 XA_STATE(xas, &mapping->i_pages, 0);
375 struct page *page;
376 bool switched = false;
377
378 spin_lock(&inode->i_lock);
379 xa_lock_irq(&mapping->i_pages);
380
381
382
383
384
385 if (unlikely(inode->i_state & (I_FREEING | I_WILL_FREE)))
386 goto skip_switch;
387
388 trace_inode_switch_wbs(inode, old_wb, new_wb);
389
390
391
392
393
394
395 xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_DIRTY) {
396 if (PageDirty(page)) {
397 dec_wb_stat(old_wb, WB_RECLAIMABLE);
398 inc_wb_stat(new_wb, WB_RECLAIMABLE);
399 }
400 }
401
402 xas_set(&xas, 0);
403 xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) {
404 WARN_ON_ONCE(!PageWriteback(page));
405 dec_wb_stat(old_wb, WB_WRITEBACK);
406 inc_wb_stat(new_wb, WB_WRITEBACK);
407 }
408
409 wb_get(new_wb);
410
411
412
413
414
415
416
417
418
419 if (!list_empty(&inode->i_io_list)) {
420 inode->i_wb = new_wb;
421
422 if (inode->i_state & I_DIRTY_ALL) {
423 struct inode *pos;
424
425 list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
426 if (time_after_eq(inode->dirtied_when,
427 pos->dirtied_when))
428 break;
429 inode_io_list_move_locked(inode, new_wb,
430 pos->i_io_list.prev);
431 } else {
432 inode_cgwb_move_to_attached(inode, new_wb);
433 }
434 } else {
435 inode->i_wb = new_wb;
436 }
437
438
439 inode->i_wb_frn_winner = 0;
440 inode->i_wb_frn_avg_time = 0;
441 inode->i_wb_frn_history = 0;
442 switched = true;
443skip_switch:
444
445
446
447
448 smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
449
450 xa_unlock_irq(&mapping->i_pages);
451 spin_unlock(&inode->i_lock);
452
453 return switched;
454}
455
456static void inode_switch_wbs_work_fn(struct work_struct *work)
457{
458 struct inode_switch_wbs_context *isw =
459 container_of(to_rcu_work(work), struct inode_switch_wbs_context, work);
460 struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]);
461 struct bdi_writeback *old_wb = isw->inodes[0]->i_wb;
462 struct bdi_writeback *new_wb = isw->new_wb;
463 unsigned long nr_switched = 0;
464 struct inode **inodep;
465
466
467
468
469
470 down_read(&bdi->wb_switch_rwsem);
471
472
473
474
475
476
477
478
479
480
481
482 if (old_wb < new_wb) {
483 spin_lock(&old_wb->list_lock);
484 spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
485 } else {
486 spin_lock(&new_wb->list_lock);
487 spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
488 }
489
490 for (inodep = isw->inodes; *inodep; inodep++) {
491 WARN_ON_ONCE((*inodep)->i_wb != old_wb);
492 if (inode_do_switch_wbs(*inodep, old_wb, new_wb))
493 nr_switched++;
494 }
495
496 spin_unlock(&new_wb->list_lock);
497 spin_unlock(&old_wb->list_lock);
498
499 up_read(&bdi->wb_switch_rwsem);
500
501 if (nr_switched) {
502 wb_wakeup(new_wb);
503 wb_put_many(old_wb, nr_switched);
504 }
505
506 for (inodep = isw->inodes; *inodep; inodep++)
507 iput(*inodep);
508 wb_put(new_wb);
509 kfree(isw);
510 atomic_dec(&isw_nr_in_flight);
511}
512
513static bool inode_prepare_wbs_switch(struct inode *inode,
514 struct bdi_writeback *new_wb)
515{
516
517
518
519
520
521
522 smp_mb();
523
524 if (IS_DAX(inode))
525 return false;
526
527
528 spin_lock(&inode->i_lock);
529 if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
530 inode->i_state & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) ||
531 inode_to_wb(inode) == new_wb) {
532 spin_unlock(&inode->i_lock);
533 return false;
534 }
535 inode->i_state |= I_WB_SWITCH;
536 __iget(inode);
537 spin_unlock(&inode->i_lock);
538
539 return true;
540}
541
542
543
544
545
546
547
548
549
550static void inode_switch_wbs(struct inode *inode, int new_wb_id)
551{
552 struct backing_dev_info *bdi = inode_to_bdi(inode);
553 struct cgroup_subsys_state *memcg_css;
554 struct inode_switch_wbs_context *isw;
555
556
557 if (inode->i_state & I_WB_SWITCH)
558 return;
559
560
561 if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT)
562 return;
563
564 isw = kzalloc(sizeof(*isw) + 2 * sizeof(struct inode *), GFP_ATOMIC);
565 if (!isw)
566 return;
567
568 atomic_inc(&isw_nr_in_flight);
569
570
571 rcu_read_lock();
572 memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
573 if (memcg_css && !css_tryget(memcg_css))
574 memcg_css = NULL;
575 rcu_read_unlock();
576 if (!memcg_css)
577 goto out_free;
578
579 isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
580 css_put(memcg_css);
581 if (!isw->new_wb)
582 goto out_free;
583
584 if (!inode_prepare_wbs_switch(inode, isw->new_wb))
585 goto out_free;
586
587 isw->inodes[0] = inode;
588
589
590
591
592
593
594
595 INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
596 queue_rcu_work(isw_wq, &isw->work);
597 return;
598
599out_free:
600 atomic_dec(&isw_nr_in_flight);
601 if (isw->new_wb)
602 wb_put(isw->new_wb);
603 kfree(isw);
604}
605
606
607
608
609
610
611
612
613
614bool cleanup_offline_cgwb(struct bdi_writeback *wb)
615{
616 struct cgroup_subsys_state *memcg_css;
617 struct inode_switch_wbs_context *isw;
618 struct inode *inode;
619 int nr;
620 bool restart = false;
621
622 isw = kzalloc(sizeof(*isw) + WB_MAX_INODES_PER_ISW *
623 sizeof(struct inode *), GFP_KERNEL);
624 if (!isw)
625 return restart;
626
627 atomic_inc(&isw_nr_in_flight);
628
629 for (memcg_css = wb->memcg_css->parent; memcg_css;
630 memcg_css = memcg_css->parent) {
631 isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
632 if (isw->new_wb)
633 break;
634 }
635 if (unlikely(!isw->new_wb))
636 isw->new_wb = &wb->bdi->wb;
637
638 nr = 0;
639 spin_lock(&wb->list_lock);
640 list_for_each_entry(inode, &wb->b_attached, i_io_list) {
641 if (!inode_prepare_wbs_switch(inode, isw->new_wb))
642 continue;
643
644 isw->inodes[nr++] = inode;
645
646 if (nr >= WB_MAX_INODES_PER_ISW - 1) {
647 restart = true;
648 break;
649 }
650 }
651 spin_unlock(&wb->list_lock);
652
653
654 if (nr == 0) {
655 atomic_dec(&isw_nr_in_flight);
656 wb_put(isw->new_wb);
657 kfree(isw);
658 return restart;
659 }
660
661
662
663
664
665
666
667 INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
668 queue_rcu_work(isw_wq, &isw->work);
669
670 return restart;
671}
672
673
674
675
676
677
678
679
680
681
682
683void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
684 struct inode *inode)
685{
686 if (!inode_cgwb_enabled(inode)) {
687 spin_unlock(&inode->i_lock);
688 return;
689 }
690
691 wbc->wb = inode_to_wb(inode);
692 wbc->inode = inode;
693
694 wbc->wb_id = wbc->wb->memcg_css->id;
695 wbc->wb_lcand_id = inode->i_wb_frn_winner;
696 wbc->wb_tcand_id = 0;
697 wbc->wb_bytes = 0;
698 wbc->wb_lcand_bytes = 0;
699 wbc->wb_tcand_bytes = 0;
700
701 wb_get(wbc->wb);
702 spin_unlock(&inode->i_lock);
703
704
705
706
707
708
709
710
711 if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css)))
712 inode_switch_wbs(inode, wbc->wb_id);
713}
714EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode);
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753void wbc_detach_inode(struct writeback_control *wbc)
754{
755 struct bdi_writeback *wb = wbc->wb;
756 struct inode *inode = wbc->inode;
757 unsigned long avg_time, max_bytes, max_time;
758 u16 history;
759 int max_id;
760
761 if (!wb)
762 return;
763
764 history = inode->i_wb_frn_history;
765 avg_time = inode->i_wb_frn_avg_time;
766
767
768 if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
769 wbc->wb_bytes >= wbc->wb_tcand_bytes) {
770 max_id = wbc->wb_id;
771 max_bytes = wbc->wb_bytes;
772 } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) {
773 max_id = wbc->wb_lcand_id;
774 max_bytes = wbc->wb_lcand_bytes;
775 } else {
776 max_id = wbc->wb_tcand_id;
777 max_bytes = wbc->wb_tcand_bytes;
778 }
779
780
781
782
783
784
785
786
787 max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT,
788 wb->avg_write_bandwidth);
789 if (avg_time)
790 avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) -
791 (avg_time >> WB_FRN_TIME_AVG_SHIFT);
792 else
793 avg_time = max_time;
794
795 if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) {
796 int slots;
797
798
799
800
801
802
803
804
805
806 slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT),
807 (unsigned long)WB_FRN_HIST_MAX_SLOTS);
808 history <<= slots;
809 if (wbc->wb_id != max_id)
810 history |= (1U << slots) - 1;
811
812 if (history)
813 trace_inode_foreign_history(inode, wbc, history);
814
815
816
817
818
819
820
821
822 if (hweight32(history) > WB_FRN_HIST_THR_SLOTS)
823 inode_switch_wbs(inode, max_id);
824 }
825
826
827
828
829
830 inode->i_wb_frn_winner = max_id;
831 inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX);
832 inode->i_wb_frn_history = history;
833
834 wb_put(wbc->wb);
835 wbc->wb = NULL;
836}
837EXPORT_SYMBOL_GPL(wbc_detach_inode);
838
839
840
841
842
843
844
845
846
847
848
849void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
850 size_t bytes)
851{
852 struct cgroup_subsys_state *css;
853 int id;
854
855
856
857
858
859
860
861 if (!wbc->wb || wbc->no_cgroup_owner)
862 return;
863
864 css = mem_cgroup_css_from_page(page);
865
866 if (!(css->flags & CSS_ONLINE))
867 return;
868
869 id = css->id;
870
871 if (id == wbc->wb_id) {
872 wbc->wb_bytes += bytes;
873 return;
874 }
875
876 if (id == wbc->wb_lcand_id)
877 wbc->wb_lcand_bytes += bytes;
878
879
880 if (!wbc->wb_tcand_bytes)
881 wbc->wb_tcand_id = id;
882 if (id == wbc->wb_tcand_id)
883 wbc->wb_tcand_bytes += bytes;
884 else
885 wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
886}
887EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905int inode_congested(struct inode *inode, int cong_bits)
906{
907
908
909
910
911 if (inode && inode_to_wb_is_valid(inode)) {
912 struct bdi_writeback *wb;
913 struct wb_lock_cookie lock_cookie = {};
914 bool congested;
915
916 wb = unlocked_inode_to_wb_begin(inode, &lock_cookie);
917 congested = wb_congested(wb, cong_bits);
918 unlocked_inode_to_wb_end(inode, &lock_cookie);
919 return congested;
920 }
921
922 return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
923}
924EXPORT_SYMBOL_GPL(inode_congested);
925
926
927
928
929
930
931
932
933
934
935static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
936{
937 unsigned long this_bw = wb->avg_write_bandwidth;
938 unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
939
940 if (nr_pages == LONG_MAX)
941 return LONG_MAX;
942
943
944
945
946
947
948 if (!tot_bw || this_bw >= tot_bw)
949 return nr_pages;
950 else
951 return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
952}
953
954
955
956
957
958
959
960
961
962
963
964
965static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
966 struct wb_writeback_work *base_work,
967 bool skip_if_busy)
968{
969 struct bdi_writeback *last_wb = NULL;
970 struct bdi_writeback *wb = list_entry(&bdi->wb_list,
971 struct bdi_writeback, bdi_node);
972
973 might_sleep();
974restart:
975 rcu_read_lock();
976 list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
977 DEFINE_WB_COMPLETION(fallback_work_done, bdi);
978 struct wb_writeback_work fallback_work;
979 struct wb_writeback_work *work;
980 long nr_pages;
981
982 if (last_wb) {
983 wb_put(last_wb);
984 last_wb = NULL;
985 }
986
987
988 if (!wb_has_dirty_io(wb) &&
989 (base_work->sync_mode == WB_SYNC_NONE ||
990 list_empty(&wb->b_dirty_time)))
991 continue;
992 if (skip_if_busy && writeback_in_progress(wb))
993 continue;
994
995 nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);
996
997 work = kmalloc(sizeof(*work), GFP_ATOMIC);
998 if (work) {
999 *work = *base_work;
1000 work->nr_pages = nr_pages;
1001 work->auto_free = 1;
1002 wb_queue_work(wb, work);
1003 continue;
1004 }
1005
1006
1007 work = &fallback_work;
1008 *work = *base_work;
1009 work->nr_pages = nr_pages;
1010 work->auto_free = 0;
1011 work->done = &fallback_work_done;
1012
1013 wb_queue_work(wb, work);
1014
1015
1016
1017
1018
1019
1020 wb_get(wb);
1021 last_wb = wb;
1022
1023 rcu_read_unlock();
1024 wb_wait_for_completion(&fallback_work_done);
1025 goto restart;
1026 }
1027 rcu_read_unlock();
1028
1029 if (last_wb)
1030 wb_put(last_wb);
1031}
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr,
1045 enum wb_reason reason, struct wb_completion *done)
1046{
1047 struct backing_dev_info *bdi;
1048 struct cgroup_subsys_state *memcg_css;
1049 struct bdi_writeback *wb;
1050 struct wb_writeback_work *work;
1051 int ret;
1052
1053
1054 bdi = bdi_get_by_id(bdi_id);
1055 if (!bdi)
1056 return -ENOENT;
1057
1058 rcu_read_lock();
1059 memcg_css = css_from_id(memcg_id, &memory_cgrp_subsys);
1060 if (memcg_css && !css_tryget(memcg_css))
1061 memcg_css = NULL;
1062 rcu_read_unlock();
1063 if (!memcg_css) {
1064 ret = -ENOENT;
1065 goto out_bdi_put;
1066 }
1067
1068
1069
1070
1071
1072 wb = wb_get_lookup(bdi, memcg_css);
1073 if (!wb) {
1074 ret = -ENOENT;
1075 goto out_css_put;
1076 }
1077
1078
1079
1080
1081
1082
1083
1084
1085 if (!nr) {
1086 unsigned long filepages, headroom, dirty, writeback;
1087
1088 mem_cgroup_wb_stats(wb, &filepages, &headroom, &dirty,
1089 &writeback);
1090 nr = dirty * 10 / 8;
1091 }
1092
1093
1094 work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN);
1095 if (work) {
1096 work->nr_pages = nr;
1097 work->sync_mode = WB_SYNC_NONE;
1098 work->range_cyclic = 1;
1099 work->reason = reason;
1100 work->done = done;
1101 work->auto_free = 1;
1102 wb_queue_work(wb, work);
1103 ret = 0;
1104 } else {
1105 ret = -ENOMEM;
1106 }
1107
1108 wb_put(wb);
1109out_css_put:
1110 css_put(memcg_css);
1111out_bdi_put:
1112 bdi_put(bdi);
1113 return ret;
1114}
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126void cgroup_writeback_umount(void)
1127{
1128
1129
1130
1131
1132 smp_mb();
1133
1134 if (atomic_read(&isw_nr_in_flight)) {
1135
1136
1137
1138
1139 rcu_barrier();
1140 flush_workqueue(isw_wq);
1141 }
1142}
1143
1144static int __init cgroup_writeback_init(void)
1145{
1146 isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
1147 if (!isw_wq)
1148 return -ENOMEM;
1149 return 0;
1150}
1151fs_initcall(cgroup_writeback_init);
1152
1153#else
1154
1155static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
1156static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
1157
1158static void inode_cgwb_move_to_attached(struct inode *inode,
1159 struct bdi_writeback *wb)
1160{
1161 assert_spin_locked(&wb->list_lock);
1162 assert_spin_locked(&inode->i_lock);
1163
1164 inode->i_state &= ~I_SYNC_QUEUED;
1165 list_del_init(&inode->i_io_list);
1166 wb_io_lists_depopulated(wb);
1167}
1168
1169static struct bdi_writeback *
1170locked_inode_to_wb_and_lock_list(struct inode *inode)
1171 __releases(&inode->i_lock)
1172 __acquires(&wb->list_lock)
1173{
1174 struct bdi_writeback *wb = inode_to_wb(inode);
1175
1176 spin_unlock(&inode->i_lock);
1177 spin_lock(&wb->list_lock);
1178 return wb;
1179}
1180
1181static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
1182 __acquires(&wb->list_lock)
1183{
1184 struct bdi_writeback *wb = inode_to_wb(inode);
1185
1186 spin_lock(&wb->list_lock);
1187 return wb;
1188}
1189
1190static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
1191{
1192 return nr_pages;
1193}
1194
1195static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
1196 struct wb_writeback_work *base_work,
1197 bool skip_if_busy)
1198{
1199 might_sleep();
1200
1201 if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
1202 base_work->auto_free = 0;
1203 wb_queue_work(&bdi->wb, base_work);
1204 }
1205}
1206
1207#endif
1208
1209
1210
1211
1212
1213static unsigned long get_nr_dirty_pages(void)
1214{
1215 return global_node_page_state(NR_FILE_DIRTY) +
1216 get_nr_dirty_inodes();
1217}
1218
1219static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
1220{
1221 if (!wb_has_dirty_io(wb))
1222 return;
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232 if (test_bit(WB_start_all, &wb->state) ||
1233 test_and_set_bit(WB_start_all, &wb->state))
1234 return;
1235
1236 wb->start_all_reason = reason;
1237 wb_wakeup(wb);
1238}
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250void wb_start_background_writeback(struct bdi_writeback *wb)
1251{
1252
1253
1254
1255
1256 trace_writeback_wake_background(wb);
1257 wb_wakeup(wb);
1258}
1259
1260
1261
1262
1263void inode_io_list_del(struct inode *inode)
1264{
1265 struct bdi_writeback *wb;
1266
1267 wb = inode_to_wb_and_lock_list(inode);
1268 spin_lock(&inode->i_lock);
1269
1270 inode->i_state &= ~I_SYNC_QUEUED;
1271 list_del_init(&inode->i_io_list);
1272 wb_io_lists_depopulated(wb);
1273
1274 spin_unlock(&inode->i_lock);
1275 spin_unlock(&wb->list_lock);
1276}
1277EXPORT_SYMBOL(inode_io_list_del);
1278
1279
1280
1281
1282void sb_mark_inode_writeback(struct inode *inode)
1283{
1284 struct super_block *sb = inode->i_sb;
1285 unsigned long flags;
1286
1287 if (list_empty(&inode->i_wb_list)) {
1288 spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
1289 if (list_empty(&inode->i_wb_list)) {
1290 list_add_tail(&inode->i_wb_list, &sb->s_inodes_wb);
1291 trace_sb_mark_inode_writeback(inode);
1292 }
1293 spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
1294 }
1295}
1296
1297
1298
1299
1300void sb_clear_inode_writeback(struct inode *inode)
1301{
1302 struct super_block *sb = inode->i_sb;
1303 unsigned long flags;
1304
1305 if (!list_empty(&inode->i_wb_list)) {
1306 spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
1307 if (!list_empty(&inode->i_wb_list)) {
1308 list_del_init(&inode->i_wb_list);
1309 trace_sb_clear_inode_writeback(inode);
1310 }
1311 spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
1312 }
1313}
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb)
1325{
1326 assert_spin_locked(&inode->i_lock);
1327
1328 if (!list_empty(&wb->b_dirty)) {
1329 struct inode *tail;
1330
1331 tail = wb_inode(wb->b_dirty.next);
1332 if (time_before(inode->dirtied_when, tail->dirtied_when))
1333 inode->dirtied_when = jiffies;
1334 }
1335 inode_io_list_move_locked(inode, wb, &wb->b_dirty);
1336 inode->i_state &= ~I_SYNC_QUEUED;
1337}
1338
1339static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
1340{
1341 spin_lock(&inode->i_lock);
1342 redirty_tail_locked(inode, wb);
1343 spin_unlock(&inode->i_lock);
1344}
1345
1346
1347
1348
1349static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
1350{
1351 inode_io_list_move_locked(inode, wb, &wb->b_more_io);
1352}
1353
1354static void inode_sync_complete(struct inode *inode)
1355{
1356 inode->i_state &= ~I_SYNC;
1357
1358 inode_add_lru(inode);
1359
1360 smp_mb();
1361 wake_up_bit(&inode->i_state, __I_SYNC);
1362}
1363
1364static bool inode_dirtied_after(struct inode *inode, unsigned long t)
1365{
1366 bool ret = time_after(inode->dirtied_when, t);
1367#ifndef CONFIG_64BIT
1368
1369
1370
1371
1372
1373
1374 ret = ret && time_before_eq(inode->dirtied_when, jiffies);
1375#endif
1376 return ret;
1377}
1378
1379#define EXPIRE_DIRTY_ATIME 0x0001
1380
1381
1382
1383
1384
1385static int move_expired_inodes(struct list_head *delaying_queue,
1386 struct list_head *dispatch_queue,
1387 unsigned long dirtied_before)
1388{
1389 LIST_HEAD(tmp);
1390 struct list_head *pos, *node;
1391 struct super_block *sb = NULL;
1392 struct inode *inode;
1393 int do_sb_sort = 0;
1394 int moved = 0;
1395
1396 while (!list_empty(delaying_queue)) {
1397 inode = wb_inode(delaying_queue->prev);
1398 if (inode_dirtied_after(inode, dirtied_before))
1399 break;
1400 list_move(&inode->i_io_list, &tmp);
1401 moved++;
1402 spin_lock(&inode->i_lock);
1403 inode->i_state |= I_SYNC_QUEUED;
1404 spin_unlock(&inode->i_lock);
1405 if (sb_is_blkdev_sb(inode->i_sb))
1406 continue;
1407 if (sb && sb != inode->i_sb)
1408 do_sb_sort = 1;
1409 sb = inode->i_sb;
1410 }
1411
1412
1413 if (!do_sb_sort) {
1414 list_splice(&tmp, dispatch_queue);
1415 goto out;
1416 }
1417
1418
1419 while (!list_empty(&tmp)) {
1420 sb = wb_inode(tmp.prev)->i_sb;
1421 list_for_each_prev_safe(pos, node, &tmp) {
1422 inode = wb_inode(pos);
1423 if (inode->i_sb == sb)
1424 list_move(&inode->i_io_list, dispatch_queue);
1425 }
1426 }
1427out:
1428 return moved;
1429}
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work,
1443 unsigned long dirtied_before)
1444{
1445 int moved;
1446 unsigned long time_expire_jif = dirtied_before;
1447
1448 assert_spin_locked(&wb->list_lock);
1449 list_splice_init(&wb->b_more_io, &wb->b_io);
1450 moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, dirtied_before);
1451 if (!work->for_sync)
1452 time_expire_jif = jiffies - dirtytime_expire_interval * HZ;
1453 moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
1454 time_expire_jif);
1455 if (moved)
1456 wb_io_lists_populated(wb);
1457 trace_writeback_queue_io(wb, work, dirtied_before, moved);
1458}
1459
1460static int write_inode(struct inode *inode, struct writeback_control *wbc)
1461{
1462 int ret;
1463
1464 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) {
1465 trace_writeback_write_inode_start(inode, wbc);
1466 ret = inode->i_sb->s_op->write_inode(inode, wbc);
1467 trace_writeback_write_inode(inode, wbc);
1468 return ret;
1469 }
1470 return 0;
1471}
1472
1473
1474
1475
1476
1477static void __inode_wait_for_writeback(struct inode *inode)
1478 __releases(inode->i_lock)
1479 __acquires(inode->i_lock)
1480{
1481 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
1482 wait_queue_head_t *wqh;
1483
1484 wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
1485 while (inode->i_state & I_SYNC) {
1486 spin_unlock(&inode->i_lock);
1487 __wait_on_bit(wqh, &wq, bit_wait,
1488 TASK_UNINTERRUPTIBLE);
1489 spin_lock(&inode->i_lock);
1490 }
1491}
1492
1493
1494
1495
1496void inode_wait_for_writeback(struct inode *inode)
1497{
1498 spin_lock(&inode->i_lock);
1499 __inode_wait_for_writeback(inode);
1500 spin_unlock(&inode->i_lock);
1501}
1502
1503
1504
1505
1506
1507
1508static void inode_sleep_on_writeback(struct inode *inode)
1509 __releases(inode->i_lock)
1510{
1511 DEFINE_WAIT(wait);
1512 wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
1513 int sleep;
1514
1515 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
1516 sleep = inode->i_state & I_SYNC;
1517 spin_unlock(&inode->i_lock);
1518 if (sleep)
1519 schedule();
1520 finish_wait(wqh, &wait);
1521}
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
1532 struct writeback_control *wbc)
1533{
1534 if (inode->i_state & I_FREEING)
1535 return;
1536
1537
1538
1539
1540
1541
1542 if ((inode->i_state & I_DIRTY) &&
1543 (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
1544 inode->dirtied_when = jiffies;
1545
1546 if (wbc->pages_skipped) {
1547
1548
1549
1550
1551 redirty_tail_locked(inode, wb);
1552 return;
1553 }
1554
1555 if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
1556
1557
1558
1559
1560 if (wbc->nr_to_write <= 0) {
1561
1562 requeue_io(inode, wb);
1563 } else {
1564
1565
1566
1567
1568
1569
1570
1571 redirty_tail_locked(inode, wb);
1572 }
1573 } else if (inode->i_state & I_DIRTY) {
1574
1575
1576
1577
1578
1579 redirty_tail_locked(inode, wb);
1580 } else if (inode->i_state & I_DIRTY_TIME) {
1581 inode->dirtied_when = jiffies;
1582 inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
1583 inode->i_state &= ~I_SYNC_QUEUED;
1584 } else {
1585
1586 inode_cgwb_move_to_attached(inode, wb);
1587 }
1588}
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601static int
1602__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
1603{
1604 struct address_space *mapping = inode->i_mapping;
1605 long nr_to_write = wbc->nr_to_write;
1606 unsigned dirty;
1607 int ret;
1608
1609 WARN_ON(!(inode->i_state & I_SYNC));
1610
1611 trace_writeback_single_inode_start(inode, wbc, nr_to_write);
1612
1613 ret = do_writepages(mapping, wbc);
1614
1615
1616
1617
1618
1619
1620
1621
1622 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
1623 int err = filemap_fdatawait(mapping);
1624 if (ret == 0)
1625 ret = err;
1626 }
1627
1628
1629
1630
1631
1632
1633 if ((inode->i_state & I_DIRTY_TIME) &&
1634 (wbc->sync_mode == WB_SYNC_ALL ||
1635 time_after(jiffies, inode->dirtied_time_when +
1636 dirtytime_expire_interval * HZ))) {
1637 trace_writeback_lazytime(inode);
1638 mark_inode_dirty_sync(inode);
1639 }
1640
1641
1642
1643
1644
1645
1646
1647 spin_lock(&inode->i_lock);
1648 dirty = inode->i_state & I_DIRTY;
1649 inode->i_state &= ~dirty;
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662 smp_mb();
1663
1664 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
1665 inode->i_state |= I_DIRTY_PAGES;
1666
1667 spin_unlock(&inode->i_lock);
1668
1669
1670 if (dirty & ~I_DIRTY_PAGES) {
1671 int err = write_inode(inode, wbc);
1672 if (ret == 0)
1673 ret = err;
1674 }
1675 trace_writeback_single_inode(inode, wbc, nr_to_write);
1676 return ret;
1677}
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688static int writeback_single_inode(struct inode *inode,
1689 struct writeback_control *wbc)
1690{
1691 struct bdi_writeback *wb;
1692 int ret = 0;
1693
1694 spin_lock(&inode->i_lock);
1695 if (!atomic_read(&inode->i_count))
1696 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
1697 else
1698 WARN_ON(inode->i_state & I_WILL_FREE);
1699
1700 if (inode->i_state & I_SYNC) {
1701
1702
1703
1704
1705
1706
1707 if (wbc->sync_mode != WB_SYNC_ALL)
1708 goto out;
1709 __inode_wait_for_writeback(inode);
1710 }
1711 WARN_ON(inode->i_state & I_SYNC);
1712
1713
1714
1715
1716
1717
1718
1719 if (!(inode->i_state & I_DIRTY_ALL) &&
1720 (wbc->sync_mode != WB_SYNC_ALL ||
1721 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
1722 goto out;
1723 inode->i_state |= I_SYNC;
1724 wbc_attach_and_unlock_inode(wbc, inode);
1725
1726 ret = __writeback_single_inode(inode, wbc);
1727
1728 wbc_detach_inode(wbc);
1729
1730 wb = inode_to_wb_and_lock_list(inode);
1731 spin_lock(&inode->i_lock);
1732
1733
1734
1735
1736
1737 if (!(inode->i_state & I_DIRTY_ALL))
1738 inode_cgwb_move_to_attached(inode, wb);
1739 spin_unlock(&wb->list_lock);
1740 inode_sync_complete(inode);
1741out:
1742 spin_unlock(&inode->i_lock);
1743 return ret;
1744}
1745
1746static long writeback_chunk_size(struct bdi_writeback *wb,
1747 struct wb_writeback_work *work)
1748{
1749 long pages;
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764 if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
1765 pages = LONG_MAX;
1766 else {
1767 pages = min(wb->avg_write_bandwidth / 2,
1768 global_wb_domain.dirty_limit / DIRTY_SCOPE);
1769 pages = min(pages, work->nr_pages);
1770 pages = round_down(pages + MIN_WRITEBACK_PAGES,
1771 MIN_WRITEBACK_PAGES);
1772 }
1773
1774 return pages;
1775}
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786static long writeback_sb_inodes(struct super_block *sb,
1787 struct bdi_writeback *wb,
1788 struct wb_writeback_work *work)
1789{
1790 struct writeback_control wbc = {
1791 .sync_mode = work->sync_mode,
1792 .tagged_writepages = work->tagged_writepages,
1793 .for_kupdate = work->for_kupdate,
1794 .for_background = work->for_background,
1795 .for_sync = work->for_sync,
1796 .range_cyclic = work->range_cyclic,
1797 .range_start = 0,
1798 .range_end = LLONG_MAX,
1799 };
1800 unsigned long start_time = jiffies;
1801 long write_chunk;
1802 long wrote = 0;
1803
1804 while (!list_empty(&wb->b_io)) {
1805 struct inode *inode = wb_inode(wb->b_io.prev);
1806 struct bdi_writeback *tmp_wb;
1807
1808 if (inode->i_sb != sb) {
1809 if (work->sb) {
1810
1811
1812
1813
1814
1815 redirty_tail(inode, wb);
1816 continue;
1817 }
1818
1819
1820
1821
1822
1823
1824 break;
1825 }
1826
1827
1828
1829
1830
1831
1832 spin_lock(&inode->i_lock);
1833 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
1834 redirty_tail_locked(inode, wb);
1835 spin_unlock(&inode->i_lock);
1836 continue;
1837 }
1838 if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848 spin_unlock(&inode->i_lock);
1849 requeue_io(inode, wb);
1850 trace_writeback_sb_inodes_requeue(inode);
1851 continue;
1852 }
1853 spin_unlock(&wb->list_lock);
1854
1855
1856
1857
1858
1859
1860 if (inode->i_state & I_SYNC) {
1861
1862 inode_sleep_on_writeback(inode);
1863
1864 spin_lock(&wb->list_lock);
1865 continue;
1866 }
1867 inode->i_state |= I_SYNC;
1868 wbc_attach_and_unlock_inode(&wbc, inode);
1869
1870 write_chunk = writeback_chunk_size(wb, work);
1871 wbc.nr_to_write = write_chunk;
1872 wbc.pages_skipped = 0;
1873
1874
1875
1876
1877
1878 __writeback_single_inode(inode, &wbc);
1879
1880 wbc_detach_inode(&wbc);
1881 work->nr_pages -= write_chunk - wbc.nr_to_write;
1882 wrote += write_chunk - wbc.nr_to_write;
1883
1884 if (need_resched()) {
1885
1886
1887
1888
1889
1890
1891
1892
1893 blk_flush_plug(current);
1894 cond_resched();
1895 }
1896
1897
1898
1899
1900
1901 tmp_wb = inode_to_wb_and_lock_list(inode);
1902 spin_lock(&inode->i_lock);
1903 if (!(inode->i_state & I_DIRTY_ALL))
1904 wrote++;
1905 requeue_inode(inode, tmp_wb, &wbc);
1906 inode_sync_complete(inode);
1907 spin_unlock(&inode->i_lock);
1908
1909 if (unlikely(tmp_wb != wb)) {
1910 spin_unlock(&tmp_wb->list_lock);
1911 spin_lock(&wb->list_lock);
1912 }
1913
1914
1915
1916
1917
1918 if (wrote) {
1919 if (time_is_before_jiffies(start_time + HZ / 10UL))
1920 break;
1921 if (work->nr_pages <= 0)
1922 break;
1923 }
1924 }
1925 return wrote;
1926}
1927
1928static long __writeback_inodes_wb(struct bdi_writeback *wb,
1929 struct wb_writeback_work *work)
1930{
1931 unsigned long start_time = jiffies;
1932 long wrote = 0;
1933
1934 while (!list_empty(&wb->b_io)) {
1935 struct inode *inode = wb_inode(wb->b_io.prev);
1936 struct super_block *sb = inode->i_sb;
1937
1938 if (!trylock_super(sb)) {
1939
1940
1941
1942
1943
1944 redirty_tail(inode, wb);
1945 continue;
1946 }
1947 wrote += writeback_sb_inodes(sb, wb, work);
1948 up_read(&sb->s_umount);
1949
1950
1951 if (wrote) {
1952 if (time_is_before_jiffies(start_time + HZ / 10UL))
1953 break;
1954 if (work->nr_pages <= 0)
1955 break;
1956 }
1957 }
1958
1959 return wrote;
1960}
1961
1962static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
1963 enum wb_reason reason)
1964{
1965 struct wb_writeback_work work = {
1966 .nr_pages = nr_pages,
1967 .sync_mode = WB_SYNC_NONE,
1968 .range_cyclic = 1,
1969 .reason = reason,
1970 };
1971 struct blk_plug plug;
1972
1973 blk_start_plug(&plug);
1974 spin_lock(&wb->list_lock);
1975 if (list_empty(&wb->b_io))
1976 queue_io(wb, &work, jiffies);
1977 __writeback_inodes_wb(wb, &work);
1978 spin_unlock(&wb->list_lock);
1979 blk_finish_plug(&plug);
1980
1981 return nr_pages - work.nr_pages;
1982}
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999static long wb_writeback(struct bdi_writeback *wb,
2000 struct wb_writeback_work *work)
2001{
2002 unsigned long wb_start = jiffies;
2003 long nr_pages = work->nr_pages;
2004 unsigned long dirtied_before = jiffies;
2005 struct inode *inode;
2006 long progress;
2007 struct blk_plug plug;
2008
2009 blk_start_plug(&plug);
2010 spin_lock(&wb->list_lock);
2011 for (;;) {
2012
2013
2014
2015 if (work->nr_pages <= 0)
2016 break;
2017
2018
2019
2020
2021
2022
2023
2024 if ((work->for_background || work->for_kupdate) &&
2025 !list_empty(&wb->work_list))
2026 break;
2027
2028
2029
2030
2031
2032 if (work->for_background && !wb_over_bg_thresh(wb))
2033 break;
2034
2035
2036
2037
2038
2039
2040
2041 if (work->for_kupdate) {
2042 dirtied_before = jiffies -
2043 msecs_to_jiffies(dirty_expire_interval * 10);
2044 } else if (work->for_background)
2045 dirtied_before = jiffies;
2046
2047 trace_writeback_start(wb, work);
2048 if (list_empty(&wb->b_io))
2049 queue_io(wb, work, dirtied_before);
2050 if (work->sb)
2051 progress = writeback_sb_inodes(work->sb, wb, work);
2052 else
2053 progress = __writeback_inodes_wb(wb, work);
2054 trace_writeback_written(wb, work);
2055
2056 wb_update_bandwidth(wb, wb_start);
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066 if (progress)
2067 continue;
2068
2069
2070
2071 if (list_empty(&wb->b_more_io))
2072 break;
2073
2074
2075
2076
2077
2078 trace_writeback_wait(wb, work);
2079 inode = wb_inode(wb->b_more_io.prev);
2080 spin_lock(&inode->i_lock);
2081 spin_unlock(&wb->list_lock);
2082
2083 inode_sleep_on_writeback(inode);
2084 spin_lock(&wb->list_lock);
2085 }
2086 spin_unlock(&wb->list_lock);
2087 blk_finish_plug(&plug);
2088
2089 return nr_pages - work->nr_pages;
2090}
2091
2092
2093
2094
2095static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
2096{
2097 struct wb_writeback_work *work = NULL;
2098
2099 spin_lock_bh(&wb->work_lock);
2100 if (!list_empty(&wb->work_list)) {
2101 work = list_entry(wb->work_list.next,
2102 struct wb_writeback_work, list);
2103 list_del_init(&work->list);
2104 }
2105 spin_unlock_bh(&wb->work_lock);
2106 return work;
2107}
2108
2109static long wb_check_background_flush(struct bdi_writeback *wb)
2110{
2111 if (wb_over_bg_thresh(wb)) {
2112
2113 struct wb_writeback_work work = {
2114 .nr_pages = LONG_MAX,
2115 .sync_mode = WB_SYNC_NONE,
2116 .for_background = 1,
2117 .range_cyclic = 1,
2118 .reason = WB_REASON_BACKGROUND,
2119 };
2120
2121 return wb_writeback(wb, &work);
2122 }
2123
2124 return 0;
2125}
2126
2127static long wb_check_old_data_flush(struct bdi_writeback *wb)
2128{
2129 unsigned long expired;
2130 long nr_pages;
2131
2132
2133
2134
2135 if (!dirty_writeback_interval)
2136 return 0;
2137
2138 expired = wb->last_old_flush +
2139 msecs_to_jiffies(dirty_writeback_interval * 10);
2140 if (time_before(jiffies, expired))
2141 return 0;
2142
2143 wb->last_old_flush = jiffies;
2144 nr_pages = get_nr_dirty_pages();
2145
2146 if (nr_pages) {
2147 struct wb_writeback_work work = {
2148 .nr_pages = nr_pages,
2149 .sync_mode = WB_SYNC_NONE,
2150 .for_kupdate = 1,
2151 .range_cyclic = 1,
2152 .reason = WB_REASON_PERIODIC,
2153 };
2154
2155 return wb_writeback(wb, &work);
2156 }
2157
2158 return 0;
2159}
2160
2161static long wb_check_start_all(struct bdi_writeback *wb)
2162{
2163 long nr_pages;
2164
2165 if (!test_bit(WB_start_all, &wb->state))
2166 return 0;
2167
2168 nr_pages = get_nr_dirty_pages();
2169 if (nr_pages) {
2170 struct wb_writeback_work work = {
2171 .nr_pages = wb_split_bdi_pages(wb, nr_pages),
2172 .sync_mode = WB_SYNC_NONE,
2173 .range_cyclic = 1,
2174 .reason = wb->start_all_reason,
2175 };
2176
2177 nr_pages = wb_writeback(wb, &work);
2178 }
2179
2180 clear_bit(WB_start_all, &wb->state);
2181 return nr_pages;
2182}
2183
2184
2185
2186
2187
2188static long wb_do_writeback(struct bdi_writeback *wb)
2189{
2190 struct wb_writeback_work *work;
2191 long wrote = 0;
2192
2193 set_bit(WB_writeback_running, &wb->state);
2194 while ((work = get_next_work_item(wb)) != NULL) {
2195 trace_writeback_exec(wb, work);
2196 wrote += wb_writeback(wb, work);
2197 finish_writeback_work(wb, work);
2198 }
2199
2200
2201
2202
2203 wrote += wb_check_start_all(wb);
2204
2205
2206
2207
2208 wrote += wb_check_old_data_flush(wb);
2209 wrote += wb_check_background_flush(wb);
2210 clear_bit(WB_writeback_running, &wb->state);
2211
2212 return wrote;
2213}
2214
2215
2216
2217
2218
2219void wb_workfn(struct work_struct *work)
2220{
2221 struct bdi_writeback *wb = container_of(to_delayed_work(work),
2222 struct bdi_writeback, dwork);
2223 long pages_written;
2224
2225 set_worker_desc("flush-%s", bdi_dev_name(wb->bdi));
2226 current->flags |= PF_SWAPWRITE;
2227
2228 if (likely(!current_is_workqueue_rescuer() ||
2229 !test_bit(WB_registered, &wb->state))) {
2230
2231
2232
2233
2234
2235
2236 do {
2237 pages_written = wb_do_writeback(wb);
2238 trace_writeback_pages_written(pages_written);
2239 } while (!list_empty(&wb->work_list));
2240 } else {
2241
2242
2243
2244
2245
2246 pages_written = writeback_inodes_wb(wb, 1024,
2247 WB_REASON_FORKER_THREAD);
2248 trace_writeback_pages_written(pages_written);
2249 }
2250
2251 if (!list_empty(&wb->work_list))
2252 wb_wakeup(wb);
2253 else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
2254 wb_wakeup_delayed(wb);
2255
2256 current->flags &= ~PF_SWAPWRITE;
2257}
2258
2259
2260
2261
2262
2263static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
2264 enum wb_reason reason)
2265{
2266 struct bdi_writeback *wb;
2267
2268 if (!bdi_has_dirty_io(bdi))
2269 return;
2270
2271 list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
2272 wb_start_writeback(wb, reason);
2273}
2274
2275void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
2276 enum wb_reason reason)
2277{
2278 rcu_read_lock();
2279 __wakeup_flusher_threads_bdi(bdi, reason);
2280 rcu_read_unlock();
2281}
2282
2283
2284
2285
2286void wakeup_flusher_threads(enum wb_reason reason)
2287{
2288 struct backing_dev_info *bdi;
2289
2290
2291
2292
2293 if (blk_needs_flush_plug(current))
2294 blk_schedule_flush_plug(current);
2295
2296 rcu_read_lock();
2297 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
2298 __wakeup_flusher_threads_bdi(bdi, reason);
2299 rcu_read_unlock();
2300}
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317static void wakeup_dirtytime_writeback(struct work_struct *w);
2318static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback);
2319
2320static void wakeup_dirtytime_writeback(struct work_struct *w)
2321{
2322 struct backing_dev_info *bdi;
2323
2324 rcu_read_lock();
2325 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
2326 struct bdi_writeback *wb;
2327
2328 list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
2329 if (!list_empty(&wb->b_dirty_time))
2330 wb_wakeup(wb);
2331 }
2332 rcu_read_unlock();
2333 schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
2334}
2335
2336static int __init start_dirtytime_writeback(void)
2337{
2338 schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
2339 return 0;
2340}
2341__initcall(start_dirtytime_writeback);
2342
2343int dirtytime_interval_handler(struct ctl_table *table, int write,
2344 void *buffer, size_t *lenp, loff_t *ppos)
2345{
2346 int ret;
2347
2348 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
2349 if (ret == 0 && write)
2350 mod_delayed_work(system_wq, &dirtytime_work, 0);
2351 return ret;
2352}
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381void __mark_inode_dirty(struct inode *inode, int flags)
2382{
2383 struct super_block *sb = inode->i_sb;
2384 int dirtytime = 0;
2385
2386 trace_writeback_mark_inode_dirty(inode, flags);
2387
2388 if (flags & I_DIRTY_INODE) {
2389
2390
2391
2392
2393
2394
2395
2396 trace_writeback_dirty_inode_start(inode, flags);
2397 if (sb->s_op->dirty_inode)
2398 sb->s_op->dirty_inode(inode, flags & I_DIRTY_INODE);
2399 trace_writeback_dirty_inode(inode, flags);
2400
2401
2402 flags &= ~I_DIRTY_TIME;
2403 } else {
2404
2405
2406
2407
2408
2409 dirtytime = flags & I_DIRTY_TIME;
2410 WARN_ON_ONCE(dirtytime && flags != I_DIRTY_TIME);
2411 }
2412
2413
2414
2415
2416
2417 smp_mb();
2418
2419 if (((inode->i_state & flags) == flags) ||
2420 (dirtytime && (inode->i_state & I_DIRTY_INODE)))
2421 return;
2422
2423 spin_lock(&inode->i_lock);
2424 if (dirtytime && (inode->i_state & I_DIRTY_INODE))
2425 goto out_unlock_inode;
2426 if ((inode->i_state & flags) != flags) {
2427 const int was_dirty = inode->i_state & I_DIRTY;
2428
2429 inode_attach_wb(inode, NULL);
2430
2431
2432 if (flags & I_DIRTY_INODE)
2433 inode->i_state &= ~I_DIRTY_TIME;
2434 inode->i_state |= flags;
2435
2436
2437
2438
2439
2440
2441
2442 if (inode->i_state & I_SYNC_QUEUED)
2443 goto out_unlock_inode;
2444
2445
2446
2447
2448
2449 if (!S_ISBLK(inode->i_mode)) {
2450 if (inode_unhashed(inode))
2451 goto out_unlock_inode;
2452 }
2453 if (inode->i_state & I_FREEING)
2454 goto out_unlock_inode;
2455
2456
2457
2458
2459
2460 if (!was_dirty) {
2461 struct bdi_writeback *wb;
2462 struct list_head *dirty_list;
2463 bool wakeup_bdi = false;
2464
2465 wb = locked_inode_to_wb_and_lock_list(inode);
2466
2467 inode->dirtied_when = jiffies;
2468 if (dirtytime)
2469 inode->dirtied_time_when = jiffies;
2470
2471 if (inode->i_state & I_DIRTY)
2472 dirty_list = &wb->b_dirty;
2473 else
2474 dirty_list = &wb->b_dirty_time;
2475
2476 wakeup_bdi = inode_io_list_move_locked(inode, wb,
2477 dirty_list);
2478
2479 spin_unlock(&wb->list_lock);
2480 trace_writeback_dirty_inode_enqueue(inode);
2481
2482
2483
2484
2485
2486
2487
2488 if (wakeup_bdi &&
2489 (wb->bdi->capabilities & BDI_CAP_WRITEBACK))
2490 wb_wakeup_delayed(wb);
2491 return;
2492 }
2493 }
2494out_unlock_inode:
2495 spin_unlock(&inode->i_lock);
2496}
2497EXPORT_SYMBOL(__mark_inode_dirty);
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508static void wait_sb_inodes(struct super_block *sb)
2509{
2510 LIST_HEAD(sync_list);
2511
2512
2513
2514
2515
2516 WARN_ON(!rwsem_is_locked(&sb->s_umount));
2517
2518 mutex_lock(&sb->s_sync_lock);
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529 rcu_read_lock();
2530 spin_lock_irq(&sb->s_inode_wblist_lock);
2531 list_splice_init(&sb->s_inodes_wb, &sync_list);
2532
2533
2534
2535
2536
2537
2538
2539
2540 while (!list_empty(&sync_list)) {
2541 struct inode *inode = list_first_entry(&sync_list, struct inode,
2542 i_wb_list);
2543 struct address_space *mapping = inode->i_mapping;
2544
2545
2546
2547
2548
2549
2550
2551 list_move_tail(&inode->i_wb_list, &sb->s_inodes_wb);
2552
2553
2554
2555
2556
2557
2558 if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
2559 continue;
2560
2561 spin_unlock_irq(&sb->s_inode_wblist_lock);
2562
2563 spin_lock(&inode->i_lock);
2564 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
2565 spin_unlock(&inode->i_lock);
2566
2567 spin_lock_irq(&sb->s_inode_wblist_lock);
2568 continue;
2569 }
2570 __iget(inode);
2571 spin_unlock(&inode->i_lock);
2572 rcu_read_unlock();
2573
2574
2575
2576
2577
2578
2579 filemap_fdatawait_keep_errors(mapping);
2580
2581 cond_resched();
2582
2583 iput(inode);
2584
2585 rcu_read_lock();
2586 spin_lock_irq(&sb->s_inode_wblist_lock);
2587 }
2588 spin_unlock_irq(&sb->s_inode_wblist_lock);
2589 rcu_read_unlock();
2590 mutex_unlock(&sb->s_sync_lock);
2591}
2592
2593static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
2594 enum wb_reason reason, bool skip_if_busy)
2595{
2596 struct backing_dev_info *bdi = sb->s_bdi;
2597 DEFINE_WB_COMPLETION(done, bdi);
2598 struct wb_writeback_work work = {
2599 .sb = sb,
2600 .sync_mode = WB_SYNC_NONE,
2601 .tagged_writepages = 1,
2602 .done = &done,
2603 .nr_pages = nr,
2604 .reason = reason,
2605 };
2606
2607 if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
2608 return;
2609 WARN_ON(!rwsem_is_locked(&sb->s_umount));
2610
2611 bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
2612 wb_wait_for_completion(&done);
2613}
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625void writeback_inodes_sb_nr(struct super_block *sb,
2626 unsigned long nr,
2627 enum wb_reason reason)
2628{
2629 __writeback_inodes_sb_nr(sb, nr, reason, false);
2630}
2631EXPORT_SYMBOL(writeback_inodes_sb_nr);
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
2643{
2644 return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
2645}
2646EXPORT_SYMBOL(writeback_inodes_sb);
2647
2648
2649
2650
2651
2652
2653
2654
2655void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
2656{
2657 if (!down_read_trylock(&sb->s_umount))
2658 return;
2659
2660 __writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason, true);
2661 up_read(&sb->s_umount);
2662}
2663EXPORT_SYMBOL(try_to_writeback_inodes_sb);
2664
2665
2666
2667
2668
2669
2670
2671
2672void sync_inodes_sb(struct super_block *sb)
2673{
2674 struct backing_dev_info *bdi = sb->s_bdi;
2675 DEFINE_WB_COMPLETION(done, bdi);
2676 struct wb_writeback_work work = {
2677 .sb = sb,
2678 .sync_mode = WB_SYNC_ALL,
2679 .nr_pages = LONG_MAX,
2680 .range_cyclic = 0,
2681 .done = &done,
2682 .reason = WB_REASON_SYNC,
2683 .for_sync = 1,
2684 };
2685
2686
2687
2688
2689
2690
2691 if (bdi == &noop_backing_dev_info)
2692 return;
2693 WARN_ON(!rwsem_is_locked(&sb->s_umount));
2694
2695
2696 bdi_down_write_wb_switch_rwsem(bdi);
2697 bdi_split_work_to_wbs(bdi, &work, false);
2698 wb_wait_for_completion(&done);
2699 bdi_up_write_wb_switch_rwsem(bdi);
2700
2701 wait_sb_inodes(sb);
2702}
2703EXPORT_SYMBOL(sync_inodes_sb);
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715int write_inode_now(struct inode *inode, int sync)
2716{
2717 struct writeback_control wbc = {
2718 .nr_to_write = LONG_MAX,
2719 .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
2720 .range_start = 0,
2721 .range_end = LLONG_MAX,
2722 };
2723
2724 if (!mapping_can_writeback(inode->i_mapping))
2725 wbc.nr_to_write = 0;
2726
2727 might_sleep();
2728 return writeback_single_inode(inode, &wbc);
2729}
2730EXPORT_SYMBOL(write_inode_now);
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743int sync_inode(struct inode *inode, struct writeback_control *wbc)
2744{
2745 return writeback_single_inode(inode, wbc);
2746}
2747EXPORT_SYMBOL(sync_inode);
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758int sync_inode_metadata(struct inode *inode, int wait)
2759{
2760 struct writeback_control wbc = {
2761 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
2762 .nr_to_write = 0,
2763 };
2764
2765 return sync_inode(inode, &wbc);
2766}
2767EXPORT_SYMBOL(sync_inode_metadata);
2768