1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16#include <linux/kernel.h>
17#include <linux/export.h>
18#include <linux/spinlock.h>
19#include <linux/slab.h>
20#include <linux/sched.h>
21#include <linux/fs.h>
22#include <linux/mm.h>
23#include <linux/pagemap.h>
24#include <linux/kthread.h>
25#include <linux/writeback.h>
26#include <linux/blkdev.h>
27#include <linux/backing-dev.h>
28#include <linux/tracepoint.h>
29#include <linux/device.h>
30#include <linux/memcontrol.h>
31#include "internal.h"
32
33
34
35
36#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10))
37
38struct wb_completion {
39 atomic_t cnt;
40};
41
42
43
44
45struct wb_writeback_work {
46 long nr_pages;
47 struct super_block *sb;
48 unsigned long *older_than_this;
49 enum writeback_sync_modes sync_mode;
50 unsigned int tagged_writepages:1;
51 unsigned int for_kupdate:1;
52 unsigned int range_cyclic:1;
53 unsigned int for_background:1;
54 unsigned int for_sync:1;
55 unsigned int auto_free:1;
56 enum wb_reason reason;
57
58 struct list_head list;
59 struct wb_completion *done;
60};
61
62
63
64
65
66
67
68
69#define DEFINE_WB_COMPLETION_ONSTACK(cmpl) \
70 struct wb_completion cmpl = { \
71 .cnt = ATOMIC_INIT(1), \
72 }
73
74
75
76
77
78
79
80
81
82
83
84
85unsigned int dirtytime_expire_interval = 12 * 60 * 60;
86
87static inline struct inode *wb_inode(struct list_head *head)
88{
89 return list_entry(head, struct inode, i_io_list);
90}
91
92
93
94
95
96
97#define CREATE_TRACE_POINTS
98#include <trace/events/writeback.h>
99
100EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
101
102static bool wb_io_lists_populated(struct bdi_writeback *wb)
103{
104 if (wb_has_dirty_io(wb)) {
105 return false;
106 } else {
107 set_bit(WB_has_dirty_io, &wb->state);
108 WARN_ON_ONCE(!wb->avg_write_bandwidth);
109 atomic_long_add(wb->avg_write_bandwidth,
110 &wb->bdi->tot_write_bandwidth);
111 return true;
112 }
113}
114
115static void wb_io_lists_depopulated(struct bdi_writeback *wb)
116{
117 if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) &&
118 list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) {
119 clear_bit(WB_has_dirty_io, &wb->state);
120 WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth,
121 &wb->bdi->tot_write_bandwidth) < 0);
122 }
123}
124
125
126
127
128
129
130
131
132
133
134
135static bool inode_io_list_move_locked(struct inode *inode,
136 struct bdi_writeback *wb,
137 struct list_head *head)
138{
139 assert_spin_locked(&wb->list_lock);
140
141 list_move(&inode->i_io_list, head);
142
143
144 if (head != &wb->b_dirty_time)
145 return wb_io_lists_populated(wb);
146
147 wb_io_lists_depopulated(wb);
148 return false;
149}
150
151
152
153
154
155
156
157
158
159static void inode_io_list_del_locked(struct inode *inode,
160 struct bdi_writeback *wb)
161{
162 assert_spin_locked(&wb->list_lock);
163
164 list_del_init(&inode->i_io_list);
165 wb_io_lists_depopulated(wb);
166}
167
168static void wb_wakeup(struct bdi_writeback *wb)
169{
170 spin_lock_bh(&wb->work_lock);
171 if (test_bit(WB_registered, &wb->state))
172 mod_delayed_work(bdi_wq, &wb->dwork, 0);
173 spin_unlock_bh(&wb->work_lock);
174}
175
176static void wb_queue_work(struct bdi_writeback *wb,
177 struct wb_writeback_work *work)
178{
179 trace_writeback_queue(wb, work);
180
181 spin_lock_bh(&wb->work_lock);
182 if (!test_bit(WB_registered, &wb->state))
183 goto out_unlock;
184 if (work->done)
185 atomic_inc(&work->done->cnt);
186 list_add_tail(&work->list, &wb->work_list);
187 mod_delayed_work(bdi_wq, &wb->dwork, 0);
188out_unlock:
189 spin_unlock_bh(&wb->work_lock);
190}
191
192
193
194
195
196
197
198
199
200
201
202
203static void wb_wait_for_completion(struct backing_dev_info *bdi,
204 struct wb_completion *done)
205{
206 atomic_dec(&done->cnt);
207 wait_event(bdi->wb_waitq, !atomic_read(&done->cnt));
208}
209
210#ifdef CONFIG_CGROUP_WRITEBACK
211
212
213#define WB_FRN_TIME_SHIFT 13
214#define WB_FRN_TIME_AVG_SHIFT 3
215#define WB_FRN_TIME_CUT_DIV 2
216#define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT))
217
218#define WB_FRN_HIST_SLOTS 16
219#define WB_FRN_HIST_UNIT (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS)
220
221#define WB_FRN_HIST_THR_SLOTS (WB_FRN_HIST_SLOTS / 2)
222
223#define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1)
224
225
226static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
227static struct workqueue_struct *isw_wq;
228
229void __inode_attach_wb(struct inode *inode, struct page *page)
230{
231 struct backing_dev_info *bdi = inode_to_bdi(inode);
232 struct bdi_writeback *wb = NULL;
233
234 if (inode_cgwb_enabled(inode)) {
235 struct cgroup_subsys_state *memcg_css;
236
237 if (page) {
238 memcg_css = mem_cgroup_css_from_page(page);
239 wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
240 } else {
241
242 memcg_css = task_get_css(current, memory_cgrp_id);
243 wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
244 css_put(memcg_css);
245 }
246 }
247
248 if (!wb)
249 wb = &bdi->wb;
250
251
252
253
254
255 if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
256 wb_put(wb);
257}
258
259
260
261
262
263
264
265
266
267static struct bdi_writeback *
268locked_inode_to_wb_and_lock_list(struct inode *inode)
269 __releases(&inode->i_lock)
270 __acquires(&wb->list_lock)
271{
272 while (true) {
273 struct bdi_writeback *wb = inode_to_wb(inode);
274
275
276
277
278
279
280
281 wb_get(wb);
282 spin_unlock(&inode->i_lock);
283 spin_lock(&wb->list_lock);
284
285
286 if (likely(wb == inode->i_wb)) {
287 wb_put(wb);
288 return wb;
289 }
290
291 spin_unlock(&wb->list_lock);
292 wb_put(wb);
293 cpu_relax();
294 spin_lock(&inode->i_lock);
295 }
296}
297
298
299
300
301
302
303
304
305static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
306 __acquires(&wb->list_lock)
307{
308 spin_lock(&inode->i_lock);
309 return locked_inode_to_wb_and_lock_list(inode);
310}
311
312struct inode_switch_wbs_context {
313 struct inode *inode;
314 struct bdi_writeback *new_wb;
315
316 struct rcu_head rcu_head;
317 struct work_struct work;
318};
319
320static void inode_switch_wbs_work_fn(struct work_struct *work)
321{
322 struct inode_switch_wbs_context *isw =
323 container_of(work, struct inode_switch_wbs_context, work);
324 struct inode *inode = isw->inode;
325 struct address_space *mapping = inode->i_mapping;
326 struct bdi_writeback *old_wb = inode->i_wb;
327 struct bdi_writeback *new_wb = isw->new_wb;
328 struct radix_tree_iter iter;
329 bool switched = false;
330 void **slot;
331
332
333
334
335
336
337
338
339
340
341
342 if (old_wb < new_wb) {
343 spin_lock(&old_wb->list_lock);
344 spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
345 } else {
346 spin_lock(&new_wb->list_lock);
347 spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
348 }
349 spin_lock(&inode->i_lock);
350 spin_lock_irq(&mapping->tree_lock);
351
352
353
354
355
356 if (unlikely(inode->i_state & I_FREEING))
357 goto skip_switch;
358
359
360
361
362
363
364 radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
365 PAGECACHE_TAG_DIRTY) {
366 struct page *page = radix_tree_deref_slot_protected(slot,
367 &mapping->tree_lock);
368 if (likely(page) && PageDirty(page)) {
369 __dec_wb_stat(old_wb, WB_RECLAIMABLE);
370 __inc_wb_stat(new_wb, WB_RECLAIMABLE);
371 }
372 }
373
374 radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
375 PAGECACHE_TAG_WRITEBACK) {
376 struct page *page = radix_tree_deref_slot_protected(slot,
377 &mapping->tree_lock);
378 if (likely(page)) {
379 WARN_ON_ONCE(!PageWriteback(page));
380 __dec_wb_stat(old_wb, WB_WRITEBACK);
381 __inc_wb_stat(new_wb, WB_WRITEBACK);
382 }
383 }
384
385 wb_get(new_wb);
386
387
388
389
390
391
392
393 if (!list_empty(&inode->i_io_list)) {
394 struct inode *pos;
395
396 inode_io_list_del_locked(inode, old_wb);
397 inode->i_wb = new_wb;
398 list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
399 if (time_after_eq(inode->dirtied_when,
400 pos->dirtied_when))
401 break;
402 inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev);
403 } else {
404 inode->i_wb = new_wb;
405 }
406
407
408 inode->i_wb_frn_winner = 0;
409 inode->i_wb_frn_avg_time = 0;
410 inode->i_wb_frn_history = 0;
411 switched = true;
412skip_switch:
413
414
415
416
417 smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
418
419 spin_unlock_irq(&mapping->tree_lock);
420 spin_unlock(&inode->i_lock);
421 spin_unlock(&new_wb->list_lock);
422 spin_unlock(&old_wb->list_lock);
423
424 if (switched) {
425 wb_wakeup(new_wb);
426 wb_put(old_wb);
427 }
428 wb_put(new_wb);
429
430 iput(inode);
431 kfree(isw);
432
433 atomic_dec(&isw_nr_in_flight);
434}
435
436static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
437{
438 struct inode_switch_wbs_context *isw = container_of(rcu_head,
439 struct inode_switch_wbs_context, rcu_head);
440
441
442 INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
443 queue_work(isw_wq, &isw->work);
444}
445
446
447
448
449
450
451
452
453
454static void inode_switch_wbs(struct inode *inode, int new_wb_id)
455{
456 struct backing_dev_info *bdi = inode_to_bdi(inode);
457 struct cgroup_subsys_state *memcg_css;
458 struct inode_switch_wbs_context *isw;
459
460
461 if (inode->i_state & I_WB_SWITCH)
462 return;
463
464 isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
465 if (!isw)
466 return;
467
468
469 rcu_read_lock();
470 memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
471 if (memcg_css)
472 isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
473 rcu_read_unlock();
474 if (!isw->new_wb)
475 goto out_free;
476
477
478 spin_lock(&inode->i_lock);
479 if (!(inode->i_sb->s_flags & MS_ACTIVE) ||
480 inode->i_state & (I_WB_SWITCH | I_FREEING) ||
481 inode_to_wb(inode) == isw->new_wb) {
482 spin_unlock(&inode->i_lock);
483 goto out_free;
484 }
485 inode->i_state |= I_WB_SWITCH;
486 __iget(inode);
487 spin_unlock(&inode->i_lock);
488
489 isw->inode = inode;
490
491 atomic_inc(&isw_nr_in_flight);
492
493
494
495
496
497
498
499 call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
500 return;
501
502out_free:
503 if (isw->new_wb)
504 wb_put(isw->new_wb);
505 kfree(isw);
506}
507
508
509
510
511
512
513
514
515
516
517
518void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
519 struct inode *inode)
520{
521 if (!inode_cgwb_enabled(inode)) {
522 spin_unlock(&inode->i_lock);
523 return;
524 }
525
526 wbc->wb = inode_to_wb(inode);
527 wbc->inode = inode;
528
529 wbc->wb_id = wbc->wb->memcg_css->id;
530 wbc->wb_lcand_id = inode->i_wb_frn_winner;
531 wbc->wb_tcand_id = 0;
532 wbc->wb_bytes = 0;
533 wbc->wb_lcand_bytes = 0;
534 wbc->wb_tcand_bytes = 0;
535
536 wb_get(wbc->wb);
537 spin_unlock(&inode->i_lock);
538
539
540
541
542
543 if (unlikely(wb_dying(wbc->wb)))
544 inode_switch_wbs(inode, wbc->wb_id);
545}
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584void wbc_detach_inode(struct writeback_control *wbc)
585{
586 struct bdi_writeback *wb = wbc->wb;
587 struct inode *inode = wbc->inode;
588 unsigned long avg_time, max_bytes, max_time;
589 u16 history;
590 int max_id;
591
592 if (!wb)
593 return;
594
595 history = inode->i_wb_frn_history;
596 avg_time = inode->i_wb_frn_avg_time;
597
598
599 if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
600 wbc->wb_bytes >= wbc->wb_tcand_bytes) {
601 max_id = wbc->wb_id;
602 max_bytes = wbc->wb_bytes;
603 } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) {
604 max_id = wbc->wb_lcand_id;
605 max_bytes = wbc->wb_lcand_bytes;
606 } else {
607 max_id = wbc->wb_tcand_id;
608 max_bytes = wbc->wb_tcand_bytes;
609 }
610
611
612
613
614
615
616
617
618 max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT,
619 wb->avg_write_bandwidth);
620 if (avg_time)
621 avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) -
622 (avg_time >> WB_FRN_TIME_AVG_SHIFT);
623 else
624 avg_time = max_time;
625
626 if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) {
627 int slots;
628
629
630
631
632
633
634
635
636
637 slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT),
638 (unsigned long)WB_FRN_HIST_MAX_SLOTS);
639 history <<= slots;
640 if (wbc->wb_id != max_id)
641 history |= (1U << slots) - 1;
642
643
644
645
646
647
648
649
650 if (hweight32(history) > WB_FRN_HIST_THR_SLOTS)
651 inode_switch_wbs(inode, max_id);
652 }
653
654
655
656
657
658 inode->i_wb_frn_winner = max_id;
659 inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX);
660 inode->i_wb_frn_history = history;
661
662 wb_put(wbc->wb);
663 wbc->wb = NULL;
664}
665
666
667
668
669
670
671
672
673
674
675
676void wbc_account_io(struct writeback_control *wbc, struct page *page,
677 size_t bytes)
678{
679 int id;
680
681
682
683
684
685
686
687 if (!wbc->wb)
688 return;
689
690 id = mem_cgroup_css_from_page(page)->id;
691
692 if (id == wbc->wb_id) {
693 wbc->wb_bytes += bytes;
694 return;
695 }
696
697 if (id == wbc->wb_lcand_id)
698 wbc->wb_lcand_bytes += bytes;
699
700
701 if (!wbc->wb_tcand_bytes)
702 wbc->wb_tcand_id = id;
703 if (id == wbc->wb_tcand_id)
704 wbc->wb_tcand_bytes += bytes;
705 else
706 wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
707}
708EXPORT_SYMBOL_GPL(wbc_account_io);
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726int inode_congested(struct inode *inode, int cong_bits)
727{
728
729
730
731
732 if (inode && inode_to_wb_is_valid(inode)) {
733 struct bdi_writeback *wb;
734 bool locked, congested;
735
736 wb = unlocked_inode_to_wb_begin(inode, &locked);
737 congested = wb_congested(wb, cong_bits);
738 unlocked_inode_to_wb_end(inode, locked);
739 return congested;
740 }
741
742 return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
743}
744EXPORT_SYMBOL_GPL(inode_congested);
745
746
747
748
749
750
751
752
753
754
755static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
756{
757 unsigned long this_bw = wb->avg_write_bandwidth;
758 unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
759
760 if (nr_pages == LONG_MAX)
761 return LONG_MAX;
762
763
764
765
766
767
768 if (!tot_bw || this_bw >= tot_bw)
769 return nr_pages;
770 else
771 return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
772}
773
774
775
776
777
778
779
780
781
782
783
784
785static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
786 struct wb_writeback_work *base_work,
787 bool skip_if_busy)
788{
789 struct bdi_writeback *last_wb = NULL;
790 struct bdi_writeback *wb = list_entry(&bdi->wb_list,
791 struct bdi_writeback, bdi_node);
792
793 might_sleep();
794restart:
795 rcu_read_lock();
796 list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
797 DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
798 struct wb_writeback_work fallback_work;
799 struct wb_writeback_work *work;
800 long nr_pages;
801
802 if (last_wb) {
803 wb_put(last_wb);
804 last_wb = NULL;
805 }
806
807
808 if (!wb_has_dirty_io(wb) &&
809 (base_work->sync_mode == WB_SYNC_NONE ||
810 list_empty(&wb->b_dirty_time)))
811 continue;
812 if (skip_if_busy && writeback_in_progress(wb))
813 continue;
814
815 nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);
816
817 work = kmalloc(sizeof(*work), GFP_ATOMIC);
818 if (work) {
819 *work = *base_work;
820 work->nr_pages = nr_pages;
821 work->auto_free = 1;
822 wb_queue_work(wb, work);
823 continue;
824 }
825
826
827 work = &fallback_work;
828 *work = *base_work;
829 work->nr_pages = nr_pages;
830 work->auto_free = 0;
831 work->done = &fallback_work_done;
832
833 wb_queue_work(wb, work);
834
835
836
837
838
839
840 wb_get(wb);
841 last_wb = wb;
842
843 rcu_read_unlock();
844 wb_wait_for_completion(bdi, &fallback_work_done);
845 goto restart;
846 }
847 rcu_read_unlock();
848
849 if (last_wb)
850 wb_put(last_wb);
851}
852
853
854
855
856
857
858
859
860
861
862
863void cgroup_writeback_umount(void)
864{
865 if (atomic_read(&isw_nr_in_flight)) {
866 synchronize_rcu();
867 flush_workqueue(isw_wq);
868 }
869}
870
871static int __init cgroup_writeback_init(void)
872{
873 isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
874 if (!isw_wq)
875 return -ENOMEM;
876 return 0;
877}
878fs_initcall(cgroup_writeback_init);
879
880#else
881
882static struct bdi_writeback *
883locked_inode_to_wb_and_lock_list(struct inode *inode)
884 __releases(&inode->i_lock)
885 __acquires(&wb->list_lock)
886{
887 struct bdi_writeback *wb = inode_to_wb(inode);
888
889 spin_unlock(&inode->i_lock);
890 spin_lock(&wb->list_lock);
891 return wb;
892}
893
894static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
895 __acquires(&wb->list_lock)
896{
897 struct bdi_writeback *wb = inode_to_wb(inode);
898
899 spin_lock(&wb->list_lock);
900 return wb;
901}
902
903static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
904{
905 return nr_pages;
906}
907
908static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
909 struct wb_writeback_work *base_work,
910 bool skip_if_busy)
911{
912 might_sleep();
913
914 if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
915 base_work->auto_free = 0;
916 wb_queue_work(&bdi->wb, base_work);
917 }
918}
919
920#endif
921
922void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
923 bool range_cyclic, enum wb_reason reason)
924{
925 struct wb_writeback_work *work;
926
927 if (!wb_has_dirty_io(wb))
928 return;
929
930
931
932
933
934 work = kzalloc(sizeof(*work),
935 GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
936 if (!work) {
937 trace_writeback_nowork(wb);
938 wb_wakeup(wb);
939 return;
940 }
941
942 work->sync_mode = WB_SYNC_NONE;
943 work->nr_pages = nr_pages;
944 work->range_cyclic = range_cyclic;
945 work->reason = reason;
946 work->auto_free = 1;
947
948 wb_queue_work(wb, work);
949}
950
951
952
953
954
955
956
957
958
959
960
961void wb_start_background_writeback(struct bdi_writeback *wb)
962{
963
964
965
966
967 trace_writeback_wake_background(wb);
968 wb_wakeup(wb);
969}
970
971
972
973
974void inode_io_list_del(struct inode *inode)
975{
976 struct bdi_writeback *wb;
977
978 wb = inode_to_wb_and_lock_list(inode);
979 inode_io_list_del_locked(inode, wb);
980 spin_unlock(&wb->list_lock);
981}
982
983
984
985
986void sb_mark_inode_writeback(struct inode *inode)
987{
988 struct super_block *sb = inode->i_sb;
989 unsigned long flags;
990
991 if (list_empty(&inode->i_wb_list)) {
992 spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
993 if (list_empty(&inode->i_wb_list)) {
994 list_add_tail(&inode->i_wb_list, &sb->s_inodes_wb);
995 trace_sb_mark_inode_writeback(inode);
996 }
997 spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
998 }
999}
1000
1001
1002
1003
1004void sb_clear_inode_writeback(struct inode *inode)
1005{
1006 struct super_block *sb = inode->i_sb;
1007 unsigned long flags;
1008
1009 if (!list_empty(&inode->i_wb_list)) {
1010 spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
1011 if (!list_empty(&inode->i_wb_list)) {
1012 list_del_init(&inode->i_wb_list);
1013 trace_sb_clear_inode_writeback(inode);
1014 }
1015 spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
1016 }
1017}
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
1029{
1030 if (!list_empty(&wb->b_dirty)) {
1031 struct inode *tail;
1032
1033 tail = wb_inode(wb->b_dirty.next);
1034 if (time_before(inode->dirtied_when, tail->dirtied_when))
1035 inode->dirtied_when = jiffies;
1036 }
1037 inode_io_list_move_locked(inode, wb, &wb->b_dirty);
1038}
1039
1040
1041
1042
1043static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
1044{
1045 inode_io_list_move_locked(inode, wb, &wb->b_more_io);
1046}
1047
1048static void inode_sync_complete(struct inode *inode)
1049{
1050 inode->i_state &= ~I_SYNC;
1051
1052 inode_add_lru(inode);
1053
1054 smp_mb();
1055 wake_up_bit(&inode->i_state, __I_SYNC);
1056}
1057
1058static bool inode_dirtied_after(struct inode *inode, unsigned long t)
1059{
1060 bool ret = time_after(inode->dirtied_when, t);
1061#ifndef CONFIG_64BIT
1062
1063
1064
1065
1066
1067
1068 ret = ret && time_before_eq(inode->dirtied_when, jiffies);
1069#endif
1070 return ret;
1071}
1072
1073#define EXPIRE_DIRTY_ATIME 0x0001
1074
1075
1076
1077
1078
1079static int move_expired_inodes(struct list_head *delaying_queue,
1080 struct list_head *dispatch_queue,
1081 int flags,
1082 struct wb_writeback_work *work)
1083{
1084 unsigned long *older_than_this = NULL;
1085 unsigned long expire_time;
1086 LIST_HEAD(tmp);
1087 struct list_head *pos, *node;
1088 struct super_block *sb = NULL;
1089 struct inode *inode;
1090 int do_sb_sort = 0;
1091 int moved = 0;
1092
1093 if ((flags & EXPIRE_DIRTY_ATIME) == 0)
1094 older_than_this = work->older_than_this;
1095 else if (!work->for_sync) {
1096 expire_time = jiffies - (dirtytime_expire_interval * HZ);
1097 older_than_this = &expire_time;
1098 }
1099 while (!list_empty(delaying_queue)) {
1100 inode = wb_inode(delaying_queue->prev);
1101 if (older_than_this &&
1102 inode_dirtied_after(inode, *older_than_this))
1103 break;
1104 list_move(&inode->i_io_list, &tmp);
1105 moved++;
1106 if (flags & EXPIRE_DIRTY_ATIME)
1107 set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);
1108 if (sb_is_blkdev_sb(inode->i_sb))
1109 continue;
1110 if (sb && sb != inode->i_sb)
1111 do_sb_sort = 1;
1112 sb = inode->i_sb;
1113 }
1114
1115
1116 if (!do_sb_sort) {
1117 list_splice(&tmp, dispatch_queue);
1118 goto out;
1119 }
1120
1121
1122 while (!list_empty(&tmp)) {
1123 sb = wb_inode(tmp.prev)->i_sb;
1124 list_for_each_prev_safe(pos, node, &tmp) {
1125 inode = wb_inode(pos);
1126 if (inode->i_sb == sb)
1127 list_move(&inode->i_io_list, dispatch_queue);
1128 }
1129 }
1130out:
1131 return moved;
1132}
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
1146{
1147 int moved;
1148
1149 assert_spin_locked(&wb->list_lock);
1150 list_splice_init(&wb->b_more_io, &wb->b_io);
1151 moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
1152 moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
1153 EXPIRE_DIRTY_ATIME, work);
1154 if (moved)
1155 wb_io_lists_populated(wb);
1156 trace_writeback_queue_io(wb, work, moved);
1157}
1158
1159static int write_inode(struct inode *inode, struct writeback_control *wbc)
1160{
1161 int ret;
1162
1163 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) {
1164 trace_writeback_write_inode_start(inode, wbc);
1165 ret = inode->i_sb->s_op->write_inode(inode, wbc);
1166 trace_writeback_write_inode(inode, wbc);
1167 return ret;
1168 }
1169 return 0;
1170}
1171
1172
1173
1174
1175
1176static void __inode_wait_for_writeback(struct inode *inode)
1177 __releases(inode->i_lock)
1178 __acquires(inode->i_lock)
1179{
1180 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
1181 wait_queue_head_t *wqh;
1182
1183 wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
1184 while (inode->i_state & I_SYNC) {
1185 spin_unlock(&inode->i_lock);
1186 __wait_on_bit(wqh, &wq, bit_wait,
1187 TASK_UNINTERRUPTIBLE);
1188 spin_lock(&inode->i_lock);
1189 }
1190}
1191
1192
1193
1194
1195void inode_wait_for_writeback(struct inode *inode)
1196{
1197 spin_lock(&inode->i_lock);
1198 __inode_wait_for_writeback(inode);
1199 spin_unlock(&inode->i_lock);
1200}
1201
1202
1203
1204
1205
1206
1207static void inode_sleep_on_writeback(struct inode *inode)
1208 __releases(inode->i_lock)
1209{
1210 DEFINE_WAIT(wait);
1211 wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
1212 int sleep;
1213
1214 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
1215 sleep = inode->i_state & I_SYNC;
1216 spin_unlock(&inode->i_lock);
1217 if (sleep)
1218 schedule();
1219 finish_wait(wqh, &wait);
1220}
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
1231 struct writeback_control *wbc)
1232{
1233 if (inode->i_state & I_FREEING)
1234 return;
1235
1236
1237
1238
1239
1240
1241 if ((inode->i_state & I_DIRTY) &&
1242 (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
1243 inode->dirtied_when = jiffies;
1244
1245 if (wbc->pages_skipped) {
1246
1247
1248
1249
1250 redirty_tail(inode, wb);
1251 return;
1252 }
1253
1254 if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
1255
1256
1257
1258
1259 if (wbc->nr_to_write <= 0) {
1260
1261 requeue_io(inode, wb);
1262 } else {
1263
1264
1265
1266
1267
1268
1269
1270 redirty_tail(inode, wb);
1271 }
1272 } else if (inode->i_state & I_DIRTY) {
1273
1274
1275
1276
1277
1278 redirty_tail(inode, wb);
1279 } else if (inode->i_state & I_DIRTY_TIME) {
1280 inode->dirtied_when = jiffies;
1281 inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
1282 } else {
1283
1284 inode_io_list_del_locked(inode, wb);
1285 }
1286}
1287
1288
1289
1290
1291
1292
1293static int
1294__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
1295{
1296 struct address_space *mapping = inode->i_mapping;
1297 long nr_to_write = wbc->nr_to_write;
1298 unsigned dirty;
1299 int ret;
1300
1301 WARN_ON(!(inode->i_state & I_SYNC));
1302
1303 trace_writeback_single_inode_start(inode, wbc, nr_to_write);
1304
1305 ret = do_writepages(mapping, wbc);
1306
1307
1308
1309
1310
1311
1312
1313
1314 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
1315 int err = filemap_fdatawait(mapping);
1316 if (ret == 0)
1317 ret = err;
1318 }
1319
1320
1321
1322
1323
1324
1325 spin_lock(&inode->i_lock);
1326
1327 dirty = inode->i_state & I_DIRTY;
1328 if (inode->i_state & I_DIRTY_TIME) {
1329 if ((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||
1330 wbc->sync_mode == WB_SYNC_ALL ||
1331 unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) ||
1332 unlikely(time_after(jiffies,
1333 (inode->dirtied_time_when +
1334 dirtytime_expire_interval * HZ)))) {
1335 dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
1336 trace_writeback_lazytime(inode);
1337 }
1338 } else
1339 inode->i_state &= ~I_DIRTY_TIME_EXPIRED;
1340 inode->i_state &= ~dirty;
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353 smp_mb();
1354
1355 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
1356 inode->i_state |= I_DIRTY_PAGES;
1357
1358 spin_unlock(&inode->i_lock);
1359
1360 if (dirty & I_DIRTY_TIME)
1361 mark_inode_dirty_sync(inode);
1362
1363 if (dirty & ~I_DIRTY_PAGES) {
1364 int err = write_inode(inode, wbc);
1365 if (ret == 0)
1366 ret = err;
1367 }
1368 trace_writeback_single_inode(inode, wbc, nr_to_write);
1369 return ret;
1370}
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380static int writeback_single_inode(struct inode *inode,
1381 struct writeback_control *wbc)
1382{
1383 struct bdi_writeback *wb;
1384 int ret = 0;
1385
1386 spin_lock(&inode->i_lock);
1387 if (!atomic_read(&inode->i_count))
1388 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
1389 else
1390 WARN_ON(inode->i_state & I_WILL_FREE);
1391
1392 if (inode->i_state & I_SYNC) {
1393 if (wbc->sync_mode != WB_SYNC_ALL)
1394 goto out;
1395
1396
1397
1398
1399
1400 __inode_wait_for_writeback(inode);
1401 }
1402 WARN_ON(inode->i_state & I_SYNC);
1403
1404
1405
1406
1407
1408
1409
1410
1411 if (!(inode->i_state & I_DIRTY_ALL) &&
1412 (wbc->sync_mode != WB_SYNC_ALL ||
1413 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
1414 goto out;
1415 inode->i_state |= I_SYNC;
1416 wbc_attach_and_unlock_inode(wbc, inode);
1417
1418 ret = __writeback_single_inode(inode, wbc);
1419
1420 wbc_detach_inode(wbc);
1421
1422 wb = inode_to_wb_and_lock_list(inode);
1423 spin_lock(&inode->i_lock);
1424
1425
1426
1427
1428 if (!(inode->i_state & I_DIRTY_ALL))
1429 inode_io_list_del_locked(inode, wb);
1430 spin_unlock(&wb->list_lock);
1431 inode_sync_complete(inode);
1432out:
1433 spin_unlock(&inode->i_lock);
1434 return ret;
1435}
1436
1437static long writeback_chunk_size(struct bdi_writeback *wb,
1438 struct wb_writeback_work *work)
1439{
1440 long pages;
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455 if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
1456 pages = LONG_MAX;
1457 else {
1458 pages = min(wb->avg_write_bandwidth / 2,
1459 global_wb_domain.dirty_limit / DIRTY_SCOPE);
1460 pages = min(pages, work->nr_pages);
1461 pages = round_down(pages + MIN_WRITEBACK_PAGES,
1462 MIN_WRITEBACK_PAGES);
1463 }
1464
1465 return pages;
1466}
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477static long writeback_sb_inodes(struct super_block *sb,
1478 struct bdi_writeback *wb,
1479 struct wb_writeback_work *work)
1480{
1481 struct writeback_control wbc = {
1482 .sync_mode = work->sync_mode,
1483 .tagged_writepages = work->tagged_writepages,
1484 .for_kupdate = work->for_kupdate,
1485 .for_background = work->for_background,
1486 .for_sync = work->for_sync,
1487 .range_cyclic = work->range_cyclic,
1488 .range_start = 0,
1489 .range_end = LLONG_MAX,
1490 };
1491 unsigned long start_time = jiffies;
1492 long write_chunk;
1493 long wrote = 0;
1494
1495 while (!list_empty(&wb->b_io)) {
1496 struct inode *inode = wb_inode(wb->b_io.prev);
1497 struct bdi_writeback *tmp_wb;
1498
1499 if (inode->i_sb != sb) {
1500 if (work->sb) {
1501
1502
1503
1504
1505
1506 redirty_tail(inode, wb);
1507 continue;
1508 }
1509
1510
1511
1512
1513
1514
1515 break;
1516 }
1517
1518
1519
1520
1521
1522
1523 spin_lock(&inode->i_lock);
1524 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
1525 spin_unlock(&inode->i_lock);
1526 redirty_tail(inode, wb);
1527 continue;
1528 }
1529 if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539 spin_unlock(&inode->i_lock);
1540 requeue_io(inode, wb);
1541 trace_writeback_sb_inodes_requeue(inode);
1542 continue;
1543 }
1544 spin_unlock(&wb->list_lock);
1545
1546
1547
1548
1549
1550
1551 if (inode->i_state & I_SYNC) {
1552
1553 inode_sleep_on_writeback(inode);
1554
1555 spin_lock(&wb->list_lock);
1556 continue;
1557 }
1558 inode->i_state |= I_SYNC;
1559 wbc_attach_and_unlock_inode(&wbc, inode);
1560
1561 write_chunk = writeback_chunk_size(wb, work);
1562 wbc.nr_to_write = write_chunk;
1563 wbc.pages_skipped = 0;
1564
1565
1566
1567
1568
1569 __writeback_single_inode(inode, &wbc);
1570
1571 wbc_detach_inode(&wbc);
1572 work->nr_pages -= write_chunk - wbc.nr_to_write;
1573 wrote += write_chunk - wbc.nr_to_write;
1574
1575 if (need_resched()) {
1576
1577
1578
1579
1580
1581
1582
1583
1584 blk_flush_plug(current);
1585 cond_resched();
1586 }
1587
1588
1589
1590
1591
1592 tmp_wb = inode_to_wb_and_lock_list(inode);
1593 spin_lock(&inode->i_lock);
1594 if (!(inode->i_state & I_DIRTY_ALL))
1595 wrote++;
1596 requeue_inode(inode, tmp_wb, &wbc);
1597 inode_sync_complete(inode);
1598 spin_unlock(&inode->i_lock);
1599
1600 if (unlikely(tmp_wb != wb)) {
1601 spin_unlock(&tmp_wb->list_lock);
1602 spin_lock(&wb->list_lock);
1603 }
1604
1605
1606
1607
1608
1609 if (wrote) {
1610 if (time_is_before_jiffies(start_time + HZ / 10UL))
1611 break;
1612 if (work->nr_pages <= 0)
1613 break;
1614 }
1615 }
1616 return wrote;
1617}
1618
1619static long __writeback_inodes_wb(struct bdi_writeback *wb,
1620 struct wb_writeback_work *work)
1621{
1622 unsigned long start_time = jiffies;
1623 long wrote = 0;
1624
1625 while (!list_empty(&wb->b_io)) {
1626 struct inode *inode = wb_inode(wb->b_io.prev);
1627 struct super_block *sb = inode->i_sb;
1628
1629 if (!trylock_super(sb)) {
1630
1631
1632
1633
1634
1635 redirty_tail(inode, wb);
1636 continue;
1637 }
1638 wrote += writeback_sb_inodes(sb, wb, work);
1639 up_read(&sb->s_umount);
1640
1641
1642 if (wrote) {
1643 if (time_is_before_jiffies(start_time + HZ / 10UL))
1644 break;
1645 if (work->nr_pages <= 0)
1646 break;
1647 }
1648 }
1649
1650 return wrote;
1651}
1652
1653static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
1654 enum wb_reason reason)
1655{
1656 struct wb_writeback_work work = {
1657 .nr_pages = nr_pages,
1658 .sync_mode = WB_SYNC_NONE,
1659 .range_cyclic = 1,
1660 .reason = reason,
1661 };
1662 struct blk_plug plug;
1663
1664 blk_start_plug(&plug);
1665 spin_lock(&wb->list_lock);
1666 if (list_empty(&wb->b_io))
1667 queue_io(wb, &work);
1668 __writeback_inodes_wb(wb, &work);
1669 spin_unlock(&wb->list_lock);
1670 blk_finish_plug(&plug);
1671
1672 return nr_pages - work.nr_pages;
1673}
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690static long wb_writeback(struct bdi_writeback *wb,
1691 struct wb_writeback_work *work)
1692{
1693 unsigned long wb_start = jiffies;
1694 long nr_pages = work->nr_pages;
1695 unsigned long oldest_jif;
1696 struct inode *inode;
1697 long progress;
1698 struct blk_plug plug;
1699
1700 oldest_jif = jiffies;
1701 work->older_than_this = &oldest_jif;
1702
1703 blk_start_plug(&plug);
1704 spin_lock(&wb->list_lock);
1705 for (;;) {
1706
1707
1708
1709 if (work->nr_pages <= 0)
1710 break;
1711
1712
1713
1714
1715
1716
1717
1718 if ((work->for_background || work->for_kupdate) &&
1719 !list_empty(&wb->work_list))
1720 break;
1721
1722
1723
1724
1725
1726 if (work->for_background && !wb_over_bg_thresh(wb))
1727 break;
1728
1729
1730
1731
1732
1733
1734
1735 if (work->for_kupdate) {
1736 oldest_jif = jiffies -
1737 msecs_to_jiffies(dirty_expire_interval * 10);
1738 } else if (work->for_background)
1739 oldest_jif = jiffies;
1740
1741 trace_writeback_start(wb, work);
1742 if (list_empty(&wb->b_io))
1743 queue_io(wb, work);
1744 if (work->sb)
1745 progress = writeback_sb_inodes(work->sb, wb, work);
1746 else
1747 progress = __writeback_inodes_wb(wb, work);
1748 trace_writeback_written(wb, work);
1749
1750 wb_update_bandwidth(wb, wb_start);
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760 if (progress)
1761 continue;
1762
1763
1764
1765 if (list_empty(&wb->b_more_io))
1766 break;
1767
1768
1769
1770
1771
1772 if (!list_empty(&wb->b_more_io)) {
1773 trace_writeback_wait(wb, work);
1774 inode = wb_inode(wb->b_more_io.prev);
1775 spin_lock(&inode->i_lock);
1776 spin_unlock(&wb->list_lock);
1777
1778 inode_sleep_on_writeback(inode);
1779 spin_lock(&wb->list_lock);
1780 }
1781 }
1782 spin_unlock(&wb->list_lock);
1783 blk_finish_plug(&plug);
1784
1785 return nr_pages - work->nr_pages;
1786}
1787
1788
1789
1790
1791static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
1792{
1793 struct wb_writeback_work *work = NULL;
1794
1795 spin_lock_bh(&wb->work_lock);
1796 if (!list_empty(&wb->work_list)) {
1797 work = list_entry(wb->work_list.next,
1798 struct wb_writeback_work, list);
1799 list_del_init(&work->list);
1800 }
1801 spin_unlock_bh(&wb->work_lock);
1802 return work;
1803}
1804
1805
1806
1807
1808
1809static unsigned long get_nr_dirty_pages(void)
1810{
1811 return global_node_page_state(NR_FILE_DIRTY) +
1812 global_node_page_state(NR_UNSTABLE_NFS) +
1813 get_nr_dirty_inodes();
1814}
1815
1816static long wb_check_background_flush(struct bdi_writeback *wb)
1817{
1818 if (wb_over_bg_thresh(wb)) {
1819
1820 struct wb_writeback_work work = {
1821 .nr_pages = LONG_MAX,
1822 .sync_mode = WB_SYNC_NONE,
1823 .for_background = 1,
1824 .range_cyclic = 1,
1825 .reason = WB_REASON_BACKGROUND,
1826 };
1827
1828 return wb_writeback(wb, &work);
1829 }
1830
1831 return 0;
1832}
1833
1834static long wb_check_old_data_flush(struct bdi_writeback *wb)
1835{
1836 unsigned long expired;
1837 long nr_pages;
1838
1839
1840
1841
1842 if (!dirty_writeback_interval)
1843 return 0;
1844
1845 expired = wb->last_old_flush +
1846 msecs_to_jiffies(dirty_writeback_interval * 10);
1847 if (time_before(jiffies, expired))
1848 return 0;
1849
1850 wb->last_old_flush = jiffies;
1851 nr_pages = get_nr_dirty_pages();
1852
1853 if (nr_pages) {
1854 struct wb_writeback_work work = {
1855 .nr_pages = nr_pages,
1856 .sync_mode = WB_SYNC_NONE,
1857 .for_kupdate = 1,
1858 .range_cyclic = 1,
1859 .reason = WB_REASON_PERIODIC,
1860 };
1861
1862 return wb_writeback(wb, &work);
1863 }
1864
1865 return 0;
1866}
1867
1868
1869
1870
1871static long wb_do_writeback(struct bdi_writeback *wb)
1872{
1873 struct wb_writeback_work *work;
1874 long wrote = 0;
1875
1876 set_bit(WB_writeback_running, &wb->state);
1877 while ((work = get_next_work_item(wb)) != NULL) {
1878 struct wb_completion *done = work->done;
1879
1880 trace_writeback_exec(wb, work);
1881
1882 wrote += wb_writeback(wb, work);
1883
1884 if (work->auto_free)
1885 kfree(work);
1886 if (done && atomic_dec_and_test(&done->cnt))
1887 wake_up_all(&wb->bdi->wb_waitq);
1888 }
1889
1890
1891
1892
1893 wrote += wb_check_old_data_flush(wb);
1894 wrote += wb_check_background_flush(wb);
1895 clear_bit(WB_writeback_running, &wb->state);
1896
1897 return wrote;
1898}
1899
1900
1901
1902
1903
1904void wb_workfn(struct work_struct *work)
1905{
1906 struct bdi_writeback *wb = container_of(to_delayed_work(work),
1907 struct bdi_writeback, dwork);
1908 long pages_written;
1909
1910 set_worker_desc("flush-%s", dev_name(wb->bdi->dev));
1911 current->flags |= PF_SWAPWRITE;
1912
1913 if (likely(!current_is_workqueue_rescuer() ||
1914 !test_bit(WB_registered, &wb->state))) {
1915
1916
1917
1918
1919
1920
1921 do {
1922 pages_written = wb_do_writeback(wb);
1923 trace_writeback_pages_written(pages_written);
1924 } while (!list_empty(&wb->work_list));
1925 } else {
1926
1927
1928
1929
1930
1931 pages_written = writeback_inodes_wb(wb, 1024,
1932 WB_REASON_FORKER_THREAD);
1933 trace_writeback_pages_written(pages_written);
1934 }
1935
1936 if (!list_empty(&wb->work_list))
1937 mod_delayed_work(bdi_wq, &wb->dwork, 0);
1938 else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
1939 wb_wakeup_delayed(wb);
1940
1941 current->flags &= ~PF_SWAPWRITE;
1942}
1943
1944
1945
1946
1947
1948void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
1949{
1950 struct backing_dev_info *bdi;
1951
1952
1953
1954
1955 if (blk_needs_flush_plug(current))
1956 blk_schedule_flush_plug(current);
1957
1958 if (!nr_pages)
1959 nr_pages = get_nr_dirty_pages();
1960
1961 rcu_read_lock();
1962 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
1963 struct bdi_writeback *wb;
1964
1965 if (!bdi_has_dirty_io(bdi))
1966 continue;
1967
1968 list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
1969 wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages),
1970 false, reason);
1971 }
1972 rcu_read_unlock();
1973}
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990static void wakeup_dirtytime_writeback(struct work_struct *w);
1991static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback);
1992
1993static void wakeup_dirtytime_writeback(struct work_struct *w)
1994{
1995 struct backing_dev_info *bdi;
1996
1997 rcu_read_lock();
1998 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
1999 struct bdi_writeback *wb;
2000
2001 list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
2002 if (!list_empty(&wb->b_dirty_time))
2003 wb_wakeup(wb);
2004 }
2005 rcu_read_unlock();
2006 schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
2007}
2008
2009static int __init start_dirtytime_writeback(void)
2010{
2011 schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
2012 return 0;
2013}
2014__initcall(start_dirtytime_writeback);
2015
2016int dirtytime_interval_handler(struct ctl_table *table, int write,
2017 void __user *buffer, size_t *lenp, loff_t *ppos)
2018{
2019 int ret;
2020
2021 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
2022 if (ret == 0 && write)
2023 mod_delayed_work(system_wq, &dirtytime_work, 0);
2024 return ret;
2025}
2026
2027static noinline void block_dump___mark_inode_dirty(struct inode *inode)
2028{
2029 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
2030 struct dentry *dentry;
2031 const char *name = "?";
2032
2033 dentry = d_find_alias(inode);
2034 if (dentry) {
2035 spin_lock(&dentry->d_lock);
2036 name = (const char *) dentry->d_name.name;
2037 }
2038 printk(KERN_DEBUG
2039 "%s(%d): dirtied inode %lu (%s) on %s\n",
2040 current->comm, task_pid_nr(current), inode->i_ino,
2041 name, inode->i_sb->s_id);
2042 if (dentry) {
2043 spin_unlock(&dentry->d_lock);
2044 dput(dentry);
2045 }
2046 }
2047}
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073void __mark_inode_dirty(struct inode *inode, int flags)
2074{
2075#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
2076 struct super_block *sb = inode->i_sb;
2077 int dirtytime;
2078
2079 trace_writeback_mark_inode_dirty(inode, flags);
2080
2081
2082
2083
2084
2085 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_TIME)) {
2086 trace_writeback_dirty_inode_start(inode, flags);
2087
2088 if (sb->s_op->dirty_inode)
2089 sb->s_op->dirty_inode(inode, flags);
2090
2091 trace_writeback_dirty_inode(inode, flags);
2092 }
2093 if (flags & I_DIRTY_INODE)
2094 flags &= ~I_DIRTY_TIME;
2095 dirtytime = flags & I_DIRTY_TIME;
2096
2097
2098
2099
2100
2101 smp_mb();
2102
2103 if (((inode->i_state & flags) == flags) ||
2104 (dirtytime && (inode->i_state & I_DIRTY_INODE)))
2105 return;
2106
2107 if (unlikely(block_dump))
2108 block_dump___mark_inode_dirty(inode);
2109
2110 spin_lock(&inode->i_lock);
2111 if (dirtytime && (inode->i_state & I_DIRTY_INODE))
2112 goto out_unlock_inode;
2113 if ((inode->i_state & flags) != flags) {
2114 const int was_dirty = inode->i_state & I_DIRTY;
2115
2116 inode_attach_wb(inode, NULL);
2117
2118 if (flags & I_DIRTY_INODE)
2119 inode->i_state &= ~I_DIRTY_TIME;
2120 inode->i_state |= flags;
2121
2122
2123
2124
2125
2126
2127 if (inode->i_state & I_SYNC)
2128 goto out_unlock_inode;
2129
2130
2131
2132
2133
2134 if (!S_ISBLK(inode->i_mode)) {
2135 if (inode_unhashed(inode))
2136 goto out_unlock_inode;
2137 }
2138 if (inode->i_state & I_FREEING)
2139 goto out_unlock_inode;
2140
2141
2142
2143
2144
2145 if (!was_dirty) {
2146 struct bdi_writeback *wb;
2147 struct list_head *dirty_list;
2148 bool wakeup_bdi = false;
2149
2150 wb = locked_inode_to_wb_and_lock_list(inode);
2151
2152 WARN(bdi_cap_writeback_dirty(wb->bdi) &&
2153 !test_bit(WB_registered, &wb->state),
2154 "bdi-%s not registered\n", wb->bdi->name);
2155
2156 inode->dirtied_when = jiffies;
2157 if (dirtytime)
2158 inode->dirtied_time_when = jiffies;
2159
2160 if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES))
2161 dirty_list = &wb->b_dirty;
2162 else
2163 dirty_list = &wb->b_dirty_time;
2164
2165 wakeup_bdi = inode_io_list_move_locked(inode, wb,
2166 dirty_list);
2167
2168 spin_unlock(&wb->list_lock);
2169 trace_writeback_dirty_inode_enqueue(inode);
2170
2171
2172
2173
2174
2175
2176
2177 if (bdi_cap_writeback_dirty(wb->bdi) && wakeup_bdi)
2178 wb_wakeup_delayed(wb);
2179 return;
2180 }
2181 }
2182out_unlock_inode:
2183 spin_unlock(&inode->i_lock);
2184
2185#undef I_DIRTY_INODE
2186}
2187EXPORT_SYMBOL(__mark_inode_dirty);
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198static void wait_sb_inodes(struct super_block *sb)
2199{
2200 LIST_HEAD(sync_list);
2201
2202
2203
2204
2205
2206 WARN_ON(!rwsem_is_locked(&sb->s_umount));
2207
2208 mutex_lock(&sb->s_sync_lock);
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219 rcu_read_lock();
2220 spin_lock_irq(&sb->s_inode_wblist_lock);
2221 list_splice_init(&sb->s_inodes_wb, &sync_list);
2222
2223
2224
2225
2226
2227
2228
2229
2230 while (!list_empty(&sync_list)) {
2231 struct inode *inode = list_first_entry(&sync_list, struct inode,
2232 i_wb_list);
2233 struct address_space *mapping = inode->i_mapping;
2234
2235
2236
2237
2238
2239
2240
2241 list_move_tail(&inode->i_wb_list, &sb->s_inodes_wb);
2242
2243
2244
2245
2246
2247
2248 if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
2249 continue;
2250
2251 spin_unlock_irq(&sb->s_inode_wblist_lock);
2252
2253 spin_lock(&inode->i_lock);
2254 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
2255 spin_unlock(&inode->i_lock);
2256
2257 spin_lock_irq(&sb->s_inode_wblist_lock);
2258 continue;
2259 }
2260 __iget(inode);
2261 spin_unlock(&inode->i_lock);
2262 rcu_read_unlock();
2263
2264
2265
2266
2267
2268
2269 filemap_fdatawait_keep_errors(mapping);
2270
2271 cond_resched();
2272
2273 iput(inode);
2274
2275 rcu_read_lock();
2276 spin_lock_irq(&sb->s_inode_wblist_lock);
2277 }
2278 spin_unlock_irq(&sb->s_inode_wblist_lock);
2279 rcu_read_unlock();
2280 mutex_unlock(&sb->s_sync_lock);
2281}
2282
2283static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
2284 enum wb_reason reason, bool skip_if_busy)
2285{
2286 DEFINE_WB_COMPLETION_ONSTACK(done);
2287 struct wb_writeback_work work = {
2288 .sb = sb,
2289 .sync_mode = WB_SYNC_NONE,
2290 .tagged_writepages = 1,
2291 .done = &done,
2292 .nr_pages = nr,
2293 .reason = reason,
2294 };
2295 struct backing_dev_info *bdi = sb->s_bdi;
2296
2297 if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
2298 return;
2299 WARN_ON(!rwsem_is_locked(&sb->s_umount));
2300
2301 bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
2302 wb_wait_for_completion(bdi, &done);
2303}
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315void writeback_inodes_sb_nr(struct super_block *sb,
2316 unsigned long nr,
2317 enum wb_reason reason)
2318{
2319 __writeback_inodes_sb_nr(sb, nr, reason, false);
2320}
2321EXPORT_SYMBOL(writeback_inodes_sb_nr);
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
2333{
2334 return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
2335}
2336EXPORT_SYMBOL(writeback_inodes_sb);
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347bool try_to_writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
2348 enum wb_reason reason)
2349{
2350 if (!down_read_trylock(&sb->s_umount))
2351 return false;
2352
2353 __writeback_inodes_sb_nr(sb, nr, reason, true);
2354 up_read(&sb->s_umount);
2355 return true;
2356}
2357EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr);
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367bool try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
2368{
2369 return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
2370}
2371EXPORT_SYMBOL(try_to_writeback_inodes_sb);
2372
2373
2374
2375
2376
2377
2378
2379
2380void sync_inodes_sb(struct super_block *sb)
2381{
2382 DEFINE_WB_COMPLETION_ONSTACK(done);
2383 struct wb_writeback_work work = {
2384 .sb = sb,
2385 .sync_mode = WB_SYNC_ALL,
2386 .nr_pages = LONG_MAX,
2387 .range_cyclic = 0,
2388 .done = &done,
2389 .reason = WB_REASON_SYNC,
2390 .for_sync = 1,
2391 };
2392 struct backing_dev_info *bdi = sb->s_bdi;
2393
2394
2395
2396
2397
2398
2399 if (bdi == &noop_backing_dev_info)
2400 return;
2401 WARN_ON(!rwsem_is_locked(&sb->s_umount));
2402
2403 bdi_split_work_to_wbs(bdi, &work, false);
2404 wb_wait_for_completion(bdi, &done);
2405
2406 wait_sb_inodes(sb);
2407}
2408EXPORT_SYMBOL(sync_inodes_sb);
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420int write_inode_now(struct inode *inode, int sync)
2421{
2422 struct writeback_control wbc = {
2423 .nr_to_write = LONG_MAX,
2424 .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
2425 .range_start = 0,
2426 .range_end = LLONG_MAX,
2427 };
2428
2429 if (!mapping_cap_writeback_dirty(inode->i_mapping))
2430 wbc.nr_to_write = 0;
2431
2432 might_sleep();
2433 return writeback_single_inode(inode, &wbc);
2434}
2435EXPORT_SYMBOL(write_inode_now);
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448int sync_inode(struct inode *inode, struct writeback_control *wbc)
2449{
2450 return writeback_single_inode(inode, wbc);
2451}
2452EXPORT_SYMBOL(sync_inode);
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463int sync_inode_metadata(struct inode *inode, int wait)
2464{
2465 struct writeback_control wbc = {
2466 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
2467 .nr_to_write = 0,
2468 };
2469
2470 return sync_inode(inode, &wbc);
2471}
2472EXPORT_SYMBOL(sync_inode_metadata);
2473