1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16#include <linux/kernel.h>
17#include <linux/export.h>
18#include <linux/spinlock.h>
19#include <linux/slab.h>
20#include <linux/sched.h>
21#include <linux/fs.h>
22#include <linux/mm.h>
23#include <linux/pagemap.h>
24#include <linux/kthread.h>
25#include <linux/writeback.h>
26#include <linux/blkdev.h>
27#include <linux/backing-dev.h>
28#include <linux/tracepoint.h>
29#include <linux/device.h>
30#include <linux/memcontrol.h>
31#include "internal.h"
32
33
34
35
36#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10))
37
38struct wb_completion {
39 atomic_t cnt;
40};
41
42
43
44
45struct wb_writeback_work {
46 long nr_pages;
47 struct super_block *sb;
48 unsigned long *older_than_this;
49 enum writeback_sync_modes sync_mode;
50 unsigned int tagged_writepages:1;
51 unsigned int for_kupdate:1;
52 unsigned int range_cyclic:1;
53 unsigned int for_background:1;
54 unsigned int for_sync:1;
55 unsigned int auto_free:1;
56 enum wb_reason reason;
57
58 struct list_head list;
59 struct wb_completion *done;
60};
61
62
63
64
65
66
67
68
69#define DEFINE_WB_COMPLETION_ONSTACK(cmpl) \
70 struct wb_completion cmpl = { \
71 .cnt = ATOMIC_INIT(1), \
72 }
73
74
75
76
77
78
79
80
81
82
83
84
85unsigned int dirtytime_expire_interval = 12 * 60 * 60;
86
87static inline struct inode *wb_inode(struct list_head *head)
88{
89 return list_entry(head, struct inode, i_io_list);
90}
91
92
93
94
95
96
97#define CREATE_TRACE_POINTS
98#include <trace/events/writeback.h>
99
100EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
101
102static bool wb_io_lists_populated(struct bdi_writeback *wb)
103{
104 if (wb_has_dirty_io(wb)) {
105 return false;
106 } else {
107 set_bit(WB_has_dirty_io, &wb->state);
108 WARN_ON_ONCE(!wb->avg_write_bandwidth);
109 atomic_long_add(wb->avg_write_bandwidth,
110 &wb->bdi->tot_write_bandwidth);
111 return true;
112 }
113}
114
115static void wb_io_lists_depopulated(struct bdi_writeback *wb)
116{
117 if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) &&
118 list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) {
119 clear_bit(WB_has_dirty_io, &wb->state);
120 WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth,
121 &wb->bdi->tot_write_bandwidth) < 0);
122 }
123}
124
125
126
127
128
129
130
131
132
133
134
135static bool inode_io_list_move_locked(struct inode *inode,
136 struct bdi_writeback *wb,
137 struct list_head *head)
138{
139 assert_spin_locked(&wb->list_lock);
140
141 list_move(&inode->i_io_list, head);
142
143
144 if (head != &wb->b_dirty_time)
145 return wb_io_lists_populated(wb);
146
147 wb_io_lists_depopulated(wb);
148 return false;
149}
150
151
152
153
154
155
156
157
158
159static void inode_io_list_del_locked(struct inode *inode,
160 struct bdi_writeback *wb)
161{
162 assert_spin_locked(&wb->list_lock);
163
164 list_del_init(&inode->i_io_list);
165 wb_io_lists_depopulated(wb);
166}
167
168static void wb_wakeup(struct bdi_writeback *wb)
169{
170 spin_lock_bh(&wb->work_lock);
171 if (test_bit(WB_registered, &wb->state))
172 mod_delayed_work(bdi_wq, &wb->dwork, 0);
173 spin_unlock_bh(&wb->work_lock);
174}
175
176static void finish_writeback_work(struct bdi_writeback *wb,
177 struct wb_writeback_work *work)
178{
179 struct wb_completion *done = work->done;
180
181 if (work->auto_free)
182 kfree(work);
183 if (done && atomic_dec_and_test(&done->cnt))
184 wake_up_all(&wb->bdi->wb_waitq);
185}
186
187static void wb_queue_work(struct bdi_writeback *wb,
188 struct wb_writeback_work *work)
189{
190 trace_writeback_queue(wb, work);
191
192 if (work->done)
193 atomic_inc(&work->done->cnt);
194
195 spin_lock_bh(&wb->work_lock);
196
197 if (test_bit(WB_registered, &wb->state)) {
198 list_add_tail(&work->list, &wb->work_list);
199 mod_delayed_work(bdi_wq, &wb->dwork, 0);
200 } else
201 finish_writeback_work(wb, work);
202
203 spin_unlock_bh(&wb->work_lock);
204}
205
206
207
208
209
210
211
212
213
214
215
216
217static void wb_wait_for_completion(struct backing_dev_info *bdi,
218 struct wb_completion *done)
219{
220 atomic_dec(&done->cnt);
221 wait_event(bdi->wb_waitq, !atomic_read(&done->cnt));
222}
223
224#ifdef CONFIG_CGROUP_WRITEBACK
225
226
227#define WB_FRN_TIME_SHIFT 13
228#define WB_FRN_TIME_AVG_SHIFT 3
229#define WB_FRN_TIME_CUT_DIV 2
230#define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT))
231
232#define WB_FRN_HIST_SLOTS 16
233#define WB_FRN_HIST_UNIT (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS)
234
235#define WB_FRN_HIST_THR_SLOTS (WB_FRN_HIST_SLOTS / 2)
236
237#define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1)
238
239
240static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
241static struct workqueue_struct *isw_wq;
242
243void __inode_attach_wb(struct inode *inode, struct page *page)
244{
245 struct backing_dev_info *bdi = inode_to_bdi(inode);
246 struct bdi_writeback *wb = NULL;
247
248 if (inode_cgwb_enabled(inode)) {
249 struct cgroup_subsys_state *memcg_css;
250
251 if (page) {
252 memcg_css = mem_cgroup_css_from_page(page);
253 wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
254 } else {
255
256 memcg_css = task_get_css(current, memory_cgrp_id);
257 wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
258 css_put(memcg_css);
259 }
260 }
261
262 if (!wb)
263 wb = &bdi->wb;
264
265
266
267
268
269 if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
270 wb_put(wb);
271}
272
273
274
275
276
277
278
279
280
281static struct bdi_writeback *
282locked_inode_to_wb_and_lock_list(struct inode *inode)
283 __releases(&inode->i_lock)
284 __acquires(&wb->list_lock)
285{
286 while (true) {
287 struct bdi_writeback *wb = inode_to_wb(inode);
288
289
290
291
292
293
294
295 wb_get(wb);
296 spin_unlock(&inode->i_lock);
297 spin_lock(&wb->list_lock);
298
299
300 if (likely(wb == inode->i_wb)) {
301 wb_put(wb);
302 return wb;
303 }
304
305 spin_unlock(&wb->list_lock);
306 wb_put(wb);
307 cpu_relax();
308 spin_lock(&inode->i_lock);
309 }
310}
311
312
313
314
315
316
317
318
319static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
320 __acquires(&wb->list_lock)
321{
322 spin_lock(&inode->i_lock);
323 return locked_inode_to_wb_and_lock_list(inode);
324}
325
326struct inode_switch_wbs_context {
327 struct inode *inode;
328 struct bdi_writeback *new_wb;
329
330 struct rcu_head rcu_head;
331 struct work_struct work;
332};
333
334static void inode_switch_wbs_work_fn(struct work_struct *work)
335{
336 struct inode_switch_wbs_context *isw =
337 container_of(work, struct inode_switch_wbs_context, work);
338 struct inode *inode = isw->inode;
339 struct address_space *mapping = inode->i_mapping;
340 struct bdi_writeback *old_wb = inode->i_wb;
341 struct bdi_writeback *new_wb = isw->new_wb;
342 struct radix_tree_iter iter;
343 bool switched = false;
344 void **slot;
345
346
347
348
349
350
351
352
353
354
355
356 if (old_wb < new_wb) {
357 spin_lock(&old_wb->list_lock);
358 spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
359 } else {
360 spin_lock(&new_wb->list_lock);
361 spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
362 }
363 spin_lock(&inode->i_lock);
364 spin_lock_irq(&mapping->tree_lock);
365
366
367
368
369
370 if (unlikely(inode->i_state & I_FREEING))
371 goto skip_switch;
372
373
374
375
376
377
378 radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
379 PAGECACHE_TAG_DIRTY) {
380 struct page *page = radix_tree_deref_slot_protected(slot,
381 &mapping->tree_lock);
382 if (likely(page) && PageDirty(page)) {
383 dec_wb_stat(old_wb, WB_RECLAIMABLE);
384 inc_wb_stat(new_wb, WB_RECLAIMABLE);
385 }
386 }
387
388 radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
389 PAGECACHE_TAG_WRITEBACK) {
390 struct page *page = radix_tree_deref_slot_protected(slot,
391 &mapping->tree_lock);
392 if (likely(page)) {
393 WARN_ON_ONCE(!PageWriteback(page));
394 dec_wb_stat(old_wb, WB_WRITEBACK);
395 inc_wb_stat(new_wb, WB_WRITEBACK);
396 }
397 }
398
399 wb_get(new_wb);
400
401
402
403
404
405
406
407 if (!list_empty(&inode->i_io_list)) {
408 struct inode *pos;
409
410 inode_io_list_del_locked(inode, old_wb);
411 inode->i_wb = new_wb;
412 list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
413 if (time_after_eq(inode->dirtied_when,
414 pos->dirtied_when))
415 break;
416 inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev);
417 } else {
418 inode->i_wb = new_wb;
419 }
420
421
422 inode->i_wb_frn_winner = 0;
423 inode->i_wb_frn_avg_time = 0;
424 inode->i_wb_frn_history = 0;
425 switched = true;
426skip_switch:
427
428
429
430
431 smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
432
433 spin_unlock_irq(&mapping->tree_lock);
434 spin_unlock(&inode->i_lock);
435 spin_unlock(&new_wb->list_lock);
436 spin_unlock(&old_wb->list_lock);
437
438 if (switched) {
439 wb_wakeup(new_wb);
440 wb_put(old_wb);
441 }
442 wb_put(new_wb);
443
444 iput(inode);
445 kfree(isw);
446
447 atomic_dec(&isw_nr_in_flight);
448}
449
450static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
451{
452 struct inode_switch_wbs_context *isw = container_of(rcu_head,
453 struct inode_switch_wbs_context, rcu_head);
454
455
456 INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
457 queue_work(isw_wq, &isw->work);
458}
459
460
461
462
463
464
465
466
467
468static void inode_switch_wbs(struct inode *inode, int new_wb_id)
469{
470 struct backing_dev_info *bdi = inode_to_bdi(inode);
471 struct cgroup_subsys_state *memcg_css;
472 struct inode_switch_wbs_context *isw;
473
474
475 if (inode->i_state & I_WB_SWITCH)
476 return;
477
478 isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
479 if (!isw)
480 return;
481
482
483 rcu_read_lock();
484 memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
485 if (memcg_css)
486 isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
487 rcu_read_unlock();
488 if (!isw->new_wb)
489 goto out_free;
490
491
492 spin_lock(&inode->i_lock);
493 if (!(inode->i_sb->s_flags & MS_ACTIVE) ||
494 inode->i_state & (I_WB_SWITCH | I_FREEING) ||
495 inode_to_wb(inode) == isw->new_wb) {
496 spin_unlock(&inode->i_lock);
497 goto out_free;
498 }
499 inode->i_state |= I_WB_SWITCH;
500 __iget(inode);
501 spin_unlock(&inode->i_lock);
502
503 isw->inode = inode;
504
505 atomic_inc(&isw_nr_in_flight);
506
507
508
509
510
511
512
513 call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
514 return;
515
516out_free:
517 if (isw->new_wb)
518 wb_put(isw->new_wb);
519 kfree(isw);
520}
521
522
523
524
525
526
527
528
529
530
531
532void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
533 struct inode *inode)
534{
535 if (!inode_cgwb_enabled(inode)) {
536 spin_unlock(&inode->i_lock);
537 return;
538 }
539
540 wbc->wb = inode_to_wb(inode);
541 wbc->inode = inode;
542
543 wbc->wb_id = wbc->wb->memcg_css->id;
544 wbc->wb_lcand_id = inode->i_wb_frn_winner;
545 wbc->wb_tcand_id = 0;
546 wbc->wb_bytes = 0;
547 wbc->wb_lcand_bytes = 0;
548 wbc->wb_tcand_bytes = 0;
549
550 wb_get(wbc->wb);
551 spin_unlock(&inode->i_lock);
552
553
554
555
556
557 if (unlikely(wb_dying(wbc->wb)))
558 inode_switch_wbs(inode, wbc->wb_id);
559}
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598void wbc_detach_inode(struct writeback_control *wbc)
599{
600 struct bdi_writeback *wb = wbc->wb;
601 struct inode *inode = wbc->inode;
602 unsigned long avg_time, max_bytes, max_time;
603 u16 history;
604 int max_id;
605
606 if (!wb)
607 return;
608
609 history = inode->i_wb_frn_history;
610 avg_time = inode->i_wb_frn_avg_time;
611
612
613 if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
614 wbc->wb_bytes >= wbc->wb_tcand_bytes) {
615 max_id = wbc->wb_id;
616 max_bytes = wbc->wb_bytes;
617 } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) {
618 max_id = wbc->wb_lcand_id;
619 max_bytes = wbc->wb_lcand_bytes;
620 } else {
621 max_id = wbc->wb_tcand_id;
622 max_bytes = wbc->wb_tcand_bytes;
623 }
624
625
626
627
628
629
630
631
632 max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT,
633 wb->avg_write_bandwidth);
634 if (avg_time)
635 avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) -
636 (avg_time >> WB_FRN_TIME_AVG_SHIFT);
637 else
638 avg_time = max_time;
639
640 if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) {
641 int slots;
642
643
644
645
646
647
648
649
650
651 slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT),
652 (unsigned long)WB_FRN_HIST_MAX_SLOTS);
653 history <<= slots;
654 if (wbc->wb_id != max_id)
655 history |= (1U << slots) - 1;
656
657
658
659
660
661
662
663
664 if (hweight32(history) > WB_FRN_HIST_THR_SLOTS)
665 inode_switch_wbs(inode, max_id);
666 }
667
668
669
670
671
672 inode->i_wb_frn_winner = max_id;
673 inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX);
674 inode->i_wb_frn_history = history;
675
676 wb_put(wbc->wb);
677 wbc->wb = NULL;
678}
679
680
681
682
683
684
685
686
687
688
689
690void wbc_account_io(struct writeback_control *wbc, struct page *page,
691 size_t bytes)
692{
693 int id;
694
695
696
697
698
699
700
701 if (!wbc->wb)
702 return;
703
704 id = mem_cgroup_css_from_page(page)->id;
705
706 if (id == wbc->wb_id) {
707 wbc->wb_bytes += bytes;
708 return;
709 }
710
711 if (id == wbc->wb_lcand_id)
712 wbc->wb_lcand_bytes += bytes;
713
714
715 if (!wbc->wb_tcand_bytes)
716 wbc->wb_tcand_id = id;
717 if (id == wbc->wb_tcand_id)
718 wbc->wb_tcand_bytes += bytes;
719 else
720 wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
721}
722EXPORT_SYMBOL_GPL(wbc_account_io);
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740int inode_congested(struct inode *inode, int cong_bits)
741{
742
743
744
745
746 if (inode && inode_to_wb_is_valid(inode)) {
747 struct bdi_writeback *wb;
748 bool locked, congested;
749
750 wb = unlocked_inode_to_wb_begin(inode, &locked);
751 congested = wb_congested(wb, cong_bits);
752 unlocked_inode_to_wb_end(inode, locked);
753 return congested;
754 }
755
756 return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
757}
758EXPORT_SYMBOL_GPL(inode_congested);
759
760
761
762
763
764
765
766
767
768
769static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
770{
771 unsigned long this_bw = wb->avg_write_bandwidth;
772 unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
773
774 if (nr_pages == LONG_MAX)
775 return LONG_MAX;
776
777
778
779
780
781
782 if (!tot_bw || this_bw >= tot_bw)
783 return nr_pages;
784 else
785 return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
786}
787
788
789
790
791
792
793
794
795
796
797
798
799static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
800 struct wb_writeback_work *base_work,
801 bool skip_if_busy)
802{
803 struct bdi_writeback *last_wb = NULL;
804 struct bdi_writeback *wb = list_entry(&bdi->wb_list,
805 struct bdi_writeback, bdi_node);
806
807 might_sleep();
808restart:
809 rcu_read_lock();
810 list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
811 DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
812 struct wb_writeback_work fallback_work;
813 struct wb_writeback_work *work;
814 long nr_pages;
815
816 if (last_wb) {
817 wb_put(last_wb);
818 last_wb = NULL;
819 }
820
821
822 if (!wb_has_dirty_io(wb) &&
823 (base_work->sync_mode == WB_SYNC_NONE ||
824 list_empty(&wb->b_dirty_time)))
825 continue;
826 if (skip_if_busy && writeback_in_progress(wb))
827 continue;
828
829 nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);
830
831 work = kmalloc(sizeof(*work), GFP_ATOMIC);
832 if (work) {
833 *work = *base_work;
834 work->nr_pages = nr_pages;
835 work->auto_free = 1;
836 wb_queue_work(wb, work);
837 continue;
838 }
839
840
841 work = &fallback_work;
842 *work = *base_work;
843 work->nr_pages = nr_pages;
844 work->auto_free = 0;
845 work->done = &fallback_work_done;
846
847 wb_queue_work(wb, work);
848
849
850
851
852
853
854 wb_get(wb);
855 last_wb = wb;
856
857 rcu_read_unlock();
858 wb_wait_for_completion(bdi, &fallback_work_done);
859 goto restart;
860 }
861 rcu_read_unlock();
862
863 if (last_wb)
864 wb_put(last_wb);
865}
866
867
868
869
870
871
872
873
874
875
876
877void cgroup_writeback_umount(void)
878{
879 if (atomic_read(&isw_nr_in_flight)) {
880 synchronize_rcu();
881 flush_workqueue(isw_wq);
882 }
883}
884
885static int __init cgroup_writeback_init(void)
886{
887 isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
888 if (!isw_wq)
889 return -ENOMEM;
890 return 0;
891}
892fs_initcall(cgroup_writeback_init);
893
894#else
895
896static struct bdi_writeback *
897locked_inode_to_wb_and_lock_list(struct inode *inode)
898 __releases(&inode->i_lock)
899 __acquires(&wb->list_lock)
900{
901 struct bdi_writeback *wb = inode_to_wb(inode);
902
903 spin_unlock(&inode->i_lock);
904 spin_lock(&wb->list_lock);
905 return wb;
906}
907
908static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
909 __acquires(&wb->list_lock)
910{
911 struct bdi_writeback *wb = inode_to_wb(inode);
912
913 spin_lock(&wb->list_lock);
914 return wb;
915}
916
917static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
918{
919 return nr_pages;
920}
921
922static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
923 struct wb_writeback_work *base_work,
924 bool skip_if_busy)
925{
926 might_sleep();
927
928 if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
929 base_work->auto_free = 0;
930 wb_queue_work(&bdi->wb, base_work);
931 }
932}
933
934#endif
935
936void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
937 bool range_cyclic, enum wb_reason reason)
938{
939 struct wb_writeback_work *work;
940
941 if (!wb_has_dirty_io(wb))
942 return;
943
944
945
946
947
948 work = kzalloc(sizeof(*work),
949 GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
950 if (!work) {
951 trace_writeback_nowork(wb);
952 wb_wakeup(wb);
953 return;
954 }
955
956 work->sync_mode = WB_SYNC_NONE;
957 work->nr_pages = nr_pages;
958 work->range_cyclic = range_cyclic;
959 work->reason = reason;
960 work->auto_free = 1;
961
962 wb_queue_work(wb, work);
963}
964
965
966
967
968
969
970
971
972
973
974
975void wb_start_background_writeback(struct bdi_writeback *wb)
976{
977
978
979
980
981 trace_writeback_wake_background(wb);
982 wb_wakeup(wb);
983}
984
985
986
987
988void inode_io_list_del(struct inode *inode)
989{
990 struct bdi_writeback *wb;
991
992 wb = inode_to_wb_and_lock_list(inode);
993 inode_io_list_del_locked(inode, wb);
994 spin_unlock(&wb->list_lock);
995}
996
997
998
999
1000void sb_mark_inode_writeback(struct inode *inode)
1001{
1002 struct super_block *sb = inode->i_sb;
1003 unsigned long flags;
1004
1005 if (list_empty(&inode->i_wb_list)) {
1006 spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
1007 if (list_empty(&inode->i_wb_list)) {
1008 list_add_tail(&inode->i_wb_list, &sb->s_inodes_wb);
1009 trace_sb_mark_inode_writeback(inode);
1010 }
1011 spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
1012 }
1013}
1014
1015
1016
1017
1018void sb_clear_inode_writeback(struct inode *inode)
1019{
1020 struct super_block *sb = inode->i_sb;
1021 unsigned long flags;
1022
1023 if (!list_empty(&inode->i_wb_list)) {
1024 spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
1025 if (!list_empty(&inode->i_wb_list)) {
1026 list_del_init(&inode->i_wb_list);
1027 trace_sb_clear_inode_writeback(inode);
1028 }
1029 spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
1030 }
1031}
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
1043{
1044 if (!list_empty(&wb->b_dirty)) {
1045 struct inode *tail;
1046
1047 tail = wb_inode(wb->b_dirty.next);
1048 if (time_before(inode->dirtied_when, tail->dirtied_when))
1049 inode->dirtied_when = jiffies;
1050 }
1051 inode_io_list_move_locked(inode, wb, &wb->b_dirty);
1052}
1053
1054
1055
1056
1057static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
1058{
1059 inode_io_list_move_locked(inode, wb, &wb->b_more_io);
1060}
1061
1062static void inode_sync_complete(struct inode *inode)
1063{
1064 inode->i_state &= ~I_SYNC;
1065
1066 inode_add_lru(inode);
1067
1068 smp_mb();
1069 wake_up_bit(&inode->i_state, __I_SYNC);
1070}
1071
1072static bool inode_dirtied_after(struct inode *inode, unsigned long t)
1073{
1074 bool ret = time_after(inode->dirtied_when, t);
1075#ifndef CONFIG_64BIT
1076
1077
1078
1079
1080
1081
1082 ret = ret && time_before_eq(inode->dirtied_when, jiffies);
1083#endif
1084 return ret;
1085}
1086
1087#define EXPIRE_DIRTY_ATIME 0x0001
1088
1089
1090
1091
1092
1093static int move_expired_inodes(struct list_head *delaying_queue,
1094 struct list_head *dispatch_queue,
1095 int flags,
1096 struct wb_writeback_work *work)
1097{
1098 unsigned long *older_than_this = NULL;
1099 unsigned long expire_time;
1100 LIST_HEAD(tmp);
1101 struct list_head *pos, *node;
1102 struct super_block *sb = NULL;
1103 struct inode *inode;
1104 int do_sb_sort = 0;
1105 int moved = 0;
1106
1107 if ((flags & EXPIRE_DIRTY_ATIME) == 0)
1108 older_than_this = work->older_than_this;
1109 else if (!work->for_sync) {
1110 expire_time = jiffies - (dirtytime_expire_interval * HZ);
1111 older_than_this = &expire_time;
1112 }
1113 while (!list_empty(delaying_queue)) {
1114 inode = wb_inode(delaying_queue->prev);
1115 if (older_than_this &&
1116 inode_dirtied_after(inode, *older_than_this))
1117 break;
1118 list_move(&inode->i_io_list, &tmp);
1119 moved++;
1120 if (flags & EXPIRE_DIRTY_ATIME)
1121 set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);
1122 if (sb_is_blkdev_sb(inode->i_sb))
1123 continue;
1124 if (sb && sb != inode->i_sb)
1125 do_sb_sort = 1;
1126 sb = inode->i_sb;
1127 }
1128
1129
1130 if (!do_sb_sort) {
1131 list_splice(&tmp, dispatch_queue);
1132 goto out;
1133 }
1134
1135
1136 while (!list_empty(&tmp)) {
1137 sb = wb_inode(tmp.prev)->i_sb;
1138 list_for_each_prev_safe(pos, node, &tmp) {
1139 inode = wb_inode(pos);
1140 if (inode->i_sb == sb)
1141 list_move(&inode->i_io_list, dispatch_queue);
1142 }
1143 }
1144out:
1145 return moved;
1146}
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
1160{
1161 int moved;
1162
1163 assert_spin_locked(&wb->list_lock);
1164 list_splice_init(&wb->b_more_io, &wb->b_io);
1165 moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
1166 moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
1167 EXPIRE_DIRTY_ATIME, work);
1168 if (moved)
1169 wb_io_lists_populated(wb);
1170 trace_writeback_queue_io(wb, work, moved);
1171}
1172
1173static int write_inode(struct inode *inode, struct writeback_control *wbc)
1174{
1175 int ret;
1176
1177 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) {
1178 trace_writeback_write_inode_start(inode, wbc);
1179 ret = inode->i_sb->s_op->write_inode(inode, wbc);
1180 trace_writeback_write_inode(inode, wbc);
1181 return ret;
1182 }
1183 return 0;
1184}
1185
1186
1187
1188
1189
1190static void __inode_wait_for_writeback(struct inode *inode)
1191 __releases(inode->i_lock)
1192 __acquires(inode->i_lock)
1193{
1194 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
1195 wait_queue_head_t *wqh;
1196
1197 wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
1198 while (inode->i_state & I_SYNC) {
1199 spin_unlock(&inode->i_lock);
1200 __wait_on_bit(wqh, &wq, bit_wait,
1201 TASK_UNINTERRUPTIBLE);
1202 spin_lock(&inode->i_lock);
1203 }
1204}
1205
1206
1207
1208
1209void inode_wait_for_writeback(struct inode *inode)
1210{
1211 spin_lock(&inode->i_lock);
1212 __inode_wait_for_writeback(inode);
1213 spin_unlock(&inode->i_lock);
1214}
1215
1216
1217
1218
1219
1220
1221static void inode_sleep_on_writeback(struct inode *inode)
1222 __releases(inode->i_lock)
1223{
1224 DEFINE_WAIT(wait);
1225 wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
1226 int sleep;
1227
1228 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
1229 sleep = inode->i_state & I_SYNC;
1230 spin_unlock(&inode->i_lock);
1231 if (sleep)
1232 schedule();
1233 finish_wait(wqh, &wait);
1234}
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
1245 struct writeback_control *wbc)
1246{
1247 if (inode->i_state & I_FREEING)
1248 return;
1249
1250
1251
1252
1253
1254
1255 if ((inode->i_state & I_DIRTY) &&
1256 (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
1257 inode->dirtied_when = jiffies;
1258
1259 if (wbc->pages_skipped) {
1260
1261
1262
1263
1264 redirty_tail(inode, wb);
1265 return;
1266 }
1267
1268 if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
1269
1270
1271
1272
1273 if (wbc->nr_to_write <= 0) {
1274
1275 requeue_io(inode, wb);
1276 } else {
1277
1278
1279
1280
1281
1282
1283
1284 redirty_tail(inode, wb);
1285 }
1286 } else if (inode->i_state & I_DIRTY) {
1287
1288
1289
1290
1291
1292 redirty_tail(inode, wb);
1293 } else if (inode->i_state & I_DIRTY_TIME) {
1294 inode->dirtied_when = jiffies;
1295 inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
1296 } else {
1297
1298 inode_io_list_del_locked(inode, wb);
1299 }
1300}
1301
1302
1303
1304
1305
1306
1307static int
1308__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
1309{
1310 struct address_space *mapping = inode->i_mapping;
1311 long nr_to_write = wbc->nr_to_write;
1312 unsigned dirty;
1313 int ret;
1314
1315 WARN_ON(!(inode->i_state & I_SYNC));
1316
1317 trace_writeback_single_inode_start(inode, wbc, nr_to_write);
1318
1319 ret = do_writepages(mapping, wbc);
1320
1321
1322
1323
1324
1325
1326
1327
1328 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
1329 int err = filemap_fdatawait(mapping);
1330 if (ret == 0)
1331 ret = err;
1332 }
1333
1334
1335
1336
1337
1338
1339 spin_lock(&inode->i_lock);
1340
1341 dirty = inode->i_state & I_DIRTY;
1342 if (inode->i_state & I_DIRTY_TIME) {
1343 if ((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||
1344 wbc->sync_mode == WB_SYNC_ALL ||
1345 unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) ||
1346 unlikely(time_after(jiffies,
1347 (inode->dirtied_time_when +
1348 dirtytime_expire_interval * HZ)))) {
1349 dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
1350 trace_writeback_lazytime(inode);
1351 }
1352 } else
1353 inode->i_state &= ~I_DIRTY_TIME_EXPIRED;
1354 inode->i_state &= ~dirty;
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367 smp_mb();
1368
1369 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
1370 inode->i_state |= I_DIRTY_PAGES;
1371
1372 spin_unlock(&inode->i_lock);
1373
1374 if (dirty & I_DIRTY_TIME)
1375 mark_inode_dirty_sync(inode);
1376
1377 if (dirty & ~I_DIRTY_PAGES) {
1378 int err = write_inode(inode, wbc);
1379 if (ret == 0)
1380 ret = err;
1381 }
1382 trace_writeback_single_inode(inode, wbc, nr_to_write);
1383 return ret;
1384}
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394static int writeback_single_inode(struct inode *inode,
1395 struct writeback_control *wbc)
1396{
1397 struct bdi_writeback *wb;
1398 int ret = 0;
1399
1400 spin_lock(&inode->i_lock);
1401 if (!atomic_read(&inode->i_count))
1402 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
1403 else
1404 WARN_ON(inode->i_state & I_WILL_FREE);
1405
1406 if (inode->i_state & I_SYNC) {
1407 if (wbc->sync_mode != WB_SYNC_ALL)
1408 goto out;
1409
1410
1411
1412
1413
1414 __inode_wait_for_writeback(inode);
1415 }
1416 WARN_ON(inode->i_state & I_SYNC);
1417
1418
1419
1420
1421
1422
1423
1424
1425 if (!(inode->i_state & I_DIRTY_ALL) &&
1426 (wbc->sync_mode != WB_SYNC_ALL ||
1427 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
1428 goto out;
1429 inode->i_state |= I_SYNC;
1430 wbc_attach_and_unlock_inode(wbc, inode);
1431
1432 ret = __writeback_single_inode(inode, wbc);
1433
1434 wbc_detach_inode(wbc);
1435
1436 wb = inode_to_wb_and_lock_list(inode);
1437 spin_lock(&inode->i_lock);
1438
1439
1440
1441
1442 if (!(inode->i_state & I_DIRTY_ALL))
1443 inode_io_list_del_locked(inode, wb);
1444 spin_unlock(&wb->list_lock);
1445 inode_sync_complete(inode);
1446out:
1447 spin_unlock(&inode->i_lock);
1448 return ret;
1449}
1450
1451static long writeback_chunk_size(struct bdi_writeback *wb,
1452 struct wb_writeback_work *work)
1453{
1454 long pages;
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469 if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
1470 pages = LONG_MAX;
1471 else {
1472 pages = min(wb->avg_write_bandwidth / 2,
1473 global_wb_domain.dirty_limit / DIRTY_SCOPE);
1474 pages = min(pages, work->nr_pages);
1475 pages = round_down(pages + MIN_WRITEBACK_PAGES,
1476 MIN_WRITEBACK_PAGES);
1477 }
1478
1479 return pages;
1480}
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491static long writeback_sb_inodes(struct super_block *sb,
1492 struct bdi_writeback *wb,
1493 struct wb_writeback_work *work)
1494{
1495 struct writeback_control wbc = {
1496 .sync_mode = work->sync_mode,
1497 .tagged_writepages = work->tagged_writepages,
1498 .for_kupdate = work->for_kupdate,
1499 .for_background = work->for_background,
1500 .for_sync = work->for_sync,
1501 .range_cyclic = work->range_cyclic,
1502 .range_start = 0,
1503 .range_end = LLONG_MAX,
1504 };
1505 unsigned long start_time = jiffies;
1506 long write_chunk;
1507 long wrote = 0;
1508
1509 while (!list_empty(&wb->b_io)) {
1510 struct inode *inode = wb_inode(wb->b_io.prev);
1511 struct bdi_writeback *tmp_wb;
1512
1513 if (inode->i_sb != sb) {
1514 if (work->sb) {
1515
1516
1517
1518
1519
1520 redirty_tail(inode, wb);
1521 continue;
1522 }
1523
1524
1525
1526
1527
1528
1529 break;
1530 }
1531
1532
1533
1534
1535
1536
1537 spin_lock(&inode->i_lock);
1538 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
1539 spin_unlock(&inode->i_lock);
1540 redirty_tail(inode, wb);
1541 continue;
1542 }
1543 if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553 spin_unlock(&inode->i_lock);
1554 requeue_io(inode, wb);
1555 trace_writeback_sb_inodes_requeue(inode);
1556 continue;
1557 }
1558 spin_unlock(&wb->list_lock);
1559
1560
1561
1562
1563
1564
1565 if (inode->i_state & I_SYNC) {
1566
1567 inode_sleep_on_writeback(inode);
1568
1569 spin_lock(&wb->list_lock);
1570 continue;
1571 }
1572 inode->i_state |= I_SYNC;
1573 wbc_attach_and_unlock_inode(&wbc, inode);
1574
1575 write_chunk = writeback_chunk_size(wb, work);
1576 wbc.nr_to_write = write_chunk;
1577 wbc.pages_skipped = 0;
1578
1579
1580
1581
1582
1583 __writeback_single_inode(inode, &wbc);
1584
1585 wbc_detach_inode(&wbc);
1586 work->nr_pages -= write_chunk - wbc.nr_to_write;
1587 wrote += write_chunk - wbc.nr_to_write;
1588
1589 if (need_resched()) {
1590
1591
1592
1593
1594
1595
1596
1597
1598 blk_flush_plug(current);
1599 cond_resched();
1600 }
1601
1602
1603
1604
1605
1606 tmp_wb = inode_to_wb_and_lock_list(inode);
1607 spin_lock(&inode->i_lock);
1608 if (!(inode->i_state & I_DIRTY_ALL))
1609 wrote++;
1610 requeue_inode(inode, tmp_wb, &wbc);
1611 inode_sync_complete(inode);
1612 spin_unlock(&inode->i_lock);
1613
1614 if (unlikely(tmp_wb != wb)) {
1615 spin_unlock(&tmp_wb->list_lock);
1616 spin_lock(&wb->list_lock);
1617 }
1618
1619
1620
1621
1622
1623 if (wrote) {
1624 if (time_is_before_jiffies(start_time + HZ / 10UL))
1625 break;
1626 if (work->nr_pages <= 0)
1627 break;
1628 }
1629 }
1630 return wrote;
1631}
1632
1633static long __writeback_inodes_wb(struct bdi_writeback *wb,
1634 struct wb_writeback_work *work)
1635{
1636 unsigned long start_time = jiffies;
1637 long wrote = 0;
1638
1639 while (!list_empty(&wb->b_io)) {
1640 struct inode *inode = wb_inode(wb->b_io.prev);
1641 struct super_block *sb = inode->i_sb;
1642
1643 if (!trylock_super(sb)) {
1644
1645
1646
1647
1648
1649 redirty_tail(inode, wb);
1650 continue;
1651 }
1652 wrote += writeback_sb_inodes(sb, wb, work);
1653 up_read(&sb->s_umount);
1654
1655
1656 if (wrote) {
1657 if (time_is_before_jiffies(start_time + HZ / 10UL))
1658 break;
1659 if (work->nr_pages <= 0)
1660 break;
1661 }
1662 }
1663
1664 return wrote;
1665}
1666
1667static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
1668 enum wb_reason reason)
1669{
1670 struct wb_writeback_work work = {
1671 .nr_pages = nr_pages,
1672 .sync_mode = WB_SYNC_NONE,
1673 .range_cyclic = 1,
1674 .reason = reason,
1675 };
1676 struct blk_plug plug;
1677
1678 blk_start_plug(&plug);
1679 spin_lock(&wb->list_lock);
1680 if (list_empty(&wb->b_io))
1681 queue_io(wb, &work);
1682 __writeback_inodes_wb(wb, &work);
1683 spin_unlock(&wb->list_lock);
1684 blk_finish_plug(&plug);
1685
1686 return nr_pages - work.nr_pages;
1687}
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704static long wb_writeback(struct bdi_writeback *wb,
1705 struct wb_writeback_work *work)
1706{
1707 unsigned long wb_start = jiffies;
1708 long nr_pages = work->nr_pages;
1709 unsigned long oldest_jif;
1710 struct inode *inode;
1711 long progress;
1712 struct blk_plug plug;
1713
1714 oldest_jif = jiffies;
1715 work->older_than_this = &oldest_jif;
1716
1717 blk_start_plug(&plug);
1718 spin_lock(&wb->list_lock);
1719 for (;;) {
1720
1721
1722
1723 if (work->nr_pages <= 0)
1724 break;
1725
1726
1727
1728
1729
1730
1731
1732 if ((work->for_background || work->for_kupdate) &&
1733 !list_empty(&wb->work_list))
1734 break;
1735
1736
1737
1738
1739
1740 if (work->for_background && !wb_over_bg_thresh(wb))
1741 break;
1742
1743
1744
1745
1746
1747
1748
1749 if (work->for_kupdate) {
1750 oldest_jif = jiffies -
1751 msecs_to_jiffies(dirty_expire_interval * 10);
1752 } else if (work->for_background)
1753 oldest_jif = jiffies;
1754
1755 trace_writeback_start(wb, work);
1756 if (list_empty(&wb->b_io))
1757 queue_io(wb, work);
1758 if (work->sb)
1759 progress = writeback_sb_inodes(work->sb, wb, work);
1760 else
1761 progress = __writeback_inodes_wb(wb, work);
1762 trace_writeback_written(wb, work);
1763
1764 wb_update_bandwidth(wb, wb_start);
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774 if (progress)
1775 continue;
1776
1777
1778
1779 if (list_empty(&wb->b_more_io))
1780 break;
1781
1782
1783
1784
1785
1786 trace_writeback_wait(wb, work);
1787 inode = wb_inode(wb->b_more_io.prev);
1788 spin_lock(&inode->i_lock);
1789 spin_unlock(&wb->list_lock);
1790
1791 inode_sleep_on_writeback(inode);
1792 spin_lock(&wb->list_lock);
1793 }
1794 spin_unlock(&wb->list_lock);
1795 blk_finish_plug(&plug);
1796
1797 return nr_pages - work->nr_pages;
1798}
1799
1800
1801
1802
1803static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
1804{
1805 struct wb_writeback_work *work = NULL;
1806
1807 spin_lock_bh(&wb->work_lock);
1808 if (!list_empty(&wb->work_list)) {
1809 work = list_entry(wb->work_list.next,
1810 struct wb_writeback_work, list);
1811 list_del_init(&work->list);
1812 }
1813 spin_unlock_bh(&wb->work_lock);
1814 return work;
1815}
1816
1817
1818
1819
1820
1821static unsigned long get_nr_dirty_pages(void)
1822{
1823 return global_node_page_state(NR_FILE_DIRTY) +
1824 global_node_page_state(NR_UNSTABLE_NFS) +
1825 get_nr_dirty_inodes();
1826}
1827
1828static long wb_check_background_flush(struct bdi_writeback *wb)
1829{
1830 if (wb_over_bg_thresh(wb)) {
1831
1832 struct wb_writeback_work work = {
1833 .nr_pages = LONG_MAX,
1834 .sync_mode = WB_SYNC_NONE,
1835 .for_background = 1,
1836 .range_cyclic = 1,
1837 .reason = WB_REASON_BACKGROUND,
1838 };
1839
1840 return wb_writeback(wb, &work);
1841 }
1842
1843 return 0;
1844}
1845
1846static long wb_check_old_data_flush(struct bdi_writeback *wb)
1847{
1848 unsigned long expired;
1849 long nr_pages;
1850
1851
1852
1853
1854 if (!dirty_writeback_interval)
1855 return 0;
1856
1857 expired = wb->last_old_flush +
1858 msecs_to_jiffies(dirty_writeback_interval * 10);
1859 if (time_before(jiffies, expired))
1860 return 0;
1861
1862 wb->last_old_flush = jiffies;
1863 nr_pages = get_nr_dirty_pages();
1864
1865 if (nr_pages) {
1866 struct wb_writeback_work work = {
1867 .nr_pages = nr_pages,
1868 .sync_mode = WB_SYNC_NONE,
1869 .for_kupdate = 1,
1870 .range_cyclic = 1,
1871 .reason = WB_REASON_PERIODIC,
1872 };
1873
1874 return wb_writeback(wb, &work);
1875 }
1876
1877 return 0;
1878}
1879
1880
1881
1882
1883static long wb_do_writeback(struct bdi_writeback *wb)
1884{
1885 struct wb_writeback_work *work;
1886 long wrote = 0;
1887
1888 set_bit(WB_writeback_running, &wb->state);
1889 while ((work = get_next_work_item(wb)) != NULL) {
1890 trace_writeback_exec(wb, work);
1891 wrote += wb_writeback(wb, work);
1892 finish_writeback_work(wb, work);
1893 }
1894
1895
1896
1897
1898 wrote += wb_check_old_data_flush(wb);
1899 wrote += wb_check_background_flush(wb);
1900 clear_bit(WB_writeback_running, &wb->state);
1901
1902 return wrote;
1903}
1904
1905
1906
1907
1908
1909void wb_workfn(struct work_struct *work)
1910{
1911 struct bdi_writeback *wb = container_of(to_delayed_work(work),
1912 struct bdi_writeback, dwork);
1913 long pages_written;
1914
1915 set_worker_desc("flush-%s", dev_name(wb->bdi->dev));
1916 current->flags |= PF_SWAPWRITE;
1917
1918 if (likely(!current_is_workqueue_rescuer() ||
1919 !test_bit(WB_registered, &wb->state))) {
1920
1921
1922
1923
1924
1925
1926 do {
1927 pages_written = wb_do_writeback(wb);
1928 trace_writeback_pages_written(pages_written);
1929 } while (!list_empty(&wb->work_list));
1930 } else {
1931
1932
1933
1934
1935
1936 pages_written = writeback_inodes_wb(wb, 1024,
1937 WB_REASON_FORKER_THREAD);
1938 trace_writeback_pages_written(pages_written);
1939 }
1940
1941 if (!list_empty(&wb->work_list))
1942 mod_delayed_work(bdi_wq, &wb->dwork, 0);
1943 else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
1944 wb_wakeup_delayed(wb);
1945
1946 current->flags &= ~PF_SWAPWRITE;
1947}
1948
1949
1950
1951
1952
1953void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
1954{
1955 struct backing_dev_info *bdi;
1956
1957
1958
1959
1960 if (blk_needs_flush_plug(current))
1961 blk_schedule_flush_plug(current);
1962
1963 if (!nr_pages)
1964 nr_pages = get_nr_dirty_pages();
1965
1966 rcu_read_lock();
1967 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
1968 struct bdi_writeback *wb;
1969
1970 if (!bdi_has_dirty_io(bdi))
1971 continue;
1972
1973 list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
1974 wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages),
1975 false, reason);
1976 }
1977 rcu_read_unlock();
1978}
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995static void wakeup_dirtytime_writeback(struct work_struct *w);
1996static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback);
1997
1998static void wakeup_dirtytime_writeback(struct work_struct *w)
1999{
2000 struct backing_dev_info *bdi;
2001
2002 rcu_read_lock();
2003 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
2004 struct bdi_writeback *wb;
2005
2006 list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
2007 if (!list_empty(&wb->b_dirty_time))
2008 wb_wakeup(wb);
2009 }
2010 rcu_read_unlock();
2011 schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
2012}
2013
2014static int __init start_dirtytime_writeback(void)
2015{
2016 schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
2017 return 0;
2018}
2019__initcall(start_dirtytime_writeback);
2020
2021int dirtytime_interval_handler(struct ctl_table *table, int write,
2022 void __user *buffer, size_t *lenp, loff_t *ppos)
2023{
2024 int ret;
2025
2026 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
2027 if (ret == 0 && write)
2028 mod_delayed_work(system_wq, &dirtytime_work, 0);
2029 return ret;
2030}
2031
2032static noinline void block_dump___mark_inode_dirty(struct inode *inode)
2033{
2034 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
2035 struct dentry *dentry;
2036 const char *name = "?";
2037
2038 dentry = d_find_alias(inode);
2039 if (dentry) {
2040 spin_lock(&dentry->d_lock);
2041 name = (const char *) dentry->d_name.name;
2042 }
2043 printk(KERN_DEBUG
2044 "%s(%d): dirtied inode %lu (%s) on %s\n",
2045 current->comm, task_pid_nr(current), inode->i_ino,
2046 name, inode->i_sb->s_id);
2047 if (dentry) {
2048 spin_unlock(&dentry->d_lock);
2049 dput(dentry);
2050 }
2051 }
2052}
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080void __mark_inode_dirty(struct inode *inode, int flags)
2081{
2082#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
2083 struct super_block *sb = inode->i_sb;
2084 int dirtytime;
2085
2086 trace_writeback_mark_inode_dirty(inode, flags);
2087
2088
2089
2090
2091
2092 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_TIME)) {
2093 trace_writeback_dirty_inode_start(inode, flags);
2094
2095 if (sb->s_op->dirty_inode)
2096 sb->s_op->dirty_inode(inode, flags);
2097
2098 trace_writeback_dirty_inode(inode, flags);
2099 }
2100 if (flags & I_DIRTY_INODE)
2101 flags &= ~I_DIRTY_TIME;
2102 dirtytime = flags & I_DIRTY_TIME;
2103
2104
2105
2106
2107
2108 smp_mb();
2109
2110 if (((inode->i_state & flags) == flags) ||
2111 (dirtytime && (inode->i_state & I_DIRTY_INODE)))
2112 return;
2113
2114 if (unlikely(block_dump))
2115 block_dump___mark_inode_dirty(inode);
2116
2117 spin_lock(&inode->i_lock);
2118 if (dirtytime && (inode->i_state & I_DIRTY_INODE))
2119 goto out_unlock_inode;
2120 if ((inode->i_state & flags) != flags) {
2121 const int was_dirty = inode->i_state & I_DIRTY;
2122
2123 inode_attach_wb(inode, NULL);
2124
2125 if (flags & I_DIRTY_INODE)
2126 inode->i_state &= ~I_DIRTY_TIME;
2127 inode->i_state |= flags;
2128
2129
2130
2131
2132
2133
2134 if (inode->i_state & I_SYNC)
2135 goto out_unlock_inode;
2136
2137
2138
2139
2140
2141 if (!S_ISBLK(inode->i_mode)) {
2142 if (inode_unhashed(inode))
2143 goto out_unlock_inode;
2144 }
2145 if (inode->i_state & I_FREEING)
2146 goto out_unlock_inode;
2147
2148
2149
2150
2151
2152 if (!was_dirty) {
2153 struct bdi_writeback *wb;
2154 struct list_head *dirty_list;
2155 bool wakeup_bdi = false;
2156
2157 wb = locked_inode_to_wb_and_lock_list(inode);
2158
2159 WARN(bdi_cap_writeback_dirty(wb->bdi) &&
2160 !test_bit(WB_registered, &wb->state),
2161 "bdi-%s not registered\n", wb->bdi->name);
2162
2163 inode->dirtied_when = jiffies;
2164 if (dirtytime)
2165 inode->dirtied_time_when = jiffies;
2166
2167 if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES))
2168 dirty_list = &wb->b_dirty;
2169 else
2170 dirty_list = &wb->b_dirty_time;
2171
2172 wakeup_bdi = inode_io_list_move_locked(inode, wb,
2173 dirty_list);
2174
2175 spin_unlock(&wb->list_lock);
2176 trace_writeback_dirty_inode_enqueue(inode);
2177
2178
2179
2180
2181
2182
2183
2184 if (bdi_cap_writeback_dirty(wb->bdi) && wakeup_bdi)
2185 wb_wakeup_delayed(wb);
2186 return;
2187 }
2188 }
2189out_unlock_inode:
2190 spin_unlock(&inode->i_lock);
2191
2192#undef I_DIRTY_INODE
2193}
2194EXPORT_SYMBOL(__mark_inode_dirty);
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205static void wait_sb_inodes(struct super_block *sb)
2206{
2207 LIST_HEAD(sync_list);
2208
2209
2210
2211
2212
2213 WARN_ON(!rwsem_is_locked(&sb->s_umount));
2214
2215 mutex_lock(&sb->s_sync_lock);
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226 rcu_read_lock();
2227 spin_lock_irq(&sb->s_inode_wblist_lock);
2228 list_splice_init(&sb->s_inodes_wb, &sync_list);
2229
2230
2231
2232
2233
2234
2235
2236
2237 while (!list_empty(&sync_list)) {
2238 struct inode *inode = list_first_entry(&sync_list, struct inode,
2239 i_wb_list);
2240 struct address_space *mapping = inode->i_mapping;
2241
2242
2243
2244
2245
2246
2247
2248 list_move_tail(&inode->i_wb_list, &sb->s_inodes_wb);
2249
2250
2251
2252
2253
2254
2255 if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
2256 continue;
2257
2258 spin_unlock_irq(&sb->s_inode_wblist_lock);
2259
2260 spin_lock(&inode->i_lock);
2261 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
2262 spin_unlock(&inode->i_lock);
2263
2264 spin_lock_irq(&sb->s_inode_wblist_lock);
2265 continue;
2266 }
2267 __iget(inode);
2268 spin_unlock(&inode->i_lock);
2269 rcu_read_unlock();
2270
2271
2272
2273
2274
2275
2276 filemap_fdatawait_keep_errors(mapping);
2277
2278 cond_resched();
2279
2280 iput(inode);
2281
2282 rcu_read_lock();
2283 spin_lock_irq(&sb->s_inode_wblist_lock);
2284 }
2285 spin_unlock_irq(&sb->s_inode_wblist_lock);
2286 rcu_read_unlock();
2287 mutex_unlock(&sb->s_sync_lock);
2288}
2289
2290static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
2291 enum wb_reason reason, bool skip_if_busy)
2292{
2293 DEFINE_WB_COMPLETION_ONSTACK(done);
2294 struct wb_writeback_work work = {
2295 .sb = sb,
2296 .sync_mode = WB_SYNC_NONE,
2297 .tagged_writepages = 1,
2298 .done = &done,
2299 .nr_pages = nr,
2300 .reason = reason,
2301 };
2302 struct backing_dev_info *bdi = sb->s_bdi;
2303
2304 if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
2305 return;
2306 WARN_ON(!rwsem_is_locked(&sb->s_umount));
2307
2308 bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
2309 wb_wait_for_completion(bdi, &done);
2310}
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322void writeback_inodes_sb_nr(struct super_block *sb,
2323 unsigned long nr,
2324 enum wb_reason reason)
2325{
2326 __writeback_inodes_sb_nr(sb, nr, reason, false);
2327}
2328EXPORT_SYMBOL(writeback_inodes_sb_nr);
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
2340{
2341 return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
2342}
2343EXPORT_SYMBOL(writeback_inodes_sb);
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354bool try_to_writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
2355 enum wb_reason reason)
2356{
2357 if (!down_read_trylock(&sb->s_umount))
2358 return false;
2359
2360 __writeback_inodes_sb_nr(sb, nr, reason, true);
2361 up_read(&sb->s_umount);
2362 return true;
2363}
2364EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr);
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374bool try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
2375{
2376 return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
2377}
2378EXPORT_SYMBOL(try_to_writeback_inodes_sb);
2379
2380
2381
2382
2383
2384
2385
2386
2387void sync_inodes_sb(struct super_block *sb)
2388{
2389 DEFINE_WB_COMPLETION_ONSTACK(done);
2390 struct wb_writeback_work work = {
2391 .sb = sb,
2392 .sync_mode = WB_SYNC_ALL,
2393 .nr_pages = LONG_MAX,
2394 .range_cyclic = 0,
2395 .done = &done,
2396 .reason = WB_REASON_SYNC,
2397 .for_sync = 1,
2398 };
2399 struct backing_dev_info *bdi = sb->s_bdi;
2400
2401
2402
2403
2404
2405
2406 if (bdi == &noop_backing_dev_info)
2407 return;
2408 WARN_ON(!rwsem_is_locked(&sb->s_umount));
2409
2410 bdi_split_work_to_wbs(bdi, &work, false);
2411 wb_wait_for_completion(bdi, &done);
2412
2413 wait_sb_inodes(sb);
2414}
2415EXPORT_SYMBOL(sync_inodes_sb);
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427int write_inode_now(struct inode *inode, int sync)
2428{
2429 struct writeback_control wbc = {
2430 .nr_to_write = LONG_MAX,
2431 .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
2432 .range_start = 0,
2433 .range_end = LLONG_MAX,
2434 };
2435
2436 if (!mapping_cap_writeback_dirty(inode->i_mapping))
2437 wbc.nr_to_write = 0;
2438
2439 might_sleep();
2440 return writeback_single_inode(inode, &wbc);
2441}
2442EXPORT_SYMBOL(write_inode_now);
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455int sync_inode(struct inode *inode, struct writeback_control *wbc)
2456{
2457 return writeback_single_inode(inode, wbc);
2458}
2459EXPORT_SYMBOL(sync_inode);
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470int sync_inode_metadata(struct inode *inode, int wait)
2471{
2472 struct writeback_control wbc = {
2473 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
2474 .nr_to_write = 0,
2475 };
2476
2477 return sync_inode(inode, &wbc);
2478}
2479EXPORT_SYMBOL(sync_inode_metadata);
2480