1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17#include <linux/kernel.h>
18#include <linux/export.h>
19#include <linux/spinlock.h>
20#include <linux/slab.h>
21#include <linux/sched.h>
22#include <linux/fs.h>
23#include <linux/mm.h>
24#include <linux/pagemap.h>
25#include <linux/kthread.h>
26#include <linux/writeback.h>
27#include <linux/blkdev.h>
28#include <linux/backing-dev.h>
29#include <linux/tracepoint.h>
30#include <linux/device.h>
31#include <linux/memcontrol.h>
32#include "internal.h"
33
34
35
36
37#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10))
38
39struct wb_completion {
40 atomic_t cnt;
41};
42
43
44
45
46struct wb_writeback_work {
47 long nr_pages;
48 struct super_block *sb;
49 unsigned long *older_than_this;
50 enum writeback_sync_modes sync_mode;
51 unsigned int tagged_writepages:1;
52 unsigned int for_kupdate:1;
53 unsigned int range_cyclic:1;
54 unsigned int for_background:1;
55 unsigned int for_sync:1;
56 unsigned int auto_free:1;
57 enum wb_reason reason;
58
59 struct list_head list;
60 struct wb_completion *done;
61};
62
63
64
65
66
67
68
69
70#define DEFINE_WB_COMPLETION_ONSTACK(cmpl) \
71 struct wb_completion cmpl = { \
72 .cnt = ATOMIC_INIT(1), \
73 }
74
75
76
77
78
79
80
81
82
83
84
85
86unsigned int dirtytime_expire_interval = 12 * 60 * 60;
87
88static inline struct inode *wb_inode(struct list_head *head)
89{
90 return list_entry(head, struct inode, i_io_list);
91}
92
93
94
95
96
97
98#define CREATE_TRACE_POINTS
99#include <trace/events/writeback.h>
100
101EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
102
103static bool wb_io_lists_populated(struct bdi_writeback *wb)
104{
105 if (wb_has_dirty_io(wb)) {
106 return false;
107 } else {
108 set_bit(WB_has_dirty_io, &wb->state);
109 WARN_ON_ONCE(!wb->avg_write_bandwidth);
110 atomic_long_add(wb->avg_write_bandwidth,
111 &wb->bdi->tot_write_bandwidth);
112 return true;
113 }
114}
115
116static void wb_io_lists_depopulated(struct bdi_writeback *wb)
117{
118 if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) &&
119 list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) {
120 clear_bit(WB_has_dirty_io, &wb->state);
121 WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth,
122 &wb->bdi->tot_write_bandwidth) < 0);
123 }
124}
125
126
127
128
129
130
131
132
133
134
135
136static bool inode_io_list_move_locked(struct inode *inode,
137 struct bdi_writeback *wb,
138 struct list_head *head)
139{
140 assert_spin_locked(&wb->list_lock);
141
142 list_move(&inode->i_io_list, head);
143
144
145 if (head != &wb->b_dirty_time)
146 return wb_io_lists_populated(wb);
147
148 wb_io_lists_depopulated(wb);
149 return false;
150}
151
152
153
154
155
156
157
158
159
160static void inode_io_list_del_locked(struct inode *inode,
161 struct bdi_writeback *wb)
162{
163 assert_spin_locked(&wb->list_lock);
164
165 list_del_init(&inode->i_io_list);
166 wb_io_lists_depopulated(wb);
167}
168
169static void wb_wakeup(struct bdi_writeback *wb)
170{
171 spin_lock_bh(&wb->work_lock);
172 if (test_bit(WB_registered, &wb->state))
173 mod_delayed_work(bdi_wq, &wb->dwork, 0);
174 spin_unlock_bh(&wb->work_lock);
175}
176
177static void finish_writeback_work(struct bdi_writeback *wb,
178 struct wb_writeback_work *work)
179{
180 struct wb_completion *done = work->done;
181
182 if (work->auto_free)
183 kfree(work);
184 if (done && atomic_dec_and_test(&done->cnt))
185 wake_up_all(&wb->bdi->wb_waitq);
186}
187
188static void wb_queue_work(struct bdi_writeback *wb,
189 struct wb_writeback_work *work)
190{
191 trace_writeback_queue(wb, work);
192
193 if (work->done)
194 atomic_inc(&work->done->cnt);
195
196 spin_lock_bh(&wb->work_lock);
197
198 if (test_bit(WB_registered, &wb->state)) {
199 list_add_tail(&work->list, &wb->work_list);
200 mod_delayed_work(bdi_wq, &wb->dwork, 0);
201 } else
202 finish_writeback_work(wb, work);
203
204 spin_unlock_bh(&wb->work_lock);
205}
206
207
208
209
210
211
212
213
214
215
216
217
218static void wb_wait_for_completion(struct backing_dev_info *bdi,
219 struct wb_completion *done)
220{
221 atomic_dec(&done->cnt);
222 wait_event(bdi->wb_waitq, !atomic_read(&done->cnt));
223}
224
225#ifdef CONFIG_CGROUP_WRITEBACK
226
227
228#define WB_FRN_TIME_SHIFT 13
229#define WB_FRN_TIME_AVG_SHIFT 3
230#define WB_FRN_TIME_CUT_DIV 2
231#define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT))
232
233#define WB_FRN_HIST_SLOTS 16
234#define WB_FRN_HIST_UNIT (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS)
235
236#define WB_FRN_HIST_THR_SLOTS (WB_FRN_HIST_SLOTS / 2)
237
238#define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1)
239
240
241static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
242static struct workqueue_struct *isw_wq;
243
244void __inode_attach_wb(struct inode *inode, struct page *page)
245{
246 struct backing_dev_info *bdi = inode_to_bdi(inode);
247 struct bdi_writeback *wb = NULL;
248
249 if (inode_cgwb_enabled(inode)) {
250 struct cgroup_subsys_state *memcg_css;
251
252 if (page) {
253 memcg_css = mem_cgroup_css_from_page(page);
254 wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
255 } else {
256
257 memcg_css = task_get_css(current, memory_cgrp_id);
258 wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
259 css_put(memcg_css);
260 }
261 }
262
263 if (!wb)
264 wb = &bdi->wb;
265
266
267
268
269
270 if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
271 wb_put(wb);
272}
273
274
275
276
277
278
279
280
281
282static struct bdi_writeback *
283locked_inode_to_wb_and_lock_list(struct inode *inode)
284 __releases(&inode->i_lock)
285 __acquires(&wb->list_lock)
286{
287 while (true) {
288 struct bdi_writeback *wb = inode_to_wb(inode);
289
290
291
292
293
294
295
296 wb_get(wb);
297 spin_unlock(&inode->i_lock);
298 spin_lock(&wb->list_lock);
299
300
301 if (likely(wb == inode->i_wb)) {
302 wb_put(wb);
303 return wb;
304 }
305
306 spin_unlock(&wb->list_lock);
307 wb_put(wb);
308 cpu_relax();
309 spin_lock(&inode->i_lock);
310 }
311}
312
313
314
315
316
317
318
319
320static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
321 __acquires(&wb->list_lock)
322{
323 spin_lock(&inode->i_lock);
324 return locked_inode_to_wb_and_lock_list(inode);
325}
326
327struct inode_switch_wbs_context {
328 struct inode *inode;
329 struct bdi_writeback *new_wb;
330
331 struct rcu_head rcu_head;
332 struct work_struct work;
333};
334
335static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
336{
337 down_write(&bdi->wb_switch_rwsem);
338}
339
340static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
341{
342 up_write(&bdi->wb_switch_rwsem);
343}
344
345static void inode_switch_wbs_work_fn(struct work_struct *work)
346{
347 struct inode_switch_wbs_context *isw =
348 container_of(work, struct inode_switch_wbs_context, work);
349 struct inode *inode = isw->inode;
350 struct backing_dev_info *bdi = inode_to_bdi(inode);
351 struct address_space *mapping = inode->i_mapping;
352 struct bdi_writeback *old_wb = inode->i_wb;
353 struct bdi_writeback *new_wb = isw->new_wb;
354 XA_STATE(xas, &mapping->i_pages, 0);
355 struct page *page;
356 bool switched = false;
357
358
359
360
361
362 down_read(&bdi->wb_switch_rwsem);
363
364
365
366
367
368
369
370
371
372
373
374 if (old_wb < new_wb) {
375 spin_lock(&old_wb->list_lock);
376 spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
377 } else {
378 spin_lock(&new_wb->list_lock);
379 spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
380 }
381 spin_lock(&inode->i_lock);
382 xa_lock_irq(&mapping->i_pages);
383
384
385
386
387
388 if (unlikely(inode->i_state & I_FREEING))
389 goto skip_switch;
390
391
392
393
394
395
396 xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_DIRTY) {
397 if (PageDirty(page)) {
398 dec_wb_stat(old_wb, WB_RECLAIMABLE);
399 inc_wb_stat(new_wb, WB_RECLAIMABLE);
400 }
401 }
402
403 xas_set(&xas, 0);
404 xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) {
405 WARN_ON_ONCE(!PageWriteback(page));
406 dec_wb_stat(old_wb, WB_WRITEBACK);
407 inc_wb_stat(new_wb, WB_WRITEBACK);
408 }
409
410 wb_get(new_wb);
411
412
413
414
415
416
417
418 if (!list_empty(&inode->i_io_list)) {
419 struct inode *pos;
420
421 inode_io_list_del_locked(inode, old_wb);
422 inode->i_wb = new_wb;
423 list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
424 if (time_after_eq(inode->dirtied_when,
425 pos->dirtied_when))
426 break;
427 inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev);
428 } else {
429 inode->i_wb = new_wb;
430 }
431
432
433 inode->i_wb_frn_winner = 0;
434 inode->i_wb_frn_avg_time = 0;
435 inode->i_wb_frn_history = 0;
436 switched = true;
437skip_switch:
438
439
440
441
442 smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
443
444 xa_unlock_irq(&mapping->i_pages);
445 spin_unlock(&inode->i_lock);
446 spin_unlock(&new_wb->list_lock);
447 spin_unlock(&old_wb->list_lock);
448
449 up_read(&bdi->wb_switch_rwsem);
450
451 if (switched) {
452 wb_wakeup(new_wb);
453 wb_put(old_wb);
454 }
455 wb_put(new_wb);
456
457 iput(inode);
458 kfree(isw);
459
460 atomic_dec(&isw_nr_in_flight);
461}
462
463static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
464{
465 struct inode_switch_wbs_context *isw = container_of(rcu_head,
466 struct inode_switch_wbs_context, rcu_head);
467
468
469 INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
470 queue_work(isw_wq, &isw->work);
471}
472
473
474
475
476
477
478
479
480
481static void inode_switch_wbs(struct inode *inode, int new_wb_id)
482{
483 struct backing_dev_info *bdi = inode_to_bdi(inode);
484 struct cgroup_subsys_state *memcg_css;
485 struct inode_switch_wbs_context *isw;
486
487
488 if (inode->i_state & I_WB_SWITCH)
489 return;
490
491
492
493
494
495
496
497 if (!down_read_trylock(&bdi->wb_switch_rwsem))
498 return;
499
500 isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
501 if (!isw)
502 goto out_unlock;
503
504
505 rcu_read_lock();
506 memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
507 if (memcg_css)
508 isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
509 rcu_read_unlock();
510 if (!isw->new_wb)
511 goto out_free;
512
513
514 spin_lock(&inode->i_lock);
515 if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
516 inode->i_state & (I_WB_SWITCH | I_FREEING) ||
517 inode_to_wb(inode) == isw->new_wb) {
518 spin_unlock(&inode->i_lock);
519 goto out_free;
520 }
521 inode->i_state |= I_WB_SWITCH;
522 __iget(inode);
523 spin_unlock(&inode->i_lock);
524
525 isw->inode = inode;
526
527
528
529
530
531
532
533 call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
534
535 atomic_inc(&isw_nr_in_flight);
536
537 goto out_unlock;
538
539out_free:
540 if (isw->new_wb)
541 wb_put(isw->new_wb);
542 kfree(isw);
543out_unlock:
544 up_read(&bdi->wb_switch_rwsem);
545}
546
547
548
549
550
551
552
553
554
555
556
557void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
558 struct inode *inode)
559{
560 if (!inode_cgwb_enabled(inode)) {
561 spin_unlock(&inode->i_lock);
562 return;
563 }
564
565 wbc->wb = inode_to_wb(inode);
566 wbc->inode = inode;
567
568 wbc->wb_id = wbc->wb->memcg_css->id;
569 wbc->wb_lcand_id = inode->i_wb_frn_winner;
570 wbc->wb_tcand_id = 0;
571 wbc->wb_bytes = 0;
572 wbc->wb_lcand_bytes = 0;
573 wbc->wb_tcand_bytes = 0;
574
575 wb_get(wbc->wb);
576 spin_unlock(&inode->i_lock);
577
578
579
580
581
582 if (unlikely(wb_dying(wbc->wb)))
583 inode_switch_wbs(inode, wbc->wb_id);
584}
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623void wbc_detach_inode(struct writeback_control *wbc)
624{
625 struct bdi_writeback *wb = wbc->wb;
626 struct inode *inode = wbc->inode;
627 unsigned long avg_time, max_bytes, max_time;
628 u16 history;
629 int max_id;
630
631 if (!wb)
632 return;
633
634 history = inode->i_wb_frn_history;
635 avg_time = inode->i_wb_frn_avg_time;
636
637
638 if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
639 wbc->wb_bytes >= wbc->wb_tcand_bytes) {
640 max_id = wbc->wb_id;
641 max_bytes = wbc->wb_bytes;
642 } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) {
643 max_id = wbc->wb_lcand_id;
644 max_bytes = wbc->wb_lcand_bytes;
645 } else {
646 max_id = wbc->wb_tcand_id;
647 max_bytes = wbc->wb_tcand_bytes;
648 }
649
650
651
652
653
654
655
656
657 max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT,
658 wb->avg_write_bandwidth);
659 if (avg_time)
660 avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) -
661 (avg_time >> WB_FRN_TIME_AVG_SHIFT);
662 else
663 avg_time = max_time;
664
665 if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) {
666 int slots;
667
668
669
670
671
672
673
674
675
676 slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT),
677 (unsigned long)WB_FRN_HIST_MAX_SLOTS);
678 history <<= slots;
679 if (wbc->wb_id != max_id)
680 history |= (1U << slots) - 1;
681
682
683
684
685
686
687
688
689 if (hweight32(history) > WB_FRN_HIST_THR_SLOTS)
690 inode_switch_wbs(inode, max_id);
691 }
692
693
694
695
696
697 inode->i_wb_frn_winner = max_id;
698 inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX);
699 inode->i_wb_frn_history = history;
700
701 wb_put(wbc->wb);
702 wbc->wb = NULL;
703}
704
705
706
707
708
709
710
711
712
713
714
715void wbc_account_io(struct writeback_control *wbc, struct page *page,
716 size_t bytes)
717{
718 int id;
719
720
721
722
723
724
725
726 if (!wbc->wb)
727 return;
728
729 id = mem_cgroup_css_from_page(page)->id;
730
731 if (id == wbc->wb_id) {
732 wbc->wb_bytes += bytes;
733 return;
734 }
735
736 if (id == wbc->wb_lcand_id)
737 wbc->wb_lcand_bytes += bytes;
738
739
740 if (!wbc->wb_tcand_bytes)
741 wbc->wb_tcand_id = id;
742 if (id == wbc->wb_tcand_id)
743 wbc->wb_tcand_bytes += bytes;
744 else
745 wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
746}
747EXPORT_SYMBOL_GPL(wbc_account_io);
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765int inode_congested(struct inode *inode, int cong_bits)
766{
767
768
769
770
771 if (inode && inode_to_wb_is_valid(inode)) {
772 struct bdi_writeback *wb;
773 struct wb_lock_cookie lock_cookie = {};
774 bool congested;
775
776 wb = unlocked_inode_to_wb_begin(inode, &lock_cookie);
777 congested = wb_congested(wb, cong_bits);
778 unlocked_inode_to_wb_end(inode, &lock_cookie);
779 return congested;
780 }
781
782 return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
783}
784EXPORT_SYMBOL_GPL(inode_congested);
785
786
787
788
789
790
791
792
793
794
795static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
796{
797 unsigned long this_bw = wb->avg_write_bandwidth;
798 unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
799
800 if (nr_pages == LONG_MAX)
801 return LONG_MAX;
802
803
804
805
806
807
808 if (!tot_bw || this_bw >= tot_bw)
809 return nr_pages;
810 else
811 return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
812}
813
814
815
816
817
818
819
820
821
822
823
824
825static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
826 struct wb_writeback_work *base_work,
827 bool skip_if_busy)
828{
829 struct bdi_writeback *last_wb = NULL;
830 struct bdi_writeback *wb = list_entry(&bdi->wb_list,
831 struct bdi_writeback, bdi_node);
832
833 might_sleep();
834restart:
835 rcu_read_lock();
836 list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
837 DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
838 struct wb_writeback_work fallback_work;
839 struct wb_writeback_work *work;
840 long nr_pages;
841
842 if (last_wb) {
843 wb_put(last_wb);
844 last_wb = NULL;
845 }
846
847
848 if (!wb_has_dirty_io(wb) &&
849 (base_work->sync_mode == WB_SYNC_NONE ||
850 list_empty(&wb->b_dirty_time)))
851 continue;
852 if (skip_if_busy && writeback_in_progress(wb))
853 continue;
854
855 nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);
856
857 work = kmalloc(sizeof(*work), GFP_ATOMIC);
858 if (work) {
859 *work = *base_work;
860 work->nr_pages = nr_pages;
861 work->auto_free = 1;
862 wb_queue_work(wb, work);
863 continue;
864 }
865
866
867 work = &fallback_work;
868 *work = *base_work;
869 work->nr_pages = nr_pages;
870 work->auto_free = 0;
871 work->done = &fallback_work_done;
872
873 wb_queue_work(wb, work);
874
875
876
877
878
879
880 wb_get(wb);
881 last_wb = wb;
882
883 rcu_read_unlock();
884 wb_wait_for_completion(bdi, &fallback_work_done);
885 goto restart;
886 }
887 rcu_read_unlock();
888
889 if (last_wb)
890 wb_put(last_wb);
891}
892
893
894
895
896
897
898
899
900
901
902
903void cgroup_writeback_umount(void)
904{
905 if (atomic_read(&isw_nr_in_flight)) {
906
907
908
909
910 rcu_barrier();
911 flush_workqueue(isw_wq);
912 }
913}
914
915static int __init cgroup_writeback_init(void)
916{
917 isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
918 if (!isw_wq)
919 return -ENOMEM;
920 return 0;
921}
922fs_initcall(cgroup_writeback_init);
923
924#else
925
926static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
927static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
928
929static struct bdi_writeback *
930locked_inode_to_wb_and_lock_list(struct inode *inode)
931 __releases(&inode->i_lock)
932 __acquires(&wb->list_lock)
933{
934 struct bdi_writeback *wb = inode_to_wb(inode);
935
936 spin_unlock(&inode->i_lock);
937 spin_lock(&wb->list_lock);
938 return wb;
939}
940
941static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
942 __acquires(&wb->list_lock)
943{
944 struct bdi_writeback *wb = inode_to_wb(inode);
945
946 spin_lock(&wb->list_lock);
947 return wb;
948}
949
950static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
951{
952 return nr_pages;
953}
954
955static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
956 struct wb_writeback_work *base_work,
957 bool skip_if_busy)
958{
959 might_sleep();
960
961 if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
962 base_work->auto_free = 0;
963 wb_queue_work(&bdi->wb, base_work);
964 }
965}
966
967#endif
968
969
970
971
972
973static unsigned long get_nr_dirty_pages(void)
974{
975 return global_node_page_state(NR_FILE_DIRTY) +
976 global_node_page_state(NR_UNSTABLE_NFS) +
977 get_nr_dirty_inodes();
978}
979
980static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
981{
982 if (!wb_has_dirty_io(wb))
983 return;
984
985
986
987
988
989
990
991
992
993 if (test_bit(WB_start_all, &wb->state) ||
994 test_and_set_bit(WB_start_all, &wb->state))
995 return;
996
997 wb->start_all_reason = reason;
998 wb_wakeup(wb);
999}
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011void wb_start_background_writeback(struct bdi_writeback *wb)
1012{
1013
1014
1015
1016
1017 trace_writeback_wake_background(wb);
1018 wb_wakeup(wb);
1019}
1020
1021
1022
1023
1024void inode_io_list_del(struct inode *inode)
1025{
1026 struct bdi_writeback *wb;
1027
1028 wb = inode_to_wb_and_lock_list(inode);
1029 inode_io_list_del_locked(inode, wb);
1030 spin_unlock(&wb->list_lock);
1031}
1032
1033
1034
1035
1036void sb_mark_inode_writeback(struct inode *inode)
1037{
1038 struct super_block *sb = inode->i_sb;
1039 unsigned long flags;
1040
1041 if (list_empty(&inode->i_wb_list)) {
1042 spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
1043 if (list_empty(&inode->i_wb_list)) {
1044 list_add_tail(&inode->i_wb_list, &sb->s_inodes_wb);
1045 trace_sb_mark_inode_writeback(inode);
1046 }
1047 spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
1048 }
1049}
1050
1051
1052
1053
1054void sb_clear_inode_writeback(struct inode *inode)
1055{
1056 struct super_block *sb = inode->i_sb;
1057 unsigned long flags;
1058
1059 if (!list_empty(&inode->i_wb_list)) {
1060 spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
1061 if (!list_empty(&inode->i_wb_list)) {
1062 list_del_init(&inode->i_wb_list);
1063 trace_sb_clear_inode_writeback(inode);
1064 }
1065 spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
1066 }
1067}
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
1079{
1080 if (!list_empty(&wb->b_dirty)) {
1081 struct inode *tail;
1082
1083 tail = wb_inode(wb->b_dirty.next);
1084 if (time_before(inode->dirtied_when, tail->dirtied_when))
1085 inode->dirtied_when = jiffies;
1086 }
1087 inode_io_list_move_locked(inode, wb, &wb->b_dirty);
1088}
1089
1090
1091
1092
1093static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
1094{
1095 inode_io_list_move_locked(inode, wb, &wb->b_more_io);
1096}
1097
1098static void inode_sync_complete(struct inode *inode)
1099{
1100 inode->i_state &= ~I_SYNC;
1101
1102 inode_add_lru(inode);
1103
1104 smp_mb();
1105 wake_up_bit(&inode->i_state, __I_SYNC);
1106}
1107
1108static bool inode_dirtied_after(struct inode *inode, unsigned long t)
1109{
1110 bool ret = time_after(inode->dirtied_when, t);
1111#ifndef CONFIG_64BIT
1112
1113
1114
1115
1116
1117
1118 ret = ret && time_before_eq(inode->dirtied_when, jiffies);
1119#endif
1120 return ret;
1121}
1122
1123#define EXPIRE_DIRTY_ATIME 0x0001
1124
1125
1126
1127
1128
1129static int move_expired_inodes(struct list_head *delaying_queue,
1130 struct list_head *dispatch_queue,
1131 int flags,
1132 struct wb_writeback_work *work)
1133{
1134 unsigned long *older_than_this = NULL;
1135 unsigned long expire_time;
1136 LIST_HEAD(tmp);
1137 struct list_head *pos, *node;
1138 struct super_block *sb = NULL;
1139 struct inode *inode;
1140 int do_sb_sort = 0;
1141 int moved = 0;
1142
1143 if ((flags & EXPIRE_DIRTY_ATIME) == 0)
1144 older_than_this = work->older_than_this;
1145 else if (!work->for_sync) {
1146 expire_time = jiffies - (dirtytime_expire_interval * HZ);
1147 older_than_this = &expire_time;
1148 }
1149 while (!list_empty(delaying_queue)) {
1150 inode = wb_inode(delaying_queue->prev);
1151 if (older_than_this &&
1152 inode_dirtied_after(inode, *older_than_this))
1153 break;
1154 list_move(&inode->i_io_list, &tmp);
1155 moved++;
1156 if (flags & EXPIRE_DIRTY_ATIME)
1157 set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);
1158 if (sb_is_blkdev_sb(inode->i_sb))
1159 continue;
1160 if (sb && sb != inode->i_sb)
1161 do_sb_sort = 1;
1162 sb = inode->i_sb;
1163 }
1164
1165
1166 if (!do_sb_sort) {
1167 list_splice(&tmp, dispatch_queue);
1168 goto out;
1169 }
1170
1171
1172 while (!list_empty(&tmp)) {
1173 sb = wb_inode(tmp.prev)->i_sb;
1174 list_for_each_prev_safe(pos, node, &tmp) {
1175 inode = wb_inode(pos);
1176 if (inode->i_sb == sb)
1177 list_move(&inode->i_io_list, dispatch_queue);
1178 }
1179 }
1180out:
1181 return moved;
1182}
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
1196{
1197 int moved;
1198
1199 assert_spin_locked(&wb->list_lock);
1200 list_splice_init(&wb->b_more_io, &wb->b_io);
1201 moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
1202 moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
1203 EXPIRE_DIRTY_ATIME, work);
1204 if (moved)
1205 wb_io_lists_populated(wb);
1206 trace_writeback_queue_io(wb, work, moved);
1207}
1208
1209static int write_inode(struct inode *inode, struct writeback_control *wbc)
1210{
1211 int ret;
1212
1213 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) {
1214 trace_writeback_write_inode_start(inode, wbc);
1215 ret = inode->i_sb->s_op->write_inode(inode, wbc);
1216 trace_writeback_write_inode(inode, wbc);
1217 return ret;
1218 }
1219 return 0;
1220}
1221
1222
1223
1224
1225
1226static void __inode_wait_for_writeback(struct inode *inode)
1227 __releases(inode->i_lock)
1228 __acquires(inode->i_lock)
1229{
1230 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
1231 wait_queue_head_t *wqh;
1232
1233 wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
1234 while (inode->i_state & I_SYNC) {
1235 spin_unlock(&inode->i_lock);
1236 __wait_on_bit(wqh, &wq, bit_wait,
1237 TASK_UNINTERRUPTIBLE);
1238 spin_lock(&inode->i_lock);
1239 }
1240}
1241
1242
1243
1244
1245void inode_wait_for_writeback(struct inode *inode)
1246{
1247 spin_lock(&inode->i_lock);
1248 __inode_wait_for_writeback(inode);
1249 spin_unlock(&inode->i_lock);
1250}
1251
1252
1253
1254
1255
1256
1257static void inode_sleep_on_writeback(struct inode *inode)
1258 __releases(inode->i_lock)
1259{
1260 DEFINE_WAIT(wait);
1261 wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
1262 int sleep;
1263
1264 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
1265 sleep = inode->i_state & I_SYNC;
1266 spin_unlock(&inode->i_lock);
1267 if (sleep)
1268 schedule();
1269 finish_wait(wqh, &wait);
1270}
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
1281 struct writeback_control *wbc)
1282{
1283 if (inode->i_state & I_FREEING)
1284 return;
1285
1286
1287
1288
1289
1290
1291 if ((inode->i_state & I_DIRTY) &&
1292 (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
1293 inode->dirtied_when = jiffies;
1294
1295 if (wbc->pages_skipped) {
1296
1297
1298
1299
1300 redirty_tail(inode, wb);
1301 return;
1302 }
1303
1304 if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
1305
1306
1307
1308
1309 if (wbc->nr_to_write <= 0) {
1310
1311 requeue_io(inode, wb);
1312 } else {
1313
1314
1315
1316
1317
1318
1319
1320 redirty_tail(inode, wb);
1321 }
1322 } else if (inode->i_state & I_DIRTY) {
1323
1324
1325
1326
1327
1328 redirty_tail(inode, wb);
1329 } else if (inode->i_state & I_DIRTY_TIME) {
1330 inode->dirtied_when = jiffies;
1331 inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
1332 } else {
1333
1334 inode_io_list_del_locked(inode, wb);
1335 }
1336}
1337
1338
1339
1340
1341
1342
1343static int
1344__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
1345{
1346 struct address_space *mapping = inode->i_mapping;
1347 long nr_to_write = wbc->nr_to_write;
1348 unsigned dirty;
1349 int ret;
1350
1351 WARN_ON(!(inode->i_state & I_SYNC));
1352
1353 trace_writeback_single_inode_start(inode, wbc, nr_to_write);
1354
1355 ret = do_writepages(mapping, wbc);
1356
1357
1358
1359
1360
1361
1362
1363
1364 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
1365 int err = filemap_fdatawait(mapping);
1366 if (ret == 0)
1367 ret = err;
1368 }
1369
1370
1371
1372
1373
1374
1375 spin_lock(&inode->i_lock);
1376
1377 dirty = inode->i_state & I_DIRTY;
1378 if (inode->i_state & I_DIRTY_TIME) {
1379 if ((dirty & I_DIRTY_INODE) ||
1380 wbc->sync_mode == WB_SYNC_ALL ||
1381 unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) ||
1382 unlikely(time_after(jiffies,
1383 (inode->dirtied_time_when +
1384 dirtytime_expire_interval * HZ)))) {
1385 dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
1386 trace_writeback_lazytime(inode);
1387 }
1388 } else
1389 inode->i_state &= ~I_DIRTY_TIME_EXPIRED;
1390 inode->i_state &= ~dirty;
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403 smp_mb();
1404
1405 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
1406 inode->i_state |= I_DIRTY_PAGES;
1407
1408 spin_unlock(&inode->i_lock);
1409
1410 if (dirty & I_DIRTY_TIME)
1411 mark_inode_dirty_sync(inode);
1412
1413 if (dirty & ~I_DIRTY_PAGES) {
1414 int err = write_inode(inode, wbc);
1415 if (ret == 0)
1416 ret = err;
1417 }
1418 trace_writeback_single_inode(inode, wbc, nr_to_write);
1419 return ret;
1420}
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430static int writeback_single_inode(struct inode *inode,
1431 struct writeback_control *wbc)
1432{
1433 struct bdi_writeback *wb;
1434 int ret = 0;
1435
1436 spin_lock(&inode->i_lock);
1437 if (!atomic_read(&inode->i_count))
1438 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
1439 else
1440 WARN_ON(inode->i_state & I_WILL_FREE);
1441
1442 if (inode->i_state & I_SYNC) {
1443 if (wbc->sync_mode != WB_SYNC_ALL)
1444 goto out;
1445
1446
1447
1448
1449
1450 __inode_wait_for_writeback(inode);
1451 }
1452 WARN_ON(inode->i_state & I_SYNC);
1453
1454
1455
1456
1457
1458
1459
1460
1461 if (!(inode->i_state & I_DIRTY_ALL) &&
1462 (wbc->sync_mode != WB_SYNC_ALL ||
1463 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
1464 goto out;
1465 inode->i_state |= I_SYNC;
1466 wbc_attach_and_unlock_inode(wbc, inode);
1467
1468 ret = __writeback_single_inode(inode, wbc);
1469
1470 wbc_detach_inode(wbc);
1471
1472 wb = inode_to_wb_and_lock_list(inode);
1473 spin_lock(&inode->i_lock);
1474
1475
1476
1477
1478 if (!(inode->i_state & I_DIRTY_ALL))
1479 inode_io_list_del_locked(inode, wb);
1480 spin_unlock(&wb->list_lock);
1481 inode_sync_complete(inode);
1482out:
1483 spin_unlock(&inode->i_lock);
1484 return ret;
1485}
1486
1487static long writeback_chunk_size(struct bdi_writeback *wb,
1488 struct wb_writeback_work *work)
1489{
1490 long pages;
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505 if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
1506 pages = LONG_MAX;
1507 else {
1508 pages = min(wb->avg_write_bandwidth / 2,
1509 global_wb_domain.dirty_limit / DIRTY_SCOPE);
1510 pages = min(pages, work->nr_pages);
1511 pages = round_down(pages + MIN_WRITEBACK_PAGES,
1512 MIN_WRITEBACK_PAGES);
1513 }
1514
1515 return pages;
1516}
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527static long writeback_sb_inodes(struct super_block *sb,
1528 struct bdi_writeback *wb,
1529 struct wb_writeback_work *work)
1530{
1531 struct writeback_control wbc = {
1532 .sync_mode = work->sync_mode,
1533 .tagged_writepages = work->tagged_writepages,
1534 .for_kupdate = work->for_kupdate,
1535 .for_background = work->for_background,
1536 .for_sync = work->for_sync,
1537 .range_cyclic = work->range_cyclic,
1538 .range_start = 0,
1539 .range_end = LLONG_MAX,
1540 };
1541 unsigned long start_time = jiffies;
1542 long write_chunk;
1543 long wrote = 0;
1544
1545 while (!list_empty(&wb->b_io)) {
1546 struct inode *inode = wb_inode(wb->b_io.prev);
1547 struct bdi_writeback *tmp_wb;
1548
1549 if (inode->i_sb != sb) {
1550 if (work->sb) {
1551
1552
1553
1554
1555
1556 redirty_tail(inode, wb);
1557 continue;
1558 }
1559
1560
1561
1562
1563
1564
1565 break;
1566 }
1567
1568
1569
1570
1571
1572
1573 spin_lock(&inode->i_lock);
1574 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
1575 spin_unlock(&inode->i_lock);
1576 redirty_tail(inode, wb);
1577 continue;
1578 }
1579 if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589 spin_unlock(&inode->i_lock);
1590 requeue_io(inode, wb);
1591 trace_writeback_sb_inodes_requeue(inode);
1592 continue;
1593 }
1594 spin_unlock(&wb->list_lock);
1595
1596
1597
1598
1599
1600
1601 if (inode->i_state & I_SYNC) {
1602
1603 inode_sleep_on_writeback(inode);
1604
1605 spin_lock(&wb->list_lock);
1606 continue;
1607 }
1608 inode->i_state |= I_SYNC;
1609 wbc_attach_and_unlock_inode(&wbc, inode);
1610
1611 write_chunk = writeback_chunk_size(wb, work);
1612 wbc.nr_to_write = write_chunk;
1613 wbc.pages_skipped = 0;
1614
1615
1616
1617
1618
1619 __writeback_single_inode(inode, &wbc);
1620
1621 wbc_detach_inode(&wbc);
1622 work->nr_pages -= write_chunk - wbc.nr_to_write;
1623 wrote += write_chunk - wbc.nr_to_write;
1624
1625 if (need_resched()) {
1626
1627
1628
1629
1630
1631
1632
1633
1634 blk_flush_plug(current);
1635 cond_resched();
1636 }
1637
1638
1639
1640
1641
1642 tmp_wb = inode_to_wb_and_lock_list(inode);
1643 spin_lock(&inode->i_lock);
1644 if (!(inode->i_state & I_DIRTY_ALL))
1645 wrote++;
1646 requeue_inode(inode, tmp_wb, &wbc);
1647 inode_sync_complete(inode);
1648 spin_unlock(&inode->i_lock);
1649
1650 if (unlikely(tmp_wb != wb)) {
1651 spin_unlock(&tmp_wb->list_lock);
1652 spin_lock(&wb->list_lock);
1653 }
1654
1655
1656
1657
1658
1659 if (wrote) {
1660 if (time_is_before_jiffies(start_time + HZ / 10UL))
1661 break;
1662 if (work->nr_pages <= 0)
1663 break;
1664 }
1665 }
1666 return wrote;
1667}
1668
1669static long __writeback_inodes_wb(struct bdi_writeback *wb,
1670 struct wb_writeback_work *work)
1671{
1672 unsigned long start_time = jiffies;
1673 long wrote = 0;
1674
1675 while (!list_empty(&wb->b_io)) {
1676 struct inode *inode = wb_inode(wb->b_io.prev);
1677 struct super_block *sb = inode->i_sb;
1678
1679 if (!trylock_super(sb)) {
1680
1681
1682
1683
1684
1685 redirty_tail(inode, wb);
1686 continue;
1687 }
1688 wrote += writeback_sb_inodes(sb, wb, work);
1689 up_read(&sb->s_umount);
1690
1691
1692 if (wrote) {
1693 if (time_is_before_jiffies(start_time + HZ / 10UL))
1694 break;
1695 if (work->nr_pages <= 0)
1696 break;
1697 }
1698 }
1699
1700 return wrote;
1701}
1702
1703static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
1704 enum wb_reason reason)
1705{
1706 struct wb_writeback_work work = {
1707 .nr_pages = nr_pages,
1708 .sync_mode = WB_SYNC_NONE,
1709 .range_cyclic = 1,
1710 .reason = reason,
1711 };
1712 struct blk_plug plug;
1713
1714 blk_start_plug(&plug);
1715 spin_lock(&wb->list_lock);
1716 if (list_empty(&wb->b_io))
1717 queue_io(wb, &work);
1718 __writeback_inodes_wb(wb, &work);
1719 spin_unlock(&wb->list_lock);
1720 blk_finish_plug(&plug);
1721
1722 return nr_pages - work.nr_pages;
1723}
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740static long wb_writeback(struct bdi_writeback *wb,
1741 struct wb_writeback_work *work)
1742{
1743 unsigned long wb_start = jiffies;
1744 long nr_pages = work->nr_pages;
1745 unsigned long oldest_jif;
1746 struct inode *inode;
1747 long progress;
1748 struct blk_plug plug;
1749
1750 oldest_jif = jiffies;
1751 work->older_than_this = &oldest_jif;
1752
1753 blk_start_plug(&plug);
1754 spin_lock(&wb->list_lock);
1755 for (;;) {
1756
1757
1758
1759 if (work->nr_pages <= 0)
1760 break;
1761
1762
1763
1764
1765
1766
1767
1768 if ((work->for_background || work->for_kupdate) &&
1769 !list_empty(&wb->work_list))
1770 break;
1771
1772
1773
1774
1775
1776 if (work->for_background && !wb_over_bg_thresh(wb))
1777 break;
1778
1779
1780
1781
1782
1783
1784
1785 if (work->for_kupdate) {
1786 oldest_jif = jiffies -
1787 msecs_to_jiffies(dirty_expire_interval * 10);
1788 } else if (work->for_background)
1789 oldest_jif = jiffies;
1790
1791 trace_writeback_start(wb, work);
1792 if (list_empty(&wb->b_io))
1793 queue_io(wb, work);
1794 if (work->sb)
1795 progress = writeback_sb_inodes(work->sb, wb, work);
1796 else
1797 progress = __writeback_inodes_wb(wb, work);
1798 trace_writeback_written(wb, work);
1799
1800 wb_update_bandwidth(wb, wb_start);
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810 if (progress)
1811 continue;
1812
1813
1814
1815 if (list_empty(&wb->b_more_io))
1816 break;
1817
1818
1819
1820
1821
1822 trace_writeback_wait(wb, work);
1823 inode = wb_inode(wb->b_more_io.prev);
1824 spin_lock(&inode->i_lock);
1825 spin_unlock(&wb->list_lock);
1826
1827 inode_sleep_on_writeback(inode);
1828 spin_lock(&wb->list_lock);
1829 }
1830 spin_unlock(&wb->list_lock);
1831 blk_finish_plug(&plug);
1832
1833 return nr_pages - work->nr_pages;
1834}
1835
1836
1837
1838
1839static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
1840{
1841 struct wb_writeback_work *work = NULL;
1842
1843 spin_lock_bh(&wb->work_lock);
1844 if (!list_empty(&wb->work_list)) {
1845 work = list_entry(wb->work_list.next,
1846 struct wb_writeback_work, list);
1847 list_del_init(&work->list);
1848 }
1849 spin_unlock_bh(&wb->work_lock);
1850 return work;
1851}
1852
1853static long wb_check_background_flush(struct bdi_writeback *wb)
1854{
1855 if (wb_over_bg_thresh(wb)) {
1856
1857 struct wb_writeback_work work = {
1858 .nr_pages = LONG_MAX,
1859 .sync_mode = WB_SYNC_NONE,
1860 .for_background = 1,
1861 .range_cyclic = 1,
1862 .reason = WB_REASON_BACKGROUND,
1863 };
1864
1865 return wb_writeback(wb, &work);
1866 }
1867
1868 return 0;
1869}
1870
1871static long wb_check_old_data_flush(struct bdi_writeback *wb)
1872{
1873 unsigned long expired;
1874 long nr_pages;
1875
1876
1877
1878
1879 if (!dirty_writeback_interval)
1880 return 0;
1881
1882 expired = wb->last_old_flush +
1883 msecs_to_jiffies(dirty_writeback_interval * 10);
1884 if (time_before(jiffies, expired))
1885 return 0;
1886
1887 wb->last_old_flush = jiffies;
1888 nr_pages = get_nr_dirty_pages();
1889
1890 if (nr_pages) {
1891 struct wb_writeback_work work = {
1892 .nr_pages = nr_pages,
1893 .sync_mode = WB_SYNC_NONE,
1894 .for_kupdate = 1,
1895 .range_cyclic = 1,
1896 .reason = WB_REASON_PERIODIC,
1897 };
1898
1899 return wb_writeback(wb, &work);
1900 }
1901
1902 return 0;
1903}
1904
1905static long wb_check_start_all(struct bdi_writeback *wb)
1906{
1907 long nr_pages;
1908
1909 if (!test_bit(WB_start_all, &wb->state))
1910 return 0;
1911
1912 nr_pages = get_nr_dirty_pages();
1913 if (nr_pages) {
1914 struct wb_writeback_work work = {
1915 .nr_pages = wb_split_bdi_pages(wb, nr_pages),
1916 .sync_mode = WB_SYNC_NONE,
1917 .range_cyclic = 1,
1918 .reason = wb->start_all_reason,
1919 };
1920
1921 nr_pages = wb_writeback(wb, &work);
1922 }
1923
1924 clear_bit(WB_start_all, &wb->state);
1925 return nr_pages;
1926}
1927
1928
1929
1930
1931
1932static long wb_do_writeback(struct bdi_writeback *wb)
1933{
1934 struct wb_writeback_work *work;
1935 long wrote = 0;
1936
1937 set_bit(WB_writeback_running, &wb->state);
1938 while ((work = get_next_work_item(wb)) != NULL) {
1939 trace_writeback_exec(wb, work);
1940 wrote += wb_writeback(wb, work);
1941 finish_writeback_work(wb, work);
1942 }
1943
1944
1945
1946
1947 wrote += wb_check_start_all(wb);
1948
1949
1950
1951
1952 wrote += wb_check_old_data_flush(wb);
1953 wrote += wb_check_background_flush(wb);
1954 clear_bit(WB_writeback_running, &wb->state);
1955
1956 return wrote;
1957}
1958
1959
1960
1961
1962
1963void wb_workfn(struct work_struct *work)
1964{
1965 struct bdi_writeback *wb = container_of(to_delayed_work(work),
1966 struct bdi_writeback, dwork);
1967 long pages_written;
1968
1969 set_worker_desc("flush-%s", dev_name(wb->bdi->dev));
1970 current->flags |= PF_SWAPWRITE;
1971
1972 if (likely(!current_is_workqueue_rescuer() ||
1973 !test_bit(WB_registered, &wb->state))) {
1974
1975
1976
1977
1978
1979
1980 do {
1981 pages_written = wb_do_writeback(wb);
1982 trace_writeback_pages_written(pages_written);
1983 } while (!list_empty(&wb->work_list));
1984 } else {
1985
1986
1987
1988
1989
1990 pages_written = writeback_inodes_wb(wb, 1024,
1991 WB_REASON_FORKER_THREAD);
1992 trace_writeback_pages_written(pages_written);
1993 }
1994
1995 if (!list_empty(&wb->work_list))
1996 wb_wakeup(wb);
1997 else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
1998 wb_wakeup_delayed(wb);
1999
2000 current->flags &= ~PF_SWAPWRITE;
2001}
2002
2003
2004
2005
2006
2007static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
2008 enum wb_reason reason)
2009{
2010 struct bdi_writeback *wb;
2011
2012 if (!bdi_has_dirty_io(bdi))
2013 return;
2014
2015 list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
2016 wb_start_writeback(wb, reason);
2017}
2018
2019void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
2020 enum wb_reason reason)
2021{
2022 rcu_read_lock();
2023 __wakeup_flusher_threads_bdi(bdi, reason);
2024 rcu_read_unlock();
2025}
2026
2027
2028
2029
2030void wakeup_flusher_threads(enum wb_reason reason)
2031{
2032 struct backing_dev_info *bdi;
2033
2034
2035
2036
2037 if (blk_needs_flush_plug(current))
2038 blk_schedule_flush_plug(current);
2039
2040 rcu_read_lock();
2041 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
2042 __wakeup_flusher_threads_bdi(bdi, reason);
2043 rcu_read_unlock();
2044}
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061static void wakeup_dirtytime_writeback(struct work_struct *w);
2062static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback);
2063
2064static void wakeup_dirtytime_writeback(struct work_struct *w)
2065{
2066 struct backing_dev_info *bdi;
2067
2068 rcu_read_lock();
2069 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
2070 struct bdi_writeback *wb;
2071
2072 list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
2073 if (!list_empty(&wb->b_dirty_time))
2074 wb_wakeup(wb);
2075 }
2076 rcu_read_unlock();
2077 schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
2078}
2079
2080static int __init start_dirtytime_writeback(void)
2081{
2082 schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
2083 return 0;
2084}
2085__initcall(start_dirtytime_writeback);
2086
2087int dirtytime_interval_handler(struct ctl_table *table, int write,
2088 void __user *buffer, size_t *lenp, loff_t *ppos)
2089{
2090 int ret;
2091
2092 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
2093 if (ret == 0 && write)
2094 mod_delayed_work(system_wq, &dirtytime_work, 0);
2095 return ret;
2096}
2097
2098static noinline void block_dump___mark_inode_dirty(struct inode *inode)
2099{
2100 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
2101 struct dentry *dentry;
2102 const char *name = "?";
2103
2104 dentry = d_find_alias(inode);
2105 if (dentry) {
2106 spin_lock(&dentry->d_lock);
2107 name = (const char *) dentry->d_name.name;
2108 }
2109 printk(KERN_DEBUG
2110 "%s(%d): dirtied inode %lu (%s) on %s\n",
2111 current->comm, task_pid_nr(current), inode->i_ino,
2112 name, inode->i_sb->s_id);
2113 if (dentry) {
2114 spin_unlock(&dentry->d_lock);
2115 dput(dentry);
2116 }
2117 }
2118}
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146void __mark_inode_dirty(struct inode *inode, int flags)
2147{
2148 struct super_block *sb = inode->i_sb;
2149 int dirtytime;
2150
2151 trace_writeback_mark_inode_dirty(inode, flags);
2152
2153
2154
2155
2156
2157 if (flags & (I_DIRTY_INODE | I_DIRTY_TIME)) {
2158 trace_writeback_dirty_inode_start(inode, flags);
2159
2160 if (sb->s_op->dirty_inode)
2161 sb->s_op->dirty_inode(inode, flags);
2162
2163 trace_writeback_dirty_inode(inode, flags);
2164 }
2165 if (flags & I_DIRTY_INODE)
2166 flags &= ~I_DIRTY_TIME;
2167 dirtytime = flags & I_DIRTY_TIME;
2168
2169
2170
2171
2172
2173 smp_mb();
2174
2175 if (((inode->i_state & flags) == flags) ||
2176 (dirtytime && (inode->i_state & I_DIRTY_INODE)))
2177 return;
2178
2179 if (unlikely(block_dump))
2180 block_dump___mark_inode_dirty(inode);
2181
2182 spin_lock(&inode->i_lock);
2183 if (dirtytime && (inode->i_state & I_DIRTY_INODE))
2184 goto out_unlock_inode;
2185 if ((inode->i_state & flags) != flags) {
2186 const int was_dirty = inode->i_state & I_DIRTY;
2187
2188 inode_attach_wb(inode, NULL);
2189
2190 if (flags & I_DIRTY_INODE)
2191 inode->i_state &= ~I_DIRTY_TIME;
2192 inode->i_state |= flags;
2193
2194
2195
2196
2197
2198
2199 if (inode->i_state & I_SYNC)
2200 goto out_unlock_inode;
2201
2202
2203
2204
2205
2206 if (!S_ISBLK(inode->i_mode)) {
2207 if (inode_unhashed(inode))
2208 goto out_unlock_inode;
2209 }
2210 if (inode->i_state & I_FREEING)
2211 goto out_unlock_inode;
2212
2213
2214
2215
2216
2217 if (!was_dirty) {
2218 struct bdi_writeback *wb;
2219 struct list_head *dirty_list;
2220 bool wakeup_bdi = false;
2221
2222 wb = locked_inode_to_wb_and_lock_list(inode);
2223
2224 WARN(bdi_cap_writeback_dirty(wb->bdi) &&
2225 !test_bit(WB_registered, &wb->state),
2226 "bdi-%s not registered\n", wb->bdi->name);
2227
2228 inode->dirtied_when = jiffies;
2229 if (dirtytime)
2230 inode->dirtied_time_when = jiffies;
2231
2232 if (inode->i_state & I_DIRTY)
2233 dirty_list = &wb->b_dirty;
2234 else
2235 dirty_list = &wb->b_dirty_time;
2236
2237 wakeup_bdi = inode_io_list_move_locked(inode, wb,
2238 dirty_list);
2239
2240 spin_unlock(&wb->list_lock);
2241 trace_writeback_dirty_inode_enqueue(inode);
2242
2243
2244
2245
2246
2247
2248
2249 if (bdi_cap_writeback_dirty(wb->bdi) && wakeup_bdi)
2250 wb_wakeup_delayed(wb);
2251 return;
2252 }
2253 }
2254out_unlock_inode:
2255 spin_unlock(&inode->i_lock);
2256}
2257EXPORT_SYMBOL(__mark_inode_dirty);
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268static void wait_sb_inodes(struct super_block *sb)
2269{
2270 LIST_HEAD(sync_list);
2271
2272
2273
2274
2275
2276 WARN_ON(!rwsem_is_locked(&sb->s_umount));
2277
2278 mutex_lock(&sb->s_sync_lock);
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289 rcu_read_lock();
2290 spin_lock_irq(&sb->s_inode_wblist_lock);
2291 list_splice_init(&sb->s_inodes_wb, &sync_list);
2292
2293
2294
2295
2296
2297
2298
2299
2300 while (!list_empty(&sync_list)) {
2301 struct inode *inode = list_first_entry(&sync_list, struct inode,
2302 i_wb_list);
2303 struct address_space *mapping = inode->i_mapping;
2304
2305
2306
2307
2308
2309
2310
2311 list_move_tail(&inode->i_wb_list, &sb->s_inodes_wb);
2312
2313
2314
2315
2316
2317
2318 if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
2319 continue;
2320
2321 spin_unlock_irq(&sb->s_inode_wblist_lock);
2322
2323 spin_lock(&inode->i_lock);
2324 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
2325 spin_unlock(&inode->i_lock);
2326
2327 spin_lock_irq(&sb->s_inode_wblist_lock);
2328 continue;
2329 }
2330 __iget(inode);
2331 spin_unlock(&inode->i_lock);
2332 rcu_read_unlock();
2333
2334
2335
2336
2337
2338
2339 filemap_fdatawait_keep_errors(mapping);
2340
2341 cond_resched();
2342
2343 iput(inode);
2344
2345 rcu_read_lock();
2346 spin_lock_irq(&sb->s_inode_wblist_lock);
2347 }
2348 spin_unlock_irq(&sb->s_inode_wblist_lock);
2349 rcu_read_unlock();
2350 mutex_unlock(&sb->s_sync_lock);
2351}
2352
2353static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
2354 enum wb_reason reason, bool skip_if_busy)
2355{
2356 DEFINE_WB_COMPLETION_ONSTACK(done);
2357 struct wb_writeback_work work = {
2358 .sb = sb,
2359 .sync_mode = WB_SYNC_NONE,
2360 .tagged_writepages = 1,
2361 .done = &done,
2362 .nr_pages = nr,
2363 .reason = reason,
2364 };
2365 struct backing_dev_info *bdi = sb->s_bdi;
2366
2367 if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
2368 return;
2369 WARN_ON(!rwsem_is_locked(&sb->s_umount));
2370
2371 bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
2372 wb_wait_for_completion(bdi, &done);
2373}
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385void writeback_inodes_sb_nr(struct super_block *sb,
2386 unsigned long nr,
2387 enum wb_reason reason)
2388{
2389 __writeback_inodes_sb_nr(sb, nr, reason, false);
2390}
2391EXPORT_SYMBOL(writeback_inodes_sb_nr);
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
2403{
2404 return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
2405}
2406EXPORT_SYMBOL(writeback_inodes_sb);
2407
2408
2409
2410
2411
2412
2413
2414
2415void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
2416{
2417 if (!down_read_trylock(&sb->s_umount))
2418 return;
2419
2420 __writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason, true);
2421 up_read(&sb->s_umount);
2422}
2423EXPORT_SYMBOL(try_to_writeback_inodes_sb);
2424
2425
2426
2427
2428
2429
2430
2431
2432void sync_inodes_sb(struct super_block *sb)
2433{
2434 DEFINE_WB_COMPLETION_ONSTACK(done);
2435 struct wb_writeback_work work = {
2436 .sb = sb,
2437 .sync_mode = WB_SYNC_ALL,
2438 .nr_pages = LONG_MAX,
2439 .range_cyclic = 0,
2440 .done = &done,
2441 .reason = WB_REASON_SYNC,
2442 .for_sync = 1,
2443 };
2444 struct backing_dev_info *bdi = sb->s_bdi;
2445
2446
2447
2448
2449
2450
2451 if (bdi == &noop_backing_dev_info)
2452 return;
2453 WARN_ON(!rwsem_is_locked(&sb->s_umount));
2454
2455
2456 bdi_down_write_wb_switch_rwsem(bdi);
2457 bdi_split_work_to_wbs(bdi, &work, false);
2458 wb_wait_for_completion(bdi, &done);
2459 bdi_up_write_wb_switch_rwsem(bdi);
2460
2461 wait_sb_inodes(sb);
2462}
2463EXPORT_SYMBOL(sync_inodes_sb);
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475int write_inode_now(struct inode *inode, int sync)
2476{
2477 struct writeback_control wbc = {
2478 .nr_to_write = LONG_MAX,
2479 .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
2480 .range_start = 0,
2481 .range_end = LLONG_MAX,
2482 };
2483
2484 if (!mapping_cap_writeback_dirty(inode->i_mapping))
2485 wbc.nr_to_write = 0;
2486
2487 might_sleep();
2488 return writeback_single_inode(inode, &wbc);
2489}
2490EXPORT_SYMBOL(write_inode_now);
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503int sync_inode(struct inode *inode, struct writeback_control *wbc)
2504{
2505 return writeback_single_inode(inode, wbc);
2506}
2507EXPORT_SYMBOL(sync_inode);
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518int sync_inode_metadata(struct inode *inode, int wait)
2519{
2520 struct writeback_control wbc = {
2521 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
2522 .nr_to_write = 0,
2523 };
2524
2525 return sync_inode(inode, &wbc);
2526}
2527EXPORT_SYMBOL(sync_inode_metadata);
2528