1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17#include <linux/kernel.h>
18#include <linux/export.h>
19#include <linux/spinlock.h>
20#include <linux/slab.h>
21#include <linux/sched.h>
22#include <linux/fs.h>
23#include <linux/mm.h>
24#include <linux/pagemap.h>
25#include <linux/kthread.h>
26#include <linux/writeback.h>
27#include <linux/blkdev.h>
28#include <linux/backing-dev.h>
29#include <linux/tracepoint.h>
30#include <linux/device.h>
31#include <linux/memcontrol.h>
32#include "internal.h"
33
34
35
36
37#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10))
38
39struct wb_completion {
40 atomic_t cnt;
41};
42
43
44
45
46struct wb_writeback_work {
47 long nr_pages;
48 struct super_block *sb;
49 unsigned long *older_than_this;
50 enum writeback_sync_modes sync_mode;
51 unsigned int tagged_writepages:1;
52 unsigned int for_kupdate:1;
53 unsigned int range_cyclic:1;
54 unsigned int for_background:1;
55 unsigned int for_sync:1;
56 unsigned int auto_free:1;
57 enum wb_reason reason;
58
59 struct list_head list;
60 struct wb_completion *done;
61};
62
63
64
65
66
67
68
69
70#define DEFINE_WB_COMPLETION_ONSTACK(cmpl) \
71 struct wb_completion cmpl = { \
72 .cnt = ATOMIC_INIT(1), \
73 }
74
75
76
77
78
79
80
81
82
83
84
85
86unsigned int dirtytime_expire_interval = 12 * 60 * 60;
87
88static inline struct inode *wb_inode(struct list_head *head)
89{
90 return list_entry(head, struct inode, i_io_list);
91}
92
93
94
95
96
97
98#define CREATE_TRACE_POINTS
99#include <trace/events/writeback.h>
100
101EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
102
103static bool wb_io_lists_populated(struct bdi_writeback *wb)
104{
105 if (wb_has_dirty_io(wb)) {
106 return false;
107 } else {
108 set_bit(WB_has_dirty_io, &wb->state);
109 WARN_ON_ONCE(!wb->avg_write_bandwidth);
110 atomic_long_add(wb->avg_write_bandwidth,
111 &wb->bdi->tot_write_bandwidth);
112 return true;
113 }
114}
115
116static void wb_io_lists_depopulated(struct bdi_writeback *wb)
117{
118 if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) &&
119 list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) {
120 clear_bit(WB_has_dirty_io, &wb->state);
121 WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth,
122 &wb->bdi->tot_write_bandwidth) < 0);
123 }
124}
125
126
127
128
129
130
131
132
133
134
135
136static bool inode_io_list_move_locked(struct inode *inode,
137 struct bdi_writeback *wb,
138 struct list_head *head)
139{
140 assert_spin_locked(&wb->list_lock);
141
142 list_move(&inode->i_io_list, head);
143
144
145 if (head != &wb->b_dirty_time)
146 return wb_io_lists_populated(wb);
147
148 wb_io_lists_depopulated(wb);
149 return false;
150}
151
152
153
154
155
156
157
158
159
160static void inode_io_list_del_locked(struct inode *inode,
161 struct bdi_writeback *wb)
162{
163 assert_spin_locked(&wb->list_lock);
164
165 list_del_init(&inode->i_io_list);
166 wb_io_lists_depopulated(wb);
167}
168
169static void wb_wakeup(struct bdi_writeback *wb)
170{
171 spin_lock_bh(&wb->work_lock);
172 if (test_bit(WB_registered, &wb->state))
173 mod_delayed_work(bdi_wq, &wb->dwork, 0);
174 spin_unlock_bh(&wb->work_lock);
175}
176
177static void finish_writeback_work(struct bdi_writeback *wb,
178 struct wb_writeback_work *work)
179{
180 struct wb_completion *done = work->done;
181
182 if (work->auto_free)
183 kfree(work);
184 if (done && atomic_dec_and_test(&done->cnt))
185 wake_up_all(&wb->bdi->wb_waitq);
186}
187
188static void wb_queue_work(struct bdi_writeback *wb,
189 struct wb_writeback_work *work)
190{
191 trace_writeback_queue(wb, work);
192
193 if (work->done)
194 atomic_inc(&work->done->cnt);
195
196 spin_lock_bh(&wb->work_lock);
197
198 if (test_bit(WB_registered, &wb->state)) {
199 list_add_tail(&work->list, &wb->work_list);
200 mod_delayed_work(bdi_wq, &wb->dwork, 0);
201 } else
202 finish_writeback_work(wb, work);
203
204 spin_unlock_bh(&wb->work_lock);
205}
206
207
208
209
210
211
212
213
214
215
216
217
218static void wb_wait_for_completion(struct backing_dev_info *bdi,
219 struct wb_completion *done)
220{
221 atomic_dec(&done->cnt);
222 wait_event(bdi->wb_waitq, !atomic_read(&done->cnt));
223}
224
225#ifdef CONFIG_CGROUP_WRITEBACK
226
227
228#define WB_FRN_TIME_SHIFT 13
229#define WB_FRN_TIME_AVG_SHIFT 3
230#define WB_FRN_TIME_CUT_DIV 2
231#define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT))
232
233#define WB_FRN_HIST_SLOTS 16
234#define WB_FRN_HIST_UNIT (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS)
235
236#define WB_FRN_HIST_THR_SLOTS (WB_FRN_HIST_SLOTS / 2)
237
238#define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1)
239
240
241static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
242static struct workqueue_struct *isw_wq;
243
244void __inode_attach_wb(struct inode *inode, struct page *page)
245{
246 struct backing_dev_info *bdi = inode_to_bdi(inode);
247 struct bdi_writeback *wb = NULL;
248
249 if (inode_cgwb_enabled(inode)) {
250 struct cgroup_subsys_state *memcg_css;
251
252 if (page) {
253 memcg_css = mem_cgroup_css_from_page(page);
254 wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
255 } else {
256
257 memcg_css = task_get_css(current, memory_cgrp_id);
258 wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
259 css_put(memcg_css);
260 }
261 }
262
263 if (!wb)
264 wb = &bdi->wb;
265
266
267
268
269
270 if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
271 wb_put(wb);
272}
273EXPORT_SYMBOL_GPL(__inode_attach_wb);
274
275
276
277
278
279
280
281
282
283static struct bdi_writeback *
284locked_inode_to_wb_and_lock_list(struct inode *inode)
285 __releases(&inode->i_lock)
286 __acquires(&wb->list_lock)
287{
288 while (true) {
289 struct bdi_writeback *wb = inode_to_wb(inode);
290
291
292
293
294
295
296
297 wb_get(wb);
298 spin_unlock(&inode->i_lock);
299 spin_lock(&wb->list_lock);
300
301
302 if (likely(wb == inode->i_wb)) {
303 wb_put(wb);
304 return wb;
305 }
306
307 spin_unlock(&wb->list_lock);
308 wb_put(wb);
309 cpu_relax();
310 spin_lock(&inode->i_lock);
311 }
312}
313
314
315
316
317
318
319
320
321static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
322 __acquires(&wb->list_lock)
323{
324 spin_lock(&inode->i_lock);
325 return locked_inode_to_wb_and_lock_list(inode);
326}
327
328struct inode_switch_wbs_context {
329 struct inode *inode;
330 struct bdi_writeback *new_wb;
331
332 struct rcu_head rcu_head;
333 struct work_struct work;
334};
335
336static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
337{
338 down_write(&bdi->wb_switch_rwsem);
339}
340
341static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
342{
343 up_write(&bdi->wb_switch_rwsem);
344}
345
346static void inode_switch_wbs_work_fn(struct work_struct *work)
347{
348 struct inode_switch_wbs_context *isw =
349 container_of(work, struct inode_switch_wbs_context, work);
350 struct inode *inode = isw->inode;
351 struct backing_dev_info *bdi = inode_to_bdi(inode);
352 struct address_space *mapping = inode->i_mapping;
353 struct bdi_writeback *old_wb = inode->i_wb;
354 struct bdi_writeback *new_wb = isw->new_wb;
355 XA_STATE(xas, &mapping->i_pages, 0);
356 struct page *page;
357 bool switched = false;
358
359
360
361
362
363 down_read(&bdi->wb_switch_rwsem);
364
365
366
367
368
369
370
371
372
373
374
375 if (old_wb < new_wb) {
376 spin_lock(&old_wb->list_lock);
377 spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
378 } else {
379 spin_lock(&new_wb->list_lock);
380 spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
381 }
382 spin_lock(&inode->i_lock);
383 xa_lock_irq(&mapping->i_pages);
384
385
386
387
388
389 if (unlikely(inode->i_state & I_FREEING))
390 goto skip_switch;
391
392
393
394
395
396
397 xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_DIRTY) {
398 if (PageDirty(page)) {
399 dec_wb_stat(old_wb, WB_RECLAIMABLE);
400 inc_wb_stat(new_wb, WB_RECLAIMABLE);
401 }
402 }
403
404 xas_set(&xas, 0);
405 xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) {
406 WARN_ON_ONCE(!PageWriteback(page));
407 dec_wb_stat(old_wb, WB_WRITEBACK);
408 inc_wb_stat(new_wb, WB_WRITEBACK);
409 }
410
411 wb_get(new_wb);
412
413
414
415
416
417
418
419 if (!list_empty(&inode->i_io_list)) {
420 struct inode *pos;
421
422 inode_io_list_del_locked(inode, old_wb);
423 inode->i_wb = new_wb;
424 list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
425 if (time_after_eq(inode->dirtied_when,
426 pos->dirtied_when))
427 break;
428 inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev);
429 } else {
430 inode->i_wb = new_wb;
431 }
432
433
434 inode->i_wb_frn_winner = 0;
435 inode->i_wb_frn_avg_time = 0;
436 inode->i_wb_frn_history = 0;
437 switched = true;
438skip_switch:
439
440
441
442
443 smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
444
445 xa_unlock_irq(&mapping->i_pages);
446 spin_unlock(&inode->i_lock);
447 spin_unlock(&new_wb->list_lock);
448 spin_unlock(&old_wb->list_lock);
449
450 up_read(&bdi->wb_switch_rwsem);
451
452 if (switched) {
453 wb_wakeup(new_wb);
454 wb_put(old_wb);
455 }
456 wb_put(new_wb);
457
458 iput(inode);
459 kfree(isw);
460
461 atomic_dec(&isw_nr_in_flight);
462}
463
464static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
465{
466 struct inode_switch_wbs_context *isw = container_of(rcu_head,
467 struct inode_switch_wbs_context, rcu_head);
468
469
470 INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
471 queue_work(isw_wq, &isw->work);
472}
473
474
475
476
477
478
479
480
481
482static void inode_switch_wbs(struct inode *inode, int new_wb_id)
483{
484 struct backing_dev_info *bdi = inode_to_bdi(inode);
485 struct cgroup_subsys_state *memcg_css;
486 struct inode_switch_wbs_context *isw;
487
488
489 if (inode->i_state & I_WB_SWITCH)
490 return;
491
492
493
494
495
496
497
498 if (!down_read_trylock(&bdi->wb_switch_rwsem))
499 return;
500
501 isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
502 if (!isw)
503 goto out_unlock;
504
505
506 rcu_read_lock();
507 memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
508 if (memcg_css)
509 isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
510 rcu_read_unlock();
511 if (!isw->new_wb)
512 goto out_free;
513
514
515 spin_lock(&inode->i_lock);
516 if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
517 inode->i_state & (I_WB_SWITCH | I_FREEING) ||
518 inode_to_wb(inode) == isw->new_wb) {
519 spin_unlock(&inode->i_lock);
520 goto out_free;
521 }
522 inode->i_state |= I_WB_SWITCH;
523 __iget(inode);
524 spin_unlock(&inode->i_lock);
525
526 isw->inode = inode;
527
528
529
530
531
532
533
534 call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
535
536 atomic_inc(&isw_nr_in_flight);
537
538 goto out_unlock;
539
540out_free:
541 if (isw->new_wb)
542 wb_put(isw->new_wb);
543 kfree(isw);
544out_unlock:
545 up_read(&bdi->wb_switch_rwsem);
546}
547
548
549
550
551
552
553
554
555
556
557
558void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
559 struct inode *inode)
560{
561 if (!inode_cgwb_enabled(inode)) {
562 spin_unlock(&inode->i_lock);
563 return;
564 }
565
566 wbc->wb = inode_to_wb(inode);
567 wbc->inode = inode;
568
569 wbc->wb_id = wbc->wb->memcg_css->id;
570 wbc->wb_lcand_id = inode->i_wb_frn_winner;
571 wbc->wb_tcand_id = 0;
572 wbc->wb_bytes = 0;
573 wbc->wb_lcand_bytes = 0;
574 wbc->wb_tcand_bytes = 0;
575
576 wb_get(wbc->wb);
577 spin_unlock(&inode->i_lock);
578
579
580
581
582
583 if (unlikely(wb_dying(wbc->wb)))
584 inode_switch_wbs(inode, wbc->wb_id);
585}
586EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode);
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625void wbc_detach_inode(struct writeback_control *wbc)
626{
627 struct bdi_writeback *wb = wbc->wb;
628 struct inode *inode = wbc->inode;
629 unsigned long avg_time, max_bytes, max_time;
630 u16 history;
631 int max_id;
632
633 if (!wb)
634 return;
635
636 history = inode->i_wb_frn_history;
637 avg_time = inode->i_wb_frn_avg_time;
638
639
640 if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
641 wbc->wb_bytes >= wbc->wb_tcand_bytes) {
642 max_id = wbc->wb_id;
643 max_bytes = wbc->wb_bytes;
644 } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) {
645 max_id = wbc->wb_lcand_id;
646 max_bytes = wbc->wb_lcand_bytes;
647 } else {
648 max_id = wbc->wb_tcand_id;
649 max_bytes = wbc->wb_tcand_bytes;
650 }
651
652
653
654
655
656
657
658
659 max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT,
660 wb->avg_write_bandwidth);
661 if (avg_time)
662 avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) -
663 (avg_time >> WB_FRN_TIME_AVG_SHIFT);
664 else
665 avg_time = max_time;
666
667 if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) {
668 int slots;
669
670
671
672
673
674
675
676
677
678 slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT),
679 (unsigned long)WB_FRN_HIST_MAX_SLOTS);
680 history <<= slots;
681 if (wbc->wb_id != max_id)
682 history |= (1U << slots) - 1;
683
684
685
686
687
688
689
690
691 if (hweight32(history) > WB_FRN_HIST_THR_SLOTS)
692 inode_switch_wbs(inode, max_id);
693 }
694
695
696
697
698
699 inode->i_wb_frn_winner = max_id;
700 inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX);
701 inode->i_wb_frn_history = history;
702
703 wb_put(wbc->wb);
704 wbc->wb = NULL;
705}
706EXPORT_SYMBOL_GPL(wbc_detach_inode);
707
708
709
710
711
712
713
714
715
716
717
718void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
719 size_t bytes)
720{
721 struct cgroup_subsys_state *css;
722 int id;
723
724
725
726
727
728
729
730 if (!wbc->wb || wbc->no_cgroup_owner)
731 return;
732
733 css = mem_cgroup_css_from_page(page);
734
735 if (!(css->flags & CSS_ONLINE))
736 return;
737
738 id = css->id;
739
740 if (id == wbc->wb_id) {
741 wbc->wb_bytes += bytes;
742 return;
743 }
744
745 if (id == wbc->wb_lcand_id)
746 wbc->wb_lcand_bytes += bytes;
747
748
749 if (!wbc->wb_tcand_bytes)
750 wbc->wb_tcand_id = id;
751 if (id == wbc->wb_tcand_id)
752 wbc->wb_tcand_bytes += bytes;
753 else
754 wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
755}
756EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774int inode_congested(struct inode *inode, int cong_bits)
775{
776
777
778
779
780 if (inode && inode_to_wb_is_valid(inode)) {
781 struct bdi_writeback *wb;
782 struct wb_lock_cookie lock_cookie = {};
783 bool congested;
784
785 wb = unlocked_inode_to_wb_begin(inode, &lock_cookie);
786 congested = wb_congested(wb, cong_bits);
787 unlocked_inode_to_wb_end(inode, &lock_cookie);
788 return congested;
789 }
790
791 return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
792}
793EXPORT_SYMBOL_GPL(inode_congested);
794
795
796
797
798
799
800
801
802
803
804static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
805{
806 unsigned long this_bw = wb->avg_write_bandwidth;
807 unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
808
809 if (nr_pages == LONG_MAX)
810 return LONG_MAX;
811
812
813
814
815
816
817 if (!tot_bw || this_bw >= tot_bw)
818 return nr_pages;
819 else
820 return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
821}
822
823
824
825
826
827
828
829
830
831
832
833
834static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
835 struct wb_writeback_work *base_work,
836 bool skip_if_busy)
837{
838 struct bdi_writeback *last_wb = NULL;
839 struct bdi_writeback *wb = list_entry(&bdi->wb_list,
840 struct bdi_writeback, bdi_node);
841
842 might_sleep();
843restart:
844 rcu_read_lock();
845 list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
846 DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
847 struct wb_writeback_work fallback_work;
848 struct wb_writeback_work *work;
849 long nr_pages;
850
851 if (last_wb) {
852 wb_put(last_wb);
853 last_wb = NULL;
854 }
855
856
857 if (!wb_has_dirty_io(wb) &&
858 (base_work->sync_mode == WB_SYNC_NONE ||
859 list_empty(&wb->b_dirty_time)))
860 continue;
861 if (skip_if_busy && writeback_in_progress(wb))
862 continue;
863
864 nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);
865
866 work = kmalloc(sizeof(*work), GFP_ATOMIC);
867 if (work) {
868 *work = *base_work;
869 work->nr_pages = nr_pages;
870 work->auto_free = 1;
871 wb_queue_work(wb, work);
872 continue;
873 }
874
875
876 work = &fallback_work;
877 *work = *base_work;
878 work->nr_pages = nr_pages;
879 work->auto_free = 0;
880 work->done = &fallback_work_done;
881
882 wb_queue_work(wb, work);
883
884
885
886
887
888
889 wb_get(wb);
890 last_wb = wb;
891
892 rcu_read_unlock();
893 wb_wait_for_completion(bdi, &fallback_work_done);
894 goto restart;
895 }
896 rcu_read_unlock();
897
898 if (last_wb)
899 wb_put(last_wb);
900}
901
902
903
904
905
906
907
908
909
910
911
912void cgroup_writeback_umount(void)
913{
914 if (atomic_read(&isw_nr_in_flight)) {
915
916
917
918
919 rcu_barrier();
920 flush_workqueue(isw_wq);
921 }
922}
923
924static int __init cgroup_writeback_init(void)
925{
926 isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
927 if (!isw_wq)
928 return -ENOMEM;
929 return 0;
930}
931fs_initcall(cgroup_writeback_init);
932
933#else
934
935static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
936static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
937
938static struct bdi_writeback *
939locked_inode_to_wb_and_lock_list(struct inode *inode)
940 __releases(&inode->i_lock)
941 __acquires(&wb->list_lock)
942{
943 struct bdi_writeback *wb = inode_to_wb(inode);
944
945 spin_unlock(&inode->i_lock);
946 spin_lock(&wb->list_lock);
947 return wb;
948}
949
950static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
951 __acquires(&wb->list_lock)
952{
953 struct bdi_writeback *wb = inode_to_wb(inode);
954
955 spin_lock(&wb->list_lock);
956 return wb;
957}
958
959static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
960{
961 return nr_pages;
962}
963
964static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
965 struct wb_writeback_work *base_work,
966 bool skip_if_busy)
967{
968 might_sleep();
969
970 if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
971 base_work->auto_free = 0;
972 wb_queue_work(&bdi->wb, base_work);
973 }
974}
975
976#endif
977
978
979
980
981
982static unsigned long get_nr_dirty_pages(void)
983{
984 return global_node_page_state(NR_FILE_DIRTY) +
985 global_node_page_state(NR_UNSTABLE_NFS) +
986 get_nr_dirty_inodes();
987}
988
989static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
990{
991 if (!wb_has_dirty_io(wb))
992 return;
993
994
995
996
997
998
999
1000
1001
1002 if (test_bit(WB_start_all, &wb->state) ||
1003 test_and_set_bit(WB_start_all, &wb->state))
1004 return;
1005
1006 wb->start_all_reason = reason;
1007 wb_wakeup(wb);
1008}
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020void wb_start_background_writeback(struct bdi_writeback *wb)
1021{
1022
1023
1024
1025
1026 trace_writeback_wake_background(wb);
1027 wb_wakeup(wb);
1028}
1029
1030
1031
1032
1033void inode_io_list_del(struct inode *inode)
1034{
1035 struct bdi_writeback *wb;
1036
1037 wb = inode_to_wb_and_lock_list(inode);
1038 inode_io_list_del_locked(inode, wb);
1039 spin_unlock(&wb->list_lock);
1040}
1041
1042
1043
1044
1045void sb_mark_inode_writeback(struct inode *inode)
1046{
1047 struct super_block *sb = inode->i_sb;
1048 unsigned long flags;
1049
1050 if (list_empty(&inode->i_wb_list)) {
1051 spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
1052 if (list_empty(&inode->i_wb_list)) {
1053 list_add_tail(&inode->i_wb_list, &sb->s_inodes_wb);
1054 trace_sb_mark_inode_writeback(inode);
1055 }
1056 spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
1057 }
1058}
1059
1060
1061
1062
1063void sb_clear_inode_writeback(struct inode *inode)
1064{
1065 struct super_block *sb = inode->i_sb;
1066 unsigned long flags;
1067
1068 if (!list_empty(&inode->i_wb_list)) {
1069 spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
1070 if (!list_empty(&inode->i_wb_list)) {
1071 list_del_init(&inode->i_wb_list);
1072 trace_sb_clear_inode_writeback(inode);
1073 }
1074 spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
1075 }
1076}
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
1088{
1089 if (!list_empty(&wb->b_dirty)) {
1090 struct inode *tail;
1091
1092 tail = wb_inode(wb->b_dirty.next);
1093 if (time_before(inode->dirtied_when, tail->dirtied_when))
1094 inode->dirtied_when = jiffies;
1095 }
1096 inode_io_list_move_locked(inode, wb, &wb->b_dirty);
1097}
1098
1099
1100
1101
1102static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
1103{
1104 inode_io_list_move_locked(inode, wb, &wb->b_more_io);
1105}
1106
1107static void inode_sync_complete(struct inode *inode)
1108{
1109 inode->i_state &= ~I_SYNC;
1110
1111 inode_add_lru(inode);
1112
1113 smp_mb();
1114 wake_up_bit(&inode->i_state, __I_SYNC);
1115}
1116
1117static bool inode_dirtied_after(struct inode *inode, unsigned long t)
1118{
1119 bool ret = time_after(inode->dirtied_when, t);
1120#ifndef CONFIG_64BIT
1121
1122
1123
1124
1125
1126
1127 ret = ret && time_before_eq(inode->dirtied_when, jiffies);
1128#endif
1129 return ret;
1130}
1131
1132#define EXPIRE_DIRTY_ATIME 0x0001
1133
1134
1135
1136
1137
1138static int move_expired_inodes(struct list_head *delaying_queue,
1139 struct list_head *dispatch_queue,
1140 int flags,
1141 struct wb_writeback_work *work)
1142{
1143 unsigned long *older_than_this = NULL;
1144 unsigned long expire_time;
1145 LIST_HEAD(tmp);
1146 struct list_head *pos, *node;
1147 struct super_block *sb = NULL;
1148 struct inode *inode;
1149 int do_sb_sort = 0;
1150 int moved = 0;
1151
1152 if ((flags & EXPIRE_DIRTY_ATIME) == 0)
1153 older_than_this = work->older_than_this;
1154 else if (!work->for_sync) {
1155 expire_time = jiffies - (dirtytime_expire_interval * HZ);
1156 older_than_this = &expire_time;
1157 }
1158 while (!list_empty(delaying_queue)) {
1159 inode = wb_inode(delaying_queue->prev);
1160 if (older_than_this &&
1161 inode_dirtied_after(inode, *older_than_this))
1162 break;
1163 list_move(&inode->i_io_list, &tmp);
1164 moved++;
1165 if (flags & EXPIRE_DIRTY_ATIME)
1166 set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);
1167 if (sb_is_blkdev_sb(inode->i_sb))
1168 continue;
1169 if (sb && sb != inode->i_sb)
1170 do_sb_sort = 1;
1171 sb = inode->i_sb;
1172 }
1173
1174
1175 if (!do_sb_sort) {
1176 list_splice(&tmp, dispatch_queue);
1177 goto out;
1178 }
1179
1180
1181 while (!list_empty(&tmp)) {
1182 sb = wb_inode(tmp.prev)->i_sb;
1183 list_for_each_prev_safe(pos, node, &tmp) {
1184 inode = wb_inode(pos);
1185 if (inode->i_sb == sb)
1186 list_move(&inode->i_io_list, dispatch_queue);
1187 }
1188 }
1189out:
1190 return moved;
1191}
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
1205{
1206 int moved;
1207
1208 assert_spin_locked(&wb->list_lock);
1209 list_splice_init(&wb->b_more_io, &wb->b_io);
1210 moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
1211 moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
1212 EXPIRE_DIRTY_ATIME, work);
1213 if (moved)
1214 wb_io_lists_populated(wb);
1215 trace_writeback_queue_io(wb, work, moved);
1216}
1217
1218static int write_inode(struct inode *inode, struct writeback_control *wbc)
1219{
1220 int ret;
1221
1222 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) {
1223 trace_writeback_write_inode_start(inode, wbc);
1224 ret = inode->i_sb->s_op->write_inode(inode, wbc);
1225 trace_writeback_write_inode(inode, wbc);
1226 return ret;
1227 }
1228 return 0;
1229}
1230
1231
1232
1233
1234
1235static void __inode_wait_for_writeback(struct inode *inode)
1236 __releases(inode->i_lock)
1237 __acquires(inode->i_lock)
1238{
1239 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
1240 wait_queue_head_t *wqh;
1241
1242 wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
1243 while (inode->i_state & I_SYNC) {
1244 spin_unlock(&inode->i_lock);
1245 __wait_on_bit(wqh, &wq, bit_wait,
1246 TASK_UNINTERRUPTIBLE);
1247 spin_lock(&inode->i_lock);
1248 }
1249}
1250
1251
1252
1253
1254void inode_wait_for_writeback(struct inode *inode)
1255{
1256 spin_lock(&inode->i_lock);
1257 __inode_wait_for_writeback(inode);
1258 spin_unlock(&inode->i_lock);
1259}
1260
1261
1262
1263
1264
1265
1266static void inode_sleep_on_writeback(struct inode *inode)
1267 __releases(inode->i_lock)
1268{
1269 DEFINE_WAIT(wait);
1270 wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
1271 int sleep;
1272
1273 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
1274 sleep = inode->i_state & I_SYNC;
1275 spin_unlock(&inode->i_lock);
1276 if (sleep)
1277 schedule();
1278 finish_wait(wqh, &wait);
1279}
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
1290 struct writeback_control *wbc)
1291{
1292 if (inode->i_state & I_FREEING)
1293 return;
1294
1295
1296
1297
1298
1299
1300 if ((inode->i_state & I_DIRTY) &&
1301 (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
1302 inode->dirtied_when = jiffies;
1303
1304 if (wbc->pages_skipped) {
1305
1306
1307
1308
1309 redirty_tail(inode, wb);
1310 return;
1311 }
1312
1313 if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
1314
1315
1316
1317
1318 if (wbc->nr_to_write <= 0) {
1319
1320 requeue_io(inode, wb);
1321 } else {
1322
1323
1324
1325
1326
1327
1328
1329 redirty_tail(inode, wb);
1330 }
1331 } else if (inode->i_state & I_DIRTY) {
1332
1333
1334
1335
1336
1337 redirty_tail(inode, wb);
1338 } else if (inode->i_state & I_DIRTY_TIME) {
1339 inode->dirtied_when = jiffies;
1340 inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
1341 } else {
1342
1343 inode_io_list_del_locked(inode, wb);
1344 }
1345}
1346
1347
1348
1349
1350
1351
1352static int
1353__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
1354{
1355 struct address_space *mapping = inode->i_mapping;
1356 long nr_to_write = wbc->nr_to_write;
1357 unsigned dirty;
1358 int ret;
1359
1360 WARN_ON(!(inode->i_state & I_SYNC));
1361
1362 trace_writeback_single_inode_start(inode, wbc, nr_to_write);
1363
1364 ret = do_writepages(mapping, wbc);
1365
1366
1367
1368
1369
1370
1371
1372
1373 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
1374 int err = filemap_fdatawait(mapping);
1375 if (ret == 0)
1376 ret = err;
1377 }
1378
1379
1380
1381
1382
1383
1384 spin_lock(&inode->i_lock);
1385
1386 dirty = inode->i_state & I_DIRTY;
1387 if (inode->i_state & I_DIRTY_TIME) {
1388 if ((dirty & I_DIRTY_INODE) ||
1389 wbc->sync_mode == WB_SYNC_ALL ||
1390 unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) ||
1391 unlikely(time_after(jiffies,
1392 (inode->dirtied_time_when +
1393 dirtytime_expire_interval * HZ)))) {
1394 dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
1395 trace_writeback_lazytime(inode);
1396 }
1397 } else
1398 inode->i_state &= ~I_DIRTY_TIME_EXPIRED;
1399 inode->i_state &= ~dirty;
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412 smp_mb();
1413
1414 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
1415 inode->i_state |= I_DIRTY_PAGES;
1416
1417 spin_unlock(&inode->i_lock);
1418
1419 if (dirty & I_DIRTY_TIME)
1420 mark_inode_dirty_sync(inode);
1421
1422 if (dirty & ~I_DIRTY_PAGES) {
1423 int err = write_inode(inode, wbc);
1424 if (ret == 0)
1425 ret = err;
1426 }
1427 trace_writeback_single_inode(inode, wbc, nr_to_write);
1428 return ret;
1429}
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439static int writeback_single_inode(struct inode *inode,
1440 struct writeback_control *wbc)
1441{
1442 struct bdi_writeback *wb;
1443 int ret = 0;
1444
1445 spin_lock(&inode->i_lock);
1446 if (!atomic_read(&inode->i_count))
1447 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
1448 else
1449 WARN_ON(inode->i_state & I_WILL_FREE);
1450
1451 if (inode->i_state & I_SYNC) {
1452 if (wbc->sync_mode != WB_SYNC_ALL)
1453 goto out;
1454
1455
1456
1457
1458
1459 __inode_wait_for_writeback(inode);
1460 }
1461 WARN_ON(inode->i_state & I_SYNC);
1462
1463
1464
1465
1466
1467
1468
1469
1470 if (!(inode->i_state & I_DIRTY_ALL) &&
1471 (wbc->sync_mode != WB_SYNC_ALL ||
1472 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
1473 goto out;
1474 inode->i_state |= I_SYNC;
1475 wbc_attach_and_unlock_inode(wbc, inode);
1476
1477 ret = __writeback_single_inode(inode, wbc);
1478
1479 wbc_detach_inode(wbc);
1480
1481 wb = inode_to_wb_and_lock_list(inode);
1482 spin_lock(&inode->i_lock);
1483
1484
1485
1486
1487 if (!(inode->i_state & I_DIRTY_ALL))
1488 inode_io_list_del_locked(inode, wb);
1489 spin_unlock(&wb->list_lock);
1490 inode_sync_complete(inode);
1491out:
1492 spin_unlock(&inode->i_lock);
1493 return ret;
1494}
1495
1496static long writeback_chunk_size(struct bdi_writeback *wb,
1497 struct wb_writeback_work *work)
1498{
1499 long pages;
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514 if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
1515 pages = LONG_MAX;
1516 else {
1517 pages = min(wb->avg_write_bandwidth / 2,
1518 global_wb_domain.dirty_limit / DIRTY_SCOPE);
1519 pages = min(pages, work->nr_pages);
1520 pages = round_down(pages + MIN_WRITEBACK_PAGES,
1521 MIN_WRITEBACK_PAGES);
1522 }
1523
1524 return pages;
1525}
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536static long writeback_sb_inodes(struct super_block *sb,
1537 struct bdi_writeback *wb,
1538 struct wb_writeback_work *work)
1539{
1540 struct writeback_control wbc = {
1541 .sync_mode = work->sync_mode,
1542 .tagged_writepages = work->tagged_writepages,
1543 .for_kupdate = work->for_kupdate,
1544 .for_background = work->for_background,
1545 .for_sync = work->for_sync,
1546 .range_cyclic = work->range_cyclic,
1547 .range_start = 0,
1548 .range_end = LLONG_MAX,
1549 };
1550 unsigned long start_time = jiffies;
1551 long write_chunk;
1552 long wrote = 0;
1553
1554 while (!list_empty(&wb->b_io)) {
1555 struct inode *inode = wb_inode(wb->b_io.prev);
1556 struct bdi_writeback *tmp_wb;
1557
1558 if (inode->i_sb != sb) {
1559 if (work->sb) {
1560
1561
1562
1563
1564
1565 redirty_tail(inode, wb);
1566 continue;
1567 }
1568
1569
1570
1571
1572
1573
1574 break;
1575 }
1576
1577
1578
1579
1580
1581
1582 spin_lock(&inode->i_lock);
1583 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
1584 spin_unlock(&inode->i_lock);
1585 redirty_tail(inode, wb);
1586 continue;
1587 }
1588 if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598 spin_unlock(&inode->i_lock);
1599 requeue_io(inode, wb);
1600 trace_writeback_sb_inodes_requeue(inode);
1601 continue;
1602 }
1603 spin_unlock(&wb->list_lock);
1604
1605
1606
1607
1608
1609
1610 if (inode->i_state & I_SYNC) {
1611
1612 inode_sleep_on_writeback(inode);
1613
1614 spin_lock(&wb->list_lock);
1615 continue;
1616 }
1617 inode->i_state |= I_SYNC;
1618 wbc_attach_and_unlock_inode(&wbc, inode);
1619
1620 write_chunk = writeback_chunk_size(wb, work);
1621 wbc.nr_to_write = write_chunk;
1622 wbc.pages_skipped = 0;
1623
1624
1625
1626
1627
1628 __writeback_single_inode(inode, &wbc);
1629
1630 wbc_detach_inode(&wbc);
1631 work->nr_pages -= write_chunk - wbc.nr_to_write;
1632 wrote += write_chunk - wbc.nr_to_write;
1633
1634 if (need_resched()) {
1635
1636
1637
1638
1639
1640
1641
1642
1643 blk_flush_plug(current);
1644 cond_resched();
1645 }
1646
1647
1648
1649
1650
1651 tmp_wb = inode_to_wb_and_lock_list(inode);
1652 spin_lock(&inode->i_lock);
1653 if (!(inode->i_state & I_DIRTY_ALL))
1654 wrote++;
1655 requeue_inode(inode, tmp_wb, &wbc);
1656 inode_sync_complete(inode);
1657 spin_unlock(&inode->i_lock);
1658
1659 if (unlikely(tmp_wb != wb)) {
1660 spin_unlock(&tmp_wb->list_lock);
1661 spin_lock(&wb->list_lock);
1662 }
1663
1664
1665
1666
1667
1668 if (wrote) {
1669 if (time_is_before_jiffies(start_time + HZ / 10UL))
1670 break;
1671 if (work->nr_pages <= 0)
1672 break;
1673 }
1674 }
1675 return wrote;
1676}
1677
1678static long __writeback_inodes_wb(struct bdi_writeback *wb,
1679 struct wb_writeback_work *work)
1680{
1681 unsigned long start_time = jiffies;
1682 long wrote = 0;
1683
1684 while (!list_empty(&wb->b_io)) {
1685 struct inode *inode = wb_inode(wb->b_io.prev);
1686 struct super_block *sb = inode->i_sb;
1687
1688 if (!trylock_super(sb)) {
1689
1690
1691
1692
1693
1694 redirty_tail(inode, wb);
1695 continue;
1696 }
1697 wrote += writeback_sb_inodes(sb, wb, work);
1698 up_read(&sb->s_umount);
1699
1700
1701 if (wrote) {
1702 if (time_is_before_jiffies(start_time + HZ / 10UL))
1703 break;
1704 if (work->nr_pages <= 0)
1705 break;
1706 }
1707 }
1708
1709 return wrote;
1710}
1711
1712static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
1713 enum wb_reason reason)
1714{
1715 struct wb_writeback_work work = {
1716 .nr_pages = nr_pages,
1717 .sync_mode = WB_SYNC_NONE,
1718 .range_cyclic = 1,
1719 .reason = reason,
1720 };
1721 struct blk_plug plug;
1722
1723 blk_start_plug(&plug);
1724 spin_lock(&wb->list_lock);
1725 if (list_empty(&wb->b_io))
1726 queue_io(wb, &work);
1727 __writeback_inodes_wb(wb, &work);
1728 spin_unlock(&wb->list_lock);
1729 blk_finish_plug(&plug);
1730
1731 return nr_pages - work.nr_pages;
1732}
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749static long wb_writeback(struct bdi_writeback *wb,
1750 struct wb_writeback_work *work)
1751{
1752 unsigned long wb_start = jiffies;
1753 long nr_pages = work->nr_pages;
1754 unsigned long oldest_jif;
1755 struct inode *inode;
1756 long progress;
1757 struct blk_plug plug;
1758
1759 oldest_jif = jiffies;
1760 work->older_than_this = &oldest_jif;
1761
1762 blk_start_plug(&plug);
1763 spin_lock(&wb->list_lock);
1764 for (;;) {
1765
1766
1767
1768 if (work->nr_pages <= 0)
1769 break;
1770
1771
1772
1773
1774
1775
1776
1777 if ((work->for_background || work->for_kupdate) &&
1778 !list_empty(&wb->work_list))
1779 break;
1780
1781
1782
1783
1784
1785 if (work->for_background && !wb_over_bg_thresh(wb))
1786 break;
1787
1788
1789
1790
1791
1792
1793
1794 if (work->for_kupdate) {
1795 oldest_jif = jiffies -
1796 msecs_to_jiffies(dirty_expire_interval * 10);
1797 } else if (work->for_background)
1798 oldest_jif = jiffies;
1799
1800 trace_writeback_start(wb, work);
1801 if (list_empty(&wb->b_io))
1802 queue_io(wb, work);
1803 if (work->sb)
1804 progress = writeback_sb_inodes(work->sb, wb, work);
1805 else
1806 progress = __writeback_inodes_wb(wb, work);
1807 trace_writeback_written(wb, work);
1808
1809 wb_update_bandwidth(wb, wb_start);
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819 if (progress)
1820 continue;
1821
1822
1823
1824 if (list_empty(&wb->b_more_io))
1825 break;
1826
1827
1828
1829
1830
1831 trace_writeback_wait(wb, work);
1832 inode = wb_inode(wb->b_more_io.prev);
1833 spin_lock(&inode->i_lock);
1834 spin_unlock(&wb->list_lock);
1835
1836 inode_sleep_on_writeback(inode);
1837 spin_lock(&wb->list_lock);
1838 }
1839 spin_unlock(&wb->list_lock);
1840 blk_finish_plug(&plug);
1841
1842 return nr_pages - work->nr_pages;
1843}
1844
1845
1846
1847
1848static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
1849{
1850 struct wb_writeback_work *work = NULL;
1851
1852 spin_lock_bh(&wb->work_lock);
1853 if (!list_empty(&wb->work_list)) {
1854 work = list_entry(wb->work_list.next,
1855 struct wb_writeback_work, list);
1856 list_del_init(&work->list);
1857 }
1858 spin_unlock_bh(&wb->work_lock);
1859 return work;
1860}
1861
1862static long wb_check_background_flush(struct bdi_writeback *wb)
1863{
1864 if (wb_over_bg_thresh(wb)) {
1865
1866 struct wb_writeback_work work = {
1867 .nr_pages = LONG_MAX,
1868 .sync_mode = WB_SYNC_NONE,
1869 .for_background = 1,
1870 .range_cyclic = 1,
1871 .reason = WB_REASON_BACKGROUND,
1872 };
1873
1874 return wb_writeback(wb, &work);
1875 }
1876
1877 return 0;
1878}
1879
1880static long wb_check_old_data_flush(struct bdi_writeback *wb)
1881{
1882 unsigned long expired;
1883 long nr_pages;
1884
1885
1886
1887
1888 if (!dirty_writeback_interval)
1889 return 0;
1890
1891 expired = wb->last_old_flush +
1892 msecs_to_jiffies(dirty_writeback_interval * 10);
1893 if (time_before(jiffies, expired))
1894 return 0;
1895
1896 wb->last_old_flush = jiffies;
1897 nr_pages = get_nr_dirty_pages();
1898
1899 if (nr_pages) {
1900 struct wb_writeback_work work = {
1901 .nr_pages = nr_pages,
1902 .sync_mode = WB_SYNC_NONE,
1903 .for_kupdate = 1,
1904 .range_cyclic = 1,
1905 .reason = WB_REASON_PERIODIC,
1906 };
1907
1908 return wb_writeback(wb, &work);
1909 }
1910
1911 return 0;
1912}
1913
1914static long wb_check_start_all(struct bdi_writeback *wb)
1915{
1916 long nr_pages;
1917
1918 if (!test_bit(WB_start_all, &wb->state))
1919 return 0;
1920
1921 nr_pages = get_nr_dirty_pages();
1922 if (nr_pages) {
1923 struct wb_writeback_work work = {
1924 .nr_pages = wb_split_bdi_pages(wb, nr_pages),
1925 .sync_mode = WB_SYNC_NONE,
1926 .range_cyclic = 1,
1927 .reason = wb->start_all_reason,
1928 };
1929
1930 nr_pages = wb_writeback(wb, &work);
1931 }
1932
1933 clear_bit(WB_start_all, &wb->state);
1934 return nr_pages;
1935}
1936
1937
1938
1939
1940
1941static long wb_do_writeback(struct bdi_writeback *wb)
1942{
1943 struct wb_writeback_work *work;
1944 long wrote = 0;
1945
1946 set_bit(WB_writeback_running, &wb->state);
1947 while ((work = get_next_work_item(wb)) != NULL) {
1948 trace_writeback_exec(wb, work);
1949 wrote += wb_writeback(wb, work);
1950 finish_writeback_work(wb, work);
1951 }
1952
1953
1954
1955
1956 wrote += wb_check_start_all(wb);
1957
1958
1959
1960
1961 wrote += wb_check_old_data_flush(wb);
1962 wrote += wb_check_background_flush(wb);
1963 clear_bit(WB_writeback_running, &wb->state);
1964
1965 return wrote;
1966}
1967
1968
1969
1970
1971
1972void wb_workfn(struct work_struct *work)
1973{
1974 struct bdi_writeback *wb = container_of(to_delayed_work(work),
1975 struct bdi_writeback, dwork);
1976 long pages_written;
1977
1978 set_worker_desc("flush-%s", dev_name(wb->bdi->dev));
1979 current->flags |= PF_SWAPWRITE;
1980
1981 if (likely(!current_is_workqueue_rescuer() ||
1982 !test_bit(WB_registered, &wb->state))) {
1983
1984
1985
1986
1987
1988
1989 do {
1990 pages_written = wb_do_writeback(wb);
1991 trace_writeback_pages_written(pages_written);
1992 } while (!list_empty(&wb->work_list));
1993 } else {
1994
1995
1996
1997
1998
1999 pages_written = writeback_inodes_wb(wb, 1024,
2000 WB_REASON_FORKER_THREAD);
2001 trace_writeback_pages_written(pages_written);
2002 }
2003
2004 if (!list_empty(&wb->work_list))
2005 wb_wakeup(wb);
2006 else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
2007 wb_wakeup_delayed(wb);
2008
2009 current->flags &= ~PF_SWAPWRITE;
2010}
2011
2012
2013
2014
2015
2016static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
2017 enum wb_reason reason)
2018{
2019 struct bdi_writeback *wb;
2020
2021 if (!bdi_has_dirty_io(bdi))
2022 return;
2023
2024 list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
2025 wb_start_writeback(wb, reason);
2026}
2027
2028void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
2029 enum wb_reason reason)
2030{
2031 rcu_read_lock();
2032 __wakeup_flusher_threads_bdi(bdi, reason);
2033 rcu_read_unlock();
2034}
2035
2036
2037
2038
2039void wakeup_flusher_threads(enum wb_reason reason)
2040{
2041 struct backing_dev_info *bdi;
2042
2043
2044
2045
2046 if (blk_needs_flush_plug(current))
2047 blk_schedule_flush_plug(current);
2048
2049 rcu_read_lock();
2050 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
2051 __wakeup_flusher_threads_bdi(bdi, reason);
2052 rcu_read_unlock();
2053}
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070static void wakeup_dirtytime_writeback(struct work_struct *w);
2071static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback);
2072
2073static void wakeup_dirtytime_writeback(struct work_struct *w)
2074{
2075 struct backing_dev_info *bdi;
2076
2077 rcu_read_lock();
2078 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
2079 struct bdi_writeback *wb;
2080
2081 list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
2082 if (!list_empty(&wb->b_dirty_time))
2083 wb_wakeup(wb);
2084 }
2085 rcu_read_unlock();
2086 schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
2087}
2088
2089static int __init start_dirtytime_writeback(void)
2090{
2091 schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
2092 return 0;
2093}
2094__initcall(start_dirtytime_writeback);
2095
2096int dirtytime_interval_handler(struct ctl_table *table, int write,
2097 void __user *buffer, size_t *lenp, loff_t *ppos)
2098{
2099 int ret;
2100
2101 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
2102 if (ret == 0 && write)
2103 mod_delayed_work(system_wq, &dirtytime_work, 0);
2104 return ret;
2105}
2106
2107static noinline void block_dump___mark_inode_dirty(struct inode *inode)
2108{
2109 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
2110 struct dentry *dentry;
2111 const char *name = "?";
2112
2113 dentry = d_find_alias(inode);
2114 if (dentry) {
2115 spin_lock(&dentry->d_lock);
2116 name = (const char *) dentry->d_name.name;
2117 }
2118 printk(KERN_DEBUG
2119 "%s(%d): dirtied inode %lu (%s) on %s\n",
2120 current->comm, task_pid_nr(current), inode->i_ino,
2121 name, inode->i_sb->s_id);
2122 if (dentry) {
2123 spin_unlock(&dentry->d_lock);
2124 dput(dentry);
2125 }
2126 }
2127}
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155void __mark_inode_dirty(struct inode *inode, int flags)
2156{
2157 struct super_block *sb = inode->i_sb;
2158 int dirtytime;
2159
2160 trace_writeback_mark_inode_dirty(inode, flags);
2161
2162
2163
2164
2165
2166 if (flags & (I_DIRTY_INODE | I_DIRTY_TIME)) {
2167 trace_writeback_dirty_inode_start(inode, flags);
2168
2169 if (sb->s_op->dirty_inode)
2170 sb->s_op->dirty_inode(inode, flags);
2171
2172 trace_writeback_dirty_inode(inode, flags);
2173 }
2174 if (flags & I_DIRTY_INODE)
2175 flags &= ~I_DIRTY_TIME;
2176 dirtytime = flags & I_DIRTY_TIME;
2177
2178
2179
2180
2181
2182 smp_mb();
2183
2184 if (((inode->i_state & flags) == flags) ||
2185 (dirtytime && (inode->i_state & I_DIRTY_INODE)))
2186 return;
2187
2188 if (unlikely(block_dump))
2189 block_dump___mark_inode_dirty(inode);
2190
2191 spin_lock(&inode->i_lock);
2192 if (dirtytime && (inode->i_state & I_DIRTY_INODE))
2193 goto out_unlock_inode;
2194 if ((inode->i_state & flags) != flags) {
2195 const int was_dirty = inode->i_state & I_DIRTY;
2196
2197 inode_attach_wb(inode, NULL);
2198
2199 if (flags & I_DIRTY_INODE)
2200 inode->i_state &= ~I_DIRTY_TIME;
2201 inode->i_state |= flags;
2202
2203
2204
2205
2206
2207
2208 if (inode->i_state & I_SYNC)
2209 goto out_unlock_inode;
2210
2211
2212
2213
2214
2215 if (!S_ISBLK(inode->i_mode)) {
2216 if (inode_unhashed(inode))
2217 goto out_unlock_inode;
2218 }
2219 if (inode->i_state & I_FREEING)
2220 goto out_unlock_inode;
2221
2222
2223
2224
2225
2226 if (!was_dirty) {
2227 struct bdi_writeback *wb;
2228 struct list_head *dirty_list;
2229 bool wakeup_bdi = false;
2230
2231 wb = locked_inode_to_wb_and_lock_list(inode);
2232
2233 WARN(bdi_cap_writeback_dirty(wb->bdi) &&
2234 !test_bit(WB_registered, &wb->state),
2235 "bdi-%s not registered\n", wb->bdi->name);
2236
2237 inode->dirtied_when = jiffies;
2238 if (dirtytime)
2239 inode->dirtied_time_when = jiffies;
2240
2241 if (inode->i_state & I_DIRTY)
2242 dirty_list = &wb->b_dirty;
2243 else
2244 dirty_list = &wb->b_dirty_time;
2245
2246 wakeup_bdi = inode_io_list_move_locked(inode, wb,
2247 dirty_list);
2248
2249 spin_unlock(&wb->list_lock);
2250 trace_writeback_dirty_inode_enqueue(inode);
2251
2252
2253
2254
2255
2256
2257
2258 if (bdi_cap_writeback_dirty(wb->bdi) && wakeup_bdi)
2259 wb_wakeup_delayed(wb);
2260 return;
2261 }
2262 }
2263out_unlock_inode:
2264 spin_unlock(&inode->i_lock);
2265}
2266EXPORT_SYMBOL(__mark_inode_dirty);
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277static void wait_sb_inodes(struct super_block *sb)
2278{
2279 LIST_HEAD(sync_list);
2280
2281
2282
2283
2284
2285 WARN_ON(!rwsem_is_locked(&sb->s_umount));
2286
2287 mutex_lock(&sb->s_sync_lock);
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298 rcu_read_lock();
2299 spin_lock_irq(&sb->s_inode_wblist_lock);
2300 list_splice_init(&sb->s_inodes_wb, &sync_list);
2301
2302
2303
2304
2305
2306
2307
2308
2309 while (!list_empty(&sync_list)) {
2310 struct inode *inode = list_first_entry(&sync_list, struct inode,
2311 i_wb_list);
2312 struct address_space *mapping = inode->i_mapping;
2313
2314
2315
2316
2317
2318
2319
2320 list_move_tail(&inode->i_wb_list, &sb->s_inodes_wb);
2321
2322
2323
2324
2325
2326
2327 if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
2328 continue;
2329
2330 spin_unlock_irq(&sb->s_inode_wblist_lock);
2331
2332 spin_lock(&inode->i_lock);
2333 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
2334 spin_unlock(&inode->i_lock);
2335
2336 spin_lock_irq(&sb->s_inode_wblist_lock);
2337 continue;
2338 }
2339 __iget(inode);
2340 spin_unlock(&inode->i_lock);
2341 rcu_read_unlock();
2342
2343
2344
2345
2346
2347
2348 filemap_fdatawait_keep_errors(mapping);
2349
2350 cond_resched();
2351
2352 iput(inode);
2353
2354 rcu_read_lock();
2355 spin_lock_irq(&sb->s_inode_wblist_lock);
2356 }
2357 spin_unlock_irq(&sb->s_inode_wblist_lock);
2358 rcu_read_unlock();
2359 mutex_unlock(&sb->s_sync_lock);
2360}
2361
2362static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
2363 enum wb_reason reason, bool skip_if_busy)
2364{
2365 DEFINE_WB_COMPLETION_ONSTACK(done);
2366 struct wb_writeback_work work = {
2367 .sb = sb,
2368 .sync_mode = WB_SYNC_NONE,
2369 .tagged_writepages = 1,
2370 .done = &done,
2371 .nr_pages = nr,
2372 .reason = reason,
2373 };
2374 struct backing_dev_info *bdi = sb->s_bdi;
2375
2376 if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
2377 return;
2378 WARN_ON(!rwsem_is_locked(&sb->s_umount));
2379
2380 bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
2381 wb_wait_for_completion(bdi, &done);
2382}
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394void writeback_inodes_sb_nr(struct super_block *sb,
2395 unsigned long nr,
2396 enum wb_reason reason)
2397{
2398 __writeback_inodes_sb_nr(sb, nr, reason, false);
2399}
2400EXPORT_SYMBOL(writeback_inodes_sb_nr);
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
2412{
2413 return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
2414}
2415EXPORT_SYMBOL(writeback_inodes_sb);
2416
2417
2418
2419
2420
2421
2422
2423
2424void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
2425{
2426 if (!down_read_trylock(&sb->s_umount))
2427 return;
2428
2429 __writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason, true);
2430 up_read(&sb->s_umount);
2431}
2432EXPORT_SYMBOL(try_to_writeback_inodes_sb);
2433
2434
2435
2436
2437
2438
2439
2440
2441void sync_inodes_sb(struct super_block *sb)
2442{
2443 DEFINE_WB_COMPLETION_ONSTACK(done);
2444 struct wb_writeback_work work = {
2445 .sb = sb,
2446 .sync_mode = WB_SYNC_ALL,
2447 .nr_pages = LONG_MAX,
2448 .range_cyclic = 0,
2449 .done = &done,
2450 .reason = WB_REASON_SYNC,
2451 .for_sync = 1,
2452 };
2453 struct backing_dev_info *bdi = sb->s_bdi;
2454
2455
2456
2457
2458
2459
2460 if (bdi == &noop_backing_dev_info)
2461 return;
2462 WARN_ON(!rwsem_is_locked(&sb->s_umount));
2463
2464
2465 bdi_down_write_wb_switch_rwsem(bdi);
2466 bdi_split_work_to_wbs(bdi, &work, false);
2467 wb_wait_for_completion(bdi, &done);
2468 bdi_up_write_wb_switch_rwsem(bdi);
2469
2470 wait_sb_inodes(sb);
2471}
2472EXPORT_SYMBOL(sync_inodes_sb);
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484int write_inode_now(struct inode *inode, int sync)
2485{
2486 struct writeback_control wbc = {
2487 .nr_to_write = LONG_MAX,
2488 .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
2489 .range_start = 0,
2490 .range_end = LLONG_MAX,
2491 };
2492
2493 if (!mapping_cap_writeback_dirty(inode->i_mapping))
2494 wbc.nr_to_write = 0;
2495
2496 might_sleep();
2497 return writeback_single_inode(inode, &wbc);
2498}
2499EXPORT_SYMBOL(write_inode_now);
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512int sync_inode(struct inode *inode, struct writeback_control *wbc)
2513{
2514 return writeback_single_inode(inode, wbc);
2515}
2516EXPORT_SYMBOL(sync_inode);
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527int sync_inode_metadata(struct inode *inode, int wait)
2528{
2529 struct writeback_control wbc = {
2530 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
2531 .nr_to_write = 0,
2532 };
2533
2534 return sync_inode(inode, &wbc);
2535}
2536EXPORT_SYMBOL(sync_inode_metadata);
2537