1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17#include <linux/kernel.h>
18#include <linux/export.h>
19#include <linux/spinlock.h>
20#include <linux/slab.h>
21#include <linux/sched.h>
22#include <linux/fs.h>
23#include <linux/mm.h>
24#include <linux/pagemap.h>
25#include <linux/kthread.h>
26#include <linux/writeback.h>
27#include <linux/blkdev.h>
28#include <linux/backing-dev.h>
29#include <linux/tracepoint.h>
30#include <linux/device.h>
31#include <linux/memcontrol.h>
32#include "internal.h"
33
34
35
36
37#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10))
38
39
40
41
42struct wb_writeback_work {
43 long nr_pages;
44 struct super_block *sb;
45 enum writeback_sync_modes sync_mode;
46 unsigned int tagged_writepages:1;
47 unsigned int for_kupdate:1;
48 unsigned int range_cyclic:1;
49 unsigned int for_background:1;
50 unsigned int for_sync:1;
51 unsigned int auto_free:1;
52 enum wb_reason reason;
53
54 struct list_head list;
55 struct wb_completion *done;
56};
57
58
59
60
61
62
63
64
65
66
67
68unsigned int dirtytime_expire_interval = 12 * 60 * 60;
69
70static inline struct inode *wb_inode(struct list_head *head)
71{
72 return list_entry(head, struct inode, i_io_list);
73}
74
75
76
77
78
79
80#define CREATE_TRACE_POINTS
81#include <trace/events/writeback.h>
82
83EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
84
85static bool wb_io_lists_populated(struct bdi_writeback *wb)
86{
87 if (wb_has_dirty_io(wb)) {
88 return false;
89 } else {
90 set_bit(WB_has_dirty_io, &wb->state);
91 WARN_ON_ONCE(!wb->avg_write_bandwidth);
92 atomic_long_add(wb->avg_write_bandwidth,
93 &wb->bdi->tot_write_bandwidth);
94 return true;
95 }
96}
97
98static void wb_io_lists_depopulated(struct bdi_writeback *wb)
99{
100 if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) &&
101 list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) {
102 clear_bit(WB_has_dirty_io, &wb->state);
103 WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth,
104 &wb->bdi->tot_write_bandwidth) < 0);
105 }
106}
107
108
109
110
111
112
113
114
115
116
117
118static bool inode_io_list_move_locked(struct inode *inode,
119 struct bdi_writeback *wb,
120 struct list_head *head)
121{
122 assert_spin_locked(&wb->list_lock);
123
124 list_move(&inode->i_io_list, head);
125
126
127 if (head != &wb->b_dirty_time)
128 return wb_io_lists_populated(wb);
129
130 wb_io_lists_depopulated(wb);
131 return false;
132}
133
134
135
136
137
138
139
140
141
142static void inode_io_list_del_locked(struct inode *inode,
143 struct bdi_writeback *wb)
144{
145 assert_spin_locked(&wb->list_lock);
146 assert_spin_locked(&inode->i_lock);
147
148 inode->i_state &= ~I_SYNC_QUEUED;
149 list_del_init(&inode->i_io_list);
150 wb_io_lists_depopulated(wb);
151}
152
153static void wb_wakeup(struct bdi_writeback *wb)
154{
155 spin_lock_bh(&wb->work_lock);
156 if (test_bit(WB_registered, &wb->state))
157 mod_delayed_work(bdi_wq, &wb->dwork, 0);
158 spin_unlock_bh(&wb->work_lock);
159}
160
161static void finish_writeback_work(struct bdi_writeback *wb,
162 struct wb_writeback_work *work)
163{
164 struct wb_completion *done = work->done;
165
166 if (work->auto_free)
167 kfree(work);
168 if (done) {
169 wait_queue_head_t *waitq = done->waitq;
170
171
172 if (atomic_dec_and_test(&done->cnt))
173 wake_up_all(waitq);
174 }
175}
176
177static void wb_queue_work(struct bdi_writeback *wb,
178 struct wb_writeback_work *work)
179{
180 trace_writeback_queue(wb, work);
181
182 if (work->done)
183 atomic_inc(&work->done->cnt);
184
185 spin_lock_bh(&wb->work_lock);
186
187 if (test_bit(WB_registered, &wb->state)) {
188 list_add_tail(&work->list, &wb->work_list);
189 mod_delayed_work(bdi_wq, &wb->dwork, 0);
190 } else
191 finish_writeback_work(wb, work);
192
193 spin_unlock_bh(&wb->work_lock);
194}
195
196
197
198
199
200
201
202
203
204
205
206void wb_wait_for_completion(struct wb_completion *done)
207{
208 atomic_dec(&done->cnt);
209 wait_event(*done->waitq, !atomic_read(&done->cnt));
210}
211
212#ifdef CONFIG_CGROUP_WRITEBACK
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233#define WB_FRN_TIME_SHIFT 13
234#define WB_FRN_TIME_AVG_SHIFT 3
235#define WB_FRN_TIME_CUT_DIV 8
236#define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT))
237
238#define WB_FRN_HIST_SLOTS 16
239#define WB_FRN_HIST_UNIT (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS)
240
241#define WB_FRN_HIST_THR_SLOTS (WB_FRN_HIST_SLOTS / 2)
242
243#define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1)
244
245#define WB_FRN_MAX_IN_FLIGHT 1024
246
247static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
248static struct workqueue_struct *isw_wq;
249
250void __inode_attach_wb(struct inode *inode, struct page *page)
251{
252 struct backing_dev_info *bdi = inode_to_bdi(inode);
253 struct bdi_writeback *wb = NULL;
254
255 if (inode_cgwb_enabled(inode)) {
256 struct cgroup_subsys_state *memcg_css;
257
258 if (page) {
259 memcg_css = mem_cgroup_css_from_page(page);
260 wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
261 } else {
262
263 memcg_css = task_get_css(current, memory_cgrp_id);
264 wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
265 css_put(memcg_css);
266 }
267 }
268
269 if (!wb)
270 wb = &bdi->wb;
271
272
273
274
275
276 if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
277 wb_put(wb);
278}
279EXPORT_SYMBOL_GPL(__inode_attach_wb);
280
281
282
283
284
285
286
287
288
289static struct bdi_writeback *
290locked_inode_to_wb_and_lock_list(struct inode *inode)
291 __releases(&inode->i_lock)
292 __acquires(&wb->list_lock)
293{
294 while (true) {
295 struct bdi_writeback *wb = inode_to_wb(inode);
296
297
298
299
300
301
302
303 wb_get(wb);
304 spin_unlock(&inode->i_lock);
305 spin_lock(&wb->list_lock);
306
307
308 if (likely(wb == inode->i_wb)) {
309 wb_put(wb);
310 return wb;
311 }
312
313 spin_unlock(&wb->list_lock);
314 wb_put(wb);
315 cpu_relax();
316 spin_lock(&inode->i_lock);
317 }
318}
319
320
321
322
323
324
325
326
327static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
328 __acquires(&wb->list_lock)
329{
330 spin_lock(&inode->i_lock);
331 return locked_inode_to_wb_and_lock_list(inode);
332}
333
334struct inode_switch_wbs_context {
335 struct inode *inode;
336 struct bdi_writeback *new_wb;
337
338 struct rcu_head rcu_head;
339 struct work_struct work;
340};
341
342static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
343{
344 down_write(&bdi->wb_switch_rwsem);
345}
346
347static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
348{
349 up_write(&bdi->wb_switch_rwsem);
350}
351
352static void inode_switch_wbs_work_fn(struct work_struct *work)
353{
354 struct inode_switch_wbs_context *isw =
355 container_of(work, struct inode_switch_wbs_context, work);
356 struct inode *inode = isw->inode;
357 struct backing_dev_info *bdi = inode_to_bdi(inode);
358 struct address_space *mapping = inode->i_mapping;
359 struct bdi_writeback *old_wb = inode->i_wb;
360 struct bdi_writeback *new_wb = isw->new_wb;
361 XA_STATE(xas, &mapping->i_pages, 0);
362 struct page *page;
363 bool switched = false;
364
365
366
367
368
369 down_read(&bdi->wb_switch_rwsem);
370
371
372
373
374
375
376
377
378
379
380
381 if (old_wb < new_wb) {
382 spin_lock(&old_wb->list_lock);
383 spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
384 } else {
385 spin_lock(&new_wb->list_lock);
386 spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
387 }
388 spin_lock(&inode->i_lock);
389 xa_lock_irq(&mapping->i_pages);
390
391
392
393
394
395 if (unlikely(inode->i_state & I_FREEING))
396 goto skip_switch;
397
398 trace_inode_switch_wbs(inode, old_wb, new_wb);
399
400
401
402
403
404
405 xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_DIRTY) {
406 if (PageDirty(page)) {
407 dec_wb_stat(old_wb, WB_RECLAIMABLE);
408 inc_wb_stat(new_wb, WB_RECLAIMABLE);
409 }
410 }
411
412 xas_set(&xas, 0);
413 xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) {
414 WARN_ON_ONCE(!PageWriteback(page));
415 dec_wb_stat(old_wb, WB_WRITEBACK);
416 inc_wb_stat(new_wb, WB_WRITEBACK);
417 }
418
419 wb_get(new_wb);
420
421
422
423
424
425
426
427 if (!list_empty(&inode->i_io_list)) {
428 struct inode *pos;
429
430 inode_io_list_del_locked(inode, old_wb);
431 inode->i_wb = new_wb;
432 list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
433 if (time_after_eq(inode->dirtied_when,
434 pos->dirtied_when))
435 break;
436 inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev);
437 } else {
438 inode->i_wb = new_wb;
439 }
440
441
442 inode->i_wb_frn_winner = 0;
443 inode->i_wb_frn_avg_time = 0;
444 inode->i_wb_frn_history = 0;
445 switched = true;
446skip_switch:
447
448
449
450
451 smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
452
453 xa_unlock_irq(&mapping->i_pages);
454 spin_unlock(&inode->i_lock);
455 spin_unlock(&new_wb->list_lock);
456 spin_unlock(&old_wb->list_lock);
457
458 up_read(&bdi->wb_switch_rwsem);
459
460 if (switched) {
461 wb_wakeup(new_wb);
462 wb_put(old_wb);
463 }
464 wb_put(new_wb);
465
466 iput(inode);
467 kfree(isw);
468
469 atomic_dec(&isw_nr_in_flight);
470}
471
472static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
473{
474 struct inode_switch_wbs_context *isw = container_of(rcu_head,
475 struct inode_switch_wbs_context, rcu_head);
476
477
478 INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
479 queue_work(isw_wq, &isw->work);
480}
481
482
483
484
485
486
487
488
489
490static void inode_switch_wbs(struct inode *inode, int new_wb_id)
491{
492 struct backing_dev_info *bdi = inode_to_bdi(inode);
493 struct cgroup_subsys_state *memcg_css;
494 struct inode_switch_wbs_context *isw;
495
496
497 if (inode->i_state & I_WB_SWITCH)
498 return;
499
500
501 if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT)
502 return;
503
504 isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
505 if (!isw)
506 return;
507
508
509 rcu_read_lock();
510 memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
511 if (memcg_css)
512 isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
513 rcu_read_unlock();
514 if (!isw->new_wb)
515 goto out_free;
516
517
518 spin_lock(&inode->i_lock);
519 if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
520 inode->i_state & (I_WB_SWITCH | I_FREEING) ||
521 inode_to_wb(inode) == isw->new_wb) {
522 spin_unlock(&inode->i_lock);
523 goto out_free;
524 }
525 inode->i_state |= I_WB_SWITCH;
526 __iget(inode);
527 spin_unlock(&inode->i_lock);
528
529 isw->inode = inode;
530
531
532
533
534
535
536
537 call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
538
539 atomic_inc(&isw_nr_in_flight);
540 return;
541
542out_free:
543 if (isw->new_wb)
544 wb_put(isw->new_wb);
545 kfree(isw);
546}
547
548
549
550
551
552
553
554
555
556
557
558void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
559 struct inode *inode)
560{
561 if (!inode_cgwb_enabled(inode)) {
562 spin_unlock(&inode->i_lock);
563 return;
564 }
565
566 wbc->wb = inode_to_wb(inode);
567 wbc->inode = inode;
568
569 wbc->wb_id = wbc->wb->memcg_css->id;
570 wbc->wb_lcand_id = inode->i_wb_frn_winner;
571 wbc->wb_tcand_id = 0;
572 wbc->wb_bytes = 0;
573 wbc->wb_lcand_bytes = 0;
574 wbc->wb_tcand_bytes = 0;
575
576 wb_get(wbc->wb);
577 spin_unlock(&inode->i_lock);
578
579
580
581
582
583
584
585
586 if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css)))
587 inode_switch_wbs(inode, wbc->wb_id);
588}
589EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode);
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628void wbc_detach_inode(struct writeback_control *wbc)
629{
630 struct bdi_writeback *wb = wbc->wb;
631 struct inode *inode = wbc->inode;
632 unsigned long avg_time, max_bytes, max_time;
633 u16 history;
634 int max_id;
635
636 if (!wb)
637 return;
638
639 history = inode->i_wb_frn_history;
640 avg_time = inode->i_wb_frn_avg_time;
641
642
643 if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
644 wbc->wb_bytes >= wbc->wb_tcand_bytes) {
645 max_id = wbc->wb_id;
646 max_bytes = wbc->wb_bytes;
647 } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) {
648 max_id = wbc->wb_lcand_id;
649 max_bytes = wbc->wb_lcand_bytes;
650 } else {
651 max_id = wbc->wb_tcand_id;
652 max_bytes = wbc->wb_tcand_bytes;
653 }
654
655
656
657
658
659
660
661
662 max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT,
663 wb->avg_write_bandwidth);
664 if (avg_time)
665 avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) -
666 (avg_time >> WB_FRN_TIME_AVG_SHIFT);
667 else
668 avg_time = max_time;
669
670 if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) {
671 int slots;
672
673
674
675
676
677
678
679
680
681 slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT),
682 (unsigned long)WB_FRN_HIST_MAX_SLOTS);
683 history <<= slots;
684 if (wbc->wb_id != max_id)
685 history |= (1U << slots) - 1;
686
687 if (history)
688 trace_inode_foreign_history(inode, wbc, history);
689
690
691
692
693
694
695
696
697 if (hweight32(history) > WB_FRN_HIST_THR_SLOTS)
698 inode_switch_wbs(inode, max_id);
699 }
700
701
702
703
704
705 inode->i_wb_frn_winner = max_id;
706 inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX);
707 inode->i_wb_frn_history = history;
708
709 wb_put(wbc->wb);
710 wbc->wb = NULL;
711}
712EXPORT_SYMBOL_GPL(wbc_detach_inode);
713
714
715
716
717
718
719
720
721
722
723
724void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
725 size_t bytes)
726{
727 struct cgroup_subsys_state *css;
728 int id;
729
730
731
732
733
734
735
736 if (!wbc->wb || wbc->no_cgroup_owner)
737 return;
738
739 css = mem_cgroup_css_from_page(page);
740
741 if (!(css->flags & CSS_ONLINE))
742 return;
743
744 id = css->id;
745
746 if (id == wbc->wb_id) {
747 wbc->wb_bytes += bytes;
748 return;
749 }
750
751 if (id == wbc->wb_lcand_id)
752 wbc->wb_lcand_bytes += bytes;
753
754
755 if (!wbc->wb_tcand_bytes)
756 wbc->wb_tcand_id = id;
757 if (id == wbc->wb_tcand_id)
758 wbc->wb_tcand_bytes += bytes;
759 else
760 wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
761}
762EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780int inode_congested(struct inode *inode, int cong_bits)
781{
782
783
784
785
786 if (inode && inode_to_wb_is_valid(inode)) {
787 struct bdi_writeback *wb;
788 struct wb_lock_cookie lock_cookie = {};
789 bool congested;
790
791 wb = unlocked_inode_to_wb_begin(inode, &lock_cookie);
792 congested = wb_congested(wb, cong_bits);
793 unlocked_inode_to_wb_end(inode, &lock_cookie);
794 return congested;
795 }
796
797 return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
798}
799EXPORT_SYMBOL_GPL(inode_congested);
800
801
802
803
804
805
806
807
808
809
810static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
811{
812 unsigned long this_bw = wb->avg_write_bandwidth;
813 unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
814
815 if (nr_pages == LONG_MAX)
816 return LONG_MAX;
817
818
819
820
821
822
823 if (!tot_bw || this_bw >= tot_bw)
824 return nr_pages;
825 else
826 return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
827}
828
829
830
831
832
833
834
835
836
837
838
839
840static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
841 struct wb_writeback_work *base_work,
842 bool skip_if_busy)
843{
844 struct bdi_writeback *last_wb = NULL;
845 struct bdi_writeback *wb = list_entry(&bdi->wb_list,
846 struct bdi_writeback, bdi_node);
847
848 might_sleep();
849restart:
850 rcu_read_lock();
851 list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
852 DEFINE_WB_COMPLETION(fallback_work_done, bdi);
853 struct wb_writeback_work fallback_work;
854 struct wb_writeback_work *work;
855 long nr_pages;
856
857 if (last_wb) {
858 wb_put(last_wb);
859 last_wb = NULL;
860 }
861
862
863 if (!wb_has_dirty_io(wb) &&
864 (base_work->sync_mode == WB_SYNC_NONE ||
865 list_empty(&wb->b_dirty_time)))
866 continue;
867 if (skip_if_busy && writeback_in_progress(wb))
868 continue;
869
870 nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);
871
872 work = kmalloc(sizeof(*work), GFP_ATOMIC);
873 if (work) {
874 *work = *base_work;
875 work->nr_pages = nr_pages;
876 work->auto_free = 1;
877 wb_queue_work(wb, work);
878 continue;
879 }
880
881
882 work = &fallback_work;
883 *work = *base_work;
884 work->nr_pages = nr_pages;
885 work->auto_free = 0;
886 work->done = &fallback_work_done;
887
888 wb_queue_work(wb, work);
889
890
891
892
893
894
895 wb_get(wb);
896 last_wb = wb;
897
898 rcu_read_unlock();
899 wb_wait_for_completion(&fallback_work_done);
900 goto restart;
901 }
902 rcu_read_unlock();
903
904 if (last_wb)
905 wb_put(last_wb);
906}
907
908
909
910
911
912
913
914
915
916
917
918
919int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr,
920 enum wb_reason reason, struct wb_completion *done)
921{
922 struct backing_dev_info *bdi;
923 struct cgroup_subsys_state *memcg_css;
924 struct bdi_writeback *wb;
925 struct wb_writeback_work *work;
926 int ret;
927
928
929 bdi = bdi_get_by_id(bdi_id);
930 if (!bdi)
931 return -ENOENT;
932
933 rcu_read_lock();
934 memcg_css = css_from_id(memcg_id, &memory_cgrp_subsys);
935 if (memcg_css && !css_tryget(memcg_css))
936 memcg_css = NULL;
937 rcu_read_unlock();
938 if (!memcg_css) {
939 ret = -ENOENT;
940 goto out_bdi_put;
941 }
942
943
944
945
946
947 wb = wb_get_lookup(bdi, memcg_css);
948 if (!wb) {
949 ret = -ENOENT;
950 goto out_css_put;
951 }
952
953
954
955
956
957
958
959
960 if (!nr) {
961 unsigned long filepages, headroom, dirty, writeback;
962
963 mem_cgroup_wb_stats(wb, &filepages, &headroom, &dirty,
964 &writeback);
965 nr = dirty * 10 / 8;
966 }
967
968
969 work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN);
970 if (work) {
971 work->nr_pages = nr;
972 work->sync_mode = WB_SYNC_NONE;
973 work->range_cyclic = 1;
974 work->reason = reason;
975 work->done = done;
976 work->auto_free = 1;
977 wb_queue_work(wb, work);
978 ret = 0;
979 } else {
980 ret = -ENOMEM;
981 }
982
983 wb_put(wb);
984out_css_put:
985 css_put(memcg_css);
986out_bdi_put:
987 bdi_put(bdi);
988 return ret;
989}
990
991
992
993
994
995
996
997
998
999
1000
1001void cgroup_writeback_umount(void)
1002{
1003 if (atomic_read(&isw_nr_in_flight)) {
1004
1005
1006
1007
1008 rcu_barrier();
1009 flush_workqueue(isw_wq);
1010 }
1011}
1012
1013static int __init cgroup_writeback_init(void)
1014{
1015 isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
1016 if (!isw_wq)
1017 return -ENOMEM;
1018 return 0;
1019}
1020fs_initcall(cgroup_writeback_init);
1021
1022#else
1023
1024static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
1025static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
1026
1027static struct bdi_writeback *
1028locked_inode_to_wb_and_lock_list(struct inode *inode)
1029 __releases(&inode->i_lock)
1030 __acquires(&wb->list_lock)
1031{
1032 struct bdi_writeback *wb = inode_to_wb(inode);
1033
1034 spin_unlock(&inode->i_lock);
1035 spin_lock(&wb->list_lock);
1036 return wb;
1037}
1038
1039static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
1040 __acquires(&wb->list_lock)
1041{
1042 struct bdi_writeback *wb = inode_to_wb(inode);
1043
1044 spin_lock(&wb->list_lock);
1045 return wb;
1046}
1047
1048static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
1049{
1050 return nr_pages;
1051}
1052
1053static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
1054 struct wb_writeback_work *base_work,
1055 bool skip_if_busy)
1056{
1057 might_sleep();
1058
1059 if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
1060 base_work->auto_free = 0;
1061 wb_queue_work(&bdi->wb, base_work);
1062 }
1063}
1064
1065#endif
1066
1067
1068
1069
1070
1071static unsigned long get_nr_dirty_pages(void)
1072{
1073 return global_node_page_state(NR_FILE_DIRTY) +
1074 get_nr_dirty_inodes();
1075}
1076
1077static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
1078{
1079 if (!wb_has_dirty_io(wb))
1080 return;
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090 if (test_bit(WB_start_all, &wb->state) ||
1091 test_and_set_bit(WB_start_all, &wb->state))
1092 return;
1093
1094 wb->start_all_reason = reason;
1095 wb_wakeup(wb);
1096}
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108void wb_start_background_writeback(struct bdi_writeback *wb)
1109{
1110
1111
1112
1113
1114 trace_writeback_wake_background(wb);
1115 wb_wakeup(wb);
1116}
1117
1118
1119
1120
1121void inode_io_list_del(struct inode *inode)
1122{
1123 struct bdi_writeback *wb;
1124
1125 wb = inode_to_wb_and_lock_list(inode);
1126 spin_lock(&inode->i_lock);
1127 inode_io_list_del_locked(inode, wb);
1128 spin_unlock(&inode->i_lock);
1129 spin_unlock(&wb->list_lock);
1130}
1131EXPORT_SYMBOL(inode_io_list_del);
1132
1133
1134
1135
1136void sb_mark_inode_writeback(struct inode *inode)
1137{
1138 struct super_block *sb = inode->i_sb;
1139 unsigned long flags;
1140
1141 if (list_empty(&inode->i_wb_list)) {
1142 spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
1143 if (list_empty(&inode->i_wb_list)) {
1144 list_add_tail(&inode->i_wb_list, &sb->s_inodes_wb);
1145 trace_sb_mark_inode_writeback(inode);
1146 }
1147 spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
1148 }
1149}
1150
1151
1152
1153
1154void sb_clear_inode_writeback(struct inode *inode)
1155{
1156 struct super_block *sb = inode->i_sb;
1157 unsigned long flags;
1158
1159 if (!list_empty(&inode->i_wb_list)) {
1160 spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
1161 if (!list_empty(&inode->i_wb_list)) {
1162 list_del_init(&inode->i_wb_list);
1163 trace_sb_clear_inode_writeback(inode);
1164 }
1165 spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
1166 }
1167}
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb)
1179{
1180 assert_spin_locked(&inode->i_lock);
1181
1182 if (!list_empty(&wb->b_dirty)) {
1183 struct inode *tail;
1184
1185 tail = wb_inode(wb->b_dirty.next);
1186 if (time_before(inode->dirtied_when, tail->dirtied_when))
1187 inode->dirtied_when = jiffies;
1188 }
1189 inode_io_list_move_locked(inode, wb, &wb->b_dirty);
1190 inode->i_state &= ~I_SYNC_QUEUED;
1191}
1192
1193static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
1194{
1195 spin_lock(&inode->i_lock);
1196 redirty_tail_locked(inode, wb);
1197 spin_unlock(&inode->i_lock);
1198}
1199
1200
1201
1202
1203static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
1204{
1205 inode_io_list_move_locked(inode, wb, &wb->b_more_io);
1206}
1207
1208static void inode_sync_complete(struct inode *inode)
1209{
1210 inode->i_state &= ~I_SYNC;
1211
1212 inode_add_lru(inode);
1213
1214 smp_mb();
1215 wake_up_bit(&inode->i_state, __I_SYNC);
1216}
1217
1218static bool inode_dirtied_after(struct inode *inode, unsigned long t)
1219{
1220 bool ret = time_after(inode->dirtied_when, t);
1221#ifndef CONFIG_64BIT
1222
1223
1224
1225
1226
1227
1228 ret = ret && time_before_eq(inode->dirtied_when, jiffies);
1229#endif
1230 return ret;
1231}
1232
1233#define EXPIRE_DIRTY_ATIME 0x0001
1234
1235
1236
1237
1238
1239static int move_expired_inodes(struct list_head *delaying_queue,
1240 struct list_head *dispatch_queue,
1241 unsigned long dirtied_before)
1242{
1243 LIST_HEAD(tmp);
1244 struct list_head *pos, *node;
1245 struct super_block *sb = NULL;
1246 struct inode *inode;
1247 int do_sb_sort = 0;
1248 int moved = 0;
1249
1250 while (!list_empty(delaying_queue)) {
1251 inode = wb_inode(delaying_queue->prev);
1252 if (inode_dirtied_after(inode, dirtied_before))
1253 break;
1254 list_move(&inode->i_io_list, &tmp);
1255 moved++;
1256 spin_lock(&inode->i_lock);
1257 inode->i_state |= I_SYNC_QUEUED;
1258 spin_unlock(&inode->i_lock);
1259 if (sb_is_blkdev_sb(inode->i_sb))
1260 continue;
1261 if (sb && sb != inode->i_sb)
1262 do_sb_sort = 1;
1263 sb = inode->i_sb;
1264 }
1265
1266
1267 if (!do_sb_sort) {
1268 list_splice(&tmp, dispatch_queue);
1269 goto out;
1270 }
1271
1272
1273 while (!list_empty(&tmp)) {
1274 sb = wb_inode(tmp.prev)->i_sb;
1275 list_for_each_prev_safe(pos, node, &tmp) {
1276 inode = wb_inode(pos);
1277 if (inode->i_sb == sb)
1278 list_move(&inode->i_io_list, dispatch_queue);
1279 }
1280 }
1281out:
1282 return moved;
1283}
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work,
1297 unsigned long dirtied_before)
1298{
1299 int moved;
1300 unsigned long time_expire_jif = dirtied_before;
1301
1302 assert_spin_locked(&wb->list_lock);
1303 list_splice_init(&wb->b_more_io, &wb->b_io);
1304 moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, dirtied_before);
1305 if (!work->for_sync)
1306 time_expire_jif = jiffies - dirtytime_expire_interval * HZ;
1307 moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
1308 time_expire_jif);
1309 if (moved)
1310 wb_io_lists_populated(wb);
1311 trace_writeback_queue_io(wb, work, dirtied_before, moved);
1312}
1313
1314static int write_inode(struct inode *inode, struct writeback_control *wbc)
1315{
1316 int ret;
1317
1318 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) {
1319 trace_writeback_write_inode_start(inode, wbc);
1320 ret = inode->i_sb->s_op->write_inode(inode, wbc);
1321 trace_writeback_write_inode(inode, wbc);
1322 return ret;
1323 }
1324 return 0;
1325}
1326
1327
1328
1329
1330
1331static void __inode_wait_for_writeback(struct inode *inode)
1332 __releases(inode->i_lock)
1333 __acquires(inode->i_lock)
1334{
1335 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
1336 wait_queue_head_t *wqh;
1337
1338 wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
1339 while (inode->i_state & I_SYNC) {
1340 spin_unlock(&inode->i_lock);
1341 __wait_on_bit(wqh, &wq, bit_wait,
1342 TASK_UNINTERRUPTIBLE);
1343 spin_lock(&inode->i_lock);
1344 }
1345}
1346
1347
1348
1349
1350void inode_wait_for_writeback(struct inode *inode)
1351{
1352 spin_lock(&inode->i_lock);
1353 __inode_wait_for_writeback(inode);
1354 spin_unlock(&inode->i_lock);
1355}
1356
1357
1358
1359
1360
1361
1362static void inode_sleep_on_writeback(struct inode *inode)
1363 __releases(inode->i_lock)
1364{
1365 DEFINE_WAIT(wait);
1366 wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
1367 int sleep;
1368
1369 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
1370 sleep = inode->i_state & I_SYNC;
1371 spin_unlock(&inode->i_lock);
1372 if (sleep)
1373 schedule();
1374 finish_wait(wqh, &wait);
1375}
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
1386 struct writeback_control *wbc)
1387{
1388 if (inode->i_state & I_FREEING)
1389 return;
1390
1391
1392
1393
1394
1395
1396 if ((inode->i_state & I_DIRTY) &&
1397 (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
1398 inode->dirtied_when = jiffies;
1399
1400 if (wbc->pages_skipped) {
1401
1402
1403
1404
1405 redirty_tail_locked(inode, wb);
1406 return;
1407 }
1408
1409 if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
1410
1411
1412
1413
1414 if (wbc->nr_to_write <= 0) {
1415
1416 requeue_io(inode, wb);
1417 } else {
1418
1419
1420
1421
1422
1423
1424
1425 redirty_tail_locked(inode, wb);
1426 }
1427 } else if (inode->i_state & I_DIRTY) {
1428
1429
1430
1431
1432
1433 redirty_tail_locked(inode, wb);
1434 } else if (inode->i_state & I_DIRTY_TIME) {
1435 inode->dirtied_when = jiffies;
1436 inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
1437 inode->i_state &= ~I_SYNC_QUEUED;
1438 } else {
1439
1440 inode_io_list_del_locked(inode, wb);
1441 }
1442}
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455static int
1456__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
1457{
1458 struct address_space *mapping = inode->i_mapping;
1459 long nr_to_write = wbc->nr_to_write;
1460 unsigned dirty;
1461 int ret;
1462
1463 WARN_ON(!(inode->i_state & I_SYNC));
1464
1465 trace_writeback_single_inode_start(inode, wbc, nr_to_write);
1466
1467 ret = do_writepages(mapping, wbc);
1468
1469
1470
1471
1472
1473
1474
1475
1476 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
1477 int err = filemap_fdatawait(mapping);
1478 if (ret == 0)
1479 ret = err;
1480 }
1481
1482
1483
1484
1485
1486
1487 if ((inode->i_state & I_DIRTY_TIME) &&
1488 (wbc->sync_mode == WB_SYNC_ALL ||
1489 time_after(jiffies, inode->dirtied_time_when +
1490 dirtytime_expire_interval * HZ))) {
1491 trace_writeback_lazytime(inode);
1492 mark_inode_dirty_sync(inode);
1493 }
1494
1495
1496
1497
1498
1499
1500
1501 spin_lock(&inode->i_lock);
1502 dirty = inode->i_state & I_DIRTY;
1503 inode->i_state &= ~dirty;
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516 smp_mb();
1517
1518 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
1519 inode->i_state |= I_DIRTY_PAGES;
1520
1521 spin_unlock(&inode->i_lock);
1522
1523
1524 if (dirty & ~I_DIRTY_PAGES) {
1525 int err = write_inode(inode, wbc);
1526 if (ret == 0)
1527 ret = err;
1528 }
1529 trace_writeback_single_inode(inode, wbc, nr_to_write);
1530 return ret;
1531}
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542static int writeback_single_inode(struct inode *inode,
1543 struct writeback_control *wbc)
1544{
1545 struct bdi_writeback *wb;
1546 int ret = 0;
1547
1548 spin_lock(&inode->i_lock);
1549 if (!atomic_read(&inode->i_count))
1550 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
1551 else
1552 WARN_ON(inode->i_state & I_WILL_FREE);
1553
1554 if (inode->i_state & I_SYNC) {
1555
1556
1557
1558
1559
1560
1561 if (wbc->sync_mode != WB_SYNC_ALL)
1562 goto out;
1563 __inode_wait_for_writeback(inode);
1564 }
1565 WARN_ON(inode->i_state & I_SYNC);
1566
1567
1568
1569
1570
1571
1572
1573 if (!(inode->i_state & I_DIRTY_ALL) &&
1574 (wbc->sync_mode != WB_SYNC_ALL ||
1575 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
1576 goto out;
1577 inode->i_state |= I_SYNC;
1578 wbc_attach_and_unlock_inode(wbc, inode);
1579
1580 ret = __writeback_single_inode(inode, wbc);
1581
1582 wbc_detach_inode(wbc);
1583
1584 wb = inode_to_wb_and_lock_list(inode);
1585 spin_lock(&inode->i_lock);
1586
1587
1588
1589
1590
1591 if (!(inode->i_state & I_DIRTY_ALL))
1592 inode_io_list_del_locked(inode, wb);
1593 spin_unlock(&wb->list_lock);
1594 inode_sync_complete(inode);
1595out:
1596 spin_unlock(&inode->i_lock);
1597 return ret;
1598}
1599
1600static long writeback_chunk_size(struct bdi_writeback *wb,
1601 struct wb_writeback_work *work)
1602{
1603 long pages;
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618 if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
1619 pages = LONG_MAX;
1620 else {
1621 pages = min(wb->avg_write_bandwidth / 2,
1622 global_wb_domain.dirty_limit / DIRTY_SCOPE);
1623 pages = min(pages, work->nr_pages);
1624 pages = round_down(pages + MIN_WRITEBACK_PAGES,
1625 MIN_WRITEBACK_PAGES);
1626 }
1627
1628 return pages;
1629}
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640static long writeback_sb_inodes(struct super_block *sb,
1641 struct bdi_writeback *wb,
1642 struct wb_writeback_work *work)
1643{
1644 struct writeback_control wbc = {
1645 .sync_mode = work->sync_mode,
1646 .tagged_writepages = work->tagged_writepages,
1647 .for_kupdate = work->for_kupdate,
1648 .for_background = work->for_background,
1649 .for_sync = work->for_sync,
1650 .range_cyclic = work->range_cyclic,
1651 .range_start = 0,
1652 .range_end = LLONG_MAX,
1653 };
1654 unsigned long start_time = jiffies;
1655 long write_chunk;
1656 long wrote = 0;
1657
1658 while (!list_empty(&wb->b_io)) {
1659 struct inode *inode = wb_inode(wb->b_io.prev);
1660 struct bdi_writeback *tmp_wb;
1661
1662 if (inode->i_sb != sb) {
1663 if (work->sb) {
1664
1665
1666
1667
1668
1669 redirty_tail(inode, wb);
1670 continue;
1671 }
1672
1673
1674
1675
1676
1677
1678 break;
1679 }
1680
1681
1682
1683
1684
1685
1686 spin_lock(&inode->i_lock);
1687 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
1688 redirty_tail_locked(inode, wb);
1689 spin_unlock(&inode->i_lock);
1690 continue;
1691 }
1692 if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702 spin_unlock(&inode->i_lock);
1703 requeue_io(inode, wb);
1704 trace_writeback_sb_inodes_requeue(inode);
1705 continue;
1706 }
1707 spin_unlock(&wb->list_lock);
1708
1709
1710
1711
1712
1713
1714 if (inode->i_state & I_SYNC) {
1715
1716 inode_sleep_on_writeback(inode);
1717
1718 spin_lock(&wb->list_lock);
1719 continue;
1720 }
1721 inode->i_state |= I_SYNC;
1722 wbc_attach_and_unlock_inode(&wbc, inode);
1723
1724 write_chunk = writeback_chunk_size(wb, work);
1725 wbc.nr_to_write = write_chunk;
1726 wbc.pages_skipped = 0;
1727
1728
1729
1730
1731
1732 __writeback_single_inode(inode, &wbc);
1733
1734 wbc_detach_inode(&wbc);
1735 work->nr_pages -= write_chunk - wbc.nr_to_write;
1736 wrote += write_chunk - wbc.nr_to_write;
1737
1738 if (need_resched()) {
1739
1740
1741
1742
1743
1744
1745
1746
1747 blk_flush_plug(current);
1748 cond_resched();
1749 }
1750
1751
1752
1753
1754
1755 tmp_wb = inode_to_wb_and_lock_list(inode);
1756 spin_lock(&inode->i_lock);
1757 if (!(inode->i_state & I_DIRTY_ALL))
1758 wrote++;
1759 requeue_inode(inode, tmp_wb, &wbc);
1760 inode_sync_complete(inode);
1761 spin_unlock(&inode->i_lock);
1762
1763 if (unlikely(tmp_wb != wb)) {
1764 spin_unlock(&tmp_wb->list_lock);
1765 spin_lock(&wb->list_lock);
1766 }
1767
1768
1769
1770
1771
1772 if (wrote) {
1773 if (time_is_before_jiffies(start_time + HZ / 10UL))
1774 break;
1775 if (work->nr_pages <= 0)
1776 break;
1777 }
1778 }
1779 return wrote;
1780}
1781
1782static long __writeback_inodes_wb(struct bdi_writeback *wb,
1783 struct wb_writeback_work *work)
1784{
1785 unsigned long start_time = jiffies;
1786 long wrote = 0;
1787
1788 while (!list_empty(&wb->b_io)) {
1789 struct inode *inode = wb_inode(wb->b_io.prev);
1790 struct super_block *sb = inode->i_sb;
1791
1792 if (!trylock_super(sb)) {
1793
1794
1795
1796
1797
1798 redirty_tail(inode, wb);
1799 continue;
1800 }
1801 wrote += writeback_sb_inodes(sb, wb, work);
1802 up_read(&sb->s_umount);
1803
1804
1805 if (wrote) {
1806 if (time_is_before_jiffies(start_time + HZ / 10UL))
1807 break;
1808 if (work->nr_pages <= 0)
1809 break;
1810 }
1811 }
1812
1813 return wrote;
1814}
1815
1816static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
1817 enum wb_reason reason)
1818{
1819 struct wb_writeback_work work = {
1820 .nr_pages = nr_pages,
1821 .sync_mode = WB_SYNC_NONE,
1822 .range_cyclic = 1,
1823 .reason = reason,
1824 };
1825 struct blk_plug plug;
1826
1827 blk_start_plug(&plug);
1828 spin_lock(&wb->list_lock);
1829 if (list_empty(&wb->b_io))
1830 queue_io(wb, &work, jiffies);
1831 __writeback_inodes_wb(wb, &work);
1832 spin_unlock(&wb->list_lock);
1833 blk_finish_plug(&plug);
1834
1835 return nr_pages - work.nr_pages;
1836}
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853static long wb_writeback(struct bdi_writeback *wb,
1854 struct wb_writeback_work *work)
1855{
1856 unsigned long wb_start = jiffies;
1857 long nr_pages = work->nr_pages;
1858 unsigned long dirtied_before = jiffies;
1859 struct inode *inode;
1860 long progress;
1861 struct blk_plug plug;
1862
1863 blk_start_plug(&plug);
1864 spin_lock(&wb->list_lock);
1865 for (;;) {
1866
1867
1868
1869 if (work->nr_pages <= 0)
1870 break;
1871
1872
1873
1874
1875
1876
1877
1878 if ((work->for_background || work->for_kupdate) &&
1879 !list_empty(&wb->work_list))
1880 break;
1881
1882
1883
1884
1885
1886 if (work->for_background && !wb_over_bg_thresh(wb))
1887 break;
1888
1889
1890
1891
1892
1893
1894
1895 if (work->for_kupdate) {
1896 dirtied_before = jiffies -
1897 msecs_to_jiffies(dirty_expire_interval * 10);
1898 } else if (work->for_background)
1899 dirtied_before = jiffies;
1900
1901 trace_writeback_start(wb, work);
1902 if (list_empty(&wb->b_io))
1903 queue_io(wb, work, dirtied_before);
1904 if (work->sb)
1905 progress = writeback_sb_inodes(work->sb, wb, work);
1906 else
1907 progress = __writeback_inodes_wb(wb, work);
1908 trace_writeback_written(wb, work);
1909
1910 wb_update_bandwidth(wb, wb_start);
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920 if (progress)
1921 continue;
1922
1923
1924
1925 if (list_empty(&wb->b_more_io))
1926 break;
1927
1928
1929
1930
1931
1932 trace_writeback_wait(wb, work);
1933 inode = wb_inode(wb->b_more_io.prev);
1934 spin_lock(&inode->i_lock);
1935 spin_unlock(&wb->list_lock);
1936
1937 inode_sleep_on_writeback(inode);
1938 spin_lock(&wb->list_lock);
1939 }
1940 spin_unlock(&wb->list_lock);
1941 blk_finish_plug(&plug);
1942
1943 return nr_pages - work->nr_pages;
1944}
1945
1946
1947
1948
1949static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
1950{
1951 struct wb_writeback_work *work = NULL;
1952
1953 spin_lock_bh(&wb->work_lock);
1954 if (!list_empty(&wb->work_list)) {
1955 work = list_entry(wb->work_list.next,
1956 struct wb_writeback_work, list);
1957 list_del_init(&work->list);
1958 }
1959 spin_unlock_bh(&wb->work_lock);
1960 return work;
1961}
1962
1963static long wb_check_background_flush(struct bdi_writeback *wb)
1964{
1965 if (wb_over_bg_thresh(wb)) {
1966
1967 struct wb_writeback_work work = {
1968 .nr_pages = LONG_MAX,
1969 .sync_mode = WB_SYNC_NONE,
1970 .for_background = 1,
1971 .range_cyclic = 1,
1972 .reason = WB_REASON_BACKGROUND,
1973 };
1974
1975 return wb_writeback(wb, &work);
1976 }
1977
1978 return 0;
1979}
1980
1981static long wb_check_old_data_flush(struct bdi_writeback *wb)
1982{
1983 unsigned long expired;
1984 long nr_pages;
1985
1986
1987
1988
1989 if (!dirty_writeback_interval)
1990 return 0;
1991
1992 expired = wb->last_old_flush +
1993 msecs_to_jiffies(dirty_writeback_interval * 10);
1994 if (time_before(jiffies, expired))
1995 return 0;
1996
1997 wb->last_old_flush = jiffies;
1998 nr_pages = get_nr_dirty_pages();
1999
2000 if (nr_pages) {
2001 struct wb_writeback_work work = {
2002 .nr_pages = nr_pages,
2003 .sync_mode = WB_SYNC_NONE,
2004 .for_kupdate = 1,
2005 .range_cyclic = 1,
2006 .reason = WB_REASON_PERIODIC,
2007 };
2008
2009 return wb_writeback(wb, &work);
2010 }
2011
2012 return 0;
2013}
2014
2015static long wb_check_start_all(struct bdi_writeback *wb)
2016{
2017 long nr_pages;
2018
2019 if (!test_bit(WB_start_all, &wb->state))
2020 return 0;
2021
2022 nr_pages = get_nr_dirty_pages();
2023 if (nr_pages) {
2024 struct wb_writeback_work work = {
2025 .nr_pages = wb_split_bdi_pages(wb, nr_pages),
2026 .sync_mode = WB_SYNC_NONE,
2027 .range_cyclic = 1,
2028 .reason = wb->start_all_reason,
2029 };
2030
2031 nr_pages = wb_writeback(wb, &work);
2032 }
2033
2034 clear_bit(WB_start_all, &wb->state);
2035 return nr_pages;
2036}
2037
2038
2039
2040
2041
2042static long wb_do_writeback(struct bdi_writeback *wb)
2043{
2044 struct wb_writeback_work *work;
2045 long wrote = 0;
2046
2047 set_bit(WB_writeback_running, &wb->state);
2048 while ((work = get_next_work_item(wb)) != NULL) {
2049 trace_writeback_exec(wb, work);
2050 wrote += wb_writeback(wb, work);
2051 finish_writeback_work(wb, work);
2052 }
2053
2054
2055
2056
2057 wrote += wb_check_start_all(wb);
2058
2059
2060
2061
2062 wrote += wb_check_old_data_flush(wb);
2063 wrote += wb_check_background_flush(wb);
2064 clear_bit(WB_writeback_running, &wb->state);
2065
2066 return wrote;
2067}
2068
2069
2070
2071
2072
2073void wb_workfn(struct work_struct *work)
2074{
2075 struct bdi_writeback *wb = container_of(to_delayed_work(work),
2076 struct bdi_writeback, dwork);
2077 long pages_written;
2078
2079 set_worker_desc("flush-%s", bdi_dev_name(wb->bdi));
2080 current->flags |= PF_SWAPWRITE;
2081
2082 if (likely(!current_is_workqueue_rescuer() ||
2083 !test_bit(WB_registered, &wb->state))) {
2084
2085
2086
2087
2088
2089
2090 do {
2091 pages_written = wb_do_writeback(wb);
2092 trace_writeback_pages_written(pages_written);
2093 } while (!list_empty(&wb->work_list));
2094 } else {
2095
2096
2097
2098
2099
2100 pages_written = writeback_inodes_wb(wb, 1024,
2101 WB_REASON_FORKER_THREAD);
2102 trace_writeback_pages_written(pages_written);
2103 }
2104
2105 if (!list_empty(&wb->work_list))
2106 wb_wakeup(wb);
2107 else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
2108 wb_wakeup_delayed(wb);
2109
2110 current->flags &= ~PF_SWAPWRITE;
2111}
2112
2113
2114
2115
2116
2117static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
2118 enum wb_reason reason)
2119{
2120 struct bdi_writeback *wb;
2121
2122 if (!bdi_has_dirty_io(bdi))
2123 return;
2124
2125 list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
2126 wb_start_writeback(wb, reason);
2127}
2128
2129void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
2130 enum wb_reason reason)
2131{
2132 rcu_read_lock();
2133 __wakeup_flusher_threads_bdi(bdi, reason);
2134 rcu_read_unlock();
2135}
2136
2137
2138
2139
2140void wakeup_flusher_threads(enum wb_reason reason)
2141{
2142 struct backing_dev_info *bdi;
2143
2144
2145
2146
2147 if (blk_needs_flush_plug(current))
2148 blk_schedule_flush_plug(current);
2149
2150 rcu_read_lock();
2151 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
2152 __wakeup_flusher_threads_bdi(bdi, reason);
2153 rcu_read_unlock();
2154}
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171static void wakeup_dirtytime_writeback(struct work_struct *w);
2172static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback);
2173
2174static void wakeup_dirtytime_writeback(struct work_struct *w)
2175{
2176 struct backing_dev_info *bdi;
2177
2178 rcu_read_lock();
2179 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
2180 struct bdi_writeback *wb;
2181
2182 list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
2183 if (!list_empty(&wb->b_dirty_time))
2184 wb_wakeup(wb);
2185 }
2186 rcu_read_unlock();
2187 schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
2188}
2189
2190static int __init start_dirtytime_writeback(void)
2191{
2192 schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
2193 return 0;
2194}
2195__initcall(start_dirtytime_writeback);
2196
2197int dirtytime_interval_handler(struct ctl_table *table, int write,
2198 void *buffer, size_t *lenp, loff_t *ppos)
2199{
2200 int ret;
2201
2202 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
2203 if (ret == 0 && write)
2204 mod_delayed_work(system_wq, &dirtytime_work, 0);
2205 return ret;
2206}
2207
2208static noinline void block_dump___mark_inode_dirty(struct inode *inode)
2209{
2210 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
2211 struct dentry *dentry;
2212 const char *name = "?";
2213
2214 dentry = d_find_alias(inode);
2215 if (dentry) {
2216 spin_lock(&dentry->d_lock);
2217 name = (const char *) dentry->d_name.name;
2218 }
2219 printk(KERN_DEBUG
2220 "%s(%d): dirtied inode %lu (%s) on %s\n",
2221 current->comm, task_pid_nr(current), inode->i_ino,
2222 name, inode->i_sb->s_id);
2223 if (dentry) {
2224 spin_unlock(&dentry->d_lock);
2225 dput(dentry);
2226 }
2227 }
2228}
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257void __mark_inode_dirty(struct inode *inode, int flags)
2258{
2259 struct super_block *sb = inode->i_sb;
2260 int dirtytime = 0;
2261
2262 trace_writeback_mark_inode_dirty(inode, flags);
2263
2264 if (flags & I_DIRTY_INODE) {
2265
2266
2267
2268
2269
2270
2271
2272 trace_writeback_dirty_inode_start(inode, flags);
2273 if (sb->s_op->dirty_inode)
2274 sb->s_op->dirty_inode(inode, flags & I_DIRTY_INODE);
2275 trace_writeback_dirty_inode(inode, flags);
2276
2277
2278 flags &= ~I_DIRTY_TIME;
2279 } else {
2280
2281
2282
2283
2284
2285 dirtytime = flags & I_DIRTY_TIME;
2286 WARN_ON_ONCE(dirtytime && flags != I_DIRTY_TIME);
2287 }
2288
2289
2290
2291
2292
2293 smp_mb();
2294
2295 if (((inode->i_state & flags) == flags) ||
2296 (dirtytime && (inode->i_state & I_DIRTY_INODE)))
2297 return;
2298
2299 if (unlikely(block_dump))
2300 block_dump___mark_inode_dirty(inode);
2301
2302 spin_lock(&inode->i_lock);
2303 if (dirtytime && (inode->i_state & I_DIRTY_INODE))
2304 goto out_unlock_inode;
2305 if ((inode->i_state & flags) != flags) {
2306 const int was_dirty = inode->i_state & I_DIRTY;
2307
2308 inode_attach_wb(inode, NULL);
2309
2310
2311 if (flags & I_DIRTY_INODE)
2312 inode->i_state &= ~I_DIRTY_TIME;
2313 inode->i_state |= flags;
2314
2315
2316
2317
2318
2319
2320
2321 if (inode->i_state & I_SYNC_QUEUED)
2322 goto out_unlock_inode;
2323
2324
2325
2326
2327
2328 if (!S_ISBLK(inode->i_mode)) {
2329 if (inode_unhashed(inode))
2330 goto out_unlock_inode;
2331 }
2332 if (inode->i_state & I_FREEING)
2333 goto out_unlock_inode;
2334
2335
2336
2337
2338
2339 if (!was_dirty) {
2340 struct bdi_writeback *wb;
2341 struct list_head *dirty_list;
2342 bool wakeup_bdi = false;
2343
2344 wb = locked_inode_to_wb_and_lock_list(inode);
2345
2346 inode->dirtied_when = jiffies;
2347 if (dirtytime)
2348 inode->dirtied_time_when = jiffies;
2349
2350 if (inode->i_state & I_DIRTY)
2351 dirty_list = &wb->b_dirty;
2352 else
2353 dirty_list = &wb->b_dirty_time;
2354
2355 wakeup_bdi = inode_io_list_move_locked(inode, wb,
2356 dirty_list);
2357
2358 spin_unlock(&wb->list_lock);
2359 trace_writeback_dirty_inode_enqueue(inode);
2360
2361
2362
2363
2364
2365
2366
2367 if (wakeup_bdi &&
2368 (wb->bdi->capabilities & BDI_CAP_WRITEBACK))
2369 wb_wakeup_delayed(wb);
2370 return;
2371 }
2372 }
2373out_unlock_inode:
2374 spin_unlock(&inode->i_lock);
2375}
2376EXPORT_SYMBOL(__mark_inode_dirty);
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387static void wait_sb_inodes(struct super_block *sb)
2388{
2389 LIST_HEAD(sync_list);
2390
2391
2392
2393
2394
2395 WARN_ON(!rwsem_is_locked(&sb->s_umount));
2396
2397 mutex_lock(&sb->s_sync_lock);
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408 rcu_read_lock();
2409 spin_lock_irq(&sb->s_inode_wblist_lock);
2410 list_splice_init(&sb->s_inodes_wb, &sync_list);
2411
2412
2413
2414
2415
2416
2417
2418
2419 while (!list_empty(&sync_list)) {
2420 struct inode *inode = list_first_entry(&sync_list, struct inode,
2421 i_wb_list);
2422 struct address_space *mapping = inode->i_mapping;
2423
2424
2425
2426
2427
2428
2429
2430 list_move_tail(&inode->i_wb_list, &sb->s_inodes_wb);
2431
2432
2433
2434
2435
2436
2437 if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
2438 continue;
2439
2440 spin_unlock_irq(&sb->s_inode_wblist_lock);
2441
2442 spin_lock(&inode->i_lock);
2443 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
2444 spin_unlock(&inode->i_lock);
2445
2446 spin_lock_irq(&sb->s_inode_wblist_lock);
2447 continue;
2448 }
2449 __iget(inode);
2450 spin_unlock(&inode->i_lock);
2451 rcu_read_unlock();
2452
2453
2454
2455
2456
2457
2458 filemap_fdatawait_keep_errors(mapping);
2459
2460 cond_resched();
2461
2462 iput(inode);
2463
2464 rcu_read_lock();
2465 spin_lock_irq(&sb->s_inode_wblist_lock);
2466 }
2467 spin_unlock_irq(&sb->s_inode_wblist_lock);
2468 rcu_read_unlock();
2469 mutex_unlock(&sb->s_sync_lock);
2470}
2471
2472static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
2473 enum wb_reason reason, bool skip_if_busy)
2474{
2475 struct backing_dev_info *bdi = sb->s_bdi;
2476 DEFINE_WB_COMPLETION(done, bdi);
2477 struct wb_writeback_work work = {
2478 .sb = sb,
2479 .sync_mode = WB_SYNC_NONE,
2480 .tagged_writepages = 1,
2481 .done = &done,
2482 .nr_pages = nr,
2483 .reason = reason,
2484 };
2485
2486 if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
2487 return;
2488 WARN_ON(!rwsem_is_locked(&sb->s_umount));
2489
2490 bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
2491 wb_wait_for_completion(&done);
2492}
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504void writeback_inodes_sb_nr(struct super_block *sb,
2505 unsigned long nr,
2506 enum wb_reason reason)
2507{
2508 __writeback_inodes_sb_nr(sb, nr, reason, false);
2509}
2510EXPORT_SYMBOL(writeback_inodes_sb_nr);
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
2522{
2523 return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
2524}
2525EXPORT_SYMBOL(writeback_inodes_sb);
2526
2527
2528
2529
2530
2531
2532
2533
2534void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
2535{
2536 if (!down_read_trylock(&sb->s_umount))
2537 return;
2538
2539 __writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason, true);
2540 up_read(&sb->s_umount);
2541}
2542EXPORT_SYMBOL(try_to_writeback_inodes_sb);
2543
2544
2545
2546
2547
2548
2549
2550
2551void sync_inodes_sb(struct super_block *sb)
2552{
2553 struct backing_dev_info *bdi = sb->s_bdi;
2554 DEFINE_WB_COMPLETION(done, bdi);
2555 struct wb_writeback_work work = {
2556 .sb = sb,
2557 .sync_mode = WB_SYNC_ALL,
2558 .nr_pages = LONG_MAX,
2559 .range_cyclic = 0,
2560 .done = &done,
2561 .reason = WB_REASON_SYNC,
2562 .for_sync = 1,
2563 };
2564
2565
2566
2567
2568
2569
2570 if (bdi == &noop_backing_dev_info)
2571 return;
2572 WARN_ON(!rwsem_is_locked(&sb->s_umount));
2573
2574
2575 bdi_down_write_wb_switch_rwsem(bdi);
2576 bdi_split_work_to_wbs(bdi, &work, false);
2577 wb_wait_for_completion(&done);
2578 bdi_up_write_wb_switch_rwsem(bdi);
2579
2580 wait_sb_inodes(sb);
2581}
2582EXPORT_SYMBOL(sync_inodes_sb);
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594int write_inode_now(struct inode *inode, int sync)
2595{
2596 struct writeback_control wbc = {
2597 .nr_to_write = LONG_MAX,
2598 .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
2599 .range_start = 0,
2600 .range_end = LLONG_MAX,
2601 };
2602
2603 if (!mapping_can_writeback(inode->i_mapping))
2604 wbc.nr_to_write = 0;
2605
2606 might_sleep();
2607 return writeback_single_inode(inode, &wbc);
2608}
2609EXPORT_SYMBOL(write_inode_now);
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622int sync_inode(struct inode *inode, struct writeback_control *wbc)
2623{
2624 return writeback_single_inode(inode, wbc);
2625}
2626EXPORT_SYMBOL(sync_inode);
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637int sync_inode_metadata(struct inode *inode, int wait)
2638{
2639 struct writeback_control wbc = {
2640 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
2641 .nr_to_write = 0,
2642 };
2643
2644 return sync_inode(inode, &wbc);
2645}
2646EXPORT_SYMBOL(sync_inode_metadata);
2647