1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17#include <linux/kernel.h>
18#include <linux/export.h>
19#include <linux/spinlock.h>
20#include <linux/slab.h>
21#include <linux/sched.h>
22#include <linux/fs.h>
23#include <linux/mm.h>
24#include <linux/pagemap.h>
25#include <linux/kthread.h>
26#include <linux/writeback.h>
27#include <linux/blkdev.h>
28#include <linux/backing-dev.h>
29#include <linux/tracepoint.h>
30#include <linux/device.h>
31#include <linux/memcontrol.h>
32#include "internal.h"
33
34
35
36
37#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10))
38
39
40
41
42struct wb_writeback_work {
43 long nr_pages;
44 struct super_block *sb;
45 enum writeback_sync_modes sync_mode;
46 unsigned int tagged_writepages:1;
47 unsigned int for_kupdate:1;
48 unsigned int range_cyclic:1;
49 unsigned int for_background:1;
50 unsigned int for_sync:1;
51 unsigned int auto_free:1;
52 enum wb_reason reason;
53
54 struct list_head list;
55 struct wb_completion *done;
56};
57
58
59
60
61
62
63
64
65
66
67
68unsigned int dirtytime_expire_interval = 12 * 60 * 60;
69
70static inline struct inode *wb_inode(struct list_head *head)
71{
72 return list_entry(head, struct inode, i_io_list);
73}
74
75
76
77
78
79
80#define CREATE_TRACE_POINTS
81#include <trace/events/writeback.h>
82
83EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
84
85static bool wb_io_lists_populated(struct bdi_writeback *wb)
86{
87 if (wb_has_dirty_io(wb)) {
88 return false;
89 } else {
90 set_bit(WB_has_dirty_io, &wb->state);
91 WARN_ON_ONCE(!wb->avg_write_bandwidth);
92 atomic_long_add(wb->avg_write_bandwidth,
93 &wb->bdi->tot_write_bandwidth);
94 return true;
95 }
96}
97
98static void wb_io_lists_depopulated(struct bdi_writeback *wb)
99{
100 if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) &&
101 list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) {
102 clear_bit(WB_has_dirty_io, &wb->state);
103 WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth,
104 &wb->bdi->tot_write_bandwidth) < 0);
105 }
106}
107
108
109
110
111
112
113
114
115
116
117
118static bool inode_io_list_move_locked(struct inode *inode,
119 struct bdi_writeback *wb,
120 struct list_head *head)
121{
122 assert_spin_locked(&wb->list_lock);
123
124 list_move(&inode->i_io_list, head);
125
126
127 if (head != &wb->b_dirty_time)
128 return wb_io_lists_populated(wb);
129
130 wb_io_lists_depopulated(wb);
131 return false;
132}
133
134
135
136
137
138
139
140
141
142static void inode_io_list_del_locked(struct inode *inode,
143 struct bdi_writeback *wb)
144{
145 assert_spin_locked(&wb->list_lock);
146 assert_spin_locked(&inode->i_lock);
147
148 inode->i_state &= ~I_SYNC_QUEUED;
149 list_del_init(&inode->i_io_list);
150 wb_io_lists_depopulated(wb);
151}
152
153static void wb_wakeup(struct bdi_writeback *wb)
154{
155 spin_lock_bh(&wb->work_lock);
156 if (test_bit(WB_registered, &wb->state))
157 mod_delayed_work(bdi_wq, &wb->dwork, 0);
158 spin_unlock_bh(&wb->work_lock);
159}
160
161static void finish_writeback_work(struct bdi_writeback *wb,
162 struct wb_writeback_work *work)
163{
164 struct wb_completion *done = work->done;
165
166 if (work->auto_free)
167 kfree(work);
168 if (done) {
169 wait_queue_head_t *waitq = done->waitq;
170
171
172 if (atomic_dec_and_test(&done->cnt))
173 wake_up_all(waitq);
174 }
175}
176
177static void wb_queue_work(struct bdi_writeback *wb,
178 struct wb_writeback_work *work)
179{
180 trace_writeback_queue(wb, work);
181
182 if (work->done)
183 atomic_inc(&work->done->cnt);
184
185 spin_lock_bh(&wb->work_lock);
186
187 if (test_bit(WB_registered, &wb->state)) {
188 list_add_tail(&work->list, &wb->work_list);
189 mod_delayed_work(bdi_wq, &wb->dwork, 0);
190 } else
191 finish_writeback_work(wb, work);
192
193 spin_unlock_bh(&wb->work_lock);
194}
195
196
197
198
199
200
201
202
203
204
205
206void wb_wait_for_completion(struct wb_completion *done)
207{
208 atomic_dec(&done->cnt);
209 wait_event(*done->waitq, !atomic_read(&done->cnt));
210}
211
212#ifdef CONFIG_CGROUP_WRITEBACK
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233#define WB_FRN_TIME_SHIFT 13
234#define WB_FRN_TIME_AVG_SHIFT 3
235#define WB_FRN_TIME_CUT_DIV 8
236#define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT))
237
238#define WB_FRN_HIST_SLOTS 16
239#define WB_FRN_HIST_UNIT (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS)
240
241#define WB_FRN_HIST_THR_SLOTS (WB_FRN_HIST_SLOTS / 2)
242
243#define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1)
244
245#define WB_FRN_MAX_IN_FLIGHT 1024
246
247static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
248static struct workqueue_struct *isw_wq;
249
250void __inode_attach_wb(struct inode *inode, struct page *page)
251{
252 struct backing_dev_info *bdi = inode_to_bdi(inode);
253 struct bdi_writeback *wb = NULL;
254
255 if (inode_cgwb_enabled(inode)) {
256 struct cgroup_subsys_state *memcg_css;
257
258 if (page) {
259 memcg_css = mem_cgroup_css_from_page(page);
260 wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
261 } else {
262
263 memcg_css = task_get_css(current, memory_cgrp_id);
264 wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
265 css_put(memcg_css);
266 }
267 }
268
269 if (!wb)
270 wb = &bdi->wb;
271
272
273
274
275
276 if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
277 wb_put(wb);
278}
279EXPORT_SYMBOL_GPL(__inode_attach_wb);
280
281
282
283
284
285
286
287
288
289static struct bdi_writeback *
290locked_inode_to_wb_and_lock_list(struct inode *inode)
291 __releases(&inode->i_lock)
292 __acquires(&wb->list_lock)
293{
294 while (true) {
295 struct bdi_writeback *wb = inode_to_wb(inode);
296
297
298
299
300
301
302
303 wb_get(wb);
304 spin_unlock(&inode->i_lock);
305 spin_lock(&wb->list_lock);
306
307
308 if (likely(wb == inode->i_wb)) {
309 wb_put(wb);
310 return wb;
311 }
312
313 spin_unlock(&wb->list_lock);
314 wb_put(wb);
315 cpu_relax();
316 spin_lock(&inode->i_lock);
317 }
318}
319
320
321
322
323
324
325
326
327static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
328 __acquires(&wb->list_lock)
329{
330 spin_lock(&inode->i_lock);
331 return locked_inode_to_wb_and_lock_list(inode);
332}
333
334struct inode_switch_wbs_context {
335 struct inode *inode;
336 struct bdi_writeback *new_wb;
337
338 struct rcu_head rcu_head;
339 struct work_struct work;
340};
341
342static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
343{
344 down_write(&bdi->wb_switch_rwsem);
345}
346
347static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
348{
349 up_write(&bdi->wb_switch_rwsem);
350}
351
352static void inode_switch_wbs_work_fn(struct work_struct *work)
353{
354 struct inode_switch_wbs_context *isw =
355 container_of(work, struct inode_switch_wbs_context, work);
356 struct inode *inode = isw->inode;
357 struct backing_dev_info *bdi = inode_to_bdi(inode);
358 struct address_space *mapping = inode->i_mapping;
359 struct bdi_writeback *old_wb = inode->i_wb;
360 struct bdi_writeback *new_wb = isw->new_wb;
361 XA_STATE(xas, &mapping->i_pages, 0);
362 struct page *page;
363 bool switched = false;
364
365
366
367
368
369 down_read(&bdi->wb_switch_rwsem);
370
371
372
373
374
375
376
377
378
379
380
381 if (old_wb < new_wb) {
382 spin_lock(&old_wb->list_lock);
383 spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
384 } else {
385 spin_lock(&new_wb->list_lock);
386 spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
387 }
388 spin_lock(&inode->i_lock);
389 xa_lock_irq(&mapping->i_pages);
390
391
392
393
394
395 if (unlikely(inode->i_state & I_FREEING))
396 goto skip_switch;
397
398 trace_inode_switch_wbs(inode, old_wb, new_wb);
399
400
401
402
403
404
405 xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_DIRTY) {
406 if (PageDirty(page)) {
407 dec_wb_stat(old_wb, WB_RECLAIMABLE);
408 inc_wb_stat(new_wb, WB_RECLAIMABLE);
409 }
410 }
411
412 xas_set(&xas, 0);
413 xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) {
414 WARN_ON_ONCE(!PageWriteback(page));
415 dec_wb_stat(old_wb, WB_WRITEBACK);
416 inc_wb_stat(new_wb, WB_WRITEBACK);
417 }
418
419 wb_get(new_wb);
420
421
422
423
424
425
426
427 if (!list_empty(&inode->i_io_list)) {
428 struct inode *pos;
429
430 inode_io_list_del_locked(inode, old_wb);
431 inode->i_wb = new_wb;
432 list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
433 if (time_after_eq(inode->dirtied_when,
434 pos->dirtied_when))
435 break;
436 inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev);
437 } else {
438 inode->i_wb = new_wb;
439 }
440
441
442 inode->i_wb_frn_winner = 0;
443 inode->i_wb_frn_avg_time = 0;
444 inode->i_wb_frn_history = 0;
445 switched = true;
446skip_switch:
447
448
449
450
451 smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
452
453 xa_unlock_irq(&mapping->i_pages);
454 spin_unlock(&inode->i_lock);
455 spin_unlock(&new_wb->list_lock);
456 spin_unlock(&old_wb->list_lock);
457
458 up_read(&bdi->wb_switch_rwsem);
459
460 if (switched) {
461 wb_wakeup(new_wb);
462 wb_put(old_wb);
463 }
464 wb_put(new_wb);
465
466 iput(inode);
467 kfree(isw);
468
469 atomic_dec(&isw_nr_in_flight);
470}
471
472static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
473{
474 struct inode_switch_wbs_context *isw = container_of(rcu_head,
475 struct inode_switch_wbs_context, rcu_head);
476
477
478 INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
479 queue_work(isw_wq, &isw->work);
480}
481
482
483
484
485
486
487
488
489
490static void inode_switch_wbs(struct inode *inode, int new_wb_id)
491{
492 struct backing_dev_info *bdi = inode_to_bdi(inode);
493 struct cgroup_subsys_state *memcg_css;
494 struct inode_switch_wbs_context *isw;
495
496
497 if (inode->i_state & I_WB_SWITCH)
498 return;
499
500
501 if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT)
502 return;
503
504 isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
505 if (!isw)
506 return;
507
508
509 rcu_read_lock();
510 memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
511 if (memcg_css)
512 isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
513 rcu_read_unlock();
514 if (!isw->new_wb)
515 goto out_free;
516
517
518 spin_lock(&inode->i_lock);
519 if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
520 inode->i_state & (I_WB_SWITCH | I_FREEING) ||
521 inode_to_wb(inode) == isw->new_wb) {
522 spin_unlock(&inode->i_lock);
523 goto out_free;
524 }
525 inode->i_state |= I_WB_SWITCH;
526 __iget(inode);
527 spin_unlock(&inode->i_lock);
528
529 isw->inode = inode;
530
531
532
533
534
535
536
537 call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
538
539 atomic_inc(&isw_nr_in_flight);
540 return;
541
542out_free:
543 if (isw->new_wb)
544 wb_put(isw->new_wb);
545 kfree(isw);
546}
547
548
549
550
551
552
553
554
555
556
557
558void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
559 struct inode *inode)
560{
561 if (!inode_cgwb_enabled(inode)) {
562 spin_unlock(&inode->i_lock);
563 return;
564 }
565
566 wbc->wb = inode_to_wb(inode);
567 wbc->inode = inode;
568
569 wbc->wb_id = wbc->wb->memcg_css->id;
570 wbc->wb_lcand_id = inode->i_wb_frn_winner;
571 wbc->wb_tcand_id = 0;
572 wbc->wb_bytes = 0;
573 wbc->wb_lcand_bytes = 0;
574 wbc->wb_tcand_bytes = 0;
575
576 wb_get(wbc->wb);
577 spin_unlock(&inode->i_lock);
578
579
580
581
582
583
584
585
586 if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css)))
587 inode_switch_wbs(inode, wbc->wb_id);
588}
589EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode);
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628void wbc_detach_inode(struct writeback_control *wbc)
629{
630 struct bdi_writeback *wb = wbc->wb;
631 struct inode *inode = wbc->inode;
632 unsigned long avg_time, max_bytes, max_time;
633 u16 history;
634 int max_id;
635
636 if (!wb)
637 return;
638
639 history = inode->i_wb_frn_history;
640 avg_time = inode->i_wb_frn_avg_time;
641
642
643 if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
644 wbc->wb_bytes >= wbc->wb_tcand_bytes) {
645 max_id = wbc->wb_id;
646 max_bytes = wbc->wb_bytes;
647 } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) {
648 max_id = wbc->wb_lcand_id;
649 max_bytes = wbc->wb_lcand_bytes;
650 } else {
651 max_id = wbc->wb_tcand_id;
652 max_bytes = wbc->wb_tcand_bytes;
653 }
654
655
656
657
658
659
660
661
662 max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT,
663 wb->avg_write_bandwidth);
664 if (avg_time)
665 avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) -
666 (avg_time >> WB_FRN_TIME_AVG_SHIFT);
667 else
668 avg_time = max_time;
669
670 if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) {
671 int slots;
672
673
674
675
676
677
678
679
680
681 slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT),
682 (unsigned long)WB_FRN_HIST_MAX_SLOTS);
683 history <<= slots;
684 if (wbc->wb_id != max_id)
685 history |= (1U << slots) - 1;
686
687 if (history)
688 trace_inode_foreign_history(inode, wbc, history);
689
690
691
692
693
694
695
696
697 if (hweight32(history) > WB_FRN_HIST_THR_SLOTS)
698 inode_switch_wbs(inode, max_id);
699 }
700
701
702
703
704
705 inode->i_wb_frn_winner = max_id;
706 inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX);
707 inode->i_wb_frn_history = history;
708
709 wb_put(wbc->wb);
710 wbc->wb = NULL;
711}
712EXPORT_SYMBOL_GPL(wbc_detach_inode);
713
714
715
716
717
718
719
720
721
722
723
724void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
725 size_t bytes)
726{
727 struct cgroup_subsys_state *css;
728 int id;
729
730
731
732
733
734
735
736 if (!wbc->wb || wbc->no_cgroup_owner)
737 return;
738
739 css = mem_cgroup_css_from_page(page);
740
741 if (!(css->flags & CSS_ONLINE))
742 return;
743
744 id = css->id;
745
746 if (id == wbc->wb_id) {
747 wbc->wb_bytes += bytes;
748 return;
749 }
750
751 if (id == wbc->wb_lcand_id)
752 wbc->wb_lcand_bytes += bytes;
753
754
755 if (!wbc->wb_tcand_bytes)
756 wbc->wb_tcand_id = id;
757 if (id == wbc->wb_tcand_id)
758 wbc->wb_tcand_bytes += bytes;
759 else
760 wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
761}
762EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780int inode_congested(struct inode *inode, int cong_bits)
781{
782
783
784
785
786 if (inode && inode_to_wb_is_valid(inode)) {
787 struct bdi_writeback *wb;
788 struct wb_lock_cookie lock_cookie = {};
789 bool congested;
790
791 wb = unlocked_inode_to_wb_begin(inode, &lock_cookie);
792 congested = wb_congested(wb, cong_bits);
793 unlocked_inode_to_wb_end(inode, &lock_cookie);
794 return congested;
795 }
796
797 return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
798}
799EXPORT_SYMBOL_GPL(inode_congested);
800
801
802
803
804
805
806
807
808
809
810static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
811{
812 unsigned long this_bw = wb->avg_write_bandwidth;
813 unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
814
815 if (nr_pages == LONG_MAX)
816 return LONG_MAX;
817
818
819
820
821
822
823 if (!tot_bw || this_bw >= tot_bw)
824 return nr_pages;
825 else
826 return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
827}
828
829
830
831
832
833
834
835
836
837
838
839
840static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
841 struct wb_writeback_work *base_work,
842 bool skip_if_busy)
843{
844 struct bdi_writeback *last_wb = NULL;
845 struct bdi_writeback *wb = list_entry(&bdi->wb_list,
846 struct bdi_writeback, bdi_node);
847
848 might_sleep();
849restart:
850 rcu_read_lock();
851 list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
852 DEFINE_WB_COMPLETION(fallback_work_done, bdi);
853 struct wb_writeback_work fallback_work;
854 struct wb_writeback_work *work;
855 long nr_pages;
856
857 if (last_wb) {
858 wb_put(last_wb);
859 last_wb = NULL;
860 }
861
862
863 if (!wb_has_dirty_io(wb) &&
864 (base_work->sync_mode == WB_SYNC_NONE ||
865 list_empty(&wb->b_dirty_time)))
866 continue;
867 if (skip_if_busy && writeback_in_progress(wb))
868 continue;
869
870 nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);
871
872 work = kmalloc(sizeof(*work), GFP_ATOMIC);
873 if (work) {
874 *work = *base_work;
875 work->nr_pages = nr_pages;
876 work->auto_free = 1;
877 wb_queue_work(wb, work);
878 continue;
879 }
880
881
882 work = &fallback_work;
883 *work = *base_work;
884 work->nr_pages = nr_pages;
885 work->auto_free = 0;
886 work->done = &fallback_work_done;
887
888 wb_queue_work(wb, work);
889
890
891
892
893
894
895 wb_get(wb);
896 last_wb = wb;
897
898 rcu_read_unlock();
899 wb_wait_for_completion(&fallback_work_done);
900 goto restart;
901 }
902 rcu_read_unlock();
903
904 if (last_wb)
905 wb_put(last_wb);
906}
907
908
909
910
911
912
913
914
915
916
917
918
919int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr,
920 enum wb_reason reason, struct wb_completion *done)
921{
922 struct backing_dev_info *bdi;
923 struct cgroup_subsys_state *memcg_css;
924 struct bdi_writeback *wb;
925 struct wb_writeback_work *work;
926 int ret;
927
928
929 bdi = bdi_get_by_id(bdi_id);
930 if (!bdi)
931 return -ENOENT;
932
933 rcu_read_lock();
934 memcg_css = css_from_id(memcg_id, &memory_cgrp_subsys);
935 if (memcg_css && !css_tryget(memcg_css))
936 memcg_css = NULL;
937 rcu_read_unlock();
938 if (!memcg_css) {
939 ret = -ENOENT;
940 goto out_bdi_put;
941 }
942
943
944
945
946
947 wb = wb_get_lookup(bdi, memcg_css);
948 if (!wb) {
949 ret = -ENOENT;
950 goto out_css_put;
951 }
952
953
954
955
956
957
958
959
960 if (!nr) {
961 unsigned long filepages, headroom, dirty, writeback;
962
963 mem_cgroup_wb_stats(wb, &filepages, &headroom, &dirty,
964 &writeback);
965 nr = dirty * 10 / 8;
966 }
967
968
969 work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN);
970 if (work) {
971 work->nr_pages = nr;
972 work->sync_mode = WB_SYNC_NONE;
973 work->range_cyclic = 1;
974 work->reason = reason;
975 work->done = done;
976 work->auto_free = 1;
977 wb_queue_work(wb, work);
978 ret = 0;
979 } else {
980 ret = -ENOMEM;
981 }
982
983 wb_put(wb);
984out_css_put:
985 css_put(memcg_css);
986out_bdi_put:
987 bdi_put(bdi);
988 return ret;
989}
990
991
992
993
994
995
996
997
998
999
1000
1001void cgroup_writeback_umount(void)
1002{
1003 if (atomic_read(&isw_nr_in_flight)) {
1004
1005
1006
1007
1008 rcu_barrier();
1009 flush_workqueue(isw_wq);
1010 }
1011}
1012
1013static int __init cgroup_writeback_init(void)
1014{
1015 isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
1016 if (!isw_wq)
1017 return -ENOMEM;
1018 return 0;
1019}
1020fs_initcall(cgroup_writeback_init);
1021
1022#else
1023
1024static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
1025static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
1026
1027static struct bdi_writeback *
1028locked_inode_to_wb_and_lock_list(struct inode *inode)
1029 __releases(&inode->i_lock)
1030 __acquires(&wb->list_lock)
1031{
1032 struct bdi_writeback *wb = inode_to_wb(inode);
1033
1034 spin_unlock(&inode->i_lock);
1035 spin_lock(&wb->list_lock);
1036 return wb;
1037}
1038
1039static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
1040 __acquires(&wb->list_lock)
1041{
1042 struct bdi_writeback *wb = inode_to_wb(inode);
1043
1044 spin_lock(&wb->list_lock);
1045 return wb;
1046}
1047
1048static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
1049{
1050 return nr_pages;
1051}
1052
1053static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
1054 struct wb_writeback_work *base_work,
1055 bool skip_if_busy)
1056{
1057 might_sleep();
1058
1059 if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
1060 base_work->auto_free = 0;
1061 wb_queue_work(&bdi->wb, base_work);
1062 }
1063}
1064
1065#endif
1066
1067
1068
1069
1070
1071static unsigned long get_nr_dirty_pages(void)
1072{
1073 return global_node_page_state(NR_FILE_DIRTY) +
1074 get_nr_dirty_inodes();
1075}
1076
1077static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
1078{
1079 if (!wb_has_dirty_io(wb))
1080 return;
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090 if (test_bit(WB_start_all, &wb->state) ||
1091 test_and_set_bit(WB_start_all, &wb->state))
1092 return;
1093
1094 wb->start_all_reason = reason;
1095 wb_wakeup(wb);
1096}
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108void wb_start_background_writeback(struct bdi_writeback *wb)
1109{
1110
1111
1112
1113
1114 trace_writeback_wake_background(wb);
1115 wb_wakeup(wb);
1116}
1117
1118
1119
1120
1121void inode_io_list_del(struct inode *inode)
1122{
1123 struct bdi_writeback *wb;
1124
1125 wb = inode_to_wb_and_lock_list(inode);
1126 spin_lock(&inode->i_lock);
1127 inode_io_list_del_locked(inode, wb);
1128 spin_unlock(&inode->i_lock);
1129 spin_unlock(&wb->list_lock);
1130}
1131EXPORT_SYMBOL(inode_io_list_del);
1132
1133
1134
1135
1136void sb_mark_inode_writeback(struct inode *inode)
1137{
1138 struct super_block *sb = inode->i_sb;
1139 unsigned long flags;
1140
1141 if (list_empty(&inode->i_wb_list)) {
1142 spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
1143 if (list_empty(&inode->i_wb_list)) {
1144 list_add_tail(&inode->i_wb_list, &sb->s_inodes_wb);
1145 trace_sb_mark_inode_writeback(inode);
1146 }
1147 spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
1148 }
1149}
1150
1151
1152
1153
1154void sb_clear_inode_writeback(struct inode *inode)
1155{
1156 struct super_block *sb = inode->i_sb;
1157 unsigned long flags;
1158
1159 if (!list_empty(&inode->i_wb_list)) {
1160 spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
1161 if (!list_empty(&inode->i_wb_list)) {
1162 list_del_init(&inode->i_wb_list);
1163 trace_sb_clear_inode_writeback(inode);
1164 }
1165 spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
1166 }
1167}
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb)
1179{
1180 assert_spin_locked(&inode->i_lock);
1181
1182 if (!list_empty(&wb->b_dirty)) {
1183 struct inode *tail;
1184
1185 tail = wb_inode(wb->b_dirty.next);
1186 if (time_before(inode->dirtied_when, tail->dirtied_when))
1187 inode->dirtied_when = jiffies;
1188 }
1189 inode_io_list_move_locked(inode, wb, &wb->b_dirty);
1190 inode->i_state &= ~I_SYNC_QUEUED;
1191}
1192
1193static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
1194{
1195 spin_lock(&inode->i_lock);
1196 redirty_tail_locked(inode, wb);
1197 spin_unlock(&inode->i_lock);
1198}
1199
1200
1201
1202
1203static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
1204{
1205 inode_io_list_move_locked(inode, wb, &wb->b_more_io);
1206}
1207
1208static void inode_sync_complete(struct inode *inode)
1209{
1210 inode->i_state &= ~I_SYNC;
1211
1212 inode_add_lru(inode);
1213
1214 smp_mb();
1215 wake_up_bit(&inode->i_state, __I_SYNC);
1216}
1217
1218static bool inode_dirtied_after(struct inode *inode, unsigned long t)
1219{
1220 bool ret = time_after(inode->dirtied_when, t);
1221#ifndef CONFIG_64BIT
1222
1223
1224
1225
1226
1227
1228 ret = ret && time_before_eq(inode->dirtied_when, jiffies);
1229#endif
1230 return ret;
1231}
1232
1233#define EXPIRE_DIRTY_ATIME 0x0001
1234
1235
1236
1237
1238
1239static int move_expired_inodes(struct list_head *delaying_queue,
1240 struct list_head *dispatch_queue,
1241 unsigned long dirtied_before)
1242{
1243 LIST_HEAD(tmp);
1244 struct list_head *pos, *node;
1245 struct super_block *sb = NULL;
1246 struct inode *inode;
1247 int do_sb_sort = 0;
1248 int moved = 0;
1249
1250 while (!list_empty(delaying_queue)) {
1251 inode = wb_inode(delaying_queue->prev);
1252 if (inode_dirtied_after(inode, dirtied_before))
1253 break;
1254 list_move(&inode->i_io_list, &tmp);
1255 moved++;
1256 spin_lock(&inode->i_lock);
1257 inode->i_state |= I_SYNC_QUEUED;
1258 spin_unlock(&inode->i_lock);
1259 if (sb_is_blkdev_sb(inode->i_sb))
1260 continue;
1261 if (sb && sb != inode->i_sb)
1262 do_sb_sort = 1;
1263 sb = inode->i_sb;
1264 }
1265
1266
1267 if (!do_sb_sort) {
1268 list_splice(&tmp, dispatch_queue);
1269 goto out;
1270 }
1271
1272
1273 while (!list_empty(&tmp)) {
1274 sb = wb_inode(tmp.prev)->i_sb;
1275 list_for_each_prev_safe(pos, node, &tmp) {
1276 inode = wb_inode(pos);
1277 if (inode->i_sb == sb)
1278 list_move(&inode->i_io_list, dispatch_queue);
1279 }
1280 }
1281out:
1282 return moved;
1283}
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work,
1297 unsigned long dirtied_before)
1298{
1299 int moved;
1300 unsigned long time_expire_jif = dirtied_before;
1301
1302 assert_spin_locked(&wb->list_lock);
1303 list_splice_init(&wb->b_more_io, &wb->b_io);
1304 moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, dirtied_before);
1305 if (!work->for_sync)
1306 time_expire_jif = jiffies - dirtytime_expire_interval * HZ;
1307 moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
1308 time_expire_jif);
1309 if (moved)
1310 wb_io_lists_populated(wb);
1311 trace_writeback_queue_io(wb, work, dirtied_before, moved);
1312}
1313
1314static int write_inode(struct inode *inode, struct writeback_control *wbc)
1315{
1316 int ret;
1317
1318 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) {
1319 trace_writeback_write_inode_start(inode, wbc);
1320 ret = inode->i_sb->s_op->write_inode(inode, wbc);
1321 trace_writeback_write_inode(inode, wbc);
1322 return ret;
1323 }
1324 return 0;
1325}
1326
1327
1328
1329
1330
1331static void __inode_wait_for_writeback(struct inode *inode)
1332 __releases(inode->i_lock)
1333 __acquires(inode->i_lock)
1334{
1335 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
1336 wait_queue_head_t *wqh;
1337
1338 wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
1339 while (inode->i_state & I_SYNC) {
1340 spin_unlock(&inode->i_lock);
1341 __wait_on_bit(wqh, &wq, bit_wait,
1342 TASK_UNINTERRUPTIBLE);
1343 spin_lock(&inode->i_lock);
1344 }
1345}
1346
1347
1348
1349
1350void inode_wait_for_writeback(struct inode *inode)
1351{
1352 spin_lock(&inode->i_lock);
1353 __inode_wait_for_writeback(inode);
1354 spin_unlock(&inode->i_lock);
1355}
1356
1357
1358
1359
1360
1361
1362static void inode_sleep_on_writeback(struct inode *inode)
1363 __releases(inode->i_lock)
1364{
1365 DEFINE_WAIT(wait);
1366 wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
1367 int sleep;
1368
1369 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
1370 sleep = inode->i_state & I_SYNC;
1371 spin_unlock(&inode->i_lock);
1372 if (sleep)
1373 schedule();
1374 finish_wait(wqh, &wait);
1375}
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
1386 struct writeback_control *wbc)
1387{
1388 if (inode->i_state & I_FREEING)
1389 return;
1390
1391
1392
1393
1394
1395
1396 if ((inode->i_state & I_DIRTY) &&
1397 (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
1398 inode->dirtied_when = jiffies;
1399
1400 if (wbc->pages_skipped) {
1401
1402
1403
1404
1405 redirty_tail_locked(inode, wb);
1406 return;
1407 }
1408
1409 if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
1410
1411
1412
1413
1414 if (wbc->nr_to_write <= 0) {
1415
1416 requeue_io(inode, wb);
1417 } else {
1418
1419
1420
1421
1422
1423
1424
1425 redirty_tail_locked(inode, wb);
1426 }
1427 } else if (inode->i_state & I_DIRTY) {
1428
1429
1430
1431
1432
1433 redirty_tail_locked(inode, wb);
1434 } else if (inode->i_state & I_DIRTY_TIME) {
1435 inode->dirtied_when = jiffies;
1436 inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
1437 inode->i_state &= ~I_SYNC_QUEUED;
1438 } else {
1439
1440 inode_io_list_del_locked(inode, wb);
1441 }
1442}
1443
1444
1445
1446
1447
1448
1449static int
1450__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
1451{
1452 struct address_space *mapping = inode->i_mapping;
1453 long nr_to_write = wbc->nr_to_write;
1454 unsigned dirty;
1455 int ret;
1456
1457 WARN_ON(!(inode->i_state & I_SYNC));
1458
1459 trace_writeback_single_inode_start(inode, wbc, nr_to_write);
1460
1461 ret = do_writepages(mapping, wbc);
1462
1463
1464
1465
1466
1467
1468
1469
1470 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
1471 int err = filemap_fdatawait(mapping);
1472 if (ret == 0)
1473 ret = err;
1474 }
1475
1476
1477
1478
1479
1480
1481 spin_lock(&inode->i_lock);
1482
1483 dirty = inode->i_state & I_DIRTY;
1484 if ((inode->i_state & I_DIRTY_TIME) &&
1485 ((dirty & I_DIRTY_INODE) ||
1486 wbc->sync_mode == WB_SYNC_ALL || wbc->for_sync ||
1487 time_after(jiffies, inode->dirtied_time_when +
1488 dirtytime_expire_interval * HZ))) {
1489 dirty |= I_DIRTY_TIME;
1490 trace_writeback_lazytime(inode);
1491 }
1492 inode->i_state &= ~dirty;
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505 smp_mb();
1506
1507 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
1508 inode->i_state |= I_DIRTY_PAGES;
1509
1510 spin_unlock(&inode->i_lock);
1511
1512 if (dirty & I_DIRTY_TIME)
1513 mark_inode_dirty_sync(inode);
1514
1515 if (dirty & ~I_DIRTY_PAGES) {
1516 int err = write_inode(inode, wbc);
1517 if (ret == 0)
1518 ret = err;
1519 }
1520 trace_writeback_single_inode(inode, wbc, nr_to_write);
1521 return ret;
1522}
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532static int writeback_single_inode(struct inode *inode,
1533 struct writeback_control *wbc)
1534{
1535 struct bdi_writeback *wb;
1536 int ret = 0;
1537
1538 spin_lock(&inode->i_lock);
1539 if (!atomic_read(&inode->i_count))
1540 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
1541 else
1542 WARN_ON(inode->i_state & I_WILL_FREE);
1543
1544 if (inode->i_state & I_SYNC) {
1545 if (wbc->sync_mode != WB_SYNC_ALL)
1546 goto out;
1547
1548
1549
1550
1551
1552 __inode_wait_for_writeback(inode);
1553 }
1554 WARN_ON(inode->i_state & I_SYNC);
1555
1556
1557
1558
1559
1560
1561
1562
1563 if (!(inode->i_state & I_DIRTY_ALL) &&
1564 (wbc->sync_mode != WB_SYNC_ALL ||
1565 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
1566 goto out;
1567 inode->i_state |= I_SYNC;
1568 wbc_attach_and_unlock_inode(wbc, inode);
1569
1570 ret = __writeback_single_inode(inode, wbc);
1571
1572 wbc_detach_inode(wbc);
1573
1574 wb = inode_to_wb_and_lock_list(inode);
1575 spin_lock(&inode->i_lock);
1576
1577
1578
1579
1580 if (!(inode->i_state & I_DIRTY_ALL))
1581 inode_io_list_del_locked(inode, wb);
1582 spin_unlock(&wb->list_lock);
1583 inode_sync_complete(inode);
1584out:
1585 spin_unlock(&inode->i_lock);
1586 return ret;
1587}
1588
1589static long writeback_chunk_size(struct bdi_writeback *wb,
1590 struct wb_writeback_work *work)
1591{
1592 long pages;
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607 if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
1608 pages = LONG_MAX;
1609 else {
1610 pages = min(wb->avg_write_bandwidth / 2,
1611 global_wb_domain.dirty_limit / DIRTY_SCOPE);
1612 pages = min(pages, work->nr_pages);
1613 pages = round_down(pages + MIN_WRITEBACK_PAGES,
1614 MIN_WRITEBACK_PAGES);
1615 }
1616
1617 return pages;
1618}
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629static long writeback_sb_inodes(struct super_block *sb,
1630 struct bdi_writeback *wb,
1631 struct wb_writeback_work *work)
1632{
1633 struct writeback_control wbc = {
1634 .sync_mode = work->sync_mode,
1635 .tagged_writepages = work->tagged_writepages,
1636 .for_kupdate = work->for_kupdate,
1637 .for_background = work->for_background,
1638 .for_sync = work->for_sync,
1639 .range_cyclic = work->range_cyclic,
1640 .range_start = 0,
1641 .range_end = LLONG_MAX,
1642 };
1643 unsigned long start_time = jiffies;
1644 long write_chunk;
1645 long wrote = 0;
1646
1647 while (!list_empty(&wb->b_io)) {
1648 struct inode *inode = wb_inode(wb->b_io.prev);
1649 struct bdi_writeback *tmp_wb;
1650
1651 if (inode->i_sb != sb) {
1652 if (work->sb) {
1653
1654
1655
1656
1657
1658 redirty_tail(inode, wb);
1659 continue;
1660 }
1661
1662
1663
1664
1665
1666
1667 break;
1668 }
1669
1670
1671
1672
1673
1674
1675 spin_lock(&inode->i_lock);
1676 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
1677 redirty_tail_locked(inode, wb);
1678 spin_unlock(&inode->i_lock);
1679 continue;
1680 }
1681 if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691 spin_unlock(&inode->i_lock);
1692 requeue_io(inode, wb);
1693 trace_writeback_sb_inodes_requeue(inode);
1694 continue;
1695 }
1696 spin_unlock(&wb->list_lock);
1697
1698
1699
1700
1701
1702
1703 if (inode->i_state & I_SYNC) {
1704
1705 inode_sleep_on_writeback(inode);
1706
1707 spin_lock(&wb->list_lock);
1708 continue;
1709 }
1710 inode->i_state |= I_SYNC;
1711 wbc_attach_and_unlock_inode(&wbc, inode);
1712
1713 write_chunk = writeback_chunk_size(wb, work);
1714 wbc.nr_to_write = write_chunk;
1715 wbc.pages_skipped = 0;
1716
1717
1718
1719
1720
1721 __writeback_single_inode(inode, &wbc);
1722
1723 wbc_detach_inode(&wbc);
1724 work->nr_pages -= write_chunk - wbc.nr_to_write;
1725 wrote += write_chunk - wbc.nr_to_write;
1726
1727 if (need_resched()) {
1728
1729
1730
1731
1732
1733
1734
1735
1736 blk_flush_plug(current);
1737 cond_resched();
1738 }
1739
1740
1741
1742
1743
1744 tmp_wb = inode_to_wb_and_lock_list(inode);
1745 spin_lock(&inode->i_lock);
1746 if (!(inode->i_state & I_DIRTY_ALL))
1747 wrote++;
1748 requeue_inode(inode, tmp_wb, &wbc);
1749 inode_sync_complete(inode);
1750 spin_unlock(&inode->i_lock);
1751
1752 if (unlikely(tmp_wb != wb)) {
1753 spin_unlock(&tmp_wb->list_lock);
1754 spin_lock(&wb->list_lock);
1755 }
1756
1757
1758
1759
1760
1761 if (wrote) {
1762 if (time_is_before_jiffies(start_time + HZ / 10UL))
1763 break;
1764 if (work->nr_pages <= 0)
1765 break;
1766 }
1767 }
1768 return wrote;
1769}
1770
1771static long __writeback_inodes_wb(struct bdi_writeback *wb,
1772 struct wb_writeback_work *work)
1773{
1774 unsigned long start_time = jiffies;
1775 long wrote = 0;
1776
1777 while (!list_empty(&wb->b_io)) {
1778 struct inode *inode = wb_inode(wb->b_io.prev);
1779 struct super_block *sb = inode->i_sb;
1780
1781 if (!trylock_super(sb)) {
1782
1783
1784
1785
1786
1787 redirty_tail(inode, wb);
1788 continue;
1789 }
1790 wrote += writeback_sb_inodes(sb, wb, work);
1791 up_read(&sb->s_umount);
1792
1793
1794 if (wrote) {
1795 if (time_is_before_jiffies(start_time + HZ / 10UL))
1796 break;
1797 if (work->nr_pages <= 0)
1798 break;
1799 }
1800 }
1801
1802 return wrote;
1803}
1804
1805static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
1806 enum wb_reason reason)
1807{
1808 struct wb_writeback_work work = {
1809 .nr_pages = nr_pages,
1810 .sync_mode = WB_SYNC_NONE,
1811 .range_cyclic = 1,
1812 .reason = reason,
1813 };
1814 struct blk_plug plug;
1815
1816 blk_start_plug(&plug);
1817 spin_lock(&wb->list_lock);
1818 if (list_empty(&wb->b_io))
1819 queue_io(wb, &work, jiffies);
1820 __writeback_inodes_wb(wb, &work);
1821 spin_unlock(&wb->list_lock);
1822 blk_finish_plug(&plug);
1823
1824 return nr_pages - work.nr_pages;
1825}
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842static long wb_writeback(struct bdi_writeback *wb,
1843 struct wb_writeback_work *work)
1844{
1845 unsigned long wb_start = jiffies;
1846 long nr_pages = work->nr_pages;
1847 unsigned long dirtied_before = jiffies;
1848 struct inode *inode;
1849 long progress;
1850 struct blk_plug plug;
1851
1852 blk_start_plug(&plug);
1853 spin_lock(&wb->list_lock);
1854 for (;;) {
1855
1856
1857
1858 if (work->nr_pages <= 0)
1859 break;
1860
1861
1862
1863
1864
1865
1866
1867 if ((work->for_background || work->for_kupdate) &&
1868 !list_empty(&wb->work_list))
1869 break;
1870
1871
1872
1873
1874
1875 if (work->for_background && !wb_over_bg_thresh(wb))
1876 break;
1877
1878
1879
1880
1881
1882
1883
1884 if (work->for_kupdate) {
1885 dirtied_before = jiffies -
1886 msecs_to_jiffies(dirty_expire_interval * 10);
1887 } else if (work->for_background)
1888 dirtied_before = jiffies;
1889
1890 trace_writeback_start(wb, work);
1891 if (list_empty(&wb->b_io))
1892 queue_io(wb, work, dirtied_before);
1893 if (work->sb)
1894 progress = writeback_sb_inodes(work->sb, wb, work);
1895 else
1896 progress = __writeback_inodes_wb(wb, work);
1897 trace_writeback_written(wb, work);
1898
1899 wb_update_bandwidth(wb, wb_start);
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909 if (progress)
1910 continue;
1911
1912
1913
1914 if (list_empty(&wb->b_more_io))
1915 break;
1916
1917
1918
1919
1920
1921 trace_writeback_wait(wb, work);
1922 inode = wb_inode(wb->b_more_io.prev);
1923 spin_lock(&inode->i_lock);
1924 spin_unlock(&wb->list_lock);
1925
1926 inode_sleep_on_writeback(inode);
1927 spin_lock(&wb->list_lock);
1928 }
1929 spin_unlock(&wb->list_lock);
1930 blk_finish_plug(&plug);
1931
1932 return nr_pages - work->nr_pages;
1933}
1934
1935
1936
1937
1938static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
1939{
1940 struct wb_writeback_work *work = NULL;
1941
1942 spin_lock_bh(&wb->work_lock);
1943 if (!list_empty(&wb->work_list)) {
1944 work = list_entry(wb->work_list.next,
1945 struct wb_writeback_work, list);
1946 list_del_init(&work->list);
1947 }
1948 spin_unlock_bh(&wb->work_lock);
1949 return work;
1950}
1951
1952static long wb_check_background_flush(struct bdi_writeback *wb)
1953{
1954 if (wb_over_bg_thresh(wb)) {
1955
1956 struct wb_writeback_work work = {
1957 .nr_pages = LONG_MAX,
1958 .sync_mode = WB_SYNC_NONE,
1959 .for_background = 1,
1960 .range_cyclic = 1,
1961 .reason = WB_REASON_BACKGROUND,
1962 };
1963
1964 return wb_writeback(wb, &work);
1965 }
1966
1967 return 0;
1968}
1969
1970static long wb_check_old_data_flush(struct bdi_writeback *wb)
1971{
1972 unsigned long expired;
1973 long nr_pages;
1974
1975
1976
1977
1978 if (!dirty_writeback_interval)
1979 return 0;
1980
1981 expired = wb->last_old_flush +
1982 msecs_to_jiffies(dirty_writeback_interval * 10);
1983 if (time_before(jiffies, expired))
1984 return 0;
1985
1986 wb->last_old_flush = jiffies;
1987 nr_pages = get_nr_dirty_pages();
1988
1989 if (nr_pages) {
1990 struct wb_writeback_work work = {
1991 .nr_pages = nr_pages,
1992 .sync_mode = WB_SYNC_NONE,
1993 .for_kupdate = 1,
1994 .range_cyclic = 1,
1995 .reason = WB_REASON_PERIODIC,
1996 };
1997
1998 return wb_writeback(wb, &work);
1999 }
2000
2001 return 0;
2002}
2003
2004static long wb_check_start_all(struct bdi_writeback *wb)
2005{
2006 long nr_pages;
2007
2008 if (!test_bit(WB_start_all, &wb->state))
2009 return 0;
2010
2011 nr_pages = get_nr_dirty_pages();
2012 if (nr_pages) {
2013 struct wb_writeback_work work = {
2014 .nr_pages = wb_split_bdi_pages(wb, nr_pages),
2015 .sync_mode = WB_SYNC_NONE,
2016 .range_cyclic = 1,
2017 .reason = wb->start_all_reason,
2018 };
2019
2020 nr_pages = wb_writeback(wb, &work);
2021 }
2022
2023 clear_bit(WB_start_all, &wb->state);
2024 return nr_pages;
2025}
2026
2027
2028
2029
2030
2031static long wb_do_writeback(struct bdi_writeback *wb)
2032{
2033 struct wb_writeback_work *work;
2034 long wrote = 0;
2035
2036 set_bit(WB_writeback_running, &wb->state);
2037 while ((work = get_next_work_item(wb)) != NULL) {
2038 trace_writeback_exec(wb, work);
2039 wrote += wb_writeback(wb, work);
2040 finish_writeback_work(wb, work);
2041 }
2042
2043
2044
2045
2046 wrote += wb_check_start_all(wb);
2047
2048
2049
2050
2051 wrote += wb_check_old_data_flush(wb);
2052 wrote += wb_check_background_flush(wb);
2053 clear_bit(WB_writeback_running, &wb->state);
2054
2055 return wrote;
2056}
2057
2058
2059
2060
2061
2062void wb_workfn(struct work_struct *work)
2063{
2064 struct bdi_writeback *wb = container_of(to_delayed_work(work),
2065 struct bdi_writeback, dwork);
2066 long pages_written;
2067
2068 set_worker_desc("flush-%s", bdi_dev_name(wb->bdi));
2069 current->flags |= PF_SWAPWRITE;
2070
2071 if (likely(!current_is_workqueue_rescuer() ||
2072 !test_bit(WB_registered, &wb->state))) {
2073
2074
2075
2076
2077
2078
2079 do {
2080 pages_written = wb_do_writeback(wb);
2081 trace_writeback_pages_written(pages_written);
2082 } while (!list_empty(&wb->work_list));
2083 } else {
2084
2085
2086
2087
2088
2089 pages_written = writeback_inodes_wb(wb, 1024,
2090 WB_REASON_FORKER_THREAD);
2091 trace_writeback_pages_written(pages_written);
2092 }
2093
2094 if (!list_empty(&wb->work_list))
2095 wb_wakeup(wb);
2096 else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
2097 wb_wakeup_delayed(wb);
2098
2099 current->flags &= ~PF_SWAPWRITE;
2100}
2101
2102
2103
2104
2105
2106static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
2107 enum wb_reason reason)
2108{
2109 struct bdi_writeback *wb;
2110
2111 if (!bdi_has_dirty_io(bdi))
2112 return;
2113
2114 list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
2115 wb_start_writeback(wb, reason);
2116}
2117
2118void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
2119 enum wb_reason reason)
2120{
2121 rcu_read_lock();
2122 __wakeup_flusher_threads_bdi(bdi, reason);
2123 rcu_read_unlock();
2124}
2125
2126
2127
2128
2129void wakeup_flusher_threads(enum wb_reason reason)
2130{
2131 struct backing_dev_info *bdi;
2132
2133
2134
2135
2136 if (blk_needs_flush_plug(current))
2137 blk_schedule_flush_plug(current);
2138
2139 rcu_read_lock();
2140 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
2141 __wakeup_flusher_threads_bdi(bdi, reason);
2142 rcu_read_unlock();
2143}
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160static void wakeup_dirtytime_writeback(struct work_struct *w);
2161static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback);
2162
2163static void wakeup_dirtytime_writeback(struct work_struct *w)
2164{
2165 struct backing_dev_info *bdi;
2166
2167 rcu_read_lock();
2168 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
2169 struct bdi_writeback *wb;
2170
2171 list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
2172 if (!list_empty(&wb->b_dirty_time))
2173 wb_wakeup(wb);
2174 }
2175 rcu_read_unlock();
2176 schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
2177}
2178
2179static int __init start_dirtytime_writeback(void)
2180{
2181 schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
2182 return 0;
2183}
2184__initcall(start_dirtytime_writeback);
2185
2186int dirtytime_interval_handler(struct ctl_table *table, int write,
2187 void *buffer, size_t *lenp, loff_t *ppos)
2188{
2189 int ret;
2190
2191 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
2192 if (ret == 0 && write)
2193 mod_delayed_work(system_wq, &dirtytime_work, 0);
2194 return ret;
2195}
2196
2197static noinline void block_dump___mark_inode_dirty(struct inode *inode)
2198{
2199 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
2200 struct dentry *dentry;
2201 const char *name = "?";
2202
2203 dentry = d_find_alias(inode);
2204 if (dentry) {
2205 spin_lock(&dentry->d_lock);
2206 name = (const char *) dentry->d_name.name;
2207 }
2208 printk(KERN_DEBUG
2209 "%s(%d): dirtied inode %lu (%s) on %s\n",
2210 current->comm, task_pid_nr(current), inode->i_ino,
2211 name, inode->i_sb->s_id);
2212 if (dentry) {
2213 spin_unlock(&dentry->d_lock);
2214 dput(dentry);
2215 }
2216 }
2217}
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245void __mark_inode_dirty(struct inode *inode, int flags)
2246{
2247 struct super_block *sb = inode->i_sb;
2248 int dirtytime;
2249
2250 trace_writeback_mark_inode_dirty(inode, flags);
2251
2252
2253
2254
2255
2256 if (flags & (I_DIRTY_INODE | I_DIRTY_TIME)) {
2257 trace_writeback_dirty_inode_start(inode, flags);
2258
2259 if (sb->s_op->dirty_inode)
2260 sb->s_op->dirty_inode(inode, flags);
2261
2262 trace_writeback_dirty_inode(inode, flags);
2263 }
2264 if (flags & I_DIRTY_INODE)
2265 flags &= ~I_DIRTY_TIME;
2266 dirtytime = flags & I_DIRTY_TIME;
2267
2268
2269
2270
2271
2272 smp_mb();
2273
2274 if (((inode->i_state & flags) == flags) ||
2275 (dirtytime && (inode->i_state & I_DIRTY_INODE)))
2276 return;
2277
2278 if (unlikely(block_dump))
2279 block_dump___mark_inode_dirty(inode);
2280
2281 spin_lock(&inode->i_lock);
2282 if (dirtytime && (inode->i_state & I_DIRTY_INODE))
2283 goto out_unlock_inode;
2284 if ((inode->i_state & flags) != flags) {
2285 const int was_dirty = inode->i_state & I_DIRTY;
2286
2287 inode_attach_wb(inode, NULL);
2288
2289 if (flags & I_DIRTY_INODE)
2290 inode->i_state &= ~I_DIRTY_TIME;
2291 inode->i_state |= flags;
2292
2293
2294
2295
2296
2297
2298
2299 if (inode->i_state & I_SYNC_QUEUED)
2300 goto out_unlock_inode;
2301
2302
2303
2304
2305
2306 if (!S_ISBLK(inode->i_mode)) {
2307 if (inode_unhashed(inode))
2308 goto out_unlock_inode;
2309 }
2310 if (inode->i_state & I_FREEING)
2311 goto out_unlock_inode;
2312
2313
2314
2315
2316
2317 if (!was_dirty) {
2318 struct bdi_writeback *wb;
2319 struct list_head *dirty_list;
2320 bool wakeup_bdi = false;
2321
2322 wb = locked_inode_to_wb_and_lock_list(inode);
2323
2324 WARN((wb->bdi->capabilities & BDI_CAP_WRITEBACK) &&
2325 !test_bit(WB_registered, &wb->state),
2326 "bdi-%s not registered\n", bdi_dev_name(wb->bdi));
2327
2328 inode->dirtied_when = jiffies;
2329 if (dirtytime)
2330 inode->dirtied_time_when = jiffies;
2331
2332 if (inode->i_state & I_DIRTY)
2333 dirty_list = &wb->b_dirty;
2334 else
2335 dirty_list = &wb->b_dirty_time;
2336
2337 wakeup_bdi = inode_io_list_move_locked(inode, wb,
2338 dirty_list);
2339
2340 spin_unlock(&wb->list_lock);
2341 trace_writeback_dirty_inode_enqueue(inode);
2342
2343
2344
2345
2346
2347
2348
2349 if (wakeup_bdi &&
2350 (wb->bdi->capabilities & BDI_CAP_WRITEBACK))
2351 wb_wakeup_delayed(wb);
2352 return;
2353 }
2354 }
2355out_unlock_inode:
2356 spin_unlock(&inode->i_lock);
2357}
2358EXPORT_SYMBOL(__mark_inode_dirty);
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369static void wait_sb_inodes(struct super_block *sb)
2370{
2371 LIST_HEAD(sync_list);
2372
2373
2374
2375
2376
2377 WARN_ON(!rwsem_is_locked(&sb->s_umount));
2378
2379 mutex_lock(&sb->s_sync_lock);
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390 rcu_read_lock();
2391 spin_lock_irq(&sb->s_inode_wblist_lock);
2392 list_splice_init(&sb->s_inodes_wb, &sync_list);
2393
2394
2395
2396
2397
2398
2399
2400
2401 while (!list_empty(&sync_list)) {
2402 struct inode *inode = list_first_entry(&sync_list, struct inode,
2403 i_wb_list);
2404 struct address_space *mapping = inode->i_mapping;
2405
2406
2407
2408
2409
2410
2411
2412 list_move_tail(&inode->i_wb_list, &sb->s_inodes_wb);
2413
2414
2415
2416
2417
2418
2419 if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
2420 continue;
2421
2422 spin_unlock_irq(&sb->s_inode_wblist_lock);
2423
2424 spin_lock(&inode->i_lock);
2425 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
2426 spin_unlock(&inode->i_lock);
2427
2428 spin_lock_irq(&sb->s_inode_wblist_lock);
2429 continue;
2430 }
2431 __iget(inode);
2432 spin_unlock(&inode->i_lock);
2433 rcu_read_unlock();
2434
2435
2436
2437
2438
2439
2440 filemap_fdatawait_keep_errors(mapping);
2441
2442 cond_resched();
2443
2444 iput(inode);
2445
2446 rcu_read_lock();
2447 spin_lock_irq(&sb->s_inode_wblist_lock);
2448 }
2449 spin_unlock_irq(&sb->s_inode_wblist_lock);
2450 rcu_read_unlock();
2451 mutex_unlock(&sb->s_sync_lock);
2452}
2453
2454static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
2455 enum wb_reason reason, bool skip_if_busy)
2456{
2457 struct backing_dev_info *bdi = sb->s_bdi;
2458 DEFINE_WB_COMPLETION(done, bdi);
2459 struct wb_writeback_work work = {
2460 .sb = sb,
2461 .sync_mode = WB_SYNC_NONE,
2462 .tagged_writepages = 1,
2463 .done = &done,
2464 .nr_pages = nr,
2465 .reason = reason,
2466 };
2467
2468 if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
2469 return;
2470 WARN_ON(!rwsem_is_locked(&sb->s_umount));
2471
2472 bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
2473 wb_wait_for_completion(&done);
2474}
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486void writeback_inodes_sb_nr(struct super_block *sb,
2487 unsigned long nr,
2488 enum wb_reason reason)
2489{
2490 __writeback_inodes_sb_nr(sb, nr, reason, false);
2491}
2492EXPORT_SYMBOL(writeback_inodes_sb_nr);
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
2504{
2505 return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
2506}
2507EXPORT_SYMBOL(writeback_inodes_sb);
2508
2509
2510
2511
2512
2513
2514
2515
2516void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
2517{
2518 if (!down_read_trylock(&sb->s_umount))
2519 return;
2520
2521 __writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason, true);
2522 up_read(&sb->s_umount);
2523}
2524EXPORT_SYMBOL(try_to_writeback_inodes_sb);
2525
2526
2527
2528
2529
2530
2531
2532
2533void sync_inodes_sb(struct super_block *sb)
2534{
2535 struct backing_dev_info *bdi = sb->s_bdi;
2536 DEFINE_WB_COMPLETION(done, bdi);
2537 struct wb_writeback_work work = {
2538 .sb = sb,
2539 .sync_mode = WB_SYNC_ALL,
2540 .nr_pages = LONG_MAX,
2541 .range_cyclic = 0,
2542 .done = &done,
2543 .reason = WB_REASON_SYNC,
2544 .for_sync = 1,
2545 };
2546
2547
2548
2549
2550
2551
2552 if (bdi == &noop_backing_dev_info)
2553 return;
2554 WARN_ON(!rwsem_is_locked(&sb->s_umount));
2555
2556
2557 bdi_down_write_wb_switch_rwsem(bdi);
2558 bdi_split_work_to_wbs(bdi, &work, false);
2559 wb_wait_for_completion(&done);
2560 bdi_up_write_wb_switch_rwsem(bdi);
2561
2562 wait_sb_inodes(sb);
2563}
2564EXPORT_SYMBOL(sync_inodes_sb);
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576int write_inode_now(struct inode *inode, int sync)
2577{
2578 struct writeback_control wbc = {
2579 .nr_to_write = LONG_MAX,
2580 .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
2581 .range_start = 0,
2582 .range_end = LLONG_MAX,
2583 };
2584
2585 if (!mapping_can_writeback(inode->i_mapping))
2586 wbc.nr_to_write = 0;
2587
2588 might_sleep();
2589 return writeback_single_inode(inode, &wbc);
2590}
2591EXPORT_SYMBOL(write_inode_now);
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604int sync_inode(struct inode *inode, struct writeback_control *wbc)
2605{
2606 return writeback_single_inode(inode, wbc);
2607}
2608EXPORT_SYMBOL(sync_inode);
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619int sync_inode_metadata(struct inode *inode, int wait)
2620{
2621 struct writeback_control wbc = {
2622 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
2623 .nr_to_write = 0,
2624 };
2625
2626 return sync_inode(inode, &wbc);
2627}
2628EXPORT_SYMBOL(sync_inode_metadata);
2629