1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16#include <linux/kernel.h>
17#include <linux/export.h>
18#include <linux/spinlock.h>
19#include <linux/slab.h>
20#include <linux/sched.h>
21#include <linux/fs.h>
22#include <linux/mm.h>
23#include <linux/pagemap.h>
24#include <linux/kthread.h>
25#include <linux/writeback.h>
26#include <linux/blkdev.h>
27#include <linux/backing-dev.h>
28#include <linux/tracepoint.h>
29#include <linux/device.h>
30#include <linux/memcontrol.h>
31#include "internal.h"
32
33
34
35
36#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10))
37
38
39
40
41struct wb_writeback_work {
42 long nr_pages;
43 struct super_block *sb;
44 enum writeback_sync_modes sync_mode;
45 unsigned int tagged_writepages:1;
46 unsigned int for_kupdate:1;
47 unsigned int range_cyclic:1;
48 unsigned int for_background:1;
49 unsigned int for_sync:1;
50 unsigned int auto_free:1;
51 enum wb_reason reason;
52
53 struct list_head list;
54 struct wb_completion *done;
55};
56
57
58
59
60
61
62
63
64
65
66
67unsigned int dirtytime_expire_interval = 12 * 60 * 60;
68
69static inline struct inode *wb_inode(struct list_head *head)
70{
71 return list_entry(head, struct inode, i_io_list);
72}
73
74
75
76
77
78
79#define CREATE_TRACE_POINTS
80#include <trace/events/writeback.h>
81
82EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
83
84static bool wb_io_lists_populated(struct bdi_writeback *wb)
85{
86 if (wb_has_dirty_io(wb)) {
87 return false;
88 } else {
89 set_bit(WB_has_dirty_io, &wb->state);
90 WARN_ON_ONCE(!wb->avg_write_bandwidth);
91 atomic_long_add(wb->avg_write_bandwidth,
92 &wb->bdi->tot_write_bandwidth);
93 return true;
94 }
95}
96
97static void wb_io_lists_depopulated(struct bdi_writeback *wb)
98{
99 if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) &&
100 list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) {
101 clear_bit(WB_has_dirty_io, &wb->state);
102 WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth,
103 &wb->bdi->tot_write_bandwidth) < 0);
104 }
105}
106
107
108
109
110
111
112
113
114
115
116
117static bool inode_io_list_move_locked(struct inode *inode,
118 struct bdi_writeback *wb,
119 struct list_head *head)
120{
121 assert_spin_locked(&wb->list_lock);
122
123 list_move(&inode->i_io_list, head);
124
125
126 if (head != &wb->b_dirty_time)
127 return wb_io_lists_populated(wb);
128
129 wb_io_lists_depopulated(wb);
130 return false;
131}
132
133
134
135
136
137
138
139
140
141static void inode_io_list_del_locked(struct inode *inode,
142 struct bdi_writeback *wb)
143{
144 assert_spin_locked(&wb->list_lock);
145 assert_spin_locked(&inode->i_lock);
146
147 inode->i_state &= ~I_SYNC_QUEUED;
148 list_del_init(&inode->i_io_list);
149 wb_io_lists_depopulated(wb);
150}
151
152static void wb_wakeup(struct bdi_writeback *wb)
153{
154 spin_lock_bh(&wb->work_lock);
155 if (test_bit(WB_registered, &wb->state))
156 mod_delayed_work(bdi_wq, &wb->dwork, 0);
157 spin_unlock_bh(&wb->work_lock);
158}
159
160static void finish_writeback_work(struct bdi_writeback *wb,
161 struct wb_writeback_work *work)
162{
163 struct wb_completion *done = work->done;
164
165 if (work->auto_free)
166 kfree(work);
167 if (done) {
168 wait_queue_head_t *waitq = done->waitq;
169
170
171 if (atomic_dec_and_test(&done->cnt))
172 wake_up_all(waitq);
173 }
174}
175
176static void wb_queue_work(struct bdi_writeback *wb,
177 struct wb_writeback_work *work)
178{
179 trace_writeback_queue(wb, work);
180
181 if (work->done)
182 atomic_inc(&work->done->cnt);
183
184 spin_lock_bh(&wb->work_lock);
185
186 if (test_bit(WB_registered, &wb->state)) {
187 list_add_tail(&work->list, &wb->work_list);
188 mod_delayed_work(bdi_wq, &wb->dwork, 0);
189 } else
190 finish_writeback_work(wb, work);
191
192 spin_unlock_bh(&wb->work_lock);
193}
194
195
196
197
198
199
200
201
202
203
204
205void wb_wait_for_completion(struct wb_completion *done)
206{
207 atomic_dec(&done->cnt);
208 wait_event(*done->waitq, !atomic_read(&done->cnt));
209}
210
211#ifdef CONFIG_CGROUP_WRITEBACK
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232#define WB_FRN_TIME_SHIFT 13
233#define WB_FRN_TIME_AVG_SHIFT 3
234#define WB_FRN_TIME_CUT_DIV 8
235#define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT))
236
237#define WB_FRN_HIST_SLOTS 16
238#define WB_FRN_HIST_UNIT (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS)
239
240#define WB_FRN_HIST_THR_SLOTS (WB_FRN_HIST_SLOTS / 2)
241
242#define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1)
243
244#define WB_FRN_MAX_IN_FLIGHT 1024
245
246static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
247static struct workqueue_struct *isw_wq;
248
249void __inode_attach_wb(struct inode *inode, struct page *page)
250{
251 struct backing_dev_info *bdi = inode_to_bdi(inode);
252 struct bdi_writeback *wb = NULL;
253
254 if (inode_cgwb_enabled(inode)) {
255 struct cgroup_subsys_state *memcg_css;
256
257 if (page) {
258 memcg_css = mem_cgroup_css_from_page(page);
259 wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
260 } else {
261
262 memcg_css = task_get_css(current, memory_cgrp_id);
263 wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
264 css_put(memcg_css);
265 }
266 }
267
268 if (!wb)
269 wb = &bdi->wb;
270
271
272
273
274
275 if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
276 wb_put(wb);
277}
278EXPORT_SYMBOL_GPL(__inode_attach_wb);
279
280
281
282
283
284
285
286
287
288static struct bdi_writeback *
289locked_inode_to_wb_and_lock_list(struct inode *inode)
290 __releases(&inode->i_lock)
291 __acquires(&wb->list_lock)
292{
293 while (true) {
294 struct bdi_writeback *wb = inode_to_wb(inode);
295
296
297
298
299
300
301
302 wb_get(wb);
303 spin_unlock(&inode->i_lock);
304 spin_lock(&wb->list_lock);
305
306
307 if (likely(wb == inode->i_wb)) {
308 wb_put(wb);
309 return wb;
310 }
311
312 spin_unlock(&wb->list_lock);
313 wb_put(wb);
314 cpu_relax();
315 spin_lock(&inode->i_lock);
316 }
317}
318
319
320
321
322
323
324
325
326static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
327 __acquires(&wb->list_lock)
328{
329 spin_lock(&inode->i_lock);
330 return locked_inode_to_wb_and_lock_list(inode);
331}
332
333struct inode_switch_wbs_context {
334 struct inode *inode;
335 struct bdi_writeback *new_wb;
336
337 struct rcu_head rcu_head;
338 struct work_struct work;
339};
340
341static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
342{
343 down_write(bdi->wb_switch_rwsem);
344}
345
346static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
347{
348 up_write(bdi->wb_switch_rwsem);
349}
350
351static void inode_switch_wbs_work_fn(struct work_struct *work)
352{
353 struct inode_switch_wbs_context *isw =
354 container_of(work, struct inode_switch_wbs_context, work);
355 struct inode *inode = isw->inode;
356 struct backing_dev_info *bdi = inode_to_bdi(inode);
357 struct address_space *mapping = inode->i_mapping;
358 struct bdi_writeback *old_wb = inode->i_wb;
359 struct bdi_writeback *new_wb = isw->new_wb;
360 XA_STATE(xas, &mapping->i_pages, 0);
361 struct page *page;
362 bool switched = false;
363
364
365
366
367
368 down_read(bdi->wb_switch_rwsem);
369
370
371
372
373
374
375
376
377
378
379
380 if (old_wb < new_wb) {
381 spin_lock(&old_wb->list_lock);
382 spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
383 } else {
384 spin_lock(&new_wb->list_lock);
385 spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
386 }
387 spin_lock(&inode->i_lock);
388 xa_lock_irq(&mapping->i_pages);
389
390
391
392
393
394 if (unlikely(inode->i_state & I_FREEING))
395 goto skip_switch;
396
397 trace_inode_switch_wbs(inode, old_wb, new_wb);
398
399
400
401
402
403
404 xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_DIRTY) {
405 if (PageDirty(page)) {
406 dec_wb_stat(old_wb, WB_RECLAIMABLE);
407 inc_wb_stat(new_wb, WB_RECLAIMABLE);
408 }
409 }
410
411 xas_set(&xas, 0);
412 xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) {
413 WARN_ON_ONCE(!PageWriteback(page));
414 dec_wb_stat(old_wb, WB_WRITEBACK);
415 inc_wb_stat(new_wb, WB_WRITEBACK);
416 }
417
418 if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
419 atomic_dec(&old_wb->writeback_inodes);
420 atomic_inc(&new_wb->writeback_inodes);
421 }
422
423 wb_get(new_wb);
424
425
426
427
428
429
430
431 if (!list_empty(&inode->i_io_list)) {
432 struct inode *pos;
433
434 inode_io_list_del_locked(inode, old_wb);
435 inode->i_wb = new_wb;
436 list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
437 if (time_after_eq(inode->dirtied_when,
438 pos->dirtied_when))
439 break;
440 inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev);
441 } else {
442 inode->i_wb = new_wb;
443 }
444
445
446 inode->i_wb_frn_winner = 0;
447 inode->i_wb_frn_avg_time = 0;
448 inode->i_wb_frn_history = 0;
449 switched = true;
450skip_switch:
451
452
453
454
455 smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
456
457 xa_unlock_irq(&mapping->i_pages);
458 spin_unlock(&inode->i_lock);
459 spin_unlock(&new_wb->list_lock);
460 spin_unlock(&old_wb->list_lock);
461
462 up_read(bdi->wb_switch_rwsem);
463
464 if (switched) {
465 wb_wakeup(new_wb);
466 wb_put(old_wb);
467 }
468 wb_put(new_wb);
469
470 iput(inode);
471 kfree(isw);
472
473 atomic_dec(&isw_nr_in_flight);
474}
475
476static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
477{
478 struct inode_switch_wbs_context *isw = container_of(rcu_head,
479 struct inode_switch_wbs_context, rcu_head);
480
481
482 INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
483 queue_work(isw_wq, &isw->work);
484}
485
486
487
488
489
490
491
492
493
494static void inode_switch_wbs(struct inode *inode, int new_wb_id)
495{
496 struct backing_dev_info *bdi = inode_to_bdi(inode);
497 struct cgroup_subsys_state *memcg_css;
498 struct inode_switch_wbs_context *isw;
499
500
501 if (inode->i_state & I_WB_SWITCH)
502 return;
503
504
505 if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT)
506 return;
507
508 isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
509 if (!isw)
510 return;
511
512
513 rcu_read_lock();
514 memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
515 if (memcg_css)
516 isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
517 rcu_read_unlock();
518 if (!isw->new_wb)
519 goto out_free;
520
521
522 spin_lock(&inode->i_lock);
523 if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
524 inode->i_state & (I_WB_SWITCH | I_FREEING) ||
525 inode_to_wb(inode) == isw->new_wb) {
526 spin_unlock(&inode->i_lock);
527 goto out_free;
528 }
529 inode->i_state |= I_WB_SWITCH;
530 __iget(inode);
531 spin_unlock(&inode->i_lock);
532
533 isw->inode = inode;
534
535 atomic_inc(&isw_nr_in_flight);
536
537
538
539
540
541
542
543 call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
544 return;
545
546out_free:
547 if (isw->new_wb)
548 wb_put(isw->new_wb);
549 kfree(isw);
550}
551
552
553
554
555
556
557
558
559
560
561
562void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
563 struct inode *inode)
564{
565 if (!inode_cgwb_enabled(inode)) {
566 spin_unlock(&inode->i_lock);
567 return;
568 }
569
570 wbc->wb = inode_to_wb(inode);
571 wbc->inode = inode;
572
573 wbc->wb_id = wbc->wb->memcg_css->id;
574 wbc->wb_lcand_id = inode->i_wb_frn_winner;
575 wbc->wb_tcand_id = 0;
576 wbc->wb_bytes = 0;
577 wbc->wb_lcand_bytes = 0;
578 wbc->wb_tcand_bytes = 0;
579
580 wb_get(wbc->wb);
581 spin_unlock(&inode->i_lock);
582
583
584
585
586
587
588
589
590 if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css)))
591 inode_switch_wbs(inode, wbc->wb_id);
592}
593EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode);
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632void wbc_detach_inode(struct writeback_control *wbc)
633{
634 struct bdi_writeback *wb = wbc->wb;
635 struct inode *inode = wbc->inode;
636 unsigned long avg_time, max_bytes, max_time;
637 u16 history;
638 int max_id;
639
640 if (!wb)
641 return;
642
643 history = inode->i_wb_frn_history;
644 avg_time = inode->i_wb_frn_avg_time;
645
646
647 if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
648 wbc->wb_bytes >= wbc->wb_tcand_bytes) {
649 max_id = wbc->wb_id;
650 max_bytes = wbc->wb_bytes;
651 } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) {
652 max_id = wbc->wb_lcand_id;
653 max_bytes = wbc->wb_lcand_bytes;
654 } else {
655 max_id = wbc->wb_tcand_id;
656 max_bytes = wbc->wb_tcand_bytes;
657 }
658
659
660
661
662
663
664
665
666 max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT,
667 wb->avg_write_bandwidth);
668 if (avg_time)
669 avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) -
670 (avg_time >> WB_FRN_TIME_AVG_SHIFT);
671 else
672 avg_time = max_time;
673
674 if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) {
675 int slots;
676
677
678
679
680
681
682
683
684
685 slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT),
686 (unsigned long)WB_FRN_HIST_MAX_SLOTS);
687 history <<= slots;
688 if (wbc->wb_id != max_id)
689 history |= (1U << slots) - 1;
690
691 if (history)
692 trace_inode_foreign_history(inode, wbc, history);
693
694
695
696
697
698
699
700
701 if (hweight32(history) > WB_FRN_HIST_THR_SLOTS)
702 inode_switch_wbs(inode, max_id);
703 }
704
705
706
707
708
709 inode->i_wb_frn_winner = max_id;
710 inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX);
711 inode->i_wb_frn_history = history;
712
713 wb_put(wbc->wb);
714 wbc->wb = NULL;
715}
716EXPORT_SYMBOL_GPL(wbc_detach_inode);
717
718
719
720
721
722
723
724
725
726
727
728void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
729 size_t bytes)
730{
731 struct cgroup_subsys_state *css;
732 int id;
733
734
735
736
737
738
739
740 if (!wbc->wb || wbc->no_cgroup_owner)
741 return;
742
743 css = mem_cgroup_css_from_page(page);
744
745 if (!(css->flags & CSS_ONLINE))
746 return;
747
748 id = css->id;
749
750 if (id == wbc->wb_id) {
751 wbc->wb_bytes += bytes;
752 return;
753 }
754
755 if (id == wbc->wb_lcand_id)
756 wbc->wb_lcand_bytes += bytes;
757
758
759 if (!wbc->wb_tcand_bytes)
760 wbc->wb_tcand_id = id;
761 if (id == wbc->wb_tcand_id)
762 wbc->wb_tcand_bytes += bytes;
763 else
764 wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
765}
766EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784int inode_congested(struct inode *inode, int cong_bits)
785{
786
787
788
789
790 if (inode && inode_to_wb_is_valid(inode)) {
791 struct bdi_writeback *wb;
792 struct wb_lock_cookie lock_cookie = {};
793 bool congested;
794
795 wb = unlocked_inode_to_wb_begin(inode, &lock_cookie);
796 congested = wb_congested(wb, cong_bits);
797 unlocked_inode_to_wb_end(inode, &lock_cookie);
798 return congested;
799 }
800
801 return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
802}
803EXPORT_SYMBOL_GPL(inode_congested);
804
805
806
807
808
809
810
811
812
813
814static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
815{
816 unsigned long this_bw = wb->avg_write_bandwidth;
817 unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
818
819 if (nr_pages == LONG_MAX)
820 return LONG_MAX;
821
822
823
824
825
826
827 if (!tot_bw || this_bw >= tot_bw)
828 return nr_pages;
829 else
830 return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
831}
832
833
834
835
836
837
838
839
840
841
842
843
844static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
845 struct wb_writeback_work *base_work,
846 bool skip_if_busy)
847{
848 struct bdi_writeback *last_wb = NULL;
849 struct bdi_writeback *wb = list_entry(&bdi->wb_list,
850 struct bdi_writeback, bdi_node);
851
852 might_sleep();
853restart:
854 rcu_read_lock();
855 list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
856 DEFINE_WB_COMPLETION(fallback_work_done, bdi);
857 struct wb_writeback_work fallback_work;
858 struct wb_writeback_work *work;
859 long nr_pages;
860
861 if (last_wb) {
862 wb_put(last_wb);
863 last_wb = NULL;
864 }
865
866
867 if (!wb_has_dirty_io(wb) &&
868 (base_work->sync_mode == WB_SYNC_NONE ||
869 list_empty(&wb->b_dirty_time)))
870 continue;
871 if (skip_if_busy && writeback_in_progress(wb))
872 continue;
873
874 nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);
875
876 work = kmalloc(sizeof(*work), GFP_ATOMIC);
877 if (work) {
878 *work = *base_work;
879 work->nr_pages = nr_pages;
880 work->auto_free = 1;
881 wb_queue_work(wb, work);
882 continue;
883 }
884
885
886 work = &fallback_work;
887 *work = *base_work;
888 work->nr_pages = nr_pages;
889 work->auto_free = 0;
890 work->done = &fallback_work_done;
891
892 wb_queue_work(wb, work);
893
894
895
896
897
898
899 wb_get(wb);
900 last_wb = wb;
901
902 rcu_read_unlock();
903 wb_wait_for_completion(&fallback_work_done);
904 goto restart;
905 }
906 rcu_read_unlock();
907
908 if (last_wb)
909 wb_put(last_wb);
910}
911
912
913
914
915
916
917
918
919
920
921
922
923int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr,
924 enum wb_reason reason, struct wb_completion *done)
925{
926 struct backing_dev_info *bdi;
927 struct cgroup_subsys_state *memcg_css;
928 struct bdi_writeback *wb;
929 struct wb_writeback_work *work;
930 int ret;
931
932
933 bdi = bdi_get_by_id(bdi_id);
934 if (!bdi)
935 return -ENOENT;
936
937 rcu_read_lock();
938 memcg_css = css_from_id(memcg_id, &memory_cgrp_subsys);
939 if (memcg_css && !css_tryget(memcg_css))
940 memcg_css = NULL;
941 rcu_read_unlock();
942 if (!memcg_css) {
943 ret = -ENOENT;
944 goto out_bdi_put;
945 }
946
947
948
949
950
951 wb = wb_get_lookup(bdi, memcg_css);
952 if (!wb) {
953 ret = -ENOENT;
954 goto out_css_put;
955 }
956
957
958
959
960
961
962
963
964 if (!nr) {
965 unsigned long filepages, headroom, dirty, writeback;
966
967 mem_cgroup_wb_stats(wb, &filepages, &headroom, &dirty,
968 &writeback);
969 nr = dirty * 10 / 8;
970 }
971
972
973 work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN);
974 if (work) {
975 work->nr_pages = nr;
976 work->sync_mode = WB_SYNC_NONE;
977 work->range_cyclic = 1;
978 work->reason = reason;
979 work->done = done;
980 work->auto_free = 1;
981 wb_queue_work(wb, work);
982 ret = 0;
983 } else {
984 ret = -ENOMEM;
985 }
986
987 wb_put(wb);
988out_css_put:
989 css_put(memcg_css);
990out_bdi_put:
991 bdi_put(bdi);
992 return ret;
993}
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005void cgroup_writeback_umount(void)
1006{
1007 if (atomic_read(&isw_nr_in_flight)) {
1008 synchronize_rcu();
1009 flush_workqueue(isw_wq);
1010 }
1011}
1012
1013static int __init cgroup_writeback_init(void)
1014{
1015 isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
1016 if (!isw_wq)
1017 return -ENOMEM;
1018 return 0;
1019}
1020fs_initcall(cgroup_writeback_init);
1021
1022#else
1023
1024static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
1025static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
1026
1027static struct bdi_writeback *
1028locked_inode_to_wb_and_lock_list(struct inode *inode)
1029 __releases(&inode->i_lock)
1030 __acquires(&wb->list_lock)
1031{
1032 struct bdi_writeback *wb = inode_to_wb(inode);
1033
1034 spin_unlock(&inode->i_lock);
1035 spin_lock(&wb->list_lock);
1036 return wb;
1037}
1038
1039static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
1040 __acquires(&wb->list_lock)
1041{
1042 struct bdi_writeback *wb = inode_to_wb(inode);
1043
1044 spin_lock(&wb->list_lock);
1045 return wb;
1046}
1047
1048static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
1049{
1050 return nr_pages;
1051}
1052
1053static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
1054 struct wb_writeback_work *base_work,
1055 bool skip_if_busy)
1056{
1057 might_sleep();
1058
1059 if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
1060 base_work->auto_free = 0;
1061 wb_queue_work(&bdi->wb, base_work);
1062 }
1063}
1064
1065#endif
1066
1067
1068
1069
1070
1071static unsigned long get_nr_dirty_pages(void)
1072{
1073 return global_node_page_state(NR_FILE_DIRTY) +
1074 get_nr_dirty_inodes();
1075}
1076
1077static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
1078{
1079 if (!wb_has_dirty_io(wb))
1080 return;
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090 if (test_bit(WB_start_all, &wb->state) ||
1091 test_and_set_bit(WB_start_all, &wb->state))
1092 return;
1093
1094 wb->start_all_reason = reason;
1095 wb_wakeup(wb);
1096}
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108void wb_start_background_writeback(struct bdi_writeback *wb)
1109{
1110
1111
1112
1113
1114 trace_writeback_wake_background(wb);
1115 wb_wakeup(wb);
1116}
1117
1118
1119
1120
1121void inode_io_list_del(struct inode *inode)
1122{
1123 struct bdi_writeback *wb;
1124
1125 wb = inode_to_wb_and_lock_list(inode);
1126 spin_lock(&inode->i_lock);
1127 inode_io_list_del_locked(inode, wb);
1128 spin_unlock(&inode->i_lock);
1129 spin_unlock(&wb->list_lock);
1130}
1131EXPORT_SYMBOL(inode_io_list_del);
1132
1133
1134
1135
1136void sb_mark_inode_writeback(struct inode *inode)
1137{
1138 struct super_block *sb = inode->i_sb;
1139 unsigned long flags;
1140
1141 if (list_empty(&inode->i_wb_list)) {
1142 spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
1143 if (list_empty(&inode->i_wb_list)) {
1144 list_add_tail(&inode->i_wb_list, &sb->s_inodes_wb);
1145 trace_sb_mark_inode_writeback(inode);
1146 }
1147 spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
1148 }
1149}
1150
1151
1152
1153
1154void sb_clear_inode_writeback(struct inode *inode)
1155{
1156 struct super_block *sb = inode->i_sb;
1157 unsigned long flags;
1158
1159 if (!list_empty(&inode->i_wb_list)) {
1160 spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
1161 if (!list_empty(&inode->i_wb_list)) {
1162 list_del_init(&inode->i_wb_list);
1163 trace_sb_clear_inode_writeback(inode);
1164 }
1165 spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
1166 }
1167}
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb)
1179{
1180 assert_spin_locked(&inode->i_lock);
1181
1182 if (!list_empty(&wb->b_dirty)) {
1183 struct inode *tail;
1184
1185 tail = wb_inode(wb->b_dirty.next);
1186 if (time_before(inode->dirtied_when, tail->dirtied_when))
1187 inode->dirtied_when = jiffies;
1188 }
1189 inode_io_list_move_locked(inode, wb, &wb->b_dirty);
1190 inode->i_state &= ~I_SYNC_QUEUED;
1191}
1192
1193static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
1194{
1195 spin_lock(&inode->i_lock);
1196 redirty_tail_locked(inode, wb);
1197 spin_unlock(&inode->i_lock);
1198}
1199
1200
1201
1202
1203static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
1204{
1205 inode_io_list_move_locked(inode, wb, &wb->b_more_io);
1206}
1207
1208static void inode_sync_complete(struct inode *inode)
1209{
1210 inode->i_state &= ~I_SYNC;
1211
1212 inode_add_lru(inode);
1213
1214 smp_mb();
1215 wake_up_bit(&inode->i_state, __I_SYNC);
1216}
1217
1218static bool inode_dirtied_after(struct inode *inode, unsigned long t)
1219{
1220 bool ret = time_after(inode->dirtied_when, t);
1221#ifndef CONFIG_64BIT
1222
1223
1224
1225
1226
1227
1228 ret = ret && time_before_eq(inode->dirtied_when, jiffies);
1229#endif
1230 return ret;
1231}
1232
1233#define EXPIRE_DIRTY_ATIME 0x0001
1234
1235
1236
1237
1238
1239static int move_expired_inodes(struct list_head *delaying_queue,
1240 struct list_head *dispatch_queue,
1241 unsigned long dirtied_before)
1242{
1243 LIST_HEAD(tmp);
1244 struct list_head *pos, *node;
1245 struct super_block *sb = NULL;
1246 struct inode *inode;
1247 int do_sb_sort = 0;
1248 int moved = 0;
1249
1250 while (!list_empty(delaying_queue)) {
1251 inode = wb_inode(delaying_queue->prev);
1252 if (inode_dirtied_after(inode, dirtied_before))
1253 break;
1254 list_move(&inode->i_io_list, &tmp);
1255 moved++;
1256 spin_lock(&inode->i_lock);
1257 inode->i_state |= I_SYNC_QUEUED;
1258 spin_unlock(&inode->i_lock);
1259 if (sb_is_blkdev_sb(inode->i_sb))
1260 continue;
1261 if (sb && sb != inode->i_sb)
1262 do_sb_sort = 1;
1263 sb = inode->i_sb;
1264 }
1265
1266
1267 if (!do_sb_sort) {
1268 list_splice(&tmp, dispatch_queue);
1269 goto out;
1270 }
1271
1272
1273 while (!list_empty(&tmp)) {
1274 sb = wb_inode(tmp.prev)->i_sb;
1275 list_for_each_prev_safe(pos, node, &tmp) {
1276 inode = wb_inode(pos);
1277 if (inode->i_sb == sb)
1278 list_move(&inode->i_io_list, dispatch_queue);
1279 }
1280 }
1281out:
1282 return moved;
1283}
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work,
1297 unsigned long dirtied_before)
1298{
1299 int moved;
1300 unsigned long time_expire_jif = dirtied_before;
1301
1302 assert_spin_locked(&wb->list_lock);
1303 list_splice_init(&wb->b_more_io, &wb->b_io);
1304 moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, dirtied_before);
1305 if (!work->for_sync)
1306 time_expire_jif = jiffies - dirtytime_expire_interval * HZ;
1307 moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
1308 time_expire_jif);
1309 if (moved)
1310 wb_io_lists_populated(wb);
1311 trace_writeback_queue_io(wb, work, dirtied_before, moved);
1312}
1313
1314static int write_inode(struct inode *inode, struct writeback_control *wbc)
1315{
1316 int ret;
1317
1318 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) {
1319 trace_writeback_write_inode_start(inode, wbc);
1320 ret = inode->i_sb->s_op->write_inode(inode, wbc);
1321 trace_writeback_write_inode(inode, wbc);
1322 return ret;
1323 }
1324 return 0;
1325}
1326
1327
1328
1329
1330
1331static void __inode_wait_for_writeback(struct inode *inode)
1332 __releases(inode->i_lock)
1333 __acquires(inode->i_lock)
1334{
1335 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
1336 wait_queue_head_t *wqh;
1337
1338 wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
1339 while (inode->i_state & I_SYNC) {
1340 spin_unlock(&inode->i_lock);
1341 __wait_on_bit(wqh, &wq, bit_wait,
1342 TASK_UNINTERRUPTIBLE);
1343 spin_lock(&inode->i_lock);
1344 }
1345}
1346
1347
1348
1349
1350void inode_wait_for_writeback(struct inode *inode)
1351{
1352 spin_lock(&inode->i_lock);
1353 __inode_wait_for_writeback(inode);
1354 spin_unlock(&inode->i_lock);
1355}
1356
1357
1358
1359
1360
1361
1362static void inode_sleep_on_writeback(struct inode *inode)
1363 __releases(inode->i_lock)
1364{
1365 DEFINE_WAIT(wait);
1366 wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
1367 int sleep;
1368
1369 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
1370 sleep = inode->i_state & I_SYNC;
1371 spin_unlock(&inode->i_lock);
1372 if (sleep)
1373 schedule();
1374 finish_wait(wqh, &wait);
1375}
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
1386 struct writeback_control *wbc)
1387{
1388 if (inode->i_state & I_FREEING)
1389 return;
1390
1391
1392
1393
1394
1395
1396 if ((inode->i_state & I_DIRTY) &&
1397 (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
1398 inode->dirtied_when = jiffies;
1399
1400 if (wbc->pages_skipped) {
1401
1402
1403
1404
1405 redirty_tail_locked(inode, wb);
1406 return;
1407 }
1408
1409 if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
1410
1411
1412
1413
1414 if (wbc->nr_to_write <= 0) {
1415
1416 requeue_io(inode, wb);
1417 } else {
1418
1419
1420
1421
1422
1423
1424
1425 redirty_tail_locked(inode, wb);
1426 }
1427 } else if (inode->i_state & I_DIRTY) {
1428
1429
1430
1431
1432
1433 redirty_tail_locked(inode, wb);
1434 } else if (inode->i_state & I_DIRTY_TIME) {
1435 inode->dirtied_when = jiffies;
1436 inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
1437 inode->i_state &= ~I_SYNC_QUEUED;
1438 } else {
1439
1440 inode_io_list_del_locked(inode, wb);
1441 }
1442}
1443
1444
1445
1446
1447
1448
1449static int
1450__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
1451{
1452 struct address_space *mapping = inode->i_mapping;
1453 long nr_to_write = wbc->nr_to_write;
1454 unsigned dirty;
1455 int ret;
1456
1457 WARN_ON(!(inode->i_state & I_SYNC));
1458
1459 trace_writeback_single_inode_start(inode, wbc, nr_to_write);
1460
1461 ret = do_writepages(mapping, wbc);
1462
1463
1464
1465
1466
1467
1468
1469
1470 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
1471 int err = filemap_fdatawait(mapping);
1472 if (ret == 0)
1473 ret = err;
1474 }
1475
1476
1477
1478
1479
1480
1481 if ((inode->i_state & I_DIRTY_TIME) &&
1482 (wbc->sync_mode == WB_SYNC_ALL || wbc->for_sync ||
1483 time_after(jiffies, inode->dirtied_time_when +
1484 dirtytime_expire_interval * HZ))) {
1485 trace_writeback_lazytime(inode);
1486 mark_inode_dirty_sync(inode);
1487 }
1488
1489
1490
1491
1492
1493
1494 spin_lock(&inode->i_lock);
1495 dirty = inode->i_state & I_DIRTY;
1496 inode->i_state &= ~dirty;
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509 smp_mb();
1510
1511 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
1512 inode->i_state |= I_DIRTY_PAGES;
1513
1514 spin_unlock(&inode->i_lock);
1515
1516
1517 if (dirty & ~I_DIRTY_PAGES) {
1518 int err = write_inode(inode, wbc);
1519 if (ret == 0)
1520 ret = err;
1521 }
1522 trace_writeback_single_inode(inode, wbc, nr_to_write);
1523 return ret;
1524}
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534static int writeback_single_inode(struct inode *inode,
1535 struct writeback_control *wbc)
1536{
1537 struct bdi_writeback *wb;
1538 int ret = 0;
1539
1540 spin_lock(&inode->i_lock);
1541 if (!atomic_read(&inode->i_count))
1542 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
1543 else
1544 WARN_ON(inode->i_state & I_WILL_FREE);
1545
1546 if (inode->i_state & I_SYNC) {
1547 if (wbc->sync_mode != WB_SYNC_ALL)
1548 goto out;
1549
1550
1551
1552
1553
1554 __inode_wait_for_writeback(inode);
1555 }
1556 WARN_ON(inode->i_state & I_SYNC);
1557
1558
1559
1560
1561
1562
1563
1564
1565 if (!(inode->i_state & I_DIRTY_ALL) &&
1566 (wbc->sync_mode != WB_SYNC_ALL ||
1567 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
1568 goto out;
1569 inode->i_state |= I_SYNC;
1570 wbc_attach_and_unlock_inode(wbc, inode);
1571
1572 ret = __writeback_single_inode(inode, wbc);
1573
1574 wbc_detach_inode(wbc);
1575
1576 wb = inode_to_wb_and_lock_list(inode);
1577 spin_lock(&inode->i_lock);
1578
1579
1580
1581
1582 if (!(inode->i_state & I_DIRTY_ALL))
1583 inode_io_list_del_locked(inode, wb);
1584 spin_unlock(&wb->list_lock);
1585 inode_sync_complete(inode);
1586out:
1587 spin_unlock(&inode->i_lock);
1588 return ret;
1589}
1590
1591static long writeback_chunk_size(struct bdi_writeback *wb,
1592 struct wb_writeback_work *work)
1593{
1594 long pages;
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609 if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
1610 pages = LONG_MAX;
1611 else {
1612 pages = min(wb->avg_write_bandwidth / 2,
1613 global_wb_domain.dirty_limit / DIRTY_SCOPE);
1614 pages = min(pages, work->nr_pages);
1615 pages = round_down(pages + MIN_WRITEBACK_PAGES,
1616 MIN_WRITEBACK_PAGES);
1617 }
1618
1619 return pages;
1620}
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631static long writeback_sb_inodes(struct super_block *sb,
1632 struct bdi_writeback *wb,
1633 struct wb_writeback_work *work)
1634{
1635 struct writeback_control wbc = {
1636 .sync_mode = work->sync_mode,
1637 .tagged_writepages = work->tagged_writepages,
1638 .for_kupdate = work->for_kupdate,
1639 .for_background = work->for_background,
1640 .for_sync = work->for_sync,
1641 .range_cyclic = work->range_cyclic,
1642 .range_start = 0,
1643 .range_end = LLONG_MAX,
1644 };
1645 unsigned long start_time = jiffies;
1646 long write_chunk;
1647 long wrote = 0;
1648
1649 while (!list_empty(&wb->b_io)) {
1650 struct inode *inode = wb_inode(wb->b_io.prev);
1651 struct bdi_writeback *tmp_wb;
1652
1653 if (inode->i_sb != sb) {
1654 if (work->sb) {
1655
1656
1657
1658
1659
1660 redirty_tail(inode, wb);
1661 continue;
1662 }
1663
1664
1665
1666
1667
1668
1669 break;
1670 }
1671
1672
1673
1674
1675
1676
1677 spin_lock(&inode->i_lock);
1678 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
1679 redirty_tail_locked(inode, wb);
1680 spin_unlock(&inode->i_lock);
1681 continue;
1682 }
1683 if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693 spin_unlock(&inode->i_lock);
1694 requeue_io(inode, wb);
1695 trace_writeback_sb_inodes_requeue(inode);
1696 continue;
1697 }
1698 spin_unlock(&wb->list_lock);
1699
1700
1701
1702
1703
1704
1705 if (inode->i_state & I_SYNC) {
1706
1707 inode_sleep_on_writeback(inode);
1708
1709 spin_lock(&wb->list_lock);
1710 continue;
1711 }
1712 inode->i_state |= I_SYNC;
1713 wbc_attach_and_unlock_inode(&wbc, inode);
1714
1715 write_chunk = writeback_chunk_size(wb, work);
1716 wbc.nr_to_write = write_chunk;
1717 wbc.pages_skipped = 0;
1718
1719
1720
1721
1722
1723 __writeback_single_inode(inode, &wbc);
1724
1725 wbc_detach_inode(&wbc);
1726 work->nr_pages -= write_chunk - wbc.nr_to_write;
1727 wrote += write_chunk - wbc.nr_to_write;
1728
1729 if (need_resched()) {
1730
1731
1732
1733
1734
1735
1736
1737
1738 blk_flush_plug(current);
1739 cond_resched();
1740 }
1741
1742
1743
1744
1745
1746 tmp_wb = inode_to_wb_and_lock_list(inode);
1747 spin_lock(&inode->i_lock);
1748 if (!(inode->i_state & I_DIRTY_ALL))
1749 wrote++;
1750 requeue_inode(inode, tmp_wb, &wbc);
1751 inode_sync_complete(inode);
1752 spin_unlock(&inode->i_lock);
1753
1754 if (unlikely(tmp_wb != wb)) {
1755 spin_unlock(&tmp_wb->list_lock);
1756 spin_lock(&wb->list_lock);
1757 }
1758
1759
1760
1761
1762
1763 if (wrote) {
1764 if (time_is_before_jiffies(start_time + HZ / 10UL))
1765 break;
1766 if (work->nr_pages <= 0)
1767 break;
1768 }
1769 }
1770 return wrote;
1771}
1772
1773static long __writeback_inodes_wb(struct bdi_writeback *wb,
1774 struct wb_writeback_work *work)
1775{
1776 unsigned long start_time = jiffies;
1777 long wrote = 0;
1778
1779 while (!list_empty(&wb->b_io)) {
1780 struct inode *inode = wb_inode(wb->b_io.prev);
1781 struct super_block *sb = inode->i_sb;
1782
1783 if (!trylock_super(sb)) {
1784
1785
1786
1787
1788
1789 redirty_tail(inode, wb);
1790 continue;
1791 }
1792 wrote += writeback_sb_inodes(sb, wb, work);
1793 up_read(&sb->s_umount);
1794
1795
1796 if (wrote) {
1797 if (time_is_before_jiffies(start_time + HZ / 10UL))
1798 break;
1799 if (work->nr_pages <= 0)
1800 break;
1801 }
1802 }
1803
1804 return wrote;
1805}
1806
1807static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
1808 enum wb_reason reason)
1809{
1810 struct wb_writeback_work work = {
1811 .nr_pages = nr_pages,
1812 .sync_mode = WB_SYNC_NONE,
1813 .range_cyclic = 1,
1814 .reason = reason,
1815 };
1816 struct blk_plug plug;
1817
1818 blk_start_plug(&plug);
1819 spin_lock(&wb->list_lock);
1820 if (list_empty(&wb->b_io))
1821 queue_io(wb, &work, jiffies);
1822 __writeback_inodes_wb(wb, &work);
1823 spin_unlock(&wb->list_lock);
1824 blk_finish_plug(&plug);
1825
1826 return nr_pages - work.nr_pages;
1827}
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844static long wb_writeback(struct bdi_writeback *wb,
1845 struct wb_writeback_work *work)
1846{
1847 long nr_pages = work->nr_pages;
1848 unsigned long dirtied_before = jiffies;
1849 struct inode *inode;
1850 long progress;
1851 struct blk_plug plug;
1852
1853 blk_start_plug(&plug);
1854 spin_lock(&wb->list_lock);
1855 for (;;) {
1856
1857
1858
1859 if (work->nr_pages <= 0)
1860 break;
1861
1862
1863
1864
1865
1866
1867
1868 if ((work->for_background || work->for_kupdate) &&
1869 !list_empty(&wb->work_list))
1870 break;
1871
1872
1873
1874
1875
1876 if (work->for_background && !wb_over_bg_thresh(wb))
1877 break;
1878
1879
1880
1881
1882
1883
1884
1885 if (work->for_kupdate) {
1886 dirtied_before = jiffies -
1887 msecs_to_jiffies(dirty_expire_interval * 10);
1888 } else if (work->for_background)
1889 dirtied_before = jiffies;
1890
1891 trace_writeback_start(wb, work);
1892 if (list_empty(&wb->b_io))
1893 queue_io(wb, work, dirtied_before);
1894 if (work->sb)
1895 progress = writeback_sb_inodes(work->sb, wb, work);
1896 else
1897 progress = __writeback_inodes_wb(wb, work);
1898 trace_writeback_written(wb, work);
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908 if (progress)
1909 continue;
1910
1911
1912
1913 if (list_empty(&wb->b_more_io))
1914 break;
1915
1916
1917
1918
1919
1920 trace_writeback_wait(wb, work);
1921 inode = wb_inode(wb->b_more_io.prev);
1922 spin_lock(&inode->i_lock);
1923 spin_unlock(&wb->list_lock);
1924
1925 inode_sleep_on_writeback(inode);
1926 spin_lock(&wb->list_lock);
1927 }
1928 spin_unlock(&wb->list_lock);
1929 blk_finish_plug(&plug);
1930
1931 return nr_pages - work->nr_pages;
1932}
1933
1934
1935
1936
1937static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
1938{
1939 struct wb_writeback_work *work = NULL;
1940
1941 spin_lock_bh(&wb->work_lock);
1942 if (!list_empty(&wb->work_list)) {
1943 work = list_entry(wb->work_list.next,
1944 struct wb_writeback_work, list);
1945 list_del_init(&work->list);
1946 }
1947 spin_unlock_bh(&wb->work_lock);
1948 return work;
1949}
1950
1951static long wb_check_background_flush(struct bdi_writeback *wb)
1952{
1953 if (wb_over_bg_thresh(wb)) {
1954
1955 struct wb_writeback_work work = {
1956 .nr_pages = LONG_MAX,
1957 .sync_mode = WB_SYNC_NONE,
1958 .for_background = 1,
1959 .range_cyclic = 1,
1960 .reason = WB_REASON_BACKGROUND,
1961 };
1962
1963 return wb_writeback(wb, &work);
1964 }
1965
1966 return 0;
1967}
1968
1969static long wb_check_old_data_flush(struct bdi_writeback *wb)
1970{
1971 unsigned long expired;
1972 long nr_pages;
1973
1974
1975
1976
1977 if (!dirty_writeback_interval)
1978 return 0;
1979
1980 expired = wb->last_old_flush +
1981 msecs_to_jiffies(dirty_writeback_interval * 10);
1982 if (time_before(jiffies, expired))
1983 return 0;
1984
1985 wb->last_old_flush = jiffies;
1986 nr_pages = get_nr_dirty_pages();
1987
1988 if (nr_pages) {
1989 struct wb_writeback_work work = {
1990 .nr_pages = nr_pages,
1991 .sync_mode = WB_SYNC_NONE,
1992 .for_kupdate = 1,
1993 .range_cyclic = 1,
1994 .reason = WB_REASON_PERIODIC,
1995 };
1996
1997 return wb_writeback(wb, &work);
1998 }
1999
2000 return 0;
2001}
2002
2003static long wb_check_start_all(struct bdi_writeback *wb)
2004{
2005 long nr_pages;
2006
2007 if (!test_bit(WB_start_all, &wb->state))
2008 return 0;
2009
2010 nr_pages = get_nr_dirty_pages();
2011 if (nr_pages) {
2012 struct wb_writeback_work work = {
2013 .nr_pages = wb_split_bdi_pages(wb, nr_pages),
2014 .sync_mode = WB_SYNC_NONE,
2015 .range_cyclic = 1,
2016 .reason = wb->start_all_reason,
2017 };
2018
2019 nr_pages = wb_writeback(wb, &work);
2020 }
2021
2022 clear_bit(WB_start_all, &wb->state);
2023 return nr_pages;
2024}
2025
2026
2027
2028
2029
2030static long wb_do_writeback(struct bdi_writeback *wb)
2031{
2032 struct wb_writeback_work *work;
2033 long wrote = 0;
2034
2035 set_bit(WB_writeback_running, &wb->state);
2036 while ((work = get_next_work_item(wb)) != NULL) {
2037 trace_writeback_exec(wb, work);
2038 wrote += wb_writeback(wb, work);
2039 finish_writeback_work(wb, work);
2040 }
2041
2042
2043
2044
2045 wrote += wb_check_start_all(wb);
2046
2047
2048
2049
2050 wrote += wb_check_old_data_flush(wb);
2051 wrote += wb_check_background_flush(wb);
2052 clear_bit(WB_writeback_running, &wb->state);
2053
2054 return wrote;
2055}
2056
2057
2058
2059
2060
2061void wb_workfn(struct work_struct *work)
2062{
2063 struct bdi_writeback *wb = container_of(to_delayed_work(work),
2064 struct bdi_writeback, dwork);
2065 long pages_written;
2066
2067 set_worker_desc("flush-%s", bdi_dev_name(wb->bdi));
2068 current->flags |= PF_SWAPWRITE;
2069
2070 if (likely(!current_is_workqueue_rescuer() ||
2071 !test_bit(WB_registered, &wb->state))) {
2072
2073
2074
2075
2076
2077
2078 do {
2079 pages_written = wb_do_writeback(wb);
2080 trace_writeback_pages_written(pages_written);
2081 } while (!list_empty(&wb->work_list));
2082 } else {
2083
2084
2085
2086
2087
2088 pages_written = writeback_inodes_wb(wb, 1024,
2089 WB_REASON_FORKER_THREAD);
2090 trace_writeback_pages_written(pages_written);
2091 }
2092
2093 if (!list_empty(&wb->work_list))
2094 wb_wakeup(wb);
2095 else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
2096 wb_wakeup_delayed(wb);
2097
2098 current->flags &= ~PF_SWAPWRITE;
2099}
2100
2101
2102
2103
2104
2105static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
2106 enum wb_reason reason)
2107{
2108 struct bdi_writeback *wb;
2109
2110 if (!bdi_has_dirty_io(bdi))
2111 return;
2112
2113 list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
2114 wb_start_writeback(wb, reason);
2115}
2116
2117void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
2118 enum wb_reason reason)
2119{
2120 rcu_read_lock();
2121 __wakeup_flusher_threads_bdi(bdi, reason);
2122 rcu_read_unlock();
2123}
2124
2125
2126
2127
2128void wakeup_flusher_threads(enum wb_reason reason)
2129{
2130 struct backing_dev_info *bdi;
2131
2132
2133
2134
2135 if (blk_needs_flush_plug(current))
2136 blk_schedule_flush_plug(current);
2137
2138 rcu_read_lock();
2139 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
2140 __wakeup_flusher_threads_bdi(bdi, reason);
2141 rcu_read_unlock();
2142}
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159static void wakeup_dirtytime_writeback(struct work_struct *w);
2160static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback);
2161
2162static void wakeup_dirtytime_writeback(struct work_struct *w)
2163{
2164 struct backing_dev_info *bdi;
2165
2166 rcu_read_lock();
2167 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
2168 struct bdi_writeback *wb;
2169
2170 list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
2171 if (!list_empty(&wb->b_dirty_time))
2172 wb_wakeup(wb);
2173 }
2174 rcu_read_unlock();
2175 schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
2176}
2177
2178static int __init start_dirtytime_writeback(void)
2179{
2180 schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
2181 return 0;
2182}
2183__initcall(start_dirtytime_writeback);
2184
2185int dirtytime_interval_handler(struct ctl_table *table, int write,
2186 void __user *buffer, size_t *lenp, loff_t *ppos)
2187{
2188 int ret;
2189
2190 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
2191 if (ret == 0 && write)
2192 mod_delayed_work(system_wq, &dirtytime_work, 0);
2193 return ret;
2194}
2195
2196static noinline void block_dump___mark_inode_dirty(struct inode *inode)
2197{
2198 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
2199 struct dentry *dentry;
2200 const char *name = "?";
2201
2202 dentry = d_find_alias(inode);
2203 if (dentry) {
2204 spin_lock(&dentry->d_lock);
2205 name = (const char *) dentry->d_name.name;
2206 }
2207 printk(KERN_DEBUG
2208 "%s(%d): dirtied inode %lu (%s) on %s\n",
2209 current->comm, task_pid_nr(current), inode->i_ino,
2210 name, inode->i_sb->s_id);
2211 if (dentry) {
2212 spin_unlock(&dentry->d_lock);
2213 dput(dentry);
2214 }
2215 }
2216}
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244void __mark_inode_dirty(struct inode *inode, int flags)
2245{
2246 struct super_block *sb = inode->i_sb;
2247 int dirtytime;
2248
2249 trace_writeback_mark_inode_dirty(inode, flags);
2250
2251
2252
2253
2254
2255 if (flags & (I_DIRTY_INODE | I_DIRTY_TIME)) {
2256 trace_writeback_dirty_inode_start(inode, flags);
2257
2258 if (sb->s_op->dirty_inode)
2259 sb->s_op->dirty_inode(inode, flags);
2260
2261 trace_writeback_dirty_inode(inode, flags);
2262 }
2263 if (flags & I_DIRTY_INODE)
2264 flags &= ~I_DIRTY_TIME;
2265 dirtytime = flags & I_DIRTY_TIME;
2266
2267
2268
2269
2270
2271 smp_mb();
2272
2273 if (((inode->i_state & flags) == flags) ||
2274 (dirtytime && (inode->i_state & I_DIRTY_INODE)))
2275 return;
2276
2277 if (unlikely(block_dump))
2278 block_dump___mark_inode_dirty(inode);
2279
2280 spin_lock(&inode->i_lock);
2281 if (dirtytime && (inode->i_state & I_DIRTY_INODE))
2282 goto out_unlock_inode;
2283 if ((inode->i_state & flags) != flags) {
2284 const int was_dirty = inode->i_state & I_DIRTY;
2285
2286 inode_attach_wb(inode, NULL);
2287
2288 if (flags & I_DIRTY_INODE)
2289 inode->i_state &= ~I_DIRTY_TIME;
2290 inode->i_state |= flags;
2291
2292
2293
2294
2295
2296
2297
2298 if (inode->i_state & I_SYNC_QUEUED)
2299 goto out_unlock_inode;
2300
2301
2302
2303
2304
2305 if (!S_ISBLK(inode->i_mode)) {
2306 if (inode_unhashed(inode))
2307 goto out_unlock_inode;
2308 }
2309 if (inode->i_state & I_FREEING)
2310 goto out_unlock_inode;
2311
2312
2313
2314
2315
2316 if (!was_dirty) {
2317 struct bdi_writeback *wb;
2318 struct list_head *dirty_list;
2319 bool wakeup_bdi = false;
2320
2321 wb = locked_inode_to_wb_and_lock_list(inode);
2322
2323 WARN((wb->bdi->capabilities & BDI_CAP_WRITEBACK) &&
2324 !test_bit(WB_registered, &wb->state),
2325 "bdi-%s not registered\n", bdi_dev_name(wb->bdi));
2326
2327 inode->dirtied_when = jiffies;
2328 if (dirtytime)
2329 inode->dirtied_time_when = jiffies;
2330
2331 if (inode->i_state & I_DIRTY)
2332 dirty_list = &wb->b_dirty;
2333 else
2334 dirty_list = &wb->b_dirty_time;
2335
2336 wakeup_bdi = inode_io_list_move_locked(inode, wb,
2337 dirty_list);
2338
2339 spin_unlock(&wb->list_lock);
2340 trace_writeback_dirty_inode_enqueue(inode);
2341
2342
2343
2344
2345
2346
2347
2348 if (wakeup_bdi &&
2349 (wb->bdi->capabilities & BDI_CAP_WRITEBACK))
2350 wb_wakeup_delayed(wb);
2351 return;
2352 }
2353 }
2354out_unlock_inode:
2355 spin_unlock(&inode->i_lock);
2356}
2357EXPORT_SYMBOL(__mark_inode_dirty);
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368static void wait_sb_inodes(struct super_block *sb)
2369{
2370 LIST_HEAD(sync_list);
2371
2372
2373
2374
2375
2376 WARN_ON(!rwsem_is_locked(&sb->s_umount));
2377
2378 mutex_lock(&sb->s_sync_lock);
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389 rcu_read_lock();
2390 spin_lock_irq(&sb->s_inode_wblist_lock);
2391 list_splice_init(&sb->s_inodes_wb, &sync_list);
2392
2393
2394
2395
2396
2397
2398
2399
2400 while (!list_empty(&sync_list)) {
2401 struct inode *inode = list_first_entry(&sync_list, struct inode,
2402 i_wb_list);
2403 struct address_space *mapping = inode->i_mapping;
2404
2405
2406
2407
2408
2409
2410
2411 list_move_tail(&inode->i_wb_list, &sb->s_inodes_wb);
2412
2413
2414
2415
2416
2417
2418 if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
2419 continue;
2420
2421 spin_unlock_irq(&sb->s_inode_wblist_lock);
2422
2423 spin_lock(&inode->i_lock);
2424 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
2425 spin_unlock(&inode->i_lock);
2426
2427 spin_lock_irq(&sb->s_inode_wblist_lock);
2428 continue;
2429 }
2430 __iget(inode);
2431 spin_unlock(&inode->i_lock);
2432 rcu_read_unlock();
2433
2434
2435
2436
2437
2438
2439 filemap_fdatawait_keep_errors(mapping);
2440
2441 cond_resched();
2442
2443 iput(inode);
2444
2445 rcu_read_lock();
2446 spin_lock_irq(&sb->s_inode_wblist_lock);
2447 }
2448 spin_unlock_irq(&sb->s_inode_wblist_lock);
2449 rcu_read_unlock();
2450 mutex_unlock(&sb->s_sync_lock);
2451}
2452
2453static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
2454 enum wb_reason reason, bool skip_if_busy)
2455{
2456 struct backing_dev_info *bdi = sb->s_bdi;
2457 DEFINE_WB_COMPLETION(done, bdi);
2458 struct wb_writeback_work work = {
2459 .sb = sb,
2460 .sync_mode = WB_SYNC_NONE,
2461 .tagged_writepages = 1,
2462 .done = &done,
2463 .nr_pages = nr,
2464 .reason = reason,
2465 };
2466
2467 if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
2468 return;
2469 WARN_ON(!rwsem_is_locked(&sb->s_umount));
2470
2471 bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
2472 wb_wait_for_completion(&done);
2473}
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485void writeback_inodes_sb_nr(struct super_block *sb,
2486 unsigned long nr,
2487 enum wb_reason reason)
2488{
2489 __writeback_inodes_sb_nr(sb, nr, reason, false);
2490}
2491EXPORT_SYMBOL(writeback_inodes_sb_nr);
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
2503{
2504 return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
2505}
2506EXPORT_SYMBOL(writeback_inodes_sb);
2507
2508
2509
2510
2511
2512
2513
2514
2515void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
2516{
2517 if (!down_read_trylock(&sb->s_umount))
2518 return;
2519
2520 __writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason, true);
2521 up_read(&sb->s_umount);
2522}
2523EXPORT_SYMBOL(try_to_writeback_inodes_sb);
2524
2525
2526
2527
2528
2529
2530
2531
2532void sync_inodes_sb(struct super_block *sb)
2533{
2534 struct backing_dev_info *bdi = sb->s_bdi;
2535 DEFINE_WB_COMPLETION(done, bdi);
2536 struct wb_writeback_work work = {
2537 .sb = sb,
2538 .sync_mode = WB_SYNC_ALL,
2539 .nr_pages = LONG_MAX,
2540 .range_cyclic = 0,
2541 .done = &done,
2542 .reason = WB_REASON_SYNC,
2543 .for_sync = 1,
2544 };
2545
2546
2547
2548
2549
2550
2551 if (bdi == &noop_backing_dev_info)
2552 return;
2553 WARN_ON(!rwsem_is_locked(&sb->s_umount));
2554
2555
2556 bdi_down_write_wb_switch_rwsem(bdi);
2557 bdi_split_work_to_wbs(bdi, &work, false);
2558 wb_wait_for_completion(&done);
2559 bdi_up_write_wb_switch_rwsem(bdi);
2560
2561 wait_sb_inodes(sb);
2562}
2563EXPORT_SYMBOL(sync_inodes_sb);
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575int write_inode_now(struct inode *inode, int sync)
2576{
2577 struct writeback_control wbc = {
2578 .nr_to_write = LONG_MAX,
2579 .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
2580 .range_start = 0,
2581 .range_end = LLONG_MAX,
2582 };
2583
2584 if (!mapping_can_writeback(inode->i_mapping))
2585 wbc.nr_to_write = 0;
2586
2587 might_sleep();
2588 return writeback_single_inode(inode, &wbc);
2589}
2590EXPORT_SYMBOL(write_inode_now);
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603int sync_inode(struct inode *inode, struct writeback_control *wbc)
2604{
2605 return writeback_single_inode(inode, wbc);
2606}
2607EXPORT_SYMBOL(sync_inode);
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618int sync_inode_metadata(struct inode *inode, int wait)
2619{
2620 struct writeback_control wbc = {
2621 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
2622 .nr_to_write = 0,
2623 };
2624
2625 return sync_inode(inode, &wbc);
2626}
2627EXPORT_SYMBOL(sync_inode_metadata);
2628