1
2
3
4
5
6
7
8
9
10
11
12
13
14#include <linux/kernel.h>
15#include <linux/export.h>
16#include <linux/spinlock.h>
17#include <linux/fs.h>
18#include <linux/mm.h>
19#include <linux/swap.h>
20#include <linux/slab.h>
21#include <linux/pagemap.h>
22#include <linux/writeback.h>
23#include <linux/init.h>
24#include <linux/backing-dev.h>
25#include <linux/task_io_accounting_ops.h>
26#include <linux/blkdev.h>
27#include <linux/mpage.h>
28#include <linux/rmap.h>
29#include <linux/percpu.h>
30#include <linux/notifier.h>
31#include <linux/smp.h>
32#include <linux/sysctl.h>
33#include <linux/cpu.h>
34#include <linux/syscalls.h>
35#include <linux/buffer_head.h>
36#include <linux/pagevec.h>
37#include <trace/events/writeback.h>
38
39
40
41
42#define MAX_PAUSE max(HZ/5, 1)
43
44
45
46
47#define BANDWIDTH_INTERVAL max(HZ/5, 1)
48
49#define RATELIMIT_CALC_SHIFT 10
50
51
52
53
54
55static long ratelimit_pages = 32;
56
57
58
59
60
61
62int dirty_background_ratio = 10;
63
64
65
66
67
68unsigned long dirty_background_bytes;
69
70
71
72
73
74int vm_highmem_is_dirtyable;
75
76
77
78
79int vm_dirty_ratio = 20;
80
81
82
83
84
85unsigned long vm_dirty_bytes;
86
87
88
89
90unsigned int dirty_writeback_interval = 5 * 100;
91
92
93
94
95unsigned int dirty_expire_interval = 30 * 100;
96
97
98
99
100int block_dump;
101
102
103
104
105
106int laptop_mode;
107
108EXPORT_SYMBOL(laptop_mode);
109
110
111
112unsigned long global_dirty_limit;
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130static struct prop_descriptor vm_completions;
131
132
133
134
135
136
137static int calc_period_shift(void)
138{
139 unsigned long dirty_total;
140
141 if (vm_dirty_bytes)
142 dirty_total = vm_dirty_bytes / PAGE_SIZE;
143 else
144 dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
145 100;
146 return 2 + ilog2(dirty_total - 1);
147}
148
149
150
151
152static void update_completion_period(void)
153{
154 int shift = calc_period_shift();
155 prop_change_shift(&vm_completions, shift);
156
157 writeback_set_ratelimit();
158}
159
160int dirty_background_ratio_handler(struct ctl_table *table, int write,
161 void __user *buffer, size_t *lenp,
162 loff_t *ppos)
163{
164 int ret;
165
166 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
167 if (ret == 0 && write)
168 dirty_background_bytes = 0;
169 return ret;
170}
171
172int dirty_background_bytes_handler(struct ctl_table *table, int write,
173 void __user *buffer, size_t *lenp,
174 loff_t *ppos)
175{
176 int ret;
177
178 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
179 if (ret == 0 && write)
180 dirty_background_ratio = 0;
181 return ret;
182}
183
184int dirty_ratio_handler(struct ctl_table *table, int write,
185 void __user *buffer, size_t *lenp,
186 loff_t *ppos)
187{
188 int old_ratio = vm_dirty_ratio;
189 int ret;
190
191 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
192 if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
193 update_completion_period();
194 vm_dirty_bytes = 0;
195 }
196 return ret;
197}
198
199
200int dirty_bytes_handler(struct ctl_table *table, int write,
201 void __user *buffer, size_t *lenp,
202 loff_t *ppos)
203{
204 unsigned long old_bytes = vm_dirty_bytes;
205 int ret;
206
207 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
208 if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
209 update_completion_period();
210 vm_dirty_ratio = 0;
211 }
212 return ret;
213}
214
215
216
217
218
219static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
220{
221 __inc_bdi_stat(bdi, BDI_WRITTEN);
222 __prop_inc_percpu_max(&vm_completions, &bdi->completions,
223 bdi->max_prop_frac);
224}
225
226void bdi_writeout_inc(struct backing_dev_info *bdi)
227{
228 unsigned long flags;
229
230 local_irq_save(flags);
231 __bdi_writeout_inc(bdi);
232 local_irq_restore(flags);
233}
234EXPORT_SYMBOL_GPL(bdi_writeout_inc);
235
236
237
238
239static void bdi_writeout_fraction(struct backing_dev_info *bdi,
240 long *numerator, long *denominator)
241{
242 prop_fraction_percpu(&vm_completions, &bdi->completions,
243 numerator, denominator);
244}
245
246
247
248
249
250
251static unsigned int bdi_min_ratio;
252
253int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
254{
255 int ret = 0;
256
257 spin_lock_bh(&bdi_lock);
258 if (min_ratio > bdi->max_ratio) {
259 ret = -EINVAL;
260 } else {
261 min_ratio -= bdi->min_ratio;
262 if (bdi_min_ratio + min_ratio < 100) {
263 bdi_min_ratio += min_ratio;
264 bdi->min_ratio += min_ratio;
265 } else {
266 ret = -EINVAL;
267 }
268 }
269 spin_unlock_bh(&bdi_lock);
270
271 return ret;
272}
273
274int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
275{
276 int ret = 0;
277
278 if (max_ratio > 100)
279 return -EINVAL;
280
281 spin_lock_bh(&bdi_lock);
282 if (bdi->min_ratio > max_ratio) {
283 ret = -EINVAL;
284 } else {
285 bdi->max_ratio = max_ratio;
286 bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
287 }
288 spin_unlock_bh(&bdi_lock);
289
290 return ret;
291}
292EXPORT_SYMBOL(bdi_set_max_ratio);
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312static unsigned long highmem_dirtyable_memory(unsigned long total)
313{
314#ifdef CONFIG_HIGHMEM
315 int node;
316 unsigned long x = 0;
317
318 for_each_node_state(node, N_HIGH_MEMORY) {
319 struct zone *z =
320 &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
321
322 x += zone_page_state(z, NR_FREE_PAGES) +
323 zone_reclaimable_pages(z);
324 }
325
326
327
328
329
330
331 return min(x, total);
332#else
333 return 0;
334#endif
335}
336
337
338
339
340
341
342
343unsigned long determine_dirtyable_memory(void)
344{
345 unsigned long x;
346
347 x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
348
349 if (!vm_highmem_is_dirtyable)
350 x -= highmem_dirtyable_memory(x);
351
352 return x + 1;
353}
354
355static unsigned long dirty_freerun_ceiling(unsigned long thresh,
356 unsigned long bg_thresh)
357{
358 return (thresh + bg_thresh) / 2;
359}
360
361static unsigned long hard_dirty_limit(unsigned long thresh)
362{
363 return max(thresh, global_dirty_limit);
364}
365
366
367
368
369
370
371
372
373
374
375void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
376{
377 unsigned long background;
378 unsigned long dirty;
379 unsigned long uninitialized_var(available_memory);
380 struct task_struct *tsk;
381
382 if (!vm_dirty_bytes || !dirty_background_bytes)
383 available_memory = determine_dirtyable_memory();
384
385 if (vm_dirty_bytes)
386 dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
387 else
388 dirty = (vm_dirty_ratio * available_memory) / 100;
389
390 if (dirty_background_bytes)
391 background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
392 else
393 background = (dirty_background_ratio * available_memory) / 100;
394
395 if (background >= dirty)
396 background = dirty / 2;
397 tsk = current;
398 if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
399 background += background / 4;
400 dirty += dirty / 4;
401 }
402 *pbackground = background;
403 *pdirty = dirty;
404 trace_global_dirty_state(background, dirty);
405}
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
430{
431 u64 bdi_dirty;
432 long numerator, denominator;
433
434
435
436
437 bdi_writeout_fraction(bdi, &numerator, &denominator);
438
439 bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
440 bdi_dirty *= numerator;
441 do_div(bdi_dirty, denominator);
442
443 bdi_dirty += (dirty * bdi->min_ratio) / 100;
444 if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
445 bdi_dirty = dirty * bdi->max_ratio / 100;
446
447 return bdi_dirty;
448}
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
526 unsigned long thresh,
527 unsigned long bg_thresh,
528 unsigned long dirty,
529 unsigned long bdi_thresh,
530 unsigned long bdi_dirty)
531{
532 unsigned long write_bw = bdi->avg_write_bandwidth;
533 unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
534 unsigned long limit = hard_dirty_limit(thresh);
535 unsigned long x_intercept;
536 unsigned long setpoint;
537 unsigned long bdi_setpoint;
538 unsigned long span;
539 long long pos_ratio;
540 long x;
541
542 if (unlikely(dirty >= limit))
543 return 0;
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561 setpoint = (freerun + limit) / 2;
562 x = div_s64((setpoint - dirty) << RATELIMIT_CALC_SHIFT,
563 limit - setpoint + 1);
564 pos_ratio = x;
565 pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
566 pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
567 pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600 if (unlikely(bdi_thresh > thresh))
601 bdi_thresh = thresh;
602
603
604
605
606
607
608
609 bdi_thresh = max(bdi_thresh, (limit - dirty) / 8);
610
611
612
613
614 x = div_u64((u64)bdi_thresh << 16, thresh + 1);
615 bdi_setpoint = setpoint * (u64)x >> 16;
616
617
618
619
620
621
622
623
624 span = (thresh - bdi_thresh + 8 * write_bw) * (u64)x >> 16;
625 x_intercept = bdi_setpoint + span;
626
627 if (bdi_dirty < x_intercept - span / 4) {
628 pos_ratio = div_u64(pos_ratio * (x_intercept - bdi_dirty),
629 x_intercept - bdi_setpoint + 1);
630 } else
631 pos_ratio /= 4;
632
633
634
635
636
637
638 x_intercept = bdi_thresh / 2;
639 if (bdi_dirty < x_intercept) {
640 if (bdi_dirty > x_intercept / 8)
641 pos_ratio = div_u64(pos_ratio * x_intercept, bdi_dirty);
642 else
643 pos_ratio *= 8;
644 }
645
646 return pos_ratio;
647}
648
649static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
650 unsigned long elapsed,
651 unsigned long written)
652{
653 const unsigned long period = roundup_pow_of_two(3 * HZ);
654 unsigned long avg = bdi->avg_write_bandwidth;
655 unsigned long old = bdi->write_bandwidth;
656 u64 bw;
657
658
659
660
661
662
663
664
665 bw = written - bdi->written_stamp;
666 bw *= HZ;
667 if (unlikely(elapsed > period)) {
668 do_div(bw, elapsed);
669 avg = bw;
670 goto out;
671 }
672 bw += (u64)bdi->write_bandwidth * (period - elapsed);
673 bw >>= ilog2(period);
674
675
676
677
678 if (avg > old && old >= (unsigned long)bw)
679 avg -= (avg - old) >> 3;
680
681 if (avg < old && old <= (unsigned long)bw)
682 avg += (old - avg) >> 3;
683
684out:
685 bdi->write_bandwidth = bw;
686 bdi->avg_write_bandwidth = avg;
687}
688
689
690
691
692
693
694
695
696
697static void update_dirty_limit(unsigned long thresh, unsigned long dirty)
698{
699 unsigned long limit = global_dirty_limit;
700
701
702
703
704 if (limit < thresh) {
705 limit = thresh;
706 goto update;
707 }
708
709
710
711
712
713
714 thresh = max(thresh, dirty);
715 if (limit > thresh) {
716 limit -= (limit - thresh) >> 5;
717 goto update;
718 }
719 return;
720update:
721 global_dirty_limit = limit;
722}
723
724static void global_update_bandwidth(unsigned long thresh,
725 unsigned long dirty,
726 unsigned long now)
727{
728 static DEFINE_SPINLOCK(dirty_lock);
729 static unsigned long update_time;
730
731
732
733
734 if (time_before(now, update_time + BANDWIDTH_INTERVAL))
735 return;
736
737 spin_lock(&dirty_lock);
738 if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) {
739 update_dirty_limit(thresh, dirty);
740 update_time = now;
741 }
742 spin_unlock(&dirty_lock);
743}
744
745
746
747
748
749
750
751static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
752 unsigned long thresh,
753 unsigned long bg_thresh,
754 unsigned long dirty,
755 unsigned long bdi_thresh,
756 unsigned long bdi_dirty,
757 unsigned long dirtied,
758 unsigned long elapsed)
759{
760 unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
761 unsigned long limit = hard_dirty_limit(thresh);
762 unsigned long setpoint = (freerun + limit) / 2;
763 unsigned long write_bw = bdi->avg_write_bandwidth;
764 unsigned long dirty_ratelimit = bdi->dirty_ratelimit;
765 unsigned long dirty_rate;
766 unsigned long task_ratelimit;
767 unsigned long balanced_dirty_ratelimit;
768 unsigned long pos_ratio;
769 unsigned long step;
770 unsigned long x;
771
772
773
774
775
776 dirty_rate = (dirtied - bdi->dirtied_stamp) * HZ / elapsed;
777
778 pos_ratio = bdi_position_ratio(bdi, thresh, bg_thresh, dirty,
779 bdi_thresh, bdi_dirty);
780
781
782
783 task_ratelimit = (u64)dirty_ratelimit *
784 pos_ratio >> RATELIMIT_CALC_SHIFT;
785 task_ratelimit++;
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817 balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw,
818 dirty_rate | 1);
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854 step = 0;
855 if (dirty < setpoint) {
856 x = min(bdi->balanced_dirty_ratelimit,
857 min(balanced_dirty_ratelimit, task_ratelimit));
858 if (dirty_ratelimit < x)
859 step = x - dirty_ratelimit;
860 } else {
861 x = max(bdi->balanced_dirty_ratelimit,
862 max(balanced_dirty_ratelimit, task_ratelimit));
863 if (dirty_ratelimit > x)
864 step = dirty_ratelimit - x;
865 }
866
867
868
869
870
871
872 step >>= dirty_ratelimit / (2 * step + 1);
873
874
875
876 step = (step + 7) / 8;
877
878 if (dirty_ratelimit < balanced_dirty_ratelimit)
879 dirty_ratelimit += step;
880 else
881 dirty_ratelimit -= step;
882
883 bdi->dirty_ratelimit = max(dirty_ratelimit, 1UL);
884 bdi->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
885
886 trace_bdi_dirty_ratelimit(bdi, dirty_rate, task_ratelimit);
887}
888
889void __bdi_update_bandwidth(struct backing_dev_info *bdi,
890 unsigned long thresh,
891 unsigned long bg_thresh,
892 unsigned long dirty,
893 unsigned long bdi_thresh,
894 unsigned long bdi_dirty,
895 unsigned long start_time)
896{
897 unsigned long now = jiffies;
898 unsigned long elapsed = now - bdi->bw_time_stamp;
899 unsigned long dirtied;
900 unsigned long written;
901
902
903
904
905 if (elapsed < BANDWIDTH_INTERVAL)
906 return;
907
908 dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]);
909 written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
910
911
912
913
914
915 if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
916 goto snapshot;
917
918 if (thresh) {
919 global_update_bandwidth(thresh, dirty, now);
920 bdi_update_dirty_ratelimit(bdi, thresh, bg_thresh, dirty,
921 bdi_thresh, bdi_dirty,
922 dirtied, elapsed);
923 }
924 bdi_update_write_bandwidth(bdi, elapsed, written);
925
926snapshot:
927 bdi->dirtied_stamp = dirtied;
928 bdi->written_stamp = written;
929 bdi->bw_time_stamp = now;
930}
931
932static void bdi_update_bandwidth(struct backing_dev_info *bdi,
933 unsigned long thresh,
934 unsigned long bg_thresh,
935 unsigned long dirty,
936 unsigned long bdi_thresh,
937 unsigned long bdi_dirty,
938 unsigned long start_time)
939{
940 if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
941 return;
942 spin_lock(&bdi->wb.list_lock);
943 __bdi_update_bandwidth(bdi, thresh, bg_thresh, dirty,
944 bdi_thresh, bdi_dirty, start_time);
945 spin_unlock(&bdi->wb.list_lock);
946}
947
948
949
950
951
952
953
954
955
956static unsigned long dirty_poll_interval(unsigned long dirty,
957 unsigned long thresh)
958{
959 if (thresh > dirty)
960 return 1UL << (ilog2(thresh - dirty) >> 1);
961
962 return 1;
963}
964
965static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
966 unsigned long bdi_dirty)
967{
968 unsigned long bw = bdi->avg_write_bandwidth;
969 unsigned long hi = ilog2(bw);
970 unsigned long lo = ilog2(bdi->dirty_ratelimit);
971 unsigned long t;
972
973
974 t = HZ / 50;
975
976
977
978
979
980
981
982 if (hi > lo)
983 t += (hi - lo) * (20 * HZ) / 1024;
984
985
986
987
988
989
990
991
992 t = min(t, bdi_dirty * HZ / (8 * bw + 1));
993
994
995
996
997
998 return clamp_val(t, 4, MAX_PAUSE);
999}
1000
1001
1002
1003
1004
1005
1006
1007
1008static void balance_dirty_pages(struct address_space *mapping,
1009 unsigned long pages_dirtied)
1010{
1011 unsigned long nr_reclaimable;
1012 unsigned long bdi_reclaimable;
1013 unsigned long nr_dirty;
1014 unsigned long bdi_dirty;
1015 unsigned long freerun;
1016 unsigned long background_thresh;
1017 unsigned long dirty_thresh;
1018 unsigned long bdi_thresh;
1019 long pause = 0;
1020 long uninitialized_var(max_pause);
1021 bool dirty_exceeded = false;
1022 unsigned long task_ratelimit;
1023 unsigned long uninitialized_var(dirty_ratelimit);
1024 unsigned long pos_ratio;
1025 struct backing_dev_info *bdi = mapping->backing_dev_info;
1026 unsigned long start_time = jiffies;
1027
1028 for (;;) {
1029
1030
1031
1032
1033
1034
1035 nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
1036 global_page_state(NR_UNSTABLE_NFS);
1037 nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
1038
1039 global_dirty_limits(&background_thresh, &dirty_thresh);
1040
1041
1042
1043
1044
1045
1046 freerun = dirty_freerun_ceiling(dirty_thresh,
1047 background_thresh);
1048 if (nr_dirty <= freerun)
1049 break;
1050
1051 if (unlikely(!writeback_in_progress(bdi)))
1052 bdi_start_background_writeback(bdi);
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079 if (bdi_thresh < 2 * bdi_stat_error(bdi)) {
1080 bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
1081 bdi_dirty = bdi_reclaimable +
1082 bdi_stat_sum(bdi, BDI_WRITEBACK);
1083 } else {
1084 bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
1085 bdi_dirty = bdi_reclaimable +
1086 bdi_stat(bdi, BDI_WRITEBACK);
1087 }
1088
1089 dirty_exceeded = (bdi_dirty > bdi_thresh) ||
1090 (nr_dirty > dirty_thresh);
1091 if (dirty_exceeded && !bdi->dirty_exceeded)
1092 bdi->dirty_exceeded = 1;
1093
1094 bdi_update_bandwidth(bdi, dirty_thresh, background_thresh,
1095 nr_dirty, bdi_thresh, bdi_dirty,
1096 start_time);
1097
1098 max_pause = bdi_max_pause(bdi, bdi_dirty);
1099
1100 dirty_ratelimit = bdi->dirty_ratelimit;
1101 pos_ratio = bdi_position_ratio(bdi, dirty_thresh,
1102 background_thresh, nr_dirty,
1103 bdi_thresh, bdi_dirty);
1104 task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >>
1105 RATELIMIT_CALC_SHIFT;
1106 if (unlikely(task_ratelimit == 0)) {
1107 pause = max_pause;
1108 goto pause;
1109 }
1110 pause = HZ * pages_dirtied / task_ratelimit;
1111 if (unlikely(pause <= 0)) {
1112 trace_balance_dirty_pages(bdi,
1113 dirty_thresh,
1114 background_thresh,
1115 nr_dirty,
1116 bdi_thresh,
1117 bdi_dirty,
1118 dirty_ratelimit,
1119 task_ratelimit,
1120 pages_dirtied,
1121 pause,
1122 start_time);
1123 pause = 1;
1124 break;
1125 }
1126 pause = min(pause, max_pause);
1127
1128pause:
1129 trace_balance_dirty_pages(bdi,
1130 dirty_thresh,
1131 background_thresh,
1132 nr_dirty,
1133 bdi_thresh,
1134 bdi_dirty,
1135 dirty_ratelimit,
1136 task_ratelimit,
1137 pages_dirtied,
1138 pause,
1139 start_time);
1140 __set_current_state(TASK_KILLABLE);
1141 io_schedule_timeout(pause);
1142
1143
1144
1145
1146
1147 if (task_ratelimit)
1148 break;
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160 if (bdi_dirty <= bdi_stat_error(bdi))
1161 break;
1162
1163 if (fatal_signal_pending(current))
1164 break;
1165 }
1166
1167 if (!dirty_exceeded && bdi->dirty_exceeded)
1168 bdi->dirty_exceeded = 0;
1169
1170 current->nr_dirtied = 0;
1171 if (pause == 0) {
1172 current->nr_dirtied_pause =
1173 dirty_poll_interval(nr_dirty, dirty_thresh);
1174 } else if (pause <= max_pause / 4 &&
1175 pages_dirtied >= current->nr_dirtied_pause) {
1176 current->nr_dirtied_pause = clamp_val(
1177 dirty_ratelimit * (max_pause / 2) / HZ,
1178 pages_dirtied + pages_dirtied / 8,
1179 pages_dirtied * 4);
1180 } else if (pause >= max_pause) {
1181 current->nr_dirtied_pause = 1 | clamp_val(
1182 dirty_ratelimit * (max_pause / 2) / HZ,
1183 pages_dirtied / 4,
1184 pages_dirtied - pages_dirtied / 8);
1185 }
1186
1187 if (writeback_in_progress(bdi))
1188 return;
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198 if (laptop_mode)
1199 return;
1200
1201 if (nr_reclaimable > background_thresh)
1202 bdi_start_background_writeback(bdi);
1203}
1204
1205void set_page_dirty_balance(struct page *page, int page_mkwrite)
1206{
1207 if (set_page_dirty(page) || page_mkwrite) {
1208 struct address_space *mapping = page_mapping(page);
1209
1210 if (mapping)
1211 balance_dirty_pages_ratelimited(mapping);
1212 }
1213}
1214
1215static DEFINE_PER_CPU(int, bdp_ratelimits);
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
1232 unsigned long nr_pages_dirtied)
1233{
1234 struct backing_dev_info *bdi = mapping->backing_dev_info;
1235 int ratelimit;
1236 int *p;
1237
1238 if (!bdi_cap_account_dirty(bdi))
1239 return;
1240
1241 ratelimit = current->nr_dirtied_pause;
1242 if (bdi->dirty_exceeded)
1243 ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
1244
1245 current->nr_dirtied += nr_pages_dirtied;
1246
1247 preempt_disable();
1248
1249
1250
1251
1252
1253
1254 p = &__get_cpu_var(bdp_ratelimits);
1255 if (unlikely(current->nr_dirtied >= ratelimit))
1256 *p = 0;
1257 else {
1258 *p += nr_pages_dirtied;
1259 if (unlikely(*p >= ratelimit_pages)) {
1260 *p = 0;
1261 ratelimit = 0;
1262 }
1263 }
1264 preempt_enable();
1265
1266 if (unlikely(current->nr_dirtied >= ratelimit))
1267 balance_dirty_pages(mapping, current->nr_dirtied);
1268}
1269EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
1270
1271void throttle_vm_writeout(gfp_t gfp_mask)
1272{
1273 unsigned long background_thresh;
1274 unsigned long dirty_thresh;
1275
1276 for ( ; ; ) {
1277 global_dirty_limits(&background_thresh, &dirty_thresh);
1278
1279
1280
1281
1282
1283 dirty_thresh += dirty_thresh / 10;
1284
1285 if (global_page_state(NR_UNSTABLE_NFS) +
1286 global_page_state(NR_WRITEBACK) <= dirty_thresh)
1287 break;
1288 congestion_wait(BLK_RW_ASYNC, HZ/10);
1289
1290
1291
1292
1293
1294
1295 if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO))
1296 break;
1297 }
1298}
1299
1300
1301
1302
1303int dirty_writeback_centisecs_handler(ctl_table *table, int write,
1304 void __user *buffer, size_t *length, loff_t *ppos)
1305{
1306 proc_dointvec(table, write, buffer, length, ppos);
1307 bdi_arm_supers_timer();
1308 return 0;
1309}
1310
1311#ifdef CONFIG_BLOCK
1312void laptop_mode_timer_fn(unsigned long data)
1313{
1314 struct request_queue *q = (struct request_queue *)data;
1315 int nr_pages = global_page_state(NR_FILE_DIRTY) +
1316 global_page_state(NR_UNSTABLE_NFS);
1317
1318
1319
1320
1321
1322 if (bdi_has_dirty_io(&q->backing_dev_info))
1323 bdi_start_writeback(&q->backing_dev_info, nr_pages,
1324 WB_REASON_LAPTOP_TIMER);
1325}
1326
1327
1328
1329
1330
1331
1332void laptop_io_completion(struct backing_dev_info *info)
1333{
1334 mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode);
1335}
1336
1337
1338
1339
1340
1341
1342void laptop_sync_completion(void)
1343{
1344 struct backing_dev_info *bdi;
1345
1346 rcu_read_lock();
1347
1348 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
1349 del_timer(&bdi->laptop_mode_wb_timer);
1350
1351 rcu_read_unlock();
1352}
1353#endif
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366void writeback_set_ratelimit(void)
1367{
1368 unsigned long background_thresh;
1369 unsigned long dirty_thresh;
1370 global_dirty_limits(&background_thresh, &dirty_thresh);
1371 ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
1372 if (ratelimit_pages < 16)
1373 ratelimit_pages = 16;
1374}
1375
1376static int __cpuinit
1377ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
1378{
1379 writeback_set_ratelimit();
1380 return NOTIFY_DONE;
1381}
1382
1383static struct notifier_block __cpuinitdata ratelimit_nb = {
1384 .notifier_call = ratelimit_handler,
1385 .next = NULL,
1386};
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406void __init page_writeback_init(void)
1407{
1408 int shift;
1409
1410 writeback_set_ratelimit();
1411 register_cpu_notifier(&ratelimit_nb);
1412
1413 shift = calc_period_shift();
1414 prop_descriptor_init(&vm_completions, shift);
1415}
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434void tag_pages_for_writeback(struct address_space *mapping,
1435 pgoff_t start, pgoff_t end)
1436{
1437#define WRITEBACK_TAG_BATCH 4096
1438 unsigned long tagged;
1439
1440 do {
1441 spin_lock_irq(&mapping->tree_lock);
1442 tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree,
1443 &start, end, WRITEBACK_TAG_BATCH,
1444 PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
1445 spin_unlock_irq(&mapping->tree_lock);
1446 WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH);
1447 cond_resched();
1448
1449 } while (tagged >= WRITEBACK_TAG_BATCH && start);
1450}
1451EXPORT_SYMBOL(tag_pages_for_writeback);
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475int write_cache_pages(struct address_space *mapping,
1476 struct writeback_control *wbc, writepage_t writepage,
1477 void *data)
1478{
1479 int ret = 0;
1480 int done = 0;
1481 struct pagevec pvec;
1482 int nr_pages;
1483 pgoff_t uninitialized_var(writeback_index);
1484 pgoff_t index;
1485 pgoff_t end;
1486 pgoff_t done_index;
1487 int cycled;
1488 int range_whole = 0;
1489 int tag;
1490
1491 pagevec_init(&pvec, 0);
1492 if (wbc->range_cyclic) {
1493 writeback_index = mapping->writeback_index;
1494 index = writeback_index;
1495 if (index == 0)
1496 cycled = 1;
1497 else
1498 cycled = 0;
1499 end = -1;
1500 } else {
1501 index = wbc->range_start >> PAGE_CACHE_SHIFT;
1502 end = wbc->range_end >> PAGE_CACHE_SHIFT;
1503 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
1504 range_whole = 1;
1505 cycled = 1;
1506 }
1507 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
1508 tag = PAGECACHE_TAG_TOWRITE;
1509 else
1510 tag = PAGECACHE_TAG_DIRTY;
1511retry:
1512 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
1513 tag_pages_for_writeback(mapping, index, end);
1514 done_index = index;
1515 while (!done && (index <= end)) {
1516 int i;
1517
1518 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
1519 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
1520 if (nr_pages == 0)
1521 break;
1522
1523 for (i = 0; i < nr_pages; i++) {
1524 struct page *page = pvec.pages[i];
1525
1526
1527
1528
1529
1530
1531
1532
1533 if (page->index > end) {
1534
1535
1536
1537
1538 done = 1;
1539 break;
1540 }
1541
1542 done_index = page->index;
1543
1544 lock_page(page);
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554 if (unlikely(page->mapping != mapping)) {
1555continue_unlock:
1556 unlock_page(page);
1557 continue;
1558 }
1559
1560 if (!PageDirty(page)) {
1561
1562 goto continue_unlock;
1563 }
1564
1565 if (PageWriteback(page)) {
1566 if (wbc->sync_mode != WB_SYNC_NONE)
1567 wait_on_page_writeback(page);
1568 else
1569 goto continue_unlock;
1570 }
1571
1572 BUG_ON(PageWriteback(page));
1573 if (!clear_page_dirty_for_io(page))
1574 goto continue_unlock;
1575
1576 trace_wbc_writepage(wbc, mapping->backing_dev_info);
1577 ret = (*writepage)(page, wbc, data);
1578 if (unlikely(ret)) {
1579 if (ret == AOP_WRITEPAGE_ACTIVATE) {
1580 unlock_page(page);
1581 ret = 0;
1582 } else {
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592 done_index = page->index + 1;
1593 done = 1;
1594 break;
1595 }
1596 }
1597
1598
1599
1600
1601
1602
1603
1604 if (--wbc->nr_to_write <= 0 &&
1605 wbc->sync_mode == WB_SYNC_NONE) {
1606 done = 1;
1607 break;
1608 }
1609 }
1610 pagevec_release(&pvec);
1611 cond_resched();
1612 }
1613 if (!cycled && !done) {
1614
1615
1616
1617
1618
1619 cycled = 1;
1620 index = 0;
1621 end = writeback_index - 1;
1622 goto retry;
1623 }
1624 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
1625 mapping->writeback_index = done_index;
1626
1627 return ret;
1628}
1629EXPORT_SYMBOL(write_cache_pages);
1630
1631
1632
1633
1634
1635static int __writepage(struct page *page, struct writeback_control *wbc,
1636 void *data)
1637{
1638 struct address_space *mapping = data;
1639 int ret = mapping->a_ops->writepage(page, wbc);
1640 mapping_set_error(mapping, ret);
1641 return ret;
1642}
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652int generic_writepages(struct address_space *mapping,
1653 struct writeback_control *wbc)
1654{
1655 struct blk_plug plug;
1656 int ret;
1657
1658
1659 if (!mapping->a_ops->writepage)
1660 return 0;
1661
1662 blk_start_plug(&plug);
1663 ret = write_cache_pages(mapping, wbc, __writepage, mapping);
1664 blk_finish_plug(&plug);
1665 return ret;
1666}
1667
1668EXPORT_SYMBOL(generic_writepages);
1669
1670int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
1671{
1672 int ret;
1673
1674 if (wbc->nr_to_write <= 0)
1675 return 0;
1676 if (mapping->a_ops->writepages)
1677 ret = mapping->a_ops->writepages(mapping, wbc);
1678 else
1679 ret = generic_writepages(mapping, wbc);
1680 return ret;
1681}
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692int write_one_page(struct page *page, int wait)
1693{
1694 struct address_space *mapping = page->mapping;
1695 int ret = 0;
1696 struct writeback_control wbc = {
1697 .sync_mode = WB_SYNC_ALL,
1698 .nr_to_write = 1,
1699 };
1700
1701 BUG_ON(!PageLocked(page));
1702
1703 if (wait)
1704 wait_on_page_writeback(page);
1705
1706 if (clear_page_dirty_for_io(page)) {
1707 page_cache_get(page);
1708 ret = mapping->a_ops->writepage(page, &wbc);
1709 if (ret == 0 && wait) {
1710 wait_on_page_writeback(page);
1711 if (PageError(page))
1712 ret = -EIO;
1713 }
1714 page_cache_release(page);
1715 } else {
1716 unlock_page(page);
1717 }
1718 return ret;
1719}
1720EXPORT_SYMBOL(write_one_page);
1721
1722
1723
1724
1725int __set_page_dirty_no_writeback(struct page *page)
1726{
1727 if (!PageDirty(page))
1728 return !TestSetPageDirty(page);
1729 return 0;
1730}
1731
1732
1733
1734
1735
1736void account_page_dirtied(struct page *page, struct address_space *mapping)
1737{
1738 if (mapping_cap_account_dirty(mapping)) {
1739 __inc_zone_page_state(page, NR_FILE_DIRTY);
1740 __inc_zone_page_state(page, NR_DIRTIED);
1741 __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
1742 __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
1743 task_io_account_write(PAGE_CACHE_SIZE);
1744 }
1745}
1746EXPORT_SYMBOL(account_page_dirtied);
1747
1748
1749
1750
1751
1752
1753void account_page_writeback(struct page *page)
1754{
1755 inc_zone_page_state(page, NR_WRITEBACK);
1756}
1757EXPORT_SYMBOL(account_page_writeback);
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774int __set_page_dirty_nobuffers(struct page *page)
1775{
1776 if (!TestSetPageDirty(page)) {
1777 struct address_space *mapping = page_mapping(page);
1778 struct address_space *mapping2;
1779
1780 if (!mapping)
1781 return 1;
1782
1783 spin_lock_irq(&mapping->tree_lock);
1784 mapping2 = page_mapping(page);
1785 if (mapping2) {
1786 BUG_ON(mapping2 != mapping);
1787 WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
1788 account_page_dirtied(page, mapping);
1789 radix_tree_tag_set(&mapping->page_tree,
1790 page_index(page), PAGECACHE_TAG_DIRTY);
1791 }
1792 spin_unlock_irq(&mapping->tree_lock);
1793 if (mapping->host) {
1794
1795 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1796 }
1797 return 1;
1798 }
1799 return 0;
1800}
1801EXPORT_SYMBOL(__set_page_dirty_nobuffers);
1802
1803
1804
1805
1806
1807
1808int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
1809{
1810 wbc->pages_skipped++;
1811 return __set_page_dirty_nobuffers(page);
1812}
1813EXPORT_SYMBOL(redirty_page_for_writepage);
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826int set_page_dirty(struct page *page)
1827{
1828 struct address_space *mapping = page_mapping(page);
1829
1830 if (likely(mapping)) {
1831 int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842 ClearPageReclaim(page);
1843#ifdef CONFIG_BLOCK
1844 if (!spd)
1845 spd = __set_page_dirty_buffers;
1846#endif
1847 return (*spd)(page);
1848 }
1849 if (!PageDirty(page)) {
1850 if (!TestSetPageDirty(page))
1851 return 1;
1852 }
1853 return 0;
1854}
1855EXPORT_SYMBOL(set_page_dirty);
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867int set_page_dirty_lock(struct page *page)
1868{
1869 int ret;
1870
1871 lock_page(page);
1872 ret = set_page_dirty(page);
1873 unlock_page(page);
1874 return ret;
1875}
1876EXPORT_SYMBOL(set_page_dirty_lock);
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892int clear_page_dirty_for_io(struct page *page)
1893{
1894 struct address_space *mapping = page_mapping(page);
1895
1896 BUG_ON(!PageLocked(page));
1897
1898 if (mapping && mapping_cap_account_dirty(mapping)) {
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924 if (page_mkclean(page))
1925 set_page_dirty(page);
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936 if (TestClearPageDirty(page)) {
1937 dec_zone_page_state(page, NR_FILE_DIRTY);
1938 dec_bdi_stat(mapping->backing_dev_info,
1939 BDI_RECLAIMABLE);
1940 return 1;
1941 }
1942 return 0;
1943 }
1944 return TestClearPageDirty(page);
1945}
1946EXPORT_SYMBOL(clear_page_dirty_for_io);
1947
1948int test_clear_page_writeback(struct page *page)
1949{
1950 struct address_space *mapping = page_mapping(page);
1951 int ret;
1952
1953 if (mapping) {
1954 struct backing_dev_info *bdi = mapping->backing_dev_info;
1955 unsigned long flags;
1956
1957 spin_lock_irqsave(&mapping->tree_lock, flags);
1958 ret = TestClearPageWriteback(page);
1959 if (ret) {
1960 radix_tree_tag_clear(&mapping->page_tree,
1961 page_index(page),
1962 PAGECACHE_TAG_WRITEBACK);
1963 if (bdi_cap_account_writeback(bdi)) {
1964 __dec_bdi_stat(bdi, BDI_WRITEBACK);
1965 __bdi_writeout_inc(bdi);
1966 }
1967 }
1968 spin_unlock_irqrestore(&mapping->tree_lock, flags);
1969 } else {
1970 ret = TestClearPageWriteback(page);
1971 }
1972 if (ret) {
1973 dec_zone_page_state(page, NR_WRITEBACK);
1974 inc_zone_page_state(page, NR_WRITTEN);
1975 }
1976 return ret;
1977}
1978
1979int test_set_page_writeback(struct page *page)
1980{
1981 struct address_space *mapping = page_mapping(page);
1982 int ret;
1983
1984 if (mapping) {
1985 struct backing_dev_info *bdi = mapping->backing_dev_info;
1986 unsigned long flags;
1987
1988 spin_lock_irqsave(&mapping->tree_lock, flags);
1989 ret = TestSetPageWriteback(page);
1990 if (!ret) {
1991 radix_tree_tag_set(&mapping->page_tree,
1992 page_index(page),
1993 PAGECACHE_TAG_WRITEBACK);
1994 if (bdi_cap_account_writeback(bdi))
1995 __inc_bdi_stat(bdi, BDI_WRITEBACK);
1996 }
1997 if (!PageDirty(page))
1998 radix_tree_tag_clear(&mapping->page_tree,
1999 page_index(page),
2000 PAGECACHE_TAG_DIRTY);
2001 radix_tree_tag_clear(&mapping->page_tree,
2002 page_index(page),
2003 PAGECACHE_TAG_TOWRITE);
2004 spin_unlock_irqrestore(&mapping->tree_lock, flags);
2005 } else {
2006 ret = TestSetPageWriteback(page);
2007 }
2008 if (!ret)
2009 account_page_writeback(page);
2010 return ret;
2011
2012}
2013EXPORT_SYMBOL(test_set_page_writeback);
2014
2015
2016
2017
2018
2019int mapping_tagged(struct address_space *mapping, int tag)
2020{
2021 return radix_tree_tagged(&mapping->page_tree, tag);
2022}
2023EXPORT_SYMBOL(mapping_tagged);
2024