1
2
3
4
5
6
7
8
9
10
11
12
13
14#include <linux/kernel.h>
15#include <linux/export.h>
16#include <linux/spinlock.h>
17#include <linux/fs.h>
18#include <linux/mm.h>
19#include <linux/swap.h>
20#include <linux/slab.h>
21#include <linux/pagemap.h>
22#include <linux/writeback.h>
23#include <linux/init.h>
24#include <linux/backing-dev.h>
25#include <linux/task_io_accounting_ops.h>
26#include <linux/blkdev.h>
27#include <linux/mpage.h>
28#include <linux/rmap.h>
29#include <linux/percpu.h>
30#include <linux/notifier.h>
31#include <linux/smp.h>
32#include <linux/sysctl.h>
33#include <linux/cpu.h>
34#include <linux/syscalls.h>
35#include <linux/buffer_head.h>
36#include <linux/pagevec.h>
37#include <linux/timer.h>
38#include <linux/sched/rt.h>
39#include <linux/mm_inline.h>
40#include <trace/events/writeback.h>
41
42#include "internal.h"
43
44
45
46
47#define MAX_PAUSE max(HZ/5, 1)
48
49
50
51
52
53#define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10))
54
55
56
57
58#define BANDWIDTH_INTERVAL max(HZ/5, 1)
59
60#define RATELIMIT_CALC_SHIFT 10
61
62
63
64
65
66static long ratelimit_pages = 32;
67
68
69
70
71
72
73int dirty_background_ratio = 10;
74
75
76
77
78
79unsigned long dirty_background_bytes;
80
81
82
83
84
85int vm_highmem_is_dirtyable;
86
87
88
89
90int vm_dirty_ratio = 20;
91
92
93
94
95
96unsigned long vm_dirty_bytes;
97
98
99
100
101unsigned int dirty_writeback_interval = 5 * 100;
102
103EXPORT_SYMBOL_GPL(dirty_writeback_interval);
104
105
106
107
108unsigned int dirty_expire_interval = 30 * 100;
109
110
111
112
113int block_dump;
114
115
116
117
118
119int laptop_mode;
120
121EXPORT_SYMBOL(laptop_mode);
122
123
124
125struct wb_domain global_wb_domain;
126
127
128struct dirty_throttle_control {
129#ifdef CONFIG_CGROUP_WRITEBACK
130 struct wb_domain *dom;
131 struct dirty_throttle_control *gdtc;
132#endif
133 struct bdi_writeback *wb;
134 struct fprop_local_percpu *wb_completions;
135
136 unsigned long avail;
137 unsigned long dirty;
138 unsigned long thresh;
139 unsigned long bg_thresh;
140
141 unsigned long wb_dirty;
142 unsigned long wb_thresh;
143 unsigned long wb_bg_thresh;
144
145 unsigned long pos_ratio;
146};
147
148
149
150
151
152
153#define VM_COMPLETIONS_PERIOD_LEN (3*HZ)
154
155#ifdef CONFIG_CGROUP_WRITEBACK
156
157#define GDTC_INIT(__wb) .wb = (__wb), \
158 .dom = &global_wb_domain, \
159 .wb_completions = &(__wb)->completions
160
161#define GDTC_INIT_NO_WB .dom = &global_wb_domain
162
163#define MDTC_INIT(__wb, __gdtc) .wb = (__wb), \
164 .dom = mem_cgroup_wb_domain(__wb), \
165 .wb_completions = &(__wb)->memcg_completions, \
166 .gdtc = __gdtc
167
168static bool mdtc_valid(struct dirty_throttle_control *dtc)
169{
170 return dtc->dom;
171}
172
173static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
174{
175 return dtc->dom;
176}
177
178static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
179{
180 return mdtc->gdtc;
181}
182
183static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
184{
185 return &wb->memcg_completions;
186}
187
188static void wb_min_max_ratio(struct bdi_writeback *wb,
189 unsigned long *minp, unsigned long *maxp)
190{
191 unsigned long this_bw = wb->avg_write_bandwidth;
192 unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
193 unsigned long long min = wb->bdi->min_ratio;
194 unsigned long long max = wb->bdi->max_ratio;
195
196
197
198
199
200 if (this_bw < tot_bw) {
201 if (min) {
202 min *= this_bw;
203 do_div(min, tot_bw);
204 }
205 if (max < 100) {
206 max *= this_bw;
207 do_div(max, tot_bw);
208 }
209 }
210
211 *minp = min;
212 *maxp = max;
213}
214
215#else
216
217#define GDTC_INIT(__wb) .wb = (__wb), \
218 .wb_completions = &(__wb)->completions
219#define GDTC_INIT_NO_WB
220#define MDTC_INIT(__wb, __gdtc)
221
222static bool mdtc_valid(struct dirty_throttle_control *dtc)
223{
224 return false;
225}
226
227static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
228{
229 return &global_wb_domain;
230}
231
232static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
233{
234 return NULL;
235}
236
237static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
238{
239 return NULL;
240}
241
242static void wb_min_max_ratio(struct bdi_writeback *wb,
243 unsigned long *minp, unsigned long *maxp)
244{
245 *minp = wb->bdi->min_ratio;
246 *maxp = wb->bdi->max_ratio;
247}
248
249#endif
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276static unsigned long zone_dirtyable_memory(struct zone *zone)
277{
278 unsigned long nr_pages;
279
280 nr_pages = zone_page_state(zone, NR_FREE_PAGES);
281
282
283
284
285
286 nr_pages -= min(nr_pages, zone->totalreserve_pages);
287
288 nr_pages += zone_page_state(zone, NR_INACTIVE_FILE);
289 nr_pages += zone_page_state(zone, NR_ACTIVE_FILE);
290
291 return nr_pages;
292}
293
294static unsigned long highmem_dirtyable_memory(unsigned long total)
295{
296#ifdef CONFIG_HIGHMEM
297 int node;
298 unsigned long x = 0;
299 int i;
300
301 for_each_node_state(node, N_HIGH_MEMORY) {
302 for (i = 0; i < MAX_NR_ZONES; i++) {
303 struct zone *z = &NODE_DATA(node)->node_zones[i];
304
305 if (is_highmem(z))
306 x += zone_dirtyable_memory(z);
307 }
308 }
309
310
311
312
313
314
315
316
317
318 if ((long)x < 0)
319 x = 0;
320
321
322
323
324
325
326
327 return min(x, total);
328#else
329 return 0;
330#endif
331}
332
333
334
335
336
337
338
339static unsigned long global_dirtyable_memory(void)
340{
341 unsigned long x;
342
343 x = global_page_state(NR_FREE_PAGES);
344
345
346
347
348
349 x -= min(x, totalreserve_pages);
350
351 x += global_page_state(NR_INACTIVE_FILE);
352 x += global_page_state(NR_ACTIVE_FILE);
353
354 if (!vm_highmem_is_dirtyable)
355 x -= highmem_dirtyable_memory(x);
356
357 return x + 1;
358}
359
360
361
362
363
364
365
366
367
368
369
370static void domain_dirty_limits(struct dirty_throttle_control *dtc)
371{
372 const unsigned long available_memory = dtc->avail;
373 struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc);
374 unsigned long bytes = vm_dirty_bytes;
375 unsigned long bg_bytes = dirty_background_bytes;
376
377 unsigned long ratio = (vm_dirty_ratio * PAGE_SIZE) / 100;
378 unsigned long bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100;
379 unsigned long thresh;
380 unsigned long bg_thresh;
381 struct task_struct *tsk;
382
383
384 if (gdtc) {
385 unsigned long global_avail = gdtc->avail;
386
387
388
389
390
391
392
393
394 if (bytes)
395 ratio = min(DIV_ROUND_UP(bytes, global_avail),
396 PAGE_SIZE);
397 if (bg_bytes)
398 bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail),
399 PAGE_SIZE);
400 bytes = bg_bytes = 0;
401 }
402
403 if (bytes)
404 thresh = DIV_ROUND_UP(bytes, PAGE_SIZE);
405 else
406 thresh = (ratio * available_memory) / PAGE_SIZE;
407
408 if (bg_bytes)
409 bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE);
410 else
411 bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE;
412
413 if (bg_thresh >= thresh)
414 bg_thresh = thresh / 2;
415 tsk = current;
416 if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
417 bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32;
418 thresh += thresh / 4 + global_wb_domain.dirty_limit / 32;
419 }
420 dtc->thresh = thresh;
421 dtc->bg_thresh = bg_thresh;
422
423
424 if (!gdtc)
425 trace_global_dirty_state(bg_thresh, thresh);
426}
427
428
429
430
431
432
433
434
435
436void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
437{
438 struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
439
440 gdtc.avail = global_dirtyable_memory();
441 domain_dirty_limits(&gdtc);
442
443 *pbackground = gdtc.bg_thresh;
444 *pdirty = gdtc.thresh;
445}
446
447
448
449
450
451
452
453
454static unsigned long zone_dirty_limit(struct zone *zone)
455{
456 unsigned long zone_memory = zone_dirtyable_memory(zone);
457 struct task_struct *tsk = current;
458 unsigned long dirty;
459
460 if (vm_dirty_bytes)
461 dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) *
462 zone_memory / global_dirtyable_memory();
463 else
464 dirty = vm_dirty_ratio * zone_memory / 100;
465
466 if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk))
467 dirty += dirty / 4;
468
469 return dirty;
470}
471
472
473
474
475
476
477
478
479bool zone_dirty_ok(struct zone *zone)
480{
481 unsigned long limit = zone_dirty_limit(zone);
482
483 return zone_page_state(zone, NR_FILE_DIRTY) +
484 zone_page_state(zone, NR_UNSTABLE_NFS) +
485 zone_page_state(zone, NR_WRITEBACK) <= limit;
486}
487
488int dirty_background_ratio_handler(struct ctl_table *table, int write,
489 void __user *buffer, size_t *lenp,
490 loff_t *ppos)
491{
492 int ret;
493
494 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
495 if (ret == 0 && write)
496 dirty_background_bytes = 0;
497 return ret;
498}
499
500int dirty_background_bytes_handler(struct ctl_table *table, int write,
501 void __user *buffer, size_t *lenp,
502 loff_t *ppos)
503{
504 int ret;
505
506 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
507 if (ret == 0 && write)
508 dirty_background_ratio = 0;
509 return ret;
510}
511
512int dirty_ratio_handler(struct ctl_table *table, int write,
513 void __user *buffer, size_t *lenp,
514 loff_t *ppos)
515{
516 int old_ratio = vm_dirty_ratio;
517 int ret;
518
519 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
520 if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
521 writeback_set_ratelimit();
522 vm_dirty_bytes = 0;
523 }
524 return ret;
525}
526
527int dirty_bytes_handler(struct ctl_table *table, int write,
528 void __user *buffer, size_t *lenp,
529 loff_t *ppos)
530{
531 unsigned long old_bytes = vm_dirty_bytes;
532 int ret;
533
534 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
535 if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
536 writeback_set_ratelimit();
537 vm_dirty_ratio = 0;
538 }
539 return ret;
540}
541
542static unsigned long wp_next_time(unsigned long cur_time)
543{
544 cur_time += VM_COMPLETIONS_PERIOD_LEN;
545
546 if (!cur_time)
547 return 1;
548 return cur_time;
549}
550
551static void wb_domain_writeout_inc(struct wb_domain *dom,
552 struct fprop_local_percpu *completions,
553 unsigned int max_prop_frac)
554{
555 __fprop_inc_percpu_max(&dom->completions, completions,
556 max_prop_frac);
557
558 if (!unlikely(dom->period_time)) {
559
560
561
562
563
564
565 dom->period_time = wp_next_time(jiffies);
566 mod_timer(&dom->period_timer, dom->period_time);
567 }
568}
569
570
571
572
573
574static inline void __wb_writeout_inc(struct bdi_writeback *wb)
575{
576 struct wb_domain *cgdom;
577
578 __inc_wb_stat(wb, WB_WRITTEN);
579 wb_domain_writeout_inc(&global_wb_domain, &wb->completions,
580 wb->bdi->max_prop_frac);
581
582 cgdom = mem_cgroup_wb_domain(wb);
583 if (cgdom)
584 wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb),
585 wb->bdi->max_prop_frac);
586}
587
588void wb_writeout_inc(struct bdi_writeback *wb)
589{
590 unsigned long flags;
591
592 local_irq_save(flags);
593 __wb_writeout_inc(wb);
594 local_irq_restore(flags);
595}
596EXPORT_SYMBOL_GPL(wb_writeout_inc);
597
598
599
600
601
602static void writeout_period(unsigned long t)
603{
604 struct wb_domain *dom = (void *)t;
605 int miss_periods = (jiffies - dom->period_time) /
606 VM_COMPLETIONS_PERIOD_LEN;
607
608 if (fprop_new_period(&dom->completions, miss_periods + 1)) {
609 dom->period_time = wp_next_time(dom->period_time +
610 miss_periods * VM_COMPLETIONS_PERIOD_LEN);
611 mod_timer(&dom->period_timer, dom->period_time);
612 } else {
613
614
615
616
617 dom->period_time = 0;
618 }
619}
620
621int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
622{
623 memset(dom, 0, sizeof(*dom));
624
625 spin_lock_init(&dom->lock);
626
627 init_timer_deferrable(&dom->period_timer);
628 dom->period_timer.function = writeout_period;
629 dom->period_timer.data = (unsigned long)dom;
630
631 dom->dirty_limit_tstamp = jiffies;
632
633 return fprop_global_init(&dom->completions, gfp);
634}
635
636#ifdef CONFIG_CGROUP_WRITEBACK
637void wb_domain_exit(struct wb_domain *dom)
638{
639 del_timer_sync(&dom->period_timer);
640 fprop_global_destroy(&dom->completions);
641}
642#endif
643
644
645
646
647
648
649static unsigned int bdi_min_ratio;
650
651int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
652{
653 int ret = 0;
654
655 spin_lock_bh(&bdi_lock);
656 if (min_ratio > bdi->max_ratio) {
657 ret = -EINVAL;
658 } else {
659 min_ratio -= bdi->min_ratio;
660 if (bdi_min_ratio + min_ratio < 100) {
661 bdi_min_ratio += min_ratio;
662 bdi->min_ratio += min_ratio;
663 } else {
664 ret = -EINVAL;
665 }
666 }
667 spin_unlock_bh(&bdi_lock);
668
669 return ret;
670}
671
672int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
673{
674 int ret = 0;
675
676 if (max_ratio > 100)
677 return -EINVAL;
678
679 spin_lock_bh(&bdi_lock);
680 if (bdi->min_ratio > max_ratio) {
681 ret = -EINVAL;
682 } else {
683 bdi->max_ratio = max_ratio;
684 bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100;
685 }
686 spin_unlock_bh(&bdi_lock);
687
688 return ret;
689}
690EXPORT_SYMBOL(bdi_set_max_ratio);
691
692static unsigned long dirty_freerun_ceiling(unsigned long thresh,
693 unsigned long bg_thresh)
694{
695 return (thresh + bg_thresh) / 2;
696}
697
698static unsigned long hard_dirty_limit(struct wb_domain *dom,
699 unsigned long thresh)
700{
701 return max(thresh, dom->dirty_limit);
702}
703
704
705
706
707
708static void mdtc_calc_avail(struct dirty_throttle_control *mdtc,
709 unsigned long filepages, unsigned long headroom)
710{
711 struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc);
712 unsigned long clean = filepages - min(filepages, mdtc->dirty);
713 unsigned long global_clean = gdtc->avail - min(gdtc->avail, gdtc->dirty);
714 unsigned long other_clean = global_clean - min(global_clean, clean);
715
716 mdtc->avail = filepages + min(headroom, other_clean);
717}
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
741{
742 struct wb_domain *dom = dtc_dom(dtc);
743 unsigned long thresh = dtc->thresh;
744 u64 wb_thresh;
745 long numerator, denominator;
746 unsigned long wb_min_ratio, wb_max_ratio;
747
748
749
750
751 fprop_fraction_percpu(&dom->completions, dtc->wb_completions,
752 &numerator, &denominator);
753
754 wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100;
755 wb_thresh *= numerator;
756 do_div(wb_thresh, denominator);
757
758 wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio);
759
760 wb_thresh += (thresh * wb_min_ratio) / 100;
761 if (wb_thresh > (thresh * wb_max_ratio) / 100)
762 wb_thresh = thresh * wb_max_ratio / 100;
763
764 return wb_thresh;
765}
766
767unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
768{
769 struct dirty_throttle_control gdtc = { GDTC_INIT(wb),
770 .thresh = thresh };
771 return __wb_calc_thresh(&gdtc);
772}
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788static long long pos_ratio_polynom(unsigned long setpoint,
789 unsigned long dirty,
790 unsigned long limit)
791{
792 long long pos_ratio;
793 long x;
794
795 x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
796 (limit - setpoint) | 1);
797 pos_ratio = x;
798 pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
799 pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
800 pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
801
802 return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT);
803}
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880static void wb_position_ratio(struct dirty_throttle_control *dtc)
881{
882 struct bdi_writeback *wb = dtc->wb;
883 unsigned long write_bw = wb->avg_write_bandwidth;
884 unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
885 unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
886 unsigned long wb_thresh = dtc->wb_thresh;
887 unsigned long x_intercept;
888 unsigned long setpoint;
889 unsigned long wb_setpoint;
890 unsigned long span;
891 long long pos_ratio;
892 long x;
893
894 dtc->pos_ratio = 0;
895
896 if (unlikely(dtc->dirty >= limit))
897 return;
898
899
900
901
902
903
904 setpoint = (freerun + limit) / 2;
905 pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit);
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932 if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
933 long long wb_pos_ratio;
934
935 if (dtc->wb_dirty < 8) {
936 dtc->pos_ratio = min_t(long long, pos_ratio * 2,
937 2 << RATELIMIT_CALC_SHIFT);
938 return;
939 }
940
941 if (dtc->wb_dirty >= wb_thresh)
942 return;
943
944 wb_setpoint = dirty_freerun_ceiling(wb_thresh,
945 dtc->wb_bg_thresh);
946
947 if (wb_setpoint == 0 || wb_setpoint == wb_thresh)
948 return;
949
950 wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty,
951 wb_thresh);
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974 dtc->pos_ratio = min(pos_ratio, wb_pos_ratio);
975 return;
976 }
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009 if (unlikely(wb_thresh > dtc->thresh))
1010 wb_thresh = dtc->thresh;
1011
1012
1013
1014
1015
1016
1017
1018 wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8);
1019
1020
1021
1022
1023 x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1);
1024 wb_setpoint = setpoint * (u64)x >> 16;
1025
1026
1027
1028
1029
1030
1031
1032
1033 span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16;
1034 x_intercept = wb_setpoint + span;
1035
1036 if (dtc->wb_dirty < x_intercept - span / 4) {
1037 pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty),
1038 (x_intercept - wb_setpoint) | 1);
1039 } else
1040 pos_ratio /= 4;
1041
1042
1043
1044
1045
1046
1047 x_intercept = wb_thresh / 2;
1048 if (dtc->wb_dirty < x_intercept) {
1049 if (dtc->wb_dirty > x_intercept / 8)
1050 pos_ratio = div_u64(pos_ratio * x_intercept,
1051 dtc->wb_dirty);
1052 else
1053 pos_ratio *= 8;
1054 }
1055
1056 dtc->pos_ratio = pos_ratio;
1057}
1058
1059static void wb_update_write_bandwidth(struct bdi_writeback *wb,
1060 unsigned long elapsed,
1061 unsigned long written)
1062{
1063 const unsigned long period = roundup_pow_of_two(3 * HZ);
1064 unsigned long avg = wb->avg_write_bandwidth;
1065 unsigned long old = wb->write_bandwidth;
1066 u64 bw;
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078 bw = written - min(written, wb->written_stamp);
1079 bw *= HZ;
1080 if (unlikely(elapsed > period)) {
1081 do_div(bw, elapsed);
1082 avg = bw;
1083 goto out;
1084 }
1085 bw += (u64)wb->write_bandwidth * (period - elapsed);
1086 bw >>= ilog2(period);
1087
1088
1089
1090
1091 if (avg > old && old >= (unsigned long)bw)
1092 avg -= (avg - old) >> 3;
1093
1094 if (avg < old && old <= (unsigned long)bw)
1095 avg += (old - avg) >> 3;
1096
1097out:
1098
1099 avg = max(avg, 1LU);
1100 if (wb_has_dirty_io(wb)) {
1101 long delta = avg - wb->avg_write_bandwidth;
1102 WARN_ON_ONCE(atomic_long_add_return(delta,
1103 &wb->bdi->tot_write_bandwidth) <= 0);
1104 }
1105 wb->write_bandwidth = bw;
1106 wb->avg_write_bandwidth = avg;
1107}
1108
1109static void update_dirty_limit(struct dirty_throttle_control *dtc)
1110{
1111 struct wb_domain *dom = dtc_dom(dtc);
1112 unsigned long thresh = dtc->thresh;
1113 unsigned long limit = dom->dirty_limit;
1114
1115
1116
1117
1118 if (limit < thresh) {
1119 limit = thresh;
1120 goto update;
1121 }
1122
1123
1124
1125
1126
1127
1128 thresh = max(thresh, dtc->dirty);
1129 if (limit > thresh) {
1130 limit -= (limit - thresh) >> 5;
1131 goto update;
1132 }
1133 return;
1134update:
1135 dom->dirty_limit = limit;
1136}
1137
1138static void domain_update_bandwidth(struct dirty_throttle_control *dtc,
1139 unsigned long now)
1140{
1141 struct wb_domain *dom = dtc_dom(dtc);
1142
1143
1144
1145
1146 if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL))
1147 return;
1148
1149 spin_lock(&dom->lock);
1150 if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) {
1151 update_dirty_limit(dtc);
1152 dom->dirty_limit_tstamp = now;
1153 }
1154 spin_unlock(&dom->lock);
1155}
1156
1157
1158
1159
1160
1161
1162
1163static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
1164 unsigned long dirtied,
1165 unsigned long elapsed)
1166{
1167 struct bdi_writeback *wb = dtc->wb;
1168 unsigned long dirty = dtc->dirty;
1169 unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
1170 unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
1171 unsigned long setpoint = (freerun + limit) / 2;
1172 unsigned long write_bw = wb->avg_write_bandwidth;
1173 unsigned long dirty_ratelimit = wb->dirty_ratelimit;
1174 unsigned long dirty_rate;
1175 unsigned long task_ratelimit;
1176 unsigned long balanced_dirty_ratelimit;
1177 unsigned long step;
1178 unsigned long x;
1179 unsigned long shift;
1180
1181
1182
1183
1184
1185 dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed;
1186
1187
1188
1189
1190 task_ratelimit = (u64)dirty_ratelimit *
1191 dtc->pos_ratio >> RATELIMIT_CALC_SHIFT;
1192 task_ratelimit++;
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224 balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw,
1225 dirty_rate | 1);
1226
1227
1228
1229 if (unlikely(balanced_dirty_ratelimit > write_bw))
1230 balanced_dirty_ratelimit = write_bw;
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266 step = 0;
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279 if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
1280 dirty = dtc->wb_dirty;
1281 if (dtc->wb_dirty < 8)
1282 setpoint = dtc->wb_dirty + 1;
1283 else
1284 setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2;
1285 }
1286
1287 if (dirty < setpoint) {
1288 x = min3(wb->balanced_dirty_ratelimit,
1289 balanced_dirty_ratelimit, task_ratelimit);
1290 if (dirty_ratelimit < x)
1291 step = x - dirty_ratelimit;
1292 } else {
1293 x = max3(wb->balanced_dirty_ratelimit,
1294 balanced_dirty_ratelimit, task_ratelimit);
1295 if (dirty_ratelimit > x)
1296 step = dirty_ratelimit - x;
1297 }
1298
1299
1300
1301
1302
1303
1304 shift = dirty_ratelimit / (2 * step + 1);
1305 if (shift < BITS_PER_LONG)
1306 step = DIV_ROUND_UP(step >> shift, 8);
1307 else
1308 step = 0;
1309
1310 if (dirty_ratelimit < balanced_dirty_ratelimit)
1311 dirty_ratelimit += step;
1312 else
1313 dirty_ratelimit -= step;
1314
1315 wb->dirty_ratelimit = max(dirty_ratelimit, 1UL);
1316 wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
1317
1318 trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit);
1319}
1320
1321static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
1322 struct dirty_throttle_control *mdtc,
1323 unsigned long start_time,
1324 bool update_ratelimit)
1325{
1326 struct bdi_writeback *wb = gdtc->wb;
1327 unsigned long now = jiffies;
1328 unsigned long elapsed = now - wb->bw_time_stamp;
1329 unsigned long dirtied;
1330 unsigned long written;
1331
1332 lockdep_assert_held(&wb->list_lock);
1333
1334
1335
1336
1337 if (elapsed < BANDWIDTH_INTERVAL)
1338 return;
1339
1340 dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]);
1341 written = percpu_counter_read(&wb->stat[WB_WRITTEN]);
1342
1343
1344
1345
1346
1347 if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time))
1348 goto snapshot;
1349
1350 if (update_ratelimit) {
1351 domain_update_bandwidth(gdtc, now);
1352 wb_update_dirty_ratelimit(gdtc, dirtied, elapsed);
1353
1354
1355
1356
1357
1358 if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) {
1359 domain_update_bandwidth(mdtc, now);
1360 wb_update_dirty_ratelimit(mdtc, dirtied, elapsed);
1361 }
1362 }
1363 wb_update_write_bandwidth(wb, elapsed, written);
1364
1365snapshot:
1366 wb->dirtied_stamp = dirtied;
1367 wb->written_stamp = written;
1368 wb->bw_time_stamp = now;
1369}
1370
1371void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time)
1372{
1373 struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };
1374
1375 __wb_update_bandwidth(&gdtc, NULL, start_time, false);
1376}
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386static unsigned long dirty_poll_interval(unsigned long dirty,
1387 unsigned long thresh)
1388{
1389 if (thresh > dirty)
1390 return 1UL << (ilog2(thresh - dirty) >> 1);
1391
1392 return 1;
1393}
1394
1395static unsigned long wb_max_pause(struct bdi_writeback *wb,
1396 unsigned long wb_dirty)
1397{
1398 unsigned long bw = wb->avg_write_bandwidth;
1399 unsigned long t;
1400
1401
1402
1403
1404
1405
1406
1407
1408 t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
1409 t++;
1410
1411 return min_t(unsigned long, t, MAX_PAUSE);
1412}
1413
1414static long wb_min_pause(struct bdi_writeback *wb,
1415 long max_pause,
1416 unsigned long task_ratelimit,
1417 unsigned long dirty_ratelimit,
1418 int *nr_dirtied_pause)
1419{
1420 long hi = ilog2(wb->avg_write_bandwidth);
1421 long lo = ilog2(wb->dirty_ratelimit);
1422 long t;
1423 long pause;
1424 int pages;
1425
1426
1427 t = max(1, HZ / 100);
1428
1429
1430
1431
1432
1433
1434
1435 if (hi > lo)
1436 t += (hi - lo) * (10 * HZ) / 1024;
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456 t = min(t, 1 + max_pause / 2);
1457 pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467 if (pages < DIRTY_POLL_THRESH) {
1468 t = max_pause;
1469 pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
1470 if (pages > DIRTY_POLL_THRESH) {
1471 pages = DIRTY_POLL_THRESH;
1472 t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit;
1473 }
1474 }
1475
1476 pause = HZ * pages / (task_ratelimit + 1);
1477 if (pause > max_pause) {
1478 t = max_pause;
1479 pages = task_ratelimit * t / roundup_pow_of_two(HZ);
1480 }
1481
1482 *nr_dirtied_pause = pages;
1483
1484
1485
1486 return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
1487}
1488
1489static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
1490{
1491 struct bdi_writeback *wb = dtc->wb;
1492 unsigned long wb_reclaimable;
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507 dtc->wb_thresh = __wb_calc_thresh(dtc);
1508 dtc->wb_bg_thresh = dtc->thresh ?
1509 div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0;
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521 if (dtc->wb_thresh < 2 * wb_stat_error(wb)) {
1522 wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
1523 dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK);
1524 } else {
1525 wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE);
1526 dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK);
1527 }
1528}
1529
1530
1531
1532
1533
1534
1535
1536
1537static void balance_dirty_pages(struct address_space *mapping,
1538 struct bdi_writeback *wb,
1539 unsigned long pages_dirtied)
1540{
1541 struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
1542 struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
1543 struct dirty_throttle_control * const gdtc = &gdtc_stor;
1544 struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
1545 &mdtc_stor : NULL;
1546 struct dirty_throttle_control *sdtc;
1547 unsigned long nr_reclaimable;
1548 long period;
1549 long pause;
1550 long max_pause;
1551 long min_pause;
1552 int nr_dirtied_pause;
1553 bool dirty_exceeded = false;
1554 unsigned long task_ratelimit;
1555 unsigned long dirty_ratelimit;
1556 struct backing_dev_info *bdi = wb->bdi;
1557 bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
1558 unsigned long start_time = jiffies;
1559
1560 for (;;) {
1561 unsigned long now = jiffies;
1562 unsigned long dirty, thresh, bg_thresh;
1563 unsigned long m_dirty = 0;
1564 unsigned long m_thresh = 0;
1565 unsigned long m_bg_thresh = 0;
1566
1567
1568
1569
1570
1571
1572
1573 nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
1574 global_page_state(NR_UNSTABLE_NFS);
1575 gdtc->avail = global_dirtyable_memory();
1576 gdtc->dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
1577
1578 domain_dirty_limits(gdtc);
1579
1580 if (unlikely(strictlimit)) {
1581 wb_dirty_limits(gdtc);
1582
1583 dirty = gdtc->wb_dirty;
1584 thresh = gdtc->wb_thresh;
1585 bg_thresh = gdtc->wb_bg_thresh;
1586 } else {
1587 dirty = gdtc->dirty;
1588 thresh = gdtc->thresh;
1589 bg_thresh = gdtc->bg_thresh;
1590 }
1591
1592 if (mdtc) {
1593 unsigned long filepages, headroom, writeback;
1594
1595
1596
1597
1598
1599 mem_cgroup_wb_stats(wb, &filepages, &headroom,
1600 &mdtc->dirty, &writeback);
1601 mdtc->dirty += writeback;
1602 mdtc_calc_avail(mdtc, filepages, headroom);
1603
1604 domain_dirty_limits(mdtc);
1605
1606 if (unlikely(strictlimit)) {
1607 wb_dirty_limits(mdtc);
1608 m_dirty = mdtc->wb_dirty;
1609 m_thresh = mdtc->wb_thresh;
1610 m_bg_thresh = mdtc->wb_bg_thresh;
1611 } else {
1612 m_dirty = mdtc->dirty;
1613 m_thresh = mdtc->thresh;
1614 m_bg_thresh = mdtc->bg_thresh;
1615 }
1616 }
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630 if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) &&
1631 (!mdtc ||
1632 m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) {
1633 unsigned long intv = dirty_poll_interval(dirty, thresh);
1634 unsigned long m_intv = ULONG_MAX;
1635
1636 current->dirty_paused_when = now;
1637 current->nr_dirtied = 0;
1638 if (mdtc)
1639 m_intv = dirty_poll_interval(m_dirty, m_thresh);
1640 current->nr_dirtied_pause = min(intv, m_intv);
1641 break;
1642 }
1643
1644 if (unlikely(!writeback_in_progress(wb)))
1645 wb_start_background_writeback(wb);
1646
1647
1648
1649
1650
1651 if (!strictlimit)
1652 wb_dirty_limits(gdtc);
1653
1654 dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) &&
1655 ((gdtc->dirty > gdtc->thresh) || strictlimit);
1656
1657 wb_position_ratio(gdtc);
1658 sdtc = gdtc;
1659
1660 if (mdtc) {
1661
1662
1663
1664
1665
1666
1667 if (!strictlimit)
1668 wb_dirty_limits(mdtc);
1669
1670 dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) &&
1671 ((mdtc->dirty > mdtc->thresh) || strictlimit);
1672
1673 wb_position_ratio(mdtc);
1674 if (mdtc->pos_ratio < gdtc->pos_ratio)
1675 sdtc = mdtc;
1676 }
1677
1678 if (dirty_exceeded && !wb->dirty_exceeded)
1679 wb->dirty_exceeded = 1;
1680
1681 if (time_is_before_jiffies(wb->bw_time_stamp +
1682 BANDWIDTH_INTERVAL)) {
1683 spin_lock(&wb->list_lock);
1684 __wb_update_bandwidth(gdtc, mdtc, start_time, true);
1685 spin_unlock(&wb->list_lock);
1686 }
1687
1688
1689 dirty_ratelimit = wb->dirty_ratelimit;
1690 task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >>
1691 RATELIMIT_CALC_SHIFT;
1692 max_pause = wb_max_pause(wb, sdtc->wb_dirty);
1693 min_pause = wb_min_pause(wb, max_pause,
1694 task_ratelimit, dirty_ratelimit,
1695 &nr_dirtied_pause);
1696
1697 if (unlikely(task_ratelimit == 0)) {
1698 period = max_pause;
1699 pause = max_pause;
1700 goto pause;
1701 }
1702 period = HZ * pages_dirtied / task_ratelimit;
1703 pause = period;
1704 if (current->dirty_paused_when)
1705 pause -= now - current->dirty_paused_when;
1706
1707
1708
1709
1710
1711
1712
1713 if (pause < min_pause) {
1714 trace_balance_dirty_pages(wb,
1715 sdtc->thresh,
1716 sdtc->bg_thresh,
1717 sdtc->dirty,
1718 sdtc->wb_thresh,
1719 sdtc->wb_dirty,
1720 dirty_ratelimit,
1721 task_ratelimit,
1722 pages_dirtied,
1723 period,
1724 min(pause, 0L),
1725 start_time);
1726 if (pause < -HZ) {
1727 current->dirty_paused_when = now;
1728 current->nr_dirtied = 0;
1729 } else if (period) {
1730 current->dirty_paused_when += period;
1731 current->nr_dirtied = 0;
1732 } else if (current->nr_dirtied_pause <= pages_dirtied)
1733 current->nr_dirtied_pause += pages_dirtied;
1734 break;
1735 }
1736 if (unlikely(pause > max_pause)) {
1737
1738 now += min(pause - max_pause, max_pause);
1739 pause = max_pause;
1740 }
1741
1742pause:
1743 trace_balance_dirty_pages(wb,
1744 sdtc->thresh,
1745 sdtc->bg_thresh,
1746 sdtc->dirty,
1747 sdtc->wb_thresh,
1748 sdtc->wb_dirty,
1749 dirty_ratelimit,
1750 task_ratelimit,
1751 pages_dirtied,
1752 period,
1753 pause,
1754 start_time);
1755 __set_current_state(TASK_KILLABLE);
1756 io_schedule_timeout(pause);
1757
1758 current->dirty_paused_when = now + pause;
1759 current->nr_dirtied = 0;
1760 current->nr_dirtied_pause = nr_dirtied_pause;
1761
1762
1763
1764
1765
1766 if (task_ratelimit)
1767 break;
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779 if (sdtc->wb_dirty <= wb_stat_error(wb))
1780 break;
1781
1782 if (fatal_signal_pending(current))
1783 break;
1784 }
1785
1786 if (!dirty_exceeded && wb->dirty_exceeded)
1787 wb->dirty_exceeded = 0;
1788
1789 if (writeback_in_progress(wb))
1790 return;
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800 if (laptop_mode)
1801 return;
1802
1803 if (nr_reclaimable > gdtc->bg_thresh)
1804 wb_start_background_writeback(wb);
1805}
1806
1807static DEFINE_PER_CPU(int, bdp_ratelimits);
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838void balance_dirty_pages_ratelimited(struct address_space *mapping)
1839{
1840 struct inode *inode = mapping->host;
1841 struct backing_dev_info *bdi = inode_to_bdi(inode);
1842 struct bdi_writeback *wb = NULL;
1843 int ratelimit;
1844 int *p;
1845
1846 if (!bdi_cap_account_dirty(bdi))
1847 return;
1848
1849 if (inode_cgwb_enabled(inode))
1850 wb = wb_get_create_current(bdi, GFP_KERNEL);
1851 if (!wb)
1852 wb = &bdi->wb;
1853
1854 ratelimit = current->nr_dirtied_pause;
1855 if (wb->dirty_exceeded)
1856 ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
1857
1858 preempt_disable();
1859
1860
1861
1862
1863
1864
1865 p = this_cpu_ptr(&bdp_ratelimits);
1866 if (unlikely(current->nr_dirtied >= ratelimit))
1867 *p = 0;
1868 else if (unlikely(*p >= ratelimit_pages)) {
1869 *p = 0;
1870 ratelimit = 0;
1871 }
1872
1873
1874
1875
1876
1877 p = this_cpu_ptr(&dirty_throttle_leaks);
1878 if (*p > 0 && current->nr_dirtied < ratelimit) {
1879 unsigned long nr_pages_dirtied;
1880 nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
1881 *p -= nr_pages_dirtied;
1882 current->nr_dirtied += nr_pages_dirtied;
1883 }
1884 preempt_enable();
1885
1886 if (unlikely(current->nr_dirtied >= ratelimit))
1887 balance_dirty_pages(mapping, wb, current->nr_dirtied);
1888
1889 wb_put(wb);
1890}
1891EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
1892
1893
1894
1895
1896
1897
1898
1899
1900bool wb_over_bg_thresh(struct bdi_writeback *wb)
1901{
1902 struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
1903 struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
1904 struct dirty_throttle_control * const gdtc = &gdtc_stor;
1905 struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
1906 &mdtc_stor : NULL;
1907
1908
1909
1910
1911
1912 gdtc->avail = global_dirtyable_memory();
1913 gdtc->dirty = global_page_state(NR_FILE_DIRTY) +
1914 global_page_state(NR_UNSTABLE_NFS);
1915 domain_dirty_limits(gdtc);
1916
1917 if (gdtc->dirty > gdtc->bg_thresh)
1918 return true;
1919
1920 if (wb_stat(wb, WB_RECLAIMABLE) >
1921 wb_calc_thresh(gdtc->wb, gdtc->bg_thresh))
1922 return true;
1923
1924 if (mdtc) {
1925 unsigned long filepages, headroom, writeback;
1926
1927 mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty,
1928 &writeback);
1929 mdtc_calc_avail(mdtc, filepages, headroom);
1930 domain_dirty_limits(mdtc);
1931
1932 if (mdtc->dirty > mdtc->bg_thresh)
1933 return true;
1934
1935 if (wb_stat(wb, WB_RECLAIMABLE) >
1936 wb_calc_thresh(mdtc->wb, mdtc->bg_thresh))
1937 return true;
1938 }
1939
1940 return false;
1941}
1942
1943void throttle_vm_writeout(gfp_t gfp_mask)
1944{
1945 unsigned long background_thresh;
1946 unsigned long dirty_thresh;
1947
1948 for ( ; ; ) {
1949 global_dirty_limits(&background_thresh, &dirty_thresh);
1950 dirty_thresh = hard_dirty_limit(&global_wb_domain, dirty_thresh);
1951
1952
1953
1954
1955
1956 dirty_thresh += dirty_thresh / 10;
1957
1958 if (global_page_state(NR_UNSTABLE_NFS) +
1959 global_page_state(NR_WRITEBACK) <= dirty_thresh)
1960 break;
1961 congestion_wait(BLK_RW_ASYNC, HZ/10);
1962
1963
1964
1965
1966
1967
1968 if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO))
1969 break;
1970 }
1971}
1972
1973
1974
1975
1976int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
1977 void __user *buffer, size_t *length, loff_t *ppos)
1978{
1979 proc_dointvec(table, write, buffer, length, ppos);
1980 return 0;
1981}
1982
1983#ifdef CONFIG_BLOCK
1984void laptop_mode_timer_fn(unsigned long data)
1985{
1986 struct request_queue *q = (struct request_queue *)data;
1987 int nr_pages = global_page_state(NR_FILE_DIRTY) +
1988 global_page_state(NR_UNSTABLE_NFS);
1989 struct bdi_writeback *wb;
1990
1991
1992
1993
1994
1995 if (!bdi_has_dirty_io(&q->backing_dev_info))
1996 return;
1997
1998 rcu_read_lock();
1999 list_for_each_entry_rcu(wb, &q->backing_dev_info.wb_list, bdi_node)
2000 if (wb_has_dirty_io(wb))
2001 wb_start_writeback(wb, nr_pages, true,
2002 WB_REASON_LAPTOP_TIMER);
2003 rcu_read_unlock();
2004}
2005
2006
2007
2008
2009
2010
2011void laptop_io_completion(struct backing_dev_info *info)
2012{
2013 mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode);
2014}
2015
2016
2017
2018
2019
2020
2021void laptop_sync_completion(void)
2022{
2023 struct backing_dev_info *bdi;
2024
2025 rcu_read_lock();
2026
2027 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
2028 del_timer(&bdi->laptop_mode_wb_timer);
2029
2030 rcu_read_unlock();
2031}
2032#endif
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045void writeback_set_ratelimit(void)
2046{
2047 struct wb_domain *dom = &global_wb_domain;
2048 unsigned long background_thresh;
2049 unsigned long dirty_thresh;
2050
2051 global_dirty_limits(&background_thresh, &dirty_thresh);
2052 dom->dirty_limit = dirty_thresh;
2053 ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
2054 if (ratelimit_pages < 16)
2055 ratelimit_pages = 16;
2056}
2057
2058static int
2059ratelimit_handler(struct notifier_block *self, unsigned long action,
2060 void *hcpu)
2061{
2062
2063 switch (action & ~CPU_TASKS_FROZEN) {
2064 case CPU_ONLINE:
2065 case CPU_DEAD:
2066 writeback_set_ratelimit();
2067 return NOTIFY_OK;
2068 default:
2069 return NOTIFY_DONE;
2070 }
2071}
2072
2073static struct notifier_block ratelimit_nb = {
2074 .notifier_call = ratelimit_handler,
2075 .next = NULL,
2076};
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096void __init page_writeback_init(void)
2097{
2098 BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL));
2099
2100 writeback_set_ratelimit();
2101 register_cpu_notifier(&ratelimit_nb);
2102}
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121void tag_pages_for_writeback(struct address_space *mapping,
2122 pgoff_t start, pgoff_t end)
2123{
2124#define WRITEBACK_TAG_BATCH 4096
2125 unsigned long tagged;
2126
2127 do {
2128 spin_lock_irq(&mapping->tree_lock);
2129 tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree,
2130 &start, end, WRITEBACK_TAG_BATCH,
2131 PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
2132 spin_unlock_irq(&mapping->tree_lock);
2133 WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH);
2134 cond_resched();
2135
2136 } while (tagged >= WRITEBACK_TAG_BATCH && start);
2137}
2138EXPORT_SYMBOL(tag_pages_for_writeback);
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162int write_cache_pages(struct address_space *mapping,
2163 struct writeback_control *wbc, writepage_t writepage,
2164 void *data)
2165{
2166 int ret = 0;
2167 int done = 0;
2168 struct pagevec pvec;
2169 int nr_pages;
2170 pgoff_t uninitialized_var(writeback_index);
2171 pgoff_t index;
2172 pgoff_t end;
2173 pgoff_t done_index;
2174 int cycled;
2175 int range_whole = 0;
2176 int tag;
2177
2178 pagevec_init(&pvec, 0);
2179 if (wbc->range_cyclic) {
2180 writeback_index = mapping->writeback_index;
2181 index = writeback_index;
2182 if (index == 0)
2183 cycled = 1;
2184 else
2185 cycled = 0;
2186 end = -1;
2187 } else {
2188 index = wbc->range_start >> PAGE_SHIFT;
2189 end = wbc->range_end >> PAGE_SHIFT;
2190 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2191 range_whole = 1;
2192 cycled = 1;
2193 }
2194 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
2195 tag = PAGECACHE_TAG_TOWRITE;
2196 else
2197 tag = PAGECACHE_TAG_DIRTY;
2198retry:
2199 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
2200 tag_pages_for_writeback(mapping, index, end);
2201 done_index = index;
2202 while (!done && (index <= end)) {
2203 int i;
2204
2205 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
2206 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
2207 if (nr_pages == 0)
2208 break;
2209
2210 for (i = 0; i < nr_pages; i++) {
2211 struct page *page = pvec.pages[i];
2212
2213
2214
2215
2216
2217
2218
2219
2220 if (page->index > end) {
2221
2222
2223
2224
2225 done = 1;
2226 break;
2227 }
2228
2229 done_index = page->index;
2230
2231 lock_page(page);
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241 if (unlikely(page->mapping != mapping)) {
2242continue_unlock:
2243 unlock_page(page);
2244 continue;
2245 }
2246
2247 if (!PageDirty(page)) {
2248
2249 goto continue_unlock;
2250 }
2251
2252 if (PageWriteback(page)) {
2253 if (wbc->sync_mode != WB_SYNC_NONE)
2254 wait_on_page_writeback(page);
2255 else
2256 goto continue_unlock;
2257 }
2258
2259 BUG_ON(PageWriteback(page));
2260 if (!clear_page_dirty_for_io(page))
2261 goto continue_unlock;
2262
2263 trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
2264 ret = (*writepage)(page, wbc, data);
2265 if (unlikely(ret)) {
2266 if (ret == AOP_WRITEPAGE_ACTIVATE) {
2267 unlock_page(page);
2268 ret = 0;
2269 } else {
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279 done_index = page->index + 1;
2280 done = 1;
2281 break;
2282 }
2283 }
2284
2285
2286
2287
2288
2289
2290
2291 if (--wbc->nr_to_write <= 0 &&
2292 wbc->sync_mode == WB_SYNC_NONE) {
2293 done = 1;
2294 break;
2295 }
2296 }
2297 pagevec_release(&pvec);
2298 cond_resched();
2299 }
2300 if (!cycled && !done) {
2301
2302
2303
2304
2305
2306 cycled = 1;
2307 index = 0;
2308 end = writeback_index - 1;
2309 goto retry;
2310 }
2311 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
2312 mapping->writeback_index = done_index;
2313
2314 return ret;
2315}
2316EXPORT_SYMBOL(write_cache_pages);
2317
2318
2319
2320
2321
2322static int __writepage(struct page *page, struct writeback_control *wbc,
2323 void *data)
2324{
2325 struct address_space *mapping = data;
2326 int ret = mapping->a_ops->writepage(page, wbc);
2327 mapping_set_error(mapping, ret);
2328 return ret;
2329}
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339int generic_writepages(struct address_space *mapping,
2340 struct writeback_control *wbc)
2341{
2342 struct blk_plug plug;
2343 int ret;
2344
2345
2346 if (!mapping->a_ops->writepage)
2347 return 0;
2348
2349 blk_start_plug(&plug);
2350 ret = write_cache_pages(mapping, wbc, __writepage, mapping);
2351 blk_finish_plug(&plug);
2352 return ret;
2353}
2354
2355EXPORT_SYMBOL(generic_writepages);
2356
2357int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
2358{
2359 int ret;
2360
2361 if (wbc->nr_to_write <= 0)
2362 return 0;
2363 if (mapping->a_ops->writepages)
2364 ret = mapping->a_ops->writepages(mapping, wbc);
2365 else
2366 ret = generic_writepages(mapping, wbc);
2367 return ret;
2368}
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379int write_one_page(struct page *page, int wait)
2380{
2381 struct address_space *mapping = page->mapping;
2382 int ret = 0;
2383 struct writeback_control wbc = {
2384 .sync_mode = WB_SYNC_ALL,
2385 .nr_to_write = 1,
2386 };
2387
2388 BUG_ON(!PageLocked(page));
2389
2390 if (wait)
2391 wait_on_page_writeback(page);
2392
2393 if (clear_page_dirty_for_io(page)) {
2394 get_page(page);
2395 ret = mapping->a_ops->writepage(page, &wbc);
2396 if (ret == 0 && wait) {
2397 wait_on_page_writeback(page);
2398 if (PageError(page))
2399 ret = -EIO;
2400 }
2401 put_page(page);
2402 } else {
2403 unlock_page(page);
2404 }
2405 return ret;
2406}
2407EXPORT_SYMBOL(write_one_page);
2408
2409
2410
2411
2412int __set_page_dirty_no_writeback(struct page *page)
2413{
2414 if (!PageDirty(page))
2415 return !TestSetPageDirty(page);
2416 return 0;
2417}
2418
2419
2420
2421
2422
2423
2424
2425
2426void account_page_dirtied(struct page *page, struct address_space *mapping)
2427{
2428 struct inode *inode = mapping->host;
2429
2430 trace_writeback_dirty_page(page, mapping);
2431
2432 if (mapping_cap_account_dirty(mapping)) {
2433 struct bdi_writeback *wb;
2434
2435 inode_attach_wb(inode, page);
2436 wb = inode_to_wb(inode);
2437
2438 mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY);
2439 __inc_zone_page_state(page, NR_FILE_DIRTY);
2440 __inc_zone_page_state(page, NR_DIRTIED);
2441 __inc_wb_stat(wb, WB_RECLAIMABLE);
2442 __inc_wb_stat(wb, WB_DIRTIED);
2443 task_io_account_write(PAGE_SIZE);
2444 current->nr_dirtied++;
2445 this_cpu_inc(bdp_ratelimits);
2446 }
2447}
2448EXPORT_SYMBOL(account_page_dirtied);
2449
2450
2451
2452
2453
2454
2455void account_page_cleaned(struct page *page, struct address_space *mapping,
2456 struct bdi_writeback *wb)
2457{
2458 if (mapping_cap_account_dirty(mapping)) {
2459 mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
2460 dec_zone_page_state(page, NR_FILE_DIRTY);
2461 dec_wb_stat(wb, WB_RECLAIMABLE);
2462 task_io_account_cancelled_write(PAGE_SIZE);
2463 }
2464}
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478int __set_page_dirty_nobuffers(struct page *page)
2479{
2480 lock_page_memcg(page);
2481 if (!TestSetPageDirty(page)) {
2482 struct address_space *mapping = page_mapping(page);
2483 unsigned long flags;
2484
2485 if (!mapping) {
2486 unlock_page_memcg(page);
2487 return 1;
2488 }
2489
2490 spin_lock_irqsave(&mapping->tree_lock, flags);
2491 BUG_ON(page_mapping(page) != mapping);
2492 WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
2493 account_page_dirtied(page, mapping);
2494 radix_tree_tag_set(&mapping->page_tree, page_index(page),
2495 PAGECACHE_TAG_DIRTY);
2496 spin_unlock_irqrestore(&mapping->tree_lock, flags);
2497 unlock_page_memcg(page);
2498
2499 if (mapping->host) {
2500
2501 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
2502 }
2503 return 1;
2504 }
2505 unlock_page_memcg(page);
2506 return 0;
2507}
2508EXPORT_SYMBOL(__set_page_dirty_nobuffers);
2509
2510
2511
2512
2513
2514
2515
2516
2517void account_page_redirty(struct page *page)
2518{
2519 struct address_space *mapping = page->mapping;
2520
2521 if (mapping && mapping_cap_account_dirty(mapping)) {
2522 struct inode *inode = mapping->host;
2523 struct bdi_writeback *wb;
2524 bool locked;
2525
2526 wb = unlocked_inode_to_wb_begin(inode, &locked);
2527 current->nr_dirtied--;
2528 dec_zone_page_state(page, NR_DIRTIED);
2529 dec_wb_stat(wb, WB_DIRTIED);
2530 unlocked_inode_to_wb_end(inode, locked);
2531 }
2532}
2533EXPORT_SYMBOL(account_page_redirty);
2534
2535
2536
2537
2538
2539
2540int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
2541{
2542 int ret;
2543
2544 wbc->pages_skipped++;
2545 ret = __set_page_dirty_nobuffers(page);
2546 account_page_redirty(page);
2547 return ret;
2548}
2549EXPORT_SYMBOL(redirty_page_for_writepage);
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562int set_page_dirty(struct page *page)
2563{
2564 struct address_space *mapping = page_mapping(page);
2565
2566 if (likely(mapping)) {
2567 int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578 if (PageReclaim(page))
2579 ClearPageReclaim(page);
2580#ifdef CONFIG_BLOCK
2581 if (!spd)
2582 spd = __set_page_dirty_buffers;
2583#endif
2584 return (*spd)(page);
2585 }
2586 if (!PageDirty(page)) {
2587 if (!TestSetPageDirty(page))
2588 return 1;
2589 }
2590 return 0;
2591}
2592EXPORT_SYMBOL(set_page_dirty);
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604int set_page_dirty_lock(struct page *page)
2605{
2606 int ret;
2607
2608 lock_page(page);
2609 ret = set_page_dirty(page);
2610 unlock_page(page);
2611 return ret;
2612}
2613EXPORT_SYMBOL(set_page_dirty_lock);
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628void cancel_dirty_page(struct page *page)
2629{
2630 struct address_space *mapping = page_mapping(page);
2631
2632 if (mapping_cap_account_dirty(mapping)) {
2633 struct inode *inode = mapping->host;
2634 struct bdi_writeback *wb;
2635 bool locked;
2636
2637 lock_page_memcg(page);
2638 wb = unlocked_inode_to_wb_begin(inode, &locked);
2639
2640 if (TestClearPageDirty(page))
2641 account_page_cleaned(page, mapping, wb);
2642
2643 unlocked_inode_to_wb_end(inode, locked);
2644 unlock_page_memcg(page);
2645 } else {
2646 ClearPageDirty(page);
2647 }
2648}
2649EXPORT_SYMBOL(cancel_dirty_page);
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665int clear_page_dirty_for_io(struct page *page)
2666{
2667 struct address_space *mapping = page_mapping(page);
2668 int ret = 0;
2669
2670 BUG_ON(!PageLocked(page));
2671
2672 if (mapping && mapping_cap_account_dirty(mapping)) {
2673 struct inode *inode = mapping->host;
2674 struct bdi_writeback *wb;
2675 bool locked;
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702 if (page_mkclean(page))
2703 set_page_dirty(page);
2704
2705
2706
2707
2708
2709
2710
2711
2712 wb = unlocked_inode_to_wb_begin(inode, &locked);
2713 if (TestClearPageDirty(page)) {
2714 mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
2715 dec_zone_page_state(page, NR_FILE_DIRTY);
2716 dec_wb_stat(wb, WB_RECLAIMABLE);
2717 ret = 1;
2718 }
2719 unlocked_inode_to_wb_end(inode, locked);
2720 return ret;
2721 }
2722 return TestClearPageDirty(page);
2723}
2724EXPORT_SYMBOL(clear_page_dirty_for_io);
2725
2726int test_clear_page_writeback(struct page *page)
2727{
2728 struct address_space *mapping = page_mapping(page);
2729 int ret;
2730
2731 lock_page_memcg(page);
2732 if (mapping) {
2733 struct inode *inode = mapping->host;
2734 struct backing_dev_info *bdi = inode_to_bdi(inode);
2735 unsigned long flags;
2736
2737 spin_lock_irqsave(&mapping->tree_lock, flags);
2738 ret = TestClearPageWriteback(page);
2739 if (ret) {
2740 radix_tree_tag_clear(&mapping->page_tree,
2741 page_index(page),
2742 PAGECACHE_TAG_WRITEBACK);
2743 if (bdi_cap_account_writeback(bdi)) {
2744 struct bdi_writeback *wb = inode_to_wb(inode);
2745
2746 __dec_wb_stat(wb, WB_WRITEBACK);
2747 __wb_writeout_inc(wb);
2748 }
2749 }
2750 spin_unlock_irqrestore(&mapping->tree_lock, flags);
2751 } else {
2752 ret = TestClearPageWriteback(page);
2753 }
2754 if (ret) {
2755 mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
2756 dec_zone_page_state(page, NR_WRITEBACK);
2757 inc_zone_page_state(page, NR_WRITTEN);
2758 }
2759 unlock_page_memcg(page);
2760 return ret;
2761}
2762
2763int __test_set_page_writeback(struct page *page, bool keep_write)
2764{
2765 struct address_space *mapping = page_mapping(page);
2766 int ret;
2767
2768 lock_page_memcg(page);
2769 if (mapping) {
2770 struct inode *inode = mapping->host;
2771 struct backing_dev_info *bdi = inode_to_bdi(inode);
2772 unsigned long flags;
2773
2774 spin_lock_irqsave(&mapping->tree_lock, flags);
2775 ret = TestSetPageWriteback(page);
2776 if (!ret) {
2777 radix_tree_tag_set(&mapping->page_tree,
2778 page_index(page),
2779 PAGECACHE_TAG_WRITEBACK);
2780 if (bdi_cap_account_writeback(bdi))
2781 __inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
2782 }
2783 if (!PageDirty(page))
2784 radix_tree_tag_clear(&mapping->page_tree,
2785 page_index(page),
2786 PAGECACHE_TAG_DIRTY);
2787 if (!keep_write)
2788 radix_tree_tag_clear(&mapping->page_tree,
2789 page_index(page),
2790 PAGECACHE_TAG_TOWRITE);
2791 spin_unlock_irqrestore(&mapping->tree_lock, flags);
2792 } else {
2793 ret = TestSetPageWriteback(page);
2794 }
2795 if (!ret) {
2796 mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
2797 inc_zone_page_state(page, NR_WRITEBACK);
2798 }
2799 unlock_page_memcg(page);
2800 return ret;
2801
2802}
2803EXPORT_SYMBOL(__test_set_page_writeback);
2804
2805
2806
2807
2808
2809int mapping_tagged(struct address_space *mapping, int tag)
2810{
2811 return radix_tree_tagged(&mapping->page_tree, tag);
2812}
2813EXPORT_SYMBOL(mapping_tagged);
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823void wait_for_stable_page(struct page *page)
2824{
2825 if (bdi_cap_stable_pages_required(inode_to_bdi(page->mapping->host)))
2826 wait_on_page_writeback(page);
2827}
2828EXPORT_SYMBOL_GPL(wait_for_stable_page);
2829