1
2
3
4
5
6
7
8
9
10
11
12
13
14#include <linux/kernel.h>
15#include <linux/export.h>
16#include <linux/spinlock.h>
17#include <linux/fs.h>
18#include <linux/mm.h>
19#include <linux/swap.h>
20#include <linux/slab.h>
21#include <linux/pagemap.h>
22#include <linux/writeback.h>
23#include <linux/init.h>
24#include <linux/backing-dev.h>
25#include <linux/task_io_accounting_ops.h>
26#include <linux/blkdev.h>
27#include <linux/mpage.h>
28#include <linux/rmap.h>
29#include <linux/percpu.h>
30#include <linux/notifier.h>
31#include <linux/smp.h>
32#include <linux/sysctl.h>
33#include <linux/cpu.h>
34#include <linux/syscalls.h>
35#include <linux/buffer_head.h>
36#include <linux/pagevec.h>
37#include <linux/timer.h>
38#include <linux/sched/rt.h>
39#include <linux/mm_inline.h>
40#include <trace/events/writeback.h>
41
42#include "internal.h"
43
44
45
46
47#define MAX_PAUSE max(HZ/5, 1)
48
49
50
51
52
53#define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10))
54
55
56
57
58#define BANDWIDTH_INTERVAL max(HZ/5, 1)
59
60#define RATELIMIT_CALC_SHIFT 10
61
62
63
64
65
66static long ratelimit_pages = 32;
67
68
69
70
71
72
73int dirty_background_ratio = 10;
74
75
76
77
78
79unsigned long dirty_background_bytes;
80
81
82
83
84
85int vm_highmem_is_dirtyable;
86
87
88
89
90int vm_dirty_ratio = 20;
91
92
93
94
95
96unsigned long vm_dirty_bytes;
97
98
99
100
101unsigned int dirty_writeback_interval = 5 * 100;
102
103EXPORT_SYMBOL_GPL(dirty_writeback_interval);
104
105
106
107
108unsigned int dirty_expire_interval = 30 * 100;
109
110
111
112
113int block_dump;
114
115
116
117
118
119int laptop_mode;
120
121EXPORT_SYMBOL(laptop_mode);
122
123
124
125struct wb_domain global_wb_domain;
126
127
128struct dirty_throttle_control {
129#ifdef CONFIG_CGROUP_WRITEBACK
130 struct wb_domain *dom;
131 struct dirty_throttle_control *gdtc;
132#endif
133 struct bdi_writeback *wb;
134 struct fprop_local_percpu *wb_completions;
135
136 unsigned long avail;
137 unsigned long dirty;
138 unsigned long thresh;
139 unsigned long bg_thresh;
140
141 unsigned long wb_dirty;
142 unsigned long wb_thresh;
143 unsigned long wb_bg_thresh;
144
145 unsigned long pos_ratio;
146};
147
148
149
150
151
152
153#define VM_COMPLETIONS_PERIOD_LEN (3*HZ)
154
155#ifdef CONFIG_CGROUP_WRITEBACK
156
157#define GDTC_INIT(__wb) .wb = (__wb), \
158 .dom = &global_wb_domain, \
159 .wb_completions = &(__wb)->completions
160
161#define GDTC_INIT_NO_WB .dom = &global_wb_domain
162
163#define MDTC_INIT(__wb, __gdtc) .wb = (__wb), \
164 .dom = mem_cgroup_wb_domain(__wb), \
165 .wb_completions = &(__wb)->memcg_completions, \
166 .gdtc = __gdtc
167
168static bool mdtc_valid(struct dirty_throttle_control *dtc)
169{
170 return dtc->dom;
171}
172
173static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
174{
175 return dtc->dom;
176}
177
178static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
179{
180 return mdtc->gdtc;
181}
182
183static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
184{
185 return &wb->memcg_completions;
186}
187
188static void wb_min_max_ratio(struct bdi_writeback *wb,
189 unsigned long *minp, unsigned long *maxp)
190{
191 unsigned long this_bw = wb->avg_write_bandwidth;
192 unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
193 unsigned long long min = wb->bdi->min_ratio;
194 unsigned long long max = wb->bdi->max_ratio;
195
196
197
198
199
200 if (this_bw < tot_bw) {
201 if (min) {
202 min *= this_bw;
203 do_div(min, tot_bw);
204 }
205 if (max < 100) {
206 max *= this_bw;
207 do_div(max, tot_bw);
208 }
209 }
210
211 *minp = min;
212 *maxp = max;
213}
214
215#else
216
217#define GDTC_INIT(__wb) .wb = (__wb), \
218 .wb_completions = &(__wb)->completions
219#define GDTC_INIT_NO_WB
220#define MDTC_INIT(__wb, __gdtc)
221
222static bool mdtc_valid(struct dirty_throttle_control *dtc)
223{
224 return false;
225}
226
227static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
228{
229 return &global_wb_domain;
230}
231
232static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
233{
234 return NULL;
235}
236
237static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
238{
239 return NULL;
240}
241
242static void wb_min_max_ratio(struct bdi_writeback *wb,
243 unsigned long *minp, unsigned long *maxp)
244{
245 *minp = wb->bdi->min_ratio;
246 *maxp = wb->bdi->max_ratio;
247}
248
249#endif
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276static unsigned long node_dirtyable_memory(struct pglist_data *pgdat)
277{
278 unsigned long nr_pages = 0;
279 int z;
280
281 for (z = 0; z < MAX_NR_ZONES; z++) {
282 struct zone *zone = pgdat->node_zones + z;
283
284 if (!populated_zone(zone))
285 continue;
286
287 nr_pages += zone_page_state(zone, NR_FREE_PAGES);
288 }
289
290
291
292
293
294
295 nr_pages -= min(nr_pages, pgdat->totalreserve_pages);
296
297 nr_pages += node_page_state(pgdat, NR_INACTIVE_FILE);
298 nr_pages += node_page_state(pgdat, NR_ACTIVE_FILE);
299
300 return nr_pages;
301}
302
303static unsigned long highmem_dirtyable_memory(unsigned long total)
304{
305#ifdef CONFIG_HIGHMEM
306 int node;
307 unsigned long x = 0;
308 int i;
309
310 for_each_node_state(node, N_HIGH_MEMORY) {
311 for (i = ZONE_NORMAL + 1; i < MAX_NR_ZONES; i++) {
312 struct zone *z;
313 unsigned long nr_pages;
314
315 if (!is_highmem_idx(i))
316 continue;
317
318 z = &NODE_DATA(node)->node_zones[i];
319 if (!populated_zone(z))
320 continue;
321
322 nr_pages = zone_page_state(z, NR_FREE_PAGES);
323
324 nr_pages -= min(nr_pages, high_wmark_pages(z));
325 nr_pages += zone_page_state(z, NR_ZONE_INACTIVE_FILE);
326 nr_pages += zone_page_state(z, NR_ZONE_ACTIVE_FILE);
327 x += nr_pages;
328 }
329 }
330
331
332
333
334
335
336
337
338
339
340 if ((long)x < 0)
341 x = 0;
342
343
344
345
346
347
348
349 return min(x, total);
350#else
351 return 0;
352#endif
353}
354
355
356
357
358
359
360
361static unsigned long global_dirtyable_memory(void)
362{
363 unsigned long x;
364
365 x = global_page_state(NR_FREE_PAGES);
366
367
368
369
370
371 x -= min(x, totalreserve_pages);
372
373 x += global_node_page_state(NR_INACTIVE_FILE);
374 x += global_node_page_state(NR_ACTIVE_FILE);
375
376 if (!vm_highmem_is_dirtyable)
377 x -= highmem_dirtyable_memory(x);
378
379 return x + 1;
380}
381
382
383
384
385
386
387
388
389
390
391
392static void domain_dirty_limits(struct dirty_throttle_control *dtc)
393{
394 const unsigned long available_memory = dtc->avail;
395 struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc);
396 unsigned long bytes = vm_dirty_bytes;
397 unsigned long bg_bytes = dirty_background_bytes;
398
399 unsigned long ratio = (vm_dirty_ratio * PAGE_SIZE) / 100;
400 unsigned long bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100;
401 unsigned long thresh;
402 unsigned long bg_thresh;
403 struct task_struct *tsk;
404
405
406 if (gdtc) {
407 unsigned long global_avail = gdtc->avail;
408
409
410
411
412
413
414
415
416 if (bytes)
417 ratio = min(DIV_ROUND_UP(bytes, global_avail),
418 PAGE_SIZE);
419 if (bg_bytes)
420 bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail),
421 PAGE_SIZE);
422 bytes = bg_bytes = 0;
423 }
424
425 if (bytes)
426 thresh = DIV_ROUND_UP(bytes, PAGE_SIZE);
427 else
428 thresh = (ratio * available_memory) / PAGE_SIZE;
429
430 if (bg_bytes)
431 bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE);
432 else
433 bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE;
434
435 if (bg_thresh >= thresh)
436 bg_thresh = thresh / 2;
437 tsk = current;
438 if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
439 bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32;
440 thresh += thresh / 4 + global_wb_domain.dirty_limit / 32;
441 }
442 dtc->thresh = thresh;
443 dtc->bg_thresh = bg_thresh;
444
445
446 if (!gdtc)
447 trace_global_dirty_state(bg_thresh, thresh);
448}
449
450
451
452
453
454
455
456
457
458void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
459{
460 struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
461
462 gdtc.avail = global_dirtyable_memory();
463 domain_dirty_limits(&gdtc);
464
465 *pbackground = gdtc.bg_thresh;
466 *pdirty = gdtc.thresh;
467}
468
469
470
471
472
473
474
475
476static unsigned long node_dirty_limit(struct pglist_data *pgdat)
477{
478 unsigned long node_memory = node_dirtyable_memory(pgdat);
479 struct task_struct *tsk = current;
480 unsigned long dirty;
481
482 if (vm_dirty_bytes)
483 dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) *
484 node_memory / global_dirtyable_memory();
485 else
486 dirty = vm_dirty_ratio * node_memory / 100;
487
488 if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk))
489 dirty += dirty / 4;
490
491 return dirty;
492}
493
494
495
496
497
498
499
500
501bool node_dirty_ok(struct pglist_data *pgdat)
502{
503 unsigned long limit = node_dirty_limit(pgdat);
504 unsigned long nr_pages = 0;
505
506 nr_pages += node_page_state(pgdat, NR_FILE_DIRTY);
507 nr_pages += node_page_state(pgdat, NR_UNSTABLE_NFS);
508 nr_pages += node_page_state(pgdat, NR_WRITEBACK);
509
510 return nr_pages <= limit;
511}
512
513int dirty_background_ratio_handler(struct ctl_table *table, int write,
514 void __user *buffer, size_t *lenp,
515 loff_t *ppos)
516{
517 int ret;
518
519 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
520 if (ret == 0 && write)
521 dirty_background_bytes = 0;
522 return ret;
523}
524
525int dirty_background_bytes_handler(struct ctl_table *table, int write,
526 void __user *buffer, size_t *lenp,
527 loff_t *ppos)
528{
529 int ret;
530
531 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
532 if (ret == 0 && write)
533 dirty_background_ratio = 0;
534 return ret;
535}
536
537int dirty_ratio_handler(struct ctl_table *table, int write,
538 void __user *buffer, size_t *lenp,
539 loff_t *ppos)
540{
541 int old_ratio = vm_dirty_ratio;
542 int ret;
543
544 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
545 if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
546 writeback_set_ratelimit();
547 vm_dirty_bytes = 0;
548 }
549 return ret;
550}
551
552int dirty_bytes_handler(struct ctl_table *table, int write,
553 void __user *buffer, size_t *lenp,
554 loff_t *ppos)
555{
556 unsigned long old_bytes = vm_dirty_bytes;
557 int ret;
558
559 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
560 if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
561 writeback_set_ratelimit();
562 vm_dirty_ratio = 0;
563 }
564 return ret;
565}
566
567static unsigned long wp_next_time(unsigned long cur_time)
568{
569 cur_time += VM_COMPLETIONS_PERIOD_LEN;
570
571 if (!cur_time)
572 return 1;
573 return cur_time;
574}
575
576static void wb_domain_writeout_inc(struct wb_domain *dom,
577 struct fprop_local_percpu *completions,
578 unsigned int max_prop_frac)
579{
580 __fprop_inc_percpu_max(&dom->completions, completions,
581 max_prop_frac);
582
583 if (!unlikely(dom->period_time)) {
584
585
586
587
588
589
590 dom->period_time = wp_next_time(jiffies);
591 mod_timer(&dom->period_timer, dom->period_time);
592 }
593}
594
595
596
597
598
599static inline void __wb_writeout_inc(struct bdi_writeback *wb)
600{
601 struct wb_domain *cgdom;
602
603 __inc_wb_stat(wb, WB_WRITTEN);
604 wb_domain_writeout_inc(&global_wb_domain, &wb->completions,
605 wb->bdi->max_prop_frac);
606
607 cgdom = mem_cgroup_wb_domain(wb);
608 if (cgdom)
609 wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb),
610 wb->bdi->max_prop_frac);
611}
612
613void wb_writeout_inc(struct bdi_writeback *wb)
614{
615 unsigned long flags;
616
617 local_irq_save(flags);
618 __wb_writeout_inc(wb);
619 local_irq_restore(flags);
620}
621EXPORT_SYMBOL_GPL(wb_writeout_inc);
622
623
624
625
626
627static void writeout_period(unsigned long t)
628{
629 struct wb_domain *dom = (void *)t;
630 int miss_periods = (jiffies - dom->period_time) /
631 VM_COMPLETIONS_PERIOD_LEN;
632
633 if (fprop_new_period(&dom->completions, miss_periods + 1)) {
634 dom->period_time = wp_next_time(dom->period_time +
635 miss_periods * VM_COMPLETIONS_PERIOD_LEN);
636 mod_timer(&dom->period_timer, dom->period_time);
637 } else {
638
639
640
641
642 dom->period_time = 0;
643 }
644}
645
646int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
647{
648 memset(dom, 0, sizeof(*dom));
649
650 spin_lock_init(&dom->lock);
651
652 init_timer_deferrable(&dom->period_timer);
653 dom->period_timer.function = writeout_period;
654 dom->period_timer.data = (unsigned long)dom;
655
656 dom->dirty_limit_tstamp = jiffies;
657
658 return fprop_global_init(&dom->completions, gfp);
659}
660
661#ifdef CONFIG_CGROUP_WRITEBACK
662void wb_domain_exit(struct wb_domain *dom)
663{
664 del_timer_sync(&dom->period_timer);
665 fprop_global_destroy(&dom->completions);
666}
667#endif
668
669
670
671
672
673
674static unsigned int bdi_min_ratio;
675
676int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
677{
678 int ret = 0;
679
680 spin_lock_bh(&bdi_lock);
681 if (min_ratio > bdi->max_ratio) {
682 ret = -EINVAL;
683 } else {
684 min_ratio -= bdi->min_ratio;
685 if (bdi_min_ratio + min_ratio < 100) {
686 bdi_min_ratio += min_ratio;
687 bdi->min_ratio += min_ratio;
688 } else {
689 ret = -EINVAL;
690 }
691 }
692 spin_unlock_bh(&bdi_lock);
693
694 return ret;
695}
696
697int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
698{
699 int ret = 0;
700
701 if (max_ratio > 100)
702 return -EINVAL;
703
704 spin_lock_bh(&bdi_lock);
705 if (bdi->min_ratio > max_ratio) {
706 ret = -EINVAL;
707 } else {
708 bdi->max_ratio = max_ratio;
709 bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100;
710 }
711 spin_unlock_bh(&bdi_lock);
712
713 return ret;
714}
715EXPORT_SYMBOL(bdi_set_max_ratio);
716
717static unsigned long dirty_freerun_ceiling(unsigned long thresh,
718 unsigned long bg_thresh)
719{
720 return (thresh + bg_thresh) / 2;
721}
722
723static unsigned long hard_dirty_limit(struct wb_domain *dom,
724 unsigned long thresh)
725{
726 return max(thresh, dom->dirty_limit);
727}
728
729
730
731
732
733static void mdtc_calc_avail(struct dirty_throttle_control *mdtc,
734 unsigned long filepages, unsigned long headroom)
735{
736 struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc);
737 unsigned long clean = filepages - min(filepages, mdtc->dirty);
738 unsigned long global_clean = gdtc->avail - min(gdtc->avail, gdtc->dirty);
739 unsigned long other_clean = global_clean - min(global_clean, clean);
740
741 mdtc->avail = filepages + min(headroom, other_clean);
742}
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
766{
767 struct wb_domain *dom = dtc_dom(dtc);
768 unsigned long thresh = dtc->thresh;
769 u64 wb_thresh;
770 long numerator, denominator;
771 unsigned long wb_min_ratio, wb_max_ratio;
772
773
774
775
776 fprop_fraction_percpu(&dom->completions, dtc->wb_completions,
777 &numerator, &denominator);
778
779 wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100;
780 wb_thresh *= numerator;
781 do_div(wb_thresh, denominator);
782
783 wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio);
784
785 wb_thresh += (thresh * wb_min_ratio) / 100;
786 if (wb_thresh > (thresh * wb_max_ratio) / 100)
787 wb_thresh = thresh * wb_max_ratio / 100;
788
789 return wb_thresh;
790}
791
792unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
793{
794 struct dirty_throttle_control gdtc = { GDTC_INIT(wb),
795 .thresh = thresh };
796 return __wb_calc_thresh(&gdtc);
797}
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813static long long pos_ratio_polynom(unsigned long setpoint,
814 unsigned long dirty,
815 unsigned long limit)
816{
817 long long pos_ratio;
818 long x;
819
820 x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
821 (limit - setpoint) | 1);
822 pos_ratio = x;
823 pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
824 pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
825 pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
826
827 return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT);
828}
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905static void wb_position_ratio(struct dirty_throttle_control *dtc)
906{
907 struct bdi_writeback *wb = dtc->wb;
908 unsigned long write_bw = wb->avg_write_bandwidth;
909 unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
910 unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
911 unsigned long wb_thresh = dtc->wb_thresh;
912 unsigned long x_intercept;
913 unsigned long setpoint;
914 unsigned long wb_setpoint;
915 unsigned long span;
916 long long pos_ratio;
917 long x;
918
919 dtc->pos_ratio = 0;
920
921 if (unlikely(dtc->dirty >= limit))
922 return;
923
924
925
926
927
928
929 setpoint = (freerun + limit) / 2;
930 pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit);
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957 if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
958 long long wb_pos_ratio;
959
960 if (dtc->wb_dirty < 8) {
961 dtc->pos_ratio = min_t(long long, pos_ratio * 2,
962 2 << RATELIMIT_CALC_SHIFT);
963 return;
964 }
965
966 if (dtc->wb_dirty >= wb_thresh)
967 return;
968
969 wb_setpoint = dirty_freerun_ceiling(wb_thresh,
970 dtc->wb_bg_thresh);
971
972 if (wb_setpoint == 0 || wb_setpoint == wb_thresh)
973 return;
974
975 wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty,
976 wb_thresh);
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999 dtc->pos_ratio = min(pos_ratio, wb_pos_ratio);
1000 return;
1001 }
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034 if (unlikely(wb_thresh > dtc->thresh))
1035 wb_thresh = dtc->thresh;
1036
1037
1038
1039
1040
1041
1042
1043 wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8);
1044
1045
1046
1047
1048 x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1);
1049 wb_setpoint = setpoint * (u64)x >> 16;
1050
1051
1052
1053
1054
1055
1056
1057
1058 span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16;
1059 x_intercept = wb_setpoint + span;
1060
1061 if (dtc->wb_dirty < x_intercept - span / 4) {
1062 pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty),
1063 (x_intercept - wb_setpoint) | 1);
1064 } else
1065 pos_ratio /= 4;
1066
1067
1068
1069
1070
1071
1072 x_intercept = wb_thresh / 2;
1073 if (dtc->wb_dirty < x_intercept) {
1074 if (dtc->wb_dirty > x_intercept / 8)
1075 pos_ratio = div_u64(pos_ratio * x_intercept,
1076 dtc->wb_dirty);
1077 else
1078 pos_ratio *= 8;
1079 }
1080
1081 dtc->pos_ratio = pos_ratio;
1082}
1083
1084static void wb_update_write_bandwidth(struct bdi_writeback *wb,
1085 unsigned long elapsed,
1086 unsigned long written)
1087{
1088 const unsigned long period = roundup_pow_of_two(3 * HZ);
1089 unsigned long avg = wb->avg_write_bandwidth;
1090 unsigned long old = wb->write_bandwidth;
1091 u64 bw;
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103 bw = written - min(written, wb->written_stamp);
1104 bw *= HZ;
1105 if (unlikely(elapsed > period)) {
1106 do_div(bw, elapsed);
1107 avg = bw;
1108 goto out;
1109 }
1110 bw += (u64)wb->write_bandwidth * (period - elapsed);
1111 bw >>= ilog2(period);
1112
1113
1114
1115
1116 if (avg > old && old >= (unsigned long)bw)
1117 avg -= (avg - old) >> 3;
1118
1119 if (avg < old && old <= (unsigned long)bw)
1120 avg += (old - avg) >> 3;
1121
1122out:
1123
1124 avg = max(avg, 1LU);
1125 if (wb_has_dirty_io(wb)) {
1126 long delta = avg - wb->avg_write_bandwidth;
1127 WARN_ON_ONCE(atomic_long_add_return(delta,
1128 &wb->bdi->tot_write_bandwidth) <= 0);
1129 }
1130 wb->write_bandwidth = bw;
1131 wb->avg_write_bandwidth = avg;
1132}
1133
1134static void update_dirty_limit(struct dirty_throttle_control *dtc)
1135{
1136 struct wb_domain *dom = dtc_dom(dtc);
1137 unsigned long thresh = dtc->thresh;
1138 unsigned long limit = dom->dirty_limit;
1139
1140
1141
1142
1143 if (limit < thresh) {
1144 limit = thresh;
1145 goto update;
1146 }
1147
1148
1149
1150
1151
1152
1153 thresh = max(thresh, dtc->dirty);
1154 if (limit > thresh) {
1155 limit -= (limit - thresh) >> 5;
1156 goto update;
1157 }
1158 return;
1159update:
1160 dom->dirty_limit = limit;
1161}
1162
1163static void domain_update_bandwidth(struct dirty_throttle_control *dtc,
1164 unsigned long now)
1165{
1166 struct wb_domain *dom = dtc_dom(dtc);
1167
1168
1169
1170
1171 if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL))
1172 return;
1173
1174 spin_lock(&dom->lock);
1175 if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) {
1176 update_dirty_limit(dtc);
1177 dom->dirty_limit_tstamp = now;
1178 }
1179 spin_unlock(&dom->lock);
1180}
1181
1182
1183
1184
1185
1186
1187
1188static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
1189 unsigned long dirtied,
1190 unsigned long elapsed)
1191{
1192 struct bdi_writeback *wb = dtc->wb;
1193 unsigned long dirty = dtc->dirty;
1194 unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
1195 unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
1196 unsigned long setpoint = (freerun + limit) / 2;
1197 unsigned long write_bw = wb->avg_write_bandwidth;
1198 unsigned long dirty_ratelimit = wb->dirty_ratelimit;
1199 unsigned long dirty_rate;
1200 unsigned long task_ratelimit;
1201 unsigned long balanced_dirty_ratelimit;
1202 unsigned long step;
1203 unsigned long x;
1204 unsigned long shift;
1205
1206
1207
1208
1209
1210 dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed;
1211
1212
1213
1214
1215 task_ratelimit = (u64)dirty_ratelimit *
1216 dtc->pos_ratio >> RATELIMIT_CALC_SHIFT;
1217 task_ratelimit++;
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249 balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw,
1250 dirty_rate | 1);
1251
1252
1253
1254 if (unlikely(balanced_dirty_ratelimit > write_bw))
1255 balanced_dirty_ratelimit = write_bw;
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291 step = 0;
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304 if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
1305 dirty = dtc->wb_dirty;
1306 if (dtc->wb_dirty < 8)
1307 setpoint = dtc->wb_dirty + 1;
1308 else
1309 setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2;
1310 }
1311
1312 if (dirty < setpoint) {
1313 x = min3(wb->balanced_dirty_ratelimit,
1314 balanced_dirty_ratelimit, task_ratelimit);
1315 if (dirty_ratelimit < x)
1316 step = x - dirty_ratelimit;
1317 } else {
1318 x = max3(wb->balanced_dirty_ratelimit,
1319 balanced_dirty_ratelimit, task_ratelimit);
1320 if (dirty_ratelimit > x)
1321 step = dirty_ratelimit - x;
1322 }
1323
1324
1325
1326
1327
1328
1329 shift = dirty_ratelimit / (2 * step + 1);
1330 if (shift < BITS_PER_LONG)
1331 step = DIV_ROUND_UP(step >> shift, 8);
1332 else
1333 step = 0;
1334
1335 if (dirty_ratelimit < balanced_dirty_ratelimit)
1336 dirty_ratelimit += step;
1337 else
1338 dirty_ratelimit -= step;
1339
1340 wb->dirty_ratelimit = max(dirty_ratelimit, 1UL);
1341 wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
1342
1343 trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit);
1344}
1345
1346static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
1347 struct dirty_throttle_control *mdtc,
1348 unsigned long start_time,
1349 bool update_ratelimit)
1350{
1351 struct bdi_writeback *wb = gdtc->wb;
1352 unsigned long now = jiffies;
1353 unsigned long elapsed = now - wb->bw_time_stamp;
1354 unsigned long dirtied;
1355 unsigned long written;
1356
1357 lockdep_assert_held(&wb->list_lock);
1358
1359
1360
1361
1362 if (elapsed < BANDWIDTH_INTERVAL)
1363 return;
1364
1365 dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]);
1366 written = percpu_counter_read(&wb->stat[WB_WRITTEN]);
1367
1368
1369
1370
1371
1372 if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time))
1373 goto snapshot;
1374
1375 if (update_ratelimit) {
1376 domain_update_bandwidth(gdtc, now);
1377 wb_update_dirty_ratelimit(gdtc, dirtied, elapsed);
1378
1379
1380
1381
1382
1383 if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) {
1384 domain_update_bandwidth(mdtc, now);
1385 wb_update_dirty_ratelimit(mdtc, dirtied, elapsed);
1386 }
1387 }
1388 wb_update_write_bandwidth(wb, elapsed, written);
1389
1390snapshot:
1391 wb->dirtied_stamp = dirtied;
1392 wb->written_stamp = written;
1393 wb->bw_time_stamp = now;
1394}
1395
1396void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time)
1397{
1398 struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };
1399
1400 __wb_update_bandwidth(&gdtc, NULL, start_time, false);
1401}
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411static unsigned long dirty_poll_interval(unsigned long dirty,
1412 unsigned long thresh)
1413{
1414 if (thresh > dirty)
1415 return 1UL << (ilog2(thresh - dirty) >> 1);
1416
1417 return 1;
1418}
1419
1420static unsigned long wb_max_pause(struct bdi_writeback *wb,
1421 unsigned long wb_dirty)
1422{
1423 unsigned long bw = wb->avg_write_bandwidth;
1424 unsigned long t;
1425
1426
1427
1428
1429
1430
1431
1432
1433 t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
1434 t++;
1435
1436 return min_t(unsigned long, t, MAX_PAUSE);
1437}
1438
1439static long wb_min_pause(struct bdi_writeback *wb,
1440 long max_pause,
1441 unsigned long task_ratelimit,
1442 unsigned long dirty_ratelimit,
1443 int *nr_dirtied_pause)
1444{
1445 long hi = ilog2(wb->avg_write_bandwidth);
1446 long lo = ilog2(wb->dirty_ratelimit);
1447 long t;
1448 long pause;
1449 int pages;
1450
1451
1452 t = max(1, HZ / 100);
1453
1454
1455
1456
1457
1458
1459
1460 if (hi > lo)
1461 t += (hi - lo) * (10 * HZ) / 1024;
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481 t = min(t, 1 + max_pause / 2);
1482 pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492 if (pages < DIRTY_POLL_THRESH) {
1493 t = max_pause;
1494 pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
1495 if (pages > DIRTY_POLL_THRESH) {
1496 pages = DIRTY_POLL_THRESH;
1497 t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit;
1498 }
1499 }
1500
1501 pause = HZ * pages / (task_ratelimit + 1);
1502 if (pause > max_pause) {
1503 t = max_pause;
1504 pages = task_ratelimit * t / roundup_pow_of_two(HZ);
1505 }
1506
1507 *nr_dirtied_pause = pages;
1508
1509
1510
1511 return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
1512}
1513
1514static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
1515{
1516 struct bdi_writeback *wb = dtc->wb;
1517 unsigned long wb_reclaimable;
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532 dtc->wb_thresh = __wb_calc_thresh(dtc);
1533 dtc->wb_bg_thresh = dtc->thresh ?
1534 div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0;
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546 if (dtc->wb_thresh < 2 * wb_stat_error(wb)) {
1547 wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
1548 dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK);
1549 } else {
1550 wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE);
1551 dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK);
1552 }
1553}
1554
1555
1556
1557
1558
1559
1560
1561
1562static void balance_dirty_pages(struct address_space *mapping,
1563 struct bdi_writeback *wb,
1564 unsigned long pages_dirtied)
1565{
1566 struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
1567 struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
1568 struct dirty_throttle_control * const gdtc = &gdtc_stor;
1569 struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
1570 &mdtc_stor : NULL;
1571 struct dirty_throttle_control *sdtc;
1572 unsigned long nr_reclaimable;
1573 long period;
1574 long pause;
1575 long max_pause;
1576 long min_pause;
1577 int nr_dirtied_pause;
1578 bool dirty_exceeded = false;
1579 unsigned long task_ratelimit;
1580 unsigned long dirty_ratelimit;
1581 struct backing_dev_info *bdi = wb->bdi;
1582 bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
1583 unsigned long start_time = jiffies;
1584
1585 for (;;) {
1586 unsigned long now = jiffies;
1587 unsigned long dirty, thresh, bg_thresh;
1588 unsigned long m_dirty = 0;
1589 unsigned long m_thresh = 0;
1590 unsigned long m_bg_thresh = 0;
1591
1592
1593
1594
1595
1596
1597
1598 nr_reclaimable = global_node_page_state(NR_FILE_DIRTY) +
1599 global_node_page_state(NR_UNSTABLE_NFS);
1600 gdtc->avail = global_dirtyable_memory();
1601 gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK);
1602
1603 domain_dirty_limits(gdtc);
1604
1605 if (unlikely(strictlimit)) {
1606 wb_dirty_limits(gdtc);
1607
1608 dirty = gdtc->wb_dirty;
1609 thresh = gdtc->wb_thresh;
1610 bg_thresh = gdtc->wb_bg_thresh;
1611 } else {
1612 dirty = gdtc->dirty;
1613 thresh = gdtc->thresh;
1614 bg_thresh = gdtc->bg_thresh;
1615 }
1616
1617 if (mdtc) {
1618 unsigned long filepages, headroom, writeback;
1619
1620
1621
1622
1623
1624 mem_cgroup_wb_stats(wb, &filepages, &headroom,
1625 &mdtc->dirty, &writeback);
1626 mdtc->dirty += writeback;
1627 mdtc_calc_avail(mdtc, filepages, headroom);
1628
1629 domain_dirty_limits(mdtc);
1630
1631 if (unlikely(strictlimit)) {
1632 wb_dirty_limits(mdtc);
1633 m_dirty = mdtc->wb_dirty;
1634 m_thresh = mdtc->wb_thresh;
1635 m_bg_thresh = mdtc->wb_bg_thresh;
1636 } else {
1637 m_dirty = mdtc->dirty;
1638 m_thresh = mdtc->thresh;
1639 m_bg_thresh = mdtc->bg_thresh;
1640 }
1641 }
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655 if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) &&
1656 (!mdtc ||
1657 m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) {
1658 unsigned long intv = dirty_poll_interval(dirty, thresh);
1659 unsigned long m_intv = ULONG_MAX;
1660
1661 current->dirty_paused_when = now;
1662 current->nr_dirtied = 0;
1663 if (mdtc)
1664 m_intv = dirty_poll_interval(m_dirty, m_thresh);
1665 current->nr_dirtied_pause = min(intv, m_intv);
1666 break;
1667 }
1668
1669 if (unlikely(!writeback_in_progress(wb)))
1670 wb_start_background_writeback(wb);
1671
1672
1673
1674
1675
1676 if (!strictlimit)
1677 wb_dirty_limits(gdtc);
1678
1679 dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) &&
1680 ((gdtc->dirty > gdtc->thresh) || strictlimit);
1681
1682 wb_position_ratio(gdtc);
1683 sdtc = gdtc;
1684
1685 if (mdtc) {
1686
1687
1688
1689
1690
1691
1692 if (!strictlimit)
1693 wb_dirty_limits(mdtc);
1694
1695 dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) &&
1696 ((mdtc->dirty > mdtc->thresh) || strictlimit);
1697
1698 wb_position_ratio(mdtc);
1699 if (mdtc->pos_ratio < gdtc->pos_ratio)
1700 sdtc = mdtc;
1701 }
1702
1703 if (dirty_exceeded && !wb->dirty_exceeded)
1704 wb->dirty_exceeded = 1;
1705
1706 if (time_is_before_jiffies(wb->bw_time_stamp +
1707 BANDWIDTH_INTERVAL)) {
1708 spin_lock(&wb->list_lock);
1709 __wb_update_bandwidth(gdtc, mdtc, start_time, true);
1710 spin_unlock(&wb->list_lock);
1711 }
1712
1713
1714 dirty_ratelimit = wb->dirty_ratelimit;
1715 task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >>
1716 RATELIMIT_CALC_SHIFT;
1717 max_pause = wb_max_pause(wb, sdtc->wb_dirty);
1718 min_pause = wb_min_pause(wb, max_pause,
1719 task_ratelimit, dirty_ratelimit,
1720 &nr_dirtied_pause);
1721
1722 if (unlikely(task_ratelimit == 0)) {
1723 period = max_pause;
1724 pause = max_pause;
1725 goto pause;
1726 }
1727 period = HZ * pages_dirtied / task_ratelimit;
1728 pause = period;
1729 if (current->dirty_paused_when)
1730 pause -= now - current->dirty_paused_when;
1731
1732
1733
1734
1735
1736
1737
1738 if (pause < min_pause) {
1739 trace_balance_dirty_pages(wb,
1740 sdtc->thresh,
1741 sdtc->bg_thresh,
1742 sdtc->dirty,
1743 sdtc->wb_thresh,
1744 sdtc->wb_dirty,
1745 dirty_ratelimit,
1746 task_ratelimit,
1747 pages_dirtied,
1748 period,
1749 min(pause, 0L),
1750 start_time);
1751 if (pause < -HZ) {
1752 current->dirty_paused_when = now;
1753 current->nr_dirtied = 0;
1754 } else if (period) {
1755 current->dirty_paused_when += period;
1756 current->nr_dirtied = 0;
1757 } else if (current->nr_dirtied_pause <= pages_dirtied)
1758 current->nr_dirtied_pause += pages_dirtied;
1759 break;
1760 }
1761 if (unlikely(pause > max_pause)) {
1762
1763 now += min(pause - max_pause, max_pause);
1764 pause = max_pause;
1765 }
1766
1767pause:
1768 trace_balance_dirty_pages(wb,
1769 sdtc->thresh,
1770 sdtc->bg_thresh,
1771 sdtc->dirty,
1772 sdtc->wb_thresh,
1773 sdtc->wb_dirty,
1774 dirty_ratelimit,
1775 task_ratelimit,
1776 pages_dirtied,
1777 period,
1778 pause,
1779 start_time);
1780 __set_current_state(TASK_KILLABLE);
1781 io_schedule_timeout(pause);
1782
1783 current->dirty_paused_when = now + pause;
1784 current->nr_dirtied = 0;
1785 current->nr_dirtied_pause = nr_dirtied_pause;
1786
1787
1788
1789
1790
1791 if (task_ratelimit)
1792 break;
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804 if (sdtc->wb_dirty <= wb_stat_error(wb))
1805 break;
1806
1807 if (fatal_signal_pending(current))
1808 break;
1809 }
1810
1811 if (!dirty_exceeded && wb->dirty_exceeded)
1812 wb->dirty_exceeded = 0;
1813
1814 if (writeback_in_progress(wb))
1815 return;
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825 if (laptop_mode)
1826 return;
1827
1828 if (nr_reclaimable > gdtc->bg_thresh)
1829 wb_start_background_writeback(wb);
1830}
1831
1832static DEFINE_PER_CPU(int, bdp_ratelimits);
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863void balance_dirty_pages_ratelimited(struct address_space *mapping)
1864{
1865 struct inode *inode = mapping->host;
1866 struct backing_dev_info *bdi = inode_to_bdi(inode);
1867 struct bdi_writeback *wb = NULL;
1868 int ratelimit;
1869 int *p;
1870
1871 if (!bdi_cap_account_dirty(bdi))
1872 return;
1873
1874 if (inode_cgwb_enabled(inode))
1875 wb = wb_get_create_current(bdi, GFP_KERNEL);
1876 if (!wb)
1877 wb = &bdi->wb;
1878
1879 ratelimit = current->nr_dirtied_pause;
1880 if (wb->dirty_exceeded)
1881 ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
1882
1883 preempt_disable();
1884
1885
1886
1887
1888
1889
1890 p = this_cpu_ptr(&bdp_ratelimits);
1891 if (unlikely(current->nr_dirtied >= ratelimit))
1892 *p = 0;
1893 else if (unlikely(*p >= ratelimit_pages)) {
1894 *p = 0;
1895 ratelimit = 0;
1896 }
1897
1898
1899
1900
1901
1902 p = this_cpu_ptr(&dirty_throttle_leaks);
1903 if (*p > 0 && current->nr_dirtied < ratelimit) {
1904 unsigned long nr_pages_dirtied;
1905 nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
1906 *p -= nr_pages_dirtied;
1907 current->nr_dirtied += nr_pages_dirtied;
1908 }
1909 preempt_enable();
1910
1911 if (unlikely(current->nr_dirtied >= ratelimit))
1912 balance_dirty_pages(mapping, wb, current->nr_dirtied);
1913
1914 wb_put(wb);
1915}
1916EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
1917
1918
1919
1920
1921
1922
1923
1924
1925bool wb_over_bg_thresh(struct bdi_writeback *wb)
1926{
1927 struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
1928 struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
1929 struct dirty_throttle_control * const gdtc = &gdtc_stor;
1930 struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
1931 &mdtc_stor : NULL;
1932
1933
1934
1935
1936
1937 gdtc->avail = global_dirtyable_memory();
1938 gdtc->dirty = global_node_page_state(NR_FILE_DIRTY) +
1939 global_node_page_state(NR_UNSTABLE_NFS);
1940 domain_dirty_limits(gdtc);
1941
1942 if (gdtc->dirty > gdtc->bg_thresh)
1943 return true;
1944
1945 if (wb_stat(wb, WB_RECLAIMABLE) >
1946 wb_calc_thresh(gdtc->wb, gdtc->bg_thresh))
1947 return true;
1948
1949 if (mdtc) {
1950 unsigned long filepages, headroom, writeback;
1951
1952 mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty,
1953 &writeback);
1954 mdtc_calc_avail(mdtc, filepages, headroom);
1955 domain_dirty_limits(mdtc);
1956
1957 if (mdtc->dirty > mdtc->bg_thresh)
1958 return true;
1959
1960 if (wb_stat(wb, WB_RECLAIMABLE) >
1961 wb_calc_thresh(mdtc->wb, mdtc->bg_thresh))
1962 return true;
1963 }
1964
1965 return false;
1966}
1967
1968void throttle_vm_writeout(gfp_t gfp_mask)
1969{
1970 unsigned long background_thresh;
1971 unsigned long dirty_thresh;
1972
1973 for ( ; ; ) {
1974 global_dirty_limits(&background_thresh, &dirty_thresh);
1975 dirty_thresh = hard_dirty_limit(&global_wb_domain, dirty_thresh);
1976
1977
1978
1979
1980
1981 dirty_thresh += dirty_thresh / 10;
1982
1983 if (global_node_page_state(NR_UNSTABLE_NFS) +
1984 global_node_page_state(NR_WRITEBACK) <= dirty_thresh)
1985 break;
1986 congestion_wait(BLK_RW_ASYNC, HZ/10);
1987
1988
1989
1990
1991
1992
1993 if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO))
1994 break;
1995 }
1996}
1997
1998
1999
2000
2001int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
2002 void __user *buffer, size_t *length, loff_t *ppos)
2003{
2004 proc_dointvec(table, write, buffer, length, ppos);
2005 return 0;
2006}
2007
2008#ifdef CONFIG_BLOCK
2009void laptop_mode_timer_fn(unsigned long data)
2010{
2011 struct request_queue *q = (struct request_queue *)data;
2012 int nr_pages = global_node_page_state(NR_FILE_DIRTY) +
2013 global_node_page_state(NR_UNSTABLE_NFS);
2014 struct bdi_writeback *wb;
2015
2016
2017
2018
2019
2020 if (!bdi_has_dirty_io(&q->backing_dev_info))
2021 return;
2022
2023 rcu_read_lock();
2024 list_for_each_entry_rcu(wb, &q->backing_dev_info.wb_list, bdi_node)
2025 if (wb_has_dirty_io(wb))
2026 wb_start_writeback(wb, nr_pages, true,
2027 WB_REASON_LAPTOP_TIMER);
2028 rcu_read_unlock();
2029}
2030
2031
2032
2033
2034
2035
2036void laptop_io_completion(struct backing_dev_info *info)
2037{
2038 mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode);
2039}
2040
2041
2042
2043
2044
2045
2046void laptop_sync_completion(void)
2047{
2048 struct backing_dev_info *bdi;
2049
2050 rcu_read_lock();
2051
2052 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
2053 del_timer(&bdi->laptop_mode_wb_timer);
2054
2055 rcu_read_unlock();
2056}
2057#endif
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070void writeback_set_ratelimit(void)
2071{
2072 struct wb_domain *dom = &global_wb_domain;
2073 unsigned long background_thresh;
2074 unsigned long dirty_thresh;
2075
2076 global_dirty_limits(&background_thresh, &dirty_thresh);
2077 dom->dirty_limit = dirty_thresh;
2078 ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
2079 if (ratelimit_pages < 16)
2080 ratelimit_pages = 16;
2081}
2082
2083static int
2084ratelimit_handler(struct notifier_block *self, unsigned long action,
2085 void *hcpu)
2086{
2087
2088 switch (action & ~CPU_TASKS_FROZEN) {
2089 case CPU_ONLINE:
2090 case CPU_DEAD:
2091 writeback_set_ratelimit();
2092 return NOTIFY_OK;
2093 default:
2094 return NOTIFY_DONE;
2095 }
2096}
2097
2098static struct notifier_block ratelimit_nb = {
2099 .notifier_call = ratelimit_handler,
2100 .next = NULL,
2101};
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121void __init page_writeback_init(void)
2122{
2123 BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL));
2124
2125 writeback_set_ratelimit();
2126 register_cpu_notifier(&ratelimit_nb);
2127}
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146void tag_pages_for_writeback(struct address_space *mapping,
2147 pgoff_t start, pgoff_t end)
2148{
2149#define WRITEBACK_TAG_BATCH 4096
2150 unsigned long tagged;
2151
2152 do {
2153 spin_lock_irq(&mapping->tree_lock);
2154 tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree,
2155 &start, end, WRITEBACK_TAG_BATCH,
2156 PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
2157 spin_unlock_irq(&mapping->tree_lock);
2158 WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH);
2159 cond_resched();
2160
2161 } while (tagged >= WRITEBACK_TAG_BATCH && start);
2162}
2163EXPORT_SYMBOL(tag_pages_for_writeback);
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187int write_cache_pages(struct address_space *mapping,
2188 struct writeback_control *wbc, writepage_t writepage,
2189 void *data)
2190{
2191 int ret = 0;
2192 int done = 0;
2193 struct pagevec pvec;
2194 int nr_pages;
2195 pgoff_t uninitialized_var(writeback_index);
2196 pgoff_t index;
2197 pgoff_t end;
2198 pgoff_t done_index;
2199 int cycled;
2200 int range_whole = 0;
2201 int tag;
2202
2203 pagevec_init(&pvec, 0);
2204 if (wbc->range_cyclic) {
2205 writeback_index = mapping->writeback_index;
2206 index = writeback_index;
2207 if (index == 0)
2208 cycled = 1;
2209 else
2210 cycled = 0;
2211 end = -1;
2212 } else {
2213 index = wbc->range_start >> PAGE_SHIFT;
2214 end = wbc->range_end >> PAGE_SHIFT;
2215 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2216 range_whole = 1;
2217 cycled = 1;
2218 }
2219 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
2220 tag = PAGECACHE_TAG_TOWRITE;
2221 else
2222 tag = PAGECACHE_TAG_DIRTY;
2223retry:
2224 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
2225 tag_pages_for_writeback(mapping, index, end);
2226 done_index = index;
2227 while (!done && (index <= end)) {
2228 int i;
2229
2230 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
2231 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
2232 if (nr_pages == 0)
2233 break;
2234
2235 for (i = 0; i < nr_pages; i++) {
2236 struct page *page = pvec.pages[i];
2237
2238
2239
2240
2241
2242
2243
2244
2245 if (page->index > end) {
2246
2247
2248
2249
2250 done = 1;
2251 break;
2252 }
2253
2254 done_index = page->index;
2255
2256 lock_page(page);
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266 if (unlikely(page->mapping != mapping)) {
2267continue_unlock:
2268 unlock_page(page);
2269 continue;
2270 }
2271
2272 if (!PageDirty(page)) {
2273
2274 goto continue_unlock;
2275 }
2276
2277 if (PageWriteback(page)) {
2278 if (wbc->sync_mode != WB_SYNC_NONE)
2279 wait_on_page_writeback(page);
2280 else
2281 goto continue_unlock;
2282 }
2283
2284 BUG_ON(PageWriteback(page));
2285 if (!clear_page_dirty_for_io(page))
2286 goto continue_unlock;
2287
2288 trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
2289 ret = (*writepage)(page, wbc, data);
2290 if (unlikely(ret)) {
2291 if (ret == AOP_WRITEPAGE_ACTIVATE) {
2292 unlock_page(page);
2293 ret = 0;
2294 } else {
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304 done_index = page->index + 1;
2305 done = 1;
2306 break;
2307 }
2308 }
2309
2310
2311
2312
2313
2314
2315
2316 if (--wbc->nr_to_write <= 0 &&
2317 wbc->sync_mode == WB_SYNC_NONE) {
2318 done = 1;
2319 break;
2320 }
2321 }
2322 pagevec_release(&pvec);
2323 cond_resched();
2324 }
2325 if (!cycled && !done) {
2326
2327
2328
2329
2330
2331 cycled = 1;
2332 index = 0;
2333 end = writeback_index - 1;
2334 goto retry;
2335 }
2336 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
2337 mapping->writeback_index = done_index;
2338
2339 return ret;
2340}
2341EXPORT_SYMBOL(write_cache_pages);
2342
2343
2344
2345
2346
2347static int __writepage(struct page *page, struct writeback_control *wbc,
2348 void *data)
2349{
2350 struct address_space *mapping = data;
2351 int ret = mapping->a_ops->writepage(page, wbc);
2352 mapping_set_error(mapping, ret);
2353 return ret;
2354}
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364int generic_writepages(struct address_space *mapping,
2365 struct writeback_control *wbc)
2366{
2367 struct blk_plug plug;
2368 int ret;
2369
2370
2371 if (!mapping->a_ops->writepage)
2372 return 0;
2373
2374 blk_start_plug(&plug);
2375 ret = write_cache_pages(mapping, wbc, __writepage, mapping);
2376 blk_finish_plug(&plug);
2377 return ret;
2378}
2379
2380EXPORT_SYMBOL(generic_writepages);
2381
2382int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
2383{
2384 int ret;
2385
2386 if (wbc->nr_to_write <= 0)
2387 return 0;
2388 if (mapping->a_ops->writepages)
2389 ret = mapping->a_ops->writepages(mapping, wbc);
2390 else
2391 ret = generic_writepages(mapping, wbc);
2392 return ret;
2393}
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404int write_one_page(struct page *page, int wait)
2405{
2406 struct address_space *mapping = page->mapping;
2407 int ret = 0;
2408 struct writeback_control wbc = {
2409 .sync_mode = WB_SYNC_ALL,
2410 .nr_to_write = 1,
2411 };
2412
2413 BUG_ON(!PageLocked(page));
2414
2415 if (wait)
2416 wait_on_page_writeback(page);
2417
2418 if (clear_page_dirty_for_io(page)) {
2419 get_page(page);
2420 ret = mapping->a_ops->writepage(page, &wbc);
2421 if (ret == 0 && wait) {
2422 wait_on_page_writeback(page);
2423 if (PageError(page))
2424 ret = -EIO;
2425 }
2426 put_page(page);
2427 } else {
2428 unlock_page(page);
2429 }
2430 return ret;
2431}
2432EXPORT_SYMBOL(write_one_page);
2433
2434
2435
2436
2437int __set_page_dirty_no_writeback(struct page *page)
2438{
2439 if (!PageDirty(page))
2440 return !TestSetPageDirty(page);
2441 return 0;
2442}
2443
2444
2445
2446
2447
2448
2449
2450
2451void account_page_dirtied(struct page *page, struct address_space *mapping)
2452{
2453 struct inode *inode = mapping->host;
2454
2455 trace_writeback_dirty_page(page, mapping);
2456
2457 if (mapping_cap_account_dirty(mapping)) {
2458 struct bdi_writeback *wb;
2459
2460 inode_attach_wb(inode, page);
2461 wb = inode_to_wb(inode);
2462
2463 mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY);
2464 __inc_node_page_state(page, NR_FILE_DIRTY);
2465 __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
2466 __inc_node_page_state(page, NR_DIRTIED);
2467 __inc_wb_stat(wb, WB_RECLAIMABLE);
2468 __inc_wb_stat(wb, WB_DIRTIED);
2469 task_io_account_write(PAGE_SIZE);
2470 current->nr_dirtied++;
2471 this_cpu_inc(bdp_ratelimits);
2472 }
2473}
2474EXPORT_SYMBOL(account_page_dirtied);
2475
2476
2477
2478
2479
2480
2481void account_page_cleaned(struct page *page, struct address_space *mapping,
2482 struct bdi_writeback *wb)
2483{
2484 if (mapping_cap_account_dirty(mapping)) {
2485 mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
2486 dec_node_page_state(page, NR_FILE_DIRTY);
2487 dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
2488 dec_wb_stat(wb, WB_RECLAIMABLE);
2489 task_io_account_cancelled_write(PAGE_SIZE);
2490 }
2491}
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505int __set_page_dirty_nobuffers(struct page *page)
2506{
2507 lock_page_memcg(page);
2508 if (!TestSetPageDirty(page)) {
2509 struct address_space *mapping = page_mapping(page);
2510 unsigned long flags;
2511
2512 if (!mapping) {
2513 unlock_page_memcg(page);
2514 return 1;
2515 }
2516
2517 spin_lock_irqsave(&mapping->tree_lock, flags);
2518 BUG_ON(page_mapping(page) != mapping);
2519 WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
2520 account_page_dirtied(page, mapping);
2521 radix_tree_tag_set(&mapping->page_tree, page_index(page),
2522 PAGECACHE_TAG_DIRTY);
2523 spin_unlock_irqrestore(&mapping->tree_lock, flags);
2524 unlock_page_memcg(page);
2525
2526 if (mapping->host) {
2527
2528 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
2529 }
2530 return 1;
2531 }
2532 unlock_page_memcg(page);
2533 return 0;
2534}
2535EXPORT_SYMBOL(__set_page_dirty_nobuffers);
2536
2537
2538
2539
2540
2541
2542
2543
2544void account_page_redirty(struct page *page)
2545{
2546 struct address_space *mapping = page->mapping;
2547
2548 if (mapping && mapping_cap_account_dirty(mapping)) {
2549 struct inode *inode = mapping->host;
2550 struct bdi_writeback *wb;
2551 bool locked;
2552
2553 wb = unlocked_inode_to_wb_begin(inode, &locked);
2554 current->nr_dirtied--;
2555 dec_node_page_state(page, NR_DIRTIED);
2556 dec_wb_stat(wb, WB_DIRTIED);
2557 unlocked_inode_to_wb_end(inode, locked);
2558 }
2559}
2560EXPORT_SYMBOL(account_page_redirty);
2561
2562
2563
2564
2565
2566
2567int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
2568{
2569 int ret;
2570
2571 wbc->pages_skipped++;
2572 ret = __set_page_dirty_nobuffers(page);
2573 account_page_redirty(page);
2574 return ret;
2575}
2576EXPORT_SYMBOL(redirty_page_for_writepage);
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589int set_page_dirty(struct page *page)
2590{
2591 struct address_space *mapping = page_mapping(page);
2592
2593 page = compound_head(page);
2594 if (likely(mapping)) {
2595 int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606 if (PageReclaim(page))
2607 ClearPageReclaim(page);
2608#ifdef CONFIG_BLOCK
2609 if (!spd)
2610 spd = __set_page_dirty_buffers;
2611#endif
2612 return (*spd)(page);
2613 }
2614 if (!PageDirty(page)) {
2615 if (!TestSetPageDirty(page))
2616 return 1;
2617 }
2618 return 0;
2619}
2620EXPORT_SYMBOL(set_page_dirty);
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632int set_page_dirty_lock(struct page *page)
2633{
2634 int ret;
2635
2636 lock_page(page);
2637 ret = set_page_dirty(page);
2638 unlock_page(page);
2639 return ret;
2640}
2641EXPORT_SYMBOL(set_page_dirty_lock);
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656void cancel_dirty_page(struct page *page)
2657{
2658 struct address_space *mapping = page_mapping(page);
2659
2660 if (mapping_cap_account_dirty(mapping)) {
2661 struct inode *inode = mapping->host;
2662 struct bdi_writeback *wb;
2663 bool locked;
2664
2665 lock_page_memcg(page);
2666 wb = unlocked_inode_to_wb_begin(inode, &locked);
2667
2668 if (TestClearPageDirty(page))
2669 account_page_cleaned(page, mapping, wb);
2670
2671 unlocked_inode_to_wb_end(inode, locked);
2672 unlock_page_memcg(page);
2673 } else {
2674 ClearPageDirty(page);
2675 }
2676}
2677EXPORT_SYMBOL(cancel_dirty_page);
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693int clear_page_dirty_for_io(struct page *page)
2694{
2695 struct address_space *mapping = page_mapping(page);
2696 int ret = 0;
2697
2698 BUG_ON(!PageLocked(page));
2699
2700 if (mapping && mapping_cap_account_dirty(mapping)) {
2701 struct inode *inode = mapping->host;
2702 struct bdi_writeback *wb;
2703 bool locked;
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730 if (page_mkclean(page))
2731 set_page_dirty(page);
2732
2733
2734
2735
2736
2737
2738
2739
2740 wb = unlocked_inode_to_wb_begin(inode, &locked);
2741 if (TestClearPageDirty(page)) {
2742 mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
2743 dec_node_page_state(page, NR_FILE_DIRTY);
2744 dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
2745 dec_wb_stat(wb, WB_RECLAIMABLE);
2746 ret = 1;
2747 }
2748 unlocked_inode_to_wb_end(inode, locked);
2749 return ret;
2750 }
2751 return TestClearPageDirty(page);
2752}
2753EXPORT_SYMBOL(clear_page_dirty_for_io);
2754
2755int test_clear_page_writeback(struct page *page)
2756{
2757 struct address_space *mapping = page_mapping(page);
2758 int ret;
2759
2760 lock_page_memcg(page);
2761 if (mapping) {
2762 struct inode *inode = mapping->host;
2763 struct backing_dev_info *bdi = inode_to_bdi(inode);
2764 unsigned long flags;
2765
2766 spin_lock_irqsave(&mapping->tree_lock, flags);
2767 ret = TestClearPageWriteback(page);
2768 if (ret) {
2769 radix_tree_tag_clear(&mapping->page_tree,
2770 page_index(page),
2771 PAGECACHE_TAG_WRITEBACK);
2772 if (bdi_cap_account_writeback(bdi)) {
2773 struct bdi_writeback *wb = inode_to_wb(inode);
2774
2775 __dec_wb_stat(wb, WB_WRITEBACK);
2776 __wb_writeout_inc(wb);
2777 }
2778 }
2779
2780 if (mapping->host && !mapping_tagged(mapping,
2781 PAGECACHE_TAG_WRITEBACK))
2782 sb_clear_inode_writeback(mapping->host);
2783
2784 spin_unlock_irqrestore(&mapping->tree_lock, flags);
2785 } else {
2786 ret = TestClearPageWriteback(page);
2787 }
2788 if (ret) {
2789 mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
2790 dec_node_page_state(page, NR_WRITEBACK);
2791 dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
2792 inc_node_page_state(page, NR_WRITTEN);
2793 }
2794 unlock_page_memcg(page);
2795 return ret;
2796}
2797
2798int __test_set_page_writeback(struct page *page, bool keep_write)
2799{
2800 struct address_space *mapping = page_mapping(page);
2801 int ret;
2802
2803 lock_page_memcg(page);
2804 if (mapping) {
2805 struct inode *inode = mapping->host;
2806 struct backing_dev_info *bdi = inode_to_bdi(inode);
2807 unsigned long flags;
2808
2809 spin_lock_irqsave(&mapping->tree_lock, flags);
2810 ret = TestSetPageWriteback(page);
2811 if (!ret) {
2812 bool on_wblist;
2813
2814 on_wblist = mapping_tagged(mapping,
2815 PAGECACHE_TAG_WRITEBACK);
2816
2817 radix_tree_tag_set(&mapping->page_tree,
2818 page_index(page),
2819 PAGECACHE_TAG_WRITEBACK);
2820 if (bdi_cap_account_writeback(bdi))
2821 __inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
2822
2823
2824
2825
2826
2827
2828 if (mapping->host && !on_wblist)
2829 sb_mark_inode_writeback(mapping->host);
2830 }
2831 if (!PageDirty(page))
2832 radix_tree_tag_clear(&mapping->page_tree,
2833 page_index(page),
2834 PAGECACHE_TAG_DIRTY);
2835 if (!keep_write)
2836 radix_tree_tag_clear(&mapping->page_tree,
2837 page_index(page),
2838 PAGECACHE_TAG_TOWRITE);
2839 spin_unlock_irqrestore(&mapping->tree_lock, flags);
2840 } else {
2841 ret = TestSetPageWriteback(page);
2842 }
2843 if (!ret) {
2844 mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
2845 inc_node_page_state(page, NR_WRITEBACK);
2846 inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
2847 }
2848 unlock_page_memcg(page);
2849 return ret;
2850
2851}
2852EXPORT_SYMBOL(__test_set_page_writeback);
2853
2854
2855
2856
2857
2858int mapping_tagged(struct address_space *mapping, int tag)
2859{
2860 return radix_tree_tagged(&mapping->page_tree, tag);
2861}
2862EXPORT_SYMBOL(mapping_tagged);
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872void wait_for_stable_page(struct page *page)
2873{
2874 if (bdi_cap_stable_pages_required(inode_to_bdi(page->mapping->host)))
2875 wait_on_page_writeback(page);
2876}
2877EXPORT_SYMBOL_GPL(wait_for_stable_page);
2878