1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21#include <linux/kernel.h>
22#include <linux/blk_types.h>
23#include <linux/slab.h>
24#include <linux/backing-dev.h>
25#include <linux/swap.h>
26
27#include "blk-wbt.h"
28
29#define CREATE_TRACE_POINTS
30#include <trace/events/wbt.h>
31
32enum {
33
34
35
36
37 RWB_DEF_DEPTH = 16,
38
39
40
41
42 RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL,
43
44
45
46
47 RWB_MIN_WRITE_SAMPLES = 3,
48
49
50
51
52
53 RWB_UNKNOWN_BUMP = 5,
54};
55
56static inline bool rwb_enabled(struct rq_wb *rwb)
57{
58 return rwb && rwb->wb_normal != 0;
59}
60
61
62
63
64
65static bool atomic_inc_below(atomic_t *v, int below)
66{
67 int cur = atomic_read(v);
68
69 for (;;) {
70 int old;
71
72 if (cur >= below)
73 return false;
74 old = atomic_cmpxchg(v, cur, cur + 1);
75 if (old == cur)
76 break;
77 cur = old;
78 }
79
80 return true;
81}
82
83static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
84{
85 if (rwb_enabled(rwb)) {
86 const unsigned long cur = jiffies;
87
88 if (cur != *var)
89 *var = cur;
90 }
91}
92
93
94
95
96
97static bool wb_recent_wait(struct rq_wb *rwb)
98{
99 struct bdi_writeback *wb = &rwb->queue->backing_dev_info->wb;
100
101 return time_before(jiffies, wb->dirty_sleep + HZ);
102}
103
104static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, bool is_kswapd)
105{
106 return &rwb->rq_wait[is_kswapd];
107}
108
109static void rwb_wake_all(struct rq_wb *rwb)
110{
111 int i;
112
113 for (i = 0; i < WBT_NUM_RWQ; i++) {
114 struct rq_wait *rqw = &rwb->rq_wait[i];
115
116 if (waitqueue_active(&rqw->wait))
117 wake_up_all(&rqw->wait);
118 }
119}
120
121void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
122{
123 struct rq_wait *rqw;
124 int inflight, limit;
125
126 if (!(wb_acct & WBT_TRACKED))
127 return;
128
129 rqw = get_rq_wait(rwb, wb_acct & WBT_KSWAPD);
130 inflight = atomic_dec_return(&rqw->inflight);
131
132
133
134
135
136 if (unlikely(!rwb_enabled(rwb))) {
137 rwb_wake_all(rwb);
138 return;
139 }
140
141
142
143
144
145 if (rwb->wc && !wb_recent_wait(rwb))
146 limit = 0;
147 else
148 limit = rwb->wb_normal;
149
150
151
152
153 if (inflight && inflight >= limit)
154 return;
155
156 if (waitqueue_active(&rqw->wait)) {
157 int diff = limit - inflight;
158
159 if (!inflight || diff >= rwb->wb_background / 2)
160 wake_up_all(&rqw->wait);
161 }
162}
163
164
165
166
167
168void wbt_done(struct rq_wb *rwb, struct blk_issue_stat *stat)
169{
170 if (!rwb)
171 return;
172
173 if (!wbt_is_tracked(stat)) {
174 if (rwb->sync_cookie == stat) {
175 rwb->sync_issue = 0;
176 rwb->sync_cookie = NULL;
177 }
178
179 if (wbt_is_read(stat))
180 wb_timestamp(rwb, &rwb->last_comp);
181 wbt_clear_state(stat);
182 } else {
183 WARN_ON_ONCE(stat == rwb->sync_cookie);
184 __wbt_done(rwb, wbt_stat_to_mask(stat));
185 wbt_clear_state(stat);
186 }
187}
188
189
190
191
192static bool calc_wb_limits(struct rq_wb *rwb)
193{
194 unsigned int depth;
195 bool ret = false;
196
197 if (!rwb->min_lat_nsec) {
198 rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0;
199 return false;
200 }
201
202
203
204
205
206
207
208
209 if (rwb->queue_depth == 1) {
210 if (rwb->scale_step > 0)
211 rwb->wb_max = rwb->wb_normal = 1;
212 else {
213 rwb->wb_max = rwb->wb_normal = 2;
214 ret = true;
215 }
216 rwb->wb_background = 1;
217 } else {
218
219
220
221
222
223
224
225 depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth);
226 if (rwb->scale_step > 0)
227 depth = 1 + ((depth - 1) >> min(31, rwb->scale_step));
228 else if (rwb->scale_step < 0) {
229 unsigned int maxd = 3 * rwb->queue_depth / 4;
230
231 depth = 1 + ((depth - 1) << -rwb->scale_step);
232 if (depth > maxd) {
233 depth = maxd;
234 ret = true;
235 }
236 }
237
238
239
240
241
242 rwb->wb_max = depth;
243 rwb->wb_normal = (rwb->wb_max + 1) / 2;
244 rwb->wb_background = (rwb->wb_max + 3) / 4;
245 }
246
247 return ret;
248}
249
250static inline bool stat_sample_valid(struct blk_rq_stat *stat)
251{
252
253
254
255
256
257
258 return (stat[READ].nr_samples >= 1 &&
259 stat[WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES);
260}
261
262static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
263{
264 u64 now, issue = ACCESS_ONCE(rwb->sync_issue);
265
266 if (!issue || !rwb->sync_cookie)
267 return 0;
268
269 now = ktime_to_ns(ktime_get());
270 return now - issue;
271}
272
273enum {
274 LAT_OK = 1,
275 LAT_UNKNOWN,
276 LAT_UNKNOWN_WRITES,
277 LAT_EXCEEDED,
278};
279
280static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
281{
282 struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
283 u64 thislat;
284
285
286
287
288
289
290
291
292
293
294 thislat = rwb_sync_issue_lat(rwb);
295 if (thislat > rwb->cur_win_nsec ||
296 (thislat > rwb->min_lat_nsec && !stat[READ].nr_samples)) {
297 trace_wbt_lat(bdi, thislat);
298 return LAT_EXCEEDED;
299 }
300
301
302
303
304 if (!stat_sample_valid(stat)) {
305
306
307
308
309
310
311 if (stat[WRITE].nr_samples || wb_recent_wait(rwb) ||
312 wbt_inflight(rwb))
313 return LAT_UNKNOWN_WRITES;
314 return LAT_UNKNOWN;
315 }
316
317
318
319
320 if (stat[READ].min > rwb->min_lat_nsec) {
321 trace_wbt_lat(bdi, stat[READ].min);
322 trace_wbt_stat(bdi, stat);
323 return LAT_EXCEEDED;
324 }
325
326 if (rwb->scale_step)
327 trace_wbt_stat(bdi, stat);
328
329 return LAT_OK;
330}
331
332static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
333{
334 struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
335
336 trace_wbt_step(bdi, msg, rwb->scale_step, rwb->cur_win_nsec,
337 rwb->wb_background, rwb->wb_normal, rwb->wb_max);
338}
339
340static void scale_up(struct rq_wb *rwb)
341{
342
343
344
345 if (rwb->scaled_max)
346 return;
347
348 rwb->scale_step--;
349 rwb->unknown_cnt = 0;
350
351 rwb->scaled_max = calc_wb_limits(rwb);
352
353 rwb_wake_all(rwb);
354
355 rwb_trace_step(rwb, "step up");
356}
357
358
359
360
361
362static void scale_down(struct rq_wb *rwb, bool hard_throttle)
363{
364
365
366
367
368
369 if (rwb->wb_max == 1)
370 return;
371
372 if (rwb->scale_step < 0 && hard_throttle)
373 rwb->scale_step = 0;
374 else
375 rwb->scale_step++;
376
377 rwb->scaled_max = false;
378 rwb->unknown_cnt = 0;
379 calc_wb_limits(rwb);
380 rwb_trace_step(rwb, "step down");
381}
382
383static void rwb_arm_timer(struct rq_wb *rwb)
384{
385 if (rwb->scale_step > 0) {
386
387
388
389
390
391
392 rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
393 int_sqrt((rwb->scale_step + 1) << 8));
394 } else {
395
396
397
398
399 rwb->cur_win_nsec = rwb->win_nsec;
400 }
401
402 blk_stat_activate_nsecs(rwb->cb, rwb->cur_win_nsec);
403}
404
405static void wb_timer_fn(struct blk_stat_callback *cb)
406{
407 struct rq_wb *rwb = cb->data;
408 unsigned int inflight = wbt_inflight(rwb);
409 int status;
410
411 status = latency_exceeded(rwb, cb->stat);
412
413 trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step,
414 inflight);
415
416
417
418
419
420
421 switch (status) {
422 case LAT_EXCEEDED:
423 scale_down(rwb, true);
424 break;
425 case LAT_OK:
426 scale_up(rwb);
427 break;
428 case LAT_UNKNOWN_WRITES:
429
430
431
432
433
434 scale_up(rwb);
435 break;
436 case LAT_UNKNOWN:
437 if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP)
438 break;
439
440
441
442
443
444 if (rwb->scale_step > 0)
445 scale_up(rwb);
446 else if (rwb->scale_step < 0)
447 scale_down(rwb, false);
448 break;
449 default:
450 break;
451 }
452
453
454
455
456 if (rwb->scale_step || inflight)
457 rwb_arm_timer(rwb);
458}
459
460void wbt_update_limits(struct rq_wb *rwb)
461{
462 rwb->scale_step = 0;
463 rwb->scaled_max = false;
464 calc_wb_limits(rwb);
465
466 rwb_wake_all(rwb);
467}
468
469static bool close_io(struct rq_wb *rwb)
470{
471 const unsigned long now = jiffies;
472
473 return time_before(now, rwb->last_issue + HZ / 10) ||
474 time_before(now, rwb->last_comp + HZ / 10);
475}
476
477#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO)
478
479static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
480{
481 unsigned int limit;
482
483
484
485
486
487
488
489
490
491 if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd())
492 limit = rwb->wb_max;
493 else if ((rw & REQ_BACKGROUND) || close_io(rwb)) {
494
495
496
497
498 limit = rwb->wb_background;
499 } else
500 limit = rwb->wb_normal;
501
502 return limit;
503}
504
505static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw,
506 wait_queue_entry_t *wait, unsigned long rw)
507{
508
509
510
511
512
513 if (!rwb_enabled(rwb)) {
514 atomic_inc(&rqw->inflight);
515 return true;
516 }
517
518
519
520
521
522 if (waitqueue_active(&rqw->wait) &&
523 rqw->wait.head.next != &wait->entry)
524 return false;
525
526 return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw));
527}
528
529
530
531
532
533static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock)
534 __releases(lock)
535 __acquires(lock)
536{
537 struct rq_wait *rqw = get_rq_wait(rwb, current_is_kswapd());
538 DEFINE_WAIT(wait);
539
540 if (may_queue(rwb, rqw, &wait, rw))
541 return;
542
543 do {
544 prepare_to_wait_exclusive(&rqw->wait, &wait,
545 TASK_UNINTERRUPTIBLE);
546
547 if (may_queue(rwb, rqw, &wait, rw))
548 break;
549
550 if (lock) {
551 spin_unlock_irq(lock);
552 io_schedule();
553 spin_lock_irq(lock);
554 } else
555 io_schedule();
556 } while (1);
557
558 finish_wait(&rqw->wait, &wait);
559}
560
561static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
562{
563 const int op = bio_op(bio);
564
565
566
567
568 if (op != REQ_OP_WRITE)
569 return false;
570
571
572
573
574 if ((bio->bi_opf & (REQ_SYNC | REQ_IDLE)) == (REQ_SYNC | REQ_IDLE))
575 return false;
576
577 return true;
578}
579
580
581
582
583
584
585
586enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock)
587{
588 unsigned int ret = 0;
589
590 if (!rwb_enabled(rwb))
591 return 0;
592
593 if (bio_op(bio) == REQ_OP_READ)
594 ret = WBT_READ;
595
596 if (!wbt_should_throttle(rwb, bio)) {
597 if (ret & WBT_READ)
598 wb_timestamp(rwb, &rwb->last_issue);
599 return ret;
600 }
601
602 __wbt_wait(rwb, bio->bi_opf, lock);
603
604 if (!blk_stat_is_active(rwb->cb))
605 rwb_arm_timer(rwb);
606
607 if (current_is_kswapd())
608 ret |= WBT_KSWAPD;
609
610 return ret | WBT_TRACKED;
611}
612
613void wbt_issue(struct rq_wb *rwb, struct blk_issue_stat *stat)
614{
615 if (!rwb_enabled(rwb))
616 return;
617
618
619
620
621
622
623
624
625
626 if (wbt_is_read(stat) && !rwb->sync_issue) {
627 rwb->sync_cookie = stat;
628 rwb->sync_issue = blk_stat_time(stat);
629 }
630}
631
632void wbt_requeue(struct rq_wb *rwb, struct blk_issue_stat *stat)
633{
634 if (!rwb_enabled(rwb))
635 return;
636 if (stat == rwb->sync_cookie) {
637 rwb->sync_issue = 0;
638 rwb->sync_cookie = NULL;
639 }
640}
641
642void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
643{
644 if (rwb) {
645 rwb->queue_depth = depth;
646 wbt_update_limits(rwb);
647 }
648}
649
650void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on)
651{
652 if (rwb)
653 rwb->wc = write_cache_on;
654}
655
656
657
658
659void wbt_disable_default(struct request_queue *q)
660{
661 struct rq_wb *rwb = q->rq_wb;
662
663 if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT)
664 wbt_exit(q);
665}
666EXPORT_SYMBOL_GPL(wbt_disable_default);
667
668
669
670
671void wbt_enable_default(struct request_queue *q)
672{
673
674 if (q->rq_wb)
675 return;
676
677
678 if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
679 return;
680
681 if ((q->mq_ops && IS_ENABLED(CONFIG_BLK_WBT_MQ)) ||
682 (q->request_fn && IS_ENABLED(CONFIG_BLK_WBT_SQ)))
683 wbt_init(q);
684}
685EXPORT_SYMBOL_GPL(wbt_enable_default);
686
687u64 wbt_default_latency_nsec(struct request_queue *q)
688{
689
690
691
692
693 if (blk_queue_nonrot(q))
694 return 2000000ULL;
695 else
696 return 75000000ULL;
697}
698
699static int wbt_data_dir(const struct request *rq)
700{
701 return rq_data_dir(rq);
702}
703
704int wbt_init(struct request_queue *q)
705{
706 struct rq_wb *rwb;
707 int i;
708
709 BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS);
710
711 rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
712 if (!rwb)
713 return -ENOMEM;
714
715 rwb->cb = blk_stat_alloc_callback(wb_timer_fn, wbt_data_dir, 2, rwb);
716 if (!rwb->cb) {
717 kfree(rwb);
718 return -ENOMEM;
719 }
720
721 for (i = 0; i < WBT_NUM_RWQ; i++) {
722 atomic_set(&rwb->rq_wait[i].inflight, 0);
723 init_waitqueue_head(&rwb->rq_wait[i].wait);
724 }
725
726 rwb->wc = 1;
727 rwb->queue_depth = RWB_DEF_DEPTH;
728 rwb->last_comp = rwb->last_issue = jiffies;
729 rwb->queue = q;
730 rwb->win_nsec = RWB_WINDOW_NSEC;
731 rwb->enable_state = WBT_STATE_ON_DEFAULT;
732 wbt_update_limits(rwb);
733
734
735
736
737 q->rq_wb = rwb;
738 blk_stat_add_callback(q, rwb->cb);
739
740 rwb->min_lat_nsec = wbt_default_latency_nsec(q);
741
742 wbt_set_queue_depth(rwb, blk_queue_depth(q));
743 wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
744
745 return 0;
746}
747
748void wbt_exit(struct request_queue *q)
749{
750 struct rq_wb *rwb = q->rq_wb;
751
752 if (rwb) {
753 blk_stat_remove_callback(q, rwb->cb);
754 blk_stat_free_callback(rwb->cb);
755 q->rq_wb = NULL;
756 kfree(rwb);
757 }
758}
759