1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175#include <linux/kernel.h>
176#include <linux/module.h>
177#include <linux/timer.h>
178#include <linux/time64.h>
179#include <linux/parser.h>
180#include <linux/sched/signal.h>
181#include <linux/blk-cgroup.h>
182#include "blk-rq-qos.h"
183#include "blk-stat.h"
184#include "blk-wbt.h"
185#include "blk.h"
186
187#ifdef CONFIG_TRACEPOINTS
188
189
190#define TRACE_IOCG_PATH_LEN 1024
191static DEFINE_SPINLOCK(trace_iocg_path_lock);
192static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
193
194#define TRACE_IOCG_PATH(type, iocg, ...) \
195 do { \
196 unsigned long flags; \
197 if (trace_iocost_##type##_enabled()) { \
198 spin_lock_irqsave(&trace_iocg_path_lock, flags); \
199 cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup, \
200 trace_iocg_path, TRACE_IOCG_PATH_LEN); \
201 trace_iocost_##type(iocg, trace_iocg_path, \
202 ##__VA_ARGS__); \
203 spin_unlock_irqrestore(&trace_iocg_path_lock, flags); \
204 } \
205 } while (0)
206
207#else
208#define TRACE_IOCG_PATH(type, iocg, ...) do { } while (0)
209#endif
210
211enum {
212 MILLION = 1000000,
213
214
215 MIN_PERIOD = USEC_PER_MSEC,
216 MAX_PERIOD = USEC_PER_SEC,
217
218
219
220
221
222
223 MARGIN_PCT = 50,
224 INUSE_MARGIN_PCT = 10,
225
226
227 WAITQ_TIMER_MARGIN_PCT = 5,
228
229
230
231
232
233
234 VTIME_VALID_DUR = 300 * USEC_PER_SEC,
235
236
237
238
239
240
241
242
243
244 NR_USAGE_SLOTS = 3,
245 MIN_VALID_USAGES = 2,
246
247
248 HWEIGHT_WHOLE = 1 << 16,
249
250
251
252
253
254
255
256
257
258
259
260
261 VTIME_PER_SEC_SHIFT = 37,
262 VTIME_PER_SEC = 1LLU << VTIME_PER_SEC_SHIFT,
263 VTIME_PER_USEC = VTIME_PER_SEC / USEC_PER_SEC,
264
265
266 VRATE_MIN_PPM = 10000,
267 VRATE_MAX_PPM = 100000000,
268
269 VRATE_MIN = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
270 VRATE_CLAMP_ADJ_PCT = 4,
271
272
273 RQ_WAIT_BUSY_PCT = 5,
274
275
276 UNBUSY_THR_PCT = 75,
277
278
279 MAX_LAGGING_PERIODS = 10,
280
281
282
283
284
285 SURPLUS_SCALE_PCT = 125,
286 SURPLUS_SCALE_ABS = HWEIGHT_WHOLE / 50,
287 SURPLUS_MIN_ADJ_DELTA = HWEIGHT_WHOLE / 33,
288
289
290 AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC,
291
292
293
294
295
296
297 IOC_PAGE_SHIFT = 12,
298 IOC_PAGE_SIZE = 1 << IOC_PAGE_SHIFT,
299 IOC_SECT_TO_PAGE_SHIFT = IOC_PAGE_SHIFT - SECTOR_SHIFT,
300
301
302 LCOEF_RANDIO_PAGES = 4096,
303};
304
305enum ioc_running {
306 IOC_IDLE,
307 IOC_RUNNING,
308 IOC_STOP,
309};
310
311
312enum {
313 QOS_ENABLE,
314 QOS_CTRL,
315 NR_QOS_CTRL_PARAMS,
316};
317
318
319enum {
320 QOS_RPPM,
321 QOS_RLAT,
322 QOS_WPPM,
323 QOS_WLAT,
324 QOS_MIN,
325 QOS_MAX,
326 NR_QOS_PARAMS,
327};
328
329
330enum {
331 COST_CTRL,
332 COST_MODEL,
333 NR_COST_CTRL_PARAMS,
334};
335
336
337enum {
338 I_LCOEF_RBPS,
339 I_LCOEF_RSEQIOPS,
340 I_LCOEF_RRANDIOPS,
341 I_LCOEF_WBPS,
342 I_LCOEF_WSEQIOPS,
343 I_LCOEF_WRANDIOPS,
344 NR_I_LCOEFS,
345};
346
347enum {
348 LCOEF_RPAGE,
349 LCOEF_RSEQIO,
350 LCOEF_RRANDIO,
351 LCOEF_WPAGE,
352 LCOEF_WSEQIO,
353 LCOEF_WRANDIO,
354 NR_LCOEFS,
355};
356
357enum {
358 AUTOP_INVALID,
359 AUTOP_HDD,
360 AUTOP_SSD_QD1,
361 AUTOP_SSD_DFL,
362 AUTOP_SSD_FAST,
363};
364
365struct ioc_gq;
366
367struct ioc_params {
368 u32 qos[NR_QOS_PARAMS];
369 u64 i_lcoefs[NR_I_LCOEFS];
370 u64 lcoefs[NR_LCOEFS];
371 u32 too_fast_vrate_pct;
372 u32 too_slow_vrate_pct;
373};
374
375struct ioc_missed {
376 u32 nr_met;
377 u32 nr_missed;
378 u32 last_met;
379 u32 last_missed;
380};
381
382struct ioc_pcpu_stat {
383 struct ioc_missed missed[2];
384
385 u64 rq_wait_ns;
386 u64 last_rq_wait_ns;
387};
388
389
390struct ioc {
391 struct rq_qos rqos;
392
393 bool enabled;
394
395 struct ioc_params params;
396 u32 period_us;
397 u32 margin_us;
398 u64 vrate_min;
399 u64 vrate_max;
400
401 spinlock_t lock;
402 struct timer_list timer;
403 struct list_head active_iocgs;
404 struct ioc_pcpu_stat __percpu *pcpu_stat;
405
406 enum ioc_running running;
407 atomic64_t vtime_rate;
408
409 seqcount_t period_seqcount;
410 u32 period_at;
411 u64 period_at_vtime;
412
413 atomic64_t cur_period;
414 int busy_level;
415
416 u64 inuse_margin_vtime;
417 bool weights_updated;
418 atomic_t hweight_gen;
419
420 u64 autop_too_fast_at;
421 u64 autop_too_slow_at;
422 int autop_idx;
423 bool user_qos_params:1;
424 bool user_cost_model:1;
425};
426
427
428struct ioc_gq {
429 struct blkg_policy_data pd;
430 struct ioc *ioc;
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447 u32 cfg_weight;
448 u32 weight;
449 u32 active;
450 u32 inuse;
451 u32 last_inuse;
452
453 sector_t cursor;
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468 atomic64_t vtime;
469 atomic64_t done_vtime;
470 u64 abs_vdebt;
471 u64 last_vtime;
472
473
474
475
476
477 atomic64_t active_period;
478 struct list_head active_list;
479
480
481 u64 child_active_sum;
482 u64 child_inuse_sum;
483 int hweight_gen;
484 u32 hweight_active;
485 u32 hweight_inuse;
486 bool has_surplus;
487
488 struct wait_queue_head waitq;
489 struct hrtimer waitq_timer;
490 struct hrtimer delay_timer;
491
492
493 int usage_idx;
494 u32 usages[NR_USAGE_SLOTS];
495
496
497 int level;
498 struct ioc_gq *ancestors[];
499};
500
501
502struct ioc_cgrp {
503 struct blkcg_policy_data cpd;
504 unsigned int dfl_weight;
505};
506
507struct ioc_now {
508 u64 now_ns;
509 u32 now;
510 u64 vnow;
511 u64 vrate;
512};
513
514struct iocg_wait {
515 struct wait_queue_entry wait;
516 struct bio *bio;
517 u64 abs_cost;
518 bool committed;
519};
520
521struct iocg_wake_ctx {
522 struct ioc_gq *iocg;
523 u32 hw_inuse;
524 s64 vbudget;
525};
526
527static const struct ioc_params autop[] = {
528 [AUTOP_HDD] = {
529 .qos = {
530 [QOS_RLAT] = 250000,
531 [QOS_WLAT] = 250000,
532 [QOS_MIN] = VRATE_MIN_PPM,
533 [QOS_MAX] = VRATE_MAX_PPM,
534 },
535 .i_lcoefs = {
536 [I_LCOEF_RBPS] = 174019176,
537 [I_LCOEF_RSEQIOPS] = 41708,
538 [I_LCOEF_RRANDIOPS] = 370,
539 [I_LCOEF_WBPS] = 178075866,
540 [I_LCOEF_WSEQIOPS] = 42705,
541 [I_LCOEF_WRANDIOPS] = 378,
542 },
543 },
544 [AUTOP_SSD_QD1] = {
545 .qos = {
546 [QOS_RLAT] = 25000,
547 [QOS_WLAT] = 25000,
548 [QOS_MIN] = VRATE_MIN_PPM,
549 [QOS_MAX] = VRATE_MAX_PPM,
550 },
551 .i_lcoefs = {
552 [I_LCOEF_RBPS] = 245855193,
553 [I_LCOEF_RSEQIOPS] = 61575,
554 [I_LCOEF_RRANDIOPS] = 6946,
555 [I_LCOEF_WBPS] = 141365009,
556 [I_LCOEF_WSEQIOPS] = 33716,
557 [I_LCOEF_WRANDIOPS] = 26796,
558 },
559 },
560 [AUTOP_SSD_DFL] = {
561 .qos = {
562 [QOS_RLAT] = 25000,
563 [QOS_WLAT] = 25000,
564 [QOS_MIN] = VRATE_MIN_PPM,
565 [QOS_MAX] = VRATE_MAX_PPM,
566 },
567 .i_lcoefs = {
568 [I_LCOEF_RBPS] = 488636629,
569 [I_LCOEF_RSEQIOPS] = 8932,
570 [I_LCOEF_RRANDIOPS] = 8518,
571 [I_LCOEF_WBPS] = 427891549,
572 [I_LCOEF_WSEQIOPS] = 28755,
573 [I_LCOEF_WRANDIOPS] = 21940,
574 },
575 .too_fast_vrate_pct = 500,
576 },
577 [AUTOP_SSD_FAST] = {
578 .qos = {
579 [QOS_RLAT] = 5000,
580 [QOS_WLAT] = 5000,
581 [QOS_MIN] = VRATE_MIN_PPM,
582 [QOS_MAX] = VRATE_MAX_PPM,
583 },
584 .i_lcoefs = {
585 [I_LCOEF_RBPS] = 3102524156LLU,
586 [I_LCOEF_RSEQIOPS] = 724816,
587 [I_LCOEF_RRANDIOPS] = 778122,
588 [I_LCOEF_WBPS] = 1742780862LLU,
589 [I_LCOEF_WSEQIOPS] = 425702,
590 [I_LCOEF_WRANDIOPS] = 443193,
591 },
592 .too_slow_vrate_pct = 10,
593 },
594};
595
596
597
598
599
600static u32 vrate_adj_pct[] =
601 { 0, 0, 0, 0,
602 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
603 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
604 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
605
606static struct blkcg_policy blkcg_policy_iocost;
607
608
609static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
610{
611 return container_of(rqos, struct ioc, rqos);
612}
613
614static struct ioc *q_to_ioc(struct request_queue *q)
615{
616 return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
617}
618
619static const char *q_name(struct request_queue *q)
620{
621 if (test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
622 return kobject_name(q->kobj.parent);
623 else
624 return "<unknown>";
625}
626
627static const char __maybe_unused *ioc_name(struct ioc *ioc)
628{
629 return q_name(ioc->rqos.q);
630}
631
632static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
633{
634 return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
635}
636
637static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
638{
639 return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
640}
641
642static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
643{
644 return pd_to_blkg(&iocg->pd);
645}
646
647static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
648{
649 return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
650 struct ioc_cgrp, cpd);
651}
652
653
654
655
656
657static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
658{
659 return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse);
660}
661
662
663
664
665static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
666{
667 return DIV64_U64_ROUND_UP(cost * hw_inuse, HWEIGHT_WHOLE);
668}
669
670static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost)
671{
672 bio->bi_iocost_cost = cost;
673 atomic64_add(cost, &iocg->vtime);
674}
675
676#define CREATE_TRACE_POINTS
677#include <trace/events/iocost.h>
678
679
680static void ioc_refresh_period_us(struct ioc *ioc)
681{
682 u32 ppm, lat, multi, period_us;
683
684 lockdep_assert_held(&ioc->lock);
685
686
687 if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
688 ppm = ioc->params.qos[QOS_RPPM];
689 lat = ioc->params.qos[QOS_RLAT];
690 } else {
691 ppm = ioc->params.qos[QOS_WPPM];
692 lat = ioc->params.qos[QOS_WLAT];
693 }
694
695
696
697
698
699
700
701
702
703 if (ppm)
704 multi = max_t(u32, (MILLION - ppm) / 50000, 2);
705 else
706 multi = 2;
707 period_us = multi * lat;
708 period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
709
710
711 ioc->period_us = period_us;
712 ioc->margin_us = period_us * MARGIN_PCT / 100;
713 ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
714 period_us * VTIME_PER_USEC * INUSE_MARGIN_PCT, 100);
715}
716
717static int ioc_autop_idx(struct ioc *ioc)
718{
719 int idx = ioc->autop_idx;
720 const struct ioc_params *p = &autop[idx];
721 u32 vrate_pct;
722 u64 now_ns;
723
724
725 if (!blk_queue_nonrot(ioc->rqos.q))
726 return AUTOP_HDD;
727
728
729 if (blk_queue_depth(ioc->rqos.q) == 1)
730 return AUTOP_SSD_QD1;
731
732
733 if (idx < AUTOP_SSD_DFL)
734 return AUTOP_SSD_DFL;
735
736
737 if (ioc->user_qos_params || ioc->user_cost_model)
738 return idx;
739
740
741 vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100,
742 VTIME_PER_USEC);
743 now_ns = ktime_get_ns();
744
745 if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
746 if (!ioc->autop_too_fast_at)
747 ioc->autop_too_fast_at = now_ns;
748 if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
749 return idx + 1;
750 } else {
751 ioc->autop_too_fast_at = 0;
752 }
753
754 if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
755 if (!ioc->autop_too_slow_at)
756 ioc->autop_too_slow_at = now_ns;
757 if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
758 return idx - 1;
759 } else {
760 ioc->autop_too_slow_at = 0;
761 }
762
763 return idx;
764}
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
780 u64 *page, u64 *seqio, u64 *randio)
781{
782 u64 v;
783
784 *page = *seqio = *randio = 0;
785
786 if (bps)
787 *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC,
788 DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE));
789
790 if (seqiops) {
791 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
792 if (v > *page)
793 *seqio = v - *page;
794 }
795
796 if (randiops) {
797 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
798 if (v > *page)
799 *randio = v - *page;
800 }
801}
802
803static void ioc_refresh_lcoefs(struct ioc *ioc)
804{
805 u64 *u = ioc->params.i_lcoefs;
806 u64 *c = ioc->params.lcoefs;
807
808 calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
809 &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
810 calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
811 &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
812}
813
814static bool ioc_refresh_params(struct ioc *ioc, bool force)
815{
816 const struct ioc_params *p;
817 int idx;
818
819 lockdep_assert_held(&ioc->lock);
820
821 idx = ioc_autop_idx(ioc);
822 p = &autop[idx];
823
824 if (idx == ioc->autop_idx && !force)
825 return false;
826
827 if (idx != ioc->autop_idx)
828 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
829
830 ioc->autop_idx = idx;
831 ioc->autop_too_fast_at = 0;
832 ioc->autop_too_slow_at = 0;
833
834 if (!ioc->user_qos_params)
835 memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
836 if (!ioc->user_cost_model)
837 memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
838
839 ioc_refresh_period_us(ioc);
840 ioc_refresh_lcoefs(ioc);
841
842 ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
843 VTIME_PER_USEC, MILLION);
844 ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
845 VTIME_PER_USEC, MILLION);
846
847 return true;
848}
849
850
851static void ioc_now(struct ioc *ioc, struct ioc_now *now)
852{
853 unsigned seq;
854
855 now->now_ns = ktime_get();
856 now->now = ktime_to_us(now->now_ns);
857 now->vrate = atomic64_read(&ioc->vtime_rate);
858
859
860
861
862
863
864
865
866
867 do {
868 seq = read_seqcount_begin(&ioc->period_seqcount);
869 now->vnow = ioc->period_at_vtime +
870 (now->now - ioc->period_at) * now->vrate;
871 } while (read_seqcount_retry(&ioc->period_seqcount, seq));
872}
873
874static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
875{
876 lockdep_assert_held(&ioc->lock);
877 WARN_ON_ONCE(ioc->running != IOC_RUNNING);
878
879 write_seqcount_begin(&ioc->period_seqcount);
880 ioc->period_at = now->now;
881 ioc->period_at_vtime = now->vnow;
882 write_seqcount_end(&ioc->period_seqcount);
883
884 ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
885 add_timer(&ioc->timer);
886}
887
888
889
890
891
892static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
893{
894 struct ioc *ioc = iocg->ioc;
895 int lvl;
896
897 lockdep_assert_held(&ioc->lock);
898
899 inuse = min(active, inuse);
900
901 for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
902 struct ioc_gq *parent = iocg->ancestors[lvl];
903 struct ioc_gq *child = iocg->ancestors[lvl + 1];
904 u32 parent_active = 0, parent_inuse = 0;
905
906
907 parent->child_active_sum += (s32)(active - child->active);
908 parent->child_inuse_sum += (s32)(inuse - child->inuse);
909
910 child->active = active;
911 child->inuse = inuse;
912
913
914
915
916
917
918 if (parent->child_active_sum) {
919 parent_active = parent->weight;
920 parent_inuse = DIV64_U64_ROUND_UP(
921 parent_active * parent->child_inuse_sum,
922 parent->child_active_sum);
923 }
924
925
926 if (parent_active == parent->active &&
927 parent_inuse == parent->inuse)
928 break;
929
930 active = parent_active;
931 inuse = parent_inuse;
932 }
933
934 ioc->weights_updated = true;
935}
936
937static void commit_active_weights(struct ioc *ioc)
938{
939 lockdep_assert_held(&ioc->lock);
940
941 if (ioc->weights_updated) {
942
943 smp_wmb();
944 atomic_inc(&ioc->hweight_gen);
945 ioc->weights_updated = false;
946 }
947}
948
949static void propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
950{
951 __propagate_active_weight(iocg, active, inuse);
952 commit_active_weights(iocg->ioc);
953}
954
955static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
956{
957 struct ioc *ioc = iocg->ioc;
958 int lvl;
959 u32 hwa, hwi;
960 int ioc_gen;
961
962
963 ioc_gen = atomic_read(&ioc->hweight_gen);
964 if (ioc_gen == iocg->hweight_gen)
965 goto out;
966
967
968
969
970
971
972
973
974
975
976
977 smp_rmb();
978
979 hwa = hwi = HWEIGHT_WHOLE;
980 for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
981 struct ioc_gq *parent = iocg->ancestors[lvl];
982 struct ioc_gq *child = iocg->ancestors[lvl + 1];
983 u32 active_sum = READ_ONCE(parent->child_active_sum);
984 u32 inuse_sum = READ_ONCE(parent->child_inuse_sum);
985 u32 active = READ_ONCE(child->active);
986 u32 inuse = READ_ONCE(child->inuse);
987
988
989 if (!active_sum || !inuse_sum)
990 continue;
991
992 active_sum = max(active, active_sum);
993 hwa = hwa * active / active_sum;
994
995 inuse_sum = max(inuse, inuse_sum);
996 hwi = hwi * inuse / inuse_sum;
997 }
998
999 iocg->hweight_active = max_t(u32, hwa, 1);
1000 iocg->hweight_inuse = max_t(u32, hwi, 1);
1001 iocg->hweight_gen = ioc_gen;
1002out:
1003 if (hw_activep)
1004 *hw_activep = iocg->hweight_active;
1005 if (hw_inusep)
1006 *hw_inusep = iocg->hweight_inuse;
1007}
1008
1009static void weight_updated(struct ioc_gq *iocg)
1010{
1011 struct ioc *ioc = iocg->ioc;
1012 struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1013 struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
1014 u32 weight;
1015
1016 lockdep_assert_held(&ioc->lock);
1017
1018 weight = iocg->cfg_weight ?: iocc->dfl_weight;
1019 if (weight != iocg->weight && iocg->active)
1020 propagate_active_weight(iocg, weight,
1021 DIV64_U64_ROUND_UP(iocg->inuse * weight, iocg->weight));
1022 iocg->weight = weight;
1023}
1024
1025static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
1026{
1027 struct ioc *ioc = iocg->ioc;
1028 u64 last_period, cur_period, max_period_delta;
1029 u64 vtime, vmargin, vmin;
1030 int i;
1031
1032
1033
1034
1035
1036 if (!list_empty(&iocg->active_list)) {
1037 ioc_now(ioc, now);
1038 cur_period = atomic64_read(&ioc->cur_period);
1039 if (atomic64_read(&iocg->active_period) != cur_period)
1040 atomic64_set(&iocg->active_period, cur_period);
1041 return true;
1042 }
1043
1044
1045 if (iocg->child_active_sum)
1046 return false;
1047
1048 spin_lock_irq(&ioc->lock);
1049
1050 ioc_now(ioc, now);
1051
1052
1053 cur_period = atomic64_read(&ioc->cur_period);
1054 last_period = atomic64_read(&iocg->active_period);
1055 atomic64_set(&iocg->active_period, cur_period);
1056
1057
1058 if (!list_empty(&iocg->active_list))
1059 goto succeed_unlock;
1060 for (i = iocg->level - 1; i > 0; i--)
1061 if (!list_empty(&iocg->ancestors[i]->active_list))
1062 goto fail_unlock;
1063
1064 if (iocg->child_active_sum)
1065 goto fail_unlock;
1066
1067
1068
1069
1070
1071
1072
1073 max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us);
1074 vtime = atomic64_read(&iocg->vtime);
1075 vmargin = ioc->margin_us * now->vrate;
1076 vmin = now->vnow - vmargin;
1077
1078 if (last_period + max_period_delta < cur_period ||
1079 time_before64(vtime, vmin)) {
1080 atomic64_add(vmin - vtime, &iocg->vtime);
1081 atomic64_add(vmin - vtime, &iocg->done_vtime);
1082 vtime = vmin;
1083 }
1084
1085
1086
1087
1088
1089
1090 iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
1091 list_add(&iocg->active_list, &ioc->active_iocgs);
1092 propagate_active_weight(iocg, iocg->weight,
1093 iocg->last_inuse ?: iocg->weight);
1094
1095 TRACE_IOCG_PATH(iocg_activate, iocg, now,
1096 last_period, cur_period, vtime);
1097
1098 iocg->last_vtime = vtime;
1099
1100 if (ioc->running == IOC_IDLE) {
1101 ioc->running = IOC_RUNNING;
1102 ioc_start_period(ioc, now);
1103 }
1104
1105succeed_unlock:
1106 spin_unlock_irq(&ioc->lock);
1107 return true;
1108
1109fail_unlock:
1110 spin_unlock_irq(&ioc->lock);
1111 return false;
1112}
1113
1114static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
1115 int flags, void *key)
1116{
1117 struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
1118 struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key;
1119 u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
1120
1121 ctx->vbudget -= cost;
1122
1123 if (ctx->vbudget < 0)
1124 return -1;
1125
1126 iocg_commit_bio(ctx->iocg, wait->bio, cost);
1127
1128
1129
1130
1131
1132
1133 list_del_init(&wq_entry->entry);
1134 wait->committed = true;
1135
1136 default_wake_function(wq_entry, mode, flags, key);
1137 return 0;
1138}
1139
1140static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now)
1141{
1142 struct ioc *ioc = iocg->ioc;
1143 struct iocg_wake_ctx ctx = { .iocg = iocg };
1144 u64 margin_ns = (u64)(ioc->period_us *
1145 WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC;
1146 u64 vdebt, vshortage, expires, oexpires;
1147 s64 vbudget;
1148 u32 hw_inuse;
1149
1150 lockdep_assert_held(&iocg->waitq.lock);
1151
1152 current_hweight(iocg, NULL, &hw_inuse);
1153 vbudget = now->vnow - atomic64_read(&iocg->vtime);
1154
1155
1156 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
1157 if (vdebt && vbudget > 0) {
1158 u64 delta = min_t(u64, vbudget, vdebt);
1159 u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse),
1160 iocg->abs_vdebt);
1161
1162 atomic64_add(delta, &iocg->vtime);
1163 atomic64_add(delta, &iocg->done_vtime);
1164 iocg->abs_vdebt -= abs_delta;
1165 }
1166
1167
1168
1169
1170
1171 ctx.hw_inuse = hw_inuse;
1172 ctx.vbudget = vbudget - vdebt;
1173 __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
1174 if (!waitqueue_active(&iocg->waitq))
1175 return;
1176 if (WARN_ON_ONCE(ctx.vbudget >= 0))
1177 return;
1178
1179
1180 vshortage = -ctx.vbudget;
1181 expires = now->now_ns +
1182 DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC;
1183 expires += margin_ns / 4;
1184
1185
1186 oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
1187 if (hrtimer_is_queued(&iocg->waitq_timer) &&
1188 abs(oexpires - expires) <= margin_ns / 4)
1189 return;
1190
1191 hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
1192 margin_ns / 4, HRTIMER_MODE_ABS);
1193}
1194
1195static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
1196{
1197 struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
1198 struct ioc_now now;
1199 unsigned long flags;
1200
1201 ioc_now(iocg->ioc, &now);
1202
1203 spin_lock_irqsave(&iocg->waitq.lock, flags);
1204 iocg_kick_waitq(iocg, &now);
1205 spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1206
1207 return HRTIMER_NORESTART;
1208}
1209
1210static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now, u64 cost)
1211{
1212 struct ioc *ioc = iocg->ioc;
1213 struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1214 u64 vtime = atomic64_read(&iocg->vtime);
1215 u64 vmargin = ioc->margin_us * now->vrate;
1216 u64 margin_ns = ioc->margin_us * NSEC_PER_USEC;
1217 u64 expires, oexpires;
1218 u32 hw_inuse;
1219
1220 lockdep_assert_held(&iocg->waitq.lock);
1221
1222
1223 current_hweight(iocg, NULL, &hw_inuse);
1224 vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
1225
1226
1227
1228
1229
1230
1231 if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) {
1232 blkcg_clear_delay(blkg);
1233 return false;
1234 }
1235 if (!atomic_read(&blkg->use_delay) &&
1236 time_before_eq64(vtime, now->vnow + vmargin))
1237 return false;
1238
1239
1240 if (cost) {
1241 u64 cost_ns = DIV64_U64_ROUND_UP(cost * NSEC_PER_USEC,
1242 now->vrate);
1243 blkcg_add_delay(blkg, now->now_ns, cost_ns);
1244 }
1245 blkcg_use_delay(blkg);
1246
1247 expires = now->now_ns + DIV64_U64_ROUND_UP(vtime - now->vnow,
1248 now->vrate) * NSEC_PER_USEC;
1249
1250
1251 oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer));
1252 if (hrtimer_is_queued(&iocg->delay_timer) &&
1253 abs(oexpires - expires) <= margin_ns / 4)
1254 return true;
1255
1256 hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires),
1257 margin_ns / 4, HRTIMER_MODE_ABS);
1258 return true;
1259}
1260
1261static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer)
1262{
1263 struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer);
1264 struct ioc_now now;
1265 unsigned long flags;
1266
1267 spin_lock_irqsave(&iocg->waitq.lock, flags);
1268 ioc_now(iocg->ioc, &now);
1269 iocg_kick_delay(iocg, &now, 0);
1270 spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1271
1272 return HRTIMER_NORESTART;
1273}
1274
1275static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
1276{
1277 u32 nr_met[2] = { };
1278 u32 nr_missed[2] = { };
1279 u64 rq_wait_ns = 0;
1280 int cpu, rw;
1281
1282 for_each_online_cpu(cpu) {
1283 struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
1284 u64 this_rq_wait_ns;
1285
1286 for (rw = READ; rw <= WRITE; rw++) {
1287 u32 this_met = READ_ONCE(stat->missed[rw].nr_met);
1288 u32 this_missed = READ_ONCE(stat->missed[rw].nr_missed);
1289
1290 nr_met[rw] += this_met - stat->missed[rw].last_met;
1291 nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
1292 stat->missed[rw].last_met = this_met;
1293 stat->missed[rw].last_missed = this_missed;
1294 }
1295
1296 this_rq_wait_ns = READ_ONCE(stat->rq_wait_ns);
1297 rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
1298 stat->last_rq_wait_ns = this_rq_wait_ns;
1299 }
1300
1301 for (rw = READ; rw <= WRITE; rw++) {
1302 if (nr_met[rw] + nr_missed[rw])
1303 missed_ppm_ar[rw] =
1304 DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
1305 nr_met[rw] + nr_missed[rw]);
1306 else
1307 missed_ppm_ar[rw] = 0;
1308 }
1309
1310 *rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
1311 ioc->period_us * NSEC_PER_USEC);
1312}
1313
1314
1315static bool iocg_is_idle(struct ioc_gq *iocg)
1316{
1317 struct ioc *ioc = iocg->ioc;
1318
1319
1320 if (atomic64_read(&iocg->active_period) ==
1321 atomic64_read(&ioc->cur_period))
1322 return false;
1323
1324
1325 if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
1326 return false;
1327
1328 return true;
1329}
1330
1331
1332static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse)
1333{
1334
1335 usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100);
1336 usage += SURPLUS_SCALE_ABS;
1337
1338
1339 if (usage + SURPLUS_MIN_ADJ_DELTA > hw_inuse)
1340 return 0;
1341
1342 return usage;
1343}
1344
1345static void ioc_timer_fn(struct timer_list *timer)
1346{
1347 struct ioc *ioc = container_of(timer, struct ioc, timer);
1348 struct ioc_gq *iocg, *tiocg;
1349 struct ioc_now now;
1350 int nr_surpluses = 0, nr_shortages = 0, nr_lagging = 0;
1351 u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
1352 u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
1353 u32 missed_ppm[2], rq_wait_pct;
1354 u64 period_vtime;
1355 int prev_busy_level, i;
1356
1357
1358 ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
1359
1360
1361 spin_lock_irq(&ioc->lock);
1362
1363 ioc_now(ioc, &now);
1364
1365 period_vtime = now.vnow - ioc->period_at_vtime;
1366 if (WARN_ON_ONCE(!period_vtime)) {
1367 spin_unlock_irq(&ioc->lock);
1368 return;
1369 }
1370
1371
1372
1373
1374
1375
1376
1377 list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
1378 if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
1379 !iocg_is_idle(iocg))
1380 continue;
1381
1382 spin_lock(&iocg->waitq.lock);
1383
1384 if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) {
1385
1386 iocg_kick_waitq(iocg, &now);
1387 iocg_kick_delay(iocg, &now, 0);
1388 } else if (iocg_is_idle(iocg)) {
1389
1390 iocg->last_inuse = iocg->inuse;
1391 __propagate_active_weight(iocg, 0, 0);
1392 list_del_init(&iocg->active_list);
1393 }
1394
1395 spin_unlock(&iocg->waitq.lock);
1396 }
1397 commit_active_weights(ioc);
1398
1399
1400 list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1401 u64 vdone, vtime, vusage, vmargin, vmin;
1402 u32 hw_active, hw_inuse, usage;
1403
1404
1405
1406
1407
1408 vdone = atomic64_read(&iocg->done_vtime);
1409 vtime = atomic64_read(&iocg->vtime);
1410 current_hweight(iocg, &hw_active, &hw_inuse);
1411
1412
1413
1414
1415
1416
1417
1418 if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
1419 !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
1420 time_after64(vtime, vdone) &&
1421 time_after64(vtime, now.vnow -
1422 MAX_LAGGING_PERIODS * period_vtime) &&
1423 time_before64(vdone, now.vnow - period_vtime))
1424 nr_lagging++;
1425
1426 if (waitqueue_active(&iocg->waitq))
1427 vusage = now.vnow - iocg->last_vtime;
1428 else if (time_before64(iocg->last_vtime, vtime))
1429 vusage = vtime - iocg->last_vtime;
1430 else
1431 vusage = 0;
1432
1433 iocg->last_vtime += vusage;
1434
1435
1436
1437
1438
1439 vusage = max(vusage, vtime - vdone);
1440
1441
1442 if (vusage) {
1443 usage = DIV64_U64_ROUND_UP(vusage * hw_inuse,
1444 period_vtime);
1445 iocg->usage_idx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS;
1446 iocg->usages[iocg->usage_idx] = usage;
1447 } else {
1448 usage = 0;
1449 }
1450
1451
1452 vmargin = ioc->margin_us * now.vrate;
1453 vmin = now.vnow - vmargin;
1454
1455 iocg->has_surplus = false;
1456
1457 if (!waitqueue_active(&iocg->waitq) &&
1458 time_before64(vtime, vmin)) {
1459 u64 delta = vmin - vtime;
1460
1461
1462 atomic64_add(delta, &iocg->vtime);
1463 atomic64_add(delta, &iocg->done_vtime);
1464 iocg->last_vtime += delta;
1465
1466 if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) {
1467 iocg->has_surplus = true;
1468 nr_surpluses++;
1469 }
1470 } else if (hw_inuse < hw_active) {
1471 u32 new_hwi, new_inuse;
1472
1473
1474 if (waitqueue_active(&iocg->waitq)) {
1475 new_hwi = hw_active;
1476 } else {
1477 new_hwi = max(hw_inuse,
1478 usage * SURPLUS_SCALE_PCT / 100 +
1479 SURPLUS_SCALE_ABS);
1480 }
1481
1482 new_inuse = div64_u64((u64)iocg->inuse * new_hwi,
1483 hw_inuse);
1484 new_inuse = clamp_t(u32, new_inuse, 1, iocg->active);
1485
1486 if (new_inuse > iocg->inuse) {
1487 TRACE_IOCG_PATH(inuse_takeback, iocg, &now,
1488 iocg->inuse, new_inuse,
1489 hw_inuse, new_hwi);
1490 __propagate_active_weight(iocg, iocg->weight,
1491 new_inuse);
1492 }
1493 } else {
1494
1495 nr_shortages++;
1496 }
1497 }
1498
1499 if (!nr_shortages || !nr_surpluses)
1500 goto skip_surplus_transfers;
1501
1502
1503 list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1504 u32 usage, hw_active, hw_inuse, new_hwi, new_inuse;
1505 int nr_valid = 0;
1506
1507 if (!iocg->has_surplus)
1508 continue;
1509
1510
1511 for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) {
1512 if (iocg->usages[i]) {
1513 usage = max(usage, iocg->usages[i]);
1514 nr_valid++;
1515 }
1516 }
1517 if (nr_valid < MIN_VALID_USAGES)
1518 continue;
1519
1520 current_hweight(iocg, &hw_active, &hw_inuse);
1521 new_hwi = surplus_adjusted_hweight_inuse(usage, hw_inuse);
1522 if (!new_hwi)
1523 continue;
1524
1525 new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi,
1526 hw_inuse);
1527 if (new_inuse < iocg->inuse) {
1528 TRACE_IOCG_PATH(inuse_giveaway, iocg, &now,
1529 iocg->inuse, new_inuse,
1530 hw_inuse, new_hwi);
1531 __propagate_active_weight(iocg, iocg->weight, new_inuse);
1532 }
1533 }
1534skip_surplus_transfers:
1535 commit_active_weights(ioc);
1536
1537
1538
1539
1540
1541
1542
1543 prev_busy_level = ioc->busy_level;
1544 if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
1545 missed_ppm[READ] > ppm_rthr ||
1546 missed_ppm[WRITE] > ppm_wthr) {
1547
1548 ioc->busy_level = max(ioc->busy_level, 0);
1549 ioc->busy_level++;
1550 } else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
1551 missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
1552 missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
1553
1554 if (nr_shortages) {
1555
1556
1557
1558
1559 ioc->busy_level = min(ioc->busy_level, 0);
1560
1561
1562
1563
1564
1565
1566
1567 if (!nr_lagging && !nr_surpluses)
1568 ioc->busy_level--;
1569 } else {
1570
1571
1572
1573
1574
1575
1576 ioc->busy_level = 0;
1577 }
1578 } else {
1579
1580 ioc->busy_level = 0;
1581 }
1582
1583 ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
1584
1585 if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) {
1586 u64 vrate = atomic64_read(&ioc->vtime_rate);
1587 u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
1588
1589
1590 if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
1591 vrate_min = VRATE_MIN;
1592
1593
1594
1595
1596
1597
1598 if (vrate < vrate_min) {
1599 vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT),
1600 100);
1601 vrate = min(vrate, vrate_min);
1602 } else if (vrate > vrate_max) {
1603 vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT),
1604 100);
1605 vrate = max(vrate, vrate_max);
1606 } else {
1607 int idx = min_t(int, abs(ioc->busy_level),
1608 ARRAY_SIZE(vrate_adj_pct) - 1);
1609 u32 adj_pct = vrate_adj_pct[idx];
1610
1611 if (ioc->busy_level > 0)
1612 adj_pct = 100 - adj_pct;
1613 else
1614 adj_pct = 100 + adj_pct;
1615
1616 vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
1617 vrate_min, vrate_max);
1618 }
1619
1620 trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
1621 nr_lagging, nr_shortages,
1622 nr_surpluses);
1623
1624 atomic64_set(&ioc->vtime_rate, vrate);
1625 ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
1626 ioc->period_us * vrate * INUSE_MARGIN_PCT, 100);
1627 } else if (ioc->busy_level != prev_busy_level || nr_lagging) {
1628 trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
1629 missed_ppm, rq_wait_pct, nr_lagging,
1630 nr_shortages, nr_surpluses);
1631 }
1632
1633 ioc_refresh_params(ioc, false);
1634
1635
1636
1637
1638
1639 atomic64_inc(&ioc->cur_period);
1640
1641 if (ioc->running != IOC_STOP) {
1642 if (!list_empty(&ioc->active_iocgs)) {
1643 ioc_start_period(ioc, &now);
1644 } else {
1645 ioc->busy_level = 0;
1646 ioc->running = IOC_IDLE;
1647 }
1648 }
1649
1650 spin_unlock_irq(&ioc->lock);
1651}
1652
1653static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
1654 bool is_merge, u64 *costp)
1655{
1656 struct ioc *ioc = iocg->ioc;
1657 u64 coef_seqio, coef_randio, coef_page;
1658 u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
1659 u64 seek_pages = 0;
1660 u64 cost = 0;
1661
1662 switch (bio_op(bio)) {
1663 case REQ_OP_READ:
1664 coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO];
1665 coef_randio = ioc->params.lcoefs[LCOEF_RRANDIO];
1666 coef_page = ioc->params.lcoefs[LCOEF_RPAGE];
1667 break;
1668 case REQ_OP_WRITE:
1669 coef_seqio = ioc->params.lcoefs[LCOEF_WSEQIO];
1670 coef_randio = ioc->params.lcoefs[LCOEF_WRANDIO];
1671 coef_page = ioc->params.lcoefs[LCOEF_WPAGE];
1672 break;
1673 default:
1674 goto out;
1675 }
1676
1677 if (iocg->cursor) {
1678 seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
1679 seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
1680 }
1681
1682 if (!is_merge) {
1683 if (seek_pages > LCOEF_RANDIO_PAGES) {
1684 cost += coef_randio;
1685 } else {
1686 cost += coef_seqio;
1687 }
1688 }
1689 cost += pages * coef_page;
1690out:
1691 *costp = cost;
1692}
1693
1694static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
1695{
1696 u64 cost;
1697
1698 calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
1699 return cost;
1700}
1701
1702static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
1703{
1704 struct blkcg_gq *blkg = bio->bi_blkg;
1705 struct ioc *ioc = rqos_to_ioc(rqos);
1706 struct ioc_gq *iocg = blkg_to_iocg(blkg);
1707 struct ioc_now now;
1708 struct iocg_wait wait;
1709 u32 hw_active, hw_inuse;
1710 u64 abs_cost, cost, vtime;
1711
1712
1713 if (!ioc->enabled || !iocg->level)
1714 return;
1715
1716
1717 if (!iocg_activate(iocg, &now))
1718 return;
1719
1720
1721 abs_cost = calc_vtime_cost(bio, iocg, false);
1722 if (!abs_cost)
1723 return;
1724
1725 iocg->cursor = bio_end_sector(bio);
1726
1727 vtime = atomic64_read(&iocg->vtime);
1728 current_hweight(iocg, &hw_active, &hw_inuse);
1729
1730 if (hw_inuse < hw_active &&
1731 time_after_eq64(vtime + ioc->inuse_margin_vtime, now.vnow)) {
1732 TRACE_IOCG_PATH(inuse_reset, iocg, &now,
1733 iocg->inuse, iocg->weight, hw_inuse, hw_active);
1734 spin_lock_irq(&ioc->lock);
1735 propagate_active_weight(iocg, iocg->weight, iocg->weight);
1736 spin_unlock_irq(&ioc->lock);
1737 current_hweight(iocg, &hw_active, &hw_inuse);
1738 }
1739
1740 cost = abs_cost_to_cost(abs_cost, hw_inuse);
1741
1742
1743
1744
1745
1746
1747 if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
1748 time_before_eq64(vtime + cost, now.vnow)) {
1749 iocg_commit_bio(iocg, bio, cost);
1750 return;
1751 }
1752
1753
1754
1755
1756
1757
1758
1759 spin_lock_irq(&iocg->waitq.lock);
1760
1761 if (unlikely(list_empty(&iocg->active_list))) {
1762 spin_unlock_irq(&iocg->waitq.lock);
1763 iocg_commit_bio(iocg, bio, cost);
1764 return;
1765 }
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784 if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) {
1785 iocg->abs_vdebt += abs_cost;
1786 if (iocg_kick_delay(iocg, &now, cost))
1787 blkcg_schedule_throttle(rqos->q,
1788 (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
1789 spin_unlock_irq(&iocg->waitq.lock);
1790 return;
1791 }
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806 init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
1807 wait.wait.private = current;
1808 wait.bio = bio;
1809 wait.abs_cost = abs_cost;
1810 wait.committed = false;
1811
1812 __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
1813 iocg_kick_waitq(iocg, &now);
1814
1815 spin_unlock_irq(&iocg->waitq.lock);
1816
1817 while (true) {
1818 set_current_state(TASK_UNINTERRUPTIBLE);
1819 if (wait.committed)
1820 break;
1821 io_schedule();
1822 }
1823
1824
1825 finish_wait(&iocg->waitq, &wait.wait);
1826}
1827
1828static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
1829 struct bio *bio)
1830{
1831 struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
1832 struct ioc *ioc = iocg->ioc;
1833 sector_t bio_end = bio_end_sector(bio);
1834 struct ioc_now now;
1835 u32 hw_inuse;
1836 u64 abs_cost, cost;
1837 unsigned long flags;
1838
1839
1840 if (!ioc->enabled || !iocg->level)
1841 return;
1842
1843 abs_cost = calc_vtime_cost(bio, iocg, true);
1844 if (!abs_cost)
1845 return;
1846
1847 ioc_now(ioc, &now);
1848 current_hweight(iocg, NULL, &hw_inuse);
1849 cost = abs_cost_to_cost(abs_cost, hw_inuse);
1850
1851
1852 if (blk_rq_pos(rq) < bio_end &&
1853 blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
1854 iocg->cursor = bio_end;
1855
1856
1857
1858
1859
1860 if (rq->bio && rq->bio->bi_iocost_cost &&
1861 time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
1862 iocg_commit_bio(iocg, bio, cost);
1863 return;
1864 }
1865
1866
1867
1868
1869
1870
1871 spin_lock_irqsave(&iocg->waitq.lock, flags);
1872 if (likely(!list_empty(&iocg->active_list))) {
1873 iocg->abs_vdebt += abs_cost;
1874 iocg_kick_delay(iocg, &now, cost);
1875 } else {
1876 iocg_commit_bio(iocg, bio, cost);
1877 }
1878 spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1879}
1880
1881static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
1882{
1883 struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
1884
1885 if (iocg && bio->bi_iocost_cost)
1886 atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
1887}
1888
1889static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
1890{
1891 struct ioc *ioc = rqos_to_ioc(rqos);
1892 u64 on_q_ns, rq_wait_ns;
1893 int pidx, rw;
1894
1895 if (!ioc->enabled || !rq_aux(rq)->alloc_time_ns || !rq->start_time_ns)
1896 return;
1897
1898 switch (req_op(rq) & REQ_OP_MASK) {
1899 case REQ_OP_READ:
1900 pidx = QOS_RLAT;
1901 rw = READ;
1902 break;
1903 case REQ_OP_WRITE:
1904 pidx = QOS_WLAT;
1905 rw = WRITE;
1906 break;
1907 default:
1908 return;
1909 }
1910
1911 on_q_ns = ktime_get_ns() - rq_aux(rq)->alloc_time_ns;
1912 rq_wait_ns = rq->start_time_ns - rq_aux(rq)->alloc_time_ns;
1913
1914 if (on_q_ns <= ioc->params.qos[pidx] * NSEC_PER_USEC)
1915 this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_met);
1916 else
1917 this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_missed);
1918
1919 this_cpu_add(ioc->pcpu_stat->rq_wait_ns, rq_wait_ns);
1920}
1921
1922static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
1923{
1924 struct ioc *ioc = rqos_to_ioc(rqos);
1925
1926 spin_lock_irq(&ioc->lock);
1927 ioc_refresh_params(ioc, false);
1928 spin_unlock_irq(&ioc->lock);
1929}
1930
1931static void ioc_rqos_exit(struct rq_qos *rqos)
1932{
1933 struct ioc *ioc = rqos_to_ioc(rqos);
1934
1935 blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
1936
1937 spin_lock_irq(&ioc->lock);
1938 ioc->running = IOC_STOP;
1939 spin_unlock_irq(&ioc->lock);
1940
1941 del_timer_sync(&ioc->timer);
1942 free_percpu(ioc->pcpu_stat);
1943 kfree(ioc);
1944}
1945
1946static struct rq_qos_ops ioc_rqos_ops = {
1947 .throttle = ioc_rqos_throttle,
1948 .merge = ioc_rqos_merge,
1949 .done_bio = ioc_rqos_done_bio,
1950 .done = ioc_rqos_done,
1951 .queue_depth_changed = ioc_rqos_queue_depth_changed,
1952 .exit = ioc_rqos_exit,
1953};
1954
1955static int blk_iocost_init(struct request_queue *q)
1956{
1957 struct ioc *ioc;
1958 struct rq_qos *rqos;
1959 int ret;
1960
1961 ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
1962 if (!ioc)
1963 return -ENOMEM;
1964
1965 ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
1966 if (!ioc->pcpu_stat) {
1967 kfree(ioc);
1968 return -ENOMEM;
1969 }
1970
1971 rqos = &ioc->rqos;
1972 rqos->id = RQ_QOS_COST;
1973 rqos->ops = &ioc_rqos_ops;
1974 rqos->q = q;
1975
1976 spin_lock_init(&ioc->lock);
1977 timer_setup(&ioc->timer, ioc_timer_fn, 0);
1978 INIT_LIST_HEAD(&ioc->active_iocgs);
1979
1980 ioc->running = IOC_IDLE;
1981 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
1982 seqcount_init(&ioc->period_seqcount);
1983 ioc->period_at = ktime_to_us(ktime_get());
1984 atomic64_set(&ioc->cur_period, 0);
1985 atomic_set(&ioc->hweight_gen, 0);
1986
1987 spin_lock_irq(&ioc->lock);
1988 ioc->autop_idx = AUTOP_INVALID;
1989 ioc_refresh_params(ioc, true);
1990 spin_unlock_irq(&ioc->lock);
1991
1992 rq_qos_add(q, rqos);
1993 ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
1994 if (ret) {
1995 rq_qos_del(q, rqos);
1996 free_percpu(ioc->pcpu_stat);
1997 kfree(ioc);
1998 return ret;
1999 }
2000 return 0;
2001}
2002
2003static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
2004{
2005 struct ioc_cgrp *iocc;
2006
2007 iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
2008 if (!iocc)
2009 return NULL;
2010
2011 iocc->dfl_weight = CGROUP_WEIGHT_DFL;
2012 return &iocc->cpd;
2013}
2014
2015static void ioc_cpd_free(struct blkcg_policy_data *cpd)
2016{
2017 kfree(container_of(cpd, struct ioc_cgrp, cpd));
2018}
2019
2020static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q,
2021 struct blkcg *blkcg)
2022{
2023 int levels = blkcg->css.cgroup->level + 1;
2024 struct ioc_gq *iocg;
2025
2026 iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, q->node);
2027 if (!iocg)
2028 return NULL;
2029
2030 return &iocg->pd;
2031}
2032
2033static void ioc_pd_init(struct blkg_policy_data *pd)
2034{
2035 struct ioc_gq *iocg = pd_to_iocg(pd);
2036 struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
2037 struct ioc *ioc = q_to_ioc(blkg->q);
2038 struct ioc_now now;
2039 struct blkcg_gq *tblkg;
2040 unsigned long flags;
2041
2042 ioc_now(ioc, &now);
2043
2044 iocg->ioc = ioc;
2045 atomic64_set(&iocg->vtime, now.vnow);
2046 atomic64_set(&iocg->done_vtime, now.vnow);
2047 atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
2048 INIT_LIST_HEAD(&iocg->active_list);
2049 iocg->hweight_active = HWEIGHT_WHOLE;
2050 iocg->hweight_inuse = HWEIGHT_WHOLE;
2051
2052 init_waitqueue_head(&iocg->waitq);
2053 hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2054 iocg->waitq_timer.function = iocg_waitq_timer_fn;
2055 hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2056 iocg->delay_timer.function = iocg_delay_timer_fn;
2057
2058 iocg->level = blkg->blkcg->css.cgroup->level;
2059
2060 for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
2061 struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
2062 iocg->ancestors[tiocg->level] = tiocg;
2063 }
2064
2065 spin_lock_irqsave(&ioc->lock, flags);
2066 weight_updated(iocg);
2067 spin_unlock_irqrestore(&ioc->lock, flags);
2068}
2069
2070static void ioc_pd_free(struct blkg_policy_data *pd)
2071{
2072 struct ioc_gq *iocg = pd_to_iocg(pd);
2073 struct ioc *ioc = iocg->ioc;
2074 unsigned long flags;
2075
2076 if (ioc) {
2077 spin_lock_irqsave(&ioc->lock, flags);
2078 if (!list_empty(&iocg->active_list)) {
2079 propagate_active_weight(iocg, 0, 0);
2080 list_del_init(&iocg->active_list);
2081 }
2082 spin_unlock_irqrestore(&ioc->lock, flags);
2083
2084 hrtimer_cancel(&iocg->waitq_timer);
2085 hrtimer_cancel(&iocg->delay_timer);
2086 }
2087 kfree(iocg);
2088}
2089
2090static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2091 int off)
2092{
2093 const char *dname = blkg_dev_name(pd->blkg);
2094 struct ioc_gq *iocg = pd_to_iocg(pd);
2095
2096 if (dname && iocg->cfg_weight)
2097 seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight);
2098 return 0;
2099}
2100
2101
2102static int ioc_weight_show(struct seq_file *sf, void *v)
2103{
2104 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2105 struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2106
2107 seq_printf(sf, "default %u\n", iocc->dfl_weight);
2108 blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
2109 &blkcg_policy_iocost, seq_cft(sf)->private, false);
2110 return 0;
2111}
2112
2113static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
2114 size_t nbytes, loff_t off)
2115{
2116 struct blkcg *blkcg = css_to_blkcg(of_css(of));
2117 struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2118 struct blkg_conf_ctx ctx;
2119 struct ioc_gq *iocg;
2120 u32 v;
2121 int ret;
2122
2123 if (!strchr(buf, ':')) {
2124 struct blkcg_gq *blkg;
2125
2126 if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
2127 return -EINVAL;
2128
2129 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2130 return -EINVAL;
2131
2132 spin_lock(&blkcg->lock);
2133 iocc->dfl_weight = v;
2134 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
2135 struct ioc_gq *iocg = blkg_to_iocg(blkg);
2136
2137 if (iocg) {
2138 spin_lock_irq(&iocg->ioc->lock);
2139 weight_updated(iocg);
2140 spin_unlock_irq(&iocg->ioc->lock);
2141 }
2142 }
2143 spin_unlock(&blkcg->lock);
2144
2145 return nbytes;
2146 }
2147
2148 ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
2149 if (ret)
2150 return ret;
2151
2152 iocg = blkg_to_iocg(ctx.blkg);
2153
2154 if (!strncmp(ctx.body, "default", 7)) {
2155 v = 0;
2156 } else {
2157 if (!sscanf(ctx.body, "%u", &v))
2158 goto einval;
2159 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2160 goto einval;
2161 }
2162
2163 spin_lock(&iocg->ioc->lock);
2164 iocg->cfg_weight = v;
2165 weight_updated(iocg);
2166 spin_unlock(&iocg->ioc->lock);
2167
2168 blkg_conf_finish(&ctx);
2169 return nbytes;
2170
2171einval:
2172 blkg_conf_finish(&ctx);
2173 return -EINVAL;
2174}
2175
2176static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2177 int off)
2178{
2179 const char *dname = blkg_dev_name(pd->blkg);
2180 struct ioc *ioc = pd_to_iocg(pd)->ioc;
2181
2182 if (!dname)
2183 return 0;
2184
2185 seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
2186 dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
2187 ioc->params.qos[QOS_RPPM] / 10000,
2188 ioc->params.qos[QOS_RPPM] % 10000 / 100,
2189 ioc->params.qos[QOS_RLAT],
2190 ioc->params.qos[QOS_WPPM] / 10000,
2191 ioc->params.qos[QOS_WPPM] % 10000 / 100,
2192 ioc->params.qos[QOS_WLAT],
2193 ioc->params.qos[QOS_MIN] / 10000,
2194 ioc->params.qos[QOS_MIN] % 10000 / 100,
2195 ioc->params.qos[QOS_MAX] / 10000,
2196 ioc->params.qos[QOS_MAX] % 10000 / 100);
2197 return 0;
2198}
2199
2200static int ioc_qos_show(struct seq_file *sf, void *v)
2201{
2202 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2203
2204 blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
2205 &blkcg_policy_iocost, seq_cft(sf)->private, false);
2206 return 0;
2207}
2208
2209static const match_table_t qos_ctrl_tokens = {
2210 { QOS_ENABLE, "enable=%u" },
2211 { QOS_CTRL, "ctrl=%s" },
2212 { NR_QOS_CTRL_PARAMS, NULL },
2213};
2214
2215static const match_table_t qos_tokens = {
2216 { QOS_RPPM, "rpct=%s" },
2217 { QOS_RLAT, "rlat=%u" },
2218 { QOS_WPPM, "wpct=%s" },
2219 { QOS_WLAT, "wlat=%u" },
2220 { QOS_MIN, "min=%s" },
2221 { QOS_MAX, "max=%s" },
2222 { NR_QOS_PARAMS, NULL },
2223};
2224
2225static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
2226 size_t nbytes, loff_t off)
2227{
2228 struct gendisk *disk;
2229 struct ioc *ioc;
2230 u32 qos[NR_QOS_PARAMS];
2231 bool enable, user;
2232 char *p;
2233 int ret;
2234
2235 disk = blkcg_conf_get_disk(&input);
2236 if (IS_ERR(disk))
2237 return PTR_ERR(disk);
2238
2239 ioc = q_to_ioc(disk->queue);
2240 if (!ioc) {
2241 ret = blk_iocost_init(disk->queue);
2242 if (ret)
2243 goto err;
2244 ioc = q_to_ioc(disk->queue);
2245 }
2246
2247 spin_lock_irq(&ioc->lock);
2248 memcpy(qos, ioc->params.qos, sizeof(qos));
2249 enable = ioc->enabled;
2250 user = ioc->user_qos_params;
2251 spin_unlock_irq(&ioc->lock);
2252
2253 while ((p = strsep(&input, " \t\n"))) {
2254 substring_t args[MAX_OPT_ARGS];
2255 char buf[32];
2256 int tok;
2257 s64 v;
2258
2259 if (!*p)
2260 continue;
2261
2262 switch (match_token(p, qos_ctrl_tokens, args)) {
2263 case QOS_ENABLE:
2264 match_u64(&args[0], &v);
2265 enable = v;
2266 continue;
2267 case QOS_CTRL:
2268 match_strlcpy(buf, &args[0], sizeof(buf));
2269 if (!strcmp(buf, "auto"))
2270 user = false;
2271 else if (!strcmp(buf, "user"))
2272 user = true;
2273 else
2274 goto einval;
2275 continue;
2276 }
2277
2278 tok = match_token(p, qos_tokens, args);
2279 switch (tok) {
2280 case QOS_RPPM:
2281 case QOS_WPPM:
2282 if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2283 sizeof(buf))
2284 goto einval;
2285 if (cgroup_parse_float(buf, 2, &v))
2286 goto einval;
2287 if (v < 0 || v > 10000)
2288 goto einval;
2289 qos[tok] = v * 100;
2290 break;
2291 case QOS_RLAT:
2292 case QOS_WLAT:
2293 if (match_u64(&args[0], &v))
2294 goto einval;
2295 qos[tok] = v;
2296 break;
2297 case QOS_MIN:
2298 case QOS_MAX:
2299 if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2300 sizeof(buf))
2301 goto einval;
2302 if (cgroup_parse_float(buf, 2, &v))
2303 goto einval;
2304 if (v < 0)
2305 goto einval;
2306 qos[tok] = clamp_t(s64, v * 100,
2307 VRATE_MIN_PPM, VRATE_MAX_PPM);
2308 break;
2309 default:
2310 goto einval;
2311 }
2312 user = true;
2313 }
2314
2315 if (qos[QOS_MIN] > qos[QOS_MAX])
2316 goto einval;
2317
2318 spin_lock_irq(&ioc->lock);
2319
2320 if (enable) {
2321 blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2322 ioc->enabled = true;
2323 } else {
2324 blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2325 ioc->enabled = false;
2326 }
2327
2328 if (user) {
2329 memcpy(ioc->params.qos, qos, sizeof(qos));
2330 ioc->user_qos_params = true;
2331 } else {
2332 ioc->user_qos_params = false;
2333 }
2334
2335 ioc_refresh_params(ioc, true);
2336 spin_unlock_irq(&ioc->lock);
2337
2338 put_disk_and_module(disk);
2339 return nbytes;
2340einval:
2341 ret = -EINVAL;
2342err:
2343 put_disk_and_module(disk);
2344 return ret;
2345}
2346
2347static u64 ioc_cost_model_prfill(struct seq_file *sf,
2348 struct blkg_policy_data *pd, int off)
2349{
2350 const char *dname = blkg_dev_name(pd->blkg);
2351 struct ioc *ioc = pd_to_iocg(pd)->ioc;
2352 u64 *u = ioc->params.i_lcoefs;
2353
2354 if (!dname)
2355 return 0;
2356
2357 seq_printf(sf, "%s ctrl=%s model=linear "
2358 "rbps=%llu rseqiops=%llu rrandiops=%llu "
2359 "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
2360 dname, ioc->user_cost_model ? "user" : "auto",
2361 u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
2362 u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
2363 return 0;
2364}
2365
2366static int ioc_cost_model_show(struct seq_file *sf, void *v)
2367{
2368 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2369
2370 blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
2371 &blkcg_policy_iocost, seq_cft(sf)->private, false);
2372 return 0;
2373}
2374
2375static const match_table_t cost_ctrl_tokens = {
2376 { COST_CTRL, "ctrl=%s" },
2377 { COST_MODEL, "model=%s" },
2378 { NR_COST_CTRL_PARAMS, NULL },
2379};
2380
2381static const match_table_t i_lcoef_tokens = {
2382 { I_LCOEF_RBPS, "rbps=%u" },
2383 { I_LCOEF_RSEQIOPS, "rseqiops=%u" },
2384 { I_LCOEF_RRANDIOPS, "rrandiops=%u" },
2385 { I_LCOEF_WBPS, "wbps=%u" },
2386 { I_LCOEF_WSEQIOPS, "wseqiops=%u" },
2387 { I_LCOEF_WRANDIOPS, "wrandiops=%u" },
2388 { NR_I_LCOEFS, NULL },
2389};
2390
2391static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
2392 size_t nbytes, loff_t off)
2393{
2394 struct gendisk *disk;
2395 struct ioc *ioc;
2396 u64 u[NR_I_LCOEFS];
2397 bool user;
2398 char *p;
2399 int ret;
2400
2401 disk = blkcg_conf_get_disk(&input);
2402 if (IS_ERR(disk))
2403 return PTR_ERR(disk);
2404
2405 ioc = q_to_ioc(disk->queue);
2406 if (!ioc) {
2407 ret = blk_iocost_init(disk->queue);
2408 if (ret)
2409 goto err;
2410 ioc = q_to_ioc(disk->queue);
2411 }
2412
2413 spin_lock_irq(&ioc->lock);
2414 memcpy(u, ioc->params.i_lcoefs, sizeof(u));
2415 user = ioc->user_cost_model;
2416 spin_unlock_irq(&ioc->lock);
2417
2418 while ((p = strsep(&input, " \t\n"))) {
2419 substring_t args[MAX_OPT_ARGS];
2420 char buf[32];
2421 int tok;
2422 u64 v;
2423
2424 if (!*p)
2425 continue;
2426
2427 switch (match_token(p, cost_ctrl_tokens, args)) {
2428 case COST_CTRL:
2429 match_strlcpy(buf, &args[0], sizeof(buf));
2430 if (!strcmp(buf, "auto"))
2431 user = false;
2432 else if (!strcmp(buf, "user"))
2433 user = true;
2434 else
2435 goto einval;
2436 continue;
2437 case COST_MODEL:
2438 match_strlcpy(buf, &args[0], sizeof(buf));
2439 if (strcmp(buf, "linear"))
2440 goto einval;
2441 continue;
2442 }
2443
2444 tok = match_token(p, i_lcoef_tokens, args);
2445 if (tok == NR_I_LCOEFS)
2446 goto einval;
2447 if (match_u64(&args[0], &v))
2448 goto einval;
2449 u[tok] = v;
2450 user = true;
2451 }
2452
2453 spin_lock_irq(&ioc->lock);
2454 if (user) {
2455 memcpy(ioc->params.i_lcoefs, u, sizeof(u));
2456 ioc->user_cost_model = true;
2457 } else {
2458 ioc->user_cost_model = false;
2459 }
2460 ioc_refresh_params(ioc, true);
2461 spin_unlock_irq(&ioc->lock);
2462
2463 put_disk_and_module(disk);
2464 return nbytes;
2465
2466einval:
2467 ret = -EINVAL;
2468err:
2469 put_disk_and_module(disk);
2470 return ret;
2471}
2472
2473static struct cftype ioc_files[] = {
2474 {
2475 .name = "weight",
2476 .flags = CFTYPE_NOT_ON_ROOT,
2477 .seq_show = ioc_weight_show,
2478 .write = ioc_weight_write,
2479 },
2480 {
2481 .name = "cost.qos",
2482 .flags = CFTYPE_ONLY_ON_ROOT,
2483 .seq_show = ioc_qos_show,
2484 .write = ioc_qos_write,
2485 },
2486 {
2487 .name = "cost.model",
2488 .flags = CFTYPE_ONLY_ON_ROOT,
2489 .seq_show = ioc_cost_model_show,
2490 .write = ioc_cost_model_write,
2491 },
2492 {}
2493};
2494
2495static struct blkcg_policy blkcg_policy_iocost = {
2496 .dfl_cftypes = ioc_files,
2497 .cpd_alloc_fn = ioc_cpd_alloc,
2498 .cpd_free_fn = ioc_cpd_free,
2499 .pd_alloc_fn = ioc_pd_alloc,
2500 .pd_init_fn = ioc_pd_init,
2501 .pd_free_fn = ioc_pd_free,
2502};
2503
2504static int __init ioc_init(void)
2505{
2506 return blkcg_policy_register(&blkcg_policy_iocost);
2507}
2508
2509static void __exit ioc_exit(void)
2510{
2511 return blkcg_policy_unregister(&blkcg_policy_iocost);
2512}
2513
2514module_init(ioc_init);
2515module_exit(ioc_exit);
2516