1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18#include <linux/ioprio.h>
19#include <linux/kdev_t.h>
20#include <linux/module.h>
21#include <linux/sched/signal.h>
22#include <linux/err.h>
23#include <linux/blkdev.h>
24#include <linux/backing-dev.h>
25#include <linux/slab.h>
26#include <linux/genhd.h>
27#include <linux/delay.h>
28#include <linux/atomic.h>
29#include <linux/ctype.h>
30#include <linux/blk-cgroup.h>
31#include <linux/tracehook.h>
32#include <linux/psi.h>
33#include "blk.h"
34
35#define MAX_KEY_LEN 100
36
37
38
39
40
41
42
43
44static DEFINE_MUTEX(blkcg_pol_register_mutex);
45static DEFINE_MUTEX(blkcg_pol_mutex);
46
47struct blkcg blkcg_root;
48EXPORT_SYMBOL_GPL(blkcg_root);
49
50struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
51EXPORT_SYMBOL_GPL(blkcg_root_css);
52
53static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
54
55static LIST_HEAD(all_blkcgs);
56
57bool blkcg_debug_stats = false;
58static struct workqueue_struct *blkcg_punt_bio_wq;
59
60static bool blkcg_policy_enabled(struct request_queue *q,
61 const struct blkcg_policy *pol)
62{
63 return pol && test_bit(pol->plid, q->blkcg_pols);
64}
65
66
67
68
69
70
71
72static void blkg_free(struct blkcg_gq *blkg)
73{
74 int i;
75
76 if (!blkg)
77 return;
78
79 for (i = 0; i < BLKCG_MAX_POLS; i++)
80 if (blkg->pd[i])
81 blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
82
83 free_percpu(blkg->iostat_cpu);
84 percpu_ref_exit(&blkg->refcnt);
85 kfree(blkg);
86}
87
88static void __blkg_release(struct rcu_head *rcu)
89{
90 struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
91
92 WARN_ON(!bio_list_empty(&blkg->async_bios));
93
94
95 css_put(&blkg->blkcg->css);
96 if (blkg->parent)
97 blkg_put(blkg->parent);
98 blkg_free(blkg);
99}
100
101
102
103
104
105
106
107
108
109static void blkg_release(struct percpu_ref *ref)
110{
111 struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt);
112
113 call_rcu(&blkg->rcu_head, __blkg_release);
114}
115
116static void blkg_async_bio_workfn(struct work_struct *work)
117{
118 struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
119 async_bio_work);
120 struct bio_list bios = BIO_EMPTY_LIST;
121 struct bio *bio;
122
123
124 spin_lock_bh(&blkg->async_bio_lock);
125 bio_list_merge(&bios, &blkg->async_bios);
126 bio_list_init(&blkg->async_bios);
127 spin_unlock_bh(&blkg->async_bio_lock);
128
129 while ((bio = bio_list_pop(&bios)))
130 submit_bio(bio);
131}
132
133
134
135
136
137
138
139
140
141static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
142 gfp_t gfp_mask)
143{
144 struct blkcg_gq *blkg;
145 int i, cpu;
146
147
148 blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node);
149 if (!blkg)
150 return NULL;
151
152 if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask))
153 goto err_free;
154
155 blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask);
156 if (!blkg->iostat_cpu)
157 goto err_free;
158
159 blkg->q = q;
160 INIT_LIST_HEAD(&blkg->q_node);
161 spin_lock_init(&blkg->async_bio_lock);
162 bio_list_init(&blkg->async_bios);
163 INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
164 blkg->blkcg = blkcg;
165
166 u64_stats_init(&blkg->iostat.sync);
167 for_each_possible_cpu(cpu)
168 u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync);
169
170 for (i = 0; i < BLKCG_MAX_POLS; i++) {
171 struct blkcg_policy *pol = blkcg_policy[i];
172 struct blkg_policy_data *pd;
173
174 if (!blkcg_policy_enabled(q, pol))
175 continue;
176
177
178 pd = pol->pd_alloc_fn(gfp_mask, q, blkcg);
179 if (!pd)
180 goto err_free;
181
182 blkg->pd[i] = pd;
183 pd->blkg = blkg;
184 pd->plid = i;
185 }
186
187 return blkg;
188
189err_free:
190 blkg_free(blkg);
191 return NULL;
192}
193
194struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
195 struct request_queue *q, bool update_hint)
196{
197 struct blkcg_gq *blkg;
198
199
200
201
202
203
204
205 blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
206 if (blkg && blkg->q == q) {
207 if (update_hint) {
208 lockdep_assert_held(&q->queue_lock);
209 rcu_assign_pointer(blkcg->blkg_hint, blkg);
210 }
211 return blkg;
212 }
213
214 return NULL;
215}
216EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
217
218
219
220
221
222static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
223 struct request_queue *q,
224 struct blkcg_gq *new_blkg)
225{
226 struct blkcg_gq *blkg;
227 int i, ret;
228
229 WARN_ON_ONCE(!rcu_read_lock_held());
230 lockdep_assert_held(&q->queue_lock);
231
232
233 if (blk_queue_dying(q)) {
234 ret = -ENODEV;
235 goto err_free_blkg;
236 }
237
238
239 if (!css_tryget_online(&blkcg->css)) {
240 ret = -ENODEV;
241 goto err_free_blkg;
242 }
243
244
245 if (!new_blkg) {
246 new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);
247 if (unlikely(!new_blkg)) {
248 ret = -ENOMEM;
249 goto err_put_css;
250 }
251 }
252 blkg = new_blkg;
253
254
255 if (blkcg_parent(blkcg)) {
256 blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
257 if (WARN_ON_ONCE(!blkg->parent)) {
258 ret = -ENODEV;
259 goto err_put_css;
260 }
261 blkg_get(blkg->parent);
262 }
263
264
265 for (i = 0; i < BLKCG_MAX_POLS; i++) {
266 struct blkcg_policy *pol = blkcg_policy[i];
267
268 if (blkg->pd[i] && pol->pd_init_fn)
269 pol->pd_init_fn(blkg->pd[i]);
270 }
271
272
273 spin_lock(&blkcg->lock);
274 ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
275 if (likely(!ret)) {
276 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
277 list_add(&blkg->q_node, &q->blkg_list);
278
279 for (i = 0; i < BLKCG_MAX_POLS; i++) {
280 struct blkcg_policy *pol = blkcg_policy[i];
281
282 if (blkg->pd[i] && pol->pd_online_fn)
283 pol->pd_online_fn(blkg->pd[i]);
284 }
285 }
286 blkg->online = true;
287 spin_unlock(&blkcg->lock);
288
289 if (!ret)
290 return blkg;
291
292
293 blkg_put(blkg);
294 return ERR_PTR(ret);
295
296err_put_css:
297 css_put(&blkcg->css);
298err_free_blkg:
299 blkg_free(new_blkg);
300 return ERR_PTR(ret);
301}
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
317 struct request_queue *q)
318{
319 struct blkcg_gq *blkg;
320 unsigned long flags;
321
322 WARN_ON_ONCE(!rcu_read_lock_held());
323
324 blkg = blkg_lookup(blkcg, q);
325 if (blkg)
326 return blkg;
327
328 spin_lock_irqsave(&q->queue_lock, flags);
329 blkg = __blkg_lookup(blkcg, q, true);
330 if (blkg)
331 goto found;
332
333
334
335
336
337
338 while (true) {
339 struct blkcg *pos = blkcg;
340 struct blkcg *parent = blkcg_parent(blkcg);
341 struct blkcg_gq *ret_blkg = q->root_blkg;
342
343 while (parent) {
344 blkg = __blkg_lookup(parent, q, false);
345 if (blkg) {
346
347 ret_blkg = blkg;
348 break;
349 }
350 pos = parent;
351 parent = blkcg_parent(parent);
352 }
353
354 blkg = blkg_create(pos, q, NULL);
355 if (IS_ERR(blkg)) {
356 blkg = ret_blkg;
357 break;
358 }
359 if (pos == blkcg)
360 break;
361 }
362
363found:
364 spin_unlock_irqrestore(&q->queue_lock, flags);
365 return blkg;
366}
367
368static void blkg_destroy(struct blkcg_gq *blkg)
369{
370 struct blkcg *blkcg = blkg->blkcg;
371 int i;
372
373 lockdep_assert_held(&blkg->q->queue_lock);
374 lockdep_assert_held(&blkcg->lock);
375
376
377 WARN_ON_ONCE(list_empty(&blkg->q_node));
378 WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
379
380 for (i = 0; i < BLKCG_MAX_POLS; i++) {
381 struct blkcg_policy *pol = blkcg_policy[i];
382
383 if (blkg->pd[i] && pol->pd_offline_fn)
384 pol->pd_offline_fn(blkg->pd[i]);
385 }
386
387 blkg->online = false;
388
389 radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
390 list_del_init(&blkg->q_node);
391 hlist_del_init_rcu(&blkg->blkcg_node);
392
393
394
395
396
397
398 if (rcu_access_pointer(blkcg->blkg_hint) == blkg)
399 rcu_assign_pointer(blkcg->blkg_hint, NULL);
400
401
402
403
404
405 percpu_ref_kill(&blkg->refcnt);
406}
407
408
409
410
411
412
413
414static void blkg_destroy_all(struct request_queue *q)
415{
416 struct blkcg_gq *blkg, *n;
417
418 spin_lock_irq(&q->queue_lock);
419 list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
420 struct blkcg *blkcg = blkg->blkcg;
421
422 spin_lock(&blkcg->lock);
423 blkg_destroy(blkg);
424 spin_unlock(&blkcg->lock);
425 }
426
427 q->root_blkg = NULL;
428 spin_unlock_irq(&q->queue_lock);
429}
430
431static int blkcg_reset_stats(struct cgroup_subsys_state *css,
432 struct cftype *cftype, u64 val)
433{
434 struct blkcg *blkcg = css_to_blkcg(css);
435 struct blkcg_gq *blkg;
436 int i, cpu;
437
438 mutex_lock(&blkcg_pol_mutex);
439 spin_lock_irq(&blkcg->lock);
440
441
442
443
444
445
446 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
447 for_each_possible_cpu(cpu) {
448 struct blkg_iostat_set *bis =
449 per_cpu_ptr(blkg->iostat_cpu, cpu);
450 memset(bis, 0, sizeof(*bis));
451 }
452 memset(&blkg->iostat, 0, sizeof(blkg->iostat));
453
454 for (i = 0; i < BLKCG_MAX_POLS; i++) {
455 struct blkcg_policy *pol = blkcg_policy[i];
456
457 if (blkg->pd[i] && pol->pd_reset_stats_fn)
458 pol->pd_reset_stats_fn(blkg->pd[i]);
459 }
460 }
461
462 spin_unlock_irq(&blkcg->lock);
463 mutex_unlock(&blkcg_pol_mutex);
464 return 0;
465}
466
467const char *blkg_dev_name(struct blkcg_gq *blkg)
468{
469
470 if (blkg->q->backing_dev_info->dev)
471 return bdi_dev_name(blkg->q->backing_dev_info);
472 return NULL;
473}
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
494 u64 (*prfill)(struct seq_file *,
495 struct blkg_policy_data *, int),
496 const struct blkcg_policy *pol, int data,
497 bool show_total)
498{
499 struct blkcg_gq *blkg;
500 u64 total = 0;
501
502 rcu_read_lock();
503 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
504 spin_lock_irq(&blkg->q->queue_lock);
505 if (blkcg_policy_enabled(blkg->q, pol))
506 total += prfill(sf, blkg->pd[pol->plid], data);
507 spin_unlock_irq(&blkg->q->queue_lock);
508 }
509 rcu_read_unlock();
510
511 if (show_total)
512 seq_printf(sf, "Total %llu\n", (unsigned long long)total);
513}
514EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
515
516
517
518
519
520
521
522
523
524u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
525{
526 const char *dname = blkg_dev_name(pd->blkg);
527
528 if (!dname)
529 return 0;
530
531 seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
532 return v;
533}
534EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
535
536
537static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
538 const struct blkcg_policy *pol,
539 struct request_queue *q)
540{
541 WARN_ON_ONCE(!rcu_read_lock_held());
542 lockdep_assert_held(&q->queue_lock);
543
544 if (!blkcg_policy_enabled(q, pol))
545 return ERR_PTR(-EOPNOTSUPP);
546 return __blkg_lookup(blkcg, q, true );
547}
548
549
550
551
552
553
554
555
556
557
558
559
560struct gendisk *blkcg_conf_get_disk(char **inputp)
561{
562 char *input = *inputp;
563 unsigned int major, minor;
564 struct gendisk *disk;
565 int key_len, part;
566
567 if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
568 return ERR_PTR(-EINVAL);
569
570 input += key_len;
571 if (!isspace(*input))
572 return ERR_PTR(-EINVAL);
573 input = skip_spaces(input);
574
575 disk = get_gendisk(MKDEV(major, minor), &part);
576 if (!disk)
577 return ERR_PTR(-ENODEV);
578 if (part) {
579 put_disk_and_module(disk);
580 return ERR_PTR(-ENODEV);
581 }
582
583 *inputp = input;
584 return disk;
585}
586
587
588
589
590
591
592
593
594
595
596
597
598
599int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
600 char *input, struct blkg_conf_ctx *ctx)
601 __acquires(rcu) __acquires(&disk->queue->queue_lock)
602{
603 struct gendisk *disk;
604 struct request_queue *q;
605 struct blkcg_gq *blkg;
606 int ret;
607
608 disk = blkcg_conf_get_disk(&input);
609 if (IS_ERR(disk))
610 return PTR_ERR(disk);
611
612 q = disk->queue;
613
614 rcu_read_lock();
615 spin_lock_irq(&q->queue_lock);
616
617 blkg = blkg_lookup_check(blkcg, pol, q);
618 if (IS_ERR(blkg)) {
619 ret = PTR_ERR(blkg);
620 goto fail_unlock;
621 }
622
623 if (blkg)
624 goto success;
625
626
627
628
629
630 while (true) {
631 struct blkcg *pos = blkcg;
632 struct blkcg *parent;
633 struct blkcg_gq *new_blkg;
634
635 parent = blkcg_parent(blkcg);
636 while (parent && !__blkg_lookup(parent, q, false)) {
637 pos = parent;
638 parent = blkcg_parent(parent);
639 }
640
641
642 spin_unlock_irq(&q->queue_lock);
643 rcu_read_unlock();
644
645 new_blkg = blkg_alloc(pos, q, GFP_KERNEL);
646 if (unlikely(!new_blkg)) {
647 ret = -ENOMEM;
648 goto fail;
649 }
650
651 rcu_read_lock();
652 spin_lock_irq(&q->queue_lock);
653
654 blkg = blkg_lookup_check(pos, pol, q);
655 if (IS_ERR(blkg)) {
656 ret = PTR_ERR(blkg);
657 goto fail_unlock;
658 }
659
660 if (blkg) {
661 blkg_free(new_blkg);
662 } else {
663 blkg = blkg_create(pos, q, new_blkg);
664 if (IS_ERR(blkg)) {
665 ret = PTR_ERR(blkg);
666 goto fail_unlock;
667 }
668 }
669
670 if (pos == blkcg)
671 goto success;
672 }
673success:
674 ctx->disk = disk;
675 ctx->blkg = blkg;
676 ctx->body = input;
677 return 0;
678
679fail_unlock:
680 spin_unlock_irq(&q->queue_lock);
681 rcu_read_unlock();
682fail:
683 put_disk_and_module(disk);
684
685
686
687
688
689
690 if (ret == -EBUSY) {
691 msleep(10);
692 ret = restart_syscall();
693 }
694 return ret;
695}
696EXPORT_SYMBOL_GPL(blkg_conf_prep);
697
698
699
700
701
702
703
704
705void blkg_conf_finish(struct blkg_conf_ctx *ctx)
706 __releases(&ctx->disk->queue->queue_lock) __releases(rcu)
707{
708 spin_unlock_irq(&ctx->disk->queue->queue_lock);
709 rcu_read_unlock();
710 put_disk_and_module(ctx->disk);
711}
712EXPORT_SYMBOL_GPL(blkg_conf_finish);
713
714static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
715{
716 int i;
717
718 for (i = 0; i < BLKG_IOSTAT_NR; i++) {
719 dst->bytes[i] = src->bytes[i];
720 dst->ios[i] = src->ios[i];
721 }
722}
723
724static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src)
725{
726 int i;
727
728 for (i = 0; i < BLKG_IOSTAT_NR; i++) {
729 dst->bytes[i] += src->bytes[i];
730 dst->ios[i] += src->ios[i];
731 }
732}
733
734static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src)
735{
736 int i;
737
738 for (i = 0; i < BLKG_IOSTAT_NR; i++) {
739 dst->bytes[i] -= src->bytes[i];
740 dst->ios[i] -= src->ios[i];
741 }
742}
743
744static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
745{
746 struct blkcg *blkcg = css_to_blkcg(css);
747 struct blkcg_gq *blkg;
748
749 rcu_read_lock();
750
751 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
752 struct blkcg_gq *parent = blkg->parent;
753 struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
754 struct blkg_iostat cur, delta;
755 unsigned int seq;
756
757
758 do {
759 seq = u64_stats_fetch_begin(&bisc->sync);
760 blkg_iostat_set(&cur, &bisc->cur);
761 } while (u64_stats_fetch_retry(&bisc->sync, seq));
762
763
764 u64_stats_update_begin(&blkg->iostat.sync);
765 blkg_iostat_set(&delta, &cur);
766 blkg_iostat_sub(&delta, &bisc->last);
767 blkg_iostat_add(&blkg->iostat.cur, &delta);
768 blkg_iostat_add(&bisc->last, &delta);
769 u64_stats_update_end(&blkg->iostat.sync);
770
771
772 if (parent) {
773 u64_stats_update_begin(&parent->iostat.sync);
774 blkg_iostat_set(&delta, &blkg->iostat.cur);
775 blkg_iostat_sub(&delta, &blkg->iostat.last);
776 blkg_iostat_add(&parent->iostat.cur, &delta);
777 blkg_iostat_add(&blkg->iostat.last, &delta);
778 u64_stats_update_end(&parent->iostat.sync);
779 }
780 }
781
782 rcu_read_unlock();
783}
784
785
786
787
788
789
790
791
792
793
794
795
796static void blkcg_fill_root_iostats(void)
797{
798 struct class_dev_iter iter;
799 struct device *dev;
800
801 class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
802 while ((dev = class_dev_iter_next(&iter))) {
803 struct gendisk *disk = dev_to_disk(dev);
804 struct hd_struct *part = disk_get_part(disk, 0);
805 struct blkcg_gq *blkg = blk_queue_root_blkg(disk->queue);
806 struct blkg_iostat tmp;
807 int cpu;
808
809 memset(&tmp, 0, sizeof(tmp));
810 for_each_possible_cpu(cpu) {
811 struct disk_stats *cpu_dkstats;
812
813 cpu_dkstats = per_cpu_ptr(part->dkstats, cpu);
814 tmp.ios[BLKG_IOSTAT_READ] +=
815 cpu_dkstats->ios[STAT_READ];
816 tmp.ios[BLKG_IOSTAT_WRITE] +=
817 cpu_dkstats->ios[STAT_WRITE];
818 tmp.ios[BLKG_IOSTAT_DISCARD] +=
819 cpu_dkstats->ios[STAT_DISCARD];
820
821 tmp.bytes[BLKG_IOSTAT_READ] +=
822 cpu_dkstats->sectors[STAT_READ] << 9;
823 tmp.bytes[BLKG_IOSTAT_WRITE] +=
824 cpu_dkstats->sectors[STAT_WRITE] << 9;
825 tmp.bytes[BLKG_IOSTAT_DISCARD] +=
826 cpu_dkstats->sectors[STAT_DISCARD] << 9;
827
828 u64_stats_update_begin(&blkg->iostat.sync);
829 blkg_iostat_set(&blkg->iostat.cur, &tmp);
830 u64_stats_update_end(&blkg->iostat.sync);
831 }
832 }
833}
834
835static int blkcg_print_stat(struct seq_file *sf, void *v)
836{
837 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
838 struct blkcg_gq *blkg;
839
840 if (!seq_css(sf)->parent)
841 blkcg_fill_root_iostats();
842 else
843 cgroup_rstat_flush(blkcg->css.cgroup);
844
845 rcu_read_lock();
846
847 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
848 struct blkg_iostat_set *bis = &blkg->iostat;
849 const char *dname;
850 char *buf;
851 u64 rbytes, wbytes, rios, wios, dbytes, dios;
852 size_t size = seq_get_buf(sf, &buf), off = 0;
853 int i;
854 bool has_stats = false;
855 unsigned seq;
856
857 spin_lock_irq(&blkg->q->queue_lock);
858
859 if (!blkg->online)
860 goto skip;
861
862 dname = blkg_dev_name(blkg);
863 if (!dname)
864 goto skip;
865
866
867
868
869
870
871
872 off += scnprintf(buf+off, size-off, "%s ", dname);
873
874 do {
875 seq = u64_stats_fetch_begin(&bis->sync);
876
877 rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
878 wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
879 dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
880 rios = bis->cur.ios[BLKG_IOSTAT_READ];
881 wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
882 dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
883 } while (u64_stats_fetch_retry(&bis->sync, seq));
884
885 if (rbytes || wbytes || rios || wios) {
886 has_stats = true;
887 off += scnprintf(buf+off, size-off,
888 "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
889 rbytes, wbytes, rios, wios,
890 dbytes, dios);
891 }
892
893 if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) {
894 has_stats = true;
895 off += scnprintf(buf+off, size-off,
896 " use_delay=%d delay_nsec=%llu",
897 atomic_read(&blkg->use_delay),
898 (unsigned long long)atomic64_read(&blkg->delay_nsec));
899 }
900
901 for (i = 0; i < BLKCG_MAX_POLS; i++) {
902 struct blkcg_policy *pol = blkcg_policy[i];
903 size_t written;
904
905 if (!blkg->pd[i] || !pol->pd_stat_fn)
906 continue;
907
908 written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off);
909 if (written)
910 has_stats = true;
911 off += written;
912 }
913
914 if (has_stats) {
915 if (off < size - 1) {
916 off += scnprintf(buf+off, size-off, "\n");
917 seq_commit(sf, off);
918 } else {
919 seq_commit(sf, -1);
920 }
921 }
922 skip:
923 spin_unlock_irq(&blkg->q->queue_lock);
924 }
925
926 rcu_read_unlock();
927 return 0;
928}
929
930static struct cftype blkcg_files[] = {
931 {
932 .name = "stat",
933 .seq_show = blkcg_print_stat,
934 },
935 { }
936};
937
938static struct cftype blkcg_legacy_files[] = {
939 {
940 .name = "reset_stats",
941 .write_u64 = blkcg_reset_stats,
942 },
943 { }
944};
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975static void blkcg_css_offline(struct cgroup_subsys_state *css)
976{
977 struct blkcg *blkcg = css_to_blkcg(css);
978
979
980 wb_blkcg_offline(blkcg);
981
982
983 blkcg_unpin_online(blkcg);
984}
985
986
987
988
989
990
991
992
993
994
995
996
997void blkcg_destroy_blkgs(struct blkcg *blkcg)
998{
999 spin_lock_irq(&blkcg->lock);
1000
1001 while (!hlist_empty(&blkcg->blkg_list)) {
1002 struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
1003 struct blkcg_gq, blkcg_node);
1004 struct request_queue *q = blkg->q;
1005
1006 if (spin_trylock(&q->queue_lock)) {
1007 blkg_destroy(blkg);
1008 spin_unlock(&q->queue_lock);
1009 } else {
1010 spin_unlock_irq(&blkcg->lock);
1011 cpu_relax();
1012 spin_lock_irq(&blkcg->lock);
1013 }
1014 }
1015
1016 spin_unlock_irq(&blkcg->lock);
1017}
1018
1019static void blkcg_css_free(struct cgroup_subsys_state *css)
1020{
1021 struct blkcg *blkcg = css_to_blkcg(css);
1022 int i;
1023
1024 mutex_lock(&blkcg_pol_mutex);
1025
1026 list_del(&blkcg->all_blkcgs_node);
1027
1028 for (i = 0; i < BLKCG_MAX_POLS; i++)
1029 if (blkcg->cpd[i])
1030 blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
1031
1032 mutex_unlock(&blkcg_pol_mutex);
1033
1034 kfree(blkcg);
1035}
1036
1037static struct cgroup_subsys_state *
1038blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
1039{
1040 struct blkcg *blkcg;
1041 struct cgroup_subsys_state *ret;
1042 int i;
1043
1044 mutex_lock(&blkcg_pol_mutex);
1045
1046 if (!parent_css) {
1047 blkcg = &blkcg_root;
1048 } else {
1049 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
1050 if (!blkcg) {
1051 ret = ERR_PTR(-ENOMEM);
1052 goto unlock;
1053 }
1054 }
1055
1056 for (i = 0; i < BLKCG_MAX_POLS ; i++) {
1057 struct blkcg_policy *pol = blkcg_policy[i];
1058 struct blkcg_policy_data *cpd;
1059
1060
1061
1062
1063
1064
1065
1066 if (!pol || !pol->cpd_alloc_fn)
1067 continue;
1068
1069 cpd = pol->cpd_alloc_fn(GFP_KERNEL);
1070 if (!cpd) {
1071 ret = ERR_PTR(-ENOMEM);
1072 goto free_pd_blkcg;
1073 }
1074 blkcg->cpd[i] = cpd;
1075 cpd->blkcg = blkcg;
1076 cpd->plid = i;
1077 if (pol->cpd_init_fn)
1078 pol->cpd_init_fn(cpd);
1079 }
1080
1081 spin_lock_init(&blkcg->lock);
1082 refcount_set(&blkcg->online_pin, 1);
1083 INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN);
1084 INIT_HLIST_HEAD(&blkcg->blkg_list);
1085#ifdef CONFIG_CGROUP_WRITEBACK
1086 INIT_LIST_HEAD(&blkcg->cgwb_list);
1087#endif
1088 list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);
1089
1090 mutex_unlock(&blkcg_pol_mutex);
1091 return &blkcg->css;
1092
1093free_pd_blkcg:
1094 for (i--; i >= 0; i--)
1095 if (blkcg->cpd[i])
1096 blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
1097
1098 if (blkcg != &blkcg_root)
1099 kfree(blkcg);
1100unlock:
1101 mutex_unlock(&blkcg_pol_mutex);
1102 return ret;
1103}
1104
1105static int blkcg_css_online(struct cgroup_subsys_state *css)
1106{
1107 struct blkcg *blkcg = css_to_blkcg(css);
1108 struct blkcg *parent = blkcg_parent(blkcg);
1109
1110
1111
1112
1113
1114
1115 if (parent)
1116 blkcg_pin_online(parent);
1117 return 0;
1118}
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130int blkcg_init_queue(struct request_queue *q)
1131{
1132 struct blkcg_gq *new_blkg, *blkg;
1133 bool preloaded;
1134 int ret;
1135
1136 new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
1137 if (!new_blkg)
1138 return -ENOMEM;
1139
1140 preloaded = !radix_tree_preload(GFP_KERNEL);
1141
1142
1143 rcu_read_lock();
1144 spin_lock_irq(&q->queue_lock);
1145 blkg = blkg_create(&blkcg_root, q, new_blkg);
1146 if (IS_ERR(blkg))
1147 goto err_unlock;
1148 q->root_blkg = blkg;
1149 spin_unlock_irq(&q->queue_lock);
1150 rcu_read_unlock();
1151
1152 if (preloaded)
1153 radix_tree_preload_end();
1154
1155 ret = blk_throtl_init(q);
1156 if (ret)
1157 goto err_destroy_all;
1158
1159 ret = blk_iolatency_init(q);
1160 if (ret) {
1161 blk_throtl_exit(q);
1162 goto err_destroy_all;
1163 }
1164 return 0;
1165
1166err_destroy_all:
1167 blkg_destroy_all(q);
1168 return ret;
1169err_unlock:
1170 spin_unlock_irq(&q->queue_lock);
1171 rcu_read_unlock();
1172 if (preloaded)
1173 radix_tree_preload_end();
1174 return PTR_ERR(blkg);
1175}
1176
1177
1178
1179
1180
1181
1182
1183void blkcg_exit_queue(struct request_queue *q)
1184{
1185 blkg_destroy_all(q);
1186 blk_throtl_exit(q);
1187}
1188
1189
1190
1191
1192
1193
1194
1195static int blkcg_can_attach(struct cgroup_taskset *tset)
1196{
1197 struct task_struct *task;
1198 struct cgroup_subsys_state *dst_css;
1199 struct io_context *ioc;
1200 int ret = 0;
1201
1202
1203 cgroup_taskset_for_each(task, dst_css, tset) {
1204 task_lock(task);
1205 ioc = task->io_context;
1206 if (ioc && atomic_read(&ioc->nr_tasks) > 1)
1207 ret = -EINVAL;
1208 task_unlock(task);
1209 if (ret)
1210 break;
1211 }
1212 return ret;
1213}
1214
1215static void blkcg_bind(struct cgroup_subsys_state *root_css)
1216{
1217 int i;
1218
1219 mutex_lock(&blkcg_pol_mutex);
1220
1221 for (i = 0; i < BLKCG_MAX_POLS; i++) {
1222 struct blkcg_policy *pol = blkcg_policy[i];
1223 struct blkcg *blkcg;
1224
1225 if (!pol || !pol->cpd_bind_fn)
1226 continue;
1227
1228 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node)
1229 if (blkcg->cpd[pol->plid])
1230 pol->cpd_bind_fn(blkcg->cpd[pol->plid]);
1231 }
1232 mutex_unlock(&blkcg_pol_mutex);
1233}
1234
1235static void blkcg_exit(struct task_struct *tsk)
1236{
1237 if (tsk->throttle_queue)
1238 blk_put_queue(tsk->throttle_queue);
1239 tsk->throttle_queue = NULL;
1240}
1241
1242struct cgroup_subsys io_cgrp_subsys = {
1243 .css_alloc = blkcg_css_alloc,
1244 .css_online = blkcg_css_online,
1245 .css_offline = blkcg_css_offline,
1246 .css_free = blkcg_css_free,
1247 .can_attach = blkcg_can_attach,
1248 .css_rstat_flush = blkcg_rstat_flush,
1249 .bind = blkcg_bind,
1250 .dfl_cftypes = blkcg_files,
1251 .legacy_cftypes = blkcg_legacy_files,
1252 .legacy_name = "blkio",
1253 .exit = blkcg_exit,
1254#ifdef CONFIG_MEMCG
1255
1256
1257
1258
1259
1260 .depends_on = 1 << memory_cgrp_id,
1261#endif
1262};
1263EXPORT_SYMBOL_GPL(io_cgrp_subsys);
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281int blkcg_activate_policy(struct request_queue *q,
1282 const struct blkcg_policy *pol)
1283{
1284 struct blkg_policy_data *pd_prealloc = NULL;
1285 struct blkcg_gq *blkg, *pinned_blkg = NULL;
1286 int ret;
1287
1288 if (blkcg_policy_enabled(q, pol))
1289 return 0;
1290
1291 if (queue_is_mq(q))
1292 blk_mq_freeze_queue(q);
1293retry:
1294 spin_lock_irq(&q->queue_lock);
1295
1296
1297 list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
1298 struct blkg_policy_data *pd;
1299
1300 if (blkg->pd[pol->plid])
1301 continue;
1302
1303
1304 if (blkg == pinned_blkg) {
1305 pd = pd_prealloc;
1306 pd_prealloc = NULL;
1307 } else {
1308 pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q,
1309 blkg->blkcg);
1310 }
1311
1312 if (!pd) {
1313
1314
1315
1316
1317 if (pinned_blkg)
1318 blkg_put(pinned_blkg);
1319 blkg_get(blkg);
1320 pinned_blkg = blkg;
1321
1322 spin_unlock_irq(&q->queue_lock);
1323
1324 if (pd_prealloc)
1325 pol->pd_free_fn(pd_prealloc);
1326 pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q,
1327 blkg->blkcg);
1328 if (pd_prealloc)
1329 goto retry;
1330 else
1331 goto enomem;
1332 }
1333
1334 blkg->pd[pol->plid] = pd;
1335 pd->blkg = blkg;
1336 pd->plid = pol->plid;
1337 }
1338
1339
1340 if (pol->pd_init_fn)
1341 list_for_each_entry_reverse(blkg, &q->blkg_list, q_node)
1342 pol->pd_init_fn(blkg->pd[pol->plid]);
1343
1344 __set_bit(pol->plid, q->blkcg_pols);
1345 ret = 0;
1346
1347 spin_unlock_irq(&q->queue_lock);
1348out:
1349 if (queue_is_mq(q))
1350 blk_mq_unfreeze_queue(q);
1351 if (pinned_blkg)
1352 blkg_put(pinned_blkg);
1353 if (pd_prealloc)
1354 pol->pd_free_fn(pd_prealloc);
1355 return ret;
1356
1357enomem:
1358
1359 spin_lock_irq(&q->queue_lock);
1360 list_for_each_entry(blkg, &q->blkg_list, q_node) {
1361 if (blkg->pd[pol->plid]) {
1362 pol->pd_free_fn(blkg->pd[pol->plid]);
1363 blkg->pd[pol->plid] = NULL;
1364 }
1365 }
1366 spin_unlock_irq(&q->queue_lock);
1367 ret = -ENOMEM;
1368 goto out;
1369}
1370EXPORT_SYMBOL_GPL(blkcg_activate_policy);
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380void blkcg_deactivate_policy(struct request_queue *q,
1381 const struct blkcg_policy *pol)
1382{
1383 struct blkcg_gq *blkg;
1384
1385 if (!blkcg_policy_enabled(q, pol))
1386 return;
1387
1388 if (queue_is_mq(q))
1389 blk_mq_freeze_queue(q);
1390
1391 spin_lock_irq(&q->queue_lock);
1392
1393 __clear_bit(pol->plid, q->blkcg_pols);
1394
1395 list_for_each_entry(blkg, &q->blkg_list, q_node) {
1396 if (blkg->pd[pol->plid]) {
1397 if (pol->pd_offline_fn)
1398 pol->pd_offline_fn(blkg->pd[pol->plid]);
1399 pol->pd_free_fn(blkg->pd[pol->plid]);
1400 blkg->pd[pol->plid] = NULL;
1401 }
1402 }
1403
1404 spin_unlock_irq(&q->queue_lock);
1405
1406 if (queue_is_mq(q))
1407 blk_mq_unfreeze_queue(q);
1408}
1409EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
1410
1411
1412
1413
1414
1415
1416
1417
1418int blkcg_policy_register(struct blkcg_policy *pol)
1419{
1420 struct blkcg *blkcg;
1421 int i, ret;
1422
1423 mutex_lock(&blkcg_pol_register_mutex);
1424 mutex_lock(&blkcg_pol_mutex);
1425
1426
1427 ret = -ENOSPC;
1428 for (i = 0; i < BLKCG_MAX_POLS; i++)
1429 if (!blkcg_policy[i])
1430 break;
1431 if (i >= BLKCG_MAX_POLS) {
1432 pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n");
1433 goto err_unlock;
1434 }
1435
1436
1437 if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
1438 (!pol->pd_alloc_fn ^ !pol->pd_free_fn))
1439 goto err_unlock;
1440
1441
1442 pol->plid = i;
1443 blkcg_policy[pol->plid] = pol;
1444
1445
1446 if (pol->cpd_alloc_fn) {
1447 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1448 struct blkcg_policy_data *cpd;
1449
1450 cpd = pol->cpd_alloc_fn(GFP_KERNEL);
1451 if (!cpd)
1452 goto err_free_cpds;
1453
1454 blkcg->cpd[pol->plid] = cpd;
1455 cpd->blkcg = blkcg;
1456 cpd->plid = pol->plid;
1457 if (pol->cpd_init_fn)
1458 pol->cpd_init_fn(cpd);
1459 }
1460 }
1461
1462 mutex_unlock(&blkcg_pol_mutex);
1463
1464
1465 if (pol->dfl_cftypes)
1466 WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys,
1467 pol->dfl_cftypes));
1468 if (pol->legacy_cftypes)
1469 WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
1470 pol->legacy_cftypes));
1471 mutex_unlock(&blkcg_pol_register_mutex);
1472 return 0;
1473
1474err_free_cpds:
1475 if (pol->cpd_free_fn) {
1476 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1477 if (blkcg->cpd[pol->plid]) {
1478 pol->cpd_free_fn(blkcg->cpd[pol->plid]);
1479 blkcg->cpd[pol->plid] = NULL;
1480 }
1481 }
1482 }
1483 blkcg_policy[pol->plid] = NULL;
1484err_unlock:
1485 mutex_unlock(&blkcg_pol_mutex);
1486 mutex_unlock(&blkcg_pol_register_mutex);
1487 return ret;
1488}
1489EXPORT_SYMBOL_GPL(blkcg_policy_register);
1490
1491
1492
1493
1494
1495
1496
1497void blkcg_policy_unregister(struct blkcg_policy *pol)
1498{
1499 struct blkcg *blkcg;
1500
1501 mutex_lock(&blkcg_pol_register_mutex);
1502
1503 if (WARN_ON(blkcg_policy[pol->plid] != pol))
1504 goto out_unlock;
1505
1506
1507 if (pol->dfl_cftypes)
1508 cgroup_rm_cftypes(pol->dfl_cftypes);
1509 if (pol->legacy_cftypes)
1510 cgroup_rm_cftypes(pol->legacy_cftypes);
1511
1512
1513 mutex_lock(&blkcg_pol_mutex);
1514
1515 if (pol->cpd_free_fn) {
1516 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1517 if (blkcg->cpd[pol->plid]) {
1518 pol->cpd_free_fn(blkcg->cpd[pol->plid]);
1519 blkcg->cpd[pol->plid] = NULL;
1520 }
1521 }
1522 }
1523 blkcg_policy[pol->plid] = NULL;
1524
1525 mutex_unlock(&blkcg_pol_mutex);
1526out_unlock:
1527 mutex_unlock(&blkcg_pol_register_mutex);
1528}
1529EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
1530
1531bool __blkcg_punt_bio_submit(struct bio *bio)
1532{
1533 struct blkcg_gq *blkg = bio->bi_blkg;
1534
1535
1536 bio->bi_opf &= ~REQ_CGROUP_PUNT;
1537
1538
1539 if (!blkg->parent)
1540 return false;
1541
1542 spin_lock_bh(&blkg->async_bio_lock);
1543 bio_list_add(&blkg->async_bios, bio);
1544 spin_unlock_bh(&blkg->async_bio_lock);
1545
1546 queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
1547 return true;
1548}
1549
1550
1551
1552
1553
1554
1555
1556static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
1557{
1558 u64 old = atomic64_read(&blkg->delay_start);
1559
1560
1561 if (atomic_read(&blkg->use_delay) < 0)
1562 return;
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577 if (time_before64(old + NSEC_PER_SEC, now) &&
1578 atomic64_cmpxchg(&blkg->delay_start, old, now) == old) {
1579 u64 cur = atomic64_read(&blkg->delay_nsec);
1580 u64 sub = min_t(u64, blkg->last_delay, now - old);
1581 int cur_use = atomic_read(&blkg->use_delay);
1582
1583
1584
1585
1586
1587 if (cur_use < blkg->last_use)
1588 sub = max_t(u64, sub, blkg->last_delay >> 1);
1589
1590
1591
1592
1593
1594
1595
1596 if (unlikely(cur < sub)) {
1597 atomic64_set(&blkg->delay_nsec, 0);
1598 blkg->last_delay = 0;
1599 } else {
1600 atomic64_sub(sub, &blkg->delay_nsec);
1601 blkg->last_delay = cur - sub;
1602 }
1603 blkg->last_use = cur_use;
1604 }
1605}
1606
1607
1608
1609
1610
1611
1612
1613static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
1614{
1615 unsigned long pflags;
1616 u64 now = ktime_to_ns(ktime_get());
1617 u64 exp;
1618 u64 delay_nsec = 0;
1619 int tok;
1620
1621 while (blkg->parent) {
1622 if (atomic_read(&blkg->use_delay)) {
1623 blkcg_scale_delay(blkg, now);
1624 delay_nsec = max_t(u64, delay_nsec,
1625 atomic64_read(&blkg->delay_nsec));
1626 }
1627 blkg = blkg->parent;
1628 }
1629
1630 if (!delay_nsec)
1631 return;
1632
1633
1634
1635
1636
1637
1638
1639
1640 delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
1641
1642 if (use_memdelay)
1643 psi_memstall_enter(&pflags);
1644
1645 exp = ktime_add_ns(now, delay_nsec);
1646 tok = io_schedule_prepare();
1647 do {
1648 __set_current_state(TASK_KILLABLE);
1649 if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS))
1650 break;
1651 } while (!fatal_signal_pending(current));
1652 io_schedule_finish(tok);
1653
1654 if (use_memdelay)
1655 psi_memstall_leave(&pflags);
1656}
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668void blkcg_maybe_throttle_current(void)
1669{
1670 struct request_queue *q = current->throttle_queue;
1671 struct cgroup_subsys_state *css;
1672 struct blkcg *blkcg;
1673 struct blkcg_gq *blkg;
1674 bool use_memdelay = current->use_memdelay;
1675
1676 if (!q)
1677 return;
1678
1679 current->throttle_queue = NULL;
1680 current->use_memdelay = false;
1681
1682 rcu_read_lock();
1683 css = kthread_blkcg();
1684 if (css)
1685 blkcg = css_to_blkcg(css);
1686 else
1687 blkcg = css_to_blkcg(task_css(current, io_cgrp_id));
1688
1689 if (!blkcg)
1690 goto out;
1691 blkg = blkg_lookup(blkcg, q);
1692 if (!blkg)
1693 goto out;
1694 if (!blkg_tryget(blkg))
1695 goto out;
1696 rcu_read_unlock();
1697
1698 blkcg_maybe_throttle_blkg(blkg, use_memdelay);
1699 blkg_put(blkg);
1700 blk_put_queue(q);
1701 return;
1702out:
1703 rcu_read_unlock();
1704 blk_put_queue(q);
1705}
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
1725{
1726 if (unlikely(current->flags & PF_KTHREAD))
1727 return;
1728
1729 if (!blk_get_queue(q))
1730 return;
1731
1732 if (current->throttle_queue)
1733 blk_put_queue(current->throttle_queue);
1734 current->throttle_queue = q;
1735 if (use_memdelay)
1736 current->use_memdelay = use_memdelay;
1737 set_notify_resume(current);
1738}
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
1750{
1751 if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0))
1752 return;
1753 blkcg_scale_delay(blkg, now);
1754 atomic64_add(delta, &blkg->delay_nsec);
1755}
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio,
1767 struct cgroup_subsys_state *css)
1768{
1769 struct blkcg_gq *blkg, *ret_blkg = NULL;
1770
1771 rcu_read_lock();
1772 blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_disk->queue);
1773 while (blkg) {
1774 if (blkg_tryget(blkg)) {
1775 ret_blkg = blkg;
1776 break;
1777 }
1778 blkg = blkg->parent;
1779 }
1780 rcu_read_unlock();
1781
1782 return ret_blkg;
1783}
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799void bio_associate_blkg_from_css(struct bio *bio,
1800 struct cgroup_subsys_state *css)
1801{
1802 if (bio->bi_blkg)
1803 blkg_put(bio->bi_blkg);
1804
1805 if (css && css->parent) {
1806 bio->bi_blkg = blkg_tryget_closest(bio, css);
1807 } else {
1808 blkg_get(bio->bi_disk->queue->root_blkg);
1809 bio->bi_blkg = bio->bi_disk->queue->root_blkg;
1810 }
1811}
1812EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823void bio_associate_blkg(struct bio *bio)
1824{
1825 struct cgroup_subsys_state *css;
1826
1827 rcu_read_lock();
1828
1829 if (bio->bi_blkg)
1830 css = &bio_blkcg(bio)->css;
1831 else
1832 css = blkcg_css();
1833
1834 bio_associate_blkg_from_css(bio, css);
1835
1836 rcu_read_unlock();
1837}
1838EXPORT_SYMBOL_GPL(bio_associate_blkg);
1839
1840
1841
1842
1843
1844
1845void bio_clone_blkg_association(struct bio *dst, struct bio *src)
1846{
1847 if (src->bi_blkg) {
1848 if (dst->bi_blkg)
1849 blkg_put(dst->bi_blkg);
1850 blkg_get(src->bi_blkg);
1851 dst->bi_blkg = src->bi_blkg;
1852 }
1853}
1854EXPORT_SYMBOL_GPL(bio_clone_blkg_association);
1855
1856static int blk_cgroup_io_type(struct bio *bio)
1857{
1858 if (op_is_discard(bio->bi_opf))
1859 return BLKG_IOSTAT_DISCARD;
1860 if (op_is_write(bio->bi_opf))
1861 return BLKG_IOSTAT_WRITE;
1862 return BLKG_IOSTAT_READ;
1863}
1864
1865void blk_cgroup_bio_start(struct bio *bio)
1866{
1867 int rwd = blk_cgroup_io_type(bio), cpu;
1868 struct blkg_iostat_set *bis;
1869
1870 cpu = get_cpu();
1871 bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu);
1872 u64_stats_update_begin(&bis->sync);
1873
1874
1875
1876
1877
1878 if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
1879 bio_set_flag(bio, BIO_CGROUP_ACCT);
1880 bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
1881 }
1882 bis->cur.ios[rwd]++;
1883
1884 u64_stats_update_end(&bis->sync);
1885 if (cgroup_subsys_on_dfl(io_cgrp_subsys))
1886 cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu);
1887 put_cpu();
1888}
1889
1890static int __init blkcg_init(void)
1891{
1892 blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
1893 WQ_MEM_RECLAIM | WQ_FREEZABLE |
1894 WQ_UNBOUND | WQ_SYSFS, 0);
1895 if (!blkcg_punt_bio_wq)
1896 return -ENOMEM;
1897 return 0;
1898}
1899subsys_initcall(blkcg_init);
1900
1901module_param(blkcg_debug_stats, bool, 0644);
1902MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
1903