1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18#include <linux/ioprio.h>
19#include <linux/kdev_t.h>
20#include <linux/module.h>
21#include <linux/sched/signal.h>
22#include <linux/err.h>
23#include <linux/blkdev.h>
24#include <linux/backing-dev.h>
25#include <linux/slab.h>
26#include <linux/genhd.h>
27#include <linux/delay.h>
28#include <linux/atomic.h>
29#include <linux/ctype.h>
30#include <linux/blk-cgroup.h>
31#include <linux/tracehook.h>
32#include <linux/psi.h>
33#include "blk.h"
34
35#define MAX_KEY_LEN 100
36
37
38
39
40
41
42
43
44static DEFINE_MUTEX(blkcg_pol_register_mutex);
45static DEFINE_MUTEX(blkcg_pol_mutex);
46
47struct blkcg blkcg_root;
48EXPORT_SYMBOL_GPL(blkcg_root);
49
50struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
51EXPORT_SYMBOL_GPL(blkcg_root_css);
52
53static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
54
55static LIST_HEAD(all_blkcgs);
56
57bool blkcg_debug_stats = false;
58static struct workqueue_struct *blkcg_punt_bio_wq;
59
60static bool blkcg_policy_enabled(struct request_queue *q,
61 const struct blkcg_policy *pol)
62{
63 return pol && test_bit(pol->plid, q->blkcg_pols);
64}
65
66
67
68
69
70
71
72static void blkg_free(struct blkcg_gq *blkg)
73{
74 int i;
75
76 if (!blkg)
77 return;
78
79 for (i = 0; i < BLKCG_MAX_POLS; i++)
80 if (blkg->pd[i])
81 blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
82
83 free_percpu(blkg->iostat_cpu);
84 percpu_ref_exit(&blkg->refcnt);
85 kfree(blkg);
86}
87
88static void __blkg_release(struct rcu_head *rcu)
89{
90 struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
91
92 WARN_ON(!bio_list_empty(&blkg->async_bios));
93
94
95 css_put(&blkg->blkcg->css);
96 if (blkg->parent)
97 blkg_put(blkg->parent);
98 blkg_free(blkg);
99}
100
101
102
103
104
105
106
107
108
109static void blkg_release(struct percpu_ref *ref)
110{
111 struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt);
112
113 call_rcu(&blkg->rcu_head, __blkg_release);
114}
115
116static void blkg_async_bio_workfn(struct work_struct *work)
117{
118 struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
119 async_bio_work);
120 struct bio_list bios = BIO_EMPTY_LIST;
121 struct bio *bio;
122 struct blk_plug plug;
123 bool need_plug = false;
124
125
126 spin_lock_bh(&blkg->async_bio_lock);
127 bio_list_merge(&bios, &blkg->async_bios);
128 bio_list_init(&blkg->async_bios);
129 spin_unlock_bh(&blkg->async_bio_lock);
130
131
132 if (bios.head && bios.head->bi_next) {
133 need_plug = true;
134 blk_start_plug(&plug);
135 }
136 while ((bio = bio_list_pop(&bios)))
137 submit_bio(bio);
138 if (need_plug)
139 blk_finish_plug(&plug);
140}
141
142
143
144
145
146
147
148
149
150static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
151 gfp_t gfp_mask)
152{
153 struct blkcg_gq *blkg;
154 int i, cpu;
155
156
157 blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node);
158 if (!blkg)
159 return NULL;
160
161 if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask))
162 goto err_free;
163
164 blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask);
165 if (!blkg->iostat_cpu)
166 goto err_free;
167
168 blkg->q = q;
169 INIT_LIST_HEAD(&blkg->q_node);
170 spin_lock_init(&blkg->async_bio_lock);
171 bio_list_init(&blkg->async_bios);
172 INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
173 blkg->blkcg = blkcg;
174
175 u64_stats_init(&blkg->iostat.sync);
176 for_each_possible_cpu(cpu)
177 u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync);
178
179 for (i = 0; i < BLKCG_MAX_POLS; i++) {
180 struct blkcg_policy *pol = blkcg_policy[i];
181 struct blkg_policy_data *pd;
182
183 if (!blkcg_policy_enabled(q, pol))
184 continue;
185
186
187 pd = pol->pd_alloc_fn(gfp_mask, q, blkcg);
188 if (!pd)
189 goto err_free;
190
191 blkg->pd[i] = pd;
192 pd->blkg = blkg;
193 pd->plid = i;
194 }
195
196 return blkg;
197
198err_free:
199 blkg_free(blkg);
200 return NULL;
201}
202
203struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
204 struct request_queue *q, bool update_hint)
205{
206 struct blkcg_gq *blkg;
207
208
209
210
211
212
213
214 blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
215 if (blkg && blkg->q == q) {
216 if (update_hint) {
217 lockdep_assert_held(&q->queue_lock);
218 rcu_assign_pointer(blkcg->blkg_hint, blkg);
219 }
220 return blkg;
221 }
222
223 return NULL;
224}
225EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
226
227
228
229
230
231static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
232 struct request_queue *q,
233 struct blkcg_gq *new_blkg)
234{
235 struct blkcg_gq *blkg;
236 int i, ret;
237
238 WARN_ON_ONCE(!rcu_read_lock_held());
239 lockdep_assert_held(&q->queue_lock);
240
241
242 if (blk_queue_dying(q)) {
243 ret = -ENODEV;
244 goto err_free_blkg;
245 }
246
247
248 if (!css_tryget_online(&blkcg->css)) {
249 ret = -ENODEV;
250 goto err_free_blkg;
251 }
252
253
254 if (!new_blkg) {
255 new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);
256 if (unlikely(!new_blkg)) {
257 ret = -ENOMEM;
258 goto err_put_css;
259 }
260 }
261 blkg = new_blkg;
262
263
264 if (blkcg_parent(blkcg)) {
265 blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
266 if (WARN_ON_ONCE(!blkg->parent)) {
267 ret = -ENODEV;
268 goto err_put_css;
269 }
270 blkg_get(blkg->parent);
271 }
272
273
274 for (i = 0; i < BLKCG_MAX_POLS; i++) {
275 struct blkcg_policy *pol = blkcg_policy[i];
276
277 if (blkg->pd[i] && pol->pd_init_fn)
278 pol->pd_init_fn(blkg->pd[i]);
279 }
280
281
282 spin_lock(&blkcg->lock);
283 ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
284 if (likely(!ret)) {
285 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
286 list_add(&blkg->q_node, &q->blkg_list);
287
288 for (i = 0; i < BLKCG_MAX_POLS; i++) {
289 struct blkcg_policy *pol = blkcg_policy[i];
290
291 if (blkg->pd[i] && pol->pd_online_fn)
292 pol->pd_online_fn(blkg->pd[i]);
293 }
294 }
295 blkg->online = true;
296 spin_unlock(&blkcg->lock);
297
298 if (!ret)
299 return blkg;
300
301
302 blkg_put(blkg);
303 return ERR_PTR(ret);
304
305err_put_css:
306 css_put(&blkcg->css);
307err_free_blkg:
308 blkg_free(new_blkg);
309 return ERR_PTR(ret);
310}
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
326 struct request_queue *q)
327{
328 struct blkcg_gq *blkg;
329 unsigned long flags;
330
331 WARN_ON_ONCE(!rcu_read_lock_held());
332
333 blkg = blkg_lookup(blkcg, q);
334 if (blkg)
335 return blkg;
336
337 spin_lock_irqsave(&q->queue_lock, flags);
338 blkg = __blkg_lookup(blkcg, q, true);
339 if (blkg)
340 goto found;
341
342
343
344
345
346
347 while (true) {
348 struct blkcg *pos = blkcg;
349 struct blkcg *parent = blkcg_parent(blkcg);
350 struct blkcg_gq *ret_blkg = q->root_blkg;
351
352 while (parent) {
353 blkg = __blkg_lookup(parent, q, false);
354 if (blkg) {
355
356 ret_blkg = blkg;
357 break;
358 }
359 pos = parent;
360 parent = blkcg_parent(parent);
361 }
362
363 blkg = blkg_create(pos, q, NULL);
364 if (IS_ERR(blkg)) {
365 blkg = ret_blkg;
366 break;
367 }
368 if (pos == blkcg)
369 break;
370 }
371
372found:
373 spin_unlock_irqrestore(&q->queue_lock, flags);
374 return blkg;
375}
376
377static void blkg_destroy(struct blkcg_gq *blkg)
378{
379 struct blkcg *blkcg = blkg->blkcg;
380 int i;
381
382 lockdep_assert_held(&blkg->q->queue_lock);
383 lockdep_assert_held(&blkcg->lock);
384
385
386 WARN_ON_ONCE(list_empty(&blkg->q_node));
387 WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
388
389 for (i = 0; i < BLKCG_MAX_POLS; i++) {
390 struct blkcg_policy *pol = blkcg_policy[i];
391
392 if (blkg->pd[i] && pol->pd_offline_fn)
393 pol->pd_offline_fn(blkg->pd[i]);
394 }
395
396 blkg->online = false;
397
398 radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
399 list_del_init(&blkg->q_node);
400 hlist_del_init_rcu(&blkg->blkcg_node);
401
402
403
404
405
406
407 if (rcu_access_pointer(blkcg->blkg_hint) == blkg)
408 rcu_assign_pointer(blkcg->blkg_hint, NULL);
409
410
411
412
413
414 percpu_ref_kill(&blkg->refcnt);
415}
416
417
418
419
420
421
422
423static void blkg_destroy_all(struct request_queue *q)
424{
425 struct blkcg_gq *blkg, *n;
426
427 spin_lock_irq(&q->queue_lock);
428 list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
429 struct blkcg *blkcg = blkg->blkcg;
430
431 spin_lock(&blkcg->lock);
432 blkg_destroy(blkg);
433 spin_unlock(&blkcg->lock);
434 }
435
436 q->root_blkg = NULL;
437 spin_unlock_irq(&q->queue_lock);
438}
439
440static int blkcg_reset_stats(struct cgroup_subsys_state *css,
441 struct cftype *cftype, u64 val)
442{
443 struct blkcg *blkcg = css_to_blkcg(css);
444 struct blkcg_gq *blkg;
445 int i, cpu;
446
447 mutex_lock(&blkcg_pol_mutex);
448 spin_lock_irq(&blkcg->lock);
449
450
451
452
453
454
455 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
456 for_each_possible_cpu(cpu) {
457 struct blkg_iostat_set *bis =
458 per_cpu_ptr(blkg->iostat_cpu, cpu);
459 memset(bis, 0, sizeof(*bis));
460 }
461 memset(&blkg->iostat, 0, sizeof(blkg->iostat));
462
463 for (i = 0; i < BLKCG_MAX_POLS; i++) {
464 struct blkcg_policy *pol = blkcg_policy[i];
465
466 if (blkg->pd[i] && pol->pd_reset_stats_fn)
467 pol->pd_reset_stats_fn(blkg->pd[i]);
468 }
469 }
470
471 spin_unlock_irq(&blkcg->lock);
472 mutex_unlock(&blkcg_pol_mutex);
473 return 0;
474}
475
476const char *blkg_dev_name(struct blkcg_gq *blkg)
477{
478
479 if (blkg->q->backing_dev_info->dev)
480 return bdi_dev_name(blkg->q->backing_dev_info);
481 return NULL;
482}
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
503 u64 (*prfill)(struct seq_file *,
504 struct blkg_policy_data *, int),
505 const struct blkcg_policy *pol, int data,
506 bool show_total)
507{
508 struct blkcg_gq *blkg;
509 u64 total = 0;
510
511 rcu_read_lock();
512 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
513 spin_lock_irq(&blkg->q->queue_lock);
514 if (blkcg_policy_enabled(blkg->q, pol))
515 total += prfill(sf, blkg->pd[pol->plid], data);
516 spin_unlock_irq(&blkg->q->queue_lock);
517 }
518 rcu_read_unlock();
519
520 if (show_total)
521 seq_printf(sf, "Total %llu\n", (unsigned long long)total);
522}
523EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
524
525
526
527
528
529
530
531
532
533u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
534{
535 const char *dname = blkg_dev_name(pd->blkg);
536
537 if (!dname)
538 return 0;
539
540 seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
541 return v;
542}
543EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
544
545
546static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
547 const struct blkcg_policy *pol,
548 struct request_queue *q)
549{
550 WARN_ON_ONCE(!rcu_read_lock_held());
551 lockdep_assert_held(&q->queue_lock);
552
553 if (!blkcg_policy_enabled(q, pol))
554 return ERR_PTR(-EOPNOTSUPP);
555 return __blkg_lookup(blkcg, q, true );
556}
557
558
559
560
561
562
563
564
565
566
567
568
569struct gendisk *blkcg_conf_get_disk(char **inputp)
570{
571 char *input = *inputp;
572 unsigned int major, minor;
573 struct gendisk *disk;
574 int key_len, part;
575
576 if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
577 return ERR_PTR(-EINVAL);
578
579 input += key_len;
580 if (!isspace(*input))
581 return ERR_PTR(-EINVAL);
582 input = skip_spaces(input);
583
584 disk = get_gendisk(MKDEV(major, minor), &part);
585 if (!disk)
586 return ERR_PTR(-ENODEV);
587 if (part) {
588 put_disk_and_module(disk);
589 return ERR_PTR(-ENODEV);
590 }
591
592 *inputp = input;
593 return disk;
594}
595
596
597
598
599
600
601
602
603
604
605
606
607
608int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
609 char *input, struct blkg_conf_ctx *ctx)
610 __acquires(rcu) __acquires(&disk->queue->queue_lock)
611{
612 struct gendisk *disk;
613 struct request_queue *q;
614 struct blkcg_gq *blkg;
615 int ret;
616
617 disk = blkcg_conf_get_disk(&input);
618 if (IS_ERR(disk))
619 return PTR_ERR(disk);
620
621 q = disk->queue;
622
623 rcu_read_lock();
624 spin_lock_irq(&q->queue_lock);
625
626 blkg = blkg_lookup_check(blkcg, pol, q);
627 if (IS_ERR(blkg)) {
628 ret = PTR_ERR(blkg);
629 goto fail_unlock;
630 }
631
632 if (blkg)
633 goto success;
634
635
636
637
638
639 while (true) {
640 struct blkcg *pos = blkcg;
641 struct blkcg *parent;
642 struct blkcg_gq *new_blkg;
643
644 parent = blkcg_parent(blkcg);
645 while (parent && !__blkg_lookup(parent, q, false)) {
646 pos = parent;
647 parent = blkcg_parent(parent);
648 }
649
650
651 spin_unlock_irq(&q->queue_lock);
652 rcu_read_unlock();
653
654 new_blkg = blkg_alloc(pos, q, GFP_KERNEL);
655 if (unlikely(!new_blkg)) {
656 ret = -ENOMEM;
657 goto fail;
658 }
659
660 if (radix_tree_preload(GFP_KERNEL)) {
661 blkg_free(new_blkg);
662 ret = -ENOMEM;
663 goto fail;
664 }
665
666 rcu_read_lock();
667 spin_lock_irq(&q->queue_lock);
668
669 blkg = blkg_lookup_check(pos, pol, q);
670 if (IS_ERR(blkg)) {
671 ret = PTR_ERR(blkg);
672 blkg_free(new_blkg);
673 goto fail_preloaded;
674 }
675
676 if (blkg) {
677 blkg_free(new_blkg);
678 } else {
679 blkg = blkg_create(pos, q, new_blkg);
680 if (IS_ERR(blkg)) {
681 ret = PTR_ERR(blkg);
682 goto fail_preloaded;
683 }
684 }
685
686 radix_tree_preload_end();
687
688 if (pos == blkcg)
689 goto success;
690 }
691success:
692 ctx->disk = disk;
693 ctx->blkg = blkg;
694 ctx->body = input;
695 return 0;
696
697fail_preloaded:
698 radix_tree_preload_end();
699fail_unlock:
700 spin_unlock_irq(&q->queue_lock);
701 rcu_read_unlock();
702fail:
703 put_disk_and_module(disk);
704
705
706
707
708
709
710 if (ret == -EBUSY) {
711 msleep(10);
712 ret = restart_syscall();
713 }
714 return ret;
715}
716EXPORT_SYMBOL_GPL(blkg_conf_prep);
717
718
719
720
721
722
723
724
725void blkg_conf_finish(struct blkg_conf_ctx *ctx)
726 __releases(&ctx->disk->queue->queue_lock) __releases(rcu)
727{
728 spin_unlock_irq(&ctx->disk->queue->queue_lock);
729 rcu_read_unlock();
730 put_disk_and_module(ctx->disk);
731}
732EXPORT_SYMBOL_GPL(blkg_conf_finish);
733
734static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
735{
736 int i;
737
738 for (i = 0; i < BLKG_IOSTAT_NR; i++) {
739 dst->bytes[i] = src->bytes[i];
740 dst->ios[i] = src->ios[i];
741 }
742}
743
744static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src)
745{
746 int i;
747
748 for (i = 0; i < BLKG_IOSTAT_NR; i++) {
749 dst->bytes[i] += src->bytes[i];
750 dst->ios[i] += src->ios[i];
751 }
752}
753
754static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src)
755{
756 int i;
757
758 for (i = 0; i < BLKG_IOSTAT_NR; i++) {
759 dst->bytes[i] -= src->bytes[i];
760 dst->ios[i] -= src->ios[i];
761 }
762}
763
764static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
765{
766 struct blkcg *blkcg = css_to_blkcg(css);
767 struct blkcg_gq *blkg;
768
769 rcu_read_lock();
770
771 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
772 struct blkcg_gq *parent = blkg->parent;
773 struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
774 struct blkg_iostat cur, delta;
775 unsigned int seq;
776
777
778 do {
779 seq = u64_stats_fetch_begin(&bisc->sync);
780 blkg_iostat_set(&cur, &bisc->cur);
781 } while (u64_stats_fetch_retry(&bisc->sync, seq));
782
783
784 u64_stats_update_begin(&blkg->iostat.sync);
785 blkg_iostat_set(&delta, &cur);
786 blkg_iostat_sub(&delta, &bisc->last);
787 blkg_iostat_add(&blkg->iostat.cur, &delta);
788 blkg_iostat_add(&bisc->last, &delta);
789 u64_stats_update_end(&blkg->iostat.sync);
790
791
792 if (parent) {
793 u64_stats_update_begin(&parent->iostat.sync);
794 blkg_iostat_set(&delta, &blkg->iostat.cur);
795 blkg_iostat_sub(&delta, &blkg->iostat.last);
796 blkg_iostat_add(&parent->iostat.cur, &delta);
797 blkg_iostat_add(&blkg->iostat.last, &delta);
798 u64_stats_update_end(&parent->iostat.sync);
799 }
800 }
801
802 rcu_read_unlock();
803}
804
805
806
807
808
809
810
811
812
813
814
815
816static void blkcg_fill_root_iostats(void)
817{
818 struct class_dev_iter iter;
819 struct device *dev;
820
821 class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
822 while ((dev = class_dev_iter_next(&iter))) {
823 struct gendisk *disk = dev_to_disk(dev);
824 struct hd_struct *part = disk_get_part(disk, 0);
825 struct blkcg_gq *blkg = blk_queue_root_blkg(disk->queue);
826 struct blkg_iostat tmp;
827 int cpu;
828
829 memset(&tmp, 0, sizeof(tmp));
830 for_each_possible_cpu(cpu) {
831 struct disk_stats *cpu_dkstats;
832
833 cpu_dkstats = per_cpu_ptr(part->dkstats, cpu);
834 tmp.ios[BLKG_IOSTAT_READ] +=
835 cpu_dkstats->ios[STAT_READ];
836 tmp.ios[BLKG_IOSTAT_WRITE] +=
837 cpu_dkstats->ios[STAT_WRITE];
838 tmp.ios[BLKG_IOSTAT_DISCARD] +=
839 cpu_dkstats->ios[STAT_DISCARD];
840
841 tmp.bytes[BLKG_IOSTAT_READ] +=
842 cpu_dkstats->sectors[STAT_READ] << 9;
843 tmp.bytes[BLKG_IOSTAT_WRITE] +=
844 cpu_dkstats->sectors[STAT_WRITE] << 9;
845 tmp.bytes[BLKG_IOSTAT_DISCARD] +=
846 cpu_dkstats->sectors[STAT_DISCARD] << 9;
847
848 u64_stats_update_begin(&blkg->iostat.sync);
849 blkg_iostat_set(&blkg->iostat.cur, &tmp);
850 u64_stats_update_end(&blkg->iostat.sync);
851 }
852 disk_put_part(part);
853 }
854}
855
856static int blkcg_print_stat(struct seq_file *sf, void *v)
857{
858 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
859 struct blkcg_gq *blkg;
860
861 if (!seq_css(sf)->parent)
862 blkcg_fill_root_iostats();
863 else
864 cgroup_rstat_flush(blkcg->css.cgroup);
865
866 rcu_read_lock();
867
868 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
869 struct blkg_iostat_set *bis = &blkg->iostat;
870 const char *dname;
871 char *buf;
872 u64 rbytes, wbytes, rios, wios, dbytes, dios;
873 size_t size = seq_get_buf(sf, &buf), off = 0;
874 int i;
875 bool has_stats = false;
876 unsigned seq;
877
878 spin_lock_irq(&blkg->q->queue_lock);
879
880 if (!blkg->online)
881 goto skip;
882
883 dname = blkg_dev_name(blkg);
884 if (!dname)
885 goto skip;
886
887
888
889
890
891
892
893 off += scnprintf(buf+off, size-off, "%s ", dname);
894
895 do {
896 seq = u64_stats_fetch_begin(&bis->sync);
897
898 rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
899 wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
900 dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
901 rios = bis->cur.ios[BLKG_IOSTAT_READ];
902 wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
903 dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
904 } while (u64_stats_fetch_retry(&bis->sync, seq));
905
906 if (rbytes || wbytes || rios || wios) {
907 has_stats = true;
908 off += scnprintf(buf+off, size-off,
909 "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
910 rbytes, wbytes, rios, wios,
911 dbytes, dios);
912 }
913
914 if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) {
915 has_stats = true;
916 off += scnprintf(buf+off, size-off,
917 " use_delay=%d delay_nsec=%llu",
918 atomic_read(&blkg->use_delay),
919 (unsigned long long)atomic64_read(&blkg->delay_nsec));
920 }
921
922 for (i = 0; i < BLKCG_MAX_POLS; i++) {
923 struct blkcg_policy *pol = blkcg_policy[i];
924 size_t written;
925
926 if (!blkg->pd[i] || !pol->pd_stat_fn)
927 continue;
928
929 written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off);
930 if (written)
931 has_stats = true;
932 off += written;
933 }
934
935 if (has_stats) {
936 if (off < size - 1) {
937 off += scnprintf(buf+off, size-off, "\n");
938 seq_commit(sf, off);
939 } else {
940 seq_commit(sf, -1);
941 }
942 }
943 skip:
944 spin_unlock_irq(&blkg->q->queue_lock);
945 }
946
947 rcu_read_unlock();
948 return 0;
949}
950
951static struct cftype blkcg_files[] = {
952 {
953 .name = "stat",
954 .seq_show = blkcg_print_stat,
955 },
956 { }
957};
958
959static struct cftype blkcg_legacy_files[] = {
960 {
961 .name = "reset_stats",
962 .write_u64 = blkcg_reset_stats,
963 },
964 { }
965};
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996static void blkcg_css_offline(struct cgroup_subsys_state *css)
997{
998 struct blkcg *blkcg = css_to_blkcg(css);
999
1000
1001 wb_blkcg_offline(blkcg);
1002
1003
1004 blkcg_unpin_online(blkcg);
1005}
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018void blkcg_destroy_blkgs(struct blkcg *blkcg)
1019{
1020 spin_lock_irq(&blkcg->lock);
1021
1022 while (!hlist_empty(&blkcg->blkg_list)) {
1023 struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
1024 struct blkcg_gq, blkcg_node);
1025 struct request_queue *q = blkg->q;
1026
1027 if (spin_trylock(&q->queue_lock)) {
1028 blkg_destroy(blkg);
1029 spin_unlock(&q->queue_lock);
1030 } else {
1031 spin_unlock_irq(&blkcg->lock);
1032 cpu_relax();
1033 spin_lock_irq(&blkcg->lock);
1034 }
1035 }
1036
1037 spin_unlock_irq(&blkcg->lock);
1038}
1039
1040static void blkcg_css_free(struct cgroup_subsys_state *css)
1041{
1042 struct blkcg *blkcg = css_to_blkcg(css);
1043 int i;
1044
1045 mutex_lock(&blkcg_pol_mutex);
1046
1047 list_del(&blkcg->all_blkcgs_node);
1048
1049 for (i = 0; i < BLKCG_MAX_POLS; i++)
1050 if (blkcg->cpd[i])
1051 blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
1052
1053 mutex_unlock(&blkcg_pol_mutex);
1054
1055 kfree(blkcg);
1056}
1057
1058static struct cgroup_subsys_state *
1059blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
1060{
1061 struct blkcg *blkcg;
1062 struct cgroup_subsys_state *ret;
1063 int i;
1064
1065 mutex_lock(&blkcg_pol_mutex);
1066
1067 if (!parent_css) {
1068 blkcg = &blkcg_root;
1069 } else {
1070 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
1071 if (!blkcg) {
1072 ret = ERR_PTR(-ENOMEM);
1073 goto unlock;
1074 }
1075 }
1076
1077 for (i = 0; i < BLKCG_MAX_POLS ; i++) {
1078 struct blkcg_policy *pol = blkcg_policy[i];
1079 struct blkcg_policy_data *cpd;
1080
1081
1082
1083
1084
1085
1086
1087 if (!pol || !pol->cpd_alloc_fn)
1088 continue;
1089
1090 cpd = pol->cpd_alloc_fn(GFP_KERNEL);
1091 if (!cpd) {
1092 ret = ERR_PTR(-ENOMEM);
1093 goto free_pd_blkcg;
1094 }
1095 blkcg->cpd[i] = cpd;
1096 cpd->blkcg = blkcg;
1097 cpd->plid = i;
1098 if (pol->cpd_init_fn)
1099 pol->cpd_init_fn(cpd);
1100 }
1101
1102 spin_lock_init(&blkcg->lock);
1103 refcount_set(&blkcg->online_pin, 1);
1104 INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN);
1105 INIT_HLIST_HEAD(&blkcg->blkg_list);
1106#ifdef CONFIG_CGROUP_WRITEBACK
1107 INIT_LIST_HEAD(&blkcg->cgwb_list);
1108#endif
1109 list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);
1110
1111 mutex_unlock(&blkcg_pol_mutex);
1112 return &blkcg->css;
1113
1114free_pd_blkcg:
1115 for (i--; i >= 0; i--)
1116 if (blkcg->cpd[i])
1117 blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
1118
1119 if (blkcg != &blkcg_root)
1120 kfree(blkcg);
1121unlock:
1122 mutex_unlock(&blkcg_pol_mutex);
1123 return ret;
1124}
1125
1126static int blkcg_css_online(struct cgroup_subsys_state *css)
1127{
1128 struct blkcg *blkcg = css_to_blkcg(css);
1129 struct blkcg *parent = blkcg_parent(blkcg);
1130
1131
1132
1133
1134
1135
1136 if (parent)
1137 blkcg_pin_online(parent);
1138 return 0;
1139}
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151int blkcg_init_queue(struct request_queue *q)
1152{
1153 struct blkcg_gq *new_blkg, *blkg;
1154 bool preloaded;
1155 int ret;
1156
1157 new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
1158 if (!new_blkg)
1159 return -ENOMEM;
1160
1161 preloaded = !radix_tree_preload(GFP_KERNEL);
1162
1163
1164 rcu_read_lock();
1165 spin_lock_irq(&q->queue_lock);
1166 blkg = blkg_create(&blkcg_root, q, new_blkg);
1167 if (IS_ERR(blkg))
1168 goto err_unlock;
1169 q->root_blkg = blkg;
1170 spin_unlock_irq(&q->queue_lock);
1171 rcu_read_unlock();
1172
1173 if (preloaded)
1174 radix_tree_preload_end();
1175
1176 ret = blk_throtl_init(q);
1177 if (ret)
1178 goto err_destroy_all;
1179
1180 ret = blk_iolatency_init(q);
1181 if (ret) {
1182 blk_throtl_exit(q);
1183 goto err_destroy_all;
1184 }
1185 return 0;
1186
1187err_destroy_all:
1188 blkg_destroy_all(q);
1189 return ret;
1190err_unlock:
1191 spin_unlock_irq(&q->queue_lock);
1192 rcu_read_unlock();
1193 if (preloaded)
1194 radix_tree_preload_end();
1195 return PTR_ERR(blkg);
1196}
1197
1198
1199
1200
1201
1202
1203
1204void blkcg_exit_queue(struct request_queue *q)
1205{
1206 blkg_destroy_all(q);
1207 blk_throtl_exit(q);
1208}
1209
1210
1211
1212
1213
1214
1215
1216static int blkcg_can_attach(struct cgroup_taskset *tset)
1217{
1218 struct task_struct *task;
1219 struct cgroup_subsys_state *dst_css;
1220 struct io_context *ioc;
1221 int ret = 0;
1222
1223
1224 cgroup_taskset_for_each(task, dst_css, tset) {
1225 task_lock(task);
1226 ioc = task->io_context;
1227 if (ioc && atomic_read(&ioc->nr_tasks) > 1)
1228 ret = -EINVAL;
1229 task_unlock(task);
1230 if (ret)
1231 break;
1232 }
1233 return ret;
1234}
1235
1236static void blkcg_bind(struct cgroup_subsys_state *root_css)
1237{
1238 int i;
1239
1240 mutex_lock(&blkcg_pol_mutex);
1241
1242 for (i = 0; i < BLKCG_MAX_POLS; i++) {
1243 struct blkcg_policy *pol = blkcg_policy[i];
1244 struct blkcg *blkcg;
1245
1246 if (!pol || !pol->cpd_bind_fn)
1247 continue;
1248
1249 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node)
1250 if (blkcg->cpd[pol->plid])
1251 pol->cpd_bind_fn(blkcg->cpd[pol->plid]);
1252 }
1253 mutex_unlock(&blkcg_pol_mutex);
1254}
1255
1256static void blkcg_exit(struct task_struct *tsk)
1257{
1258 if (tsk->throttle_queue)
1259 blk_put_queue(tsk->throttle_queue);
1260 tsk->throttle_queue = NULL;
1261}
1262
1263struct cgroup_subsys io_cgrp_subsys = {
1264 .css_alloc = blkcg_css_alloc,
1265 .css_online = blkcg_css_online,
1266 .css_offline = blkcg_css_offline,
1267 .css_free = blkcg_css_free,
1268 .can_attach = blkcg_can_attach,
1269 .css_rstat_flush = blkcg_rstat_flush,
1270 .bind = blkcg_bind,
1271 .dfl_cftypes = blkcg_files,
1272 .legacy_cftypes = blkcg_legacy_files,
1273 .legacy_name = "blkio",
1274 .exit = blkcg_exit,
1275#ifdef CONFIG_MEMCG
1276
1277
1278
1279
1280
1281 .depends_on = 1 << memory_cgrp_id,
1282#endif
1283};
1284EXPORT_SYMBOL_GPL(io_cgrp_subsys);
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302int blkcg_activate_policy(struct request_queue *q,
1303 const struct blkcg_policy *pol)
1304{
1305 struct blkg_policy_data *pd_prealloc = NULL;
1306 struct blkcg_gq *blkg, *pinned_blkg = NULL;
1307 int ret;
1308
1309 if (blkcg_policy_enabled(q, pol))
1310 return 0;
1311
1312 if (queue_is_mq(q))
1313 blk_mq_freeze_queue(q);
1314retry:
1315 spin_lock_irq(&q->queue_lock);
1316
1317
1318 list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
1319 struct blkg_policy_data *pd;
1320
1321 if (blkg->pd[pol->plid])
1322 continue;
1323
1324
1325 if (blkg == pinned_blkg) {
1326 pd = pd_prealloc;
1327 pd_prealloc = NULL;
1328 } else {
1329 pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q,
1330 blkg->blkcg);
1331 }
1332
1333 if (!pd) {
1334
1335
1336
1337
1338 if (pinned_blkg)
1339 blkg_put(pinned_blkg);
1340 blkg_get(blkg);
1341 pinned_blkg = blkg;
1342
1343 spin_unlock_irq(&q->queue_lock);
1344
1345 if (pd_prealloc)
1346 pol->pd_free_fn(pd_prealloc);
1347 pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q,
1348 blkg->blkcg);
1349 if (pd_prealloc)
1350 goto retry;
1351 else
1352 goto enomem;
1353 }
1354
1355 blkg->pd[pol->plid] = pd;
1356 pd->blkg = blkg;
1357 pd->plid = pol->plid;
1358 }
1359
1360
1361 if (pol->pd_init_fn)
1362 list_for_each_entry_reverse(blkg, &q->blkg_list, q_node)
1363 pol->pd_init_fn(blkg->pd[pol->plid]);
1364
1365 __set_bit(pol->plid, q->blkcg_pols);
1366 ret = 0;
1367
1368 spin_unlock_irq(&q->queue_lock);
1369out:
1370 if (queue_is_mq(q))
1371 blk_mq_unfreeze_queue(q);
1372 if (pinned_blkg)
1373 blkg_put(pinned_blkg);
1374 if (pd_prealloc)
1375 pol->pd_free_fn(pd_prealloc);
1376 return ret;
1377
1378enomem:
1379
1380 spin_lock_irq(&q->queue_lock);
1381 list_for_each_entry(blkg, &q->blkg_list, q_node) {
1382 if (blkg->pd[pol->plid]) {
1383 pol->pd_free_fn(blkg->pd[pol->plid]);
1384 blkg->pd[pol->plid] = NULL;
1385 }
1386 }
1387 spin_unlock_irq(&q->queue_lock);
1388 ret = -ENOMEM;
1389 goto out;
1390}
1391EXPORT_SYMBOL_GPL(blkcg_activate_policy);
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401void blkcg_deactivate_policy(struct request_queue *q,
1402 const struct blkcg_policy *pol)
1403{
1404 struct blkcg_gq *blkg;
1405
1406 if (!blkcg_policy_enabled(q, pol))
1407 return;
1408
1409 if (queue_is_mq(q))
1410 blk_mq_freeze_queue(q);
1411
1412 spin_lock_irq(&q->queue_lock);
1413
1414 __clear_bit(pol->plid, q->blkcg_pols);
1415
1416 list_for_each_entry(blkg, &q->blkg_list, q_node) {
1417 if (blkg->pd[pol->plid]) {
1418 if (pol->pd_offline_fn)
1419 pol->pd_offline_fn(blkg->pd[pol->plid]);
1420 pol->pd_free_fn(blkg->pd[pol->plid]);
1421 blkg->pd[pol->plid] = NULL;
1422 }
1423 }
1424
1425 spin_unlock_irq(&q->queue_lock);
1426
1427 if (queue_is_mq(q))
1428 blk_mq_unfreeze_queue(q);
1429}
1430EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
1431
1432
1433
1434
1435
1436
1437
1438
1439int blkcg_policy_register(struct blkcg_policy *pol)
1440{
1441 struct blkcg *blkcg;
1442 int i, ret;
1443
1444 mutex_lock(&blkcg_pol_register_mutex);
1445 mutex_lock(&blkcg_pol_mutex);
1446
1447
1448 ret = -ENOSPC;
1449 for (i = 0; i < BLKCG_MAX_POLS; i++)
1450 if (!blkcg_policy[i])
1451 break;
1452 if (i >= BLKCG_MAX_POLS) {
1453 pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n");
1454 goto err_unlock;
1455 }
1456
1457
1458 if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
1459 (!pol->pd_alloc_fn ^ !pol->pd_free_fn))
1460 goto err_unlock;
1461
1462
1463 pol->plid = i;
1464 blkcg_policy[pol->plid] = pol;
1465
1466
1467 if (pol->cpd_alloc_fn) {
1468 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1469 struct blkcg_policy_data *cpd;
1470
1471 cpd = pol->cpd_alloc_fn(GFP_KERNEL);
1472 if (!cpd)
1473 goto err_free_cpds;
1474
1475 blkcg->cpd[pol->plid] = cpd;
1476 cpd->blkcg = blkcg;
1477 cpd->plid = pol->plid;
1478 if (pol->cpd_init_fn)
1479 pol->cpd_init_fn(cpd);
1480 }
1481 }
1482
1483 mutex_unlock(&blkcg_pol_mutex);
1484
1485
1486 if (pol->dfl_cftypes)
1487 WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys,
1488 pol->dfl_cftypes));
1489 if (pol->legacy_cftypes)
1490 WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
1491 pol->legacy_cftypes));
1492 mutex_unlock(&blkcg_pol_register_mutex);
1493 return 0;
1494
1495err_free_cpds:
1496 if (pol->cpd_free_fn) {
1497 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1498 if (blkcg->cpd[pol->plid]) {
1499 pol->cpd_free_fn(blkcg->cpd[pol->plid]);
1500 blkcg->cpd[pol->plid] = NULL;
1501 }
1502 }
1503 }
1504 blkcg_policy[pol->plid] = NULL;
1505err_unlock:
1506 mutex_unlock(&blkcg_pol_mutex);
1507 mutex_unlock(&blkcg_pol_register_mutex);
1508 return ret;
1509}
1510EXPORT_SYMBOL_GPL(blkcg_policy_register);
1511
1512
1513
1514
1515
1516
1517
1518void blkcg_policy_unregister(struct blkcg_policy *pol)
1519{
1520 struct blkcg *blkcg;
1521
1522 mutex_lock(&blkcg_pol_register_mutex);
1523
1524 if (WARN_ON(blkcg_policy[pol->plid] != pol))
1525 goto out_unlock;
1526
1527
1528 if (pol->dfl_cftypes)
1529 cgroup_rm_cftypes(pol->dfl_cftypes);
1530 if (pol->legacy_cftypes)
1531 cgroup_rm_cftypes(pol->legacy_cftypes);
1532
1533
1534 mutex_lock(&blkcg_pol_mutex);
1535
1536 if (pol->cpd_free_fn) {
1537 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1538 if (blkcg->cpd[pol->plid]) {
1539 pol->cpd_free_fn(blkcg->cpd[pol->plid]);
1540 blkcg->cpd[pol->plid] = NULL;
1541 }
1542 }
1543 }
1544 blkcg_policy[pol->plid] = NULL;
1545
1546 mutex_unlock(&blkcg_pol_mutex);
1547out_unlock:
1548 mutex_unlock(&blkcg_pol_register_mutex);
1549}
1550EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
1551
1552bool __blkcg_punt_bio_submit(struct bio *bio)
1553{
1554 struct blkcg_gq *blkg = bio->bi_blkg;
1555
1556
1557 bio->bi_opf &= ~REQ_CGROUP_PUNT;
1558
1559
1560 if (!blkg->parent)
1561 return false;
1562
1563 spin_lock_bh(&blkg->async_bio_lock);
1564 bio_list_add(&blkg->async_bios, bio);
1565 spin_unlock_bh(&blkg->async_bio_lock);
1566
1567 queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
1568 return true;
1569}
1570
1571
1572
1573
1574
1575
1576
1577static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
1578{
1579 u64 old = atomic64_read(&blkg->delay_start);
1580
1581
1582 if (atomic_read(&blkg->use_delay) < 0)
1583 return;
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598 if (time_before64(old + NSEC_PER_SEC, now) &&
1599 atomic64_cmpxchg(&blkg->delay_start, old, now) == old) {
1600 u64 cur = atomic64_read(&blkg->delay_nsec);
1601 u64 sub = min_t(u64, blkg->last_delay, now - old);
1602 int cur_use = atomic_read(&blkg->use_delay);
1603
1604
1605
1606
1607
1608 if (cur_use < blkg->last_use)
1609 sub = max_t(u64, sub, blkg->last_delay >> 1);
1610
1611
1612
1613
1614
1615
1616
1617 if (unlikely(cur < sub)) {
1618 atomic64_set(&blkg->delay_nsec, 0);
1619 blkg->last_delay = 0;
1620 } else {
1621 atomic64_sub(sub, &blkg->delay_nsec);
1622 blkg->last_delay = cur - sub;
1623 }
1624 blkg->last_use = cur_use;
1625 }
1626}
1627
1628
1629
1630
1631
1632
1633
1634static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
1635{
1636 unsigned long pflags;
1637 bool clamp;
1638 u64 now = ktime_to_ns(ktime_get());
1639 u64 exp;
1640 u64 delay_nsec = 0;
1641 int tok;
1642
1643 while (blkg->parent) {
1644 int use_delay = atomic_read(&blkg->use_delay);
1645
1646 if (use_delay) {
1647 u64 this_delay;
1648
1649 blkcg_scale_delay(blkg, now);
1650 this_delay = atomic64_read(&blkg->delay_nsec);
1651 if (this_delay > delay_nsec) {
1652 delay_nsec = this_delay;
1653 clamp = use_delay > 0;
1654 }
1655 }
1656 blkg = blkg->parent;
1657 }
1658
1659 if (!delay_nsec)
1660 return;
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671 if (clamp)
1672 delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
1673
1674 if (use_memdelay)
1675 psi_memstall_enter(&pflags);
1676
1677 exp = ktime_add_ns(now, delay_nsec);
1678 tok = io_schedule_prepare();
1679 do {
1680 __set_current_state(TASK_KILLABLE);
1681 if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS))
1682 break;
1683 } while (!fatal_signal_pending(current));
1684 io_schedule_finish(tok);
1685
1686 if (use_memdelay)
1687 psi_memstall_leave(&pflags);
1688}
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700void blkcg_maybe_throttle_current(void)
1701{
1702 struct request_queue *q = current->throttle_queue;
1703 struct cgroup_subsys_state *css;
1704 struct blkcg *blkcg;
1705 struct blkcg_gq *blkg;
1706 bool use_memdelay = current->use_memdelay;
1707
1708 if (!q)
1709 return;
1710
1711 current->throttle_queue = NULL;
1712 current->use_memdelay = false;
1713
1714 rcu_read_lock();
1715 css = kthread_blkcg();
1716 if (css)
1717 blkcg = css_to_blkcg(css);
1718 else
1719 blkcg = css_to_blkcg(task_css(current, io_cgrp_id));
1720
1721 if (!blkcg)
1722 goto out;
1723 blkg = blkg_lookup(blkcg, q);
1724 if (!blkg)
1725 goto out;
1726 if (!blkg_tryget(blkg))
1727 goto out;
1728 rcu_read_unlock();
1729
1730 blkcg_maybe_throttle_blkg(blkg, use_memdelay);
1731 blkg_put(blkg);
1732 blk_put_queue(q);
1733 return;
1734out:
1735 rcu_read_unlock();
1736 blk_put_queue(q);
1737}
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
1757{
1758 if (unlikely(current->flags & PF_KTHREAD))
1759 return;
1760
1761 if (!blk_get_queue(q))
1762 return;
1763
1764 if (current->throttle_queue)
1765 blk_put_queue(current->throttle_queue);
1766 current->throttle_queue = q;
1767 if (use_memdelay)
1768 current->use_memdelay = use_memdelay;
1769 set_notify_resume(current);
1770}
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
1782{
1783 if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0))
1784 return;
1785 blkcg_scale_delay(blkg, now);
1786 atomic64_add(delta, &blkg->delay_nsec);
1787}
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio,
1799 struct cgroup_subsys_state *css)
1800{
1801 struct blkcg_gq *blkg, *ret_blkg = NULL;
1802
1803 rcu_read_lock();
1804 blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_disk->queue);
1805 while (blkg) {
1806 if (blkg_tryget(blkg)) {
1807 ret_blkg = blkg;
1808 break;
1809 }
1810 blkg = blkg->parent;
1811 }
1812 rcu_read_unlock();
1813
1814 return ret_blkg;
1815}
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831void bio_associate_blkg_from_css(struct bio *bio,
1832 struct cgroup_subsys_state *css)
1833{
1834 if (bio->bi_blkg)
1835 blkg_put(bio->bi_blkg);
1836
1837 if (css && css->parent) {
1838 bio->bi_blkg = blkg_tryget_closest(bio, css);
1839 } else {
1840 blkg_get(bio->bi_disk->queue->root_blkg);
1841 bio->bi_blkg = bio->bi_disk->queue->root_blkg;
1842 }
1843}
1844EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855void bio_associate_blkg(struct bio *bio)
1856{
1857 struct cgroup_subsys_state *css;
1858
1859 rcu_read_lock();
1860
1861 if (bio->bi_blkg)
1862 css = &bio_blkcg(bio)->css;
1863 else
1864 css = blkcg_css();
1865
1866 bio_associate_blkg_from_css(bio, css);
1867
1868 rcu_read_unlock();
1869}
1870EXPORT_SYMBOL_GPL(bio_associate_blkg);
1871
1872
1873
1874
1875
1876
1877void bio_clone_blkg_association(struct bio *dst, struct bio *src)
1878{
1879 if (src->bi_blkg) {
1880 if (dst->bi_blkg)
1881 blkg_put(dst->bi_blkg);
1882 blkg_get(src->bi_blkg);
1883 dst->bi_blkg = src->bi_blkg;
1884 }
1885}
1886EXPORT_SYMBOL_GPL(bio_clone_blkg_association);
1887
1888static int blk_cgroup_io_type(struct bio *bio)
1889{
1890 if (op_is_discard(bio->bi_opf))
1891 return BLKG_IOSTAT_DISCARD;
1892 if (op_is_write(bio->bi_opf))
1893 return BLKG_IOSTAT_WRITE;
1894 return BLKG_IOSTAT_READ;
1895}
1896
1897void blk_cgroup_bio_start(struct bio *bio)
1898{
1899 int rwd = blk_cgroup_io_type(bio), cpu;
1900 struct blkg_iostat_set *bis;
1901
1902 cpu = get_cpu();
1903 bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu);
1904 u64_stats_update_begin(&bis->sync);
1905
1906
1907
1908
1909
1910 if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
1911 bio_set_flag(bio, BIO_CGROUP_ACCT);
1912 bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
1913 }
1914 bis->cur.ios[rwd]++;
1915
1916 u64_stats_update_end(&bis->sync);
1917 if (cgroup_subsys_on_dfl(io_cgrp_subsys))
1918 cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu);
1919 put_cpu();
1920}
1921
1922static int __init blkcg_init(void)
1923{
1924 blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
1925 WQ_MEM_RECLAIM | WQ_FREEZABLE |
1926 WQ_UNBOUND | WQ_SYSFS, 0);
1927 if (!blkcg_punt_bio_wq)
1928 return -ENOMEM;
1929 return 0;
1930}
1931subsys_initcall(blkcg_init);
1932
1933module_param(blkcg_debug_stats, bool, 0644);
1934MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
1935