1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18#include <linux/ioprio.h>
19#include <linux/kdev_t.h>
20#include <linux/module.h>
21#include <linux/sched/signal.h>
22#include <linux/err.h>
23#include <linux/blkdev.h>
24#include <linux/backing-dev.h>
25#include <linux/slab.h>
26#include <linux/genhd.h>
27#include <linux/delay.h>
28#include <linux/atomic.h>
29#include <linux/ctype.h>
30#include <linux/blk-cgroup.h>
31#include <linux/tracehook.h>
32#include <linux/psi.h>
33#include "blk.h"
34#include "blk-ioprio.h"
35
36
37
38
39
40
41
42
43static DEFINE_MUTEX(blkcg_pol_register_mutex);
44static DEFINE_MUTEX(blkcg_pol_mutex);
45
46struct blkcg blkcg_root;
47EXPORT_SYMBOL_GPL(blkcg_root);
48
49struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
50EXPORT_SYMBOL_GPL(blkcg_root_css);
51
52static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
53
54static LIST_HEAD(all_blkcgs);
55
56bool blkcg_debug_stats = false;
57static struct workqueue_struct *blkcg_punt_bio_wq;
58
59#define BLKG_DESTROY_BATCH_SIZE 64
60
61static bool blkcg_policy_enabled(struct request_queue *q,
62 const struct blkcg_policy *pol)
63{
64 return pol && test_bit(pol->plid, q->blkcg_pols);
65}
66
67
68
69
70
71
72
73static void blkg_free(struct blkcg_gq *blkg)
74{
75 int i;
76
77 if (!blkg)
78 return;
79
80 for (i = 0; i < BLKCG_MAX_POLS; i++)
81 if (blkg->pd[i])
82 blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
83
84 free_percpu(blkg->iostat_cpu);
85 percpu_ref_exit(&blkg->refcnt);
86 kfree(blkg);
87}
88
89static void __blkg_release(struct rcu_head *rcu)
90{
91 struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
92
93 WARN_ON(!bio_list_empty(&blkg->async_bios));
94
95
96 css_put(&blkg->blkcg->css);
97 if (blkg->parent)
98 blkg_put(blkg->parent);
99 blkg_free(blkg);
100}
101
102
103
104
105
106
107
108
109
110static void blkg_release(struct percpu_ref *ref)
111{
112 struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt);
113
114 call_rcu(&blkg->rcu_head, __blkg_release);
115}
116
117static void blkg_async_bio_workfn(struct work_struct *work)
118{
119 struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
120 async_bio_work);
121 struct bio_list bios = BIO_EMPTY_LIST;
122 struct bio *bio;
123 struct blk_plug plug;
124 bool need_plug = false;
125
126
127 spin_lock_bh(&blkg->async_bio_lock);
128 bio_list_merge(&bios, &blkg->async_bios);
129 bio_list_init(&blkg->async_bios);
130 spin_unlock_bh(&blkg->async_bio_lock);
131
132
133 if (bios.head && bios.head->bi_next) {
134 need_plug = true;
135 blk_start_plug(&plug);
136 }
137 while ((bio = bio_list_pop(&bios)))
138 submit_bio(bio);
139 if (need_plug)
140 blk_finish_plug(&plug);
141}
142
143
144
145
146
147
148
149
150
151static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
152 gfp_t gfp_mask)
153{
154 struct blkcg_gq *blkg;
155 int i, cpu;
156
157
158 blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node);
159 if (!blkg)
160 return NULL;
161
162 if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask))
163 goto err_free;
164
165 blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask);
166 if (!blkg->iostat_cpu)
167 goto err_free;
168
169 blkg->q = q;
170 INIT_LIST_HEAD(&blkg->q_node);
171 spin_lock_init(&blkg->async_bio_lock);
172 bio_list_init(&blkg->async_bios);
173 INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
174 blkg->blkcg = blkcg;
175
176 u64_stats_init(&blkg->iostat.sync);
177 for_each_possible_cpu(cpu)
178 u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync);
179
180 for (i = 0; i < BLKCG_MAX_POLS; i++) {
181 struct blkcg_policy *pol = blkcg_policy[i];
182 struct blkg_policy_data *pd;
183
184 if (!blkcg_policy_enabled(q, pol))
185 continue;
186
187
188 pd = pol->pd_alloc_fn(gfp_mask, q, blkcg);
189 if (!pd)
190 goto err_free;
191
192 blkg->pd[i] = pd;
193 pd->blkg = blkg;
194 pd->plid = i;
195 }
196
197 return blkg;
198
199err_free:
200 blkg_free(blkg);
201 return NULL;
202}
203
204struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
205 struct request_queue *q, bool update_hint)
206{
207 struct blkcg_gq *blkg;
208
209
210
211
212
213
214
215 blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
216 if (blkg && blkg->q == q) {
217 if (update_hint) {
218 lockdep_assert_held(&q->queue_lock);
219 rcu_assign_pointer(blkcg->blkg_hint, blkg);
220 }
221 return blkg;
222 }
223
224 return NULL;
225}
226EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
227
228
229
230
231
232static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
233 struct request_queue *q,
234 struct blkcg_gq *new_blkg)
235{
236 struct blkcg_gq *blkg;
237 int i, ret;
238
239 WARN_ON_ONCE(!rcu_read_lock_held());
240 lockdep_assert_held(&q->queue_lock);
241
242
243 if (blk_queue_dying(q)) {
244 ret = -ENODEV;
245 goto err_free_blkg;
246 }
247
248
249 if (!css_tryget_online(&blkcg->css)) {
250 ret = -ENODEV;
251 goto err_free_blkg;
252 }
253
254
255 if (!new_blkg) {
256 new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);
257 if (unlikely(!new_blkg)) {
258 ret = -ENOMEM;
259 goto err_put_css;
260 }
261 }
262 blkg = new_blkg;
263
264
265 if (blkcg_parent(blkcg)) {
266 blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
267 if (WARN_ON_ONCE(!blkg->parent)) {
268 ret = -ENODEV;
269 goto err_put_css;
270 }
271 blkg_get(blkg->parent);
272 }
273
274
275 for (i = 0; i < BLKCG_MAX_POLS; i++) {
276 struct blkcg_policy *pol = blkcg_policy[i];
277
278 if (blkg->pd[i] && pol->pd_init_fn)
279 pol->pd_init_fn(blkg->pd[i]);
280 }
281
282
283 spin_lock(&blkcg->lock);
284 ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
285 if (likely(!ret)) {
286 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
287 list_add(&blkg->q_node, &q->blkg_list);
288
289 for (i = 0; i < BLKCG_MAX_POLS; i++) {
290 struct blkcg_policy *pol = blkcg_policy[i];
291
292 if (blkg->pd[i] && pol->pd_online_fn)
293 pol->pd_online_fn(blkg->pd[i]);
294 }
295 }
296 blkg->online = true;
297 spin_unlock(&blkcg->lock);
298
299 if (!ret)
300 return blkg;
301
302
303 blkg_put(blkg);
304 return ERR_PTR(ret);
305
306err_put_css:
307 css_put(&blkcg->css);
308err_free_blkg:
309 blkg_free(new_blkg);
310 return ERR_PTR(ret);
311}
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
327 struct request_queue *q)
328{
329 struct blkcg_gq *blkg;
330 unsigned long flags;
331
332 WARN_ON_ONCE(!rcu_read_lock_held());
333
334 blkg = blkg_lookup(blkcg, q);
335 if (blkg)
336 return blkg;
337
338 spin_lock_irqsave(&q->queue_lock, flags);
339 blkg = __blkg_lookup(blkcg, q, true);
340 if (blkg)
341 goto found;
342
343
344
345
346
347
348 while (true) {
349 struct blkcg *pos = blkcg;
350 struct blkcg *parent = blkcg_parent(blkcg);
351 struct blkcg_gq *ret_blkg = q->root_blkg;
352
353 while (parent) {
354 blkg = __blkg_lookup(parent, q, false);
355 if (blkg) {
356
357 ret_blkg = blkg;
358 break;
359 }
360 pos = parent;
361 parent = blkcg_parent(parent);
362 }
363
364 blkg = blkg_create(pos, q, NULL);
365 if (IS_ERR(blkg)) {
366 blkg = ret_blkg;
367 break;
368 }
369 if (pos == blkcg)
370 break;
371 }
372
373found:
374 spin_unlock_irqrestore(&q->queue_lock, flags);
375 return blkg;
376}
377
378static void blkg_destroy(struct blkcg_gq *blkg)
379{
380 struct blkcg *blkcg = blkg->blkcg;
381 int i;
382
383 lockdep_assert_held(&blkg->q->queue_lock);
384 lockdep_assert_held(&blkcg->lock);
385
386
387 WARN_ON_ONCE(list_empty(&blkg->q_node));
388 WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
389
390 for (i = 0; i < BLKCG_MAX_POLS; i++) {
391 struct blkcg_policy *pol = blkcg_policy[i];
392
393 if (blkg->pd[i] && pol->pd_offline_fn)
394 pol->pd_offline_fn(blkg->pd[i]);
395 }
396
397 blkg->online = false;
398
399 radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
400 list_del_init(&blkg->q_node);
401 hlist_del_init_rcu(&blkg->blkcg_node);
402
403
404
405
406
407
408 if (rcu_access_pointer(blkcg->blkg_hint) == blkg)
409 rcu_assign_pointer(blkcg->blkg_hint, NULL);
410
411
412
413
414
415 percpu_ref_kill(&blkg->refcnt);
416}
417
418
419
420
421
422
423
424static void blkg_destroy_all(struct request_queue *q)
425{
426 struct blkcg_gq *blkg, *n;
427 int count = BLKG_DESTROY_BATCH_SIZE;
428
429restart:
430 spin_lock_irq(&q->queue_lock);
431 list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
432 struct blkcg *blkcg = blkg->blkcg;
433
434 spin_lock(&blkcg->lock);
435 blkg_destroy(blkg);
436 spin_unlock(&blkcg->lock);
437
438
439
440
441
442 if (!(--count)) {
443 count = BLKG_DESTROY_BATCH_SIZE;
444 spin_unlock_irq(&q->queue_lock);
445 cond_resched();
446 goto restart;
447 }
448 }
449
450 q->root_blkg = NULL;
451 spin_unlock_irq(&q->queue_lock);
452}
453
454static int blkcg_reset_stats(struct cgroup_subsys_state *css,
455 struct cftype *cftype, u64 val)
456{
457 struct blkcg *blkcg = css_to_blkcg(css);
458 struct blkcg_gq *blkg;
459 int i, cpu;
460
461 mutex_lock(&blkcg_pol_mutex);
462 spin_lock_irq(&blkcg->lock);
463
464
465
466
467
468
469 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
470 for_each_possible_cpu(cpu) {
471 struct blkg_iostat_set *bis =
472 per_cpu_ptr(blkg->iostat_cpu, cpu);
473 memset(bis, 0, sizeof(*bis));
474 }
475 memset(&blkg->iostat, 0, sizeof(blkg->iostat));
476
477 for (i = 0; i < BLKCG_MAX_POLS; i++) {
478 struct blkcg_policy *pol = blkcg_policy[i];
479
480 if (blkg->pd[i] && pol->pd_reset_stats_fn)
481 pol->pd_reset_stats_fn(blkg->pd[i]);
482 }
483 }
484
485 spin_unlock_irq(&blkcg->lock);
486 mutex_unlock(&blkcg_pol_mutex);
487 return 0;
488}
489
490const char *blkg_dev_name(struct blkcg_gq *blkg)
491{
492
493 if (blkg->q->backing_dev_info->dev)
494 return bdi_dev_name(blkg->q->backing_dev_info);
495 return NULL;
496}
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
517 u64 (*prfill)(struct seq_file *,
518 struct blkg_policy_data *, int),
519 const struct blkcg_policy *pol, int data,
520 bool show_total)
521{
522 struct blkcg_gq *blkg;
523 u64 total = 0;
524
525 rcu_read_lock();
526 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
527 spin_lock_irq(&blkg->q->queue_lock);
528 if (blkcg_policy_enabled(blkg->q, pol))
529 total += prfill(sf, blkg->pd[pol->plid], data);
530 spin_unlock_irq(&blkg->q->queue_lock);
531 }
532 rcu_read_unlock();
533
534 if (show_total)
535 seq_printf(sf, "Total %llu\n", (unsigned long long)total);
536}
537EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
538
539
540
541
542
543
544
545
546
547u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
548{
549 const char *dname = blkg_dev_name(pd->blkg);
550
551 if (!dname)
552 return 0;
553
554 seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
555 return v;
556}
557EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
558
559
560static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
561 const struct blkcg_policy *pol,
562 struct request_queue *q)
563{
564 WARN_ON_ONCE(!rcu_read_lock_held());
565 lockdep_assert_held(&q->queue_lock);
566
567 if (!blkcg_policy_enabled(q, pol))
568 return ERR_PTR(-EOPNOTSUPP);
569 return __blkg_lookup(blkcg, q, true );
570}
571
572
573
574
575
576
577
578
579
580
581
582
583struct block_device *blkcg_conf_open_bdev(char **inputp)
584{
585 char *input = *inputp;
586 unsigned int major, minor;
587 struct block_device *bdev;
588 int key_len;
589
590 if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
591 return ERR_PTR(-EINVAL);
592
593 input += key_len;
594 if (!isspace(*input))
595 return ERR_PTR(-EINVAL);
596 input = skip_spaces(input);
597
598 bdev = blkdev_get_no_open(MKDEV(major, minor));
599 if (!bdev)
600 return ERR_PTR(-ENODEV);
601 if (bdev_is_partition(bdev)) {
602 blkdev_put_no_open(bdev);
603 return ERR_PTR(-ENODEV);
604 }
605
606 *inputp = input;
607 return bdev;
608}
609
610
611
612
613
614
615
616
617
618
619
620
621
622int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
623 char *input, struct blkg_conf_ctx *ctx)
624 __acquires(rcu) __acquires(&bdev->bd_disk->queue->queue_lock)
625{
626 struct block_device *bdev;
627 struct request_queue *q;
628 struct blkcg_gq *blkg;
629 int ret;
630
631 bdev = blkcg_conf_open_bdev(&input);
632 if (IS_ERR(bdev))
633 return PTR_ERR(bdev);
634
635 q = bdev->bd_disk->queue;
636
637 rcu_read_lock();
638 spin_lock_irq(&q->queue_lock);
639
640 blkg = blkg_lookup_check(blkcg, pol, q);
641 if (IS_ERR(blkg)) {
642 ret = PTR_ERR(blkg);
643 goto fail_unlock;
644 }
645
646 if (blkg)
647 goto success;
648
649
650
651
652
653 while (true) {
654 struct blkcg *pos = blkcg;
655 struct blkcg *parent;
656 struct blkcg_gq *new_blkg;
657
658 parent = blkcg_parent(blkcg);
659 while (parent && !__blkg_lookup(parent, q, false)) {
660 pos = parent;
661 parent = blkcg_parent(parent);
662 }
663
664
665 spin_unlock_irq(&q->queue_lock);
666 rcu_read_unlock();
667
668 new_blkg = blkg_alloc(pos, q, GFP_KERNEL);
669 if (unlikely(!new_blkg)) {
670 ret = -ENOMEM;
671 goto fail;
672 }
673
674 if (radix_tree_preload(GFP_KERNEL)) {
675 blkg_free(new_blkg);
676 ret = -ENOMEM;
677 goto fail;
678 }
679
680 rcu_read_lock();
681 spin_lock_irq(&q->queue_lock);
682
683 blkg = blkg_lookup_check(pos, pol, q);
684 if (IS_ERR(blkg)) {
685 ret = PTR_ERR(blkg);
686 blkg_free(new_blkg);
687 goto fail_preloaded;
688 }
689
690 if (blkg) {
691 blkg_free(new_blkg);
692 } else {
693 blkg = blkg_create(pos, q, new_blkg);
694 if (IS_ERR(blkg)) {
695 ret = PTR_ERR(blkg);
696 goto fail_preloaded;
697 }
698 }
699
700 radix_tree_preload_end();
701
702 if (pos == blkcg)
703 goto success;
704 }
705success:
706 ctx->bdev = bdev;
707 ctx->blkg = blkg;
708 ctx->body = input;
709 return 0;
710
711fail_preloaded:
712 radix_tree_preload_end();
713fail_unlock:
714 spin_unlock_irq(&q->queue_lock);
715 rcu_read_unlock();
716fail:
717 blkdev_put_no_open(bdev);
718
719
720
721
722
723
724 if (ret == -EBUSY) {
725 msleep(10);
726 ret = restart_syscall();
727 }
728 return ret;
729}
730EXPORT_SYMBOL_GPL(blkg_conf_prep);
731
732
733
734
735
736
737
738
739void blkg_conf_finish(struct blkg_conf_ctx *ctx)
740 __releases(&ctx->bdev->bd_disk->queue->queue_lock) __releases(rcu)
741{
742 spin_unlock_irq(&ctx->bdev->bd_disk->queue->queue_lock);
743 rcu_read_unlock();
744 blkdev_put_no_open(ctx->bdev);
745}
746EXPORT_SYMBOL_GPL(blkg_conf_finish);
747
748static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
749{
750 int i;
751
752 for (i = 0; i < BLKG_IOSTAT_NR; i++) {
753 dst->bytes[i] = src->bytes[i];
754 dst->ios[i] = src->ios[i];
755 }
756}
757
758static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src)
759{
760 int i;
761
762 for (i = 0; i < BLKG_IOSTAT_NR; i++) {
763 dst->bytes[i] += src->bytes[i];
764 dst->ios[i] += src->ios[i];
765 }
766}
767
768static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src)
769{
770 int i;
771
772 for (i = 0; i < BLKG_IOSTAT_NR; i++) {
773 dst->bytes[i] -= src->bytes[i];
774 dst->ios[i] -= src->ios[i];
775 }
776}
777
778static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
779{
780 struct blkcg *blkcg = css_to_blkcg(css);
781 struct blkcg_gq *blkg;
782
783
784 if (!cgroup_parent(css->cgroup))
785 return;
786
787 rcu_read_lock();
788
789 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
790 struct blkcg_gq *parent = blkg->parent;
791 struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
792 struct blkg_iostat cur, delta;
793 unsigned long flags;
794 unsigned int seq;
795
796
797 do {
798 seq = u64_stats_fetch_begin(&bisc->sync);
799 blkg_iostat_set(&cur, &bisc->cur);
800 } while (u64_stats_fetch_retry(&bisc->sync, seq));
801
802
803 flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
804 blkg_iostat_set(&delta, &cur);
805 blkg_iostat_sub(&delta, &bisc->last);
806 blkg_iostat_add(&blkg->iostat.cur, &delta);
807 blkg_iostat_add(&bisc->last, &delta);
808 u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
809
810
811 if (parent && parent->parent) {
812 flags = u64_stats_update_begin_irqsave(&parent->iostat.sync);
813 blkg_iostat_set(&delta, &blkg->iostat.cur);
814 blkg_iostat_sub(&delta, &blkg->iostat.last);
815 blkg_iostat_add(&parent->iostat.cur, &delta);
816 blkg_iostat_add(&blkg->iostat.last, &delta);
817 u64_stats_update_end_irqrestore(&parent->iostat.sync, flags);
818 }
819 }
820
821 rcu_read_unlock();
822}
823
824
825
826
827
828
829
830
831
832
833
834
835
836static void blkcg_fill_root_iostats(void)
837{
838 struct class_dev_iter iter;
839 struct device *dev;
840
841 class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
842 while ((dev = class_dev_iter_next(&iter))) {
843 struct block_device *bdev = dev_to_bdev(dev);
844 struct blkcg_gq *blkg =
845 blk_queue_root_blkg(bdev->bd_disk->queue);
846 struct blkg_iostat tmp;
847 int cpu;
848
849 memset(&tmp, 0, sizeof(tmp));
850 for_each_possible_cpu(cpu) {
851 struct disk_stats *cpu_dkstats;
852 unsigned long flags;
853
854 cpu_dkstats = per_cpu_ptr(bdev->bd_stats, cpu);
855 tmp.ios[BLKG_IOSTAT_READ] +=
856 cpu_dkstats->ios[STAT_READ];
857 tmp.ios[BLKG_IOSTAT_WRITE] +=
858 cpu_dkstats->ios[STAT_WRITE];
859 tmp.ios[BLKG_IOSTAT_DISCARD] +=
860 cpu_dkstats->ios[STAT_DISCARD];
861
862 tmp.bytes[BLKG_IOSTAT_READ] +=
863 cpu_dkstats->sectors[STAT_READ] << 9;
864 tmp.bytes[BLKG_IOSTAT_WRITE] +=
865 cpu_dkstats->sectors[STAT_WRITE] << 9;
866 tmp.bytes[BLKG_IOSTAT_DISCARD] +=
867 cpu_dkstats->sectors[STAT_DISCARD] << 9;
868
869 flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
870 blkg_iostat_set(&blkg->iostat.cur, &tmp);
871 u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
872 }
873 }
874}
875
876static int blkcg_print_stat(struct seq_file *sf, void *v)
877{
878 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
879 struct blkcg_gq *blkg;
880
881 if (!seq_css(sf)->parent)
882 blkcg_fill_root_iostats();
883 else
884 cgroup_rstat_flush(blkcg->css.cgroup);
885
886 rcu_read_lock();
887
888 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
889 struct blkg_iostat_set *bis = &blkg->iostat;
890 const char *dname;
891 char *buf;
892 u64 rbytes, wbytes, rios, wios, dbytes, dios;
893 size_t size = seq_get_buf(sf, &buf), off = 0;
894 int i;
895 bool has_stats = false;
896 unsigned seq;
897
898 spin_lock_irq(&blkg->q->queue_lock);
899
900 if (!blkg->online)
901 goto skip;
902
903 dname = blkg_dev_name(blkg);
904 if (!dname)
905 goto skip;
906
907
908
909
910
911
912
913 off += scnprintf(buf+off, size-off, "%s ", dname);
914
915 do {
916 seq = u64_stats_fetch_begin(&bis->sync);
917
918 rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
919 wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
920 dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
921 rios = bis->cur.ios[BLKG_IOSTAT_READ];
922 wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
923 dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
924 } while (u64_stats_fetch_retry(&bis->sync, seq));
925
926 if (rbytes || wbytes || rios || wios) {
927 has_stats = true;
928 off += scnprintf(buf+off, size-off,
929 "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
930 rbytes, wbytes, rios, wios,
931 dbytes, dios);
932 }
933
934 if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) {
935 has_stats = true;
936 off += scnprintf(buf+off, size-off,
937 " use_delay=%d delay_nsec=%llu",
938 atomic_read(&blkg->use_delay),
939 (unsigned long long)atomic64_read(&blkg->delay_nsec));
940 }
941
942 for (i = 0; i < BLKCG_MAX_POLS; i++) {
943 struct blkcg_policy *pol = blkcg_policy[i];
944 size_t written;
945
946 if (!blkg->pd[i] || !pol->pd_stat_fn)
947 continue;
948
949 written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off);
950 if (written)
951 has_stats = true;
952 off += written;
953 }
954
955 if (has_stats) {
956 if (off < size - 1) {
957 off += scnprintf(buf+off, size-off, "\n");
958 seq_commit(sf, off);
959 } else {
960 seq_commit(sf, -1);
961 }
962 }
963 skip:
964 spin_unlock_irq(&blkg->q->queue_lock);
965 }
966
967 rcu_read_unlock();
968 return 0;
969}
970
971static struct cftype blkcg_files[] = {
972 {
973 .name = "stat",
974 .seq_show = blkcg_print_stat,
975 },
976 { }
977};
978
979static struct cftype blkcg_legacy_files[] = {
980 {
981 .name = "reset_stats",
982 .write_u64 = blkcg_reset_stats,
983 },
984 { }
985};
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016static void blkcg_css_offline(struct cgroup_subsys_state *css)
1017{
1018 struct blkcg *blkcg = css_to_blkcg(css);
1019
1020
1021 wb_blkcg_offline(blkcg);
1022
1023
1024 blkcg_unpin_online(blkcg);
1025}
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038void blkcg_destroy_blkgs(struct blkcg *blkcg)
1039{
1040 might_sleep();
1041
1042 spin_lock_irq(&blkcg->lock);
1043
1044 while (!hlist_empty(&blkcg->blkg_list)) {
1045 struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
1046 struct blkcg_gq, blkcg_node);
1047 struct request_queue *q = blkg->q;
1048
1049 if (need_resched() || !spin_trylock(&q->queue_lock)) {
1050
1051
1052
1053
1054
1055 spin_unlock_irq(&blkcg->lock);
1056 cond_resched();
1057 spin_lock_irq(&blkcg->lock);
1058 continue;
1059 }
1060
1061 blkg_destroy(blkg);
1062 spin_unlock(&q->queue_lock);
1063 }
1064
1065 spin_unlock_irq(&blkcg->lock);
1066}
1067
1068static void blkcg_css_free(struct cgroup_subsys_state *css)
1069{
1070 struct blkcg *blkcg = css_to_blkcg(css);
1071 int i;
1072
1073 mutex_lock(&blkcg_pol_mutex);
1074
1075 list_del(&blkcg->all_blkcgs_node);
1076
1077 for (i = 0; i < BLKCG_MAX_POLS; i++)
1078 if (blkcg->cpd[i])
1079 blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
1080
1081 mutex_unlock(&blkcg_pol_mutex);
1082
1083 kfree(blkcg);
1084}
1085
1086static struct cgroup_subsys_state *
1087blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
1088{
1089 struct blkcg *blkcg;
1090 struct cgroup_subsys_state *ret;
1091 int i;
1092
1093 mutex_lock(&blkcg_pol_mutex);
1094
1095 if (!parent_css) {
1096 blkcg = &blkcg_root;
1097 } else {
1098 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
1099 if (!blkcg) {
1100 ret = ERR_PTR(-ENOMEM);
1101 goto unlock;
1102 }
1103 }
1104
1105 for (i = 0; i < BLKCG_MAX_POLS ; i++) {
1106 struct blkcg_policy *pol = blkcg_policy[i];
1107 struct blkcg_policy_data *cpd;
1108
1109
1110
1111
1112
1113
1114
1115 if (!pol || !pol->cpd_alloc_fn)
1116 continue;
1117
1118 cpd = pol->cpd_alloc_fn(GFP_KERNEL);
1119 if (!cpd) {
1120 ret = ERR_PTR(-ENOMEM);
1121 goto free_pd_blkcg;
1122 }
1123 blkcg->cpd[i] = cpd;
1124 cpd->blkcg = blkcg;
1125 cpd->plid = i;
1126 if (pol->cpd_init_fn)
1127 pol->cpd_init_fn(cpd);
1128 }
1129
1130 spin_lock_init(&blkcg->lock);
1131 refcount_set(&blkcg->online_pin, 1);
1132 INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN);
1133 INIT_HLIST_HEAD(&blkcg->blkg_list);
1134#ifdef CONFIG_CGROUP_WRITEBACK
1135 INIT_LIST_HEAD(&blkcg->cgwb_list);
1136#endif
1137 list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);
1138
1139 mutex_unlock(&blkcg_pol_mutex);
1140 return &blkcg->css;
1141
1142free_pd_blkcg:
1143 for (i--; i >= 0; i--)
1144 if (blkcg->cpd[i])
1145 blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
1146
1147 if (blkcg != &blkcg_root)
1148 kfree(blkcg);
1149unlock:
1150 mutex_unlock(&blkcg_pol_mutex);
1151 return ret;
1152}
1153
1154static int blkcg_css_online(struct cgroup_subsys_state *css)
1155{
1156 struct blkcg *blkcg = css_to_blkcg(css);
1157 struct blkcg *parent = blkcg_parent(blkcg);
1158
1159
1160
1161
1162
1163
1164 if (parent)
1165 blkcg_pin_online(parent);
1166 return 0;
1167}
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179int blkcg_init_queue(struct request_queue *q)
1180{
1181 struct blkcg_gq *new_blkg, *blkg;
1182 bool preloaded;
1183 int ret;
1184
1185 new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
1186 if (!new_blkg)
1187 return -ENOMEM;
1188
1189 preloaded = !radix_tree_preload(GFP_KERNEL);
1190
1191
1192 rcu_read_lock();
1193 spin_lock_irq(&q->queue_lock);
1194 blkg = blkg_create(&blkcg_root, q, new_blkg);
1195 if (IS_ERR(blkg))
1196 goto err_unlock;
1197 q->root_blkg = blkg;
1198 spin_unlock_irq(&q->queue_lock);
1199 rcu_read_unlock();
1200
1201 if (preloaded)
1202 radix_tree_preload_end();
1203
1204 ret = blk_iolatency_init(q);
1205 if (ret)
1206 goto err_destroy_all;
1207
1208 ret = blk_ioprio_init(q);
1209 if (ret)
1210 goto err_destroy_all;
1211
1212 ret = blk_throtl_init(q);
1213 if (ret)
1214 goto err_destroy_all;
1215
1216 return 0;
1217
1218err_destroy_all:
1219 blkg_destroy_all(q);
1220 return ret;
1221err_unlock:
1222 spin_unlock_irq(&q->queue_lock);
1223 rcu_read_unlock();
1224 if (preloaded)
1225 radix_tree_preload_end();
1226 return PTR_ERR(blkg);
1227}
1228
1229
1230
1231
1232
1233
1234
1235void blkcg_exit_queue(struct request_queue *q)
1236{
1237 blkg_destroy_all(q);
1238 blk_throtl_exit(q);
1239}
1240
1241static void blkcg_bind(struct cgroup_subsys_state *root_css)
1242{
1243 int i;
1244
1245 mutex_lock(&blkcg_pol_mutex);
1246
1247 for (i = 0; i < BLKCG_MAX_POLS; i++) {
1248 struct blkcg_policy *pol = blkcg_policy[i];
1249 struct blkcg *blkcg;
1250
1251 if (!pol || !pol->cpd_bind_fn)
1252 continue;
1253
1254 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node)
1255 if (blkcg->cpd[pol->plid])
1256 pol->cpd_bind_fn(blkcg->cpd[pol->plid]);
1257 }
1258 mutex_unlock(&blkcg_pol_mutex);
1259}
1260
1261static void blkcg_exit(struct task_struct *tsk)
1262{
1263 if (tsk->throttle_queue)
1264 blk_put_queue(tsk->throttle_queue);
1265 tsk->throttle_queue = NULL;
1266}
1267
1268struct cgroup_subsys io_cgrp_subsys = {
1269 .css_alloc = blkcg_css_alloc,
1270 .css_online = blkcg_css_online,
1271 .css_offline = blkcg_css_offline,
1272 .css_free = blkcg_css_free,
1273 .css_rstat_flush = blkcg_rstat_flush,
1274 .bind = blkcg_bind,
1275 .dfl_cftypes = blkcg_files,
1276 .legacy_cftypes = blkcg_legacy_files,
1277 .legacy_name = "blkio",
1278 .exit = blkcg_exit,
1279#ifdef CONFIG_MEMCG
1280
1281
1282
1283
1284
1285 .depends_on = 1 << memory_cgrp_id,
1286#endif
1287};
1288EXPORT_SYMBOL_GPL(io_cgrp_subsys);
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306int blkcg_activate_policy(struct request_queue *q,
1307 const struct blkcg_policy *pol)
1308{
1309 struct blkg_policy_data *pd_prealloc = NULL;
1310 struct blkcg_gq *blkg, *pinned_blkg = NULL;
1311 int ret;
1312
1313 if (blkcg_policy_enabled(q, pol))
1314 return 0;
1315
1316 if (queue_is_mq(q))
1317 blk_mq_freeze_queue(q);
1318retry:
1319 spin_lock_irq(&q->queue_lock);
1320
1321
1322 list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
1323 struct blkg_policy_data *pd;
1324
1325 if (blkg->pd[pol->plid])
1326 continue;
1327
1328
1329 if (blkg == pinned_blkg) {
1330 pd = pd_prealloc;
1331 pd_prealloc = NULL;
1332 } else {
1333 pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q,
1334 blkg->blkcg);
1335 }
1336
1337 if (!pd) {
1338
1339
1340
1341
1342 if (pinned_blkg)
1343 blkg_put(pinned_blkg);
1344 blkg_get(blkg);
1345 pinned_blkg = blkg;
1346
1347 spin_unlock_irq(&q->queue_lock);
1348
1349 if (pd_prealloc)
1350 pol->pd_free_fn(pd_prealloc);
1351 pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q,
1352 blkg->blkcg);
1353 if (pd_prealloc)
1354 goto retry;
1355 else
1356 goto enomem;
1357 }
1358
1359 blkg->pd[pol->plid] = pd;
1360 pd->blkg = blkg;
1361 pd->plid = pol->plid;
1362 }
1363
1364
1365 if (pol->pd_init_fn)
1366 list_for_each_entry_reverse(blkg, &q->blkg_list, q_node)
1367 pol->pd_init_fn(blkg->pd[pol->plid]);
1368
1369 __set_bit(pol->plid, q->blkcg_pols);
1370 ret = 0;
1371
1372 spin_unlock_irq(&q->queue_lock);
1373out:
1374 if (queue_is_mq(q))
1375 blk_mq_unfreeze_queue(q);
1376 if (pinned_blkg)
1377 blkg_put(pinned_blkg);
1378 if (pd_prealloc)
1379 pol->pd_free_fn(pd_prealloc);
1380 return ret;
1381
1382enomem:
1383
1384 spin_lock_irq(&q->queue_lock);
1385 list_for_each_entry(blkg, &q->blkg_list, q_node) {
1386 if (blkg->pd[pol->plid]) {
1387 pol->pd_free_fn(blkg->pd[pol->plid]);
1388 blkg->pd[pol->plid] = NULL;
1389 }
1390 }
1391 spin_unlock_irq(&q->queue_lock);
1392 ret = -ENOMEM;
1393 goto out;
1394}
1395EXPORT_SYMBOL_GPL(blkcg_activate_policy);
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405void blkcg_deactivate_policy(struct request_queue *q,
1406 const struct blkcg_policy *pol)
1407{
1408 struct blkcg_gq *blkg;
1409
1410 if (!blkcg_policy_enabled(q, pol))
1411 return;
1412
1413 if (queue_is_mq(q))
1414 blk_mq_freeze_queue(q);
1415
1416 spin_lock_irq(&q->queue_lock);
1417
1418 __clear_bit(pol->plid, q->blkcg_pols);
1419
1420 list_for_each_entry(blkg, &q->blkg_list, q_node) {
1421 if (blkg->pd[pol->plid]) {
1422 if (pol->pd_offline_fn)
1423 pol->pd_offline_fn(blkg->pd[pol->plid]);
1424 pol->pd_free_fn(blkg->pd[pol->plid]);
1425 blkg->pd[pol->plid] = NULL;
1426 }
1427 }
1428
1429 spin_unlock_irq(&q->queue_lock);
1430
1431 if (queue_is_mq(q))
1432 blk_mq_unfreeze_queue(q);
1433}
1434EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
1435
1436
1437
1438
1439
1440
1441
1442
1443int blkcg_policy_register(struct blkcg_policy *pol)
1444{
1445 struct blkcg *blkcg;
1446 int i, ret;
1447
1448 mutex_lock(&blkcg_pol_register_mutex);
1449 mutex_lock(&blkcg_pol_mutex);
1450
1451
1452 ret = -ENOSPC;
1453 for (i = 0; i < BLKCG_MAX_POLS; i++)
1454 if (!blkcg_policy[i])
1455 break;
1456 if (i >= BLKCG_MAX_POLS) {
1457 pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n");
1458 goto err_unlock;
1459 }
1460
1461
1462 if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
1463 (!pol->pd_alloc_fn ^ !pol->pd_free_fn))
1464 goto err_unlock;
1465
1466
1467 pol->plid = i;
1468 blkcg_policy[pol->plid] = pol;
1469
1470
1471 if (pol->cpd_alloc_fn) {
1472 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1473 struct blkcg_policy_data *cpd;
1474
1475 cpd = pol->cpd_alloc_fn(GFP_KERNEL);
1476 if (!cpd)
1477 goto err_free_cpds;
1478
1479 blkcg->cpd[pol->plid] = cpd;
1480 cpd->blkcg = blkcg;
1481 cpd->plid = pol->plid;
1482 if (pol->cpd_init_fn)
1483 pol->cpd_init_fn(cpd);
1484 }
1485 }
1486
1487 mutex_unlock(&blkcg_pol_mutex);
1488
1489
1490 if (pol->dfl_cftypes)
1491 WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys,
1492 pol->dfl_cftypes));
1493 if (pol->legacy_cftypes)
1494 WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
1495 pol->legacy_cftypes));
1496 mutex_unlock(&blkcg_pol_register_mutex);
1497 return 0;
1498
1499err_free_cpds:
1500 if (pol->cpd_free_fn) {
1501 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1502 if (blkcg->cpd[pol->plid]) {
1503 pol->cpd_free_fn(blkcg->cpd[pol->plid]);
1504 blkcg->cpd[pol->plid] = NULL;
1505 }
1506 }
1507 }
1508 blkcg_policy[pol->plid] = NULL;
1509err_unlock:
1510 mutex_unlock(&blkcg_pol_mutex);
1511 mutex_unlock(&blkcg_pol_register_mutex);
1512 return ret;
1513}
1514EXPORT_SYMBOL_GPL(blkcg_policy_register);
1515
1516
1517
1518
1519
1520
1521
1522void blkcg_policy_unregister(struct blkcg_policy *pol)
1523{
1524 struct blkcg *blkcg;
1525
1526 mutex_lock(&blkcg_pol_register_mutex);
1527
1528 if (WARN_ON(blkcg_policy[pol->plid] != pol))
1529 goto out_unlock;
1530
1531
1532 if (pol->dfl_cftypes)
1533 cgroup_rm_cftypes(pol->dfl_cftypes);
1534 if (pol->legacy_cftypes)
1535 cgroup_rm_cftypes(pol->legacy_cftypes);
1536
1537
1538 mutex_lock(&blkcg_pol_mutex);
1539
1540 if (pol->cpd_free_fn) {
1541 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1542 if (blkcg->cpd[pol->plid]) {
1543 pol->cpd_free_fn(blkcg->cpd[pol->plid]);
1544 blkcg->cpd[pol->plid] = NULL;
1545 }
1546 }
1547 }
1548 blkcg_policy[pol->plid] = NULL;
1549
1550 mutex_unlock(&blkcg_pol_mutex);
1551out_unlock:
1552 mutex_unlock(&blkcg_pol_register_mutex);
1553}
1554EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
1555
1556bool __blkcg_punt_bio_submit(struct bio *bio)
1557{
1558 struct blkcg_gq *blkg = bio->bi_blkg;
1559
1560
1561 bio->bi_opf &= ~REQ_CGROUP_PUNT;
1562
1563
1564 if (!blkg->parent)
1565 return false;
1566
1567 spin_lock_bh(&blkg->async_bio_lock);
1568 bio_list_add(&blkg->async_bios, bio);
1569 spin_unlock_bh(&blkg->async_bio_lock);
1570
1571 queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
1572 return true;
1573}
1574
1575
1576
1577
1578
1579
1580
1581static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
1582{
1583 u64 old = atomic64_read(&blkg->delay_start);
1584
1585
1586 if (atomic_read(&blkg->use_delay) < 0)
1587 return;
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602 if (time_before64(old + NSEC_PER_SEC, now) &&
1603 atomic64_cmpxchg(&blkg->delay_start, old, now) == old) {
1604 u64 cur = atomic64_read(&blkg->delay_nsec);
1605 u64 sub = min_t(u64, blkg->last_delay, now - old);
1606 int cur_use = atomic_read(&blkg->use_delay);
1607
1608
1609
1610
1611
1612 if (cur_use < blkg->last_use)
1613 sub = max_t(u64, sub, blkg->last_delay >> 1);
1614
1615
1616
1617
1618
1619
1620
1621 if (unlikely(cur < sub)) {
1622 atomic64_set(&blkg->delay_nsec, 0);
1623 blkg->last_delay = 0;
1624 } else {
1625 atomic64_sub(sub, &blkg->delay_nsec);
1626 blkg->last_delay = cur - sub;
1627 }
1628 blkg->last_use = cur_use;
1629 }
1630}
1631
1632
1633
1634
1635
1636
1637
1638static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
1639{
1640 unsigned long pflags;
1641 bool clamp;
1642 u64 now = ktime_to_ns(ktime_get());
1643 u64 exp;
1644 u64 delay_nsec = 0;
1645 int tok;
1646
1647 while (blkg->parent) {
1648 int use_delay = atomic_read(&blkg->use_delay);
1649
1650 if (use_delay) {
1651 u64 this_delay;
1652
1653 blkcg_scale_delay(blkg, now);
1654 this_delay = atomic64_read(&blkg->delay_nsec);
1655 if (this_delay > delay_nsec) {
1656 delay_nsec = this_delay;
1657 clamp = use_delay > 0;
1658 }
1659 }
1660 blkg = blkg->parent;
1661 }
1662
1663 if (!delay_nsec)
1664 return;
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675 if (clamp)
1676 delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
1677
1678 if (use_memdelay)
1679 psi_memstall_enter(&pflags);
1680
1681 exp = ktime_add_ns(now, delay_nsec);
1682 tok = io_schedule_prepare();
1683 do {
1684 __set_current_state(TASK_KILLABLE);
1685 if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS))
1686 break;
1687 } while (!fatal_signal_pending(current));
1688 io_schedule_finish(tok);
1689
1690 if (use_memdelay)
1691 psi_memstall_leave(&pflags);
1692}
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704void blkcg_maybe_throttle_current(void)
1705{
1706 struct request_queue *q = current->throttle_queue;
1707 struct cgroup_subsys_state *css;
1708 struct blkcg *blkcg;
1709 struct blkcg_gq *blkg;
1710 bool use_memdelay = current->use_memdelay;
1711
1712 if (!q)
1713 return;
1714
1715 current->throttle_queue = NULL;
1716 current->use_memdelay = false;
1717
1718 rcu_read_lock();
1719 css = kthread_blkcg();
1720 if (css)
1721 blkcg = css_to_blkcg(css);
1722 else
1723 blkcg = css_to_blkcg(task_css(current, io_cgrp_id));
1724
1725 if (!blkcg)
1726 goto out;
1727 blkg = blkg_lookup(blkcg, q);
1728 if (!blkg)
1729 goto out;
1730 if (!blkg_tryget(blkg))
1731 goto out;
1732 rcu_read_unlock();
1733
1734 blkcg_maybe_throttle_blkg(blkg, use_memdelay);
1735 blkg_put(blkg);
1736 blk_put_queue(q);
1737 return;
1738out:
1739 rcu_read_unlock();
1740 blk_put_queue(q);
1741}
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
1761{
1762 if (unlikely(current->flags & PF_KTHREAD))
1763 return;
1764
1765 if (current->throttle_queue != q) {
1766 if (!blk_get_queue(q))
1767 return;
1768
1769 if (current->throttle_queue)
1770 blk_put_queue(current->throttle_queue);
1771 current->throttle_queue = q;
1772 }
1773
1774 if (use_memdelay)
1775 current->use_memdelay = use_memdelay;
1776 set_notify_resume(current);
1777}
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
1789{
1790 if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0))
1791 return;
1792 blkcg_scale_delay(blkg, now);
1793 atomic64_add(delta, &blkg->delay_nsec);
1794}
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio,
1806 struct cgroup_subsys_state *css)
1807{
1808 struct blkcg_gq *blkg, *ret_blkg = NULL;
1809
1810 rcu_read_lock();
1811 blkg = blkg_lookup_create(css_to_blkcg(css),
1812 bio->bi_bdev->bd_disk->queue);
1813 while (blkg) {
1814 if (blkg_tryget(blkg)) {
1815 ret_blkg = blkg;
1816 break;
1817 }
1818 blkg = blkg->parent;
1819 }
1820 rcu_read_unlock();
1821
1822 return ret_blkg;
1823}
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839void bio_associate_blkg_from_css(struct bio *bio,
1840 struct cgroup_subsys_state *css)
1841{
1842 if (bio->bi_blkg)
1843 blkg_put(bio->bi_blkg);
1844
1845 if (css && css->parent) {
1846 bio->bi_blkg = blkg_tryget_closest(bio, css);
1847 } else {
1848 blkg_get(bio->bi_bdev->bd_disk->queue->root_blkg);
1849 bio->bi_blkg = bio->bi_bdev->bd_disk->queue->root_blkg;
1850 }
1851}
1852EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863void bio_associate_blkg(struct bio *bio)
1864{
1865 struct cgroup_subsys_state *css;
1866
1867 rcu_read_lock();
1868
1869 if (bio->bi_blkg)
1870 css = &bio_blkcg(bio)->css;
1871 else
1872 css = blkcg_css();
1873
1874 bio_associate_blkg_from_css(bio, css);
1875
1876 rcu_read_unlock();
1877}
1878EXPORT_SYMBOL_GPL(bio_associate_blkg);
1879
1880
1881
1882
1883
1884
1885void bio_clone_blkg_association(struct bio *dst, struct bio *src)
1886{
1887 if (src->bi_blkg) {
1888 if (dst->bi_blkg)
1889 blkg_put(dst->bi_blkg);
1890 blkg_get(src->bi_blkg);
1891 dst->bi_blkg = src->bi_blkg;
1892 }
1893}
1894EXPORT_SYMBOL_GPL(bio_clone_blkg_association);
1895
1896static int blk_cgroup_io_type(struct bio *bio)
1897{
1898 if (op_is_discard(bio->bi_opf))
1899 return BLKG_IOSTAT_DISCARD;
1900 if (op_is_write(bio->bi_opf))
1901 return BLKG_IOSTAT_WRITE;
1902 return BLKG_IOSTAT_READ;
1903}
1904
1905void blk_cgroup_bio_start(struct bio *bio)
1906{
1907 int rwd = blk_cgroup_io_type(bio), cpu;
1908 struct blkg_iostat_set *bis;
1909
1910 cpu = get_cpu();
1911 bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu);
1912 u64_stats_update_begin(&bis->sync);
1913
1914
1915
1916
1917
1918 if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
1919 bio_set_flag(bio, BIO_CGROUP_ACCT);
1920 bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
1921 }
1922 bis->cur.ios[rwd]++;
1923
1924 u64_stats_update_end(&bis->sync);
1925 if (cgroup_subsys_on_dfl(io_cgrp_subsys))
1926 cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu);
1927 put_cpu();
1928}
1929
1930static int __init blkcg_init(void)
1931{
1932 blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
1933 WQ_MEM_RECLAIM | WQ_FREEZABLE |
1934 WQ_UNBOUND | WQ_SYSFS, 0);
1935 if (!blkcg_punt_bio_wq)
1936 return -ENOMEM;
1937 return 0;
1938}
1939subsys_initcall(blkcg_init);
1940
1941module_param(blkcg_debug_stats, bool, 0644);
1942MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
1943