1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17#include <linux/ioprio.h>
18#include <linux/kdev_t.h>
19#include <linux/module.h>
20#include <linux/sched/signal.h>
21#include <linux/err.h>
22#include <linux/blkdev.h>
23#include <linux/backing-dev.h>
24#include <linux/slab.h>
25#include <linux/genhd.h>
26#include <linux/delay.h>
27#include <linux/atomic.h>
28#include <linux/ctype.h>
29#include <linux/blk-cgroup.h>
30#include <linux/tracehook.h>
31#include <linux/psi.h>
32#include "blk.h"
33#include "blk-ioprio.h"
34
35
36
37
38
39
40
41
42static DEFINE_MUTEX(blkcg_pol_register_mutex);
43static DEFINE_MUTEX(blkcg_pol_mutex);
44
45struct blkcg blkcg_root;
46EXPORT_SYMBOL_GPL(blkcg_root);
47
48struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
49EXPORT_SYMBOL_GPL(blkcg_root_css);
50
51static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
52
53static LIST_HEAD(all_blkcgs);
54
55static bool blkcg_debug_stats = false;
56static struct workqueue_struct *blkcg_punt_bio_wq;
57
58#define BLKG_DESTROY_BATCH_SIZE 64
59
60static bool blkcg_policy_enabled(struct request_queue *q,
61 const struct blkcg_policy *pol)
62{
63 return pol && test_bit(pol->plid, q->blkcg_pols);
64}
65
66
67
68
69
70
71
72static void blkg_free(struct blkcg_gq *blkg)
73{
74 int i;
75
76 if (!blkg)
77 return;
78
79 for (i = 0; i < BLKCG_MAX_POLS; i++)
80 if (blkg->pd[i])
81 blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
82
83 free_percpu(blkg->iostat_cpu);
84 percpu_ref_exit(&blkg->refcnt);
85 kfree(blkg);
86}
87
88static void __blkg_release(struct rcu_head *rcu)
89{
90 struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
91
92 WARN_ON(!bio_list_empty(&blkg->async_bios));
93
94
95 css_put(&blkg->blkcg->css);
96 if (blkg->parent)
97 blkg_put(blkg->parent);
98
99 wb_congested_put(blkg->wb_congested);
100
101 blkg_free(blkg);
102}
103
104
105
106
107
108
109
110
111
112static void blkg_release(struct percpu_ref *ref)
113{
114 struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt);
115
116 call_rcu(&blkg->rcu_head, __blkg_release);
117}
118
119static void blkg_async_bio_workfn(struct work_struct *work)
120{
121 struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
122 async_bio_work);
123 struct bio_list bios = BIO_EMPTY_LIST;
124 struct bio *bio;
125 struct blk_plug plug;
126 bool need_plug = false;
127
128
129 spin_lock_bh(&blkg->async_bio_lock);
130 bio_list_merge(&bios, &blkg->async_bios);
131 bio_list_init(&blkg->async_bios);
132 spin_unlock_bh(&blkg->async_bio_lock);
133
134
135 if (bios.head && bios.head->bi_next) {
136 need_plug = true;
137 blk_start_plug(&plug);
138 }
139 while ((bio = bio_list_pop(&bios)))
140 submit_bio(bio);
141 if (need_plug)
142 blk_finish_plug(&plug);
143}
144
145
146
147
148
149
150
151
152
153static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
154 gfp_t gfp_mask)
155{
156 struct blkcg_gq *blkg;
157 int i, cpu;
158
159
160 blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node);
161 if (!blkg)
162 return NULL;
163
164 if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask))
165 goto err_free;
166
167 blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask);
168 if (!blkg->iostat_cpu)
169 goto err_free;
170
171 blkg->q = q;
172 INIT_LIST_HEAD(&blkg->q_node);
173 spin_lock_init(&blkg->async_bio_lock);
174 bio_list_init(&blkg->async_bios);
175 INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
176 blkg->blkcg = blkcg;
177
178 u64_stats_init(&blkg->iostat.sync);
179 for_each_possible_cpu(cpu)
180 u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync);
181
182 for (i = 0; i < BLKCG_MAX_POLS; i++) {
183 struct blkcg_policy *pol = blkcg_policy[i];
184 struct blkg_policy_data *pd;
185
186 if (!blkcg_policy_enabled(q, pol))
187 continue;
188
189
190 pd = pol->pd_alloc_fn(gfp_mask, q, blkcg);
191 if (!pd)
192 goto err_free;
193
194 blkg->pd[i] = pd;
195 pd->blkg = blkg;
196 pd->plid = i;
197 }
198
199 return blkg;
200
201err_free:
202 blkg_free(blkg);
203 return NULL;
204}
205
206struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
207 struct request_queue *q, bool update_hint)
208{
209 struct blkcg_gq *blkg;
210
211
212
213
214
215
216
217 blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
218 if (blkg && blkg->q == q) {
219 if (update_hint) {
220 lockdep_assert_held(&q->queue_lock);
221 rcu_assign_pointer(blkcg->blkg_hint, blkg);
222 }
223 return blkg;
224 }
225
226 return NULL;
227}
228EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
229
230
231
232
233
234static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
235 struct request_queue *q,
236 struct blkcg_gq *new_blkg)
237{
238 struct blkcg_gq *blkg;
239 struct bdi_writeback_congested *wb_congested;
240 int i, ret;
241
242 WARN_ON_ONCE(!rcu_read_lock_held());
243 lockdep_assert_held(&q->queue_lock);
244
245
246 if (blk_queue_dying(q)) {
247 ret = -ENODEV;
248 goto err_free_blkg;
249 }
250
251
252 if (!css_tryget_online(&blkcg->css)) {
253 ret = -ENODEV;
254 goto err_free_blkg;
255 }
256
257 wb_congested = wb_congested_get_create(q->backing_dev_info,
258 blkcg->css.id,
259 GFP_NOWAIT | __GFP_NOWARN);
260 if (!wb_congested) {
261 ret = -ENOMEM;
262 goto err_put_css;
263 }
264
265
266 if (!new_blkg) {
267 new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);
268 if (unlikely(!new_blkg)) {
269 ret = -ENOMEM;
270 goto err_put_congested;
271 }
272 }
273 blkg = new_blkg;
274 blkg->wb_congested = wb_congested;
275
276
277 if (blkcg_parent(blkcg)) {
278 blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
279 if (WARN_ON_ONCE(!blkg->parent)) {
280 ret = -ENODEV;
281 goto err_put_congested;
282 }
283 blkg_get(blkg->parent);
284 }
285
286
287 for (i = 0; i < BLKCG_MAX_POLS; i++) {
288 struct blkcg_policy *pol = blkcg_policy[i];
289
290 if (blkg->pd[i] && pol->pd_init_fn)
291 pol->pd_init_fn(blkg->pd[i]);
292 }
293
294
295 spin_lock(&blkcg->lock);
296 ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
297 if (likely(!ret)) {
298 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
299 list_add(&blkg->q_node, &q->blkg_list);
300
301 for (i = 0; i < BLKCG_MAX_POLS; i++) {
302 struct blkcg_policy *pol = blkcg_policy[i];
303
304 if (blkg->pd[i] && pol->pd_online_fn)
305 pol->pd_online_fn(blkg->pd[i]);
306 }
307 }
308 blkg->online = true;
309 spin_unlock(&blkcg->lock);
310
311 if (!ret)
312 return blkg;
313
314
315 blkg_put(blkg);
316 return ERR_PTR(ret);
317
318err_put_congested:
319 wb_congested_put(wb_congested);
320err_put_css:
321 css_put(&blkcg->css);
322err_free_blkg:
323 blkg_free(new_blkg);
324 return ERR_PTR(ret);
325}
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
341 struct request_queue *q)
342{
343 struct blkcg_gq *blkg;
344 unsigned long flags;
345
346 WARN_ON_ONCE(!rcu_read_lock_held());
347
348 blkg = blkg_lookup(blkcg, q);
349 if (blkg)
350 return blkg;
351
352 spin_lock_irqsave(&q->queue_lock, flags);
353 blkg = __blkg_lookup(blkcg, q, true);
354 if (blkg)
355 goto found;
356
357
358
359
360
361
362 while (true) {
363 struct blkcg *pos = blkcg;
364 struct blkcg *parent = blkcg_parent(blkcg);
365 struct blkcg_gq *ret_blkg = q->root_blkg;
366
367 while (parent) {
368 blkg = __blkg_lookup(parent, q, false);
369 if (blkg) {
370
371 ret_blkg = blkg;
372 break;
373 }
374 pos = parent;
375 parent = blkcg_parent(parent);
376 }
377
378 blkg = blkg_create(pos, q, NULL);
379 if (IS_ERR(blkg)) {
380 blkg = ret_blkg;
381 break;
382 }
383 if (pos == blkcg)
384 break;
385 }
386
387found:
388 spin_unlock_irqrestore(&q->queue_lock, flags);
389 return blkg;
390}
391
392static void blkg_destroy(struct blkcg_gq *blkg)
393{
394 struct blkcg *blkcg = blkg->blkcg;
395 int i;
396
397 lockdep_assert_held(&blkg->q->queue_lock);
398 lockdep_assert_held(&blkcg->lock);
399
400
401 WARN_ON_ONCE(list_empty(&blkg->q_node));
402 WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
403
404 for (i = 0; i < BLKCG_MAX_POLS; i++) {
405 struct blkcg_policy *pol = blkcg_policy[i];
406
407 if (blkg->pd[i] && pol->pd_offline_fn)
408 pol->pd_offline_fn(blkg->pd[i]);
409 }
410
411 blkg->online = false;
412
413 radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
414 list_del_init(&blkg->q_node);
415 hlist_del_init_rcu(&blkg->blkcg_node);
416
417
418
419
420
421
422 if (rcu_access_pointer(blkcg->blkg_hint) == blkg)
423 rcu_assign_pointer(blkcg->blkg_hint, NULL);
424
425
426
427
428
429 percpu_ref_kill(&blkg->refcnt);
430}
431
432
433
434
435
436
437
438static void blkg_destroy_all(struct request_queue *q)
439{
440 struct blkcg_gq *blkg, *n;
441 int count = BLKG_DESTROY_BATCH_SIZE;
442
443restart:
444 spin_lock_irq(&q->queue_lock);
445 list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
446 struct blkcg *blkcg = blkg->blkcg;
447
448 spin_lock(&blkcg->lock);
449 blkg_destroy(blkg);
450 spin_unlock(&blkcg->lock);
451
452
453
454
455
456 if (!(--count)) {
457 count = BLKG_DESTROY_BATCH_SIZE;
458 spin_unlock_irq(&q->queue_lock);
459 cond_resched();
460 goto restart;
461 }
462 }
463
464 q->root_blkg = NULL;
465 spin_unlock_irq(&q->queue_lock);
466}
467
468
469
470
471
472
473
474
475
476void __blkg_release_rcu(struct rcu_head *rcu_head)
477{
478 struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head);
479
480
481 css_put(&blkg->blkcg->css);
482 if (blkg->parent)
483 blkg_put(blkg->parent);
484
485 wb_congested_put(blkg->wb_congested);
486
487 blkg_free(blkg);
488}
489EXPORT_SYMBOL_GPL(__blkg_release_rcu);
490
491static int blkcg_reset_stats(struct cgroup_subsys_state *css,
492 struct cftype *cftype, u64 val)
493{
494 struct blkcg *blkcg = css_to_blkcg(css);
495 struct blkcg_gq *blkg;
496 int i, cpu;
497
498 mutex_lock(&blkcg_pol_mutex);
499 spin_lock_irq(&blkcg->lock);
500
501
502
503
504
505
506 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
507 for_each_possible_cpu(cpu) {
508 struct blkg_iostat_set *bis =
509 per_cpu_ptr(blkg->iostat_cpu, cpu);
510 memset(bis, 0, sizeof(*bis));
511 }
512 memset(&blkg->iostat, 0, sizeof(blkg->iostat));
513
514 for (i = 0; i < BLKCG_MAX_POLS; i++) {
515 struct blkcg_policy *pol = blkcg_policy[i];
516
517 if (blkg->pd[i] && pol->pd_reset_stats_fn)
518 pol->pd_reset_stats_fn(blkg->pd[i]);
519 }
520 }
521
522 spin_unlock_irq(&blkcg->lock);
523 mutex_unlock(&blkcg_pol_mutex);
524 return 0;
525}
526
527const char *blkg_dev_name(struct blkcg_gq *blkg)
528{
529
530 if (blkg->q->backing_dev_info->dev)
531 return bdi_dev_name(blkg->q->backing_dev_info);
532 return NULL;
533}
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
554 u64 (*prfill)(struct seq_file *,
555 struct blkg_policy_data *, int),
556 const struct blkcg_policy *pol, int data,
557 bool show_total)
558{
559 struct blkcg_gq *blkg;
560 u64 total = 0;
561
562 rcu_read_lock();
563 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
564 spin_lock_irq(&blkg->q->queue_lock);
565 if (blkcg_policy_enabled(blkg->q, pol))
566 total += prfill(sf, blkg->pd[pol->plid], data);
567 spin_unlock_irq(&blkg->q->queue_lock);
568 }
569 rcu_read_unlock();
570
571 if (show_total)
572 seq_printf(sf, "Total %llu\n", (unsigned long long)total);
573}
574EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
575
576
577
578
579
580
581
582
583
584u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
585{
586 const char *dname = blkg_dev_name(pd->blkg);
587
588 if (!dname)
589 return 0;
590
591 seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
592 return v;
593}
594EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
595
596
597static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
598 const struct blkcg_policy *pol,
599 struct request_queue *q)
600{
601 WARN_ON_ONCE(!rcu_read_lock_held());
602 lockdep_assert_held(&q->queue_lock);
603
604 if (!blkcg_policy_enabled(q, pol))
605 return ERR_PTR(-EOPNOTSUPP);
606 return __blkg_lookup(blkcg, q, true );
607}
608
609
610
611
612
613
614
615
616
617
618
619
620struct gendisk *blkcg_conf_get_disk(char **inputp)
621{
622 char *input = *inputp;
623 unsigned int major, minor;
624 struct gendisk *disk;
625 int key_len, part;
626
627 if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
628 return ERR_PTR(-EINVAL);
629
630 input += key_len;
631 if (!isspace(*input))
632 return ERR_PTR(-EINVAL);
633 input = skip_spaces(input);
634
635 disk = get_gendisk(MKDEV(major, minor), &part);
636 if (!disk)
637 return ERR_PTR(-ENODEV);
638 if (part) {
639 put_disk_and_module(disk);
640 return ERR_PTR(-ENODEV);
641 }
642
643 *inputp = input;
644 return disk;
645}
646
647
648
649
650
651
652
653
654
655
656
657
658
659int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
660 char *input, struct blkg_conf_ctx *ctx)
661 __acquires(rcu) __acquires(&disk->queue->queue_lock)
662{
663 struct gendisk *disk;
664 struct request_queue *q;
665 struct blkcg_gq *blkg;
666 int ret;
667
668 disk = blkcg_conf_get_disk(&input);
669 if (IS_ERR(disk))
670 return PTR_ERR(disk);
671
672 q = disk->queue;
673
674 rcu_read_lock();
675 spin_lock_irq(&q->queue_lock);
676
677 blkg = blkg_lookup_check(blkcg, pol, q);
678 if (IS_ERR(blkg)) {
679 ret = PTR_ERR(blkg);
680 goto fail_unlock;
681 }
682
683 if (blkg)
684 goto success;
685
686
687
688
689
690 while (true) {
691 struct blkcg *pos = blkcg;
692 struct blkcg *parent;
693 struct blkcg_gq *new_blkg;
694
695 parent = blkcg_parent(blkcg);
696 while (parent && !__blkg_lookup(parent, q, false)) {
697 pos = parent;
698 parent = blkcg_parent(parent);
699 }
700
701
702 spin_unlock_irq(&q->queue_lock);
703 rcu_read_unlock();
704
705 new_blkg = blkg_alloc(pos, q, GFP_KERNEL);
706 if (unlikely(!new_blkg)) {
707 ret = -ENOMEM;
708 goto fail;
709 }
710
711 if (radix_tree_preload(GFP_KERNEL)) {
712 blkg_free(new_blkg);
713 ret = -ENOMEM;
714 goto fail;
715 }
716
717 rcu_read_lock();
718 spin_lock_irq(&q->queue_lock);
719
720 blkg = blkg_lookup_check(pos, pol, q);
721 if (IS_ERR(blkg)) {
722 ret = PTR_ERR(blkg);
723 blkg_free(new_blkg);
724 goto fail_preloaded;
725 }
726
727 if (blkg) {
728 blkg_free(new_blkg);
729 } else {
730 blkg = blkg_create(pos, q, new_blkg);
731 if (unlikely(IS_ERR(blkg))) {
732 ret = PTR_ERR(blkg);
733 goto fail_preloaded;
734 }
735 }
736
737 radix_tree_preload_end();
738
739 if (pos == blkcg)
740 goto success;
741 }
742success:
743 ctx->disk = disk;
744 ctx->blkg = blkg;
745 ctx->body = input;
746 return 0;
747
748fail_preloaded:
749 radix_tree_preload_end();
750fail_unlock:
751 spin_unlock_irq(&q->queue_lock);
752 rcu_read_unlock();
753fail:
754 put_disk_and_module(disk);
755
756
757
758
759
760
761 if (ret == -EBUSY) {
762 msleep(10);
763 ret = restart_syscall();
764 }
765 return ret;
766}
767EXPORT_SYMBOL_GPL(blkg_conf_prep);
768
769
770
771
772
773
774
775
776void blkg_conf_finish(struct blkg_conf_ctx *ctx)
777 __releases(&ctx->disk->queue->queue_lock) __releases(rcu)
778{
779 spin_unlock_irq(&ctx->disk->queue->queue_lock);
780 rcu_read_unlock();
781 put_disk_and_module(ctx->disk);
782}
783EXPORT_SYMBOL_GPL(blkg_conf_finish);
784
785static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
786{
787 int i;
788
789 for (i = 0; i < BLKG_IOSTAT_NR; i++) {
790 dst->bytes[i] = src->bytes[i];
791 dst->ios[i] = src->ios[i];
792 }
793}
794
795static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src)
796{
797 int i;
798
799 for (i = 0; i < BLKG_IOSTAT_NR; i++) {
800 dst->bytes[i] += src->bytes[i];
801 dst->ios[i] += src->ios[i];
802 }
803}
804
805static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src)
806{
807 int i;
808
809 for (i = 0; i < BLKG_IOSTAT_NR; i++) {
810 dst->bytes[i] -= src->bytes[i];
811 dst->ios[i] -= src->ios[i];
812 }
813}
814
815static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
816{
817 struct blkcg *blkcg = css_to_blkcg(css);
818 struct blkcg_gq *blkg;
819
820
821 if (!cgroup_parent(css->cgroup))
822 return;
823
824 rcu_read_lock();
825
826 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
827 struct blkcg_gq *parent = blkg->parent;
828 struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
829 struct blkg_iostat cur, delta;
830 unsigned long flags;
831 unsigned int seq;
832
833
834 do {
835 seq = u64_stats_fetch_begin(&bisc->sync);
836 blkg_iostat_set(&cur, &bisc->cur);
837 } while (u64_stats_fetch_retry(&bisc->sync, seq));
838
839
840 flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
841 blkg_iostat_set(&delta, &cur);
842 blkg_iostat_sub(&delta, &bisc->last);
843 blkg_iostat_add(&blkg->iostat.cur, &delta);
844 blkg_iostat_add(&bisc->last, &delta);
845 u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
846
847
848 if (parent && parent->parent) {
849 flags = u64_stats_update_begin_irqsave(&parent->iostat.sync);
850 blkg_iostat_set(&delta, &blkg->iostat.cur);
851 blkg_iostat_sub(&delta, &blkg->iostat.last);
852 blkg_iostat_add(&parent->iostat.cur, &delta);
853 blkg_iostat_add(&blkg->iostat.last, &delta);
854 u64_stats_update_end_irqrestore(&parent->iostat.sync, flags);
855 }
856 }
857
858 rcu_read_unlock();
859}
860
861
862
863
864
865
866
867
868
869
870
871
872
873static void blkcg_fill_root_iostats(void)
874{
875 struct class_dev_iter iter;
876 struct device *dev;
877
878 class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
879 while ((dev = class_dev_iter_next(&iter))) {
880 struct gendisk *disk = dev_to_disk(dev);
881 struct hd_struct *part = disk_get_part(disk, 0);
882 struct blkcg_gq *blkg = blk_queue_root_blkg(disk->queue);
883 struct blkg_iostat tmp;
884 int cpu;
885
886 memset(&tmp, 0, sizeof(tmp));
887 for_each_possible_cpu(cpu) {
888 struct disk_stats *cpu_dkstats;
889 unsigned long flags;
890
891 cpu_dkstats = per_cpu_ptr(part->dkstats, cpu);
892 tmp.ios[BLKG_IOSTAT_READ] +=
893 cpu_dkstats->ios[STAT_READ];
894 tmp.ios[BLKG_IOSTAT_WRITE] +=
895 cpu_dkstats->ios[STAT_WRITE];
896 tmp.ios[BLKG_IOSTAT_DISCARD] +=
897 cpu_dkstats->ios[STAT_DISCARD];
898
899 tmp.bytes[BLKG_IOSTAT_READ] +=
900 cpu_dkstats->sectors[STAT_READ] << 9;
901 tmp.bytes[BLKG_IOSTAT_WRITE] +=
902 cpu_dkstats->sectors[STAT_WRITE] << 9;
903 tmp.bytes[BLKG_IOSTAT_DISCARD] +=
904 cpu_dkstats->sectors[STAT_DISCARD] << 9;
905
906 flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
907 blkg_iostat_set(&blkg->iostat.cur, &tmp);
908 u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
909 }
910 disk_put_part(part);
911 }
912}
913
914static int blkcg_print_stat(struct seq_file *sf, void *v)
915{
916 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
917 struct blkcg_gq *blkg;
918
919 if (!seq_css(sf)->parent)
920 blkcg_fill_root_iostats();
921 else
922 cgroup_rstat_flush(blkcg->css.cgroup);
923
924 rcu_read_lock();
925
926 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
927 struct blkg_iostat_set *bis = &blkg->iostat;
928 const char *dname;
929 char *buf;
930 u64 rbytes, wbytes, rios, wios, dbytes, dios;
931 size_t size = seq_get_buf(sf, &buf), off = 0;
932 int i;
933 bool has_stats = false;
934 unsigned seq;
935
936 spin_lock_irq(&blkg->q->queue_lock);
937
938 if (!blkg->online)
939 goto skip;
940
941 dname = blkg_dev_name(blkg);
942 if (!dname)
943 goto skip;
944
945
946
947
948
949
950
951 off += scnprintf(buf+off, size-off, "%s ", dname);
952
953 do {
954 seq = u64_stats_fetch_begin(&bis->sync);
955
956 rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
957 wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
958 dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
959 rios = bis->cur.ios[BLKG_IOSTAT_READ];
960 wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
961 dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
962 } while (u64_stats_fetch_retry(&bis->sync, seq));
963
964 if (rbytes || wbytes || rios || wios) {
965 has_stats = true;
966 off += scnprintf(buf+off, size-off,
967 "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
968 rbytes, wbytes, rios, wios,
969 dbytes, dios);
970 }
971
972 if (!blkcg_debug_stats)
973 goto next;
974
975 if (atomic_read(&blkg->use_delay)) {
976 has_stats = true;
977 off += scnprintf(buf+off, size-off,
978 " use_delay=%d delay_nsec=%llu",
979 atomic_read(&blkg->use_delay),
980 (unsigned long long)atomic64_read(&blkg->delay_nsec));
981 }
982
983 for (i = 0; i < BLKCG_MAX_POLS; i++) {
984 struct blkcg_policy *pol = blkcg_policy[i];
985 size_t written;
986
987 if (!blkg->pd[i] || !pol->pd_stat_fn)
988 continue;
989
990 written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off);
991 if (written)
992 has_stats = true;
993 off += written;
994 }
995next:
996 if (has_stats) {
997 if (off < size - 1) {
998 off += scnprintf(buf+off, size-off, "\n");
999 seq_commit(sf, off);
1000 } else {
1001 seq_commit(sf, -1);
1002 }
1003 }
1004 skip:
1005 spin_unlock_irq(&blkg->q->queue_lock);
1006 }
1007
1008 rcu_read_unlock();
1009 return 0;
1010}
1011
1012static struct cftype blkcg_files[] = {
1013 {
1014 .name = "stat",
1015 .seq_show = blkcg_print_stat,
1016 },
1017 { }
1018};
1019
1020static struct cftype blkcg_legacy_files[] = {
1021 {
1022 .name = "reset_stats",
1023 .write_u64 = blkcg_reset_stats,
1024 },
1025 { }
1026};
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057static void blkcg_css_offline(struct cgroup_subsys_state *css)
1058{
1059 struct blkcg *blkcg = css_to_blkcg(css);
1060
1061
1062 wb_blkcg_offline(blkcg);
1063
1064
1065 blkcg_unpin_online(blkcg);
1066}
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079void blkcg_destroy_blkgs(struct blkcg *blkcg)
1080{
1081 spin_lock_irq(&blkcg->lock);
1082
1083 while (!hlist_empty(&blkcg->blkg_list)) {
1084 struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
1085 struct blkcg_gq, blkcg_node);
1086 struct request_queue *q = blkg->q;
1087
1088 if (spin_trylock(&q->queue_lock)) {
1089 blkg_destroy(blkg);
1090 spin_unlock(&q->queue_lock);
1091 } else {
1092 spin_unlock_irq(&blkcg->lock);
1093 cpu_relax();
1094 spin_lock_irq(&blkcg->lock);
1095 }
1096 }
1097
1098 spin_unlock_irq(&blkcg->lock);
1099}
1100
1101static void blkcg_css_free(struct cgroup_subsys_state *css)
1102{
1103 struct blkcg *blkcg = css_to_blkcg(css);
1104 int i;
1105
1106 mutex_lock(&blkcg_pol_mutex);
1107
1108 list_del(&blkcg->all_blkcgs_node);
1109
1110 for (i = 0; i < BLKCG_MAX_POLS; i++)
1111 if (blkcg->cpd[i])
1112 blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
1113
1114 mutex_unlock(&blkcg_pol_mutex);
1115
1116 kfree(blkcg);
1117}
1118
1119static struct cgroup_subsys_state *
1120blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
1121{
1122 struct blkcg *blkcg;
1123 struct cgroup_subsys_state *ret;
1124 int i;
1125
1126 mutex_lock(&blkcg_pol_mutex);
1127
1128 if (!parent_css) {
1129 blkcg = &blkcg_root;
1130 } else {
1131 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
1132 if (!blkcg) {
1133 ret = ERR_PTR(-ENOMEM);
1134 goto unlock;
1135 }
1136 }
1137
1138 for (i = 0; i < BLKCG_MAX_POLS ; i++) {
1139 struct blkcg_policy *pol = blkcg_policy[i];
1140 struct blkcg_policy_data *cpd;
1141
1142
1143
1144
1145
1146
1147
1148 if (!pol || !pol->cpd_alloc_fn)
1149 continue;
1150
1151 cpd = pol->cpd_alloc_fn(GFP_KERNEL);
1152 if (!cpd) {
1153 ret = ERR_PTR(-ENOMEM);
1154 goto free_pd_blkcg;
1155 }
1156 blkcg->cpd[i] = cpd;
1157 cpd->blkcg = blkcg;
1158 cpd->plid = i;
1159 if (pol->cpd_init_fn)
1160 pol->cpd_init_fn(cpd);
1161 }
1162
1163 spin_lock_init(&blkcg->lock);
1164 refcount_set(&blkcg->online_pin, 1);
1165 INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN);
1166 INIT_HLIST_HEAD(&blkcg->blkg_list);
1167#ifdef CONFIG_CGROUP_WRITEBACK
1168 INIT_LIST_HEAD(&blkcg->cgwb_list);
1169#endif
1170 list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);
1171
1172 mutex_unlock(&blkcg_pol_mutex);
1173 return &blkcg->css;
1174
1175free_pd_blkcg:
1176 for (i--; i >= 0; i--)
1177 if (blkcg->cpd[i])
1178 blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
1179
1180 if (blkcg != &blkcg_root)
1181 kfree(blkcg);
1182unlock:
1183 mutex_unlock(&blkcg_pol_mutex);
1184 return ret;
1185}
1186
1187static int blkcg_css_online(struct cgroup_subsys_state *css)
1188{
1189 struct blkcg *blkcg = css_to_blkcg(css);
1190 struct blkcg *parent = blkcg_parent(blkcg);
1191
1192
1193
1194
1195
1196
1197 if (parent)
1198 blkcg_pin_online(parent);
1199 return 0;
1200}
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212int blkcg_init_queue(struct request_queue *q)
1213{
1214 struct blkcg_gq *new_blkg, *blkg;
1215 bool preloaded;
1216 int ret;
1217
1218 new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
1219 if (!new_blkg)
1220 return -ENOMEM;
1221
1222 preloaded = !radix_tree_preload(GFP_KERNEL);
1223
1224
1225 rcu_read_lock();
1226 spin_lock_irq(&q->queue_lock);
1227 blkg = blkg_create(&blkcg_root, q, new_blkg);
1228 if (IS_ERR(blkg))
1229 goto err_unlock;
1230 q->root_blkg = blkg;
1231 spin_unlock_irq(&q->queue_lock);
1232 rcu_read_unlock();
1233
1234 if (preloaded)
1235 radix_tree_preload_end();
1236
1237 ret = blk_ioprio_init(q);
1238 if (ret)
1239 goto err_destroy_all;
1240
1241 ret = blk_throtl_init(q);
1242 if (ret)
1243 goto err_destroy_all;
1244
1245 ret = blk_iolatency_init(q);
1246 if (ret) {
1247 blk_throtl_exit(q);
1248 goto err_destroy_all;
1249 }
1250
1251 return 0;
1252
1253err_destroy_all:
1254 blkg_destroy_all(q);
1255 return ret;
1256err_unlock:
1257 spin_unlock_irq(&q->queue_lock);
1258 rcu_read_unlock();
1259 if (preloaded)
1260 radix_tree_preload_end();
1261 return PTR_ERR(blkg);
1262}
1263
1264
1265
1266
1267
1268
1269
1270void blkcg_exit_queue(struct request_queue *q)
1271{
1272 blkg_destroy_all(q);
1273 blk_throtl_exit(q);
1274}
1275
1276static void blkcg_bind(struct cgroup_subsys_state *root_css)
1277{
1278 int i;
1279
1280 mutex_lock(&blkcg_pol_mutex);
1281
1282 for (i = 0; i < BLKCG_MAX_POLS; i++) {
1283 struct blkcg_policy *pol = blkcg_policy[i];
1284 struct blkcg *blkcg;
1285
1286 if (!pol || !pol->cpd_bind_fn)
1287 continue;
1288
1289 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node)
1290 if (blkcg->cpd[pol->plid])
1291 pol->cpd_bind_fn(blkcg->cpd[pol->plid]);
1292 }
1293 mutex_unlock(&blkcg_pol_mutex);
1294}
1295
1296static void blkcg_exit(struct task_struct *tsk)
1297{
1298 if (tsk->throttle_queue)
1299 blk_put_queue(tsk->throttle_queue);
1300 tsk->throttle_queue = NULL;
1301}
1302
1303struct cgroup_subsys io_cgrp_subsys = {
1304 .css_alloc = blkcg_css_alloc,
1305 .css_online = blkcg_css_online,
1306 .css_offline = blkcg_css_offline,
1307 .css_free = blkcg_css_free,
1308 .css_rstat_flush = blkcg_rstat_flush,
1309 .bind = blkcg_bind,
1310 .dfl_cftypes = blkcg_files,
1311 .legacy_cftypes = blkcg_legacy_files,
1312 .legacy_name = "blkio",
1313 .exit = blkcg_exit,
1314#ifdef CONFIG_MEMCG
1315
1316
1317
1318
1319
1320 .depends_on = 1 << memory_cgrp_id,
1321#endif
1322};
1323EXPORT_SYMBOL_GPL(io_cgrp_subsys);
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341int blkcg_activate_policy(struct request_queue *q,
1342 const struct blkcg_policy *pol)
1343{
1344 struct blkg_policy_data *pd_prealloc = NULL;
1345 struct blkcg_gq *blkg, *pinned_blkg = NULL;
1346 int ret;
1347
1348 if (blkcg_policy_enabled(q, pol))
1349 return 0;
1350
1351 if (queue_is_mq(q))
1352 blk_mq_freeze_queue(q);
1353retry:
1354 spin_lock_irq(&q->queue_lock);
1355
1356
1357 list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
1358 struct blkg_policy_data *pd;
1359
1360 if (blkg->pd[pol->plid])
1361 continue;
1362
1363
1364 if (blkg == pinned_blkg) {
1365 pd = pd_prealloc;
1366 pd_prealloc = NULL;
1367 } else {
1368 pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q,
1369 blkg->blkcg);
1370 }
1371
1372 if (!pd) {
1373
1374
1375
1376
1377 if (pinned_blkg)
1378 blkg_put(pinned_blkg);
1379 blkg_get(blkg);
1380 pinned_blkg = blkg;
1381
1382 spin_unlock_irq(&q->queue_lock);
1383
1384 if (pd_prealloc)
1385 pol->pd_free_fn(pd_prealloc);
1386 pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q,
1387 blkg->blkcg);
1388 if (pd_prealloc)
1389 goto retry;
1390 else
1391 goto enomem;
1392 }
1393
1394 blkg->pd[pol->plid] = pd;
1395 pd->blkg = blkg;
1396 pd->plid = pol->plid;
1397 }
1398
1399
1400 if (pol->pd_init_fn)
1401 list_for_each_entry_reverse(blkg, &q->blkg_list, q_node)
1402 pol->pd_init_fn(blkg->pd[pol->plid]);
1403
1404 __set_bit(pol->plid, q->blkcg_pols);
1405 ret = 0;
1406
1407 spin_unlock_irq(&q->queue_lock);
1408out:
1409 if (queue_is_mq(q))
1410 blk_mq_unfreeze_queue(q);
1411 if (pinned_blkg)
1412 blkg_put(pinned_blkg);
1413 if (pd_prealloc)
1414 pol->pd_free_fn(pd_prealloc);
1415 return ret;
1416
1417enomem:
1418
1419 spin_lock_irq(&q->queue_lock);
1420 list_for_each_entry(blkg, &q->blkg_list, q_node) {
1421 struct blkcg *blkcg = blkg->blkcg;
1422
1423 spin_lock(&blkcg->lock);
1424 if (blkg->pd[pol->plid]) {
1425 pol->pd_free_fn(blkg->pd[pol->plid]);
1426 blkg->pd[pol->plid] = NULL;
1427 }
1428 spin_unlock(&blkcg->lock);
1429 }
1430 spin_unlock_irq(&q->queue_lock);
1431 ret = -ENOMEM;
1432 goto out;
1433}
1434EXPORT_SYMBOL_GPL(blkcg_activate_policy);
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444void blkcg_deactivate_policy(struct request_queue *q,
1445 const struct blkcg_policy *pol)
1446{
1447 struct blkcg_gq *blkg;
1448
1449 if (!blkcg_policy_enabled(q, pol))
1450 return;
1451
1452 if (queue_is_mq(q))
1453 blk_mq_freeze_queue(q);
1454
1455 spin_lock_irq(&q->queue_lock);
1456
1457 __clear_bit(pol->plid, q->blkcg_pols);
1458
1459 list_for_each_entry(blkg, &q->blkg_list, q_node) {
1460 struct blkcg *blkcg = blkg->blkcg;
1461
1462 spin_lock(&blkcg->lock);
1463 if (blkg->pd[pol->plid]) {
1464 if (pol->pd_offline_fn)
1465 pol->pd_offline_fn(blkg->pd[pol->plid]);
1466 pol->pd_free_fn(blkg->pd[pol->plid]);
1467 blkg->pd[pol->plid] = NULL;
1468 }
1469 spin_unlock(&blkcg->lock);
1470 }
1471
1472 spin_unlock_irq(&q->queue_lock);
1473
1474 if (queue_is_mq(q))
1475 blk_mq_unfreeze_queue(q);
1476}
1477EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
1478
1479
1480
1481
1482
1483
1484
1485
1486int blkcg_policy_register(struct blkcg_policy *pol)
1487{
1488 struct blkcg *blkcg;
1489 int i, ret;
1490
1491 mutex_lock(&blkcg_pol_register_mutex);
1492 mutex_lock(&blkcg_pol_mutex);
1493
1494
1495 ret = -ENOSPC;
1496 for (i = 0; i < BLKCG_MAX_POLS; i++)
1497 if (!blkcg_policy[i])
1498 break;
1499 if (i >= BLKCG_MAX_POLS) {
1500 pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n");
1501 goto err_unlock;
1502 }
1503
1504
1505 if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
1506 (!pol->pd_alloc_fn ^ !pol->pd_free_fn))
1507 goto err_unlock;
1508
1509
1510 pol->plid = i;
1511 blkcg_policy[pol->plid] = pol;
1512
1513
1514 if (pol->cpd_alloc_fn) {
1515 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1516 struct blkcg_policy_data *cpd;
1517
1518 cpd = pol->cpd_alloc_fn(GFP_KERNEL);
1519 if (!cpd)
1520 goto err_free_cpds;
1521
1522 blkcg->cpd[pol->plid] = cpd;
1523 cpd->blkcg = blkcg;
1524 cpd->plid = pol->plid;
1525 if (pol->cpd_init_fn)
1526 pol->cpd_init_fn(cpd);
1527 }
1528 }
1529
1530 mutex_unlock(&blkcg_pol_mutex);
1531
1532
1533 if (pol->dfl_cftypes)
1534 WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys,
1535 pol->dfl_cftypes));
1536 if (pol->legacy_cftypes)
1537 WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
1538 pol->legacy_cftypes));
1539 mutex_unlock(&blkcg_pol_register_mutex);
1540 return 0;
1541
1542err_free_cpds:
1543 if (pol->cpd_free_fn) {
1544 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1545 if (blkcg->cpd[pol->plid]) {
1546 pol->cpd_free_fn(blkcg->cpd[pol->plid]);
1547 blkcg->cpd[pol->plid] = NULL;
1548 }
1549 }
1550 }
1551 blkcg_policy[pol->plid] = NULL;
1552err_unlock:
1553 mutex_unlock(&blkcg_pol_mutex);
1554 mutex_unlock(&blkcg_pol_register_mutex);
1555 return ret;
1556}
1557EXPORT_SYMBOL_GPL(blkcg_policy_register);
1558
1559
1560
1561
1562
1563
1564
1565void blkcg_policy_unregister(struct blkcg_policy *pol)
1566{
1567 struct blkcg *blkcg;
1568
1569 mutex_lock(&blkcg_pol_register_mutex);
1570
1571 if (WARN_ON(blkcg_policy[pol->plid] != pol))
1572 goto out_unlock;
1573
1574
1575 if (pol->dfl_cftypes)
1576 cgroup_rm_cftypes(pol->dfl_cftypes);
1577 if (pol->legacy_cftypes)
1578 cgroup_rm_cftypes(pol->legacy_cftypes);
1579
1580
1581 mutex_lock(&blkcg_pol_mutex);
1582
1583 if (pol->cpd_free_fn) {
1584 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1585 if (blkcg->cpd[pol->plid]) {
1586 pol->cpd_free_fn(blkcg->cpd[pol->plid]);
1587 blkcg->cpd[pol->plid] = NULL;
1588 }
1589 }
1590 }
1591 blkcg_policy[pol->plid] = NULL;
1592
1593 mutex_unlock(&blkcg_pol_mutex);
1594out_unlock:
1595 mutex_unlock(&blkcg_pol_register_mutex);
1596}
1597EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
1598
1599bool __blkcg_punt_bio_submit(struct bio *bio)
1600{
1601 struct blkcg_gq *blkg = bio->bi_blkg;
1602
1603
1604 bio->bi_opf &= ~REQ_CGROUP_PUNT;
1605
1606
1607 if (!blkg->parent)
1608 return false;
1609
1610 spin_lock_bh(&blkg->async_bio_lock);
1611 bio_list_add(&blkg->async_bios, bio);
1612 spin_unlock_bh(&blkg->async_bio_lock);
1613
1614 queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
1615 return true;
1616}
1617
1618
1619
1620
1621
1622
1623
1624static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
1625{
1626 u64 old = atomic64_read(&blkg->delay_start);
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641 if (time_before64(old + NSEC_PER_SEC, now) &&
1642 atomic64_cmpxchg(&blkg->delay_start, old, now) == old) {
1643 u64 cur = atomic64_read(&blkg->delay_nsec);
1644 u64 sub = min_t(u64, blkg->last_delay, now - old);
1645 int cur_use = atomic_read(&blkg->use_delay);
1646
1647
1648
1649
1650
1651 if (cur_use < blkg->last_use)
1652 sub = max_t(u64, sub, blkg->last_delay >> 1);
1653
1654
1655
1656
1657
1658
1659
1660 if (unlikely(cur < sub)) {
1661 atomic64_set(&blkg->delay_nsec, 0);
1662 blkg->last_delay = 0;
1663 } else {
1664 atomic64_sub(sub, &blkg->delay_nsec);
1665 blkg->last_delay = cur - sub;
1666 }
1667 blkg->last_use = cur_use;
1668 }
1669}
1670
1671
1672
1673
1674
1675
1676
1677static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
1678{
1679 unsigned long pflags;
1680 u64 now = ktime_to_ns(ktime_get());
1681 u64 exp;
1682 u64 delay_nsec = 0;
1683 int tok;
1684
1685 while (blkg->parent) {
1686 if (atomic_read(&blkg->use_delay)) {
1687 blkcg_scale_delay(blkg, now);
1688 delay_nsec = max_t(u64, delay_nsec,
1689 atomic64_read(&blkg->delay_nsec));
1690 }
1691 blkg = blkg->parent;
1692 }
1693
1694 if (!delay_nsec)
1695 return;
1696
1697
1698
1699
1700
1701
1702
1703
1704 delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
1705
1706 if (use_memdelay)
1707 psi_memstall_enter(&pflags);
1708
1709 exp = ktime_add_ns(now, delay_nsec);
1710 tok = io_schedule_prepare();
1711 do {
1712 __set_current_state(TASK_KILLABLE);
1713 if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS))
1714 break;
1715 } while (!fatal_signal_pending(current));
1716 io_schedule_finish(tok);
1717
1718 if (use_memdelay)
1719 psi_memstall_leave(&pflags);
1720}
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732void blkcg_maybe_throttle_current(void)
1733{
1734 struct request_queue *q = current->throttle_queue;
1735 struct cgroup_subsys_state *css;
1736 struct blkcg *blkcg;
1737 struct blkcg_gq *blkg;
1738 bool use_memdelay = current->use_memdelay;
1739
1740 if (!q)
1741 return;
1742
1743 current->throttle_queue = NULL;
1744 current->use_memdelay = false;
1745
1746 rcu_read_lock();
1747 css = kthread_blkcg();
1748 if (css)
1749 blkcg = css_to_blkcg(css);
1750 else
1751 blkcg = css_to_blkcg(task_css(current, io_cgrp_id));
1752
1753 if (!blkcg)
1754 goto out;
1755 blkg = blkg_lookup(blkcg, q);
1756 if (!blkg)
1757 goto out;
1758 if (!blkg_tryget(blkg))
1759 goto out;
1760 rcu_read_unlock();
1761
1762 blkcg_maybe_throttle_blkg(blkg, use_memdelay);
1763 blkg_put(blkg);
1764 blk_put_queue(q);
1765 return;
1766out:
1767 rcu_read_unlock();
1768 blk_put_queue(q);
1769}
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
1789{
1790 if (unlikely(current->flags & PF_KTHREAD))
1791 return;
1792
1793 if (current->throttle_queue != q) {
1794 if (!blk_get_queue(q))
1795 return;
1796
1797 if (current->throttle_queue)
1798 blk_put_queue(current->throttle_queue);
1799 current->throttle_queue = q;
1800 }
1801
1802 if (use_memdelay)
1803 current->use_memdelay = use_memdelay;
1804 set_notify_resume(current);
1805}
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
1817{
1818 blkcg_scale_delay(blkg, now);
1819 atomic64_add(delta, &blkg->delay_nsec);
1820}
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio,
1832 struct cgroup_subsys_state *css)
1833{
1834 struct blkcg_gq *blkg, *ret_blkg = NULL;
1835
1836 rcu_read_lock();
1837 blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_disk->queue);
1838 while (blkg) {
1839 if (blkg_tryget(blkg)) {
1840 ret_blkg = blkg;
1841 break;
1842 }
1843 blkg = blkg->parent;
1844 }
1845 rcu_read_unlock();
1846
1847 return ret_blkg;
1848}
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864void bio_associate_blkg_from_css(struct bio *bio,
1865 struct cgroup_subsys_state *css)
1866{
1867 if (bio->bi_blkg)
1868 blkg_put(bio->bi_blkg);
1869
1870 if (css && css->parent) {
1871 bio->bi_blkg = blkg_tryget_closest(bio, css);
1872 } else {
1873 blkg_get(bio->bi_disk->queue->root_blkg);
1874 bio->bi_blkg = bio->bi_disk->queue->root_blkg;
1875 }
1876}
1877EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
1878
1879
1880
1881
1882
1883
1884void bio_clone_blkg_association(struct bio *dst, struct bio *src)
1885{
1886 if (src->bi_blkg) {
1887 if (dst->bi_blkg)
1888 blkg_put(dst->bi_blkg);
1889 blkg_get(src->bi_blkg);
1890 dst->bi_blkg = src->bi_blkg;
1891 }
1892}
1893EXPORT_SYMBOL_GPL(bio_clone_blkg_association);
1894
1895static int blk_cgroup_io_type(struct bio *bio)
1896{
1897 if (op_is_discard(bio->bi_opf))
1898 return BLKG_IOSTAT_DISCARD;
1899 if (op_is_write(bio->bi_opf))
1900 return BLKG_IOSTAT_WRITE;
1901 return BLKG_IOSTAT_READ;
1902}
1903
1904void blk_cgroup_bio_start(struct bio *bio)
1905{
1906 int rwd = blk_cgroup_io_type(bio), cpu;
1907 struct blkg_iostat_set *bis;
1908
1909 cpu = get_cpu();
1910 bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu);
1911 u64_stats_update_begin(&bis->sync);
1912
1913
1914
1915
1916
1917 if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
1918 bio_set_flag(bio, BIO_CGROUP_ACCT);
1919 bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
1920 }
1921 bis->cur.ios[rwd]++;
1922
1923 u64_stats_update_end(&bis->sync);
1924 if (cgroup_subsys_on_dfl(io_cgrp_subsys))
1925 cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu);
1926 put_cpu();
1927}
1928
1929static int __init blkcg_init(void)
1930{
1931 blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
1932 WQ_MEM_RECLAIM | WQ_FREEZABLE |
1933 WQ_UNBOUND | WQ_SYSFS, 0);
1934 if (!blkcg_punt_bio_wq)
1935 return -ENOMEM;
1936 return 0;
1937}
1938subsys_initcall(blkcg_init);
1939
1940module_param(blkcg_debug_stats, bool, 0644);
1941MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
1942