linux/block/blk-cgroup.c
<<
>>
Prefs
   1/*
   2 * Common Block IO controller cgroup interface
   3 *
   4 * Based on ideas and code from CFQ, CFS and BFQ:
   5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
   6 *
   7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
   8 *                    Paolo Valente <paolo.valente@unimore.it>
   9 *
  10 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
  11 *                    Nauman Rafique <nauman@google.com>
  12 */
  13#include <linux/ioprio.h>
  14#include <linux/seq_file.h>
  15#include <linux/kdev_t.h>
  16#include <linux/module.h>
  17#include <linux/err.h>
  18#include <linux/blkdev.h>
  19#include <linux/slab.h>
  20#include "blk-cgroup.h"
  21#include <linux/genhd.h>
  22
  23#define MAX_KEY_LEN 100
  24
  25static DEFINE_SPINLOCK(blkio_list_lock);
  26static LIST_HEAD(blkio_list);
  27
  28struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
  29EXPORT_SYMBOL_GPL(blkio_root_cgroup);
  30
  31static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
  32                                                  struct cgroup *);
  33static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
  34                              struct task_struct *, bool);
  35static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
  36                           struct cgroup *, struct task_struct *, bool);
  37static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
  38static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
  39
  40/* for encoding cft->private value on file */
  41#define BLKIOFILE_PRIVATE(x, val)       (((x) << 16) | (val))
  42/* What policy owns the file, proportional or throttle */
  43#define BLKIOFILE_POLICY(val)           (((val) >> 16) & 0xffff)
  44#define BLKIOFILE_ATTR(val)             ((val) & 0xffff)
  45
  46struct cgroup_subsys blkio_subsys = {
  47        .name = "blkio",
  48        .create = blkiocg_create,
  49        .can_attach = blkiocg_can_attach,
  50        .attach = blkiocg_attach,
  51        .destroy = blkiocg_destroy,
  52        .populate = blkiocg_populate,
  53#ifdef CONFIG_BLK_CGROUP
  54        /* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */
  55        .subsys_id = blkio_subsys_id,
  56#endif
  57        .use_id = 1,
  58        .module = THIS_MODULE,
  59};
  60EXPORT_SYMBOL_GPL(blkio_subsys);
  61
  62static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
  63                                            struct blkio_policy_node *pn)
  64{
  65        list_add(&pn->node, &blkcg->policy_list);
  66}
  67
  68static inline bool cftype_blkg_same_policy(struct cftype *cft,
  69                        struct blkio_group *blkg)
  70{
  71        enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
  72
  73        if (blkg->plid == plid)
  74                return 1;
  75
  76        return 0;
  77}
  78
  79/* Determines if policy node matches cgroup file being accessed */
  80static inline bool pn_matches_cftype(struct cftype *cft,
  81                        struct blkio_policy_node *pn)
  82{
  83        enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
  84        int fileid = BLKIOFILE_ATTR(cft->private);
  85
  86        return (plid == pn->plid && fileid == pn->fileid);
  87}
  88
  89/* Must be called with blkcg->lock held */
  90static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
  91{
  92        list_del(&pn->node);
  93}
  94
  95/* Must be called with blkcg->lock held */
  96static struct blkio_policy_node *
  97blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev,
  98                enum blkio_policy_id plid, int fileid)
  99{
 100        struct blkio_policy_node *pn;
 101
 102        list_for_each_entry(pn, &blkcg->policy_list, node) {
 103                if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid)
 104                        return pn;
 105        }
 106
 107        return NULL;
 108}
 109
 110struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
 111{
 112        return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
 113                            struct blkio_cgroup, css);
 114}
 115EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
 116
 117static inline void
 118blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
 119{
 120        struct blkio_policy_type *blkiop;
 121
 122        list_for_each_entry(blkiop, &blkio_list, list) {
 123                /* If this policy does not own the blkg, do not send updates */
 124                if (blkiop->plid != blkg->plid)
 125                        continue;
 126                if (blkiop->ops.blkio_update_group_weight_fn)
 127                        blkiop->ops.blkio_update_group_weight_fn(blkg->key,
 128                                                        blkg, weight);
 129        }
 130}
 131
 132static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps,
 133                                int fileid)
 134{
 135        struct blkio_policy_type *blkiop;
 136
 137        list_for_each_entry(blkiop, &blkio_list, list) {
 138
 139                /* If this policy does not own the blkg, do not send updates */
 140                if (blkiop->plid != blkg->plid)
 141                        continue;
 142
 143                if (fileid == BLKIO_THROTL_read_bps_device
 144                    && blkiop->ops.blkio_update_group_read_bps_fn)
 145                        blkiop->ops.blkio_update_group_read_bps_fn(blkg->key,
 146                                                                blkg, bps);
 147
 148                if (fileid == BLKIO_THROTL_write_bps_device
 149                    && blkiop->ops.blkio_update_group_write_bps_fn)
 150                        blkiop->ops.blkio_update_group_write_bps_fn(blkg->key,
 151                                                                blkg, bps);
 152        }
 153}
 154
 155static inline void blkio_update_group_iops(struct blkio_group *blkg,
 156                        unsigned int iops, int fileid)
 157{
 158        struct blkio_policy_type *blkiop;
 159
 160        list_for_each_entry(blkiop, &blkio_list, list) {
 161
 162                /* If this policy does not own the blkg, do not send updates */
 163                if (blkiop->plid != blkg->plid)
 164                        continue;
 165
 166                if (fileid == BLKIO_THROTL_read_iops_device
 167                    && blkiop->ops.blkio_update_group_read_iops_fn)
 168                        blkiop->ops.blkio_update_group_read_iops_fn(blkg->key,
 169                                                                blkg, iops);
 170
 171                if (fileid == BLKIO_THROTL_write_iops_device
 172                    && blkiop->ops.blkio_update_group_write_iops_fn)
 173                        blkiop->ops.blkio_update_group_write_iops_fn(blkg->key,
 174                                                                blkg,iops);
 175        }
 176}
 177
 178/*
 179 * Add to the appropriate stat variable depending on the request type.
 180 * This should be called with the blkg->stats_lock held.
 181 */
 182static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
 183                                bool sync)
 184{
 185        if (direction)
 186                stat[BLKIO_STAT_WRITE] += add;
 187        else
 188                stat[BLKIO_STAT_READ] += add;
 189        if (sync)
 190                stat[BLKIO_STAT_SYNC] += add;
 191        else
 192                stat[BLKIO_STAT_ASYNC] += add;
 193}
 194
 195/*
 196 * Decrements the appropriate stat variable if non-zero depending on the
 197 * request type. Panics on value being zero.
 198 * This should be called with the blkg->stats_lock held.
 199 */
 200static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
 201{
 202        if (direction) {
 203                BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
 204                stat[BLKIO_STAT_WRITE]--;
 205        } else {
 206                BUG_ON(stat[BLKIO_STAT_READ] == 0);
 207                stat[BLKIO_STAT_READ]--;
 208        }
 209        if (sync) {
 210                BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
 211                stat[BLKIO_STAT_SYNC]--;
 212        } else {
 213                BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
 214                stat[BLKIO_STAT_ASYNC]--;
 215        }
 216}
 217
 218#ifdef CONFIG_DEBUG_BLK_CGROUP
 219/* This should be called with the blkg->stats_lock held. */
 220static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
 221                                                struct blkio_group *curr_blkg)
 222{
 223        if (blkio_blkg_waiting(&blkg->stats))
 224                return;
 225        if (blkg == curr_blkg)
 226                return;
 227        blkg->stats.start_group_wait_time = sched_clock();
 228        blkio_mark_blkg_waiting(&blkg->stats);
 229}
 230
 231/* This should be called with the blkg->stats_lock held. */
 232static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
 233{
 234        unsigned long long now;
 235
 236        if (!blkio_blkg_waiting(stats))
 237                return;
 238
 239        now = sched_clock();
 240        if (time_after64(now, stats->start_group_wait_time))
 241                stats->group_wait_time += now - stats->start_group_wait_time;
 242        blkio_clear_blkg_waiting(stats);
 243}
 244
 245/* This should be called with the blkg->stats_lock held. */
 246static void blkio_end_empty_time(struct blkio_group_stats *stats)
 247{
 248        unsigned long long now;
 249
 250        if (!blkio_blkg_empty(stats))
 251                return;
 252
 253        now = sched_clock();
 254        if (time_after64(now, stats->start_empty_time))
 255                stats->empty_time += now - stats->start_empty_time;
 256        blkio_clear_blkg_empty(stats);
 257}
 258
 259void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
 260{
 261        unsigned long flags;
 262
 263        spin_lock_irqsave(&blkg->stats_lock, flags);
 264        BUG_ON(blkio_blkg_idling(&blkg->stats));
 265        blkg->stats.start_idle_time = sched_clock();
 266        blkio_mark_blkg_idling(&blkg->stats);
 267        spin_unlock_irqrestore(&blkg->stats_lock, flags);
 268}
 269EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
 270
 271void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
 272{
 273        unsigned long flags;
 274        unsigned long long now;
 275        struct blkio_group_stats *stats;
 276
 277        spin_lock_irqsave(&blkg->stats_lock, flags);
 278        stats = &blkg->stats;
 279        if (blkio_blkg_idling(stats)) {
 280                now = sched_clock();
 281                if (time_after64(now, stats->start_idle_time))
 282                        stats->idle_time += now - stats->start_idle_time;
 283                blkio_clear_blkg_idling(stats);
 284        }
 285        spin_unlock_irqrestore(&blkg->stats_lock, flags);
 286}
 287EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
 288
 289void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
 290{
 291        unsigned long flags;
 292        struct blkio_group_stats *stats;
 293
 294        spin_lock_irqsave(&blkg->stats_lock, flags);
 295        stats = &blkg->stats;
 296        stats->avg_queue_size_sum +=
 297                        stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
 298                        stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
 299        stats->avg_queue_size_samples++;
 300        blkio_update_group_wait_time(stats);
 301        spin_unlock_irqrestore(&blkg->stats_lock, flags);
 302}
 303EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
 304
 305void blkiocg_set_start_empty_time(struct blkio_group *blkg)
 306{
 307        unsigned long flags;
 308        struct blkio_group_stats *stats;
 309
 310        spin_lock_irqsave(&blkg->stats_lock, flags);
 311        stats = &blkg->stats;
 312
 313        if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
 314                        stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
 315                spin_unlock_irqrestore(&blkg->stats_lock, flags);
 316                return;
 317        }
 318
 319        /*
 320         * group is already marked empty. This can happen if cfqq got new
 321         * request in parent group and moved to this group while being added
 322         * to service tree. Just ignore the event and move on.
 323         */
 324        if(blkio_blkg_empty(stats)) {
 325                spin_unlock_irqrestore(&blkg->stats_lock, flags);
 326                return;
 327        }
 328
 329        stats->start_empty_time = sched_clock();
 330        blkio_mark_blkg_empty(stats);
 331        spin_unlock_irqrestore(&blkg->stats_lock, flags);
 332}
 333EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
 334
 335void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
 336                        unsigned long dequeue)
 337{
 338        blkg->stats.dequeue += dequeue;
 339}
 340EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
 341#else
 342static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
 343                                        struct blkio_group *curr_blkg) {}
 344static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {}
 345#endif
 346
 347void blkiocg_update_io_add_stats(struct blkio_group *blkg,
 348                        struct blkio_group *curr_blkg, bool direction,
 349                        bool sync)
 350{
 351        unsigned long flags;
 352
 353        spin_lock_irqsave(&blkg->stats_lock, flags);
 354        blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
 355                        sync);
 356        blkio_end_empty_time(&blkg->stats);
 357        blkio_set_start_group_wait_time(blkg, curr_blkg);
 358        spin_unlock_irqrestore(&blkg->stats_lock, flags);
 359}
 360EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
 361
 362void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
 363                                                bool direction, bool sync)
 364{
 365        unsigned long flags;
 366
 367        spin_lock_irqsave(&blkg->stats_lock, flags);
 368        blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED],
 369                                        direction, sync);
 370        spin_unlock_irqrestore(&blkg->stats_lock, flags);
 371}
 372EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
 373
 374void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
 375{
 376        unsigned long flags;
 377
 378        spin_lock_irqsave(&blkg->stats_lock, flags);
 379        blkg->stats.time += time;
 380        spin_unlock_irqrestore(&blkg->stats_lock, flags);
 381}
 382EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
 383
 384void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
 385                                uint64_t bytes, bool direction, bool sync)
 386{
 387        struct blkio_group_stats *stats;
 388        unsigned long flags;
 389
 390        spin_lock_irqsave(&blkg->stats_lock, flags);
 391        stats = &blkg->stats;
 392        stats->sectors += bytes >> 9;
 393        blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction,
 394                        sync);
 395        blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes,
 396                        direction, sync);
 397        spin_unlock_irqrestore(&blkg->stats_lock, flags);
 398}
 399EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
 400
 401void blkiocg_update_completion_stats(struct blkio_group *blkg,
 402        uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
 403{
 404        struct blkio_group_stats *stats;
 405        unsigned long flags;
 406        unsigned long long now = sched_clock();
 407
 408        spin_lock_irqsave(&blkg->stats_lock, flags);
 409        stats = &blkg->stats;
 410        if (time_after64(now, io_start_time))
 411                blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
 412                                now - io_start_time, direction, sync);
 413        if (time_after64(io_start_time, start_time))
 414                blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
 415                                io_start_time - start_time, direction, sync);
 416        spin_unlock_irqrestore(&blkg->stats_lock, flags);
 417}
 418EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
 419
 420void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
 421                                        bool sync)
 422{
 423        unsigned long flags;
 424
 425        spin_lock_irqsave(&blkg->stats_lock, flags);
 426        blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction,
 427                        sync);
 428        spin_unlock_irqrestore(&blkg->stats_lock, flags);
 429}
 430EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
 431
 432void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 433                struct blkio_group *blkg, void *key, dev_t dev,
 434                enum blkio_policy_id plid)
 435{
 436        unsigned long flags;
 437
 438        spin_lock_irqsave(&blkcg->lock, flags);
 439        spin_lock_init(&blkg->stats_lock);
 440        rcu_assign_pointer(blkg->key, key);
 441        blkg->blkcg_id = css_id(&blkcg->css);
 442        hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
 443        blkg->plid = plid;
 444        spin_unlock_irqrestore(&blkcg->lock, flags);
 445        /* Need to take css reference ? */
 446        cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
 447        blkg->dev = dev;
 448}
 449EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
 450
 451static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
 452{
 453        hlist_del_init_rcu(&blkg->blkcg_node);
 454        blkg->blkcg_id = 0;
 455}
 456
 457/*
 458 * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
 459 * indicating that blk_group was unhashed by the time we got to it.
 460 */
 461int blkiocg_del_blkio_group(struct blkio_group *blkg)
 462{
 463        struct blkio_cgroup *blkcg;
 464        unsigned long flags;
 465        struct cgroup_subsys_state *css;
 466        int ret = 1;
 467
 468        rcu_read_lock();
 469        css = css_lookup(&blkio_subsys, blkg->blkcg_id);
 470        if (css) {
 471                blkcg = container_of(css, struct blkio_cgroup, css);
 472                spin_lock_irqsave(&blkcg->lock, flags);
 473                if (!hlist_unhashed(&blkg->blkcg_node)) {
 474                        __blkiocg_del_blkio_group(blkg);
 475                        ret = 0;
 476                }
 477                spin_unlock_irqrestore(&blkcg->lock, flags);
 478        }
 479
 480        rcu_read_unlock();
 481        return ret;
 482}
 483EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
 484
 485/* called under rcu_read_lock(). */
 486struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
 487{
 488        struct blkio_group *blkg;
 489        struct hlist_node *n;
 490        void *__key;
 491
 492        hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
 493                __key = blkg->key;
 494                if (__key == key)
 495                        return blkg;
 496        }
 497
 498        return NULL;
 499}
 500EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
 501
 502static int
 503blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 504{
 505        struct blkio_cgroup *blkcg;
 506        struct blkio_group *blkg;
 507        struct blkio_group_stats *stats;
 508        struct hlist_node *n;
 509        uint64_t queued[BLKIO_STAT_TOTAL];
 510        int i;
 511#ifdef CONFIG_DEBUG_BLK_CGROUP
 512        bool idling, waiting, empty;
 513        unsigned long long now = sched_clock();
 514#endif
 515
 516        blkcg = cgroup_to_blkio_cgroup(cgroup);
 517        spin_lock_irq(&blkcg->lock);
 518        hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
 519                spin_lock(&blkg->stats_lock);
 520                stats = &blkg->stats;
 521#ifdef CONFIG_DEBUG_BLK_CGROUP
 522                idling = blkio_blkg_idling(stats);
 523                waiting = blkio_blkg_waiting(stats);
 524                empty = blkio_blkg_empty(stats);
 525#endif
 526                for (i = 0; i < BLKIO_STAT_TOTAL; i++)
 527                        queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
 528                memset(stats, 0, sizeof(struct blkio_group_stats));
 529                for (i = 0; i < BLKIO_STAT_TOTAL; i++)
 530                        stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
 531#ifdef CONFIG_DEBUG_BLK_CGROUP
 532                if (idling) {
 533                        blkio_mark_blkg_idling(stats);
 534                        stats->start_idle_time = now;
 535                }
 536                if (waiting) {
 537                        blkio_mark_blkg_waiting(stats);
 538                        stats->start_group_wait_time = now;
 539                }
 540                if (empty) {
 541                        blkio_mark_blkg_empty(stats);
 542                        stats->start_empty_time = now;
 543                }
 544#endif
 545                spin_unlock(&blkg->stats_lock);
 546        }
 547        spin_unlock_irq(&blkcg->lock);
 548        return 0;
 549}
 550
 551static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str,
 552                                int chars_left, bool diskname_only)
 553{
 554        snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev));
 555        chars_left -= strlen(str);
 556        if (chars_left <= 0) {
 557                printk(KERN_WARNING
 558                        "Possibly incorrect cgroup stat display format");
 559                return;
 560        }
 561        if (diskname_only)
 562                return;
 563        switch (type) {
 564        case BLKIO_STAT_READ:
 565                strlcat(str, " Read", chars_left);
 566                break;
 567        case BLKIO_STAT_WRITE:
 568                strlcat(str, " Write", chars_left);
 569                break;
 570        case BLKIO_STAT_SYNC:
 571                strlcat(str, " Sync", chars_left);
 572                break;
 573        case BLKIO_STAT_ASYNC:
 574                strlcat(str, " Async", chars_left);
 575                break;
 576        case BLKIO_STAT_TOTAL:
 577                strlcat(str, " Total", chars_left);
 578                break;
 579        default:
 580                strlcat(str, " Invalid", chars_left);
 581        }
 582}
 583
 584static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
 585                                struct cgroup_map_cb *cb, dev_t dev)
 586{
 587        blkio_get_key_name(0, dev, str, chars_left, true);
 588        cb->fill(cb, str, val);
 589        return val;
 590}
 591
 592/* This should be called with blkg->stats_lock held */
 593static uint64_t blkio_get_stat(struct blkio_group *blkg,
 594                struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
 595{
 596        uint64_t disk_total;
 597        char key_str[MAX_KEY_LEN];
 598        enum stat_sub_type sub_type;
 599
 600        if (type == BLKIO_STAT_TIME)
 601                return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
 602                                        blkg->stats.time, cb, dev);
 603        if (type == BLKIO_STAT_SECTORS)
 604                return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
 605                                        blkg->stats.sectors, cb, dev);
 606#ifdef CONFIG_DEBUG_BLK_CGROUP
 607        if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
 608                uint64_t sum = blkg->stats.avg_queue_size_sum;
 609                uint64_t samples = blkg->stats.avg_queue_size_samples;
 610                if (samples)
 611                        do_div(sum, samples);
 612                else
 613                        sum = 0;
 614                return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev);
 615        }
 616        if (type == BLKIO_STAT_GROUP_WAIT_TIME)
 617                return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
 618                                        blkg->stats.group_wait_time, cb, dev);
 619        if (type == BLKIO_STAT_IDLE_TIME)
 620                return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
 621                                        blkg->stats.idle_time, cb, dev);
 622        if (type == BLKIO_STAT_EMPTY_TIME)
 623                return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
 624                                        blkg->stats.empty_time, cb, dev);
 625        if (type == BLKIO_STAT_DEQUEUE)
 626                return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
 627                                        blkg->stats.dequeue, cb, dev);
 628#endif
 629
 630        for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
 631                        sub_type++) {
 632                blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
 633                cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
 634        }
 635        disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
 636                        blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
 637        blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
 638        cb->fill(cb, key_str, disk_total);
 639        return disk_total;
 640}
 641
 642static int blkio_check_dev_num(dev_t dev)
 643{
 644        int part = 0;
 645        struct gendisk *disk;
 646
 647        disk = get_gendisk(dev, &part);
 648        if (!disk || part)
 649                return -ENODEV;
 650
 651        return 0;
 652}
 653
 654static int blkio_policy_parse_and_set(char *buf,
 655        struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid)
 656{
 657        char *s[4], *p, *major_s = NULL, *minor_s = NULL;
 658        int ret;
 659        unsigned long major, minor, temp;
 660        int i = 0;
 661        dev_t dev;
 662        u64 bps, iops;
 663
 664        memset(s, 0, sizeof(s));
 665
 666        while ((p = strsep(&buf, " ")) != NULL) {
 667                if (!*p)
 668                        continue;
 669
 670                s[i++] = p;
 671
 672                /* Prevent from inputing too many things */
 673                if (i == 3)
 674                        break;
 675        }
 676
 677        if (i != 2)
 678                return -EINVAL;
 679
 680        p = strsep(&s[0], ":");
 681        if (p != NULL)
 682                major_s = p;
 683        else
 684                return -EINVAL;
 685
 686        minor_s = s[0];
 687        if (!minor_s)
 688                return -EINVAL;
 689
 690        ret = strict_strtoul(major_s, 10, &major);
 691        if (ret)
 692                return -EINVAL;
 693
 694        ret = strict_strtoul(minor_s, 10, &minor);
 695        if (ret)
 696                return -EINVAL;
 697
 698        dev = MKDEV(major, minor);
 699
 700        ret = blkio_check_dev_num(dev);
 701        if (ret)
 702                return ret;
 703
 704        newpn->dev = dev;
 705
 706        if (s[1] == NULL)
 707                return -EINVAL;
 708
 709        switch (plid) {
 710        case BLKIO_POLICY_PROP:
 711                ret = strict_strtoul(s[1], 10, &temp);
 712                if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
 713                        temp > BLKIO_WEIGHT_MAX)
 714                        return -EINVAL;
 715
 716                newpn->plid = plid;
 717                newpn->fileid = fileid;
 718                newpn->val.weight = temp;
 719                break;
 720        case BLKIO_POLICY_THROTL:
 721                switch(fileid) {
 722                case BLKIO_THROTL_read_bps_device:
 723                case BLKIO_THROTL_write_bps_device:
 724                        ret = strict_strtoull(s[1], 10, &bps);
 725                        if (ret)
 726                                return -EINVAL;
 727
 728                        newpn->plid = plid;
 729                        newpn->fileid = fileid;
 730                        newpn->val.bps = bps;
 731                        break;
 732                case BLKIO_THROTL_read_iops_device:
 733                case BLKIO_THROTL_write_iops_device:
 734                        ret = strict_strtoull(s[1], 10, &iops);
 735                        if (ret)
 736                                return -EINVAL;
 737
 738                        if (iops > THROTL_IOPS_MAX)
 739                                return -EINVAL;
 740
 741                        newpn->plid = plid;
 742                        newpn->fileid = fileid;
 743                        newpn->val.iops = (unsigned int)iops;
 744                        break;
 745                }
 746                break;
 747        default:
 748                BUG();
 749        }
 750
 751        return 0;
 752}
 753
 754unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
 755                              dev_t dev)
 756{
 757        struct blkio_policy_node *pn;
 758
 759        pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP,
 760                                BLKIO_PROP_weight_device);
 761        if (pn)
 762                return pn->val.weight;
 763        else
 764                return blkcg->weight;
 765}
 766EXPORT_SYMBOL_GPL(blkcg_get_weight);
 767
 768uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
 769{
 770        struct blkio_policy_node *pn;
 771
 772        pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
 773                                BLKIO_THROTL_read_bps_device);
 774        if (pn)
 775                return pn->val.bps;
 776        else
 777                return -1;
 778}
 779
 780uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev)
 781{
 782        struct blkio_policy_node *pn;
 783        pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
 784                                BLKIO_THROTL_write_bps_device);
 785        if (pn)
 786                return pn->val.bps;
 787        else
 788                return -1;
 789}
 790
 791unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev)
 792{
 793        struct blkio_policy_node *pn;
 794
 795        pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
 796                                BLKIO_THROTL_read_iops_device);
 797        if (pn)
 798                return pn->val.iops;
 799        else
 800                return -1;
 801}
 802
 803unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev)
 804{
 805        struct blkio_policy_node *pn;
 806        pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
 807                                BLKIO_THROTL_write_iops_device);
 808        if (pn)
 809                return pn->val.iops;
 810        else
 811                return -1;
 812}
 813
 814/* Checks whether user asked for deleting a policy rule */
 815static bool blkio_delete_rule_command(struct blkio_policy_node *pn)
 816{
 817        switch(pn->plid) {
 818        case BLKIO_POLICY_PROP:
 819                if (pn->val.weight == 0)
 820                        return 1;
 821                break;
 822        case BLKIO_POLICY_THROTL:
 823                switch(pn->fileid) {
 824                case BLKIO_THROTL_read_bps_device:
 825                case BLKIO_THROTL_write_bps_device:
 826                        if (pn->val.bps == 0)
 827                                return 1;
 828                        break;
 829                case BLKIO_THROTL_read_iops_device:
 830                case BLKIO_THROTL_write_iops_device:
 831                        if (pn->val.iops == 0)
 832                                return 1;
 833                }
 834                break;
 835        default:
 836                BUG();
 837        }
 838
 839        return 0;
 840}
 841
 842static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
 843                                        struct blkio_policy_node *newpn)
 844{
 845        switch(oldpn->plid) {
 846        case BLKIO_POLICY_PROP:
 847                oldpn->val.weight = newpn->val.weight;
 848                break;
 849        case BLKIO_POLICY_THROTL:
 850                switch(newpn->fileid) {
 851                case BLKIO_THROTL_read_bps_device:
 852                case BLKIO_THROTL_write_bps_device:
 853                        oldpn->val.bps = newpn->val.bps;
 854                        break;
 855                case BLKIO_THROTL_read_iops_device:
 856                case BLKIO_THROTL_write_iops_device:
 857                        oldpn->val.iops = newpn->val.iops;
 858                }
 859                break;
 860        default:
 861                BUG();
 862        }
 863}
 864
 865/*
 866 * Some rules/values in blkg have changed. Propogate those to respective
 867 * policies.
 868 */
 869static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
 870                struct blkio_group *blkg, struct blkio_policy_node *pn)
 871{
 872        unsigned int weight, iops;
 873        u64 bps;
 874
 875        switch(pn->plid) {
 876        case BLKIO_POLICY_PROP:
 877                weight = pn->val.weight ? pn->val.weight :
 878                                blkcg->weight;
 879                blkio_update_group_weight(blkg, weight);
 880                break;
 881        case BLKIO_POLICY_THROTL:
 882                switch(pn->fileid) {
 883                case BLKIO_THROTL_read_bps_device:
 884                case BLKIO_THROTL_write_bps_device:
 885                        bps = pn->val.bps ? pn->val.bps : (-1);
 886                        blkio_update_group_bps(blkg, bps, pn->fileid);
 887                        break;
 888                case BLKIO_THROTL_read_iops_device:
 889                case BLKIO_THROTL_write_iops_device:
 890                        iops = pn->val.iops ? pn->val.iops : (-1);
 891                        blkio_update_group_iops(blkg, iops, pn->fileid);
 892                        break;
 893                }
 894                break;
 895        default:
 896                BUG();
 897        }
 898}
 899
 900/*
 901 * A policy node rule has been updated. Propogate this update to all the
 902 * block groups which might be affected by this update.
 903 */
 904static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg,
 905                                struct blkio_policy_node *pn)
 906{
 907        struct blkio_group *blkg;
 908        struct hlist_node *n;
 909
 910        spin_lock(&blkio_list_lock);
 911        spin_lock_irq(&blkcg->lock);
 912
 913        hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
 914                if (pn->dev != blkg->dev || pn->plid != blkg->plid)
 915                        continue;
 916                blkio_update_blkg_policy(blkcg, blkg, pn);
 917        }
 918
 919        spin_unlock_irq(&blkcg->lock);
 920        spin_unlock(&blkio_list_lock);
 921}
 922
 923static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
 924                                       const char *buffer)
 925{
 926        int ret = 0;
 927        char *buf;
 928        struct blkio_policy_node *newpn, *pn;
 929        struct blkio_cgroup *blkcg;
 930        int keep_newpn = 0;
 931        enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
 932        int fileid = BLKIOFILE_ATTR(cft->private);
 933
 934        buf = kstrdup(buffer, GFP_KERNEL);
 935        if (!buf)
 936                return -ENOMEM;
 937
 938        newpn = kzalloc(sizeof(*newpn), GFP_KERNEL);
 939        if (!newpn) {
 940                ret = -ENOMEM;
 941                goto free_buf;
 942        }
 943
 944        ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid);
 945        if (ret)
 946                goto free_newpn;
 947
 948        blkcg = cgroup_to_blkio_cgroup(cgrp);
 949
 950        spin_lock_irq(&blkcg->lock);
 951
 952        pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid);
 953        if (!pn) {
 954                if (!blkio_delete_rule_command(newpn)) {
 955                        blkio_policy_insert_node(blkcg, newpn);
 956                        keep_newpn = 1;
 957                }
 958                spin_unlock_irq(&blkcg->lock);
 959                goto update_io_group;
 960        }
 961
 962        if (blkio_delete_rule_command(newpn)) {
 963                blkio_policy_delete_node(pn);
 964                spin_unlock_irq(&blkcg->lock);
 965                goto update_io_group;
 966        }
 967        spin_unlock_irq(&blkcg->lock);
 968
 969        blkio_update_policy_rule(pn, newpn);
 970
 971update_io_group:
 972        blkio_update_policy_node_blkg(blkcg, newpn);
 973
 974free_newpn:
 975        if (!keep_newpn)
 976                kfree(newpn);
 977free_buf:
 978        kfree(buf);
 979        return ret;
 980}
 981
 982static void
 983blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn)
 984{
 985        switch(pn->plid) {
 986                case BLKIO_POLICY_PROP:
 987                        if (pn->fileid == BLKIO_PROP_weight_device)
 988                                seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
 989                                        MINOR(pn->dev), pn->val.weight);
 990                        break;
 991                case BLKIO_POLICY_THROTL:
 992                        switch(pn->fileid) {
 993                        case BLKIO_THROTL_read_bps_device:
 994                        case BLKIO_THROTL_write_bps_device:
 995                                seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev),
 996                                        MINOR(pn->dev), pn->val.bps);
 997                                break;
 998                        case BLKIO_THROTL_read_iops_device:
 999                        case BLKIO_THROTL_write_iops_device:
1000                                seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
1001                                        MINOR(pn->dev), pn->val.iops);
1002                                break;
1003                        }
1004                        break;
1005                default:
1006                        BUG();
1007        }
1008}
1009
1010/* cgroup files which read their data from policy nodes end up here */
1011static void blkio_read_policy_node_files(struct cftype *cft,
1012                        struct blkio_cgroup *blkcg, struct seq_file *m)
1013{
1014        struct blkio_policy_node *pn;
1015
1016        if (!list_empty(&blkcg->policy_list)) {
1017                spin_lock_irq(&blkcg->lock);
1018                list_for_each_entry(pn, &blkcg->policy_list, node) {
1019                        if (!pn_matches_cftype(cft, pn))
1020                                continue;
1021                        blkio_print_policy_node(m, pn);
1022                }
1023                spin_unlock_irq(&blkcg->lock);
1024        }
1025}
1026
1027static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
1028                                struct seq_file *m)
1029{
1030        struct blkio_cgroup *blkcg;
1031        enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1032        int name = BLKIOFILE_ATTR(cft->private);
1033
1034        blkcg = cgroup_to_blkio_cgroup(cgrp);
1035
1036        switch(plid) {
1037        case BLKIO_POLICY_PROP:
1038                switch(name) {
1039                case BLKIO_PROP_weight_device:
1040                        blkio_read_policy_node_files(cft, blkcg, m);
1041                        return 0;
1042                default:
1043                        BUG();
1044                }
1045                break;
1046        case BLKIO_POLICY_THROTL:
1047                switch(name){
1048                case BLKIO_THROTL_read_bps_device:
1049                case BLKIO_THROTL_write_bps_device:
1050                case BLKIO_THROTL_read_iops_device:
1051                case BLKIO_THROTL_write_iops_device:
1052                        blkio_read_policy_node_files(cft, blkcg, m);
1053                        return 0;
1054                default:
1055                        BUG();
1056                }
1057                break;
1058        default:
1059                BUG();
1060        }
1061
1062        return 0;
1063}
1064
1065static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
1066                struct cftype *cft, struct cgroup_map_cb *cb, enum stat_type type,
1067                bool show_total)
1068{
1069        struct blkio_group *blkg;
1070        struct hlist_node *n;
1071        uint64_t cgroup_total = 0;
1072
1073        rcu_read_lock();
1074        hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
1075                if (blkg->dev) {
1076                        if (!cftype_blkg_same_policy(cft, blkg))
1077                                continue;
1078                        spin_lock_irq(&blkg->stats_lock);
1079                        cgroup_total += blkio_get_stat(blkg, cb, blkg->dev,
1080                                                type);
1081                        spin_unlock_irq(&blkg->stats_lock);
1082                }
1083        }
1084        if (show_total)
1085                cb->fill(cb, "Total", cgroup_total);
1086        rcu_read_unlock();
1087        return 0;
1088}
1089
1090/* All map kind of cgroup file get serviced by this function */
1091static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
1092                                struct cgroup_map_cb *cb)
1093{
1094        struct blkio_cgroup *blkcg;
1095        enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1096        int name = BLKIOFILE_ATTR(cft->private);
1097
1098        blkcg = cgroup_to_blkio_cgroup(cgrp);
1099
1100        switch(plid) {
1101        case BLKIO_POLICY_PROP:
1102                switch(name) {
1103                case BLKIO_PROP_time:
1104                        return blkio_read_blkg_stats(blkcg, cft, cb,
1105                                                BLKIO_STAT_TIME, 0);
1106                case BLKIO_PROP_sectors:
1107                        return blkio_read_blkg_stats(blkcg, cft, cb,
1108                                                BLKIO_STAT_SECTORS, 0);
1109                case BLKIO_PROP_io_service_bytes:
1110                        return blkio_read_blkg_stats(blkcg, cft, cb,
1111                                                BLKIO_STAT_SERVICE_BYTES, 1);
1112                case BLKIO_PROP_io_serviced:
1113                        return blkio_read_blkg_stats(blkcg, cft, cb,
1114                                                BLKIO_STAT_SERVICED, 1);
1115                case BLKIO_PROP_io_service_time:
1116                        return blkio_read_blkg_stats(blkcg, cft, cb,
1117                                                BLKIO_STAT_SERVICE_TIME, 1);
1118                case BLKIO_PROP_io_wait_time:
1119                        return blkio_read_blkg_stats(blkcg, cft, cb,
1120                                                BLKIO_STAT_WAIT_TIME, 1);
1121                case BLKIO_PROP_io_merged:
1122                        return blkio_read_blkg_stats(blkcg, cft, cb,
1123                                                BLKIO_STAT_MERGED, 1);
1124                case BLKIO_PROP_io_queued:
1125                        return blkio_read_blkg_stats(blkcg, cft, cb,
1126                                                BLKIO_STAT_QUEUED, 1);
1127#ifdef CONFIG_DEBUG_BLK_CGROUP
1128                case BLKIO_PROP_dequeue:
1129                        return blkio_read_blkg_stats(blkcg, cft, cb,
1130                                                BLKIO_STAT_DEQUEUE, 0);
1131                case BLKIO_PROP_avg_queue_size:
1132                        return blkio_read_blkg_stats(blkcg, cft, cb,
1133                                                BLKIO_STAT_AVG_QUEUE_SIZE, 0);
1134                case BLKIO_PROP_group_wait_time:
1135                        return blkio_read_blkg_stats(blkcg, cft, cb,
1136                                                BLKIO_STAT_GROUP_WAIT_TIME, 0);
1137                case BLKIO_PROP_idle_time:
1138                        return blkio_read_blkg_stats(blkcg, cft, cb,
1139                                                BLKIO_STAT_IDLE_TIME, 0);
1140                case BLKIO_PROP_empty_time:
1141                        return blkio_read_blkg_stats(blkcg, cft, cb,
1142                                                BLKIO_STAT_EMPTY_TIME, 0);
1143#endif
1144                default:
1145                        BUG();
1146                }
1147                break;
1148        case BLKIO_POLICY_THROTL:
1149                switch(name){
1150                case BLKIO_THROTL_io_service_bytes:
1151                        return blkio_read_blkg_stats(blkcg, cft, cb,
1152                                                BLKIO_STAT_SERVICE_BYTES, 1);
1153                case BLKIO_THROTL_io_serviced:
1154                        return blkio_read_blkg_stats(blkcg, cft, cb,
1155                                                BLKIO_STAT_SERVICED, 1);
1156                default:
1157                        BUG();
1158                }
1159                break;
1160        default:
1161                BUG();
1162        }
1163
1164        return 0;
1165}
1166
1167static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val)
1168{
1169        struct blkio_group *blkg;
1170        struct hlist_node *n;
1171        struct blkio_policy_node *pn;
1172
1173        if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
1174                return -EINVAL;
1175
1176        spin_lock(&blkio_list_lock);
1177        spin_lock_irq(&blkcg->lock);
1178        blkcg->weight = (unsigned int)val;
1179
1180        hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1181                pn = blkio_policy_search_node(blkcg, blkg->dev,
1182                                BLKIO_POLICY_PROP, BLKIO_PROP_weight_device);
1183                if (pn)
1184                        continue;
1185
1186                blkio_update_group_weight(blkg, blkcg->weight);
1187        }
1188        spin_unlock_irq(&blkcg->lock);
1189        spin_unlock(&blkio_list_lock);
1190        return 0;
1191}
1192
1193static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
1194        struct blkio_cgroup *blkcg;
1195        enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1196        int name = BLKIOFILE_ATTR(cft->private);
1197
1198        blkcg = cgroup_to_blkio_cgroup(cgrp);
1199
1200        switch(plid) {
1201        case BLKIO_POLICY_PROP:
1202                switch(name) {
1203                case BLKIO_PROP_weight:
1204                        return (u64)blkcg->weight;
1205                }
1206                break;
1207        default:
1208                BUG();
1209        }
1210        return 0;
1211}
1212
1213static int
1214blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1215{
1216        struct blkio_cgroup *blkcg;
1217        enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1218        int name = BLKIOFILE_ATTR(cft->private);
1219
1220        blkcg = cgroup_to_blkio_cgroup(cgrp);
1221
1222        switch(plid) {
1223        case BLKIO_POLICY_PROP:
1224                switch(name) {
1225                case BLKIO_PROP_weight:
1226                        return blkio_weight_write(blkcg, val);
1227                }
1228                break;
1229        default:
1230                BUG();
1231        }
1232
1233        return 0;
1234}
1235
1236struct cftype blkio_files[] = {
1237        {
1238                .name = "weight_device",
1239                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1240                                BLKIO_PROP_weight_device),
1241                .read_seq_string = blkiocg_file_read,
1242                .write_string = blkiocg_file_write,
1243                .max_write_len = 256,
1244        },
1245        {
1246                .name = "weight",
1247                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1248                                BLKIO_PROP_weight),
1249                .read_u64 = blkiocg_file_read_u64,
1250                .write_u64 = blkiocg_file_write_u64,
1251        },
1252        {
1253                .name = "time",
1254                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1255                                BLKIO_PROP_time),
1256                .read_map = blkiocg_file_read_map,
1257        },
1258        {
1259                .name = "sectors",
1260                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1261                                BLKIO_PROP_sectors),
1262                .read_map = blkiocg_file_read_map,
1263        },
1264        {
1265                .name = "io_service_bytes",
1266                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1267                                BLKIO_PROP_io_service_bytes),
1268                .read_map = blkiocg_file_read_map,
1269        },
1270        {
1271                .name = "io_serviced",
1272                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1273                                BLKIO_PROP_io_serviced),
1274                .read_map = blkiocg_file_read_map,
1275        },
1276        {
1277                .name = "io_service_time",
1278                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1279                                BLKIO_PROP_io_service_time),
1280                .read_map = blkiocg_file_read_map,
1281        },
1282        {
1283                .name = "io_wait_time",
1284                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1285                                BLKIO_PROP_io_wait_time),
1286                .read_map = blkiocg_file_read_map,
1287        },
1288        {
1289                .name = "io_merged",
1290                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1291                                BLKIO_PROP_io_merged),
1292                .read_map = blkiocg_file_read_map,
1293        },
1294        {
1295                .name = "io_queued",
1296                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1297                                BLKIO_PROP_io_queued),
1298                .read_map = blkiocg_file_read_map,
1299        },
1300        {
1301                .name = "reset_stats",
1302                .write_u64 = blkiocg_reset_stats,
1303        },
1304#ifdef CONFIG_BLK_DEV_THROTTLING
1305        {
1306                .name = "throttle.read_bps_device",
1307                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1308                                BLKIO_THROTL_read_bps_device),
1309                .read_seq_string = blkiocg_file_read,
1310                .write_string = blkiocg_file_write,
1311                .max_write_len = 256,
1312        },
1313
1314        {
1315                .name = "throttle.write_bps_device",
1316                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1317                                BLKIO_THROTL_write_bps_device),
1318                .read_seq_string = blkiocg_file_read,
1319                .write_string = blkiocg_file_write,
1320                .max_write_len = 256,
1321        },
1322
1323        {
1324                .name = "throttle.read_iops_device",
1325                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1326                                BLKIO_THROTL_read_iops_device),
1327                .read_seq_string = blkiocg_file_read,
1328                .write_string = blkiocg_file_write,
1329                .max_write_len = 256,
1330        },
1331
1332        {
1333                .name = "throttle.write_iops_device",
1334                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1335                                BLKIO_THROTL_write_iops_device),
1336                .read_seq_string = blkiocg_file_read,
1337                .write_string = blkiocg_file_write,
1338                .max_write_len = 256,
1339        },
1340        {
1341                .name = "throttle.io_service_bytes",
1342                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1343                                BLKIO_THROTL_io_service_bytes),
1344                .read_map = blkiocg_file_read_map,
1345        },
1346        {
1347                .name = "throttle.io_serviced",
1348                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1349                                BLKIO_THROTL_io_serviced),
1350                .read_map = blkiocg_file_read_map,
1351        },
1352#endif /* CONFIG_BLK_DEV_THROTTLING */
1353
1354#ifdef CONFIG_DEBUG_BLK_CGROUP
1355        {
1356                .name = "avg_queue_size",
1357                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1358                                BLKIO_PROP_avg_queue_size),
1359                .read_map = blkiocg_file_read_map,
1360        },
1361        {
1362                .name = "group_wait_time",
1363                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1364                                BLKIO_PROP_group_wait_time),
1365                .read_map = blkiocg_file_read_map,
1366        },
1367        {
1368                .name = "idle_time",
1369                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1370                                BLKIO_PROP_idle_time),
1371                .read_map = blkiocg_file_read_map,
1372        },
1373        {
1374                .name = "empty_time",
1375                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1376                                BLKIO_PROP_empty_time),
1377                .read_map = blkiocg_file_read_map,
1378        },
1379        {
1380                .name = "dequeue",
1381                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1382                                BLKIO_PROP_dequeue),
1383                .read_map = blkiocg_file_read_map,
1384        },
1385#endif
1386};
1387
1388static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1389{
1390        return cgroup_add_files(cgroup, subsys, blkio_files,
1391                                ARRAY_SIZE(blkio_files));
1392}
1393
1394static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1395{
1396        struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
1397        unsigned long flags;
1398        struct blkio_group *blkg;
1399        void *key;
1400        struct blkio_policy_type *blkiop;
1401        struct blkio_policy_node *pn, *pntmp;
1402
1403        rcu_read_lock();
1404        do {
1405                spin_lock_irqsave(&blkcg->lock, flags);
1406
1407                if (hlist_empty(&blkcg->blkg_list)) {
1408                        spin_unlock_irqrestore(&blkcg->lock, flags);
1409                        break;
1410                }
1411
1412                blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
1413                                        blkcg_node);
1414                key = rcu_dereference(blkg->key);
1415                __blkiocg_del_blkio_group(blkg);
1416
1417                spin_unlock_irqrestore(&blkcg->lock, flags);
1418
1419                /*
1420                 * This blkio_group is being unlinked as associated cgroup is
1421                 * going away. Let all the IO controlling policies know about
1422                 * this event.
1423                 */
1424                spin_lock(&blkio_list_lock);
1425                list_for_each_entry(blkiop, &blkio_list, list) {
1426                        if (blkiop->plid != blkg->plid)
1427                                continue;
1428                        blkiop->ops.blkio_unlink_group_fn(key, blkg);
1429                }
1430                spin_unlock(&blkio_list_lock);
1431        } while (1);
1432
1433        list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) {
1434                blkio_policy_delete_node(pn);
1435                kfree(pn);
1436        }
1437
1438        free_css_id(&blkio_subsys, &blkcg->css);
1439        rcu_read_unlock();
1440        if (blkcg != &blkio_root_cgroup)
1441                kfree(blkcg);
1442}
1443
1444static struct cgroup_subsys_state *
1445blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1446{
1447        struct blkio_cgroup *blkcg;
1448        struct cgroup *parent = cgroup->parent;
1449
1450        if (!parent) {
1451                blkcg = &blkio_root_cgroup;
1452                goto done;
1453        }
1454
1455        blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
1456        if (!blkcg)
1457                return ERR_PTR(-ENOMEM);
1458
1459        blkcg->weight = BLKIO_WEIGHT_DEFAULT;
1460done:
1461        spin_lock_init(&blkcg->lock);
1462        INIT_HLIST_HEAD(&blkcg->blkg_list);
1463
1464        INIT_LIST_HEAD(&blkcg->policy_list);
1465        return &blkcg->css;
1466}
1467
1468/*
1469 * We cannot support shared io contexts, as we have no mean to support
1470 * two tasks with the same ioc in two different groups without major rework
1471 * of the main cic data structures.  For now we allow a task to change
1472 * its cgroup only if it's the only owner of its ioc.
1473 */
1474static int blkiocg_can_attach(struct cgroup_subsys *subsys,
1475                                struct cgroup *cgroup, struct task_struct *tsk,
1476                                bool threadgroup)
1477{
1478        struct io_context *ioc;
1479        int ret = 0;
1480
1481        /* task_lock() is needed to avoid races with exit_io_context() */
1482        task_lock(tsk);
1483        ioc = tsk->io_context;
1484        if (ioc && atomic_read(&ioc->nr_tasks) > 1)
1485                ret = -EINVAL;
1486        task_unlock(tsk);
1487
1488        return ret;
1489}
1490
1491static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
1492                                struct cgroup *prev, struct task_struct *tsk,
1493                                bool threadgroup)
1494{
1495        struct io_context *ioc;
1496
1497        task_lock(tsk);
1498        ioc = tsk->io_context;
1499        if (ioc)
1500                ioc->cgroup_changed = 1;
1501        task_unlock(tsk);
1502}
1503
1504void blkio_policy_register(struct blkio_policy_type *blkiop)
1505{
1506        spin_lock(&blkio_list_lock);
1507        list_add_tail(&blkiop->list, &blkio_list);
1508        spin_unlock(&blkio_list_lock);
1509}
1510EXPORT_SYMBOL_GPL(blkio_policy_register);
1511
1512void blkio_policy_unregister(struct blkio_policy_type *blkiop)
1513{
1514        spin_lock(&blkio_list_lock);
1515        list_del_init(&blkiop->list);
1516        spin_unlock(&blkio_list_lock);
1517}
1518EXPORT_SYMBOL_GPL(blkio_policy_unregister);
1519
1520static int __init init_cgroup_blkio(void)
1521{
1522        return cgroup_load_subsys(&blkio_subsys);
1523}
1524
1525static void __exit exit_cgroup_blkio(void)
1526{
1527        cgroup_unload_subsys(&blkio_subsys);
1528}
1529
1530module_init(init_cgroup_blkio);
1531module_exit(exit_cgroup_blkio);
1532MODULE_LICENSE("GPL");
1533