linux/kernel/cgroup/rdma.c
<<
>>
Prefs
   1/*
   2 * RDMA resource limiting controller for cgroups.
   3 *
   4 * Used to allow a cgroup hierarchy to stop processes from consuming
   5 * additional RDMA resources after a certain limit is reached.
   6 *
   7 * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
   8 *
   9 * This file is subject to the terms and conditions of version 2 of the GNU
  10 * General Public License. See the file COPYING in the main directory of the
  11 * Linux distribution for more details.
  12 */
  13
  14#include <linux/bitops.h>
  15#include <linux/slab.h>
  16#include <linux/seq_file.h>
  17#include <linux/cgroup.h>
  18#include <linux/parser.h>
  19#include <linux/cgroup_rdma.h>
  20
  21#define RDMACG_MAX_STR "max"
  22
  23/*
  24 * Protects list of resource pools maintained on per cgroup basis
  25 * and rdma device list.
  26 */
  27static DEFINE_MUTEX(rdmacg_mutex);
  28static LIST_HEAD(rdmacg_devices);
  29
  30enum rdmacg_file_type {
  31        RDMACG_RESOURCE_TYPE_MAX,
  32        RDMACG_RESOURCE_TYPE_STAT,
  33};
  34
  35/*
  36 * resource table definition as to be seen by the user.
  37 * Need to add entries to it when more resources are
  38 * added/defined at IB verb/core layer.
  39 */
  40static char const *rdmacg_resource_names[] = {
  41        [RDMACG_RESOURCE_HCA_HANDLE]    = "hca_handle",
  42        [RDMACG_RESOURCE_HCA_OBJECT]    = "hca_object",
  43};
  44
  45/* resource tracker for each resource of rdma cgroup */
  46struct rdmacg_resource {
  47        int max;
  48        int usage;
  49};
  50
  51/*
  52 * resource pool object which represents per cgroup, per device
  53 * resources. There are multiple instances of this object per cgroup,
  54 * therefore it cannot be embedded within rdma_cgroup structure. It
  55 * is maintained as list.
  56 */
  57struct rdmacg_resource_pool {
  58        struct rdmacg_device    *device;
  59        struct rdmacg_resource  resources[RDMACG_RESOURCE_MAX];
  60
  61        struct list_head        cg_node;
  62        struct list_head        dev_node;
  63
  64        /* count active user tasks of this pool */
  65        u64                     usage_sum;
  66        /* total number counts which are set to max */
  67        int                     num_max_cnt;
  68};
  69
  70static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
  71{
  72        return container_of(css, struct rdma_cgroup, css);
  73}
  74
  75static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg)
  76{
  77        return css_rdmacg(cg->css.parent);
  78}
  79
  80static inline struct rdma_cgroup *get_current_rdmacg(void)
  81{
  82        return css_rdmacg(task_get_css(current, rdma_cgrp_id));
  83}
  84
  85static void set_resource_limit(struct rdmacg_resource_pool *rpool,
  86                               int index, int new_max)
  87{
  88        if (new_max == S32_MAX) {
  89                if (rpool->resources[index].max != S32_MAX)
  90                        rpool->num_max_cnt++;
  91        } else {
  92                if (rpool->resources[index].max == S32_MAX)
  93                        rpool->num_max_cnt--;
  94        }
  95        rpool->resources[index].max = new_max;
  96}
  97
  98static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool)
  99{
 100        int i;
 101
 102        for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
 103                set_resource_limit(rpool, i, S32_MAX);
 104}
 105
 106static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
 107{
 108        lockdep_assert_held(&rdmacg_mutex);
 109
 110        list_del(&rpool->cg_node);
 111        list_del(&rpool->dev_node);
 112        kfree(rpool);
 113}
 114
 115static struct rdmacg_resource_pool *
 116find_cg_rpool_locked(struct rdma_cgroup *cg,
 117                     struct rdmacg_device *device)
 118
 119{
 120        struct rdmacg_resource_pool *pool;
 121
 122        lockdep_assert_held(&rdmacg_mutex);
 123
 124        list_for_each_entry(pool, &cg->rpools, cg_node)
 125                if (pool->device == device)
 126                        return pool;
 127
 128        return NULL;
 129}
 130
 131static struct rdmacg_resource_pool *
 132get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
 133{
 134        struct rdmacg_resource_pool *rpool;
 135
 136        rpool = find_cg_rpool_locked(cg, device);
 137        if (rpool)
 138                return rpool;
 139
 140        rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
 141        if (!rpool)
 142                return ERR_PTR(-ENOMEM);
 143
 144        rpool->device = device;
 145        set_all_resource_max_limit(rpool);
 146
 147        INIT_LIST_HEAD(&rpool->cg_node);
 148        INIT_LIST_HEAD(&rpool->dev_node);
 149        list_add_tail(&rpool->cg_node, &cg->rpools);
 150        list_add_tail(&rpool->dev_node, &device->rpools);
 151        return rpool;
 152}
 153
 154/**
 155 * uncharge_cg_locked - uncharge resource for rdma cgroup
 156 * @cg: pointer to cg to uncharge and all parents in hierarchy
 157 * @device: pointer to rdmacg device
 158 * @index: index of the resource to uncharge in cg (resource pool)
 159 *
 160 * It also frees the resource pool which was created as part of
 161 * charging operation when there are no resources attached to
 162 * resource pool.
 163 */
 164static void
 165uncharge_cg_locked(struct rdma_cgroup *cg,
 166                   struct rdmacg_device *device,
 167                   enum rdmacg_resource_type index)
 168{
 169        struct rdmacg_resource_pool *rpool;
 170
 171        rpool = find_cg_rpool_locked(cg, device);
 172
 173        /*
 174         * rpool cannot be null at this stage. Let kernel operate in case
 175         * if there a bug in IB stack or rdma controller, instead of crashing
 176         * the system.
 177         */
 178        if (unlikely(!rpool)) {
 179                pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device);
 180                return;
 181        }
 182
 183        rpool->resources[index].usage--;
 184
 185        /*
 186         * A negative count (or overflow) is invalid,
 187         * it indicates a bug in the rdma controller.
 188         */
 189        WARN_ON_ONCE(rpool->resources[index].usage < 0);
 190        rpool->usage_sum--;
 191        if (rpool->usage_sum == 0 &&
 192            rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
 193                /*
 194                 * No user of the rpool and all entries are set to max, so
 195                 * safe to delete this rpool.
 196                 */
 197                free_cg_rpool_locked(rpool);
 198        }
 199}
 200
 201/**
 202 * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
 203 * @device: pointer to rdmacg device
 204 * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
 205 *           stop uncharging
 206 * @index: index of the resource to uncharge in cg in given resource pool
 207 */
 208static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
 209                                     struct rdmacg_device *device,
 210                                     struct rdma_cgroup *stop_cg,
 211                                     enum rdmacg_resource_type index)
 212{
 213        struct rdma_cgroup *p;
 214
 215        mutex_lock(&rdmacg_mutex);
 216
 217        for (p = cg; p != stop_cg; p = parent_rdmacg(p))
 218                uncharge_cg_locked(p, device, index);
 219
 220        mutex_unlock(&rdmacg_mutex);
 221
 222        css_put(&cg->css);
 223}
 224
 225/**
 226 * rdmacg_uncharge - hierarchically uncharge rdma resource count
 227 * @device: pointer to rdmacg device
 228 * @index: index of the resource to uncharge in cgroup in given resource pool
 229 */
 230void rdmacg_uncharge(struct rdma_cgroup *cg,
 231                     struct rdmacg_device *device,
 232                     enum rdmacg_resource_type index)
 233{
 234        if (index >= RDMACG_RESOURCE_MAX)
 235                return;
 236
 237        rdmacg_uncharge_hierarchy(cg, device, NULL, index);
 238}
 239EXPORT_SYMBOL(rdmacg_uncharge);
 240
 241/**
 242 * rdmacg_try_charge - hierarchically try to charge the rdma resource
 243 * @rdmacg: pointer to rdma cgroup which will own this resource
 244 * @device: pointer to rdmacg device
 245 * @index: index of the resource to charge in cgroup (resource pool)
 246 *
 247 * This function follows charging resource in hierarchical way.
 248 * It will fail if the charge would cause the new value to exceed the
 249 * hierarchical limit.
 250 * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
 251 * Returns pointer to rdmacg for this resource when charging is successful.
 252 *
 253 * Charger needs to account resources on two criteria.
 254 * (a) per cgroup & (b) per device resource usage.
 255 * Per cgroup resource usage ensures that tasks of cgroup doesn't cross
 256 * the configured limits. Per device provides granular configuration
 257 * in multi device usage. It allocates resource pool in the hierarchy
 258 * for each parent it come across for first resource. Later on resource
 259 * pool will be available. Therefore it will be much faster thereon
 260 * to charge/uncharge.
 261 */
 262int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
 263                      struct rdmacg_device *device,
 264                      enum rdmacg_resource_type index)
 265{
 266        struct rdma_cgroup *cg, *p;
 267        struct rdmacg_resource_pool *rpool;
 268        s64 new;
 269        int ret = 0;
 270
 271        if (index >= RDMACG_RESOURCE_MAX)
 272                return -EINVAL;
 273
 274        /*
 275         * hold on to css, as cgroup can be removed but resource
 276         * accounting happens on css.
 277         */
 278        cg = get_current_rdmacg();
 279
 280        mutex_lock(&rdmacg_mutex);
 281        for (p = cg; p; p = parent_rdmacg(p)) {
 282                rpool = get_cg_rpool_locked(p, device);
 283                if (IS_ERR(rpool)) {
 284                        ret = PTR_ERR(rpool);
 285                        goto err;
 286                } else {
 287                        new = rpool->resources[index].usage + 1;
 288                        if (new > rpool->resources[index].max) {
 289                                ret = -EAGAIN;
 290                                goto err;
 291                        } else {
 292                                rpool->resources[index].usage = new;
 293                                rpool->usage_sum++;
 294                        }
 295                }
 296        }
 297        mutex_unlock(&rdmacg_mutex);
 298
 299        *rdmacg = cg;
 300        return 0;
 301
 302err:
 303        mutex_unlock(&rdmacg_mutex);
 304        rdmacg_uncharge_hierarchy(cg, device, p, index);
 305        return ret;
 306}
 307EXPORT_SYMBOL(rdmacg_try_charge);
 308
 309/**
 310 * rdmacg_register_device - register rdmacg device to rdma controller.
 311 * @device: pointer to rdmacg device whose resources need to be accounted.
 312 *
 313 * If IB stack wish a device to participate in rdma cgroup resource
 314 * tracking, it must invoke this API to register with rdma cgroup before
 315 * any user space application can start using the RDMA resources.
 316 * Returns 0 on success or EINVAL when table length given is beyond
 317 * supported size.
 318 */
 319int rdmacg_register_device(struct rdmacg_device *device)
 320{
 321        INIT_LIST_HEAD(&device->dev_node);
 322        INIT_LIST_HEAD(&device->rpools);
 323
 324        mutex_lock(&rdmacg_mutex);
 325        list_add_tail(&device->dev_node, &rdmacg_devices);
 326        mutex_unlock(&rdmacg_mutex);
 327        return 0;
 328}
 329EXPORT_SYMBOL(rdmacg_register_device);
 330
 331/**
 332 * rdmacg_unregister_device - unregister rdmacg device from rdma controller.
 333 * @device: pointer to rdmacg device which was previously registered with rdma
 334 *          controller using rdmacg_register_device().
 335 *
 336 * IB stack must invoke this after all the resources of the IB device
 337 * are destroyed and after ensuring that no more resources will be created
 338 * when this API is invoked.
 339 */
 340void rdmacg_unregister_device(struct rdmacg_device *device)
 341{
 342        struct rdmacg_resource_pool *rpool, *tmp;
 343
 344        /*
 345         * Synchronize with any active resource settings,
 346         * usage query happening via configfs.
 347         */
 348        mutex_lock(&rdmacg_mutex);
 349        list_del_init(&device->dev_node);
 350
 351        /*
 352         * Now that this device is off the cgroup list, its safe to free
 353         * all the rpool resources.
 354         */
 355        list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node)
 356                free_cg_rpool_locked(rpool);
 357
 358        mutex_unlock(&rdmacg_mutex);
 359}
 360EXPORT_SYMBOL(rdmacg_unregister_device);
 361
 362static int parse_resource(char *c, int *intval)
 363{
 364        substring_t argstr;
 365        const char **table = &rdmacg_resource_names[0];
 366        char *name, *value = c;
 367        size_t len;
 368        int ret, i = 0;
 369
 370        name = strsep(&value, "=");
 371        if (!name || !value)
 372                return -EINVAL;
 373
 374        len = strlen(value);
 375
 376        for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
 377                if (strcmp(table[i], name))
 378                        continue;
 379
 380                argstr.from = value;
 381                argstr.to = value + len;
 382
 383                ret = match_int(&argstr, intval);
 384                if (ret >= 0) {
 385                        if (*intval < 0)
 386                                break;
 387                        return i;
 388                }
 389                if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
 390                        *intval = S32_MAX;
 391                        return i;
 392                }
 393                break;
 394        }
 395        return -EINVAL;
 396}
 397
 398static int rdmacg_parse_limits(char *options,
 399                               int *new_limits, unsigned long *enables)
 400{
 401        char *c;
 402        int err = -EINVAL;
 403
 404        /* parse resource options */
 405        while ((c = strsep(&options, " ")) != NULL) {
 406                int index, intval;
 407
 408                index = parse_resource(c, &intval);
 409                if (index < 0)
 410                        goto err;
 411
 412                new_limits[index] = intval;
 413                *enables |= BIT(index);
 414        }
 415        return 0;
 416
 417err:
 418        return err;
 419}
 420
 421static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
 422{
 423        struct rdmacg_device *device;
 424
 425        lockdep_assert_held(&rdmacg_mutex);
 426
 427        list_for_each_entry(device, &rdmacg_devices, dev_node)
 428                if (!strcmp(name, device->name))
 429                        return device;
 430
 431        return NULL;
 432}
 433
 434static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
 435                                       char *buf, size_t nbytes, loff_t off)
 436{
 437        struct rdma_cgroup *cg = css_rdmacg(of_css(of));
 438        const char *dev_name;
 439        struct rdmacg_resource_pool *rpool;
 440        struct rdmacg_device *device;
 441        char *options = strstrip(buf);
 442        int *new_limits;
 443        unsigned long enables = 0;
 444        int i = 0, ret = 0;
 445
 446        /* extract the device name first */
 447        dev_name = strsep(&options, " ");
 448        if (!dev_name) {
 449                ret = -EINVAL;
 450                goto err;
 451        }
 452
 453        new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL);
 454        if (!new_limits) {
 455                ret = -ENOMEM;
 456                goto err;
 457        }
 458
 459        ret = rdmacg_parse_limits(options, new_limits, &enables);
 460        if (ret)
 461                goto parse_err;
 462
 463        /* acquire lock to synchronize with hot plug devices */
 464        mutex_lock(&rdmacg_mutex);
 465
 466        device = rdmacg_get_device_locked(dev_name);
 467        if (!device) {
 468                ret = -ENODEV;
 469                goto dev_err;
 470        }
 471
 472        rpool = get_cg_rpool_locked(cg, device);
 473        if (IS_ERR(rpool)) {
 474                ret = PTR_ERR(rpool);
 475                goto dev_err;
 476        }
 477
 478        /* now set the new limits of the rpool */
 479        for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX)
 480                set_resource_limit(rpool, i, new_limits[i]);
 481
 482        if (rpool->usage_sum == 0 &&
 483            rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
 484                /*
 485                 * No user of the rpool and all entries are set to max, so
 486                 * safe to delete this rpool.
 487                 */
 488                free_cg_rpool_locked(rpool);
 489        }
 490
 491dev_err:
 492        mutex_unlock(&rdmacg_mutex);
 493
 494parse_err:
 495        kfree(new_limits);
 496
 497err:
 498        return ret ?: nbytes;
 499}
 500
 501static void print_rpool_values(struct seq_file *sf,
 502                               struct rdmacg_resource_pool *rpool)
 503{
 504        enum rdmacg_file_type sf_type;
 505        int i;
 506        u32 value;
 507
 508        sf_type = seq_cft(sf)->private;
 509
 510        for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
 511                seq_puts(sf, rdmacg_resource_names[i]);
 512                seq_putc(sf, '=');
 513                if (sf_type == RDMACG_RESOURCE_TYPE_MAX) {
 514                        if (rpool)
 515                                value = rpool->resources[i].max;
 516                        else
 517                                value = S32_MAX;
 518                } else {
 519                        if (rpool)
 520                                value = rpool->resources[i].usage;
 521                        else
 522                                value = 0;
 523                }
 524
 525                if (value == S32_MAX)
 526                        seq_puts(sf, RDMACG_MAX_STR);
 527                else
 528                        seq_printf(sf, "%d", value);
 529                seq_putc(sf, ' ');
 530        }
 531}
 532
 533static int rdmacg_resource_read(struct seq_file *sf, void *v)
 534{
 535        struct rdmacg_device *device;
 536        struct rdmacg_resource_pool *rpool;
 537        struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
 538
 539        mutex_lock(&rdmacg_mutex);
 540
 541        list_for_each_entry(device, &rdmacg_devices, dev_node) {
 542                seq_printf(sf, "%s ", device->name);
 543
 544                rpool = find_cg_rpool_locked(cg, device);
 545                print_rpool_values(sf, rpool);
 546
 547                seq_putc(sf, '\n');
 548        }
 549
 550        mutex_unlock(&rdmacg_mutex);
 551        return 0;
 552}
 553
 554static struct cftype rdmacg_files[] = {
 555        {
 556                .name = "max",
 557                .write = rdmacg_resource_set_max,
 558                .seq_show = rdmacg_resource_read,
 559                .private = RDMACG_RESOURCE_TYPE_MAX,
 560                .flags = CFTYPE_NOT_ON_ROOT,
 561        },
 562        {
 563                .name = "current",
 564                .seq_show = rdmacg_resource_read,
 565                .private = RDMACG_RESOURCE_TYPE_STAT,
 566                .flags = CFTYPE_NOT_ON_ROOT,
 567        },
 568        { }     /* terminate */
 569};
 570
 571static struct cgroup_subsys_state *
 572rdmacg_css_alloc(struct cgroup_subsys_state *parent)
 573{
 574        struct rdma_cgroup *cg;
 575
 576        cg = kzalloc(sizeof(*cg), GFP_KERNEL);
 577        if (!cg)
 578                return ERR_PTR(-ENOMEM);
 579
 580        INIT_LIST_HEAD(&cg->rpools);
 581        return &cg->css;
 582}
 583
 584static void rdmacg_css_free(struct cgroup_subsys_state *css)
 585{
 586        struct rdma_cgroup *cg = css_rdmacg(css);
 587
 588        kfree(cg);
 589}
 590
 591/**
 592 * rdmacg_css_offline - cgroup css_offline callback
 593 * @css: css of interest
 594 *
 595 * This function is called when @css is about to go away and responsible
 596 * for shooting down all rdmacg associated with @css. As part of that it
 597 * marks all the resource pool entries to max value, so that when resources are
 598 * uncharged, associated resource pool can be freed as well.
 599 */
 600static void rdmacg_css_offline(struct cgroup_subsys_state *css)
 601{
 602        struct rdma_cgroup *cg = css_rdmacg(css);
 603        struct rdmacg_resource_pool *rpool;
 604
 605        mutex_lock(&rdmacg_mutex);
 606
 607        list_for_each_entry(rpool, &cg->rpools, cg_node)
 608                set_all_resource_max_limit(rpool);
 609
 610        mutex_unlock(&rdmacg_mutex);
 611}
 612
 613struct cgroup_subsys rdma_cgrp_subsys = {
 614        .css_alloc      = rdmacg_css_alloc,
 615        .css_free       = rdmacg_css_free,
 616        .css_offline    = rdmacg_css_offline,
 617        .legacy_cftypes = rdmacg_files,
 618        .dfl_cftypes    = rdmacg_files,
 619};
 620