linux/kernel/cgroup/rdma.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * RDMA resource limiting controller for cgroups.
   4 *
   5 * Used to allow a cgroup hierarchy to stop processes from consuming
   6 * additional RDMA resources after a certain limit is reached.
   7 *
   8 * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
   9 */
  10
  11#include <linux/bitops.h>
  12#include <linux/slab.h>
  13#include <linux/seq_file.h>
  14#include <linux/cgroup.h>
  15#include <linux/parser.h>
  16#include <linux/cgroup_rdma.h>
  17
  18#define RDMACG_MAX_STR "max"
  19
  20/*
  21 * Protects list of resource pools maintained on per cgroup basis
  22 * and rdma device list.
  23 */
  24static DEFINE_MUTEX(rdmacg_mutex);
  25static LIST_HEAD(rdmacg_devices);
  26
  27enum rdmacg_file_type {
  28        RDMACG_RESOURCE_TYPE_MAX,
  29        RDMACG_RESOURCE_TYPE_STAT,
  30};
  31
  32/*
  33 * resource table definition as to be seen by the user.
  34 * Need to add entries to it when more resources are
  35 * added/defined at IB verb/core layer.
  36 */
  37static char const *rdmacg_resource_names[] = {
  38        [RDMACG_RESOURCE_HCA_HANDLE]    = "hca_handle",
  39        [RDMACG_RESOURCE_HCA_OBJECT]    = "hca_object",
  40};
  41
  42/* resource tracker for each resource of rdma cgroup */
  43struct rdmacg_resource {
  44        int max;
  45        int usage;
  46};
  47
  48/*
  49 * resource pool object which represents per cgroup, per device
  50 * resources. There are multiple instances of this object per cgroup,
  51 * therefore it cannot be embedded within rdma_cgroup structure. It
  52 * is maintained as list.
  53 */
  54struct rdmacg_resource_pool {
  55        struct rdmacg_device    *device;
  56        struct rdmacg_resource  resources[RDMACG_RESOURCE_MAX];
  57
  58        struct list_head        cg_node;
  59        struct list_head        dev_node;
  60
  61        /* count active user tasks of this pool */
  62        u64                     usage_sum;
  63        /* total number counts which are set to max */
  64        int                     num_max_cnt;
  65};
  66
  67static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
  68{
  69        return container_of(css, struct rdma_cgroup, css);
  70}
  71
  72static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg)
  73{
  74        return css_rdmacg(cg->css.parent);
  75}
  76
  77static inline struct rdma_cgroup *get_current_rdmacg(void)
  78{
  79        return css_rdmacg(task_get_css(current, rdma_cgrp_id));
  80}
  81
  82static void set_resource_limit(struct rdmacg_resource_pool *rpool,
  83                               int index, int new_max)
  84{
  85        if (new_max == S32_MAX) {
  86                if (rpool->resources[index].max != S32_MAX)
  87                        rpool->num_max_cnt++;
  88        } else {
  89                if (rpool->resources[index].max == S32_MAX)
  90                        rpool->num_max_cnt--;
  91        }
  92        rpool->resources[index].max = new_max;
  93}
  94
  95static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool)
  96{
  97        int i;
  98
  99        for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
 100                set_resource_limit(rpool, i, S32_MAX);
 101}
 102
 103static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
 104{
 105        lockdep_assert_held(&rdmacg_mutex);
 106
 107        list_del(&rpool->cg_node);
 108        list_del(&rpool->dev_node);
 109        kfree(rpool);
 110}
 111
 112static struct rdmacg_resource_pool *
 113find_cg_rpool_locked(struct rdma_cgroup *cg,
 114                     struct rdmacg_device *device)
 115
 116{
 117        struct rdmacg_resource_pool *pool;
 118
 119        lockdep_assert_held(&rdmacg_mutex);
 120
 121        list_for_each_entry(pool, &cg->rpools, cg_node)
 122                if (pool->device == device)
 123                        return pool;
 124
 125        return NULL;
 126}
 127
 128static struct rdmacg_resource_pool *
 129get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
 130{
 131        struct rdmacg_resource_pool *rpool;
 132
 133        rpool = find_cg_rpool_locked(cg, device);
 134        if (rpool)
 135                return rpool;
 136
 137        rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
 138        if (!rpool)
 139                return ERR_PTR(-ENOMEM);
 140
 141        rpool->device = device;
 142        set_all_resource_max_limit(rpool);
 143
 144        INIT_LIST_HEAD(&rpool->cg_node);
 145        INIT_LIST_HEAD(&rpool->dev_node);
 146        list_add_tail(&rpool->cg_node, &cg->rpools);
 147        list_add_tail(&rpool->dev_node, &device->rpools);
 148        return rpool;
 149}
 150
 151/**
 152 * uncharge_cg_locked - uncharge resource for rdma cgroup
 153 * @cg: pointer to cg to uncharge and all parents in hierarchy
 154 * @device: pointer to rdmacg device
 155 * @index: index of the resource to uncharge in cg (resource pool)
 156 *
 157 * It also frees the resource pool which was created as part of
 158 * charging operation when there are no resources attached to
 159 * resource pool.
 160 */
 161static void
 162uncharge_cg_locked(struct rdma_cgroup *cg,
 163                   struct rdmacg_device *device,
 164                   enum rdmacg_resource_type index)
 165{
 166        struct rdmacg_resource_pool *rpool;
 167
 168        rpool = find_cg_rpool_locked(cg, device);
 169
 170        /*
 171         * rpool cannot be null at this stage. Let kernel operate in case
 172         * if there a bug in IB stack or rdma controller, instead of crashing
 173         * the system.
 174         */
 175        if (unlikely(!rpool)) {
 176                pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device);
 177                return;
 178        }
 179
 180        rpool->resources[index].usage--;
 181
 182        /*
 183         * A negative count (or overflow) is invalid,
 184         * it indicates a bug in the rdma controller.
 185         */
 186        WARN_ON_ONCE(rpool->resources[index].usage < 0);
 187        rpool->usage_sum--;
 188        if (rpool->usage_sum == 0 &&
 189            rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
 190                /*
 191                 * No user of the rpool and all entries are set to max, so
 192                 * safe to delete this rpool.
 193                 */
 194                free_cg_rpool_locked(rpool);
 195        }
 196}
 197
 198/**
 199 * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
 200 * @device: pointer to rdmacg device
 201 * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
 202 *           stop uncharging
 203 * @index: index of the resource to uncharge in cg in given resource pool
 204 */
 205static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
 206                                     struct rdmacg_device *device,
 207                                     struct rdma_cgroup *stop_cg,
 208                                     enum rdmacg_resource_type index)
 209{
 210        struct rdma_cgroup *p;
 211
 212        mutex_lock(&rdmacg_mutex);
 213
 214        for (p = cg; p != stop_cg; p = parent_rdmacg(p))
 215                uncharge_cg_locked(p, device, index);
 216
 217        mutex_unlock(&rdmacg_mutex);
 218
 219        css_put(&cg->css);
 220}
 221
 222/**
 223 * rdmacg_uncharge - hierarchically uncharge rdma resource count
 224 * @device: pointer to rdmacg device
 225 * @index: index of the resource to uncharge in cgroup in given resource pool
 226 */
 227void rdmacg_uncharge(struct rdma_cgroup *cg,
 228                     struct rdmacg_device *device,
 229                     enum rdmacg_resource_type index)
 230{
 231        if (index >= RDMACG_RESOURCE_MAX)
 232                return;
 233
 234        rdmacg_uncharge_hierarchy(cg, device, NULL, index);
 235}
 236EXPORT_SYMBOL(rdmacg_uncharge);
 237
 238/**
 239 * rdmacg_try_charge - hierarchically try to charge the rdma resource
 240 * @rdmacg: pointer to rdma cgroup which will own this resource
 241 * @device: pointer to rdmacg device
 242 * @index: index of the resource to charge in cgroup (resource pool)
 243 *
 244 * This function follows charging resource in hierarchical way.
 245 * It will fail if the charge would cause the new value to exceed the
 246 * hierarchical limit.
 247 * Returns 0 if the charge succeeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
 248 * Returns pointer to rdmacg for this resource when charging is successful.
 249 *
 250 * Charger needs to account resources on two criteria.
 251 * (a) per cgroup & (b) per device resource usage.
 252 * Per cgroup resource usage ensures that tasks of cgroup doesn't cross
 253 * the configured limits. Per device provides granular configuration
 254 * in multi device usage. It allocates resource pool in the hierarchy
 255 * for each parent it come across for first resource. Later on resource
 256 * pool will be available. Therefore it will be much faster thereon
 257 * to charge/uncharge.
 258 */
 259int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
 260                      struct rdmacg_device *device,
 261                      enum rdmacg_resource_type index)
 262{
 263        struct rdma_cgroup *cg, *p;
 264        struct rdmacg_resource_pool *rpool;
 265        s64 new;
 266        int ret = 0;
 267
 268        if (index >= RDMACG_RESOURCE_MAX)
 269                return -EINVAL;
 270
 271        /*
 272         * hold on to css, as cgroup can be removed but resource
 273         * accounting happens on css.
 274         */
 275        cg = get_current_rdmacg();
 276
 277        mutex_lock(&rdmacg_mutex);
 278        for (p = cg; p; p = parent_rdmacg(p)) {
 279                rpool = get_cg_rpool_locked(p, device);
 280                if (IS_ERR(rpool)) {
 281                        ret = PTR_ERR(rpool);
 282                        goto err;
 283                } else {
 284                        new = rpool->resources[index].usage + 1;
 285                        if (new > rpool->resources[index].max) {
 286                                ret = -EAGAIN;
 287                                goto err;
 288                        } else {
 289                                rpool->resources[index].usage = new;
 290                                rpool->usage_sum++;
 291                        }
 292                }
 293        }
 294        mutex_unlock(&rdmacg_mutex);
 295
 296        *rdmacg = cg;
 297        return 0;
 298
 299err:
 300        mutex_unlock(&rdmacg_mutex);
 301        rdmacg_uncharge_hierarchy(cg, device, p, index);
 302        return ret;
 303}
 304EXPORT_SYMBOL(rdmacg_try_charge);
 305
 306/**
 307 * rdmacg_register_device - register rdmacg device to rdma controller.
 308 * @device: pointer to rdmacg device whose resources need to be accounted.
 309 *
 310 * If IB stack wish a device to participate in rdma cgroup resource
 311 * tracking, it must invoke this API to register with rdma cgroup before
 312 * any user space application can start using the RDMA resources.
 313 */
 314void rdmacg_register_device(struct rdmacg_device *device)
 315{
 316        INIT_LIST_HEAD(&device->dev_node);
 317        INIT_LIST_HEAD(&device->rpools);
 318
 319        mutex_lock(&rdmacg_mutex);
 320        list_add_tail(&device->dev_node, &rdmacg_devices);
 321        mutex_unlock(&rdmacg_mutex);
 322}
 323EXPORT_SYMBOL(rdmacg_register_device);
 324
 325/**
 326 * rdmacg_unregister_device - unregister rdmacg device from rdma controller.
 327 * @device: pointer to rdmacg device which was previously registered with rdma
 328 *          controller using rdmacg_register_device().
 329 *
 330 * IB stack must invoke this after all the resources of the IB device
 331 * are destroyed and after ensuring that no more resources will be created
 332 * when this API is invoked.
 333 */
 334void rdmacg_unregister_device(struct rdmacg_device *device)
 335{
 336        struct rdmacg_resource_pool *rpool, *tmp;
 337
 338        /*
 339         * Synchronize with any active resource settings,
 340         * usage query happening via configfs.
 341         */
 342        mutex_lock(&rdmacg_mutex);
 343        list_del_init(&device->dev_node);
 344
 345        /*
 346         * Now that this device is off the cgroup list, its safe to free
 347         * all the rpool resources.
 348         */
 349        list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node)
 350                free_cg_rpool_locked(rpool);
 351
 352        mutex_unlock(&rdmacg_mutex);
 353}
 354EXPORT_SYMBOL(rdmacg_unregister_device);
 355
 356static int parse_resource(char *c, int *intval)
 357{
 358        substring_t argstr;
 359        char *name, *value = c;
 360        size_t len;
 361        int ret, i;
 362
 363        name = strsep(&value, "=");
 364        if (!name || !value)
 365                return -EINVAL;
 366
 367        i = match_string(rdmacg_resource_names, RDMACG_RESOURCE_MAX, name);
 368        if (i < 0)
 369                return i;
 370
 371        len = strlen(value);
 372
 373        argstr.from = value;
 374        argstr.to = value + len;
 375
 376        ret = match_int(&argstr, intval);
 377        if (ret >= 0) {
 378                if (*intval < 0)
 379                        return -EINVAL;
 380                return i;
 381        }
 382        if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
 383                *intval = S32_MAX;
 384                return i;
 385        }
 386        return -EINVAL;
 387}
 388
 389static int rdmacg_parse_limits(char *options,
 390                               int *new_limits, unsigned long *enables)
 391{
 392        char *c;
 393        int err = -EINVAL;
 394
 395        /* parse resource options */
 396        while ((c = strsep(&options, " ")) != NULL) {
 397                int index, intval;
 398
 399                index = parse_resource(c, &intval);
 400                if (index < 0)
 401                        goto err;
 402
 403                new_limits[index] = intval;
 404                *enables |= BIT(index);
 405        }
 406        return 0;
 407
 408err:
 409        return err;
 410}
 411
 412static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
 413{
 414        struct rdmacg_device *device;
 415
 416        lockdep_assert_held(&rdmacg_mutex);
 417
 418        list_for_each_entry(device, &rdmacg_devices, dev_node)
 419                if (!strcmp(name, device->name))
 420                        return device;
 421
 422        return NULL;
 423}
 424
 425static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
 426                                       char *buf, size_t nbytes, loff_t off)
 427{
 428        struct rdma_cgroup *cg = css_rdmacg(of_css(of));
 429        const char *dev_name;
 430        struct rdmacg_resource_pool *rpool;
 431        struct rdmacg_device *device;
 432        char *options = strstrip(buf);
 433        int *new_limits;
 434        unsigned long enables = 0;
 435        int i = 0, ret = 0;
 436
 437        /* extract the device name first */
 438        dev_name = strsep(&options, " ");
 439        if (!dev_name) {
 440                ret = -EINVAL;
 441                goto err;
 442        }
 443
 444        new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL);
 445        if (!new_limits) {
 446                ret = -ENOMEM;
 447                goto err;
 448        }
 449
 450        ret = rdmacg_parse_limits(options, new_limits, &enables);
 451        if (ret)
 452                goto parse_err;
 453
 454        /* acquire lock to synchronize with hot plug devices */
 455        mutex_lock(&rdmacg_mutex);
 456
 457        device = rdmacg_get_device_locked(dev_name);
 458        if (!device) {
 459                ret = -ENODEV;
 460                goto dev_err;
 461        }
 462
 463        rpool = get_cg_rpool_locked(cg, device);
 464        if (IS_ERR(rpool)) {
 465                ret = PTR_ERR(rpool);
 466                goto dev_err;
 467        }
 468
 469        /* now set the new limits of the rpool */
 470        for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX)
 471                set_resource_limit(rpool, i, new_limits[i]);
 472
 473        if (rpool->usage_sum == 0 &&
 474            rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
 475                /*
 476                 * No user of the rpool and all entries are set to max, so
 477                 * safe to delete this rpool.
 478                 */
 479                free_cg_rpool_locked(rpool);
 480        }
 481
 482dev_err:
 483        mutex_unlock(&rdmacg_mutex);
 484
 485parse_err:
 486        kfree(new_limits);
 487
 488err:
 489        return ret ?: nbytes;
 490}
 491
 492static void print_rpool_values(struct seq_file *sf,
 493                               struct rdmacg_resource_pool *rpool)
 494{
 495        enum rdmacg_file_type sf_type;
 496        int i;
 497        u32 value;
 498
 499        sf_type = seq_cft(sf)->private;
 500
 501        for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
 502                seq_puts(sf, rdmacg_resource_names[i]);
 503                seq_putc(sf, '=');
 504                if (sf_type == RDMACG_RESOURCE_TYPE_MAX) {
 505                        if (rpool)
 506                                value = rpool->resources[i].max;
 507                        else
 508                                value = S32_MAX;
 509                } else {
 510                        if (rpool)
 511                                value = rpool->resources[i].usage;
 512                        else
 513                                value = 0;
 514                }
 515
 516                if (value == S32_MAX)
 517                        seq_puts(sf, RDMACG_MAX_STR);
 518                else
 519                        seq_printf(sf, "%d", value);
 520                seq_putc(sf, ' ');
 521        }
 522}
 523
 524static int rdmacg_resource_read(struct seq_file *sf, void *v)
 525{
 526        struct rdmacg_device *device;
 527        struct rdmacg_resource_pool *rpool;
 528        struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
 529
 530        mutex_lock(&rdmacg_mutex);
 531
 532        list_for_each_entry(device, &rdmacg_devices, dev_node) {
 533                seq_printf(sf, "%s ", device->name);
 534
 535                rpool = find_cg_rpool_locked(cg, device);
 536                print_rpool_values(sf, rpool);
 537
 538                seq_putc(sf, '\n');
 539        }
 540
 541        mutex_unlock(&rdmacg_mutex);
 542        return 0;
 543}
 544
 545static struct cftype rdmacg_files[] = {
 546        {
 547                .name = "max",
 548                .write = rdmacg_resource_set_max,
 549                .seq_show = rdmacg_resource_read,
 550                .private = RDMACG_RESOURCE_TYPE_MAX,
 551                .flags = CFTYPE_NOT_ON_ROOT,
 552        },
 553        {
 554                .name = "current",
 555                .seq_show = rdmacg_resource_read,
 556                .private = RDMACG_RESOURCE_TYPE_STAT,
 557                .flags = CFTYPE_NOT_ON_ROOT,
 558        },
 559        { }     /* terminate */
 560};
 561
 562static struct cgroup_subsys_state *
 563rdmacg_css_alloc(struct cgroup_subsys_state *parent)
 564{
 565        struct rdma_cgroup *cg;
 566
 567        cg = kzalloc(sizeof(*cg), GFP_KERNEL);
 568        if (!cg)
 569                return ERR_PTR(-ENOMEM);
 570
 571        INIT_LIST_HEAD(&cg->rpools);
 572        return &cg->css;
 573}
 574
 575static void rdmacg_css_free(struct cgroup_subsys_state *css)
 576{
 577        struct rdma_cgroup *cg = css_rdmacg(css);
 578
 579        kfree(cg);
 580}
 581
 582/**
 583 * rdmacg_css_offline - cgroup css_offline callback
 584 * @css: css of interest
 585 *
 586 * This function is called when @css is about to go away and responsible
 587 * for shooting down all rdmacg associated with @css. As part of that it
 588 * marks all the resource pool entries to max value, so that when resources are
 589 * uncharged, associated resource pool can be freed as well.
 590 */
 591static void rdmacg_css_offline(struct cgroup_subsys_state *css)
 592{
 593        struct rdma_cgroup *cg = css_rdmacg(css);
 594        struct rdmacg_resource_pool *rpool;
 595
 596        mutex_lock(&rdmacg_mutex);
 597
 598        list_for_each_entry(rpool, &cg->rpools, cg_node)
 599                set_all_resource_max_limit(rpool);
 600
 601        mutex_unlock(&rdmacg_mutex);
 602}
 603
 604struct cgroup_subsys rdma_cgrp_subsys = {
 605        .css_alloc      = rdmacg_css_alloc,
 606        .css_free       = rdmacg_css_free,
 607        .css_offline    = rdmacg_css_offline,
 608        .legacy_cftypes = rdmacg_files,
 609        .dfl_cftypes    = rdmacg_files,
 610};
 611