linux/arch/x86/kernel/cpu/resctrl/rdtgroup.c
<<
>>
Prefs
   1/*
   2 * User interface for Resource Alloction in Resource Director Technology(RDT)
   3 *
   4 * Copyright (C) 2016 Intel Corporation
   5 *
   6 * Author: Fenghua Yu <fenghua.yu@intel.com>
   7 *
   8 * This program is free software; you can redistribute it and/or modify it
   9 * under the terms and conditions of the GNU General Public License,
  10 * version 2, as published by the Free Software Foundation.
  11 *
  12 * This program is distributed in the hope it will be useful, but WITHOUT
  13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  15 * more details.
  16 *
  17 * More information about RDT be found in the Intel (R) x86 Architecture
  18 * Software Developer Manual.
  19 */
  20
  21#define pr_fmt(fmt)     KBUILD_MODNAME ": " fmt
  22
  23#include <linux/cacheinfo.h>
  24#include <linux/cpu.h>
  25#include <linux/debugfs.h>
  26#include <linux/fs.h>
  27#include <linux/fs_parser.h>
  28#include <linux/sysfs.h>
  29#include <linux/kernfs.h>
  30#include <linux/seq_buf.h>
  31#include <linux/seq_file.h>
  32#include <linux/sched/signal.h>
  33#include <linux/sched/task.h>
  34#include <linux/slab.h>
  35#include <linux/task_work.h>
  36#include <linux/user_namespace.h>
  37
  38#include <uapi/linux/magic.h>
  39
  40#include <asm/resctrl.h>
  41#include "internal.h"
  42
  43DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
  44DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key);
  45DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
  46static struct kernfs_root *rdt_root;
  47struct rdtgroup rdtgroup_default;
  48LIST_HEAD(rdt_all_groups);
  49
  50/* Kernel fs node for "info" directory under root */
  51static struct kernfs_node *kn_info;
  52
  53/* Kernel fs node for "mon_groups" directory under root */
  54static struct kernfs_node *kn_mongrp;
  55
  56/* Kernel fs node for "mon_data" directory under root */
  57static struct kernfs_node *kn_mondata;
  58
  59static struct seq_buf last_cmd_status;
  60static char last_cmd_status_buf[512];
  61
  62struct dentry *debugfs_resctrl;
  63
  64void rdt_last_cmd_clear(void)
  65{
  66        lockdep_assert_held(&rdtgroup_mutex);
  67        seq_buf_clear(&last_cmd_status);
  68}
  69
  70void rdt_last_cmd_puts(const char *s)
  71{
  72        lockdep_assert_held(&rdtgroup_mutex);
  73        seq_buf_puts(&last_cmd_status, s);
  74}
  75
  76void rdt_last_cmd_printf(const char *fmt, ...)
  77{
  78        va_list ap;
  79
  80        va_start(ap, fmt);
  81        lockdep_assert_held(&rdtgroup_mutex);
  82        seq_buf_vprintf(&last_cmd_status, fmt, ap);
  83        va_end(ap);
  84}
  85
  86/*
  87 * Trivial allocator for CLOSIDs. Since h/w only supports a small number,
  88 * we can keep a bitmap of free CLOSIDs in a single integer.
  89 *
  90 * Using a global CLOSID across all resources has some advantages and
  91 * some drawbacks:
  92 * + We can simply set "current->closid" to assign a task to a resource
  93 *   group.
  94 * + Context switch code can avoid extra memory references deciding which
  95 *   CLOSID to load into the PQR_ASSOC MSR
  96 * - We give up some options in configuring resource groups across multi-socket
  97 *   systems.
  98 * - Our choices on how to configure each resource become progressively more
  99 *   limited as the number of resources grows.
 100 */
 101static int closid_free_map;
 102static int closid_free_map_len;
 103
 104int closids_supported(void)
 105{
 106        return closid_free_map_len;
 107}
 108
 109static void closid_init(void)
 110{
 111        struct rdt_resource *r;
 112        int rdt_min_closid = 32;
 113
 114        /* Compute rdt_min_closid across all resources */
 115        for_each_alloc_enabled_rdt_resource(r)
 116                rdt_min_closid = min(rdt_min_closid, r->num_closid);
 117
 118        closid_free_map = BIT_MASK(rdt_min_closid) - 1;
 119
 120        /* CLOSID 0 is always reserved for the default group */
 121        closid_free_map &= ~1;
 122        closid_free_map_len = rdt_min_closid;
 123}
 124
 125static int closid_alloc(void)
 126{
 127        u32 closid = ffs(closid_free_map);
 128
 129        if (closid == 0)
 130                return -ENOSPC;
 131        closid--;
 132        closid_free_map &= ~(1 << closid);
 133
 134        return closid;
 135}
 136
 137void closid_free(int closid)
 138{
 139        closid_free_map |= 1 << closid;
 140}
 141
 142/**
 143 * closid_allocated - test if provided closid is in use
 144 * @closid: closid to be tested
 145 *
 146 * Return: true if @closid is currently associated with a resource group,
 147 * false if @closid is free
 148 */
 149static bool closid_allocated(unsigned int closid)
 150{
 151        return (closid_free_map & (1 << closid)) == 0;
 152}
 153
 154/**
 155 * rdtgroup_mode_by_closid - Return mode of resource group with closid
 156 * @closid: closid if the resource group
 157 *
 158 * Each resource group is associated with a @closid. Here the mode
 159 * of a resource group can be queried by searching for it using its closid.
 160 *
 161 * Return: mode as &enum rdtgrp_mode of resource group with closid @closid
 162 */
 163enum rdtgrp_mode rdtgroup_mode_by_closid(int closid)
 164{
 165        struct rdtgroup *rdtgrp;
 166
 167        list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
 168                if (rdtgrp->closid == closid)
 169                        return rdtgrp->mode;
 170        }
 171
 172        return RDT_NUM_MODES;
 173}
 174
 175static const char * const rdt_mode_str[] = {
 176        [RDT_MODE_SHAREABLE]            = "shareable",
 177        [RDT_MODE_EXCLUSIVE]            = "exclusive",
 178        [RDT_MODE_PSEUDO_LOCKSETUP]     = "pseudo-locksetup",
 179        [RDT_MODE_PSEUDO_LOCKED]        = "pseudo-locked",
 180};
 181
 182/**
 183 * rdtgroup_mode_str - Return the string representation of mode
 184 * @mode: the resource group mode as &enum rdtgroup_mode
 185 *
 186 * Return: string representation of valid mode, "unknown" otherwise
 187 */
 188static const char *rdtgroup_mode_str(enum rdtgrp_mode mode)
 189{
 190        if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES)
 191                return "unknown";
 192
 193        return rdt_mode_str[mode];
 194}
 195
 196/* set uid and gid of rdtgroup dirs and files to that of the creator */
 197static int rdtgroup_kn_set_ugid(struct kernfs_node *kn)
 198{
 199        struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
 200                                .ia_uid = current_fsuid(),
 201                                .ia_gid = current_fsgid(), };
 202
 203        if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
 204            gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
 205                return 0;
 206
 207        return kernfs_setattr(kn, &iattr);
 208}
 209
 210static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft)
 211{
 212        struct kernfs_node *kn;
 213        int ret;
 214
 215        kn = __kernfs_create_file(parent_kn, rft->name, rft->mode,
 216                                  GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
 217                                  0, rft->kf_ops, rft, NULL, NULL);
 218        if (IS_ERR(kn))
 219                return PTR_ERR(kn);
 220
 221        ret = rdtgroup_kn_set_ugid(kn);
 222        if (ret) {
 223                kernfs_remove(kn);
 224                return ret;
 225        }
 226
 227        return 0;
 228}
 229
 230static int rdtgroup_seqfile_show(struct seq_file *m, void *arg)
 231{
 232        struct kernfs_open_file *of = m->private;
 233        struct rftype *rft = of->kn->priv;
 234
 235        if (rft->seq_show)
 236                return rft->seq_show(of, m, arg);
 237        return 0;
 238}
 239
 240static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf,
 241                                   size_t nbytes, loff_t off)
 242{
 243        struct rftype *rft = of->kn->priv;
 244
 245        if (rft->write)
 246                return rft->write(of, buf, nbytes, off);
 247
 248        return -EINVAL;
 249}
 250
 251static struct kernfs_ops rdtgroup_kf_single_ops = {
 252        .atomic_write_len       = PAGE_SIZE,
 253        .write                  = rdtgroup_file_write,
 254        .seq_show               = rdtgroup_seqfile_show,
 255};
 256
 257static struct kernfs_ops kf_mondata_ops = {
 258        .atomic_write_len       = PAGE_SIZE,
 259        .seq_show               = rdtgroup_mondata_show,
 260};
 261
 262static bool is_cpu_list(struct kernfs_open_file *of)
 263{
 264        struct rftype *rft = of->kn->priv;
 265
 266        return rft->flags & RFTYPE_FLAGS_CPUS_LIST;
 267}
 268
 269static int rdtgroup_cpus_show(struct kernfs_open_file *of,
 270                              struct seq_file *s, void *v)
 271{
 272        struct rdtgroup *rdtgrp;
 273        struct cpumask *mask;
 274        int ret = 0;
 275
 276        rdtgrp = rdtgroup_kn_lock_live(of->kn);
 277
 278        if (rdtgrp) {
 279                if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
 280                        if (!rdtgrp->plr->d) {
 281                                rdt_last_cmd_clear();
 282                                rdt_last_cmd_puts("Cache domain offline\n");
 283                                ret = -ENODEV;
 284                        } else {
 285                                mask = &rdtgrp->plr->d->cpu_mask;
 286                                seq_printf(s, is_cpu_list(of) ?
 287                                           "%*pbl\n" : "%*pb\n",
 288                                           cpumask_pr_args(mask));
 289                        }
 290                } else {
 291                        seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n",
 292                                   cpumask_pr_args(&rdtgrp->cpu_mask));
 293                }
 294        } else {
 295                ret = -ENOENT;
 296        }
 297        rdtgroup_kn_unlock(of->kn);
 298
 299        return ret;
 300}
 301
 302/*
 303 * This is safe against resctrl_sched_in() called from __switch_to()
 304 * because __switch_to() is executed with interrupts disabled. A local call
 305 * from update_closid_rmid() is proteced against __switch_to() because
 306 * preemption is disabled.
 307 */
 308static void update_cpu_closid_rmid(void *info)
 309{
 310        struct rdtgroup *r = info;
 311
 312        if (r) {
 313                this_cpu_write(pqr_state.default_closid, r->closid);
 314                this_cpu_write(pqr_state.default_rmid, r->mon.rmid);
 315        }
 316
 317        /*
 318         * We cannot unconditionally write the MSR because the current
 319         * executing task might have its own closid selected. Just reuse
 320         * the context switch code.
 321         */
 322        resctrl_sched_in();
 323}
 324
 325/*
 326 * Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
 327 *
 328 * Per task closids/rmids must have been set up before calling this function.
 329 */
 330static void
 331update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r)
 332{
 333        int cpu = get_cpu();
 334
 335        if (cpumask_test_cpu(cpu, cpu_mask))
 336                update_cpu_closid_rmid(r);
 337        smp_call_function_many(cpu_mask, update_cpu_closid_rmid, r, 1);
 338        put_cpu();
 339}
 340
 341static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
 342                          cpumask_var_t tmpmask)
 343{
 344        struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp;
 345        struct list_head *head;
 346
 347        /* Check whether cpus belong to parent ctrl group */
 348        cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask);
 349        if (cpumask_weight(tmpmask)) {
 350                rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n");
 351                return -EINVAL;
 352        }
 353
 354        /* Check whether cpus are dropped from this group */
 355        cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
 356        if (cpumask_weight(tmpmask)) {
 357                /* Give any dropped cpus to parent rdtgroup */
 358                cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask);
 359                update_closid_rmid(tmpmask, prgrp);
 360        }
 361
 362        /*
 363         * If we added cpus, remove them from previous group that owned them
 364         * and update per-cpu rmid
 365         */
 366        cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
 367        if (cpumask_weight(tmpmask)) {
 368                head = &prgrp->mon.crdtgrp_list;
 369                list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
 370                        if (crgrp == rdtgrp)
 371                                continue;
 372                        cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask,
 373                                       tmpmask);
 374                }
 375                update_closid_rmid(tmpmask, rdtgrp);
 376        }
 377
 378        /* Done pushing/pulling - update this group with new mask */
 379        cpumask_copy(&rdtgrp->cpu_mask, newmask);
 380
 381        return 0;
 382}
 383
 384static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m)
 385{
 386        struct rdtgroup *crgrp;
 387
 388        cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m);
 389        /* update the child mon group masks as well*/
 390        list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list)
 391                cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask);
 392}
 393
 394static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
 395                           cpumask_var_t tmpmask, cpumask_var_t tmpmask1)
 396{
 397        struct rdtgroup *r, *crgrp;
 398        struct list_head *head;
 399
 400        /* Check whether cpus are dropped from this group */
 401        cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
 402        if (cpumask_weight(tmpmask)) {
 403                /* Can't drop from default group */
 404                if (rdtgrp == &rdtgroup_default) {
 405                        rdt_last_cmd_puts("Can't drop CPUs from default group\n");
 406                        return -EINVAL;
 407                }
 408
 409                /* Give any dropped cpus to rdtgroup_default */
 410                cpumask_or(&rdtgroup_default.cpu_mask,
 411                           &rdtgroup_default.cpu_mask, tmpmask);
 412                update_closid_rmid(tmpmask, &rdtgroup_default);
 413        }
 414
 415        /*
 416         * If we added cpus, remove them from previous group and
 417         * the prev group's child groups that owned them
 418         * and update per-cpu closid/rmid.
 419         */
 420        cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
 421        if (cpumask_weight(tmpmask)) {
 422                list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
 423                        if (r == rdtgrp)
 424                                continue;
 425                        cpumask_and(tmpmask1, &r->cpu_mask, tmpmask);
 426                        if (cpumask_weight(tmpmask1))
 427                                cpumask_rdtgrp_clear(r, tmpmask1);
 428                }
 429                update_closid_rmid(tmpmask, rdtgrp);
 430        }
 431
 432        /* Done pushing/pulling - update this group with new mask */
 433        cpumask_copy(&rdtgrp->cpu_mask, newmask);
 434
 435        /*
 436         * Clear child mon group masks since there is a new parent mask
 437         * now and update the rmid for the cpus the child lost.
 438         */
 439        head = &rdtgrp->mon.crdtgrp_list;
 440        list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
 441                cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask);
 442                update_closid_rmid(tmpmask, rdtgrp);
 443                cpumask_clear(&crgrp->cpu_mask);
 444        }
 445
 446        return 0;
 447}
 448
 449static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
 450                                   char *buf, size_t nbytes, loff_t off)
 451{
 452        cpumask_var_t tmpmask, newmask, tmpmask1;
 453        struct rdtgroup *rdtgrp;
 454        int ret;
 455
 456        if (!buf)
 457                return -EINVAL;
 458
 459        if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
 460                return -ENOMEM;
 461        if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) {
 462                free_cpumask_var(tmpmask);
 463                return -ENOMEM;
 464        }
 465        if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) {
 466                free_cpumask_var(tmpmask);
 467                free_cpumask_var(newmask);
 468                return -ENOMEM;
 469        }
 470
 471        rdtgrp = rdtgroup_kn_lock_live(of->kn);
 472        if (!rdtgrp) {
 473                ret = -ENOENT;
 474                goto unlock;
 475        }
 476
 477        if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
 478            rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
 479                ret = -EINVAL;
 480                rdt_last_cmd_puts("Pseudo-locking in progress\n");
 481                goto unlock;
 482        }
 483
 484        if (is_cpu_list(of))
 485                ret = cpulist_parse(buf, newmask);
 486        else
 487                ret = cpumask_parse(buf, newmask);
 488
 489        if (ret) {
 490                rdt_last_cmd_puts("Bad CPU list/mask\n");
 491                goto unlock;
 492        }
 493
 494        /* check that user didn't specify any offline cpus */
 495        cpumask_andnot(tmpmask, newmask, cpu_online_mask);
 496        if (cpumask_weight(tmpmask)) {
 497                ret = -EINVAL;
 498                rdt_last_cmd_puts("Can only assign online CPUs\n");
 499                goto unlock;
 500        }
 501
 502        if (rdtgrp->type == RDTCTRL_GROUP)
 503                ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1);
 504        else if (rdtgrp->type == RDTMON_GROUP)
 505                ret = cpus_mon_write(rdtgrp, newmask, tmpmask);
 506        else
 507                ret = -EINVAL;
 508
 509unlock:
 510        rdtgroup_kn_unlock(of->kn);
 511        free_cpumask_var(tmpmask);
 512        free_cpumask_var(newmask);
 513        free_cpumask_var(tmpmask1);
 514
 515        return ret ?: nbytes;
 516}
 517
 518struct task_move_callback {
 519        struct callback_head    work;
 520        struct rdtgroup         *rdtgrp;
 521};
 522
 523static void move_myself(struct callback_head *head)
 524{
 525        struct task_move_callback *callback;
 526        struct rdtgroup *rdtgrp;
 527
 528        callback = container_of(head, struct task_move_callback, work);
 529        rdtgrp = callback->rdtgrp;
 530
 531        /*
 532         * If resource group was deleted before this task work callback
 533         * was invoked, then assign the task to root group and free the
 534         * resource group.
 535         */
 536        if (atomic_dec_and_test(&rdtgrp->waitcount) &&
 537            (rdtgrp->flags & RDT_DELETED)) {
 538                current->closid = 0;
 539                current->rmid = 0;
 540                kfree(rdtgrp);
 541        }
 542
 543        if (unlikely(current->flags & PF_EXITING))
 544                goto out;
 545
 546        preempt_disable();
 547        /* update PQR_ASSOC MSR to make resource group go into effect */
 548        resctrl_sched_in();
 549        preempt_enable();
 550
 551out:
 552        kfree(callback);
 553}
 554
 555static int __rdtgroup_move_task(struct task_struct *tsk,
 556                                struct rdtgroup *rdtgrp)
 557{
 558        struct task_move_callback *callback;
 559        int ret;
 560
 561        callback = kzalloc(sizeof(*callback), GFP_KERNEL);
 562        if (!callback)
 563                return -ENOMEM;
 564        callback->work.func = move_myself;
 565        callback->rdtgrp = rdtgrp;
 566
 567        /*
 568         * Take a refcount, so rdtgrp cannot be freed before the
 569         * callback has been invoked.
 570         */
 571        atomic_inc(&rdtgrp->waitcount);
 572        ret = task_work_add(tsk, &callback->work, true);
 573        if (ret) {
 574                /*
 575                 * Task is exiting. Drop the refcount and free the callback.
 576                 * No need to check the refcount as the group cannot be
 577                 * deleted before the write function unlocks rdtgroup_mutex.
 578                 */
 579                atomic_dec(&rdtgrp->waitcount);
 580                kfree(callback);
 581                rdt_last_cmd_puts("Task exited\n");
 582        } else {
 583                /*
 584                 * For ctrl_mon groups move both closid and rmid.
 585                 * For monitor groups, can move the tasks only from
 586                 * their parent CTRL group.
 587                 */
 588                if (rdtgrp->type == RDTCTRL_GROUP) {
 589                        tsk->closid = rdtgrp->closid;
 590                        tsk->rmid = rdtgrp->mon.rmid;
 591                } else if (rdtgrp->type == RDTMON_GROUP) {
 592                        if (rdtgrp->mon.parent->closid == tsk->closid) {
 593                                tsk->rmid = rdtgrp->mon.rmid;
 594                        } else {
 595                                rdt_last_cmd_puts("Can't move task to different control group\n");
 596                                ret = -EINVAL;
 597                        }
 598                }
 599        }
 600        return ret;
 601}
 602
 603static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
 604{
 605        return (rdt_alloc_capable &&
 606               (r->type == RDTCTRL_GROUP) && (t->closid == r->closid));
 607}
 608
 609static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
 610{
 611        return (rdt_mon_capable &&
 612               (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid));
 613}
 614
 615/**
 616 * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group
 617 * @r: Resource group
 618 *
 619 * Return: 1 if tasks have been assigned to @r, 0 otherwise
 620 */
 621int rdtgroup_tasks_assigned(struct rdtgroup *r)
 622{
 623        struct task_struct *p, *t;
 624        int ret = 0;
 625
 626        lockdep_assert_held(&rdtgroup_mutex);
 627
 628        rcu_read_lock();
 629        for_each_process_thread(p, t) {
 630                if (is_closid_match(t, r) || is_rmid_match(t, r)) {
 631                        ret = 1;
 632                        break;
 633                }
 634        }
 635        rcu_read_unlock();
 636
 637        return ret;
 638}
 639
 640static int rdtgroup_task_write_permission(struct task_struct *task,
 641                                          struct kernfs_open_file *of)
 642{
 643        const struct cred *tcred = get_task_cred(task);
 644        const struct cred *cred = current_cred();
 645        int ret = 0;
 646
 647        /*
 648         * Even if we're attaching all tasks in the thread group, we only
 649         * need to check permissions on one of them.
 650         */
 651        if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
 652            !uid_eq(cred->euid, tcred->uid) &&
 653            !uid_eq(cred->euid, tcred->suid)) {
 654                rdt_last_cmd_printf("No permission to move task %d\n", task->pid);
 655                ret = -EPERM;
 656        }
 657
 658        put_cred(tcred);
 659        return ret;
 660}
 661
 662static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp,
 663                              struct kernfs_open_file *of)
 664{
 665        struct task_struct *tsk;
 666        int ret;
 667
 668        rcu_read_lock();
 669        if (pid) {
 670                tsk = find_task_by_vpid(pid);
 671                if (!tsk) {
 672                        rcu_read_unlock();
 673                        rdt_last_cmd_printf("No task %d\n", pid);
 674                        return -ESRCH;
 675                }
 676        } else {
 677                tsk = current;
 678        }
 679
 680        get_task_struct(tsk);
 681        rcu_read_unlock();
 682
 683        ret = rdtgroup_task_write_permission(tsk, of);
 684        if (!ret)
 685                ret = __rdtgroup_move_task(tsk, rdtgrp);
 686
 687        put_task_struct(tsk);
 688        return ret;
 689}
 690
 691static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of,
 692                                    char *buf, size_t nbytes, loff_t off)
 693{
 694        struct rdtgroup *rdtgrp;
 695        int ret = 0;
 696        pid_t pid;
 697
 698        if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
 699                return -EINVAL;
 700        rdtgrp = rdtgroup_kn_lock_live(of->kn);
 701        if (!rdtgrp) {
 702                rdtgroup_kn_unlock(of->kn);
 703                return -ENOENT;
 704        }
 705        rdt_last_cmd_clear();
 706
 707        if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
 708            rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
 709                ret = -EINVAL;
 710                rdt_last_cmd_puts("Pseudo-locking in progress\n");
 711                goto unlock;
 712        }
 713
 714        ret = rdtgroup_move_task(pid, rdtgrp, of);
 715
 716unlock:
 717        rdtgroup_kn_unlock(of->kn);
 718
 719        return ret ?: nbytes;
 720}
 721
 722static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
 723{
 724        struct task_struct *p, *t;
 725
 726        rcu_read_lock();
 727        for_each_process_thread(p, t) {
 728                if (is_closid_match(t, r) || is_rmid_match(t, r))
 729                        seq_printf(s, "%d\n", t->pid);
 730        }
 731        rcu_read_unlock();
 732}
 733
 734static int rdtgroup_tasks_show(struct kernfs_open_file *of,
 735                               struct seq_file *s, void *v)
 736{
 737        struct rdtgroup *rdtgrp;
 738        int ret = 0;
 739
 740        rdtgrp = rdtgroup_kn_lock_live(of->kn);
 741        if (rdtgrp)
 742                show_rdt_tasks(rdtgrp, s);
 743        else
 744                ret = -ENOENT;
 745        rdtgroup_kn_unlock(of->kn);
 746
 747        return ret;
 748}
 749
 750#ifdef CONFIG_PROC_CPU_RESCTRL
 751
 752/*
 753 * A task can only be part of one resctrl control group and of one monitor
 754 * group which is associated to that control group.
 755 *
 756 * 1)   res:
 757 *      mon:
 758 *
 759 *    resctrl is not available.
 760 *
 761 * 2)   res:/
 762 *      mon:
 763 *
 764 *    Task is part of the root resctrl control group, and it is not associated
 765 *    to any monitor group.
 766 *
 767 * 3)  res:/
 768 *     mon:mon0
 769 *
 770 *    Task is part of the root resctrl control group and monitor group mon0.
 771 *
 772 * 4)  res:group0
 773 *     mon:
 774 *
 775 *    Task is part of resctrl control group group0, and it is not associated
 776 *    to any monitor group.
 777 *
 778 * 5) res:group0
 779 *    mon:mon1
 780 *
 781 *    Task is part of resctrl control group group0 and monitor group mon1.
 782 */
 783int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns,
 784                      struct pid *pid, struct task_struct *tsk)
 785{
 786        struct rdtgroup *rdtg;
 787        int ret = 0;
 788
 789        mutex_lock(&rdtgroup_mutex);
 790
 791        /* Return empty if resctrl has not been mounted. */
 792        if (!static_branch_unlikely(&rdt_enable_key)) {
 793                seq_puts(s, "res:\nmon:\n");
 794                goto unlock;
 795        }
 796
 797        list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) {
 798                struct rdtgroup *crg;
 799
 800                /*
 801                 * Task information is only relevant for shareable
 802                 * and exclusive groups.
 803                 */
 804                if (rdtg->mode != RDT_MODE_SHAREABLE &&
 805                    rdtg->mode != RDT_MODE_EXCLUSIVE)
 806                        continue;
 807
 808                if (rdtg->closid != tsk->closid)
 809                        continue;
 810
 811                seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "",
 812                           rdtg->kn->name);
 813                seq_puts(s, "mon:");
 814                list_for_each_entry(crg, &rdtg->mon.crdtgrp_list,
 815                                    mon.crdtgrp_list) {
 816                        if (tsk->rmid != crg->mon.rmid)
 817                                continue;
 818                        seq_printf(s, "%s", crg->kn->name);
 819                        break;
 820                }
 821                seq_putc(s, '\n');
 822                goto unlock;
 823        }
 824        /*
 825         * The above search should succeed. Otherwise return
 826         * with an error.
 827         */
 828        ret = -ENOENT;
 829unlock:
 830        mutex_unlock(&rdtgroup_mutex);
 831
 832        return ret;
 833}
 834#endif
 835
 836static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
 837                                    struct seq_file *seq, void *v)
 838{
 839        int len;
 840
 841        mutex_lock(&rdtgroup_mutex);
 842        len = seq_buf_used(&last_cmd_status);
 843        if (len)
 844                seq_printf(seq, "%.*s", len, last_cmd_status_buf);
 845        else
 846                seq_puts(seq, "ok\n");
 847        mutex_unlock(&rdtgroup_mutex);
 848        return 0;
 849}
 850
 851static int rdt_num_closids_show(struct kernfs_open_file *of,
 852                                struct seq_file *seq, void *v)
 853{
 854        struct rdt_resource *r = of->kn->parent->priv;
 855
 856        seq_printf(seq, "%d\n", r->num_closid);
 857        return 0;
 858}
 859
 860static int rdt_default_ctrl_show(struct kernfs_open_file *of,
 861                             struct seq_file *seq, void *v)
 862{
 863        struct rdt_resource *r = of->kn->parent->priv;
 864
 865        seq_printf(seq, "%x\n", r->default_ctrl);
 866        return 0;
 867}
 868
 869static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
 870                             struct seq_file *seq, void *v)
 871{
 872        struct rdt_resource *r = of->kn->parent->priv;
 873
 874        seq_printf(seq, "%u\n", r->cache.min_cbm_bits);
 875        return 0;
 876}
 877
 878static int rdt_shareable_bits_show(struct kernfs_open_file *of,
 879                                   struct seq_file *seq, void *v)
 880{
 881        struct rdt_resource *r = of->kn->parent->priv;
 882
 883        seq_printf(seq, "%x\n", r->cache.shareable_bits);
 884        return 0;
 885}
 886
 887/**
 888 * rdt_bit_usage_show - Display current usage of resources
 889 *
 890 * A domain is a shared resource that can now be allocated differently. Here
 891 * we display the current regions of the domain as an annotated bitmask.
 892 * For each domain of this resource its allocation bitmask
 893 * is annotated as below to indicate the current usage of the corresponding bit:
 894 *   0 - currently unused
 895 *   X - currently available for sharing and used by software and hardware
 896 *   H - currently used by hardware only but available for software use
 897 *   S - currently used and shareable by software only
 898 *   E - currently used exclusively by one resource group
 899 *   P - currently pseudo-locked by one resource group
 900 */
 901static int rdt_bit_usage_show(struct kernfs_open_file *of,
 902                              struct seq_file *seq, void *v)
 903{
 904        struct rdt_resource *r = of->kn->parent->priv;
 905        /*
 906         * Use unsigned long even though only 32 bits are used to ensure
 907         * test_bit() is used safely.
 908         */
 909        unsigned long sw_shareable = 0, hw_shareable = 0;
 910        unsigned long exclusive = 0, pseudo_locked = 0;
 911        struct rdt_domain *dom;
 912        int i, hwb, swb, excl, psl;
 913        enum rdtgrp_mode mode;
 914        bool sep = false;
 915        u32 *ctrl;
 916
 917        mutex_lock(&rdtgroup_mutex);
 918        hw_shareable = r->cache.shareable_bits;
 919        list_for_each_entry(dom, &r->domains, list) {
 920                if (sep)
 921                        seq_putc(seq, ';');
 922                ctrl = dom->ctrl_val;
 923                sw_shareable = 0;
 924                exclusive = 0;
 925                seq_printf(seq, "%d=", dom->id);
 926                for (i = 0; i < closids_supported(); i++, ctrl++) {
 927                        if (!closid_allocated(i))
 928                                continue;
 929                        mode = rdtgroup_mode_by_closid(i);
 930                        switch (mode) {
 931                        case RDT_MODE_SHAREABLE:
 932                                sw_shareable |= *ctrl;
 933                                break;
 934                        case RDT_MODE_EXCLUSIVE:
 935                                exclusive |= *ctrl;
 936                                break;
 937                        case RDT_MODE_PSEUDO_LOCKSETUP:
 938                        /*
 939                         * RDT_MODE_PSEUDO_LOCKSETUP is possible
 940                         * here but not included since the CBM
 941                         * associated with this CLOSID in this mode
 942                         * is not initialized and no task or cpu can be
 943                         * assigned this CLOSID.
 944                         */
 945                                break;
 946                        case RDT_MODE_PSEUDO_LOCKED:
 947                        case RDT_NUM_MODES:
 948                                WARN(1,
 949                                     "invalid mode for closid %d\n", i);
 950                                break;
 951                        }
 952                }
 953                for (i = r->cache.cbm_len - 1; i >= 0; i--) {
 954                        pseudo_locked = dom->plr ? dom->plr->cbm : 0;
 955                        hwb = test_bit(i, &hw_shareable);
 956                        swb = test_bit(i, &sw_shareable);
 957                        excl = test_bit(i, &exclusive);
 958                        psl = test_bit(i, &pseudo_locked);
 959                        if (hwb && swb)
 960                                seq_putc(seq, 'X');
 961                        else if (hwb && !swb)
 962                                seq_putc(seq, 'H');
 963                        else if (!hwb && swb)
 964                                seq_putc(seq, 'S');
 965                        else if (excl)
 966                                seq_putc(seq, 'E');
 967                        else if (psl)
 968                                seq_putc(seq, 'P');
 969                        else /* Unused bits remain */
 970                                seq_putc(seq, '0');
 971                }
 972                sep = true;
 973        }
 974        seq_putc(seq, '\n');
 975        mutex_unlock(&rdtgroup_mutex);
 976        return 0;
 977}
 978
 979static int rdt_min_bw_show(struct kernfs_open_file *of,
 980                             struct seq_file *seq, void *v)
 981{
 982        struct rdt_resource *r = of->kn->parent->priv;
 983
 984        seq_printf(seq, "%u\n", r->membw.min_bw);
 985        return 0;
 986}
 987
 988static int rdt_num_rmids_show(struct kernfs_open_file *of,
 989                              struct seq_file *seq, void *v)
 990{
 991        struct rdt_resource *r = of->kn->parent->priv;
 992
 993        seq_printf(seq, "%d\n", r->num_rmid);
 994
 995        return 0;
 996}
 997
 998static int rdt_mon_features_show(struct kernfs_open_file *of,
 999                                 struct seq_file *seq, void *v)
1000{
1001        struct rdt_resource *r = of->kn->parent->priv;
1002        struct mon_evt *mevt;
1003
1004        list_for_each_entry(mevt, &r->evt_list, list)
1005                seq_printf(seq, "%s\n", mevt->name);
1006
1007        return 0;
1008}
1009
1010static int rdt_bw_gran_show(struct kernfs_open_file *of,
1011                             struct seq_file *seq, void *v)
1012{
1013        struct rdt_resource *r = of->kn->parent->priv;
1014
1015        seq_printf(seq, "%u\n", r->membw.bw_gran);
1016        return 0;
1017}
1018
1019static int rdt_delay_linear_show(struct kernfs_open_file *of,
1020                             struct seq_file *seq, void *v)
1021{
1022        struct rdt_resource *r = of->kn->parent->priv;
1023
1024        seq_printf(seq, "%u\n", r->membw.delay_linear);
1025        return 0;
1026}
1027
1028static int max_threshold_occ_show(struct kernfs_open_file *of,
1029                                  struct seq_file *seq, void *v)
1030{
1031        struct rdt_resource *r = of->kn->parent->priv;
1032
1033        seq_printf(seq, "%u\n", resctrl_cqm_threshold * r->mon_scale);
1034
1035        return 0;
1036}
1037
1038static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of,
1039                                         struct seq_file *seq, void *v)
1040{
1041        struct rdt_resource *r = of->kn->parent->priv;
1042
1043        if (r->membw.throttle_mode == THREAD_THROTTLE_PER_THREAD)
1044                seq_puts(seq, "per-thread\n");
1045        else
1046                seq_puts(seq, "max\n");
1047
1048        return 0;
1049}
1050
1051static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
1052                                       char *buf, size_t nbytes, loff_t off)
1053{
1054        struct rdt_resource *r = of->kn->parent->priv;
1055        unsigned int bytes;
1056        int ret;
1057
1058        ret = kstrtouint(buf, 0, &bytes);
1059        if (ret)
1060                return ret;
1061
1062        if (bytes > (boot_cpu_data.x86_cache_size * 1024))
1063                return -EINVAL;
1064
1065        resctrl_cqm_threshold = bytes / r->mon_scale;
1066
1067        return nbytes;
1068}
1069
1070/*
1071 * rdtgroup_mode_show - Display mode of this resource group
1072 */
1073static int rdtgroup_mode_show(struct kernfs_open_file *of,
1074                              struct seq_file *s, void *v)
1075{
1076        struct rdtgroup *rdtgrp;
1077
1078        rdtgrp = rdtgroup_kn_lock_live(of->kn);
1079        if (!rdtgrp) {
1080                rdtgroup_kn_unlock(of->kn);
1081                return -ENOENT;
1082        }
1083
1084        seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode));
1085
1086        rdtgroup_kn_unlock(of->kn);
1087        return 0;
1088}
1089
1090/**
1091 * rdt_cdp_peer_get - Retrieve CDP peer if it exists
1092 * @r: RDT resource to which RDT domain @d belongs
1093 * @d: Cache instance for which a CDP peer is requested
1094 * @r_cdp: RDT resource that shares hardware with @r (RDT resource peer)
1095 *         Used to return the result.
1096 * @d_cdp: RDT domain that shares hardware with @d (RDT domain peer)
1097 *         Used to return the result.
1098 *
1099 * RDT resources are managed independently and by extension the RDT domains
1100 * (RDT resource instances) are managed independently also. The Code and
1101 * Data Prioritization (CDP) RDT resources, while managed independently,
1102 * could refer to the same underlying hardware. For example,
1103 * RDT_RESOURCE_L2CODE and RDT_RESOURCE_L2DATA both refer to the L2 cache.
1104 *
1105 * When provided with an RDT resource @r and an instance of that RDT
1106 * resource @d rdt_cdp_peer_get() will return if there is a peer RDT
1107 * resource and the exact instance that shares the same hardware.
1108 *
1109 * Return: 0 if a CDP peer was found, <0 on error or if no CDP peer exists.
1110 *         If a CDP peer was found, @r_cdp will point to the peer RDT resource
1111 *         and @d_cdp will point to the peer RDT domain.
1112 */
1113static int rdt_cdp_peer_get(struct rdt_resource *r, struct rdt_domain *d,
1114                            struct rdt_resource **r_cdp,
1115                            struct rdt_domain **d_cdp)
1116{
1117        struct rdt_resource *_r_cdp = NULL;
1118        struct rdt_domain *_d_cdp = NULL;
1119        int ret = 0;
1120
1121        switch (r->rid) {
1122        case RDT_RESOURCE_L3DATA:
1123                _r_cdp = &rdt_resources_all[RDT_RESOURCE_L3CODE];
1124                break;
1125        case RDT_RESOURCE_L3CODE:
1126                _r_cdp =  &rdt_resources_all[RDT_RESOURCE_L3DATA];
1127                break;
1128        case RDT_RESOURCE_L2DATA:
1129                _r_cdp =  &rdt_resources_all[RDT_RESOURCE_L2CODE];
1130                break;
1131        case RDT_RESOURCE_L2CODE:
1132                _r_cdp =  &rdt_resources_all[RDT_RESOURCE_L2DATA];
1133                break;
1134        default:
1135                ret = -ENOENT;
1136                goto out;
1137        }
1138
1139        /*
1140         * When a new CPU comes online and CDP is enabled then the new
1141         * RDT domains (if any) associated with both CDP RDT resources
1142         * are added in the same CPU online routine while the
1143         * rdtgroup_mutex is held. It should thus not happen for one
1144         * RDT domain to exist and be associated with its RDT CDP
1145         * resource but there is no RDT domain associated with the
1146         * peer RDT CDP resource. Hence the WARN.
1147         */
1148        _d_cdp = rdt_find_domain(_r_cdp, d->id, NULL);
1149        if (WARN_ON(IS_ERR_OR_NULL(_d_cdp))) {
1150                _r_cdp = NULL;
1151                _d_cdp = NULL;
1152                ret = -EINVAL;
1153        }
1154
1155out:
1156        *r_cdp = _r_cdp;
1157        *d_cdp = _d_cdp;
1158
1159        return ret;
1160}
1161
1162/**
1163 * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other
1164 * @r: Resource to which domain instance @d belongs.
1165 * @d: The domain instance for which @closid is being tested.
1166 * @cbm: Capacity bitmask being tested.
1167 * @closid: Intended closid for @cbm.
1168 * @exclusive: Only check if overlaps with exclusive resource groups
1169 *
1170 * Checks if provided @cbm intended to be used for @closid on domain
1171 * @d overlaps with any other closids or other hardware usage associated
1172 * with this domain. If @exclusive is true then only overlaps with
1173 * resource groups in exclusive mode will be considered. If @exclusive
1174 * is false then overlaps with any resource group or hardware entities
1175 * will be considered.
1176 *
1177 * @cbm is unsigned long, even if only 32 bits are used, to make the
1178 * bitmap functions work correctly.
1179 *
1180 * Return: false if CBM does not overlap, true if it does.
1181 */
1182static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
1183                                    unsigned long cbm, int closid, bool exclusive)
1184{
1185        enum rdtgrp_mode mode;
1186        unsigned long ctrl_b;
1187        u32 *ctrl;
1188        int i;
1189
1190        /* Check for any overlap with regions used by hardware directly */
1191        if (!exclusive) {
1192                ctrl_b = r->cache.shareable_bits;
1193                if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len))
1194                        return true;
1195        }
1196
1197        /* Check for overlap with other resource groups */
1198        ctrl = d->ctrl_val;
1199        for (i = 0; i < closids_supported(); i++, ctrl++) {
1200                ctrl_b = *ctrl;
1201                mode = rdtgroup_mode_by_closid(i);
1202                if (closid_allocated(i) && i != closid &&
1203                    mode != RDT_MODE_PSEUDO_LOCKSETUP) {
1204                        if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) {
1205                                if (exclusive) {
1206                                        if (mode == RDT_MODE_EXCLUSIVE)
1207                                                return true;
1208                                        continue;
1209                                }
1210                                return true;
1211                        }
1212                }
1213        }
1214
1215        return false;
1216}
1217
1218/**
1219 * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware
1220 * @r: Resource to which domain instance @d belongs.
1221 * @d: The domain instance for which @closid is being tested.
1222 * @cbm: Capacity bitmask being tested.
1223 * @closid: Intended closid for @cbm.
1224 * @exclusive: Only check if overlaps with exclusive resource groups
1225 *
1226 * Resources that can be allocated using a CBM can use the CBM to control
1227 * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test
1228 * for overlap. Overlap test is not limited to the specific resource for
1229 * which the CBM is intended though - when dealing with CDP resources that
1230 * share the underlying hardware the overlap check should be performed on
1231 * the CDP resource sharing the hardware also.
1232 *
1233 * Refer to description of __rdtgroup_cbm_overlaps() for the details of the
1234 * overlap test.
1235 *
1236 * Return: true if CBM overlap detected, false if there is no overlap
1237 */
1238bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
1239                           unsigned long cbm, int closid, bool exclusive)
1240{
1241        struct rdt_resource *r_cdp;
1242        struct rdt_domain *d_cdp;
1243
1244        if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, exclusive))
1245                return true;
1246
1247        if (rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp) < 0)
1248                return false;
1249
1250        return  __rdtgroup_cbm_overlaps(r_cdp, d_cdp, cbm, closid, exclusive);
1251}
1252
1253/**
1254 * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive
1255 *
1256 * An exclusive resource group implies that there should be no sharing of
1257 * its allocated resources. At the time this group is considered to be
1258 * exclusive this test can determine if its current schemata supports this
1259 * setting by testing for overlap with all other resource groups.
1260 *
1261 * Return: true if resource group can be exclusive, false if there is overlap
1262 * with allocations of other resource groups and thus this resource group
1263 * cannot be exclusive.
1264 */
1265static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
1266{
1267        int closid = rdtgrp->closid;
1268        struct rdt_resource *r;
1269        bool has_cache = false;
1270        struct rdt_domain *d;
1271
1272        for_each_alloc_enabled_rdt_resource(r) {
1273                if (r->rid == RDT_RESOURCE_MBA)
1274                        continue;
1275                has_cache = true;
1276                list_for_each_entry(d, &r->domains, list) {
1277                        if (rdtgroup_cbm_overlaps(r, d, d->ctrl_val[closid],
1278                                                  rdtgrp->closid, false)) {
1279                                rdt_last_cmd_puts("Schemata overlaps\n");
1280                                return false;
1281                        }
1282                }
1283        }
1284
1285        if (!has_cache) {
1286                rdt_last_cmd_puts("Cannot be exclusive without CAT/CDP\n");
1287                return false;
1288        }
1289
1290        return true;
1291}
1292
1293/**
1294 * rdtgroup_mode_write - Modify the resource group's mode
1295 *
1296 */
1297static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of,
1298                                   char *buf, size_t nbytes, loff_t off)
1299{
1300        struct rdtgroup *rdtgrp;
1301        enum rdtgrp_mode mode;
1302        int ret = 0;
1303
1304        /* Valid input requires a trailing newline */
1305        if (nbytes == 0 || buf[nbytes - 1] != '\n')
1306                return -EINVAL;
1307        buf[nbytes - 1] = '\0';
1308
1309        rdtgrp = rdtgroup_kn_lock_live(of->kn);
1310        if (!rdtgrp) {
1311                rdtgroup_kn_unlock(of->kn);
1312                return -ENOENT;
1313        }
1314
1315        rdt_last_cmd_clear();
1316
1317        mode = rdtgrp->mode;
1318
1319        if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) ||
1320            (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) ||
1321            (!strcmp(buf, "pseudo-locksetup") &&
1322             mode == RDT_MODE_PSEUDO_LOCKSETUP) ||
1323            (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED))
1324                goto out;
1325
1326        if (mode == RDT_MODE_PSEUDO_LOCKED) {
1327                rdt_last_cmd_puts("Cannot change pseudo-locked group\n");
1328                ret = -EINVAL;
1329                goto out;
1330        }
1331
1332        if (!strcmp(buf, "shareable")) {
1333                if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
1334                        ret = rdtgroup_locksetup_exit(rdtgrp);
1335                        if (ret)
1336                                goto out;
1337                }
1338                rdtgrp->mode = RDT_MODE_SHAREABLE;
1339        } else if (!strcmp(buf, "exclusive")) {
1340                if (!rdtgroup_mode_test_exclusive(rdtgrp)) {
1341                        ret = -EINVAL;
1342                        goto out;
1343                }
1344                if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
1345                        ret = rdtgroup_locksetup_exit(rdtgrp);
1346                        if (ret)
1347                                goto out;
1348                }
1349                rdtgrp->mode = RDT_MODE_EXCLUSIVE;
1350        } else if (!strcmp(buf, "pseudo-locksetup")) {
1351                ret = rdtgroup_locksetup_enter(rdtgrp);
1352                if (ret)
1353                        goto out;
1354                rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP;
1355        } else {
1356                rdt_last_cmd_puts("Unknown or unsupported mode\n");
1357                ret = -EINVAL;
1358        }
1359
1360out:
1361        rdtgroup_kn_unlock(of->kn);
1362        return ret ?: nbytes;
1363}
1364
1365/**
1366 * rdtgroup_cbm_to_size - Translate CBM to size in bytes
1367 * @r: RDT resource to which @d belongs.
1368 * @d: RDT domain instance.
1369 * @cbm: bitmask for which the size should be computed.
1370 *
1371 * The bitmask provided associated with the RDT domain instance @d will be
1372 * translated into how many bytes it represents. The size in bytes is
1373 * computed by first dividing the total cache size by the CBM length to
1374 * determine how many bytes each bit in the bitmask represents. The result
1375 * is multiplied with the number of bits set in the bitmask.
1376 *
1377 * @cbm is unsigned long, even if only 32 bits are used to make the
1378 * bitmap functions work correctly.
1379 */
1380unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
1381                                  struct rdt_domain *d, unsigned long cbm)
1382{
1383        struct cpu_cacheinfo *ci;
1384        unsigned int size = 0;
1385        int num_b, i;
1386
1387        num_b = bitmap_weight(&cbm, r->cache.cbm_len);
1388        ci = get_cpu_cacheinfo(cpumask_any(&d->cpu_mask));
1389        for (i = 0; i < ci->num_leaves; i++) {
1390                if (ci->info_list[i].level == r->cache_level) {
1391                        size = ci->info_list[i].size / r->cache.cbm_len * num_b;
1392                        break;
1393                }
1394        }
1395
1396        return size;
1397}
1398
1399/**
1400 * rdtgroup_size_show - Display size in bytes of allocated regions
1401 *
1402 * The "size" file mirrors the layout of the "schemata" file, printing the
1403 * size in bytes of each region instead of the capacity bitmask.
1404 *
1405 */
1406static int rdtgroup_size_show(struct kernfs_open_file *of,
1407                              struct seq_file *s, void *v)
1408{
1409        struct rdtgroup *rdtgrp;
1410        struct rdt_resource *r;
1411        struct rdt_domain *d;
1412        unsigned int size;
1413        int ret = 0;
1414        bool sep;
1415        u32 ctrl;
1416
1417        rdtgrp = rdtgroup_kn_lock_live(of->kn);
1418        if (!rdtgrp) {
1419                rdtgroup_kn_unlock(of->kn);
1420                return -ENOENT;
1421        }
1422
1423        if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
1424                if (!rdtgrp->plr->d) {
1425                        rdt_last_cmd_clear();
1426                        rdt_last_cmd_puts("Cache domain offline\n");
1427                        ret = -ENODEV;
1428                } else {
1429                        seq_printf(s, "%*s:", max_name_width,
1430                                   rdtgrp->plr->r->name);
1431                        size = rdtgroup_cbm_to_size(rdtgrp->plr->r,
1432                                                    rdtgrp->plr->d,
1433                                                    rdtgrp->plr->cbm);
1434                        seq_printf(s, "%d=%u\n", rdtgrp->plr->d->id, size);
1435                }
1436                goto out;
1437        }
1438
1439        for_each_alloc_enabled_rdt_resource(r) {
1440                sep = false;
1441                seq_printf(s, "%*s:", max_name_width, r->name);
1442                list_for_each_entry(d, &r->domains, list) {
1443                        if (sep)
1444                                seq_putc(s, ';');
1445                        if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
1446                                size = 0;
1447                        } else {
1448                                ctrl = (!is_mba_sc(r) ?
1449                                                d->ctrl_val[rdtgrp->closid] :
1450                                                d->mbps_val[rdtgrp->closid]);
1451                                if (r->rid == RDT_RESOURCE_MBA)
1452                                        size = ctrl;
1453                                else
1454                                        size = rdtgroup_cbm_to_size(r, d, ctrl);
1455                        }
1456                        seq_printf(s, "%d=%u", d->id, size);
1457                        sep = true;
1458                }
1459                seq_putc(s, '\n');
1460        }
1461
1462out:
1463        rdtgroup_kn_unlock(of->kn);
1464
1465        return ret;
1466}
1467
1468/* rdtgroup information files for one cache resource. */
1469static struct rftype res_common_files[] = {
1470        {
1471                .name           = "last_cmd_status",
1472                .mode           = 0444,
1473                .kf_ops         = &rdtgroup_kf_single_ops,
1474                .seq_show       = rdt_last_cmd_status_show,
1475                .fflags         = RF_TOP_INFO,
1476        },
1477        {
1478                .name           = "num_closids",
1479                .mode           = 0444,
1480                .kf_ops         = &rdtgroup_kf_single_ops,
1481                .seq_show       = rdt_num_closids_show,
1482                .fflags         = RF_CTRL_INFO,
1483        },
1484        {
1485                .name           = "mon_features",
1486                .mode           = 0444,
1487                .kf_ops         = &rdtgroup_kf_single_ops,
1488                .seq_show       = rdt_mon_features_show,
1489                .fflags         = RF_MON_INFO,
1490        },
1491        {
1492                .name           = "num_rmids",
1493                .mode           = 0444,
1494                .kf_ops         = &rdtgroup_kf_single_ops,
1495                .seq_show       = rdt_num_rmids_show,
1496                .fflags         = RF_MON_INFO,
1497        },
1498        {
1499                .name           = "cbm_mask",
1500                .mode           = 0444,
1501                .kf_ops         = &rdtgroup_kf_single_ops,
1502                .seq_show       = rdt_default_ctrl_show,
1503                .fflags         = RF_CTRL_INFO | RFTYPE_RES_CACHE,
1504        },
1505        {
1506                .name           = "min_cbm_bits",
1507                .mode           = 0444,
1508                .kf_ops         = &rdtgroup_kf_single_ops,
1509                .seq_show       = rdt_min_cbm_bits_show,
1510                .fflags         = RF_CTRL_INFO | RFTYPE_RES_CACHE,
1511        },
1512        {
1513                .name           = "shareable_bits",
1514                .mode           = 0444,
1515                .kf_ops         = &rdtgroup_kf_single_ops,
1516                .seq_show       = rdt_shareable_bits_show,
1517                .fflags         = RF_CTRL_INFO | RFTYPE_RES_CACHE,
1518        },
1519        {
1520                .name           = "bit_usage",
1521                .mode           = 0444,
1522                .kf_ops         = &rdtgroup_kf_single_ops,
1523                .seq_show       = rdt_bit_usage_show,
1524                .fflags         = RF_CTRL_INFO | RFTYPE_RES_CACHE,
1525        },
1526        {
1527                .name           = "min_bandwidth",
1528                .mode           = 0444,
1529                .kf_ops         = &rdtgroup_kf_single_ops,
1530                .seq_show       = rdt_min_bw_show,
1531                .fflags         = RF_CTRL_INFO | RFTYPE_RES_MB,
1532        },
1533        {
1534                .name           = "bandwidth_gran",
1535                .mode           = 0444,
1536                .kf_ops         = &rdtgroup_kf_single_ops,
1537                .seq_show       = rdt_bw_gran_show,
1538                .fflags         = RF_CTRL_INFO | RFTYPE_RES_MB,
1539        },
1540        {
1541                .name           = "delay_linear",
1542                .mode           = 0444,
1543                .kf_ops         = &rdtgroup_kf_single_ops,
1544                .seq_show       = rdt_delay_linear_show,
1545                .fflags         = RF_CTRL_INFO | RFTYPE_RES_MB,
1546        },
1547        /*
1548         * Platform specific which (if any) capabilities are provided by
1549         * thread_throttle_mode. Defer "fflags" initialization to platform
1550         * discovery.
1551         */
1552        {
1553                .name           = "thread_throttle_mode",
1554                .mode           = 0444,
1555                .kf_ops         = &rdtgroup_kf_single_ops,
1556                .seq_show       = rdt_thread_throttle_mode_show,
1557        },
1558        {
1559                .name           = "max_threshold_occupancy",
1560                .mode           = 0644,
1561                .kf_ops         = &rdtgroup_kf_single_ops,
1562                .write          = max_threshold_occ_write,
1563                .seq_show       = max_threshold_occ_show,
1564                .fflags         = RF_MON_INFO | RFTYPE_RES_CACHE,
1565        },
1566        {
1567                .name           = "cpus",
1568                .mode           = 0644,
1569                .kf_ops         = &rdtgroup_kf_single_ops,
1570                .write          = rdtgroup_cpus_write,
1571                .seq_show       = rdtgroup_cpus_show,
1572                .fflags         = RFTYPE_BASE,
1573        },
1574        {
1575                .name           = "cpus_list",
1576                .mode           = 0644,
1577                .kf_ops         = &rdtgroup_kf_single_ops,
1578                .write          = rdtgroup_cpus_write,
1579                .seq_show       = rdtgroup_cpus_show,
1580                .flags          = RFTYPE_FLAGS_CPUS_LIST,
1581                .fflags         = RFTYPE_BASE,
1582        },
1583        {
1584                .name           = "tasks",
1585                .mode           = 0644,
1586                .kf_ops         = &rdtgroup_kf_single_ops,
1587                .write          = rdtgroup_tasks_write,
1588                .seq_show       = rdtgroup_tasks_show,
1589                .fflags         = RFTYPE_BASE,
1590        },
1591        {
1592                .name           = "schemata",
1593                .mode           = 0644,
1594                .kf_ops         = &rdtgroup_kf_single_ops,
1595                .write          = rdtgroup_schemata_write,
1596                .seq_show       = rdtgroup_schemata_show,
1597                .fflags         = RF_CTRL_BASE,
1598        },
1599        {
1600                .name           = "mode",
1601                .mode           = 0644,
1602                .kf_ops         = &rdtgroup_kf_single_ops,
1603                .write          = rdtgroup_mode_write,
1604                .seq_show       = rdtgroup_mode_show,
1605                .fflags         = RF_CTRL_BASE,
1606        },
1607        {
1608                .name           = "size",
1609                .mode           = 0444,
1610                .kf_ops         = &rdtgroup_kf_single_ops,
1611                .seq_show       = rdtgroup_size_show,
1612                .fflags         = RF_CTRL_BASE,
1613        },
1614
1615};
1616
1617static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags)
1618{
1619        struct rftype *rfts, *rft;
1620        int ret, len;
1621
1622        rfts = res_common_files;
1623        len = ARRAY_SIZE(res_common_files);
1624
1625        lockdep_assert_held(&rdtgroup_mutex);
1626
1627        for (rft = rfts; rft < rfts + len; rft++) {
1628                if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) {
1629                        ret = rdtgroup_add_file(kn, rft);
1630                        if (ret)
1631                                goto error;
1632                }
1633        }
1634
1635        return 0;
1636error:
1637        pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
1638        while (--rft >= rfts) {
1639                if ((fflags & rft->fflags) == rft->fflags)
1640                        kernfs_remove_by_name(kn, rft->name);
1641        }
1642        return ret;
1643}
1644
1645static struct rftype *rdtgroup_get_rftype_by_name(const char *name)
1646{
1647        struct rftype *rfts, *rft;
1648        int len;
1649
1650        rfts = res_common_files;
1651        len = ARRAY_SIZE(res_common_files);
1652
1653        for (rft = rfts; rft < rfts + len; rft++) {
1654                if (!strcmp(rft->name, name))
1655                        return rft;
1656        }
1657
1658        return NULL;
1659}
1660
1661void __init thread_throttle_mode_init(void)
1662{
1663        struct rftype *rft;
1664
1665        rft = rdtgroup_get_rftype_by_name("thread_throttle_mode");
1666        if (!rft)
1667                return;
1668
1669        rft->fflags = RF_CTRL_INFO | RFTYPE_RES_MB;
1670}
1671
1672/**
1673 * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file
1674 * @r: The resource group with which the file is associated.
1675 * @name: Name of the file
1676 *
1677 * The permissions of named resctrl file, directory, or link are modified
1678 * to not allow read, write, or execute by any user.
1679 *
1680 * WARNING: This function is intended to communicate to the user that the
1681 * resctrl file has been locked down - that it is not relevant to the
1682 * particular state the system finds itself in. It should not be relied
1683 * on to protect from user access because after the file's permissions
1684 * are restricted the user can still change the permissions using chmod
1685 * from the command line.
1686 *
1687 * Return: 0 on success, <0 on failure.
1688 */
1689int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name)
1690{
1691        struct iattr iattr = {.ia_valid = ATTR_MODE,};
1692        struct kernfs_node *kn;
1693        int ret = 0;
1694
1695        kn = kernfs_find_and_get_ns(r->kn, name, NULL);
1696        if (!kn)
1697                return -ENOENT;
1698
1699        switch (kernfs_type(kn)) {
1700        case KERNFS_DIR:
1701                iattr.ia_mode = S_IFDIR;
1702                break;
1703        case KERNFS_FILE:
1704                iattr.ia_mode = S_IFREG;
1705                break;
1706        case KERNFS_LINK:
1707                iattr.ia_mode = S_IFLNK;
1708                break;
1709        }
1710
1711        ret = kernfs_setattr(kn, &iattr);
1712        kernfs_put(kn);
1713        return ret;
1714}
1715
1716/**
1717 * rdtgroup_kn_mode_restore - Restore user access to named resctrl file
1718 * @r: The resource group with which the file is associated.
1719 * @name: Name of the file
1720 * @mask: Mask of permissions that should be restored
1721 *
1722 * Restore the permissions of the named file. If @name is a directory the
1723 * permissions of its parent will be used.
1724 *
1725 * Return: 0 on success, <0 on failure.
1726 */
1727int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
1728                             umode_t mask)
1729{
1730        struct iattr iattr = {.ia_valid = ATTR_MODE,};
1731        struct kernfs_node *kn, *parent;
1732        struct rftype *rfts, *rft;
1733        int ret, len;
1734
1735        rfts = res_common_files;
1736        len = ARRAY_SIZE(res_common_files);
1737
1738        for (rft = rfts; rft < rfts + len; rft++) {
1739                if (!strcmp(rft->name, name))
1740                        iattr.ia_mode = rft->mode & mask;
1741        }
1742
1743        kn = kernfs_find_and_get_ns(r->kn, name, NULL);
1744        if (!kn)
1745                return -ENOENT;
1746
1747        switch (kernfs_type(kn)) {
1748        case KERNFS_DIR:
1749                parent = kernfs_get_parent(kn);
1750                if (parent) {
1751                        iattr.ia_mode |= parent->mode;
1752                        kernfs_put(parent);
1753                }
1754                iattr.ia_mode |= S_IFDIR;
1755                break;
1756        case KERNFS_FILE:
1757                iattr.ia_mode |= S_IFREG;
1758                break;
1759        case KERNFS_LINK:
1760                iattr.ia_mode |= S_IFLNK;
1761                break;
1762        }
1763
1764        ret = kernfs_setattr(kn, &iattr);
1765        kernfs_put(kn);
1766        return ret;
1767}
1768
1769static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name,
1770                                      unsigned long fflags)
1771{
1772        struct kernfs_node *kn_subdir;
1773        int ret;
1774
1775        kn_subdir = kernfs_create_dir(kn_info, name,
1776                                      kn_info->mode, r);
1777        if (IS_ERR(kn_subdir))
1778                return PTR_ERR(kn_subdir);
1779
1780        kernfs_get(kn_subdir);
1781        ret = rdtgroup_kn_set_ugid(kn_subdir);
1782        if (ret)
1783                return ret;
1784
1785        ret = rdtgroup_add_files(kn_subdir, fflags);
1786        if (!ret)
1787                kernfs_activate(kn_subdir);
1788
1789        return ret;
1790}
1791
1792static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
1793{
1794        struct rdt_resource *r;
1795        unsigned long fflags;
1796        char name[32];
1797        int ret;
1798
1799        /* create the directory */
1800        kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
1801        if (IS_ERR(kn_info))
1802                return PTR_ERR(kn_info);
1803        kernfs_get(kn_info);
1804
1805        ret = rdtgroup_add_files(kn_info, RF_TOP_INFO);
1806        if (ret)
1807                goto out_destroy;
1808
1809        for_each_alloc_enabled_rdt_resource(r) {
1810                fflags =  r->fflags | RF_CTRL_INFO;
1811                ret = rdtgroup_mkdir_info_resdir(r, r->name, fflags);
1812                if (ret)
1813                        goto out_destroy;
1814        }
1815
1816        for_each_mon_enabled_rdt_resource(r) {
1817                fflags =  r->fflags | RF_MON_INFO;
1818                sprintf(name, "%s_MON", r->name);
1819                ret = rdtgroup_mkdir_info_resdir(r, name, fflags);
1820                if (ret)
1821                        goto out_destroy;
1822        }
1823
1824        /*
1825         * This extra ref will be put in kernfs_remove() and guarantees
1826         * that @rdtgrp->kn is always accessible.
1827         */
1828        kernfs_get(kn_info);
1829
1830        ret = rdtgroup_kn_set_ugid(kn_info);
1831        if (ret)
1832                goto out_destroy;
1833
1834        kernfs_activate(kn_info);
1835
1836        return 0;
1837
1838out_destroy:
1839        kernfs_remove(kn_info);
1840        return ret;
1841}
1842
1843static int
1844mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp,
1845                    char *name, struct kernfs_node **dest_kn)
1846{
1847        struct kernfs_node *kn;
1848        int ret;
1849
1850        /* create the directory */
1851        kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
1852        if (IS_ERR(kn))
1853                return PTR_ERR(kn);
1854
1855        if (dest_kn)
1856                *dest_kn = kn;
1857
1858        /*
1859         * This extra ref will be put in kernfs_remove() and guarantees
1860         * that @rdtgrp->kn is always accessible.
1861         */
1862        kernfs_get(kn);
1863
1864        ret = rdtgroup_kn_set_ugid(kn);
1865        if (ret)
1866                goto out_destroy;
1867
1868        kernfs_activate(kn);
1869
1870        return 0;
1871
1872out_destroy:
1873        kernfs_remove(kn);
1874        return ret;
1875}
1876
1877static void l3_qos_cfg_update(void *arg)
1878{
1879        bool *enable = arg;
1880
1881        wrmsrl(MSR_IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL);
1882}
1883
1884static void l2_qos_cfg_update(void *arg)
1885{
1886        bool *enable = arg;
1887
1888        wrmsrl(MSR_IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL);
1889}
1890
1891static inline bool is_mba_linear(void)
1892{
1893        return rdt_resources_all[RDT_RESOURCE_MBA].membw.delay_linear;
1894}
1895
1896static int set_cache_qos_cfg(int level, bool enable)
1897{
1898        void (*update)(void *arg);
1899        struct rdt_resource *r_l;
1900        cpumask_var_t cpu_mask;
1901        struct rdt_domain *d;
1902        int cpu;
1903
1904        if (level == RDT_RESOURCE_L3)
1905                update = l3_qos_cfg_update;
1906        else if (level == RDT_RESOURCE_L2)
1907                update = l2_qos_cfg_update;
1908        else
1909                return -EINVAL;
1910
1911        if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
1912                return -ENOMEM;
1913
1914        r_l = &rdt_resources_all[level];
1915        list_for_each_entry(d, &r_l->domains, list) {
1916                if (r_l->cache.arch_has_per_cpu_cfg)
1917                        /* Pick all the CPUs in the domain instance */
1918                        for_each_cpu(cpu, &d->cpu_mask)
1919                                cpumask_set_cpu(cpu, cpu_mask);
1920                else
1921                        /* Pick one CPU from each domain instance to update MSR */
1922                        cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
1923        }
1924        cpu = get_cpu();
1925        /* Update QOS_CFG MSR on this cpu if it's in cpu_mask. */
1926        if (cpumask_test_cpu(cpu, cpu_mask))
1927                update(&enable);
1928        /* Update QOS_CFG MSR on all other cpus in cpu_mask. */
1929        smp_call_function_many(cpu_mask, update, &enable, 1);
1930        put_cpu();
1931
1932        free_cpumask_var(cpu_mask);
1933
1934        return 0;
1935}
1936
1937/* Restore the qos cfg state when a domain comes online */
1938void rdt_domain_reconfigure_cdp(struct rdt_resource *r)
1939{
1940        if (!r->alloc_capable)
1941                return;
1942
1943        if (r == &rdt_resources_all[RDT_RESOURCE_L2DATA])
1944                l2_qos_cfg_update(&r->alloc_enabled);
1945
1946        if (r == &rdt_resources_all[RDT_RESOURCE_L3DATA])
1947                l3_qos_cfg_update(&r->alloc_enabled);
1948}
1949
1950/*
1951 * Enable or disable the MBA software controller
1952 * which helps user specify bandwidth in MBps.
1953 * MBA software controller is supported only if
1954 * MBM is supported and MBA is in linear scale.
1955 */
1956static int set_mba_sc(bool mba_sc)
1957{
1958        struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA];
1959        struct rdt_domain *d;
1960
1961        if (!is_mbm_enabled() || !is_mba_linear() ||
1962            mba_sc == is_mba_sc(r))
1963                return -EINVAL;
1964
1965        r->membw.mba_sc = mba_sc;
1966        list_for_each_entry(d, &r->domains, list)
1967                setup_default_ctrlval(r, d->ctrl_val, d->mbps_val);
1968
1969        return 0;
1970}
1971
1972static int cdp_enable(int level, int data_type, int code_type)
1973{
1974        struct rdt_resource *r_ldata = &rdt_resources_all[data_type];
1975        struct rdt_resource *r_lcode = &rdt_resources_all[code_type];
1976        struct rdt_resource *r_l = &rdt_resources_all[level];
1977        int ret;
1978
1979        if (!r_l->alloc_capable || !r_ldata->alloc_capable ||
1980            !r_lcode->alloc_capable)
1981                return -EINVAL;
1982
1983        ret = set_cache_qos_cfg(level, true);
1984        if (!ret) {
1985                r_l->alloc_enabled = false;
1986                r_ldata->alloc_enabled = true;
1987                r_lcode->alloc_enabled = true;
1988        }
1989        return ret;
1990}
1991
1992static int cdpl3_enable(void)
1993{
1994        return cdp_enable(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA,
1995                          RDT_RESOURCE_L3CODE);
1996}
1997
1998static int cdpl2_enable(void)
1999{
2000        return cdp_enable(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA,
2001                          RDT_RESOURCE_L2CODE);
2002}
2003
2004static void cdp_disable(int level, int data_type, int code_type)
2005{
2006        struct rdt_resource *r = &rdt_resources_all[level];
2007
2008        r->alloc_enabled = r->alloc_capable;
2009
2010        if (rdt_resources_all[data_type].alloc_enabled) {
2011                rdt_resources_all[data_type].alloc_enabled = false;
2012                rdt_resources_all[code_type].alloc_enabled = false;
2013                set_cache_qos_cfg(level, false);
2014        }
2015}
2016
2017static void cdpl3_disable(void)
2018{
2019        cdp_disable(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA, RDT_RESOURCE_L3CODE);
2020}
2021
2022static void cdpl2_disable(void)
2023{
2024        cdp_disable(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA, RDT_RESOURCE_L2CODE);
2025}
2026
2027static void cdp_disable_all(void)
2028{
2029        if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled)
2030                cdpl3_disable();
2031        if (rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled)
2032                cdpl2_disable();
2033}
2034
2035/*
2036 * We don't allow rdtgroup directories to be created anywhere
2037 * except the root directory. Thus when looking for the rdtgroup
2038 * structure for a kernfs node we are either looking at a directory,
2039 * in which case the rdtgroup structure is pointed at by the "priv"
2040 * field, otherwise we have a file, and need only look to the parent
2041 * to find the rdtgroup.
2042 */
2043static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn)
2044{
2045        if (kernfs_type(kn) == KERNFS_DIR) {
2046                /*
2047                 * All the resource directories use "kn->priv"
2048                 * to point to the "struct rdtgroup" for the
2049                 * resource. "info" and its subdirectories don't
2050                 * have rdtgroup structures, so return NULL here.
2051                 */
2052                if (kn == kn_info || kn->parent == kn_info)
2053                        return NULL;
2054                else
2055                        return kn->priv;
2056        } else {
2057                return kn->parent->priv;
2058        }
2059}
2060
2061struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn)
2062{
2063        struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
2064
2065        if (!rdtgrp)
2066                return NULL;
2067
2068        atomic_inc(&rdtgrp->waitcount);
2069        kernfs_break_active_protection(kn);
2070
2071        mutex_lock(&rdtgroup_mutex);
2072
2073        /* Was this group deleted while we waited? */
2074        if (rdtgrp->flags & RDT_DELETED)
2075                return NULL;
2076
2077        return rdtgrp;
2078}
2079
2080void rdtgroup_kn_unlock(struct kernfs_node *kn)
2081{
2082        struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
2083
2084        if (!rdtgrp)
2085                return;
2086
2087        mutex_unlock(&rdtgroup_mutex);
2088
2089        if (atomic_dec_and_test(&rdtgrp->waitcount) &&
2090            (rdtgrp->flags & RDT_DELETED)) {
2091                if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
2092                    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
2093                        rdtgroup_pseudo_lock_remove(rdtgrp);
2094                kernfs_unbreak_active_protection(kn);
2095                kernfs_put(rdtgrp->kn);
2096                kfree(rdtgrp);
2097        } else {
2098                kernfs_unbreak_active_protection(kn);
2099        }
2100}
2101
2102static int mkdir_mondata_all(struct kernfs_node *parent_kn,
2103                             struct rdtgroup *prgrp,
2104                             struct kernfs_node **mon_data_kn);
2105
2106static int rdt_enable_ctx(struct rdt_fs_context *ctx)
2107{
2108        int ret = 0;
2109
2110        if (ctx->enable_cdpl2)
2111                ret = cdpl2_enable();
2112
2113        if (!ret && ctx->enable_cdpl3)
2114                ret = cdpl3_enable();
2115
2116        if (!ret && ctx->enable_mba_mbps)
2117                ret = set_mba_sc(true);
2118
2119        return ret;
2120}
2121
2122static int rdt_get_tree(struct fs_context *fc)
2123{
2124        struct rdt_fs_context *ctx = rdt_fc2context(fc);
2125        struct rdt_domain *dom;
2126        struct rdt_resource *r;
2127        int ret;
2128
2129        cpus_read_lock();
2130        mutex_lock(&rdtgroup_mutex);
2131        /*
2132         * resctrl file system can only be mounted once.
2133         */
2134        if (static_branch_unlikely(&rdt_enable_key)) {
2135                ret = -EBUSY;
2136                goto out;
2137        }
2138
2139        ret = rdt_enable_ctx(ctx);
2140        if (ret < 0)
2141                goto out_cdp;
2142
2143        closid_init();
2144
2145        ret = rdtgroup_create_info_dir(rdtgroup_default.kn);
2146        if (ret < 0)
2147                goto out_mba;
2148
2149        if (rdt_mon_capable) {
2150                ret = mongroup_create_dir(rdtgroup_default.kn,
2151                                          &rdtgroup_default, "mon_groups",
2152                                          &kn_mongrp);
2153                if (ret < 0)
2154                        goto out_info;
2155                kernfs_get(kn_mongrp);
2156
2157                ret = mkdir_mondata_all(rdtgroup_default.kn,
2158                                        &rdtgroup_default, &kn_mondata);
2159                if (ret < 0)
2160                        goto out_mongrp;
2161                kernfs_get(kn_mondata);
2162                rdtgroup_default.mon.mon_data_kn = kn_mondata;
2163        }
2164
2165        ret = rdt_pseudo_lock_init();
2166        if (ret)
2167                goto out_mondata;
2168
2169        ret = kernfs_get_tree(fc);
2170        if (ret < 0)
2171                goto out_psl;
2172
2173        if (rdt_alloc_capable)
2174                static_branch_enable_cpuslocked(&rdt_alloc_enable_key);
2175        if (rdt_mon_capable)
2176                static_branch_enable_cpuslocked(&rdt_mon_enable_key);
2177
2178        if (rdt_alloc_capable || rdt_mon_capable)
2179                static_branch_enable_cpuslocked(&rdt_enable_key);
2180
2181        if (is_mbm_enabled()) {
2182                r = &rdt_resources_all[RDT_RESOURCE_L3];
2183                list_for_each_entry(dom, &r->domains, list)
2184                        mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL);
2185        }
2186
2187        goto out;
2188
2189out_psl:
2190        rdt_pseudo_lock_release();
2191out_mondata:
2192        if (rdt_mon_capable)
2193                kernfs_remove(kn_mondata);
2194out_mongrp:
2195        if (rdt_mon_capable)
2196                kernfs_remove(kn_mongrp);
2197out_info:
2198        kernfs_remove(kn_info);
2199out_mba:
2200        if (ctx->enable_mba_mbps)
2201                set_mba_sc(false);
2202out_cdp:
2203        cdp_disable_all();
2204out:
2205        rdt_last_cmd_clear();
2206        mutex_unlock(&rdtgroup_mutex);
2207        cpus_read_unlock();
2208        return ret;
2209}
2210
2211enum rdt_param {
2212        Opt_cdp,
2213        Opt_cdpl2,
2214        Opt_mba_mbps,
2215        nr__rdt_params
2216};
2217
2218static const struct fs_parameter_spec rdt_fs_parameters[] = {
2219        fsparam_flag("cdp",             Opt_cdp),
2220        fsparam_flag("cdpl2",           Opt_cdpl2),
2221        fsparam_flag("mba_MBps",        Opt_mba_mbps),
2222        {}
2223};
2224
2225static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
2226{
2227        struct rdt_fs_context *ctx = rdt_fc2context(fc);
2228        struct fs_parse_result result;
2229        int opt;
2230
2231        opt = fs_parse(fc, rdt_fs_parameters, param, &result);
2232        if (opt < 0)
2233                return opt;
2234
2235        switch (opt) {
2236        case Opt_cdp:
2237                ctx->enable_cdpl3 = true;
2238                return 0;
2239        case Opt_cdpl2:
2240                ctx->enable_cdpl2 = true;
2241                return 0;
2242        case Opt_mba_mbps:
2243                if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
2244                        return -EINVAL;
2245                ctx->enable_mba_mbps = true;
2246                return 0;
2247        }
2248
2249        return -EINVAL;
2250}
2251
2252static void rdt_fs_context_free(struct fs_context *fc)
2253{
2254        struct rdt_fs_context *ctx = rdt_fc2context(fc);
2255
2256        kernfs_free_fs_context(fc);
2257        kfree(ctx);
2258}
2259
2260static const struct fs_context_operations rdt_fs_context_ops = {
2261        .free           = rdt_fs_context_free,
2262        .parse_param    = rdt_parse_param,
2263        .get_tree       = rdt_get_tree,
2264};
2265
2266static int rdt_init_fs_context(struct fs_context *fc)
2267{
2268        struct rdt_fs_context *ctx;
2269
2270        ctx = kzalloc(sizeof(struct rdt_fs_context), GFP_KERNEL);
2271        if (!ctx)
2272                return -ENOMEM;
2273
2274        ctx->kfc.root = rdt_root;
2275        ctx->kfc.magic = RDTGROUP_SUPER_MAGIC;
2276        fc->fs_private = &ctx->kfc;
2277        fc->ops = &rdt_fs_context_ops;
2278        put_user_ns(fc->user_ns);
2279        fc->user_ns = get_user_ns(&init_user_ns);
2280        fc->global = true;
2281        return 0;
2282}
2283
2284static int reset_all_ctrls(struct rdt_resource *r)
2285{
2286        struct msr_param msr_param;
2287        cpumask_var_t cpu_mask;
2288        struct rdt_domain *d;
2289        int i, cpu;
2290
2291        if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
2292                return -ENOMEM;
2293
2294        msr_param.res = r;
2295        msr_param.low = 0;
2296        msr_param.high = r->num_closid;
2297
2298        /*
2299         * Disable resource control for this resource by setting all
2300         * CBMs in all domains to the maximum mask value. Pick one CPU
2301         * from each domain to update the MSRs below.
2302         */
2303        list_for_each_entry(d, &r->domains, list) {
2304                cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
2305
2306                for (i = 0; i < r->num_closid; i++)
2307                        d->ctrl_val[i] = r->default_ctrl;
2308        }
2309        cpu = get_cpu();
2310        /* Update CBM on this cpu if it's in cpu_mask. */
2311        if (cpumask_test_cpu(cpu, cpu_mask))
2312                rdt_ctrl_update(&msr_param);
2313        /* Update CBM on all other cpus in cpu_mask. */
2314        smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1);
2315        put_cpu();
2316
2317        free_cpumask_var(cpu_mask);
2318
2319        return 0;
2320}
2321
2322/*
2323 * Move tasks from one to the other group. If @from is NULL, then all tasks
2324 * in the systems are moved unconditionally (used for teardown).
2325 *
2326 * If @mask is not NULL the cpus on which moved tasks are running are set
2327 * in that mask so the update smp function call is restricted to affected
2328 * cpus.
2329 */
2330static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
2331                                 struct cpumask *mask)
2332{
2333        struct task_struct *p, *t;
2334
2335        read_lock(&tasklist_lock);
2336        for_each_process_thread(p, t) {
2337                if (!from || is_closid_match(t, from) ||
2338                    is_rmid_match(t, from)) {
2339                        t->closid = to->closid;
2340                        t->rmid = to->mon.rmid;
2341
2342#ifdef CONFIG_SMP
2343                        /*
2344                         * This is safe on x86 w/o barriers as the ordering
2345                         * of writing to task_cpu() and t->on_cpu is
2346                         * reverse to the reading here. The detection is
2347                         * inaccurate as tasks might move or schedule
2348                         * before the smp function call takes place. In
2349                         * such a case the function call is pointless, but
2350                         * there is no other side effect.
2351                         */
2352                        if (mask && t->on_cpu)
2353                                cpumask_set_cpu(task_cpu(t), mask);
2354#endif
2355                }
2356        }
2357        read_unlock(&tasklist_lock);
2358}
2359
2360static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
2361{
2362        struct rdtgroup *sentry, *stmp;
2363        struct list_head *head;
2364
2365        head = &rdtgrp->mon.crdtgrp_list;
2366        list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
2367                free_rmid(sentry->mon.rmid);
2368                list_del(&sentry->mon.crdtgrp_list);
2369
2370                if (atomic_read(&sentry->waitcount) != 0)
2371                        sentry->flags = RDT_DELETED;
2372                else
2373                        kfree(sentry);
2374        }
2375}
2376
2377/*
2378 * Forcibly remove all of subdirectories under root.
2379 */
2380static void rmdir_all_sub(void)
2381{
2382        struct rdtgroup *rdtgrp, *tmp;
2383
2384        /* Move all tasks to the default resource group */
2385        rdt_move_group_tasks(NULL, &rdtgroup_default, NULL);
2386
2387        list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) {
2388                /* Free any child rmids */
2389                free_all_child_rdtgrp(rdtgrp);
2390
2391                /* Remove each rdtgroup other than root */
2392                if (rdtgrp == &rdtgroup_default)
2393                        continue;
2394
2395                if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
2396                    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
2397                        rdtgroup_pseudo_lock_remove(rdtgrp);
2398
2399                /*
2400                 * Give any CPUs back to the default group. We cannot copy
2401                 * cpu_online_mask because a CPU might have executed the
2402                 * offline callback already, but is still marked online.
2403                 */
2404                cpumask_or(&rdtgroup_default.cpu_mask,
2405                           &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
2406
2407                free_rmid(rdtgrp->mon.rmid);
2408
2409                kernfs_remove(rdtgrp->kn);
2410                list_del(&rdtgrp->rdtgroup_list);
2411
2412                if (atomic_read(&rdtgrp->waitcount) != 0)
2413                        rdtgrp->flags = RDT_DELETED;
2414                else
2415                        kfree(rdtgrp);
2416        }
2417        /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
2418        update_closid_rmid(cpu_online_mask, &rdtgroup_default);
2419
2420        kernfs_remove(kn_info);
2421        kernfs_remove(kn_mongrp);
2422        kernfs_remove(kn_mondata);
2423}
2424
2425static void rdt_kill_sb(struct super_block *sb)
2426{
2427        struct rdt_resource *r;
2428
2429        cpus_read_lock();
2430        mutex_lock(&rdtgroup_mutex);
2431
2432        set_mba_sc(false);
2433
2434        /*Put everything back to default values. */
2435        for_each_alloc_enabled_rdt_resource(r)
2436                reset_all_ctrls(r);
2437        cdp_disable_all();
2438        rmdir_all_sub();
2439        rdt_pseudo_lock_release();
2440        rdtgroup_default.mode = RDT_MODE_SHAREABLE;
2441        static_branch_disable_cpuslocked(&rdt_alloc_enable_key);
2442        static_branch_disable_cpuslocked(&rdt_mon_enable_key);
2443        static_branch_disable_cpuslocked(&rdt_enable_key);
2444        kernfs_kill_sb(sb);
2445        mutex_unlock(&rdtgroup_mutex);
2446        cpus_read_unlock();
2447}
2448
2449static struct file_system_type rdt_fs_type = {
2450        .name                   = "resctrl",
2451        .init_fs_context        = rdt_init_fs_context,
2452        .parameters             = rdt_fs_parameters,
2453        .kill_sb                = rdt_kill_sb,
2454};
2455
2456static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
2457                       void *priv)
2458{
2459        struct kernfs_node *kn;
2460        int ret = 0;
2461
2462        kn = __kernfs_create_file(parent_kn, name, 0444,
2463                                  GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0,
2464                                  &kf_mondata_ops, priv, NULL, NULL);
2465        if (IS_ERR(kn))
2466                return PTR_ERR(kn);
2467
2468        ret = rdtgroup_kn_set_ugid(kn);
2469        if (ret) {
2470                kernfs_remove(kn);
2471                return ret;
2472        }
2473
2474        return ret;
2475}
2476
2477/*
2478 * Remove all subdirectories of mon_data of ctrl_mon groups
2479 * and monitor groups with given domain id.
2480 */
2481void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, unsigned int dom_id)
2482{
2483        struct rdtgroup *prgrp, *crgrp;
2484        char name[32];
2485
2486        if (!r->mon_enabled)
2487                return;
2488
2489        list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
2490                sprintf(name, "mon_%s_%02d", r->name, dom_id);
2491                kernfs_remove_by_name(prgrp->mon.mon_data_kn, name);
2492
2493                list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list)
2494                        kernfs_remove_by_name(crgrp->mon.mon_data_kn, name);
2495        }
2496}
2497
2498static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
2499                                struct rdt_domain *d,
2500                                struct rdt_resource *r, struct rdtgroup *prgrp)
2501{
2502        union mon_data_bits priv;
2503        struct kernfs_node *kn;
2504        struct mon_evt *mevt;
2505        struct rmid_read rr;
2506        char name[32];
2507        int ret;
2508
2509        sprintf(name, "mon_%s_%02d", r->name, d->id);
2510        /* create the directory */
2511        kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
2512        if (IS_ERR(kn))
2513                return PTR_ERR(kn);
2514
2515        /*
2516         * This extra ref will be put in kernfs_remove() and guarantees
2517         * that kn is always accessible.
2518         */
2519        kernfs_get(kn);
2520        ret = rdtgroup_kn_set_ugid(kn);
2521        if (ret)
2522                goto out_destroy;
2523
2524        if (WARN_ON(list_empty(&r->evt_list))) {
2525                ret = -EPERM;
2526                goto out_destroy;
2527        }
2528
2529        priv.u.rid = r->rid;
2530        priv.u.domid = d->id;
2531        list_for_each_entry(mevt, &r->evt_list, list) {
2532                priv.u.evtid = mevt->evtid;
2533                ret = mon_addfile(kn, mevt->name, priv.priv);
2534                if (ret)
2535                        goto out_destroy;
2536
2537                if (is_mbm_event(mevt->evtid))
2538                        mon_event_read(&rr, r, d, prgrp, mevt->evtid, true);
2539        }
2540        kernfs_activate(kn);
2541        return 0;
2542
2543out_destroy:
2544        kernfs_remove(kn);
2545        return ret;
2546}
2547
2548/*
2549 * Add all subdirectories of mon_data for "ctrl_mon" groups
2550 * and "monitor" groups with given domain id.
2551 */
2552void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
2553                                    struct rdt_domain *d)
2554{
2555        struct kernfs_node *parent_kn;
2556        struct rdtgroup *prgrp, *crgrp;
2557        struct list_head *head;
2558
2559        if (!r->mon_enabled)
2560                return;
2561
2562        list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
2563                parent_kn = prgrp->mon.mon_data_kn;
2564                mkdir_mondata_subdir(parent_kn, d, r, prgrp);
2565
2566                head = &prgrp->mon.crdtgrp_list;
2567                list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
2568                        parent_kn = crgrp->mon.mon_data_kn;
2569                        mkdir_mondata_subdir(parent_kn, d, r, crgrp);
2570                }
2571        }
2572}
2573
2574static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn,
2575                                       struct rdt_resource *r,
2576                                       struct rdtgroup *prgrp)
2577{
2578        struct rdt_domain *dom;
2579        int ret;
2580
2581        list_for_each_entry(dom, &r->domains, list) {
2582                ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp);
2583                if (ret)
2584                        return ret;
2585        }
2586
2587        return 0;
2588}
2589
2590/*
2591 * This creates a directory mon_data which contains the monitored data.
2592 *
2593 * mon_data has one directory for each domain whic are named
2594 * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data
2595 * with L3 domain looks as below:
2596 * ./mon_data:
2597 * mon_L3_00
2598 * mon_L3_01
2599 * mon_L3_02
2600 * ...
2601 *
2602 * Each domain directory has one file per event:
2603 * ./mon_L3_00/:
2604 * llc_occupancy
2605 *
2606 */
2607static int mkdir_mondata_all(struct kernfs_node *parent_kn,
2608                             struct rdtgroup *prgrp,
2609                             struct kernfs_node **dest_kn)
2610{
2611        struct rdt_resource *r;
2612        struct kernfs_node *kn;
2613        int ret;
2614
2615        /*
2616         * Create the mon_data directory first.
2617         */
2618        ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn);
2619        if (ret)
2620                return ret;
2621
2622        if (dest_kn)
2623                *dest_kn = kn;
2624
2625        /*
2626         * Create the subdirectories for each domain. Note that all events
2627         * in a domain like L3 are grouped into a resource whose domain is L3
2628         */
2629        for_each_mon_enabled_rdt_resource(r) {
2630                ret = mkdir_mondata_subdir_alldom(kn, r, prgrp);
2631                if (ret)
2632                        goto out_destroy;
2633        }
2634
2635        return 0;
2636
2637out_destroy:
2638        kernfs_remove(kn);
2639        return ret;
2640}
2641
2642/**
2643 * cbm_ensure_valid - Enforce validity on provided CBM
2644 * @_val:       Candidate CBM
2645 * @r:          RDT resource to which the CBM belongs
2646 *
2647 * The provided CBM represents all cache portions available for use. This
2648 * may be represented by a bitmap that does not consist of contiguous ones
2649 * and thus be an invalid CBM.
2650 * Here the provided CBM is forced to be a valid CBM by only considering
2651 * the first set of contiguous bits as valid and clearing all bits.
2652 * The intention here is to provide a valid default CBM with which a new
2653 * resource group is initialized. The user can follow this with a
2654 * modification to the CBM if the default does not satisfy the
2655 * requirements.
2656 */
2657static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r)
2658{
2659        unsigned int cbm_len = r->cache.cbm_len;
2660        unsigned long first_bit, zero_bit;
2661        unsigned long val = _val;
2662
2663        if (!val)
2664                return 0;
2665
2666        first_bit = find_first_bit(&val, cbm_len);
2667        zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
2668
2669        /* Clear any remaining bits to ensure contiguous region */
2670        bitmap_clear(&val, zero_bit, cbm_len - zero_bit);
2671        return (u32)val;
2672}
2673
2674/*
2675 * Initialize cache resources per RDT domain
2676 *
2677 * Set the RDT domain up to start off with all usable allocations. That is,
2678 * all shareable and unused bits. All-zero CBM is invalid.
2679 */
2680static int __init_one_rdt_domain(struct rdt_domain *d, struct rdt_resource *r,
2681                                 u32 closid)
2682{
2683        struct rdt_resource *r_cdp = NULL;
2684        struct rdt_domain *d_cdp = NULL;
2685        u32 used_b = 0, unused_b = 0;
2686        unsigned long tmp_cbm;
2687        enum rdtgrp_mode mode;
2688        u32 peer_ctl, *ctrl;
2689        int i;
2690
2691        rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp);
2692        d->have_new_ctrl = false;
2693        d->new_ctrl = r->cache.shareable_bits;
2694        used_b = r->cache.shareable_bits;
2695        ctrl = d->ctrl_val;
2696        for (i = 0; i < closids_supported(); i++, ctrl++) {
2697                if (closid_allocated(i) && i != closid) {
2698                        mode = rdtgroup_mode_by_closid(i);
2699                        if (mode == RDT_MODE_PSEUDO_LOCKSETUP)
2700                                /*
2701                                 * ctrl values for locksetup aren't relevant
2702                                 * until the schemata is written, and the mode
2703                                 * becomes RDT_MODE_PSEUDO_LOCKED.
2704                                 */
2705                                continue;
2706                        /*
2707                         * If CDP is active include peer domain's
2708                         * usage to ensure there is no overlap
2709                         * with an exclusive group.
2710                         */
2711                        if (d_cdp)
2712                                peer_ctl = d_cdp->ctrl_val[i];
2713                        else
2714                                peer_ctl = 0;
2715                        used_b |= *ctrl | peer_ctl;
2716                        if (mode == RDT_MODE_SHAREABLE)
2717                                d->new_ctrl |= *ctrl | peer_ctl;
2718                }
2719        }
2720        if (d->plr && d->plr->cbm > 0)
2721                used_b |= d->plr->cbm;
2722        unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1);
2723        unused_b &= BIT_MASK(r->cache.cbm_len) - 1;
2724        d->new_ctrl |= unused_b;
2725        /*
2726         * Force the initial CBM to be valid, user can
2727         * modify the CBM based on system availability.
2728         */
2729        d->new_ctrl = cbm_ensure_valid(d->new_ctrl, r);
2730        /*
2731         * Assign the u32 CBM to an unsigned long to ensure that
2732         * bitmap_weight() does not access out-of-bound memory.
2733         */
2734        tmp_cbm = d->new_ctrl;
2735        if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) {
2736                rdt_last_cmd_printf("No space on %s:%d\n", r->name, d->id);
2737                return -ENOSPC;
2738        }
2739        d->have_new_ctrl = true;
2740
2741        return 0;
2742}
2743
2744/*
2745 * Initialize cache resources with default values.
2746 *
2747 * A new RDT group is being created on an allocation capable (CAT)
2748 * supporting system. Set this group up to start off with all usable
2749 * allocations.
2750 *
2751 * If there are no more shareable bits available on any domain then
2752 * the entire allocation will fail.
2753 */
2754static int rdtgroup_init_cat(struct rdt_resource *r, u32 closid)
2755{
2756        struct rdt_domain *d;
2757        int ret;
2758
2759        list_for_each_entry(d, &r->domains, list) {
2760                ret = __init_one_rdt_domain(d, r, closid);
2761                if (ret < 0)
2762                        return ret;
2763        }
2764
2765        return 0;
2766}
2767
2768/* Initialize MBA resource with default values. */
2769static void rdtgroup_init_mba(struct rdt_resource *r)
2770{
2771        struct rdt_domain *d;
2772
2773        list_for_each_entry(d, &r->domains, list) {
2774                d->new_ctrl = is_mba_sc(r) ? MBA_MAX_MBPS : r->default_ctrl;
2775                d->have_new_ctrl = true;
2776        }
2777}
2778
2779/* Initialize the RDT group's allocations. */
2780static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
2781{
2782        struct rdt_resource *r;
2783        int ret;
2784
2785        for_each_alloc_enabled_rdt_resource(r) {
2786                if (r->rid == RDT_RESOURCE_MBA) {
2787                        rdtgroup_init_mba(r);
2788                } else {
2789                        ret = rdtgroup_init_cat(r, rdtgrp->closid);
2790                        if (ret < 0)
2791                                return ret;
2792                }
2793
2794                ret = update_domains(r, rdtgrp->closid);
2795                if (ret < 0) {
2796                        rdt_last_cmd_puts("Failed to initialize allocations\n");
2797                        return ret;
2798                }
2799
2800        }
2801
2802        rdtgrp->mode = RDT_MODE_SHAREABLE;
2803
2804        return 0;
2805}
2806
2807static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
2808                             const char *name, umode_t mode,
2809                             enum rdt_group_type rtype, struct rdtgroup **r)
2810{
2811        struct rdtgroup *prdtgrp, *rdtgrp;
2812        struct kernfs_node *kn;
2813        uint files = 0;
2814        int ret;
2815
2816        prdtgrp = rdtgroup_kn_lock_live(parent_kn);
2817        if (!prdtgrp) {
2818                ret = -ENODEV;
2819                goto out_unlock;
2820        }
2821
2822        if (rtype == RDTMON_GROUP &&
2823            (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
2824             prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) {
2825                ret = -EINVAL;
2826                rdt_last_cmd_puts("Pseudo-locking in progress\n");
2827                goto out_unlock;
2828        }
2829
2830        /* allocate the rdtgroup. */
2831        rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
2832        if (!rdtgrp) {
2833                ret = -ENOSPC;
2834                rdt_last_cmd_puts("Kernel out of memory\n");
2835                goto out_unlock;
2836        }
2837        *r = rdtgrp;
2838        rdtgrp->mon.parent = prdtgrp;
2839        rdtgrp->type = rtype;
2840        INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list);
2841
2842        /* kernfs creates the directory for rdtgrp */
2843        kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp);
2844        if (IS_ERR(kn)) {
2845                ret = PTR_ERR(kn);
2846                rdt_last_cmd_puts("kernfs create error\n");
2847                goto out_free_rgrp;
2848        }
2849        rdtgrp->kn = kn;
2850
2851        /*
2852         * kernfs_remove() will drop the reference count on "kn" which
2853         * will free it. But we still need it to stick around for the
2854         * rdtgroup_kn_unlock(kn} call below. Take one extra reference
2855         * here, which will be dropped inside rdtgroup_kn_unlock().
2856         */
2857        kernfs_get(kn);
2858
2859        ret = rdtgroup_kn_set_ugid(kn);
2860        if (ret) {
2861                rdt_last_cmd_puts("kernfs perm error\n");
2862                goto out_destroy;
2863        }
2864
2865        files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype);
2866        ret = rdtgroup_add_files(kn, files);
2867        if (ret) {
2868                rdt_last_cmd_puts("kernfs fill error\n");
2869                goto out_destroy;
2870        }
2871
2872        if (rdt_mon_capable) {
2873                ret = alloc_rmid();
2874                if (ret < 0) {
2875                        rdt_last_cmd_puts("Out of RMIDs\n");
2876                        goto out_destroy;
2877                }
2878                rdtgrp->mon.rmid = ret;
2879
2880                ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn);
2881                if (ret) {
2882                        rdt_last_cmd_puts("kernfs subdir error\n");
2883                        goto out_idfree;
2884                }
2885        }
2886        kernfs_activate(kn);
2887
2888        /*
2889         * The caller unlocks the parent_kn upon success.
2890         */
2891        return 0;
2892
2893out_idfree:
2894        free_rmid(rdtgrp->mon.rmid);
2895out_destroy:
2896        kernfs_remove(rdtgrp->kn);
2897out_free_rgrp:
2898        kfree(rdtgrp);
2899out_unlock:
2900        rdtgroup_kn_unlock(parent_kn);
2901        return ret;
2902}
2903
2904static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
2905{
2906        kernfs_remove(rgrp->kn);
2907        free_rmid(rgrp->mon.rmid);
2908        kfree(rgrp);
2909}
2910
2911/*
2912 * Create a monitor group under "mon_groups" directory of a control
2913 * and monitor group(ctrl_mon). This is a resource group
2914 * to monitor a subset of tasks and cpus in its parent ctrl_mon group.
2915 */
2916static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
2917                              const char *name, umode_t mode)
2918{
2919        struct rdtgroup *rdtgrp, *prgrp;
2920        int ret;
2921
2922        ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTMON_GROUP, &rdtgrp);
2923        if (ret)
2924                return ret;
2925
2926        prgrp = rdtgrp->mon.parent;
2927        rdtgrp->closid = prgrp->closid;
2928
2929        /*
2930         * Add the rdtgrp to the list of rdtgrps the parent
2931         * ctrl_mon group has to track.
2932         */
2933        list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list);
2934
2935        rdtgroup_kn_unlock(parent_kn);
2936        return ret;
2937}
2938
2939/*
2940 * These are rdtgroups created under the root directory. Can be used
2941 * to allocate and monitor resources.
2942 */
2943static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
2944                                   const char *name, umode_t mode)
2945{
2946        struct rdtgroup *rdtgrp;
2947        struct kernfs_node *kn;
2948        u32 closid;
2949        int ret;
2950
2951        ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp);
2952        if (ret)
2953                return ret;
2954
2955        kn = rdtgrp->kn;
2956        ret = closid_alloc();
2957        if (ret < 0) {
2958                rdt_last_cmd_puts("Out of CLOSIDs\n");
2959                goto out_common_fail;
2960        }
2961        closid = ret;
2962        ret = 0;
2963
2964        rdtgrp->closid = closid;
2965        ret = rdtgroup_init_alloc(rdtgrp);
2966        if (ret < 0)
2967                goto out_id_free;
2968
2969        list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
2970
2971        if (rdt_mon_capable) {
2972                /*
2973                 * Create an empty mon_groups directory to hold the subset
2974                 * of tasks and cpus to monitor.
2975                 */
2976                ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL);
2977                if (ret) {
2978                        rdt_last_cmd_puts("kernfs subdir error\n");
2979                        goto out_del_list;
2980                }
2981        }
2982
2983        goto out_unlock;
2984
2985out_del_list:
2986        list_del(&rdtgrp->rdtgroup_list);
2987out_id_free:
2988        closid_free(closid);
2989out_common_fail:
2990        mkdir_rdt_prepare_clean(rdtgrp);
2991out_unlock:
2992        rdtgroup_kn_unlock(parent_kn);
2993        return ret;
2994}
2995
2996/*
2997 * We allow creating mon groups only with in a directory called "mon_groups"
2998 * which is present in every ctrl_mon group. Check if this is a valid
2999 * "mon_groups" directory.
3000 *
3001 * 1. The directory should be named "mon_groups".
3002 * 2. The mon group itself should "not" be named "mon_groups".
3003 *   This makes sure "mon_groups" directory always has a ctrl_mon group
3004 *   as parent.
3005 */
3006static bool is_mon_groups(struct kernfs_node *kn, const char *name)
3007{
3008        return (!strcmp(kn->name, "mon_groups") &&
3009                strcmp(name, "mon_groups"));
3010}
3011
3012static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
3013                          umode_t mode)
3014{
3015        /* Do not accept '\n' to avoid unparsable situation. */
3016        if (strchr(name, '\n'))
3017                return -EINVAL;
3018
3019        /*
3020         * If the parent directory is the root directory and RDT
3021         * allocation is supported, add a control and monitoring
3022         * subdirectory
3023         */
3024        if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn)
3025                return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode);
3026
3027        /*
3028         * If RDT monitoring is supported and the parent directory is a valid
3029         * "mon_groups" directory, add a monitoring subdirectory.
3030         */
3031        if (rdt_mon_capable && is_mon_groups(parent_kn, name))
3032                return rdtgroup_mkdir_mon(parent_kn, name, mode);
3033
3034        return -EPERM;
3035}
3036
3037static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
3038                              cpumask_var_t tmpmask)
3039{
3040        struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
3041        int cpu;
3042
3043        /* Give any tasks back to the parent group */
3044        rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask);
3045
3046        /* Update per cpu rmid of the moved CPUs first */
3047        for_each_cpu(cpu, &rdtgrp->cpu_mask)
3048                per_cpu(pqr_state.default_rmid, cpu) = prdtgrp->mon.rmid;
3049        /*
3050         * Update the MSR on moved CPUs and CPUs which have moved
3051         * task running on them.
3052         */
3053        cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
3054        update_closid_rmid(tmpmask, NULL);
3055
3056        rdtgrp->flags = RDT_DELETED;
3057        free_rmid(rdtgrp->mon.rmid);
3058
3059        /*
3060         * Remove the rdtgrp from the parent ctrl_mon group's list
3061         */
3062        WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
3063        list_del(&rdtgrp->mon.crdtgrp_list);
3064
3065        /*
3066         * one extra hold on this, will drop when we kfree(rdtgrp)
3067         * in rdtgroup_kn_unlock()
3068         */
3069        kernfs_get(kn);
3070        kernfs_remove(rdtgrp->kn);
3071
3072        return 0;
3073}
3074
3075static int rdtgroup_ctrl_remove(struct kernfs_node *kn,
3076                                struct rdtgroup *rdtgrp)
3077{
3078        rdtgrp->flags = RDT_DELETED;
3079        list_del(&rdtgrp->rdtgroup_list);
3080
3081        /*
3082         * one extra hold on this, will drop when we kfree(rdtgrp)
3083         * in rdtgroup_kn_unlock()
3084         */
3085        kernfs_get(kn);
3086        kernfs_remove(rdtgrp->kn);
3087        return 0;
3088}
3089
3090static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
3091                               cpumask_var_t tmpmask)
3092{
3093        int cpu;
3094
3095        /* Give any tasks back to the default group */
3096        rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask);
3097
3098        /* Give any CPUs back to the default group */
3099        cpumask_or(&rdtgroup_default.cpu_mask,
3100                   &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
3101
3102        /* Update per cpu closid and rmid of the moved CPUs first */
3103        for_each_cpu(cpu, &rdtgrp->cpu_mask) {
3104                per_cpu(pqr_state.default_closid, cpu) = rdtgroup_default.closid;
3105                per_cpu(pqr_state.default_rmid, cpu) = rdtgroup_default.mon.rmid;
3106        }
3107
3108        /*
3109         * Update the MSR on moved CPUs and CPUs which have moved
3110         * task running on them.
3111         */
3112        cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
3113        update_closid_rmid(tmpmask, NULL);
3114
3115        closid_free(rdtgrp->closid);
3116        free_rmid(rdtgrp->mon.rmid);
3117
3118        rdtgroup_ctrl_remove(kn, rdtgrp);
3119
3120        /*
3121         * Free all the child monitor group rmids.
3122         */
3123        free_all_child_rdtgrp(rdtgrp);
3124
3125        return 0;
3126}
3127
3128static int rdtgroup_rmdir(struct kernfs_node *kn)
3129{
3130        struct kernfs_node *parent_kn = kn->parent;
3131        struct rdtgroup *rdtgrp;
3132        cpumask_var_t tmpmask;
3133        int ret = 0;
3134
3135        if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
3136                return -ENOMEM;
3137
3138        rdtgrp = rdtgroup_kn_lock_live(kn);
3139        if (!rdtgrp) {
3140                ret = -EPERM;
3141                goto out;
3142        }
3143
3144        /*
3145         * If the rdtgroup is a ctrl_mon group and parent directory
3146         * is the root directory, remove the ctrl_mon group.
3147         *
3148         * If the rdtgroup is a mon group and parent directory
3149         * is a valid "mon_groups" directory, remove the mon group.
3150         */
3151        if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn &&
3152            rdtgrp != &rdtgroup_default) {
3153                if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
3154                    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
3155                        ret = rdtgroup_ctrl_remove(kn, rdtgrp);
3156                } else {
3157                        ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask);
3158                }
3159        } else if (rdtgrp->type == RDTMON_GROUP &&
3160                 is_mon_groups(parent_kn, kn->name)) {
3161                ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask);
3162        } else {
3163                ret = -EPERM;
3164        }
3165
3166out:
3167        rdtgroup_kn_unlock(kn);
3168        free_cpumask_var(tmpmask);
3169        return ret;
3170}
3171
3172static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
3173{
3174        if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled)
3175                seq_puts(seq, ",cdp");
3176
3177        if (rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled)
3178                seq_puts(seq, ",cdpl2");
3179
3180        if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA]))
3181                seq_puts(seq, ",mba_MBps");
3182
3183        return 0;
3184}
3185
3186static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = {
3187        .mkdir          = rdtgroup_mkdir,
3188        .rmdir          = rdtgroup_rmdir,
3189        .show_options   = rdtgroup_show_options,
3190};
3191
3192static int __init rdtgroup_setup_root(void)
3193{
3194        int ret;
3195
3196        rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops,
3197                                      KERNFS_ROOT_CREATE_DEACTIVATED |
3198                                      KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK,
3199                                      &rdtgroup_default);
3200        if (IS_ERR(rdt_root))
3201                return PTR_ERR(rdt_root);
3202
3203        mutex_lock(&rdtgroup_mutex);
3204
3205        rdtgroup_default.closid = 0;
3206        rdtgroup_default.mon.rmid = 0;
3207        rdtgroup_default.type = RDTCTRL_GROUP;
3208        INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list);
3209
3210        list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
3211
3212        ret = rdtgroup_add_files(rdt_root->kn, RF_CTRL_BASE);
3213        if (ret) {
3214                kernfs_destroy_root(rdt_root);
3215                goto out;
3216        }
3217
3218        rdtgroup_default.kn = rdt_root->kn;
3219        kernfs_activate(rdtgroup_default.kn);
3220
3221out:
3222        mutex_unlock(&rdtgroup_mutex);
3223
3224        return ret;
3225}
3226
3227/*
3228 * rdtgroup_init - rdtgroup initialization
3229 *
3230 * Setup resctrl file system including set up root, create mount point,
3231 * register rdtgroup filesystem, and initialize files under root directory.
3232 *
3233 * Return: 0 on success or -errno
3234 */
3235int __init rdtgroup_init(void)
3236{
3237        int ret = 0;
3238
3239        seq_buf_init(&last_cmd_status, last_cmd_status_buf,
3240                     sizeof(last_cmd_status_buf));
3241
3242        ret = rdtgroup_setup_root();
3243        if (ret)
3244                return ret;
3245
3246        ret = sysfs_create_mount_point(fs_kobj, "resctrl");
3247        if (ret)
3248                goto cleanup_root;
3249
3250        ret = register_filesystem(&rdt_fs_type);
3251        if (ret)
3252                goto cleanup_mountpoint;
3253
3254        /*
3255         * Adding the resctrl debugfs directory here may not be ideal since
3256         * it would let the resctrl debugfs directory appear on the debugfs
3257         * filesystem before the resctrl filesystem is mounted.
3258         * It may also be ok since that would enable debugging of RDT before
3259         * resctrl is mounted.
3260         * The reason why the debugfs directory is created here and not in
3261         * rdt_get_tree() is because rdt_get_tree() takes rdtgroup_mutex and
3262         * during the debugfs directory creation also &sb->s_type->i_mutex_key
3263         * (the lockdep class of inode->i_rwsem). Other filesystem
3264         * interactions (eg. SyS_getdents) have the lock ordering:
3265         * &sb->s_type->i_mutex_key --> &mm->mmap_lock
3266         * During mmap(), called with &mm->mmap_lock, the rdtgroup_mutex
3267         * is taken, thus creating dependency:
3268         * &mm->mmap_lock --> rdtgroup_mutex for the latter that can cause
3269         * issues considering the other two lock dependencies.
3270         * By creating the debugfs directory here we avoid a dependency
3271         * that may cause deadlock (even though file operations cannot
3272         * occur until the filesystem is mounted, but I do not know how to
3273         * tell lockdep that).
3274         */
3275        debugfs_resctrl = debugfs_create_dir("resctrl", NULL);
3276
3277        return 0;
3278
3279cleanup_mountpoint:
3280        sysfs_remove_mount_point(fs_kobj, "resctrl");
3281cleanup_root:
3282        kernfs_destroy_root(rdt_root);
3283
3284        return ret;
3285}
3286
3287void __exit rdtgroup_exit(void)
3288{
3289        debugfs_remove_recursive(debugfs_resctrl);
3290        unregister_filesystem(&rdt_fs_type);
3291        sysfs_remove_mount_point(fs_kobj, "resctrl");
3292        kernfs_destroy_root(rdt_root);
3293}
3294