LXR linux/arch/x86/kernel/cpu/resctrl/rdtgroup.c

   1/*
   2 * User interface for Resource Alloction in Resource Director Technology(RDT)
   3 *
   4 * Copyright (C) 2016 Intel Corporation
   5 *
   6 * Author: Fenghua Yu <fenghua.yu@intel.com>
   7 *
   8 * This program is free software; you can redistribute it and/or modify it
   9 * under the terms and conditions of the GNU General Public License,
  10 * version 2, as published by the Free Software Foundation.
  11 *
  12 * This program is distributed in the hope it will be useful, but WITHOUT
  13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  15 * more details.
  16 *
  17 * More information about RDT be found in the Intel (R) x86 Architecture
  18 * Software Developer Manual.
  19 */
  20
  21#define pr_fmt(fmt)     KBUILD_MODNAME ": " fmt
  22
  23#include <linux/cacheinfo.h>
  24#include <linux/cpu.h>
  25#include <linux/debugfs.h>
  26#include <linux/fs.h>
  27#include <linux/sysfs.h>
  28#include <linux/kernfs.h>
  29#include <linux/seq_buf.h>
  30#include <linux/seq_file.h>
  31#include <linux/sched/signal.h>
  32#include <linux/sched/task.h>
  33#include <linux/slab.h>
  34#include <linux/task_work.h>
  35
  36#include <uapi/linux/magic.h>
  37
  38#include <asm/resctrl_sched.h>
  39#include "internal.h"
  40
  41DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
  42DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key);
  43DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
  44static struct kernfs_root *rdt_root;
  45struct rdtgroup rdtgroup_default;
  46LIST_HEAD(rdt_all_groups);
  47
  48/* Kernel fs node for "info" directory under root */
  49static struct kernfs_node *kn_info;
  50
  51/* Kernel fs node for "mon_groups" directory under root */
  52static struct kernfs_node *kn_mongrp;
  53
  54/* Kernel fs node for "mon_data" directory under root */
  55static struct kernfs_node *kn_mondata;
  56
  57static struct seq_buf last_cmd_status;
  58static char last_cmd_status_buf[512];
  59
  60struct dentry *debugfs_resctrl;
  61
  62void rdt_last_cmd_clear(void)
  63{
  64        lockdep_assert_held(&rdtgroup_mutex);
  65        seq_buf_clear(&last_cmd_status);
  66}
  67
  68void rdt_last_cmd_puts(const char *s)
  69{
  70        lockdep_assert_held(&rdtgroup_mutex);
  71        seq_buf_puts(&last_cmd_status, s);
  72}
  73
  74void rdt_last_cmd_printf(const char *fmt, ...)
  75{
  76        va_list ap;
  77
  78        va_start(ap, fmt);
  79        lockdep_assert_held(&rdtgroup_mutex);
  80        seq_buf_vprintf(&last_cmd_status, fmt, ap);
  81        va_end(ap);
  82}
  83
  84/*
  85 * Trivial allocator for CLOSIDs. Since h/w only supports a small number,
  86 * we can keep a bitmap of free CLOSIDs in a single integer.
  87 *
  88 * Using a global CLOSID across all resources has some advantages and
  89 * some drawbacks:
  90 * + We can simply set "current->closid" to assign a task to a resource
  91 *   group.
  92 * + Context switch code can avoid extra memory references deciding which
  93 *   CLOSID to load into the PQR_ASSOC MSR
  94 * - We give up some options in configuring resource groups across multi-socket
  95 *   systems.
  96 * - Our choices on how to configure each resource become progressively more
  97 *   limited as the number of resources grows.
  98 */
  99static int closid_free_map;
 100static int closid_free_map_len;
 101
 102int closids_supported(void)
 103{
 104        return closid_free_map_len;
 105}
 106
 107static void closid_init(void)
 108{
 109        struct rdt_resource *r;
 110        int rdt_min_closid = 32;
 111
 112        /* Compute rdt_min_closid across all resources */
 113        for_each_alloc_enabled_rdt_resource(r)
 114                rdt_min_closid = min(rdt_min_closid, r->num_closid);
 115
 116        closid_free_map = BIT_MASK(rdt_min_closid) - 1;
 117
 118        /* CLOSID 0 is always reserved for the default group */
 119        closid_free_map &= ~1;
 120        closid_free_map_len = rdt_min_closid;
 121}
 122
 123static int closid_alloc(void)
 124{
 125        u32 closid = ffs(closid_free_map);
 126
 127        if (closid == 0)
 128                return -ENOSPC;
 129        closid--;
 130        closid_free_map &= ~(1 << closid);
 131
 132        return closid;
 133}
 134
 135void closid_free(int closid)
 136{
 137        closid_free_map |= 1 << closid;
 138}
 139
 140/**
 141 * closid_allocated - test if provided closid is in use
 142 * @closid: closid to be tested
 143 *
 144 * Return: true if @closid is currently associated with a resource group,
 145 * false if @closid is free
 146 */
 147static bool closid_allocated(unsigned int closid)
 148{
 149        return (closid_free_map & (1 << closid)) == 0;
 150}
 151
 152/**
 153 * rdtgroup_mode_by_closid - Return mode of resource group with closid
 154 * @closid: closid if the resource group
 155 *
 156 * Each resource group is associated with a @closid. Here the mode
 157 * of a resource group can be queried by searching for it using its closid.
 158 *
 159 * Return: mode as &enum rdtgrp_mode of resource group with closid @closid
 160 */
 161enum rdtgrp_mode rdtgroup_mode_by_closid(int closid)
 162{
 163        struct rdtgroup *rdtgrp;
 164
 165        list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
 166                if (rdtgrp->closid == closid)
 167                        return rdtgrp->mode;
 168        }
 169
 170        return RDT_NUM_MODES;
 171}
 172
 173static const char * const rdt_mode_str[] = {
 174        [RDT_MODE_SHAREABLE]            = "shareable",
 175        [RDT_MODE_EXCLUSIVE]            = "exclusive",
 176        [RDT_MODE_PSEUDO_LOCKSETUP]     = "pseudo-locksetup",
 177        [RDT_MODE_PSEUDO_LOCKED]        = "pseudo-locked",
 178};
 179
 180/**
 181 * rdtgroup_mode_str - Return the string representation of mode
 182 * @mode: the resource group mode as &enum rdtgroup_mode
 183 *
 184 * Return: string representation of valid mode, "unknown" otherwise
 185 */
 186static const char *rdtgroup_mode_str(enum rdtgrp_mode mode)
 187{
 188        if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES)
 189                return "unknown";
 190
 191        return rdt_mode_str[mode];
 192}
 193
 194/* set uid and gid of rdtgroup dirs and files to that of the creator */
 195static int rdtgroup_kn_set_ugid(struct kernfs_node *kn)
 196{
 197        struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
 198                                .ia_uid = current_fsuid(),
 199                                .ia_gid = current_fsgid(), };
 200
 201        if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
 202            gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
 203                return 0;
 204
 205        return kernfs_setattr(kn, &iattr);
 206}
 207
 208static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft)
 209{
 210        struct kernfs_node *kn;
 211        int ret;
 212
 213        kn = __kernfs_create_file(parent_kn, rft->name, rft->mode,
 214                                  GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
 215                                  0, rft->kf_ops, rft, NULL, NULL);
 216        if (IS_ERR(kn))
 217                return PTR_ERR(kn);
 218
 219        ret = rdtgroup_kn_set_ugid(kn);
 220        if (ret) {
 221                kernfs_remove(kn);
 222                return ret;
 223        }
 224
 225        return 0;
 226}
 227
 228static int rdtgroup_seqfile_show(struct seq_file *m, void *arg)
 229{
 230        struct kernfs_open_file *of = m->private;
 231        struct rftype *rft = of->kn->priv;
 232
 233        if (rft->seq_show)
 234                return rft->seq_show(of, m, arg);
 235        return 0;
 236}
 237
 238static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf,
 239                                   size_t nbytes, loff_t off)
 240{
 241        struct rftype *rft = of->kn->priv;
 242
 243        if (rft->write)
 244                return rft->write(of, buf, nbytes, off);
 245
 246        return -EINVAL;
 247}
 248
 249static struct kernfs_ops rdtgroup_kf_single_ops = {
 250        .atomic_write_len       = PAGE_SIZE,
 251        .write                  = rdtgroup_file_write,
 252        .seq_show               = rdtgroup_seqfile_show,
 253};
 254
 255static struct kernfs_ops kf_mondata_ops = {
 256        .atomic_write_len       = PAGE_SIZE,
 257        .seq_show               = rdtgroup_mondata_show,
 258};
 259
 260static bool is_cpu_list(struct kernfs_open_file *of)
 261{
 262        struct rftype *rft = of->kn->priv;
 263
 264        return rft->flags & RFTYPE_FLAGS_CPUS_LIST;
 265}
 266
 267static int rdtgroup_cpus_show(struct kernfs_open_file *of,
 268                              struct seq_file *s, void *v)
 269{
 270        struct rdtgroup *rdtgrp;
 271        struct cpumask *mask;
 272        int ret = 0;
 273
 274        rdtgrp = rdtgroup_kn_lock_live(of->kn);
 275
 276        if (rdtgrp) {
 277                if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
 278                        if (!rdtgrp->plr->d) {
 279                                rdt_last_cmd_clear();
 280                                rdt_last_cmd_puts("Cache domain offline\n");
 281                                ret = -ENODEV;
 282                        } else {
 283                                mask = &rdtgrp->plr->d->cpu_mask;
 284                                seq_printf(s, is_cpu_list(of) ?
 285                                           "%*pbl\n" : "%*pb\n",
 286                                           cpumask_pr_args(mask));
 287                        }
 288                } else {
 289                        seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n",
 290                                   cpumask_pr_args(&rdtgrp->cpu_mask));
 291                }
 292        } else {
 293                ret = -ENOENT;
 294        }
 295        rdtgroup_kn_unlock(of->kn);
 296
 297        return ret;
 298}
 299
 300/*
 301 * This is safe against resctrl_sched_in() called from __switch_to()
 302 * because __switch_to() is executed with interrupts disabled. A local call
 303 * from update_closid_rmid() is proteced against __switch_to() because
 304 * preemption is disabled.
 305 */
 306static void update_cpu_closid_rmid(void *info)
 307{
 308        struct rdtgroup *r = info;
 309
 310        if (r) {
 311                this_cpu_write(pqr_state.default_closid, r->closid);
 312                this_cpu_write(pqr_state.default_rmid, r->mon.rmid);
 313        }
 314
 315        /*
 316         * We cannot unconditionally write the MSR because the current
 317         * executing task might have its own closid selected. Just reuse
 318         * the context switch code.
 319         */
 320        resctrl_sched_in();
 321}
 322
 323/*
 324 * Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
 325 *
 326 * Per task closids/rmids must have been set up before calling this function.
 327 */
 328static void
 329update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r)
 330{
 331        int cpu = get_cpu();
 332
 333        if (cpumask_test_cpu(cpu, cpu_mask))
 334                update_cpu_closid_rmid(r);
 335        smp_call_function_many(cpu_mask, update_cpu_closid_rmid, r, 1);
 336        put_cpu();
 337}
 338
 339static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
 340                          cpumask_var_t tmpmask)
 341{
 342        struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp;
 343        struct list_head *head;
 344
 345        /* Check whether cpus belong to parent ctrl group */
 346        cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask);
 347        if (cpumask_weight(tmpmask)) {
 348                rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n");
 349                return -EINVAL;
 350        }
 351
 352        /* Check whether cpus are dropped from this group */
 353        cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
 354        if (cpumask_weight(tmpmask)) {
 355                /* Give any dropped cpus to parent rdtgroup */
 356                cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask);
 357                update_closid_rmid(tmpmask, prgrp);
 358        }
 359
 360        /*
 361         * If we added cpus, remove them from previous group that owned them
 362         * and update per-cpu rmid
 363         */
 364        cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
 365        if (cpumask_weight(tmpmask)) {
 366                head = &prgrp->mon.crdtgrp_list;
 367                list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
 368                        if (crgrp == rdtgrp)
 369                                continue;
 370                        cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask,
 371                                       tmpmask);
 372                }
 373                update_closid_rmid(tmpmask, rdtgrp);
 374        }
 375
 376        /* Done pushing/pulling - update this group with new mask */
 377        cpumask_copy(&rdtgrp->cpu_mask, newmask);
 378
 379        return 0;
 380}
 381
 382static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m)
 383{
 384        struct rdtgroup *crgrp;
 385
 386        cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m);
 387        /* update the child mon group masks as well*/
 388        list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list)
 389                cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask);
 390}
 391
 392static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
 393                           cpumask_var_t tmpmask, cpumask_var_t tmpmask1)
 394{
 395        struct rdtgroup *r, *crgrp;
 396        struct list_head *head;
 397
 398        /* Check whether cpus are dropped from this group */
 399        cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
 400        if (cpumask_weight(tmpmask)) {
 401                /* Can't drop from default group */
 402                if (rdtgrp == &rdtgroup_default) {
 403                        rdt_last_cmd_puts("Can't drop CPUs from default group\n");
 404                        return -EINVAL;
 405                }
 406
 407                /* Give any dropped cpus to rdtgroup_default */
 408                cpumask_or(&rdtgroup_default.cpu_mask,
 409                           &rdtgroup_default.cpu_mask, tmpmask);
 410                update_closid_rmid(tmpmask, &rdtgroup_default);
 411        }
 412
 413        /*
 414         * If we added cpus, remove them from previous group and
 415         * the prev group's child groups that owned them
 416         * and update per-cpu closid/rmid.
 417         */
 418        cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
 419        if (cpumask_weight(tmpmask)) {
 420                list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
 421                        if (r == rdtgrp)
 422                                continue;
 423                        cpumask_and(tmpmask1, &r->cpu_mask, tmpmask);
 424                        if (cpumask_weight(tmpmask1))
 425                                cpumask_rdtgrp_clear(r, tmpmask1);
 426                }
 427                update_closid_rmid(tmpmask, rdtgrp);
 428        }
 429
 430        /* Done pushing/pulling - update this group with new mask */
 431        cpumask_copy(&rdtgrp->cpu_mask, newmask);
 432
 433        /*
 434         * Clear child mon group masks since there is a new parent mask
 435         * now and update the rmid for the cpus the child lost.
 436         */
 437        head = &rdtgrp->mon.crdtgrp_list;
 438        list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
 439                cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask);
 440                update_closid_rmid(tmpmask, rdtgrp);
 441                cpumask_clear(&crgrp->cpu_mask);
 442        }
 443
 444        return 0;
 445}
 446
 447static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
 448                                   char *buf, size_t nbytes, loff_t off)
 449{
 450        cpumask_var_t tmpmask, newmask, tmpmask1;
 451        struct rdtgroup *rdtgrp;
 452        int ret;
 453
 454        if (!buf)
 455                return -EINVAL;
 456
 457        if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
 458                return -ENOMEM;
 459        if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) {
 460                free_cpumask_var(tmpmask);
 461                return -ENOMEM;
 462        }
 463        if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) {
 464                free_cpumask_var(tmpmask);
 465                free_cpumask_var(newmask);
 466                return -ENOMEM;
 467        }
 468
 469        rdtgrp = rdtgroup_kn_lock_live(of->kn);
 470        rdt_last_cmd_clear();
 471        if (!rdtgrp) {
 472                ret = -ENOENT;
 473                rdt_last_cmd_puts("Directory was removed\n");
 474                goto unlock;
 475        }
 476
 477        if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
 478            rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
 479                ret = -EINVAL;
 480                rdt_last_cmd_puts("Pseudo-locking in progress\n");
 481                goto unlock;
 482        }
 483
 484        if (is_cpu_list(of))
 485                ret = cpulist_parse(buf, newmask);
 486        else
 487                ret = cpumask_parse(buf, newmask);
 488
 489        if (ret) {
 490                rdt_last_cmd_puts("Bad CPU list/mask\n");
 491                goto unlock;
 492        }
 493
 494        /* check that user didn't specify any offline cpus */
 495        cpumask_andnot(tmpmask, newmask, cpu_online_mask);
 496        if (cpumask_weight(tmpmask)) {
 497                ret = -EINVAL;
 498                rdt_last_cmd_puts("Can only assign online CPUs\n");
 499                goto unlock;
 500        }
 501
 502        if (rdtgrp->type == RDTCTRL_GROUP)
 503                ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1);
 504        else if (rdtgrp->type == RDTMON_GROUP)
 505                ret = cpus_mon_write(rdtgrp, newmask, tmpmask);
 506        else
 507                ret = -EINVAL;
 508
 509unlock:
 510        rdtgroup_kn_unlock(of->kn);
 511        free_cpumask_var(tmpmask);
 512        free_cpumask_var(newmask);
 513        free_cpumask_var(tmpmask1);
 514
 515        return ret ?: nbytes;
 516}
 517
 518struct task_move_callback {
 519        struct callback_head    work;
 520        struct rdtgroup         *rdtgrp;
 521};
 522
 523static void move_myself(struct callback_head *head)
 524{
 525        struct task_move_callback *callback;
 526        struct rdtgroup *rdtgrp;
 527
 528        callback = container_of(head, struct task_move_callback, work);
 529        rdtgrp = callback->rdtgrp;
 530
 531        /*
 532         * If resource group was deleted before this task work callback
 533         * was invoked, then assign the task to root group and free the
 534         * resource group.
 535         */
 536        if (atomic_dec_and_test(&rdtgrp->waitcount) &&
 537            (rdtgrp->flags & RDT_DELETED)) {
 538                current->closid = 0;
 539                current->rmid = 0;
 540                kfree(rdtgrp);
 541        }
 542
 543        preempt_disable();
 544        /* update PQR_ASSOC MSR to make resource group go into effect */
 545        resctrl_sched_in();
 546        preempt_enable();
 547
 548        kfree(callback);
 549}
 550
 551static int __rdtgroup_move_task(struct task_struct *tsk,
 552                                struct rdtgroup *rdtgrp)
 553{
 554        struct task_move_callback *callback;
 555        int ret;
 556
 557        callback = kzalloc(sizeof(*callback), GFP_KERNEL);
 558        if (!callback)
 559                return -ENOMEM;
 560        callback->work.func = move_myself;
 561        callback->rdtgrp = rdtgrp;
 562
 563        /*
 564         * Take a refcount, so rdtgrp cannot be freed before the
 565         * callback has been invoked.
 566         */
 567        atomic_inc(&rdtgrp->waitcount);
 568        ret = task_work_add(tsk, &callback->work, true);
 569        if (ret) {
 570                /*
 571                 * Task is exiting. Drop the refcount and free the callback.
 572                 * No need to check the refcount as the group cannot be
 573                 * deleted before the write function unlocks rdtgroup_mutex.
 574                 */
 575                atomic_dec(&rdtgrp->waitcount);
 576                kfree(callback);
 577                rdt_last_cmd_puts("Task exited\n");
 578        } else {
 579                /*
 580                 * For ctrl_mon groups move both closid and rmid.
 581                 * For monitor groups, can move the tasks only from
 582                 * their parent CTRL group.
 583                 */
 584                if (rdtgrp->type == RDTCTRL_GROUP) {
 585                        tsk->closid = rdtgrp->closid;
 586                        tsk->rmid = rdtgrp->mon.rmid;
 587                } else if (rdtgrp->type == RDTMON_GROUP) {
 588                        if (rdtgrp->mon.parent->closid == tsk->closid) {
 589                                tsk->rmid = rdtgrp->mon.rmid;
 590                        } else {
 591                                rdt_last_cmd_puts("Can't move task to different control group\n");
 592                                ret = -EINVAL;
 593                        }
 594                }
 595        }
 596        return ret;
 597}
 598
 599/**
 600 * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group
 601 * @r: Resource group
 602 *
 603 * Return: 1 if tasks have been assigned to @r, 0 otherwise
 604 */
 605int rdtgroup_tasks_assigned(struct rdtgroup *r)
 606{
 607        struct task_struct *p, *t;
 608        int ret = 0;
 609
 610        lockdep_assert_held(&rdtgroup_mutex);
 611
 612        rcu_read_lock();
 613        for_each_process_thread(p, t) {
 614                if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) ||
 615                    (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid)) {
 616                        ret = 1;
 617                        break;
 618                }
 619        }
 620        rcu_read_unlock();
 621
 622        return ret;
 623}
 624
 625static int rdtgroup_task_write_permission(struct task_struct *task,
 626                                          struct kernfs_open_file *of)
 627{
 628        const struct cred *tcred = get_task_cred(task);
 629        const struct cred *cred = current_cred();
 630        int ret = 0;
 631
 632        /*
 633         * Even if we're attaching all tasks in the thread group, we only
 634         * need to check permissions on one of them.
 635         */
 636        if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
 637            !uid_eq(cred->euid, tcred->uid) &&
 638            !uid_eq(cred->euid, tcred->suid)) {
 639                rdt_last_cmd_printf("No permission to move task %d\n", task->pid);
 640                ret = -EPERM;
 641        }
 642
 643        put_cred(tcred);
 644        return ret;
 645}
 646
 647static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp,
 648                              struct kernfs_open_file *of)
 649{
 650        struct task_struct *tsk;
 651        int ret;
 652
 653        rcu_read_lock();
 654        if (pid) {
 655                tsk = find_task_by_vpid(pid);
 656                if (!tsk) {
 657                        rcu_read_unlock();
 658                        rdt_last_cmd_printf("No task %d\n", pid);
 659                        return -ESRCH;
 660                }
 661        } else {
 662                tsk = current;
 663        }
 664
 665        get_task_struct(tsk);
 666        rcu_read_unlock();
 667
 668        ret = rdtgroup_task_write_permission(tsk, of);
 669        if (!ret)
 670                ret = __rdtgroup_move_task(tsk, rdtgrp);
 671
 672        put_task_struct(tsk);
 673        return ret;
 674}
 675
 676static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of,
 677                                    char *buf, size_t nbytes, loff_t off)
 678{
 679        struct rdtgroup *rdtgrp;
 680        int ret = 0;
 681        pid_t pid;
 682
 683        if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
 684                return -EINVAL;
 685        rdtgrp = rdtgroup_kn_lock_live(of->kn);
 686        if (!rdtgrp) {
 687                rdtgroup_kn_unlock(of->kn);
 688                return -ENOENT;
 689        }
 690        rdt_last_cmd_clear();
 691
 692        if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
 693            rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
 694                ret = -EINVAL;
 695                rdt_last_cmd_puts("Pseudo-locking in progress\n");
 696                goto unlock;
 697        }
 698
 699        ret = rdtgroup_move_task(pid, rdtgrp, of);
 700
 701unlock:
 702        rdtgroup_kn_unlock(of->kn);
 703
 704        return ret ?: nbytes;
 705}
 706
 707static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
 708{
 709        struct task_struct *p, *t;
 710
 711        rcu_read_lock();
 712        for_each_process_thread(p, t) {
 713                if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) ||
 714                    (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid))
 715                        seq_printf(s, "%d\n", t->pid);
 716        }
 717        rcu_read_unlock();
 718}
 719
 720static int rdtgroup_tasks_show(struct kernfs_open_file *of,
 721                               struct seq_file *s, void *v)
 722{
 723        struct rdtgroup *rdtgrp;
 724        int ret = 0;
 725
 726        rdtgrp = rdtgroup_kn_lock_live(of->kn);
 727        if (rdtgrp)
 728                show_rdt_tasks(rdtgrp, s);
 729        else
 730                ret = -ENOENT;
 731        rdtgroup_kn_unlock(of->kn);
 732
 733        return ret;
 734}
 735
 736static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
 737                                    struct seq_file *seq, void *v)
 738{
 739        int len;
 740
 741        mutex_lock(&rdtgroup_mutex);
 742        len = seq_buf_used(&last_cmd_status);
 743        if (len)
 744                seq_printf(seq, "%.*s", len, last_cmd_status_buf);
 745        else
 746                seq_puts(seq, "ok\n");
 747        mutex_unlock(&rdtgroup_mutex);
 748        return 0;
 749}
 750
 751static int rdt_num_closids_show(struct kernfs_open_file *of,
 752                                struct seq_file *seq, void *v)
 753{
 754        struct rdt_resource *r = of->kn->parent->priv;
 755
 756        seq_printf(seq, "%d\n", r->num_closid);
 757        return 0;
 758}
 759
 760static int rdt_default_ctrl_show(struct kernfs_open_file *of,
 761                             struct seq_file *seq, void *v)
 762{
 763        struct rdt_resource *r = of->kn->parent->priv;
 764
 765        seq_printf(seq, "%x\n", r->default_ctrl);
 766        return 0;
 767}
 768
 769static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
 770                             struct seq_file *seq, void *v)
 771{
 772        struct rdt_resource *r = of->kn->parent->priv;
 773
 774        seq_printf(seq, "%u\n", r->cache.min_cbm_bits);
 775        return 0;
 776}
 777
 778static int rdt_shareable_bits_show(struct kernfs_open_file *of,
 779                                   struct seq_file *seq, void *v)
 780{
 781        struct rdt_resource *r = of->kn->parent->priv;
 782
 783        seq_printf(seq, "%x\n", r->cache.shareable_bits);
 784        return 0;
 785}
 786
 787/**
 788 * rdt_bit_usage_show - Display current usage of resources
 789 *
 790 * A domain is a shared resource that can now be allocated differently. Here
 791 * we display the current regions of the domain as an annotated bitmask.
 792 * For each domain of this resource its allocation bitmask
 793 * is annotated as below to indicate the current usage of the corresponding bit:
 794 *   0 - currently unused
 795 *   X - currently available for sharing and used by software and hardware
 796 *   H - currently used by hardware only but available for software use
 797 *   S - currently used and shareable by software only
 798 *   E - currently used exclusively by one resource group
 799 *   P - currently pseudo-locked by one resource group
 800 */
 801static int rdt_bit_usage_show(struct kernfs_open_file *of,
 802                              struct seq_file *seq, void *v)
 803{
 804        struct rdt_resource *r = of->kn->parent->priv;
 805        u32 sw_shareable = 0, hw_shareable = 0;
 806        u32 exclusive = 0, pseudo_locked = 0;
 807        struct rdt_domain *dom;
 808        int i, hwb, swb, excl, psl;
 809        enum rdtgrp_mode mode;
 810        bool sep = false;
 811        u32 *ctrl;
 812
 813        mutex_lock(&rdtgroup_mutex);
 814        hw_shareable = r->cache.shareable_bits;
 815        list_for_each_entry(dom, &r->domains, list) {
 816                if (sep)
 817                        seq_putc(seq, ';');
 818                ctrl = dom->ctrl_val;
 819                sw_shareable = 0;
 820                exclusive = 0;
 821                seq_printf(seq, "%d=", dom->id);
 822                for (i = 0; i < closids_supported(); i++, ctrl++) {
 823                        if (!closid_allocated(i))
 824                                continue;
 825                        mode = rdtgroup_mode_by_closid(i);
 826                        switch (mode) {
 827                        case RDT_MODE_SHAREABLE:
 828                                sw_shareable |= *ctrl;
 829                                break;
 830                        case RDT_MODE_EXCLUSIVE:
 831                                exclusive |= *ctrl;
 832                                break;
 833                        case RDT_MODE_PSEUDO_LOCKSETUP:
 834                        /*
 835                         * RDT_MODE_PSEUDO_LOCKSETUP is possible
 836                         * here but not included since the CBM
 837                         * associated with this CLOSID in this mode
 838                         * is not initialized and no task or cpu can be
 839                         * assigned this CLOSID.
 840                         */
 841                                break;
 842                        case RDT_MODE_PSEUDO_LOCKED:
 843                        case RDT_NUM_MODES:
 844                                WARN(1,
 845                                     "invalid mode for closid %d\n", i);
 846                                break;
 847                        }
 848                }
 849                for (i = r->cache.cbm_len - 1; i >= 0; i--) {
 850                        pseudo_locked = dom->plr ? dom->plr->cbm : 0;
 851                        hwb = test_bit(i, (unsigned long *)&hw_shareable);
 852                        swb = test_bit(i, (unsigned long *)&sw_shareable);
 853                        excl = test_bit(i, (unsigned long *)&exclusive);
 854                        psl = test_bit(i, (unsigned long *)&pseudo_locked);
 855                        if (hwb && swb)
 856                                seq_putc(seq, 'X');
 857                        else if (hwb && !swb)
 858                                seq_putc(seq, 'H');
 859                        else if (!hwb && swb)
 860                                seq_putc(seq, 'S');
 861                        else if (excl)
 862                                seq_putc(seq, 'E');
 863                        else if (psl)
 864                                seq_putc(seq, 'P');
 865                        else /* Unused bits remain */
 866                                seq_putc(seq, '0');
 867                }
 868                sep = true;
 869        }
 870        seq_putc(seq, '\n');
 871        mutex_unlock(&rdtgroup_mutex);
 872        return 0;
 873}
 874
 875static int rdt_min_bw_show(struct kernfs_open_file *of,
 876                             struct seq_file *seq, void *v)
 877{
 878        struct rdt_resource *r = of->kn->parent->priv;
 879
 880        seq_printf(seq, "%u\n", r->membw.min_bw);
 881        return 0;
 882}
 883
 884static int rdt_num_rmids_show(struct kernfs_open_file *of,
 885                              struct seq_file *seq, void *v)
 886{
 887        struct rdt_resource *r = of->kn->parent->priv;
 888
 889        seq_printf(seq, "%d\n", r->num_rmid);
 890
 891        return 0;
 892}
 893
 894static int rdt_mon_features_show(struct kernfs_open_file *of,
 895                                 struct seq_file *seq, void *v)
 896{
 897        struct rdt_resource *r = of->kn->parent->priv;
 898        struct mon_evt *mevt;
 899
 900        list_for_each_entry(mevt, &r->evt_list, list)
 901                seq_printf(seq, "%s\n", mevt->name);
 902
 903        return 0;
 904}
 905
 906static int rdt_bw_gran_show(struct kernfs_open_file *of,
 907                             struct seq_file *seq, void *v)
 908{
 909        struct rdt_resource *r = of->kn->parent->priv;
 910
 911        seq_printf(seq, "%u\n", r->membw.bw_gran);
 912        return 0;
 913}
 914
 915static int rdt_delay_linear_show(struct kernfs_open_file *of,
 916                             struct seq_file *seq, void *v)
 917{
 918        struct rdt_resource *r = of->kn->parent->priv;
 919
 920        seq_printf(seq, "%u\n", r->membw.delay_linear);
 921        return 0;
 922}
 923
 924static int max_threshold_occ_show(struct kernfs_open_file *of,
 925                                  struct seq_file *seq, void *v)
 926{
 927        struct rdt_resource *r = of->kn->parent->priv;
 928
 929        seq_printf(seq, "%u\n", resctrl_cqm_threshold * r->mon_scale);
 930
 931        return 0;
 932}
 933
 934static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
 935                                       char *buf, size_t nbytes, loff_t off)
 936{
 937        struct rdt_resource *r = of->kn->parent->priv;
 938        unsigned int bytes;
 939        int ret;
 940
 941        ret = kstrtouint(buf, 0, &bytes);
 942        if (ret)
 943                return ret;
 944
 945        if (bytes > (boot_cpu_data.x86_cache_size * 1024))
 946                return -EINVAL;
 947
 948        resctrl_cqm_threshold = bytes / r->mon_scale;
 949
 950        return nbytes;
 951}
 952
 953/*
 954 * rdtgroup_mode_show - Display mode of this resource group
 955 */
 956static int rdtgroup_mode_show(struct kernfs_open_file *of,
 957                              struct seq_file *s, void *v)
 958{
 959        struct rdtgroup *rdtgrp;
 960
 961        rdtgrp = rdtgroup_kn_lock_live(of->kn);
 962        if (!rdtgrp) {
 963                rdtgroup_kn_unlock(of->kn);
 964                return -ENOENT;
 965        }
 966
 967        seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode));
 968
 969        rdtgroup_kn_unlock(of->kn);
 970        return 0;
 971}
 972
 973/**
 974 * rdt_cdp_peer_get - Retrieve CDP peer if it exists
 975 * @r: RDT resource to which RDT domain @d belongs
 976 * @d: Cache instance for which a CDP peer is requested
 977 * @r_cdp: RDT resource that shares hardware with @r (RDT resource peer)
 978 *         Used to return the result.
 979 * @d_cdp: RDT domain that shares hardware with @d (RDT domain peer)
 980 *         Used to return the result.
 981 *
 982 * RDT resources are managed independently and by extension the RDT domains
 983 * (RDT resource instances) are managed independently also. The Code and
 984 * Data Prioritization (CDP) RDT resources, while managed independently,
 985 * could refer to the same underlying hardware. For example,
 986 * RDT_RESOURCE_L2CODE and RDT_RESOURCE_L2DATA both refer to the L2 cache.
 987 *
 988 * When provided with an RDT resource @r and an instance of that RDT
 989 * resource @d rdt_cdp_peer_get() will return if there is a peer RDT
 990 * resource and the exact instance that shares the same hardware.
 991 *
 992 * Return: 0 if a CDP peer was found, <0 on error or if no CDP peer exists.
 993 *         If a CDP peer was found, @r_cdp will point to the peer RDT resource
 994 *         and @d_cdp will point to the peer RDT domain.
 995 */
 996static int rdt_cdp_peer_get(struct rdt_resource *r, struct rdt_domain *d,
 997                            struct rdt_resource **r_cdp,
 998                            struct rdt_domain **d_cdp)
 999{
1000        struct rdt_resource *_r_cdp = NULL;

1001        struct rdt_domain *_d_cdp = NULL;
1002        int ret = 0;
1003
1004        switch (r->rid) {
1005        case RDT_RESOURCE_L3DATA:
1006                _r_cdp = &rdt_resources_all[RDT_RESOURCE_L3CODE];
1007                break;
1008        case RDT_RESOURCE_L3CODE:
1009                _r_cdp =  &rdt_resources_all[RDT_RESOURCE_L3DATA];
1010                break;
1011        case RDT_RESOURCE_L2DATA:
1012                _r_cdp =  &rdt_resources_all[RDT_RESOURCE_L2CODE];
1013                break;
1014        case RDT_RESOURCE_L2CODE:
1015                _r_cdp =  &rdt_resources_all[RDT_RESOURCE_L2DATA];
1016                break;
1017        default:
1018                ret = -ENOENT;
1019                goto out;
1020        }
1021
1022        /*
1023         * When a new CPU comes online and CDP is enabled then the new
1024         * RDT domains (if any) associated with both CDP RDT resources
1025         * are added in the same CPU online routine while the
1026         * rdtgroup_mutex is held. It should thus not happen for one
1027         * RDT domain to exist and be associated with its RDT CDP
1028         * resource but there is no RDT domain associated with the
1029         * peer RDT CDP resource. Hence the WARN.
1030         */
1031        _d_cdp = rdt_find_domain(_r_cdp, d->id, NULL);
1032        if (WARN_ON(IS_ERR_OR_NULL(_d_cdp))) {
1033                _r_cdp = NULL;
1034                ret = -EINVAL;
1035        }
1036
1037out:
1038        *r_cdp = _r_cdp;
1039        *d_cdp = _d_cdp;
1040
1041        return ret;
1042}
1043
1044/**
1045 * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other
1046 * @r: Resource to which domain instance @d belongs.
1047 * @d: The domain instance for which @closid is being tested.
1048 * @cbm: Capacity bitmask being tested.
1049 * @closid: Intended closid for @cbm.
1050 * @exclusive: Only check if overlaps with exclusive resource groups
1051 *
1052 * Checks if provided @cbm intended to be used for @closid on domain
1053 * @d overlaps with any other closids or other hardware usage associated
1054 * with this domain. If @exclusive is true then only overlaps with
1055 * resource groups in exclusive mode will be considered. If @exclusive
1056 * is false then overlaps with any resource group or hardware entities
1057 * will be considered.
1058 *
1059 * @cbm is unsigned long, even if only 32 bits are used, to make the
1060 * bitmap functions work correctly.
1061 *
1062 * Return: false if CBM does not overlap, true if it does.
1063 */
1064static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
1065                                    unsigned long cbm, int closid, bool exclusive)
1066{
1067        enum rdtgrp_mode mode;
1068        unsigned long ctrl_b;
1069        u32 *ctrl;
1070        int i;
1071
1072        /* Check for any overlap with regions used by hardware directly */
1073        if (!exclusive) {
1074                ctrl_b = r->cache.shareable_bits;
1075                if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len))
1076                        return true;
1077        }
1078
1079        /* Check for overlap with other resource groups */
1080        ctrl = d->ctrl_val;
1081        for (i = 0; i < closids_supported(); i++, ctrl++) {
1082                ctrl_b = *ctrl;
1083                mode = rdtgroup_mode_by_closid(i);
1084                if (closid_allocated(i) && i != closid &&
1085                    mode != RDT_MODE_PSEUDO_LOCKSETUP) {
1086                        if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) {
1087                                if (exclusive) {
1088                                        if (mode == RDT_MODE_EXCLUSIVE)
1089                                                return true;
1090                                        continue;
1091                                }
1092                                return true;
1093                        }
1094                }
1095        }
1096
1097        return false;
1098}
1099
1100/**
1101 * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware
1102 * @r: Resource to which domain instance @d belongs.
1103 * @d: The domain instance for which @closid is being tested.
1104 * @cbm: Capacity bitmask being tested.
1105 * @closid: Intended closid for @cbm.
1106 * @exclusive: Only check if overlaps with exclusive resource groups
1107 *
1108 * Resources that can be allocated using a CBM can use the CBM to control
1109 * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test
1110 * for overlap. Overlap test is not limited to the specific resource for
1111 * which the CBM is intended though - when dealing with CDP resources that
1112 * share the underlying hardware the overlap check should be performed on
1113 * the CDP resource sharing the hardware also.
1114 *
1115 * Refer to description of __rdtgroup_cbm_overlaps() for the details of the
1116 * overlap test.
1117 *
1118 * Return: true if CBM overlap detected, false if there is no overlap
1119 */
1120bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
1121                           unsigned long cbm, int closid, bool exclusive)
1122{
1123        struct rdt_resource *r_cdp;
1124        struct rdt_domain *d_cdp;
1125
1126        if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, exclusive))
1127                return true;
1128
1129        if (rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp) < 0)
1130                return false;
1131
1132        return  __rdtgroup_cbm_overlaps(r_cdp, d_cdp, cbm, closid, exclusive);
1133}
1134
1135/**
1136 * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive
1137 *
1138 * An exclusive resource group implies that there should be no sharing of
1139 * its allocated resources. At the time this group is considered to be
1140 * exclusive this test can determine if its current schemata supports this
1141 * setting by testing for overlap with all other resource groups.
1142 *
1143 * Return: true if resource group can be exclusive, false if there is overlap
1144 * with allocations of other resource groups and thus this resource group
1145 * cannot be exclusive.
1146 */
1147static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
1148{
1149        int closid = rdtgrp->closid;
1150        struct rdt_resource *r;
1151        bool has_cache = false;
1152        struct rdt_domain *d;
1153
1154        for_each_alloc_enabled_rdt_resource(r) {
1155                if (r->rid == RDT_RESOURCE_MBA)
1156                        continue;
1157                has_cache = true;
1158                list_for_each_entry(d, &r->domains, list) {
1159                        if (rdtgroup_cbm_overlaps(r, d, d->ctrl_val[closid],
1160                                                  rdtgrp->closid, false)) {
1161                                rdt_last_cmd_puts("Schemata overlaps\n");
1162                                return false;
1163                        }
1164                }
1165        }
1166
1167        if (!has_cache) {
1168                rdt_last_cmd_puts("Cannot be exclusive without CAT/CDP\n");
1169                return false;
1170        }
1171
1172        return true;
1173}
1174
1175/**
1176 * rdtgroup_mode_write - Modify the resource group's mode
1177 *
1178 */
1179static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of,
1180                                   char *buf, size_t nbytes, loff_t off)
1181{
1182        struct rdtgroup *rdtgrp;
1183        enum rdtgrp_mode mode;
1184        int ret = 0;
1185
1186        /* Valid input requires a trailing newline */
1187        if (nbytes == 0 || buf[nbytes - 1] != '\n')
1188                return -EINVAL;
1189        buf[nbytes - 1] = '\0';
1190
1191        rdtgrp = rdtgroup_kn_lock_live(of->kn);
1192        if (!rdtgrp) {
1193                rdtgroup_kn_unlock(of->kn);
1194                return -ENOENT;
1195        }
1196
1197        rdt_last_cmd_clear();
1198
1199        mode = rdtgrp->mode;
1200
1201        if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) ||
1202            (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) ||
1203            (!strcmp(buf, "pseudo-locksetup") &&
1204             mode == RDT_MODE_PSEUDO_LOCKSETUP) ||
1205            (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED))
1206                goto out;
1207
1208        if (mode == RDT_MODE_PSEUDO_LOCKED) {
1209                rdt_last_cmd_puts("Cannot change pseudo-locked group\n");
1210                ret = -EINVAL;
1211                goto out;
1212        }
1213
1214        if (!strcmp(buf, "shareable")) {
1215                if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
1216                        ret = rdtgroup_locksetup_exit(rdtgrp);
1217                        if (ret)
1218                                goto out;
1219                }
1220                rdtgrp->mode = RDT_MODE_SHAREABLE;
1221        } else if (!strcmp(buf, "exclusive")) {
1222                if (!rdtgroup_mode_test_exclusive(rdtgrp)) {
1223                        ret = -EINVAL;
1224                        goto out;
1225                }
1226                if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
1227                        ret = rdtgroup_locksetup_exit(rdtgrp);
1228                        if (ret)
1229                                goto out;
1230                }
1231                rdtgrp->mode = RDT_MODE_EXCLUSIVE;
1232        } else if (!strcmp(buf, "pseudo-locksetup")) {
1233                ret = rdtgroup_locksetup_enter(rdtgrp);
1234                if (ret)
1235                        goto out;
1236                rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP;
1237        } else {
1238                rdt_last_cmd_puts("Unknown or unsupported mode\n");
1239                ret = -EINVAL;
1240        }
1241
1242out:
1243        rdtgroup_kn_unlock(of->kn);
1244        return ret ?: nbytes;
1245}
1246
1247/**
1248 * rdtgroup_cbm_to_size - Translate CBM to size in bytes
1249 * @r: RDT resource to which @d belongs.
1250 * @d: RDT domain instance.
1251 * @cbm: bitmask for which the size should be computed.
1252 *
1253 * The bitmask provided associated with the RDT domain instance @d will be
1254 * translated into how many bytes it represents. The size in bytes is
1255 * computed by first dividing the total cache size by the CBM length to
1256 * determine how many bytes each bit in the bitmask represents. The result
1257 * is multiplied with the number of bits set in the bitmask.
1258 *
1259 * @cbm is unsigned long, even if only 32 bits are used to make the
1260 * bitmap functions work correctly.
1261 */
1262unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
1263                                  struct rdt_domain *d, unsigned long cbm)
1264{
1265        struct cpu_cacheinfo *ci;
1266        unsigned int size = 0;
1267        int num_b, i;
1268
1269        num_b = bitmap_weight(&cbm, r->cache.cbm_len);
1270        ci = get_cpu_cacheinfo(cpumask_any(&d->cpu_mask));
1271        for (i = 0; i < ci->num_leaves; i++) {
1272                if (ci->info_list[i].level == r->cache_level) {
1273                        size = ci->info_list[i].size / r->cache.cbm_len * num_b;
1274                        break;
1275                }
1276        }
1277
1278        return size;
1279}
1280
1281/**
1282 * rdtgroup_size_show - Display size in bytes of allocated regions
1283 *
1284 * The "size" file mirrors the layout of the "schemata" file, printing the
1285 * size in bytes of each region instead of the capacity bitmask.
1286 *
1287 */
1288static int rdtgroup_size_show(struct kernfs_open_file *of,
1289                              struct seq_file *s, void *v)
1290{
1291        struct rdtgroup *rdtgrp;
1292        struct rdt_resource *r;
1293        struct rdt_domain *d;
1294        unsigned int size;
1295        int ret = 0;
1296        bool sep;
1297        u32 ctrl;
1298
1299        rdtgrp = rdtgroup_kn_lock_live(of->kn);
1300        if (!rdtgrp) {
1301                rdtgroup_kn_unlock(of->kn);
1302                return -ENOENT;
1303        }
1304
1305        if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
1306                if (!rdtgrp->plr->d) {
1307                        rdt_last_cmd_clear();
1308                        rdt_last_cmd_puts("Cache domain offline\n");
1309                        ret = -ENODEV;
1310                } else {
1311                        seq_printf(s, "%*s:", max_name_width,
1312                                   rdtgrp->plr->r->name);
1313                        size = rdtgroup_cbm_to_size(rdtgrp->plr->r,
1314                                                    rdtgrp->plr->d,
1315                                                    rdtgrp->plr->cbm);
1316                        seq_printf(s, "%d=%u\n", rdtgrp->plr->d->id, size);
1317                }
1318                goto out;
1319        }
1320
1321        for_each_alloc_enabled_rdt_resource(r) {
1322                sep = false;
1323                seq_printf(s, "%*s:", max_name_width, r->name);
1324                list_for_each_entry(d, &r->domains, list) {
1325                        if (sep)
1326                                seq_putc(s, ';');
1327                        if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
1328                                size = 0;
1329                        } else {
1330                                ctrl = (!is_mba_sc(r) ?
1331                                                d->ctrl_val[rdtgrp->closid] :
1332                                                d->mbps_val[rdtgrp->closid]);
1333                                if (r->rid == RDT_RESOURCE_MBA)
1334                                        size = ctrl;
1335                                else
1336                                        size = rdtgroup_cbm_to_size(r, d, ctrl);
1337                        }
1338                        seq_printf(s, "%d=%u", d->id, size);
1339                        sep = true;
1340                }
1341                seq_putc(s, '\n');
1342        }
1343
1344out:
1345        rdtgroup_kn_unlock(of->kn);
1346
1347        return ret;
1348}
1349
1350/* rdtgroup information files for one cache resource. */
1351static struct rftype res_common_files[] = {
1352        {
1353                .name           = "last_cmd_status",
1354                .mode           = 0444,
1355                .kf_ops         = &rdtgroup_kf_single_ops,
1356                .seq_show       = rdt_last_cmd_status_show,
1357                .fflags         = RF_TOP_INFO,
1358        },
1359        {
1360                .name           = "num_closids",
1361                .mode           = 0444,
1362                .kf_ops         = &rdtgroup_kf_single_ops,
1363                .seq_show       = rdt_num_closids_show,
1364                .fflags         = RF_CTRL_INFO,
1365        },
1366        {
1367                .name           = "mon_features",
1368                .mode           = 0444,
1369                .kf_ops         = &rdtgroup_kf_single_ops,
1370                .seq_show       = rdt_mon_features_show,
1371                .fflags         = RF_MON_INFO,
1372        },
1373        {
1374                .name           = "num_rmids",
1375                .mode           = 0444,
1376                .kf_ops         = &rdtgroup_kf_single_ops,
1377                .seq_show       = rdt_num_rmids_show,
1378                .fflags         = RF_MON_INFO,
1379        },
1380        {
1381                .name           = "cbm_mask",
1382                .mode           = 0444,
1383                .kf_ops         = &rdtgroup_kf_single_ops,
1384                .seq_show       = rdt_default_ctrl_show,
1385                .fflags         = RF_CTRL_INFO | RFTYPE_RES_CACHE,
1386        },
1387        {
1388                .name           = "min_cbm_bits",
1389                .mode           = 0444,
1390                .kf_ops         = &rdtgroup_kf_single_ops,
1391                .seq_show       = rdt_min_cbm_bits_show,
1392                .fflags         = RF_CTRL_INFO | RFTYPE_RES_CACHE,
1393        },
1394        {
1395                .name           = "shareable_bits",
1396                .mode           = 0444,
1397                .kf_ops         = &rdtgroup_kf_single_ops,
1398                .seq_show       = rdt_shareable_bits_show,
1399                .fflags         = RF_CTRL_INFO | RFTYPE_RES_CACHE,
1400        },
1401        {
1402                .name           = "bit_usage",
1403                .mode           = 0444,
1404                .kf_ops         = &rdtgroup_kf_single_ops,
1405                .seq_show       = rdt_bit_usage_show,
1406                .fflags         = RF_CTRL_INFO | RFTYPE_RES_CACHE,
1407        },
1408        {
1409                .name           = "min_bandwidth",
1410                .mode           = 0444,
1411                .kf_ops         = &rdtgroup_kf_single_ops,
1412                .seq_show       = rdt_min_bw_show,
1413                .fflags         = RF_CTRL_INFO | RFTYPE_RES_MB,
1414        },
1415        {
1416                .name           = "bandwidth_gran",
1417                .mode           = 0444,
1418                .kf_ops         = &rdtgroup_kf_single_ops,
1419                .seq_show       = rdt_bw_gran_show,
1420                .fflags         = RF_CTRL_INFO | RFTYPE_RES_MB,
1421        },
1422        {
1423                .name           = "delay_linear",
1424                .mode           = 0444,
1425                .kf_ops         = &rdtgroup_kf_single_ops,
1426                .seq_show       = rdt_delay_linear_show,
1427                .fflags         = RF_CTRL_INFO | RFTYPE_RES_MB,
1428        },
1429        {
1430                .name           = "max_threshold_occupancy",
1431                .mode           = 0644,
1432                .kf_ops         = &rdtgroup_kf_single_ops,
1433                .write          = max_threshold_occ_write,
1434                .seq_show       = max_threshold_occ_show,
1435                .fflags         = RF_MON_INFO | RFTYPE_RES_CACHE,
1436        },
1437        {
1438                .name           = "cpus",
1439                .mode           = 0644,
1440                .kf_ops         = &rdtgroup_kf_single_ops,
1441                .write          = rdtgroup_cpus_write,
1442                .seq_show       = rdtgroup_cpus_show,
1443                .fflags         = RFTYPE_BASE,
1444        },
1445        {
1446                .name           = "cpus_list",
1447                .mode           = 0644,
1448                .kf_ops         = &rdtgroup_kf_single_ops,
1449                .write          = rdtgroup_cpus_write,
1450                .seq_show       = rdtgroup_cpus_show,
1451                .flags          = RFTYPE_FLAGS_CPUS_LIST,
1452                .fflags         = RFTYPE_BASE,
1453        },
1454        {
1455                .name           = "tasks",
1456                .mode           = 0644,
1457                .kf_ops         = &rdtgroup_kf_single_ops,
1458                .write          = rdtgroup_tasks_write,
1459                .seq_show       = rdtgroup_tasks_show,
1460                .fflags         = RFTYPE_BASE,
1461        },
1462        {
1463                .name           = "schemata",
1464                .mode           = 0644,
1465                .kf_ops         = &rdtgroup_kf_single_ops,
1466                .write          = rdtgroup_schemata_write,
1467                .seq_show       = rdtgroup_schemata_show,
1468                .fflags         = RF_CTRL_BASE,
1469        },
1470        {
1471                .name           = "mode",
1472                .mode           = 0644,
1473                .kf_ops         = &rdtgroup_kf_single_ops,
1474                .write          = rdtgroup_mode_write,
1475                .seq_show       = rdtgroup_mode_show,
1476                .fflags         = RF_CTRL_BASE,
1477        },
1478        {
1479                .name           = "size",
1480                .mode           = 0444,
1481                .kf_ops         = &rdtgroup_kf_single_ops,
1482                .seq_show       = rdtgroup_size_show,
1483                .fflags         = RF_CTRL_BASE,
1484        },
1485
1486};
1487
1488static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags)
1489{
1490        struct rftype *rfts, *rft;
1491        int ret, len;
1492
1493        rfts = res_common_files;
1494        len = ARRAY_SIZE(res_common_files);
1495
1496        lockdep_assert_held(&rdtgroup_mutex);
1497
1498        for (rft = rfts; rft < rfts + len; rft++) {
1499                if ((fflags & rft->fflags) == rft->fflags) {
1500                        ret = rdtgroup_add_file(kn, rft);
1501                        if (ret)
1502                                goto error;
1503                }
1504        }
1505
1506        return 0;
1507error:
1508        pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
1509        while (--rft >= rfts) {
1510                if ((fflags & rft->fflags) == rft->fflags)
1511                        kernfs_remove_by_name(kn, rft->name);
1512        }
1513        return ret;
1514}
1515
1516/**
1517 * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file
1518 * @r: The resource group with which the file is associated.
1519 * @name: Name of the file
1520 *
1521 * The permissions of named resctrl file, directory, or link are modified
1522 * to not allow read, write, or execute by any user.
1523 *
1524 * WARNING: This function is intended to communicate to the user that the
1525 * resctrl file has been locked down - that it is not relevant to the
1526 * particular state the system finds itself in. It should not be relied
1527 * on to protect from user access because after the file's permissions
1528 * are restricted the user can still change the permissions using chmod
1529 * from the command line.
1530 *
1531 * Return: 0 on success, <0 on failure.
1532 */
1533int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name)
1534{
1535        struct iattr iattr = {.ia_valid = ATTR_MODE,};
1536        struct kernfs_node *kn;
1537        int ret = 0;
1538
1539        kn = kernfs_find_and_get_ns(r->kn, name, NULL);
1540        if (!kn)
1541                return -ENOENT;
1542
1543        switch (kernfs_type(kn)) {
1544        case KERNFS_DIR:
1545                iattr.ia_mode = S_IFDIR;
1546                break;
1547        case KERNFS_FILE:
1548                iattr.ia_mode = S_IFREG;
1549                break;
1550        case KERNFS_LINK:
1551                iattr.ia_mode = S_IFLNK;
1552                break;
1553        }
1554
1555        ret = kernfs_setattr(kn, &iattr);
1556        kernfs_put(kn);
1557        return ret;
1558}
1559
1560/**
1561 * rdtgroup_kn_mode_restore - Restore user access to named resctrl file
1562 * @r: The resource group with which the file is associated.
1563 * @name: Name of the file
1564 * @mask: Mask of permissions that should be restored
1565 *
1566 * Restore the permissions of the named file. If @name is a directory the
1567 * permissions of its parent will be used.
1568 *
1569 * Return: 0 on success, <0 on failure.
1570 */
1571int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
1572                             umode_t mask)
1573{
1574        struct iattr iattr = {.ia_valid = ATTR_MODE,};
1575        struct kernfs_node *kn, *parent;
1576        struct rftype *rfts, *rft;
1577        int ret, len;
1578
1579        rfts = res_common_files;
1580        len = ARRAY_SIZE(res_common_files);
1581
1582        for (rft = rfts; rft < rfts + len; rft++) {
1583                if (!strcmp(rft->name, name))
1584                        iattr.ia_mode = rft->mode & mask;
1585        }
1586
1587        kn = kernfs_find_and_get_ns(r->kn, name, NULL);
1588        if (!kn)
1589                return -ENOENT;
1590
1591        switch (kernfs_type(kn)) {
1592        case KERNFS_DIR:
1593                parent = kernfs_get_parent(kn);
1594                if (parent) {
1595                        iattr.ia_mode |= parent->mode;
1596                        kernfs_put(parent);
1597                }
1598                iattr.ia_mode |= S_IFDIR;
1599                break;
1600        case KERNFS_FILE:
1601                iattr.ia_mode |= S_IFREG;
1602                break;
1603        case KERNFS_LINK:
1604                iattr.ia_mode |= S_IFLNK;
1605                break;
1606        }
1607
1608        ret = kernfs_setattr(kn, &iattr);
1609        kernfs_put(kn);
1610        return ret;
1611}
1612
1613static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name,
1614                                      unsigned long fflags)
1615{
1616        struct kernfs_node *kn_subdir;
1617        int ret;
1618
1619        kn_subdir = kernfs_create_dir(kn_info, name,
1620                                      kn_info->mode, r);
1621        if (IS_ERR(kn_subdir))
1622                return PTR_ERR(kn_subdir);
1623
1624        kernfs_get(kn_subdir);
1625        ret = rdtgroup_kn_set_ugid(kn_subdir);
1626        if (ret)
1627                return ret;
1628
1629        ret = rdtgroup_add_files(kn_subdir, fflags);
1630        if (!ret)
1631                kernfs_activate(kn_subdir);
1632
1633        return ret;
1634}
1635
1636static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
1637{
1638        struct rdt_resource *r;
1639        unsigned long fflags;
1640        char name[32];
1641        int ret;
1642
1643        /* create the directory */
1644        kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
1645        if (IS_ERR(kn_info))
1646                return PTR_ERR(kn_info);
1647        kernfs_get(kn_info);
1648
1649        ret = rdtgroup_add_files(kn_info, RF_TOP_INFO);
1650        if (ret)
1651                goto out_destroy;
1652
1653        for_each_alloc_enabled_rdt_resource(r) {
1654                fflags =  r->fflags | RF_CTRL_INFO;
1655                ret = rdtgroup_mkdir_info_resdir(r, r->name, fflags);
1656                if (ret)
1657                        goto out_destroy;
1658        }
1659
1660        for_each_mon_enabled_rdt_resource(r) {
1661                fflags =  r->fflags | RF_MON_INFO;
1662                sprintf(name, "%s_MON", r->name);
1663                ret = rdtgroup_mkdir_info_resdir(r, name, fflags);
1664                if (ret)
1665                        goto out_destroy;
1666        }
1667
1668        /*
1669         * This extra ref will be put in kernfs_remove() and guarantees
1670         * that @rdtgrp->kn is always accessible.
1671         */
1672        kernfs_get(kn_info);
1673
1674        ret = rdtgroup_kn_set_ugid(kn_info);
1675        if (ret)
1676                goto out_destroy;
1677
1678        kernfs_activate(kn_info);
1679
1680        return 0;
1681
1682out_destroy:
1683        kernfs_remove(kn_info);
1684        return ret;
1685}
1686
1687static int
1688mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp,
1689                    char *name, struct kernfs_node **dest_kn)
1690{
1691        struct kernfs_node *kn;
1692        int ret;
1693
1694        /* create the directory */
1695        kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
1696        if (IS_ERR(kn))
1697                return PTR_ERR(kn);
1698
1699        if (dest_kn)
1700                *dest_kn = kn;
1701
1702        /*
1703         * This extra ref will be put in kernfs_remove() and guarantees
1704         * that @rdtgrp->kn is always accessible.
1705         */
1706        kernfs_get(kn);
1707
1708        ret = rdtgroup_kn_set_ugid(kn);
1709        if (ret)
1710                goto out_destroy;
1711
1712        kernfs_activate(kn);
1713
1714        return 0;
1715
1716out_destroy:
1717        kernfs_remove(kn);
1718        return ret;
1719}
1720
1721static void l3_qos_cfg_update(void *arg)
1722{
1723        bool *enable = arg;
1724
1725        wrmsrl(MSR_IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL);
1726}
1727
1728static void l2_qos_cfg_update(void *arg)
1729{
1730        bool *enable = arg;
1731
1732        wrmsrl(MSR_IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL);
1733}
1734
1735static inline bool is_mba_linear(void)
1736{
1737        return rdt_resources_all[RDT_RESOURCE_MBA].membw.delay_linear;
1738}
1739
1740static int set_cache_qos_cfg(int level, bool enable)
1741{
1742        void (*update)(void *arg);
1743        struct rdt_resource *r_l;
1744        cpumask_var_t cpu_mask;
1745        struct rdt_domain *d;
1746        int cpu;
1747
1748        if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
1749                return -ENOMEM;
1750
1751        if (level == RDT_RESOURCE_L3)
1752                update = l3_qos_cfg_update;
1753        else if (level == RDT_RESOURCE_L2)
1754                update = l2_qos_cfg_update;
1755        else
1756                return -EINVAL;
1757
1758        r_l = &rdt_resources_all[level];
1759        list_for_each_entry(d, &r_l->domains, list) {
1760                /* Pick one CPU from each domain instance to update MSR */
1761                cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
1762        }
1763        cpu = get_cpu();
1764        /* Update QOS_CFG MSR on this cpu if it's in cpu_mask. */
1765        if (cpumask_test_cpu(cpu, cpu_mask))
1766                update(&enable);
1767        /* Update QOS_CFG MSR on all other cpus in cpu_mask. */
1768        smp_call_function_many(cpu_mask, update, &enable, 1);
1769        put_cpu();
1770
1771        free_cpumask_var(cpu_mask);
1772
1773        return 0;
1774}
1775
1776/*
1777 * Enable or disable the MBA software controller
1778 * which helps user specify bandwidth in MBps.
1779 * MBA software controller is supported only if
1780 * MBM is supported and MBA is in linear scale.
1781 */
1782static int set_mba_sc(bool mba_sc)
1783{
1784        struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA];
1785        struct rdt_domain *d;
1786
1787        if (!is_mbm_enabled() || !is_mba_linear() ||
1788            mba_sc == is_mba_sc(r))
1789                return -EINVAL;
1790
1791        r->membw.mba_sc = mba_sc;
1792        list_for_each_entry(d, &r->domains, list)
1793                setup_default_ctrlval(r, d->ctrl_val, d->mbps_val);
1794
1795        return 0;
1796}
1797
1798static int cdp_enable(int level, int data_type, int code_type)
1799{
1800        struct rdt_resource *r_ldata = &rdt_resources_all[data_type];
1801        struct rdt_resource *r_lcode = &rdt_resources_all[code_type];
1802        struct rdt_resource *r_l = &rdt_resources_all[level];
1803        int ret;
1804
1805        if (!r_l->alloc_capable || !r_ldata->alloc_capable ||
1806            !r_lcode->alloc_capable)
1807                return -EINVAL;
1808
1809        ret = set_cache_qos_cfg(level, true);
1810        if (!ret) {
1811                r_l->alloc_enabled = false;
1812                r_ldata->alloc_enabled = true;
1813                r_lcode->alloc_enabled = true;
1814        }
1815        return ret;
1816}
1817
1818static int cdpl3_enable(void)
1819{
1820        return cdp_enable(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA,
1821                          RDT_RESOURCE_L3CODE);
1822}
1823
1824static int cdpl2_enable(void)
1825{
1826        return cdp_enable(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA,
1827                          RDT_RESOURCE_L2CODE);
1828}
1829
1830static void cdp_disable(int level, int data_type, int code_type)
1831{
1832        struct rdt_resource *r = &rdt_resources_all[level];
1833
1834        r->alloc_enabled = r->alloc_capable;
1835
1836        if (rdt_resources_all[data_type].alloc_enabled) {
1837                rdt_resources_all[data_type].alloc_enabled = false;
1838                rdt_resources_all[code_type].alloc_enabled = false;
1839                set_cache_qos_cfg(level, false);
1840        }
1841}
1842
1843static void cdpl3_disable(void)
1844{
1845        cdp_disable(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA, RDT_RESOURCE_L3CODE);
1846}
1847
1848static void cdpl2_disable(void)
1849{
1850        cdp_disable(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA, RDT_RESOURCE_L2CODE);
1851}
1852
1853static void cdp_disable_all(void)
1854{
1855        if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled)
1856                cdpl3_disable();
1857        if (rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled)
1858                cdpl2_disable();
1859}
1860
1861static int parse_rdtgroupfs_options(char *data)
1862{
1863        char *token, *o = data;
1864        int ret = 0;
1865
1866        while ((token = strsep(&o, ",")) != NULL) {
1867                if (!*token) {
1868                        ret = -EINVAL;
1869                        goto out;
1870                }
1871
1872                if (!strcmp(token, "cdp")) {
1873                        ret = cdpl3_enable();
1874                        if (ret)
1875                                goto out;
1876                } else if (!strcmp(token, "cdpl2")) {
1877                        ret = cdpl2_enable();
1878                        if (ret)
1879                                goto out;
1880                } else if (!strcmp(token, "mba_MBps")) {
1881                        if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
1882                                ret = set_mba_sc(true);
1883                        else
1884                                ret = -EINVAL;
1885                        if (ret)
1886                                goto out;
1887                } else {
1888                        ret = -EINVAL;
1889                        goto out;
1890                }
1891        }
1892
1893        return 0;
1894
1895out:
1896        pr_err("Invalid mount option \"%s\"\n", token);
1897
1898        return ret;
1899}
1900
1901/*
1902 * We don't allow rdtgroup directories to be created anywhere
1903 * except the root directory. Thus when looking for the rdtgroup
1904 * structure for a kernfs node we are either looking at a directory,
1905 * in which case the rdtgroup structure is pointed at by the "priv"
1906 * field, otherwise we have a file, and need only look to the parent
1907 * to find the rdtgroup.
1908 */
1909static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn)
1910{
1911        if (kernfs_type(kn) == KERNFS_DIR) {
1912                /*
1913                 * All the resource directories use "kn->priv"
1914                 * to point to the "struct rdtgroup" for the
1915                 * resource. "info" and its subdirectories don't
1916                 * have rdtgroup structures, so return NULL here.
1917                 */
1918                if (kn == kn_info || kn->parent == kn_info)
1919                        return NULL;
1920                else
1921                        return kn->priv;
1922        } else {
1923                return kn->parent->priv;
1924        }
1925}
1926
1927struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn)
1928{
1929        struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
1930
1931        if (!rdtgrp)
1932                return NULL;
1933
1934        atomic_inc(&rdtgrp->waitcount);
1935        kernfs_break_active_protection(kn);
1936
1937        mutex_lock(&rdtgroup_mutex);
1938
1939        /* Was this group deleted while we waited? */
1940        if (rdtgrp->flags & RDT_DELETED)
1941                return NULL;
1942
1943        return rdtgrp;
1944}
1945
1946void rdtgroup_kn_unlock(struct kernfs_node *kn)
1947{
1948        struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
1949
1950        if (!rdtgrp)
1951                return;
1952
1953        mutex_unlock(&rdtgroup_mutex);
1954
1955        if (atomic_dec_and_test(&rdtgrp->waitcount) &&
1956            (rdtgrp->flags & RDT_DELETED)) {
1957                if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
1958                    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
1959                        rdtgroup_pseudo_lock_remove(rdtgrp);
1960                kernfs_unbreak_active_protection(kn);
1961                kernfs_put(rdtgrp->kn);
1962                kfree(rdtgrp);
1963        } else {
1964                kernfs_unbreak_active_protection(kn);
1965        }
1966}
1967
1968static int mkdir_mondata_all(struct kernfs_node *parent_kn,
1969                             struct rdtgroup *prgrp,
1970                             struct kernfs_node **mon_data_kn);
1971
1972static struct dentry *rdt_mount(struct file_system_type *fs_type,
1973                                int flags, const char *unused_dev_name,
1974                                void *data)
1975{
1976        struct rdt_domain *dom;
1977        struct rdt_resource *r;
1978        struct dentry *dentry;
1979        int ret;
1980
1981        cpus_read_lock();
1982        mutex_lock(&rdtgroup_mutex);
1983        /*
1984         * resctrl file system can only be mounted once.
1985         */
1986        if (static_branch_unlikely(&rdt_enable_key)) {
1987                dentry = ERR_PTR(-EBUSY);
1988                goto out;
1989        }
1990
1991        ret = parse_rdtgroupfs_options(data);
1992        if (ret) {
1993                dentry = ERR_PTR(ret);
1994                goto out_cdp;
1995        }
1996
1997        closid_init();
1998
1999        ret = rdtgroup_create_info_dir(rdtgroup_default.kn);
2000        if (ret) {

2001                dentry = ERR_PTR(ret);
2002                goto out_cdp;
2003        }
2004
2005        if (rdt_mon_capable) {
2006                ret = mongroup_create_dir(rdtgroup_default.kn,
2007                                          NULL, "mon_groups",
2008                                          &kn_mongrp);
2009                if (ret) {
2010                        dentry = ERR_PTR(ret);
2011                        goto out_info;
2012                }
2013                kernfs_get(kn_mongrp);
2014
2015                ret = mkdir_mondata_all(rdtgroup_default.kn,
2016                                        &rdtgroup_default, &kn_mondata);
2017                if (ret) {
2018                        dentry = ERR_PTR(ret);
2019                        goto out_mongrp;
2020                }
2021                kernfs_get(kn_mondata);
2022                rdtgroup_default.mon.mon_data_kn = kn_mondata;
2023        }
2024
2025        ret = rdt_pseudo_lock_init();
2026        if (ret) {
2027                dentry = ERR_PTR(ret);
2028                goto out_mondata;
2029        }
2030
2031        dentry = kernfs_mount(fs_type, flags, rdt_root,
2032                              RDTGROUP_SUPER_MAGIC, NULL);
2033        if (IS_ERR(dentry))
2034                goto out_psl;
2035
2036        if (rdt_alloc_capable)
2037                static_branch_enable_cpuslocked(&rdt_alloc_enable_key);
2038        if (rdt_mon_capable)
2039                static_branch_enable_cpuslocked(&rdt_mon_enable_key);
2040
2041        if (rdt_alloc_capable || rdt_mon_capable)
2042                static_branch_enable_cpuslocked(&rdt_enable_key);
2043
2044        if (is_mbm_enabled()) {
2045                r = &rdt_resources_all[RDT_RESOURCE_L3];
2046                list_for_each_entry(dom, &r->domains, list)
2047                        mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL);
2048        }
2049
2050        goto out;
2051
2052out_psl:
2053        rdt_pseudo_lock_release();
2054out_mondata:
2055        if (rdt_mon_capable)
2056                kernfs_remove(kn_mondata);
2057out_mongrp:
2058        if (rdt_mon_capable)
2059                kernfs_remove(kn_mongrp);
2060out_info:
2061        kernfs_remove(kn_info);
2062out_cdp:
2063        cdp_disable_all();
2064out:
2065        rdt_last_cmd_clear();
2066        mutex_unlock(&rdtgroup_mutex);
2067        cpus_read_unlock();
2068
2069        return dentry;
2070}
2071
2072static int reset_all_ctrls(struct rdt_resource *r)
2073{
2074        struct msr_param msr_param;
2075        cpumask_var_t cpu_mask;
2076        struct rdt_domain *d;
2077        int i, cpu;
2078
2079        if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
2080                return -ENOMEM;
2081
2082        msr_param.res = r;
2083        msr_param.low = 0;
2084        msr_param.high = r->num_closid;
2085
2086        /*
2087         * Disable resource control for this resource by setting all
2088         * CBMs in all domains to the maximum mask value. Pick one CPU
2089         * from each domain to update the MSRs below.
2090         */
2091        list_for_each_entry(d, &r->domains, list) {
2092                cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
2093
2094                for (i = 0; i < r->num_closid; i++)
2095                        d->ctrl_val[i] = r->default_ctrl;
2096        }
2097        cpu = get_cpu();
2098        /* Update CBM on this cpu if it's in cpu_mask. */
2099        if (cpumask_test_cpu(cpu, cpu_mask))
2100                rdt_ctrl_update(&msr_param);
2101        /* Update CBM on all other cpus in cpu_mask. */
2102        smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1);
2103        put_cpu();
2104
2105        free_cpumask_var(cpu_mask);
2106
2107        return 0;
2108}
2109
2110static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
2111{
2112        return (rdt_alloc_capable &&
2113                (r->type == RDTCTRL_GROUP) && (t->closid == r->closid));
2114}
2115
2116static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
2117{
2118        return (rdt_mon_capable &&
2119                (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid));
2120}
2121
2122/*
2123 * Move tasks from one to the other group. If @from is NULL, then all tasks
2124 * in the systems are moved unconditionally (used for teardown).
2125 *
2126 * If @mask is not NULL the cpus on which moved tasks are running are set
2127 * in that mask so the update smp function call is restricted to affected
2128 * cpus.
2129 */
2130static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
2131                                 struct cpumask *mask)
2132{
2133        struct task_struct *p, *t;
2134
2135        read_lock(&tasklist_lock);
2136        for_each_process_thread(p, t) {
2137                if (!from || is_closid_match(t, from) ||
2138                    is_rmid_match(t, from)) {
2139                        t->closid = to->closid;
2140                        t->rmid = to->mon.rmid;
2141
2142#ifdef CONFIG_SMP
2143                        /*
2144                         * This is safe on x86 w/o barriers as the ordering
2145                         * of writing to task_cpu() and t->on_cpu is
2146                         * reverse to the reading here. The detection is
2147                         * inaccurate as tasks might move or schedule
2148                         * before the smp function call takes place. In
2149                         * such a case the function call is pointless, but
2150                         * there is no other side effect.
2151                         */
2152                        if (mask && t->on_cpu)
2153                                cpumask_set_cpu(task_cpu(t), mask);
2154#endif
2155                }
2156        }
2157        read_unlock(&tasklist_lock);
2158}
2159
2160static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
2161{
2162        struct rdtgroup *sentry, *stmp;
2163        struct list_head *head;
2164
2165        head = &rdtgrp->mon.crdtgrp_list;
2166        list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
2167                free_rmid(sentry->mon.rmid);
2168                list_del(&sentry->mon.crdtgrp_list);
2169                kfree(sentry);
2170        }
2171}
2172
2173/*
2174 * Forcibly remove all of subdirectories under root.
2175 */
2176static void rmdir_all_sub(void)
2177{
2178        struct rdtgroup *rdtgrp, *tmp;
2179
2180        /* Move all tasks to the default resource group */
2181        rdt_move_group_tasks(NULL, &rdtgroup_default, NULL);
2182
2183        list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) {
2184                /* Free any child rmids */
2185                free_all_child_rdtgrp(rdtgrp);
2186
2187                /* Remove each rdtgroup other than root */
2188                if (rdtgrp == &rdtgroup_default)
2189                        continue;
2190
2191                if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
2192                    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
2193                        rdtgroup_pseudo_lock_remove(rdtgrp);
2194
2195                /*
2196                 * Give any CPUs back to the default group. We cannot copy
2197                 * cpu_online_mask because a CPU might have executed the
2198                 * offline callback already, but is still marked online.
2199                 */
2200                cpumask_or(&rdtgroup_default.cpu_mask,
2201                           &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
2202
2203                free_rmid(rdtgrp->mon.rmid);
2204
2205                kernfs_remove(rdtgrp->kn);
2206                list_del(&rdtgrp->rdtgroup_list);
2207                kfree(rdtgrp);
2208        }
2209        /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
2210        update_closid_rmid(cpu_online_mask, &rdtgroup_default);
2211
2212        kernfs_remove(kn_info);
2213        kernfs_remove(kn_mongrp);
2214        kernfs_remove(kn_mondata);
2215}
2216
2217static void rdt_kill_sb(struct super_block *sb)
2218{
2219        struct rdt_resource *r;
2220
2221        cpus_read_lock();
2222        mutex_lock(&rdtgroup_mutex);
2223
2224        set_mba_sc(false);
2225
2226        /*Put everything back to default values. */
2227        for_each_alloc_enabled_rdt_resource(r)
2228                reset_all_ctrls(r);
2229        cdp_disable_all();
2230        rmdir_all_sub();
2231        rdt_pseudo_lock_release();
2232        rdtgroup_default.mode = RDT_MODE_SHAREABLE;
2233        static_branch_disable_cpuslocked(&rdt_alloc_enable_key);
2234        static_branch_disable_cpuslocked(&rdt_mon_enable_key);
2235        static_branch_disable_cpuslocked(&rdt_enable_key);
2236        kernfs_kill_sb(sb);
2237        mutex_unlock(&rdtgroup_mutex);
2238        cpus_read_unlock();
2239}
2240
2241static struct file_system_type rdt_fs_type = {
2242        .name    = "resctrl",
2243        .mount   = rdt_mount,
2244        .kill_sb = rdt_kill_sb,
2245};
2246
2247static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
2248                       void *priv)
2249{
2250        struct kernfs_node *kn;
2251        int ret = 0;
2252
2253        kn = __kernfs_create_file(parent_kn, name, 0444,
2254                                  GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0,
2255                                  &kf_mondata_ops, priv, NULL, NULL);
2256        if (IS_ERR(kn))
2257                return PTR_ERR(kn);
2258
2259        ret = rdtgroup_kn_set_ugid(kn);
2260        if (ret) {
2261                kernfs_remove(kn);
2262                return ret;
2263        }
2264
2265        return ret;
2266}
2267
2268/*
2269 * Remove all subdirectories of mon_data of ctrl_mon groups
2270 * and monitor groups with given domain id.
2271 */
2272void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, unsigned int dom_id)
2273{
2274        struct rdtgroup *prgrp, *crgrp;
2275        char name[32];
2276
2277        if (!r->mon_enabled)
2278                return;
2279
2280        list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
2281                sprintf(name, "mon_%s_%02d", r->name, dom_id);
2282                kernfs_remove_by_name(prgrp->mon.mon_data_kn, name);
2283
2284                list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list)
2285                        kernfs_remove_by_name(crgrp->mon.mon_data_kn, name);
2286        }
2287}
2288
2289static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
2290                                struct rdt_domain *d,
2291                                struct rdt_resource *r, struct rdtgroup *prgrp)
2292{
2293        union mon_data_bits priv;
2294        struct kernfs_node *kn;
2295        struct mon_evt *mevt;
2296        struct rmid_read rr;
2297        char name[32];
2298        int ret;
2299
2300        sprintf(name, "mon_%s_%02d", r->name, d->id);
2301        /* create the directory */
2302        kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
2303        if (IS_ERR(kn))
2304                return PTR_ERR(kn);
2305
2306        /*
2307         * This extra ref will be put in kernfs_remove() and guarantees
2308         * that kn is always accessible.
2309         */
2310        kernfs_get(kn);
2311        ret = rdtgroup_kn_set_ugid(kn);
2312        if (ret)
2313                goto out_destroy;
2314
2315        if (WARN_ON(list_empty(&r->evt_list))) {
2316                ret = -EPERM;
2317                goto out_destroy;
2318        }
2319
2320        priv.u.rid = r->rid;
2321        priv.u.domid = d->id;
2322        list_for_each_entry(mevt, &r->evt_list, list) {
2323                priv.u.evtid = mevt->evtid;
2324                ret = mon_addfile(kn, mevt->name, priv.priv);
2325                if (ret)
2326                        goto out_destroy;
2327
2328                if (is_mbm_event(mevt->evtid))
2329                        mon_event_read(&rr, d, prgrp, mevt->evtid, true);
2330        }
2331        kernfs_activate(kn);
2332        return 0;
2333
2334out_destroy:
2335        kernfs_remove(kn);
2336        return ret;
2337}
2338
2339/*
2340 * Add all subdirectories of mon_data for "ctrl_mon" groups
2341 * and "monitor" groups with given domain id.
2342 */
2343void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
2344                                    struct rdt_domain *d)
2345{
2346        struct kernfs_node *parent_kn;
2347        struct rdtgroup *prgrp, *crgrp;
2348        struct list_head *head;
2349
2350        if (!r->mon_enabled)
2351                return;
2352
2353        list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
2354                parent_kn = prgrp->mon.mon_data_kn;
2355                mkdir_mondata_subdir(parent_kn, d, r, prgrp);
2356
2357                head = &prgrp->mon.crdtgrp_list;
2358                list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
2359                        parent_kn = crgrp->mon.mon_data_kn;
2360                        mkdir_mondata_subdir(parent_kn, d, r, crgrp);
2361                }
2362        }
2363}
2364
2365static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn,
2366                                       struct rdt_resource *r,
2367                                       struct rdtgroup *prgrp)
2368{
2369        struct rdt_domain *dom;
2370        int ret;
2371
2372        list_for_each_entry(dom, &r->domains, list) {
2373                ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp);
2374                if (ret)
2375                        return ret;
2376        }
2377
2378        return 0;
2379}
2380
2381/*
2382 * This creates a directory mon_data which contains the monitored data.
2383 *
2384 * mon_data has one directory for each domain whic are named
2385 * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data
2386 * with L3 domain looks as below:
2387 * ./mon_data:
2388 * mon_L3_00
2389 * mon_L3_01
2390 * mon_L3_02
2391 * ...
2392 *
2393 * Each domain directory has one file per event:
2394 * ./mon_L3_00/:
2395 * llc_occupancy
2396 *
2397 */
2398static int mkdir_mondata_all(struct kernfs_node *parent_kn,
2399                             struct rdtgroup *prgrp,
2400                             struct kernfs_node **dest_kn)
2401{
2402        struct rdt_resource *r;
2403        struct kernfs_node *kn;
2404        int ret;
2405
2406        /*
2407         * Create the mon_data directory first.
2408         */
2409        ret = mongroup_create_dir(parent_kn, NULL, "mon_data", &kn);
2410        if (ret)
2411                return ret;
2412
2413        if (dest_kn)
2414                *dest_kn = kn;
2415
2416        /*
2417         * Create the subdirectories for each domain. Note that all events
2418         * in a domain like L3 are grouped into a resource whose domain is L3
2419         */
2420        for_each_mon_enabled_rdt_resource(r) {
2421                ret = mkdir_mondata_subdir_alldom(kn, r, prgrp);
2422                if (ret)
2423                        goto out_destroy;
2424        }
2425
2426        return 0;
2427
2428out_destroy:
2429        kernfs_remove(kn);
2430        return ret;
2431}
2432
2433/**
2434 * cbm_ensure_valid - Enforce validity on provided CBM
2435 * @_val:       Candidate CBM
2436 * @r:          RDT resource to which the CBM belongs
2437 *
2438 * The provided CBM represents all cache portions available for use. This
2439 * may be represented by a bitmap that does not consist of contiguous ones
2440 * and thus be an invalid CBM.
2441 * Here the provided CBM is forced to be a valid CBM by only considering
2442 * the first set of contiguous bits as valid and clearing all bits.
2443 * The intention here is to provide a valid default CBM with which a new
2444 * resource group is initialized. The user can follow this with a
2445 * modification to the CBM if the default does not satisfy the
2446 * requirements.
2447 */
2448static void cbm_ensure_valid(u32 *_val, struct rdt_resource *r)
2449{
2450        /*
2451         * Convert the u32 _val to an unsigned long required by all the bit
2452         * operations within this function. No more than 32 bits of this
2453         * converted value can be accessed because all bit operations are
2454         * additionally provided with cbm_len that is initialized during
2455         * hardware enumeration using five bits from the EAX register and
2456         * thus never can exceed 32 bits.
2457         */
2458        unsigned long *val = (unsigned long *)_val;
2459        unsigned int cbm_len = r->cache.cbm_len;
2460        unsigned long first_bit, zero_bit;
2461
2462        if (*val == 0)
2463                return;
2464
2465        first_bit = find_first_bit(val, cbm_len);
2466        zero_bit = find_next_zero_bit(val, cbm_len, first_bit);
2467
2468        /* Clear any remaining bits to ensure contiguous region */
2469        bitmap_clear(val, zero_bit, cbm_len - zero_bit);
2470}
2471
2472/*
2473 * Initialize cache resources per RDT domain
2474 *
2475 * Set the RDT domain up to start off with all usable allocations. That is,
2476 * all shareable and unused bits. All-zero CBM is invalid.
2477 */
2478static int __init_one_rdt_domain(struct rdt_domain *d, struct rdt_resource *r,
2479                                 u32 closid)
2480{
2481        struct rdt_resource *r_cdp = NULL;
2482        struct rdt_domain *d_cdp = NULL;
2483        u32 used_b = 0, unused_b = 0;
2484        unsigned long tmp_cbm;
2485        enum rdtgrp_mode mode;
2486        u32 peer_ctl, *ctrl;
2487        int i;
2488
2489        rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp);
2490        d->have_new_ctrl = false;
2491        d->new_ctrl = r->cache.shareable_bits;
2492        used_b = r->cache.shareable_bits;
2493        ctrl = d->ctrl_val;
2494        for (i = 0; i < closids_supported(); i++, ctrl++) {
2495                if (closid_allocated(i) && i != closid) {
2496                        mode = rdtgroup_mode_by_closid(i);
2497                        if (mode == RDT_MODE_PSEUDO_LOCKSETUP)
2498                                break;
2499                        /*
2500                         * If CDP is active include peer domain's
2501                         * usage to ensure there is no overlap
2502                         * with an exclusive group.
2503                         */
2504                        if (d_cdp)
2505                                peer_ctl = d_cdp->ctrl_val[i];
2506                        else
2507                                peer_ctl = 0;
2508                        used_b |= *ctrl | peer_ctl;
2509                        if (mode == RDT_MODE_SHAREABLE)
2510                                d->new_ctrl |= *ctrl | peer_ctl;
2511                }
2512        }
2513        if (d->plr && d->plr->cbm > 0)
2514                used_b |= d->plr->cbm;
2515        unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1);
2516        unused_b &= BIT_MASK(r->cache.cbm_len) - 1;
2517        d->new_ctrl |= unused_b;
2518        /*
2519         * Force the initial CBM to be valid, user can
2520         * modify the CBM based on system availability.
2521         */
2522        cbm_ensure_valid(&d->new_ctrl, r);
2523        /*
2524         * Assign the u32 CBM to an unsigned long to ensure that
2525         * bitmap_weight() does not access out-of-bound memory.
2526         */
2527        tmp_cbm = d->new_ctrl;
2528        if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) {
2529                rdt_last_cmd_printf("No space on %s:%d\n", r->name, d->id);
2530                return -ENOSPC;
2531        }
2532        d->have_new_ctrl = true;
2533
2534        return 0;
2535}
2536
2537/*
2538 * Initialize cache resources with default values.
2539 *
2540 * A new RDT group is being created on an allocation capable (CAT)
2541 * supporting system. Set this group up to start off with all usable
2542 * allocations.
2543 *
2544 * If there are no more shareable bits available on any domain then
2545 * the entire allocation will fail.
2546 */
2547static int rdtgroup_init_cat(struct rdt_resource *r, u32 closid)
2548{
2549        struct rdt_domain *d;
2550        int ret;
2551
2552        list_for_each_entry(d, &r->domains, list) {
2553                ret = __init_one_rdt_domain(d, r, closid);
2554                if (ret < 0)
2555                        return ret;
2556        }
2557
2558        return 0;
2559}
2560
2561/* Initialize MBA resource with default values. */
2562static void rdtgroup_init_mba(struct rdt_resource *r)
2563{
2564        struct rdt_domain *d;
2565
2566        list_for_each_entry(d, &r->domains, list) {
2567                d->new_ctrl = is_mba_sc(r) ? MBA_MAX_MBPS : r->default_ctrl;
2568                d->have_new_ctrl = true;
2569        }
2570}
2571
2572/* Initialize the RDT group's allocations. */
2573static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
2574{
2575        struct rdt_resource *r;
2576        int ret;
2577
2578        for_each_alloc_enabled_rdt_resource(r) {
2579                if (r->rid == RDT_RESOURCE_MBA) {
2580                        rdtgroup_init_mba(r);
2581                } else {
2582                        ret = rdtgroup_init_cat(r, rdtgrp->closid);
2583                        if (ret < 0)
2584                                return ret;
2585                }
2586
2587                ret = update_domains(r, rdtgrp->closid);
2588                if (ret < 0) {
2589                        rdt_last_cmd_puts("Failed to initialize allocations\n");
2590                        return ret;
2591                }
2592
2593        }
2594
2595        rdtgrp->mode = RDT_MODE_SHAREABLE;
2596
2597        return 0;
2598}
2599
2600static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
2601                             struct kernfs_node *prgrp_kn,
2602                             const char *name, umode_t mode,
2603                             enum rdt_group_type rtype, struct rdtgroup **r)
2604{
2605        struct rdtgroup *prdtgrp, *rdtgrp;
2606        struct kernfs_node *kn;
2607        uint files = 0;
2608        int ret;
2609
2610        prdtgrp = rdtgroup_kn_lock_live(prgrp_kn);
2611        rdt_last_cmd_clear();
2612        if (!prdtgrp) {
2613                ret = -ENODEV;
2614                rdt_last_cmd_puts("Directory was removed\n");
2615                goto out_unlock;
2616        }
2617
2618        if (rtype == RDTMON_GROUP &&
2619            (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
2620             prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) {
2621                ret = -EINVAL;
2622                rdt_last_cmd_puts("Pseudo-locking in progress\n");
2623                goto out_unlock;
2624        }
2625
2626        /* allocate the rdtgroup. */
2627        rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
2628        if (!rdtgrp) {
2629                ret = -ENOSPC;
2630                rdt_last_cmd_puts("Kernel out of memory\n");
2631                goto out_unlock;
2632        }
2633        *r = rdtgrp;
2634        rdtgrp->mon.parent = prdtgrp;
2635        rdtgrp->type = rtype;
2636        INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list);
2637
2638        /* kernfs creates the directory for rdtgrp */
2639        kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp);
2640        if (IS_ERR(kn)) {
2641                ret = PTR_ERR(kn);
2642                rdt_last_cmd_puts("kernfs create error\n");
2643                goto out_free_rgrp;
2644        }
2645        rdtgrp->kn = kn;
2646
2647        /*
2648         * kernfs_remove() will drop the reference count on "kn" which
2649         * will free it. But we still need it to stick around for the
2650         * rdtgroup_kn_unlock(kn} call below. Take one extra reference
2651         * here, which will be dropped inside rdtgroup_kn_unlock().
2652         */
2653        kernfs_get(kn);
2654
2655        ret = rdtgroup_kn_set_ugid(kn);
2656        if (ret) {
2657                rdt_last_cmd_puts("kernfs perm error\n");
2658                goto out_destroy;
2659        }
2660
2661        files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype);
2662        ret = rdtgroup_add_files(kn, files);
2663        if (ret) {
2664                rdt_last_cmd_puts("kernfs fill error\n");
2665                goto out_destroy;
2666        }
2667
2668        if (rdt_mon_capable) {
2669                ret = alloc_rmid();
2670                if (ret < 0) {
2671                        rdt_last_cmd_puts("Out of RMIDs\n");
2672                        goto out_destroy;
2673                }
2674                rdtgrp->mon.rmid = ret;
2675
2676                ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn);
2677                if (ret) {
2678                        rdt_last_cmd_puts("kernfs subdir error\n");
2679                        goto out_idfree;
2680                }
2681        }
2682        kernfs_activate(kn);
2683
2684        /*
2685         * The caller unlocks the prgrp_kn upon success.
2686         */
2687        return 0;
2688
2689out_idfree:
2690        free_rmid(rdtgrp->mon.rmid);
2691out_destroy:
2692        kernfs_remove(rdtgrp->kn);
2693out_free_rgrp:
2694        kfree(rdtgrp);
2695out_unlock:
2696        rdtgroup_kn_unlock(prgrp_kn);
2697        return ret;
2698}
2699
2700static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
2701{
2702        kernfs_remove(rgrp->kn);
2703        free_rmid(rgrp->mon.rmid);
2704        kfree(rgrp);
2705}
2706
2707/*
2708 * Create a monitor group under "mon_groups" directory of a control
2709 * and monitor group(ctrl_mon). This is a resource group
2710 * to monitor a subset of tasks and cpus in its parent ctrl_mon group.
2711 */
2712static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
2713                              struct kernfs_node *prgrp_kn,
2714                              const char *name,
2715                              umode_t mode)
2716{
2717        struct rdtgroup *rdtgrp, *prgrp;
2718        int ret;
2719
2720        ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTMON_GROUP,
2721                                &rdtgrp);
2722        if (ret)
2723                return ret;
2724
2725        prgrp = rdtgrp->mon.parent;
2726        rdtgrp->closid = prgrp->closid;
2727
2728        /*
2729         * Add the rdtgrp to the list of rdtgrps the parent
2730         * ctrl_mon group has to track.
2731         */
2732        list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list);
2733
2734        rdtgroup_kn_unlock(prgrp_kn);
2735        return ret;
2736}
2737
2738/*
2739 * These are rdtgroups created under the root directory. Can be used
2740 * to allocate and monitor resources.
2741 */
2742static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
2743                                   struct kernfs_node *prgrp_kn,
2744                                   const char *name, umode_t mode)
2745{
2746        struct rdtgroup *rdtgrp;
2747        struct kernfs_node *kn;
2748        u32 closid;
2749        int ret;
2750
2751        ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTCTRL_GROUP,
2752                                &rdtgrp);
2753        if (ret)
2754                return ret;
2755
2756        kn = rdtgrp->kn;
2757        ret = closid_alloc();
2758        if (ret < 0) {
2759                rdt_last_cmd_puts("Out of CLOSIDs\n");
2760                goto out_common_fail;
2761        }
2762        closid = ret;
2763        ret = 0;
2764
2765        rdtgrp->closid = closid;
2766        ret = rdtgroup_init_alloc(rdtgrp);
2767        if (ret < 0)
2768                goto out_id_free;
2769
2770        list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
2771
2772        if (rdt_mon_capable) {
2773                /*
2774                 * Create an empty mon_groups directory to hold the subset
2775                 * of tasks and cpus to monitor.
2776                 */
2777                ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL);
2778                if (ret) {
2779                        rdt_last_cmd_puts("kernfs subdir error\n");
2780                        goto out_del_list;
2781                }
2782        }
2783
2784        goto out_unlock;
2785
2786out_del_list:
2787        list_del(&rdtgrp->rdtgroup_list);
2788out_id_free:
2789        closid_free(closid);
2790out_common_fail:
2791        mkdir_rdt_prepare_clean(rdtgrp);
2792out_unlock:
2793        rdtgroup_kn_unlock(prgrp_kn);
2794        return ret;
2795}
2796
2797/*
2798 * We allow creating mon groups only with in a directory called "mon_groups"
2799 * which is present in every ctrl_mon group. Check if this is a valid
2800 * "mon_groups" directory.
2801 *
2802 * 1. The directory should be named "mon_groups".
2803 * 2. The mon group itself should "not" be named "mon_groups".
2804 *   This makes sure "mon_groups" directory always has a ctrl_mon group
2805 *   as parent.
2806 */
2807static bool is_mon_groups(struct kernfs_node *kn, const char *name)
2808{
2809        return (!strcmp(kn->name, "mon_groups") &&
2810                strcmp(name, "mon_groups"));
2811}
2812
2813static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
2814                          umode_t mode)
2815{
2816        /* Do not accept '\n' to avoid unparsable situation. */
2817        if (strchr(name, '\n'))
2818                return -EINVAL;
2819
2820        /*
2821         * If the parent directory is the root directory and RDT
2822         * allocation is supported, add a control and monitoring
2823         * subdirectory
2824         */
2825        if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn)
2826                return rdtgroup_mkdir_ctrl_mon(parent_kn, parent_kn, name, mode);
2827
2828        /*
2829         * If RDT monitoring is supported and the parent directory is a valid
2830         * "mon_groups" directory, add a monitoring subdirectory.
2831         */
2832        if (rdt_mon_capable && is_mon_groups(parent_kn, name))
2833                return rdtgroup_mkdir_mon(parent_kn, parent_kn->parent, name, mode);
2834
2835        return -EPERM;
2836}
2837
2838static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
2839                              cpumask_var_t tmpmask)
2840{
2841        struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
2842        int cpu;
2843
2844        /* Give any tasks back to the parent group */
2845        rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask);
2846
2847        /* Update per cpu rmid of the moved CPUs first */
2848        for_each_cpu(cpu, &rdtgrp->cpu_mask)
2849                per_cpu(pqr_state.default_rmid, cpu) = prdtgrp->mon.rmid;
2850        /*
2851         * Update the MSR on moved CPUs and CPUs which have moved
2852         * task running on them.
2853         */
2854        cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
2855        update_closid_rmid(tmpmask, NULL);
2856
2857        rdtgrp->flags = RDT_DELETED;
2858        free_rmid(rdtgrp->mon.rmid);
2859
2860        /*
2861         * Remove the rdtgrp from the parent ctrl_mon group's list
2862         */
2863        WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
2864        list_del(&rdtgrp->mon.crdtgrp_list);
2865
2866        /*
2867         * one extra hold on this, will drop when we kfree(rdtgrp)
2868         * in rdtgroup_kn_unlock()
2869         */
2870        kernfs_get(kn);
2871        kernfs_remove(rdtgrp->kn);
2872
2873        return 0;
2874}
2875
2876static int rdtgroup_ctrl_remove(struct kernfs_node *kn,
2877                                struct rdtgroup *rdtgrp)
2878{
2879        rdtgrp->flags = RDT_DELETED;
2880        list_del(&rdtgrp->rdtgroup_list);
2881
2882        /*
2883         * one extra hold on this, will drop when we kfree(rdtgrp)
2884         * in rdtgroup_kn_unlock()
2885         */
2886        kernfs_get(kn);
2887        kernfs_remove(rdtgrp->kn);
2888        return 0;
2889}
2890
2891static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
2892                               cpumask_var_t tmpmask)
2893{
2894        int cpu;
2895
2896        /* Give any tasks back to the default group */
2897        rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask);
2898
2899        /* Give any CPUs back to the default group */
2900        cpumask_or(&rdtgroup_default.cpu_mask,
2901                   &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
2902
2903        /* Update per cpu closid and rmid of the moved CPUs first */
2904        for_each_cpu(cpu, &rdtgrp->cpu_mask) {
2905                per_cpu(pqr_state.default_closid, cpu) = rdtgroup_default.closid;
2906                per_cpu(pqr_state.default_rmid, cpu) = rdtgroup_default.mon.rmid;
2907        }
2908
2909        /*
2910         * Update the MSR on moved CPUs and CPUs which have moved
2911         * task running on them.
2912         */
2913        cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
2914        update_closid_rmid(tmpmask, NULL);
2915
2916        closid_free(rdtgrp->closid);
2917        free_rmid(rdtgrp->mon.rmid);
2918
2919        /*
2920         * Free all the child monitor group rmids.
2921         */
2922        free_all_child_rdtgrp(rdtgrp);
2923
2924        rdtgroup_ctrl_remove(kn, rdtgrp);
2925
2926        return 0;
2927}
2928
2929static int rdtgroup_rmdir(struct kernfs_node *kn)
2930{
2931        struct kernfs_node *parent_kn = kn->parent;
2932        struct rdtgroup *rdtgrp;
2933        cpumask_var_t tmpmask;
2934        int ret = 0;
2935
2936        if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
2937                return -ENOMEM;
2938
2939        rdtgrp = rdtgroup_kn_lock_live(kn);
2940        if (!rdtgrp) {
2941                ret = -EPERM;
2942                goto out;
2943        }
2944
2945        /*
2946         * If the rdtgroup is a ctrl_mon group and parent directory
2947         * is the root directory, remove the ctrl_mon group.
2948         *
2949         * If the rdtgroup is a mon group and parent directory
2950         * is a valid "mon_groups" directory, remove the mon group.
2951         */
2952        if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn) {
2953                if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
2954                    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
2955                        ret = rdtgroup_ctrl_remove(kn, rdtgrp);
2956                } else {
2957                        ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask);
2958                }
2959        } else if (rdtgrp->type == RDTMON_GROUP &&
2960                 is_mon_groups(parent_kn, kn->name)) {
2961                ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask);
2962        } else {
2963                ret = -EPERM;
2964        }
2965
2966out:
2967        rdtgroup_kn_unlock(kn);
2968        free_cpumask_var(tmpmask);
2969        return ret;
2970}
2971
2972static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
2973{
2974        if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled)
2975                seq_puts(seq, ",cdp");
2976
2977        if (rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled)
2978                seq_puts(seq, ",cdpl2");
2979
2980        if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA]))
2981                seq_puts(seq, ",mba_MBps");
2982
2983        return 0;
2984}
2985
2986static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = {
2987        .mkdir          = rdtgroup_mkdir,
2988        .rmdir          = rdtgroup_rmdir,
2989        .show_options   = rdtgroup_show_options,
2990};
2991
2992static int __init rdtgroup_setup_root(void)
2993{
2994        int ret;
2995
2996        rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops,
2997                                      KERNFS_ROOT_CREATE_DEACTIVATED |
2998                                      KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK,
2999                                      &rdtgroup_default);
3000        if (IS_ERR(rdt_root))

3001                return PTR_ERR(rdt_root);
3002
3003        mutex_lock(&rdtgroup_mutex);
3004
3005        rdtgroup_default.closid = 0;
3006        rdtgroup_default.mon.rmid = 0;
3007        rdtgroup_default.type = RDTCTRL_GROUP;
3008        INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list);
3009
3010        list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
3011
3012        ret = rdtgroup_add_files(rdt_root->kn, RF_CTRL_BASE);
3013        if (ret) {
3014                kernfs_destroy_root(rdt_root);
3015                goto out;
3016        }
3017
3018        rdtgroup_default.kn = rdt_root->kn;
3019        kernfs_activate(rdtgroup_default.kn);
3020
3021out:
3022        mutex_unlock(&rdtgroup_mutex);
3023
3024        return ret;
3025}
3026
3027/*
3028 * rdtgroup_init - rdtgroup initialization
3029 *
3030 * Setup resctrl file system including set up root, create mount point,
3031 * register rdtgroup filesystem, and initialize files under root directory.
3032 *
3033 * Return: 0 on success or -errno
3034 */
3035int __init rdtgroup_init(void)
3036{
3037        int ret = 0;
3038
3039        seq_buf_init(&last_cmd_status, last_cmd_status_buf,
3040                     sizeof(last_cmd_status_buf));
3041
3042        ret = rdtgroup_setup_root();
3043        if (ret)
3044                return ret;
3045
3046        ret = sysfs_create_mount_point(fs_kobj, "resctrl");
3047        if (ret)
3048                goto cleanup_root;
3049
3050        ret = register_filesystem(&rdt_fs_type);
3051        if (ret)
3052                goto cleanup_mountpoint;
3053
3054        /*
3055         * Adding the resctrl debugfs directory here may not be ideal since
3056         * it would let the resctrl debugfs directory appear on the debugfs
3057         * filesystem before the resctrl filesystem is mounted.
3058         * It may also be ok since that would enable debugging of RDT before
3059         * resctrl is mounted.
3060         * The reason why the debugfs directory is created here and not in
3061         * rdt_mount() is because rdt_mount() takes rdtgroup_mutex and
3062         * during the debugfs directory creation also &sb->s_type->i_mutex_key
3063         * (the lockdep class of inode->i_rwsem). Other filesystem
3064         * interactions (eg. SyS_getdents) have the lock ordering:
3065         * &sb->s_type->i_mutex_key --> &mm->mmap_sem
3066         * During mmap(), called with &mm->mmap_sem, the rdtgroup_mutex
3067         * is taken, thus creating dependency:
3068         * &mm->mmap_sem --> rdtgroup_mutex for the latter that can cause
3069         * issues considering the other two lock dependencies.
3070         * By creating the debugfs directory here we avoid a dependency
3071         * that may cause deadlock (even though file operations cannot
3072         * occur until the filesystem is mounted, but I do not know how to
3073         * tell lockdep that).
3074         */
3075        debugfs_resctrl = debugfs_create_dir("resctrl", NULL);
3076
3077        return 0;
3078
3079cleanup_mountpoint:
3080        sysfs_remove_mount_point(fs_kobj, "resctrl");
3081cleanup_root:
3082        kernfs_destroy_root(rdt_root);
3083
3084        return ret;
3085}
3086
3087void __exit rdtgroup_exit(void)
3088{
3089        debugfs_remove_recursive(debugfs_resctrl);
3090        unregister_filesystem(&rdt_fs_type);
3091        sysfs_remove_mount_point(fs_kobj, "resctrl");
3092        kernfs_destroy_root(rdt_root);
3093}
3094