LXR linux/drivers/virt/nitro_enclaves/ne_misc

   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
   4 */
   5
   6/**
   7 * DOC: Enclave lifetime management driver for Nitro Enclaves (NE).
   8 * Nitro is a hypervisor that has been developed by Amazon.
   9 */
  10
  11#include <linux/anon_inodes.h>
  12#include <linux/capability.h>
  13#include <linux/cpu.h>
  14#include <linux/device.h>
  15#include <linux/file.h>
  16#include <linux/hugetlb.h>
  17#include <linux/limits.h>
  18#include <linux/list.h>
  19#include <linux/miscdevice.h>
  20#include <linux/mm.h>
  21#include <linux/mman.h>
  22#include <linux/module.h>
  23#include <linux/mutex.h>
  24#include <linux/nitro_enclaves.h>
  25#include <linux/pci.h>
  26#include <linux/poll.h>
  27#include <linux/slab.h>
  28#include <linux/types.h>
  29#include <uapi/linux/vm_sockets.h>
  30
  31#include "ne_misc_dev.h"
  32#include "ne_pci_dev.h"
  33
  34/**
  35 * NE_CPUS_SIZE - Size for max 128 CPUs, for now, in a cpu-list string, comma
  36 *                separated. The NE CPU pool includes CPUs from a single NUMA
  37 *                node.
  38 */
  39#define NE_CPUS_SIZE            (512)
  40
  41/**
  42 * NE_EIF_LOAD_OFFSET - The offset where to copy the Enclave Image Format (EIF)
  43 *                      image in enclave memory.
  44 */
  45#define NE_EIF_LOAD_OFFSET      (8 * 1024UL * 1024UL)
  46
  47/**
  48 * NE_MIN_ENCLAVE_MEM_SIZE - The minimum memory size an enclave can be launched
  49 *                           with.
  50 */
  51#define NE_MIN_ENCLAVE_MEM_SIZE (64 * 1024UL * 1024UL)
  52
  53/**
  54 * NE_MIN_MEM_REGION_SIZE - The minimum size of an enclave memory region.
  55 */
  56#define NE_MIN_MEM_REGION_SIZE  (2 * 1024UL * 1024UL)
  57
  58/**
  59 * NE_PARENT_VM_CID - The CID for the vsock device of the primary / parent VM.
  60 */
  61#define NE_PARENT_VM_CID        (3)
  62
  63static long ne_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
  64
  65static const struct file_operations ne_fops = {
  66        .owner          = THIS_MODULE,
  67        .llseek         = noop_llseek,
  68        .unlocked_ioctl = ne_ioctl,
  69};
  70
  71static struct miscdevice ne_misc_dev = {
  72        .minor  = MISC_DYNAMIC_MINOR,
  73        .name   = "nitro_enclaves",
  74        .fops   = &ne_fops,
  75        .mode   = 0660,
  76};
  77
  78struct ne_devs ne_devs = {
  79        .ne_misc_dev    = &ne_misc_dev,
  80};
  81
  82/*
  83 * TODO: Update logic to create new sysfs entries instead of using
  84 * a kernel parameter e.g. if multiple sysfs files needed.
  85 */
  86static int ne_set_kernel_param(const char *val, const struct kernel_param *kp);
  87
  88static const struct kernel_param_ops ne_cpu_pool_ops = {
  89        .get    = param_get_string,
  90        .set    = ne_set_kernel_param,
  91};
  92
  93static char ne_cpus[NE_CPUS_SIZE];
  94static struct kparam_string ne_cpus_arg = {
  95        .maxlen = sizeof(ne_cpus),
  96        .string = ne_cpus,
  97};
  98
  99module_param_cb(ne_cpus, &ne_cpu_pool_ops, &ne_cpus_arg, 0644);
 100/* https://www.kernel.org/doc/html/latest/admin-guide/kernel-parameters.html#cpu-lists */
 101MODULE_PARM_DESC(ne_cpus, "<cpu-list> - CPU pool used for Nitro Enclaves");
 102
 103/**
 104 * struct ne_cpu_pool - CPU pool used for Nitro Enclaves.
 105 * @avail_threads_per_core:     Available full CPU cores to be dedicated to
 106 *                              enclave(s). The cpumasks from the array, indexed
 107 *                              by core id, contain all the threads from the
 108 *                              available cores, that are not set for created
 109 *                              enclave(s). The full CPU cores are part of the
 110 *                              NE CPU pool.
 111 * @mutex:                      Mutex for the access to the NE CPU pool.
 112 * @nr_parent_vm_cores :        The size of the available threads per core array.
 113 *                              The total number of CPU cores available on the
 114 *                              primary / parent VM.
 115 * @nr_threads_per_core:        The number of threads that a full CPU core has.
 116 * @numa_node:                  NUMA node of the CPUs in the pool.
 117 */
 118struct ne_cpu_pool {
 119        cpumask_var_t   *avail_threads_per_core;
 120        struct mutex    mutex;
 121        unsigned int    nr_parent_vm_cores;
 122        unsigned int    nr_threads_per_core;
 123        int             numa_node;
 124};
 125
 126static struct ne_cpu_pool ne_cpu_pool;
 127
 128/**
 129 * ne_check_enclaves_created() - Verify if at least one enclave has been created.
 130 * @void:       No parameters provided.
 131 *
 132 * Context: Process context.
 133 * Return:
 134 * * True if at least one enclave is created.
 135 * * False otherwise.
 136 */
 137static bool ne_check_enclaves_created(void)
 138{
 139        struct ne_pci_dev *ne_pci_dev = ne_devs.ne_pci_dev;
 140        bool ret = false;
 141
 142        if (!ne_pci_dev)
 143                return ret;
 144
 145        mutex_lock(&ne_pci_dev->enclaves_list_mutex);
 146
 147        if (!list_empty(&ne_pci_dev->enclaves_list))
 148                ret = true;
 149
 150        mutex_unlock(&ne_pci_dev->enclaves_list_mutex);
 151
 152        return ret;
 153}
 154
 155/**
 156 * ne_setup_cpu_pool() - Set the NE CPU pool after handling sanity checks such
 157 *                       as not sharing CPU cores with the primary / parent VM
 158 *                       or not using CPU 0, which should remain available for
 159 *                       the primary / parent VM. Offline the CPUs from the
 160 *                       pool after the checks passed.
 161 * @ne_cpu_list:        The CPU list used for setting NE CPU pool.
 162 *
 163 * Context: Process context.
 164 * Return:
 165 * * 0 on success.
 166 * * Negative return value on failure.
 167 */
 168static int ne_setup_cpu_pool(const char *ne_cpu_list)
 169{
 170        int core_id = -1;
 171        unsigned int cpu = 0;
 172        cpumask_var_t cpu_pool;
 173        unsigned int cpu_sibling = 0;
 174        unsigned int i = 0;
 175        int numa_node = -1;
 176        int rc = -EINVAL;
 177
 178        if (!zalloc_cpumask_var(&cpu_pool, GFP_KERNEL))
 179                return -ENOMEM;
 180
 181        mutex_lock(&ne_cpu_pool.mutex);
 182
 183        rc = cpulist_parse(ne_cpu_list, cpu_pool);
 184        if (rc < 0) {
 185                pr_err("%s: Error in cpulist parse [rc=%d]\n", ne_misc_dev.name, rc);
 186
 187                goto free_pool_cpumask;
 188        }
 189
 190        cpu = cpumask_any(cpu_pool);
 191        if (cpu >= nr_cpu_ids) {
 192                pr_err("%s: No CPUs available in CPU pool\n", ne_misc_dev.name);
 193
 194                rc = -EINVAL;
 195
 196                goto free_pool_cpumask;
 197        }
 198
 199        /*
 200         * Check if the CPUs are online, to further get info about them
 201         * e.g. numa node, core id, siblings.
 202         */
 203        for_each_cpu(cpu, cpu_pool)
 204                if (cpu_is_offline(cpu)) {
 205                        pr_err("%s: CPU %d is offline, has to be online to get its metadata\n",
 206                               ne_misc_dev.name, cpu);
 207
 208                        rc = -EINVAL;
 209
 210                        goto free_pool_cpumask;
 211                }
 212
 213        /*
 214         * Check if the CPUs from the NE CPU pool are from the same NUMA node.
 215         */
 216        for_each_cpu(cpu, cpu_pool)
 217                if (numa_node < 0) {
 218                        numa_node = cpu_to_node(cpu);
 219                        if (numa_node < 0) {
 220                                pr_err("%s: Invalid NUMA node %d\n",
 221                                       ne_misc_dev.name, numa_node);
 222
 223                                rc = -EINVAL;
 224
 225                                goto free_pool_cpumask;
 226                        }
 227                } else {
 228                        if (numa_node != cpu_to_node(cpu)) {
 229                                pr_err("%s: CPUs with different NUMA nodes\n",
 230                                       ne_misc_dev.name);
 231
 232                                rc = -EINVAL;
 233
 234                                goto free_pool_cpumask;
 235                        }
 236                }
 237
 238        /*
 239         * Check if CPU 0 and its siblings are included in the provided CPU pool
 240         * They should remain available for the primary / parent VM.
 241         */
 242        if (cpumask_test_cpu(0, cpu_pool)) {
 243                pr_err("%s: CPU 0 has to remain available\n", ne_misc_dev.name);
 244
 245                rc = -EINVAL;
 246
 247                goto free_pool_cpumask;
 248        }
 249
 250        for_each_cpu(cpu_sibling, topology_sibling_cpumask(0)) {
 251                if (cpumask_test_cpu(cpu_sibling, cpu_pool)) {
 252                        pr_err("%s: CPU sibling %d for CPU 0 is in CPU pool\n",
 253                               ne_misc_dev.name, cpu_sibling);
 254
 255                        rc = -EINVAL;
 256
 257                        goto free_pool_cpumask;
 258                }
 259        }
 260
 261        /*
 262         * Check if CPU siblings are included in the provided CPU pool. The
 263         * expectation is that full CPU cores are made available in the CPU pool
 264         * for enclaves.
 265         */
 266        for_each_cpu(cpu, cpu_pool) {
 267                for_each_cpu(cpu_sibling, topology_sibling_cpumask(cpu)) {
 268                        if (!cpumask_test_cpu(cpu_sibling, cpu_pool)) {
 269                                pr_err("%s: CPU %d is not in CPU pool\n",
 270                                       ne_misc_dev.name, cpu_sibling);
 271
 272                                rc = -EINVAL;
 273
 274                                goto free_pool_cpumask;
 275                        }
 276                }
 277        }
 278
 279        /* Calculate the number of threads from a full CPU core. */
 280        cpu = cpumask_any(cpu_pool);
 281        for_each_cpu(cpu_sibling, topology_sibling_cpumask(cpu))
 282                ne_cpu_pool.nr_threads_per_core++;
 283
 284        ne_cpu_pool.nr_parent_vm_cores = nr_cpu_ids / ne_cpu_pool.nr_threads_per_core;
 285
 286        ne_cpu_pool.avail_threads_per_core = kcalloc(ne_cpu_pool.nr_parent_vm_cores,
 287                                             sizeof(*ne_cpu_pool.avail_threads_per_core),
 288                                             GFP_KERNEL);
 289        if (!ne_cpu_pool.avail_threads_per_core) {
 290                rc = -ENOMEM;
 291
 292                goto free_pool_cpumask;
 293        }
 294
 295        for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++)
 296                if (!zalloc_cpumask_var(&ne_cpu_pool.avail_threads_per_core[i], GFP_KERNEL)) {
 297                        rc = -ENOMEM;
 298
 299                        goto free_cores_cpumask;
 300                }
 301
 302        /*
 303         * Split the NE CPU pool in threads per core to keep the CPU topology
 304         * after offlining the CPUs.
 305         */
 306        for_each_cpu(cpu, cpu_pool) {
 307                core_id = topology_core_id(cpu);
 308                if (core_id < 0 || core_id >= ne_cpu_pool.nr_parent_vm_cores) {
 309                        pr_err("%s: Invalid core id  %d for CPU %d\n",
 310                               ne_misc_dev.name, core_id, cpu);
 311
 312                        rc = -EINVAL;
 313
 314                        goto clear_cpumask;
 315                }
 316
 317                cpumask_set_cpu(cpu, ne_cpu_pool.avail_threads_per_core[core_id]);
 318        }
 319
 320        /*
 321         * CPUs that are given to enclave(s) should not be considered online
 322         * by Linux anymore, as the hypervisor will degrade them to floating.
 323         * The physical CPUs (full cores) are carved out of the primary / parent
 324         * VM and given to the enclave VM. The same number of vCPUs would run
 325         * on less pCPUs for the primary / parent VM.
 326         *
 327         * We offline them here, to not degrade performance and expose correct
 328         * topology to Linux and user space.
 329         */
 330        for_each_cpu(cpu, cpu_pool) {
 331                rc = remove_cpu(cpu);
 332                if (rc != 0) {
 333                        pr_err("%s: CPU %d is not offlined [rc=%d]\n",
 334                               ne_misc_dev.name, cpu, rc);
 335
 336                        goto online_cpus;
 337                }
 338        }
 339
 340        free_cpumask_var(cpu_pool);
 341
 342        ne_cpu_pool.numa_node = numa_node;
 343
 344        mutex_unlock(&ne_cpu_pool.mutex);
 345
 346        return 0;
 347
 348online_cpus:
 349        for_each_cpu(cpu, cpu_pool)
 350                add_cpu(cpu);
 351clear_cpumask:
 352        for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++)
 353                cpumask_clear(ne_cpu_pool.avail_threads_per_core[i]);
 354free_cores_cpumask:
 355        for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++)
 356                free_cpumask_var(ne_cpu_pool.avail_threads_per_core[i]);
 357        kfree(ne_cpu_pool.avail_threads_per_core);
 358free_pool_cpumask:
 359        free_cpumask_var(cpu_pool);
 360        ne_cpu_pool.nr_parent_vm_cores = 0;
 361        ne_cpu_pool.nr_threads_per_core = 0;
 362        ne_cpu_pool.numa_node = -1;
 363        mutex_unlock(&ne_cpu_pool.mutex);
 364
 365        return rc;
 366}
 367
 368/**
 369 * ne_teardown_cpu_pool() - Online the CPUs from the NE CPU pool and cleanup the
 370 *                          CPU pool.
 371 * @void:       No parameters provided.
 372 *
 373 * Context: Process context.
 374 */
 375static void ne_teardown_cpu_pool(void)
 376{
 377        unsigned int cpu = 0;
 378        unsigned int i = 0;
 379        int rc = -EINVAL;
 380
 381        mutex_lock(&ne_cpu_pool.mutex);
 382
 383        if (!ne_cpu_pool.nr_parent_vm_cores) {
 384                mutex_unlock(&ne_cpu_pool.mutex);
 385
 386                return;
 387        }
 388
 389        for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++) {
 390                for_each_cpu(cpu, ne_cpu_pool.avail_threads_per_core[i]) {
 391                        rc = add_cpu(cpu);
 392                        if (rc != 0)
 393                                pr_err("%s: CPU %d is not onlined [rc=%d]\n",
 394                                       ne_misc_dev.name, cpu, rc);
 395                }
 396
 397                cpumask_clear(ne_cpu_pool.avail_threads_per_core[i]);
 398
 399                free_cpumask_var(ne_cpu_pool.avail_threads_per_core[i]);
 400        }
 401
 402        kfree(ne_cpu_pool.avail_threads_per_core);
 403        ne_cpu_pool.nr_parent_vm_cores = 0;
 404        ne_cpu_pool.nr_threads_per_core = 0;
 405        ne_cpu_pool.numa_node = -1;
 406
 407        mutex_unlock(&ne_cpu_pool.mutex);
 408}
 409
 410/**
 411 * ne_set_kernel_param() - Set the NE CPU pool value via the NE kernel parameter.
 412 * @val:        NE CPU pool string value.
 413 * @kp :        NE kernel parameter associated with the NE CPU pool.
 414 *
 415 * Context: Process context.
 416 * Return:
 417 * * 0 on success.
 418 * * Negative return value on failure.
 419 */
 420static int ne_set_kernel_param(const char *val, const struct kernel_param *kp)
 421{
 422        char error_val[] = "";
 423        int rc = -EINVAL;
 424
 425        if (!capable(CAP_SYS_ADMIN))
 426                return -EPERM;
 427
 428        if (ne_check_enclaves_created()) {
 429                pr_err("%s: The CPU pool is used by enclave(s)\n", ne_misc_dev.name);
 430
 431                return -EPERM;
 432        }
 433
 434        ne_teardown_cpu_pool();
 435
 436        rc = ne_setup_cpu_pool(val);
 437        if (rc < 0) {
 438                pr_err("%s: Error in setup CPU pool [rc=%d]\n", ne_misc_dev.name, rc);
 439
 440                param_set_copystring(error_val, kp);
 441
 442                return rc;
 443        }
 444
 445        rc = param_set_copystring(val, kp);
 446        if (rc < 0) {
 447                pr_err("%s: Error in param set copystring [rc=%d]\n", ne_misc_dev.name, rc);
 448
 449                ne_teardown_cpu_pool();
 450
 451                param_set_copystring(error_val, kp);
 452
 453                return rc;
 454        }
 455
 456        return 0;
 457}
 458
 459/**
 460 * ne_donated_cpu() - Check if the provided CPU is already used by the enclave.
 461 * @ne_enclave :        Private data associated with the current enclave.
 462 * @cpu:                CPU to check if already used.
 463 *
 464 * Context: Process context. This function is called with the ne_enclave mutex held.
 465 * Return:
 466 * * True if the provided CPU is already used by the enclave.
 467 * * False otherwise.
 468 */
 469static bool ne_donated_cpu(struct ne_enclave *ne_enclave, unsigned int cpu)
 470{
 471        if (cpumask_test_cpu(cpu, ne_enclave->vcpu_ids))
 472                return true;
 473
 474        return false;
 475}
 476
 477/**
 478 * ne_get_unused_core_from_cpu_pool() - Get the id of a full core from the
 479 *                                      NE CPU pool.
 480 * @void:       No parameters provided.
 481 *
 482 * Context: Process context. This function is called with the ne_enclave and
 483 *          ne_cpu_pool mutexes held.
 484 * Return:
 485 * * Core id.
 486 * * -1 if no CPU core available in the pool.
 487 */
 488static int ne_get_unused_core_from_cpu_pool(void)
 489{
 490        int core_id = -1;
 491        unsigned int i = 0;
 492
 493        for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++)
 494                if (!cpumask_empty(ne_cpu_pool.avail_threads_per_core[i])) {
 495                        core_id = i;
 496
 497                        break;
 498                }
 499
 500        return core_id;
 501}
 502
 503/**
 504 * ne_set_enclave_threads_per_core() - Set the threads of the provided core in
 505 *                                     the enclave data structure.
 506 * @ne_enclave :        Private data associated with the current enclave.
 507 * @core_id:            Core id to get its threads from the NE CPU pool.
 508 * @vcpu_id:            vCPU id part of the provided core.
 509 *
 510 * Context: Process context. This function is called with the ne_enclave and
 511 *          ne_cpu_pool mutexes held.
 512 * Return:
 513 * * 0 on success.
 514 * * Negative return value on failure.
 515 */
 516static int ne_set_enclave_threads_per_core(struct ne_enclave *ne_enclave,
 517                                           int core_id, u32 vcpu_id)
 518{
 519        unsigned int cpu = 0;
 520
 521        if (core_id < 0 && vcpu_id == 0) {
 522                dev_err_ratelimited(ne_misc_dev.this_device,
 523                                    "No CPUs available in NE CPU pool\n");
 524
 525                return -NE_ERR_NO_CPUS_AVAIL_IN_POOL;
 526        }
 527
 528        if (core_id < 0) {
 529                dev_err_ratelimited(ne_misc_dev.this_device,
 530                                    "CPU %d is not in NE CPU pool\n", vcpu_id);
 531
 532                return -NE_ERR_VCPU_NOT_IN_CPU_POOL;
 533        }
 534
 535        if (core_id >= ne_enclave->nr_parent_vm_cores) {
 536                dev_err_ratelimited(ne_misc_dev.this_device,
 537                                    "Invalid core id %d - ne_enclave\n", core_id);
 538
 539                return -NE_ERR_VCPU_INVALID_CPU_CORE;
 540        }
 541
 542        for_each_cpu(cpu, ne_cpu_pool.avail_threads_per_core[core_id])
 543                cpumask_set_cpu(cpu, ne_enclave->threads_per_core[core_id]);
 544
 545        cpumask_clear(ne_cpu_pool.avail_threads_per_core[core_id]);
 546
 547        return 0;
 548}
 549
 550/**
 551 * ne_get_cpu_from_cpu_pool() - Get a CPU from the NE CPU pool, either from the
 552 *                              remaining sibling(s) of a CPU core or the first
 553 *                              sibling of a new CPU core.
 554 * @ne_enclave :        Private data associated with the current enclave.
 555 * @vcpu_id:            vCPU to get from the NE CPU pool.
 556 *
 557 * Context: Process context. This function is called with the ne_enclave mutex held.
 558 * Return:
 559 * * 0 on success.
 560 * * Negative return value on failure.
 561 */
 562static int ne_get_cpu_from_cpu_pool(struct ne_enclave *ne_enclave, u32 *vcpu_id)
 563{
 564        int core_id = -1;
 565        unsigned int cpu = 0;
 566        unsigned int i = 0;
 567        int rc = -EINVAL;
 568
 569        /*
 570         * If previously allocated a thread of a core to this enclave, first
 571         * check remaining sibling(s) for new CPU allocations, so that full
 572         * CPU cores are used for the enclave.
 573         */
 574        for (i = 0; i < ne_enclave->nr_parent_vm_cores; i++)
 575                for_each_cpu(cpu, ne_enclave->threads_per_core[i])
 576                        if (!ne_donated_cpu(ne_enclave, cpu)) {
 577                                *vcpu_id = cpu;
 578
 579                                return 0;
 580                        }
 581
 582        mutex_lock(&ne_cpu_pool.mutex);
 583
 584        /*
 585         * If no remaining siblings, get a core from the NE CPU pool and keep
 586         * track of all the threads in the enclave threads per core data structure.
 587         */
 588        core_id = ne_get_unused_core_from_cpu_pool();
 589
 590        rc = ne_set_enclave_threads_per_core(ne_enclave, core_id, *vcpu_id);
 591        if (rc < 0)
 592                goto unlock_mutex;
 593
 594        *vcpu_id = cpumask_any(ne_enclave->threads_per_core[core_id]);
 595
 596        rc = 0;
 597
 598unlock_mutex:
 599        mutex_unlock(&ne_cpu_pool.mutex);
 600
 601        return rc;
 602}
 603
 604/**
 605 * ne_get_vcpu_core_from_cpu_pool() - Get from the NE CPU pool the id of the
 606 *                                    core associated with the provided vCPU.
 607 * @vcpu_id:    Provided vCPU id to get its associated core id.
 608 *
 609 * Context: Process context. This function is called with the ne_enclave and
 610 *          ne_cpu_pool mutexes held.
 611 * Return:
 612 * * Core id.
 613 * * -1 if the provided vCPU is not in the pool.
 614 */
 615static int ne_get_vcpu_core_from_cpu_pool(u32 vcpu_id)
 616{
 617        int core_id = -1;
 618        unsigned int i = 0;
 619
 620        for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++)
 621                if (cpumask_test_cpu(vcpu_id, ne_cpu_pool.avail_threads_per_core[i])) {
 622                        core_id = i;
 623
 624                        break;
 625        }
 626
 627        return core_id;
 628}
 629
 630/**
 631 * ne_check_cpu_in_cpu_pool() - Check if the given vCPU is in the available CPUs
 632 *                              from the pool.
 633 * @ne_enclave :        Private data associated with the current enclave.
 634 * @vcpu_id:            ID of the vCPU to check if available in the NE CPU pool.
 635 *
 636 * Context: Process context. This function is called with the ne_enclave mutex held.
 637 * Return:
 638 * * 0 on success.
 639 * * Negative return value on failure.
 640 */
 641static int ne_check_cpu_in_cpu_pool(struct ne_enclave *ne_enclave, u32 vcpu_id)
 642{
 643        int core_id = -1;
 644        unsigned int i = 0;
 645        int rc = -EINVAL;
 646
 647        if (ne_donated_cpu(ne_enclave, vcpu_id)) {
 648                dev_err_ratelimited(ne_misc_dev.this_device,
 649                                    "CPU %d already used\n", vcpu_id);
 650
 651                return -NE_ERR_VCPU_ALREADY_USED;
 652        }
 653
 654        /*
 655         * If previously allocated a thread of a core to this enclave, but not
 656         * the full core, first check remaining sibling(s).
 657         */
 658        for (i = 0; i < ne_enclave->nr_parent_vm_cores; i++)
 659                if (cpumask_test_cpu(vcpu_id, ne_enclave->threads_per_core[i]))
 660                        return 0;
 661
 662        mutex_lock(&ne_cpu_pool.mutex);
 663
 664        /*
 665         * If no remaining siblings, get from the NE CPU pool the core
 666         * associated with the vCPU and keep track of all the threads in the
 667         * enclave threads per core data structure.
 668         */
 669        core_id = ne_get_vcpu_core_from_cpu_pool(vcpu_id);
 670
 671        rc = ne_set_enclave_threads_per_core(ne_enclave, core_id, vcpu_id);
 672        if (rc < 0)
 673                goto unlock_mutex;
 674
 675        rc = 0;
 676
 677unlock_mutex:
 678        mutex_unlock(&ne_cpu_pool.mutex);
 679
 680        return rc;
 681}
 682
 683/**
 684 * ne_add_vcpu_ioctl() - Add a vCPU to the slot associated with the current
 685 *                       enclave.
 686 * @ne_enclave :        Private data associated with the current enclave.
 687 * @vcpu_id:            ID of the CPU to be associated with the given slot,
 688 *                      apic id on x86.
 689 *
 690 * Context: Process context. This function is called with the ne_enclave mutex held.
 691 * Return:
 692 * * 0 on success.
 693 * * Negative return value on failure.
 694 */
 695static int ne_add_vcpu_ioctl(struct ne_enclave *ne_enclave, u32 vcpu_id)
 696{
 697        struct ne_pci_dev_cmd_reply cmd_reply = {};
 698        struct pci_dev *pdev = ne_devs.ne_pci_dev->pdev;
 699        int rc = -EINVAL;
 700        struct slot_add_vcpu_req slot_add_vcpu_req = {};
 701
 702        if (ne_enclave->mm != current->mm)
 703                return -EIO;
 704
 705        slot_add_vcpu_req.slot_uid = ne_enclave->slot_uid;
 706        slot_add_vcpu_req.vcpu_id = vcpu_id;
 707
 708        rc = ne_do_request(pdev, SLOT_ADD_VCPU,
 709                           &slot_add_vcpu_req, sizeof(slot_add_vcpu_req),
 710                           &cmd_reply, sizeof(cmd_reply));
 711        if (rc < 0) {
 712                dev_err_ratelimited(ne_misc_dev.this_device,
 713                                    "Error in slot add vCPU [rc=%d]\n", rc);
 714
 715                return rc;
 716        }
 717
 718        cpumask_set_cpu(vcpu_id, ne_enclave->vcpu_ids);
 719
 720        ne_enclave->nr_vcpus++;
 721
 722        return 0;
 723}
 724
 725/**
 726 * ne_sanity_check_user_mem_region() - Sanity check the user space memory
 727 *                                     region received during the set user
 728 *                                     memory region ioctl call.
 729 * @ne_enclave :        Private data associated with the current enclave.
 730 * @mem_region :        User space memory region to be sanity checked.
 731 *
 732 * Context: Process context. This function is called with the ne_enclave mutex held.
 733 * Return:
 734 * * 0 on success.
 735 * * Negative return value on failure.
 736 */
 737static int ne_sanity_check_user_mem_region(struct ne_enclave *ne_enclave,
 738        struct ne_user_memory_region mem_region)
 739{
 740        struct ne_mem_region *ne_mem_region = NULL;
 741
 742        if (ne_enclave->mm != current->mm)
 743                return -EIO;
 744
 745        if (mem_region.memory_size & (NE_MIN_MEM_REGION_SIZE - 1)) {
 746                dev_err_ratelimited(ne_misc_dev.this_device,
 747                                    "User space memory size is not multiple of 2 MiB\n");
 748
 749                return -NE_ERR_INVALID_MEM_REGION_SIZE;
 750        }
 751
 752        if (!IS_ALIGNED(mem_region.userspace_addr, NE_MIN_MEM_REGION_SIZE)) {
 753                dev_err_ratelimited(ne_misc_dev.this_device,
 754                                    "User space address is not 2 MiB aligned\n");
 755
 756                return -NE_ERR_UNALIGNED_MEM_REGION_ADDR;
 757        }
 758
 759        if ((mem_region.userspace_addr & (NE_MIN_MEM_REGION_SIZE - 1)) ||
 760            !access_ok((void __user *)(unsigned long)mem_region.userspace_addr,
 761                       mem_region.memory_size)) {
 762                dev_err_ratelimited(ne_misc_dev.this_device,
 763                                    "Invalid user space address range\n");
 764
 765                return -NE_ERR_INVALID_MEM_REGION_ADDR;
 766        }
 767
 768        list_for_each_entry(ne_mem_region, &ne_enclave->mem_regions_list,
 769                            mem_region_list_entry) {
 770                u64 memory_size = ne_mem_region->memory_size;
 771                u64 userspace_addr = ne_mem_region->userspace_addr;
 772
 773                if ((userspace_addr <= mem_region.userspace_addr &&
 774                    mem_region.userspace_addr < (userspace_addr + memory_size)) ||
 775                    (mem_region.userspace_addr <= userspace_addr &&
 776                    (mem_region.userspace_addr + mem_region.memory_size) > userspace_addr)) {
 777                        dev_err_ratelimited(ne_misc_dev.this_device,
 778                                            "User space memory region already used\n");
 779
 780                        return -NE_ERR_MEM_REGION_ALREADY_USED;
 781                }
 782        }
 783
 784        return 0;
 785}
 786
 787/**
 788 * ne_sanity_check_user_mem_region_page() - Sanity check a page from the user space
 789 *                                          memory region received during the set
 790 *                                          user memory region ioctl call.
 791 * @ne_enclave :        Private data associated with the current enclave.
 792 * @mem_region_page:    Page from the user space memory region to be sanity checked.
 793 *
 794 * Context: Process context. This function is called with the ne_enclave mutex held.
 795 * Return:
 796 * * 0 on success.
 797 * * Negative return value on failure.
 798 */
 799static int ne_sanity_check_user_mem_region_page(struct ne_enclave *ne_enclave,
 800                                                struct page *mem_region_page)
 801{
 802        if (!PageHuge(mem_region_page)) {
 803                dev_err_ratelimited(ne_misc_dev.this_device,
 804                                    "Not a hugetlbfs page\n");
 805
 806                return -NE_ERR_MEM_NOT_HUGE_PAGE;
 807        }
 808
 809        if (page_size(mem_region_page) & (NE_MIN_MEM_REGION_SIZE - 1)) {
 810                dev_err_ratelimited(ne_misc_dev.this_device,
 811                                    "Page size not multiple of 2 MiB\n");
 812
 813                return -NE_ERR_INVALID_PAGE_SIZE;
 814        }
 815
 816        if (ne_enclave->numa_node != page_to_nid(mem_region_page)) {
 817                dev_err_ratelimited(ne_misc_dev.this_device,
 818                                    "Page is not from NUMA node %d\n",
 819                                    ne_enclave->numa_node);
 820
 821                return -NE_ERR_MEM_DIFFERENT_NUMA_NODE;
 822        }
 823
 824        return 0;
 825}
 826
 827/**
 828 * ne_set_user_memory_region_ioctl() - Add user space memory region to the slot
 829 *                                     associated with the current enclave.
 830 * @ne_enclave :        Private data associated with the current enclave.
 831 * @mem_region :        User space memory region to be associated with the given slot.
 832 *
 833 * Context: Process context. This function is called with the ne_enclave mutex held.
 834 * Return:
 835 * * 0 on success.
 836 * * Negative return value on failure.
 837 */
 838static int ne_set_user_memory_region_ioctl(struct ne_enclave *ne_enclave,
 839        struct ne_user_memory_region mem_region)
 840{
 841        long gup_rc = 0;
 842        unsigned long i = 0;
 843        unsigned long max_nr_pages = 0;
 844        unsigned long memory_size = 0;
 845        struct ne_mem_region *ne_mem_region = NULL;
 846        unsigned long nr_phys_contig_mem_regions = 0;
 847        struct pci_dev *pdev = ne_devs.ne_pci_dev->pdev;
 848        struct page **phys_contig_mem_regions = NULL;
 849        int rc = -EINVAL;
 850
 851        rc = ne_sanity_check_user_mem_region(ne_enclave, mem_region);
 852        if (rc < 0)
 853                return rc;
 854
 855        ne_mem_region = kzalloc(sizeof(*ne_mem_region), GFP_KERNEL);
 856        if (!ne_mem_region)
 857                return -ENOMEM;
 858
 859        max_nr_pages = mem_region.memory_size / NE_MIN_MEM_REGION_SIZE;
 860
 861        ne_mem_region->pages = kcalloc(max_nr_pages, sizeof(*ne_mem_region->pages),
 862                                       GFP_KERNEL);
 863        if (!ne_mem_region->pages) {
 864                rc = -ENOMEM;
 865
 866                goto free_mem_region;
 867        }
 868
 869        phys_contig_mem_regions = kcalloc(max_nr_pages, sizeof(*phys_contig_mem_regions),
 870                                          GFP_KERNEL);
 871        if (!phys_contig_mem_regions) {
 872                rc = -ENOMEM;
 873
 874                goto free_mem_region;
 875        }
 876
 877        do {
 878                i = ne_mem_region->nr_pages;
 879
 880                if (i == max_nr_pages) {
 881                        dev_err_ratelimited(ne_misc_dev.this_device,
 882                                            "Reached max nr of pages in the pages data struct\n");
 883
 884                        rc = -ENOMEM;
 885
 886                        goto put_pages;
 887                }
 888
 889                gup_rc = get_user_pages(mem_region.userspace_addr + memory_size, 1, FOLL_GET,
 890                                        ne_mem_region->pages + i, NULL);
 891                if (gup_rc < 0) {
 892                        rc = gup_rc;
 893
 894                        dev_err_ratelimited(ne_misc_dev.this_device,
 895                                            "Error in get user pages [rc=%d]\n", rc);
 896
 897                        goto put_pages;
 898                }
 899
 900                rc = ne_sanity_check_user_mem_region_page(ne_enclave, ne_mem_region->pages[i]);
 901                if (rc < 0)
 902                        goto put_pages;
 903
 904                /*
 905                 * TODO: Update once handled non-contiguous memory regions
 906                 * received from user space or contiguous physical memory regions
 907                 * larger than 2 MiB e.g. 8 MiB.
 908                 */
 909                phys_contig_mem_regions[i] = ne_mem_region->pages[i];
 910
 911                memory_size += page_size(ne_mem_region->pages[i]);
 912
 913                ne_mem_region->nr_pages++;
 914        } while (memory_size < mem_region.memory_size);
 915
 916        /*
 917         * TODO: Update once handled non-contiguous memory regions received
 918         * from user space or contiguous physical memory regions larger than
 919         * 2 MiB e.g. 8 MiB.
 920         */
 921        nr_phys_contig_mem_regions = ne_mem_region->nr_pages;
 922
 923        if ((ne_enclave->nr_mem_regions + nr_phys_contig_mem_regions) >
 924            ne_enclave->max_mem_regions) {
 925                dev_err_ratelimited(ne_misc_dev.this_device,
 926                                    "Reached max memory regions %lld\n",
 927                                    ne_enclave->max_mem_regions);
 928
 929                rc = -NE_ERR_MEM_MAX_REGIONS;
 930
 931                goto put_pages;
 932        }
 933
 934        for (i = 0; i < nr_phys_contig_mem_regions; i++) {
 935                u64 phys_region_addr = page_to_phys(phys_contig_mem_regions[i]);
 936                u64 phys_region_size = page_size(phys_contig_mem_regions[i]);
 937
 938                if (phys_region_size & (NE_MIN_MEM_REGION_SIZE - 1)) {
 939                        dev_err_ratelimited(ne_misc_dev.this_device,
 940                                            "Physical mem region size is not multiple of 2 MiB\n");
 941
 942                        rc = -EINVAL;
 943
 944                        goto put_pages;
 945                }
 946
 947                if (!IS_ALIGNED(phys_region_addr, NE_MIN_MEM_REGION_SIZE)) {
 948                        dev_err_ratelimited(ne_misc_dev.this_device,
 949                                            "Physical mem region address is not 2 MiB aligned\n");
 950
 951                        rc = -EINVAL;
 952
 953                        goto put_pages;
 954                }
 955        }
 956
 957        ne_mem_region->memory_size = mem_region.memory_size;
 958        ne_mem_region->userspace_addr = mem_region.userspace_addr;
 959
 960        list_add(&ne_mem_region->mem_region_list_entry, &ne_enclave->mem_regions_list);
 961
 962        for (i = 0; i < nr_phys_contig_mem_regions; i++) {
 963                struct ne_pci_dev_cmd_reply cmd_reply = {};
 964                struct slot_add_mem_req slot_add_mem_req = {};
 965
 966                slot_add_mem_req.slot_uid = ne_enclave->slot_uid;
 967                slot_add_mem_req.paddr = page_to_phys(phys_contig_mem_regions[i]);
 968                slot_add_mem_req.size = page_size(phys_contig_mem_regions[i]);
 969
 970                rc = ne_do_request(pdev, SLOT_ADD_MEM,
 971                                   &slot_add_mem_req, sizeof(slot_add_mem_req),
 972                                   &cmd_reply, sizeof(cmd_reply));
 973                if (rc < 0) {
 974                        dev_err_ratelimited(ne_misc_dev.this_device,
 975                                            "Error in slot add mem [rc=%d]\n", rc);
 976
 977                        kfree(phys_contig_mem_regions);
 978
 979                        /*
 980                         * Exit here without put pages as memory regions may
 981                         * already been added.
 982                         */
 983                        return rc;
 984                }
 985
 986                ne_enclave->mem_size += slot_add_mem_req.size;
 987                ne_enclave->nr_mem_regions++;
 988        }
 989
 990        kfree(phys_contig_mem_regions);
 991
 992        return 0;
 993
 994put_pages:
 995        for (i = 0; i < ne_mem_region->nr_pages; i++)
 996                put_page(ne_mem_region->pages[i]);
 997free_mem_region:
 998        kfree(phys_contig_mem_regions);
 999        kfree(ne_mem_region->pages);
1000        kfree(ne_mem_region);

1001
1002        return rc;
1003}
1004
1005/**
1006 * ne_start_enclave_ioctl() - Trigger enclave start after the enclave resources,
1007 *                            such as memory and CPU, have been set.
1008 * @ne_enclave :                Private data associated with the current enclave.
1009 * @enclave_start_info :        Enclave info that includes enclave cid and flags.
1010 *
1011 * Context: Process context. This function is called with the ne_enclave mutex held.
1012 * Return:
1013 * * 0 on success.
1014 * * Negative return value on failure.
1015 */
1016static int ne_start_enclave_ioctl(struct ne_enclave *ne_enclave,
1017        struct ne_enclave_start_info *enclave_start_info)
1018{
1019        struct ne_pci_dev_cmd_reply cmd_reply = {};
1020        unsigned int cpu = 0;
1021        struct enclave_start_req enclave_start_req = {};
1022        unsigned int i = 0;
1023        struct pci_dev *pdev = ne_devs.ne_pci_dev->pdev;
1024        int rc = -EINVAL;
1025
1026        if (!ne_enclave->nr_mem_regions) {
1027                dev_err_ratelimited(ne_misc_dev.this_device,
1028                                    "Enclave has no mem regions\n");
1029
1030                return -NE_ERR_NO_MEM_REGIONS_ADDED;
1031        }
1032
1033        if (ne_enclave->mem_size < NE_MIN_ENCLAVE_MEM_SIZE) {
1034                dev_err_ratelimited(ne_misc_dev.this_device,
1035                                    "Enclave memory is less than %ld\n",
1036                                    NE_MIN_ENCLAVE_MEM_SIZE);
1037
1038                return -NE_ERR_ENCLAVE_MEM_MIN_SIZE;
1039        }
1040
1041        if (!ne_enclave->nr_vcpus) {
1042                dev_err_ratelimited(ne_misc_dev.this_device,
1043                                    "Enclave has no vCPUs\n");
1044
1045                return -NE_ERR_NO_VCPUS_ADDED;
1046        }
1047
1048        for (i = 0; i < ne_enclave->nr_parent_vm_cores; i++)
1049                for_each_cpu(cpu, ne_enclave->threads_per_core[i])
1050                        if (!cpumask_test_cpu(cpu, ne_enclave->vcpu_ids)) {
1051                                dev_err_ratelimited(ne_misc_dev.this_device,
1052                                                    "Full CPU cores not used\n");
1053
1054                                return -NE_ERR_FULL_CORES_NOT_USED;
1055                        }
1056
1057        enclave_start_req.enclave_cid = enclave_start_info->enclave_cid;
1058        enclave_start_req.flags = enclave_start_info->flags;
1059        enclave_start_req.slot_uid = ne_enclave->slot_uid;
1060
1061        rc = ne_do_request(pdev, ENCLAVE_START,
1062                           &enclave_start_req, sizeof(enclave_start_req),
1063                           &cmd_reply, sizeof(cmd_reply));
1064        if (rc < 0) {
1065                dev_err_ratelimited(ne_misc_dev.this_device,
1066                                    "Error in enclave start [rc=%d]\n", rc);
1067
1068                return rc;
1069        }
1070
1071        ne_enclave->state = NE_STATE_RUNNING;
1072
1073        enclave_start_info->enclave_cid = cmd_reply.enclave_cid;
1074
1075        return 0;
1076}
1077
1078/**
1079 * ne_enclave_ioctl() - Ioctl function provided by the enclave file.
1080 * @file:       File associated with this ioctl function.
1081 * @cmd:        The command that is set for the ioctl call.
1082 * @arg:        The argument that is provided for the ioctl call.
1083 *
1084 * Context: Process context.
1085 * Return:
1086 * * 0 on success.
1087 * * Negative return value on failure.
1088 */
1089static long ne_enclave_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1090{
1091        struct ne_enclave *ne_enclave = file->private_data;
1092
1093        switch (cmd) {
1094        case NE_ADD_VCPU: {
1095                int rc = -EINVAL;
1096                u32 vcpu_id = 0;
1097
1098                if (copy_from_user(&vcpu_id, (void __user *)arg, sizeof(vcpu_id)))
1099                        return -EFAULT;
1100
1101                mutex_lock(&ne_enclave->enclave_info_mutex);
1102
1103                if (ne_enclave->state != NE_STATE_INIT) {
1104                        dev_err_ratelimited(ne_misc_dev.this_device,
1105                                            "Enclave is not in init state\n");
1106
1107                        mutex_unlock(&ne_enclave->enclave_info_mutex);
1108
1109                        return -NE_ERR_NOT_IN_INIT_STATE;
1110                }
1111
1112                if (vcpu_id >= (ne_enclave->nr_parent_vm_cores *
1113                    ne_enclave->nr_threads_per_core)) {
1114                        dev_err_ratelimited(ne_misc_dev.this_device,
1115                                            "vCPU id higher than max CPU id\n");
1116
1117                        mutex_unlock(&ne_enclave->enclave_info_mutex);
1118
1119                        return -NE_ERR_INVALID_VCPU;
1120                }
1121
1122                if (!vcpu_id) {
1123                        /* Use the CPU pool for choosing a CPU for the enclave. */
1124                        rc = ne_get_cpu_from_cpu_pool(ne_enclave, &vcpu_id);
1125                        if (rc < 0) {
1126                                dev_err_ratelimited(ne_misc_dev.this_device,
1127                                                    "Error in get CPU from pool [rc=%d]\n",
1128                                                    rc);
1129
1130                                mutex_unlock(&ne_enclave->enclave_info_mutex);
1131
1132                                return rc;
1133                        }
1134                } else {
1135                        /* Check if the provided vCPU is available in the NE CPU pool. */
1136                        rc = ne_check_cpu_in_cpu_pool(ne_enclave, vcpu_id);
1137                        if (rc < 0) {
1138                                dev_err_ratelimited(ne_misc_dev.this_device,
1139                                                    "Error in check CPU %d in pool [rc=%d]\n",
1140                                                    vcpu_id, rc);
1141
1142                                mutex_unlock(&ne_enclave->enclave_info_mutex);
1143
1144                                return rc;
1145                        }
1146                }
1147
1148                rc = ne_add_vcpu_ioctl(ne_enclave, vcpu_id);
1149                if (rc < 0) {
1150                        mutex_unlock(&ne_enclave->enclave_info_mutex);
1151
1152                        return rc;
1153                }
1154
1155                mutex_unlock(&ne_enclave->enclave_info_mutex);
1156
1157                if (copy_to_user((void __user *)arg, &vcpu_id, sizeof(vcpu_id)))
1158                        return -EFAULT;
1159
1160                return 0;
1161        }
1162
1163        case NE_GET_IMAGE_LOAD_INFO: {
1164                struct ne_image_load_info image_load_info = {};
1165
1166                if (copy_from_user(&image_load_info, (void __user *)arg, sizeof(image_load_info)))
1167                        return -EFAULT;
1168
1169                mutex_lock(&ne_enclave->enclave_info_mutex);
1170
1171                if (ne_enclave->state != NE_STATE_INIT) {
1172                        dev_err_ratelimited(ne_misc_dev.this_device,
1173                                            "Enclave is not in init state\n");
1174
1175                        mutex_unlock(&ne_enclave->enclave_info_mutex);
1176
1177                        return -NE_ERR_NOT_IN_INIT_STATE;
1178                }
1179
1180                mutex_unlock(&ne_enclave->enclave_info_mutex);
1181
1182                if (!image_load_info.flags ||
1183                    image_load_info.flags >= NE_IMAGE_LOAD_MAX_FLAG_VAL) {
1184                        dev_err_ratelimited(ne_misc_dev.this_device,
1185                                            "Incorrect flag in enclave image load info\n");
1186
1187                        return -NE_ERR_INVALID_FLAG_VALUE;
1188                }
1189
1190                if (image_load_info.flags == NE_EIF_IMAGE)
1191                        image_load_info.memory_offset = NE_EIF_LOAD_OFFSET;
1192
1193                if (copy_to_user((void __user *)arg, &image_load_info, sizeof(image_load_info)))
1194                        return -EFAULT;
1195
1196                return 0;
1197        }
1198
1199        case NE_SET_USER_MEMORY_REGION: {
1200                struct ne_user_memory_region mem_region = {};
1201                int rc = -EINVAL;
1202
1203                if (copy_from_user(&mem_region, (void __user *)arg, sizeof(mem_region)))
1204                        return -EFAULT;
1205
1206                if (mem_region.flags >= NE_MEMORY_REGION_MAX_FLAG_VAL) {
1207                        dev_err_ratelimited(ne_misc_dev.this_device,
1208                                            "Incorrect flag for user memory region\n");
1209
1210                        return -NE_ERR_INVALID_FLAG_VALUE;
1211                }
1212
1213                mutex_lock(&ne_enclave->enclave_info_mutex);
1214
1215                if (ne_enclave->state != NE_STATE_INIT) {
1216                        dev_err_ratelimited(ne_misc_dev.this_device,
1217                                            "Enclave is not in init state\n");
1218
1219                        mutex_unlock(&ne_enclave->enclave_info_mutex);
1220
1221                        return -NE_ERR_NOT_IN_INIT_STATE;
1222                }
1223
1224                rc = ne_set_user_memory_region_ioctl(ne_enclave, mem_region);
1225                if (rc < 0) {
1226                        mutex_unlock(&ne_enclave->enclave_info_mutex);
1227
1228                        return rc;
1229                }
1230
1231                mutex_unlock(&ne_enclave->enclave_info_mutex);
1232
1233                return 0;
1234        }
1235
1236        case NE_START_ENCLAVE: {
1237                struct ne_enclave_start_info enclave_start_info = {};
1238                int rc = -EINVAL;
1239
1240                if (copy_from_user(&enclave_start_info, (void __user *)arg,
1241                                   sizeof(enclave_start_info)))
1242                        return -EFAULT;
1243
1244                if (enclave_start_info.flags >= NE_ENCLAVE_START_MAX_FLAG_VAL) {
1245                        dev_err_ratelimited(ne_misc_dev.this_device,
1246                                            "Incorrect flag in enclave start info\n");
1247
1248                        return -NE_ERR_INVALID_FLAG_VALUE;
1249                }
1250
1251                /*
1252                 * Do not use well-known CIDs - 0, 1, 2 - for enclaves.
1253                 * VMADDR_CID_ANY = -1U
1254                 * VMADDR_CID_HYPERVISOR = 0
1255                 * VMADDR_CID_LOCAL = 1
1256                 * VMADDR_CID_HOST = 2
1257                 * Note: 0 is used as a placeholder to auto-generate an enclave CID.
1258                 * http://man7.org/linux/man-pages/man7/vsock.7.html
1259                 */
1260                if (enclave_start_info.enclave_cid > 0 &&
1261                    enclave_start_info.enclave_cid <= VMADDR_CID_HOST) {
1262                        dev_err_ratelimited(ne_misc_dev.this_device,
1263                                            "Well-known CID value, not to be used for enclaves\n");
1264
1265                        return -NE_ERR_INVALID_ENCLAVE_CID;
1266                }
1267
1268                if (enclave_start_info.enclave_cid == U32_MAX) {
1269                        dev_err_ratelimited(ne_misc_dev.this_device,
1270                                            "Well-known CID value, not to be used for enclaves\n");
1271
1272                        return -NE_ERR_INVALID_ENCLAVE_CID;
1273                }
1274
1275                /*
1276                 * Do not use the CID of the primary / parent VM for enclaves.
1277                 */
1278                if (enclave_start_info.enclave_cid == NE_PARENT_VM_CID) {
1279                        dev_err_ratelimited(ne_misc_dev.this_device,
1280                                            "CID of the parent VM, not to be used for enclaves\n");
1281
1282                        return -NE_ERR_INVALID_ENCLAVE_CID;
1283                }
1284
1285                /* 64-bit CIDs are not yet supported for the vsock device. */
1286                if (enclave_start_info.enclave_cid > U32_MAX) {
1287                        dev_err_ratelimited(ne_misc_dev.this_device,
1288                                            "64-bit CIDs not yet supported for the vsock device\n");
1289
1290                        return -NE_ERR_INVALID_ENCLAVE_CID;
1291                }
1292
1293                mutex_lock(&ne_enclave->enclave_info_mutex);
1294
1295                if (ne_enclave->state != NE_STATE_INIT) {
1296                        dev_err_ratelimited(ne_misc_dev.this_device,
1297                                            "Enclave is not in init state\n");
1298
1299                        mutex_unlock(&ne_enclave->enclave_info_mutex);
1300
1301                        return -NE_ERR_NOT_IN_INIT_STATE;
1302                }
1303
1304                rc = ne_start_enclave_ioctl(ne_enclave, &enclave_start_info);
1305                if (rc < 0) {
1306                        mutex_unlock(&ne_enclave->enclave_info_mutex);
1307
1308                        return rc;
1309                }
1310
1311                mutex_unlock(&ne_enclave->enclave_info_mutex);
1312
1313                if (copy_to_user((void __user *)arg, &enclave_start_info,
1314                                 sizeof(enclave_start_info)))
1315                        return -EFAULT;
1316
1317                return 0;
1318        }
1319
1320        default:
1321                return -ENOTTY;
1322        }
1323
1324        return 0;
1325}
1326
1327/**
1328 * ne_enclave_remove_all_mem_region_entries() - Remove all memory region entries
1329 *                                              from the enclave data structure.
1330 * @ne_enclave :        Private data associated with the current enclave.
1331 *
1332 * Context: Process context. This function is called with the ne_enclave mutex held.
1333 */
1334static void ne_enclave_remove_all_mem_region_entries(struct ne_enclave *ne_enclave)
1335{
1336        unsigned long i = 0;
1337        struct ne_mem_region *ne_mem_region = NULL;
1338        struct ne_mem_region *ne_mem_region_tmp = NULL;
1339
1340        list_for_each_entry_safe(ne_mem_region, ne_mem_region_tmp,
1341                                 &ne_enclave->mem_regions_list,
1342                                 mem_region_list_entry) {
1343                list_del(&ne_mem_region->mem_region_list_entry);
1344
1345                for (i = 0; i < ne_mem_region->nr_pages; i++)
1346                        put_page(ne_mem_region->pages[i]);
1347
1348                kfree(ne_mem_region->pages);
1349
1350                kfree(ne_mem_region);
1351        }
1352}
1353
1354/**
1355 * ne_enclave_remove_all_vcpu_id_entries() - Remove all vCPU id entries from
1356 *                                           the enclave data structure.
1357 * @ne_enclave :        Private data associated with the current enclave.
1358 *
1359 * Context: Process context. This function is called with the ne_enclave mutex held.
1360 */
1361static void ne_enclave_remove_all_vcpu_id_entries(struct ne_enclave *ne_enclave)
1362{
1363        unsigned int cpu = 0;
1364        unsigned int i = 0;
1365
1366        mutex_lock(&ne_cpu_pool.mutex);
1367
1368        for (i = 0; i < ne_enclave->nr_parent_vm_cores; i++) {
1369                for_each_cpu(cpu, ne_enclave->threads_per_core[i])
1370                        /* Update the available NE CPU pool. */
1371                        cpumask_set_cpu(cpu, ne_cpu_pool.avail_threads_per_core[i]);
1372
1373                free_cpumask_var(ne_enclave->threads_per_core[i]);
1374        }
1375
1376        mutex_unlock(&ne_cpu_pool.mutex);
1377
1378        kfree(ne_enclave->threads_per_core);
1379
1380        free_cpumask_var(ne_enclave->vcpu_ids);
1381}
1382
1383/**
1384 * ne_pci_dev_remove_enclave_entry() - Remove the enclave entry from the data
1385 *                                     structure that is part of the NE PCI
1386 *                                     device private data.
1387 * @ne_enclave :        Private data associated with the current enclave.
1388 * @ne_pci_dev :        Private data associated with the PCI device.
1389 *
1390 * Context: Process context. This function is called with the ne_pci_dev enclave
1391 *          mutex held.
1392 */
1393static void ne_pci_dev_remove_enclave_entry(struct ne_enclave *ne_enclave,
1394                                            struct ne_pci_dev *ne_pci_dev)
1395{
1396        struct ne_enclave *ne_enclave_entry = NULL;
1397        struct ne_enclave *ne_enclave_entry_tmp = NULL;
1398
1399        list_for_each_entry_safe(ne_enclave_entry, ne_enclave_entry_tmp,
1400                                 &ne_pci_dev->enclaves_list, enclave_list_entry) {
1401                if (ne_enclave_entry->slot_uid == ne_enclave->slot_uid) {
1402                        list_del(&ne_enclave_entry->enclave_list_entry);
1403
1404                        break;
1405                }
1406        }
1407}
1408
1409/**
1410 * ne_enclave_release() - Release function provided by the enclave file.
1411 * @inode:      Inode associated with this file release function.
1412 * @file:       File associated with this release function.
1413 *
1414 * Context: Process context.
1415 * Return:
1416 * * 0 on success.
1417 * * Negative return value on failure.
1418 */
1419static int ne_enclave_release(struct inode *inode, struct file *file)
1420{
1421        struct ne_pci_dev_cmd_reply cmd_reply = {};
1422        struct enclave_stop_req enclave_stop_request = {};
1423        struct ne_enclave *ne_enclave = file->private_data;
1424        struct ne_pci_dev *ne_pci_dev = ne_devs.ne_pci_dev;
1425        struct pci_dev *pdev = ne_pci_dev->pdev;
1426        int rc = -EINVAL;
1427        struct slot_free_req slot_free_req = {};
1428
1429        if (!ne_enclave)
1430                return 0;
1431
1432        /*
1433         * Early exit in case there is an error in the enclave creation logic
1434         * and fput() is called on the cleanup path.
1435         */
1436        if (!ne_enclave->slot_uid)
1437                return 0;
1438
1439        /*
1440         * Acquire the enclave list mutex before the enclave mutex
1441         * in order to avoid deadlocks with @ref ne_event_work_handler.
1442         */
1443        mutex_lock(&ne_pci_dev->enclaves_list_mutex);
1444        mutex_lock(&ne_enclave->enclave_info_mutex);
1445
1446        if (ne_enclave->state != NE_STATE_INIT && ne_enclave->state != NE_STATE_STOPPED) {
1447                enclave_stop_request.slot_uid = ne_enclave->slot_uid;
1448
1449                rc = ne_do_request(pdev, ENCLAVE_STOP,
1450                                   &enclave_stop_request, sizeof(enclave_stop_request),
1451                                   &cmd_reply, sizeof(cmd_reply));
1452                if (rc < 0) {
1453                        dev_err_ratelimited(ne_misc_dev.this_device,
1454                                            "Error in enclave stop [rc=%d]\n", rc);
1455
1456                        goto unlock_mutex;
1457                }
1458
1459                memset(&cmd_reply, 0, sizeof(cmd_reply));
1460        }
1461
1462        slot_free_req.slot_uid = ne_enclave->slot_uid;
1463
1464        rc = ne_do_request(pdev, SLOT_FREE,
1465                           &slot_free_req, sizeof(slot_free_req),
1466                           &cmd_reply, sizeof(cmd_reply));
1467        if (rc < 0) {
1468                dev_err_ratelimited(ne_misc_dev.this_device,
1469                                    "Error in slot free [rc=%d]\n", rc);
1470
1471                goto unlock_mutex;
1472        }
1473
1474        ne_pci_dev_remove_enclave_entry(ne_enclave, ne_pci_dev);
1475        ne_enclave_remove_all_mem_region_entries(ne_enclave);
1476        ne_enclave_remove_all_vcpu_id_entries(ne_enclave);
1477
1478        mutex_unlock(&ne_enclave->enclave_info_mutex);
1479        mutex_unlock(&ne_pci_dev->enclaves_list_mutex);
1480
1481        kfree(ne_enclave);
1482
1483        return 0;
1484
1485unlock_mutex:
1486        mutex_unlock(&ne_enclave->enclave_info_mutex);
1487        mutex_unlock(&ne_pci_dev->enclaves_list_mutex);
1488
1489        return rc;
1490}
1491
1492/**
1493 * ne_enclave_poll() - Poll functionality used for enclave out-of-band events.
1494 * @file:       File associated with this poll function.
1495 * @wait:       Poll table data structure.
1496 *
1497 * Context: Process context.
1498 * Return:
1499 * * Poll mask.
1500 */
1501static __poll_t ne_enclave_poll(struct file *file, poll_table *wait)
1502{
1503        __poll_t mask = 0;
1504        struct ne_enclave *ne_enclave = file->private_data;
1505
1506        poll_wait(file, &ne_enclave->eventq, wait);
1507
1508        if (ne_enclave->has_event)
1509                mask |= EPOLLHUP;
1510
1511        return mask;
1512}
1513
1514static const struct file_operations ne_enclave_fops = {
1515        .owner          = THIS_MODULE,
1516        .llseek         = noop_llseek,
1517        .poll           = ne_enclave_poll,
1518        .unlocked_ioctl = ne_enclave_ioctl,
1519        .release        = ne_enclave_release,
1520};
1521
1522/**
1523 * ne_create_vm_ioctl() - Alloc slot to be associated with an enclave. Create
1524 *                        enclave file descriptor to be further used for enclave
1525 *                        resources handling e.g. memory regions and CPUs.
1526 * @ne_pci_dev :        Private data associated with the PCI device.
1527 * @slot_uid:           User pointer to store the generated unique slot id
1528 *                      associated with an enclave to.
1529 *
1530 * Context: Process context. This function is called with the ne_pci_dev enclave
1531 *          mutex held.
1532 * Return:
1533 * * Enclave fd on success.
1534 * * Negative return value on failure.
1535 */
1536static int ne_create_vm_ioctl(struct ne_pci_dev *ne_pci_dev, u64 __user *slot_uid)
1537{
1538        struct ne_pci_dev_cmd_reply cmd_reply = {};
1539        int enclave_fd = -1;
1540        struct file *enclave_file = NULL;
1541        unsigned int i = 0;
1542        struct ne_enclave *ne_enclave = NULL;
1543        struct pci_dev *pdev = ne_pci_dev->pdev;
1544        int rc = -EINVAL;
1545        struct slot_alloc_req slot_alloc_req = {};
1546
1547        mutex_lock(&ne_cpu_pool.mutex);
1548
1549        for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++)
1550                if (!cpumask_empty(ne_cpu_pool.avail_threads_per_core[i]))
1551                        break;
1552
1553        if (i == ne_cpu_pool.nr_parent_vm_cores) {
1554                dev_err_ratelimited(ne_misc_dev.this_device,
1555                                    "No CPUs available in CPU pool\n");
1556
1557                mutex_unlock(&ne_cpu_pool.mutex);
1558
1559                return -NE_ERR_NO_CPUS_AVAIL_IN_POOL;
1560        }
1561
1562        mutex_unlock(&ne_cpu_pool.mutex);
1563
1564        ne_enclave = kzalloc(sizeof(*ne_enclave), GFP_KERNEL);
1565        if (!ne_enclave)
1566                return -ENOMEM;
1567
1568        mutex_lock(&ne_cpu_pool.mutex);
1569
1570        ne_enclave->nr_parent_vm_cores = ne_cpu_pool.nr_parent_vm_cores;
1571        ne_enclave->nr_threads_per_core = ne_cpu_pool.nr_threads_per_core;
1572        ne_enclave->numa_node = ne_cpu_pool.numa_node;
1573
1574        mutex_unlock(&ne_cpu_pool.mutex);
1575
1576        ne_enclave->threads_per_core = kcalloc(ne_enclave->nr_parent_vm_cores,
1577                sizeof(*ne_enclave->threads_per_core), GFP_KERNEL);
1578        if (!ne_enclave->threads_per_core) {
1579                rc = -ENOMEM;
1580
1581                goto free_ne_enclave;
1582        }
1583
1584        for (i = 0; i < ne_enclave->nr_parent_vm_cores; i++)
1585                if (!zalloc_cpumask_var(&ne_enclave->threads_per_core[i], GFP_KERNEL)) {
1586                        rc = -ENOMEM;
1587
1588                        goto free_cpumask;
1589                }
1590
1591        if (!zalloc_cpumask_var(&ne_enclave->vcpu_ids, GFP_KERNEL)) {
1592                rc = -ENOMEM;
1593
1594                goto free_cpumask;
1595        }
1596
1597        enclave_fd = get_unused_fd_flags(O_CLOEXEC);
1598        if (enclave_fd < 0) {
1599                rc = enclave_fd;
1600
1601                dev_err_ratelimited(ne_misc_dev.this_device,
1602                                    "Error in getting unused fd [rc=%d]\n", rc);
1603
1604                goto free_cpumask;
1605        }
1606
1607        enclave_file = anon_inode_getfile("ne-vm", &ne_enclave_fops, ne_enclave, O_RDWR);
1608        if (IS_ERR(enclave_file)) {
1609                rc = PTR_ERR(enclave_file);
1610
1611                dev_err_ratelimited(ne_misc_dev.this_device,
1612                                    "Error in anon inode get file [rc=%d]\n", rc);
1613
1614                goto put_fd;
1615        }
1616
1617        rc = ne_do_request(pdev, SLOT_ALLOC,
1618                           &slot_alloc_req, sizeof(slot_alloc_req),
1619                           &cmd_reply, sizeof(cmd_reply));
1620        if (rc < 0) {
1621                dev_err_ratelimited(ne_misc_dev.this_device,
1622                                    "Error in slot alloc [rc=%d]\n", rc);
1623
1624                goto put_file;
1625        }
1626
1627        init_waitqueue_head(&ne_enclave->eventq);
1628        ne_enclave->has_event = false;
1629        mutex_init(&ne_enclave->enclave_info_mutex);
1630        ne_enclave->max_mem_regions = cmd_reply.mem_regions;
1631        INIT_LIST_HEAD(&ne_enclave->mem_regions_list);
1632        ne_enclave->mm = current->mm;
1633        ne_enclave->slot_uid = cmd_reply.slot_uid;
1634        ne_enclave->state = NE_STATE_INIT;
1635
1636        list_add(&ne_enclave->enclave_list_entry, &ne_pci_dev->enclaves_list);
1637
1638        if (copy_to_user(slot_uid, &ne_enclave->slot_uid, sizeof(ne_enclave->slot_uid))) {
1639                /*
1640                 * As we're holding the only reference to 'enclave_file', fput()
1641                 * will call ne_enclave_release() which will do a proper cleanup
1642                 * of all so far allocated resources, leaving only the unused fd
1643                 * for us to free.
1644                 */
1645                fput(enclave_file);
1646                put_unused_fd(enclave_fd);
1647
1648                return -EFAULT;
1649        }
1650
1651        fd_install(enclave_fd, enclave_file);
1652
1653        return enclave_fd;
1654
1655put_file:
1656        fput(enclave_file);
1657put_fd:
1658        put_unused_fd(enclave_fd);
1659free_cpumask:
1660        free_cpumask_var(ne_enclave->vcpu_ids);
1661        for (i = 0; i < ne_enclave->nr_parent_vm_cores; i++)
1662                free_cpumask_var(ne_enclave->threads_per_core[i]);
1663        kfree(ne_enclave->threads_per_core);
1664free_ne_enclave:
1665        kfree(ne_enclave);
1666
1667        return rc;
1668}
1669
1670/**
1671 * ne_ioctl() - Ioctl function provided by the NE misc device.
1672 * @file:       File associated with this ioctl function.
1673 * @cmd:        The command that is set for the ioctl call.
1674 * @arg:        The argument that is provided for the ioctl call.
1675 *
1676 * Context: Process context.
1677 * Return:
1678 * * Ioctl result (e.g. enclave file descriptor) on success.
1679 * * Negative return value on failure.
1680 */
1681static long ne_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1682{
1683        switch (cmd) {
1684        case NE_CREATE_VM: {
1685                int enclave_fd = -1;
1686                struct ne_pci_dev *ne_pci_dev = ne_devs.ne_pci_dev;
1687                u64 __user *slot_uid = (void __user *)arg;
1688
1689                mutex_lock(&ne_pci_dev->enclaves_list_mutex);
1690                enclave_fd = ne_create_vm_ioctl(ne_pci_dev, slot_uid);
1691                mutex_unlock(&ne_pci_dev->enclaves_list_mutex);
1692
1693                return enclave_fd;
1694        }
1695
1696        default:
1697                return -ENOTTY;
1698        }
1699
1700        return 0;
1701}
1702
1703static int __init ne_init(void)
1704{
1705        mutex_init(&ne_cpu_pool.mutex);
1706
1707        return pci_register_driver(&ne_pci_driver);
1708}
1709
1710static void __exit ne_exit(void)
1711{
1712        pci_unregister_driver(&ne_pci_driver);
1713
1714        ne_teardown_cpu_pool();
1715}
1716
1717module_init(ne_init);
1718module_exit(ne_exit);
1719
1720MODULE_AUTHOR("Amazon.com, Inc. or its affiliates");
1721MODULE_DESCRIPTION("Nitro Enclaves Driver");
1722MODULE_LICENSE("GPL v2");
1723