linux/drivers/virt/nitro_enclaves/ne_misc_dev.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright 2020-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
   4 */
   5
   6/**
   7 * DOC: Enclave lifetime management driver for Nitro Enclaves (NE).
   8 * Nitro is a hypervisor that has been developed by Amazon.
   9 */
  10
  11#include <linux/anon_inodes.h>
  12#include <linux/capability.h>
  13#include <linux/cpu.h>
  14#include <linux/device.h>
  15#include <linux/file.h>
  16#include <linux/hugetlb.h>
  17#include <linux/limits.h>
  18#include <linux/list.h>
  19#include <linux/miscdevice.h>
  20#include <linux/mm.h>
  21#include <linux/mman.h>
  22#include <linux/module.h>
  23#include <linux/mutex.h>
  24#include <linux/nitro_enclaves.h>
  25#include <linux/pci.h>
  26#include <linux/poll.h>
  27#include <linux/slab.h>
  28#include <linux/types.h>
  29#include <uapi/linux/vm_sockets.h>
  30
  31#include "ne_misc_dev.h"
  32#include "ne_pci_dev.h"
  33
  34/**
  35 * NE_CPUS_SIZE - Size for max 128 CPUs, for now, in a cpu-list string, comma
  36 *                separated. The NE CPU pool includes CPUs from a single NUMA
  37 *                node.
  38 */
  39#define NE_CPUS_SIZE            (512)
  40
  41/**
  42 * NE_EIF_LOAD_OFFSET - The offset where to copy the Enclave Image Format (EIF)
  43 *                      image in enclave memory.
  44 */
  45#define NE_EIF_LOAD_OFFSET      (8 * 1024UL * 1024UL)
  46
  47/**
  48 * NE_MIN_ENCLAVE_MEM_SIZE - The minimum memory size an enclave can be launched
  49 *                           with.
  50 */
  51#define NE_MIN_ENCLAVE_MEM_SIZE (64 * 1024UL * 1024UL)
  52
  53/**
  54 * NE_MIN_MEM_REGION_SIZE - The minimum size of an enclave memory region.
  55 */
  56#define NE_MIN_MEM_REGION_SIZE  (2 * 1024UL * 1024UL)
  57
  58/**
  59 * NE_PARENT_VM_CID - The CID for the vsock device of the primary / parent VM.
  60 */
  61#define NE_PARENT_VM_CID        (3)
  62
  63static long ne_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
  64
  65static const struct file_operations ne_fops = {
  66        .owner          = THIS_MODULE,
  67        .llseek         = noop_llseek,
  68        .unlocked_ioctl = ne_ioctl,
  69};
  70
  71static struct miscdevice ne_misc_dev = {
  72        .minor  = MISC_DYNAMIC_MINOR,
  73        .name   = "nitro_enclaves",
  74        .fops   = &ne_fops,
  75        .mode   = 0660,
  76};
  77
  78struct ne_devs ne_devs = {
  79        .ne_misc_dev    = &ne_misc_dev,
  80};
  81
  82/*
  83 * TODO: Update logic to create new sysfs entries instead of using
  84 * a kernel parameter e.g. if multiple sysfs files needed.
  85 */
  86static int ne_set_kernel_param(const char *val, const struct kernel_param *kp);
  87
  88static const struct kernel_param_ops ne_cpu_pool_ops = {
  89        .get    = param_get_string,
  90        .set    = ne_set_kernel_param,
  91};
  92
  93static char ne_cpus[NE_CPUS_SIZE];
  94static struct kparam_string ne_cpus_arg = {
  95        .maxlen = sizeof(ne_cpus),
  96        .string = ne_cpus,
  97};
  98
  99module_param_cb(ne_cpus, &ne_cpu_pool_ops, &ne_cpus_arg, 0644);
 100/* https://www.kernel.org/doc/html/latest/admin-guide/kernel-parameters.html#cpu-lists */
 101MODULE_PARM_DESC(ne_cpus, "<cpu-list> - CPU pool used for Nitro Enclaves");
 102
 103/**
 104 * struct ne_cpu_pool - CPU pool used for Nitro Enclaves.
 105 * @avail_threads_per_core:     Available full CPU cores to be dedicated to
 106 *                              enclave(s). The cpumasks from the array, indexed
 107 *                              by core id, contain all the threads from the
 108 *                              available cores, that are not set for created
 109 *                              enclave(s). The full CPU cores are part of the
 110 *                              NE CPU pool.
 111 * @mutex:                      Mutex for the access to the NE CPU pool.
 112 * @nr_parent_vm_cores :        The size of the available threads per core array.
 113 *                              The total number of CPU cores available on the
 114 *                              primary / parent VM.
 115 * @nr_threads_per_core:        The number of threads that a full CPU core has.
 116 * @numa_node:                  NUMA node of the CPUs in the pool.
 117 */
 118struct ne_cpu_pool {
 119        cpumask_var_t   *avail_threads_per_core;
 120        struct mutex    mutex;
 121        unsigned int    nr_parent_vm_cores;
 122        unsigned int    nr_threads_per_core;
 123        int             numa_node;
 124};
 125
 126static struct ne_cpu_pool ne_cpu_pool;
 127
 128/**
 129 * ne_check_enclaves_created() - Verify if at least one enclave has been created.
 130 * @void:       No parameters provided.
 131 *
 132 * Context: Process context.
 133 * Return:
 134 * * True if at least one enclave is created.
 135 * * False otherwise.
 136 */
 137static bool ne_check_enclaves_created(void)
 138{
 139        struct ne_pci_dev *ne_pci_dev = ne_devs.ne_pci_dev;
 140        bool ret = false;
 141
 142        if (!ne_pci_dev)
 143                return ret;
 144
 145        mutex_lock(&ne_pci_dev->enclaves_list_mutex);
 146
 147        if (!list_empty(&ne_pci_dev->enclaves_list))
 148                ret = true;
 149
 150        mutex_unlock(&ne_pci_dev->enclaves_list_mutex);
 151
 152        return ret;
 153}
 154
 155/**
 156 * ne_setup_cpu_pool() - Set the NE CPU pool after handling sanity checks such
 157 *                       as not sharing CPU cores with the primary / parent VM
 158 *                       or not using CPU 0, which should remain available for
 159 *                       the primary / parent VM. Offline the CPUs from the
 160 *                       pool after the checks passed.
 161 * @ne_cpu_list:        The CPU list used for setting NE CPU pool.
 162 *
 163 * Context: Process context.
 164 * Return:
 165 * * 0 on success.
 166 * * Negative return value on failure.
 167 */
 168static int ne_setup_cpu_pool(const char *ne_cpu_list)
 169{
 170        int core_id = -1;
 171        unsigned int cpu = 0;
 172        cpumask_var_t cpu_pool;
 173        unsigned int cpu_sibling = 0;
 174        unsigned int i = 0;
 175        int numa_node = -1;
 176        int rc = -EINVAL;
 177
 178        if (!zalloc_cpumask_var(&cpu_pool, GFP_KERNEL))
 179                return -ENOMEM;
 180
 181        mutex_lock(&ne_cpu_pool.mutex);
 182
 183        rc = cpulist_parse(ne_cpu_list, cpu_pool);
 184        if (rc < 0) {
 185                pr_err("%s: Error in cpulist parse [rc=%d]\n", ne_misc_dev.name, rc);
 186
 187                goto free_pool_cpumask;
 188        }
 189
 190        cpu = cpumask_any(cpu_pool);
 191        if (cpu >= nr_cpu_ids) {
 192                pr_err("%s: No CPUs available in CPU pool\n", ne_misc_dev.name);
 193
 194                rc = -EINVAL;
 195
 196                goto free_pool_cpumask;
 197        }
 198
 199        /*
 200         * Check if the CPUs are online, to further get info about them
 201         * e.g. numa node, core id, siblings.
 202         */
 203        for_each_cpu(cpu, cpu_pool)
 204                if (cpu_is_offline(cpu)) {
 205                        pr_err("%s: CPU %d is offline, has to be online to get its metadata\n",
 206                               ne_misc_dev.name, cpu);
 207
 208                        rc = -EINVAL;
 209
 210                        goto free_pool_cpumask;
 211                }
 212
 213        /*
 214         * Check if the CPUs from the NE CPU pool are from the same NUMA node.
 215         */
 216        for_each_cpu(cpu, cpu_pool)
 217                if (numa_node < 0) {
 218                        numa_node = cpu_to_node(cpu);
 219                        if (numa_node < 0) {
 220                                pr_err("%s: Invalid NUMA node %d\n",
 221                                       ne_misc_dev.name, numa_node);
 222
 223                                rc = -EINVAL;
 224
 225                                goto free_pool_cpumask;
 226                        }
 227                } else {
 228                        if (numa_node != cpu_to_node(cpu)) {
 229                                pr_err("%s: CPUs with different NUMA nodes\n",
 230                                       ne_misc_dev.name);
 231
 232                                rc = -EINVAL;
 233
 234                                goto free_pool_cpumask;
 235                        }
 236                }
 237
 238        /*
 239         * Check if CPU 0 and its siblings are included in the provided CPU pool
 240         * They should remain available for the primary / parent VM.
 241         */
 242        if (cpumask_test_cpu(0, cpu_pool)) {
 243                pr_err("%s: CPU 0 has to remain available\n", ne_misc_dev.name);
 244
 245                rc = -EINVAL;
 246
 247                goto free_pool_cpumask;
 248        }
 249
 250        for_each_cpu(cpu_sibling, topology_sibling_cpumask(0)) {
 251                if (cpumask_test_cpu(cpu_sibling, cpu_pool)) {
 252                        pr_err("%s: CPU sibling %d for CPU 0 is in CPU pool\n",
 253                               ne_misc_dev.name, cpu_sibling);
 254
 255                        rc = -EINVAL;
 256
 257                        goto free_pool_cpumask;
 258                }
 259        }
 260
 261        /*
 262         * Check if CPU siblings are included in the provided CPU pool. The
 263         * expectation is that full CPU cores are made available in the CPU pool
 264         * for enclaves.
 265         */
 266        for_each_cpu(cpu, cpu_pool) {
 267                for_each_cpu(cpu_sibling, topology_sibling_cpumask(cpu)) {
 268                        if (!cpumask_test_cpu(cpu_sibling, cpu_pool)) {
 269                                pr_err("%s: CPU %d is not in CPU pool\n",
 270                                       ne_misc_dev.name, cpu_sibling);
 271
 272                                rc = -EINVAL;
 273
 274                                goto free_pool_cpumask;
 275                        }
 276                }
 277        }
 278
 279        /* Calculate the number of threads from a full CPU core. */
 280        cpu = cpumask_any(cpu_pool);
 281        for_each_cpu(cpu_sibling, topology_sibling_cpumask(cpu))
 282                ne_cpu_pool.nr_threads_per_core++;
 283
 284        ne_cpu_pool.nr_parent_vm_cores = nr_cpu_ids / ne_cpu_pool.nr_threads_per_core;
 285
 286        ne_cpu_pool.avail_threads_per_core = kcalloc(ne_cpu_pool.nr_parent_vm_cores,
 287                                                     sizeof(*ne_cpu_pool.avail_threads_per_core),
 288                                                     GFP_KERNEL);
 289        if (!ne_cpu_pool.avail_threads_per_core) {
 290                rc = -ENOMEM;
 291
 292                goto free_pool_cpumask;
 293        }
 294
 295        for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++)
 296                if (!zalloc_cpumask_var(&ne_cpu_pool.avail_threads_per_core[i], GFP_KERNEL)) {
 297                        rc = -ENOMEM;
 298
 299                        goto free_cores_cpumask;
 300                }
 301
 302        /*
 303         * Split the NE CPU pool in threads per core to keep the CPU topology
 304         * after offlining the CPUs.
 305         */
 306        for_each_cpu(cpu, cpu_pool) {
 307                core_id = topology_core_id(cpu);
 308                if (core_id < 0 || core_id >= ne_cpu_pool.nr_parent_vm_cores) {
 309                        pr_err("%s: Invalid core id  %d for CPU %d\n",
 310                               ne_misc_dev.name, core_id, cpu);
 311
 312                        rc = -EINVAL;
 313
 314                        goto clear_cpumask;
 315                }
 316
 317                cpumask_set_cpu(cpu, ne_cpu_pool.avail_threads_per_core[core_id]);
 318        }
 319
 320        /*
 321         * CPUs that are given to enclave(s) should not be considered online
 322         * by Linux anymore, as the hypervisor will degrade them to floating.
 323         * The physical CPUs (full cores) are carved out of the primary / parent
 324         * VM and given to the enclave VM. The same number of vCPUs would run
 325         * on less pCPUs for the primary / parent VM.
 326         *
 327         * We offline them here, to not degrade performance and expose correct
 328         * topology to Linux and user space.
 329         */
 330        for_each_cpu(cpu, cpu_pool) {
 331                rc = remove_cpu(cpu);
 332                if (rc != 0) {
 333                        pr_err("%s: CPU %d is not offlined [rc=%d]\n",
 334                               ne_misc_dev.name, cpu, rc);
 335
 336                        goto online_cpus;
 337                }
 338        }
 339
 340        free_cpumask_var(cpu_pool);
 341
 342        ne_cpu_pool.numa_node = numa_node;
 343
 344        mutex_unlock(&ne_cpu_pool.mutex);
 345
 346        return 0;
 347
 348online_cpus:
 349        for_each_cpu(cpu, cpu_pool)
 350                add_cpu(cpu);
 351clear_cpumask:
 352        for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++)
 353                cpumask_clear(ne_cpu_pool.avail_threads_per_core[i]);
 354free_cores_cpumask:
 355        for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++)
 356                free_cpumask_var(ne_cpu_pool.avail_threads_per_core[i]);
 357        kfree(ne_cpu_pool.avail_threads_per_core);
 358free_pool_cpumask:
 359        free_cpumask_var(cpu_pool);
 360        ne_cpu_pool.nr_parent_vm_cores = 0;
 361        ne_cpu_pool.nr_threads_per_core = 0;
 362        ne_cpu_pool.numa_node = -1;
 363        mutex_unlock(&ne_cpu_pool.mutex);
 364
 365        return rc;
 366}
 367
 368/**
 369 * ne_teardown_cpu_pool() - Online the CPUs from the NE CPU pool and cleanup the
 370 *                          CPU pool.
 371 * @void:       No parameters provided.
 372 *
 373 * Context: Process context.
 374 */
 375static void ne_teardown_cpu_pool(void)
 376{
 377        unsigned int cpu = 0;
 378        unsigned int i = 0;
 379        int rc = -EINVAL;
 380
 381        mutex_lock(&ne_cpu_pool.mutex);
 382
 383        if (!ne_cpu_pool.nr_parent_vm_cores) {
 384                mutex_unlock(&ne_cpu_pool.mutex);
 385
 386                return;
 387        }
 388
 389        for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++) {
 390                for_each_cpu(cpu, ne_cpu_pool.avail_threads_per_core[i]) {
 391                        rc = add_cpu(cpu);
 392                        if (rc != 0)
 393                                pr_err("%s: CPU %d is not onlined [rc=%d]\n",
 394                                       ne_misc_dev.name, cpu, rc);
 395                }
 396
 397                cpumask_clear(ne_cpu_pool.avail_threads_per_core[i]);
 398
 399                free_cpumask_var(ne_cpu_pool.avail_threads_per_core[i]);
 400        }
 401
 402        kfree(ne_cpu_pool.avail_threads_per_core);
 403        ne_cpu_pool.nr_parent_vm_cores = 0;
 404        ne_cpu_pool.nr_threads_per_core = 0;
 405        ne_cpu_pool.numa_node = -1;
 406
 407        mutex_unlock(&ne_cpu_pool.mutex);
 408}
 409
 410/**
 411 * ne_set_kernel_param() - Set the NE CPU pool value via the NE kernel parameter.
 412 * @val:        NE CPU pool string value.
 413 * @kp :        NE kernel parameter associated with the NE CPU pool.
 414 *
 415 * Context: Process context.
 416 * Return:
 417 * * 0 on success.
 418 * * Negative return value on failure.
 419 */
 420static int ne_set_kernel_param(const char *val, const struct kernel_param *kp)
 421{
 422        char error_val[] = "";
 423        int rc = -EINVAL;
 424
 425        if (!capable(CAP_SYS_ADMIN))
 426                return -EPERM;
 427
 428        if (ne_check_enclaves_created()) {
 429                pr_err("%s: The CPU pool is used by enclave(s)\n", ne_misc_dev.name);
 430
 431                return -EPERM;
 432        }
 433
 434        ne_teardown_cpu_pool();
 435
 436        rc = ne_setup_cpu_pool(val);
 437        if (rc < 0) {
 438                pr_err("%s: Error in setup CPU pool [rc=%d]\n", ne_misc_dev.name, rc);
 439
 440                param_set_copystring(error_val, kp);
 441
 442                return rc;
 443        }
 444
 445        rc = param_set_copystring(val, kp);
 446        if (rc < 0) {
 447                pr_err("%s: Error in param set copystring [rc=%d]\n", ne_misc_dev.name, rc);
 448
 449                ne_teardown_cpu_pool();
 450
 451                param_set_copystring(error_val, kp);
 452
 453                return rc;
 454        }
 455
 456        return 0;
 457}
 458
 459/**
 460 * ne_donated_cpu() - Check if the provided CPU is already used by the enclave.
 461 * @ne_enclave :        Private data associated with the current enclave.
 462 * @cpu:                CPU to check if already used.
 463 *
 464 * Context: Process context. This function is called with the ne_enclave mutex held.
 465 * Return:
 466 * * True if the provided CPU is already used by the enclave.
 467 * * False otherwise.
 468 */
 469static bool ne_donated_cpu(struct ne_enclave *ne_enclave, unsigned int cpu)
 470{
 471        if (cpumask_test_cpu(cpu, ne_enclave->vcpu_ids))
 472                return true;
 473
 474        return false;
 475}
 476
 477/**
 478 * ne_get_unused_core_from_cpu_pool() - Get the id of a full core from the
 479 *                                      NE CPU pool.
 480 * @void:       No parameters provided.
 481 *
 482 * Context: Process context. This function is called with the ne_enclave and
 483 *          ne_cpu_pool mutexes held.
 484 * Return:
 485 * * Core id.
 486 * * -1 if no CPU core available in the pool.
 487 */
 488static int ne_get_unused_core_from_cpu_pool(void)
 489{
 490        int core_id = -1;
 491        unsigned int i = 0;
 492
 493        for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++)
 494                if (!cpumask_empty(ne_cpu_pool.avail_threads_per_core[i])) {
 495                        core_id = i;
 496
 497                        break;
 498                }
 499
 500        return core_id;
 501}
 502
 503/**
 504 * ne_set_enclave_threads_per_core() - Set the threads of the provided core in
 505 *                                     the enclave data structure.
 506 * @ne_enclave :        Private data associated with the current enclave.
 507 * @core_id:            Core id to get its threads from the NE CPU pool.
 508 * @vcpu_id:            vCPU id part of the provided core.
 509 *
 510 * Context: Process context. This function is called with the ne_enclave and
 511 *          ne_cpu_pool mutexes held.
 512 * Return:
 513 * * 0 on success.
 514 * * Negative return value on failure.
 515 */
 516static int ne_set_enclave_threads_per_core(struct ne_enclave *ne_enclave,
 517                                           int core_id, u32 vcpu_id)
 518{
 519        unsigned int cpu = 0;
 520
 521        if (core_id < 0 && vcpu_id == 0) {
 522                dev_err_ratelimited(ne_misc_dev.this_device,
 523                                    "No CPUs available in NE CPU pool\n");
 524
 525                return -NE_ERR_NO_CPUS_AVAIL_IN_POOL;
 526        }
 527
 528        if (core_id < 0) {
 529                dev_err_ratelimited(ne_misc_dev.this_device,
 530                                    "CPU %d is not in NE CPU pool\n", vcpu_id);
 531
 532                return -NE_ERR_VCPU_NOT_IN_CPU_POOL;
 533        }
 534
 535        if (core_id >= ne_enclave->nr_parent_vm_cores) {
 536                dev_err_ratelimited(ne_misc_dev.this_device,
 537                                    "Invalid core id %d - ne_enclave\n", core_id);
 538
 539                return -NE_ERR_VCPU_INVALID_CPU_CORE;
 540        }
 541
 542        for_each_cpu(cpu, ne_cpu_pool.avail_threads_per_core[core_id])
 543                cpumask_set_cpu(cpu, ne_enclave->threads_per_core[core_id]);
 544
 545        cpumask_clear(ne_cpu_pool.avail_threads_per_core[core_id]);
 546
 547        return 0;
 548}
 549
 550/**
 551 * ne_get_cpu_from_cpu_pool() - Get a CPU from the NE CPU pool, either from the
 552 *                              remaining sibling(s) of a CPU core or the first
 553 *                              sibling of a new CPU core.
 554 * @ne_enclave :        Private data associated with the current enclave.
 555 * @vcpu_id:            vCPU to get from the NE CPU pool.
 556 *
 557 * Context: Process context. This function is called with the ne_enclave mutex held.
 558 * Return:
 559 * * 0 on success.
 560 * * Negative return value on failure.
 561 */
 562static int ne_get_cpu_from_cpu_pool(struct ne_enclave *ne_enclave, u32 *vcpu_id)
 563{
 564        int core_id = -1;
 565        unsigned int cpu = 0;
 566        unsigned int i = 0;
 567        int rc = -EINVAL;
 568
 569        /*
 570         * If previously allocated a thread of a core to this enclave, first
 571         * check remaining sibling(s) for new CPU allocations, so that full
 572         * CPU cores are used for the enclave.
 573         */
 574        for (i = 0; i < ne_enclave->nr_parent_vm_cores; i++)
 575                for_each_cpu(cpu, ne_enclave->threads_per_core[i])
 576                        if (!ne_donated_cpu(ne_enclave, cpu)) {
 577                                *vcpu_id = cpu;
 578
 579                                return 0;
 580                        }
 581
 582        mutex_lock(&ne_cpu_pool.mutex);
 583
 584        /*
 585         * If no remaining siblings, get a core from the NE CPU pool and keep
 586         * track of all the threads in the enclave threads per core data structure.
 587         */
 588        core_id = ne_get_unused_core_from_cpu_pool();
 589
 590        rc = ne_set_enclave_threads_per_core(ne_enclave, core_id, *vcpu_id);
 591        if (rc < 0)
 592                goto unlock_mutex;
 593
 594        *vcpu_id = cpumask_any(ne_enclave->threads_per_core[core_id]);
 595
 596        rc = 0;
 597
 598unlock_mutex:
 599        mutex_unlock(&ne_cpu_pool.mutex);
 600
 601        return rc;
 602}
 603
 604/**
 605 * ne_get_vcpu_core_from_cpu_pool() - Get from the NE CPU pool the id of the
 606 *                                    core associated with the provided vCPU.
 607 * @vcpu_id:    Provided vCPU id to get its associated core id.
 608 *
 609 * Context: Process context. This function is called with the ne_enclave and
 610 *          ne_cpu_pool mutexes held.
 611 * Return:
 612 * * Core id.
 613 * * -1 if the provided vCPU is not in the pool.
 614 */
 615static int ne_get_vcpu_core_from_cpu_pool(u32 vcpu_id)
 616{
 617        int core_id = -1;
 618        unsigned int i = 0;
 619
 620        for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++)
 621                if (cpumask_test_cpu(vcpu_id, ne_cpu_pool.avail_threads_per_core[i])) {
 622                        core_id = i;
 623
 624                        break;
 625        }
 626
 627        return core_id;
 628}
 629
 630/**
 631 * ne_check_cpu_in_cpu_pool() - Check if the given vCPU is in the available CPUs
 632 *                              from the pool.
 633 * @ne_enclave :        Private data associated with the current enclave.
 634 * @vcpu_id:            ID of the vCPU to check if available in the NE CPU pool.
 635 *
 636 * Context: Process context. This function is called with the ne_enclave mutex held.
 637 * Return:
 638 * * 0 on success.
 639 * * Negative return value on failure.
 640 */
 641static int ne_check_cpu_in_cpu_pool(struct ne_enclave *ne_enclave, u32 vcpu_id)
 642{
 643        int core_id = -1;
 644        unsigned int i = 0;
 645        int rc = -EINVAL;
 646
 647        if (ne_donated_cpu(ne_enclave, vcpu_id)) {
 648                dev_err_ratelimited(ne_misc_dev.this_device,
 649                                    "CPU %d already used\n", vcpu_id);
 650
 651                return -NE_ERR_VCPU_ALREADY_USED;
 652        }
 653
 654        /*
 655         * If previously allocated a thread of a core to this enclave, but not
 656         * the full core, first check remaining sibling(s).
 657         */
 658        for (i = 0; i < ne_enclave->nr_parent_vm_cores; i++)
 659                if (cpumask_test_cpu(vcpu_id, ne_enclave->threads_per_core[i]))
 660                        return 0;
 661
 662        mutex_lock(&ne_cpu_pool.mutex);
 663
 664        /*
 665         * If no remaining siblings, get from the NE CPU pool the core
 666         * associated with the vCPU and keep track of all the threads in the
 667         * enclave threads per core data structure.
 668         */
 669        core_id = ne_get_vcpu_core_from_cpu_pool(vcpu_id);
 670
 671        rc = ne_set_enclave_threads_per_core(ne_enclave, core_id, vcpu_id);
 672        if (rc < 0)
 673                goto unlock_mutex;
 674
 675        rc = 0;
 676
 677unlock_mutex:
 678        mutex_unlock(&ne_cpu_pool.mutex);
 679
 680        return rc;
 681}
 682
 683/**
 684 * ne_add_vcpu_ioctl() - Add a vCPU to the slot associated with the current
 685 *                       enclave.
 686 * @ne_enclave :        Private data associated with the current enclave.
 687 * @vcpu_id:            ID of the CPU to be associated with the given slot,
 688 *                      apic id on x86.
 689 *
 690 * Context: Process context. This function is called with the ne_enclave mutex held.
 691 * Return:
 692 * * 0 on success.
 693 * * Negative return value on failure.
 694 */
 695static int ne_add_vcpu_ioctl(struct ne_enclave *ne_enclave, u32 vcpu_id)
 696{
 697        struct ne_pci_dev_cmd_reply cmd_reply = {};
 698        struct pci_dev *pdev = ne_devs.ne_pci_dev->pdev;
 699        int rc = -EINVAL;
 700        struct slot_add_vcpu_req slot_add_vcpu_req = {};
 701
 702        if (ne_enclave->mm != current->mm)
 703                return -EIO;
 704
 705        slot_add_vcpu_req.slot_uid = ne_enclave->slot_uid;
 706        slot_add_vcpu_req.vcpu_id = vcpu_id;
 707
 708        rc = ne_do_request(pdev, SLOT_ADD_VCPU,
 709                           &slot_add_vcpu_req, sizeof(slot_add_vcpu_req),
 710                           &cmd_reply, sizeof(cmd_reply));
 711        if (rc < 0) {
 712                dev_err_ratelimited(ne_misc_dev.this_device,
 713                                    "Error in slot add vCPU [rc=%d]\n", rc);
 714
 715                return rc;
 716        }
 717
 718        cpumask_set_cpu(vcpu_id, ne_enclave->vcpu_ids);
 719
 720        ne_enclave->nr_vcpus++;
 721
 722        return 0;
 723}
 724
 725/**
 726 * ne_sanity_check_user_mem_region() - Sanity check the user space memory
 727 *                                     region received during the set user
 728 *                                     memory region ioctl call.
 729 * @ne_enclave :        Private data associated with the current enclave.
 730 * @mem_region :        User space memory region to be sanity checked.
 731 *
 732 * Context: Process context. This function is called with the ne_enclave mutex held.
 733 * Return:
 734 * * 0 on success.
 735 * * Negative return value on failure.
 736 */
 737static int ne_sanity_check_user_mem_region(struct ne_enclave *ne_enclave,
 738                                           struct ne_user_memory_region mem_region)
 739{
 740        struct ne_mem_region *ne_mem_region = NULL;
 741
 742        if (ne_enclave->mm != current->mm)
 743                return -EIO;
 744
 745        if (mem_region.memory_size & (NE_MIN_MEM_REGION_SIZE - 1)) {
 746                dev_err_ratelimited(ne_misc_dev.this_device,
 747                                    "User space memory size is not multiple of 2 MiB\n");
 748
 749                return -NE_ERR_INVALID_MEM_REGION_SIZE;
 750        }
 751
 752        if (!IS_ALIGNED(mem_region.userspace_addr, NE_MIN_MEM_REGION_SIZE)) {
 753                dev_err_ratelimited(ne_misc_dev.this_device,
 754                                    "User space address is not 2 MiB aligned\n");
 755
 756                return -NE_ERR_UNALIGNED_MEM_REGION_ADDR;
 757        }
 758
 759        if ((mem_region.userspace_addr & (NE_MIN_MEM_REGION_SIZE - 1)) ||
 760            !access_ok((void __user *)(unsigned long)mem_region.userspace_addr,
 761                       mem_region.memory_size)) {
 762                dev_err_ratelimited(ne_misc_dev.this_device,
 763                                    "Invalid user space address range\n");
 764
 765                return -NE_ERR_INVALID_MEM_REGION_ADDR;
 766        }
 767
 768        list_for_each_entry(ne_mem_region, &ne_enclave->mem_regions_list,
 769                            mem_region_list_entry) {
 770                u64 memory_size = ne_mem_region->memory_size;
 771                u64 userspace_addr = ne_mem_region->userspace_addr;
 772
 773                if ((userspace_addr <= mem_region.userspace_addr &&
 774                     mem_region.userspace_addr < (userspace_addr + memory_size)) ||
 775                    (mem_region.userspace_addr <= userspace_addr &&
 776                    (mem_region.userspace_addr + mem_region.memory_size) > userspace_addr)) {
 777                        dev_err_ratelimited(ne_misc_dev.this_device,
 778                                            "User space memory region already used\n");
 779
 780                        return -NE_ERR_MEM_REGION_ALREADY_USED;
 781                }
 782        }
 783
 784        return 0;
 785}
 786
 787/**
 788 * ne_sanity_check_user_mem_region_page() - Sanity check a page from the user space
 789 *                                          memory region received during the set
 790 *                                          user memory region ioctl call.
 791 * @ne_enclave :        Private data associated with the current enclave.
 792 * @mem_region_page:    Page from the user space memory region to be sanity checked.
 793 *
 794 * Context: Process context. This function is called with the ne_enclave mutex held.
 795 * Return:
 796 * * 0 on success.
 797 * * Negative return value on failure.
 798 */
 799static int ne_sanity_check_user_mem_region_page(struct ne_enclave *ne_enclave,
 800                                                struct page *mem_region_page)
 801{
 802        if (!PageHuge(mem_region_page)) {
 803                dev_err_ratelimited(ne_misc_dev.this_device,
 804                                    "Not a hugetlbfs page\n");
 805
 806                return -NE_ERR_MEM_NOT_HUGE_PAGE;
 807        }
 808
 809        if (page_size(mem_region_page) & (NE_MIN_MEM_REGION_SIZE - 1)) {
 810                dev_err_ratelimited(ne_misc_dev.this_device,
 811                                    "Page size not multiple of 2 MiB\n");
 812
 813                return -NE_ERR_INVALID_PAGE_SIZE;
 814        }
 815
 816        if (ne_enclave->numa_node != page_to_nid(mem_region_page)) {
 817                dev_err_ratelimited(ne_misc_dev.this_device,
 818                                    "Page is not from NUMA node %d\n",
 819                                    ne_enclave->numa_node);
 820
 821                return -NE_ERR_MEM_DIFFERENT_NUMA_NODE;
 822        }
 823
 824        return 0;
 825}
 826
 827/**
 828 * ne_set_user_memory_region_ioctl() - Add user space memory region to the slot
 829 *                                     associated with the current enclave.
 830 * @ne_enclave :        Private data associated with the current enclave.
 831 * @mem_region :        User space memory region to be associated with the given slot.
 832 *
 833 * Context: Process context. This function is called with the ne_enclave mutex held.
 834 * Return:
 835 * * 0 on success.
 836 * * Negative return value on failure.
 837 */
 838static int ne_set_user_memory_region_ioctl(struct ne_enclave *ne_enclave,
 839                                           struct ne_user_memory_region mem_region)
 840{
 841        long gup_rc = 0;
 842        unsigned long i = 0;
 843        unsigned long max_nr_pages = 0;
 844        unsigned long memory_size = 0;
 845        struct ne_mem_region *ne_mem_region = NULL;
 846        unsigned long nr_phys_contig_mem_regions = 0;
 847        struct pci_dev *pdev = ne_devs.ne_pci_dev->pdev;
 848        struct page **phys_contig_mem_regions = NULL;
 849        int rc = -EINVAL;
 850
 851        rc = ne_sanity_check_user_mem_region(ne_enclave, mem_region);
 852        if (rc < 0)
 853                return rc;
 854
 855        ne_mem_region = kzalloc(sizeof(*ne_mem_region), GFP_KERNEL);
 856        if (!ne_mem_region)
 857                return -ENOMEM;
 858
 859        max_nr_pages = mem_region.memory_size / NE_MIN_MEM_REGION_SIZE;
 860
 861        ne_mem_region->pages = kcalloc(max_nr_pages, sizeof(*ne_mem_region->pages),
 862                                       GFP_KERNEL);
 863        if (!ne_mem_region->pages) {
 864                rc = -ENOMEM;
 865
 866                goto free_mem_region;
 867        }
 868
 869        phys_contig_mem_regions = kcalloc(max_nr_pages, sizeof(*phys_contig_mem_regions),
 870                                          GFP_KERNEL);
 871        if (!phys_contig_mem_regions) {
 872                rc = -ENOMEM;
 873
 874                goto free_mem_region;
 875        }
 876
 877        do {
 878                i = ne_mem_region->nr_pages;
 879
 880                if (i == max_nr_pages) {
 881                        dev_err_ratelimited(ne_misc_dev.this_device,
 882                                            "Reached max nr of pages in the pages data struct\n");
 883
 884                        rc = -ENOMEM;
 885
 886                        goto put_pages;
 887                }
 888
 889                gup_rc = get_user_pages_unlocked(mem_region.userspace_addr + memory_size, 1,
 890                                                 ne_mem_region->pages + i, FOLL_GET);
 891
 892                if (gup_rc < 0) {
 893                        rc = gup_rc;
 894
 895                        dev_err_ratelimited(ne_misc_dev.this_device,
 896                                            "Error in get user pages [rc=%d]\n", rc);
 897
 898                        goto put_pages;
 899                }
 900
 901                rc = ne_sanity_check_user_mem_region_page(ne_enclave, ne_mem_region->pages[i]);
 902                if (rc < 0)
 903                        goto put_pages;
 904
 905                /*
 906                 * TODO: Update once handled non-contiguous memory regions
 907                 * received from user space or contiguous physical memory regions
 908                 * larger than 2 MiB e.g. 8 MiB.
 909                 */
 910                phys_contig_mem_regions[i] = ne_mem_region->pages[i];
 911
 912                memory_size += page_size(ne_mem_region->pages[i]);
 913
 914                ne_mem_region->nr_pages++;
 915        } while (memory_size < mem_region.memory_size);
 916
 917        /*
 918         * TODO: Update once handled non-contiguous memory regions received
 919         * from user space or contiguous physical memory regions larger than
 920         * 2 MiB e.g. 8 MiB.
 921         */
 922        nr_phys_contig_mem_regions = ne_mem_region->nr_pages;
 923
 924        if ((ne_enclave->nr_mem_regions + nr_phys_contig_mem_regions) >
 925            ne_enclave->max_mem_regions) {
 926                dev_err_ratelimited(ne_misc_dev.this_device,
 927                                    "Reached max memory regions %lld\n",
 928                                    ne_enclave->max_mem_regions);
 929
 930                rc = -NE_ERR_MEM_MAX_REGIONS;
 931
 932                goto put_pages;
 933        }
 934
 935        for (i = 0; i < nr_phys_contig_mem_regions; i++) {
 936                u64 phys_region_addr = page_to_phys(phys_contig_mem_regions[i]);
 937                u64 phys_region_size = page_size(phys_contig_mem_regions[i]);
 938
 939                if (phys_region_size & (NE_MIN_MEM_REGION_SIZE - 1)) {
 940                        dev_err_ratelimited(ne_misc_dev.this_device,
 941                                            "Physical mem region size is not multiple of 2 MiB\n");
 942
 943                        rc = -EINVAL;
 944
 945                        goto put_pages;
 946                }
 947
 948                if (!IS_ALIGNED(phys_region_addr, NE_MIN_MEM_REGION_SIZE)) {
 949                        dev_err_ratelimited(ne_misc_dev.this_device,
 950                                            "Physical mem region address is not 2 MiB aligned\n");
 951
 952                        rc = -EINVAL;
 953
 954                        goto put_pages;
 955                }
 956        }
 957
 958        ne_mem_region->memory_size = mem_region.memory_size;
 959        ne_mem_region->userspace_addr = mem_region.userspace_addr;
 960
 961        list_add(&ne_mem_region->mem_region_list_entry, &ne_enclave->mem_regions_list);
 962
 963        for (i = 0; i < nr_phys_contig_mem_regions; i++) {
 964                struct ne_pci_dev_cmd_reply cmd_reply = {};
 965                struct slot_add_mem_req slot_add_mem_req = {};
 966
 967                slot_add_mem_req.slot_uid = ne_enclave->slot_uid;
 968                slot_add_mem_req.paddr = page_to_phys(phys_contig_mem_regions[i]);
 969                slot_add_mem_req.size = page_size(phys_contig_mem_regions[i]);
 970
 971                rc = ne_do_request(pdev, SLOT_ADD_MEM,
 972                                   &slot_add_mem_req, sizeof(slot_add_mem_req),
 973                                   &cmd_reply, sizeof(cmd_reply));
 974                if (rc < 0) {
 975                        dev_err_ratelimited(ne_misc_dev.this_device,
 976                                            "Error in slot add mem [rc=%d]\n", rc);
 977
 978                        kfree(phys_contig_mem_regions);
 979
 980                        /*
 981                         * Exit here without put pages as memory regions may
 982                         * already been added.
 983                         */
 984                        return rc;
 985                }
 986
 987                ne_enclave->mem_size += slot_add_mem_req.size;
 988                ne_enclave->nr_mem_regions++;
 989        }
 990
 991        kfree(phys_contig_mem_regions);
 992
 993        return 0;
 994
 995put_pages:
 996        for (i = 0; i < ne_mem_region->nr_pages; i++)
 997                put_page(ne_mem_region->pages[i]);
 998free_mem_region:
 999        kfree(phys_contig_mem_regions);
1000        kfree(ne_mem_region->pages);
1001        kfree(ne_mem_region);
1002
1003        return rc;
1004}
1005
1006/**
1007 * ne_start_enclave_ioctl() - Trigger enclave start after the enclave resources,
1008 *                            such as memory and CPU, have been set.
1009 * @ne_enclave :                Private data associated with the current enclave.
1010 * @enclave_start_info :        Enclave info that includes enclave cid and flags.
1011 *
1012 * Context: Process context. This function is called with the ne_enclave mutex held.
1013 * Return:
1014 * * 0 on success.
1015 * * Negative return value on failure.
1016 */
1017static int ne_start_enclave_ioctl(struct ne_enclave *ne_enclave,
1018                                  struct ne_enclave_start_info *enclave_start_info)
1019{
1020        struct ne_pci_dev_cmd_reply cmd_reply = {};
1021        unsigned int cpu = 0;
1022        struct enclave_start_req enclave_start_req = {};
1023        unsigned int i = 0;
1024        struct pci_dev *pdev = ne_devs.ne_pci_dev->pdev;
1025        int rc = -EINVAL;
1026
1027        if (!ne_enclave->nr_mem_regions) {
1028                dev_err_ratelimited(ne_misc_dev.this_device,
1029                                    "Enclave has no mem regions\n");
1030
1031                return -NE_ERR_NO_MEM_REGIONS_ADDED;
1032        }
1033
1034        if (ne_enclave->mem_size < NE_MIN_ENCLAVE_MEM_SIZE) {
1035                dev_err_ratelimited(ne_misc_dev.this_device,
1036                                    "Enclave memory is less than %ld\n",
1037                                    NE_MIN_ENCLAVE_MEM_SIZE);
1038
1039                return -NE_ERR_ENCLAVE_MEM_MIN_SIZE;
1040        }
1041
1042        if (!ne_enclave->nr_vcpus) {
1043                dev_err_ratelimited(ne_misc_dev.this_device,
1044                                    "Enclave has no vCPUs\n");
1045
1046                return -NE_ERR_NO_VCPUS_ADDED;
1047        }
1048
1049        for (i = 0; i < ne_enclave->nr_parent_vm_cores; i++)
1050                for_each_cpu(cpu, ne_enclave->threads_per_core[i])
1051                        if (!cpumask_test_cpu(cpu, ne_enclave->vcpu_ids)) {
1052                                dev_err_ratelimited(ne_misc_dev.this_device,
1053                                                    "Full CPU cores not used\n");
1054
1055                                return -NE_ERR_FULL_CORES_NOT_USED;
1056                        }
1057
1058        enclave_start_req.enclave_cid = enclave_start_info->enclave_cid;
1059        enclave_start_req.flags = enclave_start_info->flags;
1060        enclave_start_req.slot_uid = ne_enclave->slot_uid;
1061
1062        rc = ne_do_request(pdev, ENCLAVE_START,
1063                           &enclave_start_req, sizeof(enclave_start_req),
1064                           &cmd_reply, sizeof(cmd_reply));
1065        if (rc < 0) {
1066                dev_err_ratelimited(ne_misc_dev.this_device,
1067                                    "Error in enclave start [rc=%d]\n", rc);
1068
1069                return rc;
1070        }
1071
1072        ne_enclave->state = NE_STATE_RUNNING;
1073
1074        enclave_start_info->enclave_cid = cmd_reply.enclave_cid;
1075
1076        return 0;
1077}
1078
1079/**
1080 * ne_enclave_ioctl() - Ioctl function provided by the enclave file.
1081 * @file:       File associated with this ioctl function.
1082 * @cmd:        The command that is set for the ioctl call.
1083 * @arg:        The argument that is provided for the ioctl call.
1084 *
1085 * Context: Process context.
1086 * Return:
1087 * * 0 on success.
1088 * * Negative return value on failure.
1089 */
1090static long ne_enclave_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1091{
1092        struct ne_enclave *ne_enclave = file->private_data;
1093
1094        switch (cmd) {
1095        case NE_ADD_VCPU: {
1096                int rc = -EINVAL;
1097                u32 vcpu_id = 0;
1098
1099                if (copy_from_user(&vcpu_id, (void __user *)arg, sizeof(vcpu_id)))
1100                        return -EFAULT;
1101
1102                mutex_lock(&ne_enclave->enclave_info_mutex);
1103
1104                if (ne_enclave->state != NE_STATE_INIT) {
1105                        dev_err_ratelimited(ne_misc_dev.this_device,
1106                                            "Enclave is not in init state\n");
1107
1108                        mutex_unlock(&ne_enclave->enclave_info_mutex);
1109
1110                        return -NE_ERR_NOT_IN_INIT_STATE;
1111                }
1112
1113                if (vcpu_id >= (ne_enclave->nr_parent_vm_cores *
1114                    ne_enclave->nr_threads_per_core)) {
1115                        dev_err_ratelimited(ne_misc_dev.this_device,
1116                                            "vCPU id higher than max CPU id\n");
1117
1118                        mutex_unlock(&ne_enclave->enclave_info_mutex);
1119
1120                        return -NE_ERR_INVALID_VCPU;
1121                }
1122
1123                if (!vcpu_id) {
1124                        /* Use the CPU pool for choosing a CPU for the enclave. */
1125                        rc = ne_get_cpu_from_cpu_pool(ne_enclave, &vcpu_id);
1126                        if (rc < 0) {
1127                                dev_err_ratelimited(ne_misc_dev.this_device,
1128                                                    "Error in get CPU from pool [rc=%d]\n",
1129                                                    rc);
1130
1131                                mutex_unlock(&ne_enclave->enclave_info_mutex);
1132
1133                                return rc;
1134                        }
1135                } else {
1136                        /* Check if the provided vCPU is available in the NE CPU pool. */
1137                        rc = ne_check_cpu_in_cpu_pool(ne_enclave, vcpu_id);
1138                        if (rc < 0) {
1139                                dev_err_ratelimited(ne_misc_dev.this_device,
1140                                                    "Error in check CPU %d in pool [rc=%d]\n",
1141                                                    vcpu_id, rc);
1142
1143                                mutex_unlock(&ne_enclave->enclave_info_mutex);
1144
1145                                return rc;
1146                        }
1147                }
1148
1149                rc = ne_add_vcpu_ioctl(ne_enclave, vcpu_id);
1150                if (rc < 0) {
1151                        mutex_unlock(&ne_enclave->enclave_info_mutex);
1152
1153                        return rc;
1154                }
1155
1156                mutex_unlock(&ne_enclave->enclave_info_mutex);
1157
1158                if (copy_to_user((void __user *)arg, &vcpu_id, sizeof(vcpu_id)))
1159                        return -EFAULT;
1160
1161                return 0;
1162        }
1163
1164        case NE_GET_IMAGE_LOAD_INFO: {
1165                struct ne_image_load_info image_load_info = {};
1166
1167                if (copy_from_user(&image_load_info, (void __user *)arg, sizeof(image_load_info)))
1168                        return -EFAULT;
1169
1170                mutex_lock(&ne_enclave->enclave_info_mutex);
1171
1172                if (ne_enclave->state != NE_STATE_INIT) {
1173                        dev_err_ratelimited(ne_misc_dev.this_device,
1174                                            "Enclave is not in init state\n");
1175
1176                        mutex_unlock(&ne_enclave->enclave_info_mutex);
1177
1178                        return -NE_ERR_NOT_IN_INIT_STATE;
1179                }
1180
1181                mutex_unlock(&ne_enclave->enclave_info_mutex);
1182
1183                if (!image_load_info.flags ||
1184                    image_load_info.flags >= NE_IMAGE_LOAD_MAX_FLAG_VAL) {
1185                        dev_err_ratelimited(ne_misc_dev.this_device,
1186                                            "Incorrect flag in enclave image load info\n");
1187
1188                        return -NE_ERR_INVALID_FLAG_VALUE;
1189                }
1190
1191                if (image_load_info.flags == NE_EIF_IMAGE)
1192                        image_load_info.memory_offset = NE_EIF_LOAD_OFFSET;
1193
1194                if (copy_to_user((void __user *)arg, &image_load_info, sizeof(image_load_info)))
1195                        return -EFAULT;
1196
1197                return 0;
1198        }
1199
1200        case NE_SET_USER_MEMORY_REGION: {
1201                struct ne_user_memory_region mem_region = {};
1202                int rc = -EINVAL;
1203
1204                if (copy_from_user(&mem_region, (void __user *)arg, sizeof(mem_region)))
1205                        return -EFAULT;
1206
1207                if (mem_region.flags >= NE_MEMORY_REGION_MAX_FLAG_VAL) {
1208                        dev_err_ratelimited(ne_misc_dev.this_device,
1209                                            "Incorrect flag for user memory region\n");
1210
1211                        return -NE_ERR_INVALID_FLAG_VALUE;
1212                }
1213
1214                mutex_lock(&ne_enclave->enclave_info_mutex);
1215
1216                if (ne_enclave->state != NE_STATE_INIT) {
1217                        dev_err_ratelimited(ne_misc_dev.this_device,
1218                                            "Enclave is not in init state\n");
1219
1220                        mutex_unlock(&ne_enclave->enclave_info_mutex);
1221
1222                        return -NE_ERR_NOT_IN_INIT_STATE;
1223                }
1224
1225                rc = ne_set_user_memory_region_ioctl(ne_enclave, mem_region);
1226                if (rc < 0) {
1227                        mutex_unlock(&ne_enclave->enclave_info_mutex);
1228
1229                        return rc;
1230                }
1231
1232                mutex_unlock(&ne_enclave->enclave_info_mutex);
1233
1234                return 0;
1235        }
1236
1237        case NE_START_ENCLAVE: {
1238                struct ne_enclave_start_info enclave_start_info = {};
1239                int rc = -EINVAL;
1240
1241                if (copy_from_user(&enclave_start_info, (void __user *)arg,
1242                                   sizeof(enclave_start_info)))
1243                        return -EFAULT;
1244
1245                if (enclave_start_info.flags >= NE_ENCLAVE_START_MAX_FLAG_VAL) {
1246                        dev_err_ratelimited(ne_misc_dev.this_device,
1247                                            "Incorrect flag in enclave start info\n");
1248
1249                        return -NE_ERR_INVALID_FLAG_VALUE;
1250                }
1251
1252                /*
1253                 * Do not use well-known CIDs - 0, 1, 2 - for enclaves.
1254                 * VMADDR_CID_ANY = -1U
1255                 * VMADDR_CID_HYPERVISOR = 0
1256                 * VMADDR_CID_LOCAL = 1
1257                 * VMADDR_CID_HOST = 2
1258                 * Note: 0 is used as a placeholder to auto-generate an enclave CID.
1259                 * http://man7.org/linux/man-pages/man7/vsock.7.html
1260                 */
1261                if (enclave_start_info.enclave_cid > 0 &&
1262                    enclave_start_info.enclave_cid <= VMADDR_CID_HOST) {
1263                        dev_err_ratelimited(ne_misc_dev.this_device,
1264                                            "Well-known CID value, not to be used for enclaves\n");
1265
1266                        return -NE_ERR_INVALID_ENCLAVE_CID;
1267                }
1268
1269                if (enclave_start_info.enclave_cid == U32_MAX) {
1270                        dev_err_ratelimited(ne_misc_dev.this_device,
1271                                            "Well-known CID value, not to be used for enclaves\n");
1272
1273                        return -NE_ERR_INVALID_ENCLAVE_CID;
1274                }
1275
1276                /*
1277                 * Do not use the CID of the primary / parent VM for enclaves.
1278                 */
1279                if (enclave_start_info.enclave_cid == NE_PARENT_VM_CID) {
1280                        dev_err_ratelimited(ne_misc_dev.this_device,
1281                                            "CID of the parent VM, not to be used for enclaves\n");
1282
1283                        return -NE_ERR_INVALID_ENCLAVE_CID;
1284                }
1285
1286                /* 64-bit CIDs are not yet supported for the vsock device. */
1287                if (enclave_start_info.enclave_cid > U32_MAX) {
1288                        dev_err_ratelimited(ne_misc_dev.this_device,
1289                                            "64-bit CIDs not yet supported for the vsock device\n");
1290
1291                        return -NE_ERR_INVALID_ENCLAVE_CID;
1292                }
1293
1294                mutex_lock(&ne_enclave->enclave_info_mutex);
1295
1296                if (ne_enclave->state != NE_STATE_INIT) {
1297                        dev_err_ratelimited(ne_misc_dev.this_device,
1298                                            "Enclave is not in init state\n");
1299
1300                        mutex_unlock(&ne_enclave->enclave_info_mutex);
1301
1302                        return -NE_ERR_NOT_IN_INIT_STATE;
1303                }
1304
1305                rc = ne_start_enclave_ioctl(ne_enclave, &enclave_start_info);
1306                if (rc < 0) {
1307                        mutex_unlock(&ne_enclave->enclave_info_mutex);
1308
1309                        return rc;
1310                }
1311
1312                mutex_unlock(&ne_enclave->enclave_info_mutex);
1313
1314                if (copy_to_user((void __user *)arg, &enclave_start_info,
1315                                 sizeof(enclave_start_info)))
1316                        return -EFAULT;
1317
1318                return 0;
1319        }
1320
1321        default:
1322                return -ENOTTY;
1323        }
1324
1325        return 0;
1326}
1327
1328/**
1329 * ne_enclave_remove_all_mem_region_entries() - Remove all memory region entries
1330 *                                              from the enclave data structure.
1331 * @ne_enclave :        Private data associated with the current enclave.
1332 *
1333 * Context: Process context. This function is called with the ne_enclave mutex held.
1334 */
1335static void ne_enclave_remove_all_mem_region_entries(struct ne_enclave *ne_enclave)
1336{
1337        unsigned long i = 0;
1338        struct ne_mem_region *ne_mem_region = NULL;
1339        struct ne_mem_region *ne_mem_region_tmp = NULL;
1340
1341        list_for_each_entry_safe(ne_mem_region, ne_mem_region_tmp,
1342                                 &ne_enclave->mem_regions_list,
1343                                 mem_region_list_entry) {
1344                list_del(&ne_mem_region->mem_region_list_entry);
1345
1346                for (i = 0; i < ne_mem_region->nr_pages; i++)
1347                        put_page(ne_mem_region->pages[i]);
1348
1349                kfree(ne_mem_region->pages);
1350
1351                kfree(ne_mem_region);
1352        }
1353}
1354
1355/**
1356 * ne_enclave_remove_all_vcpu_id_entries() - Remove all vCPU id entries from
1357 *                                           the enclave data structure.
1358 * @ne_enclave :        Private data associated with the current enclave.
1359 *
1360 * Context: Process context. This function is called with the ne_enclave mutex held.
1361 */
1362static void ne_enclave_remove_all_vcpu_id_entries(struct ne_enclave *ne_enclave)
1363{
1364        unsigned int cpu = 0;
1365        unsigned int i = 0;
1366
1367        mutex_lock(&ne_cpu_pool.mutex);
1368
1369        for (i = 0; i < ne_enclave->nr_parent_vm_cores; i++) {
1370                for_each_cpu(cpu, ne_enclave->threads_per_core[i])
1371                        /* Update the available NE CPU pool. */
1372                        cpumask_set_cpu(cpu, ne_cpu_pool.avail_threads_per_core[i]);
1373
1374                free_cpumask_var(ne_enclave->threads_per_core[i]);
1375        }
1376
1377        mutex_unlock(&ne_cpu_pool.mutex);
1378
1379        kfree(ne_enclave->threads_per_core);
1380
1381        free_cpumask_var(ne_enclave->vcpu_ids);
1382}
1383
1384/**
1385 * ne_pci_dev_remove_enclave_entry() - Remove the enclave entry from the data
1386 *                                     structure that is part of the NE PCI
1387 *                                     device private data.
1388 * @ne_enclave :        Private data associated with the current enclave.
1389 * @ne_pci_dev :        Private data associated with the PCI device.
1390 *
1391 * Context: Process context. This function is called with the ne_pci_dev enclave
1392 *          mutex held.
1393 */
1394static void ne_pci_dev_remove_enclave_entry(struct ne_enclave *ne_enclave,
1395                                            struct ne_pci_dev *ne_pci_dev)
1396{
1397        struct ne_enclave *ne_enclave_entry = NULL;
1398        struct ne_enclave *ne_enclave_entry_tmp = NULL;
1399
1400        list_for_each_entry_safe(ne_enclave_entry, ne_enclave_entry_tmp,
1401                                 &ne_pci_dev->enclaves_list, enclave_list_entry) {
1402                if (ne_enclave_entry->slot_uid == ne_enclave->slot_uid) {
1403                        list_del(&ne_enclave_entry->enclave_list_entry);
1404
1405                        break;
1406                }
1407        }
1408}
1409
1410/**
1411 * ne_enclave_release() - Release function provided by the enclave file.
1412 * @inode:      Inode associated with this file release function.
1413 * @file:       File associated with this release function.
1414 *
1415 * Context: Process context.
1416 * Return:
1417 * * 0 on success.
1418 * * Negative return value on failure.
1419 */
1420static int ne_enclave_release(struct inode *inode, struct file *file)
1421{
1422        struct ne_pci_dev_cmd_reply cmd_reply = {};
1423        struct enclave_stop_req enclave_stop_request = {};
1424        struct ne_enclave *ne_enclave = file->private_data;
1425        struct ne_pci_dev *ne_pci_dev = ne_devs.ne_pci_dev;
1426        struct pci_dev *pdev = ne_pci_dev->pdev;
1427        int rc = -EINVAL;
1428        struct slot_free_req slot_free_req = {};
1429
1430        if (!ne_enclave)
1431                return 0;
1432
1433        /*
1434         * Early exit in case there is an error in the enclave creation logic
1435         * and fput() is called on the cleanup path.
1436         */
1437        if (!ne_enclave->slot_uid)
1438                return 0;
1439
1440        /*
1441         * Acquire the enclave list mutex before the enclave mutex
1442         * in order to avoid deadlocks with @ref ne_event_work_handler.
1443         */
1444        mutex_lock(&ne_pci_dev->enclaves_list_mutex);
1445        mutex_lock(&ne_enclave->enclave_info_mutex);
1446
1447        if (ne_enclave->state != NE_STATE_INIT && ne_enclave->state != NE_STATE_STOPPED) {
1448                enclave_stop_request.slot_uid = ne_enclave->slot_uid;
1449
1450                rc = ne_do_request(pdev, ENCLAVE_STOP,
1451                                   &enclave_stop_request, sizeof(enclave_stop_request),
1452                                   &cmd_reply, sizeof(cmd_reply));
1453                if (rc < 0) {
1454                        dev_err_ratelimited(ne_misc_dev.this_device,
1455                                            "Error in enclave stop [rc=%d]\n", rc);
1456
1457                        goto unlock_mutex;
1458                }
1459
1460                memset(&cmd_reply, 0, sizeof(cmd_reply));
1461        }
1462
1463        slot_free_req.slot_uid = ne_enclave->slot_uid;
1464
1465        rc = ne_do_request(pdev, SLOT_FREE,
1466                           &slot_free_req, sizeof(slot_free_req),
1467                           &cmd_reply, sizeof(cmd_reply));
1468        if (rc < 0) {
1469                dev_err_ratelimited(ne_misc_dev.this_device,
1470                                    "Error in slot free [rc=%d]\n", rc);
1471
1472                goto unlock_mutex;
1473        }
1474
1475        ne_pci_dev_remove_enclave_entry(ne_enclave, ne_pci_dev);
1476        ne_enclave_remove_all_mem_region_entries(ne_enclave);
1477        ne_enclave_remove_all_vcpu_id_entries(ne_enclave);
1478
1479        mutex_unlock(&ne_enclave->enclave_info_mutex);
1480        mutex_unlock(&ne_pci_dev->enclaves_list_mutex);
1481
1482        kfree(ne_enclave);
1483
1484        return 0;
1485
1486unlock_mutex:
1487        mutex_unlock(&ne_enclave->enclave_info_mutex);
1488        mutex_unlock(&ne_pci_dev->enclaves_list_mutex);
1489
1490        return rc;
1491}
1492
1493/**
1494 * ne_enclave_poll() - Poll functionality used for enclave out-of-band events.
1495 * @file:       File associated with this poll function.
1496 * @wait:       Poll table data structure.
1497 *
1498 * Context: Process context.
1499 * Return:
1500 * * Poll mask.
1501 */
1502static __poll_t ne_enclave_poll(struct file *file, poll_table *wait)
1503{
1504        __poll_t mask = 0;
1505        struct ne_enclave *ne_enclave = file->private_data;
1506
1507        poll_wait(file, &ne_enclave->eventq, wait);
1508
1509        if (ne_enclave->has_event)
1510                mask |= EPOLLHUP;
1511
1512        return mask;
1513}
1514
1515static const struct file_operations ne_enclave_fops = {
1516        .owner          = THIS_MODULE,
1517        .llseek         = noop_llseek,
1518        .poll           = ne_enclave_poll,
1519        .unlocked_ioctl = ne_enclave_ioctl,
1520        .release        = ne_enclave_release,
1521};
1522
1523/**
1524 * ne_create_vm_ioctl() - Alloc slot to be associated with an enclave. Create
1525 *                        enclave file descriptor to be further used for enclave
1526 *                        resources handling e.g. memory regions and CPUs.
1527 * @ne_pci_dev :        Private data associated with the PCI device.
1528 * @slot_uid:           User pointer to store the generated unique slot id
1529 *                      associated with an enclave to.
1530 *
1531 * Context: Process context. This function is called with the ne_pci_dev enclave
1532 *          mutex held.
1533 * Return:
1534 * * Enclave fd on success.
1535 * * Negative return value on failure.
1536 */
1537static int ne_create_vm_ioctl(struct ne_pci_dev *ne_pci_dev, u64 __user *slot_uid)
1538{
1539        struct ne_pci_dev_cmd_reply cmd_reply = {};
1540        int enclave_fd = -1;
1541        struct file *enclave_file = NULL;
1542        unsigned int i = 0;
1543        struct ne_enclave *ne_enclave = NULL;
1544        struct pci_dev *pdev = ne_pci_dev->pdev;
1545        int rc = -EINVAL;
1546        struct slot_alloc_req slot_alloc_req = {};
1547
1548        mutex_lock(&ne_cpu_pool.mutex);
1549
1550        for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++)
1551                if (!cpumask_empty(ne_cpu_pool.avail_threads_per_core[i]))
1552                        break;
1553
1554        if (i == ne_cpu_pool.nr_parent_vm_cores) {
1555                dev_err_ratelimited(ne_misc_dev.this_device,
1556                                    "No CPUs available in CPU pool\n");
1557
1558                mutex_unlock(&ne_cpu_pool.mutex);
1559
1560                return -NE_ERR_NO_CPUS_AVAIL_IN_POOL;
1561        }
1562
1563        mutex_unlock(&ne_cpu_pool.mutex);
1564
1565        ne_enclave = kzalloc(sizeof(*ne_enclave), GFP_KERNEL);
1566        if (!ne_enclave)
1567                return -ENOMEM;
1568
1569        mutex_lock(&ne_cpu_pool.mutex);
1570
1571        ne_enclave->nr_parent_vm_cores = ne_cpu_pool.nr_parent_vm_cores;
1572        ne_enclave->nr_threads_per_core = ne_cpu_pool.nr_threads_per_core;
1573        ne_enclave->numa_node = ne_cpu_pool.numa_node;
1574
1575        mutex_unlock(&ne_cpu_pool.mutex);
1576
1577        ne_enclave->threads_per_core = kcalloc(ne_enclave->nr_parent_vm_cores,
1578                                               sizeof(*ne_enclave->threads_per_core),
1579                                               GFP_KERNEL);
1580        if (!ne_enclave->threads_per_core) {
1581                rc = -ENOMEM;
1582
1583                goto free_ne_enclave;
1584        }
1585
1586        for (i = 0; i < ne_enclave->nr_parent_vm_cores; i++)
1587                if (!zalloc_cpumask_var(&ne_enclave->threads_per_core[i], GFP_KERNEL)) {
1588                        rc = -ENOMEM;
1589
1590                        goto free_cpumask;
1591                }
1592
1593        if (!zalloc_cpumask_var(&ne_enclave->vcpu_ids, GFP_KERNEL)) {
1594                rc = -ENOMEM;
1595
1596                goto free_cpumask;
1597        }
1598
1599        enclave_fd = get_unused_fd_flags(O_CLOEXEC);
1600        if (enclave_fd < 0) {
1601                rc = enclave_fd;
1602
1603                dev_err_ratelimited(ne_misc_dev.this_device,
1604                                    "Error in getting unused fd [rc=%d]\n", rc);
1605
1606                goto free_cpumask;
1607        }
1608
1609        enclave_file = anon_inode_getfile("ne-vm", &ne_enclave_fops, ne_enclave, O_RDWR);
1610        if (IS_ERR(enclave_file)) {
1611                rc = PTR_ERR(enclave_file);
1612
1613                dev_err_ratelimited(ne_misc_dev.this_device,
1614                                    "Error in anon inode get file [rc=%d]\n", rc);
1615
1616                goto put_fd;
1617        }
1618
1619        rc = ne_do_request(pdev, SLOT_ALLOC,
1620                           &slot_alloc_req, sizeof(slot_alloc_req),
1621                           &cmd_reply, sizeof(cmd_reply));
1622        if (rc < 0) {
1623                dev_err_ratelimited(ne_misc_dev.this_device,
1624                                    "Error in slot alloc [rc=%d]\n", rc);
1625
1626                goto put_file;
1627        }
1628
1629        init_waitqueue_head(&ne_enclave->eventq);
1630        ne_enclave->has_event = false;
1631        mutex_init(&ne_enclave->enclave_info_mutex);
1632        ne_enclave->max_mem_regions = cmd_reply.mem_regions;
1633        INIT_LIST_HEAD(&ne_enclave->mem_regions_list);
1634        ne_enclave->mm = current->mm;
1635        ne_enclave->slot_uid = cmd_reply.slot_uid;
1636        ne_enclave->state = NE_STATE_INIT;
1637
1638        list_add(&ne_enclave->enclave_list_entry, &ne_pci_dev->enclaves_list);
1639
1640        if (copy_to_user(slot_uid, &ne_enclave->slot_uid, sizeof(ne_enclave->slot_uid))) {
1641                /*
1642                 * As we're holding the only reference to 'enclave_file', fput()
1643                 * will call ne_enclave_release() which will do a proper cleanup
1644                 * of all so far allocated resources, leaving only the unused fd
1645                 * for us to free.
1646                 */
1647                fput(enclave_file);
1648                put_unused_fd(enclave_fd);
1649
1650                return -EFAULT;
1651        }
1652
1653        fd_install(enclave_fd, enclave_file);
1654
1655        return enclave_fd;
1656
1657put_file:
1658        fput(enclave_file);
1659put_fd:
1660        put_unused_fd(enclave_fd);
1661free_cpumask:
1662        free_cpumask_var(ne_enclave->vcpu_ids);
1663        for (i = 0; i < ne_enclave->nr_parent_vm_cores; i++)
1664                free_cpumask_var(ne_enclave->threads_per_core[i]);
1665        kfree(ne_enclave->threads_per_core);
1666free_ne_enclave:
1667        kfree(ne_enclave);
1668
1669        return rc;
1670}
1671
1672/**
1673 * ne_ioctl() - Ioctl function provided by the NE misc device.
1674 * @file:       File associated with this ioctl function.
1675 * @cmd:        The command that is set for the ioctl call.
1676 * @arg:        The argument that is provided for the ioctl call.
1677 *
1678 * Context: Process context.
1679 * Return:
1680 * * Ioctl result (e.g. enclave file descriptor) on success.
1681 * * Negative return value on failure.
1682 */
1683static long ne_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1684{
1685        switch (cmd) {
1686        case NE_CREATE_VM: {
1687                int enclave_fd = -1;
1688                struct ne_pci_dev *ne_pci_dev = ne_devs.ne_pci_dev;
1689                u64 __user *slot_uid = (void __user *)arg;
1690
1691                mutex_lock(&ne_pci_dev->enclaves_list_mutex);
1692                enclave_fd = ne_create_vm_ioctl(ne_pci_dev, slot_uid);
1693                mutex_unlock(&ne_pci_dev->enclaves_list_mutex);
1694
1695                return enclave_fd;
1696        }
1697
1698        default:
1699                return -ENOTTY;
1700        }
1701
1702        return 0;
1703}
1704
1705static int __init ne_init(void)
1706{
1707        mutex_init(&ne_cpu_pool.mutex);
1708
1709        return pci_register_driver(&ne_pci_driver);
1710}
1711
1712static void __exit ne_exit(void)
1713{
1714        pci_unregister_driver(&ne_pci_driver);
1715
1716        ne_teardown_cpu_pool();
1717}
1718
1719module_init(ne_init);
1720module_exit(ne_exit);
1721
1722MODULE_AUTHOR("Amazon.com, Inc. or its affiliates");
1723MODULE_DESCRIPTION("Nitro Enclaves Driver");
1724MODULE_LICENSE("GPL v2");
1725