LXR linux/arch/x86/kernel/cpu/sgx/virt.c

   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Device driver to expose SGX enclave memory to KVM guests.
   4 *
   5 * Copyright(c) 2021 Intel Corporation.
   6 */
   7
   8#include <linux/miscdevice.h>
   9#include <linux/mm.h>
  10#include <linux/mman.h>
  11#include <linux/sched/mm.h>
  12#include <linux/sched/signal.h>
  13#include <linux/slab.h>
  14#include <linux/xarray.h>
  15#include <asm/sgx.h>
  16#include <uapi/asm/sgx.h>
  17
  18#include "encls.h"
  19#include "sgx.h"
  20
  21struct sgx_vepc {
  22        struct xarray page_array;
  23        struct mutex lock;
  24};
  25
  26/*
  27 * Temporary SECS pages that cannot be EREMOVE'd due to having child in other
  28 * virtual EPC instances, and the lock to protect it.
  29 */
  30static struct mutex zombie_secs_pages_lock;
  31static struct list_head zombie_secs_pages;
  32
  33static int __sgx_vepc_fault(struct sgx_vepc *vepc,
  34                            struct vm_area_struct *vma, unsigned long addr)
  35{
  36        struct sgx_epc_page *epc_page;
  37        unsigned long index, pfn;
  38        int ret;
  39
  40        WARN_ON(!mutex_is_locked(&vepc->lock));
  41
  42        /* Calculate index of EPC page in virtual EPC's page_array */
  43        index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start);
  44
  45        epc_page = xa_load(&vepc->page_array, index);
  46        if (epc_page)
  47                return 0;
  48
  49        epc_page = sgx_alloc_epc_page(vepc, false);
  50        if (IS_ERR(epc_page))
  51                return PTR_ERR(epc_page);
  52
  53        ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL));
  54        if (ret)
  55                goto err_free;
  56
  57        pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page));
  58
  59        ret = vmf_insert_pfn(vma, addr, pfn);
  60        if (ret != VM_FAULT_NOPAGE) {
  61                ret = -EFAULT;
  62                goto err_delete;
  63        }
  64
  65        return 0;
  66
  67err_delete:
  68        xa_erase(&vepc->page_array, index);
  69err_free:
  70        sgx_free_epc_page(epc_page);
  71        return ret;
  72}
  73
  74static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf)
  75{
  76        struct vm_area_struct *vma = vmf->vma;
  77        struct sgx_vepc *vepc = vma->vm_private_data;
  78        int ret;
  79
  80        mutex_lock(&vepc->lock);
  81        ret = __sgx_vepc_fault(vepc, vma, vmf->address);
  82        mutex_unlock(&vepc->lock);
  83
  84        if (!ret)
  85                return VM_FAULT_NOPAGE;
  86
  87        if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) {
  88                mmap_read_unlock(vma->vm_mm);
  89                return VM_FAULT_RETRY;
  90        }
  91
  92        return VM_FAULT_SIGBUS;
  93}
  94
  95static const struct vm_operations_struct sgx_vepc_vm_ops = {
  96        .fault = sgx_vepc_fault,
  97};
  98
  99static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma)
 100{
 101        struct sgx_vepc *vepc = file->private_data;
 102
 103        if (!(vma->vm_flags & VM_SHARED))
 104                return -EINVAL;
 105
 106        vma->vm_ops = &sgx_vepc_vm_ops;
 107        /* Don't copy VMA in fork() */
 108        vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY;
 109        vma->vm_private_data = vepc;
 110
 111        return 0;
 112}
 113
 114static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
 115{
 116        int ret;
 117
 118        /*
 119         * Take a previously guest-owned EPC page and return it to the
 120         * general EPC page pool.
 121         *
 122         * Guests can not be trusted to have left this page in a good
 123         * state, so run EREMOVE on the page unconditionally.  In the
 124         * case that a guest properly EREMOVE'd this page, a superfluous
 125         * EREMOVE is harmless.
 126         */
 127        ret = __eremove(sgx_get_epc_virt_addr(epc_page));
 128        if (ret) {
 129                /*
 130                 * Only SGX_CHILD_PRESENT is expected, which is because of
 131                 * EREMOVE'ing an SECS still with child, in which case it can
 132                 * be handled by EREMOVE'ing the SECS again after all pages in
 133                 * virtual EPC have been EREMOVE'd. See comments in below in
 134                 * sgx_vepc_release().
 135                 *
 136                 * The user of virtual EPC (KVM) needs to guarantee there's no
 137                 * logical processor is still running in the enclave in guest,
 138                 * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be
 139                 * handled here.
 140                 */
 141                WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE,
 142                          ret, ret);
 143                return ret;
 144        }
 145
 146        sgx_free_epc_page(epc_page);
 147
 148        return 0;
 149}
 150
 151static int sgx_vepc_release(struct inode *inode, struct file *file)
 152{
 153        struct sgx_vepc *vepc = file->private_data;
 154        struct sgx_epc_page *epc_page, *tmp, *entry;
 155        unsigned long index;
 156
 157        LIST_HEAD(secs_pages);
 158
 159        xa_for_each(&vepc->page_array, index, entry) {
 160                /*
 161                 * Remove all normal, child pages.  sgx_vepc_free_page()
 162                 * will fail if EREMOVE fails, but this is OK and expected on
 163                 * SECS pages.  Those can only be EREMOVE'd *after* all their
 164                 * child pages. Retries below will clean them up.
 165                 */
 166                if (sgx_vepc_free_page(entry))
 167                        continue;
 168
 169                xa_erase(&vepc->page_array, index);
 170        }
 171
 172        /*
 173         * Retry EREMOVE'ing pages.  This will clean up any SECS pages that
 174         * only had children in this 'epc' area.
 175         */
 176        xa_for_each(&vepc->page_array, index, entry) {
 177                epc_page = entry;
 178                /*
 179                 * An EREMOVE failure here means that the SECS page still
 180                 * has children.  But, since all children in this 'sgx_vepc'
 181                 * have been removed, the SECS page must have a child on
 182                 * another instance.
 183                 */
 184                if (sgx_vepc_free_page(epc_page))
 185                        list_add_tail(&epc_page->list, &secs_pages);
 186
 187                xa_erase(&vepc->page_array, index);
 188        }
 189
 190        /*
 191         * SECS pages are "pinned" by child pages, and "unpinned" once all
 192         * children have been EREMOVE'd.  A child page in this instance
 193         * may have pinned an SECS page encountered in an earlier release(),
 194         * creating a zombie.  Since some children were EREMOVE'd above,
 195         * try to EREMOVE all zombies in the hopes that one was unpinned.
 196         */
 197        mutex_lock(&zombie_secs_pages_lock);
 198        list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) {
 199                /*
 200                 * Speculatively remove the page from the list of zombies,
 201                 * if the page is successfully EREMOVE'd it will be added to
 202                 * the list of free pages.  If EREMOVE fails, throw the page
 203                 * on the local list, which will be spliced on at the end.
 204                 */
 205                list_del(&epc_page->list);
 206
 207                if (sgx_vepc_free_page(epc_page))
 208                        list_add_tail(&epc_page->list, &secs_pages);
 209        }
 210
 211        if (!list_empty(&secs_pages))
 212                list_splice_tail(&secs_pages, &zombie_secs_pages);
 213        mutex_unlock(&zombie_secs_pages_lock);
 214
 215        xa_destroy(&vepc->page_array);
 216        kfree(vepc);
 217
 218        return 0;
 219}
 220
 221static int sgx_vepc_open(struct inode *inode, struct file *file)
 222{
 223        struct sgx_vepc *vepc;
 224
 225        vepc = kzalloc(sizeof(struct sgx_vepc), GFP_KERNEL);
 226        if (!vepc)
 227                return -ENOMEM;
 228        mutex_init(&vepc->lock);
 229        xa_init(&vepc->page_array);
 230
 231        file->private_data = vepc;
 232
 233        return 0;
 234}
 235
 236static const struct file_operations sgx_vepc_fops = {
 237        .owner          = THIS_MODULE,
 238        .open           = sgx_vepc_open,
 239        .release        = sgx_vepc_release,
 240        .mmap           = sgx_vepc_mmap,
 241};
 242
 243static struct miscdevice sgx_vepc_dev = {
 244        .minor          = MISC_DYNAMIC_MINOR,
 245        .name           = "sgx_vepc",
 246        .nodename       = "sgx_vepc",
 247        .fops           = &sgx_vepc_fops,
 248};
 249
 250int __init sgx_vepc_init(void)
 251{
 252        /* SGX virtualization requires KVM to work */
 253        if (!cpu_feature_enabled(X86_FEATURE_VMX))
 254                return -ENODEV;
 255
 256        INIT_LIST_HEAD(&zombie_secs_pages);
 257        mutex_init(&zombie_secs_pages_lock);
 258
 259        return misc_register(&sgx_vepc_dev);
 260}
 261
 262/**
 263 * sgx_virt_ecreate() - Run ECREATE on behalf of guest
 264 * @pageinfo:   Pointer to PAGEINFO structure
 265 * @secs:       Userspace pointer to SECS page
 266 * @trapnr:     trap number injected to guest in case of ECREATE error
 267 *
 268 * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose
 269 * of enforcing policies of guest's enclaves, and return the trap number
 270 * which should be injected to guest in case of any ECREATE error.
 271 *
 272 * Return:
 273 * -  0:        ECREATE was successful.
 274 * - <0:        on error.
 275 */
 276int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
 277                     int *trapnr)
 278{
 279        int ret;
 280
 281        /*
 282         * @secs is an untrusted, userspace-provided address.  It comes from
 283         * KVM and is assumed to be a valid pointer which points somewhere in
 284         * userspace.  This can fault and call SGX or other fault handlers when
 285         * userspace mapping @secs doesn't exist.
 286         *
 287         * Add a WARN() to make sure @secs is already valid userspace pointer
 288         * from caller (KVM), who should already have handled invalid pointer
 289         * case (for instance, made by malicious guest).  All other checks,
 290         * such as alignment of @secs, are deferred to ENCLS itself.
 291         */
 292        if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE)))
 293                return -EINVAL;
 294
 295        __uaccess_begin();
 296        ret = __ecreate(pageinfo, (void *)secs);
 297        __uaccess_end();
 298
 299        if (encls_faulted(ret)) {
 300                *trapnr = ENCLS_TRAPNR(ret);
 301                return -EFAULT;
 302        }
 303
 304        /* ECREATE doesn't return an error code, it faults or succeeds. */
 305        WARN_ON_ONCE(ret);
 306        return 0;
 307}
 308EXPORT_SYMBOL_GPL(sgx_virt_ecreate);
 309
 310static int __sgx_virt_einit(void __user *sigstruct, void __user *token,
 311                            void __user *secs)
 312{
 313        int ret;
 314
 315        /*
 316         * Make sure all userspace pointers from caller (KVM) are valid.
 317         * All other checks deferred to ENCLS itself.  Also see comment
 318         * for @secs in sgx_virt_ecreate().
 319         */
 320#define SGX_EINITTOKEN_SIZE     304
 321        if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) ||
 322                         !access_ok(token, SGX_EINITTOKEN_SIZE) ||
 323                         !access_ok(secs, PAGE_SIZE)))
 324                return -EINVAL;
 325
 326        __uaccess_begin();
 327        ret = __einit((void *)sigstruct, (void *)token, (void *)secs);
 328        __uaccess_end();
 329
 330        return ret;
 331}
 332
 333/**
 334 * sgx_virt_einit() - Run EINIT on behalf of guest
 335 * @sigstruct:          Userspace pointer to SIGSTRUCT structure
 336 * @token:              Userspace pointer to EINITTOKEN structure
 337 * @secs:               Userspace pointer to SECS page
 338 * @lepubkeyhash:       Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values
 339 * @trapnr:             trap number injected to guest in case of EINIT error
 340 *
 341 * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available
 342 * in host, SGX driver may rewrite the hardware values at wish, therefore KVM
 343 * needs to update hardware values to guest's virtual MSR values in order to
 344 * ensure EINIT is executed with expected hardware values.
 345 *
 346 * Return:
 347 * -  0:        EINIT was successful.
 348 * - <0:        on error.
 349 */
 350int sgx_virt_einit(void __user *sigstruct, void __user *token,
 351                   void __user *secs, u64 *lepubkeyhash, int *trapnr)
 352{
 353        int ret;
 354
 355        if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) {
 356                ret = __sgx_virt_einit(sigstruct, token, secs);
 357        } else {
 358                preempt_disable();
 359
 360                sgx_update_lepubkeyhash(lepubkeyhash);
 361
 362                ret = __sgx_virt_einit(sigstruct, token, secs);
 363                preempt_enable();
 364        }
 365
 366        /* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */
 367        if (ret == -EINVAL)
 368                return ret;
 369
 370        if (encls_faulted(ret)) {
 371                *trapnr = ENCLS_TRAPNR(ret);
 372                return -EFAULT;
 373        }
 374
 375        return ret;
 376}
 377EXPORT_SYMBOL_GPL(sgx_virt_einit);
 378