linux/arch/x86/mm/mem_encrypt.c
<<
>>
Prefs
   1/*
   2 * AMD Memory Encryption Support
   3 *
   4 * Copyright (C) 2016 Advanced Micro Devices, Inc.
   5 *
   6 * Author: Tom Lendacky <thomas.lendacky@amd.com>
   7 *
   8 * This program is free software; you can redistribute it and/or modify
   9 * it under the terms of the GNU General Public License version 2 as
  10 * published by the Free Software Foundation.
  11 */
  12
  13#define DISABLE_BRANCH_PROFILING
  14
  15#include <linux/linkage.h>
  16#include <linux/init.h>
  17#include <linux/mm.h>
  18#include <linux/dma-direct.h>
  19#include <linux/swiotlb.h>
  20#include <linux/mem_encrypt.h>
  21
  22#include <asm/tlbflush.h>
  23#include <asm/fixmap.h>
  24#include <asm/setup.h>
  25#include <asm/bootparam.h>
  26#include <asm/set_memory.h>
  27#include <asm/cacheflush.h>
  28#include <asm/sections.h>
  29#include <asm/processor-flags.h>
  30#include <asm/msr.h>
  31#include <asm/cmdline.h>
  32
  33#include "mm_internal.h"
  34
  35static char sme_cmdline_arg[] __initdata = "mem_encrypt";
  36static char sme_cmdline_on[]  __initdata = "on";
  37static char sme_cmdline_off[] __initdata = "off";
  38
  39/*
  40 * Since SME related variables are set early in the boot process they must
  41 * reside in the .data section so as not to be zeroed out when the .bss
  42 * section is later cleared.
  43 */
  44u64 sme_me_mask __section(.data) = 0;
  45EXPORT_SYMBOL(sme_me_mask);
  46DEFINE_STATIC_KEY_FALSE(sev_enable_key);
  47EXPORT_SYMBOL_GPL(sev_enable_key);
  48
  49static bool sev_enabled __section(.data);
  50
  51/* Buffer used for early in-place encryption by BSP, no locking needed */
  52static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE);
  53
  54/*
  55 * This routine does not change the underlying encryption setting of the
  56 * page(s) that map this memory. It assumes that eventually the memory is
  57 * meant to be accessed as either encrypted or decrypted but the contents
  58 * are currently not in the desired state.
  59 *
  60 * This routine follows the steps outlined in the AMD64 Architecture
  61 * Programmer's Manual Volume 2, Section 7.10.8 Encrypt-in-Place.
  62 */
  63static void __init __sme_early_enc_dec(resource_size_t paddr,
  64                                       unsigned long size, bool enc)
  65{
  66        void *src, *dst;
  67        size_t len;
  68
  69        if (!sme_me_mask)
  70                return;
  71
  72        wbinvd();
  73
  74        /*
  75         * There are limited number of early mapping slots, so map (at most)
  76         * one page at time.
  77         */
  78        while (size) {
  79                len = min_t(size_t, sizeof(sme_early_buffer), size);
  80
  81                /*
  82                 * Create mappings for the current and desired format of
  83                 * the memory. Use a write-protected mapping for the source.
  84                 */
  85                src = enc ? early_memremap_decrypted_wp(paddr, len) :
  86                            early_memremap_encrypted_wp(paddr, len);
  87
  88                dst = enc ? early_memremap_encrypted(paddr, len) :
  89                            early_memremap_decrypted(paddr, len);
  90
  91                /*
  92                 * If a mapping can't be obtained to perform the operation,
  93                 * then eventual access of that area in the desired mode
  94                 * will cause a crash.
  95                 */
  96                BUG_ON(!src || !dst);
  97
  98                /*
  99                 * Use a temporary buffer, of cache-line multiple size, to
 100                 * avoid data corruption as documented in the APM.
 101                 */
 102                memcpy(sme_early_buffer, src, len);
 103                memcpy(dst, sme_early_buffer, len);
 104
 105                early_memunmap(dst, len);
 106                early_memunmap(src, len);
 107
 108                paddr += len;
 109                size -= len;
 110        }
 111}
 112
 113void __init sme_early_encrypt(resource_size_t paddr, unsigned long size)
 114{
 115        __sme_early_enc_dec(paddr, size, true);
 116}
 117
 118void __init sme_early_decrypt(resource_size_t paddr, unsigned long size)
 119{
 120        __sme_early_enc_dec(paddr, size, false);
 121}
 122
 123static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size,
 124                                             bool map)
 125{
 126        unsigned long paddr = (unsigned long)vaddr - __PAGE_OFFSET;
 127        pmdval_t pmd_flags, pmd;
 128
 129        /* Use early_pmd_flags but remove the encryption mask */
 130        pmd_flags = __sme_clr(early_pmd_flags);
 131
 132        do {
 133                pmd = map ? (paddr & PMD_MASK) + pmd_flags : 0;
 134                __early_make_pgtable((unsigned long)vaddr, pmd);
 135
 136                vaddr += PMD_SIZE;
 137                paddr += PMD_SIZE;
 138                size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE;
 139        } while (size);
 140
 141        __native_flush_tlb();
 142}
 143
 144void __init sme_unmap_bootdata(char *real_mode_data)
 145{
 146        struct boot_params *boot_data;
 147        unsigned long cmdline_paddr;
 148
 149        if (!sme_active())
 150                return;
 151
 152        /* Get the command line address before unmapping the real_mode_data */
 153        boot_data = (struct boot_params *)real_mode_data;
 154        cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
 155
 156        __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), false);
 157
 158        if (!cmdline_paddr)
 159                return;
 160
 161        __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, false);
 162}
 163
 164void __init sme_map_bootdata(char *real_mode_data)
 165{
 166        struct boot_params *boot_data;
 167        unsigned long cmdline_paddr;
 168
 169        if (!sme_active())
 170                return;
 171
 172        __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), true);
 173
 174        /* Get the command line address after mapping the real_mode_data */
 175        boot_data = (struct boot_params *)real_mode_data;
 176        cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
 177
 178        if (!cmdline_paddr)
 179                return;
 180
 181        __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true);
 182}
 183
 184void __init sme_early_init(void)
 185{
 186        unsigned int i;
 187
 188        if (!sme_me_mask)
 189                return;
 190
 191        early_pmd_flags = __sme_set(early_pmd_flags);
 192
 193        __supported_pte_mask = __sme_set(__supported_pte_mask);
 194
 195        /* Update the protection map with memory encryption mask */
 196        for (i = 0; i < ARRAY_SIZE(protection_map); i++)
 197                protection_map[i] = pgprot_encrypted(protection_map[i]);
 198
 199        if (sev_active())
 200                swiotlb_force = SWIOTLB_FORCE;
 201}
 202
 203static void *sev_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 204                       gfp_t gfp, unsigned long attrs)
 205{
 206        unsigned long dma_mask;
 207        unsigned int order;
 208        struct page *page;
 209        void *vaddr = NULL;
 210
 211        dma_mask = dma_alloc_coherent_mask(dev, gfp);
 212        order = get_order(size);
 213
 214        /*
 215         * Memory will be memset to zero after marking decrypted, so don't
 216         * bother clearing it before.
 217         */
 218        gfp &= ~__GFP_ZERO;
 219
 220        page = alloc_pages_node(dev_to_node(dev), gfp, order);
 221        if (page) {
 222                dma_addr_t addr;
 223
 224                /*
 225                 * Since we will be clearing the encryption bit, check the
 226                 * mask with it already cleared.
 227                 */
 228                addr = __sme_clr(phys_to_dma(dev, page_to_phys(page)));
 229                if ((addr + size) > dma_mask) {
 230                        __free_pages(page, get_order(size));
 231                } else {
 232                        vaddr = page_address(page);
 233                        *dma_handle = addr;
 234                }
 235        }
 236
 237        if (!vaddr)
 238                vaddr = swiotlb_alloc_coherent(dev, size, dma_handle, gfp);
 239
 240        if (!vaddr)
 241                return NULL;
 242
 243        /* Clear the SME encryption bit for DMA use if not swiotlb area */
 244        if (!is_swiotlb_buffer(dma_to_phys(dev, *dma_handle))) {
 245                set_memory_decrypted((unsigned long)vaddr, 1 << order);
 246                memset(vaddr, 0, PAGE_SIZE << order);
 247                *dma_handle = __sme_clr(*dma_handle);
 248        }
 249
 250        return vaddr;
 251}
 252
 253static void sev_free(struct device *dev, size_t size, void *vaddr,
 254                     dma_addr_t dma_handle, unsigned long attrs)
 255{
 256        /* Set the SME encryption bit for re-use if not swiotlb area */
 257        if (!is_swiotlb_buffer(dma_to_phys(dev, dma_handle)))
 258                set_memory_encrypted((unsigned long)vaddr,
 259                                     1 << get_order(size));
 260
 261        swiotlb_free_coherent(dev, size, vaddr, dma_handle);
 262}
 263
 264static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc)
 265{
 266        pgprot_t old_prot, new_prot;
 267        unsigned long pfn, pa, size;
 268        pte_t new_pte;
 269
 270        switch (level) {
 271        case PG_LEVEL_4K:
 272                pfn = pte_pfn(*kpte);
 273                old_prot = pte_pgprot(*kpte);
 274                break;
 275        case PG_LEVEL_2M:
 276                pfn = pmd_pfn(*(pmd_t *)kpte);
 277                old_prot = pmd_pgprot(*(pmd_t *)kpte);
 278                break;
 279        case PG_LEVEL_1G:
 280                pfn = pud_pfn(*(pud_t *)kpte);
 281                old_prot = pud_pgprot(*(pud_t *)kpte);
 282                break;
 283        default:
 284                return;
 285        }
 286
 287        new_prot = old_prot;
 288        if (enc)
 289                pgprot_val(new_prot) |= _PAGE_ENC;
 290        else
 291                pgprot_val(new_prot) &= ~_PAGE_ENC;
 292
 293        /* If prot is same then do nothing. */
 294        if (pgprot_val(old_prot) == pgprot_val(new_prot))
 295                return;
 296
 297        pa = pfn << page_level_shift(level);
 298        size = page_level_size(level);
 299
 300        /*
 301         * We are going to perform in-place en-/decryption and change the
 302         * physical page attribute from C=1 to C=0 or vice versa. Flush the
 303         * caches to ensure that data gets accessed with the correct C-bit.
 304         */
 305        clflush_cache_range(__va(pa), size);
 306
 307        /* Encrypt/decrypt the contents in-place */
 308        if (enc)
 309                sme_early_encrypt(pa, size);
 310        else
 311                sme_early_decrypt(pa, size);
 312
 313        /* Change the page encryption mask. */
 314        new_pte = pfn_pte(pfn, new_prot);
 315        set_pte_atomic(kpte, new_pte);
 316}
 317
 318static int __init early_set_memory_enc_dec(unsigned long vaddr,
 319                                           unsigned long size, bool enc)
 320{
 321        unsigned long vaddr_end, vaddr_next;
 322        unsigned long psize, pmask;
 323        int split_page_size_mask;
 324        int level, ret;
 325        pte_t *kpte;
 326
 327        vaddr_next = vaddr;
 328        vaddr_end = vaddr + size;
 329
 330        for (; vaddr < vaddr_end; vaddr = vaddr_next) {
 331                kpte = lookup_address(vaddr, &level);
 332                if (!kpte || pte_none(*kpte)) {
 333                        ret = 1;
 334                        goto out;
 335                }
 336
 337                if (level == PG_LEVEL_4K) {
 338                        __set_clr_pte_enc(kpte, level, enc);
 339                        vaddr_next = (vaddr & PAGE_MASK) + PAGE_SIZE;
 340                        continue;
 341                }
 342
 343                psize = page_level_size(level);
 344                pmask = page_level_mask(level);
 345
 346                /*
 347                 * Check whether we can change the large page in one go.
 348                 * We request a split when the address is not aligned and
 349                 * the number of pages to set/clear encryption bit is smaller
 350                 * than the number of pages in the large page.
 351                 */
 352                if (vaddr == (vaddr & pmask) &&
 353                    ((vaddr_end - vaddr) >= psize)) {
 354                        __set_clr_pte_enc(kpte, level, enc);
 355                        vaddr_next = (vaddr & pmask) + psize;
 356                        continue;
 357                }
 358
 359                /*
 360                 * The virtual address is part of a larger page, create the next
 361                 * level page table mapping (4K or 2M). If it is part of a 2M
 362                 * page then we request a split of the large page into 4K
 363                 * chunks. A 1GB large page is split into 2M pages, resp.
 364                 */
 365                if (level == PG_LEVEL_2M)
 366                        split_page_size_mask = 0;
 367                else
 368                        split_page_size_mask = 1 << PG_LEVEL_2M;
 369
 370                kernel_physical_mapping_init(__pa(vaddr & pmask),
 371                                             __pa((vaddr_end & pmask) + psize),
 372                                             split_page_size_mask);
 373        }
 374
 375        ret = 0;
 376
 377out:
 378        __flush_tlb_all();
 379        return ret;
 380}
 381
 382int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size)
 383{
 384        return early_set_memory_enc_dec(vaddr, size, false);
 385}
 386
 387int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size)
 388{
 389        return early_set_memory_enc_dec(vaddr, size, true);
 390}
 391
 392/*
 393 * SME and SEV are very similar but they are not the same, so there are
 394 * times that the kernel will need to distinguish between SME and SEV. The
 395 * sme_active() and sev_active() functions are used for this.  When a
 396 * distinction isn't needed, the mem_encrypt_active() function can be used.
 397 *
 398 * The trampoline code is a good example for this requirement.  Before
 399 * paging is activated, SME will access all memory as decrypted, but SEV
 400 * will access all memory as encrypted.  So, when APs are being brought
 401 * up under SME the trampoline area cannot be encrypted, whereas under SEV
 402 * the trampoline area must be encrypted.
 403 */
 404bool sme_active(void)
 405{
 406        return sme_me_mask && !sev_enabled;
 407}
 408EXPORT_SYMBOL(sme_active);
 409
 410bool sev_active(void)
 411{
 412        return sme_me_mask && sev_enabled;
 413}
 414EXPORT_SYMBOL(sev_active);
 415
 416static const struct dma_map_ops sev_dma_ops = {
 417        .alloc                  = sev_alloc,
 418        .free                   = sev_free,
 419        .map_page               = swiotlb_map_page,
 420        .unmap_page             = swiotlb_unmap_page,
 421        .map_sg                 = swiotlb_map_sg_attrs,
 422        .unmap_sg               = swiotlb_unmap_sg_attrs,
 423        .sync_single_for_cpu    = swiotlb_sync_single_for_cpu,
 424        .sync_single_for_device = swiotlb_sync_single_for_device,
 425        .sync_sg_for_cpu        = swiotlb_sync_sg_for_cpu,
 426        .sync_sg_for_device     = swiotlb_sync_sg_for_device,
 427        .mapping_error          = swiotlb_dma_mapping_error,
 428};
 429
 430/* Architecture __weak replacement functions */
 431void __init mem_encrypt_init(void)
 432{
 433        if (!sme_me_mask)
 434                return;
 435
 436        /* Call into SWIOTLB to update the SWIOTLB DMA buffers */
 437        swiotlb_update_mem_attributes();
 438
 439        /*
 440         * With SEV, DMA operations cannot use encryption. New DMA ops
 441         * are required in order to mark the DMA areas as decrypted or
 442         * to use bounce buffers.
 443         */
 444        if (sev_active())
 445                dma_ops = &sev_dma_ops;
 446
 447        /*
 448         * With SEV, we need to unroll the rep string I/O instructions.
 449         */
 450        if (sev_active())
 451                static_branch_enable(&sev_enable_key);
 452
 453        pr_info("AMD %s active\n",
 454                sev_active() ? "Secure Encrypted Virtualization (SEV)"
 455                             : "Secure Memory Encryption (SME)");
 456}
 457
 458void swiotlb_set_mem_attributes(void *vaddr, unsigned long size)
 459{
 460        WARN(PAGE_ALIGN(size) != size,
 461             "size is not page-aligned (%#lx)\n", size);
 462
 463        /* Make the SWIOTLB buffer area decrypted */
 464        set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT);
 465}
 466
 467struct sme_populate_pgd_data {
 468        void    *pgtable_area;
 469        pgd_t   *pgd;
 470
 471        pmdval_t pmd_flags;
 472        pteval_t pte_flags;
 473        unsigned long paddr;
 474
 475        unsigned long vaddr;
 476        unsigned long vaddr_end;
 477};
 478
 479static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd)
 480{
 481        unsigned long pgd_start, pgd_end, pgd_size;
 482        pgd_t *pgd_p;
 483
 484        pgd_start = ppd->vaddr & PGDIR_MASK;
 485        pgd_end = ppd->vaddr_end & PGDIR_MASK;
 486
 487        pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1) * sizeof(pgd_t);
 488
 489        pgd_p = ppd->pgd + pgd_index(ppd->vaddr);
 490
 491        memset(pgd_p, 0, pgd_size);
 492}
 493
 494#define PGD_FLAGS               _KERNPG_TABLE_NOENC
 495#define P4D_FLAGS               _KERNPG_TABLE_NOENC
 496#define PUD_FLAGS               _KERNPG_TABLE_NOENC
 497#define PMD_FLAGS               _KERNPG_TABLE_NOENC
 498
 499#define PMD_FLAGS_LARGE         (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL)
 500
 501#define PMD_FLAGS_DEC           PMD_FLAGS_LARGE
 502#define PMD_FLAGS_DEC_WP        ((PMD_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \
 503                                 (_PAGE_PAT | _PAGE_PWT))
 504
 505#define PMD_FLAGS_ENC           (PMD_FLAGS_LARGE | _PAGE_ENC)
 506
 507#define PTE_FLAGS               (__PAGE_KERNEL_EXEC & ~_PAGE_GLOBAL)
 508
 509#define PTE_FLAGS_DEC           PTE_FLAGS
 510#define PTE_FLAGS_DEC_WP        ((PTE_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \
 511                                 (_PAGE_PAT | _PAGE_PWT))
 512
 513#define PTE_FLAGS_ENC           (PTE_FLAGS | _PAGE_ENC)
 514
 515static pmd_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd)
 516{
 517        pgd_t *pgd_p;
 518        p4d_t *p4d_p;
 519        pud_t *pud_p;
 520        pmd_t *pmd_p;
 521
 522        pgd_p = ppd->pgd + pgd_index(ppd->vaddr);
 523        if (native_pgd_val(*pgd_p)) {
 524                if (IS_ENABLED(CONFIG_X86_5LEVEL))
 525                        p4d_p = (p4d_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK);
 526                else
 527                        pud_p = (pud_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK);
 528        } else {
 529                pgd_t pgd;
 530
 531                if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
 532                        p4d_p = ppd->pgtable_area;
 533                        memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D);
 534                        ppd->pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D;
 535
 536                        pgd = native_make_pgd((pgdval_t)p4d_p + PGD_FLAGS);
 537                } else {
 538                        pud_p = ppd->pgtable_area;
 539                        memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
 540                        ppd->pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
 541
 542                        pgd = native_make_pgd((pgdval_t)pud_p + PGD_FLAGS);
 543                }
 544                native_set_pgd(pgd_p, pgd);
 545        }
 546
 547        if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
 548                p4d_p += p4d_index(ppd->vaddr);
 549                if (native_p4d_val(*p4d_p)) {
 550                        pud_p = (pud_t *)(native_p4d_val(*p4d_p) & ~PTE_FLAGS_MASK);
 551                } else {
 552                        p4d_t p4d;
 553
 554                        pud_p = ppd->pgtable_area;
 555                        memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
 556                        ppd->pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
 557
 558                        p4d = native_make_p4d((pudval_t)pud_p + P4D_FLAGS);
 559                        native_set_p4d(p4d_p, p4d);
 560                }
 561        }
 562
 563        pud_p += pud_index(ppd->vaddr);
 564        if (native_pud_val(*pud_p)) {
 565                if (native_pud_val(*pud_p) & _PAGE_PSE)
 566                        return NULL;
 567
 568                pmd_p = (pmd_t *)(native_pud_val(*pud_p) & ~PTE_FLAGS_MASK);
 569        } else {
 570                pud_t pud;
 571
 572                pmd_p = ppd->pgtable_area;
 573                memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
 574                ppd->pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD;
 575
 576                pud = native_make_pud((pmdval_t)pmd_p + PUD_FLAGS);
 577                native_set_pud(pud_p, pud);
 578        }
 579
 580        return pmd_p;
 581}
 582
 583static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd)
 584{
 585        pmd_t *pmd_p;
 586
 587        pmd_p = sme_prepare_pgd(ppd);
 588        if (!pmd_p)
 589                return;
 590
 591        pmd_p += pmd_index(ppd->vaddr);
 592        if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE))
 593                native_set_pmd(pmd_p, native_make_pmd(ppd->paddr | ppd->pmd_flags));
 594}
 595
 596static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd)
 597{
 598        pmd_t *pmd_p;
 599        pte_t *pte_p;
 600
 601        pmd_p = sme_prepare_pgd(ppd);
 602        if (!pmd_p)
 603                return;
 604
 605        pmd_p += pmd_index(ppd->vaddr);
 606        if (native_pmd_val(*pmd_p)) {
 607                if (native_pmd_val(*pmd_p) & _PAGE_PSE)
 608                        return;
 609
 610                pte_p = (pte_t *)(native_pmd_val(*pmd_p) & ~PTE_FLAGS_MASK);
 611        } else {
 612                pmd_t pmd;
 613
 614                pte_p = ppd->pgtable_area;
 615                memset(pte_p, 0, sizeof(*pte_p) * PTRS_PER_PTE);
 616                ppd->pgtable_area += sizeof(*pte_p) * PTRS_PER_PTE;
 617
 618                pmd = native_make_pmd((pteval_t)pte_p + PMD_FLAGS);
 619                native_set_pmd(pmd_p, pmd);
 620        }
 621
 622        pte_p += pte_index(ppd->vaddr);
 623        if (!native_pte_val(*pte_p))
 624                native_set_pte(pte_p, native_make_pte(ppd->paddr | ppd->pte_flags));
 625}
 626
 627static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd)
 628{
 629        while (ppd->vaddr < ppd->vaddr_end) {
 630                sme_populate_pgd_large(ppd);
 631
 632                ppd->vaddr += PMD_PAGE_SIZE;
 633                ppd->paddr += PMD_PAGE_SIZE;
 634        }
 635}
 636
 637static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd)
 638{
 639        while (ppd->vaddr < ppd->vaddr_end) {
 640                sme_populate_pgd(ppd);
 641
 642                ppd->vaddr += PAGE_SIZE;
 643                ppd->paddr += PAGE_SIZE;
 644        }
 645}
 646
 647static void __init __sme_map_range(struct sme_populate_pgd_data *ppd,
 648                                   pmdval_t pmd_flags, pteval_t pte_flags)
 649{
 650        unsigned long vaddr_end;
 651
 652        ppd->pmd_flags = pmd_flags;
 653        ppd->pte_flags = pte_flags;
 654
 655        /* Save original end value since we modify the struct value */
 656        vaddr_end = ppd->vaddr_end;
 657
 658        /* If start is not 2MB aligned, create PTE entries */
 659        ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_PAGE_SIZE);
 660        __sme_map_range_pte(ppd);
 661
 662        /* Create PMD entries */
 663        ppd->vaddr_end = vaddr_end & PMD_PAGE_MASK;
 664        __sme_map_range_pmd(ppd);
 665
 666        /* If end is not 2MB aligned, create PTE entries */
 667        ppd->vaddr_end = vaddr_end;
 668        __sme_map_range_pte(ppd);
 669}
 670
 671static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd)
 672{
 673        __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC);
 674}
 675
 676static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd)
 677{
 678        __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC);
 679}
 680
 681static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd)
 682{
 683        __sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP);
 684}
 685
 686static unsigned long __init sme_pgtable_calc(unsigned long len)
 687{
 688        unsigned long p4d_size, pud_size, pmd_size, pte_size;
 689        unsigned long total;
 690
 691        /*
 692         * Perform a relatively simplistic calculation of the pagetable
 693         * entries that are needed. Those mappings will be covered mostly
 694         * by 2MB PMD entries so we can conservatively calculate the required
 695         * number of P4D, PUD and PMD structures needed to perform the
 696         * mappings.  For mappings that are not 2MB aligned, PTE mappings
 697         * would be needed for the start and end portion of the address range
 698         * that fall outside of the 2MB alignment.  This results in, at most,
 699         * two extra pages to hold PTE entries for each range that is mapped.
 700         * Incrementing the count for each covers the case where the addresses
 701         * cross entries.
 702         */
 703        if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
 704                p4d_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1;
 705                p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D;
 706                pud_size = (ALIGN(len, P4D_SIZE) / P4D_SIZE) + 1;
 707                pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
 708        } else {
 709                p4d_size = 0;
 710                pud_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1;
 711                pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
 712        }
 713        pmd_size = (ALIGN(len, PUD_SIZE) / PUD_SIZE) + 1;
 714        pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD;
 715        pte_size = 2 * sizeof(pte_t) * PTRS_PER_PTE;
 716
 717        total = p4d_size + pud_size + pmd_size + pte_size;
 718
 719        /*
 720         * Now calculate the added pagetable structures needed to populate
 721         * the new pagetables.
 722         */
 723        if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
 724                p4d_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE;
 725                p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D;
 726                pud_size = ALIGN(total, P4D_SIZE) / P4D_SIZE;
 727                pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
 728        } else {
 729                p4d_size = 0;
 730                pud_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE;
 731                pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
 732        }
 733        pmd_size = ALIGN(total, PUD_SIZE) / PUD_SIZE;
 734        pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD;
 735
 736        total += p4d_size + pud_size + pmd_size;
 737
 738        return total;
 739}
 740
 741void __init __nostackprotector sme_encrypt_kernel(struct boot_params *bp)
 742{
 743        unsigned long workarea_start, workarea_end, workarea_len;
 744        unsigned long execute_start, execute_end, execute_len;
 745        unsigned long kernel_start, kernel_end, kernel_len;
 746        unsigned long initrd_start, initrd_end, initrd_len;
 747        struct sme_populate_pgd_data ppd;
 748        unsigned long pgtable_area_len;
 749        unsigned long decrypted_base;
 750
 751        if (!sme_active())
 752                return;
 753
 754        /*
 755         * Prepare for encrypting the kernel and initrd by building new
 756         * pagetables with the necessary attributes needed to encrypt the
 757         * kernel in place.
 758         *
 759         *   One range of virtual addresses will map the memory occupied
 760         *   by the kernel and initrd as encrypted.
 761         *
 762         *   Another range of virtual addresses will map the memory occupied
 763         *   by the kernel and initrd as decrypted and write-protected.
 764         *
 765         *     The use of write-protect attribute will prevent any of the
 766         *     memory from being cached.
 767         */
 768
 769        /* Physical addresses gives us the identity mapped virtual addresses */
 770        kernel_start = __pa_symbol(_text);
 771        kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE);
 772        kernel_len = kernel_end - kernel_start;
 773
 774        initrd_start = 0;
 775        initrd_end = 0;
 776        initrd_len = 0;
 777#ifdef CONFIG_BLK_DEV_INITRD
 778        initrd_len = (unsigned long)bp->hdr.ramdisk_size |
 779                     ((unsigned long)bp->ext_ramdisk_size << 32);
 780        if (initrd_len) {
 781                initrd_start = (unsigned long)bp->hdr.ramdisk_image |
 782                               ((unsigned long)bp->ext_ramdisk_image << 32);
 783                initrd_end = PAGE_ALIGN(initrd_start + initrd_len);
 784                initrd_len = initrd_end - initrd_start;
 785        }
 786#endif
 787
 788        /* Set the encryption workarea to be immediately after the kernel */
 789        workarea_start = kernel_end;
 790
 791        /*
 792         * Calculate required number of workarea bytes needed:
 793         *   executable encryption area size:
 794         *     stack page (PAGE_SIZE)
 795         *     encryption routine page (PAGE_SIZE)
 796         *     intermediate copy buffer (PMD_PAGE_SIZE)
 797         *   pagetable structures for the encryption of the kernel
 798         *   pagetable structures for workarea (in case not currently mapped)
 799         */
 800        execute_start = workarea_start;
 801        execute_end = execute_start + (PAGE_SIZE * 2) + PMD_PAGE_SIZE;
 802        execute_len = execute_end - execute_start;
 803
 804        /*
 805         * One PGD for both encrypted and decrypted mappings and a set of
 806         * PUDs and PMDs for each of the encrypted and decrypted mappings.
 807         */
 808        pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD;
 809        pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2;
 810        if (initrd_len)
 811                pgtable_area_len += sme_pgtable_calc(initrd_len) * 2;
 812
 813        /* PUDs and PMDs needed in the current pagetables for the workarea */
 814        pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len);
 815
 816        /*
 817         * The total workarea includes the executable encryption area and
 818         * the pagetable area. The start of the workarea is already 2MB
 819         * aligned, align the end of the workarea on a 2MB boundary so that
 820         * we don't try to create/allocate PTE entries from the workarea
 821         * before it is mapped.
 822         */
 823        workarea_len = execute_len + pgtable_area_len;
 824        workarea_end = ALIGN(workarea_start + workarea_len, PMD_PAGE_SIZE);
 825
 826        /*
 827         * Set the address to the start of where newly created pagetable
 828         * structures (PGDs, PUDs and PMDs) will be allocated. New pagetable
 829         * structures are created when the workarea is added to the current
 830         * pagetables and when the new encrypted and decrypted kernel
 831         * mappings are populated.
 832         */
 833        ppd.pgtable_area = (void *)execute_end;
 834
 835        /*
 836         * Make sure the current pagetable structure has entries for
 837         * addressing the workarea.
 838         */
 839        ppd.pgd = (pgd_t *)native_read_cr3_pa();
 840        ppd.paddr = workarea_start;
 841        ppd.vaddr = workarea_start;
 842        ppd.vaddr_end = workarea_end;
 843        sme_map_range_decrypted(&ppd);
 844
 845        /* Flush the TLB - no globals so cr3 is enough */
 846        native_write_cr3(__native_read_cr3());
 847
 848        /*
 849         * A new pagetable structure is being built to allow for the kernel
 850         * and initrd to be encrypted. It starts with an empty PGD that will
 851         * then be populated with new PUDs and PMDs as the encrypted and
 852         * decrypted kernel mappings are created.
 853         */
 854        ppd.pgd = ppd.pgtable_area;
 855        memset(ppd.pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD);
 856        ppd.pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD;
 857
 858        /*
 859         * A different PGD index/entry must be used to get different
 860         * pagetable entries for the decrypted mapping. Choose the next
 861         * PGD index and convert it to a virtual address to be used as
 862         * the base of the mapping.
 863         */
 864        decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1);
 865        if (initrd_len) {
 866                unsigned long check_base;
 867
 868                check_base = (pgd_index(initrd_end) + 1) & (PTRS_PER_PGD - 1);
 869                decrypted_base = max(decrypted_base, check_base);
 870        }
 871        decrypted_base <<= PGDIR_SHIFT;
 872
 873        /* Add encrypted kernel (identity) mappings */
 874        ppd.paddr = kernel_start;
 875        ppd.vaddr = kernel_start;
 876        ppd.vaddr_end = kernel_end;
 877        sme_map_range_encrypted(&ppd);
 878
 879        /* Add decrypted, write-protected kernel (non-identity) mappings */
 880        ppd.paddr = kernel_start;
 881        ppd.vaddr = kernel_start + decrypted_base;
 882        ppd.vaddr_end = kernel_end + decrypted_base;
 883        sme_map_range_decrypted_wp(&ppd);
 884
 885        if (initrd_len) {
 886                /* Add encrypted initrd (identity) mappings */
 887                ppd.paddr = initrd_start;
 888                ppd.vaddr = initrd_start;
 889                ppd.vaddr_end = initrd_end;
 890                sme_map_range_encrypted(&ppd);
 891                /*
 892                 * Add decrypted, write-protected initrd (non-identity) mappings
 893                 */
 894                ppd.paddr = initrd_start;
 895                ppd.vaddr = initrd_start + decrypted_base;
 896                ppd.vaddr_end = initrd_end + decrypted_base;
 897                sme_map_range_decrypted_wp(&ppd);
 898        }
 899
 900        /* Add decrypted workarea mappings to both kernel mappings */
 901        ppd.paddr = workarea_start;
 902        ppd.vaddr = workarea_start;
 903        ppd.vaddr_end = workarea_end;
 904        sme_map_range_decrypted(&ppd);
 905
 906        ppd.paddr = workarea_start;
 907        ppd.vaddr = workarea_start + decrypted_base;
 908        ppd.vaddr_end = workarea_end + decrypted_base;
 909        sme_map_range_decrypted(&ppd);
 910
 911        /* Perform the encryption */
 912        sme_encrypt_execute(kernel_start, kernel_start + decrypted_base,
 913                            kernel_len, workarea_start, (unsigned long)ppd.pgd);
 914
 915        if (initrd_len)
 916                sme_encrypt_execute(initrd_start, initrd_start + decrypted_base,
 917                                    initrd_len, workarea_start,
 918                                    (unsigned long)ppd.pgd);
 919
 920        /*
 921         * At this point we are running encrypted.  Remove the mappings for
 922         * the decrypted areas - all that is needed for this is to remove
 923         * the PGD entry/entries.
 924         */
 925        ppd.vaddr = kernel_start + decrypted_base;
 926        ppd.vaddr_end = kernel_end + decrypted_base;
 927        sme_clear_pgd(&ppd);
 928
 929        if (initrd_len) {
 930                ppd.vaddr = initrd_start + decrypted_base;
 931                ppd.vaddr_end = initrd_end + decrypted_base;
 932                sme_clear_pgd(&ppd);
 933        }
 934
 935        ppd.vaddr = workarea_start + decrypted_base;
 936        ppd.vaddr_end = workarea_end + decrypted_base;
 937        sme_clear_pgd(&ppd);
 938
 939        /* Flush the TLB - no globals so cr3 is enough */
 940        native_write_cr3(__native_read_cr3());
 941}
 942
 943void __init __nostackprotector sme_enable(struct boot_params *bp)
 944{
 945        const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off;
 946        unsigned int eax, ebx, ecx, edx;
 947        unsigned long feature_mask;
 948        bool active_by_default;
 949        unsigned long me_mask;
 950        char buffer[16];
 951        u64 msr;
 952
 953        /* Check for the SME/SEV support leaf */
 954        eax = 0x80000000;
 955        ecx = 0;
 956        native_cpuid(&eax, &ebx, &ecx, &edx);
 957        if (eax < 0x8000001f)
 958                return;
 959
 960#define AMD_SME_BIT     BIT(0)
 961#define AMD_SEV_BIT     BIT(1)
 962        /*
 963         * Set the feature mask (SME or SEV) based on whether we are
 964         * running under a hypervisor.
 965         */
 966        eax = 1;
 967        ecx = 0;
 968        native_cpuid(&eax, &ebx, &ecx, &edx);
 969        feature_mask = (ecx & BIT(31)) ? AMD_SEV_BIT : AMD_SME_BIT;
 970
 971        /*
 972         * Check for the SME/SEV feature:
 973         *   CPUID Fn8000_001F[EAX]
 974         *   - Bit 0 - Secure Memory Encryption support
 975         *   - Bit 1 - Secure Encrypted Virtualization support
 976         *   CPUID Fn8000_001F[EBX]
 977         *   - Bits 5:0 - Pagetable bit position used to indicate encryption
 978         */
 979        eax = 0x8000001f;
 980        ecx = 0;
 981        native_cpuid(&eax, &ebx, &ecx, &edx);
 982        if (!(eax & feature_mask))
 983                return;
 984
 985        me_mask = 1UL << (ebx & 0x3f);
 986
 987        /* Check if memory encryption is enabled */
 988        if (feature_mask == AMD_SME_BIT) {
 989                /* For SME, check the SYSCFG MSR */
 990                msr = __rdmsr(MSR_K8_SYSCFG);
 991                if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
 992                        return;
 993        } else {
 994                /* For SEV, check the SEV MSR */
 995                msr = __rdmsr(MSR_AMD64_SEV);
 996                if (!(msr & MSR_AMD64_SEV_ENABLED))
 997                        return;
 998
 999                /* SEV state cannot be controlled by a command line option */
1000                sme_me_mask = me_mask;
1001                sev_enabled = true;
1002                return;
1003        }
1004
1005        /*
1006         * Fixups have not been applied to phys_base yet and we're running
1007         * identity mapped, so we must obtain the address to the SME command
1008         * line argument data using rip-relative addressing.
1009         */
1010        asm ("lea sme_cmdline_arg(%%rip), %0"
1011             : "=r" (cmdline_arg)
1012             : "p" (sme_cmdline_arg));
1013        asm ("lea sme_cmdline_on(%%rip), %0"
1014             : "=r" (cmdline_on)
1015             : "p" (sme_cmdline_on));
1016        asm ("lea sme_cmdline_off(%%rip), %0"
1017             : "=r" (cmdline_off)
1018             : "p" (sme_cmdline_off));
1019
1020        if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT))
1021                active_by_default = true;
1022        else
1023                active_by_default = false;
1024
1025        cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr |
1026                                     ((u64)bp->ext_cmd_line_ptr << 32));
1027
1028        cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer));
1029
1030        if (!strncmp(buffer, cmdline_on, sizeof(buffer)))
1031                sme_me_mask = me_mask;
1032        else if (!strncmp(buffer, cmdline_off, sizeof(buffer)))
1033                sme_me_mask = 0;
1034        else
1035                sme_me_mask = active_by_default ? me_mask : 0;
1036}
1037