linux/tools/testing/selftests/kvm/lib/x86_64/processor.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * tools/testing/selftests/kvm/lib/x86_64/processor.c
   4 *
   5 * Copyright (C) 2018, Google LLC.
   6 */
   7
   8#include "test_util.h"
   9#include "kvm_util.h"
  10#include "../kvm_util_internal.h"
  11#include "processor.h"
  12
  13#ifndef NUM_INTERRUPTS
  14#define NUM_INTERRUPTS 256
  15#endif
  16
  17#define DEFAULT_CODE_SELECTOR 0x8
  18#define DEFAULT_DATA_SELECTOR 0x10
  19
  20vm_vaddr_t exception_handlers;
  21
  22void regs_dump(FILE *stream, struct kvm_regs *regs,
  23               uint8_t indent)
  24{
  25        fprintf(stream, "%*srax: 0x%.16llx rbx: 0x%.16llx "
  26                "rcx: 0x%.16llx rdx: 0x%.16llx\n",
  27                indent, "",
  28                regs->rax, regs->rbx, regs->rcx, regs->rdx);
  29        fprintf(stream, "%*srsi: 0x%.16llx rdi: 0x%.16llx "
  30                "rsp: 0x%.16llx rbp: 0x%.16llx\n",
  31                indent, "",
  32                regs->rsi, regs->rdi, regs->rsp, regs->rbp);
  33        fprintf(stream, "%*sr8:  0x%.16llx r9:  0x%.16llx "
  34                "r10: 0x%.16llx r11: 0x%.16llx\n",
  35                indent, "",
  36                regs->r8, regs->r9, regs->r10, regs->r11);
  37        fprintf(stream, "%*sr12: 0x%.16llx r13: 0x%.16llx "
  38                "r14: 0x%.16llx r15: 0x%.16llx\n",
  39                indent, "",
  40                regs->r12, regs->r13, regs->r14, regs->r15);
  41        fprintf(stream, "%*srip: 0x%.16llx rfl: 0x%.16llx\n",
  42                indent, "",
  43                regs->rip, regs->rflags);
  44}
  45
  46/*
  47 * Segment Dump
  48 *
  49 * Input Args:
  50 *   stream  - Output FILE stream
  51 *   segment - KVM segment
  52 *   indent  - Left margin indent amount
  53 *
  54 * Output Args: None
  55 *
  56 * Return: None
  57 *
  58 * Dumps the state of the KVM segment given by @segment, to the FILE stream
  59 * given by @stream.
  60 */
  61static void segment_dump(FILE *stream, struct kvm_segment *segment,
  62                         uint8_t indent)
  63{
  64        fprintf(stream, "%*sbase: 0x%.16llx limit: 0x%.8x "
  65                "selector: 0x%.4x type: 0x%.2x\n",
  66                indent, "", segment->base, segment->limit,
  67                segment->selector, segment->type);
  68        fprintf(stream, "%*spresent: 0x%.2x dpl: 0x%.2x "
  69                "db: 0x%.2x s: 0x%.2x l: 0x%.2x\n",
  70                indent, "", segment->present, segment->dpl,
  71                segment->db, segment->s, segment->l);
  72        fprintf(stream, "%*sg: 0x%.2x avl: 0x%.2x "
  73                "unusable: 0x%.2x padding: 0x%.2x\n",
  74                indent, "", segment->g, segment->avl,
  75                segment->unusable, segment->padding);
  76}
  77
  78/*
  79 * dtable Dump
  80 *
  81 * Input Args:
  82 *   stream - Output FILE stream
  83 *   dtable - KVM dtable
  84 *   indent - Left margin indent amount
  85 *
  86 * Output Args: None
  87 *
  88 * Return: None
  89 *
  90 * Dumps the state of the KVM dtable given by @dtable, to the FILE stream
  91 * given by @stream.
  92 */
  93static void dtable_dump(FILE *stream, struct kvm_dtable *dtable,
  94                        uint8_t indent)
  95{
  96        fprintf(stream, "%*sbase: 0x%.16llx limit: 0x%.4x "
  97                "padding: 0x%.4x 0x%.4x 0x%.4x\n",
  98                indent, "", dtable->base, dtable->limit,
  99                dtable->padding[0], dtable->padding[1], dtable->padding[2]);
 100}
 101
 102void sregs_dump(FILE *stream, struct kvm_sregs *sregs,
 103                uint8_t indent)
 104{
 105        unsigned int i;
 106
 107        fprintf(stream, "%*scs:\n", indent, "");
 108        segment_dump(stream, &sregs->cs, indent + 2);
 109        fprintf(stream, "%*sds:\n", indent, "");
 110        segment_dump(stream, &sregs->ds, indent + 2);
 111        fprintf(stream, "%*ses:\n", indent, "");
 112        segment_dump(stream, &sregs->es, indent + 2);
 113        fprintf(stream, "%*sfs:\n", indent, "");
 114        segment_dump(stream, &sregs->fs, indent + 2);
 115        fprintf(stream, "%*sgs:\n", indent, "");
 116        segment_dump(stream, &sregs->gs, indent + 2);
 117        fprintf(stream, "%*sss:\n", indent, "");
 118        segment_dump(stream, &sregs->ss, indent + 2);
 119        fprintf(stream, "%*str:\n", indent, "");
 120        segment_dump(stream, &sregs->tr, indent + 2);
 121        fprintf(stream, "%*sldt:\n", indent, "");
 122        segment_dump(stream, &sregs->ldt, indent + 2);
 123
 124        fprintf(stream, "%*sgdt:\n", indent, "");
 125        dtable_dump(stream, &sregs->gdt, indent + 2);
 126        fprintf(stream, "%*sidt:\n", indent, "");
 127        dtable_dump(stream, &sregs->idt, indent + 2);
 128
 129        fprintf(stream, "%*scr0: 0x%.16llx cr2: 0x%.16llx "
 130                "cr3: 0x%.16llx cr4: 0x%.16llx\n",
 131                indent, "",
 132                sregs->cr0, sregs->cr2, sregs->cr3, sregs->cr4);
 133        fprintf(stream, "%*scr8: 0x%.16llx efer: 0x%.16llx "
 134                "apic_base: 0x%.16llx\n",
 135                indent, "",
 136                sregs->cr8, sregs->efer, sregs->apic_base);
 137
 138        fprintf(stream, "%*sinterrupt_bitmap:\n", indent, "");
 139        for (i = 0; i < (KVM_NR_INTERRUPTS + 63) / 64; i++) {
 140                fprintf(stream, "%*s%.16llx\n", indent + 2, "",
 141                        sregs->interrupt_bitmap[i]);
 142        }
 143}
 144
 145void virt_pgd_alloc(struct kvm_vm *vm)
 146{
 147        TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
 148                "unknown or unsupported guest mode, mode: 0x%x", vm->mode);
 149
 150        /* If needed, create page map l4 table. */
 151        if (!vm->pgd_created) {
 152                vm->pgd = vm_alloc_page_table(vm);
 153                vm->pgd_created = true;
 154        }
 155}
 156
 157static void *virt_get_pte(struct kvm_vm *vm, uint64_t pt_pfn, uint64_t vaddr,
 158                          int level)
 159{
 160        uint64_t *page_table = addr_gpa2hva(vm, pt_pfn << vm->page_shift);
 161        int index = (vaddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu;
 162
 163        return &page_table[index];
 164}
 165
 166static uint64_t *virt_create_upper_pte(struct kvm_vm *vm,
 167                                       uint64_t pt_pfn,
 168                                       uint64_t vaddr,
 169                                       uint64_t paddr,
 170                                       int current_level,
 171                                       int target_level)
 172{
 173        uint64_t *pte = virt_get_pte(vm, pt_pfn, vaddr, current_level);
 174
 175        if (!(*pte & PTE_PRESENT_MASK)) {
 176                *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK;
 177                if (current_level == target_level)
 178                        *pte |= PTE_LARGE_MASK | (paddr & PHYSICAL_PAGE_MASK);
 179                else
 180                        *pte |= vm_alloc_page_table(vm) & PHYSICAL_PAGE_MASK;
 181        } else {
 182                /*
 183                 * Entry already present.  Assert that the caller doesn't want
 184                 * a hugepage at this level, and that there isn't a hugepage at
 185                 * this level.
 186                 */
 187                TEST_ASSERT(current_level != target_level,
 188                            "Cannot create hugepage at level: %u, vaddr: 0x%lx\n",
 189                            current_level, vaddr);
 190                TEST_ASSERT(!(*pte & PTE_LARGE_MASK),
 191                            "Cannot create page table at level: %u, vaddr: 0x%lx\n",
 192                            current_level, vaddr);
 193        }
 194        return pte;
 195}
 196
 197void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level)
 198{
 199        const uint64_t pg_size = PG_LEVEL_SIZE(level);
 200        uint64_t *pml4e, *pdpe, *pde;
 201        uint64_t *pte;
 202
 203        TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K,
 204                    "Unknown or unsupported guest mode, mode: 0x%x", vm->mode);
 205
 206        TEST_ASSERT((vaddr % pg_size) == 0,
 207                    "Virtual address not aligned,\n"
 208                    "vaddr: 0x%lx page size: 0x%lx", vaddr, pg_size);
 209        TEST_ASSERT(sparsebit_is_set(vm->vpages_valid, (vaddr >> vm->page_shift)),
 210                    "Invalid virtual address, vaddr: 0x%lx", vaddr);
 211        TEST_ASSERT((paddr % pg_size) == 0,
 212                    "Physical address not aligned,\n"
 213                    "  paddr: 0x%lx page size: 0x%lx", paddr, pg_size);
 214        TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn,
 215                    "Physical address beyond maximum supported,\n"
 216                    "  paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
 217                    paddr, vm->max_gfn, vm->page_size);
 218
 219        /*
 220         * Allocate upper level page tables, if not already present.  Return
 221         * early if a hugepage was created.
 222         */
 223        pml4e = virt_create_upper_pte(vm, vm->pgd >> vm->page_shift,
 224                                      vaddr, paddr, PG_LEVEL_512G, level);
 225        if (*pml4e & PTE_LARGE_MASK)
 226                return;
 227
 228        pdpe = virt_create_upper_pte(vm, PTE_GET_PFN(*pml4e), vaddr, paddr, PG_LEVEL_1G, level);
 229        if (*pdpe & PTE_LARGE_MASK)
 230                return;
 231
 232        pde = virt_create_upper_pte(vm, PTE_GET_PFN(*pdpe), vaddr, paddr, PG_LEVEL_2M, level);
 233        if (*pde & PTE_LARGE_MASK)
 234                return;
 235
 236        /* Fill in page table entry. */
 237        pte = virt_get_pte(vm, PTE_GET_PFN(*pde), vaddr, PG_LEVEL_4K);
 238        TEST_ASSERT(!(*pte & PTE_PRESENT_MASK),
 239                    "PTE already present for 4k page at vaddr: 0x%lx\n", vaddr);
 240        *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK);
 241}
 242
 243void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
 244{
 245        __virt_pg_map(vm, vaddr, paddr, PG_LEVEL_4K);
 246}
 247
 248static uint64_t *_vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid,
 249                                                       uint64_t vaddr)
 250{
 251        uint16_t index[4];
 252        uint64_t *pml4e, *pdpe, *pde;
 253        uint64_t *pte;
 254        struct kvm_cpuid_entry2 *entry;
 255        struct kvm_sregs sregs;
 256        int max_phy_addr;
 257        uint64_t rsvd_mask = 0;
 258
 259        entry = kvm_get_supported_cpuid_index(0x80000008, 0);
 260        max_phy_addr = entry->eax & 0x000000ff;
 261        /* Set the high bits in the reserved mask. */
 262        if (max_phy_addr < 52)
 263                rsvd_mask = GENMASK_ULL(51, max_phy_addr);
 264
 265        /*
 266         * SDM vol 3, fig 4-11 "Formats of CR3 and Paging-Structure Entries
 267         * with 4-Level Paging and 5-Level Paging".
 268         * If IA32_EFER.NXE = 0 and the P flag of a paging-structure entry is 1,
 269         * the XD flag (bit 63) is reserved.
 270         */
 271        vcpu_sregs_get(vm, vcpuid, &sregs);
 272        if ((sregs.efer & EFER_NX) == 0) {
 273                rsvd_mask |= PTE_NX_MASK;
 274        }
 275
 276        TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
 277                "unknown or unsupported guest mode, mode: 0x%x", vm->mode);
 278        TEST_ASSERT(sparsebit_is_set(vm->vpages_valid,
 279                (vaddr >> vm->page_shift)),
 280                "Invalid virtual address, vaddr: 0x%lx",
 281                vaddr);
 282        /*
 283         * Based on the mode check above there are 48 bits in the vaddr, so
 284         * shift 16 to sign extend the last bit (bit-47),
 285         */
 286        TEST_ASSERT(vaddr == (((int64_t)vaddr << 16) >> 16),
 287                "Canonical check failed.  The virtual address is invalid.");
 288
 289        index[0] = (vaddr >> 12) & 0x1ffu;
 290        index[1] = (vaddr >> 21) & 0x1ffu;
 291        index[2] = (vaddr >> 30) & 0x1ffu;
 292        index[3] = (vaddr >> 39) & 0x1ffu;
 293
 294        pml4e = addr_gpa2hva(vm, vm->pgd);
 295        TEST_ASSERT(pml4e[index[3]] & PTE_PRESENT_MASK,
 296                "Expected pml4e to be present for gva: 0x%08lx", vaddr);
 297        TEST_ASSERT((pml4e[index[3]] & (rsvd_mask | PTE_LARGE_MASK)) == 0,
 298                "Unexpected reserved bits set.");
 299
 300        pdpe = addr_gpa2hva(vm, PTE_GET_PFN(pml4e[index[3]]) * vm->page_size);
 301        TEST_ASSERT(pdpe[index[2]] & PTE_PRESENT_MASK,
 302                "Expected pdpe to be present for gva: 0x%08lx", vaddr);
 303        TEST_ASSERT(!(pdpe[index[2]] & PTE_LARGE_MASK),
 304                "Expected pdpe to map a pde not a 1-GByte page.");
 305        TEST_ASSERT((pdpe[index[2]] & rsvd_mask) == 0,
 306                "Unexpected reserved bits set.");
 307
 308        pde = addr_gpa2hva(vm, PTE_GET_PFN(pdpe[index[2]]) * vm->page_size);
 309        TEST_ASSERT(pde[index[1]] & PTE_PRESENT_MASK,
 310                "Expected pde to be present for gva: 0x%08lx", vaddr);
 311        TEST_ASSERT(!(pde[index[1]] & PTE_LARGE_MASK),
 312                "Expected pde to map a pte not a 2-MByte page.");
 313        TEST_ASSERT((pde[index[1]] & rsvd_mask) == 0,
 314                "Unexpected reserved bits set.");
 315
 316        pte = addr_gpa2hva(vm, PTE_GET_PFN(pde[index[1]]) * vm->page_size);
 317        TEST_ASSERT(pte[index[0]] & PTE_PRESENT_MASK,
 318                "Expected pte to be present for gva: 0x%08lx", vaddr);
 319
 320        return &pte[index[0]];
 321}
 322
 323uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr)
 324{
 325        uint64_t *pte = _vm_get_page_table_entry(vm, vcpuid, vaddr);
 326
 327        return *(uint64_t *)pte;
 328}
 329
 330void vm_set_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr,
 331                             uint64_t pte)
 332{
 333        uint64_t *new_pte = _vm_get_page_table_entry(vm, vcpuid, vaddr);
 334
 335        *(uint64_t *)new_pte = pte;
 336}
 337
 338void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
 339{
 340        uint64_t *pml4e, *pml4e_start;
 341        uint64_t *pdpe, *pdpe_start;
 342        uint64_t *pde, *pde_start;
 343        uint64_t *pte, *pte_start;
 344
 345        if (!vm->pgd_created)
 346                return;
 347
 348        fprintf(stream, "%*s                                          "
 349                "                no\n", indent, "");
 350        fprintf(stream, "%*s      index hvaddr         gpaddr         "
 351                "addr         w exec dirty\n",
 352                indent, "");
 353        pml4e_start = (uint64_t *) addr_gpa2hva(vm, vm->pgd);
 354        for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) {
 355                pml4e = &pml4e_start[n1];
 356                if (!(*pml4e & PTE_PRESENT_MASK))
 357                        continue;
 358                fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10llx %u "
 359                        " %u\n",
 360                        indent, "",
 361                        pml4e - pml4e_start, pml4e,
 362                        addr_hva2gpa(vm, pml4e), PTE_GET_PFN(*pml4e),
 363                        !!(*pml4e & PTE_WRITABLE_MASK), !!(*pml4e & PTE_NX_MASK));
 364
 365                pdpe_start = addr_gpa2hva(vm, *pml4e & PHYSICAL_PAGE_MASK);
 366                for (uint16_t n2 = 0; n2 <= 0x1ffu; n2++) {
 367                        pdpe = &pdpe_start[n2];
 368                        if (!(*pdpe & PTE_PRESENT_MASK))
 369                                continue;
 370                        fprintf(stream, "%*spdpe  0x%-3zx %p 0x%-12lx 0x%-10llx "
 371                                "%u  %u\n",
 372                                indent, "",
 373                                pdpe - pdpe_start, pdpe,
 374                                addr_hva2gpa(vm, pdpe),
 375                                PTE_GET_PFN(*pdpe), !!(*pdpe & PTE_WRITABLE_MASK),
 376                                !!(*pdpe & PTE_NX_MASK));
 377
 378                        pde_start = addr_gpa2hva(vm, *pdpe & PHYSICAL_PAGE_MASK);
 379                        for (uint16_t n3 = 0; n3 <= 0x1ffu; n3++) {
 380                                pde = &pde_start[n3];
 381                                if (!(*pde & PTE_PRESENT_MASK))
 382                                        continue;
 383                                fprintf(stream, "%*spde   0x%-3zx %p "
 384                                        "0x%-12lx 0x%-10llx %u  %u\n",
 385                                        indent, "", pde - pde_start, pde,
 386                                        addr_hva2gpa(vm, pde),
 387                                        PTE_GET_PFN(*pde), !!(*pde & PTE_WRITABLE_MASK),
 388                                        !!(*pde & PTE_NX_MASK));
 389
 390                                pte_start = addr_gpa2hva(vm, *pde & PHYSICAL_PAGE_MASK);
 391                                for (uint16_t n4 = 0; n4 <= 0x1ffu; n4++) {
 392                                        pte = &pte_start[n4];
 393                                        if (!(*pte & PTE_PRESENT_MASK))
 394                                                continue;
 395                                        fprintf(stream, "%*spte   0x%-3zx %p "
 396                                                "0x%-12lx 0x%-10llx %u  %u "
 397                                                "    %u    0x%-10lx\n",
 398                                                indent, "",
 399                                                pte - pte_start, pte,
 400                                                addr_hva2gpa(vm, pte),
 401                                                PTE_GET_PFN(*pte),
 402                                                !!(*pte & PTE_WRITABLE_MASK),
 403                                                !!(*pte & PTE_NX_MASK),
 404                                                !!(*pte & PTE_DIRTY_MASK),
 405                                                ((uint64_t) n1 << 27)
 406                                                        | ((uint64_t) n2 << 18)
 407                                                        | ((uint64_t) n3 << 9)
 408                                                        | ((uint64_t) n4));
 409                                }
 410                        }
 411                }
 412        }
 413}
 414
 415/*
 416 * Set Unusable Segment
 417 *
 418 * Input Args: None
 419 *
 420 * Output Args:
 421 *   segp - Pointer to segment register
 422 *
 423 * Return: None
 424 *
 425 * Sets the segment register pointed to by @segp to an unusable state.
 426 */
 427static void kvm_seg_set_unusable(struct kvm_segment *segp)
 428{
 429        memset(segp, 0, sizeof(*segp));
 430        segp->unusable = true;
 431}
 432
 433static void kvm_seg_fill_gdt_64bit(struct kvm_vm *vm, struct kvm_segment *segp)
 434{
 435        void *gdt = addr_gva2hva(vm, vm->gdt);
 436        struct desc64 *desc = gdt + (segp->selector >> 3) * 8;
 437
 438        desc->limit0 = segp->limit & 0xFFFF;
 439        desc->base0 = segp->base & 0xFFFF;
 440        desc->base1 = segp->base >> 16;
 441        desc->type = segp->type;
 442        desc->s = segp->s;
 443        desc->dpl = segp->dpl;
 444        desc->p = segp->present;
 445        desc->limit1 = segp->limit >> 16;
 446        desc->avl = segp->avl;
 447        desc->l = segp->l;
 448        desc->db = segp->db;
 449        desc->g = segp->g;
 450        desc->base2 = segp->base >> 24;
 451        if (!segp->s)
 452                desc->base3 = segp->base >> 32;
 453}
 454
 455
 456/*
 457 * Set Long Mode Flat Kernel Code Segment
 458 *
 459 * Input Args:
 460 *   vm - VM whose GDT is being filled, or NULL to only write segp
 461 *   selector - selector value
 462 *
 463 * Output Args:
 464 *   segp - Pointer to KVM segment
 465 *
 466 * Return: None
 467 *
 468 * Sets up the KVM segment pointed to by @segp, to be a code segment
 469 * with the selector value given by @selector.
 470 */
 471static void kvm_seg_set_kernel_code_64bit(struct kvm_vm *vm, uint16_t selector,
 472        struct kvm_segment *segp)
 473{
 474        memset(segp, 0, sizeof(*segp));
 475        segp->selector = selector;
 476        segp->limit = 0xFFFFFFFFu;
 477        segp->s = 0x1; /* kTypeCodeData */
 478        segp->type = 0x08 | 0x01 | 0x02; /* kFlagCode | kFlagCodeAccessed
 479                                          * | kFlagCodeReadable
 480                                          */
 481        segp->g = true;
 482        segp->l = true;
 483        segp->present = 1;
 484        if (vm)
 485                kvm_seg_fill_gdt_64bit(vm, segp);
 486}
 487
 488/*
 489 * Set Long Mode Flat Kernel Data Segment
 490 *
 491 * Input Args:
 492 *   vm - VM whose GDT is being filled, or NULL to only write segp
 493 *   selector - selector value
 494 *
 495 * Output Args:
 496 *   segp - Pointer to KVM segment
 497 *
 498 * Return: None
 499 *
 500 * Sets up the KVM segment pointed to by @segp, to be a data segment
 501 * with the selector value given by @selector.
 502 */
 503static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector,
 504        struct kvm_segment *segp)
 505{
 506        memset(segp, 0, sizeof(*segp));
 507        segp->selector = selector;
 508        segp->limit = 0xFFFFFFFFu;
 509        segp->s = 0x1; /* kTypeCodeData */
 510        segp->type = 0x00 | 0x01 | 0x02; /* kFlagData | kFlagDataAccessed
 511                                          * | kFlagDataWritable
 512                                          */
 513        segp->g = true;
 514        segp->present = true;
 515        if (vm)
 516                kvm_seg_fill_gdt_64bit(vm, segp);
 517}
 518
 519vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
 520{
 521        uint16_t index[4];
 522        uint64_t *pml4e, *pdpe, *pde;
 523        uint64_t *pte;
 524
 525        TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
 526                "unknown or unsupported guest mode, mode: 0x%x", vm->mode);
 527
 528        index[0] = (gva >> 12) & 0x1ffu;
 529        index[1] = (gva >> 21) & 0x1ffu;
 530        index[2] = (gva >> 30) & 0x1ffu;
 531        index[3] = (gva >> 39) & 0x1ffu;
 532
 533        if (!vm->pgd_created)
 534                goto unmapped_gva;
 535        pml4e = addr_gpa2hva(vm, vm->pgd);
 536        if (!(pml4e[index[3]] & PTE_PRESENT_MASK))
 537                goto unmapped_gva;
 538
 539        pdpe = addr_gpa2hva(vm, PTE_GET_PFN(pml4e[index[3]]) * vm->page_size);
 540        if (!(pdpe[index[2]] & PTE_PRESENT_MASK))
 541                goto unmapped_gva;
 542
 543        pde = addr_gpa2hva(vm, PTE_GET_PFN(pdpe[index[2]]) * vm->page_size);
 544        if (!(pde[index[1]] & PTE_PRESENT_MASK))
 545                goto unmapped_gva;
 546
 547        pte = addr_gpa2hva(vm, PTE_GET_PFN(pde[index[1]]) * vm->page_size);
 548        if (!(pte[index[0]] & PTE_PRESENT_MASK))
 549                goto unmapped_gva;
 550
 551        return (PTE_GET_PFN(pte[index[0]]) * vm->page_size) + (gva & ~PAGE_MASK);
 552
 553unmapped_gva:
 554        TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva);
 555        exit(EXIT_FAILURE);
 556}
 557
 558static void kvm_setup_gdt(struct kvm_vm *vm, struct kvm_dtable *dt)
 559{
 560        if (!vm->gdt)
 561                vm->gdt = vm_vaddr_alloc_page(vm);
 562
 563        dt->base = vm->gdt;
 564        dt->limit = getpagesize();
 565}
 566
 567static void kvm_setup_tss_64bit(struct kvm_vm *vm, struct kvm_segment *segp,
 568                                int selector)
 569{
 570        if (!vm->tss)
 571                vm->tss = vm_vaddr_alloc_page(vm);
 572
 573        memset(segp, 0, sizeof(*segp));
 574        segp->base = vm->tss;
 575        segp->limit = 0x67;
 576        segp->selector = selector;
 577        segp->type = 0xb;
 578        segp->present = 1;
 579        kvm_seg_fill_gdt_64bit(vm, segp);
 580}
 581
 582static void vcpu_setup(struct kvm_vm *vm, int vcpuid)
 583{
 584        struct kvm_sregs sregs;
 585
 586        /* Set mode specific system register values. */
 587        vcpu_sregs_get(vm, vcpuid, &sregs);
 588
 589        sregs.idt.limit = 0;
 590
 591        kvm_setup_gdt(vm, &sregs.gdt);
 592
 593        switch (vm->mode) {
 594        case VM_MODE_PXXV48_4K:
 595                sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG;
 596                sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR;
 597                sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX);
 598
 599                kvm_seg_set_unusable(&sregs.ldt);
 600                kvm_seg_set_kernel_code_64bit(vm, DEFAULT_CODE_SELECTOR, &sregs.cs);
 601                kvm_seg_set_kernel_data_64bit(vm, DEFAULT_DATA_SELECTOR, &sregs.ds);
 602                kvm_seg_set_kernel_data_64bit(vm, DEFAULT_DATA_SELECTOR, &sregs.es);
 603                kvm_setup_tss_64bit(vm, &sregs.tr, 0x18);
 604                break;
 605
 606        default:
 607                TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode);
 608        }
 609
 610        sregs.cr3 = vm->pgd;
 611        vcpu_sregs_set(vm, vcpuid, &sregs);
 612}
 613
 614#define CPUID_XFD_BIT (1 << 4)
 615static bool is_xfd_supported(void)
 616{
 617        int eax, ebx, ecx, edx;
 618        const int leaf = 0xd, subleaf = 0x1;
 619
 620        __asm__ __volatile__(
 621                "cpuid"
 622                : /* output */ "=a"(eax), "=b"(ebx),
 623                  "=c"(ecx), "=d"(edx)
 624                : /* input */ "0"(leaf), "2"(subleaf));
 625
 626        return !!(eax & CPUID_XFD_BIT);
 627}
 628
 629void vm_xsave_req_perm(int bit)
 630{
 631        int kvm_fd;
 632        u64 bitmask;
 633        long rc;
 634        struct kvm_device_attr attr = {
 635                .group = 0,
 636                .attr = KVM_X86_XCOMP_GUEST_SUPP,
 637                .addr = (unsigned long) &bitmask
 638        };
 639
 640        kvm_fd = open_kvm_dev_path_or_exit();
 641        rc = ioctl(kvm_fd, KVM_GET_DEVICE_ATTR, &attr);
 642        close(kvm_fd);
 643        if (rc == -1 && (errno == ENXIO || errno == EINVAL))
 644                exit(KSFT_SKIP);
 645        TEST_ASSERT(rc == 0, "KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) error: %ld", rc);
 646        if (!(bitmask & (1ULL << bit)))
 647                exit(KSFT_SKIP);
 648
 649        if (!is_xfd_supported())
 650                exit(KSFT_SKIP);
 651
 652        rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, bit);
 653
 654        /*
 655         * The older kernel version(<5.15) can't support
 656         * ARCH_REQ_XCOMP_GUEST_PERM and directly return.
 657         */
 658        if (rc)
 659                return;
 660
 661        rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &bitmask);
 662        TEST_ASSERT(rc == 0, "prctl(ARCH_GET_XCOMP_GUEST_PERM) error: %ld", rc);
 663        TEST_ASSERT(bitmask & (1ULL << bit),
 664                    "prctl(ARCH_REQ_XCOMP_GUEST_PERM) failure bitmask=0x%lx",
 665                    bitmask);
 666}
 667
 668void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code)
 669{
 670        struct kvm_mp_state mp_state;
 671        struct kvm_regs regs;
 672        vm_vaddr_t stack_vaddr;
 673        stack_vaddr = vm_vaddr_alloc(vm, DEFAULT_STACK_PGS * getpagesize(),
 674                                     DEFAULT_GUEST_STACK_VADDR_MIN);
 675
 676        /* Create VCPU */
 677        vm_vcpu_add(vm, vcpuid);
 678        vcpu_set_cpuid(vm, vcpuid, kvm_get_supported_cpuid());
 679        vcpu_setup(vm, vcpuid);
 680
 681        /* Setup guest general purpose registers */
 682        vcpu_regs_get(vm, vcpuid, &regs);
 683        regs.rflags = regs.rflags | 0x2;
 684        regs.rsp = stack_vaddr + (DEFAULT_STACK_PGS * getpagesize());
 685        regs.rip = (unsigned long) guest_code;
 686        vcpu_regs_set(vm, vcpuid, &regs);
 687
 688        /* Setup the MP state */
 689        mp_state.mp_state = 0;
 690        vcpu_set_mp_state(vm, vcpuid, &mp_state);
 691}
 692
 693/*
 694 * Allocate an instance of struct kvm_cpuid2
 695 *
 696 * Input Args: None
 697 *
 698 * Output Args: None
 699 *
 700 * Return: A pointer to the allocated struct. The caller is responsible
 701 * for freeing this struct.
 702 *
 703 * Since kvm_cpuid2 uses a 0-length array to allow a the size of the
 704 * array to be decided at allocation time, allocation is slightly
 705 * complicated. This function uses a reasonable default length for
 706 * the array and performs the appropriate allocation.
 707 */
 708static struct kvm_cpuid2 *allocate_kvm_cpuid2(void)
 709{
 710        struct kvm_cpuid2 *cpuid;
 711        int nent = 100;
 712        size_t size;
 713
 714        size = sizeof(*cpuid);
 715        size += nent * sizeof(struct kvm_cpuid_entry2);
 716        cpuid = malloc(size);
 717        if (!cpuid) {
 718                perror("malloc");
 719                abort();
 720        }
 721
 722        cpuid->nent = nent;
 723
 724        return cpuid;
 725}
 726
 727/*
 728 * KVM Supported CPUID Get
 729 *
 730 * Input Args: None
 731 *
 732 * Output Args:
 733 *
 734 * Return: The supported KVM CPUID
 735 *
 736 * Get the guest CPUID supported by KVM.
 737 */
 738struct kvm_cpuid2 *kvm_get_supported_cpuid(void)
 739{
 740        static struct kvm_cpuid2 *cpuid;
 741        int ret;
 742        int kvm_fd;
 743
 744        if (cpuid)
 745                return cpuid;
 746
 747        cpuid = allocate_kvm_cpuid2();
 748        kvm_fd = open_kvm_dev_path_or_exit();
 749
 750        ret = ioctl(kvm_fd, KVM_GET_SUPPORTED_CPUID, cpuid);
 751        TEST_ASSERT(ret == 0, "KVM_GET_SUPPORTED_CPUID failed %d %d\n",
 752                    ret, errno);
 753
 754        close(kvm_fd);
 755        return cpuid;
 756}
 757
 758/*
 759 * KVM Get MSR
 760 *
 761 * Input Args:
 762 *   msr_index - Index of MSR
 763 *
 764 * Output Args: None
 765 *
 766 * Return: On success, value of the MSR. On failure a TEST_ASSERT is produced.
 767 *
 768 * Get value of MSR for VCPU.
 769 */
 770uint64_t kvm_get_feature_msr(uint64_t msr_index)
 771{
 772        struct {
 773                struct kvm_msrs header;
 774                struct kvm_msr_entry entry;
 775        } buffer = {};
 776        int r, kvm_fd;
 777
 778        buffer.header.nmsrs = 1;
 779        buffer.entry.index = msr_index;
 780        kvm_fd = open_kvm_dev_path_or_exit();
 781
 782        r = ioctl(kvm_fd, KVM_GET_MSRS, &buffer.header);
 783        TEST_ASSERT(r == 1, "KVM_GET_MSRS IOCTL failed,\n"
 784                "  rc: %i errno: %i", r, errno);
 785
 786        close(kvm_fd);
 787        return buffer.entry.data;
 788}
 789
 790/*
 791 * VM VCPU CPUID Set
 792 *
 793 * Input Args:
 794 *   vm - Virtual Machine
 795 *   vcpuid - VCPU id
 796 *
 797 * Output Args: None
 798 *
 799 * Return: KVM CPUID (KVM_GET_CPUID2)
 800 *
 801 * Set the VCPU's CPUID.
 802 */
 803struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vm *vm, uint32_t vcpuid)
 804{
 805        struct vcpu *vcpu = vcpu_find(vm, vcpuid);
 806        struct kvm_cpuid2 *cpuid;
 807        int max_ent;
 808        int rc = -1;
 809
 810        TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
 811
 812        cpuid = allocate_kvm_cpuid2();
 813        max_ent = cpuid->nent;
 814
 815        for (cpuid->nent = 1; cpuid->nent <= max_ent; cpuid->nent++) {
 816                rc = ioctl(vcpu->fd, KVM_GET_CPUID2, cpuid);
 817                if (!rc)
 818                        break;
 819
 820                TEST_ASSERT(rc == -1 && errno == E2BIG,
 821                            "KVM_GET_CPUID2 should either succeed or give E2BIG: %d %d",
 822                            rc, errno);
 823        }
 824
 825        TEST_ASSERT(rc == 0, "KVM_GET_CPUID2 failed, rc: %i errno: %i",
 826                    rc, errno);
 827
 828        return cpuid;
 829}
 830
 831
 832
 833/*
 834 * Locate a cpuid entry.
 835 *
 836 * Input Args:
 837 *   function: The function of the cpuid entry to find.
 838 *   index: The index of the cpuid entry.
 839 *
 840 * Output Args: None
 841 *
 842 * Return: A pointer to the cpuid entry. Never returns NULL.
 843 */
 844struct kvm_cpuid_entry2 *
 845kvm_get_supported_cpuid_index(uint32_t function, uint32_t index)
 846{
 847        struct kvm_cpuid2 *cpuid;
 848        struct kvm_cpuid_entry2 *entry = NULL;
 849        int i;
 850
 851        cpuid = kvm_get_supported_cpuid();
 852        for (i = 0; i < cpuid->nent; i++) {
 853                if (cpuid->entries[i].function == function &&
 854                    cpuid->entries[i].index == index) {
 855                        entry = &cpuid->entries[i];
 856                        break;
 857                }
 858        }
 859
 860        TEST_ASSERT(entry, "Guest CPUID entry not found: (EAX=%x, ECX=%x).",
 861                    function, index);
 862        return entry;
 863}
 864
 865
 866int __vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid,
 867                     struct kvm_cpuid2 *cpuid)
 868{
 869        struct vcpu *vcpu = vcpu_find(vm, vcpuid);
 870
 871        TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
 872
 873        return ioctl(vcpu->fd, KVM_SET_CPUID2, cpuid);
 874}
 875
 876/*
 877 * VM VCPU CPUID Set
 878 *
 879 * Input Args:
 880 *   vm - Virtual Machine
 881 *   vcpuid - VCPU id
 882 *   cpuid - The CPUID values to set.
 883 *
 884 * Output Args: None
 885 *
 886 * Return: void
 887 *
 888 * Set the VCPU's CPUID.
 889 */
 890void vcpu_set_cpuid(struct kvm_vm *vm,
 891                uint32_t vcpuid, struct kvm_cpuid2 *cpuid)
 892{
 893        int rc;
 894
 895        rc = __vcpu_set_cpuid(vm, vcpuid, cpuid);
 896        TEST_ASSERT(rc == 0, "KVM_SET_CPUID2 failed, rc: %i errno: %i",
 897                    rc, errno);
 898
 899}
 900
 901/*
 902 * VCPU Get MSR
 903 *
 904 * Input Args:
 905 *   vm - Virtual Machine
 906 *   vcpuid - VCPU ID
 907 *   msr_index - Index of MSR
 908 *
 909 * Output Args: None
 910 *
 911 * Return: On success, value of the MSR. On failure a TEST_ASSERT is produced.
 912 *
 913 * Get value of MSR for VCPU.
 914 */
 915uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index)
 916{
 917        struct vcpu *vcpu = vcpu_find(vm, vcpuid);
 918        struct {
 919                struct kvm_msrs header;
 920                struct kvm_msr_entry entry;
 921        } buffer = {};
 922        int r;
 923
 924        TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
 925        buffer.header.nmsrs = 1;
 926        buffer.entry.index = msr_index;
 927        r = ioctl(vcpu->fd, KVM_GET_MSRS, &buffer.header);
 928        TEST_ASSERT(r == 1, "KVM_GET_MSRS IOCTL failed,\n"
 929                "  rc: %i errno: %i", r, errno);
 930
 931        return buffer.entry.data;
 932}
 933
 934/*
 935 * _VCPU Set MSR
 936 *
 937 * Input Args:
 938 *   vm - Virtual Machine
 939 *   vcpuid - VCPU ID
 940 *   msr_index - Index of MSR
 941 *   msr_value - New value of MSR
 942 *
 943 * Output Args: None
 944 *
 945 * Return: The result of KVM_SET_MSRS.
 946 *
 947 * Sets the value of an MSR for the given VCPU.
 948 */
 949int _vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index,
 950                  uint64_t msr_value)
 951{
 952        struct vcpu *vcpu = vcpu_find(vm, vcpuid);
 953        struct {
 954                struct kvm_msrs header;
 955                struct kvm_msr_entry entry;
 956        } buffer = {};
 957        int r;
 958
 959        TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
 960        memset(&buffer, 0, sizeof(buffer));
 961        buffer.header.nmsrs = 1;
 962        buffer.entry.index = msr_index;
 963        buffer.entry.data = msr_value;
 964        r = ioctl(vcpu->fd, KVM_SET_MSRS, &buffer.header);
 965        return r;
 966}
 967
 968/*
 969 * VCPU Set MSR
 970 *
 971 * Input Args:
 972 *   vm - Virtual Machine
 973 *   vcpuid - VCPU ID
 974 *   msr_index - Index of MSR
 975 *   msr_value - New value of MSR
 976 *
 977 * Output Args: None
 978 *
 979 * Return: On success, nothing. On failure a TEST_ASSERT is produced.
 980 *
 981 * Set value of MSR for VCPU.
 982 */
 983void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index,
 984        uint64_t msr_value)
 985{
 986        int r;
 987
 988        r = _vcpu_set_msr(vm, vcpuid, msr_index, msr_value);
 989        TEST_ASSERT(r == 1, "KVM_SET_MSRS IOCTL failed,\n"
 990                "  rc: %i errno: %i", r, errno);
 991}
 992
 993void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...)
 994{
 995        va_list ap;
 996        struct kvm_regs regs;
 997
 998        TEST_ASSERT(num >= 1 && num <= 6, "Unsupported number of args,\n"
 999                    "  num: %u\n",
1000                    num);
1001
1002        va_start(ap, num);
1003        vcpu_regs_get(vm, vcpuid, &regs);
1004
1005        if (num >= 1)
1006                regs.rdi = va_arg(ap, uint64_t);
1007
1008        if (num >= 2)
1009                regs.rsi = va_arg(ap, uint64_t);
1010
1011        if (num >= 3)
1012                regs.rdx = va_arg(ap, uint64_t);
1013
1014        if (num >= 4)
1015                regs.rcx = va_arg(ap, uint64_t);
1016
1017        if (num >= 5)
1018                regs.r8 = va_arg(ap, uint64_t);
1019
1020        if (num >= 6)
1021                regs.r9 = va_arg(ap, uint64_t);
1022
1023        vcpu_regs_set(vm, vcpuid, &regs);
1024        va_end(ap);
1025}
1026
1027void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent)
1028{
1029        struct kvm_regs regs;
1030        struct kvm_sregs sregs;
1031
1032        fprintf(stream, "%*scpuid: %u\n", indent, "", vcpuid);
1033
1034        fprintf(stream, "%*sregs:\n", indent + 2, "");
1035        vcpu_regs_get(vm, vcpuid, &regs);
1036        regs_dump(stream, &regs, indent + 4);
1037
1038        fprintf(stream, "%*ssregs:\n", indent + 2, "");
1039        vcpu_sregs_get(vm, vcpuid, &sregs);
1040        sregs_dump(stream, &sregs, indent + 4);
1041}
1042
1043static int kvm_get_num_msrs_fd(int kvm_fd)
1044{
1045        struct kvm_msr_list nmsrs;
1046        int r;
1047
1048        nmsrs.nmsrs = 0;
1049        r = ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, &nmsrs);
1050        TEST_ASSERT(r == -1 && errno == E2BIG, "Unexpected result from KVM_GET_MSR_INDEX_LIST probe, r: %i",
1051                r);
1052
1053        return nmsrs.nmsrs;
1054}
1055
1056static int kvm_get_num_msrs(struct kvm_vm *vm)
1057{
1058        return kvm_get_num_msrs_fd(vm->kvm_fd);
1059}
1060
1061struct kvm_msr_list *kvm_get_msr_index_list(void)
1062{
1063        struct kvm_msr_list *list;
1064        int nmsrs, r, kvm_fd;
1065
1066        kvm_fd = open_kvm_dev_path_or_exit();
1067
1068        nmsrs = kvm_get_num_msrs_fd(kvm_fd);
1069        list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0]));
1070        list->nmsrs = nmsrs;
1071        r = ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, list);
1072        close(kvm_fd);
1073
1074        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i",
1075                r);
1076
1077        return list;
1078}
1079
1080static int vcpu_save_xsave_state(struct kvm_vm *vm, struct vcpu *vcpu,
1081                                 struct kvm_x86_state *state)
1082{
1083        int size;
1084
1085        size = vm_check_cap(vm, KVM_CAP_XSAVE2);
1086        if (!size)
1087                size = sizeof(struct kvm_xsave);
1088
1089        state->xsave = malloc(size);
1090        if (size == sizeof(struct kvm_xsave))
1091                return ioctl(vcpu->fd, KVM_GET_XSAVE, state->xsave);
1092        else
1093                return ioctl(vcpu->fd, KVM_GET_XSAVE2, state->xsave);
1094}
1095
1096struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid)
1097{
1098        struct vcpu *vcpu = vcpu_find(vm, vcpuid);
1099        struct kvm_msr_list *list;
1100        struct kvm_x86_state *state;
1101        int nmsrs, r, i;
1102        static int nested_size = -1;
1103
1104        if (nested_size == -1) {
1105                nested_size = kvm_check_cap(KVM_CAP_NESTED_STATE);
1106                TEST_ASSERT(nested_size <= sizeof(state->nested_),
1107                            "Nested state size too big, %i > %zi",
1108                            nested_size, sizeof(state->nested_));
1109        }
1110
1111        /*
1112         * When KVM exits to userspace with KVM_EXIT_IO, KVM guarantees
1113         * guest state is consistent only after userspace re-enters the
1114         * kernel with KVM_RUN.  Complete IO prior to migrating state
1115         * to a new VM.
1116         */
1117        vcpu_run_complete_io(vm, vcpuid);
1118
1119        nmsrs = kvm_get_num_msrs(vm);
1120        list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0]));
1121        list->nmsrs = nmsrs;
1122        r = ioctl(vm->kvm_fd, KVM_GET_MSR_INDEX_LIST, list);
1123        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i",
1124                    r);
1125
1126        state = malloc(sizeof(*state) + nmsrs * sizeof(state->msrs.entries[0]));
1127        r = ioctl(vcpu->fd, KVM_GET_VCPU_EVENTS, &state->events);
1128        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_VCPU_EVENTS, r: %i",
1129                    r);
1130
1131        r = ioctl(vcpu->fd, KVM_GET_MP_STATE, &state->mp_state);
1132        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MP_STATE, r: %i",
1133                    r);
1134
1135        r = ioctl(vcpu->fd, KVM_GET_REGS, &state->regs);
1136        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_REGS, r: %i",
1137                    r);
1138
1139        r = vcpu_save_xsave_state(vm, vcpu, state);
1140        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XSAVE, r: %i",
1141                    r);
1142
1143        if (kvm_check_cap(KVM_CAP_XCRS)) {
1144                r = ioctl(vcpu->fd, KVM_GET_XCRS, &state->xcrs);
1145                TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XCRS, r: %i",
1146                            r);
1147        }
1148
1149        r = ioctl(vcpu->fd, KVM_GET_SREGS, &state->sregs);
1150        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_SREGS, r: %i",
1151                    r);
1152
1153        if (nested_size) {
1154                state->nested.size = sizeof(state->nested_);
1155                r = ioctl(vcpu->fd, KVM_GET_NESTED_STATE, &state->nested);
1156                TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_NESTED_STATE, r: %i",
1157                            r);
1158                TEST_ASSERT(state->nested.size <= nested_size,
1159                            "Nested state size too big, %i (KVM_CHECK_CAP gave %i)",
1160                            state->nested.size, nested_size);
1161        } else
1162                state->nested.size = 0;
1163
1164        state->msrs.nmsrs = nmsrs;
1165        for (i = 0; i < nmsrs; i++)
1166                state->msrs.entries[i].index = list->indices[i];
1167        r = ioctl(vcpu->fd, KVM_GET_MSRS, &state->msrs);
1168        TEST_ASSERT(r == nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed MSR was 0x%x)",
1169                    r, r == nmsrs ? -1 : list->indices[r]);
1170
1171        r = ioctl(vcpu->fd, KVM_GET_DEBUGREGS, &state->debugregs);
1172        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_DEBUGREGS, r: %i",
1173                    r);
1174
1175        free(list);
1176        return state;
1177}
1178
1179void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *state)
1180{
1181        struct vcpu *vcpu = vcpu_find(vm, vcpuid);
1182        int r;
1183
1184        r = ioctl(vcpu->fd, KVM_SET_SREGS, &state->sregs);
1185        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_SREGS, r: %i",
1186                    r);
1187
1188        r = ioctl(vcpu->fd, KVM_SET_MSRS, &state->msrs);
1189        TEST_ASSERT(r == state->msrs.nmsrs,
1190                "Unexpected result from KVM_SET_MSRS, r: %i (failed at %x)",
1191                r, r == state->msrs.nmsrs ? -1 : state->msrs.entries[r].index);
1192
1193        if (kvm_check_cap(KVM_CAP_XCRS)) {
1194                r = ioctl(vcpu->fd, KVM_SET_XCRS, &state->xcrs);
1195                TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XCRS, r: %i",
1196                            r);
1197        }
1198
1199        r = ioctl(vcpu->fd, KVM_SET_XSAVE, state->xsave);
1200        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XSAVE, r: %i",
1201                    r);
1202
1203        r = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, &state->events);
1204        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_VCPU_EVENTS, r: %i",
1205                    r);
1206
1207        r = ioctl(vcpu->fd, KVM_SET_MP_STATE, &state->mp_state);
1208        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_MP_STATE, r: %i",
1209                    r);
1210
1211        r = ioctl(vcpu->fd, KVM_SET_DEBUGREGS, &state->debugregs);
1212        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_DEBUGREGS, r: %i",
1213                    r);
1214
1215        r = ioctl(vcpu->fd, KVM_SET_REGS, &state->regs);
1216        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_REGS, r: %i",
1217                    r);
1218
1219        if (state->nested.size) {
1220                r = ioctl(vcpu->fd, KVM_SET_NESTED_STATE, &state->nested);
1221                TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_NESTED_STATE, r: %i",
1222                            r);
1223        }
1224}
1225
1226void kvm_x86_state_cleanup(struct kvm_x86_state *state)
1227{
1228        free(state->xsave);
1229        free(state);
1230}
1231
1232static bool cpu_vendor_string_is(const char *vendor)
1233{
1234        const uint32_t *chunk = (const uint32_t *)vendor;
1235        int eax, ebx, ecx, edx;
1236        const int leaf = 0;
1237
1238        __asm__ __volatile__(
1239                "cpuid"
1240                : /* output */ "=a"(eax), "=b"(ebx),
1241                  "=c"(ecx), "=d"(edx)
1242                : /* input */ "0"(leaf), "2"(0));
1243
1244        return (ebx == chunk[0] && edx == chunk[1] && ecx == chunk[2]);
1245}
1246
1247bool is_intel_cpu(void)
1248{
1249        return cpu_vendor_string_is("GenuineIntel");
1250}
1251
1252/*
1253 * Exclude early K5 samples with a vendor string of "AMDisbetter!"
1254 */
1255bool is_amd_cpu(void)
1256{
1257        return cpu_vendor_string_is("AuthenticAMD");
1258}
1259
1260uint32_t kvm_get_cpuid_max_basic(void)
1261{
1262        return kvm_get_supported_cpuid_entry(0)->eax;
1263}
1264
1265uint32_t kvm_get_cpuid_max_extended(void)
1266{
1267        return kvm_get_supported_cpuid_entry(0x80000000)->eax;
1268}
1269
1270void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits)
1271{
1272        struct kvm_cpuid_entry2 *entry;
1273        bool pae;
1274
1275        /* SDM 4.1.4 */
1276        if (kvm_get_cpuid_max_extended() < 0x80000008) {
1277                pae = kvm_get_supported_cpuid_entry(1)->edx & (1 << 6);
1278                *pa_bits = pae ? 36 : 32;
1279                *va_bits = 32;
1280        } else {
1281                entry = kvm_get_supported_cpuid_entry(0x80000008);
1282                *pa_bits = entry->eax & 0xff;
1283                *va_bits = (entry->eax >> 8) & 0xff;
1284        }
1285}
1286
1287struct idt_entry {
1288        uint16_t offset0;
1289        uint16_t selector;
1290        uint16_t ist : 3;
1291        uint16_t : 5;
1292        uint16_t type : 4;
1293        uint16_t : 1;
1294        uint16_t dpl : 2;
1295        uint16_t p : 1;
1296        uint16_t offset1;
1297        uint32_t offset2; uint32_t reserved;
1298};
1299
1300static void set_idt_entry(struct kvm_vm *vm, int vector, unsigned long addr,
1301                          int dpl, unsigned short selector)
1302{
1303        struct idt_entry *base =
1304                (struct idt_entry *)addr_gva2hva(vm, vm->idt);
1305        struct idt_entry *e = &base[vector];
1306
1307        memset(e, 0, sizeof(*e));
1308        e->offset0 = addr;
1309        e->selector = selector;
1310        e->ist = 0;
1311        e->type = 14;
1312        e->dpl = dpl;
1313        e->p = 1;
1314        e->offset1 = addr >> 16;
1315        e->offset2 = addr >> 32;
1316}
1317
1318void kvm_exit_unexpected_vector(uint32_t value)
1319{
1320        ucall(UCALL_UNHANDLED, 1, value);
1321}
1322
1323void route_exception(struct ex_regs *regs)
1324{
1325        typedef void(*handler)(struct ex_regs *);
1326        handler *handlers = (handler *)exception_handlers;
1327
1328        if (handlers && handlers[regs->vector]) {
1329                handlers[regs->vector](regs);
1330                return;
1331        }
1332
1333        kvm_exit_unexpected_vector(regs->vector);
1334}
1335
1336void vm_init_descriptor_tables(struct kvm_vm *vm)
1337{
1338        extern void *idt_handlers;
1339        int i;
1340
1341        vm->idt = vm_vaddr_alloc_page(vm);
1342        vm->handlers = vm_vaddr_alloc_page(vm);
1343        /* Handlers have the same address in both address spaces.*/
1344        for (i = 0; i < NUM_INTERRUPTS; i++)
1345                set_idt_entry(vm, i, (unsigned long)(&idt_handlers)[i], 0,
1346                        DEFAULT_CODE_SELECTOR);
1347}
1348
1349void vcpu_init_descriptor_tables(struct kvm_vm *vm, uint32_t vcpuid)
1350{
1351        struct kvm_sregs sregs;
1352
1353        vcpu_sregs_get(vm, vcpuid, &sregs);
1354        sregs.idt.base = vm->idt;
1355        sregs.idt.limit = NUM_INTERRUPTS * sizeof(struct idt_entry) - 1;
1356        sregs.gdt.base = vm->gdt;
1357        sregs.gdt.limit = getpagesize() - 1;
1358        kvm_seg_set_kernel_data_64bit(NULL, DEFAULT_DATA_SELECTOR, &sregs.gs);
1359        vcpu_sregs_set(vm, vcpuid, &sregs);
1360        *(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers;
1361}
1362
1363void vm_install_exception_handler(struct kvm_vm *vm, int vector,
1364                               void (*handler)(struct ex_regs *))
1365{
1366        vm_vaddr_t *handlers = (vm_vaddr_t *)addr_gva2hva(vm, vm->handlers);
1367
1368        handlers[vector] = (vm_vaddr_t)handler;
1369}
1370
1371void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid)
1372{
1373        struct ucall uc;
1374
1375        if (get_ucall(vm, vcpuid, &uc) == UCALL_UNHANDLED) {
1376                uint64_t vector = uc.args[0];
1377
1378                TEST_FAIL("Unexpected vectored event in guest (vector:0x%lx)",
1379                          vector);
1380        }
1381}
1382
1383struct kvm_cpuid_entry2 *get_cpuid(struct kvm_cpuid2 *cpuid, uint32_t function,
1384                                   uint32_t index)
1385{
1386        int i;
1387
1388        for (i = 0; i < cpuid->nent; i++) {
1389                struct kvm_cpuid_entry2 *cur = &cpuid->entries[i];
1390
1391                if (cur->function == function && cur->index == index)
1392                        return cur;
1393        }
1394
1395        TEST_FAIL("CPUID function 0x%x index 0x%x not found ", function, index);
1396
1397        return NULL;
1398}
1399
1400bool set_cpuid(struct kvm_cpuid2 *cpuid,
1401               struct kvm_cpuid_entry2 *ent)
1402{
1403        int i;
1404
1405        for (i = 0; i < cpuid->nent; i++) {
1406                struct kvm_cpuid_entry2 *cur = &cpuid->entries[i];
1407
1408                if (cur->function != ent->function || cur->index != ent->index)
1409                        continue;
1410
1411                memcpy(cur, ent, sizeof(struct kvm_cpuid_entry2));
1412                return true;
1413        }
1414
1415        return false;
1416}
1417
1418uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2,
1419                       uint64_t a3)
1420{
1421        uint64_t r;
1422
1423        asm volatile("vmcall"
1424                     : "=a"(r)
1425                     : "b"(a0), "c"(a1), "d"(a2), "S"(a3));
1426        return r;
1427}
1428
1429struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void)
1430{
1431        static struct kvm_cpuid2 *cpuid;
1432        int ret;
1433        int kvm_fd;
1434
1435        if (cpuid)
1436                return cpuid;
1437
1438        cpuid = allocate_kvm_cpuid2();
1439        kvm_fd = open_kvm_dev_path_or_exit();
1440
1441        ret = ioctl(kvm_fd, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
1442        TEST_ASSERT(ret == 0, "KVM_GET_SUPPORTED_HV_CPUID failed %d %d\n",
1443                    ret, errno);
1444
1445        close(kvm_fd);
1446        return cpuid;
1447}
1448
1449void vcpu_set_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid)
1450{
1451        static struct kvm_cpuid2 *cpuid_full;
1452        struct kvm_cpuid2 *cpuid_sys, *cpuid_hv;
1453        int i, nent = 0;
1454
1455        if (!cpuid_full) {
1456                cpuid_sys = kvm_get_supported_cpuid();
1457                cpuid_hv = kvm_get_supported_hv_cpuid();
1458
1459                cpuid_full = malloc(sizeof(*cpuid_full) +
1460                                    (cpuid_sys->nent + cpuid_hv->nent) *
1461                                    sizeof(struct kvm_cpuid_entry2));
1462                if (!cpuid_full) {
1463                        perror("malloc");
1464                        abort();
1465                }
1466
1467                /* Need to skip KVM CPUID leaves 0x400000xx */
1468                for (i = 0; i < cpuid_sys->nent; i++) {
1469                        if (cpuid_sys->entries[i].function >= 0x40000000 &&
1470                            cpuid_sys->entries[i].function < 0x40000100)
1471                                continue;
1472                        cpuid_full->entries[nent] = cpuid_sys->entries[i];
1473                        nent++;
1474                }
1475
1476                memcpy(&cpuid_full->entries[nent], cpuid_hv->entries,
1477                       cpuid_hv->nent * sizeof(struct kvm_cpuid_entry2));
1478                cpuid_full->nent = nent + cpuid_hv->nent;
1479        }
1480
1481        vcpu_set_cpuid(vm, vcpuid, cpuid_full);
1482}
1483
1484struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid)
1485{
1486        static struct kvm_cpuid2 *cpuid;
1487
1488        cpuid = allocate_kvm_cpuid2();
1489
1490        vcpu_ioctl(vm, vcpuid, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
1491
1492        return cpuid;
1493}
1494
1495unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
1496{
1497        const unsigned long num_ht_pages = 12 << (30 - vm->page_shift); /* 12 GiB */
1498        unsigned long ht_gfn, max_gfn, max_pfn;
1499        uint32_t eax, ebx, ecx, edx, max_ext_leaf;
1500
1501        max_gfn = (1ULL << (vm->pa_bits - vm->page_shift)) - 1;
1502
1503        /* Avoid reserved HyperTransport region on AMD processors.  */
1504        if (!is_amd_cpu())
1505                return max_gfn;
1506
1507        /* On parts with <40 physical address bits, the area is fully hidden */
1508        if (vm->pa_bits < 40)
1509                return max_gfn;
1510
1511        /* Before family 17h, the HyperTransport area is just below 1T.  */
1512        ht_gfn = (1 << 28) - num_ht_pages;
1513        eax = 1;
1514        ecx = 0;
1515        cpuid(&eax, &ebx, &ecx, &edx);
1516        if (x86_family(eax) < 0x17)
1517                goto done;
1518
1519        /*
1520         * Otherwise it's at the top of the physical address space, possibly
1521         * reduced due to SME by bits 11:6 of CPUID[0x8000001f].EBX.  Use
1522         * the old conservative value if MAXPHYADDR is not enumerated.
1523         */
1524        eax = 0x80000000;
1525        cpuid(&eax, &ebx, &ecx, &edx);
1526        max_ext_leaf = eax;
1527        if (max_ext_leaf < 0x80000008)
1528                goto done;
1529
1530        eax = 0x80000008;
1531        cpuid(&eax, &ebx, &ecx, &edx);
1532        max_pfn = (1ULL << ((eax & 0xff) - vm->page_shift)) - 1;
1533        if (max_ext_leaf >= 0x8000001f) {
1534                eax = 0x8000001f;
1535                cpuid(&eax, &ebx, &ecx, &edx);
1536                max_pfn >>= (ebx >> 6) & 0x3f;
1537        }
1538
1539        ht_gfn = max_pfn - num_ht_pages;
1540done:
1541        return min(max_gfn, ht_gfn - 1);
1542}
1543