1/*P:700 2 * The pagetable code, on the other hand, still shows the scars of 3 * previous encounters. It's functional, and as neat as it can be in the 4 * circumstances, but be wary, for these things are subtle and break easily. 5 * The Guest provides a virtual to physical mapping, but we can neither trust 6 * it nor use it: we verify and convert it here then point the CPU to the 7 * converted Guest pages when running the Guest. 8:*/ 9 10/* Copyright (C) Rusty Russell IBM Corporation 2006. 11 * GPL v2 and any later version */ 12#include <linux/mm.h> 13#include <linux/gfp.h> 14#include <linux/types.h> 15#include <linux/spinlock.h> 16#include <linux/random.h> 17#include <linux/percpu.h> 18#include <asm/tlbflush.h> 19#include <asm/uaccess.h> 20#include "lg.h" 21 22/*M:008 23 * We hold reference to pages, which prevents them from being swapped. 24 * It'd be nice to have a callback in the "struct mm_struct" when Linux wants 25 * to swap out. If we had this, and a shrinker callback to trim PTE pages, we 26 * could probably consider launching Guests as non-root. 27:*/ 28 29/*H:300 30 * The Page Table Code 31 * 32 * We use two-level page tables for the Guest, or three-level with PAE. If 33 * you're not entirely comfortable with virtual addresses, physical addresses 34 * and page tables then I recommend you review arch/x86/lguest/boot.c's "Page 35 * Table Handling" (with diagrams!). 36 * 37 * The Guest keeps page tables, but we maintain the actual ones here: these are 38 * called "shadow" page tables. Which is a very Guest-centric name: these are 39 * the real page tables the CPU uses, although we keep them up to date to 40 * reflect the Guest's. (See what I mean about weird naming? Since when do 41 * shadows reflect anything?) 42 * 43 * Anyway, this is the most complicated part of the Host code. There are seven 44 * parts to this: 45 * (i) Looking up a page table entry when the Guest faults, 46 * (ii) Making sure the Guest stack is mapped, 47 * (iii) Setting up a page table entry when the Guest tells us one has changed, 48 * (iv) Switching page tables, 49 * (v) Flushing (throwing away) page tables, 50 * (vi) Mapping the Switcher when the Guest is about to run, 51 * (vii) Setting up the page tables initially. 52:*/ 53 54/* 55 * The Switcher uses the complete top PTE page. That's 1024 PTE entries (4MB) 56 * or 512 PTE entries with PAE (2MB). 57 */ 58#define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) 59 60/* 61 * For PAE we need the PMD index as well. We use the last 2MB, so we 62 * will need the last pmd entry of the last pmd page. 63 */ 64#ifdef CONFIG_X86_PAE 65#define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1) 66#define RESERVE_MEM 2U 67#define CHECK_GPGD_MASK _PAGE_PRESENT 68#else 69#define RESERVE_MEM 4U 70#define CHECK_GPGD_MASK _PAGE_TABLE 71#endif 72 73/* 74 * We actually need a separate PTE page for each CPU. Remember that after the 75 * Switcher code itself comes two pages for each CPU, and we don't want this 76 * CPU's guest to see the pages of any other CPU. 77 */ 78static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); 79#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) 80 81/*H:320 82 * The page table code is curly enough to need helper functions to keep it 83 * clear and clean. The kernel itself provides many of them; one advantage 84 * of insisting that the Guest and Host use the same CONFIG_PAE setting. 85 * 86 * There are two functions which return pointers to the shadow (aka "real") 87 * page tables. 88 * 89 * spgd_addr() takes the virtual address and returns a pointer to the top-level 90 * page directory entry (PGD) for that address. Since we keep track of several 91 * page tables, the "i" argument tells us which one we're interested in (it's 92 * usually the current one). 93 */ 94static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) 95{ 96 unsigned int index = pgd_index(vaddr); 97 98#ifndef CONFIG_X86_PAE 99 /* We kill any Guest trying to touch the Switcher addresses. */ 100 if (index >= SWITCHER_PGD_INDEX) { 101 kill_guest(cpu, "attempt to access switcher pages"); 102 index = 0; 103 } 104#endif 105 /* Return a pointer index'th pgd entry for the i'th page table. */ 106 return &cpu->lg->pgdirs[i].pgdir[index]; 107} 108 109#ifdef CONFIG_X86_PAE 110/* 111 * This routine then takes the PGD entry given above, which contains the 112 * address of the PMD page. It then returns a pointer to the PMD entry for the 113 * given address. 114 */ 115static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) 116{ 117 unsigned int index = pmd_index(vaddr); 118 pmd_t *page; 119 120 /* We kill any Guest trying to touch the Switcher addresses. */ 121 if (pgd_index(vaddr) == SWITCHER_PGD_INDEX && 122 index >= SWITCHER_PMD_INDEX) { 123 kill_guest(cpu, "attempt to access switcher pages"); 124 index = 0; 125 } 126 127 /* You should never call this if the PGD entry wasn't valid */ 128 BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); 129 page = __va(pgd_pfn(spgd) << PAGE_SHIFT); 130 131 return &page[index]; 132} 133#endif 134 135/* 136 * This routine then takes the page directory entry returned above, which 137 * contains the address of the page table entry (PTE) page. It then returns a 138 * pointer to the PTE entry for the given address. 139 */ 140static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) 141{ 142#ifdef CONFIG_X86_PAE 143 pmd_t *pmd = spmd_addr(cpu, spgd, vaddr); 144 pte_t *page = __va(pmd_pfn(*pmd) << PAGE_SHIFT); 145 146 /* You should never call this if the PMD entry wasn't valid */ 147 BUG_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT)); 148#else 149 pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); 150 /* You should never call this if the PGD entry wasn't valid */ 151 BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); 152#endif 153 154 return &page[pte_index(vaddr)]; 155} 156 157/* 158 * These functions are just like the above, except they access the Guest 159 * page tables. Hence they return a Guest address. 160 */ 161static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) 162{ 163 unsigned int index = vaddr >> (PGDIR_SHIFT); 164 return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t); 165} 166 167#ifdef CONFIG_X86_PAE 168/* Follow the PGD to the PMD. */ 169static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) 170{ 171 unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; 172 BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); 173 return gpage + pmd_index(vaddr) * sizeof(pmd_t); 174} 175 176/* Follow the PMD to the PTE. */ 177static unsigned long gpte_addr(struct lg_cpu *cpu, 178 pmd_t gpmd, unsigned long vaddr) 179{ 180 unsigned long gpage = pmd_pfn(gpmd) << PAGE_SHIFT; 181 182 BUG_ON(!(pmd_flags(gpmd) & _PAGE_PRESENT)); 183 return gpage + pte_index(vaddr) * sizeof(pte_t); 184} 185#else 186/* Follow the PGD to the PTE (no mid-level for !PAE). */ 187static unsigned long gpte_addr(struct lg_cpu *cpu, 188 pgd_t gpgd, unsigned long vaddr) 189{ 190 unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; 191 192 BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); 193 return gpage + pte_index(vaddr) * sizeof(pte_t); 194} 195#endif 196/*:*/ 197 198/*M:007 199 * get_pfn is slow: we could probably try to grab batches of pages here as 200 * an optimization (ie. pre-faulting). 201:*/ 202 203/*H:350 204 * This routine takes a page number given by the Guest and converts it to 205 * an actual, physical page number. It can fail for several reasons: the 206 * virtual address might not be mapped by the Launcher, the write flag is set 207 * and the page is read-only, or the write flag was set and the page was 208 * shared so had to be copied, but we ran out of memory. 209 * 210 * This holds a reference to the page, so release_pte() is careful to put that 211 * back. 212 */ 213static unsigned long get_pfn(unsigned long virtpfn, int write) 214{ 215 struct page *page; 216 217 /* gup me one page at this address please! */ 218 if (get_user_pages_fast(virtpfn << PAGE_SHIFT, 1, write, &page) == 1) 219 return page_to_pfn(page); 220 221 /* This value indicates failure. */ 222 return -1UL; 223} 224 225/*H:340 226 * Converting a Guest page table entry to a shadow (ie. real) page table 227 * entry can be a little tricky. The flags are (almost) the same, but the 228 * Guest PTE contains a virtual page number: the CPU needs the real page 229 * number. 230 */ 231static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write) 232{ 233 unsigned long pfn, base, flags; 234 235 /* 236 * The Guest sets the global flag, because it thinks that it is using 237 * PGE. We only told it to use PGE so it would tell us whether it was 238 * flushing a kernel mapping or a userspace mapping. We don't actually 239 * use the global bit, so throw it away. 240 */ 241 flags = (pte_flags(gpte) & ~_PAGE_GLOBAL); 242 243 /* The Guest's pages are offset inside the Launcher. */ 244 base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE; 245 246 /* 247 * We need a temporary "unsigned long" variable to hold the answer from 248 * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't 249 * fit in spte.pfn. get_pfn() finds the real physical number of the 250 * page, given the virtual number. 251 */ 252 pfn = get_pfn(base + pte_pfn(gpte), write); 253 if (pfn == -1UL) { 254 kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte)); 255 /* 256 * When we destroy the Guest, we'll go through the shadow page 257 * tables and release_pte() them. Make sure we don't think 258 * this one is valid! 259 */ 260 flags = 0; 261 } 262 /* Now we assemble our shadow PTE from the page number and flags. */ 263 return pfn_pte(pfn, __pgprot(flags)); 264} 265 266/*H:460 And to complete the chain, release_pte() looks like this: */ 267static void release_pte(pte_t pte) 268{ 269 /* 270 * Remember that get_user_pages_fast() took a reference to the page, in 271 * get_pfn()? We have to put it back now. 272 */ 273 if (pte_flags(pte) & _PAGE_PRESENT) 274 put_page(pte_page(pte)); 275} 276/*:*/ 277 278static void check_gpte(struct lg_cpu *cpu, pte_t gpte) 279{ 280 if ((pte_flags(gpte) & _PAGE_PSE) || 281 pte_pfn(gpte) >= cpu->lg->pfn_limit) 282 kill_guest(cpu, "bad page table entry"); 283} 284 285static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) 286{ 287 if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) || 288 (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) 289 kill_guest(cpu, "bad page directory entry"); 290} 291 292#ifdef CONFIG_X86_PAE 293static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) 294{ 295 if ((pmd_flags(gpmd) & ~_PAGE_TABLE) || 296 (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) 297 kill_guest(cpu, "bad page middle directory entry"); 298} 299#endif 300 301/*H:330 302 * (i) Looking up a page table entry when the Guest faults. 303 * 304 * We saw this call in run_guest(): when we see a page fault in the Guest, we 305 * come here. That's because we only set up the shadow page tables lazily as 306 * they're needed, so we get page faults all the time and quietly fix them up 307 * and return to the Guest without it knowing. 308 * 309 * If we fixed up the fault (ie. we mapped the address), this routine returns 310 * true. Otherwise, it was a real fault and we need to tell the Guest. 311 */ 312bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) 313{ 314 pgd_t gpgd; 315 pgd_t *spgd; 316 unsigned long gpte_ptr; 317 pte_t gpte; 318 pte_t *spte; 319 320 /* Mid level for PAE. */ 321#ifdef CONFIG_X86_PAE 322 pmd_t *spmd; 323 pmd_t gpmd; 324#endif 325 326 /* First step: get the top-level Guest page table entry. */ 327 if (unlikely(cpu->linear_pages)) { 328 /* Faking up a linear mapping. */ 329 gpgd = __pgd(CHECK_GPGD_MASK); 330 } else { 331 gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); 332 /* Toplevel not present? We can't map it in. */ 333 if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) 334 return false; 335 } 336 337 /* Now look at the matching shadow entry. */ 338 spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); 339 if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { 340 /* No shadow entry: allocate a new shadow PTE page. */ 341 unsigned long ptepage = get_zeroed_page(GFP_KERNEL); 342 /* 343 * This is not really the Guest's fault, but killing it is 344 * simple for this corner case. 345 */ 346 if (!ptepage) { 347 kill_guest(cpu, "out of memory allocating pte page"); 348 return false; 349 } 350 /* We check that the Guest pgd is OK. */ 351 check_gpgd(cpu, gpgd); 352 /* 353 * And we copy the flags to the shadow PGD entry. The page 354 * number in the shadow PGD is the page we just allocated. 355 */ 356 set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); 357 } 358 359#ifdef CONFIG_X86_PAE 360 if (unlikely(cpu->linear_pages)) { 361 /* Faking up a linear mapping. */ 362 gpmd = __pmd(_PAGE_TABLE); 363 } else { 364 gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); 365 /* Middle level not present? We can't map it in. */ 366 if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) 367 return false; 368 } 369 370 /* Now look at the matching shadow entry. */ 371 spmd = spmd_addr(cpu, *spgd, vaddr); 372 373 if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) { 374 /* No shadow entry: allocate a new shadow PTE page. */ 375 unsigned long ptepage = get_zeroed_page(GFP_KERNEL); 376 377 /* 378 * This is not really the Guest's fault, but killing it is 379 * simple for this corner case. 380 */ 381 if (!ptepage) { 382 kill_guest(cpu, "out of memory allocating pte page"); 383 return false; 384 } 385 386 /* We check that the Guest pmd is OK. */ 387 check_gpmd(cpu, gpmd); 388 389 /* 390 * And we copy the flags to the shadow PMD entry. The page 391 * number in the shadow PMD is the page we just allocated. 392 */ 393 set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); 394 } 395 396 /* 397 * OK, now we look at the lower level in the Guest page table: keep its 398 * address, because we might update it later. 399 */ 400 gpte_ptr = gpte_addr(cpu, gpmd, vaddr); 401#else 402 /* 403 * OK, now we look at the lower level in the Guest page table: keep its 404 * address, because we might update it later. 405 */ 406 gpte_ptr = gpte_addr(cpu, gpgd, vaddr); 407#endif 408 409 if (unlikely(cpu->linear_pages)) { 410 /* Linear? Make up a PTE which points to same page. */ 411 gpte = __pte((vaddr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT); 412 } else { 413 /* Read the actual PTE value. */ 414 gpte = lgread(cpu, gpte_ptr, pte_t); 415 } 416 417 /* If this page isn't in the Guest page tables, we can't page it in. */ 418 if (!(pte_flags(gpte) & _PAGE_PRESENT)) 419 return false; 420 421 /* 422 * Check they're not trying to write to a page the Guest wants 423 * read-only (bit 2 of errcode == write). 424 */ 425 if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW)) 426 return false; 427 428 /* User access to a kernel-only page? (bit 3 == user access) */ 429 if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER)) 430 return false; 431 432 /* 433 * Check that the Guest PTE flags are OK, and the page number is below 434 * the pfn_limit (ie. not mapping the Launcher binary). 435 */ 436 check_gpte(cpu, gpte); 437 438 /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ 439 gpte = pte_mkyoung(gpte); 440 if (errcode & 2) 441 gpte = pte_mkdirty(gpte); 442 443 /* Get the pointer to the shadow PTE entry we're going to set. */ 444 spte = spte_addr(cpu, *spgd, vaddr); 445 446 /* 447 * If there was a valid shadow PTE entry here before, we release it. 448 * This can happen with a write to a previously read-only entry. 449 */ 450 release_pte(*spte); 451 452 /* 453 * If this is a write, we insist that the Guest page is writable (the 454 * final arg to gpte_to_spte()). 455 */ 456 if (pte_dirty(gpte)) 457 *spte = gpte_to_spte(cpu, gpte, 1); 458 else 459 /* 460 * If this is a read, don't set the "writable" bit in the page 461 * table entry, even if the Guest says it's writable. That way 462 * we will come back here when a write does actually occur, so 463 * we can update the Guest's _PAGE_DIRTY flag. 464 */ 465 set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0)); 466 467 /* 468 * Finally, we write the Guest PTE entry back: we've set the 469 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. 470 */ 471 if (likely(!cpu->linear_pages)) 472 lgwrite(cpu, gpte_ptr, pte_t, gpte); 473 474 /* 475 * The fault is fixed, the page table is populated, the mapping 476 * manipulated, the result returned and the code complete. A small 477 * delay and a trace of alliteration are the only indications the Guest 478 * has that a page fault occurred at all. 479 */ 480 return true; 481} 482 483/*H:360 484 * (ii) Making sure the Guest stack is mapped. 485 * 486 * Remember that direct traps into the Guest need a mapped Guest kernel stack. 487 * pin_stack_pages() calls us here: we could simply call demand_page(), but as 488 * we've seen that logic is quite long, and usually the stack pages are already 489 * mapped, so it's overkill. 490 * 491 * This is a quick version which answers the question: is this virtual address 492 * mapped by the shadow page tables, and is it writable? 493 */ 494static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) 495{ 496 pgd_t *spgd; 497 unsigned long flags; 498 499#ifdef CONFIG_X86_PAE 500 pmd_t *spmd; 501#endif 502 /* Look at the current top level entry: is it present? */ 503 spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); 504 if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) 505 return false; 506 507#ifdef CONFIG_X86_PAE 508 spmd = spmd_addr(cpu, *spgd, vaddr); 509 if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) 510 return false; 511#endif 512 513 /* 514 * Check the flags on the pte entry itself: it must be present and 515 * writable. 516 */ 517 flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr))); 518 519 return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); 520} 521 522/* 523 * So, when pin_stack_pages() asks us to pin a page, we check if it's already 524 * in the page tables, and if not, we call demand_page() with error code 2 525 * (meaning "write"). 526 */ 527void pin_page(struct lg_cpu *cpu, unsigned long vaddr) 528{ 529 if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) 530 kill_guest(cpu, "bad stack page %#lx", vaddr); 531} 532/*:*/ 533 534#ifdef CONFIG_X86_PAE 535static void release_pmd(pmd_t *spmd) 536{ 537 /* If the entry's not present, there's nothing to release. */ 538 if (pmd_flags(*spmd) & _PAGE_PRESENT) { 539 unsigned int i; 540 pte_t *ptepage = __va(pmd_pfn(*spmd) << PAGE_SHIFT); 541 /* For each entry in the page, we might need to release it. */ 542 for (i = 0; i < PTRS_PER_PTE; i++) 543 release_pte(ptepage[i]); 544 /* Now we can free the page of PTEs */ 545 free_page((long)ptepage); 546 /* And zero out the PMD entry so we never release it twice. */ 547 set_pmd(spmd, __pmd(0)); 548 } 549} 550 551static void release_pgd(pgd_t *spgd) 552{ 553 /* If the entry's not present, there's nothing to release. */ 554 if (pgd_flags(*spgd) & _PAGE_PRESENT) { 555 unsigned int i; 556 pmd_t *pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); 557 558 for (i = 0; i < PTRS_PER_PMD; i++) 559 release_pmd(&pmdpage[i]); 560 561 /* Now we can free the page of PMDs */ 562 free_page((long)pmdpage); 563 /* And zero out the PGD entry so we never release it twice. */ 564 set_pgd(spgd, __pgd(0)); 565 } 566} 567 568#else /* !CONFIG_X86_PAE */ 569/*H:450 570 * If we chase down the release_pgd() code, the non-PAE version looks like 571 * this. The PAE version is almost identical, but instead of calling 572 * release_pte it calls release_pmd(), which looks much like this. 573 */ 574static void release_pgd(pgd_t *spgd) 575{ 576 /* If the entry's not present, there's nothing to release. */ 577 if (pgd_flags(*spgd) & _PAGE_PRESENT) { 578 unsigned int i; 579 /* 580 * Converting the pfn to find the actual PTE page is easy: turn 581 * the page number into a physical address, then convert to a 582 * virtual address (easy for kernel pages like this one). 583 */ 584 pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); 585 /* For each entry in the page, we might need to release it. */ 586 for (i = 0; i < PTRS_PER_PTE; i++) 587 release_pte(ptepage[i]); 588 /* Now we can free the page of PTEs */ 589 free_page((long)ptepage); 590 /* And zero out the PGD entry so we never release it twice. */ 591 *spgd = __pgd(0); 592 } 593} 594#endif 595 596/*H:445 597 * We saw flush_user_mappings() twice: once from the flush_user_mappings() 598 * hypercall and once in new_pgdir() when we re-used a top-level pgdir page. 599 * It simply releases every PTE page from 0 up to the Guest's kernel address. 600 */ 601static void flush_user_mappings(struct lguest *lg, int idx) 602{ 603 unsigned int i; 604 /* Release every pgd entry up to the kernel's address. */ 605 for (i = 0; i < pgd_index(lg->kernel_address); i++) 606 release_pgd(lg->pgdirs[idx].pgdir + i); 607} 608 609/*H:440 610 * (v) Flushing (throwing away) page tables, 611 * 612 * The Guest has a hypercall to throw away the page tables: it's used when a 613 * large number of mappings have been changed. 614 */ 615void guest_pagetable_flush_user(struct lg_cpu *cpu) 616{ 617 /* Drop the userspace part of the current page table. */ 618 flush_user_mappings(cpu->lg, cpu->cpu_pgd); 619} 620/*:*/ 621 622/* We walk down the guest page tables to get a guest-physical address */ 623unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) 624{ 625 pgd_t gpgd; 626 pte_t gpte; 627#ifdef CONFIG_X86_PAE 628 pmd_t gpmd; 629#endif 630 631 /* Still not set up? Just map 1:1. */ 632 if (unlikely(cpu->linear_pages)) 633 return vaddr; 634 635 /* First step: get the top-level Guest page table entry. */ 636 gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); 637 /* Toplevel not present? We can't map it in. */ 638 if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) { 639 kill_guest(cpu, "Bad address %#lx", vaddr); 640 return -1UL; 641 } 642 643#ifdef CONFIG_X86_PAE 644 gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); 645 if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) 646 kill_guest(cpu, "Bad address %#lx", vaddr); 647 gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t); 648#else 649 gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t); 650#endif 651 if (!(pte_flags(gpte) & _PAGE_PRESENT)) 652 kill_guest(cpu, "Bad address %#lx", vaddr); 653 654 return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); 655} 656 657/* 658 * We keep several page tables. This is a simple routine to find the page 659 * table (if any) corresponding to this top-level address the Guest has given 660 * us. 661 */ 662static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) 663{ 664 unsigned int i; 665 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) 666 if (lg->pgdirs[i].pgdir && lg->pgdirs[i].gpgdir == pgtable) 667 break; 668 return i; 669} 670 671/*H:435 672 * And this is us, creating the new page directory. If we really do 673 * allocate a new one (and so the kernel parts are not there), we set 674 * blank_pgdir. 675 */ 676static unsigned int new_pgdir(struct lg_cpu *cpu, 677 unsigned long gpgdir, 678 int *blank_pgdir) 679{ 680 unsigned int next; 681#ifdef CONFIG_X86_PAE 682 pmd_t *pmd_table; 683#endif 684 685 /* 686 * We pick one entry at random to throw out. Choosing the Least 687 * Recently Used might be better, but this is easy. 688 */ 689 next = random32() % ARRAY_SIZE(cpu->lg->pgdirs); 690 /* If it's never been allocated at all before, try now. */ 691 if (!cpu->lg->pgdirs[next].pgdir) { 692 cpu->lg->pgdirs[next].pgdir = 693 (pgd_t *)get_zeroed_page(GFP_KERNEL); 694 /* If the allocation fails, just keep using the one we have */ 695 if (!cpu->lg->pgdirs[next].pgdir) 696 next = cpu->cpu_pgd; 697 else { 698#ifdef CONFIG_X86_PAE 699 /* 700 * In PAE mode, allocate a pmd page and populate the 701 * last pgd entry. 702 */ 703 pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL); 704 if (!pmd_table) { 705 free_page((long)cpu->lg->pgdirs[next].pgdir); 706 set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0)); 707 next = cpu->cpu_pgd; 708 } else { 709 set_pgd(cpu->lg->pgdirs[next].pgdir + 710 SWITCHER_PGD_INDEX, 711 __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 712 /* 713 * This is a blank page, so there are no kernel 714 * mappings: caller must map the stack! 715 */ 716 *blank_pgdir = 1; 717 } 718#else 719 *blank_pgdir = 1; 720#endif 721 } 722 } 723 /* Record which Guest toplevel this shadows. */ 724 cpu->lg->pgdirs[next].gpgdir = gpgdir; 725 /* Release all the non-kernel mappings. */ 726 flush_user_mappings(cpu->lg, next); 727 728 return next; 729} 730 731/*H:470 732 * Finally, a routine which throws away everything: all PGD entries in all 733 * the shadow page tables, including the Guest's kernel mappings. This is used 734 * when we destroy the Guest. 735 */ 736static void release_all_pagetables(struct lguest *lg) 737{ 738 unsigned int i, j; 739 740 /* Every shadow pagetable this Guest has */ 741 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) 742 if (lg->pgdirs[i].pgdir) { 743#ifdef CONFIG_X86_PAE 744 pgd_t *spgd; 745 pmd_t *pmdpage; 746 unsigned int k; 747 748 /* Get the last pmd page. */ 749 spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX; 750 pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); 751 752 /* 753 * And release the pmd entries of that pmd page, 754 * except for the switcher pmd. 755 */ 756 for (k = 0; k < SWITCHER_PMD_INDEX; k++) 757 release_pmd(&pmdpage[k]); 758#endif 759 /* Every PGD entry except the Switcher at the top */ 760 for (j = 0; j < SWITCHER_PGD_INDEX; j++) 761 release_pgd(lg->pgdirs[i].pgdir + j); 762 } 763} 764 765/* 766 * We also throw away everything when a Guest tells us it's changed a kernel 767 * mapping. Since kernel mappings are in every page table, it's easiest to 768 * throw them all away. This traps the Guest in amber for a while as 769 * everything faults back in, but it's rare. 770 */ 771void guest_pagetable_clear_all(struct lg_cpu *cpu) 772{ 773 release_all_pagetables(cpu->lg); 774 /* We need the Guest kernel stack mapped again. */ 775 pin_stack_pages(cpu); 776} 777 778/*H:430 779 * (iv) Switching page tables 780 * 781 * Now we've seen all the page table setting and manipulation, let's see 782 * what happens when the Guest changes page tables (ie. changes the top-level 783 * pgdir). This occurs on almost every context switch. 784 */ 785void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) 786{ 787 int newpgdir, repin = 0; 788 789 /* 790 * The very first time they call this, we're actually running without 791 * any page tables; we've been making it up. Throw them away now. 792 */ 793 if (unlikely(cpu->linear_pages)) { 794 release_all_pagetables(cpu->lg); 795 cpu->linear_pages = false; 796 /* Force allocation of a new pgdir. */ 797 newpgdir = ARRAY_SIZE(cpu->lg->pgdirs); 798 } else { 799 /* Look to see if we have this one already. */ 800 newpgdir = find_pgdir(cpu->lg, pgtable); 801 } 802 803 /* 804 * If not, we allocate or mug an existing one: if it's a fresh one, 805 * repin gets set to 1. 806 */ 807 if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs)) 808 newpgdir = new_pgdir(cpu, pgtable, &repin); 809 /* Change the current pgd index to the new one. */ 810 cpu->cpu_pgd = newpgdir; 811 /* If it was completely blank, we map in the Guest kernel stack */ 812 if (repin) 813 pin_stack_pages(cpu); 814} 815/*:*/ 816 817/*M:009 818 * Since we throw away all mappings when a kernel mapping changes, our 819 * performance sucks for guests using highmem. In fact, a guest with 820 * PAGE_OFFSET 0xc0000000 (the default) and more than about 700MB of RAM is 821 * usually slower than a Guest with less memory. 822 * 823 * This, of course, cannot be fixed. It would take some kind of... well, I 824 * don't know, but the term "puissant code-fu" comes to mind. 825:*/ 826 827/*H:420 828 * This is the routine which actually sets the page table entry for then 829 * "idx"'th shadow page table. 830 * 831 * Normally, we can just throw out the old entry and replace it with 0: if they 832 * use it demand_page() will put the new entry in. We need to do this anyway: 833 * The Guest expects _PAGE_ACCESSED to be set on its PTE the first time a page 834 * is read from, and _PAGE_DIRTY when it's written to. 835 * 836 * But Avi Kivity pointed out that most Operating Systems (Linux included) set 837 * these bits on PTEs immediately anyway. This is done to save the CPU from 838 * having to update them, but it helps us the same way: if they set 839 * _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if 840 * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately. 841 */ 842static void do_set_pte(struct lg_cpu *cpu, int idx, 843 unsigned long vaddr, pte_t gpte) 844{ 845 /* Look up the matching shadow page directory entry. */ 846 pgd_t *spgd = spgd_addr(cpu, idx, vaddr); 847#ifdef CONFIG_X86_PAE 848 pmd_t *spmd; 849#endif 850 851 /* If the top level isn't present, there's no entry to update. */ 852 if (pgd_flags(*spgd) & _PAGE_PRESENT) { 853#ifdef CONFIG_X86_PAE 854 spmd = spmd_addr(cpu, *spgd, vaddr); 855 if (pmd_flags(*spmd) & _PAGE_PRESENT) { 856#endif 857 /* Otherwise, start by releasing the existing entry. */ 858 pte_t *spte = spte_addr(cpu, *spgd, vaddr); 859 release_pte(*spte); 860 861 /* 862 * If they're setting this entry as dirty or accessed, 863 * we might as well put that entry they've given us in 864 * now. This shaves 10% off a copy-on-write 865 * micro-benchmark. 866 */ 867 if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { 868 check_gpte(cpu, gpte); 869 set_pte(spte, 870 gpte_to_spte(cpu, gpte, 871 pte_flags(gpte) & _PAGE_DIRTY)); 872 } else { 873 /* 874 * Otherwise kill it and we can demand_page() 875 * it in later. 876 */ 877 set_pte(spte, __pte(0)); 878 } 879#ifdef CONFIG_X86_PAE 880 } 881#endif 882 } 883} 884 885/*H:410 886 * Updating a PTE entry is a little trickier. 887 * 888 * We keep track of several different page tables (the Guest uses one for each 889 * process, so it makes sense to cache at least a few). Each of these have 890 * identical kernel parts: ie. every mapping above PAGE_OFFSET is the same for 891 * all processes. So when the page table above that address changes, we update 892 * all the page tables, not just the current one. This is rare. 893 * 894 * The benefit is that when we have to track a new page table, we can keep all 895 * the kernel mappings. This speeds up context switch immensely. 896 */ 897void guest_set_pte(struct lg_cpu *cpu, 898 unsigned long gpgdir, unsigned long vaddr, pte_t gpte) 899{ 900 /* 901 * Kernel mappings must be changed on all top levels. Slow, but doesn't 902 * happen often. 903 */ 904 if (vaddr >= cpu->lg->kernel_address) { 905 unsigned int i; 906 for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++) 907 if (cpu->lg->pgdirs[i].pgdir) 908 do_set_pte(cpu, i, vaddr, gpte); 909 } else { 910 /* Is this page table one we have a shadow for? */ 911 int pgdir = find_pgdir(cpu->lg, gpgdir); 912 if (pgdir != ARRAY_SIZE(cpu->lg->pgdirs)) 913 /* If so, do the update. */ 914 do_set_pte(cpu, pgdir, vaddr, gpte); 915 } 916} 917 918/*H:400 919 * (iii) Setting up a page table entry when the Guest tells us one has changed. 920 * 921 * Just like we did in interrupts_and_traps.c, it makes sense for us to deal 922 * with the other side of page tables while we're here: what happens when the 923 * Guest asks for a page table to be updated? 924 * 925 * We already saw that demand_page() will fill in the shadow page tables when 926 * needed, so we can simply remove shadow page table entries whenever the Guest 927 * tells us they've changed. When the Guest tries to use the new entry it will 928 * fault and demand_page() will fix it up. 929 * 930 * So with that in mind here's our code to update a (top-level) PGD entry: 931 */ 932void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) 933{ 934 int pgdir; 935 936 if (idx >= SWITCHER_PGD_INDEX) 937 return; 938 939 /* If they're talking about a page table we have a shadow for... */ 940 pgdir = find_pgdir(lg, gpgdir); 941 if (pgdir < ARRAY_SIZE(lg->pgdirs)) 942 /* ... throw it away. */ 943 release_pgd(lg->pgdirs[pgdir].pgdir + idx); 944} 945 946#ifdef CONFIG_X86_PAE 947/* For setting a mid-level, we just throw everything away. It's easy. */ 948void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) 949{ 950 guest_pagetable_clear_all(&lg->cpus[0]); 951} 952#endif 953 954/*H:500 955 * (vii) Setting up the page tables initially. 956 * 957 * When a Guest is first created, set initialize a shadow page table which 958 * we will populate on future faults. The Guest doesn't have any actual 959 * pagetables yet, so we set linear_pages to tell demand_page() to fake it 960 * for the moment. 961 */ 962int init_guest_pagetable(struct lguest *lg) 963{ 964 struct lg_cpu *cpu = &lg->cpus[0]; 965 int allocated = 0; 966 967 /* lg (and lg->cpus[]) starts zeroed: this allocates a new pgdir */ 968 cpu->cpu_pgd = new_pgdir(cpu, 0, &allocated); 969 if (!allocated) 970 return -ENOMEM; 971 972 /* We start with a linear mapping until the initialize. */ 973 cpu->linear_pages = true; 974 return 0; 975} 976 977/*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ 978void page_table_guest_data_init(struct lg_cpu *cpu) 979{ 980 /* We get the kernel address: above this is all kernel memory. */ 981 if (get_user(cpu->lg->kernel_address, 982 &cpu->lg->lguest_data->kernel_address) 983 /* 984 * We tell the Guest that it can't use the top 2 or 4 MB 985 * of virtual addresses used by the Switcher. 986 */ 987 || put_user(RESERVE_MEM * 1024 * 1024, 988 &cpu->lg->lguest_data->reserve_mem)) { 989 kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); 990 return; 991 } 992 993 /* 994 * In flush_user_mappings() we loop from 0 to 995 * "pgd_index(lg->kernel_address)". This assumes it won't hit the 996 * Switcher mappings, so check that now. 997 */ 998#ifdef CONFIG_X86_PAE 999 if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX && 1000 pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX)
1001#else 1002 if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX) 1003#endif 1004 kill_guest(cpu, "bad kernel address %#lx", 1005 cpu->lg->kernel_address); 1006} 1007 1008/* When a Guest dies, our cleanup is fairly simple. */ 1009void free_guest_pagetable(struct lguest *lg) 1010{ 1011 unsigned int i; 1012 1013 /* Throw away all page table pages. */ 1014 release_all_pagetables(lg); 1015 /* Now free the top levels: free_page() can handle 0 just fine. */ 1016 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) 1017 free_page((long)lg->pgdirs[i].pgdir); 1018} 1019 1020/*H:480 1021 * (vi) Mapping the Switcher when the Guest is about to run. 1022 * 1023 * The Switcher and the two pages for this CPU need to be visible in the 1024 * Guest (and not the pages for other CPUs). We have the appropriate PTE pages 1025 * for each CPU already set up, we just need to hook them in now we know which 1026 * Guest is about to run on this CPU. 1027 */ 1028void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) 1029{ 1030 pte_t *switcher_pte_page = __this_cpu_read(switcher_pte_pages); 1031 pte_t regs_pte; 1032 1033#ifdef CONFIG_X86_PAE 1034 pmd_t switcher_pmd; 1035 pmd_t *pmd_table; 1036 1037 switcher_pmd = pfn_pmd(__pa(switcher_pte_page) >> PAGE_SHIFT, 1038 PAGE_KERNEL_EXEC); 1039 1040 /* Figure out where the pmd page is, by reading the PGD, and converting 1041 * it to a virtual address. */ 1042 pmd_table = __va(pgd_pfn(cpu->lg-> 1043 pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) 1044 << PAGE_SHIFT); 1045 /* Now write it into the shadow page table. */ 1046 set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); 1047#else 1048 pgd_t switcher_pgd; 1049 1050 /* 1051 * Make the last PGD entry for this Guest point to the Switcher's PTE 1052 * page for this CPU (with appropriate flags). 1053 */ 1054 switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC); 1055 1056 cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; 1057 1058#endif 1059 /* 1060 * We also change the Switcher PTE page. When we're running the Guest, 1061 * we want the Guest's "regs" page to appear where the first Switcher 1062 * page for this CPU is. This is an optimization: when the Switcher 1063 * saves the Guest registers, it saves them into the first page of this 1064 * CPU's "struct lguest_pages": if we make sure the Guest's register 1065 * page is already mapped there, we don't have to copy them out 1066 * again. 1067 */ 1068 regs_pte = pfn_pte(__pa(cpu->regs_page) >> PAGE_SHIFT, PAGE_KERNEL); 1069 set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], regs_pte); 1070} 1071/*:*/ 1072 1073static void free_switcher_pte_pages(void) 1074{ 1075 unsigned int i; 1076 1077 for_each_possible_cpu(i) 1078 free_page((long)switcher_pte_page(i)); 1079} 1080 1081/*H:520 1082 * Setting up the Switcher PTE page for given CPU is fairly easy, given 1083 * the CPU number and the "struct page"s for the Switcher code itself. 1084 * 1085 * Currently the Switcher is less than a page long, so "pages" is always 1. 1086 */ 1087static __init void populate_switcher_pte_page(unsigned int cpu, 1088 struct page *switcher_page[], 1089 unsigned int pages) 1090{ 1091 unsigned int i; 1092 pte_t *pte = switcher_pte_page(cpu); 1093 1094 /* The first entries are easy: they map the Switcher code. */ 1095 for (i = 0; i < pages; i++) { 1096 set_pte(&pte[i], mk_pte(switcher_page[i], 1097 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); 1098 } 1099 1100 /* The only other thing we map is this CPU's pair of pages. */ 1101 i = pages + cpu*2; 1102 1103 /* First page (Guest registers) is writable from the Guest */ 1104 set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]), 1105 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW))); 1106 1107 /* 1108 * The second page contains the "struct lguest_ro_state", and is 1109 * read-only. 1110 */ 1111 set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]), 1112 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); 1113} 1114 1115/* 1116 * We've made it through the page table code. Perhaps our tired brains are 1117 * still processing the details, or perhaps we're simply glad it's over. 1118 * 1119 * If nothing else, note that all this complexity in juggling shadow page tables 1120 * in sync with the Guest's page tables is for one reason: for most Guests this 1121 * page table dance determines how bad performance will be. This is why Xen 1122 * uses exotic direct Guest pagetable manipulation, and why both Intel and AMD 1123 * have implemented shadow page table support directly into hardware. 1124 * 1125 * There is just one file remaining in the Host. 1126 */ 1127 1128/*H:510 1129 * At boot or module load time, init_pagetables() allocates and populates 1130 * the Switcher PTE page for each CPU. 1131 */ 1132__init int init_pagetables(struct page **switcher_page, unsigned int pages) 1133{ 1134 unsigned int i; 1135 1136 for_each_possible_cpu(i) { 1137 switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL); 1138 if (!switcher_pte_page(i)) { 1139 free_switcher_pte_pages(); 1140 return -ENOMEM; 1141 } 1142 populate_switcher_pte_page(i, switcher_page, pages); 1143 } 1144 return 0; 1145} 1146/*:*/ 1147 1148/* Cleaning up simply involves freeing the PTE page for each CPU. */ 1149void free_pagetables(void) 1150{ 1151 free_switcher_pte_pages(); 1152} 1153