1/*P:700 2 * The pagetable code, on the other hand, still shows the scars of 3 * previous encounters. It's functional, and as neat as it can be in the 4 * circumstances, but be wary, for these things are subtle and break easily. 5 * The Guest provides a virtual to physical mapping, but we can neither trust 6 * it nor use it: we verify and convert it here then point the CPU to the 7 * converted Guest pages when running the Guest. 8:*/ 9 10/* Copyright (C) Rusty Russell IBM Corporation 2006. 11 * GPL v2 and any later version */ 12#include <linux/mm.h> 13#include <linux/types.h> 14#include <linux/spinlock.h> 15#include <linux/random.h> 16#include <linux/percpu.h> 17#include <asm/tlbflush.h> 18#include <asm/uaccess.h> 19#include <asm/bootparam.h> 20#include "lg.h" 21 22/*M:008 23 * We hold reference to pages, which prevents them from being swapped. 24 * It'd be nice to have a callback in the "struct mm_struct" when Linux wants 25 * to swap out. If we had this, and a shrinker callback to trim PTE pages, we 26 * could probably consider launching Guests as non-root. 27:*/ 28 29/*H:300 30 * The Page Table Code 31 * 32 * We use two-level page tables for the Guest, or three-level with PAE. If 33 * you're not entirely comfortable with virtual addresses, physical addresses 34 * and page tables then I recommend you review arch/x86/lguest/boot.c's "Page 35 * Table Handling" (with diagrams!). 36 * 37 * The Guest keeps page tables, but we maintain the actual ones here: these are 38 * called "shadow" page tables. Which is a very Guest-centric name: these are 39 * the real page tables the CPU uses, although we keep them up to date to 40 * reflect the Guest's. (See what I mean about weird naming? Since when do 41 * shadows reflect anything?) 42 * 43 * Anyway, this is the most complicated part of the Host code. There are seven 44 * parts to this: 45 * (i) Looking up a page table entry when the Guest faults, 46 * (ii) Making sure the Guest stack is mapped, 47 * (iii) Setting up a page table entry when the Guest tells us one has changed, 48 * (iv) Switching page tables, 49 * (v) Flushing (throwing away) page tables, 50 * (vi) Mapping the Switcher when the Guest is about to run, 51 * (vii) Setting up the page tables initially. 52:*/ 53 54/* 55 * The Switcher uses the complete top PTE page. That's 1024 PTE entries (4MB) 56 * or 512 PTE entries with PAE (2MB). 57 */ 58#define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) 59 60/* 61 * For PAE we need the PMD index as well. We use the last 2MB, so we 62 * will need the last pmd entry of the last pmd page. 63 */ 64#ifdef CONFIG_X86_PAE 65#define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1) 66#define RESERVE_MEM 2U 67#define CHECK_GPGD_MASK _PAGE_PRESENT 68#else 69#define RESERVE_MEM 4U 70#define CHECK_GPGD_MASK _PAGE_TABLE 71#endif 72 73/* 74 * We actually need a separate PTE page for each CPU. Remember that after the 75 * Switcher code itself comes two pages for each CPU, and we don't want this 76 * CPU's guest to see the pages of any other CPU. 77 */ 78static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); 79#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) 80 81/*H:320 82 * The page table code is curly enough to need helper functions to keep it 83 * clear and clean. The kernel itself provides many of them; one advantage 84 * of insisting that the Guest and Host use the same CONFIG_PAE setting. 85 * 86 * There are two functions which return pointers to the shadow (aka "real") 87 * page tables. 88 * 89 * spgd_addr() takes the virtual address and returns a pointer to the top-level 90 * page directory entry (PGD) for that address. Since we keep track of several 91 * page tables, the "i" argument tells us which one we're interested in (it's 92 * usually the current one). 93 */ 94static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) 95{ 96 unsigned int index = pgd_index(vaddr); 97 98#ifndef CONFIG_X86_PAE 99 /* We kill any Guest trying to touch the Switcher addresses. */ 100 if (index >= SWITCHER_PGD_INDEX) { 101 kill_guest(cpu, "attempt to access switcher pages"); 102 index = 0; 103 } 104#endif 105 /* Return a pointer index'th pgd entry for the i'th page table. */ 106 return &cpu->lg->pgdirs[i].pgdir[index]; 107} 108 109#ifdef CONFIG_X86_PAE 110/* 111 * This routine then takes the PGD entry given above, which contains the 112 * address of the PMD page. It then returns a pointer to the PMD entry for the 113 * given address. 114 */ 115static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) 116{ 117 unsigned int index = pmd_index(vaddr); 118 pmd_t *page; 119 120 /* We kill any Guest trying to touch the Switcher addresses. */ 121 if (pgd_index(vaddr) == SWITCHER_PGD_INDEX && 122 index >= SWITCHER_PMD_INDEX) { 123 kill_guest(cpu, "attempt to access switcher pages"); 124 index = 0; 125 } 126 127 /* You should never call this if the PGD entry wasn't valid */ 128 BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); 129 page = __va(pgd_pfn(spgd) << PAGE_SHIFT); 130 131 return &page[index]; 132} 133#endif 134 135/* 136 * This routine then takes the page directory entry returned above, which 137 * contains the address of the page table entry (PTE) page. It then returns a 138 * pointer to the PTE entry for the given address. 139 */ 140static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) 141{ 142#ifdef CONFIG_X86_PAE 143 pmd_t *pmd = spmd_addr(cpu, spgd, vaddr); 144 pte_t *page = __va(pmd_pfn(*pmd) << PAGE_SHIFT); 145 146 /* You should never call this if the PMD entry wasn't valid */ 147 BUG_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT)); 148#else 149 pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); 150 /* You should never call this if the PGD entry wasn't valid */ 151 BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); 152#endif 153 154 return &page[pte_index(vaddr)]; 155} 156 157/* 158 * These functions are just like the above two, except they access the Guest 159 * page tables. Hence they return a Guest address. 160 */ 161static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) 162{ 163 unsigned int index = vaddr >> (PGDIR_SHIFT); 164 return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t); 165} 166 167#ifdef CONFIG_X86_PAE 168/* Follow the PGD to the PMD. */ 169static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) 170{ 171 unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; 172 BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); 173 return gpage + pmd_index(vaddr) * sizeof(pmd_t); 174} 175 176/* Follow the PMD to the PTE. */ 177static unsigned long gpte_addr(struct lg_cpu *cpu, 178 pmd_t gpmd, unsigned long vaddr) 179{ 180 unsigned long gpage = pmd_pfn(gpmd) << PAGE_SHIFT; 181 182 BUG_ON(!(pmd_flags(gpmd) & _PAGE_PRESENT)); 183 return gpage + pte_index(vaddr) * sizeof(pte_t); 184} 185#else 186/* Follow the PGD to the PTE (no mid-level for !PAE). */ 187static unsigned long gpte_addr(struct lg_cpu *cpu, 188 pgd_t gpgd, unsigned long vaddr) 189{ 190 unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; 191 192 BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); 193 return gpage + pte_index(vaddr) * sizeof(pte_t); 194} 195#endif 196/*:*/ 197 198/*M:014 199 * get_pfn is slow: we could probably try to grab batches of pages here as 200 * an optimization (ie. pre-faulting). 201:*/ 202 203/*H:350 204 * This routine takes a page number given by the Guest and converts it to 205 * an actual, physical page number. It can fail for several reasons: the 206 * virtual address might not be mapped by the Launcher, the write flag is set 207 * and the page is read-only, or the write flag was set and the page was 208 * shared so had to be copied, but we ran out of memory. 209 * 210 * This holds a reference to the page, so release_pte() is careful to put that 211 * back. 212 */ 213static unsigned long get_pfn(unsigned long virtpfn, int write) 214{ 215 struct page *page; 216 217 /* gup me one page at this address please! */ 218 if (get_user_pages_fast(virtpfn << PAGE_SHIFT, 1, write, &page) == 1) 219 return page_to_pfn(page); 220 221 /* This value indicates failure. */ 222 return -1UL; 223} 224 225/*H:340 226 * Converting a Guest page table entry to a shadow (ie. real) page table 227 * entry can be a little tricky. The flags are (almost) the same, but the 228 * Guest PTE contains a virtual page number: the CPU needs the real page 229 * number. 230 */ 231static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write) 232{ 233 unsigned long pfn, base, flags; 234 235 /* 236 * The Guest sets the global flag, because it thinks that it is using 237 * PGE. We only told it to use PGE so it would tell us whether it was 238 * flushing a kernel mapping or a userspace mapping. We don't actually 239 * use the global bit, so throw it away. 240 */ 241 flags = (pte_flags(gpte) & ~_PAGE_GLOBAL); 242 243 /* The Guest's pages are offset inside the Launcher. */ 244 base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE; 245 246 /* 247 * We need a temporary "unsigned long" variable to hold the answer from 248 * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't 249 * fit in spte.pfn. get_pfn() finds the real physical number of the 250 * page, given the virtual number. 251 */ 252 pfn = get_pfn(base + pte_pfn(gpte), write); 253 if (pfn == -1UL) { 254 kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte)); 255 /* 256 * When we destroy the Guest, we'll go through the shadow page 257 * tables and release_pte() them. Make sure we don't think 258 * this one is valid! 259 */ 260 flags = 0; 261 } 262 /* Now we assemble our shadow PTE from the page number and flags. */ 263 return pfn_pte(pfn, __pgprot(flags)); 264} 265 266/*H:460 And to complete the chain, release_pte() looks like this: */ 267static void release_pte(pte_t pte) 268{ 269 /* 270 * Remember that get_user_pages_fast() took a reference to the page, in 271 * get_pfn()? We have to put it back now. 272 */ 273 if (pte_flags(pte) & _PAGE_PRESENT) 274 put_page(pte_page(pte)); 275} 276/*:*/ 277 278static void check_gpte(struct lg_cpu *cpu, pte_t gpte) 279{ 280 if ((pte_flags(gpte) & _PAGE_PSE) || 281 pte_pfn(gpte) >= cpu->lg->pfn_limit) 282 kill_guest(cpu, "bad page table entry"); 283} 284 285static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) 286{ 287 if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) || 288 (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) 289 kill_guest(cpu, "bad page directory entry"); 290} 291 292#ifdef CONFIG_X86_PAE 293static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) 294{ 295 if ((pmd_flags(gpmd) & ~_PAGE_TABLE) || 296 (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) 297 kill_guest(cpu, "bad page middle directory entry"); 298} 299#endif 300 301/*H:330 302 * (i) Looking up a page table entry when the Guest faults. 303 * 304 * We saw this call in run_guest(): when we see a page fault in the Guest, we 305 * come here. That's because we only set up the shadow page tables lazily as 306 * they're needed, so we get page faults all the time and quietly fix them up 307 * and return to the Guest without it knowing. 308 * 309 * If we fixed up the fault (ie. we mapped the address), this routine returns 310 * true. Otherwise, it was a real fault and we need to tell the Guest. 311 */ 312bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) 313{ 314 pgd_t gpgd; 315 pgd_t *spgd; 316 unsigned long gpte_ptr; 317 pte_t gpte; 318 pte_t *spte; 319 320 /* Mid level for PAE. */ 321#ifdef CONFIG_X86_PAE 322 pmd_t *spmd; 323 pmd_t gpmd; 324#endif 325 326 /* First step: get the top-level Guest page table entry. */ 327 gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); 328 /* Toplevel not present? We can't map it in. */ 329 if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) 330 return false; 331 332 /* Now look at the matching shadow entry. */ 333 spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); 334 if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { 335 /* No shadow entry: allocate a new shadow PTE page. */ 336 unsigned long ptepage = get_zeroed_page(GFP_KERNEL); 337 /* 338 * This is not really the Guest's fault, but killing it is 339 * simple for this corner case. 340 */ 341 if (!ptepage) { 342 kill_guest(cpu, "out of memory allocating pte page"); 343 return false; 344 } 345 /* We check that the Guest pgd is OK. */ 346 check_gpgd(cpu, gpgd); 347 /* 348 * And we copy the flags to the shadow PGD entry. The page 349 * number in the shadow PGD is the page we just allocated. 350 */ 351 set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); 352 } 353 354#ifdef CONFIG_X86_PAE 355 gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); 356 /* Middle level not present? We can't map it in. */ 357 if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) 358 return false; 359 360 /* Now look at the matching shadow entry. */ 361 spmd = spmd_addr(cpu, *spgd, vaddr); 362 363 if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) { 364 /* No shadow entry: allocate a new shadow PTE page. */ 365 unsigned long ptepage = get_zeroed_page(GFP_KERNEL); 366 367 /* 368 * This is not really the Guest's fault, but killing it is 369 * simple for this corner case. 370 */ 371 if (!ptepage) { 372 kill_guest(cpu, "out of memory allocating pte page"); 373 return false; 374 } 375 376 /* We check that the Guest pmd is OK. */ 377 check_gpmd(cpu, gpmd); 378 379 /* 380 * And we copy the flags to the shadow PMD entry. The page 381 * number in the shadow PMD is the page we just allocated. 382 */ 383 set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); 384 } 385 386 /* 387 * OK, now we look at the lower level in the Guest page table: keep its 388 * address, because we might update it later. 389 */ 390 gpte_ptr = gpte_addr(cpu, gpmd, vaddr); 391#else 392 /* 393 * OK, now we look at the lower level in the Guest page table: keep its 394 * address, because we might update it later. 395 */ 396 gpte_ptr = gpte_addr(cpu, gpgd, vaddr); 397#endif 398 399 /* Read the actual PTE value. */ 400 gpte = lgread(cpu, gpte_ptr, pte_t); 401 402 /* If this page isn't in the Guest page tables, we can't page it in. */ 403 if (!(pte_flags(gpte) & _PAGE_PRESENT)) 404 return false; 405 406 /* 407 * Check they're not trying to write to a page the Guest wants 408 * read-only (bit 2 of errcode == write). 409 */ 410 if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW)) 411 return false; 412 413 /* User access to a kernel-only page? (bit 3 == user access) */ 414 if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER)) 415 return false; 416 417 /* 418 * Check that the Guest PTE flags are OK, and the page number is below 419 * the pfn_limit (ie. not mapping the Launcher binary). 420 */ 421 check_gpte(cpu, gpte); 422 423 /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ 424 gpte = pte_mkyoung(gpte); 425 if (errcode & 2) 426 gpte = pte_mkdirty(gpte); 427 428 /* Get the pointer to the shadow PTE entry we're going to set. */ 429 spte = spte_addr(cpu, *spgd, vaddr); 430 431 /* 432 * If there was a valid shadow PTE entry here before, we release it. 433 * This can happen with a write to a previously read-only entry. 434 */ 435 release_pte(*spte); 436 437 /* 438 * If this is a write, we insist that the Guest page is writable (the 439 * final arg to gpte_to_spte()). 440 */ 441 if (pte_dirty(gpte)) 442 *spte = gpte_to_spte(cpu, gpte, 1); 443 else 444 /* 445 * If this is a read, don't set the "writable" bit in the page 446 * table entry, even if the Guest says it's writable. That way 447 * we will come back here when a write does actually occur, so 448 * we can update the Guest's _PAGE_DIRTY flag. 449 */ 450 set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0)); 451 452 /* 453 * Finally, we write the Guest PTE entry back: we've set the 454 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. 455 */ 456 lgwrite(cpu, gpte_ptr, pte_t, gpte); 457 458 /* 459 * The fault is fixed, the page table is populated, the mapping 460 * manipulated, the result returned and the code complete. A small 461 * delay and a trace of alliteration are the only indications the Guest 462 * has that a page fault occurred at all. 463 */ 464 return true; 465} 466 467/*H:360 468 * (ii) Making sure the Guest stack is mapped. 469 * 470 * Remember that direct traps into the Guest need a mapped Guest kernel stack. 471 * pin_stack_pages() calls us here: we could simply call demand_page(), but as 472 * we've seen that logic is quite long, and usually the stack pages are already 473 * mapped, so it's overkill. 474 * 475 * This is a quick version which answers the question: is this virtual address 476 * mapped by the shadow page tables, and is it writable? 477 */ 478static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) 479{ 480 pgd_t *spgd; 481 unsigned long flags; 482 483#ifdef CONFIG_X86_PAE 484 pmd_t *spmd; 485#endif 486 /* Look at the current top level entry: is it present? */ 487 spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); 488 if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) 489 return false; 490 491#ifdef CONFIG_X86_PAE 492 spmd = spmd_addr(cpu, *spgd, vaddr); 493 if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) 494 return false; 495#endif 496 497 /* 498 * Check the flags on the pte entry itself: it must be present and 499 * writable. 500 */ 501 flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr))); 502 503 return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); 504} 505 506/* 507 * So, when pin_stack_pages() asks us to pin a page, we check if it's already 508 * in the page tables, and if not, we call demand_page() with error code 2 509 * (meaning "write"). 510 */ 511void pin_page(struct lg_cpu *cpu, unsigned long vaddr) 512{ 513 if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) 514 kill_guest(cpu, "bad stack page %#lx", vaddr); 515} 516/*:*/ 517 518#ifdef CONFIG_X86_PAE 519static void release_pmd(pmd_t *spmd) 520{ 521 /* If the entry's not present, there's nothing to release. */ 522 if (pmd_flags(*spmd) & _PAGE_PRESENT) { 523 unsigned int i; 524 pte_t *ptepage = __va(pmd_pfn(*spmd) << PAGE_SHIFT); 525 /* For each entry in the page, we might need to release it. */ 526 for (i = 0; i < PTRS_PER_PTE; i++) 527 release_pte(ptepage[i]); 528 /* Now we can free the page of PTEs */ 529 free_page((long)ptepage); 530 /* And zero out the PMD entry so we never release it twice. */ 531 set_pmd(spmd, __pmd(0)); 532 } 533} 534 535static void release_pgd(pgd_t *spgd) 536{ 537 /* If the entry's not present, there's nothing to release. */ 538 if (pgd_flags(*spgd) & _PAGE_PRESENT) { 539 unsigned int i; 540 pmd_t *pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); 541 542 for (i = 0; i < PTRS_PER_PMD; i++) 543 release_pmd(&pmdpage[i]); 544 545 /* Now we can free the page of PMDs */ 546 free_page((long)pmdpage); 547 /* And zero out the PGD entry so we never release it twice. */ 548 set_pgd(spgd, __pgd(0)); 549 } 550} 551 552#else /* !CONFIG_X86_PAE */ 553/*H:450 554 * If we chase down the release_pgd() code, the non-PAE version looks like 555 * this. The PAE version is almost identical, but instead of calling 556 * release_pte it calls release_pmd(), which looks much like this. 557 */ 558static void release_pgd(pgd_t *spgd) 559{ 560 /* If the entry's not present, there's nothing to release. */ 561 if (pgd_flags(*spgd) & _PAGE_PRESENT) { 562 unsigned int i; 563 /* 564 * Converting the pfn to find the actual PTE page is easy: turn 565 * the page number into a physical address, then convert to a 566 * virtual address (easy for kernel pages like this one). 567 */ 568 pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); 569 /* For each entry in the page, we might need to release it. */ 570 for (i = 0; i < PTRS_PER_PTE; i++) 571 release_pte(ptepage[i]); 572 /* Now we can free the page of PTEs */ 573 free_page((long)ptepage); 574 /* And zero out the PGD entry so we never release it twice. */ 575 *spgd = __pgd(0); 576 } 577} 578#endif 579 580/*H:445 581 * We saw flush_user_mappings() twice: once from the flush_user_mappings() 582 * hypercall and once in new_pgdir() when we re-used a top-level pgdir page. 583 * It simply releases every PTE page from 0 up to the Guest's kernel address. 584 */ 585static void flush_user_mappings(struct lguest *lg, int idx) 586{ 587 unsigned int i; 588 /* Release every pgd entry up to the kernel's address. */ 589 for (i = 0; i < pgd_index(lg->kernel_address); i++) 590 release_pgd(lg->pgdirs[idx].pgdir + i); 591} 592 593/*H:440 594 * (v) Flushing (throwing away) page tables, 595 * 596 * The Guest has a hypercall to throw away the page tables: it's used when a 597 * large number of mappings have been changed. 598 */ 599void guest_pagetable_flush_user(struct lg_cpu *cpu) 600{ 601 /* Drop the userspace part of the current page table. */ 602 flush_user_mappings(cpu->lg, cpu->cpu_pgd); 603} 604/*:*/ 605 606/* We walk down the guest page tables to get a guest-physical address */ 607unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) 608{ 609 pgd_t gpgd; 610 pte_t gpte; 611#ifdef CONFIG_X86_PAE 612 pmd_t gpmd; 613#endif 614 /* First step: get the top-level Guest page table entry. */ 615 gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); 616 /* Toplevel not present? We can't map it in. */ 617 if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) { 618 kill_guest(cpu, "Bad address %#lx", vaddr); 619 return -1UL; 620 } 621 622#ifdef CONFIG_X86_PAE 623 gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); 624 if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) 625 kill_guest(cpu, "Bad address %#lx", vaddr); 626 gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t); 627#else 628 gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t); 629#endif 630 if (!(pte_flags(gpte) & _PAGE_PRESENT)) 631 kill_guest(cpu, "Bad address %#lx", vaddr); 632 633 return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); 634} 635 636/* 637 * We keep several page tables. This is a simple routine to find the page 638 * table (if any) corresponding to this top-level address the Guest has given 639 * us. 640 */ 641static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) 642{ 643 unsigned int i; 644 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) 645 if (lg->pgdirs[i].pgdir && lg->pgdirs[i].gpgdir == pgtable) 646 break; 647 return i; 648} 649 650/*H:435 651 * And this is us, creating the new page directory. If we really do 652 * allocate a new one (and so the kernel parts are not there), we set 653 * blank_pgdir. 654 */ 655static unsigned int new_pgdir(struct lg_cpu *cpu, 656 unsigned long gpgdir, 657 int *blank_pgdir) 658{ 659 unsigned int next; 660#ifdef CONFIG_X86_PAE 661 pmd_t *pmd_table; 662#endif 663 664 /* 665 * We pick one entry at random to throw out. Choosing the Least 666 * Recently Used might be better, but this is easy. 667 */ 668 next = random32() % ARRAY_SIZE(cpu->lg->pgdirs); 669 /* If it's never been allocated at all before, try now. */ 670 if (!cpu->lg->pgdirs[next].pgdir) { 671 cpu->lg->pgdirs[next].pgdir = 672 (pgd_t *)get_zeroed_page(GFP_KERNEL); 673 /* If the allocation fails, just keep using the one we have */ 674 if (!cpu->lg->pgdirs[next].pgdir) 675 next = cpu->cpu_pgd; 676 else { 677#ifdef CONFIG_X86_PAE 678 /* 679 * In PAE mode, allocate a pmd page and populate the 680 * last pgd entry. 681 */ 682 pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL); 683 if (!pmd_table) { 684 free_page((long)cpu->lg->pgdirs[next].pgdir); 685 set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0)); 686 next = cpu->cpu_pgd; 687 } else { 688 set_pgd(cpu->lg->pgdirs[next].pgdir + 689 SWITCHER_PGD_INDEX, 690 __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 691 /* 692 * This is a blank page, so there are no kernel 693 * mappings: caller must map the stack! 694 */ 695 *blank_pgdir = 1; 696 } 697#else 698 *blank_pgdir = 1; 699#endif 700 } 701 } 702 /* Record which Guest toplevel this shadows. */ 703 cpu->lg->pgdirs[next].gpgdir = gpgdir; 704 /* Release all the non-kernel mappings. */ 705 flush_user_mappings(cpu->lg, next); 706 707 return next; 708} 709 710/*H:430 711 * (iv) Switching page tables 712 * 713 * Now we've seen all the page table setting and manipulation, let's see 714 * what happens when the Guest changes page tables (ie. changes the top-level 715 * pgdir). This occurs on almost every context switch. 716 */ 717void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) 718{ 719 int newpgdir, repin = 0; 720 721 /* Look to see if we have this one already. */ 722 newpgdir = find_pgdir(cpu->lg, pgtable); 723 /* 724 * If not, we allocate or mug an existing one: if it's a fresh one, 725 * repin gets set to 1. 726 */ 727 if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs)) 728 newpgdir = new_pgdir(cpu, pgtable, &repin); 729 /* Change the current pgd index to the new one. */ 730 cpu->cpu_pgd = newpgdir; 731 /* If it was completely blank, we map in the Guest kernel stack */ 732 if (repin) 733 pin_stack_pages(cpu); 734} 735 736/*H:470 737 * Finally, a routine which throws away everything: all PGD entries in all 738 * the shadow page tables, including the Guest's kernel mappings. This is used 739 * when we destroy the Guest. 740 */ 741static void release_all_pagetables(struct lguest *lg) 742{ 743 unsigned int i, j; 744 745 /* Every shadow pagetable this Guest has */ 746 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) 747 if (lg->pgdirs[i].pgdir) { 748#ifdef CONFIG_X86_PAE 749 pgd_t *spgd; 750 pmd_t *pmdpage; 751 unsigned int k; 752 753 /* Get the last pmd page. */ 754 spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX; 755 pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); 756 757 /* 758 * And release the pmd entries of that pmd page, 759 * except for the switcher pmd. 760 */ 761 for (k = 0; k < SWITCHER_PMD_INDEX; k++) 762 release_pmd(&pmdpage[k]); 763#endif 764 /* Every PGD entry except the Switcher at the top */ 765 for (j = 0; j < SWITCHER_PGD_INDEX; j++) 766 release_pgd(lg->pgdirs[i].pgdir + j); 767 } 768} 769 770/* 771 * We also throw away everything when a Guest tells us it's changed a kernel 772 * mapping. Since kernel mappings are in every page table, it's easiest to 773 * throw them all away. This traps the Guest in amber for a while as 774 * everything faults back in, but it's rare. 775 */ 776void guest_pagetable_clear_all(struct lg_cpu *cpu) 777{ 778 release_all_pagetables(cpu->lg); 779 /* We need the Guest kernel stack mapped again. */ 780 pin_stack_pages(cpu); 781} 782/*:*/ 783 784/*M:009 785 * Since we throw away all mappings when a kernel mapping changes, our 786 * performance sucks for guests using highmem. In fact, a guest with 787 * PAGE_OFFSET 0xc0000000 (the default) and more than about 700MB of RAM is 788 * usually slower than a Guest with less memory. 789 * 790 * This, of course, cannot be fixed. It would take some kind of... well, I 791 * don't know, but the term "puissant code-fu" comes to mind. 792:*/ 793 794/*H:420 795 * This is the routine which actually sets the page table entry for then 796 * "idx"'th shadow page table. 797 * 798 * Normally, we can just throw out the old entry and replace it with 0: if they 799 * use it demand_page() will put the new entry in. We need to do this anyway: 800 * The Guest expects _PAGE_ACCESSED to be set on its PTE the first time a page 801 * is read from, and _PAGE_DIRTY when it's written to. 802 * 803 * But Avi Kivity pointed out that most Operating Systems (Linux included) set 804 * these bits on PTEs immediately anyway. This is done to save the CPU from 805 * having to update them, but it helps us the same way: if they set 806 * _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if 807 * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately. 808 */ 809static void do_set_pte(struct lg_cpu *cpu, int idx, 810 unsigned long vaddr, pte_t gpte) 811{ 812 /* Look up the matching shadow page directory entry. */ 813 pgd_t *spgd = spgd_addr(cpu, idx, vaddr); 814#ifdef CONFIG_X86_PAE 815 pmd_t *spmd; 816#endif 817 818 /* If the top level isn't present, there's no entry to update. */ 819 if (pgd_flags(*spgd) & _PAGE_PRESENT) { 820#ifdef CONFIG_X86_PAE 821 spmd = spmd_addr(cpu, *spgd, vaddr); 822 if (pmd_flags(*spmd) & _PAGE_PRESENT) { 823#endif 824 /* Otherwise, start by releasing the existing entry. */ 825 pte_t *spte = spte_addr(cpu, *spgd, vaddr); 826 release_pte(*spte); 827 828 /* 829 * If they're setting this entry as dirty or accessed, 830 * we might as well put that entry they've given us in 831 * now. This shaves 10% off a copy-on-write 832 * micro-benchmark. 833 */ 834 if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { 835 check_gpte(cpu, gpte); 836 set_pte(spte, 837 gpte_to_spte(cpu, gpte, 838 pte_flags(gpte) & _PAGE_DIRTY)); 839 } else { 840 /* 841 * Otherwise kill it and we can demand_page() 842 * it in later. 843 */ 844 set_pte(spte, __pte(0)); 845 } 846#ifdef CONFIG_X86_PAE 847 } 848#endif 849 } 850} 851 852/*H:410 853 * Updating a PTE entry is a little trickier. 854 * 855 * We keep track of several different page tables (the Guest uses one for each 856 * process, so it makes sense to cache at least a few). Each of these have 857 * identical kernel parts: ie. every mapping above PAGE_OFFSET is the same for 858 * all processes. So when the page table above that address changes, we update 859 * all the page tables, not just the current one. This is rare. 860 * 861 * The benefit is that when we have to track a new page table, we can keep all 862 * the kernel mappings. This speeds up context switch immensely. 863 */ 864void guest_set_pte(struct lg_cpu *cpu, 865 unsigned long gpgdir, unsigned long vaddr, pte_t gpte) 866{ 867 /* 868 * Kernel mappings must be changed on all top levels. Slow, but doesn't 869 * happen often. 870 */ 871 if (vaddr >= cpu->lg->kernel_address) { 872 unsigned int i; 873 for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++) 874 if (cpu->lg->pgdirs[i].pgdir) 875 do_set_pte(cpu, i, vaddr, gpte); 876 } else { 877 /* Is this page table one we have a shadow for? */ 878 int pgdir = find_pgdir(cpu->lg, gpgdir); 879 if (pgdir != ARRAY_SIZE(cpu->lg->pgdirs)) 880 /* If so, do the update. */ 881 do_set_pte(cpu, pgdir, vaddr, gpte); 882 } 883} 884 885/*H:400 886 * (iii) Setting up a page table entry when the Guest tells us one has changed. 887 * 888 * Just like we did in interrupts_and_traps.c, it makes sense for us to deal 889 * with the other side of page tables while we're here: what happens when the 890 * Guest asks for a page table to be updated? 891 * 892 * We already saw that demand_page() will fill in the shadow page tables when 893 * needed, so we can simply remove shadow page table entries whenever the Guest 894 * tells us they've changed. When the Guest tries to use the new entry it will 895 * fault and demand_page() will fix it up. 896 * 897 * So with that in mind here's our code to update a (top-level) PGD entry: 898 */ 899void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) 900{ 901 int pgdir; 902 903 if (idx >= SWITCHER_PGD_INDEX) 904 return; 905 906 /* If they're talking about a page table we have a shadow for... */ 907 pgdir = find_pgdir(lg, gpgdir); 908 if (pgdir < ARRAY_SIZE(lg->pgdirs)) 909 /* ... throw it away. */ 910 release_pgd(lg->pgdirs[pgdir].pgdir + idx); 911} 912 913#ifdef CONFIG_X86_PAE 914/* For setting a mid-level, we just throw everything away. It's easy. */ 915void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) 916{ 917 guest_pagetable_clear_all(&lg->cpus[0]); 918} 919#endif 920 921/*H:505 922 * To get through boot, we construct simple identity page mappings (which 923 * set virtual == physical) and linear mappings which will get the Guest far 924 * enough into the boot to create its own. The linear mapping means we 925 * simplify the Guest boot, but it makes assumptions about their PAGE_OFFSET, 926 * as you'll see. 927 * 928 * We lay them out of the way, just below the initrd (which is why we need to 929 * know its size here). 930 */ 931static unsigned long setup_pagetables(struct lguest *lg, 932 unsigned long mem, 933 unsigned long initrd_size) 934{ 935 pgd_t __user *pgdir; 936 pte_t __user *linear; 937 unsigned long mem_base = (unsigned long)lg->mem_base; 938 unsigned int mapped_pages, i, linear_pages; 939#ifdef CONFIG_X86_PAE 940 pmd_t __user *pmds; 941 unsigned int j; 942 pgd_t pgd; 943 pmd_t pmd; 944#else 945 unsigned int phys_linear; 946#endif 947 948 /* 949 * We have mapped_pages frames to map, so we need linear_pages page 950 * tables to map them. 951 */ 952 mapped_pages = mem / PAGE_SIZE; 953 linear_pages = (mapped_pages + PTRS_PER_PTE - 1) / PTRS_PER_PTE; 954 955 /* We put the toplevel page directory page at the top of memory. */ 956 pgdir = (pgd_t *)(mem + mem_base - initrd_size - PAGE_SIZE); 957 958 /* Now we use the next linear_pages pages as pte pages */ 959 linear = (void *)pgdir - linear_pages * PAGE_SIZE; 960 961#ifdef CONFIG_X86_PAE 962 /* 963 * And the single mid page goes below that. We only use one, but 964 * that's enough to map 1G, which definitely gets us through boot. 965 */ 966 pmds = (void *)linear - PAGE_SIZE; 967#endif 968 /* 969 * Linear mapping is easy: put every page's address into the 970 * mapping in order. 971 */ 972 for (i = 0; i < mapped_pages; i++) { 973 pte_t pte; 974 pte = pfn_pte(i, __pgprot(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER)); 975 if (copy_to_user(&linear[i], &pte, sizeof(pte)) != 0) 976 return -EFAULT; 977 } 978 979#ifdef CONFIG_X86_PAE 980 /* 981 * Make the Guest PMD entries point to the corresponding place in the 982 * linear mapping (up to one page worth of PMD). 983 */ 984 for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; 985 i += PTRS_PER_PTE, j++) { 986 pmd = pfn_pmd(((unsigned long)&linear[i] - mem_base)/PAGE_SIZE, 987 __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); 988 989 if (copy_to_user(&pmds[j], &pmd, sizeof(pmd)) != 0) 990 return -EFAULT; 991 } 992 993 /* One PGD entry, pointing to that PMD page. */ 994 pgd = __pgd(((unsigned long)pmds - mem_base) | _PAGE_PRESENT); 995 /* Copy it in as the first PGD entry (ie. addresses 0-1G). */ 996 if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) 997 return -EFAULT; 998 /* 999 * And the other PGD entry to make the linear mapping at PAGE_OFFSET 1000 */
1001 if (copy_to_user(&pgdir[KERNEL_PGD_BOUNDARY], &pgd, sizeof(pgd))) 1002 return -EFAULT; 1003#else 1004 /* 1005 * The top level points to the linear page table pages above. 1006 * We setup the identity and linear mappings here. 1007 */ 1008 phys_linear = (unsigned long)linear - mem_base; 1009 for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { 1010 pgd_t pgd; 1011 /* 1012 * Create a PGD entry which points to the right part of the 1013 * linear PTE pages. 1014 */ 1015 pgd = __pgd((phys_linear + i * sizeof(pte_t)) | 1016 (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); 1017 1018 /* 1019 * Copy it into the PGD page at 0 and PAGE_OFFSET. 1020 */ 1021 if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd)) 1022 || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET) 1023 + i / PTRS_PER_PTE], 1024 &pgd, sizeof(pgd))) 1025 return -EFAULT; 1026 } 1027#endif 1028 1029 /* 1030 * We return the top level (guest-physical) address: we remember where 1031 * this is to write it into lguest_data when the Guest initializes. 1032 */ 1033 return (unsigned long)pgdir - mem_base; 1034} 1035 1036/*H:500 1037 * (vii) Setting up the page tables initially. 1038 * 1039 * When a Guest is first created, the Launcher tells us where the toplevel of 1040 * its first page table is. We set some things up here: 1041 */ 1042int init_guest_pagetable(struct lguest *lg) 1043{ 1044 u64 mem; 1045 u32 initrd_size; 1046 struct boot_params __user *boot = (struct boot_params *)lg->mem_base; 1047#ifdef CONFIG_X86_PAE 1048 pgd_t *pgd; 1049 pmd_t *pmd_table; 1050#endif 1051 /* 1052 * Get the Guest memory size and the ramdisk size from the boot header 1053 * located at lg->mem_base (Guest address 0). 1054 */ 1055 if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem)) 1056 || get_user(initrd_size, &boot->hdr.ramdisk_size)) 1057 return -EFAULT; 1058 1059 /* 1060 * We start on the first shadow page table, and give it a blank PGD 1061 * page. 1062 */ 1063 lg->pgdirs[0].gpgdir = setup_pagetables(lg, mem, initrd_size); 1064 if (IS_ERR_VALUE(lg->pgdirs[0].gpgdir)) 1065 return lg->pgdirs[0].gpgdir; 1066 lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); 1067 if (!lg->pgdirs[0].pgdir) 1068 return -ENOMEM; 1069 1070#ifdef CONFIG_X86_PAE 1071 /* For PAE, we also create the initial mid-level. */ 1072 pgd = lg->pgdirs[0].pgdir; 1073 pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); 1074 if (!pmd_table) 1075 return -ENOMEM; 1076 1077 set_pgd(pgd + SWITCHER_PGD_INDEX, 1078 __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 1079#endif 1080 1081 /* This is the current page table. */ 1082 lg->cpus[0].cpu_pgd = 0; 1083 return 0; 1084} 1085 1086/*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ 1087void page_table_guest_data_init(struct lg_cpu *cpu) 1088{ 1089 /* We get the kernel address: above this is all kernel memory. */ 1090 if (get_user(cpu->lg->kernel_address, 1091 &cpu->lg->lguest_data->kernel_address) 1092 /* 1093 * We tell the Guest that it can't use the top 2 or 4 MB 1094 * of virtual addresses used by the Switcher. 1095 */ 1096 || put_user(RESERVE_MEM * 1024 * 1024, 1097 &cpu->lg->lguest_data->reserve_mem) 1098 || put_user(cpu->lg->pgdirs[0].gpgdir, 1099 &cpu->lg->lguest_data->pgdir)) 1100 kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); 1101 1102 /* 1103 * In flush_user_mappings() we loop from 0 to 1104 * "pgd_index(lg->kernel_address)". This assumes it won't hit the 1105 * Switcher mappings, so check that now. 1106 */ 1107#ifdef CONFIG_X86_PAE 1108 if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX && 1109 pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX) 1110#else 1111 if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX) 1112#endif 1113 kill_guest(cpu, "bad kernel address %#lx", 1114 cpu->lg->kernel_address); 1115} 1116 1117/* When a Guest dies, our cleanup is fairly simple. */ 1118void free_guest_pagetable(struct lguest *lg) 1119{ 1120 unsigned int i; 1121 1122 /* Throw away all page table pages. */ 1123 release_all_pagetables(lg); 1124 /* Now free the top levels: free_page() can handle 0 just fine. */ 1125 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) 1126 free_page((long)lg->pgdirs[i].pgdir); 1127} 1128 1129/*H:480 1130 * (vi) Mapping the Switcher when the Guest is about to run. 1131 * 1132 * The Switcher and the two pages for this CPU need to be visible in the 1133 * Guest (and not the pages for other CPUs). We have the appropriate PTE pages 1134 * for each CPU already set up, we just need to hook them in now we know which 1135 * Guest is about to run on this CPU. 1136 */ 1137void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) 1138{ 1139 pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); 1140 pte_t regs_pte; 1141 1142#ifdef CONFIG_X86_PAE 1143 pmd_t switcher_pmd; 1144 pmd_t *pmd_table; 1145 1146 switcher_pmd = pfn_pmd(__pa(switcher_pte_page) >> PAGE_SHIFT, 1147 PAGE_KERNEL_EXEC); 1148 1149 /* Figure out where the pmd page is, by reading the PGD, and converting 1150 * it to a virtual address. */ 1151 pmd_table = __va(pgd_pfn(cpu->lg-> 1152 pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) 1153 << PAGE_SHIFT); 1154 /* Now write it into the shadow page table. */ 1155 set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); 1156#else 1157 pgd_t switcher_pgd; 1158 1159 /* 1160 * Make the last PGD entry for this Guest point to the Switcher's PTE 1161 * page for this CPU (with appropriate flags). 1162 */ 1163 switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC); 1164 1165 cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; 1166 1167#endif 1168 /* 1169 * We also change the Switcher PTE page. When we're running the Guest, 1170 * we want the Guest's "regs" page to appear where the first Switcher 1171 * page for this CPU is. This is an optimization: when the Switcher 1172 * saves the Guest registers, it saves them into the first page of this 1173 * CPU's "struct lguest_pages": if we make sure the Guest's register 1174 * page is already mapped there, we don't have to copy them out 1175 * again. 1176 */ 1177 regs_pte = pfn_pte(__pa(cpu->regs_page) >> PAGE_SHIFT, PAGE_KERNEL); 1178 set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], regs_pte); 1179} 1180/*:*/ 1181 1182static void free_switcher_pte_pages(void) 1183{ 1184 unsigned int i; 1185 1186 for_each_possible_cpu(i) 1187 free_page((long)switcher_pte_page(i)); 1188} 1189 1190/*H:520 1191 * Setting up the Switcher PTE page for given CPU is fairly easy, given 1192 * the CPU number and the "struct page"s for the Switcher code itself. 1193 * 1194 * Currently the Switcher is less than a page long, so "pages" is always 1. 1195 */ 1196static __init void populate_switcher_pte_page(unsigned int cpu, 1197 struct page *switcher_page[], 1198 unsigned int pages) 1199{ 1200 unsigned int i; 1201 pte_t *pte = switcher_pte_page(cpu); 1202 1203 /* The first entries are easy: they map the Switcher code. */ 1204 for (i = 0; i < pages; i++) { 1205 set_pte(&pte[i], mk_pte(switcher_page[i], 1206 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); 1207 } 1208 1209 /* The only other thing we map is this CPU's pair of pages. */ 1210 i = pages + cpu*2; 1211 1212 /* First page (Guest registers) is writable from the Guest */ 1213 set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]), 1214 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW))); 1215 1216 /* 1217 * The second page contains the "struct lguest_ro_state", and is 1218 * read-only. 1219 */ 1220 set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]), 1221 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); 1222} 1223 1224/* 1225 * We've made it through the page table code. Perhaps our tired brains are 1226 * still processing the details, or perhaps we're simply glad it's over. 1227 * 1228 * If nothing else, note that all this complexity in juggling shadow page tables 1229 * in sync with the Guest's page tables is for one reason: for most Guests this 1230 * page table dance determines how bad performance will be. This is why Xen 1231 * uses exotic direct Guest pagetable manipulation, and why both Intel and AMD 1232 * have implemented shadow page table support directly into hardware. 1233 * 1234 * There is just one file remaining in the Host. 1235 */ 1236 1237/*H:510 1238 * At boot or module load time, init_pagetables() allocates and populates 1239 * the Switcher PTE page for each CPU. 1240 */ 1241__init int init_pagetables(struct page **switcher_page, unsigned int pages) 1242{ 1243 unsigned int i; 1244 1245 for_each_possible_cpu(i) { 1246 switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL); 1247 if (!switcher_pte_page(i)) { 1248 free_switcher_pte_pages(); 1249 return -ENOMEM; 1250 } 1251 populate_switcher_pte_page(i, switcher_page, pages); 1252 } 1253 return 0; 1254} 1255/*:*/ 1256 1257/* Cleaning up simply involves freeing the PTE page for each CPU. */ 1258void free_pagetables(void) 1259{ 1260 free_switcher_pte_pages(); 1261} 1262