LXR linux/arch/arm/kvm/mmu.c

   1/*
   2 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
   3 * Author: Christoffer Dall <c.dall@virtualopensystems.com>
   4 *
   5 * This program is free software; you can redistribute it and/or modify
   6 * it under the terms of the GNU General Public License, version 2, as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it will be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write to the Free Software
  16 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  17 */
  18
  19#include <linux/mman.h>
  20#include <linux/kvm_host.h>
  21#include <linux/io.h>
  22#include <linux/hugetlb.h>
  23#include <trace/events/kvm.h>
  24#include <asm/pgalloc.h>
  25#include <asm/cacheflush.h>
  26#include <asm/kvm_arm.h>
  27#include <asm/kvm_mmu.h>
  28#include <asm/kvm_mmio.h>
  29#include <asm/kvm_asm.h>
  30#include <asm/kvm_emulate.h>
  31#include <asm/virt.h>
  32
  33#include "trace.h"
  34
  35extern char  __hyp_idmap_text_start[], __hyp_idmap_text_end[];
  36
  37static pgd_t *boot_hyp_pgd;
  38static pgd_t *hyp_pgd;
  39static pgd_t *merged_hyp_pgd;
  40static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
  41
  42static unsigned long hyp_idmap_start;
  43static unsigned long hyp_idmap_end;
  44static phys_addr_t hyp_idmap_vector;
  45
  46#define S2_PGD_SIZE     (PTRS_PER_S2_PGD * sizeof(pgd_t))
  47#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
  48
  49#define KVM_S2PTE_FLAG_IS_IOMAP         (1UL << 0)
  50#define KVM_S2_FLAG_LOGGING_ACTIVE      (1UL << 1)
  51
  52static bool memslot_is_logging(struct kvm_memory_slot *memslot)
  53{
  54        return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
  55}
  56
  57/**
  58 * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
  59 * @kvm:        pointer to kvm structure.
  60 *
  61 * Interface to HYP function to flush all VM TLB entries
  62 */
  63void kvm_flush_remote_tlbs(struct kvm *kvm)
  64{
  65        kvm_call_hyp(__kvm_tlb_flush_vmid, kvm);
  66}
  67
  68static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
  69{
  70        kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
  71}
  72
  73/*
  74 * D-Cache management functions. They take the page table entries by
  75 * value, as they are flushing the cache using the kernel mapping (or
  76 * kmap on 32bit).
  77 */
  78static void kvm_flush_dcache_pte(pte_t pte)
  79{
  80        __kvm_flush_dcache_pte(pte);
  81}
  82
  83static void kvm_flush_dcache_pmd(pmd_t pmd)
  84{
  85        __kvm_flush_dcache_pmd(pmd);
  86}
  87
  88static void kvm_flush_dcache_pud(pud_t pud)
  89{
  90        __kvm_flush_dcache_pud(pud);
  91}
  92
  93static bool kvm_is_device_pfn(unsigned long pfn)
  94{
  95        return !pfn_valid(pfn);
  96}
  97
  98/**
  99 * stage2_dissolve_pmd() - clear and flush huge PMD entry
 100 * @kvm:        pointer to kvm structure.
 101 * @addr:       IPA
 102 * @pmd:        pmd pointer for IPA
 103 *
 104 * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
 105 * pages in the range dirty.
 106 */
 107static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
 108{
 109        if (!pmd_thp_or_huge(*pmd))
 110                return;
 111
 112        pmd_clear(pmd);
 113        kvm_tlb_flush_vmid_ipa(kvm, addr);
 114        put_page(virt_to_page(pmd));
 115}
 116
 117static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 118                                  int min, int max)
 119{
 120        void *page;
 121
 122        BUG_ON(max > KVM_NR_MEM_OBJS);
 123        if (cache->nobjs >= min)
 124                return 0;
 125        while (cache->nobjs < max) {
 126                page = (void *)__get_free_page(PGALLOC_GFP);
 127                if (!page)
 128                        return -ENOMEM;
 129                cache->objects[cache->nobjs++] = page;
 130        }
 131        return 0;
 132}
 133
 134static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
 135{
 136        while (mc->nobjs)
 137                free_page((unsigned long)mc->objects[--mc->nobjs]);
 138}
 139
 140static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
 141{
 142        void *p;
 143
 144        BUG_ON(!mc || !mc->nobjs);
 145        p = mc->objects[--mc->nobjs];
 146        return p;
 147}
 148
 149static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr)
 150{
 151        pud_t *pud_table __maybe_unused = stage2_pud_offset(pgd, 0UL);
 152        stage2_pgd_clear(pgd);
 153        kvm_tlb_flush_vmid_ipa(kvm, addr);
 154        stage2_pud_free(pud_table);
 155        put_page(virt_to_page(pgd));
 156}
 157
 158static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
 159{
 160        pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(pud, 0);
 161        VM_BUG_ON(stage2_pud_huge(*pud));
 162        stage2_pud_clear(pud);
 163        kvm_tlb_flush_vmid_ipa(kvm, addr);
 164        stage2_pmd_free(pmd_table);
 165        put_page(virt_to_page(pud));
 166}
 167
 168static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
 169{
 170        pte_t *pte_table = pte_offset_kernel(pmd, 0);
 171        VM_BUG_ON(pmd_thp_or_huge(*pmd));
 172        pmd_clear(pmd);
 173        kvm_tlb_flush_vmid_ipa(kvm, addr);
 174        pte_free_kernel(NULL, pte_table);
 175        put_page(virt_to_page(pmd));
 176}
 177
 178/*
 179 * Unmapping vs dcache management:
 180 *
 181 * If a guest maps certain memory pages as uncached, all writes will
 182 * bypass the data cache and go directly to RAM.  However, the CPUs
 183 * can still speculate reads (not writes) and fill cache lines with
 184 * data.
 185 *
 186 * Those cache lines will be *clean* cache lines though, so a
 187 * clean+invalidate operation is equivalent to an invalidate
 188 * operation, because no cache lines are marked dirty.
 189 *
 190 * Those clean cache lines could be filled prior to an uncached write
 191 * by the guest, and the cache coherent IO subsystem would therefore
 192 * end up writing old data to disk.
 193 *
 194 * This is why right after unmapping a page/section and invalidating
 195 * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure
 196 * the IO subsystem will never hit in the cache.
 197 */
 198static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd,
 199                       phys_addr_t addr, phys_addr_t end)
 200{
 201        phys_addr_t start_addr = addr;
 202        pte_t *pte, *start_pte;
 203
 204        start_pte = pte = pte_offset_kernel(pmd, addr);
 205        do {
 206                if (!pte_none(*pte)) {
 207                        pte_t old_pte = *pte;
 208
 209                        kvm_set_pte(pte, __pte(0));
 210                        kvm_tlb_flush_vmid_ipa(kvm, addr);
 211
 212                        /* No need to invalidate the cache for device mappings */
 213                        if (!kvm_is_device_pfn(pte_pfn(old_pte)))
 214                                kvm_flush_dcache_pte(old_pte);
 215
 216                        put_page(virt_to_page(pte));
 217                }
 218        } while (pte++, addr += PAGE_SIZE, addr != end);
 219
 220        if (stage2_pte_table_empty(start_pte))
 221                clear_stage2_pmd_entry(kvm, pmd, start_addr);
 222}
 223
 224static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud,
 225                       phys_addr_t addr, phys_addr_t end)
 226{
 227        phys_addr_t next, start_addr = addr;
 228        pmd_t *pmd, *start_pmd;
 229
 230        start_pmd = pmd = stage2_pmd_offset(pud, addr);
 231        do {
 232                next = stage2_pmd_addr_end(addr, end);
 233                if (!pmd_none(*pmd)) {
 234                        if (pmd_thp_or_huge(*pmd)) {
 235                                pmd_t old_pmd = *pmd;
 236
 237                                pmd_clear(pmd);
 238                                kvm_tlb_flush_vmid_ipa(kvm, addr);
 239
 240                                kvm_flush_dcache_pmd(old_pmd);
 241
 242                                put_page(virt_to_page(pmd));
 243                        } else {
 244                                unmap_stage2_ptes(kvm, pmd, addr, next);
 245                        }
 246                }
 247        } while (pmd++, addr = next, addr != end);
 248
 249        if (stage2_pmd_table_empty(start_pmd))
 250                clear_stage2_pud_entry(kvm, pud, start_addr);
 251}
 252
 253static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd,
 254                       phys_addr_t addr, phys_addr_t end)
 255{
 256        phys_addr_t next, start_addr = addr;
 257        pud_t *pud, *start_pud;
 258
 259        start_pud = pud = stage2_pud_offset(pgd, addr);
 260        do {
 261                next = stage2_pud_addr_end(addr, end);
 262                if (!stage2_pud_none(*pud)) {
 263                        if (stage2_pud_huge(*pud)) {
 264                                pud_t old_pud = *pud;
 265
 266                                stage2_pud_clear(pud);
 267                                kvm_tlb_flush_vmid_ipa(kvm, addr);
 268                                kvm_flush_dcache_pud(old_pud);
 269                                put_page(virt_to_page(pud));
 270                        } else {
 271                                unmap_stage2_pmds(kvm, pud, addr, next);
 272                        }
 273                }
 274        } while (pud++, addr = next, addr != end);
 275
 276        if (stage2_pud_table_empty(start_pud))
 277                clear_stage2_pgd_entry(kvm, pgd, start_addr);
 278}
 279
 280/**
 281 * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
 282 * @kvm:   The VM pointer
 283 * @start: The intermediate physical base address of the range to unmap
 284 * @size:  The size of the area to unmap
 285 *
 286 * Clear a range of stage-2 mappings, lowering the various ref-counts.  Must
 287 * be called while holding mmu_lock (unless for freeing the stage2 pgd before
 288 * destroying the VM), otherwise another faulting VCPU may come in and mess
 289 * with things behind our backs.
 290 */
 291static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
 292{
 293        pgd_t *pgd;
 294        phys_addr_t addr = start, end = start + size;
 295        phys_addr_t next;
 296
 297        pgd = kvm->arch.pgd + stage2_pgd_index(addr);
 298        do {
 299                next = stage2_pgd_addr_end(addr, end);
 300                if (!stage2_pgd_none(*pgd))
 301                        unmap_stage2_puds(kvm, pgd, addr, next);
 302        } while (pgd++, addr = next, addr != end);
 303}
 304
 305static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd,
 306                              phys_addr_t addr, phys_addr_t end)
 307{
 308        pte_t *pte;
 309
 310        pte = pte_offset_kernel(pmd, addr);
 311        do {
 312                if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte)))
 313                        kvm_flush_dcache_pte(*pte);
 314        } while (pte++, addr += PAGE_SIZE, addr != end);
 315}
 316
 317static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud,
 318                              phys_addr_t addr, phys_addr_t end)
 319{
 320        pmd_t *pmd;
 321        phys_addr_t next;
 322
 323        pmd = stage2_pmd_offset(pud, addr);
 324        do {
 325                next = stage2_pmd_addr_end(addr, end);
 326                if (!pmd_none(*pmd)) {
 327                        if (pmd_thp_or_huge(*pmd))
 328                                kvm_flush_dcache_pmd(*pmd);
 329                        else
 330                                stage2_flush_ptes(kvm, pmd, addr, next);
 331                }
 332        } while (pmd++, addr = next, addr != end);
 333}
 334
 335static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd,
 336                              phys_addr_t addr, phys_addr_t end)
 337{
 338        pud_t *pud;
 339        phys_addr_t next;
 340
 341        pud = stage2_pud_offset(pgd, addr);
 342        do {
 343                next = stage2_pud_addr_end(addr, end);
 344                if (!stage2_pud_none(*pud)) {
 345                        if (stage2_pud_huge(*pud))
 346                                kvm_flush_dcache_pud(*pud);
 347                        else
 348                                stage2_flush_pmds(kvm, pud, addr, next);
 349                }
 350        } while (pud++, addr = next, addr != end);
 351}
 352
 353static void stage2_flush_memslot(struct kvm *kvm,
 354                                 struct kvm_memory_slot *memslot)
 355{
 356        phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
 357        phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
 358        phys_addr_t next;
 359        pgd_t *pgd;
 360
 361        pgd = kvm->arch.pgd + stage2_pgd_index(addr);
 362        do {
 363                next = stage2_pgd_addr_end(addr, end);
 364                stage2_flush_puds(kvm, pgd, addr, next);
 365        } while (pgd++, addr = next, addr != end);
 366}
 367
 368/**
 369 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
 370 * @kvm: The struct kvm pointer
 371 *
 372 * Go through the stage 2 page tables and invalidate any cache lines
 373 * backing memory already mapped to the VM.
 374 */
 375static void stage2_flush_vm(struct kvm *kvm)
 376{
 377        struct kvm_memslots *slots;
 378        struct kvm_memory_slot *memslot;
 379        int idx;
 380
 381        idx = srcu_read_lock(&kvm->srcu);
 382        spin_lock(&kvm->mmu_lock);
 383
 384        slots = kvm_memslots(kvm);
 385        kvm_for_each_memslot(memslot, slots)
 386                stage2_flush_memslot(kvm, memslot);
 387
 388        spin_unlock(&kvm->mmu_lock);
 389        srcu_read_unlock(&kvm->srcu, idx);
 390}
 391
 392static void clear_hyp_pgd_entry(pgd_t *pgd)
 393{
 394        pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL);
 395        pgd_clear(pgd);
 396        pud_free(NULL, pud_table);
 397        put_page(virt_to_page(pgd));
 398}
 399
 400static void clear_hyp_pud_entry(pud_t *pud)
 401{
 402        pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0);
 403        VM_BUG_ON(pud_huge(*pud));
 404        pud_clear(pud);
 405        pmd_free(NULL, pmd_table);
 406        put_page(virt_to_page(pud));
 407}
 408
 409static void clear_hyp_pmd_entry(pmd_t *pmd)
 410{
 411        pte_t *pte_table = pte_offset_kernel(pmd, 0);
 412        VM_BUG_ON(pmd_thp_or_huge(*pmd));
 413        pmd_clear(pmd);
 414        pte_free_kernel(NULL, pte_table);
 415        put_page(virt_to_page(pmd));
 416}
 417
 418static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
 419{
 420        pte_t *pte, *start_pte;
 421
 422        start_pte = pte = pte_offset_kernel(pmd, addr);
 423        do {
 424                if (!pte_none(*pte)) {
 425                        kvm_set_pte(pte, __pte(0));
 426                        put_page(virt_to_page(pte));
 427                }
 428        } while (pte++, addr += PAGE_SIZE, addr != end);
 429
 430        if (hyp_pte_table_empty(start_pte))
 431                clear_hyp_pmd_entry(pmd);
 432}
 433
 434static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
 435{
 436        phys_addr_t next;
 437        pmd_t *pmd, *start_pmd;
 438
 439        start_pmd = pmd = pmd_offset(pud, addr);
 440        do {
 441                next = pmd_addr_end(addr, end);
 442                /* Hyp doesn't use huge pmds */
 443                if (!pmd_none(*pmd))
 444                        unmap_hyp_ptes(pmd, addr, next);
 445        } while (pmd++, addr = next, addr != end);
 446
 447        if (hyp_pmd_table_empty(start_pmd))
 448                clear_hyp_pud_entry(pud);
 449}
 450
 451static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
 452{
 453        phys_addr_t next;
 454        pud_t *pud, *start_pud;
 455
 456        start_pud = pud = pud_offset(pgd, addr);
 457        do {
 458                next = pud_addr_end(addr, end);
 459                /* Hyp doesn't use huge puds */
 460                if (!pud_none(*pud))
 461                        unmap_hyp_pmds(pud, addr, next);
 462        } while (pud++, addr = next, addr != end);
 463
 464        if (hyp_pud_table_empty(start_pud))
 465                clear_hyp_pgd_entry(pgd);
 466}
 467
 468static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
 469{
 470        pgd_t *pgd;
 471        phys_addr_t addr = start, end = start + size;
 472        phys_addr_t next;
 473
 474        /*
 475         * We don't unmap anything from HYP, except at the hyp tear down.
 476         * Hence, we don't have to invalidate the TLBs here.
 477         */
 478        pgd = pgdp + pgd_index(addr);
 479        do {
 480                next = pgd_addr_end(addr, end);
 481                if (!pgd_none(*pgd))
 482                        unmap_hyp_puds(pgd, addr, next);
 483        } while (pgd++, addr = next, addr != end);
 484}
 485
 486/**
 487 * free_boot_hyp_pgd - free HYP boot page tables
 488 *
 489 * Free the HYP boot page tables. The bounce page is also freed.
 490 */
 491void free_boot_hyp_pgd(void)
 492{
 493        mutex_lock(&kvm_hyp_pgd_mutex);
 494
 495        if (boot_hyp_pgd) {
 496                unmap_hyp_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE);
 497                unmap_hyp_range(boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
 498                free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
 499                boot_hyp_pgd = NULL;
 500        }
 501
 502        if (hyp_pgd)
 503                unmap_hyp_range(hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
 504
 505        mutex_unlock(&kvm_hyp_pgd_mutex);
 506}
 507
 508/**
 509 * free_hyp_pgds - free Hyp-mode page tables
 510 *
 511 * Assumes hyp_pgd is a page table used strictly in Hyp-mode and
 512 * therefore contains either mappings in the kernel memory area (above
 513 * PAGE_OFFSET), or device mappings in the vmalloc range (from
 514 * VMALLOC_START to VMALLOC_END).
 515 *
 516 * boot_hyp_pgd should only map two pages for the init code.
 517 */
 518void free_hyp_pgds(void)
 519{
 520        unsigned long addr;
 521
 522        free_boot_hyp_pgd();
 523
 524        mutex_lock(&kvm_hyp_pgd_mutex);
 525
 526        if (hyp_pgd) {
 527                for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE)
 528                        unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
 529                for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE)
 530                        unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
 531
 532                free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
 533                hyp_pgd = NULL;
 534        }
 535        if (merged_hyp_pgd) {
 536                clear_page(merged_hyp_pgd);
 537                free_page((unsigned long)merged_hyp_pgd);
 538                merged_hyp_pgd = NULL;
 539        }
 540
 541        mutex_unlock(&kvm_hyp_pgd_mutex);
 542}
 543
 544static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start,
 545                                    unsigned long end, unsigned long pfn,
 546                                    pgprot_t prot)
 547{
 548        pte_t *pte;
 549        unsigned long addr;
 550
 551        addr = start;
 552        do {
 553                pte = pte_offset_kernel(pmd, addr);
 554                kvm_set_pte(pte, pfn_pte(pfn, prot));
 555                get_page(virt_to_page(pte));
 556                kvm_flush_dcache_to_poc(pte, sizeof(*pte));
 557                pfn++;
 558        } while (addr += PAGE_SIZE, addr != end);
 559}
 560
 561static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start,
 562                                   unsigned long end, unsigned long pfn,
 563                                   pgprot_t prot)
 564{
 565        pmd_t *pmd;
 566        pte_t *pte;
 567        unsigned long addr, next;
 568
 569        addr = start;
 570        do {
 571                pmd = pmd_offset(pud, addr);
 572
 573                BUG_ON(pmd_sect(*pmd));
 574
 575                if (pmd_none(*pmd)) {
 576                        pte = pte_alloc_one_kernel(NULL, addr);
 577                        if (!pte) {
 578                                kvm_err("Cannot allocate Hyp pte\n");
 579                                return -ENOMEM;
 580                        }
 581                        pmd_populate_kernel(NULL, pmd, pte);
 582                        get_page(virt_to_page(pmd));
 583                        kvm_flush_dcache_to_poc(pmd, sizeof(*pmd));
 584                }
 585
 586                next = pmd_addr_end(addr, end);
 587
 588                create_hyp_pte_mappings(pmd, addr, next, pfn, prot);
 589                pfn += (next - addr) >> PAGE_SHIFT;
 590        } while (addr = next, addr != end);
 591
 592        return 0;
 593}
 594
 595static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start,
 596                                   unsigned long end, unsigned long pfn,
 597                                   pgprot_t prot)
 598{
 599        pud_t *pud;
 600        pmd_t *pmd;
 601        unsigned long addr, next;
 602        int ret;
 603
 604        addr = start;
 605        do {
 606                pud = pud_offset(pgd, addr);
 607
 608                if (pud_none_or_clear_bad(pud)) {
 609                        pmd = pmd_alloc_one(NULL, addr);
 610                        if (!pmd) {
 611                                kvm_err("Cannot allocate Hyp pmd\n");
 612                                return -ENOMEM;
 613                        }
 614                        pud_populate(NULL, pud, pmd);
 615                        get_page(virt_to_page(pud));
 616                        kvm_flush_dcache_to_poc(pud, sizeof(*pud));
 617                }
 618
 619                next = pud_addr_end(addr, end);
 620                ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot);
 621                if (ret)
 622                        return ret;
 623                pfn += (next - addr) >> PAGE_SHIFT;
 624        } while (addr = next, addr != end);
 625
 626        return 0;
 627}
 628
 629static int __create_hyp_mappings(pgd_t *pgdp,
 630                                 unsigned long start, unsigned long end,
 631                                 unsigned long pfn, pgprot_t prot)
 632{
 633        pgd_t *pgd;
 634        pud_t *pud;
 635        unsigned long addr, next;
 636        int err = 0;
 637
 638        mutex_lock(&kvm_hyp_pgd_mutex);
 639        addr = start & PAGE_MASK;
 640        end = PAGE_ALIGN(end);
 641        do {
 642                pgd = pgdp + pgd_index(addr);
 643
 644                if (pgd_none(*pgd)) {
 645                        pud = pud_alloc_one(NULL, addr);
 646                        if (!pud) {
 647                                kvm_err("Cannot allocate Hyp pud\n");
 648                                err = -ENOMEM;
 649                                goto out;
 650                        }
 651                        pgd_populate(NULL, pgd, pud);
 652                        get_page(virt_to_page(pgd));
 653                        kvm_flush_dcache_to_poc(pgd, sizeof(*pgd));
 654                }
 655
 656                next = pgd_addr_end(addr, end);
 657                err = create_hyp_pud_mappings(pgd, addr, next, pfn, prot);
 658                if (err)
 659                        goto out;
 660                pfn += (next - addr) >> PAGE_SHIFT;
 661        } while (addr = next, addr != end);
 662out:
 663        mutex_unlock(&kvm_hyp_pgd_mutex);
 664        return err;
 665}
 666
 667static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
 668{
 669        if (!is_vmalloc_addr(kaddr)) {
 670                BUG_ON(!virt_addr_valid(kaddr));
 671                return __pa(kaddr);
 672        } else {
 673                return page_to_phys(vmalloc_to_page(kaddr)) +
 674                       offset_in_page(kaddr);
 675        }
 676}
 677
 678/**
 679 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
 680 * @from:       The virtual kernel start address of the range
 681 * @to:         The virtual kernel end address of the range (exclusive)
 682 *
 683 * The same virtual address as the kernel virtual address is also used
 684 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
 685 * physical pages.
 686 */
 687int create_hyp_mappings(void *from, void *to)
 688{
 689        phys_addr_t phys_addr;
 690        unsigned long virt_addr;
 691        unsigned long start = KERN_TO_HYP((unsigned long)from);
 692        unsigned long end = KERN_TO_HYP((unsigned long)to);
 693
 694        if (is_kernel_in_hyp_mode())
 695                return 0;
 696
 697        start = start & PAGE_MASK;
 698        end = PAGE_ALIGN(end);
 699
 700        for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
 701                int err;
 702
 703                phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
 704                err = __create_hyp_mappings(hyp_pgd, virt_addr,
 705                                            virt_addr + PAGE_SIZE,
 706                                            __phys_to_pfn(phys_addr),
 707                                            PAGE_HYP);
 708                if (err)
 709                        return err;
 710        }
 711
 712        return 0;
 713}
 714
 715/**
 716 * create_hyp_io_mappings - duplicate a kernel IO mapping into Hyp mode
 717 * @from:       The kernel start VA of the range
 718 * @to:         The kernel end VA of the range (exclusive)
 719 * @phys_addr:  The physical start address which gets mapped
 720 *
 721 * The resulting HYP VA is the same as the kernel VA, modulo
 722 * HYP_PAGE_OFFSET.
 723 */
 724int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr)
 725{
 726        unsigned long start = KERN_TO_HYP((unsigned long)from);
 727        unsigned long end = KERN_TO_HYP((unsigned long)to);
 728
 729        if (is_kernel_in_hyp_mode())
 730                return 0;
 731
 732        /* Check for a valid kernel IO mapping */
 733        if (!is_vmalloc_addr(from) || !is_vmalloc_addr(to - 1))
 734                return -EINVAL;
 735
 736        return __create_hyp_mappings(hyp_pgd, start, end,
 737                                     __phys_to_pfn(phys_addr), PAGE_HYP_DEVICE);
 738}
 739
 740/**
 741 * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
 742 * @kvm:        The KVM struct pointer for the VM.
 743 *
 744 * Allocates only the stage-2 HW PGD level table(s) (can support either full
 745 * 40-bit input addresses or limited to 32-bit input addresses). Clears the
 746 * allocated pages.
 747 *
 748 * Note we don't need locking here as this is only called when the VM is
 749 * created, which can only be done once.
 750 */
 751int kvm_alloc_stage2_pgd(struct kvm *kvm)
 752{
 753        pgd_t *pgd;
 754
 755        if (kvm->arch.pgd != NULL) {
 756                kvm_err("kvm_arch already initialized?\n");
 757                return -EINVAL;
 758        }
 759
 760        /* Allocate the HW PGD, making sure that each page gets its own refcount */
 761        pgd = alloc_pages_exact(S2_PGD_SIZE, GFP_KERNEL | __GFP_ZERO);
 762        if (!pgd)
 763                return -ENOMEM;
 764
 765        kvm_clean_pgd(pgd);
 766        kvm->arch.pgd = pgd;
 767        return 0;
 768}
 769
 770static void stage2_unmap_memslot(struct kvm *kvm,
 771                                 struct kvm_memory_slot *memslot)
 772{
 773        hva_t hva = memslot->userspace_addr;
 774        phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
 775        phys_addr_t size = PAGE_SIZE * memslot->npages;
 776        hva_t reg_end = hva + size;
 777
 778        /*
 779         * A memory region could potentially cover multiple VMAs, and any holes
 780         * between them, so iterate over all of them to find out if we should
 781         * unmap any of them.
 782         *
 783         *     +--------------------------------------------+
 784         * +---------------+----------------+   +----------------+
 785         * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
 786         * +---------------+----------------+   +----------------+
 787         *     |               memory region                |
 788         *     +--------------------------------------------+
 789         */
 790        do {
 791                struct vm_area_struct *vma = find_vma(current->mm, hva);
 792                hva_t vm_start, vm_end;
 793
 794                if (!vma || vma->vm_start >= reg_end)
 795                        break;
 796
 797                /*
 798                 * Take the intersection of this VMA with the memory region
 799                 */
 800                vm_start = max(hva, vma->vm_start);
 801                vm_end = min(reg_end, vma->vm_end);
 802
 803                if (!(vma->vm_flags & VM_PFNMAP)) {
 804                        gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
 805                        unmap_stage2_range(kvm, gpa, vm_end - vm_start);
 806                }
 807                hva = vm_end;
 808        } while (hva < reg_end);
 809}
 810
 811/**
 812 * stage2_unmap_vm - Unmap Stage-2 RAM mappings
 813 * @kvm: The struct kvm pointer
 814 *
 815 * Go through the memregions and unmap any reguler RAM
 816 * backing memory already mapped to the VM.
 817 */
 818void stage2_unmap_vm(struct kvm *kvm)
 819{
 820        struct kvm_memslots *slots;
 821        struct kvm_memory_slot *memslot;
 822        int idx;
 823
 824        idx = srcu_read_lock(&kvm->srcu);
 825        spin_lock(&kvm->mmu_lock);
 826
 827        slots = kvm_memslots(kvm);
 828        kvm_for_each_memslot(memslot, slots)
 829                stage2_unmap_memslot(kvm, memslot);
 830
 831        spin_unlock(&kvm->mmu_lock);
 832        srcu_read_unlock(&kvm->srcu, idx);
 833}
 834
 835/**
 836 * kvm_free_stage2_pgd - free all stage-2 tables
 837 * @kvm:        The KVM struct pointer for the VM.
 838 *
 839 * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all
 840 * underlying level-2 and level-3 tables before freeing the actual level-1 table
 841 * and setting the struct pointer to NULL.
 842 *
 843 * Note we don't need locking here as this is only called when the VM is
 844 * destroyed, which can only be done once.
 845 */
 846void kvm_free_stage2_pgd(struct kvm *kvm)
 847{
 848        if (kvm->arch.pgd == NULL)
 849                return;
 850
 851        unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE);
 852        /* Free the HW pgd, one page at a time */
 853        free_pages_exact(kvm->arch.pgd, S2_PGD_SIZE);
 854        kvm->arch.pgd = NULL;
 855}
 856
 857static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
 858                             phys_addr_t addr)
 859{
 860        pgd_t *pgd;
 861        pud_t *pud;
 862
 863        pgd = kvm->arch.pgd + stage2_pgd_index(addr);
 864        if (WARN_ON(stage2_pgd_none(*pgd))) {
 865                if (!cache)
 866                        return NULL;
 867                pud = mmu_memory_cache_alloc(cache);
 868                stage2_pgd_populate(pgd, pud);
 869                get_page(virt_to_page(pgd));
 870        }
 871
 872        return stage2_pud_offset(pgd, addr);
 873}
 874
 875static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
 876                             phys_addr_t addr)
 877{
 878        pud_t *pud;
 879        pmd_t *pmd;
 880
 881        pud = stage2_get_pud(kvm, cache, addr);
 882        if (stage2_pud_none(*pud)) {
 883                if (!cache)
 884                        return NULL;
 885                pmd = mmu_memory_cache_alloc(cache);
 886                stage2_pud_populate(pud, pmd);
 887                get_page(virt_to_page(pud));
 888        }
 889
 890        return stage2_pmd_offset(pud, addr);
 891}
 892
 893static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
 894                               *cache, phys_addr_t addr, const pmd_t *new_pmd)
 895{
 896        pmd_t *pmd, old_pmd;
 897
 898        pmd = stage2_get_pmd(kvm, cache, addr);
 899        VM_BUG_ON(!pmd);
 900
 901        /*
 902         * Mapping in huge pages should only happen through a fault.  If a
 903         * page is merged into a transparent huge page, the individual
 904         * subpages of that huge page should be unmapped through MMU
 905         * notifiers before we get here.
 906         *
 907         * Merging of CompoundPages is not supported; they should become
 908         * splitting first, unmapped, merged, and mapped back in on-demand.
 909         */
 910        VM_BUG_ON(pmd_present(*pmd) && pmd_pfn(*pmd) != pmd_pfn(*new_pmd));
 911
 912        old_pmd = *pmd;
 913        if (pmd_present(old_pmd)) {
 914                pmd_clear(pmd);
 915                kvm_tlb_flush_vmid_ipa(kvm, addr);
 916        } else {
 917                get_page(virt_to_page(pmd));
 918        }
 919
 920        kvm_set_pmd(pmd, *new_pmd);
 921        return 0;
 922}
 923
 924static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
 925                          phys_addr_t addr, const pte_t *new_pte,
 926                          unsigned long flags)
 927{
 928        pmd_t *pmd;
 929        pte_t *pte, old_pte;
 930        bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
 931        bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE;
 932
 933        VM_BUG_ON(logging_active && !cache);
 934
 935        /* Create stage-2 page table mapping - Levels 0 and 1 */
 936        pmd = stage2_get_pmd(kvm, cache, addr);
 937        if (!pmd) {
 938                /*
 939                 * Ignore calls from kvm_set_spte_hva for unallocated
 940                 * address ranges.
 941                 */
 942                return 0;
 943        }
 944
 945        /*
 946         * While dirty page logging - dissolve huge PMD, then continue on to
 947         * allocate page.
 948         */
 949        if (logging_active)
 950                stage2_dissolve_pmd(kvm, addr, pmd);
 951
 952        /* Create stage-2 page mappings - Level 2 */
 953        if (pmd_none(*pmd)) {
 954                if (!cache)
 955                        return 0; /* ignore calls from kvm_set_spte_hva */
 956                pte = mmu_memory_cache_alloc(cache);
 957                kvm_clean_pte(pte);
 958                pmd_populate_kernel(NULL, pmd, pte);
 959                get_page(virt_to_page(pmd));
 960        }
 961
 962        pte = pte_offset_kernel(pmd, addr);
 963
 964        if (iomap && pte_present(*pte))
 965                return -EFAULT;
 966
 967        /* Create 2nd stage page table mapping - Level 3 */
 968        old_pte = *pte;
 969        if (pte_present(old_pte)) {
 970                kvm_set_pte(pte, __pte(0));
 971                kvm_tlb_flush_vmid_ipa(kvm, addr);
 972        } else {
 973                get_page(virt_to_page(pte));
 974        }
 975
 976        kvm_set_pte(pte, *new_pte);
 977        return 0;
 978}
 979
 980#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
 981static int stage2_ptep_test_and_clear_young(pte_t *pte)
 982{
 983        if (pte_young(*pte)) {
 984                *pte = pte_mkold(*pte);
 985                return 1;
 986        }
 987        return 0;
 988}
 989#else
 990static int stage2_ptep_test_and_clear_young(pte_t *pte)
 991{
 992        return __ptep_test_and_clear_young(pte);
 993}
 994#endif
 995
 996static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
 997{
 998        return stage2_ptep_test_and_clear_young((pte_t *)pmd);
 999}
1000

1001/**
1002 * kvm_phys_addr_ioremap - map a device range to guest IPA
1003 *
1004 * @kvm:        The KVM pointer
1005 * @guest_ipa:  The IPA at which to insert the mapping
1006 * @pa:         The physical address of the device
1007 * @size:       The size of the mapping
1008 */
1009int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
1010                          phys_addr_t pa, unsigned long size, bool writable)
1011{
1012        phys_addr_t addr, end;
1013        int ret = 0;
1014        unsigned long pfn;
1015        struct kvm_mmu_memory_cache cache = { 0, };
1016
1017        end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK;
1018        pfn = __phys_to_pfn(pa);
1019
1020        for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) {
1021                pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE);
1022
1023                if (writable)
1024                        pte = kvm_s2pte_mkwrite(pte);
1025
1026                ret = mmu_topup_memory_cache(&cache, KVM_MMU_CACHE_MIN_PAGES,
1027                                                KVM_NR_MEM_OBJS);
1028                if (ret)
1029                        goto out;
1030                spin_lock(&kvm->mmu_lock);
1031                ret = stage2_set_pte(kvm, &cache, addr, &pte,
1032                                                KVM_S2PTE_FLAG_IS_IOMAP);
1033                spin_unlock(&kvm->mmu_lock);
1034                if (ret)
1035                        goto out;
1036
1037                pfn++;
1038        }
1039
1040out:
1041        mmu_free_memory_cache(&cache);
1042        return ret;
1043}
1044
1045static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap)
1046{
1047        kvm_pfn_t pfn = *pfnp;
1048        gfn_t gfn = *ipap >> PAGE_SHIFT;
1049
1050        if (PageTransCompoundMap(pfn_to_page(pfn))) {
1051                unsigned long mask;
1052                /*
1053                 * The address we faulted on is backed by a transparent huge
1054                 * page.  However, because we map the compound huge page and
1055                 * not the individual tail page, we need to transfer the
1056                 * refcount to the head page.  We have to be careful that the
1057                 * THP doesn't start to split while we are adjusting the
1058                 * refcounts.
1059                 *
1060                 * We are sure this doesn't happen, because mmu_notifier_retry
1061                 * was successful and we are holding the mmu_lock, so if this
1062                 * THP is trying to split, it will be blocked in the mmu
1063                 * notifier before touching any of the pages, specifically
1064                 * before being able to call __split_huge_page_refcount().
1065                 *
1066                 * We can therefore safely transfer the refcount from PG_tail
1067                 * to PG_head and switch the pfn from a tail page to the head
1068                 * page accordingly.
1069                 */
1070                mask = PTRS_PER_PMD - 1;
1071                VM_BUG_ON((gfn & mask) != (pfn & mask));
1072                if (pfn & mask) {
1073                        *ipap &= PMD_MASK;
1074                        kvm_release_pfn_clean(pfn);
1075                        pfn &= ~mask;
1076                        kvm_get_pfn(pfn);
1077                        *pfnp = pfn;
1078                }
1079
1080                return true;
1081        }
1082
1083        return false;
1084}
1085
1086static bool kvm_is_write_fault(struct kvm_vcpu *vcpu)
1087{
1088        if (kvm_vcpu_trap_is_iabt(vcpu))
1089                return false;
1090
1091        return kvm_vcpu_dabt_iswrite(vcpu);
1092}
1093
1094/**
1095 * stage2_wp_ptes - write protect PMD range
1096 * @pmd:        pointer to pmd entry
1097 * @addr:       range start address
1098 * @end:        range end address
1099 */
1100static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
1101{
1102        pte_t *pte;
1103
1104        pte = pte_offset_kernel(pmd, addr);
1105        do {
1106                if (!pte_none(*pte)) {
1107                        if (!kvm_s2pte_readonly(pte))
1108                                kvm_set_s2pte_readonly(pte);
1109                }
1110        } while (pte++, addr += PAGE_SIZE, addr != end);
1111}
1112
1113/**
1114 * stage2_wp_pmds - write protect PUD range
1115 * @pud:        pointer to pud entry
1116 * @addr:       range start address
1117 * @end:        range end address
1118 */
1119static void stage2_wp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
1120{
1121        pmd_t *pmd;
1122        phys_addr_t next;
1123
1124        pmd = stage2_pmd_offset(pud, addr);
1125
1126        do {
1127                next = stage2_pmd_addr_end(addr, end);
1128                if (!pmd_none(*pmd)) {
1129                        if (pmd_thp_or_huge(*pmd)) {
1130                                if (!kvm_s2pmd_readonly(pmd))
1131                                        kvm_set_s2pmd_readonly(pmd);
1132                        } else {
1133                                stage2_wp_ptes(pmd, addr, next);
1134                        }
1135                }
1136        } while (pmd++, addr = next, addr != end);
1137}
1138
1139/**
1140  * stage2_wp_puds - write protect PGD range
1141  * @pgd:       pointer to pgd entry
1142  * @addr:      range start address
1143  * @end:       range end address
1144  *
1145  * Process PUD entries, for a huge PUD we cause a panic.
1146  */
1147static void  stage2_wp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
1148{
1149        pud_t *pud;
1150        phys_addr_t next;
1151
1152        pud = stage2_pud_offset(pgd, addr);
1153        do {
1154                next = stage2_pud_addr_end(addr, end);
1155                if (!stage2_pud_none(*pud)) {
1156                        /* TODO:PUD not supported, revisit later if supported */
1157                        BUG_ON(stage2_pud_huge(*pud));
1158                        stage2_wp_pmds(pud, addr, next);
1159                }
1160        } while (pud++, addr = next, addr != end);
1161}
1162
1163/**
1164 * stage2_wp_range() - write protect stage2 memory region range
1165 * @kvm:        The KVM pointer
1166 * @addr:       Start address of range
1167 * @end:        End address of range
1168 */
1169static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
1170{
1171        pgd_t *pgd;
1172        phys_addr_t next;
1173
1174        pgd = kvm->arch.pgd + stage2_pgd_index(addr);
1175        do {
1176                /*
1177                 * Release kvm_mmu_lock periodically if the memory region is
1178                 * large. Otherwise, we may see kernel panics with
1179                 * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR,
1180                 * CONFIG_LOCKDEP. Additionally, holding the lock too long
1181                 * will also starve other vCPUs.
1182                 */
1183                if (need_resched() || spin_needbreak(&kvm->mmu_lock))
1184                        cond_resched_lock(&kvm->mmu_lock);
1185
1186                next = stage2_pgd_addr_end(addr, end);
1187                if (stage2_pgd_present(*pgd))
1188                        stage2_wp_puds(pgd, addr, next);
1189        } while (pgd++, addr = next, addr != end);
1190}
1191
1192/**
1193 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
1194 * @kvm:        The KVM pointer
1195 * @slot:       The memory slot to write protect
1196 *
1197 * Called to start logging dirty pages after memory region
1198 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
1199 * all present PMD and PTEs are write protected in the memory region.
1200 * Afterwards read of dirty page log can be called.
1201 *
1202 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
1203 * serializing operations for VM memory regions.
1204 */
1205void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
1206{
1207        struct kvm_memslots *slots = kvm_memslots(kvm);
1208        struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
1209        phys_addr_t start = memslot->base_gfn << PAGE_SHIFT;
1210        phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
1211
1212        spin_lock(&kvm->mmu_lock);
1213        stage2_wp_range(kvm, start, end);
1214        spin_unlock(&kvm->mmu_lock);
1215        kvm_flush_remote_tlbs(kvm);
1216}
1217
1218/**
1219 * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
1220 * @kvm:        The KVM pointer
1221 * @slot:       The memory slot associated with mask
1222 * @gfn_offset: The gfn offset in memory slot
1223 * @mask:       The mask of dirty pages at offset 'gfn_offset' in this memory
1224 *              slot to be write protected
1225 *
1226 * Walks bits set in mask write protects the associated pte's. Caller must
1227 * acquire kvm_mmu_lock.
1228 */
1229static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1230                struct kvm_memory_slot *slot,
1231                gfn_t gfn_offset, unsigned long mask)
1232{
1233        phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
1234        phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT;
1235        phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
1236
1237        stage2_wp_range(kvm, start, end);
1238}
1239
1240/*
1241 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1242 * dirty pages.
1243 *
1244 * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1245 * enable dirty logging for them.
1246 */
1247void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1248                struct kvm_memory_slot *slot,
1249                gfn_t gfn_offset, unsigned long mask)
1250{
1251        kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1252}
1253
1254static void coherent_cache_guest_page(struct kvm_vcpu *vcpu, kvm_pfn_t pfn,
1255                                      unsigned long size, bool uncached)
1256{
1257        __coherent_cache_guest_page(vcpu, pfn, size, uncached);
1258}
1259
1260static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1261                          struct kvm_memory_slot *memslot, unsigned long hva,
1262                          unsigned long fault_status)
1263{
1264        int ret;
1265        bool write_fault, writable, hugetlb = false, force_pte = false;
1266        unsigned long mmu_seq;
1267        gfn_t gfn = fault_ipa >> PAGE_SHIFT;
1268        struct kvm *kvm = vcpu->kvm;
1269        struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
1270        struct vm_area_struct *vma;
1271        kvm_pfn_t pfn;
1272        pgprot_t mem_type = PAGE_S2;
1273        bool fault_ipa_uncached;
1274        bool logging_active = memslot_is_logging(memslot);
1275        unsigned long flags = 0;
1276
1277        write_fault = kvm_is_write_fault(vcpu);
1278        if (fault_status == FSC_PERM && !write_fault) {
1279                kvm_err("Unexpected L2 read permission error\n");
1280                return -EFAULT;
1281        }
1282
1283        /* Let's check if we will get back a huge page backed by hugetlbfs */
1284        down_read(&current->mm->mmap_sem);
1285        vma = find_vma_intersection(current->mm, hva, hva + 1);
1286        if (unlikely(!vma)) {
1287                kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
1288                up_read(&current->mm->mmap_sem);
1289                return -EFAULT;
1290        }
1291
1292        if (is_vm_hugetlb_page(vma) && !logging_active) {
1293                hugetlb = true;
1294                gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
1295        } else {
1296                /*
1297                 * Pages belonging to memslots that don't have the same
1298                 * alignment for userspace and IPA cannot be mapped using
1299                 * block descriptors even if the pages belong to a THP for
1300                 * the process, because the stage-2 block descriptor will
1301                 * cover more than a single THP and we loose atomicity for
1302                 * unmapping, updates, and splits of the THP or other pages
1303                 * in the stage-2 block range.
1304                 */
1305                if ((memslot->userspace_addr & ~PMD_MASK) !=
1306                    ((memslot->base_gfn << PAGE_SHIFT) & ~PMD_MASK))
1307                        force_pte = true;
1308        }
1309        up_read(&current->mm->mmap_sem);
1310
1311        /* We need minimum second+third level pages */
1312        ret = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES,
1313                                     KVM_NR_MEM_OBJS);
1314        if (ret)
1315                return ret;
1316
1317        mmu_seq = vcpu->kvm->mmu_notifier_seq;
1318        /*
1319         * Ensure the read of mmu_notifier_seq happens before we call
1320         * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
1321         * the page we just got a reference to gets unmapped before we have a
1322         * chance to grab the mmu_lock, which ensure that if the page gets
1323         * unmapped afterwards, the call to kvm_unmap_hva will take it away
1324         * from us again properly. This smp_rmb() interacts with the smp_wmb()
1325         * in kvm_mmu_notifier_invalidate_<page|range_end>.
1326         */
1327        smp_rmb();
1328
1329        pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
1330        if (is_error_pfn(pfn))
1331                return -EFAULT;
1332
1333        if (kvm_is_device_pfn(pfn)) {
1334                mem_type = PAGE_S2_DEVICE;
1335                flags |= KVM_S2PTE_FLAG_IS_IOMAP;
1336        } else if (logging_active) {
1337                /*
1338                 * Faults on pages in a memslot with logging enabled
1339                 * should not be mapped with huge pages (it introduces churn
1340                 * and performance degradation), so force a pte mapping.
1341                 */
1342                force_pte = true;
1343                flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
1344
1345                /*
1346                 * Only actually map the page as writable if this was a write
1347                 * fault.
1348                 */
1349                if (!write_fault)
1350                        writable = false;
1351        }
1352
1353        spin_lock(&kvm->mmu_lock);
1354        if (mmu_notifier_retry(kvm, mmu_seq))
1355                goto out_unlock;
1356
1357        if (!hugetlb && !force_pte)
1358                hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
1359
1360        fault_ipa_uncached = memslot->flags & KVM_MEMSLOT_INCOHERENT;
1361
1362        if (hugetlb) {
1363                pmd_t new_pmd = pfn_pmd(pfn, mem_type);
1364                new_pmd = pmd_mkhuge(new_pmd);
1365                if (writable) {
1366                        new_pmd = kvm_s2pmd_mkwrite(new_pmd);
1367                        kvm_set_pfn_dirty(pfn);
1368                }
1369                coherent_cache_guest_page(vcpu, pfn, PMD_SIZE, fault_ipa_uncached);
1370                ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
1371        } else {
1372                pte_t new_pte = pfn_pte(pfn, mem_type);
1373
1374                if (writable) {
1375                        new_pte = kvm_s2pte_mkwrite(new_pte);
1376                        kvm_set_pfn_dirty(pfn);
1377                        mark_page_dirty(kvm, gfn);
1378                }
1379                coherent_cache_guest_page(vcpu, pfn, PAGE_SIZE, fault_ipa_uncached);
1380                ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
1381        }
1382
1383out_unlock:
1384        spin_unlock(&kvm->mmu_lock);
1385        kvm_set_pfn_accessed(pfn);
1386        kvm_release_pfn_clean(pfn);
1387        return ret;
1388}
1389
1390/*
1391 * Resolve the access fault by making the page young again.
1392 * Note that because the faulting entry is guaranteed not to be
1393 * cached in the TLB, we don't need to invalidate anything.
1394 * Only the HW Access Flag updates are supported for Stage 2 (no DBM),
1395 * so there is no need for atomic (pte|pmd)_mkyoung operations.
1396 */
1397static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
1398{
1399        pmd_t *pmd;
1400        pte_t *pte;
1401        kvm_pfn_t pfn;
1402        bool pfn_valid = false;
1403
1404        trace_kvm_access_fault(fault_ipa);
1405
1406        spin_lock(&vcpu->kvm->mmu_lock);
1407
1408        pmd = stage2_get_pmd(vcpu->kvm, NULL, fault_ipa);
1409        if (!pmd || pmd_none(*pmd))     /* Nothing there */
1410                goto out;
1411
1412        if (pmd_thp_or_huge(*pmd)) {    /* THP, HugeTLB */
1413                *pmd = pmd_mkyoung(*pmd);
1414                pfn = pmd_pfn(*pmd);
1415                pfn_valid = true;
1416                goto out;
1417        }
1418
1419        pte = pte_offset_kernel(pmd, fault_ipa);
1420        if (pte_none(*pte))             /* Nothing there either */
1421                goto out;
1422
1423        *pte = pte_mkyoung(*pte);       /* Just a page... */
1424        pfn = pte_pfn(*pte);
1425        pfn_valid = true;
1426out:
1427        spin_unlock(&vcpu->kvm->mmu_lock);
1428        if (pfn_valid)
1429                kvm_set_pfn_accessed(pfn);
1430}
1431
1432/**
1433 * kvm_handle_guest_abort - handles all 2nd stage aborts
1434 * @vcpu:       the VCPU pointer
1435 * @run:        the kvm_run structure
1436 *
1437 * Any abort that gets to the host is almost guaranteed to be caused by a
1438 * missing second stage translation table entry, which can mean that either the
1439 * guest simply needs more memory and we must allocate an appropriate page or it
1440 * can mean that the guest tried to access I/O memory, which is emulated by user
1441 * space. The distinction is based on the IPA causing the fault and whether this
1442 * memory region has been registered as standard RAM by user space.
1443 */
1444int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
1445{
1446        unsigned long fault_status;
1447        phys_addr_t fault_ipa;
1448        struct kvm_memory_slot *memslot;
1449        unsigned long hva;
1450        bool is_iabt, write_fault, writable;
1451        gfn_t gfn;
1452        int ret, idx;
1453
1454        is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
1455        fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
1456
1457        trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu),
1458                              kvm_vcpu_get_hfar(vcpu), fault_ipa);
1459
1460        /* Check the stage-2 fault is trans. fault or write fault */
1461        fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
1462        if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
1463            fault_status != FSC_ACCESS) {
1464                kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
1465                        kvm_vcpu_trap_get_class(vcpu),
1466                        (unsigned long)kvm_vcpu_trap_get_fault(vcpu),
1467                        (unsigned long)kvm_vcpu_get_hsr(vcpu));
1468                return -EFAULT;
1469        }
1470
1471        idx = srcu_read_lock(&vcpu->kvm->srcu);
1472
1473        gfn = fault_ipa >> PAGE_SHIFT;
1474        memslot = gfn_to_memslot(vcpu->kvm, gfn);
1475        hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
1476        write_fault = kvm_is_write_fault(vcpu);
1477        if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
1478                if (is_iabt) {
1479                        /* Prefetch Abort on I/O address */
1480                        kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
1481                        ret = 1;
1482                        goto out_unlock;
1483                }
1484
1485                /*
1486                 * Check for a cache maintenance operation. Since we
1487                 * ended-up here, we know it is outside of any memory
1488                 * slot. But we can't find out if that is for a device,
1489                 * or if the guest is just being stupid. The only thing
1490                 * we know for sure is that this range cannot be cached.
1491                 *
1492                 * So let's assume that the guest is just being
1493                 * cautious, and skip the instruction.
1494                 */
1495                if (kvm_vcpu_dabt_is_cm(vcpu)) {
1496                        kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
1497                        ret = 1;
1498                        goto out_unlock;
1499                }
1500
1501                /*
1502                 * The IPA is reported as [MAX:12], so we need to
1503                 * complement it with the bottom 12 bits from the
1504                 * faulting VA. This is always 12 bits, irrespective
1505                 * of the page size.
1506                 */
1507                fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
1508                ret = io_mem_abort(vcpu, run, fault_ipa);
1509                goto out_unlock;
1510        }
1511
1512        /* Userspace should not be able to register out-of-bounds IPAs */
1513        VM_BUG_ON(fault_ipa >= KVM_PHYS_SIZE);
1514
1515        if (fault_status == FSC_ACCESS) {
1516                handle_access_fault(vcpu, fault_ipa);
1517                ret = 1;
1518                goto out_unlock;
1519        }
1520
1521        ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
1522        if (ret == 0)
1523                ret = 1;
1524out_unlock:
1525        srcu_read_unlock(&vcpu->kvm->srcu, idx);
1526        return ret;
1527}
1528
1529static int handle_hva_to_gpa(struct kvm *kvm,
1530                             unsigned long start,
1531                             unsigned long end,
1532                             int (*handler)(struct kvm *kvm,
1533                                            gpa_t gpa, void *data),
1534                             void *data)
1535{
1536        struct kvm_memslots *slots;
1537        struct kvm_memory_slot *memslot;
1538        int ret = 0;
1539
1540        slots = kvm_memslots(kvm);
1541
1542        /* we only care about the pages that the guest sees */
1543        kvm_for_each_memslot(memslot, slots) {
1544                unsigned long hva_start, hva_end;
1545                gfn_t gfn, gfn_end;
1546
1547                hva_start = max(start, memslot->userspace_addr);
1548                hva_end = min(end, memslot->userspace_addr +
1549                                        (memslot->npages << PAGE_SHIFT));
1550                if (hva_start >= hva_end)
1551                        continue;
1552
1553                /*
1554                 * {gfn(page) | page intersects with [hva_start, hva_end)} =
1555                 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
1556                 */
1557                gfn = hva_to_gfn_memslot(hva_start, memslot);
1558                gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
1559
1560                for (; gfn < gfn_end; ++gfn) {
1561                        gpa_t gpa = gfn << PAGE_SHIFT;
1562                        ret |= handler(kvm, gpa, data);
1563                }
1564        }
1565
1566        return ret;
1567}
1568
1569static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, void *data)
1570{
1571        unmap_stage2_range(kvm, gpa, PAGE_SIZE);
1572        return 0;
1573}
1574
1575int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
1576{
1577        unsigned long end = hva + PAGE_SIZE;
1578
1579        if (!kvm->arch.pgd)
1580                return 0;
1581
1582        trace_kvm_unmap_hva(hva);
1583        handle_hva_to_gpa(kvm, hva, end, &kvm_unmap_hva_handler, NULL);
1584        return 0;
1585}
1586
1587int kvm_unmap_hva_range(struct kvm *kvm,
1588                        unsigned long start, unsigned long end)
1589{
1590        if (!kvm->arch.pgd)
1591                return 0;
1592
1593        trace_kvm_unmap_hva_range(start, end);
1594        handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL);
1595        return 0;
1596}
1597
1598static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
1599{
1600        pte_t *pte = (pte_t *)data;
1601
1602        /*
1603         * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
1604         * flag clear because MMU notifiers will have unmapped a huge PMD before
1605         * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
1606         * therefore stage2_set_pte() never needs to clear out a huge PMD
1607         * through this calling path.
1608         */
1609        stage2_set_pte(kvm, NULL, gpa, pte, 0);
1610        return 0;
1611}
1612
1613
1614void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
1615{
1616        unsigned long end = hva + PAGE_SIZE;
1617        pte_t stage2_pte;
1618
1619        if (!kvm->arch.pgd)
1620                return;
1621
1622        trace_kvm_set_spte_hva(hva);
1623        stage2_pte = pfn_pte(pte_pfn(pte), PAGE_S2);
1624        handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte);
1625}
1626
1627static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data)
1628{
1629        pmd_t *pmd;
1630        pte_t *pte;
1631
1632        pmd = stage2_get_pmd(kvm, NULL, gpa);
1633        if (!pmd || pmd_none(*pmd))     /* Nothing there */
1634                return 0;
1635
1636        if (pmd_thp_or_huge(*pmd))      /* THP, HugeTLB */
1637                return stage2_pmdp_test_and_clear_young(pmd);
1638
1639        pte = pte_offset_kernel(pmd, gpa);
1640        if (pte_none(*pte))
1641                return 0;
1642
1643        return stage2_ptep_test_and_clear_young(pte);
1644}
1645
1646static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data)
1647{
1648        pmd_t *pmd;
1649        pte_t *pte;
1650
1651        pmd = stage2_get_pmd(kvm, NULL, gpa);
1652        if (!pmd || pmd_none(*pmd))     /* Nothing there */
1653                return 0;
1654
1655        if (pmd_thp_or_huge(*pmd))              /* THP, HugeTLB */
1656                return pmd_young(*pmd);
1657
1658        pte = pte_offset_kernel(pmd, gpa);
1659        if (!pte_none(*pte))            /* Just a page... */
1660                return pte_young(*pte);
1661
1662        return 0;
1663}
1664
1665int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
1666{
1667        trace_kvm_age_hva(start, end);
1668        return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
1669}
1670
1671int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
1672{
1673        trace_kvm_test_age_hva(hva);
1674        return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL);
1675}
1676
1677void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
1678{
1679        mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
1680}
1681
1682phys_addr_t kvm_mmu_get_httbr(void)
1683{
1684        if (__kvm_cpu_uses_extended_idmap())
1685                return virt_to_phys(merged_hyp_pgd);
1686        else
1687                return virt_to_phys(hyp_pgd);
1688}
1689
1690phys_addr_t kvm_mmu_get_boot_httbr(void)
1691{
1692        if (__kvm_cpu_uses_extended_idmap())
1693                return virt_to_phys(merged_hyp_pgd);
1694        else
1695                return virt_to_phys(boot_hyp_pgd);
1696}
1697
1698phys_addr_t kvm_get_idmap_vector(void)
1699{
1700        return hyp_idmap_vector;
1701}
1702
1703phys_addr_t kvm_get_idmap_start(void)
1704{
1705        return hyp_idmap_start;
1706}
1707
1708int kvm_mmu_init(void)
1709{
1710        int err;
1711
1712        hyp_idmap_start = kvm_virt_to_phys(__hyp_idmap_text_start);
1713        hyp_idmap_end = kvm_virt_to_phys(__hyp_idmap_text_end);
1714        hyp_idmap_vector = kvm_virt_to_phys(__kvm_hyp_init);
1715
1716        /*
1717         * We rely on the linker script to ensure at build time that the HYP
1718         * init code does not cross a page boundary.
1719         */
1720        BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
1721
1722        hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
1723        boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
1724
1725        if (!hyp_pgd || !boot_hyp_pgd) {
1726                kvm_err("Hyp mode PGD not allocated\n");
1727                err = -ENOMEM;
1728                goto out;
1729        }
1730
1731        /* Create the idmap in the boot page tables */
1732        err =   __create_hyp_mappings(boot_hyp_pgd,
1733                                      hyp_idmap_start, hyp_idmap_end,
1734                                      __phys_to_pfn(hyp_idmap_start),
1735                                      PAGE_HYP);
1736
1737        if (err) {
1738                kvm_err("Failed to idmap %lx-%lx\n",
1739                        hyp_idmap_start, hyp_idmap_end);
1740                goto out;
1741        }
1742
1743        if (__kvm_cpu_uses_extended_idmap()) {
1744                merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1745                if (!merged_hyp_pgd) {
1746                        kvm_err("Failed to allocate extra HYP pgd\n");
1747                        goto out;
1748                }
1749                __kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd,
1750                                    hyp_idmap_start);
1751                return 0;
1752        }
1753
1754        /* Map the very same page at the trampoline VA */
1755        err =   __create_hyp_mappings(boot_hyp_pgd,
1756                                      TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE,
1757                                      __phys_to_pfn(hyp_idmap_start),
1758                                      PAGE_HYP);
1759        if (err) {
1760                kvm_err("Failed to map trampoline @%lx into boot HYP pgd\n",
1761                        TRAMPOLINE_VA);
1762                goto out;
1763        }
1764
1765        /* Map the same page again into the runtime page tables */
1766        err =   __create_hyp_mappings(hyp_pgd,
1767                                      TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE,
1768                                      __phys_to_pfn(hyp_idmap_start),
1769                                      PAGE_HYP);
1770        if (err) {
1771                kvm_err("Failed to map trampoline @%lx into runtime HYP pgd\n",
1772                        TRAMPOLINE_VA);
1773                goto out;
1774        }
1775
1776        return 0;
1777out:
1778        free_hyp_pgds();
1779        return err;
1780}
1781
1782void kvm_arch_commit_memory_region(struct kvm *kvm,
1783                                   const struct kvm_userspace_memory_region *mem,
1784                                   const struct kvm_memory_slot *old,
1785                                   const struct kvm_memory_slot *new,
1786                                   enum kvm_mr_change change)
1787{
1788        /*
1789         * At this point memslot has been committed and there is an
1790         * allocated dirty_bitmap[], dirty pages will be be tracked while the
1791         * memory slot is write protected.
1792         */
1793        if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES)
1794                kvm_mmu_wp_memory_region(kvm, mem->slot);
1795}
1796
1797int kvm_arch_prepare_memory_region(struct kvm *kvm,
1798                                   struct kvm_memory_slot *memslot,
1799                                   const struct kvm_userspace_memory_region *mem,
1800                                   enum kvm_mr_change change)
1801{
1802        hva_t hva = mem->userspace_addr;
1803        hva_t reg_end = hva + mem->memory_size;
1804        bool writable = !(mem->flags & KVM_MEM_READONLY);
1805        int ret = 0;
1806
1807        if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
1808                        change != KVM_MR_FLAGS_ONLY)
1809                return 0;
1810
1811        /*
1812         * Prevent userspace from creating a memory region outside of the IPA
1813         * space addressable by the KVM guest IPA space.
1814         */
1815        if (memslot->base_gfn + memslot->npages >=
1816            (KVM_PHYS_SIZE >> PAGE_SHIFT))
1817                return -EFAULT;
1818
1819        /*
1820         * A memory region could potentially cover multiple VMAs, and any holes
1821         * between them, so iterate over all of them to find out if we can map
1822         * any of them right now.
1823         *
1824         *     +--------------------------------------------+
1825         * +---------------+----------------+   +----------------+
1826         * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
1827         * +---------------+----------------+   +----------------+
1828         *     |               memory region                |
1829         *     +--------------------------------------------+
1830         */
1831        do {
1832                struct vm_area_struct *vma = find_vma(current->mm, hva);
1833                hva_t vm_start, vm_end;
1834
1835                if (!vma || vma->vm_start >= reg_end)
1836                        break;
1837
1838                /*
1839                 * Mapping a read-only VMA is only allowed if the
1840                 * memory region is configured as read-only.
1841                 */
1842                if (writable && !(vma->vm_flags & VM_WRITE)) {
1843                        ret = -EPERM;
1844                        break;
1845                }
1846
1847                /*
1848                 * Take the intersection of this VMA with the memory region
1849                 */
1850                vm_start = max(hva, vma->vm_start);
1851                vm_end = min(reg_end, vma->vm_end);
1852
1853                if (vma->vm_flags & VM_PFNMAP) {
1854                        gpa_t gpa = mem->guest_phys_addr +
1855                                    (vm_start - mem->userspace_addr);
1856                        phys_addr_t pa;
1857
1858                        pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
1859                        pa += vm_start - vma->vm_start;
1860
1861                        /* IO region dirty page logging not allowed */
1862                        if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES)
1863                                return -EINVAL;
1864
1865                        ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
1866                                                    vm_end - vm_start,
1867                                                    writable);
1868                        if (ret)
1869                                break;
1870                }
1871                hva = vm_end;
1872        } while (hva < reg_end);
1873
1874        if (change == KVM_MR_FLAGS_ONLY)
1875                return ret;
1876
1877        spin_lock(&kvm->mmu_lock);
1878        if (ret)
1879                unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
1880        else
1881                stage2_flush_memslot(kvm, memslot);
1882        spin_unlock(&kvm->mmu_lock);
1883        return ret;
1884}
1885
1886void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
1887                           struct kvm_memory_slot *dont)
1888{
1889}
1890
1891int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
1892                            unsigned long npages)
1893{
1894        /*
1895         * Readonly memslots are not incoherent with the caches by definition,
1896         * but in practice, they are used mostly to emulate ROMs or NOR flashes
1897         * that the guest may consider devices and hence map as uncached.
1898         * To prevent incoherency issues in these cases, tag all readonly
1899         * regions as incoherent.
1900         */
1901        if (slot->flags & KVM_MEM_READONLY)
1902                slot->flags |= KVM_MEMSLOT_INCOHERENT;
1903        return 0;
1904}
1905
1906void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots)
1907{
1908}
1909
1910void kvm_arch_flush_shadow_all(struct kvm *kvm)
1911{
1912}
1913
1914void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
1915                                   struct kvm_memory_slot *slot)
1916{
1917        gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
1918        phys_addr_t size = slot->npages << PAGE_SHIFT;
1919
1920        spin_lock(&kvm->mmu_lock);
1921        unmap_stage2_range(kvm, gpa, size);
1922        spin_unlock(&kvm->mmu_lock);
1923}
1924
1925/*
1926 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
1927 *
1928 * Main problems:
1929 * - S/W ops are local to a CPU (not broadcast)
1930 * - We have line migration behind our back (speculation)
1931 * - System caches don't support S/W at all (damn!)
1932 *
1933 * In the face of the above, the best we can do is to try and convert
1934 * S/W ops to VA ops. Because the guest is not allowed to infer the
1935 * S/W to PA mapping, it can only use S/W to nuke the whole cache,
1936 * which is a rather good thing for us.
1937 *
1938 * Also, it is only used when turning caches on/off ("The expected
1939 * usage of the cache maintenance instructions that operate by set/way
1940 * is associated with the cache maintenance instructions associated
1941 * with the powerdown and powerup of caches, if this is required by
1942 * the implementation.").
1943 *
1944 * We use the following policy:
1945 *
1946 * - If we trap a S/W operation, we enable VM trapping to detect
1947 *   caches being turned on/off, and do a full clean.
1948 *
1949 * - We flush the caches on both caches being turned on and off.
1950 *
1951 * - Once the caches are enabled, we stop trapping VM ops.
1952 */
1953void kvm_set_way_flush(struct kvm_vcpu *vcpu)
1954{
1955        unsigned long hcr = vcpu_get_hcr(vcpu);
1956
1957        /*
1958         * If this is the first time we do a S/W operation
1959         * (i.e. HCR_TVM not set) flush the whole memory, and set the
1960         * VM trapping.
1961         *
1962         * Otherwise, rely on the VM trapping to wait for the MMU +
1963         * Caches to be turned off. At that point, we'll be able to
1964         * clean the caches again.
1965         */
1966        if (!(hcr & HCR_TVM)) {
1967                trace_kvm_set_way_flush(*vcpu_pc(vcpu),
1968                                        vcpu_has_cache_enabled(vcpu));
1969                stage2_flush_vm(vcpu->kvm);
1970                vcpu_set_hcr(vcpu, hcr | HCR_TVM);
1971        }
1972}
1973
1974void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
1975{
1976        bool now_enabled = vcpu_has_cache_enabled(vcpu);
1977
1978        /*
1979         * If switching the MMU+caches on, need to invalidate the caches.
1980         * If switching it off, need to clean the caches.
1981         * Clean + invalidate does the trick always.
1982         */
1983        if (now_enabled != was_enabled)
1984                stage2_flush_vm(vcpu->kvm);
1985
1986        /* Caches are now on, stop trapping VM ops (until a S/W op) */
1987        if (now_enabled)
1988                vcpu_set_hcr(vcpu, vcpu_get_hcr(vcpu) & ~HCR_TVM);
1989
1990        trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
1991}
1992