linux/arch/s390/mm/pgtable.c
<<
>>
Prefs
   1/*
   2 *    Copyright IBM Corp. 2007, 2011
   3 *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
   4 */
   5
   6#include <linux/sched.h>
   7#include <linux/kernel.h>
   8#include <linux/errno.h>
   9#include <linux/gfp.h>
  10#include <linux/mm.h>
  11#include <linux/swap.h>
  12#include <linux/smp.h>
  13#include <linux/spinlock.h>
  14#include <linux/rcupdate.h>
  15#include <linux/slab.h>
  16#include <linux/swapops.h>
  17#include <linux/sysctl.h>
  18#include <linux/ksm.h>
  19#include <linux/mman.h>
  20
  21#include <asm/pgtable.h>
  22#include <asm/pgalloc.h>
  23#include <asm/tlb.h>
  24#include <asm/tlbflush.h>
  25#include <asm/mmu_context.h>
  26
  27unsigned long *crst_table_alloc(struct mm_struct *mm)
  28{
  29        struct page *page = alloc_pages(GFP_KERNEL, 2);
  30
  31        if (!page)
  32                return NULL;
  33        return (unsigned long *) page_to_phys(page);
  34}
  35
  36void crst_table_free(struct mm_struct *mm, unsigned long *table)
  37{
  38        free_pages((unsigned long) table, 2);
  39}
  40
  41static void __crst_table_upgrade(void *arg)
  42{
  43        struct mm_struct *mm = arg;
  44
  45        if (current->active_mm == mm) {
  46                clear_user_asce();
  47                set_user_asce(mm);
  48        }
  49        __tlb_flush_local();
  50}
  51
  52int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
  53{
  54        unsigned long *table, *pgd;
  55        unsigned long entry;
  56        int flush;
  57
  58        BUG_ON(limit > (1UL << 53));
  59        flush = 0;
  60repeat:
  61        table = crst_table_alloc(mm);
  62        if (!table)
  63                return -ENOMEM;
  64        spin_lock_bh(&mm->page_table_lock);
  65        if (mm->context.asce_limit < limit) {
  66                pgd = (unsigned long *) mm->pgd;
  67                if (mm->context.asce_limit <= (1UL << 31)) {
  68                        entry = _REGION3_ENTRY_EMPTY;
  69                        mm->context.asce_limit = 1UL << 42;
  70                        mm->context.asce_bits = _ASCE_TABLE_LENGTH |
  71                                                _ASCE_USER_BITS |
  72                                                _ASCE_TYPE_REGION3;
  73                } else {
  74                        entry = _REGION2_ENTRY_EMPTY;
  75                        mm->context.asce_limit = 1UL << 53;
  76                        mm->context.asce_bits = _ASCE_TABLE_LENGTH |
  77                                                _ASCE_USER_BITS |
  78                                                _ASCE_TYPE_REGION2;
  79                }
  80                crst_table_init(table, entry);
  81                pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
  82                mm->pgd = (pgd_t *) table;
  83                mm->task_size = mm->context.asce_limit;
  84                table = NULL;
  85                flush = 1;
  86        }
  87        spin_unlock_bh(&mm->page_table_lock);
  88        if (table)
  89                crst_table_free(mm, table);
  90        if (mm->context.asce_limit < limit)
  91                goto repeat;
  92        if (flush)
  93                on_each_cpu(__crst_table_upgrade, mm, 0);
  94        return 0;
  95}
  96
  97void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)
  98{
  99        pgd_t *pgd;
 100
 101        if (current->active_mm == mm) {
 102                clear_user_asce();
 103                __tlb_flush_mm(mm);
 104        }
 105        while (mm->context.asce_limit > limit) {
 106                pgd = mm->pgd;
 107                switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) {
 108                case _REGION_ENTRY_TYPE_R2:
 109                        mm->context.asce_limit = 1UL << 42;
 110                        mm->context.asce_bits = _ASCE_TABLE_LENGTH |
 111                                                _ASCE_USER_BITS |
 112                                                _ASCE_TYPE_REGION3;
 113                        break;
 114                case _REGION_ENTRY_TYPE_R3:
 115                        mm->context.asce_limit = 1UL << 31;
 116                        mm->context.asce_bits = _ASCE_TABLE_LENGTH |
 117                                                _ASCE_USER_BITS |
 118                                                _ASCE_TYPE_SEGMENT;
 119                        break;
 120                default:
 121                        BUG();
 122                }
 123                mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
 124                mm->task_size = mm->context.asce_limit;
 125                crst_table_free(mm, (unsigned long *) pgd);
 126        }
 127        if (current->active_mm == mm)
 128                set_user_asce(mm);
 129}
 130
 131#ifdef CONFIG_PGSTE
 132
 133/**
 134 * gmap_alloc - allocate a guest address space
 135 * @mm: pointer to the parent mm_struct
 136 * @limit: maximum size of the gmap address space
 137 *
 138 * Returns a guest address space structure.
 139 */
 140struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
 141{
 142        struct gmap *gmap;
 143        struct page *page;
 144        unsigned long *table;
 145        unsigned long etype, atype;
 146
 147        if (limit < (1UL << 31)) {
 148                limit = (1UL << 31) - 1;
 149                atype = _ASCE_TYPE_SEGMENT;
 150                etype = _SEGMENT_ENTRY_EMPTY;
 151        } else if (limit < (1UL << 42)) {
 152                limit = (1UL << 42) - 1;
 153                atype = _ASCE_TYPE_REGION3;
 154                etype = _REGION3_ENTRY_EMPTY;
 155        } else if (limit < (1UL << 53)) {
 156                limit = (1UL << 53) - 1;
 157                atype = _ASCE_TYPE_REGION2;
 158                etype = _REGION2_ENTRY_EMPTY;
 159        } else {
 160                limit = -1UL;
 161                atype = _ASCE_TYPE_REGION1;
 162                etype = _REGION1_ENTRY_EMPTY;
 163        }
 164        gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL);
 165        if (!gmap)
 166                goto out;
 167        INIT_LIST_HEAD(&gmap->crst_list);
 168        INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL);
 169        INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC);
 170        spin_lock_init(&gmap->guest_table_lock);
 171        gmap->mm = mm;
 172        page = alloc_pages(GFP_KERNEL, 2);
 173        if (!page)
 174                goto out_free;
 175        page->index = 0;
 176        list_add(&page->lru, &gmap->crst_list);
 177        table = (unsigned long *) page_to_phys(page);
 178        crst_table_init(table, etype);
 179        gmap->table = table;
 180        gmap->asce = atype | _ASCE_TABLE_LENGTH |
 181                _ASCE_USER_BITS | __pa(table);
 182        gmap->asce_end = limit;
 183        down_write(&mm->mmap_sem);
 184        list_add(&gmap->list, &mm->context.gmap_list);
 185        up_write(&mm->mmap_sem);
 186        return gmap;
 187
 188out_free:
 189        kfree(gmap);
 190out:
 191        return NULL;
 192}
 193EXPORT_SYMBOL_GPL(gmap_alloc);
 194
 195static void gmap_flush_tlb(struct gmap *gmap)
 196{
 197        if (MACHINE_HAS_IDTE)
 198                __tlb_flush_asce(gmap->mm, gmap->asce);
 199        else
 200                __tlb_flush_global();
 201}
 202
 203static void gmap_radix_tree_free(struct radix_tree_root *root)
 204{
 205        struct radix_tree_iter iter;
 206        unsigned long indices[16];
 207        unsigned long index;
 208        void **slot;
 209        int i, nr;
 210
 211        /* A radix tree is freed by deleting all of its entries */
 212        index = 0;
 213        do {
 214                nr = 0;
 215                radix_tree_for_each_slot(slot, root, &iter, index) {
 216                        indices[nr] = iter.index;
 217                        if (++nr == 16)
 218                                break;
 219                }
 220                for (i = 0; i < nr; i++) {
 221                        index = indices[i];
 222                        radix_tree_delete(root, index);
 223                }
 224        } while (nr > 0);
 225}
 226
 227/**
 228 * gmap_free - free a guest address space
 229 * @gmap: pointer to the guest address space structure
 230 */
 231void gmap_free(struct gmap *gmap)
 232{
 233        struct page *page, *next;
 234
 235        /* Flush tlb. */
 236        if (MACHINE_HAS_IDTE)
 237                __tlb_flush_asce(gmap->mm, gmap->asce);
 238        else
 239                __tlb_flush_global();
 240
 241        /* Free all segment & region tables. */
 242        list_for_each_entry_safe(page, next, &gmap->crst_list, lru)
 243                __free_pages(page, 2);
 244        gmap_radix_tree_free(&gmap->guest_to_host);
 245        gmap_radix_tree_free(&gmap->host_to_guest);
 246        down_write(&gmap->mm->mmap_sem);
 247        list_del(&gmap->list);
 248        up_write(&gmap->mm->mmap_sem);
 249        kfree(gmap);
 250}
 251EXPORT_SYMBOL_GPL(gmap_free);
 252
 253/**
 254 * gmap_enable - switch primary space to the guest address space
 255 * @gmap: pointer to the guest address space structure
 256 */
 257void gmap_enable(struct gmap *gmap)
 258{
 259        S390_lowcore.gmap = (unsigned long) gmap;
 260}
 261EXPORT_SYMBOL_GPL(gmap_enable);
 262
 263/**
 264 * gmap_disable - switch back to the standard primary address space
 265 * @gmap: pointer to the guest address space structure
 266 */
 267void gmap_disable(struct gmap *gmap)
 268{
 269        S390_lowcore.gmap = 0UL;
 270}
 271EXPORT_SYMBOL_GPL(gmap_disable);
 272
 273/*
 274 * gmap_alloc_table is assumed to be called with mmap_sem held
 275 */
 276static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
 277                            unsigned long init, unsigned long gaddr)
 278{
 279        struct page *page;
 280        unsigned long *new;
 281
 282        /* since we dont free the gmap table until gmap_free we can unlock */
 283        page = alloc_pages(GFP_KERNEL, 2);
 284        if (!page)
 285                return -ENOMEM;
 286        new = (unsigned long *) page_to_phys(page);
 287        crst_table_init(new, init);
 288        spin_lock(&gmap->mm->page_table_lock);
 289        if (*table & _REGION_ENTRY_INVALID) {
 290                list_add(&page->lru, &gmap->crst_list);
 291                *table = (unsigned long) new | _REGION_ENTRY_LENGTH |
 292                        (*table & _REGION_ENTRY_TYPE_MASK);
 293                page->index = gaddr;
 294                page = NULL;
 295        }
 296        spin_unlock(&gmap->mm->page_table_lock);
 297        if (page)
 298                __free_pages(page, 2);
 299        return 0;
 300}
 301
 302/**
 303 * __gmap_segment_gaddr - find virtual address from segment pointer
 304 * @entry: pointer to a segment table entry in the guest address space
 305 *
 306 * Returns the virtual address in the guest address space for the segment
 307 */
 308static unsigned long __gmap_segment_gaddr(unsigned long *entry)
 309{
 310        struct page *page;
 311        unsigned long offset, mask;
 312
 313        offset = (unsigned long) entry / sizeof(unsigned long);
 314        offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE;
 315        mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
 316        page = virt_to_page((void *)((unsigned long) entry & mask));
 317        return page->index + offset;
 318}
 319
 320/**
 321 * __gmap_unlink_by_vmaddr - unlink a single segment via a host address
 322 * @gmap: pointer to the guest address space structure
 323 * @vmaddr: address in the host process address space
 324 *
 325 * Returns 1 if a TLB flush is required
 326 */
 327static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
 328{
 329        unsigned long *entry;
 330        int flush = 0;
 331
 332        spin_lock(&gmap->guest_table_lock);
 333        entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
 334        if (entry) {
 335                flush = (*entry != _SEGMENT_ENTRY_INVALID);
 336                *entry = _SEGMENT_ENTRY_INVALID;
 337        }
 338        spin_unlock(&gmap->guest_table_lock);
 339        return flush;
 340}
 341
 342/**
 343 * __gmap_unmap_by_gaddr - unmap a single segment via a guest address
 344 * @gmap: pointer to the guest address space structure
 345 * @gaddr: address in the guest address space
 346 *
 347 * Returns 1 if a TLB flush is required
 348 */
 349static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr)
 350{
 351        unsigned long vmaddr;
 352
 353        vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host,
 354                                                   gaddr >> PMD_SHIFT);
 355        return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0;
 356}
 357
 358/**
 359 * gmap_unmap_segment - unmap segment from the guest address space
 360 * @gmap: pointer to the guest address space structure
 361 * @to: address in the guest address space
 362 * @len: length of the memory area to unmap
 363 *
 364 * Returns 0 if the unmap succeeded, -EINVAL if not.
 365 */
 366int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
 367{
 368        unsigned long off;
 369        int flush;
 370
 371        if ((to | len) & (PMD_SIZE - 1))
 372                return -EINVAL;
 373        if (len == 0 || to + len < to)
 374                return -EINVAL;
 375
 376        flush = 0;
 377        down_write(&gmap->mm->mmap_sem);
 378        for (off = 0; off < len; off += PMD_SIZE)
 379                flush |= __gmap_unmap_by_gaddr(gmap, to + off);
 380        up_write(&gmap->mm->mmap_sem);
 381        if (flush)
 382                gmap_flush_tlb(gmap);
 383        return 0;
 384}
 385EXPORT_SYMBOL_GPL(gmap_unmap_segment);
 386
 387/**
 388 * gmap_mmap_segment - map a segment to the guest address space
 389 * @gmap: pointer to the guest address space structure
 390 * @from: source address in the parent address space
 391 * @to: target address in the guest address space
 392 * @len: length of the memory area to map
 393 *
 394 * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not.
 395 */
 396int gmap_map_segment(struct gmap *gmap, unsigned long from,
 397                     unsigned long to, unsigned long len)
 398{
 399        unsigned long off;
 400        int flush;
 401
 402        if ((from | to | len) & (PMD_SIZE - 1))
 403                return -EINVAL;
 404        if (len == 0 || from + len < from || to + len < to ||
 405            from + len > TASK_MAX_SIZE || to + len > gmap->asce_end)
 406                return -EINVAL;
 407
 408        flush = 0;
 409        down_write(&gmap->mm->mmap_sem);
 410        for (off = 0; off < len; off += PMD_SIZE) {
 411                /* Remove old translation */
 412                flush |= __gmap_unmap_by_gaddr(gmap, to + off);
 413                /* Store new translation */
 414                if (radix_tree_insert(&gmap->guest_to_host,
 415                                      (to + off) >> PMD_SHIFT,
 416                                      (void *) from + off))
 417                        break;
 418        }
 419        up_write(&gmap->mm->mmap_sem);
 420        if (flush)
 421                gmap_flush_tlb(gmap);
 422        if (off >= len)
 423                return 0;
 424        gmap_unmap_segment(gmap, to, len);
 425        return -ENOMEM;
 426}
 427EXPORT_SYMBOL_GPL(gmap_map_segment);
 428
 429/**
 430 * __gmap_translate - translate a guest address to a user space address
 431 * @gmap: pointer to guest mapping meta data structure
 432 * @gaddr: guest address
 433 *
 434 * Returns user space address which corresponds to the guest address or
 435 * -EFAULT if no such mapping exists.
 436 * This function does not establish potentially missing page table entries.
 437 * The mmap_sem of the mm that belongs to the address space must be held
 438 * when this function gets called.
 439 */
 440unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
 441{
 442        unsigned long vmaddr;
 443
 444        vmaddr = (unsigned long)
 445                radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT);
 446        return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT;
 447}
 448EXPORT_SYMBOL_GPL(__gmap_translate);
 449
 450/**
 451 * gmap_translate - translate a guest address to a user space address
 452 * @gmap: pointer to guest mapping meta data structure
 453 * @gaddr: guest address
 454 *
 455 * Returns user space address which corresponds to the guest address or
 456 * -EFAULT if no such mapping exists.
 457 * This function does not establish potentially missing page table entries.
 458 */
 459unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr)
 460{
 461        unsigned long rc;
 462
 463        down_read(&gmap->mm->mmap_sem);
 464        rc = __gmap_translate(gmap, gaddr);
 465        up_read(&gmap->mm->mmap_sem);
 466        return rc;
 467}
 468EXPORT_SYMBOL_GPL(gmap_translate);
 469
 470/**
 471 * gmap_unlink - disconnect a page table from the gmap shadow tables
 472 * @gmap: pointer to guest mapping meta data structure
 473 * @table: pointer to the host page table
 474 * @vmaddr: vm address associated with the host page table
 475 */
 476static void gmap_unlink(struct mm_struct *mm, unsigned long *table,
 477                        unsigned long vmaddr)
 478{
 479        struct gmap *gmap;
 480        int flush;
 481
 482        list_for_each_entry(gmap, &mm->context.gmap_list, list) {
 483                flush = __gmap_unlink_by_vmaddr(gmap, vmaddr);
 484                if (flush)
 485                        gmap_flush_tlb(gmap);
 486        }
 487}
 488
 489/**
 490 * gmap_link - set up shadow page tables to connect a host to a guest address
 491 * @gmap: pointer to guest mapping meta data structure
 492 * @gaddr: guest address
 493 * @vmaddr: vm address
 494 *
 495 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
 496 * if the vm address is already mapped to a different guest segment.
 497 * The mmap_sem of the mm that belongs to the address space must be held
 498 * when this function gets called.
 499 */
 500int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
 501{
 502        struct mm_struct *mm;
 503        unsigned long *table;
 504        spinlock_t *ptl;
 505        pgd_t *pgd;
 506        pud_t *pud;
 507        pmd_t *pmd;
 508        int rc;
 509
 510        /* Create higher level tables in the gmap page table */
 511        table = gmap->table;
 512        if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) {
 513                table += (gaddr >> 53) & 0x7ff;
 514                if ((*table & _REGION_ENTRY_INVALID) &&
 515                    gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY,
 516                                     gaddr & 0xffe0000000000000UL))
 517                        return -ENOMEM;
 518                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 519        }
 520        if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) {
 521                table += (gaddr >> 42) & 0x7ff;
 522                if ((*table & _REGION_ENTRY_INVALID) &&
 523                    gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY,
 524                                     gaddr & 0xfffffc0000000000UL))
 525                        return -ENOMEM;
 526                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 527        }
 528        if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) {
 529                table += (gaddr >> 31) & 0x7ff;
 530                if ((*table & _REGION_ENTRY_INVALID) &&
 531                    gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY,
 532                                     gaddr & 0xffffffff80000000UL))
 533                        return -ENOMEM;
 534                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 535        }
 536        table += (gaddr >> 20) & 0x7ff;
 537        /* Walk the parent mm page table */
 538        mm = gmap->mm;
 539        pgd = pgd_offset(mm, vmaddr);
 540        VM_BUG_ON(pgd_none(*pgd));
 541        pud = pud_offset(pgd, vmaddr);
 542        VM_BUG_ON(pud_none(*pud));
 543        pmd = pmd_offset(pud, vmaddr);
 544        VM_BUG_ON(pmd_none(*pmd));
 545        /* large pmds cannot yet be handled */
 546        if (pmd_large(*pmd))
 547                return -EFAULT;
 548        /* Link gmap segment table entry location to page table. */
 549        rc = radix_tree_preload(GFP_KERNEL);
 550        if (rc)
 551                return rc;
 552        ptl = pmd_lock(mm, pmd);
 553        spin_lock(&gmap->guest_table_lock);
 554        if (*table == _SEGMENT_ENTRY_INVALID) {
 555                rc = radix_tree_insert(&gmap->host_to_guest,
 556                                       vmaddr >> PMD_SHIFT, table);
 557                if (!rc)
 558                        *table = pmd_val(*pmd);
 559        } else
 560                rc = 0;
 561        spin_unlock(&gmap->guest_table_lock);
 562        spin_unlock(ptl);
 563        radix_tree_preload_end();
 564        return rc;
 565}
 566
 567/**
 568 * gmap_fault - resolve a fault on a guest address
 569 * @gmap: pointer to guest mapping meta data structure
 570 * @gaddr: guest address
 571 * @fault_flags: flags to pass down to handle_mm_fault()
 572 *
 573 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
 574 * if the vm address is already mapped to a different guest segment.
 575 */
 576int gmap_fault(struct gmap *gmap, unsigned long gaddr,
 577               unsigned int fault_flags)
 578{
 579        unsigned long vmaddr;
 580        int rc;
 581
 582        down_read(&gmap->mm->mmap_sem);
 583        vmaddr = __gmap_translate(gmap, gaddr);
 584        if (IS_ERR_VALUE(vmaddr)) {
 585                rc = vmaddr;
 586                goto out_up;
 587        }
 588        if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags)) {
 589                rc = -EFAULT;
 590                goto out_up;
 591        }
 592        rc = __gmap_link(gmap, gaddr, vmaddr);
 593out_up:
 594        up_read(&gmap->mm->mmap_sem);
 595        return rc;
 596}
 597EXPORT_SYMBOL_GPL(gmap_fault);
 598
 599static void gmap_zap_swap_entry(swp_entry_t entry, struct mm_struct *mm)
 600{
 601        if (!non_swap_entry(entry))
 602                dec_mm_counter(mm, MM_SWAPENTS);
 603        else if (is_migration_entry(entry)) {
 604                struct page *page = migration_entry_to_page(entry);
 605
 606                if (PageAnon(page))
 607                        dec_mm_counter(mm, MM_ANONPAGES);
 608                else
 609                        dec_mm_counter(mm, MM_FILEPAGES);
 610        }
 611        free_swap_and_cache(entry);
 612}
 613
 614/*
 615 * this function is assumed to be called with mmap_sem held
 616 */
 617void __gmap_zap(struct gmap *gmap, unsigned long gaddr)
 618{
 619        unsigned long vmaddr, ptev, pgstev;
 620        pte_t *ptep, pte;
 621        spinlock_t *ptl;
 622        pgste_t pgste;
 623
 624        /* Find the vm address for the guest address */
 625        vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host,
 626                                                   gaddr >> PMD_SHIFT);
 627        if (!vmaddr)
 628                return;
 629        vmaddr |= gaddr & ~PMD_MASK;
 630        /* Get pointer to the page table entry */
 631        ptep = get_locked_pte(gmap->mm, vmaddr, &ptl);
 632        if (unlikely(!ptep))
 633                return;
 634        pte = *ptep;
 635        if (!pte_swap(pte))
 636                goto out_pte;
 637        /* Zap unused and logically-zero pages */
 638        pgste = pgste_get_lock(ptep);
 639        pgstev = pgste_val(pgste);
 640        ptev = pte_val(pte);
 641        if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) ||
 642            ((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID))) {
 643                gmap_zap_swap_entry(pte_to_swp_entry(pte), gmap->mm);
 644                pte_clear(gmap->mm, vmaddr, ptep);
 645        }
 646        pgste_set_unlock(ptep, pgste);
 647out_pte:
 648        pte_unmap_unlock(ptep, ptl);
 649}
 650EXPORT_SYMBOL_GPL(__gmap_zap);
 651
 652void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
 653{
 654        unsigned long gaddr, vmaddr, size;
 655        struct vm_area_struct *vma;
 656
 657        down_read(&gmap->mm->mmap_sem);
 658        for (gaddr = from; gaddr < to;
 659             gaddr = (gaddr + PMD_SIZE) & PMD_MASK) {
 660                /* Find the vm address for the guest address */
 661                vmaddr = (unsigned long)
 662                        radix_tree_lookup(&gmap->guest_to_host,
 663                                          gaddr >> PMD_SHIFT);
 664                if (!vmaddr)
 665                        continue;
 666                vmaddr |= gaddr & ~PMD_MASK;
 667                /* Find vma in the parent mm */
 668                vma = find_vma(gmap->mm, vmaddr);
 669                size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
 670                zap_page_range(vma, vmaddr, size, NULL);
 671        }
 672        up_read(&gmap->mm->mmap_sem);
 673}
 674EXPORT_SYMBOL_GPL(gmap_discard);
 675
 676static LIST_HEAD(gmap_notifier_list);
 677static DEFINE_SPINLOCK(gmap_notifier_lock);
 678
 679/**
 680 * gmap_register_ipte_notifier - register a pte invalidation callback
 681 * @nb: pointer to the gmap notifier block
 682 */
 683void gmap_register_ipte_notifier(struct gmap_notifier *nb)
 684{
 685        spin_lock(&gmap_notifier_lock);
 686        list_add(&nb->list, &gmap_notifier_list);
 687        spin_unlock(&gmap_notifier_lock);
 688}
 689EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier);
 690
 691/**
 692 * gmap_unregister_ipte_notifier - remove a pte invalidation callback
 693 * @nb: pointer to the gmap notifier block
 694 */
 695void gmap_unregister_ipte_notifier(struct gmap_notifier *nb)
 696{
 697        spin_lock(&gmap_notifier_lock);
 698        list_del_init(&nb->list);
 699        spin_unlock(&gmap_notifier_lock);
 700}
 701EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier);
 702
 703/**
 704 * gmap_ipte_notify - mark a range of ptes for invalidation notification
 705 * @gmap: pointer to guest mapping meta data structure
 706 * @gaddr: virtual address in the guest address space
 707 * @len: size of area
 708 *
 709 * Returns 0 if for each page in the given range a gmap mapping exists and
 710 * the invalidation notification could be set. If the gmap mapping is missing
 711 * for one or more pages -EFAULT is returned. If no memory could be allocated
 712 * -ENOMEM is returned. This function establishes missing page table entries.
 713 */
 714int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len)
 715{
 716        unsigned long addr;
 717        spinlock_t *ptl;
 718        pte_t *ptep, entry;
 719        pgste_t pgste;
 720        int rc = 0;
 721
 722        if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK))
 723                return -EINVAL;
 724        down_read(&gmap->mm->mmap_sem);
 725        while (len) {
 726                /* Convert gmap address and connect the page tables */
 727                addr = __gmap_translate(gmap, gaddr);
 728                if (IS_ERR_VALUE(addr)) {
 729                        rc = addr;
 730                        break;
 731                }
 732                /* Get the page mapped */
 733                if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE)) {
 734                        rc = -EFAULT;
 735                        break;
 736                }
 737                rc = __gmap_link(gmap, gaddr, addr);
 738                if (rc)
 739                        break;
 740                /* Walk the process page table, lock and get pte pointer */
 741                ptep = get_locked_pte(gmap->mm, addr, &ptl);
 742                VM_BUG_ON(!ptep);
 743                /* Set notification bit in the pgste of the pte */
 744                entry = *ptep;
 745                if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) {
 746                        pgste = pgste_get_lock(ptep);
 747                        pgste_val(pgste) |= PGSTE_IN_BIT;
 748                        pgste_set_unlock(ptep, pgste);
 749                        gaddr += PAGE_SIZE;
 750                        len -= PAGE_SIZE;
 751                }
 752                pte_unmap_unlock(ptep, ptl);
 753        }
 754        up_read(&gmap->mm->mmap_sem);
 755        return rc;
 756}
 757EXPORT_SYMBOL_GPL(gmap_ipte_notify);
 758
 759/**
 760 * gmap_do_ipte_notify - call all invalidation callbacks for a specific pte.
 761 * @mm: pointer to the process mm_struct
 762 * @addr: virtual address in the process address space
 763 * @pte: pointer to the page table entry
 764 *
 765 * This function is assumed to be called with the page table lock held
 766 * for the pte to notify.
 767 */
 768void gmap_do_ipte_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte)
 769{
 770        unsigned long offset, gaddr;
 771        unsigned long *table;
 772        struct gmap_notifier *nb;
 773        struct gmap *gmap;
 774
 775        offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
 776        offset = offset * (4096 / sizeof(pte_t));
 777        spin_lock(&gmap_notifier_lock);
 778        list_for_each_entry(gmap, &mm->context.gmap_list, list) {
 779                table = radix_tree_lookup(&gmap->host_to_guest,
 780                                          vmaddr >> PMD_SHIFT);
 781                if (!table)
 782                        continue;
 783                gaddr = __gmap_segment_gaddr(table) + offset;
 784                list_for_each_entry(nb, &gmap_notifier_list, list)
 785                        nb->notifier_call(gmap, gaddr);
 786        }
 787        spin_unlock(&gmap_notifier_lock);
 788}
 789EXPORT_SYMBOL_GPL(gmap_do_ipte_notify);
 790
 791int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 792                          unsigned long key, bool nq)
 793{
 794        spinlock_t *ptl;
 795        pgste_t old, new;
 796        pte_t *ptep;
 797
 798        down_read(&mm->mmap_sem);
 799retry:
 800        ptep = get_locked_pte(mm, addr, &ptl);
 801        if (unlikely(!ptep)) {
 802                up_read(&mm->mmap_sem);
 803                return -EFAULT;
 804        }
 805        if (!(pte_val(*ptep) & _PAGE_INVALID) &&
 806             (pte_val(*ptep) & _PAGE_PROTECT)) {
 807                pte_unmap_unlock(ptep, ptl);
 808                if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE)) {
 809                        up_read(&mm->mmap_sem);
 810                        return -EFAULT;
 811                }
 812                goto retry;
 813        }
 814
 815        new = old = pgste_get_lock(ptep);
 816        pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT |
 817                            PGSTE_ACC_BITS | PGSTE_FP_BIT);
 818        pgste_val(new) |= (key & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48;
 819        pgste_val(new) |= (key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
 820        if (!(pte_val(*ptep) & _PAGE_INVALID)) {
 821                unsigned long address, bits, skey;
 822
 823                address = pte_val(*ptep) & PAGE_MASK;
 824                skey = (unsigned long) page_get_storage_key(address);
 825                bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
 826                skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT);
 827                /* Set storage key ACC and FP */
 828                page_set_storage_key(address, skey, !nq);
 829                /* Merge host changed & referenced into pgste  */
 830                pgste_val(new) |= bits << 52;
 831        }
 832        /* changing the guest storage key is considered a change of the page */
 833        if ((pgste_val(new) ^ pgste_val(old)) &
 834            (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT))
 835                pgste_val(new) |= PGSTE_UC_BIT;
 836
 837        pgste_set_unlock(ptep, new);
 838        pte_unmap_unlock(ptep, ptl);
 839        up_read(&mm->mmap_sem);
 840        return 0;
 841}
 842EXPORT_SYMBOL(set_guest_storage_key);
 843
 844unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr)
 845{
 846        spinlock_t *ptl;
 847        pgste_t pgste;
 848        pte_t *ptep;
 849        uint64_t physaddr;
 850        unsigned long key = 0;
 851
 852        down_read(&mm->mmap_sem);
 853        ptep = get_locked_pte(mm, addr, &ptl);
 854        if (unlikely(!ptep)) {
 855                up_read(&mm->mmap_sem);
 856                return -EFAULT;
 857        }
 858        pgste = pgste_get_lock(ptep);
 859
 860        if (pte_val(*ptep) & _PAGE_INVALID) {
 861                key |= (pgste_val(pgste) & PGSTE_ACC_BITS) >> 56;
 862                key |= (pgste_val(pgste) & PGSTE_FP_BIT) >> 56;
 863                key |= (pgste_val(pgste) & PGSTE_GR_BIT) >> 48;
 864                key |= (pgste_val(pgste) & PGSTE_GC_BIT) >> 48;
 865        } else {
 866                physaddr = pte_val(*ptep) & PAGE_MASK;
 867                key = page_get_storage_key(physaddr);
 868
 869                /* Reflect guest's logical view, not physical */
 870                if (pgste_val(pgste) & PGSTE_GR_BIT)
 871                        key |= _PAGE_REFERENCED;
 872                if (pgste_val(pgste) & PGSTE_GC_BIT)
 873                        key |= _PAGE_CHANGED;
 874        }
 875
 876        pgste_set_unlock(ptep, pgste);
 877        pte_unmap_unlock(ptep, ptl);
 878        up_read(&mm->mmap_sem);
 879        return key;
 880}
 881EXPORT_SYMBOL(get_guest_storage_key);
 882
 883static int page_table_allocate_pgste_min = 0;
 884static int page_table_allocate_pgste_max = 1;
 885int page_table_allocate_pgste = 0;
 886EXPORT_SYMBOL(page_table_allocate_pgste);
 887
 888static struct ctl_table page_table_sysctl[] = {
 889        {
 890                .procname       = "allocate_pgste",
 891                .data           = &page_table_allocate_pgste,
 892                .maxlen         = sizeof(int),
 893                .mode           = S_IRUGO | S_IWUSR,
 894                .proc_handler   = proc_dointvec,
 895                .extra1         = &page_table_allocate_pgste_min,
 896                .extra2         = &page_table_allocate_pgste_max,
 897        },
 898        { }
 899};
 900
 901static struct ctl_table page_table_sysctl_dir[] = {
 902        {
 903                .procname       = "vm",
 904                .maxlen         = 0,
 905                .mode           = 0555,
 906                .child          = page_table_sysctl,
 907        },
 908        { }
 909};
 910
 911static int __init page_table_register_sysctl(void)
 912{
 913        return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM;
 914}
 915__initcall(page_table_register_sysctl);
 916
 917#else /* CONFIG_PGSTE */
 918
 919static inline void gmap_unlink(struct mm_struct *mm, unsigned long *table,
 920                        unsigned long vmaddr)
 921{
 922}
 923
 924#endif /* CONFIG_PGSTE */
 925
 926static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
 927{
 928        unsigned int old, new;
 929
 930        do {
 931                old = atomic_read(v);
 932                new = old ^ bits;
 933        } while (atomic_cmpxchg(v, old, new) != old);
 934        return new;
 935}
 936
 937/*
 938 * page table entry allocation/free routines.
 939 */
 940unsigned long *page_table_alloc(struct mm_struct *mm)
 941{
 942        unsigned long *table;
 943        struct page *page;
 944        unsigned int mask, bit;
 945
 946        /* Try to get a fragment of a 4K page as a 2K page table */
 947        if (!mm_alloc_pgste(mm)) {
 948                table = NULL;
 949                spin_lock_bh(&mm->context.list_lock);
 950                if (!list_empty(&mm->context.pgtable_list)) {
 951                        page = list_first_entry(&mm->context.pgtable_list,
 952                                                struct page, lru);
 953                        mask = atomic_read(&page->_mapcount);
 954                        mask = (mask | (mask >> 4)) & 3;
 955                        if (mask != 3) {
 956                                table = (unsigned long *) page_to_phys(page);
 957                                bit = mask & 1;         /* =1 -> second 2K */
 958                                if (bit)
 959                                        table += PTRS_PER_PTE;
 960                                atomic_xor_bits(&page->_mapcount, 1U << bit);
 961                                list_del(&page->lru);
 962                        }
 963                }
 964                spin_unlock_bh(&mm->context.list_lock);
 965                if (table)
 966                        return table;
 967        }
 968        /* Allocate a fresh page */
 969        page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
 970        if (!page)
 971                return NULL;
 972        if (!pgtable_page_ctor(page)) {
 973                __free_page(page);
 974                return NULL;
 975        }
 976        /* Initialize page table */
 977        table = (unsigned long *) page_to_phys(page);
 978        if (mm_alloc_pgste(mm)) {
 979                /* Return 4K page table with PGSTEs */
 980                atomic_set(&page->_mapcount, 3);
 981                clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
 982                clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
 983        } else {
 984                /* Return the first 2K fragment of the page */
 985                atomic_set(&page->_mapcount, 1);
 986                clear_table(table, _PAGE_INVALID, PAGE_SIZE);
 987                spin_lock_bh(&mm->context.list_lock);
 988                list_add(&page->lru, &mm->context.pgtable_list);
 989                spin_unlock_bh(&mm->context.list_lock);
 990        }
 991        return table;
 992}
 993
 994void page_table_free(struct mm_struct *mm, unsigned long *table)
 995{
 996        struct page *page;
 997        unsigned int bit, mask;
 998
 999        page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
1000        if (!mm_alloc_pgste(mm)) {
1001                /* Free 2K page table fragment of a 4K page */
1002                bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
1003                spin_lock_bh(&mm->context.list_lock);
1004                mask = atomic_xor_bits(&page->_mapcount, 1U << bit);
1005                if (mask & 3)
1006                        list_add(&page->lru, &mm->context.pgtable_list);
1007                else
1008                        list_del(&page->lru);
1009                spin_unlock_bh(&mm->context.list_lock);
1010                if (mask != 0)
1011                        return;
1012        }
1013
1014        pgtable_page_dtor(page);
1015        atomic_set(&page->_mapcount, -1);
1016        __free_page(page);
1017}
1018
1019void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
1020                         unsigned long vmaddr)
1021{
1022        struct mm_struct *mm;
1023        struct page *page;
1024        unsigned int bit, mask;
1025
1026        mm = tlb->mm;
1027        page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
1028        if (mm_alloc_pgste(mm)) {
1029                gmap_unlink(mm, table, vmaddr);
1030                table = (unsigned long *) (__pa(table) | 3);
1031                tlb_remove_table(tlb, table);
1032                return;
1033        }
1034        bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
1035        spin_lock_bh(&mm->context.list_lock);
1036        mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit);
1037        if (mask & 3)
1038                list_add_tail(&page->lru, &mm->context.pgtable_list);
1039        else
1040                list_del(&page->lru);
1041        spin_unlock_bh(&mm->context.list_lock);
1042        table = (unsigned long *) (__pa(table) | (1U << bit));
1043        tlb_remove_table(tlb, table);
1044}
1045
1046static void __tlb_remove_table(void *_table)
1047{
1048        unsigned int mask = (unsigned long) _table & 3;
1049        void *table = (void *)((unsigned long) _table ^ mask);
1050        struct page *page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
1051
1052        switch (mask) {
1053        case 0:         /* pmd or pud */
1054                free_pages((unsigned long) table, 2);
1055                break;
1056        case 1:         /* lower 2K of a 4K page table */
1057        case 2:         /* higher 2K of a 4K page table */
1058                if (atomic_xor_bits(&page->_mapcount, mask << 4) != 0)
1059                        break;
1060                /* fallthrough */
1061        case 3:         /* 4K page table with pgstes */
1062                pgtable_page_dtor(page);
1063                atomic_set(&page->_mapcount, -1);
1064                __free_page(page);
1065                break;
1066        }
1067}
1068
1069static void tlb_remove_table_smp_sync(void *arg)
1070{
1071        /* Simply deliver the interrupt */
1072}
1073
1074static void tlb_remove_table_one(void *table)
1075{
1076        /*
1077         * This isn't an RCU grace period and hence the page-tables cannot be
1078         * assumed to be actually RCU-freed.
1079         *
1080         * It is however sufficient for software page-table walkers that rely
1081         * on IRQ disabling. See the comment near struct mmu_table_batch.
1082         */
1083        smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
1084        __tlb_remove_table(table);
1085}
1086
1087static void tlb_remove_table_rcu(struct rcu_head *head)
1088{
1089        struct mmu_table_batch *batch;
1090        int i;
1091
1092        batch = container_of(head, struct mmu_table_batch, rcu);
1093
1094        for (i = 0; i < batch->nr; i++)
1095                __tlb_remove_table(batch->tables[i]);
1096
1097        free_page((unsigned long)batch);
1098}
1099
1100void tlb_table_flush(struct mmu_gather *tlb)
1101{
1102        struct mmu_table_batch **batch = &tlb->batch;
1103
1104        if (*batch) {
1105                call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
1106                *batch = NULL;
1107        }
1108}
1109
1110void tlb_remove_table(struct mmu_gather *tlb, void *table)
1111{
1112        struct mmu_table_batch **batch = &tlb->batch;
1113
1114        tlb->mm->context.flush_mm = 1;
1115        if (*batch == NULL) {
1116                *batch = (struct mmu_table_batch *)
1117                        __get_free_page(GFP_NOWAIT | __GFP_NOWARN);
1118                if (*batch == NULL) {
1119                        __tlb_flush_mm_lazy(tlb->mm);
1120                        tlb_remove_table_one(table);
1121                        return;
1122                }
1123                (*batch)->nr = 0;
1124        }
1125        (*batch)->tables[(*batch)->nr++] = table;
1126        if ((*batch)->nr == MAX_TABLE_BATCH)
1127                tlb_flush_mmu(tlb);
1128}
1129
1130#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1131static inline void thp_split_vma(struct vm_area_struct *vma)
1132{
1133        unsigned long addr;
1134
1135        for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE)
1136                follow_page(vma, addr, FOLL_SPLIT);
1137}
1138
1139static inline void thp_split_mm(struct mm_struct *mm)
1140{
1141        struct vm_area_struct *vma;
1142
1143        for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
1144                thp_split_vma(vma);
1145                vma->vm_flags &= ~VM_HUGEPAGE;
1146                vma->vm_flags |= VM_NOHUGEPAGE;
1147        }
1148        mm->def_flags |= VM_NOHUGEPAGE;
1149}
1150#else
1151static inline void thp_split_mm(struct mm_struct *mm)
1152{
1153}
1154#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1155
1156/*
1157 * switch on pgstes for its userspace process (for kvm)
1158 */
1159int s390_enable_sie(void)
1160{
1161        struct mm_struct *mm = current->mm;
1162
1163        /* Do we have pgstes? if yes, we are done */
1164        if (mm_has_pgste(mm))
1165                return 0;
1166        /* Fail if the page tables are 2K */
1167        if (!mm_alloc_pgste(mm))
1168                return -EINVAL;
1169        down_write(&mm->mmap_sem);
1170        mm->context.has_pgste = 1;
1171        /* split thp mappings and disable thp for future mappings */
1172        thp_split_mm(mm);
1173        up_write(&mm->mmap_sem);
1174        return 0;
1175}
1176EXPORT_SYMBOL_GPL(s390_enable_sie);
1177
1178/*
1179 * Enable storage key handling from now on and initialize the storage
1180 * keys with the default key.
1181 */
1182static int __s390_enable_skey(pte_t *pte, unsigned long addr,
1183                              unsigned long next, struct mm_walk *walk)
1184{
1185        unsigned long ptev;
1186        pgste_t pgste;
1187
1188        pgste = pgste_get_lock(pte);
1189        /*
1190         * Remove all zero page mappings,
1191         * after establishing a policy to forbid zero page mappings
1192         * following faults for that page will get fresh anonymous pages
1193         */
1194        if (is_zero_pfn(pte_pfn(*pte))) {
1195                ptep_flush_direct(walk->mm, addr, pte);
1196                pte_val(*pte) = _PAGE_INVALID;
1197        }
1198        /* Clear storage key */
1199        pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT |
1200                              PGSTE_GR_BIT | PGSTE_GC_BIT);
1201        ptev = pte_val(*pte);
1202        if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE))
1203                page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 1);
1204        pgste_set_unlock(pte, pgste);
1205        return 0;
1206}
1207
1208int s390_enable_skey(void)
1209{
1210        struct mm_walk walk = { .pte_entry = __s390_enable_skey };
1211        struct mm_struct *mm = current->mm;
1212        struct vm_area_struct *vma;
1213        int rc = 0;
1214
1215        down_write(&mm->mmap_sem);
1216        if (mm_use_skey(mm))
1217                goto out_up;
1218
1219        mm->context.use_skey = 1;
1220        for (vma = mm->mmap; vma; vma = vma->vm_next) {
1221                if (ksm_madvise(vma, vma->vm_start, vma->vm_end,
1222                                MADV_UNMERGEABLE, &vma->vm_flags)) {
1223                        mm->context.use_skey = 0;
1224                        rc = -ENOMEM;
1225                        goto out_up;
1226                }
1227        }
1228        mm->def_flags &= ~VM_MERGEABLE;
1229
1230        walk.mm = mm;
1231        walk_page_range(0, TASK_SIZE, &walk);
1232
1233out_up:
1234        up_write(&mm->mmap_sem);
1235        return rc;
1236}
1237EXPORT_SYMBOL_GPL(s390_enable_skey);
1238
1239/*
1240 * Reset CMMA state, make all pages stable again.
1241 */
1242static int __s390_reset_cmma(pte_t *pte, unsigned long addr,
1243                             unsigned long next, struct mm_walk *walk)
1244{
1245        pgste_t pgste;
1246
1247        pgste = pgste_get_lock(pte);
1248        pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK;
1249        pgste_set_unlock(pte, pgste);
1250        return 0;
1251}
1252
1253void s390_reset_cmma(struct mm_struct *mm)
1254{
1255        struct mm_walk walk = { .pte_entry = __s390_reset_cmma };
1256
1257        down_write(&mm->mmap_sem);
1258        walk.mm = mm;
1259        walk_page_range(0, TASK_SIZE, &walk);
1260        up_write(&mm->mmap_sem);
1261}
1262EXPORT_SYMBOL_GPL(s390_reset_cmma);
1263
1264/*
1265 * Test and reset if a guest page is dirty
1266 */
1267bool gmap_test_and_clear_dirty(unsigned long address, struct gmap *gmap)
1268{
1269        pte_t *pte;
1270        spinlock_t *ptl;
1271        bool dirty = false;
1272
1273        pte = get_locked_pte(gmap->mm, address, &ptl);
1274        if (unlikely(!pte))
1275                return false;
1276
1277        if (ptep_test_and_clear_user_dirty(gmap->mm, address, pte))
1278                dirty = true;
1279
1280        spin_unlock(ptl);
1281        return dirty;
1282}
1283EXPORT_SYMBOL_GPL(gmap_test_and_clear_dirty);
1284
1285#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1286int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address,
1287                           pmd_t *pmdp)
1288{
1289        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1290        /* No need to flush TLB
1291         * On s390 reference bits are in storage key and never in TLB */
1292        return pmdp_test_and_clear_young(vma, address, pmdp);
1293}
1294
1295int pmdp_set_access_flags(struct vm_area_struct *vma,
1296                          unsigned long address, pmd_t *pmdp,
1297                          pmd_t entry, int dirty)
1298{
1299        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1300
1301        entry = pmd_mkyoung(entry);
1302        if (dirty)
1303                entry = pmd_mkdirty(entry);
1304        if (pmd_same(*pmdp, entry))
1305                return 0;
1306        pmdp_invalidate(vma, address, pmdp);
1307        set_pmd_at(vma->vm_mm, address, pmdp, entry);
1308        return 1;
1309}
1310
1311static void pmdp_splitting_flush_sync(void *arg)
1312{
1313        /* Simply deliver the interrupt */
1314}
1315
1316void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
1317                          pmd_t *pmdp)
1318{
1319        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1320        if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT,
1321                              (unsigned long *) pmdp)) {
1322                /* need to serialize against gup-fast (IRQ disabled) */
1323                smp_call_function(pmdp_splitting_flush_sync, NULL, 1);
1324        }
1325}
1326
1327void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
1328                                pgtable_t pgtable)
1329{
1330        struct list_head *lh = (struct list_head *) pgtable;
1331
1332        assert_spin_locked(pmd_lockptr(mm, pmdp));
1333
1334        /* FIFO */
1335        if (!pmd_huge_pte(mm, pmdp))
1336                INIT_LIST_HEAD(lh);
1337        else
1338                list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
1339        pmd_huge_pte(mm, pmdp) = pgtable;
1340}
1341
1342pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
1343{
1344        struct list_head *lh;
1345        pgtable_t pgtable;
1346        pte_t *ptep;
1347
1348        assert_spin_locked(pmd_lockptr(mm, pmdp));
1349
1350        /* FIFO */
1351        pgtable = pmd_huge_pte(mm, pmdp);
1352        lh = (struct list_head *) pgtable;
1353        if (list_empty(lh))
1354                pmd_huge_pte(mm, pmdp) = NULL;
1355        else {
1356                pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
1357                list_del(lh);
1358        }
1359        ptep = (pte_t *) pgtable;
1360        pte_val(*ptep) = _PAGE_INVALID;
1361        ptep++;
1362        pte_val(*ptep) = _PAGE_INVALID;
1363        return pgtable;
1364}
1365#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1366