linux/arch/s390/mm/pgtable.c
<<
>>
Prefs
   1/*
   2 *    Copyright IBM Corp. 2007,2009
   3 *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
   4 */
   5
   6#include <linux/sched.h>
   7#include <linux/kernel.h>
   8#include <linux/errno.h>
   9#include <linux/mm.h>
  10#include <linux/swap.h>
  11#include <linux/smp.h>
  12#include <linux/highmem.h>
  13#include <linux/slab.h>
  14#include <linux/pagemap.h>
  15#include <linux/spinlock.h>
  16#include <linux/module.h>
  17#include <linux/quicklist.h>
  18
  19#include <asm/system.h>
  20#include <asm/pgtable.h>
  21#include <asm/pgalloc.h>
  22#include <asm/tlb.h>
  23#include <asm/tlbflush.h>
  24#include <asm/mmu_context.h>
  25
  26#ifndef CONFIG_64BIT
  27#define ALLOC_ORDER     1
  28#define TABLES_PER_PAGE 4
  29#define FRAG_MASK       15UL
  30#define SECOND_HALVES   10UL
  31
  32void clear_table_pgstes(unsigned long *table)
  33{
  34        clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/4);
  35        memset(table + 256, 0, PAGE_SIZE/4);
  36        clear_table(table + 512, _PAGE_TYPE_EMPTY, PAGE_SIZE/4);
  37        memset(table + 768, 0, PAGE_SIZE/4);
  38}
  39
  40#else
  41#define ALLOC_ORDER     2
  42#define TABLES_PER_PAGE 2
  43#define FRAG_MASK       3UL
  44#define SECOND_HALVES   2UL
  45
  46void clear_table_pgstes(unsigned long *table)
  47{
  48        clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2);
  49        memset(table + 256, 0, PAGE_SIZE/2);
  50}
  51
  52#endif
  53
  54unsigned long VMALLOC_START = VMALLOC_END - VMALLOC_SIZE;
  55EXPORT_SYMBOL(VMALLOC_START);
  56
  57static int __init parse_vmalloc(char *arg)
  58{
  59        if (!arg)
  60                return -EINVAL;
  61        VMALLOC_START = (VMALLOC_END - memparse(arg, &arg)) & PAGE_MASK;
  62        return 0;
  63}
  64early_param("vmalloc", parse_vmalloc);
  65
  66unsigned long *crst_table_alloc(struct mm_struct *mm, int noexec)
  67{
  68        struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
  69
  70        if (!page)
  71                return NULL;
  72        page->index = 0;
  73        if (noexec) {
  74                struct page *shadow = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
  75                if (!shadow) {
  76                        __free_pages(page, ALLOC_ORDER);
  77                        return NULL;
  78                }
  79                page->index = page_to_phys(shadow);
  80        }
  81        spin_lock(&mm->context.list_lock);
  82        list_add(&page->lru, &mm->context.crst_list);
  83        spin_unlock(&mm->context.list_lock);
  84        return (unsigned long *) page_to_phys(page);
  85}
  86
  87void crst_table_free(struct mm_struct *mm, unsigned long *table)
  88{
  89        unsigned long *shadow = get_shadow_table(table);
  90        struct page *page = virt_to_page(table);
  91
  92        spin_lock(&mm->context.list_lock);
  93        list_del(&page->lru);
  94        spin_unlock(&mm->context.list_lock);
  95        if (shadow)
  96                free_pages((unsigned long) shadow, ALLOC_ORDER);
  97        free_pages((unsigned long) table, ALLOC_ORDER);
  98}
  99
 100#ifdef CONFIG_64BIT
 101int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
 102{
 103        unsigned long *table, *pgd;
 104        unsigned long entry;
 105
 106        BUG_ON(limit > (1UL << 53));
 107repeat:
 108        table = crst_table_alloc(mm, mm->context.noexec);
 109        if (!table)
 110                return -ENOMEM;
 111        spin_lock(&mm->page_table_lock);
 112        if (mm->context.asce_limit < limit) {
 113                pgd = (unsigned long *) mm->pgd;
 114                if (mm->context.asce_limit <= (1UL << 31)) {
 115                        entry = _REGION3_ENTRY_EMPTY;
 116                        mm->context.asce_limit = 1UL << 42;
 117                        mm->context.asce_bits = _ASCE_TABLE_LENGTH |
 118                                                _ASCE_USER_BITS |
 119                                                _ASCE_TYPE_REGION3;
 120                } else {
 121                        entry = _REGION2_ENTRY_EMPTY;
 122                        mm->context.asce_limit = 1UL << 53;
 123                        mm->context.asce_bits = _ASCE_TABLE_LENGTH |
 124                                                _ASCE_USER_BITS |
 125                                                _ASCE_TYPE_REGION2;
 126                }
 127                crst_table_init(table, entry);
 128                pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
 129                mm->pgd = (pgd_t *) table;
 130                mm->task_size = mm->context.asce_limit;
 131                table = NULL;
 132        }
 133        spin_unlock(&mm->page_table_lock);
 134        if (table)
 135                crst_table_free(mm, table);
 136        if (mm->context.asce_limit < limit)
 137                goto repeat;
 138        update_mm(mm, current);
 139        return 0;
 140}
 141
 142void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)
 143{
 144        pgd_t *pgd;
 145
 146        if (mm->context.asce_limit <= limit)
 147                return;
 148        __tlb_flush_mm(mm);
 149        while (mm->context.asce_limit > limit) {
 150                pgd = mm->pgd;
 151                switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) {
 152                case _REGION_ENTRY_TYPE_R2:
 153                        mm->context.asce_limit = 1UL << 42;
 154                        mm->context.asce_bits = _ASCE_TABLE_LENGTH |
 155                                                _ASCE_USER_BITS |
 156                                                _ASCE_TYPE_REGION3;
 157                        break;
 158                case _REGION_ENTRY_TYPE_R3:
 159                        mm->context.asce_limit = 1UL << 31;
 160                        mm->context.asce_bits = _ASCE_TABLE_LENGTH |
 161                                                _ASCE_USER_BITS |
 162                                                _ASCE_TYPE_SEGMENT;
 163                        break;
 164                default:
 165                        BUG();
 166                }
 167                mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
 168                mm->task_size = mm->context.asce_limit;
 169                crst_table_free(mm, (unsigned long *) pgd);
 170        }
 171        update_mm(mm, current);
 172}
 173#endif
 174
 175/*
 176 * page table entry allocation/free routines.
 177 */
 178unsigned long *page_table_alloc(struct mm_struct *mm)
 179{
 180        struct page *page;
 181        unsigned long *table;
 182        unsigned long bits;
 183
 184        bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL;
 185        spin_lock(&mm->context.list_lock);
 186        page = NULL;
 187        if (!list_empty(&mm->context.pgtable_list)) {
 188                page = list_first_entry(&mm->context.pgtable_list,
 189                                        struct page, lru);
 190                if ((page->flags & FRAG_MASK) == ((1UL << TABLES_PER_PAGE) - 1))
 191                        page = NULL;
 192        }
 193        if (!page) {
 194                spin_unlock(&mm->context.list_lock);
 195                page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
 196                if (!page)
 197                        return NULL;
 198                pgtable_page_ctor(page);
 199                page->flags &= ~FRAG_MASK;
 200                table = (unsigned long *) page_to_phys(page);
 201                if (mm->context.has_pgste)
 202                        clear_table_pgstes(table);
 203                else
 204                        clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE);
 205                spin_lock(&mm->context.list_lock);
 206                list_add(&page->lru, &mm->context.pgtable_list);
 207        }
 208        table = (unsigned long *) page_to_phys(page);
 209        while (page->flags & bits) {
 210                table += 256;
 211                bits <<= 1;
 212        }
 213        page->flags |= bits;
 214        if ((page->flags & FRAG_MASK) == ((1UL << TABLES_PER_PAGE) - 1))
 215                list_move_tail(&page->lru, &mm->context.pgtable_list);
 216        spin_unlock(&mm->context.list_lock);
 217        return table;
 218}
 219
 220void page_table_free(struct mm_struct *mm, unsigned long *table)
 221{
 222        struct page *page;
 223        unsigned long bits;
 224
 225        bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL;
 226        bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long);
 227        page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
 228        spin_lock(&mm->context.list_lock);
 229        page->flags ^= bits;
 230        if (page->flags & FRAG_MASK) {
 231                /* Page now has some free pgtable fragments. */
 232                list_move(&page->lru, &mm->context.pgtable_list);
 233                page = NULL;
 234        } else
 235                /* All fragments of the 4K page have been freed. */
 236                list_del(&page->lru);
 237        spin_unlock(&mm->context.list_lock);
 238        if (page) {
 239                pgtable_page_dtor(page);
 240                __free_page(page);
 241        }
 242}
 243
 244void disable_noexec(struct mm_struct *mm, struct task_struct *tsk)
 245{
 246        struct page *page;
 247
 248        spin_lock(&mm->context.list_lock);
 249        /* Free shadow region and segment tables. */
 250        list_for_each_entry(page, &mm->context.crst_list, lru)
 251                if (page->index) {
 252                        free_pages((unsigned long) page->index, ALLOC_ORDER);
 253                        page->index = 0;
 254                }
 255        /* "Free" second halves of page tables. */
 256        list_for_each_entry(page, &mm->context.pgtable_list, lru)
 257                page->flags &= ~SECOND_HALVES;
 258        spin_unlock(&mm->context.list_lock);
 259        mm->context.noexec = 0;
 260        update_mm(mm, tsk);
 261}
 262
 263/*
 264 * switch on pgstes for its userspace process (for kvm)
 265 */
 266int s390_enable_sie(void)
 267{
 268        struct task_struct *tsk = current;
 269        struct mm_struct *mm, *old_mm;
 270
 271        /* Do we have switched amode? If no, we cannot do sie */
 272        if (!switch_amode)
 273                return -EINVAL;
 274
 275        /* Do we have pgstes? if yes, we are done */
 276        if (tsk->mm->context.has_pgste)
 277                return 0;
 278
 279        /* lets check if we are allowed to replace the mm */
 280        task_lock(tsk);
 281        if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
 282#ifdef CONFIG_AIO
 283            !hlist_empty(&tsk->mm->ioctx_list) ||
 284#endif
 285            tsk->mm != tsk->active_mm) {
 286                task_unlock(tsk);
 287                return -EINVAL;
 288        }
 289        task_unlock(tsk);
 290
 291        /* we copy the mm and let dup_mm create the page tables with_pgstes */
 292        tsk->mm->context.alloc_pgste = 1;
 293        mm = dup_mm(tsk);
 294        tsk->mm->context.alloc_pgste = 0;
 295        if (!mm)
 296                return -ENOMEM;
 297
 298        /* Now lets check again if something happened */
 299        task_lock(tsk);
 300        if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
 301#ifdef CONFIG_AIO
 302            !hlist_empty(&tsk->mm->ioctx_list) ||
 303#endif
 304            tsk->mm != tsk->active_mm) {
 305                mmput(mm);
 306                task_unlock(tsk);
 307                return -EINVAL;
 308        }
 309
 310        /* ok, we are alone. No ptrace, no threads, etc. */
 311        old_mm = tsk->mm;
 312        tsk->mm = tsk->active_mm = mm;
 313        preempt_disable();
 314        update_mm(mm, tsk);
 315        cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
 316        preempt_enable();
 317        task_unlock(tsk);
 318        mmput(old_mm);
 319        return 0;
 320}
 321EXPORT_SYMBOL_GPL(s390_enable_sie);
 322
 323#if defined(CONFIG_DEBUG_PAGEALLOC) && defined(CONFIG_HIBERNATION)
 324bool kernel_page_present(struct page *page)
 325{
 326        unsigned long addr;
 327        int cc;
 328
 329        addr = page_to_phys(page);
 330        asm volatile(
 331                "       lra     %1,0(%1)\n"
 332                "       ipm     %0\n"
 333                "       srl     %0,28"
 334                : "=d" (cc), "+a" (addr) : : "cc");
 335        return cc == 0;
 336}
 337#endif /* CONFIG_HIBERNATION && CONFIG_DEBUG_PAGEALLOC */
 338