LXR linux/arch/s390/mm/gup.c

   1/*
   2 *  Lockless get_user_pages_fast for s390
   3 *
   4 *  Copyright IBM Corp. 2010
   5 *  Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
   6 */
   7#include <linux/sched.h>
   8#include <linux/mm.h>
   9#include <linux/hugetlb.h>
  10#include <linux/vmstat.h>
  11#include <linux/pagemap.h>
  12#include <linux/rwsem.h>
  13#include <asm/pgtable.h>
  14
  15/*
  16 * The performance critical leaf functions are made noinline otherwise gcc
  17 * inlines everything into a single function which results in too much
  18 * register pressure.
  19 */
  20static inline int gup_pte_range(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
  21                unsigned long end, int write, struct page **pages, int *nr)
  22{
  23        unsigned long mask;
  24        pte_t *ptep, pte;
  25        struct page *page;
  26
  27        mask = (write ? _PAGE_RO : 0) | _PAGE_INVALID | _PAGE_SPECIAL;
  28
  29        ptep = ((pte_t *) pmd_deref(pmd)) + pte_index(addr);
  30        do {
  31                pte = *ptep;
  32                barrier();
  33                if ((pte_val(pte) & mask) != 0)
  34                        return 0;
  35                VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
  36                page = pte_page(pte);
  37                if (!page_cache_get_speculative(page))
  38                        return 0;
  39                if (unlikely(pte_val(pte) != pte_val(*ptep))) {
  40                        put_page(page);
  41                        return 0;
  42                }
  43                pages[*nr] = page;
  44                (*nr)++;
  45
  46        } while (ptep++, addr += PAGE_SIZE, addr != end);
  47
  48        return 1;
  49}
  50
  51static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
  52                unsigned long end, int write, struct page **pages, int *nr)
  53{
  54        unsigned long mask, result;
  55        struct page *head, *page, *tail;
  56        int refs;
  57
  58        result = write ? 0 : _SEGMENT_ENTRY_RO;
  59        mask = result | _SEGMENT_ENTRY_INV;
  60        if ((pmd_val(pmd) & mask) != result)
  61                return 0;
  62        VM_BUG_ON(!pfn_valid(pmd_val(pmd) >> PAGE_SHIFT));
  63
  64        refs = 0;
  65        head = pmd_page(pmd);
  66        page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
  67        tail = page;
  68        do {
  69                VM_BUG_ON(compound_head(page) != head);
  70                pages[*nr] = page;
  71                (*nr)++;
  72                page++;
  73                refs++;
  74        } while (addr += PAGE_SIZE, addr != end);
  75
  76        if (!page_cache_add_speculative(head, refs)) {
  77                *nr -= refs;
  78                return 0;
  79        }
  80
  81        if (unlikely(pmd_val(pmd) != pmd_val(*pmdp))) {
  82                *nr -= refs;
  83                while (refs--)
  84                        put_page(head);
  85                return 0;
  86        }
  87
  88        /*
  89         * Any tail page need their mapcount reference taken before we
  90         * return.
  91         */
  92        while (refs--) {
  93                if (PageTail(tail))
  94                        get_huge_page_tail(tail);
  95                tail++;
  96        }
  97
  98        return 1;
  99}
 100
 101
 102static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr,
 103                unsigned long end, int write, struct page **pages, int *nr)
 104{
 105        unsigned long next;
 106        pmd_t *pmdp, pmd;
 107
 108        pmdp = (pmd_t *) pudp;
 109#ifdef CONFIG_64BIT
 110        if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
 111                pmdp = (pmd_t *) pud_deref(pud);
 112        pmdp += pmd_index(addr);
 113#endif
 114        do {
 115                pmd = *pmdp;
 116                barrier();
 117                next = pmd_addr_end(addr, end);
 118                /*
 119                 * The pmd_trans_splitting() check below explains why
 120                 * pmdp_splitting_flush() has to serialize with
 121                 * smp_call_function() against our disabled IRQs, to stop
 122                 * this gup-fast code from running while we set the
 123                 * splitting bit in the pmd. Returning zero will take
 124                 * the slow path that will call wait_split_huge_page()
 125                 * if the pmd is still in splitting state.
 126                 */
 127                if (pmd_none(pmd) || pmd_trans_splitting(pmd))
 128                        return 0;
 129                if (unlikely(pmd_large(pmd))) {
 130                        if (!gup_huge_pmd(pmdp, pmd, addr, next,
 131                                          write, pages, nr))
 132                                return 0;
 133                } else if (!gup_pte_range(pmdp, pmd, addr, next,
 134                                          write, pages, nr))
 135                        return 0;
 136        } while (pmdp++, addr = next, addr != end);
 137
 138        return 1;
 139}
 140
 141static inline int gup_pud_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr,
 142                unsigned long end, int write, struct page **pages, int *nr)
 143{
 144        unsigned long next;
 145        pud_t *pudp, pud;
 146
 147        pudp = (pud_t *) pgdp;
 148#ifdef CONFIG_64BIT
 149        if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R2)
 150                pudp = (pud_t *) pgd_deref(pgd);
 151        pudp += pud_index(addr);
 152#endif
 153        do {
 154                pud = *pudp;
 155                barrier();
 156                next = pud_addr_end(addr, end);
 157                if (pud_none(pud))
 158                        return 0;
 159                if (!gup_pmd_range(pudp, pud, addr, next, write, pages, nr))
 160                        return 0;
 161        } while (pudp++, addr = next, addr != end);
 162
 163        return 1;
 164}
 165
 166/*
 167 * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
 168 * back to the regular GUP.
 169 */
 170int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
 171                          struct page **pages)
 172{
 173        struct mm_struct *mm = current->mm;
 174        unsigned long addr, len, end;
 175        unsigned long next, flags;
 176        pgd_t *pgdp, pgd;
 177        int nr = 0;
 178
 179        start &= PAGE_MASK;
 180        addr = start;
 181        len = (unsigned long) nr_pages << PAGE_SHIFT;
 182        end = start + len;
 183        if ((end < start) || (end > TASK_SIZE))
 184                return 0;
 185
 186        local_irq_save(flags);
 187        pgdp = pgd_offset(mm, addr);
 188        do {
 189                pgd = *pgdp;
 190                barrier();
 191                next = pgd_addr_end(addr, end);
 192                if (pgd_none(pgd))
 193                        break;
 194                if (!gup_pud_range(pgdp, pgd, addr, next, write, pages, &nr))
 195                        break;
 196        } while (pgdp++, addr = next, addr != end);
 197        local_irq_restore(flags);
 198
 199        return nr;
 200}
 201
 202/**
 203 * get_user_pages_fast() - pin user pages in memory
 204 * @start:      starting user address
 205 * @nr_pages:   number of pages from start to pin
 206 * @write:      whether pages will be written to
 207 * @pages:      array that receives pointers to the pages pinned.
 208 *              Should be at least nr_pages long.
 209 *
 210 * Attempt to pin user pages in memory without taking mm->mmap_sem.
 211 * If not successful, it will fall back to taking the lock and
 212 * calling get_user_pages().
 213 *
 214 * Returns number of pages pinned. This may be fewer than the number
 215 * requested. If nr_pages is 0 or negative, returns 0. If no pages
 216 * were pinned, returns -errno.
 217 */
 218int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 219                        struct page **pages)
 220{
 221        struct mm_struct *mm = current->mm;
 222        unsigned long addr, len, end;
 223        unsigned long next;
 224        pgd_t *pgdp, pgd;
 225        int nr = 0;
 226
 227        start &= PAGE_MASK;
 228        addr = start;
 229        len = (unsigned long) nr_pages << PAGE_SHIFT;
 230        end = start + len;
 231        if ((end < start) || (end > TASK_SIZE))
 232                goto slow_irqon;
 233
 234        /*
 235         * local_irq_disable() doesn't prevent pagetable teardown, but does
 236         * prevent the pagetables from being freed on s390.
 237         *
 238         * So long as we atomically load page table pointers versus teardown,
 239         * we can follow the address down to the the page and take a ref on it.
 240         */
 241        local_irq_disable();
 242        pgdp = pgd_offset(mm, addr);
 243        do {
 244                pgd = *pgdp;
 245                barrier();
 246                next = pgd_addr_end(addr, end);
 247                if (pgd_none(pgd))
 248                        goto slow;
 249                if (!gup_pud_range(pgdp, pgd, addr, next, write, pages, &nr))
 250                        goto slow;
 251        } while (pgdp++, addr = next, addr != end);
 252        local_irq_enable();
 253
 254        VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
 255        return nr;
 256
 257        {
 258                int ret;
 259slow:
 260                local_irq_enable();
 261slow_irqon:
 262                /* Try to get the remaining pages with get_user_pages */
 263                start += nr << PAGE_SHIFT;
 264                pages += nr;
 265
 266                down_read(&mm->mmap_sem);
 267                ret = get_user_pages(current, mm, start,
 268                        (end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
 269                up_read(&mm->mmap_sem);
 270
 271                /* Have to be a bit careful with return values */
 272                if (nr > 0) {
 273                        if (ret < 0)
 274                                ret = nr;
 275                        else
 276                                ret += nr;
 277                }
 278
 279                return ret;
 280        }
 281}
 282