linux/fs/proc/task_mmu.c
<<
>>
Prefs
   1#include <linux/mm.h>
   2#include <linux/hugetlb.h>
   3#include <linux/huge_mm.h>
   4#include <linux/mount.h>
   5#include <linux/seq_file.h>
   6#include <linux/highmem.h>
   7#include <linux/ptrace.h>
   8#include <linux/slab.h>
   9#include <linux/pagemap.h>
  10#include <linux/mempolicy.h>
  11#include <linux/rmap.h>
  12#include <linux/swap.h>
  13#include <linux/swapops.h>
  14#include <linux/shmem_fs.h>
  15#include <linux/mmu_notifier.h>
  16#include <linux/page_idle.h>
  17#include <linux/sched/mm.h>
  18
  19#include <asm/elf.h>
  20#include <asm/uaccess.h>
  21#include <asm/tlbflush.h>
  22#include "internal.h"
  23
  24void task_mem(struct seq_file *m, struct mm_struct *mm)
  25{
  26        unsigned long data, text, lib, swap, anon, file, shmem;
  27        unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
  28
  29        anon = get_mm_counter(mm, MM_ANONPAGES);
  30        file = get_mm_counter(mm, MM_FILEPAGES);
  31        shmem = get_mm_counter(mm, MM_SHMEMPAGES);
  32
  33        /*
  34         * Note: to minimize their overhead, mm maintains hiwater_vm and
  35         * hiwater_rss only when about to *lower* total_vm or rss.  Any
  36         * collector of these hiwater stats must therefore get total_vm
  37         * and rss too, which will usually be the higher.  Barriers? not
  38         * worth the effort, such snapshots can always be inconsistent.
  39         */
  40        hiwater_vm = total_vm = mm->total_vm;
  41        if (hiwater_vm < mm->hiwater_vm)
  42                hiwater_vm = mm->hiwater_vm;
  43        hiwater_rss = total_rss = anon + file + shmem;
  44        if (hiwater_rss < mm->hiwater_rss)
  45                hiwater_rss = mm->hiwater_rss;
  46
  47        data = mm->total_vm - mm->shared_vm - mm->stack_vm;
  48        /* split executable areas between text and lib */
  49        text = PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK);
  50        text = min(text, mm->exec_vm << PAGE_SHIFT);
  51        lib = (mm->exec_vm << PAGE_SHIFT) - text;
  52
  53        swap = get_mm_counter(mm, MM_SWAPENTS);
  54        seq_printf(m,
  55                "VmPeak:\t%8lu kB\n"
  56                "VmSize:\t%8lu kB\n"
  57                "VmLck:\t%8lu kB\n"
  58                "VmPin:\t%8lu kB\n"
  59                "VmHWM:\t%8lu kB\n"
  60                "VmRSS:\t%8lu kB\n"
  61                "RssAnon:\t%8lu kB\n"
  62                "RssFile:\t%8lu kB\n"
  63                "RssShmem:\t%8lu kB\n"
  64                "VmData:\t%8lu kB\n"
  65                "VmStk:\t%8lu kB\n"
  66                "VmExe:\t%8lu kB\n"
  67                "VmLib:\t%8lu kB\n"
  68                "VmPTE:\t%8lu kB\n"
  69                "VmSwap:\t%8lu kB\n",
  70                hiwater_vm << (PAGE_SHIFT-10),
  71                total_vm << (PAGE_SHIFT-10),
  72                mm->locked_vm << (PAGE_SHIFT-10),
  73                mm->pinned_vm << (PAGE_SHIFT-10),
  74                hiwater_rss << (PAGE_SHIFT-10),
  75                total_rss << (PAGE_SHIFT-10),
  76                anon << (PAGE_SHIFT-10),
  77                file << (PAGE_SHIFT-10),
  78                shmem << (PAGE_SHIFT-10),
  79                data << (PAGE_SHIFT-10),
  80                mm->stack_vm << (PAGE_SHIFT-10),
  81                text >> 10,
  82                lib >> 10,
  83                (PTRS_PER_PTE * sizeof(pte_t) *
  84                 atomic_long_read(&mm->nr_ptes)) >> 10,
  85                swap << (PAGE_SHIFT-10));
  86}
  87
  88unsigned long task_vsize(struct mm_struct *mm)
  89{
  90        return PAGE_SIZE * mm->total_vm;
  91}
  92
  93unsigned long task_statm(struct mm_struct *mm,
  94                         unsigned long *shared, unsigned long *text,
  95                         unsigned long *data, unsigned long *resident)
  96{
  97        *shared = get_mm_counter(mm, MM_FILEPAGES) +
  98                        get_mm_counter(mm, MM_SHMEMPAGES);
  99        *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
 100                                                                >> PAGE_SHIFT;
 101        *data = mm->total_vm - mm->shared_vm;
 102        *resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
 103        return mm->total_vm;
 104}
 105
 106static void pad_len_spaces(struct seq_file *m, int len)
 107{
 108        len = 25 + sizeof(void*) * 6 - len;
 109        if (len < 1)
 110                len = 1;
 111        seq_printf(m, "%*c", len, ' ');
 112}
 113
 114#ifdef CONFIG_NUMA
 115/*
 116 * Save get_task_policy() for show_numa_map().
 117 */
 118static void hold_task_mempolicy(struct proc_maps_private *priv)
 119{
 120        struct task_struct *task = priv->task;
 121
 122        task_lock(task);
 123        priv->task_mempolicy = get_task_policy(task);
 124        mpol_get(priv->task_mempolicy);
 125        task_unlock(task);
 126}
 127static void release_task_mempolicy(struct proc_maps_private *priv)
 128{
 129        mpol_put(priv->task_mempolicy);
 130}
 131#else
 132static void hold_task_mempolicy(struct proc_maps_private *priv)
 133{
 134}
 135static void release_task_mempolicy(struct proc_maps_private *priv)
 136{
 137}
 138#endif
 139
 140static void vma_stop(struct proc_maps_private *priv)
 141{
 142        struct mm_struct *mm = priv->mm;
 143
 144        release_task_mempolicy(priv);
 145        up_read(&mm->mmap_sem);
 146        mmput(mm);
 147}
 148
 149static struct vm_area_struct *
 150m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma)
 151{
 152        if (vma == priv->tail_vma)
 153                return NULL;
 154        return vma->vm_next ?: priv->tail_vma;
 155}
 156
 157static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma)
 158{
 159        if (m->count < m->size) /* vma is copied successfully */
 160                m->version = m_next_vma(m->private, vma) ? vma->vm_start : -1UL;
 161}
 162
 163static void *m_start(struct seq_file *m, loff_t *ppos)
 164{
 165        struct proc_maps_private *priv = m->private;
 166        unsigned long last_addr = m->version;
 167        struct mm_struct *mm;
 168        struct vm_area_struct *vma;
 169        unsigned int pos = *ppos;
 170
 171        /* See m_cache_vma(). Zero at the start or after lseek. */
 172        if (last_addr == -1UL)
 173                return NULL;
 174
 175        priv->task = get_proc_task(priv->inode);
 176        if (!priv->task)
 177                return ERR_PTR(-ESRCH);
 178
 179        mm = priv->mm;
 180        if (!mm || !atomic_inc_not_zero(&mm->mm_users))
 181                return NULL;
 182
 183        down_read(&mm->mmap_sem);
 184        hold_task_mempolicy(priv);
 185        priv->tail_vma = get_gate_vma(mm);
 186
 187        if (last_addr) {
 188                vma = find_vma(mm, last_addr);
 189                if (vma && (vma = m_next_vma(priv, vma)))
 190                        return vma;
 191        }
 192
 193        m->version = 0;
 194        if (pos < mm->map_count) {
 195                for (vma = mm->mmap; pos; pos--) {
 196                        m->version = vma->vm_start;
 197                        vma = vma->vm_next;
 198                }
 199                return vma;
 200        }
 201
 202        /* we do not bother to update m->version in this case */
 203        if (pos == mm->map_count && priv->tail_vma)
 204                return priv->tail_vma;
 205
 206        vma_stop(priv);
 207        return NULL;
 208}
 209
 210static void *m_next(struct seq_file *m, void *v, loff_t *pos)
 211{
 212        struct proc_maps_private *priv = m->private;
 213        struct vm_area_struct *next;
 214
 215        (*pos)++;
 216        next = m_next_vma(priv, v);
 217        if (!next)
 218                vma_stop(priv);
 219        return next;
 220}
 221
 222static void m_stop(struct seq_file *m, void *v)
 223{
 224        struct proc_maps_private *priv = m->private;
 225
 226        if (!IS_ERR_OR_NULL(v))
 227                vma_stop(priv);
 228        if (priv->task) {
 229                put_task_struct(priv->task);
 230                priv->task = NULL;
 231        }
 232}
 233
 234static int proc_maps_open(struct inode *inode, struct file *file,
 235                        const struct seq_operations *ops, int psize)
 236{
 237        struct proc_maps_private *priv = __seq_open_private(file, ops, psize);
 238
 239        if (!priv)
 240                return -ENOMEM;
 241
 242        priv->inode = inode;
 243        priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
 244        if (IS_ERR(priv->mm)) {
 245                int err = PTR_ERR(priv->mm);
 246
 247                seq_release_private(inode, file);
 248                return err;
 249        }
 250
 251        return 0;
 252}
 253
 254static int proc_map_release(struct inode *inode, struct file *file)
 255{
 256        struct seq_file *seq = file->private_data;
 257        struct proc_maps_private *priv = seq->private;
 258
 259        if (priv->mm)
 260                mmdrop(priv->mm);
 261
 262        return seq_release_private(inode, file);
 263}
 264
 265static int do_maps_open(struct inode *inode, struct file *file,
 266                        const struct seq_operations *ops)
 267{
 268        return proc_maps_open(inode, file, ops,
 269                                sizeof(struct proc_maps_private));
 270}
 271
 272/*
 273 * Indicate if the VMA is a stack for the given task; for
 274 * /proc/PID/maps that is the stack of the main task.
 275 */
 276static int is_stack(struct proc_maps_private *priv,
 277                    struct vm_area_struct *vma, int is_pid)
 278{
 279        int stack = 0;
 280
 281        if (is_pid) {
 282                stack = vma->vm_start <= vma->vm_mm->start_stack &&
 283                        vma->vm_end >= vma->vm_mm->start_stack;
 284        } else {
 285                struct inode *inode = priv->inode;
 286                struct task_struct *task;
 287
 288                rcu_read_lock();
 289                task = pid_task(proc_pid(inode), PIDTYPE_PID);
 290                if (task)
 291                        stack = vma_is_stack_for_task(vma, task);
 292                rcu_read_unlock();
 293        }
 294        return stack;
 295}
 296
 297static void
 298show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 299{
 300        struct mm_struct *mm = vma->vm_mm;
 301        struct file *file = vma->vm_file;
 302        struct proc_maps_private *priv = m->private;
 303        vm_flags_t flags = vma->vm_flags;
 304        unsigned long ino = 0;
 305        unsigned long long pgoff = 0;
 306        unsigned long start, end;
 307        dev_t dev = 0;
 308        int len;
 309        const char *name = NULL;
 310
 311        if (file) {
 312                struct inode *inode = file_inode(vma->vm_file);
 313                dev = inode->i_sb->s_dev;
 314                ino = inode->i_ino;
 315                pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
 316        }
 317
 318        /* We don't show the stack guard page in /proc/maps */
 319        start = vma->vm_start;
 320        end = vma->vm_end;
 321
 322        seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
 323                        start,
 324                        end,
 325                        flags & VM_READ ? 'r' : '-',
 326                        flags & VM_WRITE ? 'w' : '-',
 327                        flags & VM_EXEC ? 'x' : '-',
 328                        flags & VM_MAYSHARE ? 's' : 'p',
 329                        pgoff,
 330                        MAJOR(dev), MINOR(dev), ino, &len);
 331
 332        /*
 333         * Print the dentry name for named mappings, and a
 334         * special [heap] marker for the heap:
 335         */
 336        if (file) {
 337                pad_len_spaces(m, len);
 338                seq_path(m, &file->f_path, "\n");
 339                goto done;
 340        }
 341
 342        name = arch_vma_name(vma);
 343        if (!name) {
 344                if (!mm) {
 345                        name = "[vdso]";
 346                        goto done;
 347                }
 348
 349                if (vma->vm_start <= mm->brk &&
 350                    vma->vm_end >= mm->start_brk) {
 351                        name = "[heap]";
 352                        goto done;
 353                }
 354
 355                if (is_stack(priv, vma, is_pid))
 356                        name = "[stack]";
 357        }
 358
 359done:
 360        if (name) {
 361                pad_len_spaces(m, len);
 362                seq_puts(m, name);
 363        }
 364        seq_putc(m, '\n');
 365}
 366
 367static int show_map(struct seq_file *m, void *v, int is_pid)
 368{
 369        show_map_vma(m, v, is_pid);
 370        m_cache_vma(m, v);
 371        return 0;
 372}
 373
 374static int show_pid_map(struct seq_file *m, void *v)
 375{
 376        return show_map(m, v, 1);
 377}
 378
 379static int show_tid_map(struct seq_file *m, void *v)
 380{
 381        return show_map(m, v, 0);
 382}
 383
 384static const struct seq_operations proc_pid_maps_op = {
 385        .start  = m_start,
 386        .next   = m_next,
 387        .stop   = m_stop,
 388        .show   = show_pid_map
 389};
 390
 391static const struct seq_operations proc_tid_maps_op = {
 392        .start  = m_start,
 393        .next   = m_next,
 394        .stop   = m_stop,
 395        .show   = show_tid_map
 396};
 397
 398static int pid_maps_open(struct inode *inode, struct file *file)
 399{
 400        return do_maps_open(inode, file, &proc_pid_maps_op);
 401}
 402
 403static int tid_maps_open(struct inode *inode, struct file *file)
 404{
 405        return do_maps_open(inode, file, &proc_tid_maps_op);
 406}
 407
 408const struct file_operations proc_pid_maps_operations = {
 409        .open           = pid_maps_open,
 410        .read           = seq_read,
 411        .llseek         = seq_lseek,
 412        .release        = proc_map_release,
 413};
 414
 415const struct file_operations proc_tid_maps_operations = {
 416        .open           = tid_maps_open,
 417        .read           = seq_read,
 418        .llseek         = seq_lseek,
 419        .release        = proc_map_release,
 420};
 421
 422/*
 423 * Proportional Set Size(PSS): my share of RSS.
 424 *
 425 * PSS of a process is the count of pages it has in memory, where each
 426 * page is divided by the number of processes sharing it.  So if a
 427 * process has 1000 pages all to itself, and 1000 shared with one other
 428 * process, its PSS will be 1500.
 429 *
 430 * To keep (accumulated) division errors low, we adopt a 64bit
 431 * fixed-point pss counter to minimize division errors. So (pss >>
 432 * PSS_SHIFT) would be the real byte count.
 433 *
 434 * A shift of 12 before division means (assuming 4K page size):
 435 *      - 1M 3-user-pages add up to 8KB errors;
 436 *      - supports mapcount up to 2^24, or 16M;
 437 *      - supports PSS up to 2^52 bytes, or 4PB.
 438 */
 439#define PSS_SHIFT 12
 440
 441#ifdef CONFIG_PROC_PAGE_MONITOR
 442struct mem_size_stats {
 443        struct vm_area_struct *vma;
 444        unsigned long resident;
 445        unsigned long shared_clean;
 446        unsigned long shared_dirty;
 447        unsigned long private_clean;
 448        unsigned long private_dirty;
 449        unsigned long referenced;
 450        unsigned long anonymous;
 451        unsigned long anonymous_thp;
 452        unsigned long swap;
 453        unsigned long nonlinear;
 454        u64 pss;
 455        bool check_shmem_swap;
 456};
 457
 458static void smaps_account(struct mem_size_stats *mss, struct page *page,
 459                unsigned long size, bool young, bool dirty)
 460{
 461        int mapcount;
 462
 463        if (PageAnon(page))
 464                mss->anonymous += size;
 465
 466        mss->resident += size;
 467        /* Accumulate the size in pages that have been accessed. */
 468        if (young || page_is_young(page) || PageReferenced(page))
 469                mss->referenced += size;
 470        mapcount = page_mapcount(page);
 471        if (mapcount >= 2) {
 472                u64 pss_delta;
 473
 474                if (dirty || PageDirty(page))
 475                        mss->shared_dirty += size;
 476                else
 477                        mss->shared_clean += size;
 478                pss_delta = (u64)size << PSS_SHIFT;
 479                do_div(pss_delta, mapcount);
 480                mss->pss += pss_delta;
 481        } else {
 482                if (dirty || PageDirty(page))
 483                        mss->private_dirty += size;
 484                else
 485                        mss->private_clean += size;
 486                mss->pss += (u64)size << PSS_SHIFT;
 487        }
 488}
 489
 490#ifdef CONFIG_SHMEM
 491static int smaps_pte_hole(unsigned long addr, unsigned long end,
 492                struct mm_walk *walk)
 493{
 494        struct mem_size_stats *mss = walk->private;
 495
 496        mss->swap += shmem_partial_swap_usage(
 497                        mss->vma->vm_file->f_mapping, addr, end);
 498
 499        return 0;
 500}
 501#endif
 502
 503static void smaps_pte_entry(pte_t *pte, unsigned long addr,
 504                struct mm_walk *walk)
 505{
 506        struct mem_size_stats *mss = walk->private;
 507        struct vm_area_struct *vma = mss->vma;
 508        pgoff_t pgoff = linear_page_index(vma, addr);
 509        struct page *page = NULL;
 510
 511        if (pte_present(*pte)) {
 512                page = vm_normal_page(vma, addr, *pte);
 513        } else if (is_swap_pte(*pte)) {
 514                swp_entry_t swpent = pte_to_swp_entry(*pte);
 515
 516                if (!non_swap_entry(swpent))
 517                        mss->swap += PAGE_SIZE;
 518                else if (is_migration_entry(swpent))
 519                        page = migration_entry_to_page(swpent);
 520                else if (is_hmm_entry(swpent))
 521                        page = hmm_entry_to_page(swpent);
 522        } else if (pte_file(*pte)) {
 523                if (pte_to_pgoff(*pte) != pgoff)
 524                        mss->nonlinear += PAGE_SIZE;
 525        } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
 526                            && pte_none(*pte))) {
 527                page = find_get_page(vma->vm_file->f_mapping,
 528                                                linear_page_index(vma, addr));
 529                if (!page)
 530                        return;
 531
 532                if (radix_tree_exceptional_entry(page))
 533                        mss->swap += PAGE_SIZE;
 534                else
 535                        page_cache_release(page);
 536
 537                return;
 538        }
 539
 540        if (!page)
 541                return;
 542
 543        if (page->index != pgoff)
 544                mss->nonlinear += PAGE_SIZE;
 545
 546        smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte));
 547}
 548
 549#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 550static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
 551                struct mm_walk *walk)
 552{
 553        struct mem_size_stats *mss = walk->private;
 554        struct vm_area_struct *vma = mss->vma;
 555        struct page *page;
 556
 557        /* FOLL_DUMP will return -EFAULT on huge zero page */
 558        page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP);
 559        if (IS_ERR_OR_NULL(page))
 560                return;
 561        mss->anonymous_thp += HPAGE_PMD_SIZE;
 562        smaps_account(mss, page, HPAGE_PMD_SIZE,
 563                        pmd_young(*pmd), pmd_dirty(*pmd));
 564}
 565#else
 566static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
 567                struct mm_walk *walk)
 568{
 569}
 570#endif
 571
 572static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 573                           struct mm_walk *walk)
 574{
 575        struct mem_size_stats *mss = walk->private;
 576        struct vm_area_struct *vma = mss->vma;
 577        pte_t *pte;
 578        spinlock_t *ptl;
 579
 580        if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
 581                smaps_pmd_entry(pmd, addr, walk);
 582                spin_unlock(ptl);
 583                return 0;
 584        }
 585
 586        if (pmd_trans_unstable(pmd))
 587                return 0;
 588        /*
 589         * The mmap_sem held all the way back in m_start() is what
 590         * keeps khugepaged out of here and from collapsing things
 591         * in here.
 592         */
 593        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 594        for (; addr != end; pte++, addr += PAGE_SIZE)
 595                smaps_pte_entry(pte, addr, walk);
 596        pte_unmap_unlock(pte - 1, ptl);
 597        cond_resched();
 598        return 0;
 599}
 600
 601static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
 602{
 603        /*
 604         * Don't forget to update Documentation/ on changes.
 605         */
 606        static const char mnemonics[BITS_PER_LONG][2] = {
 607                /*
 608                 * In case if we meet a flag we don't know about.
 609                 */
 610                [0 ... (BITS_PER_LONG-1)] = "??",
 611
 612                [ilog2(VM_READ)]        = "rd",
 613                [ilog2(VM_WRITE)]       = "wr",
 614                [ilog2(VM_EXEC)]        = "ex",
 615                [ilog2(VM_SHARED)]      = "sh",
 616                [ilog2(VM_MAYREAD)]     = "mr",
 617                [ilog2(VM_MAYWRITE)]    = "mw",
 618                [ilog2(VM_MAYEXEC)]     = "me",
 619                [ilog2(VM_MAYSHARE)]    = "ms",
 620                [ilog2(VM_GROWSDOWN)]   = "gd",
 621                [ilog2(VM_PFNMAP)]      = "pf",
 622                [ilog2(VM_DENYWRITE)]   = "dw",
 623#ifdef CONFIG_X86_INTEL_MPX
 624                [ilog2(VM_MPX)]         = "mp",
 625#endif
 626                [ilog2(VM_LOCKONFAULT)] = "lf",
 627                [ilog2(VM_LOCKED)]      = "lo",
 628                [ilog2(VM_IO)]          = "io",
 629                [ilog2(VM_SEQ_READ)]    = "sr",
 630                [ilog2(VM_RAND_READ)]   = "rr",
 631                [ilog2(VM_DONTCOPY)]    = "dc",
 632                [ilog2(VM_DONTEXPAND)]  = "de",
 633                [ilog2(VM_SYNC)]        = "sf",
 634                [ilog2(VM_ACCOUNT)]     = "ac",
 635                [ilog2(VM_NORESERVE)]   = "nr",
 636                [ilog2(VM_HUGETLB)]     = "ht",
 637                [ilog2(VM_NONLINEAR)]   = "nl",
 638                [ilog2(VM_ARCH_1)]      = "ar",
 639                [ilog2(VM_WIPEONFORK)]  = "wf",
 640                [ilog2(VM_DONTDUMP)]    = "dd",
 641#ifdef CONFIG_MEM_SOFT_DIRTY
 642                [ilog2(VM_SOFTDIRTY)]   = "sd",
 643#endif
 644                [ilog2(VM_MIXEDMAP)]    = "mm",
 645                [ilog2(VM_HUGEPAGE)]    = "hg",
 646                [ilog2(VM_NOHUGEPAGE)]  = "nh",
 647                [ilog2(VM_MERGEABLE)]   = "mg",
 648                [ilog2(VM_UFFD_MISSING)]= "um",
 649                [ilog2(VM_UFFD_WP)]     = "uw",
 650#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
 651                /* These come out via ProtectionKey: */
 652                [ilog2(VM_PKEY_BIT0)]   = "",
 653                [ilog2(VM_PKEY_BIT1)]   = "",
 654                [ilog2(VM_PKEY_BIT2)]   = "",
 655                [ilog2(VM_PKEY_BIT3)]   = "",
 656#endif
 657        };
 658        size_t i;
 659
 660        seq_puts(m, "VmFlags: ");
 661        for (i = 0; i < BITS_PER_LONG; i++) {
 662                if (!mnemonics[i][0])
 663                        continue;
 664                if (vma->vm_flags & (1UL << i)) {
 665                        seq_printf(m, "%c%c ",
 666                                   mnemonics[i][0], mnemonics[i][1]);
 667                }
 668        }
 669        seq_putc(m, '\n');
 670}
 671
 672void __weak arch_show_smap(struct seq_file *m, struct vm_area_struct *vma)
 673{
 674}
 675
 676static int show_smap(struct seq_file *m, void *v, int is_pid)
 677{
 678        struct vm_area_struct *vma = v;
 679        struct mem_size_stats mss;
 680        struct mm_walk smaps_walk = {
 681                .pmd_entry = smaps_pte_range,
 682                .mm = vma->vm_mm,
 683                .private = &mss,
 684        };
 685
 686        memset(&mss, 0, sizeof mss);
 687        mss.vma = vma;
 688        /* mmap_sem is held in m_start */
 689        if (vma->vm_mm && !is_vm_hugetlb_page(vma)) {
 690#ifdef CONFIG_SHMEM
 691                if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
 692                        /*
 693                         * For shared or readonly shmem mappings we know that
 694                         * all swapped out pages belong to the shmem object,
 695                         * and we can obtain the swap value much more
 696                         * efficiently. For private writable mappings, we might
 697                         * have COW pages that are not affected by the parent
 698                         * swapped out pages of the shmem object, so we have to
 699                         * distinguish them during the page walk. Unless we
 700                         * know that the shmem object (or the part mapped by
 701                         * our VMA) has no swapped out pages at all.
 702                         */
 703                        unsigned long shmem_swapped = shmem_swap_usage(vma);
 704
 705                        if (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
 706                            !(vma->vm_flags & VM_WRITE)) {
 707                                mss.swap = shmem_swapped;
 708                        } else {
 709                                mss.check_shmem_swap = true;
 710                                smaps_walk.pte_hole = smaps_pte_hole;
 711                        }
 712                }
 713#endif
 714                walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
 715        }
 716
 717        show_map_vma(m, vma, is_pid);
 718
 719        seq_printf(m,
 720                   "Size:           %8lu kB\n"
 721                   "Rss:            %8lu kB\n"
 722                   "Pss:            %8lu kB\n"
 723                   "Shared_Clean:   %8lu kB\n"
 724                   "Shared_Dirty:   %8lu kB\n"
 725                   "Private_Clean:  %8lu kB\n"
 726                   "Private_Dirty:  %8lu kB\n"
 727                   "Referenced:     %8lu kB\n"
 728                   "Anonymous:      %8lu kB\n"
 729                   "AnonHugePages:  %8lu kB\n"
 730                   "Swap:           %8lu kB\n"
 731                   "KernelPageSize: %8lu kB\n"
 732                   "MMUPageSize:    %8lu kB\n"
 733                   "Locked:         %8lu kB\n",
 734                   (vma->vm_end - vma->vm_start) >> 10,
 735                   mss.resident >> 10,
 736                   (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
 737                   mss.shared_clean  >> 10,
 738                   mss.shared_dirty  >> 10,
 739                   mss.private_clean >> 10,
 740                   mss.private_dirty >> 10,
 741                   mss.referenced >> 10,
 742                   mss.anonymous >> 10,
 743                   mss.anonymous_thp >> 10,
 744                   mss.swap >> 10,
 745                   vma_kernel_pagesize(vma) >> 10,
 746                   vma_mmu_pagesize(vma) >> 10,
 747                   (vma->vm_flags & VM_LOCKED) ?
 748                        (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
 749
 750        if (vma->vm_flags & VM_NONLINEAR)
 751                seq_printf(m, "Nonlinear:      %8lu kB\n",
 752                                mss.nonlinear >> 10);
 753
 754        arch_show_smap(m, vma);
 755        show_smap_vma_flags(m, vma);
 756        m_cache_vma(m, vma);
 757        return 0;
 758}
 759
 760static int show_pid_smap(struct seq_file *m, void *v)
 761{
 762        return show_smap(m, v, 1);
 763}
 764
 765static int show_tid_smap(struct seq_file *m, void *v)
 766{
 767        return show_smap(m, v, 0);
 768}
 769
 770static const struct seq_operations proc_pid_smaps_op = {
 771        .start  = m_start,
 772        .next   = m_next,
 773        .stop   = m_stop,
 774        .show   = show_pid_smap
 775};
 776
 777static const struct seq_operations proc_tid_smaps_op = {
 778        .start  = m_start,
 779        .next   = m_next,
 780        .stop   = m_stop,
 781        .show   = show_tid_smap
 782};
 783
 784static int pid_smaps_open(struct inode *inode, struct file *file)
 785{
 786        return do_maps_open(inode, file, &proc_pid_smaps_op);
 787}
 788
 789static int tid_smaps_open(struct inode *inode, struct file *file)
 790{
 791        return do_maps_open(inode, file, &proc_tid_smaps_op);
 792}
 793
 794const struct file_operations proc_pid_smaps_operations = {
 795        .open           = pid_smaps_open,
 796        .read           = seq_read,
 797        .llseek         = seq_lseek,
 798        .release        = proc_map_release,
 799};
 800
 801const struct file_operations proc_tid_smaps_operations = {
 802        .open           = tid_smaps_open,
 803        .read           = seq_read,
 804        .llseek         = seq_lseek,
 805        .release        = proc_map_release,
 806};
 807
 808enum clear_refs_types {
 809        CLEAR_REFS_ALL = 1,
 810        CLEAR_REFS_ANON,
 811        CLEAR_REFS_MAPPED,
 812        CLEAR_REFS_SOFT_DIRTY,
 813        CLEAR_REFS_LAST,
 814};
 815
 816struct clear_refs_private {
 817        struct vm_area_struct *vma;
 818        enum clear_refs_types type;
 819};
 820
 821static inline void clear_soft_dirty(struct vm_area_struct *vma,
 822                unsigned long addr, pte_t *pte)
 823{
 824#ifdef CONFIG_MEM_SOFT_DIRTY
 825        /*
 826         * The soft-dirty tracker uses #PF-s to catch writes
 827         * to pages, so write-protect the pte as well. See the
 828         * Documentation/vm/soft-dirty.txt for full description
 829         * of how soft-dirty works.
 830         */
 831        pte_t ptent = *pte;
 832
 833        if (pte_present(ptent)) {
 834                ptent = pte_wrprotect(ptent);
 835                ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
 836        } else if (is_swap_pte(ptent)) {
 837                ptent = pte_swp_clear_soft_dirty(ptent);
 838        } else if (pte_file(ptent)) {
 839                ptent = pte_file_clear_soft_dirty(ptent);
 840        }
 841
 842        set_pte_at(vma->vm_mm, addr, pte, ptent);
 843#endif
 844}
 845
 846static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 847                                unsigned long end, struct mm_walk *walk)
 848{
 849        struct clear_refs_private *cp = walk->private;
 850        struct vm_area_struct *vma = cp->vma;
 851        pte_t *pte, ptent;
 852        spinlock_t *ptl;
 853        struct page *page;
 854
 855        split_huge_page_pmd(vma, addr, pmd);
 856        if (pmd_trans_unstable(pmd))
 857                return 0;
 858
 859        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 860        for (; addr != end; pte++, addr += PAGE_SIZE) {
 861                ptent = *pte;
 862
 863                if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
 864                        clear_soft_dirty(vma, addr, pte);
 865                        continue;
 866                }
 867
 868                if (!pte_present(ptent))
 869                        continue;
 870
 871                page = vm_normal_page(vma, addr, ptent);
 872                if (!page)
 873                        continue;
 874
 875                /* Clear accessed and referenced bits. */
 876                ptep_test_and_clear_young(vma, addr, pte);
 877                test_and_clear_page_young(page);
 878                ClearPageReferenced(page);
 879        }
 880        pte_unmap_unlock(pte - 1, ptl);
 881        cond_resched();
 882        return 0;
 883}
 884
 885static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 886                                size_t count, loff_t *ppos)
 887{
 888        struct task_struct *task;
 889        char buffer[PROC_NUMBUF];
 890        struct mm_struct *mm;
 891        struct vm_area_struct *vma;
 892        enum clear_refs_types type;
 893        int itype;
 894        int rv;
 895
 896        memset(buffer, 0, sizeof(buffer));
 897        if (count > sizeof(buffer) - 1)
 898                count = sizeof(buffer) - 1;
 899        if (copy_from_user(buffer, buf, count))
 900                return -EFAULT;
 901        rv = kstrtoint(strstrip(buffer), 10, &itype);
 902        if (rv < 0)
 903                return rv;
 904        type = (enum clear_refs_types)itype;
 905        if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
 906                return -EINVAL;
 907
 908        task = get_proc_task(file_inode(file));
 909        if (!task)
 910                return -ESRCH;
 911        mm = get_task_mm(task);
 912        if (mm) {
 913                struct clear_refs_private cp = {
 914                        .type = type,
 915                };
 916                struct mm_walk clear_refs_walk = {
 917                        .pmd_entry = clear_refs_pte_range,
 918                        .mm = mm,
 919                        .private = &cp,
 920                };
 921                down_read(&mm->mmap_sem);
 922                if (type == CLEAR_REFS_SOFT_DIRTY) {
 923                        for (vma = mm->mmap; vma; vma = vma->vm_next) {
 924                                if (!(vma->vm_flags & VM_SOFTDIRTY))
 925                                        continue;
 926                                up_read(&mm->mmap_sem);
 927                                down_write(&mm->mmap_sem);
 928                                /*
 929                                 * Avoid to modify vma->vm_flags
 930                                 * without locked ops while the
 931                                 * coredump reads the vm_flags.
 932                                 */
 933                                if (!mmget_still_valid(mm)) {
 934                                        /*
 935                                         * Silently return "count"
 936                                         * like if get_task_mm()
 937                                         * failed. FIXME: should this
 938                                         * function have returned
 939                                         * -ESRCH if get_task_mm()
 940                                         * failed like if
 941                                         * get_proc_task() fails?
 942                                         */
 943                                        up_write(&mm->mmap_sem);
 944                                        goto out_mm;
 945                                }
 946                                for (vma = mm->mmap; vma; vma = vma->vm_next) {
 947                                        vma->vm_flags &= ~VM_SOFTDIRTY;
 948                                        vma_set_page_prot(vma);
 949                                }
 950                                downgrade_write(&mm->mmap_sem);
 951                                break;
 952                        }
 953                        mmu_notifier_invalidate_range_start(mm, 0, -1);
 954                }
 955                for (vma = mm->mmap; vma; vma = vma->vm_next) {
 956                        cp.vma = vma;
 957                        if (is_vm_hugetlb_page(vma))
 958                                continue;
 959
 960                        if (vma->vm_flags & VM_PFNMAP)
 961                                continue;
 962
 963                        /*
 964                         * Writing 1 to /proc/pid/clear_refs affects all pages.
 965                         *
 966                         * Writing 2 to /proc/pid/clear_refs only affects
 967                         * Anonymous pages.
 968                         *
 969                         * Writing 3 to /proc/pid/clear_refs only affects file
 970                         * mapped pages.
 971                         *
 972                         * Writing 4 to /proc/pid/clear_refs affects all pages.
 973                         */
 974                        if (type == CLEAR_REFS_ANON && vma->vm_file)
 975                                continue;
 976                        if (type == CLEAR_REFS_MAPPED && !vma->vm_file)
 977                                continue;
 978                        walk_page_range(vma->vm_start, vma->vm_end,
 979                                        &clear_refs_walk);
 980                }
 981                if (type == CLEAR_REFS_SOFT_DIRTY)
 982                        mmu_notifier_invalidate_range_end(mm, 0, -1);
 983                flush_tlb_mm(mm);
 984                up_read(&mm->mmap_sem);
 985out_mm:
 986                mmput(mm);
 987        }
 988        put_task_struct(task);
 989
 990        return count;
 991}
 992
 993const struct file_operations proc_clear_refs_operations = {
 994        .write          = clear_refs_write,
 995        .llseek         = noop_llseek,
 996};
 997
 998typedef struct {
 999        u64 pme;
1000} pagemap_entry_t;
1001
1002struct pagemapread {
1003        int pos, len;           /* units: PM_ENTRY_BYTES, not bytes */
1004        pagemap_entry_t *buffer;
1005        bool show_pfn;
1006};
1007
1008#define PAGEMAP_WALK_SIZE       (PMD_SIZE)
1009#define PAGEMAP_WALK_MASK       (PMD_MASK)
1010
1011#define PM_ENTRY_BYTES          sizeof(pagemap_entry_t)
1012#define PM_PFRAME_BITS          55
1013#define PM_PFRAME_MASK          GENMASK_ULL(PM_PFRAME_BITS - 1, 0)
1014#define PM_SOFT_DIRTY           BIT_ULL(55)
1015#define PM_MMAP_EXCLUSIVE       BIT_ULL(56)
1016#define PM_FILE                 BIT_ULL(61)
1017#define PM_SWAP                 BIT_ULL(62)
1018#define PM_PRESENT              BIT_ULL(63)
1019
1020#define PM_END_OF_BUFFER    1
1021
1022static inline pagemap_entry_t make_pme(u64 frame, u64 flags)
1023{
1024        return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags };
1025}
1026
1027static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
1028                          struct pagemapread *pm)
1029{
1030        pm->buffer[pm->pos++] = *pme;
1031        if (pm->pos >= pm->len)
1032                return PM_END_OF_BUFFER;
1033        return 0;
1034}
1035
1036static int pagemap_pte_hole(unsigned long start, unsigned long end,
1037                                struct mm_walk *walk)
1038{
1039        struct pagemapread *pm = walk->private;
1040        unsigned long addr = start;
1041        int err = 0;
1042
1043        while (addr < end) {
1044                struct vm_area_struct *vma = find_vma(walk->mm, addr);
1045                pagemap_entry_t pme = make_pme(0, 0);
1046                /* End of address space hole, which we mark as non-present. */
1047                unsigned long hole_end;
1048
1049                if (vma)
1050                        hole_end = min(end, vma->vm_start);
1051                else
1052                        hole_end = end;
1053
1054                for (; addr < hole_end; addr += PAGE_SIZE) {
1055                        err = add_to_pagemap(addr, &pme, pm);
1056                        if (err)
1057                                goto out;
1058                }
1059
1060                if (!vma)
1061                        break;
1062
1063                /* Addresses in the VMA. */
1064                if (vma->vm_flags & VM_SOFTDIRTY)
1065                        pme = make_pme(0, PM_SOFT_DIRTY);
1066                for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
1067                        err = add_to_pagemap(addr, &pme, pm);
1068                        if (err)
1069                                goto out;
1070                }
1071        }
1072out:
1073        return err;
1074}
1075
1076static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
1077                struct vm_area_struct *vma, unsigned long addr, pte_t pte)
1078{
1079        u64 frame = 0, flags = 0;
1080        struct page *page = NULL;
1081
1082        if (pte_present(pte)) {
1083                if (pm->show_pfn)
1084                        frame = pte_pfn(pte);
1085                flags |= PM_PRESENT;
1086                page = vm_normal_page(vma, addr, pte);
1087                if (pte_soft_dirty(pte))
1088                        flags |= PM_SOFT_DIRTY;
1089        } else if (is_swap_pte(pte)) {
1090                swp_entry_t entry;
1091                if (pte_swp_soft_dirty(pte))
1092                        flags |= PM_SOFT_DIRTY;
1093                entry = pte_to_swp_entry(pte);
1094                if (pm->show_pfn)
1095                        frame = swp_type(entry) |
1096                                (swp_offset(entry) << MAX_SWAPFILES_SHIFT);
1097                flags |= PM_SWAP;
1098                if (is_migration_entry(entry))
1099                        page = migration_entry_to_page(entry);
1100                else if (is_hmm_entry(entry))
1101                        page = hmm_entry_to_page(entry);
1102        }
1103
1104        if (page && !PageAnon(page))
1105                flags |= PM_FILE;
1106        if (page && page_mapcount(page) == 1)
1107                flags |= PM_MMAP_EXCLUSIVE;
1108        if (vma->vm_flags & VM_SOFTDIRTY)
1109                flags |= PM_SOFT_DIRTY;
1110
1111        return make_pme(frame, flags);
1112}
1113
1114static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
1115                             struct mm_walk *walk)
1116{
1117        struct vm_area_struct *vma;
1118        struct pagemapread *pm = walk->private;
1119        spinlock_t *ptl;
1120        pte_t *pte;
1121        int err = 0;
1122
1123        /* find the first VMA at or above 'addr' */
1124        vma = find_vma(walk->mm, addr);
1125#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1126        if (vma && pmd_trans_huge_lock(pmdp, vma, &ptl) == 1) {
1127                u64 flags = 0, frame = 0;
1128                pmd_t pmd = *pmdp;
1129
1130                if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(pmd))
1131                        flags |= PM_SOFT_DIRTY;
1132
1133                /*
1134                 * Currently pmd for thp is always present because thp
1135                 * can not be swapped-out, migrated, or HWPOISONed
1136                 * (split in such cases instead.)
1137                 * This if-check is just to prepare for future implementation.
1138                 */
1139                if (pmd_present(pmd)) {
1140                        struct page *page = pmd_page(pmd);
1141
1142                        if (page_mapcount(page) == 1)
1143                                flags |= PM_MMAP_EXCLUSIVE;
1144
1145                        flags |= PM_PRESENT;
1146                        if (pm->show_pfn)
1147                                frame = pmd_pfn(pmd) +
1148                                        ((addr & ~PMD_MASK) >> PAGE_SHIFT);
1149                }
1150
1151                for (; addr != end; addr += PAGE_SIZE) {
1152                        pagemap_entry_t pme = make_pme(frame, flags);
1153                        err = add_to_pagemap(addr, &pme, pm);
1154                        if (err)
1155                                break;
1156                        if (pm->show_pfn) {
1157                                if (flags & PM_PRESENT)
1158                                        frame++;
1159                                else if (flags & PM_SWAP)
1160                                        frame += (1 << MAX_SWAPFILES_SHIFT);
1161                        }
1162                }
1163                spin_unlock(ptl);
1164                return err;
1165        }
1166
1167        if (pmd_trans_unstable(pmdp))
1168                return 0;
1169#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1170
1171        while (1) {
1172                /* End of address space hole, which we mark as non-present. */
1173                unsigned long hole_end;
1174
1175                if (vma)
1176                        hole_end = min(end, vma->vm_start);
1177                else
1178                        hole_end = end;
1179
1180                for (; addr < hole_end; addr += PAGE_SIZE) {
1181                        pagemap_entry_t pme = make_pme(0, 0);
1182
1183                        err = add_to_pagemap(addr, &pme, pm);
1184                        if (err)
1185                                return err;
1186                }
1187
1188                if (!vma || vma->vm_start >= end)
1189                        break;
1190                /*
1191                 * We can't possibly be in a hugetlb VMA. In general,
1192                 * for a mm_walk with a pmd_entry and a hugetlb_entry,
1193                 * the pmd_entry can only be called on addresses in a
1194                 * hugetlb if the walk starts in a non-hugetlb VMA and
1195                 * spans a hugepage VMA. Since pagemap_read walks are
1196                 * PMD-sized and PMD-aligned, this will never be true.
1197                 */
1198                BUG_ON(is_vm_hugetlb_page(vma));
1199
1200                /* Addresses in the VMA. */
1201                for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
1202                        pagemap_entry_t pme;
1203                        pte = pte_offset_map(pmdp, addr);
1204                        pme = pte_to_pagemap_entry(pm, vma, addr, *pte);
1205                        pte_unmap(pte);
1206                        err = add_to_pagemap(addr, &pme, pm);
1207                        if (err)
1208                                return err;
1209                }
1210
1211                if (addr == end)
1212                        break;
1213
1214                vma = find_vma(walk->mm, addr);
1215        }
1216
1217        cond_resched();
1218
1219        return err;
1220}
1221
1222#ifdef CONFIG_HUGETLB_PAGE
1223/* This function walks within one hugetlb entry in the single call */
1224static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
1225                                 unsigned long addr, unsigned long end,
1226                                 struct mm_walk *walk)
1227{
1228        struct pagemapread *pm = walk->private;
1229        struct vm_area_struct *vma;
1230        u64 flags = 0, frame = 0;
1231        int err = 0;
1232        pte_t pte;
1233
1234        vma = find_vma(walk->mm, addr);
1235        WARN_ON_ONCE(!vma);
1236
1237        if (vma && (vma->vm_flags & VM_SOFTDIRTY))
1238                flags |= PM_SOFT_DIRTY;
1239
1240        pte = huge_ptep_get(ptep);
1241        if (pte_present(pte)) {
1242                struct page *page = pte_page(pte);
1243
1244                if (!PageAnon(page))
1245                        flags |= PM_FILE;
1246
1247                if (page_mapcount(page) == 1)
1248                        flags |= PM_MMAP_EXCLUSIVE;
1249
1250                flags |= PM_PRESENT;
1251                if (pm->show_pfn)
1252                        frame = pte_pfn(pte) +
1253                                ((addr & ~hmask) >> PAGE_SHIFT);
1254        }
1255
1256        for (; addr != end; addr += PAGE_SIZE) {
1257                pagemap_entry_t pme = make_pme(frame, flags);
1258
1259                err = add_to_pagemap(addr, &pme, pm);
1260                if (err)
1261                        return err;
1262                if (pm->show_pfn && (flags & PM_PRESENT))
1263                        frame++;
1264        }
1265
1266        cond_resched();
1267
1268        return err;
1269}
1270#endif /* HUGETLB_PAGE */
1271
1272/*
1273 * /proc/pid/pagemap - an array mapping virtual pages to pfns
1274 *
1275 * For each page in the address space, this file contains one 64-bit entry
1276 * consisting of the following:
1277 *
1278 * Bits 0-54  page frame number (PFN) if present
1279 * Bits 0-4   swap type if swapped
1280 * Bits 5-54  swap offset if swapped
1281 * Bit  55    pte is soft-dirty (see Documentation/vm/soft-dirty.txt)
1282 * Bit  56    page exclusively mapped
1283 * Bits 57-60 zero
1284 * Bit  61    page is file-page or shared-anon
1285 * Bit  62    page swapped
1286 * Bit  63    page present
1287 *
1288 * If the page is not present but in swap, then the PFN contains an
1289 * encoding of the swap file number and the page's offset into the
1290 * swap. Unmapped pages return a null PFN. This allows determining
1291 * precisely which pages are mapped (or in swap) and comparing mapped
1292 * pages between processes.
1293 *
1294 * Efficient users of this interface will use /proc/pid/maps to
1295 * determine which areas of memory are actually mapped and llseek to
1296 * skip over unmapped regions.
1297 */
1298static ssize_t pagemap_read(struct file *file, char __user *buf,
1299                            size_t count, loff_t *ppos)
1300{
1301        struct mm_struct *mm = file->private_data;
1302        struct pagemapread pm;
1303        struct mm_walk pagemap_walk = {};
1304        unsigned long src;
1305        unsigned long svpfn;
1306        unsigned long start_vaddr;
1307        unsigned long end_vaddr;
1308        int ret = 0, copied = 0;
1309
1310        if (!mm || !atomic_inc_not_zero(&mm->mm_users))
1311                goto out;
1312
1313        ret = -EINVAL;
1314        /* file position must be aligned */
1315        if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))
1316                goto out_mm;
1317
1318        ret = 0;
1319        if (!count)
1320                goto out_mm;
1321
1322        /* do not disclose physical addresses: attack vector */
1323        pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN);
1324
1325        pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
1326        pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY);
1327        ret = -ENOMEM;
1328        if (!pm.buffer)
1329                goto out_mm;
1330
1331        pagemap_walk.pmd_entry = pagemap_pmd_range;
1332        pagemap_walk.pte_hole = pagemap_pte_hole;
1333#ifdef CONFIG_HUGETLB_PAGE
1334        pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
1335#endif
1336        pagemap_walk.mm = mm;
1337        pagemap_walk.private = &pm;
1338
1339        src = *ppos;
1340        svpfn = src / PM_ENTRY_BYTES;
1341        start_vaddr = svpfn << PAGE_SHIFT;
1342        end_vaddr = mm->task_size;
1343
1344        /* watch out for wraparound */
1345        if (svpfn > mm->task_size >> PAGE_SHIFT)
1346                start_vaddr = end_vaddr;
1347
1348        /*
1349         * The odds are that this will stop walking way
1350         * before end_vaddr, because the length of the
1351         * user buffer is tracked in "pm", and the walk
1352         * will stop when we hit the end of the buffer.
1353         */
1354        ret = 0;
1355        while (count && (start_vaddr < end_vaddr)) {
1356                int len;
1357                unsigned long end;
1358
1359                pm.pos = 0;
1360                end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK;
1361                /* overflow ? */
1362                if (end < start_vaddr || end > end_vaddr)
1363                        end = end_vaddr;
1364                down_read(&mm->mmap_sem);
1365                ret = walk_page_range(start_vaddr, end, &pagemap_walk);
1366                up_read(&mm->mmap_sem);
1367                start_vaddr = end;
1368
1369                len = min(count, PM_ENTRY_BYTES * pm.pos);
1370                if (copy_to_user(buf, pm.buffer, len)) {
1371                        ret = -EFAULT;
1372                        goto out_free;
1373                }
1374                copied += len;
1375                buf += len;
1376                count -= len;
1377        }
1378        *ppos += copied;
1379        if (!ret || ret == PM_END_OF_BUFFER)
1380                ret = copied;
1381
1382out_free:
1383        kfree(pm.buffer);
1384out_mm:
1385        mmput(mm);
1386out:
1387        return ret;
1388}
1389
1390static int pagemap_open(struct inode *inode, struct file *file)
1391{
1392        struct mm_struct *mm;
1393
1394        mm = proc_mem_open(inode, PTRACE_MODE_READ);
1395        if (IS_ERR(mm))
1396                return PTR_ERR(mm);
1397        file->private_data = mm;
1398        return 0;
1399}
1400
1401static int pagemap_release(struct inode *inode, struct file *file)
1402{
1403        struct mm_struct *mm = file->private_data;
1404
1405        if (mm)
1406                mmdrop(mm);
1407        return 0;
1408}
1409
1410const struct file_operations proc_pagemap_operations = {
1411        .llseek         = mem_lseek, /* borrow this */
1412        .read           = pagemap_read,
1413        .open           = pagemap_open,
1414        .release        = pagemap_release,
1415};
1416#endif /* CONFIG_PROC_PAGE_MONITOR */
1417
1418#ifdef CONFIG_NUMA
1419
1420struct numa_maps {
1421        struct vm_area_struct *vma;
1422        unsigned long pages;
1423        unsigned long anon;
1424        unsigned long active;
1425        unsigned long writeback;
1426        unsigned long mapcount_max;
1427        unsigned long dirty;
1428        unsigned long swapcache;
1429        unsigned long node[MAX_NUMNODES];
1430};
1431
1432struct numa_maps_private {
1433        struct proc_maps_private proc_maps;
1434        struct numa_maps md;
1435};
1436
1437static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty,
1438                        unsigned long nr_pages)
1439{
1440        int count = page_mapcount(page);
1441
1442        md->pages += nr_pages;
1443        if (pte_dirty || PageDirty(page))
1444                md->dirty += nr_pages;
1445
1446        if (PageSwapCache(page))
1447                md->swapcache += nr_pages;
1448
1449        if (PageActive(page) || PageUnevictable(page))
1450                md->active += nr_pages;
1451
1452        if (PageWriteback(page))
1453                md->writeback += nr_pages;
1454
1455        if (PageAnon(page))
1456                md->anon += nr_pages;
1457
1458        if (count > md->mapcount_max)
1459                md->mapcount_max = count;
1460
1461        md->node[page_to_nid(page)] += nr_pages;
1462}
1463
1464static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
1465                unsigned long addr)
1466{
1467        struct page *page;
1468        int nid;
1469
1470        if (!pte_present(pte))
1471                return NULL;
1472
1473        page = vm_normal_page(vma, addr, pte);
1474        if (!page)
1475                return NULL;
1476
1477        if (PageReserved(page))
1478                return NULL;
1479
1480        nid = page_to_nid(page);
1481        if (!node_isset(nid, node_states[N_MEMORY]))
1482                return NULL;
1483
1484        return page;
1485}
1486
1487static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
1488                unsigned long end, struct mm_walk *walk)
1489{
1490        struct numa_maps *md;
1491        spinlock_t *ptl;
1492        pte_t *orig_pte;
1493        pte_t *pte;
1494
1495        md = walk->private;
1496
1497        if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) {
1498                pte_t huge_pte = *(pte_t *)pmd;
1499                struct page *page;
1500
1501                page = can_gather_numa_stats(huge_pte, md->vma, addr);
1502                if (page)
1503                        gather_stats(page, md, pte_dirty(huge_pte),
1504                                     HPAGE_PMD_SIZE/PAGE_SIZE);
1505                spin_unlock(ptl);
1506                return 0;
1507        }
1508
1509        if (pmd_trans_unstable(pmd))
1510                return 0;
1511        orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
1512        do {
1513                struct page *page = can_gather_numa_stats(*pte, md->vma, addr);
1514                if (!page)
1515                        continue;
1516                gather_stats(page, md, pte_dirty(*pte), 1);
1517
1518        } while (pte++, addr += PAGE_SIZE, addr != end);
1519        pte_unmap_unlock(orig_pte, ptl);
1520        return 0;
1521}
1522#ifdef CONFIG_HUGETLB_PAGE
1523static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
1524                unsigned long addr, unsigned long end, struct mm_walk *walk)
1525{
1526        struct numa_maps *md;
1527        struct page *page;
1528
1529        if (!pte_present(*pte))
1530                return 0;
1531
1532        page = pte_page(*pte);
1533        if (!page)
1534                return 0;
1535
1536        md = walk->private;
1537        gather_stats(page, md, pte_dirty(*pte), 1);
1538        return 0;
1539}
1540
1541#else
1542static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
1543                unsigned long addr, unsigned long end, struct mm_walk *walk)
1544{
1545        return 0;
1546}
1547#endif
1548
1549/*
1550 * Display pages allocated per node and memory policy via /proc.
1551 */
1552static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1553{
1554        struct numa_maps_private *numa_priv = m->private;
1555        struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
1556        struct vm_area_struct *vma = v;
1557        struct numa_maps *md = &numa_priv->md;
1558        struct file *file = vma->vm_file;
1559        struct mm_struct *mm = vma->vm_mm;
1560        struct mm_walk walk = {};
1561        struct mempolicy *pol;
1562        int n;
1563        char buffer[50];
1564
1565        if (!mm)
1566                return 0;
1567
1568        /* Ensure we start with an empty set of numa_maps statistics. */
1569        memset(md, 0, sizeof(*md));
1570
1571        md->vma = vma;
1572
1573        walk.hugetlb_entry = gather_hugetbl_stats;
1574        walk.pmd_entry = gather_pte_stats;
1575        walk.private = md;
1576        walk.mm = mm;
1577
1578        pol = __get_vma_policy(vma, vma->vm_start);
1579        if (pol) {
1580                mpol_to_str(buffer, sizeof(buffer), pol);
1581                mpol_cond_put(pol);
1582        } else {
1583                mpol_to_str(buffer, sizeof(buffer), proc_priv->task_mempolicy);
1584        }
1585
1586        seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1587
1588        if (file) {
1589                seq_printf(m, " file=");
1590                seq_path(m, &file->f_path, "\n\t= ");
1591        } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1592                seq_printf(m, " heap");
1593        } else if (is_stack(proc_priv, vma, is_pid)) {
1594                seq_puts(m, " stack");
1595        }
1596
1597        if (is_vm_hugetlb_page(vma))
1598                seq_printf(m, " huge");
1599
1600        walk_page_range(vma->vm_start, vma->vm_end, &walk);
1601
1602        if (!md->pages)
1603                goto out;
1604
1605        if (md->anon)
1606                seq_printf(m, " anon=%lu", md->anon);
1607
1608        if (md->dirty)
1609                seq_printf(m, " dirty=%lu", md->dirty);
1610
1611        if (md->pages != md->anon && md->pages != md->dirty)
1612                seq_printf(m, " mapped=%lu", md->pages);
1613
1614        if (md->mapcount_max > 1)
1615                seq_printf(m, " mapmax=%lu", md->mapcount_max);
1616
1617        if (md->swapcache)
1618                seq_printf(m, " swapcache=%lu", md->swapcache);
1619
1620        if (md->active < md->pages && !is_vm_hugetlb_page(vma))
1621                seq_printf(m, " active=%lu", md->active);
1622
1623        if (md->writeback)
1624                seq_printf(m, " writeback=%lu", md->writeback);
1625
1626        for_each_node_state(n, N_MEMORY)
1627                if (md->node[n])
1628                        seq_printf(m, " N%d=%lu", n, md->node[n]);
1629
1630        seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10);
1631out:
1632        seq_putc(m, '\n');
1633        m_cache_vma(m, vma);
1634        return 0;
1635}
1636
1637static int show_pid_numa_map(struct seq_file *m, void *v)
1638{
1639        return show_numa_map(m, v, 1);
1640}
1641
1642static int show_tid_numa_map(struct seq_file *m, void *v)
1643{
1644        return show_numa_map(m, v, 0);
1645}
1646
1647static const struct seq_operations proc_pid_numa_maps_op = {
1648        .start  = m_start,
1649        .next   = m_next,
1650        .stop   = m_stop,
1651        .show   = show_pid_numa_map,
1652};
1653
1654static const struct seq_operations proc_tid_numa_maps_op = {
1655        .start  = m_start,
1656        .next   = m_next,
1657        .stop   = m_stop,
1658        .show   = show_tid_numa_map,
1659};
1660
1661static int numa_maps_open(struct inode *inode, struct file *file,
1662                          const struct seq_operations *ops)
1663{
1664        return proc_maps_open(inode, file, ops,
1665                                sizeof(struct numa_maps_private));
1666}
1667
1668static int pid_numa_maps_open(struct inode *inode, struct file *file)
1669{
1670        return numa_maps_open(inode, file, &proc_pid_numa_maps_op);
1671}
1672
1673static int tid_numa_maps_open(struct inode *inode, struct file *file)
1674{
1675        return numa_maps_open(inode, file, &proc_tid_numa_maps_op);
1676}
1677
1678const struct file_operations proc_pid_numa_maps_operations = {
1679        .open           = pid_numa_maps_open,
1680        .read           = seq_read,
1681        .llseek         = seq_lseek,
1682        .release        = proc_map_release,
1683};
1684
1685const struct file_operations proc_tid_numa_maps_operations = {
1686        .open           = tid_numa_maps_open,
1687        .read           = seq_read,
1688        .llseek         = seq_lseek,
1689        .release        = proc_map_release,
1690};
1691#endif /* CONFIG_NUMA */
1692