linux/fs/proc/base.c
<<
>>
Prefs
   1/*
   2 *  linux/fs/proc/base.c
   3 *
   4 *  Copyright (C) 1991, 1992 Linus Torvalds
   5 *
   6 *  proc base directory handling functions
   7 *
   8 *  1999, Al Viro. Rewritten. Now it covers the whole per-process part.
   9 *  Instead of using magical inumbers to determine the kind of object
  10 *  we allocate and fill in-core inodes upon lookup. They don't even
  11 *  go into icache. We cache the reference to task_struct upon lookup too.
  12 *  Eventually it should become a filesystem in its own. We don't use the
  13 *  rest of procfs anymore.
  14 *
  15 *
  16 *  Changelog:
  17 *  17-Jan-2005
  18 *  Allan Bezerra
  19 *  Bruna Moreira <bruna.moreira@indt.org.br>
  20 *  Edjard Mota <edjard.mota@indt.org.br>
  21 *  Ilias Biris <ilias.biris@indt.org.br>
  22 *  Mauricio Lin <mauricio.lin@indt.org.br>
  23 *
  24 *  Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
  25 *
  26 *  A new process specific entry (smaps) included in /proc. It shows the
  27 *  size of rss for each memory area. The maps entry lacks information
  28 *  about physical memory size (rss) for each mapped file, i.e.,
  29 *  rss information for executables and library files.
  30 *  This additional information is useful for any tools that need to know
  31 *  about physical memory consumption for a process specific library.
  32 *
  33 *  Changelog:
  34 *  21-Feb-2005
  35 *  Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
  36 *  Pud inclusion in the page table walking.
  37 *
  38 *  ChangeLog:
  39 *  10-Mar-2005
  40 *  10LE Instituto Nokia de Tecnologia - INdT:
  41 *  A better way to walks through the page table as suggested by Hugh Dickins.
  42 *
  43 *  Simo Piiroinen <simo.piiroinen@nokia.com>:
  44 *  Smaps information related to shared, private, clean and dirty pages.
  45 *
  46 *  Paul Mundt <paul.mundt@nokia.com>:
  47 *  Overall revision about smaps.
  48 */
  49
  50#include <asm/uaccess.h>
  51
  52#include <linux/errno.h>
  53#include <linux/time.h>
  54#include <linux/proc_fs.h>
  55#include <linux/stat.h>
  56#include <linux/init.h>
  57#include <linux/capability.h>
  58#include <linux/file.h>
  59#include <linux/string.h>
  60#include <linux/seq_file.h>
  61#include <linux/namei.h>
  62#include <linux/mnt_namespace.h>
  63#include <linux/mm.h>
  64#include <linux/rcupdate.h>
  65#include <linux/kallsyms.h>
  66#include <linux/resource.h>
  67#include <linux/module.h>
  68#include <linux/mount.h>
  69#include <linux/security.h>
  70#include <linux/ptrace.h>
  71#include <linux/cgroup.h>
  72#include <linux/cpuset.h>
  73#include <linux/audit.h>
  74#include <linux/poll.h>
  75#include <linux/nsproxy.h>
  76#include <linux/oom.h>
  77#include <linux/elf.h>
  78#include <linux/pid_namespace.h>
  79#include "internal.h"
  80
  81/* NOTE:
  82 *      Implementing inode permission operations in /proc is almost
  83 *      certainly an error.  Permission checks need to happen during
  84 *      each system call not at open time.  The reason is that most of
  85 *      what we wish to check for permissions in /proc varies at runtime.
  86 *
  87 *      The classic example of a problem is opening file descriptors
  88 *      in /proc for a task before it execs a suid executable.
  89 */
  90
  91
  92/* Worst case buffer size needed for holding an integer. */
  93#define PROC_NUMBUF 13
  94
  95struct pid_entry {
  96        char *name;
  97        int len;
  98        mode_t mode;
  99        const struct inode_operations *iop;
 100        const struct file_operations *fop;
 101        union proc_op op;
 102};
 103
 104#define NOD(NAME, MODE, IOP, FOP, OP) {                 \
 105        .name = (NAME),                                 \
 106        .len  = sizeof(NAME) - 1,                       \
 107        .mode = MODE,                                   \
 108        .iop  = IOP,                                    \
 109        .fop  = FOP,                                    \
 110        .op   = OP,                                     \
 111}
 112
 113#define DIR(NAME, MODE, OTYPE)                                                  \
 114        NOD(NAME, (S_IFDIR|(MODE)),                                             \
 115                &proc_##OTYPE##_inode_operations, &proc_##OTYPE##_operations,   \
 116                {} )
 117#define LNK(NAME, OTYPE)                                        \
 118        NOD(NAME, (S_IFLNK|S_IRWXUGO),                          \
 119                &proc_pid_link_inode_operations, NULL,          \
 120                { .proc_get_link = &proc_##OTYPE##_link } )
 121#define REG(NAME, MODE, OTYPE)                          \
 122        NOD(NAME, (S_IFREG|(MODE)), NULL,               \
 123                &proc_##OTYPE##_operations, {})
 124#define INF(NAME, MODE, OTYPE)                          \
 125        NOD(NAME, (S_IFREG|(MODE)),                     \
 126                NULL, &proc_info_file_operations,       \
 127                { .proc_read = &proc_##OTYPE } )
 128
 129int maps_protect;
 130EXPORT_SYMBOL(maps_protect);
 131
 132static struct fs_struct *get_fs_struct(struct task_struct *task)
 133{
 134        struct fs_struct *fs;
 135        task_lock(task);
 136        fs = task->fs;
 137        if(fs)
 138                atomic_inc(&fs->count);
 139        task_unlock(task);
 140        return fs;
 141}
 142
 143static int get_nr_threads(struct task_struct *tsk)
 144{
 145        /* Must be called with the rcu_read_lock held */
 146        unsigned long flags;
 147        int count = 0;
 148
 149        if (lock_task_sighand(tsk, &flags)) {
 150                count = atomic_read(&tsk->signal->count);
 151                unlock_task_sighand(tsk, &flags);
 152        }
 153        return count;
 154}
 155
 156static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
 157{
 158        struct task_struct *task = get_proc_task(inode);
 159        struct fs_struct *fs = NULL;
 160        int result = -ENOENT;
 161
 162        if (task) {
 163                fs = get_fs_struct(task);
 164                put_task_struct(task);
 165        }
 166        if (fs) {
 167                read_lock(&fs->lock);
 168                *mnt = mntget(fs->pwdmnt);
 169                *dentry = dget(fs->pwd);
 170                read_unlock(&fs->lock);
 171                result = 0;
 172                put_fs_struct(fs);
 173        }
 174        return result;
 175}
 176
 177static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
 178{
 179        struct task_struct *task = get_proc_task(inode);
 180        struct fs_struct *fs = NULL;
 181        int result = -ENOENT;
 182
 183        if (task) {
 184                fs = get_fs_struct(task);
 185                put_task_struct(task);
 186        }
 187        if (fs) {
 188                read_lock(&fs->lock);
 189                *mnt = mntget(fs->rootmnt);
 190                *dentry = dget(fs->root);
 191                read_unlock(&fs->lock);
 192                result = 0;
 193                put_fs_struct(fs);
 194        }
 195        return result;
 196}
 197
 198#define MAY_PTRACE(task) \
 199        (task == current || \
 200        (task->parent == current && \
 201        (task->ptrace & PT_PTRACED) && \
 202         (task->state == TASK_STOPPED || task->state == TASK_TRACED) && \
 203         security_ptrace(current,task) == 0))
 204
 205struct mm_struct *mm_for_maps(struct task_struct *task)
 206{
 207        struct mm_struct *mm = get_task_mm(task);
 208        if (!mm)
 209                return NULL;
 210        down_read(&mm->mmap_sem);
 211        task_lock(task);
 212        if (task->mm != mm)
 213                goto out;
 214        if (task->mm != current->mm && __ptrace_may_attach(task) < 0)
 215                goto out;
 216        task_unlock(task);
 217        return mm;
 218out:
 219        task_unlock(task);
 220        up_read(&mm->mmap_sem);
 221        mmput(mm);
 222        return NULL;
 223}
 224
 225static int proc_pid_cmdline(struct task_struct *task, char * buffer)
 226{
 227        int res = 0;
 228        unsigned int len;
 229        struct mm_struct *mm = get_task_mm(task);
 230        if (!mm)
 231                goto out;
 232        if (!mm->arg_end)
 233                goto out_mm;    /* Shh! No looking before we're done */
 234
 235        len = mm->arg_end - mm->arg_start;
 236 
 237        if (len > PAGE_SIZE)
 238                len = PAGE_SIZE;
 239 
 240        res = access_process_vm(task, mm->arg_start, buffer, len, 0);
 241
 242        // If the nul at the end of args has been overwritten, then
 243        // assume application is using setproctitle(3).
 244        if (res > 0 && buffer[res-1] != '\0' && len < PAGE_SIZE) {
 245                len = strnlen(buffer, res);
 246                if (len < res) {
 247                    res = len;
 248                } else {
 249                        len = mm->env_end - mm->env_start;
 250                        if (len > PAGE_SIZE - res)
 251                                len = PAGE_SIZE - res;
 252                        res += access_process_vm(task, mm->env_start, buffer+res, len, 0);
 253                        res = strnlen(buffer, res);
 254                }
 255        }
 256out_mm:
 257        mmput(mm);
 258out:
 259        return res;
 260}
 261
 262static int proc_pid_auxv(struct task_struct *task, char *buffer)
 263{
 264        int res = 0;
 265        struct mm_struct *mm = get_task_mm(task);
 266        if (mm) {
 267                unsigned int nwords = 0;
 268                do
 269                        nwords += 2;
 270                while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
 271                res = nwords * sizeof(mm->saved_auxv[0]);
 272                if (res > PAGE_SIZE)
 273                        res = PAGE_SIZE;
 274                memcpy(buffer, mm->saved_auxv, res);
 275                mmput(mm);
 276        }
 277        return res;
 278}
 279
 280
 281#ifdef CONFIG_KALLSYMS
 282/*
 283 * Provides a wchan file via kallsyms in a proper one-value-per-file format.
 284 * Returns the resolved symbol.  If that fails, simply return the address.
 285 */
 286static int proc_pid_wchan(struct task_struct *task, char *buffer)
 287{
 288        unsigned long wchan;
 289        char symname[KSYM_NAME_LEN];
 290
 291        wchan = get_wchan(task);
 292
 293        if (lookup_symbol_name(wchan, symname) < 0)
 294                return sprintf(buffer, "%lu", wchan);
 295        else
 296                return sprintf(buffer, "%s", symname);
 297}
 298#endif /* CONFIG_KALLSYMS */
 299
 300#ifdef CONFIG_SCHEDSTATS
 301/*
 302 * Provides /proc/PID/schedstat
 303 */
 304static int proc_pid_schedstat(struct task_struct *task, char *buffer)
 305{
 306        return sprintf(buffer, "%llu %llu %lu\n",
 307                        task->sched_info.cpu_time,
 308                        task->sched_info.run_delay,
 309                        task->sched_info.pcount);
 310}
 311#endif
 312
 313/* The badness from the OOM killer */
 314unsigned long badness(struct task_struct *p, unsigned long uptime);
 315static int proc_oom_score(struct task_struct *task, char *buffer)
 316{
 317        unsigned long points;
 318        struct timespec uptime;
 319
 320        do_posix_clock_monotonic_gettime(&uptime);
 321        read_lock(&tasklist_lock);
 322        points = badness(task, uptime.tv_sec);
 323        read_unlock(&tasklist_lock);
 324        return sprintf(buffer, "%lu\n", points);
 325}
 326
 327struct limit_names {
 328        char *name;
 329        char *unit;
 330};
 331
 332static const struct limit_names lnames[RLIM_NLIMITS] = {
 333        [RLIMIT_CPU] = {"Max cpu time", "ms"},
 334        [RLIMIT_FSIZE] = {"Max file size", "bytes"},
 335        [RLIMIT_DATA] = {"Max data size", "bytes"},
 336        [RLIMIT_STACK] = {"Max stack size", "bytes"},
 337        [RLIMIT_CORE] = {"Max core file size", "bytes"},
 338        [RLIMIT_RSS] = {"Max resident set", "bytes"},
 339        [RLIMIT_NPROC] = {"Max processes", "processes"},
 340        [RLIMIT_NOFILE] = {"Max open files", "files"},
 341        [RLIMIT_MEMLOCK] = {"Max locked memory", "bytes"},
 342        [RLIMIT_AS] = {"Max address space", "bytes"},
 343        [RLIMIT_LOCKS] = {"Max file locks", "locks"},
 344        [RLIMIT_SIGPENDING] = {"Max pending signals", "signals"},
 345        [RLIMIT_MSGQUEUE] = {"Max msgqueue size", "bytes"},
 346        [RLIMIT_NICE] = {"Max nice priority", NULL},
 347        [RLIMIT_RTPRIO] = {"Max realtime priority", NULL},
 348};
 349
 350/* Display limits for a process */
 351static int proc_pid_limits(struct task_struct *task, char *buffer)
 352{
 353        unsigned int i;
 354        int count = 0;
 355        unsigned long flags;
 356        char *bufptr = buffer;
 357
 358        struct rlimit rlim[RLIM_NLIMITS];
 359
 360        rcu_read_lock();
 361        if (!lock_task_sighand(task,&flags)) {
 362                rcu_read_unlock();
 363                return 0;
 364        }
 365        memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS);
 366        unlock_task_sighand(task, &flags);
 367        rcu_read_unlock();
 368
 369        /*
 370         * print the file header
 371         */
 372        count += sprintf(&bufptr[count], "%-25s %-20s %-20s %-10s\n",
 373                        "Limit", "Soft Limit", "Hard Limit", "Units");
 374
 375        for (i = 0; i < RLIM_NLIMITS; i++) {
 376                if (rlim[i].rlim_cur == RLIM_INFINITY)
 377                        count += sprintf(&bufptr[count], "%-25s %-20s ",
 378                                         lnames[i].name, "unlimited");
 379                else
 380                        count += sprintf(&bufptr[count], "%-25s %-20lu ",
 381                                         lnames[i].name, rlim[i].rlim_cur);
 382
 383                if (rlim[i].rlim_max == RLIM_INFINITY)
 384                        count += sprintf(&bufptr[count], "%-20s ", "unlimited");
 385                else
 386                        count += sprintf(&bufptr[count], "%-20lu ",
 387                                         rlim[i].rlim_max);
 388
 389                if (lnames[i].unit)
 390                        count += sprintf(&bufptr[count], "%-10s\n",
 391                                         lnames[i].unit);
 392                else
 393                        count += sprintf(&bufptr[count], "\n");
 394        }
 395
 396        return count;
 397}
 398
 399/************************************************************************/
 400/*                       Here the fs part begins                        */
 401/************************************************************************/
 402
 403/* permission checks */
 404static int proc_fd_access_allowed(struct inode *inode)
 405{
 406        struct task_struct *task;
 407        int allowed = 0;
 408        /* Allow access to a task's file descriptors if it is us or we
 409         * may use ptrace attach to the process and find out that
 410         * information.
 411         */
 412        task = get_proc_task(inode);
 413        if (task) {
 414                allowed = ptrace_may_attach(task);
 415                put_task_struct(task);
 416        }
 417        return allowed;
 418}
 419
 420static int proc_setattr(struct dentry *dentry, struct iattr *attr)
 421{
 422        int error;
 423        struct inode *inode = dentry->d_inode;
 424
 425        if (attr->ia_valid & ATTR_MODE)
 426                return -EPERM;
 427
 428        error = inode_change_ok(inode, attr);
 429        if (!error)
 430                error = inode_setattr(inode, attr);
 431        return error;
 432}
 433
 434static const struct inode_operations proc_def_inode_operations = {
 435        .setattr        = proc_setattr,
 436};
 437
 438extern struct seq_operations mounts_op;
 439struct proc_mounts {
 440        struct seq_file m;
 441        int event;
 442};
 443
 444static int mounts_open(struct inode *inode, struct file *file)
 445{
 446        struct task_struct *task = get_proc_task(inode);
 447        struct nsproxy *nsp;
 448        struct mnt_namespace *ns = NULL;
 449        struct proc_mounts *p;
 450        int ret = -EINVAL;
 451
 452        if (task) {
 453                rcu_read_lock();
 454                nsp = task_nsproxy(task);
 455                if (nsp) {
 456                        ns = nsp->mnt_ns;
 457                        if (ns)
 458                                get_mnt_ns(ns);
 459                }
 460                rcu_read_unlock();
 461
 462                put_task_struct(task);
 463        }
 464
 465        if (ns) {
 466                ret = -ENOMEM;
 467                p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL);
 468                if (p) {
 469                        file->private_data = &p->m;
 470                        ret = seq_open(file, &mounts_op);
 471                        if (!ret) {
 472                                p->m.private = ns;
 473                                p->event = ns->event;
 474                                return 0;
 475                        }
 476                        kfree(p);
 477                }
 478                put_mnt_ns(ns);
 479        }
 480        return ret;
 481}
 482
 483static int mounts_release(struct inode *inode, struct file *file)
 484{
 485        struct seq_file *m = file->private_data;
 486        struct mnt_namespace *ns = m->private;
 487        put_mnt_ns(ns);
 488        return seq_release(inode, file);
 489}
 490
 491static unsigned mounts_poll(struct file *file, poll_table *wait)
 492{
 493        struct proc_mounts *p = file->private_data;
 494        struct mnt_namespace *ns = p->m.private;
 495        unsigned res = 0;
 496
 497        poll_wait(file, &ns->poll, wait);
 498
 499        spin_lock(&vfsmount_lock);
 500        if (p->event != ns->event) {
 501                p->event = ns->event;
 502                res = POLLERR;
 503        }
 504        spin_unlock(&vfsmount_lock);
 505
 506        return res;
 507}
 508
 509static const struct file_operations proc_mounts_operations = {
 510        .open           = mounts_open,
 511        .read           = seq_read,
 512        .llseek         = seq_lseek,
 513        .release        = mounts_release,
 514        .poll           = mounts_poll,
 515};
 516
 517extern struct seq_operations mountstats_op;
 518static int mountstats_open(struct inode *inode, struct file *file)
 519{
 520        int ret = seq_open(file, &mountstats_op);
 521
 522        if (!ret) {
 523                struct seq_file *m = file->private_data;
 524                struct nsproxy *nsp;
 525                struct mnt_namespace *mnt_ns = NULL;
 526                struct task_struct *task = get_proc_task(inode);
 527
 528                if (task) {
 529                        rcu_read_lock();
 530                        nsp = task_nsproxy(task);
 531                        if (nsp) {
 532                                mnt_ns = nsp->mnt_ns;
 533                                if (mnt_ns)
 534                                        get_mnt_ns(mnt_ns);
 535                        }
 536                        rcu_read_unlock();
 537
 538                        put_task_struct(task);
 539                }
 540
 541                if (mnt_ns)
 542                        m->private = mnt_ns;
 543                else {
 544                        seq_release(inode, file);
 545                        ret = -EINVAL;
 546                }
 547        }
 548        return ret;
 549}
 550
 551static const struct file_operations proc_mountstats_operations = {
 552        .open           = mountstats_open,
 553        .read           = seq_read,
 554        .llseek         = seq_lseek,
 555        .release        = mounts_release,
 556};
 557
 558#define PROC_BLOCK_SIZE (3*1024)                /* 4K page size but our output routines use some slack for overruns */
 559
 560static ssize_t proc_info_read(struct file * file, char __user * buf,
 561                          size_t count, loff_t *ppos)
 562{
 563        struct inode * inode = file->f_path.dentry->d_inode;
 564        unsigned long page;
 565        ssize_t length;
 566        struct task_struct *task = get_proc_task(inode);
 567
 568        length = -ESRCH;
 569        if (!task)
 570                goto out_no_task;
 571
 572        if (count > PROC_BLOCK_SIZE)
 573                count = PROC_BLOCK_SIZE;
 574
 575        length = -ENOMEM;
 576        if (!(page = __get_free_page(GFP_TEMPORARY)))
 577                goto out;
 578
 579        length = PROC_I(inode)->op.proc_read(task, (char*)page);
 580
 581        if (length >= 0)
 582                length = simple_read_from_buffer(buf, count, ppos, (char *)page, length);
 583        free_page(page);
 584out:
 585        put_task_struct(task);
 586out_no_task:
 587        return length;
 588}
 589
 590static const struct file_operations proc_info_file_operations = {
 591        .read           = proc_info_read,
 592};
 593
 594static int mem_open(struct inode* inode, struct file* file)
 595{
 596        file->private_data = (void*)((long)current->self_exec_id);
 597        return 0;
 598}
 599
 600static ssize_t mem_read(struct file * file, char __user * buf,
 601                        size_t count, loff_t *ppos)
 602{
 603        struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
 604        char *page;
 605        unsigned long src = *ppos;
 606        int ret = -ESRCH;
 607        struct mm_struct *mm;
 608
 609        if (!task)
 610                goto out_no_task;
 611
 612        if (!MAY_PTRACE(task) || !ptrace_may_attach(task))
 613                goto out;
 614
 615        ret = -ENOMEM;
 616        page = (char *)__get_free_page(GFP_TEMPORARY);
 617        if (!page)
 618                goto out;
 619
 620        ret = 0;
 621 
 622        mm = get_task_mm(task);
 623        if (!mm)
 624                goto out_free;
 625
 626        ret = -EIO;
 627 
 628        if (file->private_data != (void*)((long)current->self_exec_id))
 629                goto out_put;
 630
 631        ret = 0;
 632 
 633        while (count > 0) {
 634                int this_len, retval;
 635
 636                this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
 637                retval = access_process_vm(task, src, page, this_len, 0);
 638                if (!retval || !MAY_PTRACE(task) || !ptrace_may_attach(task)) {
 639                        if (!ret)
 640                                ret = -EIO;
 641                        break;
 642                }
 643
 644                if (copy_to_user(buf, page, retval)) {
 645                        ret = -EFAULT;
 646                        break;
 647                }
 648 
 649                ret += retval;
 650                src += retval;
 651                buf += retval;
 652                count -= retval;
 653        }
 654        *ppos = src;
 655
 656out_put:
 657        mmput(mm);
 658out_free:
 659        free_page((unsigned long) page);
 660out:
 661        put_task_struct(task);
 662out_no_task:
 663        return ret;
 664}
 665
 666#define mem_write NULL
 667
 668#ifndef mem_write
 669/* This is a security hazard */
 670static ssize_t mem_write(struct file * file, const char __user *buf,
 671                         size_t count, loff_t *ppos)
 672{
 673        int copied;
 674        char *page;
 675        struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
 676        unsigned long dst = *ppos;
 677
 678        copied = -ESRCH;
 679        if (!task)
 680                goto out_no_task;
 681
 682        if (!MAY_PTRACE(task) || !ptrace_may_attach(task))
 683                goto out;
 684
 685        copied = -ENOMEM;
 686        page = (char *)__get_free_page(GFP_TEMPORARY);
 687        if (!page)
 688                goto out;
 689
 690        copied = 0;
 691        while (count > 0) {
 692                int this_len, retval;
 693
 694                this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
 695                if (copy_from_user(page, buf, this_len)) {
 696                        copied = -EFAULT;
 697                        break;
 698                }
 699                retval = access_process_vm(task, dst, page, this_len, 1);
 700                if (!retval) {
 701                        if (!copied)
 702                                copied = -EIO;
 703                        break;
 704                }
 705                copied += retval;
 706                buf += retval;
 707                dst += retval;
 708                count -= retval;                        
 709        }
 710        *ppos = dst;
 711        free_page((unsigned long) page);
 712out:
 713        put_task_struct(task);
 714out_no_task:
 715        return copied;
 716}
 717#endif
 718
 719static loff_t mem_lseek(struct file * file, loff_t offset, int orig)
 720{
 721        switch (orig) {
 722        case 0:
 723                file->f_pos = offset;
 724                break;
 725        case 1:
 726                file->f_pos += offset;
 727                break;
 728        default:
 729                return -EINVAL;
 730        }
 731        force_successful_syscall_return();
 732        return file->f_pos;
 733}
 734
 735static const struct file_operations proc_mem_operations = {
 736        .llseek         = mem_lseek,
 737        .read           = mem_read,
 738        .write          = mem_write,
 739        .open           = mem_open,
 740};
 741
 742static ssize_t environ_read(struct file *file, char __user *buf,
 743                        size_t count, loff_t *ppos)
 744{
 745        struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
 746        char *page;
 747        unsigned long src = *ppos;
 748        int ret = -ESRCH;
 749        struct mm_struct *mm;
 750
 751        if (!task)
 752                goto out_no_task;
 753
 754        if (!ptrace_may_attach(task))
 755                goto out;
 756
 757        ret = -ENOMEM;
 758        page = (char *)__get_free_page(GFP_TEMPORARY);
 759        if (!page)
 760                goto out;
 761
 762        ret = 0;
 763
 764        mm = get_task_mm(task);
 765        if (!mm)
 766                goto out_free;
 767
 768        while (count > 0) {
 769                int this_len, retval, max_len;
 770
 771                this_len = mm->env_end - (mm->env_start + src);
 772
 773                if (this_len <= 0)
 774                        break;
 775
 776                max_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
 777                this_len = (this_len > max_len) ? max_len : this_len;
 778
 779                retval = access_process_vm(task, (mm->env_start + src),
 780                        page, this_len, 0);
 781
 782                if (retval <= 0) {
 783                        ret = retval;
 784                        break;
 785                }
 786
 787                if (copy_to_user(buf, page, retval)) {
 788                        ret = -EFAULT;
 789                        break;
 790                }
 791
 792                ret += retval;
 793                src += retval;
 794                buf += retval;
 795                count -= retval;
 796        }
 797        *ppos = src;
 798
 799        mmput(mm);
 800out_free:
 801        free_page((unsigned long) page);
 802out:
 803        put_task_struct(task);
 804out_no_task:
 805        return ret;
 806}
 807
 808static const struct file_operations proc_environ_operations = {
 809        .read           = environ_read,
 810};
 811
 812static ssize_t oom_adjust_read(struct file *file, char __user *buf,
 813                                size_t count, loff_t *ppos)
 814{
 815        struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
 816        char buffer[PROC_NUMBUF];
 817        size_t len;
 818        int oom_adjust;
 819
 820        if (!task)
 821                return -ESRCH;
 822        oom_adjust = task->oomkilladj;
 823        put_task_struct(task);
 824
 825        len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
 826
 827        return simple_read_from_buffer(buf, count, ppos, buffer, len);
 828}
 829
 830static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 831                                size_t count, loff_t *ppos)
 832{
 833        struct task_struct *task;
 834        char buffer[PROC_NUMBUF], *end;
 835        int oom_adjust;
 836
 837        memset(buffer, 0, sizeof(buffer));
 838        if (count > sizeof(buffer) - 1)
 839                count = sizeof(buffer) - 1;
 840        if (copy_from_user(buffer, buf, count))
 841                return -EFAULT;
 842        oom_adjust = simple_strtol(buffer, &end, 0);
 843        if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
 844             oom_adjust != OOM_DISABLE)
 845                return -EINVAL;
 846        if (*end == '\n')
 847                end++;
 848        task = get_proc_task(file->f_path.dentry->d_inode);
 849        if (!task)
 850                return -ESRCH;
 851        if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) {
 852                put_task_struct(task);
 853                return -EACCES;
 854        }
 855        task->oomkilladj = oom_adjust;
 856        put_task_struct(task);
 857        if (end - buffer == 0)
 858                return -EIO;
 859        return end - buffer;
 860}
 861
 862static const struct file_operations proc_oom_adjust_operations = {
 863        .read           = oom_adjust_read,
 864        .write          = oom_adjust_write,
 865};
 866
 867#ifdef CONFIG_MMU
 868static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 869                                size_t count, loff_t *ppos)
 870{
 871        struct task_struct *task;
 872        char buffer[PROC_NUMBUF], *end;
 873        struct mm_struct *mm;
 874
 875        memset(buffer, 0, sizeof(buffer));
 876        if (count > sizeof(buffer) - 1)
 877                count = sizeof(buffer) - 1;
 878        if (copy_from_user(buffer, buf, count))
 879                return -EFAULT;
 880        if (!simple_strtol(buffer, &end, 0))
 881                return -EINVAL;
 882        if (*end == '\n')
 883                end++;
 884        task = get_proc_task(file->f_path.dentry->d_inode);
 885        if (!task)
 886                return -ESRCH;
 887        mm = get_task_mm(task);
 888        if (mm) {
 889                clear_refs_smap(mm);
 890                mmput(mm);
 891        }
 892        put_task_struct(task);
 893        if (end - buffer == 0)
 894                return -EIO;
 895        return end - buffer;
 896}
 897
 898static struct file_operations proc_clear_refs_operations = {
 899        .write          = clear_refs_write,
 900};
 901#endif
 902
 903#ifdef CONFIG_AUDITSYSCALL
 904#define TMPBUFLEN 21
 905static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
 906                                  size_t count, loff_t *ppos)
 907{
 908        struct inode * inode = file->f_path.dentry->d_inode;
 909        struct task_struct *task = get_proc_task(inode);
 910        ssize_t length;
 911        char tmpbuf[TMPBUFLEN];
 912
 913        if (!task)
 914                return -ESRCH;
 915        length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
 916                                audit_get_loginuid(task->audit_context));
 917        put_task_struct(task);
 918        return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
 919}
 920
 921static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
 922                                   size_t count, loff_t *ppos)
 923{
 924        struct inode * inode = file->f_path.dentry->d_inode;
 925        char *page, *tmp;
 926        ssize_t length;
 927        uid_t loginuid;
 928
 929        if (!capable(CAP_AUDIT_CONTROL))
 930                return -EPERM;
 931
 932        if (current != pid_task(proc_pid(inode), PIDTYPE_PID))
 933                return -EPERM;
 934
 935        if (count >= PAGE_SIZE)
 936                count = PAGE_SIZE - 1;
 937
 938        if (*ppos != 0) {
 939                /* No partial writes. */
 940                return -EINVAL;
 941        }
 942        page = (char*)__get_free_page(GFP_TEMPORARY);
 943        if (!page)
 944                return -ENOMEM;
 945        length = -EFAULT;
 946        if (copy_from_user(page, buf, count))
 947                goto out_free_page;
 948
 949        page[count] = '\0';
 950        loginuid = simple_strtoul(page, &tmp, 10);
 951        if (tmp == page) {
 952                length = -EINVAL;
 953                goto out_free_page;
 954
 955        }
 956        length = audit_set_loginuid(current, loginuid);
 957        if (likely(length == 0))
 958                length = count;
 959
 960out_free_page:
 961        free_page((unsigned long) page);
 962        return length;
 963}
 964
 965static const struct file_operations proc_loginuid_operations = {
 966        .read           = proc_loginuid_read,
 967        .write          = proc_loginuid_write,
 968};
 969#endif
 970
 971#ifdef CONFIG_FAULT_INJECTION
 972static ssize_t proc_fault_inject_read(struct file * file, char __user * buf,
 973                                      size_t count, loff_t *ppos)
 974{
 975        struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
 976        char buffer[PROC_NUMBUF];
 977        size_t len;
 978        int make_it_fail;
 979
 980        if (!task)
 981                return -ESRCH;
 982        make_it_fail = task->make_it_fail;
 983        put_task_struct(task);
 984
 985        len = snprintf(buffer, sizeof(buffer), "%i\n", make_it_fail);
 986
 987        return simple_read_from_buffer(buf, count, ppos, buffer, len);
 988}
 989
 990static ssize_t proc_fault_inject_write(struct file * file,
 991                        const char __user * buf, size_t count, loff_t *ppos)
 992{
 993        struct task_struct *task;
 994        char buffer[PROC_NUMBUF], *end;
 995        int make_it_fail;
 996
 997        if (!capable(CAP_SYS_RESOURCE))
 998                return -EPERM;
 999        memset(buffer, 0, sizeof(buffer));
1000        if (count > sizeof(buffer) - 1)
1001                count = sizeof(buffer) - 1;
1002        if (copy_from_user(buffer, buf, count))
1003                return -EFAULT;
1004        make_it_fail = simple_strtol(buffer, &end, 0);
1005        if (*end == '\n')
1006                end++;
1007        task = get_proc_task(file->f_dentry->d_inode);
1008        if (!task)
1009                return -ESRCH;
1010        task->make_it_fail = make_it_fail;
1011        put_task_struct(task);
1012        if (end - buffer == 0)
1013                return -EIO;
1014        return end - buffer;
1015}
1016
1017static const struct file_operations proc_fault_inject_operations = {
1018        .read           = proc_fault_inject_read,
1019        .write          = proc_fault_inject_write,
1020};
1021#endif
1022
1023#ifdef CONFIG_SCHED_DEBUG
1024/*
1025 * Print out various scheduling related per-task fields:
1026 */
1027static int sched_show(struct seq_file *m, void *v)
1028{
1029        struct inode *inode = m->private;
1030        struct task_struct *p;
1031
1032        WARN_ON(!inode);
1033
1034        p = get_proc_task(inode);
1035        if (!p)
1036                return -ESRCH;
1037        proc_sched_show_task(p, m);
1038
1039        put_task_struct(p);
1040
1041        return 0;
1042}
1043
1044static ssize_t
1045sched_write(struct file *file, const char __user *buf,
1046            size_t count, loff_t *offset)
1047{
1048        struct inode *inode = file->f_path.dentry->d_inode;
1049        struct task_struct *p;
1050
1051        WARN_ON(!inode);
1052
1053        p = get_proc_task(inode);
1054        if (!p)
1055                return -ESRCH;
1056        proc_sched_set_task(p);
1057
1058        put_task_struct(p);
1059
1060        return count;
1061}
1062
1063static int sched_open(struct inode *inode, struct file *filp)
1064{
1065        int ret;
1066
1067        ret = single_open(filp, sched_show, NULL);
1068        if (!ret) {
1069                struct seq_file *m = filp->private_data;
1070
1071                m->private = inode;
1072        }
1073        return ret;
1074}
1075
1076static const struct file_operations proc_pid_sched_operations = {
1077        .open           = sched_open,
1078        .read           = seq_read,
1079        .write          = sched_write,
1080        .llseek         = seq_lseek,
1081        .release        = single_release,
1082};
1083
1084#endif
1085
1086static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
1087{
1088        struct inode *inode = dentry->d_inode;
1089        int error = -EACCES;
1090
1091        /* We don't need a base pointer in the /proc filesystem */
1092        path_release(nd);
1093
1094        /* Are we allowed to snoop on the tasks file descriptors? */
1095        if (!proc_fd_access_allowed(inode))
1096                goto out;
1097
1098        error = PROC_I(inode)->op.proc_get_link(inode, &nd->dentry, &nd->mnt);
1099        nd->last_type = LAST_BIND;
1100out:
1101        return ERR_PTR(error);
1102}
1103
1104static int do_proc_readlink(struct dentry *dentry, struct vfsmount *mnt,
1105                            char __user *buffer, int buflen)
1106{
1107        struct inode * inode;
1108        char *tmp = (char*)__get_free_page(GFP_TEMPORARY);
1109        char *path;
1110        int len;
1111
1112        if (!tmp)
1113                return -ENOMEM;
1114
1115        inode = dentry->d_inode;
1116        path = d_path(dentry, mnt, tmp, PAGE_SIZE);
1117        len = PTR_ERR(path);
1118        if (IS_ERR(path))
1119                goto out;
1120        len = tmp + PAGE_SIZE - 1 - path;
1121
1122        if (len > buflen)
1123                len = buflen;
1124        if (copy_to_user(buffer, path, len))
1125                len = -EFAULT;
1126 out:
1127        free_page((unsigned long)tmp);
1128        return len;
1129}
1130
1131static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen)
1132{
1133        int error = -EACCES;
1134        struct inode *inode = dentry->d_inode;
1135        struct dentry *de;
1136        struct vfsmount *mnt = NULL;
1137
1138        /* Are we allowed to snoop on the tasks file descriptors? */
1139        if (!proc_fd_access_allowed(inode))
1140                goto out;
1141
1142        error = PROC_I(inode)->op.proc_get_link(inode, &de, &mnt);
1143        if (error)
1144                goto out;
1145
1146        error = do_proc_readlink(de, mnt, buffer, buflen);
1147        dput(de);
1148        mntput(mnt);
1149out:
1150        return error;
1151}
1152
1153static const struct inode_operations proc_pid_link_inode_operations = {
1154        .readlink       = proc_pid_readlink,
1155        .follow_link    = proc_pid_follow_link,
1156        .setattr        = proc_setattr,
1157};
1158
1159
1160/* building an inode */
1161
1162static int task_dumpable(struct task_struct *task)
1163{
1164        int dumpable = 0;
1165        struct mm_struct *mm;
1166
1167        task_lock(task);
1168        mm = task->mm;
1169        if (mm)
1170                dumpable = get_dumpable(mm);
1171        task_unlock(task);
1172        if(dumpable == 1)
1173                return 1;
1174        return 0;
1175}
1176
1177
1178static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)
1179{
1180        struct inode * inode;
1181        struct proc_inode *ei;
1182
1183        /* We need a new inode */
1184
1185        inode = new_inode(sb);
1186        if (!inode)
1187                goto out;
1188
1189        /* Common stuff */
1190        ei = PROC_I(inode);
1191        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
1192        inode->i_op = &proc_def_inode_operations;
1193
1194        /*
1195         * grab the reference to task.
1196         */
1197        ei->pid = get_task_pid(task, PIDTYPE_PID);
1198        if (!ei->pid)
1199                goto out_unlock;
1200
1201        inode->i_uid = 0;
1202        inode->i_gid = 0;
1203        if (task_dumpable(task)) {
1204                inode->i_uid = task->euid;
1205                inode->i_gid = task->egid;
1206        }
1207        security_task_to_inode(task, inode);
1208
1209out:
1210        return inode;
1211
1212out_unlock:
1213        iput(inode);
1214        return NULL;
1215}
1216
1217static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
1218{
1219        struct inode *inode = dentry->d_inode;
1220        struct task_struct *task;
1221        generic_fillattr(inode, stat);
1222
1223        rcu_read_lock();
1224        stat->uid = 0;
1225        stat->gid = 0;
1226        task = pid_task(proc_pid(inode), PIDTYPE_PID);
1227        if (task) {
1228                if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
1229                    task_dumpable(task)) {
1230                        stat->uid = task->euid;
1231                        stat->gid = task->egid;
1232                }
1233        }
1234        rcu_read_unlock();
1235        return 0;
1236}
1237
1238/* dentry stuff */
1239
1240/*
1241 *      Exceptional case: normally we are not allowed to unhash a busy
1242 * directory. In this case, however, we can do it - no aliasing problems
1243 * due to the way we treat inodes.
1244 *
1245 * Rewrite the inode's ownerships here because the owning task may have
1246 * performed a setuid(), etc.
1247 *
1248 * Before the /proc/pid/status file was created the only way to read
1249 * the effective uid of a /process was to stat /proc/pid.  Reading
1250 * /proc/pid/status is slow enough that procps and other packages
1251 * kept stating /proc/pid.  To keep the rules in /proc simple I have
1252 * made this apply to all per process world readable and executable
1253 * directories.
1254 */
1255static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
1256{
1257        struct inode *inode = dentry->d_inode;
1258        struct task_struct *task = get_proc_task(inode);
1259        if (task) {
1260                if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
1261                    task_dumpable(task)) {
1262                        inode->i_uid = task->euid;
1263                        inode->i_gid = task->egid;
1264                } else {
1265                        inode->i_uid = 0;
1266                        inode->i_gid = 0;
1267                }
1268                inode->i_mode &= ~(S_ISUID | S_ISGID);
1269                security_task_to_inode(task, inode);
1270                put_task_struct(task);
1271                return 1;
1272        }
1273        d_drop(dentry);
1274        return 0;
1275}
1276
1277static int pid_delete_dentry(struct dentry * dentry)
1278{
1279        /* Is the task we represent dead?
1280         * If so, then don't put the dentry on the lru list,
1281         * kill it immediately.
1282         */
1283        return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
1284}
1285
1286static struct dentry_operations pid_dentry_operations =
1287{
1288        .d_revalidate   = pid_revalidate,
1289        .d_delete       = pid_delete_dentry,
1290};
1291
1292/* Lookups */
1293
1294typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
1295                                struct task_struct *, const void *);
1296
1297/*
1298 * Fill a directory entry.
1299 *
1300 * If possible create the dcache entry and derive our inode number and
1301 * file type from dcache entry.
1302 *
1303 * Since all of the proc inode numbers are dynamically generated, the inode
1304 * numbers do not exist until the inode is cache.  This means creating the
1305 * the dcache entry in readdir is necessary to keep the inode numbers
1306 * reported by readdir in sync with the inode numbers reported
1307 * by stat.
1308 */
1309static int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
1310        char *name, int len,
1311        instantiate_t instantiate, struct task_struct *task, const void *ptr)
1312{
1313        struct dentry *child, *dir = filp->f_path.dentry;
1314        struct inode *inode;
1315        struct qstr qname;
1316        ino_t ino = 0;
1317        unsigned type = DT_UNKNOWN;
1318
1319        qname.name = name;
1320        qname.len  = len;
1321        qname.hash = full_name_hash(name, len);
1322
1323        child = d_lookup(dir, &qname);
1324        if (!child) {
1325                struct dentry *new;
1326                new = d_alloc(dir, &qname);
1327                if (new) {
1328                        child = instantiate(dir->d_inode, new, task, ptr);
1329                        if (child)
1330                                dput(new);
1331                        else
1332                                child = new;
1333                }
1334        }
1335        if (!child || IS_ERR(child) || !child->d_inode)
1336                goto end_instantiate;
1337        inode = child->d_inode;
1338        if (inode) {
1339                ino = inode->i_ino;
1340                type = inode->i_mode >> 12;
1341        }
1342        dput(child);
1343end_instantiate:
1344        if (!ino)
1345                ino = find_inode_number(dir, &qname);
1346        if (!ino)
1347                ino = 1;
1348        return filldir(dirent, name, len, filp->f_pos, ino, type);
1349}
1350
1351static unsigned name_to_int(struct dentry *dentry)
1352{
1353        const char *name = dentry->d_name.name;
1354        int len = dentry->d_name.len;
1355        unsigned n = 0;
1356
1357        if (len > 1 && *name == '0')
1358                goto out;
1359        while (len-- > 0) {
1360                unsigned c = *name++ - '0';
1361                if (c > 9)
1362                        goto out;
1363                if (n >= (~0U-9)/10)
1364                        goto out;
1365                n *= 10;
1366                n += c;
1367        }
1368        return n;
1369out:
1370        return ~0U;
1371}
1372
1373#define PROC_FDINFO_MAX 64
1374
1375static int proc_fd_info(struct inode *inode, struct dentry **dentry,
1376                        struct vfsmount **mnt, char *info)
1377{
1378        struct task_struct *task = get_proc_task(inode);
1379        struct files_struct *files = NULL;
1380        struct file *file;
1381        int fd = proc_fd(inode);
1382
1383        if (task) {
1384                files = get_files_struct(task);
1385                put_task_struct(task);
1386        }
1387        if (files) {
1388                /*
1389                 * We are not taking a ref to the file structure, so we must
1390                 * hold ->file_lock.
1391                 */
1392                spin_lock(&files->file_lock);
1393                file = fcheck_files(files, fd);
1394                if (file) {
1395                        if (mnt)
1396                                *mnt = mntget(file->f_path.mnt);
1397                        if (dentry)
1398                                *dentry = dget(file->f_path.dentry);
1399                        if (info)
1400                                snprintf(info, PROC_FDINFO_MAX,
1401                                         "pos:\t%lli\n"
1402                                         "flags:\t0%o\n",
1403                                         (long long) file->f_pos,
1404                                         file->f_flags);
1405                        spin_unlock(&files->file_lock);
1406                        put_files_struct(files);
1407                        return 0;
1408                }
1409                spin_unlock(&files->file_lock);
1410                put_files_struct(files);
1411        }
1412        return -ENOENT;
1413}
1414
1415static int proc_fd_link(struct inode *inode, struct dentry **dentry,
1416                        struct vfsmount **mnt)
1417{
1418        return proc_fd_info(inode, dentry, mnt, NULL);
1419}
1420
1421static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
1422{
1423        struct inode *inode = dentry->d_inode;
1424        struct task_struct *task = get_proc_task(inode);
1425        int fd = proc_fd(inode);
1426        struct files_struct *files;
1427
1428        if (task) {
1429                files = get_files_struct(task);
1430                if (files) {
1431                        rcu_read_lock();
1432                        if (fcheck_files(files, fd)) {
1433                                rcu_read_unlock();
1434                                put_files_struct(files);
1435                                if (task_dumpable(task)) {
1436                                        inode->i_uid = task->euid;
1437                                        inode->i_gid = task->egid;
1438                                } else {
1439                                        inode->i_uid = 0;
1440                                        inode->i_gid = 0;
1441                                }
1442                                inode->i_mode &= ~(S_ISUID | S_ISGID);
1443                                security_task_to_inode(task, inode);
1444                                put_task_struct(task);
1445                                return 1;
1446                        }
1447                        rcu_read_unlock();
1448                        put_files_struct(files);
1449                }
1450                put_task_struct(task);
1451        }
1452        d_drop(dentry);
1453        return 0;
1454}
1455
1456static struct dentry_operations tid_fd_dentry_operations =
1457{
1458        .d_revalidate   = tid_fd_revalidate,
1459        .d_delete       = pid_delete_dentry,
1460};
1461
1462static struct dentry *proc_fd_instantiate(struct inode *dir,
1463        struct dentry *dentry, struct task_struct *task, const void *ptr)
1464{
1465        unsigned fd = *(const unsigned *)ptr;
1466        struct file *file;
1467        struct files_struct *files;
1468        struct inode *inode;
1469        struct proc_inode *ei;
1470        struct dentry *error = ERR_PTR(-ENOENT);
1471
1472        inode = proc_pid_make_inode(dir->i_sb, task);
1473        if (!inode)
1474                goto out;
1475        ei = PROC_I(inode);
1476        ei->fd = fd;
1477        files = get_files_struct(task);
1478        if (!files)
1479                goto out_iput;
1480        inode->i_mode = S_IFLNK;
1481
1482        /*
1483         * We are not taking a ref to the file structure, so we must
1484         * hold ->file_lock.
1485         */
1486        spin_lock(&files->file_lock);
1487        file = fcheck_files(files, fd);
1488        if (!file)
1489                goto out_unlock;
1490        if (file->f_mode & 1)
1491                inode->i_mode |= S_IRUSR | S_IXUSR;
1492        if (file->f_mode & 2)
1493                inode->i_mode |= S_IWUSR | S_IXUSR;
1494        spin_unlock(&files->file_lock);
1495        put_files_struct(files);
1496
1497        inode->i_op = &proc_pid_link_inode_operations;
1498        inode->i_size = 64;
1499        ei->op.proc_get_link = proc_fd_link;
1500        dentry->d_op = &tid_fd_dentry_operations;
1501        d_add(dentry, inode);
1502        /* Close the race of the process dying before we return the dentry */
1503        if (tid_fd_revalidate(dentry, NULL))
1504                error = NULL;
1505
1506 out:
1507        return error;
1508out_unlock:
1509        spin_unlock(&files->file_lock);
1510        put_files_struct(files);
1511out_iput:
1512        iput(inode);
1513        goto out;
1514}
1515
1516static struct dentry *proc_lookupfd_common(struct inode *dir,
1517                                           struct dentry *dentry,
1518                                           instantiate_t instantiate)
1519{
1520        struct task_struct *task = get_proc_task(dir);
1521        unsigned fd = name_to_int(dentry);
1522        struct dentry *result = ERR_PTR(-ENOENT);
1523
1524        if (!task)
1525                goto out_no_task;
1526        if (fd == ~0U)
1527                goto out;
1528
1529        result = instantiate(dir, dentry, task, &fd);
1530out:
1531        put_task_struct(task);
1532out_no_task:
1533        return result;
1534}
1535
1536static int proc_readfd_common(struct file * filp, void * dirent,
1537                              filldir_t filldir, instantiate_t instantiate)
1538{
1539        struct dentry *dentry = filp->f_path.dentry;
1540        struct inode *inode = dentry->d_inode;
1541        struct task_struct *p = get_proc_task(inode);
1542        unsigned int fd, ino;
1543        int retval;
1544        struct files_struct * files;
1545        struct fdtable *fdt;
1546
1547        retval = -ENOENT;
1548        if (!p)
1549                goto out_no_task;
1550        retval = 0;
1551
1552        fd = filp->f_pos;
1553        switch (fd) {
1554                case 0:
1555                        if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
1556                                goto out;
1557                        filp->f_pos++;
1558                case 1:
1559                        ino = parent_ino(dentry);
1560                        if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
1561                                goto out;
1562                        filp->f_pos++;
1563                default:
1564                        files = get_files_struct(p);
1565                        if (!files)
1566                                goto out;
1567                        rcu_read_lock();
1568                        fdt = files_fdtable(files);
1569                        for (fd = filp->f_pos-2;
1570                             fd < fdt->max_fds;
1571                             fd++, filp->f_pos++) {
1572                                char name[PROC_NUMBUF];
1573                                int len;
1574
1575                                if (!fcheck_files(files, fd))
1576                                        continue;
1577                                rcu_read_unlock();
1578
1579                                len = snprintf(name, sizeof(name), "%d", fd);
1580                                if (proc_fill_cache(filp, dirent, filldir,
1581                                                    name, len, instantiate,
1582                                                    p, &fd) < 0) {
1583                                        rcu_read_lock();
1584                                        break;
1585                                }
1586                                rcu_read_lock();
1587                        }
1588                        rcu_read_unlock();
1589                        put_files_struct(files);
1590        }
1591out:
1592        put_task_struct(p);
1593out_no_task:
1594        return retval;
1595}
1596
1597static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry,
1598                                    struct nameidata *nd)
1599{
1600        return proc_lookupfd_common(dir, dentry, proc_fd_instantiate);
1601}
1602
1603static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir)
1604{
1605        return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate);
1606}
1607
1608static ssize_t proc_fdinfo_read(struct file *file, char __user *buf,
1609                                      size_t len, loff_t *ppos)
1610{
1611        char tmp[PROC_FDINFO_MAX];
1612        int err = proc_fd_info(file->f_path.dentry->d_inode, NULL, NULL, tmp);
1613        if (!err)
1614                err = simple_read_from_buffer(buf, len, ppos, tmp, strlen(tmp));
1615        return err;
1616}
1617
1618static const struct file_operations proc_fdinfo_file_operations = {
1619        .open           = nonseekable_open,
1620        .read           = proc_fdinfo_read,
1621};
1622
1623static const struct file_operations proc_fd_operations = {
1624        .read           = generic_read_dir,
1625        .readdir        = proc_readfd,
1626};
1627
1628/*
1629 * /proc/pid/fd needs a special permission handler so that a process can still
1630 * access /proc/self/fd after it has executed a setuid().
1631 */
1632static int proc_fd_permission(struct inode *inode, int mask,
1633                                struct nameidata *nd)
1634{
1635        int rv;
1636
1637        rv = generic_permission(inode, mask, NULL);
1638        if (rv == 0)
1639                return 0;
1640        if (task_pid(current) == proc_pid(inode))
1641                rv = 0;
1642        return rv;
1643}
1644
1645/*
1646 * proc directories can do almost nothing..
1647 */
1648static const struct inode_operations proc_fd_inode_operations = {
1649        .lookup         = proc_lookupfd,
1650        .permission     = proc_fd_permission,
1651        .setattr        = proc_setattr,
1652};
1653
1654static struct dentry *proc_fdinfo_instantiate(struct inode *dir,
1655        struct dentry *dentry, struct task_struct *task, const void *ptr)
1656{
1657        unsigned fd = *(unsigned *)ptr;
1658        struct inode *inode;
1659        struct proc_inode *ei;
1660        struct dentry *error = ERR_PTR(-ENOENT);
1661
1662        inode = proc_pid_make_inode(dir->i_sb, task);
1663        if (!inode)
1664                goto out;
1665        ei = PROC_I(inode);
1666        ei->fd = fd;
1667        inode->i_mode = S_IFREG | S_IRUSR;
1668        inode->i_fop = &proc_fdinfo_file_operations;
1669        dentry->d_op = &tid_fd_dentry_operations;
1670        d_add(dentry, inode);
1671        /* Close the race of the process dying before we return the dentry */
1672        if (tid_fd_revalidate(dentry, NULL))
1673                error = NULL;
1674
1675 out:
1676        return error;
1677}
1678
1679static struct dentry *proc_lookupfdinfo(struct inode *dir,
1680                                        struct dentry *dentry,
1681                                        struct nameidata *nd)
1682{
1683        return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
1684}
1685
1686static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir)
1687{
1688        return proc_readfd_common(filp, dirent, filldir,
1689                                  proc_fdinfo_instantiate);
1690}
1691
1692static const struct file_operations proc_fdinfo_operations = {
1693        .read           = generic_read_dir,
1694        .readdir        = proc_readfdinfo,
1695};
1696
1697/*
1698 * proc directories can do almost nothing..
1699 */
1700static const struct inode_operations proc_fdinfo_inode_operations = {
1701        .lookup         = proc_lookupfdinfo,
1702        .setattr        = proc_setattr,
1703};
1704
1705
1706static struct dentry *proc_pident_instantiate(struct inode *dir,
1707        struct dentry *dentry, struct task_struct *task, const void *ptr)
1708{
1709        const struct pid_entry *p = ptr;
1710        struct inode *inode;
1711        struct proc_inode *ei;
1712        struct dentry *error = ERR_PTR(-EINVAL);
1713
1714        inode = proc_pid_make_inode(dir->i_sb, task);
1715        if (!inode)
1716                goto out;
1717
1718        ei = PROC_I(inode);
1719        inode->i_mode = p->mode;
1720        if (S_ISDIR(inode->i_mode))
1721                inode->i_nlink = 2;     /* Use getattr to fix if necessary */
1722        if (p->iop)
1723                inode->i_op = p->iop;
1724        if (p->fop)
1725                inode->i_fop = p->fop;
1726        ei->op = p->op;
1727        dentry->d_op = &pid_dentry_operations;
1728        d_add(dentry, inode);
1729        /* Close the race of the process dying before we return the dentry */
1730        if (pid_revalidate(dentry, NULL))
1731                error = NULL;
1732out:
1733        return error;
1734}
1735
1736static struct dentry *proc_pident_lookup(struct inode *dir, 
1737                                         struct dentry *dentry,
1738                                         const struct pid_entry *ents,
1739                                         unsigned int nents)
1740{
1741        struct inode *inode;
1742        struct dentry *error;
1743        struct task_struct *task = get_proc_task(dir);
1744        const struct pid_entry *p, *last;
1745
1746        error = ERR_PTR(-ENOENT);
1747        inode = NULL;
1748
1749        if (!task)
1750                goto out_no_task;
1751
1752        /*
1753         * Yes, it does not scale. And it should not. Don't add
1754         * new entries into /proc/<tgid>/ without very good reasons.
1755         */
1756        last = &ents[nents - 1];
1757        for (p = ents; p <= last; p++) {
1758                if (p->len != dentry->d_name.len)
1759                        continue;
1760                if (!memcmp(dentry->d_name.name, p->name, p->len))
1761                        break;
1762        }
1763        if (p > last)
1764                goto out;
1765
1766        error = proc_pident_instantiate(dir, dentry, task, p);
1767out:
1768        put_task_struct(task);
1769out_no_task:
1770        return error;
1771}
1772
1773static int proc_pident_fill_cache(struct file *filp, void *dirent,
1774        filldir_t filldir, struct task_struct *task, const struct pid_entry *p)
1775{
1776        return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
1777                                proc_pident_instantiate, task, p);
1778}
1779
1780static int proc_pident_readdir(struct file *filp,
1781                void *dirent, filldir_t filldir,
1782                const struct pid_entry *ents, unsigned int nents)
1783{
1784        int i;
1785        struct dentry *dentry = filp->f_path.dentry;
1786        struct inode *inode = dentry->d_inode;
1787        struct task_struct *task = get_proc_task(inode);
1788        const struct pid_entry *p, *last;
1789        ino_t ino;
1790        int ret;
1791
1792        ret = -ENOENT;
1793        if (!task)
1794                goto out_no_task;
1795
1796        ret = 0;
1797        i = filp->f_pos;
1798        switch (i) {
1799        case 0:
1800                ino = inode->i_ino;
1801                if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
1802                        goto out;
1803                i++;
1804                filp->f_pos++;
1805                /* fall through */
1806        case 1:
1807                ino = parent_ino(dentry);
1808                if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
1809                        goto out;
1810                i++;
1811                filp->f_pos++;
1812                /* fall through */
1813        default:
1814                i -= 2;
1815                if (i >= nents) {
1816                        ret = 1;
1817                        goto out;
1818                }
1819                p = ents + i;
1820                last = &ents[nents - 1];
1821                while (p <= last) {
1822                        if (proc_pident_fill_cache(filp, dirent, filldir, task, p) < 0)
1823                                goto out;
1824                        filp->f_pos++;
1825                        p++;
1826                }
1827        }
1828
1829        ret = 1;
1830out:
1831        put_task_struct(task);
1832out_no_task:
1833        return ret;
1834}
1835
1836#ifdef CONFIG_SECURITY
1837static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
1838                                  size_t count, loff_t *ppos)
1839{
1840        struct inode * inode = file->f_path.dentry->d_inode;
1841        char *p = NULL;
1842        ssize_t length;
1843        struct task_struct *task = get_proc_task(inode);
1844
1845        if (!task)
1846                return -ESRCH;
1847
1848        length = security_getprocattr(task,
1849                                      (char*)file->f_path.dentry->d_name.name,
1850                                      &p);
1851        put_task_struct(task);
1852        if (length > 0)
1853                length = simple_read_from_buffer(buf, count, ppos, p, length);
1854        kfree(p);
1855        return length;
1856}
1857
1858static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
1859                                   size_t count, loff_t *ppos)
1860{
1861        struct inode * inode = file->f_path.dentry->d_inode;
1862        char *page;
1863        ssize_t length;
1864        struct task_struct *task = get_proc_task(inode);
1865
1866        length = -ESRCH;
1867        if (!task)
1868                goto out_no_task;
1869        if (count > PAGE_SIZE)
1870                count = PAGE_SIZE;
1871
1872        /* No partial writes. */
1873        length = -EINVAL;
1874        if (*ppos != 0)
1875                goto out;
1876
1877        length = -ENOMEM;
1878        page = (char*)__get_free_page(GFP_TEMPORARY);
1879        if (!page)
1880                goto out;
1881
1882        length = -EFAULT;
1883        if (copy_from_user(page, buf, count))
1884                goto out_free;
1885
1886        length = security_setprocattr(task,
1887                                      (char*)file->f_path.dentry->d_name.name,
1888                                      (void*)page, count);
1889out_free:
1890        free_page((unsigned long) page);
1891out:
1892        put_task_struct(task);
1893out_no_task:
1894        return length;
1895}
1896
1897static const struct file_operations proc_pid_attr_operations = {
1898        .read           = proc_pid_attr_read,
1899        .write          = proc_pid_attr_write,
1900};
1901
1902static const struct pid_entry attr_dir_stuff[] = {
1903        REG("current",    S_IRUGO|S_IWUGO, pid_attr),
1904        REG("prev",       S_IRUGO,         pid_attr),
1905        REG("exec",       S_IRUGO|S_IWUGO, pid_attr),
1906        REG("fscreate",   S_IRUGO|S_IWUGO, pid_attr),
1907        REG("keycreate",  S_IRUGO|S_IWUGO, pid_attr),
1908        REG("sockcreate", S_IRUGO|S_IWUGO, pid_attr),
1909};
1910
1911static int proc_attr_dir_readdir(struct file * filp,
1912                             void * dirent, filldir_t filldir)
1913{
1914        return proc_pident_readdir(filp,dirent,filldir,
1915                                   attr_dir_stuff,ARRAY_SIZE(attr_dir_stuff));
1916}
1917
1918static const struct file_operations proc_attr_dir_operations = {
1919        .read           = generic_read_dir,
1920        .readdir        = proc_attr_dir_readdir,
1921};
1922
1923static struct dentry *proc_attr_dir_lookup(struct inode *dir,
1924                                struct dentry *dentry, struct nameidata *nd)
1925{
1926        return proc_pident_lookup(dir, dentry,
1927                                  attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
1928}
1929
1930static const struct inode_operations proc_attr_dir_inode_operations = {
1931        .lookup         = proc_attr_dir_lookup,
1932        .getattr        = pid_getattr,
1933        .setattr        = proc_setattr,
1934};
1935
1936#endif
1937
1938#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
1939static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf,
1940                                         size_t count, loff_t *ppos)
1941{
1942        struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
1943        struct mm_struct *mm;
1944        char buffer[PROC_NUMBUF];
1945        size_t len;
1946        int ret;
1947
1948        if (!task)
1949                return -ESRCH;
1950
1951        ret = 0;
1952        mm = get_task_mm(task);
1953        if (mm) {
1954                len = snprintf(buffer, sizeof(buffer), "%08lx\n",
1955                               ((mm->flags & MMF_DUMP_FILTER_MASK) >>
1956                                MMF_DUMP_FILTER_SHIFT));
1957                mmput(mm);
1958                ret = simple_read_from_buffer(buf, count, ppos, buffer, len);
1959        }
1960
1961        put_task_struct(task);
1962
1963        return ret;
1964}
1965
1966static ssize_t proc_coredump_filter_write(struct file *file,
1967                                          const char __user *buf,
1968                                          size_t count,
1969                                          loff_t *ppos)
1970{
1971        struct task_struct *task;
1972        struct mm_struct *mm;
1973        char buffer[PROC_NUMBUF], *end;
1974        unsigned int val;
1975        int ret;
1976        int i;
1977        unsigned long mask;
1978
1979        ret = -EFAULT;
1980        memset(buffer, 0, sizeof(buffer));
1981        if (count > sizeof(buffer) - 1)
1982                count = sizeof(buffer) - 1;
1983        if (copy_from_user(buffer, buf, count))
1984                goto out_no_task;
1985
1986        ret = -EINVAL;
1987        val = (unsigned int)simple_strtoul(buffer, &end, 0);
1988        if (*end == '\n')
1989                end++;
1990        if (end - buffer == 0)
1991                goto out_no_task;
1992
1993        ret = -ESRCH;
1994        task = get_proc_task(file->f_dentry->d_inode);
1995        if (!task)
1996                goto out_no_task;
1997
1998        ret = end - buffer;
1999        mm = get_task_mm(task);
2000        if (!mm)
2001                goto out_no_mm;
2002
2003        for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) {
2004                if (val & mask)
2005                        set_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
2006                else
2007                        clear_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
2008        }
2009
2010        mmput(mm);
2011 out_no_mm:
2012        put_task_struct(task);
2013 out_no_task:
2014        return ret;
2015}
2016
2017static const struct file_operations proc_coredump_filter_operations = {
2018        .read           = proc_coredump_filter_read,
2019        .write          = proc_coredump_filter_write,
2020};
2021#endif
2022
2023/*
2024 * /proc/self:
2025 */
2026static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
2027                              int buflen)
2028{
2029        char tmp[PROC_NUMBUF];
2030        sprintf(tmp, "%d", task_tgid_vnr(current));
2031        return vfs_readlink(dentry,buffer,buflen,tmp);
2032}
2033
2034static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
2035{
2036        char tmp[PROC_NUMBUF];
2037        sprintf(tmp, "%d", task_tgid_vnr(current));
2038        return ERR_PTR(vfs_follow_link(nd,tmp));
2039}
2040
2041static const struct inode_operations proc_self_inode_operations = {
2042        .readlink       = proc_self_readlink,
2043        .follow_link    = proc_self_follow_link,
2044};
2045
2046/*
2047 * proc base
2048 *
2049 * These are the directory entries in the root directory of /proc
2050 * that properly belong to the /proc filesystem, as they describe
2051 * describe something that is process related.
2052 */
2053static const struct pid_entry proc_base_stuff[] = {
2054        NOD("self", S_IFLNK|S_IRWXUGO,
2055                &proc_self_inode_operations, NULL, {}),
2056};
2057
2058/*
2059 *      Exceptional case: normally we are not allowed to unhash a busy
2060 * directory. In this case, however, we can do it - no aliasing problems
2061 * due to the way we treat inodes.
2062 */
2063static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd)
2064{
2065        struct inode *inode = dentry->d_inode;
2066        struct task_struct *task = get_proc_task(inode);
2067        if (task) {
2068                put_task_struct(task);
2069                return 1;
2070        }
2071        d_drop(dentry);
2072        return 0;
2073}
2074
2075static struct dentry_operations proc_base_dentry_operations =
2076{
2077        .d_revalidate   = proc_base_revalidate,
2078        .d_delete       = pid_delete_dentry,
2079};
2080
2081static struct dentry *proc_base_instantiate(struct inode *dir,
2082        struct dentry *dentry, struct task_struct *task, const void *ptr)
2083{
2084        const struct pid_entry *p = ptr;
2085        struct inode *inode;
2086        struct proc_inode *ei;
2087        struct dentry *error = ERR_PTR(-EINVAL);
2088
2089        /* Allocate the inode */
2090        error = ERR_PTR(-ENOMEM);
2091        inode = new_inode(dir->i_sb);
2092        if (!inode)
2093                goto out;
2094
2095        /* Initialize the inode */
2096        ei = PROC_I(inode);
2097        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
2098
2099        /*
2100         * grab the reference to the task.
2101         */
2102        ei->pid = get_task_pid(task, PIDTYPE_PID);
2103        if (!ei->pid)
2104                goto out_iput;
2105
2106        inode->i_uid = 0;
2107        inode->i_gid = 0;
2108        inode->i_mode = p->mode;
2109        if (S_ISDIR(inode->i_mode))
2110                inode->i_nlink = 2;
2111        if (S_ISLNK(inode->i_mode))
2112                inode->i_size = 64;
2113        if (p->iop)
2114                inode->i_op = p->iop;
2115        if (p->fop)
2116                inode->i_fop = p->fop;
2117        ei->op = p->op;
2118        dentry->d_op = &proc_base_dentry_operations;
2119        d_add(dentry, inode);
2120        error = NULL;
2121out:
2122        return error;
2123out_iput:
2124        iput(inode);
2125        goto out;
2126}
2127
2128static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry)
2129{
2130        struct dentry *error;
2131        struct task_struct *task = get_proc_task(dir);
2132        const struct pid_entry *p, *last;
2133
2134        error = ERR_PTR(-ENOENT);
2135
2136        if (!task)
2137                goto out_no_task;
2138
2139        /* Lookup the directory entry */
2140        last = &proc_base_stuff[ARRAY_SIZE(proc_base_stuff) - 1];
2141        for (p = proc_base_stuff; p <= last; p++) {
2142                if (p->len != dentry->d_name.len)
2143                        continue;
2144                if (!memcmp(dentry->d_name.name, p->name, p->len))
2145                        break;
2146        }
2147        if (p > last)
2148                goto out;
2149
2150        error = proc_base_instantiate(dir, dentry, task, p);
2151
2152out:
2153        put_task_struct(task);
2154out_no_task:
2155        return error;
2156}
2157
2158static int proc_base_fill_cache(struct file *filp, void *dirent,
2159        filldir_t filldir, struct task_struct *task, const struct pid_entry *p)
2160{
2161        return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
2162                                proc_base_instantiate, task, p);
2163}
2164
2165#ifdef CONFIG_TASK_IO_ACCOUNTING
2166static int proc_pid_io_accounting(struct task_struct *task, char *buffer)
2167{
2168        return sprintf(buffer,
2169#ifdef CONFIG_TASK_XACCT
2170                        "rchar: %llu\n"
2171                        "wchar: %llu\n"
2172                        "syscr: %llu\n"
2173                        "syscw: %llu\n"
2174#endif
2175                        "read_bytes: %llu\n"
2176                        "write_bytes: %llu\n"
2177                        "cancelled_write_bytes: %llu\n",
2178#ifdef CONFIG_TASK_XACCT
2179                        (unsigned long long)task->rchar,
2180                        (unsigned long long)task->wchar,
2181                        (unsigned long long)task->syscr,
2182                        (unsigned long long)task->syscw,
2183#endif
2184                        (unsigned long long)task->ioac.read_bytes,
2185                        (unsigned long long)task->ioac.write_bytes,
2186                        (unsigned long long)task->ioac.cancelled_write_bytes);
2187}
2188#endif
2189
2190/*
2191 * Thread groups
2192 */
2193static const struct file_operations proc_task_operations;
2194static const struct inode_operations proc_task_inode_operations;
2195
2196static const struct pid_entry tgid_base_stuff[] = {
2197        DIR("task",       S_IRUGO|S_IXUGO, task),
2198        DIR("fd",         S_IRUSR|S_IXUSR, fd),
2199        DIR("fdinfo",     S_IRUSR|S_IXUSR, fdinfo),
2200        REG("environ",    S_IRUSR, environ),
2201        INF("auxv",       S_IRUSR, pid_auxv),
2202        INF("status",     S_IRUGO, pid_status),
2203        INF("limits",     S_IRUSR, pid_limits),
2204#ifdef CONFIG_SCHED_DEBUG
2205        REG("sched",      S_IRUGO|S_IWUSR, pid_sched),
2206#endif
2207        INF("cmdline",    S_IRUGO, pid_cmdline),
2208        INF("stat",       S_IRUGO, tgid_stat),
2209        INF("statm",      S_IRUGO, pid_statm),
2210        REG("maps",       S_IRUGO, maps),
2211#ifdef CONFIG_NUMA
2212        REG("numa_maps",  S_IRUGO, numa_maps),
2213#endif
2214        REG("mem",        S_IRUSR|S_IWUSR, mem),
2215        LNK("cwd",        cwd),
2216        LNK("root",       root),
2217        LNK("exe",        exe),
2218        REG("mounts",     S_IRUGO, mounts),
2219        REG("mountstats", S_IRUSR, mountstats),
2220#ifdef CONFIG_MMU
2221        REG("clear_refs", S_IWUSR, clear_refs),
2222        REG("smaps",      S_IRUGO, smaps),
2223#endif
2224#ifdef CONFIG_SECURITY
2225        DIR("attr",       S_IRUGO|S_IXUGO, attr_dir),
2226#endif
2227#ifdef CONFIG_KALLSYMS
2228        INF("wchan",      S_IRUGO, pid_wchan),
2229#endif
2230#ifdef CONFIG_SCHEDSTATS
2231        INF("schedstat",  S_IRUGO, pid_schedstat),
2232#endif
2233#ifdef CONFIG_PROC_PID_CPUSET
2234        REG("cpuset",     S_IRUGO, cpuset),
2235#endif
2236#ifdef CONFIG_CGROUPS
2237        REG("cgroup",  S_IRUGO, cgroup),
2238#endif
2239        INF("oom_score",  S_IRUGO, oom_score),
2240        REG("oom_adj",    S_IRUGO|S_IWUSR, oom_adjust),
2241#ifdef CONFIG_AUDITSYSCALL
2242        REG("loginuid",   S_IWUSR|S_IRUGO, loginuid),
2243#endif
2244#ifdef CONFIG_FAULT_INJECTION
2245        REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
2246#endif
2247#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
2248        REG("coredump_filter", S_IRUGO|S_IWUSR, coredump_filter),
2249#endif
2250#ifdef CONFIG_TASK_IO_ACCOUNTING
2251        INF("io",       S_IRUGO, pid_io_accounting),
2252#endif
2253};
2254
2255static int proc_tgid_base_readdir(struct file * filp,
2256                             void * dirent, filldir_t filldir)
2257{
2258        return proc_pident_readdir(filp,dirent,filldir,
2259                                   tgid_base_stuff,ARRAY_SIZE(tgid_base_stuff));
2260}
2261
2262static const struct file_operations proc_tgid_base_operations = {
2263        .read           = generic_read_dir,
2264        .readdir        = proc_tgid_base_readdir,
2265};
2266
2267static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
2268        return proc_pident_lookup(dir, dentry,
2269                                  tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
2270}
2271
2272static const struct inode_operations proc_tgid_base_inode_operations = {
2273        .lookup         = proc_tgid_base_lookup,
2274        .getattr        = pid_getattr,
2275        .setattr        = proc_setattr,
2276};
2277
2278static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
2279{
2280        struct dentry *dentry, *leader, *dir;
2281        char buf[PROC_NUMBUF];
2282        struct qstr name;
2283
2284        name.name = buf;
2285        name.len = snprintf(buf, sizeof(buf), "%d", pid);
2286        dentry = d_hash_and_lookup(mnt->mnt_root, &name);
2287        if (dentry) {
2288                shrink_dcache_parent(dentry);
2289                d_drop(dentry);
2290                dput(dentry);
2291        }
2292
2293        if (tgid == 0)
2294                goto out;
2295
2296        name.name = buf;
2297        name.len = snprintf(buf, sizeof(buf), "%d", tgid);
2298        leader = d_hash_and_lookup(mnt->mnt_root, &name);
2299        if (!leader)
2300                goto out;
2301
2302        name.name = "task";
2303        name.len = strlen(name.name);
2304        dir = d_hash_and_lookup(leader, &name);
2305        if (!dir)
2306                goto out_put_leader;
2307
2308        name.name = buf;
2309        name.len = snprintf(buf, sizeof(buf), "%d", pid);
2310        dentry = d_hash_and_lookup(dir, &name);
2311        if (dentry) {
2312                shrink_dcache_parent(dentry);
2313                d_drop(dentry);
2314                dput(dentry);
2315        }
2316
2317        dput(dir);
2318out_put_leader:
2319        dput(leader);
2320out:
2321        return;
2322}
2323
2324/**
2325 * proc_flush_task -  Remove dcache entries for @task from the /proc dcache.
2326 * @task: task that should be flushed.
2327 *
2328 * When flushing dentries from proc, one needs to flush them from global
2329 * proc (proc_mnt) and from all the namespaces' procs this task was seen
2330 * in. This call is supposed to do all of this job.
2331 *
2332 * Looks in the dcache for
2333 * /proc/@pid
2334 * /proc/@tgid/task/@pid
2335 * if either directory is present flushes it and all of it'ts children
2336 * from the dcache.
2337 *
2338 * It is safe and reasonable to cache /proc entries for a task until
2339 * that task exits.  After that they just clog up the dcache with
2340 * useless entries, possibly causing useful dcache entries to be
2341 * flushed instead.  This routine is proved to flush those useless
2342 * dcache entries at process exit time.
2343 *
2344 * NOTE: This routine is just an optimization so it does not guarantee
2345 *       that no dcache entries will exist at process exit time it
2346 *       just makes it very unlikely that any will persist.
2347 */
2348
2349void proc_flush_task(struct task_struct *task)
2350{
2351        int i;
2352        struct pid *pid, *tgid = NULL;
2353        struct upid *upid;
2354
2355        pid = task_pid(task);
2356        if (thread_group_leader(task))
2357                tgid = task_tgid(task);
2358
2359        for (i = 0; i <= pid->level; i++) {
2360                upid = &pid->numbers[i];
2361                proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
2362                        tgid ? tgid->numbers[i].nr : 0);
2363        }
2364
2365        upid = &pid->numbers[pid->level];
2366        if (upid->nr == 1)
2367                pid_ns_release_proc(upid->ns);
2368}
2369
2370static struct dentry *proc_pid_instantiate(struct inode *dir,
2371                                           struct dentry * dentry,
2372                                           struct task_struct *task, const void *ptr)
2373{
2374        struct dentry *error = ERR_PTR(-ENOENT);
2375        struct inode *inode;
2376
2377        inode = proc_pid_make_inode(dir->i_sb, task);
2378        if (!inode)
2379                goto out;
2380
2381        inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO;
2382        inode->i_op = &proc_tgid_base_inode_operations;
2383        inode->i_fop = &proc_tgid_base_operations;
2384        inode->i_flags|=S_IMMUTABLE;
2385        inode->i_nlink = 5;
2386#ifdef CONFIG_SECURITY
2387        inode->i_nlink += 1;
2388#endif
2389
2390        dentry->d_op = &pid_dentry_operations;
2391
2392        d_add(dentry, inode);
2393        /* Close the race of the process dying before we return the dentry */
2394        if (pid_revalidate(dentry, NULL))
2395                error = NULL;
2396out:
2397        return error;
2398}
2399
2400struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
2401{
2402        struct dentry *result = ERR_PTR(-ENOENT);
2403        struct task_struct *task;
2404        unsigned tgid;
2405        struct pid_namespace *ns;
2406
2407        result = proc_base_lookup(dir, dentry);
2408        if (!IS_ERR(result) || PTR_ERR(result) != -ENOENT)
2409                goto out;
2410
2411        tgid = name_to_int(dentry);
2412        if (tgid == ~0U)
2413                goto out;
2414
2415        ns = dentry->d_sb->s_fs_info;
2416        rcu_read_lock();
2417        task = find_task_by_pid_ns(tgid, ns);
2418        if (task)
2419                get_task_struct(task);
2420        rcu_read_unlock();
2421        if (!task)
2422                goto out;
2423
2424        result = proc_pid_instantiate(dir, dentry, task, NULL);
2425        put_task_struct(task);
2426out:
2427        return result;
2428}
2429
2430/*
2431 * Find the first task with tgid >= tgid
2432 *
2433 */
2434struct tgid_iter {
2435        unsigned int tgid;
2436        struct task_struct *task;
2437};
2438static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter)
2439{
2440        struct pid *pid;
2441
2442        if (iter.task)
2443                put_task_struct(iter.task);
2444        rcu_read_lock();
2445retry:
2446        iter.task = NULL;
2447        pid = find_ge_pid(iter.tgid, ns);
2448        if (pid) {
2449                iter.tgid = pid_nr_ns(pid, ns);
2450                iter.task = pid_task(pid, PIDTYPE_PID);
2451                /* What we to know is if the pid we have find is the
2452                 * pid of a thread_group_leader.  Testing for task
2453                 * being a thread_group_leader is the obvious thing
2454                 * todo but there is a window when it fails, due to
2455                 * the pid transfer logic in de_thread.
2456                 *
2457                 * So we perform the straight forward test of seeing
2458                 * if the pid we have found is the pid of a thread
2459                 * group leader, and don't worry if the task we have
2460                 * found doesn't happen to be a thread group leader.
2461                 * As we don't care in the case of readdir.
2462                 */
2463                if (!iter.task || !has_group_leader_pid(iter.task)) {
2464                        iter.tgid += 1;
2465                        goto retry;
2466                }
2467                get_task_struct(iter.task);
2468        }
2469        rcu_read_unlock();
2470        return iter;
2471}
2472
2473#define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff))
2474
2475static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
2476        struct tgid_iter iter)
2477{
2478        char name[PROC_NUMBUF];
2479        int len = snprintf(name, sizeof(name), "%d", iter.tgid);
2480        return proc_fill_cache(filp, dirent, filldir, name, len,
2481                                proc_pid_instantiate, iter.task, NULL);
2482}
2483
2484/* for the /proc/ directory itself, after non-process stuff has been done */
2485int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
2486{
2487        unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY;
2488        struct task_struct *reaper = get_proc_task(filp->f_path.dentry->d_inode);
2489        struct tgid_iter iter;
2490        struct pid_namespace *ns;
2491
2492        if (!reaper)
2493                goto out_no_task;
2494
2495        for (; nr < ARRAY_SIZE(proc_base_stuff); filp->f_pos++, nr++) {
2496                const struct pid_entry *p = &proc_base_stuff[nr];
2497                if (proc_base_fill_cache(filp, dirent, filldir, reaper, p) < 0)
2498                        goto out;
2499        }
2500
2501        ns = filp->f_dentry->d_sb->s_fs_info;
2502        iter.task = NULL;
2503        iter.tgid = filp->f_pos - TGID_OFFSET;
2504        for (iter = next_tgid(ns, iter);
2505             iter.task;
2506             iter.tgid += 1, iter = next_tgid(ns, iter)) {
2507                filp->f_pos = iter.tgid + TGID_OFFSET;
2508                if (proc_pid_fill_cache(filp, dirent, filldir, iter) < 0) {
2509                        put_task_struct(iter.task);
2510                        goto out;
2511                }
2512        }
2513        filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET;
2514out:
2515        put_task_struct(reaper);
2516out_no_task:
2517        return 0;
2518}
2519
2520/*
2521 * Tasks
2522 */
2523static const struct pid_entry tid_base_stuff[] = {
2524        DIR("fd",        S_IRUSR|S_IXUSR, fd),
2525        DIR("fdinfo",    S_IRUSR|S_IXUSR, fdinfo),
2526        REG("environ",   S_IRUSR, environ),
2527        INF("auxv",      S_IRUSR, pid_auxv),
2528        INF("status",    S_IRUGO, pid_status),
2529        INF("limits",    S_IRUSR, pid_limits),
2530#ifdef CONFIG_SCHED_DEBUG
2531        REG("sched",     S_IRUGO|S_IWUSR, pid_sched),
2532#endif
2533        INF("cmdline",   S_IRUGO, pid_cmdline),
2534        INF("stat",      S_IRUGO, tid_stat),
2535        INF("statm",     S_IRUGO, pid_statm),
2536        REG("maps",      S_IRUGO, maps),
2537#ifdef CONFIG_NUMA
2538        REG("numa_maps", S_IRUGO, numa_maps),
2539#endif
2540        REG("mem",       S_IRUSR|S_IWUSR, mem),
2541        LNK("cwd",       cwd),
2542        LNK("root",      root),
2543        LNK("exe",       exe),
2544        REG("mounts",    S_IRUGO, mounts),
2545#ifdef CONFIG_MMU
2546        REG("clear_refs", S_IWUSR, clear_refs),
2547        REG("smaps",     S_IRUGO, smaps),
2548#endif
2549#ifdef CONFIG_SECURITY
2550        DIR("attr",      S_IRUGO|S_IXUGO, attr_dir),
2551#endif
2552#ifdef CONFIG_KALLSYMS
2553        INF("wchan",     S_IRUGO, pid_wchan),
2554#endif
2555#ifdef CONFIG_SCHEDSTATS
2556        INF("schedstat", S_IRUGO, pid_schedstat),
2557#endif
2558#ifdef CONFIG_PROC_PID_CPUSET
2559        REG("cpuset",    S_IRUGO, cpuset),
2560#endif
2561#ifdef CONFIG_CGROUPS
2562        REG("cgroup",  S_IRUGO, cgroup),
2563#endif
2564        INF("oom_score", S_IRUGO, oom_score),
2565        REG("oom_adj",   S_IRUGO|S_IWUSR, oom_adjust),
2566#ifdef CONFIG_AUDITSYSCALL
2567        REG("loginuid",  S_IWUSR|S_IRUGO, loginuid),
2568#endif
2569#ifdef CONFIG_FAULT_INJECTION
2570        REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
2571#endif
2572};
2573
2574static int proc_tid_base_readdir(struct file * filp,
2575                             void * dirent, filldir_t filldir)
2576{
2577        return proc_pident_readdir(filp,dirent,filldir,
2578                                   tid_base_stuff,ARRAY_SIZE(tid_base_stuff));
2579}
2580
2581static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
2582        return proc_pident_lookup(dir, dentry,
2583                                  tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
2584}
2585
2586static const struct file_operations proc_tid_base_operations = {
2587        .read           = generic_read_dir,
2588        .readdir        = proc_tid_base_readdir,
2589};
2590
2591static const struct inode_operations proc_tid_base_inode_operations = {
2592        .lookup         = proc_tid_base_lookup,
2593        .getattr        = pid_getattr,
2594        .setattr        = proc_setattr,
2595};
2596
2597static struct dentry *proc_task_instantiate(struct inode *dir,
2598        struct dentry *dentry, struct task_struct *task, const void *ptr)
2599{
2600        struct dentry *error = ERR_PTR(-ENOENT);
2601        struct inode *inode;
2602        inode = proc_pid_make_inode(dir->i_sb, task);
2603
2604        if (!inode)
2605                goto out;
2606        inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO;
2607        inode->i_op = &proc_tid_base_inode_operations;
2608        inode->i_fop = &proc_tid_base_operations;
2609        inode->i_flags|=S_IMMUTABLE;
2610        inode->i_nlink = 4;
2611#ifdef CONFIG_SECURITY
2612        inode->i_nlink += 1;
2613#endif
2614
2615        dentry->d_op = &pid_dentry_operations;
2616
2617        d_add(dentry, inode);
2618        /* Close the race of the process dying before we return the dentry */
2619        if (pid_revalidate(dentry, NULL))
2620                error = NULL;
2621out:
2622        return error;
2623}
2624
2625static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
2626{
2627        struct dentry *result = ERR_PTR(-ENOENT);
2628        struct task_struct *task;
2629        struct task_struct *leader = get_proc_task(dir);
2630        unsigned tid;
2631        struct pid_namespace *ns;
2632
2633        if (!leader)
2634                goto out_no_task;
2635
2636        tid = name_to_int(dentry);
2637        if (tid == ~0U)
2638                goto out;
2639
2640        ns = dentry->d_sb->s_fs_info;
2641        rcu_read_lock();
2642        task = find_task_by_pid_ns(tid, ns);
2643        if (task)
2644                get_task_struct(task);
2645        rcu_read_unlock();
2646        if (!task)
2647                goto out;
2648        if (!same_thread_group(leader, task))
2649                goto out_drop_task;
2650
2651        result = proc_task_instantiate(dir, dentry, task, NULL);
2652out_drop_task:
2653        put_task_struct(task);
2654out:
2655        put_task_struct(leader);
2656out_no_task:
2657        return result;
2658}
2659
2660/*
2661 * Find the first tid of a thread group to return to user space.
2662 *
2663 * Usually this is just the thread group leader, but if the users
2664 * buffer was too small or there was a seek into the middle of the
2665 * directory we have more work todo.
2666 *
2667 * In the case of a short read we start with find_task_by_pid.
2668 *
2669 * In the case of a seek we start with the leader and walk nr
2670 * threads past it.
2671 */
2672static struct task_struct *first_tid(struct task_struct *leader,
2673                int tid, int nr, struct pid_namespace *ns)
2674{
2675        struct task_struct *pos;
2676
2677        rcu_read_lock();
2678        /* Attempt to start with the pid of a thread */
2679        if (tid && (nr > 0)) {
2680                pos = find_task_by_pid_ns(tid, ns);
2681                if (pos && (pos->group_leader == leader))
2682                        goto found;
2683        }
2684
2685        /* If nr exceeds the number of threads there is nothing todo */
2686        pos = NULL;
2687        if (nr && nr >= get_nr_threads(leader))
2688                goto out;
2689
2690        /* If we haven't found our starting place yet start
2691         * with the leader and walk nr threads forward.
2692         */
2693        for (pos = leader; nr > 0; --nr) {
2694                pos = next_thread(pos);
2695                if (pos == leader) {
2696                        pos = NULL;
2697                        goto out;
2698                }
2699        }
2700found:
2701        get_task_struct(pos);
2702out:
2703        rcu_read_unlock();
2704        return pos;
2705}
2706
2707/*
2708 * Find the next thread in the thread list.
2709 * Return NULL if there is an error or no next thread.
2710 *
2711 * The reference to the input task_struct is released.
2712 */
2713static struct task_struct *next_tid(struct task_struct *start)
2714{
2715        struct task_struct *pos = NULL;
2716        rcu_read_lock();
2717        if (pid_alive(start)) {
2718                pos = next_thread(start);
2719                if (thread_group_leader(pos))
2720                        pos = NULL;
2721                else
2722                        get_task_struct(pos);
2723        }
2724        rcu_read_unlock();
2725        put_task_struct(start);
2726        return pos;
2727}
2728
2729static int proc_task_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
2730        struct task_struct *task, int tid)
2731{
2732        char name[PROC_NUMBUF];
2733        int len = snprintf(name, sizeof(name), "%d", tid);
2734        return proc_fill_cache(filp, dirent, filldir, name, len,
2735                                proc_task_instantiate, task, NULL);
2736}
2737
2738/* for the /proc/TGID/task/ directories */
2739static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir)
2740{
2741        struct dentry *dentry = filp->f_path.dentry;
2742        struct inode *inode = dentry->d_inode;
2743        struct task_struct *leader = NULL;
2744        struct task_struct *task;
2745        int retval = -ENOENT;
2746        ino_t ino;
2747        int tid;
2748        unsigned long pos = filp->f_pos;  /* avoiding "long long" filp->f_pos */
2749        struct pid_namespace *ns;
2750
2751        task = get_proc_task(inode);
2752        if (!task)
2753                goto out_no_task;
2754        rcu_read_lock();
2755        if (pid_alive(task)) {
2756                leader = task->group_leader;
2757                get_task_struct(leader);
2758        }
2759        rcu_read_unlock();
2760        put_task_struct(task);
2761        if (!leader)
2762                goto out_no_task;
2763        retval = 0;
2764
2765        switch (pos) {
2766        case 0:
2767                ino = inode->i_ino;
2768                if (filldir(dirent, ".", 1, pos, ino, DT_DIR) < 0)
2769                        goto out;
2770                pos++;
2771                /* fall through */
2772        case 1:
2773                ino = parent_ino(dentry);
2774                if (filldir(dirent, "..", 2, pos, ino, DT_DIR) < 0)
2775                        goto out;
2776                pos++;
2777                /* fall through */
2778        }
2779
2780        /* f_version caches the tgid value that the last readdir call couldn't
2781         * return. lseek aka telldir automagically resets f_version to 0.
2782         */
2783        ns = filp->f_dentry->d_sb->s_fs_info;
2784        tid = (int)filp->f_version;
2785        filp->f_version = 0;
2786        for (task = first_tid(leader, tid, pos - 2, ns);
2787             task;
2788             task = next_tid(task), pos++) {
2789                tid = task_pid_nr_ns(task, ns);
2790                if (proc_task_fill_cache(filp, dirent, filldir, task, tid) < 0) {
2791                        /* returning this tgid failed, save it as the first
2792                         * pid for the next readir call */
2793                        filp->f_version = (u64)tid;
2794                        put_task_struct(task);
2795                        break;
2796                }
2797        }
2798out:
2799        filp->f_pos = pos;
2800        put_task_struct(leader);
2801out_no_task:
2802        return retval;
2803}
2804
2805static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
2806{
2807        struct inode *inode = dentry->d_inode;
2808        struct task_struct *p = get_proc_task(inode);
2809        generic_fillattr(inode, stat);
2810
2811        if (p) {
2812                rcu_read_lock();
2813                stat->nlink += get_nr_threads(p);
2814                rcu_read_unlock();
2815                put_task_struct(p);
2816        }
2817
2818        return 0;
2819}
2820
2821static const struct inode_operations proc_task_inode_operations = {
2822        .lookup         = proc_task_lookup,
2823        .getattr        = proc_task_getattr,
2824        .setattr        = proc_setattr,
2825};
2826
2827static const struct file_operations proc_task_operations = {
2828        .read           = generic_read_dir,
2829        .readdir        = proc_task_readdir,
2830};
2831