LXR linux/fs/exec.c

   1/*
   2 *  linux/fs/exec.c
   3 *
   4 *  Copyright (C) 1991, 1992  Linus Torvalds
   5 */
   6
   7/*
   8 * #!-checking implemented by tytso.
   9 */
  10/*
  11 * Demand-loading implemented 01.12.91 - no need to read anything but
  12 * the header into memory. The inode of the executable is put into
  13 * "current->executable", and page faults do the actual loading. Clean.
  14 *
  15 * Once more I can proudly say that linux stood up to being changed: it
  16 * was less than 2 hours work to get demand-loading completely implemented.
  17 *
  18 * Demand loading changed July 1993 by Eric Youngdale.   Use mmap instead,
  19 * current->executable is only used by the procfs.  This allows a dispatch
  20 * table to check for several different types  of binary formats.  We keep
  21 * trying until we recognize the file or we run out of supported binary
  22 * formats. 
  23 */
  24
  25#include <linux/slab.h>
  26#include <linux/file.h>
  27#include <linux/mman.h>
  28#include <linux/a.out.h>
  29#include <linux/stat.h>
  30#include <linux/fcntl.h>
  31#include <linux/smp_lock.h>
  32#include <linux/string.h>
  33#include <linux/init.h>
  34#include <linux/pagemap.h>
  35#include <linux/highmem.h>
  36#include <linux/spinlock.h>
  37#include <linux/key.h>
  38#include <linux/personality.h>
  39#include <linux/binfmts.h>
  40#include <linux/swap.h>
  41#include <linux/utsname.h>
  42#include <linux/pid_namespace.h>
  43#include <linux/module.h>
  44#include <linux/namei.h>
  45#include <linux/proc_fs.h>
  46#include <linux/ptrace.h>
  47#include <linux/mount.h>
  48#include <linux/security.h>
  49#include <linux/syscalls.h>
  50#include <linux/rmap.h>
  51#include <linux/tsacct_kern.h>
  52#include <linux/cn_proc.h>
  53#include <linux/audit.h>
  54
  55#include <asm/uaccess.h>
  56#include <asm/mmu_context.h>
  57#include <asm/tlb.h>
  58
  59#ifdef CONFIG_KMOD
  60#include <linux/kmod.h>
  61#endif
  62
  63int core_uses_pid;
  64char core_pattern[CORENAME_MAX_SIZE] = "core";
  65int suid_dumpable = 0;
  66
  67/* The maximal length of core_pattern is also specified in sysctl.c */
  68
  69static LIST_HEAD(formats);
  70static DEFINE_RWLOCK(binfmt_lock);
  71
  72int register_binfmt(struct linux_binfmt * fmt)
  73{
  74        if (!fmt)
  75                return -EINVAL;
  76        write_lock(&binfmt_lock);
  77        list_add(&fmt->lh, &formats);
  78        write_unlock(&binfmt_lock);
  79        return 0;       
  80}
  81
  82EXPORT_SYMBOL(register_binfmt);
  83
  84void unregister_binfmt(struct linux_binfmt * fmt)
  85{
  86        write_lock(&binfmt_lock);
  87        list_del(&fmt->lh);
  88        write_unlock(&binfmt_lock);
  89}
  90
  91EXPORT_SYMBOL(unregister_binfmt);
  92
  93static inline void put_binfmt(struct linux_binfmt * fmt)
  94{
  95        module_put(fmt->module);
  96}
  97
  98/*
  99 * Note that a shared library must be both readable and executable due to
 100 * security reasons.
 101 *
 102 * Also note that we take the address to load from from the file itself.
 103 */
 104asmlinkage long sys_uselib(const char __user * library)
 105{
 106        struct file * file;
 107        struct nameidata nd;
 108        int error;
 109
 110        error = __user_path_lookup_open(library, LOOKUP_FOLLOW, &nd, FMODE_READ|FMODE_EXEC);
 111        if (error)
 112                goto out;
 113
 114        error = -EINVAL;
 115        if (!S_ISREG(nd.dentry->d_inode->i_mode))
 116                goto exit;
 117
 118        error = vfs_permission(&nd, MAY_READ | MAY_EXEC);
 119        if (error)
 120                goto exit;
 121
 122        file = nameidata_to_filp(&nd, O_RDONLY);
 123        error = PTR_ERR(file);
 124        if (IS_ERR(file))
 125                goto out;
 126
 127        error = -ENOEXEC;
 128        if(file->f_op) {
 129                struct linux_binfmt * fmt;
 130
 131                read_lock(&binfmt_lock);
 132                list_for_each_entry(fmt, &formats, lh) {
 133                        if (!fmt->load_shlib)
 134                                continue;
 135                        if (!try_module_get(fmt->module))
 136                                continue;
 137                        read_unlock(&binfmt_lock);
 138                        error = fmt->load_shlib(file);
 139                        read_lock(&binfmt_lock);
 140                        put_binfmt(fmt);
 141                        if (error != -ENOEXEC)
 142                                break;
 143                }
 144                read_unlock(&binfmt_lock);
 145        }
 146        fput(file);
 147out:
 148        return error;
 149exit:
 150        release_open_intent(&nd);
 151        path_release(&nd);
 152        goto out;
 153}
 154
 155#ifdef CONFIG_MMU
 156
 157static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 158                int write)
 159{
 160        struct page *page;
 161        int ret;
 162
 163#ifdef CONFIG_STACK_GROWSUP
 164        if (write) {
 165                ret = expand_stack_downwards(bprm->vma, pos);
 166                if (ret < 0)
 167                        return NULL;
 168        }
 169#endif
 170        ret = get_user_pages(current, bprm->mm, pos,
 171                        1, write, 1, &page, NULL);
 172        if (ret <= 0)
 173                return NULL;
 174
 175        if (write) {
 176                struct rlimit *rlim = current->signal->rlim;
 177                unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
 178
 179                /*
 180                 * Limit to 1/4-th the stack size for the argv+env strings.
 181                 * This ensures that:
 182                 *  - the remaining binfmt code will not run out of stack space,
 183                 *  - the program will have a reasonable amount of stack left
 184                 *    to work from.
 185                 */
 186                if (size > rlim[RLIMIT_STACK].rlim_cur / 4) {
 187                        put_page(page);
 188                        return NULL;
 189                }
 190        }
 191
 192        return page;
 193}
 194
 195static void put_arg_page(struct page *page)
 196{
 197        put_page(page);
 198}
 199
 200static void free_arg_page(struct linux_binprm *bprm, int i)
 201{
 202}
 203
 204static void free_arg_pages(struct linux_binprm *bprm)
 205{
 206}
 207
 208static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
 209                struct page *page)
 210{
 211        flush_cache_page(bprm->vma, pos, page_to_pfn(page));
 212}
 213
 214static int __bprm_mm_init(struct linux_binprm *bprm)
 215{
 216        int err = -ENOMEM;
 217        struct vm_area_struct *vma = NULL;
 218        struct mm_struct *mm = bprm->mm;
 219
 220        bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
 221        if (!vma)
 222                goto err;
 223
 224        down_write(&mm->mmap_sem);
 225        vma->vm_mm = mm;
 226
 227        /*
 228         * Place the stack at the largest stack address the architecture
 229         * supports. Later, we'll move this to an appropriate place. We don't
 230         * use STACK_TOP because that can depend on attributes which aren't
 231         * configured yet.
 232         */
 233        vma->vm_end = STACK_TOP_MAX;
 234        vma->vm_start = vma->vm_end - PAGE_SIZE;
 235
 236        vma->vm_flags = VM_STACK_FLAGS;
 237        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 238        err = insert_vm_struct(mm, vma);
 239        if (err) {
 240                up_write(&mm->mmap_sem);
 241                goto err;
 242        }
 243
 244        mm->stack_vm = mm->total_vm = 1;
 245        up_write(&mm->mmap_sem);
 246
 247        bprm->p = vma->vm_end - sizeof(void *);
 248
 249        return 0;
 250
 251err:
 252        if (vma) {
 253                bprm->vma = NULL;
 254                kmem_cache_free(vm_area_cachep, vma);
 255        }
 256
 257        return err;
 258}
 259
 260static bool valid_arg_len(struct linux_binprm *bprm, long len)
 261{
 262        return len <= MAX_ARG_STRLEN;
 263}
 264
 265#else
 266
 267static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 268                int write)
 269{
 270        struct page *page;
 271
 272        page = bprm->page[pos / PAGE_SIZE];
 273        if (!page && write) {
 274                page = alloc_page(GFP_HIGHUSER|__GFP_ZERO);
 275                if (!page)
 276                        return NULL;
 277                bprm->page[pos / PAGE_SIZE] = page;
 278        }
 279
 280        return page;
 281}
 282
 283static void put_arg_page(struct page *page)
 284{
 285}
 286
 287static void free_arg_page(struct linux_binprm *bprm, int i)
 288{
 289        if (bprm->page[i]) {
 290                __free_page(bprm->page[i]);
 291                bprm->page[i] = NULL;
 292        }
 293}
 294
 295static void free_arg_pages(struct linux_binprm *bprm)
 296{
 297        int i;
 298
 299        for (i = 0; i < MAX_ARG_PAGES; i++)
 300                free_arg_page(bprm, i);
 301}
 302
 303static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
 304                struct page *page)
 305{
 306}
 307
 308static int __bprm_mm_init(struct linux_binprm *bprm)
 309{
 310        bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
 311        return 0;
 312}
 313
 314static bool valid_arg_len(struct linux_binprm *bprm, long len)
 315{
 316        return len <= bprm->p;
 317}
 318
 319#endif /* CONFIG_MMU */
 320
 321/*
 322 * Create a new mm_struct and populate it with a temporary stack
 323 * vm_area_struct.  We don't have enough context at this point to set the stack
 324 * flags, permissions, and offset, so we use temporary values.  We'll update
 325 * them later in setup_arg_pages().
 326 */
 327int bprm_mm_init(struct linux_binprm *bprm)
 328{
 329        int err;
 330        struct mm_struct *mm = NULL;
 331
 332        bprm->mm = mm = mm_alloc();
 333        err = -ENOMEM;
 334        if (!mm)
 335                goto err;
 336
 337        err = init_new_context(current, mm);
 338        if (err)
 339                goto err;
 340
 341        err = __bprm_mm_init(bprm);
 342        if (err)
 343                goto err;
 344
 345        return 0;
 346
 347err:
 348        if (mm) {
 349                bprm->mm = NULL;
 350                mmdrop(mm);
 351        }
 352
 353        return err;
 354}
 355
 356/*
 357 * count() counts the number of strings in array ARGV.
 358 */
 359static int count(char __user * __user * argv, int max)
 360{
 361        int i = 0;
 362
 363        if (argv != NULL) {
 364                for (;;) {
 365                        char __user * p;
 366
 367                        if (get_user(p, argv))
 368                                return -EFAULT;
 369                        if (!p)
 370                                break;
 371                        argv++;
 372                        if(++i > max)
 373                                return -E2BIG;
 374                        cond_resched();
 375                }
 376        }
 377        return i;
 378}
 379
 380/*
 381 * 'copy_strings()' copies argument/environment strings from the old
 382 * processes's memory to the new process's stack.  The call to get_user_pages()
 383 * ensures the destination page is created and not swapped out.
 384 */
 385static int copy_strings(int argc, char __user * __user * argv,
 386                        struct linux_binprm *bprm)
 387{
 388        struct page *kmapped_page = NULL;
 389        char *kaddr = NULL;
 390        unsigned long kpos = 0;
 391        int ret;
 392
 393        while (argc-- > 0) {
 394                char __user *str;
 395                int len;
 396                unsigned long pos;
 397
 398                if (get_user(str, argv+argc) ||
 399                                !(len = strnlen_user(str, MAX_ARG_STRLEN))) {
 400                        ret = -EFAULT;
 401                        goto out;
 402                }
 403
 404                if (!valid_arg_len(bprm, len)) {
 405                        ret = -E2BIG;
 406                        goto out;
 407                }
 408
 409                /* We're going to work our way backwords. */
 410                pos = bprm->p;
 411                str += len;
 412                bprm->p -= len;
 413
 414                while (len > 0) {
 415                        int offset, bytes_to_copy;
 416
 417                        offset = pos % PAGE_SIZE;
 418                        if (offset == 0)
 419                                offset = PAGE_SIZE;
 420
 421                        bytes_to_copy = offset;
 422                        if (bytes_to_copy > len)
 423                                bytes_to_copy = len;
 424
 425                        offset -= bytes_to_copy;
 426                        pos -= bytes_to_copy;
 427                        str -= bytes_to_copy;
 428                        len -= bytes_to_copy;
 429
 430                        if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
 431                                struct page *page;
 432
 433                                page = get_arg_page(bprm, pos, 1);
 434                                if (!page) {
 435                                        ret = -E2BIG;
 436                                        goto out;
 437                                }
 438
 439                                if (kmapped_page) {
 440                                        flush_kernel_dcache_page(kmapped_page);
 441                                        kunmap(kmapped_page);
 442                                        put_arg_page(kmapped_page);
 443                                }
 444                                kmapped_page = page;
 445                                kaddr = kmap(kmapped_page);
 446                                kpos = pos & PAGE_MASK;
 447                                flush_arg_page(bprm, kpos, kmapped_page);
 448                        }
 449                        if (copy_from_user(kaddr+offset, str, bytes_to_copy)) {
 450                                ret = -EFAULT;
 451                                goto out;
 452                        }
 453                }
 454        }
 455        ret = 0;
 456out:
 457        if (kmapped_page) {
 458                flush_kernel_dcache_page(kmapped_page);
 459                kunmap(kmapped_page);
 460                put_arg_page(kmapped_page);
 461        }
 462        return ret;
 463}
 464
 465/*
 466 * Like copy_strings, but get argv and its values from kernel memory.
 467 */
 468int copy_strings_kernel(int argc,char ** argv, struct linux_binprm *bprm)
 469{
 470        int r;
 471        mm_segment_t oldfs = get_fs();
 472        set_fs(KERNEL_DS);
 473        r = copy_strings(argc, (char __user * __user *)argv, bprm);
 474        set_fs(oldfs);
 475        return r;
 476}
 477EXPORT_SYMBOL(copy_strings_kernel);
 478
 479#ifdef CONFIG_MMU
 480
 481/*
 482 * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX.  Once
 483 * the binfmt code determines where the new stack should reside, we shift it to
 484 * its final location.  The process proceeds as follows:
 485 *
 486 * 1) Use shift to calculate the new vma endpoints.
 487 * 2) Extend vma to cover both the old and new ranges.  This ensures the
 488 *    arguments passed to subsequent functions are consistent.
 489 * 3) Move vma's page tables to the new range.
 490 * 4) Free up any cleared pgd range.
 491 * 5) Shrink the vma to cover only the new range.
 492 */
 493static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 494{
 495        struct mm_struct *mm = vma->vm_mm;
 496        unsigned long old_start = vma->vm_start;
 497        unsigned long old_end = vma->vm_end;
 498        unsigned long length = old_end - old_start;
 499        unsigned long new_start = old_start - shift;
 500        unsigned long new_end = old_end - shift;
 501        struct mmu_gather *tlb;
 502
 503        BUG_ON(new_start > new_end);
 504
 505        /*
 506         * ensure there are no vmas between where we want to go
 507         * and where we are
 508         */
 509        if (vma != find_vma(mm, new_start))
 510                return -EFAULT;
 511
 512        /*
 513         * cover the whole range: [new_start, old_end)
 514         */
 515        vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL);
 516
 517        /*
 518         * move the page tables downwards, on failure we rely on
 519         * process cleanup to remove whatever mess we made.
 520         */
 521        if (length != move_page_tables(vma, old_start,
 522                                       vma, new_start, length))
 523                return -ENOMEM;
 524
 525        lru_add_drain();
 526        tlb = tlb_gather_mmu(mm, 0);
 527        if (new_end > old_start) {
 528                /*
 529                 * when the old and new regions overlap clear from new_end.
 530                 */
 531                free_pgd_range(&tlb, new_end, old_end, new_end,
 532                        vma->vm_next ? vma->vm_next->vm_start : 0);
 533        } else {
 534                /*
 535                 * otherwise, clean from old_start; this is done to not touch
 536                 * the address space in [new_end, old_start) some architectures
 537                 * have constraints on va-space that make this illegal (IA64) -
 538                 * for the others its just a little faster.
 539                 */
 540                free_pgd_range(&tlb, old_start, old_end, new_end,
 541                        vma->vm_next ? vma->vm_next->vm_start : 0);
 542        }
 543        tlb_finish_mmu(tlb, new_end, old_end);
 544
 545        /*
 546         * shrink the vma to just the new range.
 547         */
 548        vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL);
 549
 550        return 0;
 551}
 552
 553#define EXTRA_STACK_VM_PAGES    20      /* random */
 554
 555/*
 556 * Finalizes the stack vm_area_struct. The flags and permissions are updated,
 557 * the stack is optionally relocated, and some extra space is added.
 558 */
 559int setup_arg_pages(struct linux_binprm *bprm,
 560                    unsigned long stack_top,
 561                    int executable_stack)
 562{
 563        unsigned long ret;
 564        unsigned long stack_shift;
 565        struct mm_struct *mm = current->mm;
 566        struct vm_area_struct *vma = bprm->vma;
 567        struct vm_area_struct *prev = NULL;
 568        unsigned long vm_flags;
 569        unsigned long stack_base;
 570
 571#ifdef CONFIG_STACK_GROWSUP
 572        /* Limit stack size to 1GB */
 573        stack_base = current->signal->rlim[RLIMIT_STACK].rlim_max;
 574        if (stack_base > (1 << 30))
 575                stack_base = 1 << 30;
 576
 577        /* Make sure we didn't let the argument array grow too large. */
 578        if (vma->vm_end - vma->vm_start > stack_base)
 579                return -ENOMEM;
 580
 581        stack_base = PAGE_ALIGN(stack_top - stack_base);
 582
 583        stack_shift = vma->vm_start - stack_base;
 584        mm->arg_start = bprm->p - stack_shift;
 585        bprm->p = vma->vm_end - stack_shift;
 586#else
 587        stack_top = arch_align_stack(stack_top);
 588        stack_top = PAGE_ALIGN(stack_top);
 589        stack_shift = vma->vm_end - stack_top;
 590
 591        bprm->p -= stack_shift;
 592        mm->arg_start = bprm->p;
 593#endif
 594
 595        if (bprm->loader)
 596                bprm->loader -= stack_shift;
 597        bprm->exec -= stack_shift;
 598
 599        down_write(&mm->mmap_sem);
 600        vm_flags = vma->vm_flags;
 601
 602        /*
 603         * Adjust stack execute permissions; explicitly enable for
 604         * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone
 605         * (arch default) otherwise.
 606         */
 607        if (unlikely(executable_stack == EXSTACK_ENABLE_X))
 608                vm_flags |= VM_EXEC;
 609        else if (executable_stack == EXSTACK_DISABLE_X)
 610                vm_flags &= ~VM_EXEC;
 611        vm_flags |= mm->def_flags;
 612
 613        ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
 614                        vm_flags);
 615        if (ret)
 616                goto out_unlock;
 617        BUG_ON(prev != vma);
 618
 619        /* Move stack pages down in memory. */
 620        if (stack_shift) {
 621                ret = shift_arg_pages(vma, stack_shift);
 622                if (ret) {
 623                        up_write(&mm->mmap_sem);
 624                        return ret;
 625                }
 626        }
 627
 628#ifdef CONFIG_STACK_GROWSUP
 629        stack_base = vma->vm_end + EXTRA_STACK_VM_PAGES * PAGE_SIZE;
 630#else
 631        stack_base = vma->vm_start - EXTRA_STACK_VM_PAGES * PAGE_SIZE;
 632#endif
 633        ret = expand_stack(vma, stack_base);
 634        if (ret)
 635                ret = -EFAULT;
 636
 637out_unlock:
 638        up_write(&mm->mmap_sem);
 639        return 0;
 640}
 641EXPORT_SYMBOL(setup_arg_pages);
 642
 643#endif /* CONFIG_MMU */
 644
 645struct file *open_exec(const char *name)
 646{
 647        struct nameidata nd;
 648        int err;
 649        struct file *file;
 650
 651        err = path_lookup_open(AT_FDCWD, name, LOOKUP_FOLLOW, &nd, FMODE_READ|FMODE_EXEC);
 652        file = ERR_PTR(err);
 653
 654        if (!err) {
 655                struct inode *inode = nd.dentry->d_inode;
 656                file = ERR_PTR(-EACCES);
 657                if (S_ISREG(inode->i_mode)) {
 658                        int err = vfs_permission(&nd, MAY_EXEC);
 659                        file = ERR_PTR(err);
 660                        if (!err) {
 661                                file = nameidata_to_filp(&nd, O_RDONLY);
 662                                if (!IS_ERR(file)) {
 663                                        err = deny_write_access(file);
 664                                        if (err) {
 665                                                fput(file);
 666                                                file = ERR_PTR(err);
 667                                        }
 668                                }
 669out:
 670                                return file;
 671                        }
 672                }
 673                release_open_intent(&nd);
 674                path_release(&nd);
 675        }
 676        goto out;
 677}
 678
 679EXPORT_SYMBOL(open_exec);
 680
 681int kernel_read(struct file *file, unsigned long offset,
 682        char *addr, unsigned long count)
 683{
 684        mm_segment_t old_fs;
 685        loff_t pos = offset;
 686        int result;
 687
 688        old_fs = get_fs();
 689        set_fs(get_ds());
 690        /* The cast to a user pointer is valid due to the set_fs() */
 691        result = vfs_read(file, (void __user *)addr, count, &pos);
 692        set_fs(old_fs);
 693        return result;
 694}
 695
 696EXPORT_SYMBOL(kernel_read);
 697
 698static int exec_mmap(struct mm_struct *mm)
 699{
 700        struct task_struct *tsk;
 701        struct mm_struct * old_mm, *active_mm;
 702
 703        /* Notify parent that we're no longer interested in the old VM */
 704        tsk = current;
 705        old_mm = current->mm;
 706        mm_release(tsk, old_mm);
 707
 708        if (old_mm) {
 709                /*
 710                 * Make sure that if there is a core dump in progress
 711                 * for the old mm, we get out and die instead of going
 712                 * through with the exec.  We must hold mmap_sem around
 713                 * checking core_waiters and changing tsk->mm.  The
 714                 * core-inducing thread will increment core_waiters for
 715                 * each thread whose ->mm == old_mm.
 716                 */
 717                down_read(&old_mm->mmap_sem);
 718                if (unlikely(old_mm->core_waiters)) {
 719                        up_read(&old_mm->mmap_sem);
 720                        return -EINTR;
 721                }
 722        }
 723        task_lock(tsk);
 724        active_mm = tsk->active_mm;
 725        tsk->mm = mm;
 726        tsk->active_mm = mm;
 727        activate_mm(active_mm, mm);
 728        task_unlock(tsk);
 729        arch_pick_mmap_layout(mm);
 730        if (old_mm) {
 731                up_read(&old_mm->mmap_sem);
 732                BUG_ON(active_mm != old_mm);
 733                mmput(old_mm);
 734                return 0;
 735        }
 736        mmdrop(active_mm);
 737        return 0;
 738}
 739
 740/*
 741 * This function makes sure the current process has its own signal table,
 742 * so that flush_signal_handlers can later reset the handlers without
 743 * disturbing other processes.  (Other processes might share the signal
 744 * table via the CLONE_SIGHAND option to clone().)
 745 */
 746static int de_thread(struct task_struct *tsk)
 747{
 748        struct signal_struct *sig = tsk->signal;
 749        struct sighand_struct *oldsighand = tsk->sighand;
 750        spinlock_t *lock = &oldsighand->siglock;
 751        struct task_struct *leader = NULL;
 752        int count;
 753
 754        if (thread_group_empty(tsk))
 755                goto no_thread_group;
 756
 757        /*
 758         * Kill all other threads in the thread group.
 759         * We must hold tasklist_lock to call zap_other_threads.
 760         */
 761        read_lock(&tasklist_lock);
 762        spin_lock_irq(lock);
 763        if (sig->flags & SIGNAL_GROUP_EXIT) {
 764                /*
 765                 * Another group action in progress, just
 766                 * return so that the signal is processed.
 767                 */
 768                spin_unlock_irq(lock);
 769                read_unlock(&tasklist_lock);
 770                return -EAGAIN;
 771        }
 772
 773        /*
 774         * child_reaper ignores SIGKILL, change it now.
 775         * Reparenting needs write_lock on tasklist_lock,
 776         * so it is safe to do it under read_lock.
 777         */
 778        if (unlikely(tsk->group_leader == task_child_reaper(tsk)))
 779                task_active_pid_ns(tsk)->child_reaper = tsk;
 780
 781        zap_other_threads(tsk);
 782        read_unlock(&tasklist_lock);
 783
 784        /*
 785         * Account for the thread group leader hanging around:
 786         */
 787        count = 1;
 788        if (!thread_group_leader(tsk)) {
 789                count = 2;
 790                /*
 791                 * The SIGALRM timer survives the exec, but needs to point
 792                 * at us as the new group leader now.  We have a race with
 793                 * a timer firing now getting the old leader, so we need to
 794                 * synchronize with any firing (by calling del_timer_sync)
 795                 * before we can safely let the old group leader die.
 796                 */
 797                sig->tsk = tsk;
 798                spin_unlock_irq(lock);
 799                if (hrtimer_cancel(&sig->real_timer))
 800                        hrtimer_restart(&sig->real_timer);
 801                spin_lock_irq(lock);
 802        }
 803
 804        sig->notify_count = count;
 805        sig->group_exit_task = tsk;
 806        while (atomic_read(&sig->count) > count) {
 807                __set_current_state(TASK_UNINTERRUPTIBLE);
 808                spin_unlock_irq(lock);
 809                schedule();
 810                spin_lock_irq(lock);
 811        }
 812        spin_unlock_irq(lock);
 813
 814        /*
 815         * At this point all other threads have exited, all we have to
 816         * do is to wait for the thread group leader to become inactive,
 817         * and to assume its PID:
 818         */
 819        if (!thread_group_leader(tsk)) {
 820                leader = tsk->group_leader;
 821
 822                sig->notify_count = -1;
 823                for (;;) {
 824                        write_lock_irq(&tasklist_lock);
 825                        if (likely(leader->exit_state))
 826                                break;
 827                        __set_current_state(TASK_UNINTERRUPTIBLE);
 828                        write_unlock_irq(&tasklist_lock);
 829                        schedule();
 830                }
 831
 832                /*
 833                 * The only record we have of the real-time age of a
 834                 * process, regardless of execs it's done, is start_time.
 835                 * All the past CPU time is accumulated in signal_struct
 836                 * from sister threads now dead.  But in this non-leader
 837                 * exec, nothing survives from the original leader thread,
 838                 * whose birth marks the true age of this process now.
 839                 * When we take on its identity by switching to its PID, we
 840                 * also take its birthdate (always earlier than our own).
 841                 */
 842                tsk->start_time = leader->start_time;
 843
 844                BUG_ON(!same_thread_group(leader, tsk));
 845                BUG_ON(has_group_leader_pid(tsk));
 846                /*
 847                 * An exec() starts a new thread group with the
 848                 * TGID of the previous thread group. Rehash the
 849                 * two threads with a switched PID, and release
 850                 * the former thread group leader:
 851                 */
 852
 853                /* Become a process group leader with the old leader's pid.
 854                 * The old leader becomes a thread of the this thread group.
 855                 * Note: The old leader also uses this pid until release_task
 856                 *       is called.  Odd but simple and correct.
 857                 */
 858                detach_pid(tsk, PIDTYPE_PID);
 859                tsk->pid = leader->pid;
 860                attach_pid(tsk, PIDTYPE_PID,  task_pid(leader));
 861                transfer_pid(leader, tsk, PIDTYPE_PGID);
 862                transfer_pid(leader, tsk, PIDTYPE_SID);
 863                list_replace_rcu(&leader->tasks, &tsk->tasks);
 864
 865                tsk->group_leader = tsk;
 866                leader->group_leader = tsk;
 867
 868                tsk->exit_signal = SIGCHLD;
 869
 870                BUG_ON(leader->exit_state != EXIT_ZOMBIE);
 871                leader->exit_state = EXIT_DEAD;
 872
 873                write_unlock_irq(&tasklist_lock);
 874        }
 875
 876        sig->group_exit_task = NULL;
 877        sig->notify_count = 0;
 878        /*
 879         * There may be one thread left which is just exiting,
 880         * but it's safe to stop telling the group to kill themselves.
 881         */
 882        sig->flags = 0;
 883
 884no_thread_group:
 885        exit_itimers(sig);
 886        if (leader)
 887                release_task(leader);
 888
 889        if (atomic_read(&oldsighand->count) != 1) {
 890                struct sighand_struct *newsighand;
 891                /*
 892                 * This ->sighand is shared with the CLONE_SIGHAND
 893                 * but not CLONE_THREAD task, switch to the new one.
 894                 */
 895                newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
 896                if (!newsighand)
 897                        return -ENOMEM;
 898
 899                atomic_set(&newsighand->count, 1);
 900                memcpy(newsighand->action, oldsighand->action,
 901                       sizeof(newsighand->action));
 902
 903                write_lock_irq(&tasklist_lock);
 904                spin_lock(&oldsighand->siglock);
 905                rcu_assign_pointer(tsk->sighand, newsighand);
 906                spin_unlock(&oldsighand->siglock);
 907                write_unlock_irq(&tasklist_lock);
 908
 909                __cleanup_sighand(oldsighand);
 910        }
 911
 912        BUG_ON(!thread_group_leader(tsk));
 913        return 0;
 914}
 915
 916/*
 917 * These functions flushes out all traces of the currently running executable
 918 * so that a new one can be started
 919 */
 920static void flush_old_files(struct files_struct * files)
 921{
 922        long j = -1;
 923        struct fdtable *fdt;
 924
 925        spin_lock(&files->file_lock);
 926        for (;;) {
 927                unsigned long set, i;
 928
 929                j++;
 930                i = j * __NFDBITS;
 931                fdt = files_fdtable(files);
 932                if (i >= fdt->max_fds)
 933                        break;
 934                set = fdt->close_on_exec->fds_bits[j];
 935                if (!set)
 936                        continue;
 937                fdt->close_on_exec->fds_bits[j] = 0;
 938                spin_unlock(&files->file_lock);
 939                for ( ; set ; i++,set >>= 1) {
 940                        if (set & 1) {
 941                                sys_close(i);
 942                        }
 943                }
 944                spin_lock(&files->file_lock);
 945
 946        }
 947        spin_unlock(&files->file_lock);
 948}
 949
 950void get_task_comm(char *buf, struct task_struct *tsk)
 951{
 952        /* buf must be at least sizeof(tsk->comm) in size */
 953        task_lock(tsk);
 954        strncpy(buf, tsk->comm, sizeof(tsk->comm));
 955        task_unlock(tsk);
 956}
 957
 958void set_task_comm(struct task_struct *tsk, char *buf)
 959{
 960        task_lock(tsk);
 961        strlcpy(tsk->comm, buf, sizeof(tsk->comm));
 962        task_unlock(tsk);
 963}
 964
 965int flush_old_exec(struct linux_binprm * bprm)
 966{
 967        char * name;
 968        int i, ch, retval;
 969        struct files_struct *files;
 970        char tcomm[sizeof(current->comm)];
 971
 972        /*
 973         * Make sure we have a private signal table and that
 974         * we are unassociated from the previous thread group.
 975         */
 976        retval = de_thread(current);
 977        if (retval)
 978                goto out;
 979
 980        /*
 981         * Make sure we have private file handles. Ask the
 982         * fork helper to do the work for us and the exit
 983         * helper to do the cleanup of the old one.
 984         */
 985        files = current->files;         /* refcounted so safe to hold */
 986        retval = unshare_files();
 987        if (retval)
 988                goto out;
 989        /*
 990         * Release all of the old mmap stuff
 991         */
 992        retval = exec_mmap(bprm->mm);
 993        if (retval)
 994                goto mmap_failed;
 995
 996        bprm->mm = NULL;                /* We're using it now */
 997
 998        /* This is the point of no return */
 999        put_files_struct(files);
1000

1001        current->sas_ss_sp = current->sas_ss_size = 0;
1002
1003        if (current->euid == current->uid && current->egid == current->gid)
1004                set_dumpable(current->mm, 1);
1005        else
1006                set_dumpable(current->mm, suid_dumpable);
1007
1008        name = bprm->filename;
1009
1010        /* Copies the binary name from after last slash */
1011        for (i=0; (ch = *(name++)) != '\0';) {
1012                if (ch == '/')
1013                        i = 0; /* overwrite what we wrote */
1014                else
1015                        if (i < (sizeof(tcomm) - 1))
1016                                tcomm[i++] = ch;
1017        }
1018        tcomm[i] = '\0';
1019        set_task_comm(current, tcomm);
1020
1021        current->flags &= ~PF_RANDOMIZE;
1022        flush_thread();
1023
1024        /* Set the new mm task size. We have to do that late because it may
1025         * depend on TIF_32BIT which is only updated in flush_thread() on
1026         * some architectures like powerpc
1027         */
1028        current->mm->task_size = TASK_SIZE;
1029
1030        if (bprm->e_uid != current->euid || bprm->e_gid != current->egid) {
1031                suid_keys(current);
1032                set_dumpable(current->mm, suid_dumpable);
1033                current->pdeath_signal = 0;
1034        } else if (file_permission(bprm->file, MAY_READ) ||
1035                        (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)) {
1036                suid_keys(current);
1037                set_dumpable(current->mm, suid_dumpable);
1038        }
1039
1040        /* An exec changes our domain. We are no longer part of the thread
1041           group */
1042
1043        current->self_exec_id++;
1044                        
1045        flush_signal_handlers(current, 0);
1046        flush_old_files(current->files);
1047
1048        return 0;
1049
1050mmap_failed:
1051        reset_files_struct(current, files);
1052out:
1053        return retval;
1054}
1055
1056EXPORT_SYMBOL(flush_old_exec);
1057
1058/* 
1059 * Fill the binprm structure from the inode. 
1060 * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
1061 */
1062int prepare_binprm(struct linux_binprm *bprm)
1063{
1064        int mode;
1065        struct inode * inode = bprm->file->f_path.dentry->d_inode;
1066        int retval;
1067
1068        mode = inode->i_mode;
1069        if (bprm->file->f_op == NULL)
1070                return -EACCES;
1071
1072        bprm->e_uid = current->euid;
1073        bprm->e_gid = current->egid;
1074
1075        if(!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)) {
1076                /* Set-uid? */
1077                if (mode & S_ISUID) {
1078                        current->personality &= ~PER_CLEAR_ON_SETID;
1079                        bprm->e_uid = inode->i_uid;
1080                }
1081
1082                /* Set-gid? */
1083                /*
1084                 * If setgid is set but no group execute bit then this
1085                 * is a candidate for mandatory locking, not a setgid
1086                 * executable.
1087                 */
1088                if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
1089                        current->personality &= ~PER_CLEAR_ON_SETID;
1090                        bprm->e_gid = inode->i_gid;
1091                }
1092        }
1093
1094        /* fill in binprm security blob */
1095        retval = security_bprm_set(bprm);
1096        if (retval)
1097                return retval;
1098
1099        memset(bprm->buf,0,BINPRM_BUF_SIZE);
1100        return kernel_read(bprm->file,0,bprm->buf,BINPRM_BUF_SIZE);
1101}
1102
1103EXPORT_SYMBOL(prepare_binprm);
1104
1105static int unsafe_exec(struct task_struct *p)
1106{
1107        int unsafe = 0;
1108        if (p->ptrace & PT_PTRACED) {
1109                if (p->ptrace & PT_PTRACE_CAP)
1110                        unsafe |= LSM_UNSAFE_PTRACE_CAP;
1111                else
1112                        unsafe |= LSM_UNSAFE_PTRACE;
1113        }
1114        if (atomic_read(&p->fs->count) > 1 ||
1115            atomic_read(&p->files->count) > 1 ||
1116            atomic_read(&p->sighand->count) > 1)
1117                unsafe |= LSM_UNSAFE_SHARE;
1118
1119        return unsafe;
1120}
1121
1122void compute_creds(struct linux_binprm *bprm)
1123{
1124        int unsafe;
1125
1126        if (bprm->e_uid != current->uid) {
1127                suid_keys(current);
1128                current->pdeath_signal = 0;
1129        }
1130        exec_keys(current);
1131
1132        task_lock(current);
1133        unsafe = unsafe_exec(current);
1134        security_bprm_apply_creds(bprm, unsafe);
1135        task_unlock(current);
1136        security_bprm_post_apply_creds(bprm);
1137}
1138EXPORT_SYMBOL(compute_creds);
1139
1140/*
1141 * Arguments are '\0' separated strings found at the location bprm->p
1142 * points to; chop off the first by relocating brpm->p to right after
1143 * the first '\0' encountered.
1144 */
1145int remove_arg_zero(struct linux_binprm *bprm)
1146{
1147        int ret = 0;
1148        unsigned long offset;
1149        char *kaddr;
1150        struct page *page;
1151
1152        if (!bprm->argc)
1153                return 0;
1154
1155        do {
1156                offset = bprm->p & ~PAGE_MASK;
1157                page = get_arg_page(bprm, bprm->p, 0);
1158                if (!page) {
1159                        ret = -EFAULT;
1160                        goto out;
1161                }
1162                kaddr = kmap_atomic(page, KM_USER0);
1163
1164                for (; offset < PAGE_SIZE && kaddr[offset];
1165                                offset++, bprm->p++)
1166                        ;
1167
1168                kunmap_atomic(kaddr, KM_USER0);
1169                put_arg_page(page);
1170
1171                if (offset == PAGE_SIZE)
1172                        free_arg_page(bprm, (bprm->p >> PAGE_SHIFT) - 1);
1173        } while (offset == PAGE_SIZE);
1174
1175        bprm->p++;
1176        bprm->argc--;
1177        ret = 0;
1178
1179out:
1180        return ret;
1181}
1182EXPORT_SYMBOL(remove_arg_zero);
1183
1184/*
1185 * cycle the list of binary formats handler, until one recognizes the image
1186 */
1187int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
1188{
1189        int try,retval;
1190        struct linux_binfmt *fmt;
1191#ifdef __alpha__
1192        /* handle /sbin/loader.. */
1193        {
1194            struct exec * eh = (struct exec *) bprm->buf;
1195
1196            if (!bprm->loader && eh->fh.f_magic == 0x183 &&
1197                (eh->fh.f_flags & 0x3000) == 0x3000)
1198            {
1199                struct file * file;
1200                unsigned long loader;
1201
1202                allow_write_access(bprm->file);
1203                fput(bprm->file);
1204                bprm->file = NULL;
1205
1206                loader = bprm->vma->vm_end - sizeof(void *);
1207
1208                file = open_exec("/sbin/loader");
1209                retval = PTR_ERR(file);
1210                if (IS_ERR(file))
1211                        return retval;
1212
1213                /* Remember if the application is TASO.  */
1214                bprm->sh_bang = eh->ah.entry < 0x100000000UL;
1215
1216                bprm->file = file;
1217                bprm->loader = loader;
1218                retval = prepare_binprm(bprm);
1219                if (retval<0)
1220                        return retval;
1221                /* should call search_binary_handler recursively here,
1222                   but it does not matter */
1223            }
1224        }
1225#endif
1226        retval = security_bprm_check(bprm);
1227        if (retval)
1228                return retval;
1229
1230        /* kernel module loader fixup */
1231        /* so we don't try to load run modprobe in kernel space. */
1232        set_fs(USER_DS);
1233
1234        retval = audit_bprm(bprm);
1235        if (retval)
1236                return retval;
1237
1238        retval = -ENOENT;
1239        for (try=0; try<2; try++) {
1240                read_lock(&binfmt_lock);
1241                list_for_each_entry(fmt, &formats, lh) {
1242                        int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary;
1243                        if (!fn)
1244                                continue;
1245                        if (!try_module_get(fmt->module))
1246                                continue;
1247                        read_unlock(&binfmt_lock);
1248                        retval = fn(bprm, regs);
1249                        if (retval >= 0) {
1250                                put_binfmt(fmt);
1251                                allow_write_access(bprm->file);
1252                                if (bprm->file)
1253                                        fput(bprm->file);
1254                                bprm->file = NULL;
1255                                current->did_exec = 1;
1256                                proc_exec_connector(current);
1257                                return retval;
1258                        }
1259                        read_lock(&binfmt_lock);
1260                        put_binfmt(fmt);
1261                        if (retval != -ENOEXEC || bprm->mm == NULL)
1262                                break;
1263                        if (!bprm->file) {
1264                                read_unlock(&binfmt_lock);
1265                                return retval;
1266                        }
1267                }
1268                read_unlock(&binfmt_lock);
1269                if (retval != -ENOEXEC || bprm->mm == NULL) {
1270                        break;
1271#ifdef CONFIG_KMOD
1272                }else{
1273#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
1274                        if (printable(bprm->buf[0]) &&
1275                            printable(bprm->buf[1]) &&
1276                            printable(bprm->buf[2]) &&
1277                            printable(bprm->buf[3]))
1278                                break; /* -ENOEXEC */
1279                        request_module("binfmt-%04x", *(unsigned short *)(&bprm->buf[2]));
1280#endif
1281                }
1282        }
1283        return retval;
1284}
1285
1286EXPORT_SYMBOL(search_binary_handler);
1287
1288/*
1289 * sys_execve() executes a new program.
1290 */
1291int do_execve(char * filename,
1292        char __user *__user *argv,
1293        char __user *__user *envp,
1294        struct pt_regs * regs)
1295{
1296        struct linux_binprm *bprm;
1297        struct file *file;
1298        unsigned long env_p;
1299        int retval;
1300
1301        retval = -ENOMEM;
1302        bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
1303        if (!bprm)
1304                goto out_ret;
1305
1306        file = open_exec(filename);
1307        retval = PTR_ERR(file);
1308        if (IS_ERR(file))
1309                goto out_kfree;
1310
1311        sched_exec();
1312
1313        bprm->file = file;
1314        bprm->filename = filename;
1315        bprm->interp = filename;
1316
1317        retval = bprm_mm_init(bprm);
1318        if (retval)
1319                goto out_file;
1320
1321        bprm->argc = count(argv, MAX_ARG_STRINGS);
1322        if ((retval = bprm->argc) < 0)
1323                goto out_mm;
1324
1325        bprm->envc = count(envp, MAX_ARG_STRINGS);
1326        if ((retval = bprm->envc) < 0)
1327                goto out_mm;
1328
1329        retval = security_bprm_alloc(bprm);
1330        if (retval)
1331                goto out;
1332
1333        retval = prepare_binprm(bprm);
1334        if (retval < 0)
1335                goto out;
1336
1337        retval = copy_strings_kernel(1, &bprm->filename, bprm);
1338        if (retval < 0)
1339                goto out;
1340
1341        bprm->exec = bprm->p;
1342        retval = copy_strings(bprm->envc, envp, bprm);
1343        if (retval < 0)
1344                goto out;
1345
1346        env_p = bprm->p;
1347        retval = copy_strings(bprm->argc, argv, bprm);
1348        if (retval < 0)
1349                goto out;
1350        bprm->argv_len = env_p - bprm->p;
1351
1352        retval = search_binary_handler(bprm,regs);
1353        if (retval >= 0) {
1354                /* execve success */
1355                free_arg_pages(bprm);
1356                security_bprm_free(bprm);
1357                acct_update_integrals(current);
1358                kfree(bprm);
1359                return retval;
1360        }
1361
1362out:
1363        free_arg_pages(bprm);
1364        if (bprm->security)
1365                security_bprm_free(bprm);
1366
1367out_mm:
1368        if (bprm->mm)
1369                mmput (bprm->mm);
1370
1371out_file:
1372        if (bprm->file) {
1373                allow_write_access(bprm->file);
1374                fput(bprm->file);
1375        }
1376out_kfree:
1377        kfree(bprm);
1378
1379out_ret:
1380        return retval;
1381}
1382
1383int set_binfmt(struct linux_binfmt *new)
1384{
1385        struct linux_binfmt *old = current->binfmt;
1386
1387        if (new) {
1388                if (!try_module_get(new->module))
1389                        return -1;
1390        }
1391        current->binfmt = new;
1392        if (old)
1393                module_put(old->module);
1394        return 0;
1395}
1396
1397EXPORT_SYMBOL(set_binfmt);
1398
1399/* format_corename will inspect the pattern parameter, and output a
1400 * name into corename, which must have space for at least
1401 * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
1402 */
1403static int format_corename(char *corename, const char *pattern, long signr)
1404{
1405        const char *pat_ptr = pattern;
1406        char *out_ptr = corename;
1407        char *const out_end = corename + CORENAME_MAX_SIZE;
1408        int rc;
1409        int pid_in_pattern = 0;
1410        int ispipe = 0;
1411
1412        if (*pattern == '|')
1413                ispipe = 1;
1414
1415        /* Repeat as long as we have more pattern to process and more output
1416           space */
1417        while (*pat_ptr) {
1418                if (*pat_ptr != '%') {
1419                        if (out_ptr == out_end)
1420                                goto out;
1421                        *out_ptr++ = *pat_ptr++;
1422                } else {
1423                        switch (*++pat_ptr) {
1424                        case 0:
1425                                goto out;
1426                        /* Double percent, output one percent */
1427                        case '%':
1428                                if (out_ptr == out_end)
1429                                        goto out;
1430                                *out_ptr++ = '%';
1431                                break;
1432                        /* pid */
1433                        case 'p':
1434                                pid_in_pattern = 1;
1435                                rc = snprintf(out_ptr, out_end - out_ptr,
1436                                              "%d", task_tgid_vnr(current));
1437                                if (rc > out_end - out_ptr)
1438                                        goto out;
1439                                out_ptr += rc;
1440                                break;
1441                        /* uid */
1442                        case 'u':
1443                                rc = snprintf(out_ptr, out_end - out_ptr,
1444                                              "%d", current->uid);
1445                                if (rc > out_end - out_ptr)
1446                                        goto out;
1447                                out_ptr += rc;
1448                                break;
1449                        /* gid */
1450                        case 'g':
1451                                rc = snprintf(out_ptr, out_end - out_ptr,
1452                                              "%d", current->gid);
1453                                if (rc > out_end - out_ptr)
1454                                        goto out;
1455                                out_ptr += rc;
1456                                break;
1457                        /* signal that caused the coredump */
1458                        case 's':
1459                                rc = snprintf(out_ptr, out_end - out_ptr,
1460                                              "%ld", signr);
1461                                if (rc > out_end - out_ptr)
1462                                        goto out;
1463                                out_ptr += rc;
1464                                break;
1465                        /* UNIX time of coredump */
1466                        case 't': {
1467                                struct timeval tv;
1468                                do_gettimeofday(&tv);
1469                                rc = snprintf(out_ptr, out_end - out_ptr,
1470                                              "%lu", tv.tv_sec);
1471                                if (rc > out_end - out_ptr)
1472                                        goto out;
1473                                out_ptr += rc;
1474                                break;
1475                        }
1476                        /* hostname */
1477                        case 'h':
1478                                down_read(&uts_sem);
1479                                rc = snprintf(out_ptr, out_end - out_ptr,
1480                                              "%s", utsname()->nodename);
1481                                up_read(&uts_sem);
1482                                if (rc > out_end - out_ptr)
1483                                        goto out;
1484                                out_ptr += rc;
1485                                break;
1486                        /* executable */
1487                        case 'e':
1488                                rc = snprintf(out_ptr, out_end - out_ptr,
1489                                              "%s", current->comm);
1490                                if (rc > out_end - out_ptr)
1491                                        goto out;
1492                                out_ptr += rc;
1493                                break;
1494                        /* core limit size */
1495                        case 'c':
1496                                rc = snprintf(out_ptr, out_end - out_ptr,
1497                                              "%lu", current->signal->rlim[RLIMIT_CORE].rlim_cur);
1498                                if (rc > out_end - out_ptr)
1499                                        goto out;
1500                                out_ptr += rc;
1501                                break;
1502                        default:
1503                                break;
1504                        }
1505                        ++pat_ptr;
1506                }
1507        }
1508        /* Backward compatibility with core_uses_pid:
1509         *
1510         * If core_pattern does not include a %p (as is the default)
1511         * and core_uses_pid is set, then .%pid will be appended to
1512         * the filename. Do not do this for piped commands. */
1513        if (!ispipe && !pid_in_pattern
1514            && (core_uses_pid || atomic_read(&current->mm->mm_users) != 1)) {
1515                rc = snprintf(out_ptr, out_end - out_ptr,
1516                              ".%d", task_tgid_vnr(current));
1517                if (rc > out_end - out_ptr)
1518                        goto out;
1519                out_ptr += rc;
1520        }
1521out:
1522        *out_ptr = 0;
1523        return ispipe;
1524}
1525
1526static void zap_process(struct task_struct *start)
1527{
1528        struct task_struct *t;
1529
1530        start->signal->flags = SIGNAL_GROUP_EXIT;
1531        start->signal->group_stop_count = 0;
1532
1533        t = start;
1534        do {
1535                if (t != current && t->mm) {
1536                        t->mm->core_waiters++;
1537                        sigaddset(&t->pending.signal, SIGKILL);
1538                        signal_wake_up(t, 1);
1539                }
1540        } while ((t = next_thread(t)) != start);
1541}
1542
1543static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
1544                                int exit_code)
1545{
1546        struct task_struct *g, *p;
1547        unsigned long flags;
1548        int err = -EAGAIN;
1549
1550        spin_lock_irq(&tsk->sighand->siglock);
1551        if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) {
1552                tsk->signal->group_exit_code = exit_code;
1553                zap_process(tsk);
1554                err = 0;
1555        }
1556        spin_unlock_irq(&tsk->sighand->siglock);
1557        if (err)
1558                return err;
1559
1560        if (atomic_read(&mm->mm_users) == mm->core_waiters + 1)
1561                goto done;
1562
1563        rcu_read_lock();
1564        for_each_process(g) {
1565                if (g == tsk->group_leader)
1566                        continue;
1567
1568                p = g;
1569                do {
1570                        if (p->mm) {
1571                                if (p->mm == mm) {
1572                                        /*
1573                                         * p->sighand can't disappear, but
1574                                         * may be changed by de_thread()
1575                                         */
1576                                        lock_task_sighand(p, &flags);
1577                                        zap_process(p);
1578                                        unlock_task_sighand(p, &flags);
1579                                }
1580                                break;
1581                        }
1582                } while ((p = next_thread(p)) != g);
1583        }
1584        rcu_read_unlock();
1585done:
1586        return mm->core_waiters;
1587}
1588
1589static int coredump_wait(int exit_code)
1590{
1591        struct task_struct *tsk = current;
1592        struct mm_struct *mm = tsk->mm;
1593        struct completion startup_done;
1594        struct completion *vfork_done;
1595        int core_waiters;
1596
1597        init_completion(&mm->core_done);
1598        init_completion(&startup_done);
1599        mm->core_startup_done = &startup_done;
1600
1601        core_waiters = zap_threads(tsk, mm, exit_code);
1602        up_write(&mm->mmap_sem);
1603
1604        if (unlikely(core_waiters < 0))
1605                goto fail;
1606
1607        /*
1608         * Make sure nobody is waiting for us to release the VM,
1609         * otherwise we can deadlock when we wait on each other
1610         */
1611        vfork_done = tsk->vfork_done;
1612        if (vfork_done) {
1613                tsk->vfork_done = NULL;
1614                complete(vfork_done);
1615        }
1616
1617        if (core_waiters)
1618                wait_for_completion(&startup_done);
1619fail:
1620        BUG_ON(mm->core_waiters);
1621        return core_waiters;
1622}
1623
1624/*
1625 * set_dumpable converts traditional three-value dumpable to two flags and
1626 * stores them into mm->flags.  It modifies lower two bits of mm->flags, but
1627 * these bits are not changed atomically.  So get_dumpable can observe the
1628 * intermediate state.  To avoid doing unexpected behavior, get get_dumpable
1629 * return either old dumpable or new one by paying attention to the order of
1630 * modifying the bits.
1631 *
1632 * dumpable |   mm->flags (binary)
1633 * old  new | initial interim  final
1634 * ---------+-----------------------
1635 *  0    1  |   00      01      01
1636 *  0    2  |   00      10(*)   11
1637 *  1    0  |   01      00      00
1638 *  1    2  |   01      11      11
1639 *  2    0  |   11      10(*)   00
1640 *  2    1  |   11      11      01
1641 *
1642 * (*) get_dumpable regards interim value of 10 as 11.
1643 */
1644void set_dumpable(struct mm_struct *mm, int value)
1645{
1646        switch (value) {
1647        case 0:
1648                clear_bit(MMF_DUMPABLE, &mm->flags);
1649                smp_wmb();
1650                clear_bit(MMF_DUMP_SECURELY, &mm->flags);
1651                break;
1652        case 1:
1653                set_bit(MMF_DUMPABLE, &mm->flags);
1654                smp_wmb();
1655                clear_bit(MMF_DUMP_SECURELY, &mm->flags);
1656                break;
1657        case 2:
1658                set_bit(MMF_DUMP_SECURELY, &mm->flags);
1659                smp_wmb();
1660                set_bit(MMF_DUMPABLE, &mm->flags);
1661                break;
1662        }
1663}
1664
1665int get_dumpable(struct mm_struct *mm)
1666{
1667        int ret;
1668
1669        ret = mm->flags & 0x3;
1670        return (ret >= 2) ? 2 : ret;
1671}
1672
1673int do_coredump(long signr, int exit_code, struct pt_regs * regs)
1674{
1675        char corename[CORENAME_MAX_SIZE + 1];
1676        struct mm_struct *mm = current->mm;
1677        struct linux_binfmt * binfmt;
1678        struct inode * inode;
1679        struct file * file;
1680        int retval = 0;
1681        int fsuid = current->fsuid;
1682        int flag = 0;
1683        int ispipe = 0;
1684        unsigned long core_limit = current->signal->rlim[RLIMIT_CORE].rlim_cur;
1685        char **helper_argv = NULL;
1686        int helper_argc = 0;
1687        char *delimit;
1688
1689        audit_core_dumps(signr);
1690
1691        binfmt = current->binfmt;
1692        if (!binfmt || !binfmt->core_dump)
1693                goto fail;
1694        down_write(&mm->mmap_sem);
1695        /*
1696         * If another thread got here first, or we are not dumpable, bail out.
1697         */
1698        if (mm->core_waiters || !get_dumpable(mm)) {
1699                up_write(&mm->mmap_sem);
1700                goto fail;
1701        }
1702
1703        /*
1704         *      We cannot trust fsuid as being the "true" uid of the
1705         *      process nor do we know its entire history. We only know it
1706         *      was tainted so we dump it as root in mode 2.
1707         */
1708        if (get_dumpable(mm) == 2) {    /* Setuid core dump mode */
1709                flag = O_EXCL;          /* Stop rewrite attacks */
1710                current->fsuid = 0;     /* Dump root private */
1711        }
1712
1713        retval = coredump_wait(exit_code);
1714        if (retval < 0)
1715                goto fail;
1716
1717        /*
1718         * Clear any false indication of pending signals that might
1719         * be seen by the filesystem code called to write the core file.
1720         */
1721        clear_thread_flag(TIF_SIGPENDING);
1722
1723        /*
1724         * lock_kernel() because format_corename() is controlled by sysctl, which
1725         * uses lock_kernel()
1726         */
1727        lock_kernel();
1728        ispipe = format_corename(corename, core_pattern, signr);
1729        unlock_kernel();
1730        /*
1731         * Don't bother to check the RLIMIT_CORE value if core_pattern points
1732         * to a pipe.  Since we're not writing directly to the filesystem
1733         * RLIMIT_CORE doesn't really apply, as no actual core file will be
1734         * created unless the pipe reader choses to write out the core file
1735         * at which point file size limits and permissions will be imposed
1736         * as it does with any other process
1737         */
1738        if ((!ispipe) && (core_limit < binfmt->min_coredump))
1739                goto fail_unlock;
1740
1741        if (ispipe) {
1742                helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc);
1743                /* Terminate the string before the first option */
1744                delimit = strchr(corename, ' ');
1745                if (delimit)
1746                        *delimit = '\0';
1747                delimit = strrchr(helper_argv[0], '/');
1748                if (delimit)
1749                        delimit++;
1750                else
1751                        delimit = helper_argv[0];
1752                if (!strcmp(delimit, current->comm)) {
1753                        printk(KERN_NOTICE "Recursive core dump detected, "
1754                                        "aborting\n");
1755                        goto fail_unlock;
1756                }
1757
1758                core_limit = RLIM_INFINITY;
1759
1760                /* SIGPIPE can happen, but it's just never processed */
1761                if (call_usermodehelper_pipe(corename+1, helper_argv, NULL,
1762                                &file)) {
1763                        printk(KERN_INFO "Core dump to %s pipe failed\n",
1764                               corename);
1765                        goto fail_unlock;
1766                }
1767        } else
1768                file = filp_open(corename,
1769                                 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
1770                                 0600);
1771        if (IS_ERR(file))
1772                goto fail_unlock;
1773        inode = file->f_path.dentry->d_inode;
1774        if (inode->i_nlink > 1)
1775                goto close_fail;        /* multiple links - don't dump */
1776        if (!ispipe && d_unhashed(file->f_path.dentry))
1777                goto close_fail;
1778
1779        /* AK: actually i see no reason to not allow this for named pipes etc.,
1780           but keep the previous behaviour for now. */
1781        if (!ispipe && !S_ISREG(inode->i_mode))
1782                goto close_fail;
1783        /*
1784         * Dont allow local users get cute and trick others to coredump
1785         * into their pre-created files:
1786         */
1787        if (inode->i_uid != current->fsuid)
1788                goto close_fail;
1789        if (!file->f_op)
1790                goto close_fail;
1791        if (!file->f_op->write)
1792                goto close_fail;
1793        if (!ispipe && do_truncate(file->f_path.dentry, 0, 0, file) != 0)
1794                goto close_fail;
1795
1796        retval = binfmt->core_dump(signr, regs, file, core_limit);
1797
1798        if (retval)
1799                current->signal->group_exit_code |= 0x80;
1800close_fail:
1801        filp_close(file, NULL);
1802fail_unlock:
1803        if (helper_argv)
1804                argv_free(helper_argv);
1805
1806        current->fsuid = fsuid;
1807        complete_all(&mm->core_done);
1808fail:
1809        return retval;
1810}
1811