linux/kernel/acct.c
<<
>>
Prefs
   1/*
   2 *  linux/kernel/acct.c
   3 *
   4 *  BSD Process Accounting for Linux
   5 *
   6 *  Author: Marco van Wieringen <mvw@planets.elm.net>
   7 *
   8 *  Some code based on ideas and code from:
   9 *  Thomas K. Dyas <tdyas@eden.rutgers.edu>
  10 *
  11 *  This file implements BSD-style process accounting. Whenever any
  12 *  process exits, an accounting record of type "struct acct" is
  13 *  written to the file specified with the acct() system call. It is
  14 *  up to user-level programs to do useful things with the accounting
  15 *  log. The kernel just provides the raw accounting information.
  16 *
  17 * (C) Copyright 1995 - 1997 Marco van Wieringen - ELM Consultancy B.V.
  18 *
  19 *  Plugged two leaks. 1) It didn't return acct_file into the free_filps if
  20 *  the file happened to be read-only. 2) If the accounting was suspended
  21 *  due to the lack of space it happily allowed to reopen it and completely
  22 *  lost the old acct_file. 3/10/98, Al Viro.
  23 *
  24 *  Now we silently close acct_file on attempt to reopen. Cleaned sys_acct().
  25 *  XTerms and EMACS are manifestations of pure evil. 21/10/98, AV.
  26 *
  27 *  Fixed a nasty interaction with with sys_umount(). If the accointing
  28 *  was suspeneded we failed to stop it on umount(). Messy.
  29 *  Another one: remount to readonly didn't stop accounting.
  30 *      Question: what should we do if we have CAP_SYS_ADMIN but not
  31 *  CAP_SYS_PACCT? Current code does the following: umount returns -EBUSY
  32 *  unless we are messing with the root. In that case we are getting a
  33 *  real mess with do_remount_sb(). 9/11/98, AV.
  34 *
  35 *  Fixed a bunch of races (and pair of leaks). Probably not the best way,
  36 *  but this one obviously doesn't introduce deadlocks. Later. BTW, found
  37 *  one race (and leak) in BSD implementation.
  38 *  OK, that's better. ANOTHER race and leak in BSD variant. There always
  39 *  is one more bug... 10/11/98, AV.
  40 *
  41 *      Oh, fsck... Oopsable SMP race in do_process_acct() - we must hold
  42 * ->mmap_sem to walk the vma list of current->mm. Nasty, since it leaks
  43 * a struct file opened for write. Fixed. 2/6/2000, AV.
  44 */
  45
  46#include <linux/mm.h>
  47#include <linux/slab.h>
  48#include <linux/acct.h>
  49#include <linux/capability.h>
  50#include <linux/file.h>
  51#include <linux/tty.h>
  52#include <linux/security.h>
  53#include <linux/vfs.h>
  54#include <linux/jiffies.h>
  55#include <linux/times.h>
  56#include <linux/syscalls.h>
  57#include <linux/mount.h>
  58#include <linux/uaccess.h>
  59#include <asm/div64.h>
  60#include <linux/blkdev.h> /* sector_div */
  61#include <linux/pid_namespace.h>
  62#include <linux/fs_pin.h>
  63
  64/*
  65 * These constants control the amount of freespace that suspend and
  66 * resume the process accounting system, and the time delay between
  67 * each check.
  68 * Turned into sysctl-controllable parameters. AV, 12/11/98
  69 */
  70
  71int acct_parm[3] = {4, 2, 30};
  72#define RESUME          (acct_parm[0])  /* >foo% free space - resume */
  73#define SUSPEND         (acct_parm[1])  /* <foo% free space - suspend */
  74#define ACCT_TIMEOUT    (acct_parm[2])  /* foo second timeout between checks */
  75
  76/*
  77 * External references and all of the globals.
  78 */
  79
  80struct bsd_acct_struct {
  81        struct fs_pin           pin;
  82        atomic_long_t           count;
  83        struct rcu_head         rcu;
  84        struct mutex            lock;
  85        int                     active;
  86        unsigned long           needcheck;
  87        struct file             *file;
  88        struct pid_namespace    *ns;
  89        struct work_struct      work;
  90        struct completion       done;
  91};
  92
  93static void do_acct_process(struct bsd_acct_struct *acct);
  94
  95/*
  96 * Check the amount of free space and suspend/resume accordingly.
  97 */
  98static int check_free_space(struct bsd_acct_struct *acct)
  99{
 100        struct kstatfs sbuf;
 101
 102        if (time_is_before_jiffies(acct->needcheck))
 103                goto out;
 104
 105        /* May block */
 106        if (vfs_statfs(&acct->file->f_path, &sbuf))
 107                goto out;
 108
 109        if (acct->active) {
 110                u64 suspend = sbuf.f_blocks * SUSPEND;
 111                do_div(suspend, 100);
 112                if (sbuf.f_bavail <= suspend) {
 113                        acct->active = 0;
 114                        pr_info("Process accounting paused\n");
 115                }
 116        } else {
 117                u64 resume = sbuf.f_blocks * RESUME;
 118                do_div(resume, 100);
 119                if (sbuf.f_bavail >= resume) {
 120                        acct->active = 1;
 121                        pr_info("Process accounting resumed\n");
 122                }
 123        }
 124
 125        acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
 126out:
 127        return acct->active;
 128}
 129
 130static void acct_put(struct bsd_acct_struct *p)
 131{
 132        if (atomic_long_dec_and_test(&p->count))
 133                kfree_rcu(p, rcu);
 134}
 135
 136static inline struct bsd_acct_struct *to_acct(struct fs_pin *p)
 137{
 138        return p ? container_of(p, struct bsd_acct_struct, pin) : NULL;
 139}
 140
 141static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
 142{
 143        struct bsd_acct_struct *res;
 144again:
 145        smp_rmb();
 146        rcu_read_lock();
 147        res = to_acct(ACCESS_ONCE(ns->bacct));
 148        if (!res) {
 149                rcu_read_unlock();
 150                return NULL;
 151        }
 152        if (!atomic_long_inc_not_zero(&res->count)) {
 153                rcu_read_unlock();
 154                cpu_relax();
 155                goto again;
 156        }
 157        rcu_read_unlock();
 158        mutex_lock(&res->lock);
 159        if (res != to_acct(ACCESS_ONCE(ns->bacct))) {
 160                mutex_unlock(&res->lock);
 161                acct_put(res);
 162                goto again;
 163        }
 164        return res;
 165}
 166
 167static void acct_pin_kill(struct fs_pin *pin)
 168{
 169        struct bsd_acct_struct *acct = to_acct(pin);
 170        mutex_lock(&acct->lock);
 171        do_acct_process(acct);
 172        schedule_work(&acct->work);
 173        wait_for_completion(&acct->done);
 174        cmpxchg(&acct->ns->bacct, pin, NULL);
 175        mutex_unlock(&acct->lock);
 176        pin_remove(pin);
 177        acct_put(acct);
 178}
 179
 180static void close_work(struct work_struct *work)
 181{
 182        struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
 183        struct file *file = acct->file;
 184        if (file->f_op->flush)
 185                file->f_op->flush(file, NULL);
 186        __fput_sync(file);
 187        complete(&acct->done);
 188}
 189
 190static int acct_on(struct filename *pathname)
 191{
 192        struct file *file;
 193        struct vfsmount *mnt, *internal;
 194        struct pid_namespace *ns = task_active_pid_ns(current);
 195        struct bsd_acct_struct *acct;
 196        struct fs_pin *old;
 197        int err;
 198
 199        acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
 200        if (!acct)
 201                return -ENOMEM;
 202
 203        /* Difference from BSD - they don't do O_APPEND */
 204        file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
 205        if (IS_ERR(file)) {
 206                kfree(acct);
 207                return PTR_ERR(file);
 208        }
 209
 210        if (!S_ISREG(file_inode(file)->i_mode)) {
 211                kfree(acct);
 212                filp_close(file, NULL);
 213                return -EACCES;
 214        }
 215
 216        if (!(file->f_mode & FMODE_CAN_WRITE)) {
 217                kfree(acct);
 218                filp_close(file, NULL);
 219                return -EIO;
 220        }
 221        internal = mnt_clone_internal(&file->f_path);
 222        if (IS_ERR(internal)) {
 223                kfree(acct);
 224                filp_close(file, NULL);
 225                return PTR_ERR(internal);
 226        }
 227        err = mnt_want_write(internal);
 228        if (err) {
 229                mntput(internal);
 230                kfree(acct);
 231                filp_close(file, NULL);
 232                return err;
 233        }
 234        mnt = file->f_path.mnt;
 235        file->f_path.mnt = internal;
 236
 237        atomic_long_set(&acct->count, 1);
 238        init_fs_pin(&acct->pin, acct_pin_kill);
 239        acct->file = file;
 240        acct->needcheck = jiffies;
 241        acct->ns = ns;
 242        mutex_init(&acct->lock);
 243        INIT_WORK(&acct->work, close_work);
 244        init_completion(&acct->done);
 245        mutex_lock_nested(&acct->lock, 1);      /* nobody has seen it yet */
 246        pin_insert(&acct->pin, mnt);
 247
 248        rcu_read_lock();
 249        old = xchg(&ns->bacct, &acct->pin);
 250        mutex_unlock(&acct->lock);
 251        pin_kill(old);
 252        mnt_drop_write(mnt);
 253        mntput(mnt);
 254        return 0;
 255}
 256
 257static DEFINE_MUTEX(acct_on_mutex);
 258
 259/**
 260 * sys_acct - enable/disable process accounting
 261 * @name: file name for accounting records or NULL to shutdown accounting
 262 *
 263 * Returns 0 for success or negative errno values for failure.
 264 *
 265 * sys_acct() is the only system call needed to implement process
 266 * accounting. It takes the name of the file where accounting records
 267 * should be written. If the filename is NULL, accounting will be
 268 * shutdown.
 269 */
 270SYSCALL_DEFINE1(acct, const char __user *, name)
 271{
 272        int error = 0;
 273
 274        if (!capable(CAP_SYS_PACCT))
 275                return -EPERM;
 276
 277        if (name) {
 278                struct filename *tmp = getname(name);
 279
 280                if (IS_ERR(tmp))
 281                        return PTR_ERR(tmp);
 282                mutex_lock(&acct_on_mutex);
 283                error = acct_on(tmp);
 284                mutex_unlock(&acct_on_mutex);
 285                putname(tmp);
 286        } else {
 287                rcu_read_lock();
 288                pin_kill(task_active_pid_ns(current)->bacct);
 289        }
 290
 291        return error;
 292}
 293
 294void acct_exit_ns(struct pid_namespace *ns)
 295{
 296        rcu_read_lock();
 297        pin_kill(ns->bacct);
 298}
 299
 300/*
 301 *  encode an unsigned long into a comp_t
 302 *
 303 *  This routine has been adopted from the encode_comp_t() function in
 304 *  the kern_acct.c file of the FreeBSD operating system. The encoding
 305 *  is a 13-bit fraction with a 3-bit (base 8) exponent.
 306 */
 307
 308#define MANTSIZE        13                      /* 13 bit mantissa. */
 309#define EXPSIZE         3                       /* Base 8 (3 bit) exponent. */
 310#define MAXFRACT        ((1 << MANTSIZE) - 1)   /* Maximum fractional value. */
 311
 312static comp_t encode_comp_t(unsigned long value)
 313{
 314        int exp, rnd;
 315
 316        exp = rnd = 0;
 317        while (value > MAXFRACT) {
 318                rnd = value & (1 << (EXPSIZE - 1));     /* Round up? */
 319                value >>= EXPSIZE;      /* Base 8 exponent == 3 bit shift. */
 320                exp++;
 321        }
 322
 323        /*
 324         * If we need to round up, do it (and handle overflow correctly).
 325         */
 326        if (rnd && (++value > MAXFRACT)) {
 327                value >>= EXPSIZE;
 328                exp++;
 329        }
 330
 331        /*
 332         * Clean it up and polish it off.
 333         */
 334        exp <<= MANTSIZE;               /* Shift the exponent into place */
 335        exp += value;                   /* and add on the mantissa. */
 336        return exp;
 337}
 338
 339#if ACCT_VERSION == 1 || ACCT_VERSION == 2
 340/*
 341 * encode an u64 into a comp2_t (24 bits)
 342 *
 343 * Format: 5 bit base 2 exponent, 20 bits mantissa.
 344 * The leading bit of the mantissa is not stored, but implied for
 345 * non-zero exponents.
 346 * Largest encodable value is 50 bits.
 347 */
 348
 349#define MANTSIZE2       20                      /* 20 bit mantissa. */
 350#define EXPSIZE2        5                       /* 5 bit base 2 exponent. */
 351#define MAXFRACT2       ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */
 352#define MAXEXP2         ((1 << EXPSIZE2) - 1)    /* Maximum exponent. */
 353
 354static comp2_t encode_comp2_t(u64 value)
 355{
 356        int exp, rnd;
 357
 358        exp = (value > (MAXFRACT2>>1));
 359        rnd = 0;
 360        while (value > MAXFRACT2) {
 361                rnd = value & 1;
 362                value >>= 1;
 363                exp++;
 364        }
 365
 366        /*
 367         * If we need to round up, do it (and handle overflow correctly).
 368         */
 369        if (rnd && (++value > MAXFRACT2)) {
 370                value >>= 1;
 371                exp++;
 372        }
 373
 374        if (exp > MAXEXP2) {
 375                /* Overflow. Return largest representable number instead. */
 376                return (1ul << (MANTSIZE2+EXPSIZE2-1)) - 1;
 377        } else {
 378                return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1));
 379        }
 380}
 381#endif
 382
 383#if ACCT_VERSION == 3
 384/*
 385 * encode an u64 into a 32 bit IEEE float
 386 */
 387static u32 encode_float(u64 value)
 388{
 389        unsigned exp = 190;
 390        unsigned u;
 391
 392        if (value == 0)
 393                return 0;
 394        while ((s64)value > 0) {
 395                value <<= 1;
 396                exp--;
 397        }
 398        u = (u32)(value >> 40) & 0x7fffffu;
 399        return u | (exp << 23);
 400}
 401#endif
 402
 403/*
 404 *  Write an accounting entry for an exiting process
 405 *
 406 *  The acct_process() call is the workhorse of the process
 407 *  accounting system. The struct acct is built here and then written
 408 *  into the accounting file. This function should only be called from
 409 *  do_exit() or when switching to a different output file.
 410 */
 411
 412static void fill_ac(acct_t *ac)
 413{
 414        struct pacct_struct *pacct = &current->signal->pacct;
 415        u64 elapsed, run_time;
 416        struct tty_struct *tty;
 417
 418        /*
 419         * Fill the accounting struct with the needed info as recorded
 420         * by the different kernel functions.
 421         */
 422        memset(ac, 0, sizeof(acct_t));
 423
 424        ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER;
 425        strlcpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm));
 426
 427        /* calculate run_time in nsec*/
 428        run_time = ktime_get_ns();
 429        run_time -= current->group_leader->start_time;
 430        /* convert nsec -> AHZ */
 431        elapsed = nsec_to_AHZ(run_time);
 432#if ACCT_VERSION == 3
 433        ac->ac_etime = encode_float(elapsed);
 434#else
 435        ac->ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
 436                                (unsigned long) elapsed : (unsigned long) -1l);
 437#endif
 438#if ACCT_VERSION == 1 || ACCT_VERSION == 2
 439        {
 440                /* new enlarged etime field */
 441                comp2_t etime = encode_comp2_t(elapsed);
 442
 443                ac->ac_etime_hi = etime >> 16;
 444                ac->ac_etime_lo = (u16) etime;
 445        }
 446#endif
 447        do_div(elapsed, AHZ);
 448        ac->ac_btime = get_seconds() - elapsed;
 449#if ACCT_VERSION==2
 450        ac->ac_ahz = AHZ;
 451#endif
 452
 453        spin_lock_irq(&current->sighand->siglock);
 454        tty = current->signal->tty;     /* Safe as we hold the siglock */
 455        ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
 456        ac->ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
 457        ac->ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
 458        ac->ac_flag = pacct->ac_flag;
 459        ac->ac_mem = encode_comp_t(pacct->ac_mem);
 460        ac->ac_minflt = encode_comp_t(pacct->ac_minflt);
 461        ac->ac_majflt = encode_comp_t(pacct->ac_majflt);
 462        ac->ac_exitcode = pacct->ac_exitcode;
 463        spin_unlock_irq(&current->sighand->siglock);
 464}
 465/*
 466 *  do_acct_process does all actual work. Caller holds the reference to file.
 467 */
 468static void do_acct_process(struct bsd_acct_struct *acct)
 469{
 470        acct_t ac;
 471        unsigned long flim;
 472        const struct cred *orig_cred;
 473        struct file *file = acct->file;
 474
 475        /*
 476         * Accounting records are not subject to resource limits.
 477         */
 478        flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
 479        current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
 480        /* Perform file operations on behalf of whoever enabled accounting */
 481        orig_cred = override_creds(file->f_cred);
 482
 483        /*
 484         * First check to see if there is enough free_space to continue
 485         * the process accounting system.
 486         */
 487        if (!check_free_space(acct))
 488                goto out;
 489
 490        fill_ac(&ac);
 491        /* we really need to bite the bullet and change layout */
 492        ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
 493        ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
 494#if ACCT_VERSION == 1 || ACCT_VERSION == 2
 495        /* backward-compatible 16 bit fields */
 496        ac.ac_uid16 = ac.ac_uid;
 497        ac.ac_gid16 = ac.ac_gid;
 498#endif
 499#if ACCT_VERSION == 3
 500        {
 501                struct pid_namespace *ns = acct->ns;
 502
 503                ac.ac_pid = task_tgid_nr_ns(current, ns);
 504                rcu_read_lock();
 505                ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent),
 506                                             ns);
 507                rcu_read_unlock();
 508        }
 509#endif
 510        /*
 511         * Get freeze protection. If the fs is frozen, just skip the write
 512         * as we could deadlock the system otherwise.
 513         */
 514        if (file_start_write_trylock(file)) {
 515                /* it's been opened O_APPEND, so position is irrelevant */
 516                loff_t pos = 0;
 517                __kernel_write(file, (char *)&ac, sizeof(acct_t), &pos);
 518                file_end_write(file);
 519        }
 520out:
 521        current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
 522        revert_creds(orig_cred);
 523}
 524
 525/**
 526 * acct_collect - collect accounting information into pacct_struct
 527 * @exitcode: task exit code
 528 * @group_dead: not 0, if this thread is the last one in the process.
 529 */
 530void acct_collect(long exitcode, int group_dead)
 531{
 532        struct pacct_struct *pacct = &current->signal->pacct;
 533        cputime_t utime, stime;
 534        unsigned long vsize = 0;
 535
 536        if (group_dead && current->mm) {
 537                struct vm_area_struct *vma;
 538
 539                down_read(&current->mm->mmap_sem);
 540                vma = current->mm->mmap;
 541                while (vma) {
 542                        vsize += vma->vm_end - vma->vm_start;
 543                        vma = vma->vm_next;
 544                }
 545                up_read(&current->mm->mmap_sem);
 546        }
 547
 548        spin_lock_irq(&current->sighand->siglock);
 549        if (group_dead)
 550                pacct->ac_mem = vsize / 1024;
 551        if (thread_group_leader(current)) {
 552                pacct->ac_exitcode = exitcode;
 553                if (current->flags & PF_FORKNOEXEC)
 554                        pacct->ac_flag |= AFORK;
 555        }
 556        if (current->flags & PF_SUPERPRIV)
 557                pacct->ac_flag |= ASU;
 558        if (current->flags & PF_DUMPCORE)
 559                pacct->ac_flag |= ACORE;
 560        if (current->flags & PF_SIGNALED)
 561                pacct->ac_flag |= AXSIG;
 562        task_cputime(current, &utime, &stime);
 563        pacct->ac_utime += utime;
 564        pacct->ac_stime += stime;
 565        pacct->ac_minflt += current->min_flt;
 566        pacct->ac_majflt += current->maj_flt;
 567        spin_unlock_irq(&current->sighand->siglock);
 568}
 569
 570static void slow_acct_process(struct pid_namespace *ns)
 571{
 572        for ( ; ns; ns = ns->parent) {
 573                struct bsd_acct_struct *acct = acct_get(ns);
 574                if (acct) {
 575                        do_acct_process(acct);
 576                        mutex_unlock(&acct->lock);
 577                        acct_put(acct);
 578                }
 579        }
 580}
 581
 582/**
 583 * acct_process
 584 *
 585 * handles process accounting for an exiting task
 586 */
 587void acct_process(void)
 588{
 589        struct pid_namespace *ns;
 590
 591        /*
 592         * This loop is safe lockless, since current is still
 593         * alive and holds its namespace, which in turn holds
 594         * its parent.
 595         */
 596        for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) {
 597                if (ns->bacct)
 598                        break;
 599        }
 600        if (unlikely(ns))
 601                slow_acct_process(ns);
 602}
 603