linux/kernel/acct.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *  linux/kernel/acct.c
   4 *
   5 *  BSD Process Accounting for Linux
   6 *
   7 *  Author: Marco van Wieringen <mvw@planets.elm.net>
   8 *
   9 *  Some code based on ideas and code from:
  10 *  Thomas K. Dyas <tdyas@eden.rutgers.edu>
  11 *
  12 *  This file implements BSD-style process accounting. Whenever any
  13 *  process exits, an accounting record of type "struct acct" is
  14 *  written to the file specified with the acct() system call. It is
  15 *  up to user-level programs to do useful things with the accounting
  16 *  log. The kernel just provides the raw accounting information.
  17 *
  18 * (C) Copyright 1995 - 1997 Marco van Wieringen - ELM Consultancy B.V.
  19 *
  20 *  Plugged two leaks. 1) It didn't return acct_file into the free_filps if
  21 *  the file happened to be read-only. 2) If the accounting was suspended
  22 *  due to the lack of space it happily allowed to reopen it and completely
  23 *  lost the old acct_file. 3/10/98, Al Viro.
  24 *
  25 *  Now we silently close acct_file on attempt to reopen. Cleaned sys_acct().
  26 *  XTerms and EMACS are manifestations of pure evil. 21/10/98, AV.
  27 *
  28 *  Fixed a nasty interaction with sys_umount(). If the accounting
  29 *  was suspeneded we failed to stop it on umount(). Messy.
  30 *  Another one: remount to readonly didn't stop accounting.
  31 *      Question: what should we do if we have CAP_SYS_ADMIN but not
  32 *  CAP_SYS_PACCT? Current code does the following: umount returns -EBUSY
  33 *  unless we are messing with the root. In that case we are getting a
  34 *  real mess with do_remount_sb(). 9/11/98, AV.
  35 *
  36 *  Fixed a bunch of races (and pair of leaks). Probably not the best way,
  37 *  but this one obviously doesn't introduce deadlocks. Later. BTW, found
  38 *  one race (and leak) in BSD implementation.
  39 *  OK, that's better. ANOTHER race and leak in BSD variant. There always
  40 *  is one more bug... 10/11/98, AV.
  41 *
  42 *      Oh, fsck... Oopsable SMP race in do_process_acct() - we must hold
  43 * ->mmap_lock to walk the vma list of current->mm. Nasty, since it leaks
  44 * a struct file opened for write. Fixed. 2/6/2000, AV.
  45 */
  46
  47#include <linux/mm.h>
  48#include <linux/slab.h>
  49#include <linux/acct.h>
  50#include <linux/capability.h>
  51#include <linux/file.h>
  52#include <linux/tty.h>
  53#include <linux/security.h>
  54#include <linux/vfs.h>
  55#include <linux/jiffies.h>
  56#include <linux/times.h>
  57#include <linux/syscalls.h>
  58#include <linux/mount.h>
  59#include <linux/uaccess.h>
  60#include <linux/sched/cputime.h>
  61
  62#include <asm/div64.h>
  63#include <linux/blkdev.h> /* sector_div */
  64#include <linux/pid_namespace.h>
  65#include <linux/fs_pin.h>
  66
  67/*
  68 * These constants control the amount of freespace that suspend and
  69 * resume the process accounting system, and the time delay between
  70 * each check.
  71 * Turned into sysctl-controllable parameters. AV, 12/11/98
  72 */
  73
  74int acct_parm[3] = {4, 2, 30};
  75#define RESUME          (acct_parm[0])  /* >foo% free space - resume */
  76#define SUSPEND         (acct_parm[1])  /* <foo% free space - suspend */
  77#define ACCT_TIMEOUT    (acct_parm[2])  /* foo second timeout between checks */
  78
  79/*
  80 * External references and all of the globals.
  81 */
  82
  83struct bsd_acct_struct {
  84        struct fs_pin           pin;
  85        atomic_long_t           count;
  86        struct rcu_head         rcu;
  87        struct mutex            lock;
  88        int                     active;
  89        unsigned long           needcheck;
  90        struct file             *file;
  91        struct pid_namespace    *ns;
  92        struct work_struct      work;
  93        struct completion       done;
  94};
  95
  96static void do_acct_process(struct bsd_acct_struct *acct);
  97
  98/*
  99 * Check the amount of free space and suspend/resume accordingly.
 100 */
 101static int check_free_space(struct bsd_acct_struct *acct)
 102{
 103        struct kstatfs sbuf;
 104
 105        if (time_is_after_jiffies(acct->needcheck))
 106                goto out;
 107
 108        /* May block */
 109        if (vfs_statfs(&acct->file->f_path, &sbuf))
 110                goto out;
 111
 112        if (acct->active) {
 113                u64 suspend = sbuf.f_blocks * SUSPEND;
 114                do_div(suspend, 100);
 115                if (sbuf.f_bavail <= suspend) {
 116                        acct->active = 0;
 117                        pr_info("Process accounting paused\n");
 118                }
 119        } else {
 120                u64 resume = sbuf.f_blocks * RESUME;
 121                do_div(resume, 100);
 122                if (sbuf.f_bavail >= resume) {
 123                        acct->active = 1;
 124                        pr_info("Process accounting resumed\n");
 125                }
 126        }
 127
 128        acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
 129out:
 130        return acct->active;
 131}
 132
 133static void acct_put(struct bsd_acct_struct *p)
 134{
 135        if (atomic_long_dec_and_test(&p->count))
 136                kfree_rcu(p, rcu);
 137}
 138
 139static inline struct bsd_acct_struct *to_acct(struct fs_pin *p)
 140{
 141        return p ? container_of(p, struct bsd_acct_struct, pin) : NULL;
 142}
 143
 144static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
 145{
 146        struct bsd_acct_struct *res;
 147again:
 148        smp_rmb();
 149        rcu_read_lock();
 150        res = to_acct(READ_ONCE(ns->bacct));
 151        if (!res) {
 152                rcu_read_unlock();
 153                return NULL;
 154        }
 155        if (!atomic_long_inc_not_zero(&res->count)) {
 156                rcu_read_unlock();
 157                cpu_relax();
 158                goto again;
 159        }
 160        rcu_read_unlock();
 161        mutex_lock(&res->lock);
 162        if (res != to_acct(READ_ONCE(ns->bacct))) {
 163                mutex_unlock(&res->lock);
 164                acct_put(res);
 165                goto again;
 166        }
 167        return res;
 168}
 169
 170static void acct_pin_kill(struct fs_pin *pin)
 171{
 172        struct bsd_acct_struct *acct = to_acct(pin);
 173        mutex_lock(&acct->lock);
 174        do_acct_process(acct);
 175        schedule_work(&acct->work);
 176        wait_for_completion(&acct->done);
 177        cmpxchg(&acct->ns->bacct, pin, NULL);
 178        mutex_unlock(&acct->lock);
 179        pin_remove(pin);
 180        acct_put(acct);
 181}
 182
 183static void close_work(struct work_struct *work)
 184{
 185        struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
 186        struct file *file = acct->file;
 187        if (file->f_op->flush)
 188                file->f_op->flush(file, NULL);
 189        __fput_sync(file);
 190        complete(&acct->done);
 191}
 192
 193static int acct_on(struct filename *pathname)
 194{
 195        struct file *file;
 196        struct vfsmount *mnt, *internal;
 197        struct pid_namespace *ns = task_active_pid_ns(current);
 198        struct bsd_acct_struct *acct;
 199        struct fs_pin *old;
 200        int err;
 201
 202        acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
 203        if (!acct)
 204                return -ENOMEM;
 205
 206        /* Difference from BSD - they don't do O_APPEND */
 207        file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
 208        if (IS_ERR(file)) {
 209                kfree(acct);
 210                return PTR_ERR(file);
 211        }
 212
 213        if (!S_ISREG(file_inode(file)->i_mode)) {
 214                kfree(acct);
 215                filp_close(file, NULL);
 216                return -EACCES;
 217        }
 218
 219        if (!(file->f_mode & FMODE_CAN_WRITE)) {
 220                kfree(acct);
 221                filp_close(file, NULL);
 222                return -EIO;
 223        }
 224        internal = mnt_clone_internal(&file->f_path);
 225        if (IS_ERR(internal)) {
 226                kfree(acct);
 227                filp_close(file, NULL);
 228                return PTR_ERR(internal);
 229        }
 230        err = __mnt_want_write(internal);
 231        if (err) {
 232                mntput(internal);
 233                kfree(acct);
 234                filp_close(file, NULL);
 235                return err;
 236        }
 237        mnt = file->f_path.mnt;
 238        file->f_path.mnt = internal;
 239
 240        atomic_long_set(&acct->count, 1);
 241        init_fs_pin(&acct->pin, acct_pin_kill);
 242        acct->file = file;
 243        acct->needcheck = jiffies;
 244        acct->ns = ns;
 245        mutex_init(&acct->lock);
 246        INIT_WORK(&acct->work, close_work);
 247        init_completion(&acct->done);
 248        mutex_lock_nested(&acct->lock, 1);      /* nobody has seen it yet */
 249        pin_insert(&acct->pin, mnt);
 250
 251        rcu_read_lock();
 252        old = xchg(&ns->bacct, &acct->pin);
 253        mutex_unlock(&acct->lock);
 254        pin_kill(old);
 255        __mnt_drop_write(mnt);
 256        mntput(mnt);
 257        return 0;
 258}
 259
 260static DEFINE_MUTEX(acct_on_mutex);
 261
 262/**
 263 * sys_acct - enable/disable process accounting
 264 * @name: file name for accounting records or NULL to shutdown accounting
 265 *
 266 * sys_acct() is the only system call needed to implement process
 267 * accounting. It takes the name of the file where accounting records
 268 * should be written. If the filename is NULL, accounting will be
 269 * shutdown.
 270 *
 271 * Returns: 0 for success or negative errno values for failure.
 272 */
 273SYSCALL_DEFINE1(acct, const char __user *, name)
 274{
 275        int error = 0;
 276
 277        if (!capable(CAP_SYS_PACCT))
 278                return -EPERM;
 279
 280        if (name) {
 281                struct filename *tmp = getname(name);
 282
 283                if (IS_ERR(tmp))
 284                        return PTR_ERR(tmp);
 285                mutex_lock(&acct_on_mutex);
 286                error = acct_on(tmp);
 287                mutex_unlock(&acct_on_mutex);
 288                putname(tmp);
 289        } else {
 290                rcu_read_lock();
 291                pin_kill(task_active_pid_ns(current)->bacct);
 292        }
 293
 294        return error;
 295}
 296
 297void acct_exit_ns(struct pid_namespace *ns)
 298{
 299        rcu_read_lock();
 300        pin_kill(ns->bacct);
 301}
 302
 303/*
 304 *  encode an unsigned long into a comp_t
 305 *
 306 *  This routine has been adopted from the encode_comp_t() function in
 307 *  the kern_acct.c file of the FreeBSD operating system. The encoding
 308 *  is a 13-bit fraction with a 3-bit (base 8) exponent.
 309 */
 310
 311#define MANTSIZE        13                      /* 13 bit mantissa. */
 312#define EXPSIZE         3                       /* Base 8 (3 bit) exponent. */
 313#define MAXFRACT        ((1 << MANTSIZE) - 1)   /* Maximum fractional value. */
 314
 315static comp_t encode_comp_t(unsigned long value)
 316{
 317        int exp, rnd;
 318
 319        exp = rnd = 0;
 320        while (value > MAXFRACT) {
 321                rnd = value & (1 << (EXPSIZE - 1));     /* Round up? */
 322                value >>= EXPSIZE;      /* Base 8 exponent == 3 bit shift. */
 323                exp++;
 324        }
 325
 326        /*
 327         * If we need to round up, do it (and handle overflow correctly).
 328         */
 329        if (rnd && (++value > MAXFRACT)) {
 330                value >>= EXPSIZE;
 331                exp++;
 332        }
 333
 334        /*
 335         * Clean it up and polish it off.
 336         */
 337        exp <<= MANTSIZE;               /* Shift the exponent into place */
 338        exp += value;                   /* and add on the mantissa. */
 339        return exp;
 340}
 341
 342#if ACCT_VERSION == 1 || ACCT_VERSION == 2
 343/*
 344 * encode an u64 into a comp2_t (24 bits)
 345 *
 346 * Format: 5 bit base 2 exponent, 20 bits mantissa.
 347 * The leading bit of the mantissa is not stored, but implied for
 348 * non-zero exponents.
 349 * Largest encodable value is 50 bits.
 350 */
 351
 352#define MANTSIZE2       20                      /* 20 bit mantissa. */
 353#define EXPSIZE2        5                       /* 5 bit base 2 exponent. */
 354#define MAXFRACT2       ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */
 355#define MAXEXP2         ((1 << EXPSIZE2) - 1)    /* Maximum exponent. */
 356
 357static comp2_t encode_comp2_t(u64 value)
 358{
 359        int exp, rnd;
 360
 361        exp = (value > (MAXFRACT2>>1));
 362        rnd = 0;
 363        while (value > MAXFRACT2) {
 364                rnd = value & 1;
 365                value >>= 1;
 366                exp++;
 367        }
 368
 369        /*
 370         * If we need to round up, do it (and handle overflow correctly).
 371         */
 372        if (rnd && (++value > MAXFRACT2)) {
 373                value >>= 1;
 374                exp++;
 375        }
 376
 377        if (exp > MAXEXP2) {
 378                /* Overflow. Return largest representable number instead. */
 379                return (1ul << (MANTSIZE2+EXPSIZE2-1)) - 1;
 380        } else {
 381                return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1));
 382        }
 383}
 384#elif ACCT_VERSION == 3
 385/*
 386 * encode an u64 into a 32 bit IEEE float
 387 */
 388static u32 encode_float(u64 value)
 389{
 390        unsigned exp = 190;
 391        unsigned u;
 392
 393        if (value == 0)
 394                return 0;
 395        while ((s64)value > 0) {
 396                value <<= 1;
 397                exp--;
 398        }
 399        u = (u32)(value >> 40) & 0x7fffffu;
 400        return u | (exp << 23);
 401}
 402#endif
 403
 404/*
 405 *  Write an accounting entry for an exiting process
 406 *
 407 *  The acct_process() call is the workhorse of the process
 408 *  accounting system. The struct acct is built here and then written
 409 *  into the accounting file. This function should only be called from
 410 *  do_exit() or when switching to a different output file.
 411 */
 412
 413static void fill_ac(acct_t *ac)
 414{
 415        struct pacct_struct *pacct = &current->signal->pacct;
 416        u64 elapsed, run_time;
 417        time64_t btime;
 418        struct tty_struct *tty;
 419
 420        /*
 421         * Fill the accounting struct with the needed info as recorded
 422         * by the different kernel functions.
 423         */
 424        memset(ac, 0, sizeof(acct_t));
 425
 426        ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER;
 427        strlcpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm));
 428
 429        /* calculate run_time in nsec*/
 430        run_time = ktime_get_ns();
 431        run_time -= current->group_leader->start_time;
 432        /* convert nsec -> AHZ */
 433        elapsed = nsec_to_AHZ(run_time);
 434#if ACCT_VERSION == 3
 435        ac->ac_etime = encode_float(elapsed);
 436#else
 437        ac->ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
 438                                (unsigned long) elapsed : (unsigned long) -1l);
 439#endif
 440#if ACCT_VERSION == 1 || ACCT_VERSION == 2
 441        {
 442                /* new enlarged etime field */
 443                comp2_t etime = encode_comp2_t(elapsed);
 444
 445                ac->ac_etime_hi = etime >> 16;
 446                ac->ac_etime_lo = (u16) etime;
 447        }
 448#endif
 449        do_div(elapsed, AHZ);
 450        btime = ktime_get_real_seconds() - elapsed;
 451        ac->ac_btime = clamp_t(time64_t, btime, 0, U32_MAX);
 452#if ACCT_VERSION==2
 453        ac->ac_ahz = AHZ;
 454#endif
 455
 456        spin_lock_irq(&current->sighand->siglock);
 457        tty = current->signal->tty;     /* Safe as we hold the siglock */
 458        ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
 459        ac->ac_utime = encode_comp_t(nsec_to_AHZ(pacct->ac_utime));
 460        ac->ac_stime = encode_comp_t(nsec_to_AHZ(pacct->ac_stime));
 461        ac->ac_flag = pacct->ac_flag;
 462        ac->ac_mem = encode_comp_t(pacct->ac_mem);
 463        ac->ac_minflt = encode_comp_t(pacct->ac_minflt);
 464        ac->ac_majflt = encode_comp_t(pacct->ac_majflt);
 465        ac->ac_exitcode = pacct->ac_exitcode;
 466        spin_unlock_irq(&current->sighand->siglock);
 467}
 468/*
 469 *  do_acct_process does all actual work. Caller holds the reference to file.
 470 */
 471static void do_acct_process(struct bsd_acct_struct *acct)
 472{
 473        acct_t ac;
 474        unsigned long flim;
 475        const struct cred *orig_cred;
 476        struct file *file = acct->file;
 477
 478        /*
 479         * Accounting records are not subject to resource limits.
 480         */
 481        flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
 482        current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
 483        /* Perform file operations on behalf of whoever enabled accounting */
 484        orig_cred = override_creds(file->f_cred);
 485
 486        /*
 487         * First check to see if there is enough free_space to continue
 488         * the process accounting system.
 489         */
 490        if (!check_free_space(acct))
 491                goto out;
 492
 493        fill_ac(&ac);
 494        /* we really need to bite the bullet and change layout */
 495        ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
 496        ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
 497#if ACCT_VERSION == 1 || ACCT_VERSION == 2
 498        /* backward-compatible 16 bit fields */
 499        ac.ac_uid16 = ac.ac_uid;
 500        ac.ac_gid16 = ac.ac_gid;
 501#elif ACCT_VERSION == 3
 502        {
 503                struct pid_namespace *ns = acct->ns;
 504
 505                ac.ac_pid = task_tgid_nr_ns(current, ns);
 506                rcu_read_lock();
 507                ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent),
 508                                             ns);
 509                rcu_read_unlock();
 510        }
 511#endif
 512        /*
 513         * Get freeze protection. If the fs is frozen, just skip the write
 514         * as we could deadlock the system otherwise.
 515         */
 516        if (file_start_write_trylock(file)) {
 517                /* it's been opened O_APPEND, so position is irrelevant */
 518                loff_t pos = 0;
 519                __kernel_write(file, &ac, sizeof(acct_t), &pos);
 520                file_end_write(file);
 521        }
 522out:
 523        current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
 524        revert_creds(orig_cred);
 525}
 526
 527/**
 528 * acct_collect - collect accounting information into pacct_struct
 529 * @exitcode: task exit code
 530 * @group_dead: not 0, if this thread is the last one in the process.
 531 */
 532void acct_collect(long exitcode, int group_dead)
 533{
 534        struct pacct_struct *pacct = &current->signal->pacct;
 535        u64 utime, stime;
 536        unsigned long vsize = 0;
 537
 538        if (group_dead && current->mm) {
 539                struct vm_area_struct *vma;
 540
 541                mmap_read_lock(current->mm);
 542                vma = current->mm->mmap;
 543                while (vma) {
 544                        vsize += vma->vm_end - vma->vm_start;
 545                        vma = vma->vm_next;
 546                }
 547                mmap_read_unlock(current->mm);
 548        }
 549
 550        spin_lock_irq(&current->sighand->siglock);
 551        if (group_dead)
 552                pacct->ac_mem = vsize / 1024;
 553        if (thread_group_leader(current)) {
 554                pacct->ac_exitcode = exitcode;
 555                if (current->flags & PF_FORKNOEXEC)
 556                        pacct->ac_flag |= AFORK;
 557        }
 558        if (current->flags & PF_SUPERPRIV)
 559                pacct->ac_flag |= ASU;
 560        if (current->flags & PF_DUMPCORE)
 561                pacct->ac_flag |= ACORE;
 562        if (current->flags & PF_SIGNALED)
 563                pacct->ac_flag |= AXSIG;
 564
 565        task_cputime(current, &utime, &stime);
 566        pacct->ac_utime += utime;
 567        pacct->ac_stime += stime;
 568        pacct->ac_minflt += current->min_flt;
 569        pacct->ac_majflt += current->maj_flt;
 570        spin_unlock_irq(&current->sighand->siglock);
 571}
 572
 573static void slow_acct_process(struct pid_namespace *ns)
 574{
 575        for ( ; ns; ns = ns->parent) {
 576                struct bsd_acct_struct *acct = acct_get(ns);
 577                if (acct) {
 578                        do_acct_process(acct);
 579                        mutex_unlock(&acct->lock);
 580                        acct_put(acct);
 581                }
 582        }
 583}
 584
 585/**
 586 * acct_process - handles process accounting for an exiting task
 587 */
 588void acct_process(void)
 589{
 590        struct pid_namespace *ns;
 591
 592        /*
 593         * This loop is safe lockless, since current is still
 594         * alive and holds its namespace, which in turn holds
 595         * its parent.
 596         */
 597        for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) {
 598                if (ns->bacct)
 599                        break;
 600        }
 601        if (unlikely(ns))
 602                slow_acct_process(ns);
 603}
 604