linux/kernel/posix-cpu-timers.c
<<
>>
Prefs
   1/*
   2 * Implement CPU time clocks for the POSIX clock interface.
   3 */
   4
   5#include <linux/sched.h>
   6#include <linux/posix-timers.h>
   7#include <linux/errno.h>
   8#include <linux/math64.h>
   9#include <asm/uaccess.h>
  10#include <linux/kernel_stat.h>
  11#include <trace/events/timer.h>
  12#include <linux/random.h>
  13#include <linux/tick.h>
  14#include <linux/workqueue.h>
  15
  16/*
  17 * Called after updating RLIMIT_CPU to run cpu timer and update
  18 * tsk->signal->cputime_expires expiration cache if necessary. Needs
  19 * siglock protection since other code may update expiration cache as
  20 * well.
  21 */
  22void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
  23{
  24        cputime_t cputime = secs_to_cputime(rlim_new);
  25
  26        spin_lock_irq(&task->sighand->siglock);
  27        set_process_cpu_timer(task, CPUCLOCK_PROF, &cputime, NULL);
  28        spin_unlock_irq(&task->sighand->siglock);
  29}
  30
  31static int check_clock(const clockid_t which_clock)
  32{
  33        int error = 0;
  34        struct task_struct *p;
  35        const pid_t pid = CPUCLOCK_PID(which_clock);
  36
  37        if (CPUCLOCK_WHICH(which_clock) >= CPUCLOCK_MAX)
  38                return -EINVAL;
  39
  40        if (pid == 0)
  41                return 0;
  42
  43        rcu_read_lock();
  44        p = find_task_by_vpid(pid);
  45        if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ?
  46                   same_thread_group(p, current) : has_group_leader_pid(p))) {
  47                error = -EINVAL;
  48        }
  49        rcu_read_unlock();
  50
  51        return error;
  52}
  53
  54static inline union cpu_time_count
  55timespec_to_sample(const clockid_t which_clock, const struct timespec *tp)
  56{
  57        union cpu_time_count ret;
  58        ret.sched = 0;          /* high half always zero when .cpu used */
  59        if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
  60                ret.sched = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
  61        } else {
  62                ret.cpu = timespec_to_cputime(tp);
  63        }
  64        return ret;
  65}
  66
  67static void sample_to_timespec(const clockid_t which_clock,
  68                               union cpu_time_count cpu,
  69                               struct timespec *tp)
  70{
  71        if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED)
  72                *tp = ns_to_timespec(cpu.sched);
  73        else
  74                cputime_to_timespec(cpu.cpu, tp);
  75}
  76
  77static inline int cpu_time_before(const clockid_t which_clock,
  78                                  union cpu_time_count now,
  79                                  union cpu_time_count then)
  80{
  81        if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
  82                return now.sched < then.sched;
  83        }  else {
  84                return now.cpu < then.cpu;
  85        }
  86}
  87static inline void cpu_time_add(const clockid_t which_clock,
  88                                union cpu_time_count *acc,
  89                                union cpu_time_count val)
  90{
  91        if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
  92                acc->sched += val.sched;
  93        }  else {
  94                acc->cpu += val.cpu;
  95        }
  96}
  97static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
  98                                                union cpu_time_count a,
  99                                                union cpu_time_count b)
 100{
 101        if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
 102                a.sched -= b.sched;
 103        }  else {
 104                a.cpu -= b.cpu;
 105        }
 106        return a;
 107}
 108
 109/*
 110 * Update expiry time from increment, and increase overrun count,
 111 * given the current clock sample.
 112 */
 113static void bump_cpu_timer(struct k_itimer *timer,
 114                                  union cpu_time_count now)
 115{
 116        int i;
 117
 118        if (timer->it.cpu.incr.sched == 0)
 119                return;
 120
 121        if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
 122                unsigned long long delta, incr;
 123
 124                if (now.sched < timer->it.cpu.expires.sched)
 125                        return;
 126                incr = timer->it.cpu.incr.sched;
 127                delta = now.sched + incr - timer->it.cpu.expires.sched;
 128                /* Don't use (incr*2 < delta), incr*2 might overflow. */
 129                for (i = 0; incr < delta - incr; i++)
 130                        incr = incr << 1;
 131                for (; i >= 0; incr >>= 1, i--) {
 132                        if (delta < incr)
 133                                continue;
 134                        timer->it.cpu.expires.sched += incr;
 135                        timer->it_overrun += 1 << i;
 136                        delta -= incr;
 137                }
 138        } else {
 139                cputime_t delta, incr;
 140
 141                if (now.cpu < timer->it.cpu.expires.cpu)
 142                        return;
 143                incr = timer->it.cpu.incr.cpu;
 144                delta = now.cpu + incr - timer->it.cpu.expires.cpu;
 145                /* Don't use (incr*2 < delta), incr*2 might overflow. */
 146                for (i = 0; incr < delta - incr; i++)
 147                             incr += incr;
 148                for (; i >= 0; incr = incr >> 1, i--) {
 149                        if (delta < incr)
 150                                continue;
 151                        timer->it.cpu.expires.cpu += incr;
 152                        timer->it_overrun += 1 << i;
 153                        delta -= incr;
 154                }
 155        }
 156}
 157
 158/**
 159 * task_cputime_zero - Check a task_cputime struct for all zero fields.
 160 *
 161 * @cputime:    The struct to compare.
 162 *
 163 * Checks @cputime to see if all fields are zero.  Returns true if all fields
 164 * are zero, false if any field is nonzero.
 165 */
 166static inline int task_cputime_zero(const struct task_cputime *cputime)
 167{
 168        if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
 169                return 1;
 170        return 0;
 171}
 172
 173static inline cputime_t prof_ticks(struct task_struct *p)
 174{
 175        cputime_t utime, stime;
 176
 177        task_cputime(p, &utime, &stime);
 178
 179        return utime + stime;
 180}
 181static inline cputime_t virt_ticks(struct task_struct *p)
 182{
 183        cputime_t utime;
 184
 185        task_cputime(p, &utime, NULL);
 186
 187        return utime;
 188}
 189
 190static int
 191posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
 192{
 193        int error = check_clock(which_clock);
 194        if (!error) {
 195                tp->tv_sec = 0;
 196                tp->tv_nsec = ((NSEC_PER_SEC + HZ - 1) / HZ);
 197                if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
 198                        /*
 199                         * If sched_clock is using a cycle counter, we
 200                         * don't have any idea of its true resolution
 201                         * exported, but it is much more than 1s/HZ.
 202                         */
 203                        tp->tv_nsec = 1;
 204                }
 205        }
 206        return error;
 207}
 208
 209static int
 210posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
 211{
 212        /*
 213         * You can never reset a CPU clock, but we check for other errors
 214         * in the call before failing with EPERM.
 215         */
 216        int error = check_clock(which_clock);
 217        if (error == 0) {
 218                error = -EPERM;
 219        }
 220        return error;
 221}
 222
 223
 224/*
 225 * Sample a per-thread clock for the given task.
 226 */
 227static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
 228                            union cpu_time_count *cpu)
 229{
 230        switch (CPUCLOCK_WHICH(which_clock)) {
 231        default:
 232                return -EINVAL;
 233        case CPUCLOCK_PROF:
 234                cpu->cpu = prof_ticks(p);
 235                break;
 236        case CPUCLOCK_VIRT:
 237                cpu->cpu = virt_ticks(p);
 238                break;
 239        case CPUCLOCK_SCHED:
 240                cpu->sched = task_sched_runtime(p);
 241                break;
 242        }
 243        return 0;
 244}
 245
 246static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
 247{
 248        if (b->utime > a->utime)
 249                a->utime = b->utime;
 250
 251        if (b->stime > a->stime)
 252                a->stime = b->stime;
 253
 254        if (b->sum_exec_runtime > a->sum_exec_runtime)
 255                a->sum_exec_runtime = b->sum_exec_runtime;
 256}
 257
 258void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
 259{
 260        struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
 261        struct task_cputime sum;
 262        unsigned long flags;
 263
 264        if (!cputimer->running) {
 265                /*
 266                 * The POSIX timer interface allows for absolute time expiry
 267                 * values through the TIMER_ABSTIME flag, therefore we have
 268                 * to synchronize the timer to the clock every time we start
 269                 * it.
 270                 */
 271                thread_group_cputime(tsk, &sum);
 272                raw_spin_lock_irqsave(&cputimer->lock, flags);
 273                cputimer->running = 1;
 274                update_gt_cputime(&cputimer->cputime, &sum);
 275        } else
 276                raw_spin_lock_irqsave(&cputimer->lock, flags);
 277        *times = cputimer->cputime;
 278        raw_spin_unlock_irqrestore(&cputimer->lock, flags);
 279}
 280
 281/*
 282 * Sample a process (thread group) clock for the given group_leader task.
 283 * Must be called with tasklist_lock held for reading.
 284 */
 285static int cpu_clock_sample_group(const clockid_t which_clock,
 286                                  struct task_struct *p,
 287                                  union cpu_time_count *cpu)
 288{
 289        struct task_cputime cputime;
 290
 291        switch (CPUCLOCK_WHICH(which_clock)) {
 292        default:
 293                return -EINVAL;
 294        case CPUCLOCK_PROF:
 295                thread_group_cputime(p, &cputime);
 296                cpu->cpu = cputime.utime + cputime.stime;
 297                break;
 298        case CPUCLOCK_VIRT:
 299                thread_group_cputime(p, &cputime);
 300                cpu->cpu = cputime.utime;
 301                break;
 302        case CPUCLOCK_SCHED:
 303                thread_group_cputime(p, &cputime);
 304                cpu->sched = cputime.sum_exec_runtime;
 305                break;
 306        }
 307        return 0;
 308}
 309
 310
 311static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
 312{
 313        const pid_t pid = CPUCLOCK_PID(which_clock);
 314        int error = -EINVAL;
 315        union cpu_time_count rtn;
 316
 317        if (pid == 0) {
 318                /*
 319                 * Special case constant value for our own clocks.
 320                 * We don't have to do any lookup to find ourselves.
 321                 */
 322                if (CPUCLOCK_PERTHREAD(which_clock)) {
 323                        /*
 324                         * Sampling just ourselves we can do with no locking.
 325                         */
 326                        error = cpu_clock_sample(which_clock,
 327                                                 current, &rtn);
 328                } else {
 329                        read_lock(&tasklist_lock);
 330                        error = cpu_clock_sample_group(which_clock,
 331                                                       current, &rtn);
 332                        read_unlock(&tasklist_lock);
 333                }
 334        } else {
 335                /*
 336                 * Find the given PID, and validate that the caller
 337                 * should be able to see it.
 338                 */
 339                struct task_struct *p;
 340                rcu_read_lock();
 341                p = find_task_by_vpid(pid);
 342                if (p) {
 343                        if (CPUCLOCK_PERTHREAD(which_clock)) {
 344                                if (same_thread_group(p, current)) {
 345                                        error = cpu_clock_sample(which_clock,
 346                                                                 p, &rtn);
 347                                }
 348                        } else {
 349                                read_lock(&tasklist_lock);
 350                                if (thread_group_leader(p) && p->sighand) {
 351                                        error =
 352                                            cpu_clock_sample_group(which_clock,
 353                                                                   p, &rtn);
 354                                }
 355                                read_unlock(&tasklist_lock);
 356                        }
 357                }
 358                rcu_read_unlock();
 359        }
 360
 361        if (error)
 362                return error;
 363        sample_to_timespec(which_clock, rtn, tp);
 364        return 0;
 365}
 366
 367
 368/*
 369 * Validate the clockid_t for a new CPU-clock timer, and initialize the timer.
 370 * This is called from sys_timer_create() and do_cpu_nanosleep() with the
 371 * new timer already all-zeros initialized.
 372 */
 373static int posix_cpu_timer_create(struct k_itimer *new_timer)
 374{
 375        int ret = 0;
 376        const pid_t pid = CPUCLOCK_PID(new_timer->it_clock);
 377        struct task_struct *p;
 378
 379        if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX)
 380                return -EINVAL;
 381
 382        INIT_LIST_HEAD(&new_timer->it.cpu.entry);
 383
 384        rcu_read_lock();
 385        if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
 386                if (pid == 0) {
 387                        p = current;
 388                } else {
 389                        p = find_task_by_vpid(pid);
 390                        if (p && !same_thread_group(p, current))
 391                                p = NULL;
 392                }
 393        } else {
 394                if (pid == 0) {
 395                        p = current->group_leader;
 396                } else {
 397                        p = find_task_by_vpid(pid);
 398                        if (p && !has_group_leader_pid(p))
 399                                p = NULL;
 400                }
 401        }
 402        new_timer->it.cpu.task = p;
 403        if (p) {
 404                get_task_struct(p);
 405        } else {
 406                ret = -EINVAL;
 407        }
 408        rcu_read_unlock();
 409
 410        return ret;
 411}
 412
 413/*
 414 * Clean up a CPU-clock timer that is about to be destroyed.
 415 * This is called from timer deletion with the timer already locked.
 416 * If we return TIMER_RETRY, it's necessary to release the timer's lock
 417 * and try again.  (This happens when the timer is in the middle of firing.)
 418 */
 419static int posix_cpu_timer_del(struct k_itimer *timer)
 420{
 421        struct task_struct *p = timer->it.cpu.task;
 422        int ret = 0;
 423
 424        if (likely(p != NULL)) {
 425                read_lock(&tasklist_lock);
 426                if (unlikely(p->sighand == NULL)) {
 427                        /*
 428                         * We raced with the reaping of the task.
 429                         * The deletion should have cleared us off the list.
 430                         */
 431                        BUG_ON(!list_empty(&timer->it.cpu.entry));
 432                } else {
 433                        spin_lock(&p->sighand->siglock);
 434                        if (timer->it.cpu.firing)
 435                                ret = TIMER_RETRY;
 436                        else
 437                                list_del(&timer->it.cpu.entry);
 438                        spin_unlock(&p->sighand->siglock);
 439                }
 440                read_unlock(&tasklist_lock);
 441
 442                if (!ret)
 443                        put_task_struct(p);
 444        }
 445
 446        return ret;
 447}
 448
 449/*
 450 * Clean out CPU timers still ticking when a thread exited.  The task
 451 * pointer is cleared, and the expiry time is replaced with the residual
 452 * time for later timer_gettime calls to return.
 453 * This must be called with the siglock held.
 454 */
 455static void cleanup_timers(struct list_head *head,
 456                           cputime_t utime, cputime_t stime,
 457                           unsigned long long sum_exec_runtime)
 458{
 459        struct cpu_timer_list *timer, *next;
 460        cputime_t ptime = utime + stime;
 461
 462        list_for_each_entry_safe(timer, next, head, entry) {
 463                list_del_init(&timer->entry);
 464                if (timer->expires.cpu < ptime) {
 465                        timer->expires.cpu = 0;
 466                } else {
 467                        timer->expires.cpu -= ptime;
 468                }
 469        }
 470
 471        ++head;
 472        list_for_each_entry_safe(timer, next, head, entry) {
 473                list_del_init(&timer->entry);
 474                if (timer->expires.cpu < utime) {
 475                        timer->expires.cpu = 0;
 476                } else {
 477                        timer->expires.cpu -= utime;
 478                }
 479        }
 480
 481        ++head;
 482        list_for_each_entry_safe(timer, next, head, entry) {
 483                list_del_init(&timer->entry);
 484                if (timer->expires.sched < sum_exec_runtime) {
 485                        timer->expires.sched = 0;
 486                } else {
 487                        timer->expires.sched -= sum_exec_runtime;
 488                }
 489        }
 490}
 491
 492/*
 493 * These are both called with the siglock held, when the current thread
 494 * is being reaped.  When the final (leader) thread in the group is reaped,
 495 * posix_cpu_timers_exit_group will be called after posix_cpu_timers_exit.
 496 */
 497void posix_cpu_timers_exit(struct task_struct *tsk)
 498{
 499        cputime_t utime, stime;
 500
 501        add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
 502                                                sizeof(unsigned long long));
 503        task_cputime(tsk, &utime, &stime);
 504        cleanup_timers(tsk->cpu_timers,
 505                       utime, stime, tsk->se.sum_exec_runtime);
 506
 507}
 508void posix_cpu_timers_exit_group(struct task_struct *tsk)
 509{
 510        struct signal_struct *const sig = tsk->signal;
 511        cputime_t utime, stime;
 512
 513        task_cputime(tsk, &utime, &stime);
 514        cleanup_timers(tsk->signal->cpu_timers,
 515                       utime + sig->utime, stime + sig->stime,
 516                       tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
 517}
 518
 519static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
 520{
 521        /*
 522         * That's all for this thread or process.
 523         * We leave our residual in expires to be reported.
 524         */
 525        put_task_struct(timer->it.cpu.task);
 526        timer->it.cpu.task = NULL;
 527        timer->it.cpu.expires = cpu_time_sub(timer->it_clock,
 528                                             timer->it.cpu.expires,
 529                                             now);
 530}
 531
 532static inline int expires_gt(cputime_t expires, cputime_t new_exp)
 533{
 534        return expires == 0 || expires > new_exp;
 535}
 536
 537/*
 538 * Insert the timer on the appropriate list before any timers that
 539 * expire later.  This must be called with the tasklist_lock held
 540 * for reading, interrupts disabled and p->sighand->siglock taken.
 541 */
 542static void arm_timer(struct k_itimer *timer)
 543{
 544        struct task_struct *p = timer->it.cpu.task;
 545        struct list_head *head, *listpos;
 546        struct task_cputime *cputime_expires;
 547        struct cpu_timer_list *const nt = &timer->it.cpu;
 548        struct cpu_timer_list *next;
 549
 550        if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
 551                head = p->cpu_timers;
 552                cputime_expires = &p->cputime_expires;
 553        } else {
 554                head = p->signal->cpu_timers;
 555                cputime_expires = &p->signal->cputime_expires;
 556        }
 557        head += CPUCLOCK_WHICH(timer->it_clock);
 558
 559        listpos = head;
 560        list_for_each_entry(next, head, entry) {
 561                if (cpu_time_before(timer->it_clock, nt->expires, next->expires))
 562                        break;
 563                listpos = &next->entry;
 564        }
 565        list_add(&nt->entry, listpos);
 566
 567        if (listpos == head) {
 568                union cpu_time_count *exp = &nt->expires;
 569
 570                /*
 571                 * We are the new earliest-expiring POSIX 1.b timer, hence
 572                 * need to update expiration cache. Take into account that
 573                 * for process timers we share expiration cache with itimers
 574                 * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME.
 575                 */
 576
 577                switch (CPUCLOCK_WHICH(timer->it_clock)) {
 578                case CPUCLOCK_PROF:
 579                        if (expires_gt(cputime_expires->prof_exp, exp->cpu))
 580                                cputime_expires->prof_exp = exp->cpu;
 581                        break;
 582                case CPUCLOCK_VIRT:
 583                        if (expires_gt(cputime_expires->virt_exp, exp->cpu))
 584                                cputime_expires->virt_exp = exp->cpu;
 585                        break;
 586                case CPUCLOCK_SCHED:
 587                        if (cputime_expires->sched_exp == 0 ||
 588                            cputime_expires->sched_exp > exp->sched)
 589                                cputime_expires->sched_exp = exp->sched;
 590                        break;
 591                }
 592        }
 593}
 594
 595/*
 596 * The timer is locked, fire it and arrange for its reload.
 597 */
 598static void cpu_timer_fire(struct k_itimer *timer)
 599{
 600        if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
 601                /*
 602                 * User don't want any signal.
 603                 */
 604                timer->it.cpu.expires.sched = 0;
 605        } else if (unlikely(timer->sigq == NULL)) {
 606                /*
 607                 * This a special case for clock_nanosleep,
 608                 * not a normal timer from sys_timer_create.
 609                 */
 610                wake_up_process(timer->it_process);
 611                timer->it.cpu.expires.sched = 0;
 612        } else if (timer->it.cpu.incr.sched == 0) {
 613                /*
 614                 * One-shot timer.  Clear it as soon as it's fired.
 615                 */
 616                posix_timer_event(timer, 0);
 617                timer->it.cpu.expires.sched = 0;
 618        } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) {
 619                /*
 620                 * The signal did not get queued because the signal
 621                 * was ignored, so we won't get any callback to
 622                 * reload the timer.  But we need to keep it
 623                 * ticking in case the signal is deliverable next time.
 624                 */
 625                posix_cpu_timer_schedule(timer);
 626        }
 627}
 628
 629/*
 630 * Sample a process (thread group) timer for the given group_leader task.
 631 * Must be called with tasklist_lock held for reading.
 632 */
 633static int cpu_timer_sample_group(const clockid_t which_clock,
 634                                  struct task_struct *p,
 635                                  union cpu_time_count *cpu)
 636{
 637        struct task_cputime cputime;
 638
 639        thread_group_cputimer(p, &cputime);
 640        switch (CPUCLOCK_WHICH(which_clock)) {
 641        default:
 642                return -EINVAL;
 643        case CPUCLOCK_PROF:
 644                cpu->cpu = cputime.utime + cputime.stime;
 645                break;
 646        case CPUCLOCK_VIRT:
 647                cpu->cpu = cputime.utime;
 648                break;
 649        case CPUCLOCK_SCHED:
 650                cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
 651                break;
 652        }
 653        return 0;
 654}
 655
 656#ifdef CONFIG_NO_HZ_FULL
 657static void nohz_kick_work_fn(struct work_struct *work)
 658{
 659        tick_nohz_full_kick_all();
 660}
 661
 662static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn);
 663
 664/*
 665 * We need the IPIs to be sent from sane process context.
 666 * The posix cpu timers are always set with irqs disabled.
 667 */
 668static void posix_cpu_timer_kick_nohz(void)
 669{
 670        schedule_work(&nohz_kick_work);
 671}
 672
 673bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
 674{
 675        if (!task_cputime_zero(&tsk->cputime_expires))
 676                return false;
 677
 678        if (tsk->signal->cputimer.running)
 679                return false;
 680
 681        return true;
 682}
 683#else
 684static inline void posix_cpu_timer_kick_nohz(void) { }
 685#endif
 686
 687/*
 688 * Guts of sys_timer_settime for CPU timers.
 689 * This is called with the timer locked and interrupts disabled.
 690 * If we return TIMER_RETRY, it's necessary to release the timer's lock
 691 * and try again.  (This happens when the timer is in the middle of firing.)
 692 */
 693static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 694                               struct itimerspec *new, struct itimerspec *old)
 695{
 696        struct task_struct *p = timer->it.cpu.task;
 697        union cpu_time_count old_expires, new_expires, old_incr, val;
 698        int ret;
 699
 700        if (unlikely(p == NULL)) {
 701                /*
 702                 * Timer refers to a dead task's clock.
 703                 */
 704                return -ESRCH;
 705        }
 706
 707        new_expires = timespec_to_sample(timer->it_clock, &new->it_value);
 708
 709        read_lock(&tasklist_lock);
 710        /*
 711         * We need the tasklist_lock to protect against reaping that
 712         * clears p->sighand.  If p has just been reaped, we can no
 713         * longer get any information about it at all.
 714         */
 715        if (unlikely(p->sighand == NULL)) {
 716                read_unlock(&tasklist_lock);
 717                put_task_struct(p);
 718                timer->it.cpu.task = NULL;
 719                return -ESRCH;
 720        }
 721
 722        /*
 723         * Disarm any old timer after extracting its expiry time.
 724         */
 725        BUG_ON(!irqs_disabled());
 726
 727        ret = 0;
 728        old_incr = timer->it.cpu.incr;
 729        spin_lock(&p->sighand->siglock);
 730        old_expires = timer->it.cpu.expires;
 731        if (unlikely(timer->it.cpu.firing)) {
 732                timer->it.cpu.firing = -1;
 733                ret = TIMER_RETRY;
 734        } else
 735                list_del_init(&timer->it.cpu.entry);
 736
 737        /*
 738         * We need to sample the current value to convert the new
 739         * value from to relative and absolute, and to convert the
 740         * old value from absolute to relative.  To set a process
 741         * timer, we need a sample to balance the thread expiry
 742         * times (in arm_timer).  With an absolute time, we must
 743         * check if it's already passed.  In short, we need a sample.
 744         */
 745        if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
 746                cpu_clock_sample(timer->it_clock, p, &val);
 747        } else {
 748                cpu_timer_sample_group(timer->it_clock, p, &val);
 749        }
 750
 751        if (old) {
 752                if (old_expires.sched == 0) {
 753                        old->it_value.tv_sec = 0;
 754                        old->it_value.tv_nsec = 0;
 755                } else {
 756                        /*
 757                         * Update the timer in case it has
 758                         * overrun already.  If it has,
 759                         * we'll report it as having overrun
 760                         * and with the next reloaded timer
 761                         * already ticking, though we are
 762                         * swallowing that pending
 763                         * notification here to install the
 764                         * new setting.
 765                         */
 766                        bump_cpu_timer(timer, val);
 767                        if (cpu_time_before(timer->it_clock, val,
 768                                            timer->it.cpu.expires)) {
 769                                old_expires = cpu_time_sub(
 770                                        timer->it_clock,
 771                                        timer->it.cpu.expires, val);
 772                                sample_to_timespec(timer->it_clock,
 773                                                   old_expires,
 774                                                   &old->it_value);
 775                        } else {
 776                                old->it_value.tv_nsec = 1;
 777                                old->it_value.tv_sec = 0;
 778                        }
 779                }
 780        }
 781
 782        if (unlikely(ret)) {
 783                /*
 784                 * We are colliding with the timer actually firing.
 785                 * Punt after filling in the timer's old value, and
 786                 * disable this firing since we are already reporting
 787                 * it as an overrun (thanks to bump_cpu_timer above).
 788                 */
 789                spin_unlock(&p->sighand->siglock);
 790                read_unlock(&tasklist_lock);
 791                goto out;
 792        }
 793
 794        if (new_expires.sched != 0 && !(flags & TIMER_ABSTIME)) {
 795                cpu_time_add(timer->it_clock, &new_expires, val);
 796        }
 797
 798        /*
 799         * Install the new expiry time (or zero).
 800         * For a timer with no notification action, we don't actually
 801         * arm the timer (we'll just fake it for timer_gettime).
 802         */
 803        timer->it.cpu.expires = new_expires;
 804        if (new_expires.sched != 0 &&
 805            cpu_time_before(timer->it_clock, val, new_expires)) {
 806                arm_timer(timer);
 807        }
 808
 809        spin_unlock(&p->sighand->siglock);
 810        read_unlock(&tasklist_lock);
 811
 812        /*
 813         * Install the new reload setting, and
 814         * set up the signal and overrun bookkeeping.
 815         */
 816        timer->it.cpu.incr = timespec_to_sample(timer->it_clock,
 817                                                &new->it_interval);
 818
 819        /*
 820         * This acts as a modification timestamp for the timer,
 821         * so any automatic reload attempt will punt on seeing
 822         * that we have reset the timer manually.
 823         */
 824        timer->it_requeue_pending = (timer->it_requeue_pending + 2) &
 825                ~REQUEUE_PENDING;
 826        timer->it_overrun_last = 0;
 827        timer->it_overrun = -1;
 828
 829        if (new_expires.sched != 0 &&
 830            !cpu_time_before(timer->it_clock, val, new_expires)) {
 831                /*
 832                 * The designated time already passed, so we notify
 833                 * immediately, even if the thread never runs to
 834                 * accumulate more time on this clock.
 835                 */
 836                cpu_timer_fire(timer);
 837        }
 838
 839        ret = 0;
 840 out:
 841        if (old) {
 842                sample_to_timespec(timer->it_clock,
 843                                   old_incr, &old->it_interval);
 844        }
 845        if (!ret)
 846                posix_cpu_timer_kick_nohz();
 847        return ret;
 848}
 849
 850static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 851{
 852        union cpu_time_count now;
 853        struct task_struct *p = timer->it.cpu.task;
 854        int clear_dead;
 855
 856        /*
 857         * Easy part: convert the reload time.
 858         */
 859        sample_to_timespec(timer->it_clock,
 860                           timer->it.cpu.incr, &itp->it_interval);
 861
 862        if (timer->it.cpu.expires.sched == 0) { /* Timer not armed at all.  */
 863                itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
 864                return;
 865        }
 866
 867        if (unlikely(p == NULL)) {
 868                /*
 869                 * This task already died and the timer will never fire.
 870                 * In this case, expires is actually the dead value.
 871                 */
 872        dead:
 873                sample_to_timespec(timer->it_clock, timer->it.cpu.expires,
 874                                   &itp->it_value);
 875                return;
 876        }
 877
 878        /*
 879         * Sample the clock to take the difference with the expiry time.
 880         */
 881        if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
 882                cpu_clock_sample(timer->it_clock, p, &now);
 883                clear_dead = p->exit_state;
 884        } else {
 885                read_lock(&tasklist_lock);
 886                if (unlikely(p->sighand == NULL)) {
 887                        /*
 888                         * The process has been reaped.
 889                         * We can't even collect a sample any more.
 890                         * Call the timer disarmed, nothing else to do.
 891                         */
 892                        put_task_struct(p);
 893                        timer->it.cpu.task = NULL;
 894                        timer->it.cpu.expires.sched = 0;
 895                        read_unlock(&tasklist_lock);
 896                        goto dead;
 897                } else {
 898                        cpu_timer_sample_group(timer->it_clock, p, &now);
 899                        clear_dead = (unlikely(p->exit_state) &&
 900                                      thread_group_empty(p));
 901                }
 902                read_unlock(&tasklist_lock);
 903        }
 904
 905        if (unlikely(clear_dead)) {
 906                /*
 907                 * We've noticed that the thread is dead, but
 908                 * not yet reaped.  Take this opportunity to
 909                 * drop our task ref.
 910                 */
 911                clear_dead_task(timer, now);
 912                goto dead;
 913        }
 914
 915        if (cpu_time_before(timer->it_clock, now, timer->it.cpu.expires)) {
 916                sample_to_timespec(timer->it_clock,
 917                                   cpu_time_sub(timer->it_clock,
 918                                                timer->it.cpu.expires, now),
 919                                   &itp->it_value);
 920        } else {
 921                /*
 922                 * The timer should have expired already, but the firing
 923                 * hasn't taken place yet.  Say it's just about to expire.
 924                 */
 925                itp->it_value.tv_nsec = 1;
 926                itp->it_value.tv_sec = 0;
 927        }
 928}
 929
 930/*
 931 * Check for any per-thread CPU timers that have fired and move them off
 932 * the tsk->cpu_timers[N] list onto the firing list.  Here we update the
 933 * tsk->it_*_expires values to reflect the remaining thread CPU timers.
 934 */
 935static void check_thread_timers(struct task_struct *tsk,
 936                                struct list_head *firing)
 937{
 938        int maxfire;
 939        struct list_head *timers = tsk->cpu_timers;
 940        struct signal_struct *const sig = tsk->signal;
 941        unsigned long soft;
 942
 943        maxfire = 20;
 944        tsk->cputime_expires.prof_exp = 0;
 945        while (!list_empty(timers)) {
 946                struct cpu_timer_list *t = list_first_entry(timers,
 947                                                      struct cpu_timer_list,
 948                                                      entry);
 949                if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) {
 950                        tsk->cputime_expires.prof_exp = t->expires.cpu;
 951                        break;
 952                }
 953                t->firing = 1;
 954                list_move_tail(&t->entry, firing);
 955        }
 956
 957        ++timers;
 958        maxfire = 20;
 959        tsk->cputime_expires.virt_exp = 0;
 960        while (!list_empty(timers)) {
 961                struct cpu_timer_list *t = list_first_entry(timers,
 962                                                      struct cpu_timer_list,
 963                                                      entry);
 964                if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) {
 965                        tsk->cputime_expires.virt_exp = t->expires.cpu;
 966                        break;
 967                }
 968                t->firing = 1;
 969                list_move_tail(&t->entry, firing);
 970        }
 971
 972        ++timers;
 973        maxfire = 20;
 974        tsk->cputime_expires.sched_exp = 0;
 975        while (!list_empty(timers)) {
 976                struct cpu_timer_list *t = list_first_entry(timers,
 977                                                      struct cpu_timer_list,
 978                                                      entry);
 979                if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
 980                        tsk->cputime_expires.sched_exp = t->expires.sched;
 981                        break;
 982                }
 983                t->firing = 1;
 984                list_move_tail(&t->entry, firing);
 985        }
 986
 987        /*
 988         * Check for the special case thread timers.
 989         */
 990        soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
 991        if (soft != RLIM_INFINITY) {
 992                unsigned long hard =
 993                        ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
 994
 995                if (hard != RLIM_INFINITY &&
 996                    tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
 997                        /*
 998                         * At the hard limit, we just die.
 999                         * No need to calculate anything else now.
1000                         */
1001                        __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
1002                        return;
1003                }
1004                if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
1005                        /*
1006                         * At the soft limit, send a SIGXCPU every second.
1007                         */
1008                        if (soft < hard) {
1009                                soft += USEC_PER_SEC;
1010                                sig->rlim[RLIMIT_RTTIME].rlim_cur = soft;
1011                        }
1012                        printk(KERN_INFO
1013                                "RT Watchdog Timeout: %s[%d]\n",
1014                                tsk->comm, task_pid_nr(tsk));
1015                        __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
1016                }
1017        }
1018}
1019
1020static void stop_process_timers(struct signal_struct *sig)
1021{
1022        struct thread_group_cputimer *cputimer = &sig->cputimer;
1023        unsigned long flags;
1024
1025        raw_spin_lock_irqsave(&cputimer->lock, flags);
1026        cputimer->running = 0;
1027        raw_spin_unlock_irqrestore(&cputimer->lock, flags);
1028}
1029
1030static u32 onecputick;
1031
1032static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1033                             cputime_t *expires, cputime_t cur_time, int signo)
1034{
1035        if (!it->expires)
1036                return;
1037
1038        if (cur_time >= it->expires) {
1039                if (it->incr) {
1040                        it->expires += it->incr;
1041                        it->error += it->incr_error;
1042                        if (it->error >= onecputick) {
1043                                it->expires -= cputime_one_jiffy;
1044                                it->error -= onecputick;
1045                        }
1046                } else {
1047                        it->expires = 0;
1048                }
1049
1050                trace_itimer_expire(signo == SIGPROF ?
1051                                    ITIMER_PROF : ITIMER_VIRTUAL,
1052                                    tsk->signal->leader_pid, cur_time);
1053                __group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
1054        }
1055
1056        if (it->expires && (!*expires || it->expires < *expires)) {
1057                *expires = it->expires;
1058        }
1059}
1060
1061/*
1062 * Check for any per-thread CPU timers that have fired and move them
1063 * off the tsk->*_timers list onto the firing list.  Per-thread timers
1064 * have already been taken off.
1065 */
1066static void check_process_timers(struct task_struct *tsk,
1067                                 struct list_head *firing)
1068{
1069        int maxfire;
1070        struct signal_struct *const sig = tsk->signal;
1071        cputime_t utime, ptime, virt_expires, prof_expires;
1072        unsigned long long sum_sched_runtime, sched_expires;
1073        struct list_head *timers = sig->cpu_timers;
1074        struct task_cputime cputime;
1075        unsigned long soft;
1076
1077        /*
1078         * Collect the current process totals.
1079         */
1080        thread_group_cputimer(tsk, &cputime);
1081        utime = cputime.utime;
1082        ptime = utime + cputime.stime;
1083        sum_sched_runtime = cputime.sum_exec_runtime;
1084        maxfire = 20;
1085        prof_expires = 0;
1086        while (!list_empty(timers)) {
1087                struct cpu_timer_list *tl = list_first_entry(timers,
1088                                                      struct cpu_timer_list,
1089                                                      entry);
1090                if (!--maxfire || ptime < tl->expires.cpu) {
1091                        prof_expires = tl->expires.cpu;
1092                        break;
1093                }
1094                tl->firing = 1;
1095                list_move_tail(&tl->entry, firing);
1096        }
1097
1098        ++timers;
1099        maxfire = 20;
1100        virt_expires = 0;
1101        while (!list_empty(timers)) {
1102                struct cpu_timer_list *tl = list_first_entry(timers,
1103                                                      struct cpu_timer_list,
1104                                                      entry);
1105                if (!--maxfire || utime < tl->expires.cpu) {
1106                        virt_expires = tl->expires.cpu;
1107                        break;
1108                }
1109                tl->firing = 1;
1110                list_move_tail(&tl->entry, firing);
1111        }
1112
1113        ++timers;
1114        maxfire = 20;
1115        sched_expires = 0;
1116        while (!list_empty(timers)) {
1117                struct cpu_timer_list *tl = list_first_entry(timers,
1118                                                      struct cpu_timer_list,
1119                                                      entry);
1120                if (!--maxfire || sum_sched_runtime < tl->expires.sched) {
1121                        sched_expires = tl->expires.sched;
1122                        break;
1123                }
1124                tl->firing = 1;
1125                list_move_tail(&tl->entry, firing);
1126        }
1127
1128        /*
1129         * Check for the special case process timers.
1130         */
1131        check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime,
1132                         SIGPROF);
1133        check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
1134                         SIGVTALRM);
1135        soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
1136        if (soft != RLIM_INFINITY) {
1137                unsigned long psecs = cputime_to_secs(ptime);
1138                unsigned long hard =
1139                        ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
1140                cputime_t x;
1141                if (psecs >= hard) {
1142                        /*
1143                         * At the hard limit, we just die.
1144                         * No need to calculate anything else now.
1145                         */
1146                        __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
1147                        return;
1148                }
1149                if (psecs >= soft) {
1150                        /*
1151                         * At the soft limit, send a SIGXCPU every second.
1152                         */
1153                        __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
1154                        if (soft < hard) {
1155                                soft++;
1156                                sig->rlim[RLIMIT_CPU].rlim_cur = soft;
1157                        }
1158                }
1159                x = secs_to_cputime(soft);
1160                if (!prof_expires || x < prof_expires) {
1161                        prof_expires = x;
1162                }
1163        }
1164
1165        sig->cputime_expires.prof_exp = prof_expires;
1166        sig->cputime_expires.virt_exp = virt_expires;
1167        sig->cputime_expires.sched_exp = sched_expires;
1168        if (task_cputime_zero(&sig->cputime_expires))
1169                stop_process_timers(sig);
1170}
1171
1172/*
1173 * This is called from the signal code (via do_schedule_next_timer)
1174 * when the last timer signal was delivered and we have to reload the timer.
1175 */
1176void posix_cpu_timer_schedule(struct k_itimer *timer)
1177{
1178        struct task_struct *p = timer->it.cpu.task;
1179        union cpu_time_count now;
1180
1181        if (unlikely(p == NULL))
1182                /*
1183                 * The task was cleaned up already, no future firings.
1184                 */
1185                goto out;
1186
1187        /*
1188         * Fetch the current sample and update the timer's expiry time.
1189         */
1190        if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
1191                cpu_clock_sample(timer->it_clock, p, &now);
1192                bump_cpu_timer(timer, now);
1193                if (unlikely(p->exit_state)) {
1194                        clear_dead_task(timer, now);
1195                        goto out;
1196                }
1197                read_lock(&tasklist_lock); /* arm_timer needs it.  */
1198                spin_lock(&p->sighand->siglock);
1199        } else {
1200                read_lock(&tasklist_lock);
1201                if (unlikely(p->sighand == NULL)) {
1202                        /*
1203                         * The process has been reaped.
1204                         * We can't even collect a sample any more.
1205                         */
1206                        put_task_struct(p);
1207                        timer->it.cpu.task = p = NULL;
1208                        timer->it.cpu.expires.sched = 0;
1209                        goto out_unlock;
1210                } else if (unlikely(p->exit_state) && thread_group_empty(p)) {
1211                        /*
1212                         * We've noticed that the thread is dead, but
1213                         * not yet reaped.  Take this opportunity to
1214                         * drop our task ref.
1215                         */
1216                        clear_dead_task(timer, now);
1217                        goto out_unlock;
1218                }
1219                spin_lock(&p->sighand->siglock);
1220                cpu_timer_sample_group(timer->it_clock, p, &now);
1221                bump_cpu_timer(timer, now);
1222                /* Leave the tasklist_lock locked for the call below.  */
1223        }
1224
1225        /*
1226         * Now re-arm for the new expiry time.
1227         */
1228        BUG_ON(!irqs_disabled());
1229        arm_timer(timer);
1230        spin_unlock(&p->sighand->siglock);
1231
1232out_unlock:
1233        read_unlock(&tasklist_lock);
1234
1235out:
1236        timer->it_overrun_last = timer->it_overrun;
1237        timer->it_overrun = -1;
1238        ++timer->it_requeue_pending;
1239}
1240
1241/**
1242 * task_cputime_expired - Compare two task_cputime entities.
1243 *
1244 * @sample:     The task_cputime structure to be checked for expiration.
1245 * @expires:    Expiration times, against which @sample will be checked.
1246 *
1247 * Checks @sample against @expires to see if any field of @sample has expired.
1248 * Returns true if any field of the former is greater than the corresponding
1249 * field of the latter if the latter field is set.  Otherwise returns false.
1250 */
1251static inline int task_cputime_expired(const struct task_cputime *sample,
1252                                        const struct task_cputime *expires)
1253{
1254        if (expires->utime && sample->utime >= expires->utime)
1255                return 1;
1256        if (expires->stime && sample->utime + sample->stime >= expires->stime)
1257                return 1;
1258        if (expires->sum_exec_runtime != 0 &&
1259            sample->sum_exec_runtime >= expires->sum_exec_runtime)
1260                return 1;
1261        return 0;
1262}
1263
1264/**
1265 * fastpath_timer_check - POSIX CPU timers fast path.
1266 *
1267 * @tsk:        The task (thread) being checked.
1268 *
1269 * Check the task and thread group timers.  If both are zero (there are no
1270 * timers set) return false.  Otherwise snapshot the task and thread group
1271 * timers and compare them with the corresponding expiration times.  Return
1272 * true if a timer has expired, else return false.
1273 */
1274static inline int fastpath_timer_check(struct task_struct *tsk)
1275{
1276        struct signal_struct *sig;
1277        cputime_t utime, stime;
1278
1279        task_cputime(tsk, &utime, &stime);
1280
1281        if (!task_cputime_zero(&tsk->cputime_expires)) {
1282                struct task_cputime task_sample = {
1283                        .utime = utime,
1284                        .stime = stime,
1285                        .sum_exec_runtime = tsk->se.sum_exec_runtime
1286                };
1287
1288                if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
1289                        return 1;
1290        }
1291
1292        sig = tsk->signal;
1293        if (sig->cputimer.running) {
1294                struct task_cputime group_sample;
1295
1296                raw_spin_lock(&sig->cputimer.lock);
1297                group_sample = sig->cputimer.cputime;
1298                raw_spin_unlock(&sig->cputimer.lock);
1299
1300                if (task_cputime_expired(&group_sample, &sig->cputime_expires))
1301                        return 1;
1302        }
1303
1304        return 0;
1305}
1306
1307/*
1308 * This is called from the timer interrupt handler.  The irq handler has
1309 * already updated our counts.  We need to check if any timers fire now.
1310 * Interrupts are disabled.
1311 */
1312void run_posix_cpu_timers(struct task_struct *tsk)
1313{
1314        LIST_HEAD(firing);
1315        struct k_itimer *timer, *next;
1316        unsigned long flags;
1317
1318        BUG_ON(!irqs_disabled());
1319
1320        /*
1321         * The fast path checks that there are no expired thread or thread
1322         * group timers.  If that's so, just return.
1323         */
1324        if (!fastpath_timer_check(tsk))
1325                return;
1326
1327        if (!lock_task_sighand(tsk, &flags))
1328                return;
1329        /*
1330         * Here we take off tsk->signal->cpu_timers[N] and
1331         * tsk->cpu_timers[N] all the timers that are firing, and
1332         * put them on the firing list.
1333         */
1334        check_thread_timers(tsk, &firing);
1335        /*
1336         * If there are any active process wide timers (POSIX 1.b, itimers,
1337         * RLIMIT_CPU) cputimer must be running.
1338         */
1339        if (tsk->signal->cputimer.running)
1340                check_process_timers(tsk, &firing);
1341
1342        /*
1343         * We must release these locks before taking any timer's lock.
1344         * There is a potential race with timer deletion here, as the
1345         * siglock now protects our private firing list.  We have set
1346         * the firing flag in each timer, so that a deletion attempt
1347         * that gets the timer lock before we do will give it up and
1348         * spin until we've taken care of that timer below.
1349         */
1350        unlock_task_sighand(tsk, &flags);
1351
1352        /*
1353         * Now that all the timers on our list have the firing flag,
1354         * no one will touch their list entries but us.  We'll take
1355         * each timer's lock before clearing its firing flag, so no
1356         * timer call will interfere.
1357         */
1358        list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) {
1359                int cpu_firing;
1360
1361                spin_lock(&timer->it_lock);
1362                list_del_init(&timer->it.cpu.entry);
1363                cpu_firing = timer->it.cpu.firing;
1364                timer->it.cpu.firing = 0;
1365                /*
1366                 * The firing flag is -1 if we collided with a reset
1367                 * of the timer, which already reported this
1368                 * almost-firing as an overrun.  So don't generate an event.
1369                 */
1370                if (likely(cpu_firing >= 0))
1371                        cpu_timer_fire(timer);
1372                spin_unlock(&timer->it_lock);
1373        }
1374
1375        /*
1376         * In case some timers were rescheduled after the queue got emptied,
1377         * wake up full dynticks CPUs.
1378         */
1379        if (tsk->signal->cputimer.running)
1380                posix_cpu_timer_kick_nohz();
1381}
1382
1383/*
1384 * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
1385 * The tsk->sighand->siglock must be held by the caller.
1386 */
1387void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1388                           cputime_t *newval, cputime_t *oldval)
1389{
1390        union cpu_time_count now;
1391
1392        BUG_ON(clock_idx == CPUCLOCK_SCHED);
1393        cpu_timer_sample_group(clock_idx, tsk, &now);
1394
1395        if (oldval) {
1396                /*
1397                 * We are setting itimer. The *oldval is absolute and we update
1398                 * it to be relative, *newval argument is relative and we update
1399                 * it to be absolute.
1400                 */
1401                if (*oldval) {
1402                        if (*oldval <= now.cpu) {
1403                                /* Just about to fire. */
1404                                *oldval = cputime_one_jiffy;
1405                        } else {
1406                                *oldval -= now.cpu;
1407                        }
1408                }
1409
1410                if (!*newval)
1411                        goto out;
1412                *newval += now.cpu;
1413        }
1414
1415        /*
1416         * Update expiration cache if we are the earliest timer, or eventually
1417         * RLIMIT_CPU limit is earlier than prof_exp cpu timer expire.
1418         */
1419        switch (clock_idx) {
1420        case CPUCLOCK_PROF:
1421                if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval))
1422                        tsk->signal->cputime_expires.prof_exp = *newval;
1423                break;
1424        case CPUCLOCK_VIRT:
1425                if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval))
1426                        tsk->signal->cputime_expires.virt_exp = *newval;
1427                break;
1428        }
1429out:
1430        posix_cpu_timer_kick_nohz();
1431}
1432
1433static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1434                            struct timespec *rqtp, struct itimerspec *it)
1435{
1436        struct k_itimer timer;
1437        int error;
1438
1439        /*
1440         * Set up a temporary timer and then wait for it to go off.
1441         */
1442        memset(&timer, 0, sizeof timer);
1443        spin_lock_init(&timer.it_lock);
1444        timer.it_clock = which_clock;
1445        timer.it_overrun = -1;
1446        error = posix_cpu_timer_create(&timer);
1447        timer.it_process = current;
1448        if (!error) {
1449                static struct itimerspec zero_it;
1450
1451                memset(it, 0, sizeof *it);
1452                it->it_value = *rqtp;
1453
1454                spin_lock_irq(&timer.it_lock);
1455                error = posix_cpu_timer_set(&timer, flags, it, NULL);
1456                if (error) {
1457                        spin_unlock_irq(&timer.it_lock);
1458                        return error;
1459                }
1460
1461                while (!signal_pending(current)) {
1462                        if (timer.it.cpu.expires.sched == 0) {
1463                                /*
1464                                 * Our timer fired and was reset, below
1465                                 * deletion can not fail.
1466                                 */
1467                                posix_cpu_timer_del(&timer);
1468                                spin_unlock_irq(&timer.it_lock);
1469                                return 0;
1470                        }
1471
1472                        /*
1473                         * Block until cpu_timer_fire (or a signal) wakes us.
1474                         */
1475                        __set_current_state(TASK_INTERRUPTIBLE);
1476                        spin_unlock_irq(&timer.it_lock);
1477                        schedule();
1478                        spin_lock_irq(&timer.it_lock);
1479                }
1480
1481                /*
1482                 * We were interrupted by a signal.
1483                 */
1484                sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
1485                error = posix_cpu_timer_set(&timer, 0, &zero_it, it);
1486                if (!error) {
1487                        /*
1488                         * Timer is now unarmed, deletion can not fail.
1489                         */
1490                        posix_cpu_timer_del(&timer);
1491                }
1492                spin_unlock_irq(&timer.it_lock);
1493
1494                while (error == TIMER_RETRY) {
1495                        /*
1496                         * We need to handle case when timer was or is in the
1497                         * middle of firing. In other cases we already freed
1498                         * resources.
1499                         */
1500                        spin_lock_irq(&timer.it_lock);
1501                        error = posix_cpu_timer_del(&timer);
1502                        spin_unlock_irq(&timer.it_lock);
1503                }
1504
1505                if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) {
1506                        /*
1507                         * It actually did fire already.
1508                         */
1509                        return 0;
1510                }
1511
1512                error = -ERESTART_RESTARTBLOCK;
1513        }
1514
1515        return error;
1516}
1517
1518static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
1519
1520static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1521                            struct timespec *rqtp, struct timespec __user *rmtp)
1522{
1523        struct restart_block *restart_block =
1524                &current_thread_info()->restart_block;
1525        struct itimerspec it;
1526        int error;
1527
1528        /*
1529         * Diagnose required errors first.
1530         */
1531        if (CPUCLOCK_PERTHREAD(which_clock) &&
1532            (CPUCLOCK_PID(which_clock) == 0 ||
1533             CPUCLOCK_PID(which_clock) == current->pid))
1534                return -EINVAL;
1535
1536        error = do_cpu_nanosleep(which_clock, flags, rqtp, &it);
1537
1538        if (error == -ERESTART_RESTARTBLOCK) {
1539
1540                if (flags & TIMER_ABSTIME)
1541                        return -ERESTARTNOHAND;
1542                /*
1543                 * Report back to the user the time still remaining.
1544                 */
1545                if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
1546                        return -EFAULT;
1547
1548                restart_block->fn = posix_cpu_nsleep_restart;
1549                restart_block->nanosleep.clockid = which_clock;
1550                restart_block->nanosleep.rmtp = rmtp;
1551                restart_block->nanosleep.expires = timespec_to_ns(rqtp);
1552        }
1553        return error;
1554}
1555
1556static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
1557{
1558        clockid_t which_clock = restart_block->nanosleep.clockid;
1559        struct timespec t;
1560        struct itimerspec it;
1561        int error;
1562
1563        t = ns_to_timespec(restart_block->nanosleep.expires);
1564
1565        error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
1566
1567        if (error == -ERESTART_RESTARTBLOCK) {
1568                struct timespec __user *rmtp = restart_block->nanosleep.rmtp;
1569                /*
1570                 * Report back to the user the time still remaining.
1571                 */
1572                if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
1573                        return -EFAULT;
1574
1575                restart_block->nanosleep.expires = timespec_to_ns(&t);
1576        }
1577        return error;
1578
1579}
1580
1581#define PROCESS_CLOCK   MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
1582#define THREAD_CLOCK    MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
1583
1584static int process_cpu_clock_getres(const clockid_t which_clock,
1585                                    struct timespec *tp)
1586{
1587        return posix_cpu_clock_getres(PROCESS_CLOCK, tp);
1588}
1589static int process_cpu_clock_get(const clockid_t which_clock,
1590                                 struct timespec *tp)
1591{
1592        return posix_cpu_clock_get(PROCESS_CLOCK, tp);
1593}
1594static int process_cpu_timer_create(struct k_itimer *timer)
1595{
1596        timer->it_clock = PROCESS_CLOCK;
1597        return posix_cpu_timer_create(timer);
1598}
1599static int process_cpu_nsleep(const clockid_t which_clock, int flags,
1600                              struct timespec *rqtp,
1601                              struct timespec __user *rmtp)
1602{
1603        return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp);
1604}
1605static long process_cpu_nsleep_restart(struct restart_block *restart_block)
1606{
1607        return -EINVAL;
1608}
1609static int thread_cpu_clock_getres(const clockid_t which_clock,
1610                                   struct timespec *tp)
1611{
1612        return posix_cpu_clock_getres(THREAD_CLOCK, tp);
1613}
1614static int thread_cpu_clock_get(const clockid_t which_clock,
1615                                struct timespec *tp)
1616{
1617        return posix_cpu_clock_get(THREAD_CLOCK, tp);
1618}
1619static int thread_cpu_timer_create(struct k_itimer *timer)
1620{
1621        timer->it_clock = THREAD_CLOCK;
1622        return posix_cpu_timer_create(timer);
1623}
1624
1625struct k_clock clock_posix_cpu = {
1626        .clock_getres   = posix_cpu_clock_getres,
1627        .clock_set      = posix_cpu_clock_set,
1628        .clock_get      = posix_cpu_clock_get,
1629        .timer_create   = posix_cpu_timer_create,
1630        .nsleep         = posix_cpu_nsleep,
1631        .nsleep_restart = posix_cpu_nsleep_restart,
1632        .timer_set      = posix_cpu_timer_set,
1633        .timer_del      = posix_cpu_timer_del,
1634        .timer_get      = posix_cpu_timer_get,
1635};
1636
1637static __init int init_posix_cpu_timers(void)
1638{
1639        struct k_clock process = {
1640                .clock_getres   = process_cpu_clock_getres,
1641                .clock_get      = process_cpu_clock_get,
1642                .timer_create   = process_cpu_timer_create,
1643                .nsleep         = process_cpu_nsleep,
1644                .nsleep_restart = process_cpu_nsleep_restart,
1645        };
1646        struct k_clock thread = {
1647                .clock_getres   = thread_cpu_clock_getres,
1648                .clock_get      = thread_cpu_clock_get,
1649                .timer_create   = thread_cpu_timer_create,
1650        };
1651        struct timespec ts;
1652
1653        posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
1654        posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
1655
1656        cputime_to_timespec(cputime_one_jiffy, &ts);
1657        onecputick = ts.tv_nsec;
1658        WARN_ON(ts.tv_sec != 0);
1659
1660        return 0;
1661}
1662__initcall(init_posix_cpu_timers);
1663