LXR linux/tools/perf/builtin-sched.c

   1#include "builtin.h"
   2#include "perf.h"
   3
   4#include "util/util.h"
   5#include "util/evlist.h"
   6#include "util/cache.h"
   7#include "util/evsel.h"
   8#include "util/symbol.h"
   9#include "util/thread.h"
  10#include "util/header.h"
  11#include "util/session.h"
  12#include "util/tool.h"
  13#include "util/cloexec.h"
  14#include "util/thread_map.h"
  15#include "util/color.h"
  16#include "util/stat.h"
  17#include "util/callchain.h"
  18#include "util/time-utils.h"
  19
  20#include <subcmd/parse-options.h>
  21#include "util/trace-event.h"
  22
  23#include "util/debug.h"
  24
  25#include <linux/kernel.h>
  26#include <linux/log2.h>
  27#include <sys/prctl.h>
  28#include <sys/resource.h>
  29#include <inttypes.h>
  30
  31#include <errno.h>
  32#include <semaphore.h>
  33#include <pthread.h>
  34#include <math.h>
  35#include <api/fs/fs.h>
  36#include <linux/time64.h>
  37
  38#include "sane_ctype.h"
  39
  40#define PR_SET_NAME             15               /* Set process name */
  41#define MAX_CPUS                4096
  42#define COMM_LEN                20
  43#define SYM_LEN                 129
  44#define MAX_PID                 1024000
  45
  46struct sched_atom;
  47
  48struct task_desc {
  49        unsigned long           nr;
  50        unsigned long           pid;
  51        char                    comm[COMM_LEN];
  52
  53        unsigned long           nr_events;
  54        unsigned long           curr_event;
  55        struct sched_atom       **atoms;
  56
  57        pthread_t               thread;
  58        sem_t                   sleep_sem;
  59
  60        sem_t                   ready_for_work;
  61        sem_t                   work_done_sem;
  62
  63        u64                     cpu_usage;
  64};
  65
  66enum sched_event_type {
  67        SCHED_EVENT_RUN,
  68        SCHED_EVENT_SLEEP,
  69        SCHED_EVENT_WAKEUP,
  70        SCHED_EVENT_MIGRATION,
  71};
  72
  73struct sched_atom {
  74        enum sched_event_type   type;
  75        int                     specific_wait;
  76        u64                     timestamp;
  77        u64                     duration;
  78        unsigned long           nr;
  79        sem_t                   *wait_sem;
  80        struct task_desc        *wakee;
  81};
  82
  83#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
  84
  85/* task state bitmask, copied from include/linux/sched.h */
  86#define TASK_RUNNING            0
  87#define TASK_INTERRUPTIBLE      1
  88#define TASK_UNINTERRUPTIBLE    2
  89#define __TASK_STOPPED          4
  90#define __TASK_TRACED           8
  91/* in tsk->exit_state */
  92#define EXIT_DEAD               16
  93#define EXIT_ZOMBIE             32
  94#define EXIT_TRACE              (EXIT_ZOMBIE | EXIT_DEAD)
  95/* in tsk->state again */
  96#define TASK_DEAD               64
  97#define TASK_WAKEKILL           128
  98#define TASK_WAKING             256
  99#define TASK_PARKED             512
 100
 101enum thread_state {
 102        THREAD_SLEEPING = 0,
 103        THREAD_WAIT_CPU,
 104        THREAD_SCHED_IN,
 105        THREAD_IGNORE
 106};
 107
 108struct work_atom {
 109        struct list_head        list;
 110        enum thread_state       state;
 111        u64                     sched_out_time;
 112        u64                     wake_up_time;
 113        u64                     sched_in_time;
 114        u64                     runtime;
 115};
 116
 117struct work_atoms {
 118        struct list_head        work_list;
 119        struct thread           *thread;
 120        struct rb_node          node;
 121        u64                     max_lat;
 122        u64                     max_lat_at;
 123        u64                     total_lat;
 124        u64                     nb_atoms;
 125        u64                     total_runtime;
 126        int                     num_merged;
 127};
 128
 129typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
 130
 131struct perf_sched;
 132
 133struct trace_sched_handler {
 134        int (*switch_event)(struct perf_sched *sched, struct perf_evsel *evsel,
 135                            struct perf_sample *sample, struct machine *machine);
 136
 137        int (*runtime_event)(struct perf_sched *sched, struct perf_evsel *evsel,
 138                             struct perf_sample *sample, struct machine *machine);
 139
 140        int (*wakeup_event)(struct perf_sched *sched, struct perf_evsel *evsel,
 141                            struct perf_sample *sample, struct machine *machine);
 142
 143        /* PERF_RECORD_FORK event, not sched_process_fork tracepoint */
 144        int (*fork_event)(struct perf_sched *sched, union perf_event *event,
 145                          struct machine *machine);
 146
 147        int (*migrate_task_event)(struct perf_sched *sched,
 148                                  struct perf_evsel *evsel,
 149                                  struct perf_sample *sample,
 150                                  struct machine *machine);
 151};
 152
 153#define COLOR_PIDS PERF_COLOR_BLUE
 154#define COLOR_CPUS PERF_COLOR_BG_RED
 155
 156struct perf_sched_map {
 157        DECLARE_BITMAP(comp_cpus_mask, MAX_CPUS);
 158        int                     *comp_cpus;
 159        bool                     comp;
 160        struct thread_map       *color_pids;
 161        const char              *color_pids_str;
 162        struct cpu_map          *color_cpus;
 163        const char              *color_cpus_str;
 164        struct cpu_map          *cpus;
 165        const char              *cpus_str;
 166};
 167
 168struct perf_sched {
 169        struct perf_tool tool;
 170        const char       *sort_order;
 171        unsigned long    nr_tasks;
 172        struct task_desc **pid_to_task;
 173        struct task_desc **tasks;
 174        const struct trace_sched_handler *tp_handler;
 175        pthread_mutex_t  start_work_mutex;
 176        pthread_mutex_t  work_done_wait_mutex;
 177        int              profile_cpu;
 178/*
 179 * Track the current task - that way we can know whether there's any
 180 * weird events, such as a task being switched away that is not current.
 181 */
 182        int              max_cpu;
 183        u32              curr_pid[MAX_CPUS];
 184        struct thread    *curr_thread[MAX_CPUS];
 185        char             next_shortname1;
 186        char             next_shortname2;
 187        unsigned int     replay_repeat;
 188        unsigned long    nr_run_events;
 189        unsigned long    nr_sleep_events;
 190        unsigned long    nr_wakeup_events;
 191        unsigned long    nr_sleep_corrections;
 192        unsigned long    nr_run_events_optimized;
 193        unsigned long    targetless_wakeups;
 194        unsigned long    multitarget_wakeups;
 195        unsigned long    nr_runs;
 196        unsigned long    nr_timestamps;
 197        unsigned long    nr_unordered_timestamps;
 198        unsigned long    nr_context_switch_bugs;
 199        unsigned long    nr_events;
 200        unsigned long    nr_lost_chunks;
 201        unsigned long    nr_lost_events;
 202        u64              run_measurement_overhead;
 203        u64              sleep_measurement_overhead;
 204        u64              start_time;
 205        u64              cpu_usage;
 206        u64              runavg_cpu_usage;
 207        u64              parent_cpu_usage;
 208        u64              runavg_parent_cpu_usage;
 209        u64              sum_runtime;
 210        u64              sum_fluct;
 211        u64              run_avg;
 212        u64              all_runtime;
 213        u64              all_count;
 214        u64              cpu_last_switched[MAX_CPUS];
 215        struct rb_root   atom_root, sorted_atom_root, merged_atom_root;
 216        struct list_head sort_list, cmp_pid;
 217        bool force;
 218        bool skip_merge;
 219        struct perf_sched_map map;
 220
 221        /* options for timehist command */
 222        bool            summary;
 223        bool            summary_only;
 224        bool            idle_hist;
 225        bool            show_callchain;
 226        unsigned int    max_stack;
 227        bool            show_cpu_visual;
 228        bool            show_wakeups;
 229        bool            show_next;
 230        bool            show_migrations;
 231        bool            show_state;
 232        u64             skipped_samples;
 233        const char      *time_str;
 234        struct perf_time_interval ptime;
 235        struct perf_time_interval hist_time;
 236};
 237
 238/* per thread run time data */
 239struct thread_runtime {
 240        u64 last_time;      /* time of previous sched in/out event */
 241        u64 dt_run;         /* run time */
 242        u64 dt_sleep;       /* time between CPU access by sleep (off cpu) */
 243        u64 dt_iowait;      /* time between CPU access by iowait (off cpu) */
 244        u64 dt_preempt;     /* time between CPU access by preempt (off cpu) */
 245        u64 dt_delay;       /* time between wakeup and sched-in */
 246        u64 ready_to_run;   /* time of wakeup */
 247
 248        struct stats run_stats;
 249        u64 total_run_time;
 250        u64 total_sleep_time;
 251        u64 total_iowait_time;
 252        u64 total_preempt_time;
 253        u64 total_delay_time;
 254
 255        int last_state;
 256        u64 migrations;
 257};
 258
 259/* per event run time data */
 260struct evsel_runtime {
 261        u64 *last_time; /* time this event was last seen per cpu */
 262        u32 ncpu;       /* highest cpu slot allocated */
 263};
 264
 265/* per cpu idle time data */
 266struct idle_thread_runtime {
 267        struct thread_runtime   tr;
 268        struct thread           *last_thread;
 269        struct rb_root          sorted_root;
 270        struct callchain_root   callchain;
 271        struct callchain_cursor cursor;
 272};
 273
 274/* track idle times per cpu */
 275static struct thread **idle_threads;
 276static int idle_max_cpu;
 277static char idle_comm[] = "<idle>";
 278
 279static u64 get_nsecs(void)
 280{
 281        struct timespec ts;
 282
 283        clock_gettime(CLOCK_MONOTONIC, &ts);
 284
 285        return ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec;
 286}
 287
 288static void burn_nsecs(struct perf_sched *sched, u64 nsecs)
 289{
 290        u64 T0 = get_nsecs(), T1;
 291
 292        do {
 293                T1 = get_nsecs();
 294        } while (T1 + sched->run_measurement_overhead < T0 + nsecs);
 295}
 296
 297static void sleep_nsecs(u64 nsecs)
 298{
 299        struct timespec ts;
 300
 301        ts.tv_nsec = nsecs % 999999999;
 302        ts.tv_sec = nsecs / 999999999;
 303
 304        nanosleep(&ts, NULL);
 305}
 306
 307static void calibrate_run_measurement_overhead(struct perf_sched *sched)
 308{
 309        u64 T0, T1, delta, min_delta = NSEC_PER_SEC;
 310        int i;
 311
 312        for (i = 0; i < 10; i++) {
 313                T0 = get_nsecs();
 314                burn_nsecs(sched, 0);
 315                T1 = get_nsecs();
 316                delta = T1-T0;
 317                min_delta = min(min_delta, delta);
 318        }
 319        sched->run_measurement_overhead = min_delta;
 320
 321        printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta);
 322}
 323
 324static void calibrate_sleep_measurement_overhead(struct perf_sched *sched)
 325{
 326        u64 T0, T1, delta, min_delta = NSEC_PER_SEC;
 327        int i;
 328
 329        for (i = 0; i < 10; i++) {
 330                T0 = get_nsecs();
 331                sleep_nsecs(10000);
 332                T1 = get_nsecs();
 333                delta = T1-T0;
 334                min_delta = min(min_delta, delta);
 335        }
 336        min_delta -= 10000;
 337        sched->sleep_measurement_overhead = min_delta;
 338
 339        printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta);
 340}
 341
 342static struct sched_atom *
 343get_new_event(struct task_desc *task, u64 timestamp)
 344{
 345        struct sched_atom *event = zalloc(sizeof(*event));
 346        unsigned long idx = task->nr_events;
 347        size_t size;
 348
 349        event->timestamp = timestamp;
 350        event->nr = idx;
 351
 352        task->nr_events++;
 353        size = sizeof(struct sched_atom *) * task->nr_events;
 354        task->atoms = realloc(task->atoms, size);
 355        BUG_ON(!task->atoms);
 356
 357        task->atoms[idx] = event;
 358
 359        return event;
 360}
 361
 362static struct sched_atom *last_event(struct task_desc *task)
 363{
 364        if (!task->nr_events)
 365                return NULL;
 366
 367        return task->atoms[task->nr_events - 1];
 368}
 369
 370static void add_sched_event_run(struct perf_sched *sched, struct task_desc *task,
 371                                u64 timestamp, u64 duration)
 372{
 373        struct sched_atom *event, *curr_event = last_event(task);
 374
 375        /*
 376         * optimize an existing RUN event by merging this one
 377         * to it:
 378         */
 379        if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
 380                sched->nr_run_events_optimized++;
 381                curr_event->duration += duration;
 382                return;
 383        }
 384
 385        event = get_new_event(task, timestamp);
 386
 387        event->type = SCHED_EVENT_RUN;
 388        event->duration = duration;
 389
 390        sched->nr_run_events++;
 391}
 392
 393static void add_sched_event_wakeup(struct perf_sched *sched, struct task_desc *task,
 394                                   u64 timestamp, struct task_desc *wakee)
 395{
 396        struct sched_atom *event, *wakee_event;
 397
 398        event = get_new_event(task, timestamp);
 399        event->type = SCHED_EVENT_WAKEUP;
 400        event->wakee = wakee;
 401
 402        wakee_event = last_event(wakee);
 403        if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) {
 404                sched->targetless_wakeups++;
 405                return;
 406        }
 407        if (wakee_event->wait_sem) {
 408                sched->multitarget_wakeups++;
 409                return;
 410        }
 411
 412        wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem));
 413        sem_init(wakee_event->wait_sem, 0, 0);
 414        wakee_event->specific_wait = 1;
 415        event->wait_sem = wakee_event->wait_sem;
 416
 417        sched->nr_wakeup_events++;
 418}
 419
 420static void add_sched_event_sleep(struct perf_sched *sched, struct task_desc *task,
 421                                  u64 timestamp, u64 task_state __maybe_unused)
 422{
 423        struct sched_atom *event = get_new_event(task, timestamp);
 424
 425        event->type = SCHED_EVENT_SLEEP;
 426
 427        sched->nr_sleep_events++;
 428}
 429
 430static struct task_desc *register_pid(struct perf_sched *sched,
 431                                      unsigned long pid, const char *comm)
 432{
 433        struct task_desc *task;
 434        static int pid_max;
 435
 436        if (sched->pid_to_task == NULL) {
 437                if (sysctl__read_int("kernel/pid_max", &pid_max) < 0)
 438                        pid_max = MAX_PID;
 439                BUG_ON((sched->pid_to_task = calloc(pid_max, sizeof(struct task_desc *))) == NULL);
 440        }
 441        if (pid >= (unsigned long)pid_max) {
 442                BUG_ON((sched->pid_to_task = realloc(sched->pid_to_task, (pid + 1) *
 443                        sizeof(struct task_desc *))) == NULL);
 444                while (pid >= (unsigned long)pid_max)
 445                        sched->pid_to_task[pid_max++] = NULL;
 446        }
 447
 448        task = sched->pid_to_task[pid];
 449
 450        if (task)
 451                return task;
 452
 453        task = zalloc(sizeof(*task));
 454        task->pid = pid;
 455        task->nr = sched->nr_tasks;
 456        strcpy(task->comm, comm);
 457        /*
 458         * every task starts in sleeping state - this gets ignored
 459         * if there's no wakeup pointing to this sleep state:
 460         */
 461        add_sched_event_sleep(sched, task, 0, 0);
 462
 463        sched->pid_to_task[pid] = task;
 464        sched->nr_tasks++;
 465        sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_desc *));
 466        BUG_ON(!sched->tasks);
 467        sched->tasks[task->nr] = task;
 468
 469        if (verbose > 0)
 470                printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm);
 471
 472        return task;
 473}
 474
 475
 476static void print_task_traces(struct perf_sched *sched)
 477{
 478        struct task_desc *task;
 479        unsigned long i;
 480
 481        for (i = 0; i < sched->nr_tasks; i++) {
 482                task = sched->tasks[i];
 483                printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
 484                        task->nr, task->comm, task->pid, task->nr_events);
 485        }
 486}
 487
 488static void add_cross_task_wakeups(struct perf_sched *sched)
 489{
 490        struct task_desc *task1, *task2;
 491        unsigned long i, j;
 492
 493        for (i = 0; i < sched->nr_tasks; i++) {
 494                task1 = sched->tasks[i];
 495                j = i + 1;
 496                if (j == sched->nr_tasks)
 497                        j = 0;
 498                task2 = sched->tasks[j];
 499                add_sched_event_wakeup(sched, task1, 0, task2);
 500        }
 501}
 502
 503static void perf_sched__process_event(struct perf_sched *sched,
 504                                      struct sched_atom *atom)
 505{
 506        int ret = 0;
 507
 508        switch (atom->type) {
 509                case SCHED_EVENT_RUN:
 510                        burn_nsecs(sched, atom->duration);
 511                        break;
 512                case SCHED_EVENT_SLEEP:
 513                        if (atom->wait_sem)
 514                                ret = sem_wait(atom->wait_sem);
 515                        BUG_ON(ret);
 516                        break;
 517                case SCHED_EVENT_WAKEUP:
 518                        if (atom->wait_sem)
 519                                ret = sem_post(atom->wait_sem);
 520                        BUG_ON(ret);
 521                        break;
 522                case SCHED_EVENT_MIGRATION:
 523                        break;
 524                default:
 525                        BUG_ON(1);
 526        }
 527}
 528
 529static u64 get_cpu_usage_nsec_parent(void)
 530{
 531        struct rusage ru;
 532        u64 sum;
 533        int err;
 534
 535        err = getrusage(RUSAGE_SELF, &ru);
 536        BUG_ON(err);
 537
 538        sum =  ru.ru_utime.tv_sec * NSEC_PER_SEC + ru.ru_utime.tv_usec * NSEC_PER_USEC;
 539        sum += ru.ru_stime.tv_sec * NSEC_PER_SEC + ru.ru_stime.tv_usec * NSEC_PER_USEC;
 540
 541        return sum;
 542}
 543
 544static int self_open_counters(struct perf_sched *sched, unsigned long cur_task)
 545{
 546        struct perf_event_attr attr;
 547        char sbuf[STRERR_BUFSIZE], info[STRERR_BUFSIZE];
 548        int fd;
 549        struct rlimit limit;
 550        bool need_privilege = false;
 551
 552        memset(&attr, 0, sizeof(attr));
 553
 554        attr.type = PERF_TYPE_SOFTWARE;
 555        attr.config = PERF_COUNT_SW_TASK_CLOCK;
 556
 557force_again:
 558        fd = sys_perf_event_open(&attr, 0, -1, -1,
 559                                 perf_event_open_cloexec_flag());
 560
 561        if (fd < 0) {
 562                if (errno == EMFILE) {
 563                        if (sched->force) {
 564                                BUG_ON(getrlimit(RLIMIT_NOFILE, &limit) == -1);
 565                                limit.rlim_cur += sched->nr_tasks - cur_task;
 566                                if (limit.rlim_cur > limit.rlim_max) {
 567                                        limit.rlim_max = limit.rlim_cur;
 568                                        need_privilege = true;
 569                                }
 570                                if (setrlimit(RLIMIT_NOFILE, &limit) == -1) {
 571                                        if (need_privilege && errno == EPERM)
 572                                                strcpy(info, "Need privilege\n");
 573                                } else
 574                                        goto force_again;
 575                        } else
 576                                strcpy(info, "Have a try with -f option\n");
 577                }
 578                pr_err("Error: sys_perf_event_open() syscall returned "
 579                       "with %d (%s)\n%s", fd,
 580                       str_error_r(errno, sbuf, sizeof(sbuf)), info);
 581                exit(EXIT_FAILURE);
 582        }
 583        return fd;
 584}
 585
 586static u64 get_cpu_usage_nsec_self(int fd)
 587{
 588        u64 runtime;
 589        int ret;
 590
 591        ret = read(fd, &runtime, sizeof(runtime));
 592        BUG_ON(ret != sizeof(runtime));
 593
 594        return runtime;
 595}
 596
 597struct sched_thread_parms {
 598        struct task_desc  *task;
 599        struct perf_sched *sched;
 600        int fd;
 601};
 602
 603static void *thread_func(void *ctx)
 604{
 605        struct sched_thread_parms *parms = ctx;
 606        struct task_desc *this_task = parms->task;
 607        struct perf_sched *sched = parms->sched;
 608        u64 cpu_usage_0, cpu_usage_1;
 609        unsigned long i, ret;
 610        char comm2[22];
 611        int fd = parms->fd;
 612
 613        zfree(&parms);
 614
 615        sprintf(comm2, ":%s", this_task->comm);
 616        prctl(PR_SET_NAME, comm2);
 617        if (fd < 0)
 618                return NULL;
 619again:
 620        ret = sem_post(&this_task->ready_for_work);
 621        BUG_ON(ret);
 622        ret = pthread_mutex_lock(&sched->start_work_mutex);
 623        BUG_ON(ret);
 624        ret = pthread_mutex_unlock(&sched->start_work_mutex);
 625        BUG_ON(ret);
 626
 627        cpu_usage_0 = get_cpu_usage_nsec_self(fd);
 628
 629        for (i = 0; i < this_task->nr_events; i++) {
 630                this_task->curr_event = i;
 631                perf_sched__process_event(sched, this_task->atoms[i]);
 632        }
 633
 634        cpu_usage_1 = get_cpu_usage_nsec_self(fd);
 635        this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
 636        ret = sem_post(&this_task->work_done_sem);
 637        BUG_ON(ret);
 638
 639        ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
 640        BUG_ON(ret);
 641        ret = pthread_mutex_unlock(&sched->work_done_wait_mutex);
 642        BUG_ON(ret);
 643
 644        goto again;
 645}
 646
 647static void create_tasks(struct perf_sched *sched)
 648{
 649        struct task_desc *task;
 650        pthread_attr_t attr;
 651        unsigned long i;
 652        int err;
 653
 654        err = pthread_attr_init(&attr);
 655        BUG_ON(err);
 656        err = pthread_attr_setstacksize(&attr,
 657                        (size_t) max(16 * 1024, PTHREAD_STACK_MIN));
 658        BUG_ON(err);
 659        err = pthread_mutex_lock(&sched->start_work_mutex);
 660        BUG_ON(err);
 661        err = pthread_mutex_lock(&sched->work_done_wait_mutex);
 662        BUG_ON(err);
 663        for (i = 0; i < sched->nr_tasks; i++) {
 664                struct sched_thread_parms *parms = malloc(sizeof(*parms));
 665                BUG_ON(parms == NULL);
 666                parms->task = task = sched->tasks[i];
 667                parms->sched = sched;
 668                parms->fd = self_open_counters(sched, i);
 669                sem_init(&task->sleep_sem, 0, 0);
 670                sem_init(&task->ready_for_work, 0, 0);
 671                sem_init(&task->work_done_sem, 0, 0);
 672                task->curr_event = 0;
 673                err = pthread_create(&task->thread, &attr, thread_func, parms);
 674                BUG_ON(err);
 675        }
 676}
 677
 678static void wait_for_tasks(struct perf_sched *sched)
 679{
 680        u64 cpu_usage_0, cpu_usage_1;
 681        struct task_desc *task;
 682        unsigned long i, ret;
 683
 684        sched->start_time = get_nsecs();
 685        sched->cpu_usage = 0;
 686        pthread_mutex_unlock(&sched->work_done_wait_mutex);
 687
 688        for (i = 0; i < sched->nr_tasks; i++) {
 689                task = sched->tasks[i];
 690                ret = sem_wait(&task->ready_for_work);
 691                BUG_ON(ret);
 692                sem_init(&task->ready_for_work, 0, 0);
 693        }
 694        ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
 695        BUG_ON(ret);
 696
 697        cpu_usage_0 = get_cpu_usage_nsec_parent();
 698
 699        pthread_mutex_unlock(&sched->start_work_mutex);
 700
 701        for (i = 0; i < sched->nr_tasks; i++) {
 702                task = sched->tasks[i];
 703                ret = sem_wait(&task->work_done_sem);
 704                BUG_ON(ret);
 705                sem_init(&task->work_done_sem, 0, 0);
 706                sched->cpu_usage += task->cpu_usage;
 707                task->cpu_usage = 0;
 708        }
 709
 710        cpu_usage_1 = get_cpu_usage_nsec_parent();
 711        if (!sched->runavg_cpu_usage)
 712                sched->runavg_cpu_usage = sched->cpu_usage;
 713        sched->runavg_cpu_usage = (sched->runavg_cpu_usage * (sched->replay_repeat - 1) + sched->cpu_usage) / sched->replay_repeat;
 714
 715        sched->parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
 716        if (!sched->runavg_parent_cpu_usage)
 717                sched->runavg_parent_cpu_usage = sched->parent_cpu_usage;
 718        sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * (sched->replay_repeat - 1) +
 719                                         sched->parent_cpu_usage)/sched->replay_repeat;
 720
 721        ret = pthread_mutex_lock(&sched->start_work_mutex);
 722        BUG_ON(ret);
 723
 724        for (i = 0; i < sched->nr_tasks; i++) {
 725                task = sched->tasks[i];
 726                sem_init(&task->sleep_sem, 0, 0);
 727                task->curr_event = 0;
 728        }
 729}
 730
 731static void run_one_test(struct perf_sched *sched)
 732{
 733        u64 T0, T1, delta, avg_delta, fluct;
 734
 735        T0 = get_nsecs();
 736        wait_for_tasks(sched);
 737        T1 = get_nsecs();
 738
 739        delta = T1 - T0;
 740        sched->sum_runtime += delta;
 741        sched->nr_runs++;
 742
 743        avg_delta = sched->sum_runtime / sched->nr_runs;
 744        if (delta < avg_delta)
 745                fluct = avg_delta - delta;
 746        else
 747                fluct = delta - avg_delta;
 748        sched->sum_fluct += fluct;
 749        if (!sched->run_avg)
 750                sched->run_avg = delta;
 751        sched->run_avg = (sched->run_avg * (sched->replay_repeat - 1) + delta) / sched->replay_repeat;
 752
 753        printf("#%-3ld: %0.3f, ", sched->nr_runs, (double)delta / NSEC_PER_MSEC);
 754
 755        printf("ravg: %0.2f, ", (double)sched->run_avg / NSEC_PER_MSEC);
 756
 757        printf("cpu: %0.2f / %0.2f",
 758                (double)sched->cpu_usage / NSEC_PER_MSEC, (double)sched->runavg_cpu_usage / NSEC_PER_MSEC);
 759
 760#if 0
 761        /*
 762         * rusage statistics done by the parent, these are less
 763         * accurate than the sched->sum_exec_runtime based statistics:
 764         */
 765        printf(" [%0.2f / %0.2f]",
 766                (double)sched->parent_cpu_usage / NSEC_PER_MSEC,
 767                (double)sched->runavg_parent_cpu_usage / NSEC_PER_MSEC);
 768#endif
 769
 770        printf("\n");
 771
 772        if (sched->nr_sleep_corrections)
 773                printf(" (%ld sleep corrections)\n", sched->nr_sleep_corrections);
 774        sched->nr_sleep_corrections = 0;
 775}
 776
 777static void test_calibrations(struct perf_sched *sched)
 778{
 779        u64 T0, T1;
 780
 781        T0 = get_nsecs();
 782        burn_nsecs(sched, NSEC_PER_MSEC);
 783        T1 = get_nsecs();
 784
 785        printf("the run test took %" PRIu64 " nsecs\n", T1 - T0);
 786
 787        T0 = get_nsecs();
 788        sleep_nsecs(NSEC_PER_MSEC);
 789        T1 = get_nsecs();
 790
 791        printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0);
 792}
 793
 794static int
 795replay_wakeup_event(struct perf_sched *sched,
 796                    struct perf_evsel *evsel, struct perf_sample *sample,
 797                    struct machine *machine __maybe_unused)
 798{
 799        const char *comm = perf_evsel__strval(evsel, sample, "comm");
 800        const u32 pid    = perf_evsel__intval(evsel, sample, "pid");
 801        struct task_desc *waker, *wakee;
 802
 803        if (verbose > 0) {
 804                printf("sched_wakeup event %p\n", evsel);
 805
 806                printf(" ... pid %d woke up %s/%d\n", sample->tid, comm, pid);
 807        }
 808
 809        waker = register_pid(sched, sample->tid, "<unknown>");
 810        wakee = register_pid(sched, pid, comm);
 811
 812        add_sched_event_wakeup(sched, waker, sample->time, wakee);
 813        return 0;
 814}
 815
 816static int replay_switch_event(struct perf_sched *sched,
 817                               struct perf_evsel *evsel,
 818                               struct perf_sample *sample,
 819                               struct machine *machine __maybe_unused)
 820{
 821        const char *prev_comm  = perf_evsel__strval(evsel, sample, "prev_comm"),
 822                   *next_comm  = perf_evsel__strval(evsel, sample, "next_comm");
 823        const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
 824                  next_pid = perf_evsel__intval(evsel, sample, "next_pid");
 825        const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
 826        struct task_desc *prev, __maybe_unused *next;
 827        u64 timestamp0, timestamp = sample->time;
 828        int cpu = sample->cpu;
 829        s64 delta;
 830
 831        if (verbose > 0)
 832                printf("sched_switch event %p\n", evsel);
 833
 834        if (cpu >= MAX_CPUS || cpu < 0)
 835                return 0;
 836
 837        timestamp0 = sched->cpu_last_switched[cpu];
 838        if (timestamp0)
 839                delta = timestamp - timestamp0;
 840        else
 841                delta = 0;
 842
 843        if (delta < 0) {
 844                pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
 845                return -1;
 846        }
 847
 848        pr_debug(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n",
 849                 prev_comm, prev_pid, next_comm, next_pid, delta);
 850
 851        prev = register_pid(sched, prev_pid, prev_comm);
 852        next = register_pid(sched, next_pid, next_comm);
 853
 854        sched->cpu_last_switched[cpu] = timestamp;
 855
 856        add_sched_event_run(sched, prev, timestamp, delta);
 857        add_sched_event_sleep(sched, prev, timestamp, prev_state);
 858
 859        return 0;
 860}
 861
 862static int replay_fork_event(struct perf_sched *sched,
 863                             union perf_event *event,
 864                             struct machine *machine)
 865{
 866        struct thread *child, *parent;
 867
 868        child = machine__findnew_thread(machine, event->fork.pid,
 869                                        event->fork.tid);
 870        parent = machine__findnew_thread(machine, event->fork.ppid,
 871                                         event->fork.ptid);
 872
 873        if (child == NULL || parent == NULL) {
 874                pr_debug("thread does not exist on fork event: child %p, parent %p\n",
 875                                 child, parent);
 876                goto out_put;
 877        }
 878
 879        if (verbose > 0) {
 880                printf("fork event\n");
 881                printf("... parent: %s/%d\n", thread__comm_str(parent), parent->tid);
 882                printf("...  child: %s/%d\n", thread__comm_str(child), child->tid);
 883        }
 884
 885        register_pid(sched, parent->tid, thread__comm_str(parent));
 886        register_pid(sched, child->tid, thread__comm_str(child));
 887out_put:
 888        thread__put(child);
 889        thread__put(parent);
 890        return 0;
 891}
 892
 893struct sort_dimension {
 894        const char              *name;
 895        sort_fn_t               cmp;
 896        struct list_head        list;
 897};
 898
 899static int
 900thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r)
 901{
 902        struct sort_dimension *sort;
 903        int ret = 0;
 904
 905        BUG_ON(list_empty(list));
 906
 907        list_for_each_entry(sort, list, list) {
 908                ret = sort->cmp(l, r);
 909                if (ret)
 910                        return ret;
 911        }
 912
 913        return ret;
 914}
 915
 916static struct work_atoms *
 917thread_atoms_search(struct rb_root *root, struct thread *thread,
 918                         struct list_head *sort_list)
 919{
 920        struct rb_node *node = root->rb_node;
 921        struct work_atoms key = { .thread = thread };
 922
 923        while (node) {
 924                struct work_atoms *atoms;
 925                int cmp;
 926
 927                atoms = container_of(node, struct work_atoms, node);
 928
 929                cmp = thread_lat_cmp(sort_list, &key, atoms);
 930                if (cmp > 0)
 931                        node = node->rb_left;
 932                else if (cmp < 0)
 933                        node = node->rb_right;
 934                else {
 935                        BUG_ON(thread != atoms->thread);
 936                        return atoms;
 937                }
 938        }
 939        return NULL;
 940}
 941
 942static void
 943__thread_latency_insert(struct rb_root *root, struct work_atoms *data,
 944                         struct list_head *sort_list)
 945{
 946        struct rb_node **new = &(root->rb_node), *parent = NULL;
 947
 948        while (*new) {
 949                struct work_atoms *this;
 950                int cmp;
 951
 952                this = container_of(*new, struct work_atoms, node);
 953                parent = *new;
 954
 955                cmp = thread_lat_cmp(sort_list, data, this);
 956
 957                if (cmp > 0)
 958                        new = &((*new)->rb_left);
 959                else
 960                        new = &((*new)->rb_right);
 961        }
 962
 963        rb_link_node(&data->node, parent, new);
 964        rb_insert_color(&data->node, root);
 965}
 966
 967static int thread_atoms_insert(struct perf_sched *sched, struct thread *thread)
 968{
 969        struct work_atoms *atoms = zalloc(sizeof(*atoms));
 970        if (!atoms) {
 971                pr_err("No memory at %s\n", __func__);
 972                return -1;
 973        }
 974
 975        atoms->thread = thread__get(thread);
 976        INIT_LIST_HEAD(&atoms->work_list);
 977        __thread_latency_insert(&sched->atom_root, atoms, &sched->cmp_pid);
 978        return 0;
 979}
 980
 981static char sched_out_state(u64 prev_state)
 982{
 983        const char *str = TASK_STATE_TO_CHAR_STR;
 984
 985        return str[prev_state];
 986}
 987
 988static int
 989add_sched_out_event(struct work_atoms *atoms,
 990                    char run_state,
 991                    u64 timestamp)
 992{
 993        struct work_atom *atom = zalloc(sizeof(*atom));
 994        if (!atom) {
 995                pr_err("Non memory at %s", __func__);
 996                return -1;
 997        }
 998
 999        atom->sched_out_time = timestamp;
1000

1001        if (run_state == 'R') {
1002                atom->state = THREAD_WAIT_CPU;
1003                atom->wake_up_time = atom->sched_out_time;
1004        }
1005
1006        list_add_tail(&atom->list, &atoms->work_list);
1007        return 0;
1008}
1009
1010static void
1011add_runtime_event(struct work_atoms *atoms, u64 delta,
1012                  u64 timestamp __maybe_unused)
1013{
1014        struct work_atom *atom;
1015
1016        BUG_ON(list_empty(&atoms->work_list));
1017
1018        atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1019
1020        atom->runtime += delta;
1021        atoms->total_runtime += delta;
1022}
1023
1024static void
1025add_sched_in_event(struct work_atoms *atoms, u64 timestamp)
1026{
1027        struct work_atom *atom;
1028        u64 delta;
1029
1030        if (list_empty(&atoms->work_list))
1031                return;
1032
1033        atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1034
1035        if (atom->state != THREAD_WAIT_CPU)
1036                return;
1037
1038        if (timestamp < atom->wake_up_time) {
1039                atom->state = THREAD_IGNORE;
1040                return;
1041        }
1042
1043        atom->state = THREAD_SCHED_IN;
1044        atom->sched_in_time = timestamp;
1045
1046        delta = atom->sched_in_time - atom->wake_up_time;
1047        atoms->total_lat += delta;
1048        if (delta > atoms->max_lat) {
1049                atoms->max_lat = delta;
1050                atoms->max_lat_at = timestamp;
1051        }
1052        atoms->nb_atoms++;
1053}
1054
1055static int latency_switch_event(struct perf_sched *sched,
1056                                struct perf_evsel *evsel,
1057                                struct perf_sample *sample,
1058                                struct machine *machine)
1059{
1060        const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
1061                  next_pid = perf_evsel__intval(evsel, sample, "next_pid");
1062        const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
1063        struct work_atoms *out_events, *in_events;
1064        struct thread *sched_out, *sched_in;
1065        u64 timestamp0, timestamp = sample->time;
1066        int cpu = sample->cpu, err = -1;
1067        s64 delta;
1068
1069        BUG_ON(cpu >= MAX_CPUS || cpu < 0);
1070
1071        timestamp0 = sched->cpu_last_switched[cpu];
1072        sched->cpu_last_switched[cpu] = timestamp;
1073        if (timestamp0)
1074                delta = timestamp - timestamp0;
1075        else
1076                delta = 0;
1077
1078        if (delta < 0) {
1079                pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1080                return -1;
1081        }
1082
1083        sched_out = machine__findnew_thread(machine, -1, prev_pid);
1084        sched_in = machine__findnew_thread(machine, -1, next_pid);
1085        if (sched_out == NULL || sched_in == NULL)
1086                goto out_put;
1087
1088        out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
1089        if (!out_events) {
1090                if (thread_atoms_insert(sched, sched_out))
1091                        goto out_put;
1092                out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
1093                if (!out_events) {
1094                        pr_err("out-event: Internal tree error");
1095                        goto out_put;
1096                }
1097        }
1098        if (add_sched_out_event(out_events, sched_out_state(prev_state), timestamp))
1099                return -1;
1100
1101        in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
1102        if (!in_events) {
1103                if (thread_atoms_insert(sched, sched_in))
1104                        goto out_put;
1105                in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
1106                if (!in_events) {
1107                        pr_err("in-event: Internal tree error");
1108                        goto out_put;
1109                }
1110                /*
1111                 * Take came in we have not heard about yet,
1112                 * add in an initial atom in runnable state:
1113                 */
1114                if (add_sched_out_event(in_events, 'R', timestamp))
1115                        goto out_put;
1116        }
1117        add_sched_in_event(in_events, timestamp);
1118        err = 0;
1119out_put:
1120        thread__put(sched_out);
1121        thread__put(sched_in);
1122        return err;
1123}
1124
1125static int latency_runtime_event(struct perf_sched *sched,
1126                                 struct perf_evsel *evsel,
1127                                 struct perf_sample *sample,
1128                                 struct machine *machine)
1129{
1130        const u32 pid      = perf_evsel__intval(evsel, sample, "pid");
1131        const u64 runtime  = perf_evsel__intval(evsel, sample, "runtime");
1132        struct thread *thread = machine__findnew_thread(machine, -1, pid);
1133        struct work_atoms *atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
1134        u64 timestamp = sample->time;
1135        int cpu = sample->cpu, err = -1;
1136
1137        if (thread == NULL)
1138                return -1;
1139
1140        BUG_ON(cpu >= MAX_CPUS || cpu < 0);
1141        if (!atoms) {
1142                if (thread_atoms_insert(sched, thread))
1143                        goto out_put;
1144                atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
1145                if (!atoms) {
1146                        pr_err("in-event: Internal tree error");
1147                        goto out_put;
1148                }
1149                if (add_sched_out_event(atoms, 'R', timestamp))
1150                        goto out_put;
1151        }
1152
1153        add_runtime_event(atoms, runtime, timestamp);
1154        err = 0;
1155out_put:
1156        thread__put(thread);
1157        return err;
1158}
1159
1160static int latency_wakeup_event(struct perf_sched *sched,
1161                                struct perf_evsel *evsel,
1162                                struct perf_sample *sample,
1163                                struct machine *machine)
1164{
1165        const u32 pid     = perf_evsel__intval(evsel, sample, "pid");
1166        struct work_atoms *atoms;
1167        struct work_atom *atom;
1168        struct thread *wakee;
1169        u64 timestamp = sample->time;
1170        int err = -1;
1171
1172        wakee = machine__findnew_thread(machine, -1, pid);
1173        if (wakee == NULL)
1174                return -1;
1175        atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
1176        if (!atoms) {
1177                if (thread_atoms_insert(sched, wakee))
1178                        goto out_put;
1179                atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
1180                if (!atoms) {
1181                        pr_err("wakeup-event: Internal tree error");
1182                        goto out_put;
1183                }
1184                if (add_sched_out_event(atoms, 'S', timestamp))
1185                        goto out_put;
1186        }
1187
1188        BUG_ON(list_empty(&atoms->work_list));
1189
1190        atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1191
1192        /*
1193         * As we do not guarantee the wakeup event happens when
1194         * task is out of run queue, also may happen when task is
1195         * on run queue and wakeup only change ->state to TASK_RUNNING,
1196         * then we should not set the ->wake_up_time when wake up a
1197         * task which is on run queue.
1198         *
1199         * You WILL be missing events if you've recorded only
1200         * one CPU, or are only looking at only one, so don't
1201         * skip in this case.
1202         */
1203        if (sched->profile_cpu == -1 && atom->state != THREAD_SLEEPING)
1204                goto out_ok;
1205
1206        sched->nr_timestamps++;
1207        if (atom->sched_out_time > timestamp) {
1208                sched->nr_unordered_timestamps++;
1209                goto out_ok;
1210        }
1211
1212        atom->state = THREAD_WAIT_CPU;
1213        atom->wake_up_time = timestamp;
1214out_ok:
1215        err = 0;
1216out_put:
1217        thread__put(wakee);
1218        return err;
1219}
1220
1221static int latency_migrate_task_event(struct perf_sched *sched,
1222                                      struct perf_evsel *evsel,
1223                                      struct perf_sample *sample,
1224                                      struct machine *machine)
1225{
1226        const u32 pid = perf_evsel__intval(evsel, sample, "pid");
1227        u64 timestamp = sample->time;
1228        struct work_atoms *atoms;
1229        struct work_atom *atom;
1230        struct thread *migrant;
1231        int err = -1;
1232
1233        /*
1234         * Only need to worry about migration when profiling one CPU.
1235         */
1236        if (sched->profile_cpu == -1)
1237                return 0;
1238
1239        migrant = machine__findnew_thread(machine, -1, pid);
1240        if (migrant == NULL)
1241                return -1;
1242        atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
1243        if (!atoms) {
1244                if (thread_atoms_insert(sched, migrant))
1245                        goto out_put;
1246                register_pid(sched, migrant->tid, thread__comm_str(migrant));
1247                atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
1248                if (!atoms) {
1249                        pr_err("migration-event: Internal tree error");
1250                        goto out_put;
1251                }
1252                if (add_sched_out_event(atoms, 'R', timestamp))
1253                        goto out_put;
1254        }
1255
1256        BUG_ON(list_empty(&atoms->work_list));
1257
1258        atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1259        atom->sched_in_time = atom->sched_out_time = atom->wake_up_time = timestamp;
1260
1261        sched->nr_timestamps++;
1262
1263        if (atom->sched_out_time > timestamp)
1264                sched->nr_unordered_timestamps++;
1265        err = 0;
1266out_put:
1267        thread__put(migrant);
1268        return err;
1269}
1270
1271static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_list)
1272{
1273        int i;
1274        int ret;
1275        u64 avg;
1276        char max_lat_at[32];
1277
1278        if (!work_list->nb_atoms)
1279                return;
1280        /*
1281         * Ignore idle threads:
1282         */
1283        if (!strcmp(thread__comm_str(work_list->thread), "swapper"))
1284                return;
1285
1286        sched->all_runtime += work_list->total_runtime;
1287        sched->all_count   += work_list->nb_atoms;
1288
1289        if (work_list->num_merged > 1)
1290                ret = printf("  %s:(%d) ", thread__comm_str(work_list->thread), work_list->num_merged);
1291        else
1292                ret = printf("  %s:%d ", thread__comm_str(work_list->thread), work_list->thread->tid);
1293
1294        for (i = 0; i < 24 - ret; i++)
1295                printf(" ");
1296
1297        avg = work_list->total_lat / work_list->nb_atoms;
1298        timestamp__scnprintf_usec(work_list->max_lat_at, max_lat_at, sizeof(max_lat_at));
1299
1300        printf("|%11.3f ms |%9" PRIu64 " | avg:%9.3f ms | max:%9.3f ms | max at: %13s s\n",
1301              (double)work_list->total_runtime / NSEC_PER_MSEC,
1302                 work_list->nb_atoms, (double)avg / NSEC_PER_MSEC,
1303                 (double)work_list->max_lat / NSEC_PER_MSEC,
1304                 max_lat_at);
1305}
1306
1307static int pid_cmp(struct work_atoms *l, struct work_atoms *r)
1308{
1309        if (l->thread == r->thread)
1310                return 0;
1311        if (l->thread->tid < r->thread->tid)
1312                return -1;
1313        if (l->thread->tid > r->thread->tid)
1314                return 1;
1315        return (int)(l->thread - r->thread);
1316}
1317
1318static int avg_cmp(struct work_atoms *l, struct work_atoms *r)
1319{
1320        u64 avgl, avgr;
1321
1322        if (!l->nb_atoms)
1323                return -1;
1324
1325        if (!r->nb_atoms)
1326                return 1;
1327
1328        avgl = l->total_lat / l->nb_atoms;
1329        avgr = r->total_lat / r->nb_atoms;
1330
1331        if (avgl < avgr)
1332                return -1;
1333        if (avgl > avgr)
1334                return 1;
1335
1336        return 0;
1337}
1338
1339static int max_cmp(struct work_atoms *l, struct work_atoms *r)
1340{
1341        if (l->max_lat < r->max_lat)
1342                return -1;
1343        if (l->max_lat > r->max_lat)
1344                return 1;
1345
1346        return 0;
1347}
1348
1349static int switch_cmp(struct work_atoms *l, struct work_atoms *r)
1350{
1351        if (l->nb_atoms < r->nb_atoms)
1352                return -1;
1353        if (l->nb_atoms > r->nb_atoms)
1354                return 1;
1355
1356        return 0;
1357}
1358
1359static int runtime_cmp(struct work_atoms *l, struct work_atoms *r)
1360{
1361        if (l->total_runtime < r->total_runtime)
1362                return -1;
1363        if (l->total_runtime > r->total_runtime)
1364                return 1;
1365
1366        return 0;
1367}
1368
1369static int sort_dimension__add(const char *tok, struct list_head *list)
1370{
1371        size_t i;
1372        static struct sort_dimension avg_sort_dimension = {
1373                .name = "avg",
1374                .cmp  = avg_cmp,
1375        };
1376        static struct sort_dimension max_sort_dimension = {
1377                .name = "max",
1378                .cmp  = max_cmp,
1379        };
1380        static struct sort_dimension pid_sort_dimension = {
1381                .name = "pid",
1382                .cmp  = pid_cmp,
1383        };
1384        static struct sort_dimension runtime_sort_dimension = {
1385                .name = "runtime",
1386                .cmp  = runtime_cmp,
1387        };
1388        static struct sort_dimension switch_sort_dimension = {
1389                .name = "switch",
1390                .cmp  = switch_cmp,
1391        };
1392        struct sort_dimension *available_sorts[] = {
1393                &pid_sort_dimension,
1394                &avg_sort_dimension,
1395                &max_sort_dimension,
1396                &switch_sort_dimension,
1397                &runtime_sort_dimension,
1398        };
1399
1400        for (i = 0; i < ARRAY_SIZE(available_sorts); i++) {
1401                if (!strcmp(available_sorts[i]->name, tok)) {
1402                        list_add_tail(&available_sorts[i]->list, list);
1403
1404                        return 0;
1405                }
1406        }
1407
1408        return -1;
1409}
1410
1411static void perf_sched__sort_lat(struct perf_sched *sched)
1412{
1413        struct rb_node *node;
1414        struct rb_root *root = &sched->atom_root;
1415again:
1416        for (;;) {
1417                struct work_atoms *data;
1418                node = rb_first(root);
1419                if (!node)
1420                        break;
1421
1422                rb_erase(node, root);
1423                data = rb_entry(node, struct work_atoms, node);
1424                __thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list);
1425        }
1426        if (root == &sched->atom_root) {
1427                root = &sched->merged_atom_root;
1428                goto again;
1429        }
1430}
1431
1432static int process_sched_wakeup_event(struct perf_tool *tool,
1433                                      struct perf_evsel *evsel,
1434                                      struct perf_sample *sample,
1435                                      struct machine *machine)
1436{
1437        struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1438
1439        if (sched->tp_handler->wakeup_event)
1440                return sched->tp_handler->wakeup_event(sched, evsel, sample, machine);
1441
1442        return 0;
1443}
1444
1445union map_priv {
1446        void    *ptr;
1447        bool     color;
1448};
1449
1450static bool thread__has_color(struct thread *thread)
1451{
1452        union map_priv priv = {
1453                .ptr = thread__priv(thread),
1454        };
1455
1456        return priv.color;
1457}
1458
1459static struct thread*
1460map__findnew_thread(struct perf_sched *sched, struct machine *machine, pid_t pid, pid_t tid)
1461{
1462        struct thread *thread = machine__findnew_thread(machine, pid, tid);
1463        union map_priv priv = {
1464                .color = false,
1465        };
1466
1467        if (!sched->map.color_pids || !thread || thread__priv(thread))
1468                return thread;
1469
1470        if (thread_map__has(sched->map.color_pids, tid))
1471                priv.color = true;
1472
1473        thread__set_priv(thread, priv.ptr);
1474        return thread;
1475}
1476
1477static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
1478                            struct perf_sample *sample, struct machine *machine)
1479{
1480        const u32 next_pid = perf_evsel__intval(evsel, sample, "next_pid");
1481        struct thread *sched_in;
1482        int new_shortname;
1483        u64 timestamp0, timestamp = sample->time;
1484        s64 delta;
1485        int i, this_cpu = sample->cpu;
1486        int cpus_nr;
1487        bool new_cpu = false;
1488        const char *color = PERF_COLOR_NORMAL;
1489        char stimestamp[32];
1490
1491        BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0);
1492
1493        if (this_cpu > sched->max_cpu)
1494                sched->max_cpu = this_cpu;
1495
1496        if (sched->map.comp) {
1497                cpus_nr = bitmap_weight(sched->map.comp_cpus_mask, MAX_CPUS);
1498                if (!test_and_set_bit(this_cpu, sched->map.comp_cpus_mask)) {
1499                        sched->map.comp_cpus[cpus_nr++] = this_cpu;
1500                        new_cpu = true;
1501                }
1502        } else
1503                cpus_nr = sched->max_cpu;
1504
1505        timestamp0 = sched->cpu_last_switched[this_cpu];
1506        sched->cpu_last_switched[this_cpu] = timestamp;
1507        if (timestamp0)
1508                delta = timestamp - timestamp0;
1509        else
1510                delta = 0;
1511
1512        if (delta < 0) {
1513                pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1514                return -1;
1515        }
1516
1517        sched_in = map__findnew_thread(sched, machine, -1, next_pid);
1518        if (sched_in == NULL)
1519                return -1;
1520
1521        sched->curr_thread[this_cpu] = thread__get(sched_in);
1522
1523        printf("  ");
1524
1525        new_shortname = 0;
1526        if (!sched_in->shortname[0]) {
1527                if (!strcmp(thread__comm_str(sched_in), "swapper")) {
1528                        /*
1529                         * Don't allocate a letter-number for swapper:0
1530                         * as a shortname. Instead, we use '.' for it.
1531                         */
1532                        sched_in->shortname[0] = '.';
1533                        sched_in->shortname[1] = ' ';
1534                } else {
1535                        sched_in->shortname[0] = sched->next_shortname1;
1536                        sched_in->shortname[1] = sched->next_shortname2;
1537
1538                        if (sched->next_shortname1 < 'Z') {
1539                                sched->next_shortname1++;
1540                        } else {
1541                                sched->next_shortname1 = 'A';
1542                                if (sched->next_shortname2 < '9')
1543                                        sched->next_shortname2++;
1544                                else
1545                                        sched->next_shortname2 = '0';
1546                        }
1547                }
1548                new_shortname = 1;
1549        }
1550
1551        for (i = 0; i < cpus_nr; i++) {
1552                int cpu = sched->map.comp ? sched->map.comp_cpus[i] : i;
1553                struct thread *curr_thread = sched->curr_thread[cpu];
1554                const char *pid_color = color;
1555                const char *cpu_color = color;
1556
1557                if (curr_thread && thread__has_color(curr_thread))
1558                        pid_color = COLOR_PIDS;
1559
1560                if (sched->map.cpus && !cpu_map__has(sched->map.cpus, cpu))
1561                        continue;
1562
1563                if (sched->map.color_cpus && cpu_map__has(sched->map.color_cpus, cpu))
1564                        cpu_color = COLOR_CPUS;
1565
1566                if (cpu != this_cpu)
1567                        color_fprintf(stdout, color, " ");
1568                else
1569                        color_fprintf(stdout, cpu_color, "*");
1570
1571                if (sched->curr_thread[cpu])
1572                        color_fprintf(stdout, pid_color, "%2s ", sched->curr_thread[cpu]->shortname);
1573                else
1574                        color_fprintf(stdout, color, "   ");
1575        }
1576
1577        if (sched->map.cpus && !cpu_map__has(sched->map.cpus, this_cpu))
1578                goto out;
1579
1580        timestamp__scnprintf_usec(timestamp, stimestamp, sizeof(stimestamp));
1581        color_fprintf(stdout, color, "  %12s secs ", stimestamp);
1582        if (new_shortname || (verbose > 0 && sched_in->tid)) {
1583                const char *pid_color = color;
1584
1585                if (thread__has_color(sched_in))
1586                        pid_color = COLOR_PIDS;
1587
1588                color_fprintf(stdout, pid_color, "%s => %s:%d",
1589                       sched_in->shortname, thread__comm_str(sched_in), sched_in->tid);
1590        }
1591
1592        if (sched->map.comp && new_cpu)
1593                color_fprintf(stdout, color, " (CPU %d)", this_cpu);
1594
1595out:
1596        color_fprintf(stdout, color, "\n");
1597
1598        thread__put(sched_in);
1599
1600        return 0;
1601}
1602
1603static int process_sched_switch_event(struct perf_tool *tool,
1604                                      struct perf_evsel *evsel,
1605                                      struct perf_sample *sample,
1606                                      struct machine *machine)
1607{
1608        struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1609        int this_cpu = sample->cpu, err = 0;
1610        u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
1611            next_pid = perf_evsel__intval(evsel, sample, "next_pid");
1612
1613        if (sched->curr_pid[this_cpu] != (u32)-1) {
1614                /*
1615                 * Are we trying to switch away a PID that is
1616                 * not current?
1617                 */
1618                if (sched->curr_pid[this_cpu] != prev_pid)
1619                        sched->nr_context_switch_bugs++;
1620        }
1621
1622        if (sched->tp_handler->switch_event)
1623                err = sched->tp_handler->switch_event(sched, evsel, sample, machine);
1624
1625        sched->curr_pid[this_cpu] = next_pid;
1626        return err;
1627}
1628
1629static int process_sched_runtime_event(struct perf_tool *tool,
1630                                       struct perf_evsel *evsel,
1631                                       struct perf_sample *sample,
1632                                       struct machine *machine)
1633{
1634        struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1635
1636        if (sched->tp_handler->runtime_event)
1637                return sched->tp_handler->runtime_event(sched, evsel, sample, machine);
1638
1639        return 0;
1640}
1641
1642static int perf_sched__process_fork_event(struct perf_tool *tool,
1643                                          union perf_event *event,
1644                                          struct perf_sample *sample,
1645                                          struct machine *machine)
1646{
1647        struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1648
1649        /* run the fork event through the perf machineruy */
1650        perf_event__process_fork(tool, event, sample, machine);
1651
1652        /* and then run additional processing needed for this command */
1653        if (sched->tp_handler->fork_event)
1654                return sched->tp_handler->fork_event(sched, event, machine);
1655
1656        return 0;
1657}
1658
1659static int process_sched_migrate_task_event(struct perf_tool *tool,
1660                                            struct perf_evsel *evsel,
1661                                            struct perf_sample *sample,
1662                                            struct machine *machine)
1663{
1664        struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1665
1666        if (sched->tp_handler->migrate_task_event)
1667                return sched->tp_handler->migrate_task_event(sched, evsel, sample, machine);
1668
1669        return 0;
1670}
1671
1672typedef int (*tracepoint_handler)(struct perf_tool *tool,
1673                                  struct perf_evsel *evsel,
1674                                  struct perf_sample *sample,
1675                                  struct machine *machine);
1676
1677static int perf_sched__process_tracepoint_sample(struct perf_tool *tool __maybe_unused,
1678                                                 union perf_event *event __maybe_unused,
1679                                                 struct perf_sample *sample,
1680                                                 struct perf_evsel *evsel,
1681                                                 struct machine *machine)
1682{
1683        int err = 0;
1684
1685        if (evsel->handler != NULL) {
1686                tracepoint_handler f = evsel->handler;
1687                err = f(tool, evsel, sample, machine);
1688        }
1689
1690        return err;
1691}
1692
1693static int perf_sched__read_events(struct perf_sched *sched)
1694{
1695        const struct perf_evsel_str_handler handlers[] = {
1696                { "sched:sched_switch",       process_sched_switch_event, },
1697                { "sched:sched_stat_runtime", process_sched_runtime_event, },
1698                { "sched:sched_wakeup",       process_sched_wakeup_event, },
1699                { "sched:sched_wakeup_new",   process_sched_wakeup_event, },
1700                { "sched:sched_migrate_task", process_sched_migrate_task_event, },
1701        };
1702        struct perf_session *session;
1703        struct perf_data_file file = {
1704                .path = input_name,
1705                .mode = PERF_DATA_MODE_READ,
1706                .force = sched->force,
1707        };
1708        int rc = -1;
1709
1710        session = perf_session__new(&file, false, &sched->tool);
1711        if (session == NULL) {
1712                pr_debug("No Memory for session\n");
1713                return -1;
1714        }
1715
1716        symbol__init(&session->header.env);
1717
1718        if (perf_session__set_tracepoints_handlers(session, handlers))
1719                goto out_delete;
1720
1721        if (perf_session__has_traces(session, "record -R")) {
1722                int err = perf_session__process_events(session);
1723                if (err) {
1724                        pr_err("Failed to process events, error %d", err);
1725                        goto out_delete;
1726                }
1727
1728                sched->nr_events      = session->evlist->stats.nr_events[0];
1729                sched->nr_lost_events = session->evlist->stats.total_lost;
1730                sched->nr_lost_chunks = session->evlist->stats.nr_events[PERF_RECORD_LOST];
1731        }
1732
1733        rc = 0;
1734out_delete:
1735        perf_session__delete(session);
1736        return rc;
1737}
1738
1739/*
1740 * scheduling times are printed as msec.usec
1741 */
1742static inline void print_sched_time(unsigned long long nsecs, int width)
1743{
1744        unsigned long msecs;
1745        unsigned long usecs;
1746
1747        msecs  = nsecs / NSEC_PER_MSEC;
1748        nsecs -= msecs * NSEC_PER_MSEC;
1749        usecs  = nsecs / NSEC_PER_USEC;
1750        printf("%*lu.%03lu ", width, msecs, usecs);
1751}
1752
1753/*
1754 * returns runtime data for event, allocating memory for it the
1755 * first time it is used.
1756 */
1757static struct evsel_runtime *perf_evsel__get_runtime(struct perf_evsel *evsel)
1758{
1759        struct evsel_runtime *r = evsel->priv;
1760
1761        if (r == NULL) {
1762                r = zalloc(sizeof(struct evsel_runtime));
1763                evsel->priv = r;
1764        }
1765
1766        return r;
1767}
1768
1769/*
1770 * save last time event was seen per cpu
1771 */
1772static void perf_evsel__save_time(struct perf_evsel *evsel,
1773                                  u64 timestamp, u32 cpu)
1774{
1775        struct evsel_runtime *r = perf_evsel__get_runtime(evsel);
1776
1777        if (r == NULL)
1778                return;
1779
1780        if ((cpu >= r->ncpu) || (r->last_time == NULL)) {
1781                int i, n = __roundup_pow_of_two(cpu+1);
1782                void *p = r->last_time;
1783
1784                p = realloc(r->last_time, n * sizeof(u64));
1785                if (!p)
1786                        return;
1787
1788                r->last_time = p;
1789                for (i = r->ncpu; i < n; ++i)
1790                        r->last_time[i] = (u64) 0;
1791
1792                r->ncpu = n;
1793        }
1794
1795        r->last_time[cpu] = timestamp;
1796}
1797
1798/* returns last time this event was seen on the given cpu */
1799static u64 perf_evsel__get_time(struct perf_evsel *evsel, u32 cpu)
1800{
1801        struct evsel_runtime *r = perf_evsel__get_runtime(evsel);
1802
1803        if ((r == NULL) || (r->last_time == NULL) || (cpu >= r->ncpu))
1804                return 0;
1805
1806        return r->last_time[cpu];
1807}
1808
1809static int comm_width = 30;
1810
1811static char *timehist_get_commstr(struct thread *thread)
1812{
1813        static char str[32];
1814        const char *comm = thread__comm_str(thread);
1815        pid_t tid = thread->tid;
1816        pid_t pid = thread->pid_;
1817        int n;
1818
1819        if (pid == 0)
1820                n = scnprintf(str, sizeof(str), "%s", comm);
1821
1822        else if (tid != pid)
1823                n = scnprintf(str, sizeof(str), "%s[%d/%d]", comm, tid, pid);
1824
1825        else
1826                n = scnprintf(str, sizeof(str), "%s[%d]", comm, tid);
1827
1828        if (n > comm_width)
1829                comm_width = n;
1830
1831        return str;
1832}
1833
1834static void timehist_header(struct perf_sched *sched)
1835{
1836        u32 ncpus = sched->max_cpu + 1;
1837        u32 i, j;
1838
1839        printf("%15s %6s ", "time", "cpu");
1840
1841        if (sched->show_cpu_visual) {
1842                printf(" ");
1843                for (i = 0, j = 0; i < ncpus; ++i) {
1844                        printf("%x", j++);
1845                        if (j > 15)
1846                                j = 0;
1847                }
1848                printf(" ");
1849        }
1850
1851        printf(" %-*s  %9s  %9s  %9s", comm_width,
1852                "task name", "wait time", "sch delay", "run time");
1853
1854        if (sched->show_state)
1855                printf("  %s", "state");
1856
1857        printf("\n");
1858
1859        /*
1860         * units row
1861         */
1862        printf("%15s %-6s ", "", "");
1863
1864        if (sched->show_cpu_visual)
1865                printf(" %*s ", ncpus, "");
1866
1867        printf(" %-*s  %9s  %9s  %9s", comm_width,
1868               "[tid/pid]", "(msec)", "(msec)", "(msec)");
1869
1870        if (sched->show_state)
1871                printf("  %5s", "");
1872
1873        printf("\n");
1874
1875        /*
1876         * separator
1877         */
1878        printf("%.15s %.6s ", graph_dotted_line, graph_dotted_line);
1879
1880        if (sched->show_cpu_visual)
1881                printf(" %.*s ", ncpus, graph_dotted_line);
1882
1883        printf(" %.*s  %.9s  %.9s  %.9s", comm_width,
1884                graph_dotted_line, graph_dotted_line, graph_dotted_line,
1885                graph_dotted_line);
1886
1887        if (sched->show_state)
1888                printf("  %.5s", graph_dotted_line);
1889
1890        printf("\n");
1891}
1892
1893static char task_state_char(struct thread *thread, int state)
1894{
1895        static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
1896        unsigned bit = state ? ffs(state) : 0;
1897
1898        /* 'I' for idle */
1899        if (thread->tid == 0)
1900                return 'I';
1901
1902        return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
1903}
1904
1905static void timehist_print_sample(struct perf_sched *sched,
1906                                  struct perf_evsel *evsel,
1907                                  struct perf_sample *sample,
1908                                  struct addr_location *al,
1909                                  struct thread *thread,
1910                                  u64 t, int state)
1911{
1912        struct thread_runtime *tr = thread__priv(thread);
1913        const char *next_comm = perf_evsel__strval(evsel, sample, "next_comm");
1914        const u32 next_pid = perf_evsel__intval(evsel, sample, "next_pid");
1915        u32 max_cpus = sched->max_cpu + 1;
1916        char tstr[64];
1917        char nstr[30];
1918        u64 wait_time;
1919
1920        timestamp__scnprintf_usec(t, tstr, sizeof(tstr));
1921        printf("%15s [%04d] ", tstr, sample->cpu);
1922
1923        if (sched->show_cpu_visual) {
1924                u32 i;
1925                char c;
1926
1927                printf(" ");
1928                for (i = 0; i < max_cpus; ++i) {
1929                        /* flag idle times with 'i'; others are sched events */
1930                        if (i == sample->cpu)
1931                                c = (thread->tid == 0) ? 'i' : 's';
1932                        else
1933                                c = ' ';
1934                        printf("%c", c);
1935                }
1936                printf(" ");
1937        }
1938
1939        printf(" %-*s ", comm_width, timehist_get_commstr(thread));
1940
1941        wait_time = tr->dt_sleep + tr->dt_iowait + tr->dt_preempt;
1942        print_sched_time(wait_time, 6);
1943
1944        print_sched_time(tr->dt_delay, 6);
1945        print_sched_time(tr->dt_run, 6);
1946
1947        if (sched->show_state)
1948                printf(" %5c ", task_state_char(thread, state));
1949
1950        if (sched->show_next) {
1951                snprintf(nstr, sizeof(nstr), "next: %s[%d]", next_comm, next_pid);
1952                printf(" %-*s", comm_width, nstr);
1953        }
1954
1955        if (sched->show_wakeups && !sched->show_next)
1956                printf("  %-*s", comm_width, "");
1957
1958        if (thread->tid == 0)
1959                goto out;
1960
1961        if (sched->show_callchain)
1962                printf("  ");
1963
1964        sample__fprintf_sym(sample, al, 0,
1965                            EVSEL__PRINT_SYM | EVSEL__PRINT_ONELINE |
1966                            EVSEL__PRINT_CALLCHAIN_ARROW |
1967                            EVSEL__PRINT_SKIP_IGNORED,
1968                            &callchain_cursor, stdout);
1969
1970out:
1971        printf("\n");
1972}
1973
1974/*
1975 * Explanation of delta-time stats:
1976 *
1977 *            t = time of current schedule out event
1978 *        tprev = time of previous sched out event
1979 *                also time of schedule-in event for current task
1980 *    last_time = time of last sched change event for current task
1981 *                (i.e, time process was last scheduled out)
1982 * ready_to_run = time of wakeup for current task
1983 *
1984 * -----|------------|------------|------------|------
1985 *    last         ready        tprev          t
1986 *    time         to run
1987 *
1988 *      |-------- dt_wait --------|
1989 *                   |- dt_delay -|-- dt_run --|
1990 *
1991 *   dt_run = run time of current task
1992 *  dt_wait = time between last schedule out event for task and tprev
1993 *            represents time spent off the cpu
1994 * dt_delay = time between wakeup and schedule-in of task
1995 */
1996
1997static void timehist_update_runtime_stats(struct thread_runtime *r,
1998                                         u64 t, u64 tprev)
1999{
2000        r->dt_delay   = 0;

2001        r->dt_sleep   = 0;
2002        r->dt_iowait  = 0;
2003        r->dt_preempt = 0;
2004        r->dt_run     = 0;
2005
2006        if (tprev) {
2007                r->dt_run = t - tprev;
2008                if (r->ready_to_run) {
2009                        if (r->ready_to_run > tprev)
2010                                pr_debug("time travel: wakeup time for task > previous sched_switch event\n");
2011                        else
2012                                r->dt_delay = tprev - r->ready_to_run;
2013                }
2014
2015                if (r->last_time > tprev)
2016                        pr_debug("time travel: last sched out time for task > previous sched_switch event\n");
2017                else if (r->last_time) {
2018                        u64 dt_wait = tprev - r->last_time;
2019
2020                        if (r->last_state == TASK_RUNNING)
2021                                r->dt_preempt = dt_wait;
2022                        else if (r->last_state == TASK_UNINTERRUPTIBLE)
2023                                r->dt_iowait = dt_wait;
2024                        else
2025                                r->dt_sleep = dt_wait;
2026                }
2027        }
2028
2029        update_stats(&r->run_stats, r->dt_run);
2030
2031        r->total_run_time     += r->dt_run;
2032        r->total_delay_time   += r->dt_delay;
2033        r->total_sleep_time   += r->dt_sleep;
2034        r->total_iowait_time  += r->dt_iowait;
2035        r->total_preempt_time += r->dt_preempt;
2036}
2037
2038static bool is_idle_sample(struct perf_sample *sample,
2039                           struct perf_evsel *evsel)
2040{
2041        /* pid 0 == swapper == idle task */
2042        if (strcmp(perf_evsel__name(evsel), "sched:sched_switch") == 0)
2043                return perf_evsel__intval(evsel, sample, "prev_pid") == 0;
2044
2045        return sample->pid == 0;
2046}
2047
2048static void save_task_callchain(struct perf_sched *sched,
2049                                struct perf_sample *sample,
2050                                struct perf_evsel *evsel,
2051                                struct machine *machine)
2052{
2053        struct callchain_cursor *cursor = &callchain_cursor;
2054        struct thread *thread;
2055
2056        /* want main thread for process - has maps */
2057        thread = machine__findnew_thread(machine, sample->pid, sample->pid);
2058        if (thread == NULL) {
2059                pr_debug("Failed to get thread for pid %d.\n", sample->pid);
2060                return;
2061        }
2062
2063        if (!symbol_conf.use_callchain || sample->callchain == NULL)
2064                return;
2065
2066        if (thread__resolve_callchain(thread, cursor, evsel, sample,
2067                                      NULL, NULL, sched->max_stack + 2) != 0) {
2068                if (verbose > 0)
2069                        pr_err("Failed to resolve callchain. Skipping\n");
2070
2071                return;
2072        }
2073
2074        callchain_cursor_commit(cursor);
2075
2076        while (true) {
2077                struct callchain_cursor_node *node;
2078                struct symbol *sym;
2079
2080                node = callchain_cursor_current(cursor);
2081                if (node == NULL)
2082                        break;
2083
2084                sym = node->sym;
2085                if (sym) {
2086                        if (!strcmp(sym->name, "schedule") ||
2087                            !strcmp(sym->name, "__schedule") ||
2088                            !strcmp(sym->name, "preempt_schedule"))
2089                                sym->ignore = 1;
2090                }
2091
2092                callchain_cursor_advance(cursor);
2093        }
2094}
2095
2096static int init_idle_thread(struct thread *thread)
2097{
2098        struct idle_thread_runtime *itr;
2099
2100        thread__set_comm(thread, idle_comm, 0);
2101
2102        itr = zalloc(sizeof(*itr));
2103        if (itr == NULL)
2104                return -ENOMEM;
2105
2106        init_stats(&itr->tr.run_stats);
2107        callchain_init(&itr->callchain);
2108        callchain_cursor_reset(&itr->cursor);
2109        thread__set_priv(thread, itr);
2110
2111        return 0;
2112}
2113
2114/*
2115 * Track idle stats per cpu by maintaining a local thread
2116 * struct for the idle task on each cpu.
2117 */
2118static int init_idle_threads(int ncpu)
2119{
2120        int i, ret;
2121
2122        idle_threads = zalloc(ncpu * sizeof(struct thread *));
2123        if (!idle_threads)
2124                return -ENOMEM;
2125
2126        idle_max_cpu = ncpu;
2127
2128        /* allocate the actual thread struct if needed */
2129        for (i = 0; i < ncpu; ++i) {
2130                idle_threads[i] = thread__new(0, 0);
2131                if (idle_threads[i] == NULL)
2132                        return -ENOMEM;
2133
2134                ret = init_idle_thread(idle_threads[i]);
2135                if (ret < 0)
2136                        return ret;
2137        }
2138
2139        return 0;
2140}
2141
2142static void free_idle_threads(void)
2143{
2144        int i;
2145
2146        if (idle_threads == NULL)
2147                return;
2148
2149        for (i = 0; i < idle_max_cpu; ++i) {
2150                if ((idle_threads[i]))
2151                        thread__delete(idle_threads[i]);
2152        }
2153
2154        free(idle_threads);
2155}
2156
2157static struct thread *get_idle_thread(int cpu)
2158{
2159        /*
2160         * expand/allocate array of pointers to local thread
2161         * structs if needed
2162         */
2163        if ((cpu >= idle_max_cpu) || (idle_threads == NULL)) {
2164                int i, j = __roundup_pow_of_two(cpu+1);
2165                void *p;
2166
2167                p = realloc(idle_threads, j * sizeof(struct thread *));
2168                if (!p)
2169                        return NULL;
2170
2171                idle_threads = (struct thread **) p;
2172                for (i = idle_max_cpu; i < j; ++i)
2173                        idle_threads[i] = NULL;
2174
2175                idle_max_cpu = j;
2176        }
2177
2178        /* allocate a new thread struct if needed */
2179        if (idle_threads[cpu] == NULL) {
2180                idle_threads[cpu] = thread__new(0, 0);
2181
2182                if (idle_threads[cpu]) {
2183                        if (init_idle_thread(idle_threads[cpu]) < 0)
2184                                return NULL;
2185                }
2186        }
2187
2188        return idle_threads[cpu];
2189}
2190
2191static void save_idle_callchain(struct idle_thread_runtime *itr,
2192                                struct perf_sample *sample)
2193{
2194        if (!symbol_conf.use_callchain || sample->callchain == NULL)
2195                return;
2196
2197        callchain_cursor__copy(&itr->cursor, &callchain_cursor);
2198}
2199
2200/*
2201 * handle runtime stats saved per thread
2202 */
2203static struct thread_runtime *thread__init_runtime(struct thread *thread)
2204{
2205        struct thread_runtime *r;
2206
2207        r = zalloc(sizeof(struct thread_runtime));
2208        if (!r)
2209                return NULL;
2210
2211        init_stats(&r->run_stats);
2212        thread__set_priv(thread, r);
2213
2214        return r;
2215}
2216
2217static struct thread_runtime *thread__get_runtime(struct thread *thread)
2218{
2219        struct thread_runtime *tr;
2220
2221        tr = thread__priv(thread);
2222        if (tr == NULL) {
2223                tr = thread__init_runtime(thread);
2224                if (tr == NULL)
2225                        pr_debug("Failed to malloc memory for runtime data.\n");
2226        }
2227
2228        return tr;
2229}
2230
2231static struct thread *timehist_get_thread(struct perf_sched *sched,
2232                                          struct perf_sample *sample,
2233                                          struct machine *machine,
2234                                          struct perf_evsel *evsel)
2235{
2236        struct thread *thread;
2237
2238        if (is_idle_sample(sample, evsel)) {
2239                thread = get_idle_thread(sample->cpu);
2240                if (thread == NULL)
2241                        pr_err("Failed to get idle thread for cpu %d.\n", sample->cpu);
2242
2243        } else {
2244                /* there were samples with tid 0 but non-zero pid */
2245                thread = machine__findnew_thread(machine, sample->pid,
2246                                                 sample->tid ?: sample->pid);
2247                if (thread == NULL) {
2248                        pr_debug("Failed to get thread for tid %d. skipping sample.\n",
2249                                 sample->tid);
2250                }
2251
2252                save_task_callchain(sched, sample, evsel, machine);
2253                if (sched->idle_hist) {
2254                        struct thread *idle;
2255                        struct idle_thread_runtime *itr;
2256
2257                        idle = get_idle_thread(sample->cpu);
2258                        if (idle == NULL) {
2259                                pr_err("Failed to get idle thread for cpu %d.\n", sample->cpu);
2260                                return NULL;
2261                        }
2262
2263                        itr = thread__priv(idle);
2264                        if (itr == NULL)
2265                                return NULL;
2266
2267                        itr->last_thread = thread;
2268
2269                        /* copy task callchain when entering to idle */
2270                        if (perf_evsel__intval(evsel, sample, "next_pid") == 0)
2271                                save_idle_callchain(itr, sample);
2272                }
2273        }
2274
2275        return thread;
2276}
2277
2278static bool timehist_skip_sample(struct perf_sched *sched,
2279                                 struct thread *thread,
2280                                 struct perf_evsel *evsel,
2281                                 struct perf_sample *sample)
2282{
2283        bool rc = false;
2284
2285        if (thread__is_filtered(thread)) {
2286                rc = true;
2287                sched->skipped_samples++;
2288        }
2289
2290        if (sched->idle_hist) {
2291                if (strcmp(perf_evsel__name(evsel), "sched:sched_switch"))
2292                        rc = true;
2293                else if (perf_evsel__intval(evsel, sample, "prev_pid") != 0 &&
2294                         perf_evsel__intval(evsel, sample, "next_pid") != 0)
2295                        rc = true;
2296        }
2297
2298        return rc;
2299}
2300
2301static void timehist_print_wakeup_event(struct perf_sched *sched,
2302                                        struct perf_evsel *evsel,
2303                                        struct perf_sample *sample,
2304                                        struct machine *machine,
2305                                        struct thread *awakened)
2306{
2307        struct thread *thread;
2308        char tstr[64];
2309
2310        thread = machine__findnew_thread(machine, sample->pid, sample->tid);
2311        if (thread == NULL)
2312                return;
2313
2314        /* show wakeup unless both awakee and awaker are filtered */
2315        if (timehist_skip_sample(sched, thread, evsel, sample) &&
2316            timehist_skip_sample(sched, awakened, evsel, sample)) {
2317                return;
2318        }
2319
2320        timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr));
2321        printf("%15s [%04d] ", tstr, sample->cpu);
2322        if (sched->show_cpu_visual)
2323                printf(" %*s ", sched->max_cpu + 1, "");
2324
2325        printf(" %-*s ", comm_width, timehist_get_commstr(thread));
2326
2327        /* dt spacer */
2328        printf("  %9s  %9s  %9s ", "", "", "");
2329
2330        printf("awakened: %s", timehist_get_commstr(awakened));
2331
2332        printf("\n");
2333}
2334
2335static int timehist_sched_wakeup_event(struct perf_tool *tool,
2336                                       union perf_event *event __maybe_unused,
2337                                       struct perf_evsel *evsel,
2338                                       struct perf_sample *sample,
2339                                       struct machine *machine)
2340{
2341        struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
2342        struct thread *thread;
2343        struct thread_runtime *tr = NULL;
2344        /* want pid of awakened task not pid in sample */
2345        const u32 pid = perf_evsel__intval(evsel, sample, "pid");
2346
2347        thread = machine__findnew_thread(machine, 0, pid);
2348        if (thread == NULL)
2349                return -1;
2350
2351        tr = thread__get_runtime(thread);
2352        if (tr == NULL)
2353                return -1;
2354
2355        if (tr->ready_to_run == 0)
2356                tr->ready_to_run = sample->time;
2357
2358        /* show wakeups if requested */
2359        if (sched->show_wakeups &&
2360            !perf_time__skip_sample(&sched->ptime, sample->time))
2361                timehist_print_wakeup_event(sched, evsel, sample, machine, thread);
2362
2363        return 0;
2364}
2365
2366static void timehist_print_migration_event(struct perf_sched *sched,
2367                                        struct perf_evsel *evsel,
2368                                        struct perf_sample *sample,
2369                                        struct machine *machine,
2370                                        struct thread *migrated)
2371{
2372        struct thread *thread;
2373        char tstr[64];
2374        u32 max_cpus = sched->max_cpu + 1;
2375        u32 ocpu, dcpu;
2376
2377        if (sched->summary_only)
2378                return;
2379
2380        max_cpus = sched->max_cpu + 1;
2381        ocpu = perf_evsel__intval(evsel, sample, "orig_cpu");
2382        dcpu = perf_evsel__intval(evsel, sample, "dest_cpu");
2383
2384        thread = machine__findnew_thread(machine, sample->pid, sample->tid);
2385        if (thread == NULL)
2386                return;
2387
2388        if (timehist_skip_sample(sched, thread, evsel, sample) &&
2389            timehist_skip_sample(sched, migrated, evsel, sample)) {
2390                return;
2391        }
2392
2393        timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr));
2394        printf("%15s [%04d] ", tstr, sample->cpu);
2395
2396        if (sched->show_cpu_visual) {
2397                u32 i;
2398                char c;
2399
2400                printf("  ");
2401                for (i = 0; i < max_cpus; ++i) {
2402                        c = (i == sample->cpu) ? 'm' : ' ';
2403                        printf("%c", c);
2404                }
2405                printf("  ");
2406        }
2407
2408        printf(" %-*s ", comm_width, timehist_get_commstr(thread));
2409
2410        /* dt spacer */
2411        printf("  %9s  %9s  %9s ", "", "", "");
2412
2413        printf("migrated: %s", timehist_get_commstr(migrated));
2414        printf(" cpu %d => %d", ocpu, dcpu);
2415
2416        printf("\n");
2417}
2418
2419static int timehist_migrate_task_event(struct perf_tool *tool,
2420                                       union perf_event *event __maybe_unused,
2421                                       struct perf_evsel *evsel,
2422                                       struct perf_sample *sample,
2423                                       struct machine *machine)
2424{
2425        struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
2426        struct thread *thread;
2427        struct thread_runtime *tr = NULL;
2428        /* want pid of migrated task not pid in sample */
2429        const u32 pid = perf_evsel__intval(evsel, sample, "pid");
2430
2431        thread = machine__findnew_thread(machine, 0, pid);
2432        if (thread == NULL)
2433                return -1;
2434
2435        tr = thread__get_runtime(thread);
2436        if (tr == NULL)
2437                return -1;
2438
2439        tr->migrations++;
2440
2441        /* show migrations if requested */
2442        timehist_print_migration_event(sched, evsel, sample, machine, thread);
2443
2444        return 0;
2445}
2446
2447static int timehist_sched_change_event(struct perf_tool *tool,
2448                                       union perf_event *event,
2449                                       struct perf_evsel *evsel,
2450                                       struct perf_sample *sample,
2451                                       struct machine *machine)
2452{
2453        struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
2454        struct perf_time_interval *ptime = &sched->ptime;
2455        struct addr_location al;
2456        struct thread *thread;
2457        struct thread_runtime *tr = NULL;
2458        u64 tprev, t = sample->time;
2459        int rc = 0;
2460        int state = perf_evsel__intval(evsel, sample, "prev_state");
2461
2462
2463        if (machine__resolve(machine, &al, sample) < 0) {
2464                pr_err("problem processing %d event. skipping it\n",
2465                       event->header.type);
2466                rc = -1;
2467                goto out;
2468        }
2469
2470        thread = timehist_get_thread(sched, sample, machine, evsel);
2471        if (thread == NULL) {
2472                rc = -1;
2473                goto out;
2474        }
2475
2476        if (timehist_skip_sample(sched, thread, evsel, sample))
2477                goto out;
2478
2479        tr = thread__get_runtime(thread);
2480        if (tr == NULL) {
2481                rc = -1;
2482                goto out;
2483        }
2484
2485        tprev = perf_evsel__get_time(evsel, sample->cpu);
2486
2487        /*
2488         * If start time given:
2489         * - sample time is under window user cares about - skip sample
2490         * - tprev is under window user cares about  - reset to start of window
2491         */
2492        if (ptime->start && ptime->start > t)
2493                goto out;
2494
2495        if (tprev && ptime->start > tprev)
2496                tprev = ptime->start;
2497
2498        /*
2499         * If end time given:
2500         * - previous sched event is out of window - we are done
2501         * - sample time is beyond window user cares about - reset it
2502         *   to close out stats for time window interest
2503         */
2504        if (ptime->end) {
2505                if (tprev > ptime->end)
2506                        goto out;
2507
2508                if (t > ptime->end)
2509                        t = ptime->end;
2510        }
2511
2512        if (!sched->idle_hist || thread->tid == 0) {
2513                timehist_update_runtime_stats(tr, t, tprev);
2514
2515                if (sched->idle_hist) {
2516                        struct idle_thread_runtime *itr = (void *)tr;
2517                        struct thread_runtime *last_tr;
2518
2519                        BUG_ON(thread->tid != 0);
2520
2521                        if (itr->last_thread == NULL)
2522                                goto out;
2523
2524                        /* add current idle time as last thread's runtime */
2525                        last_tr = thread__get_runtime(itr->last_thread);
2526                        if (last_tr == NULL)
2527                                goto out;
2528
2529                        timehist_update_runtime_stats(last_tr, t, tprev);
2530                        /*
2531                         * remove delta time of last thread as it's not updated
2532                         * and otherwise it will show an invalid value next
2533                         * time.  we only care total run time and run stat.
2534                         */
2535                        last_tr->dt_run = 0;
2536                        last_tr->dt_delay = 0;
2537                        last_tr->dt_sleep = 0;
2538                        last_tr->dt_iowait = 0;
2539                        last_tr->dt_preempt = 0;
2540
2541                        if (itr->cursor.nr)
2542                                callchain_append(&itr->callchain, &itr->cursor, t - tprev);
2543
2544                        itr->last_thread = NULL;
2545                }
2546        }
2547
2548        if (!sched->summary_only)
2549                timehist_print_sample(sched, evsel, sample, &al, thread, t, state);
2550
2551out:
2552        if (sched->hist_time.start == 0 && t >= ptime->start)
2553                sched->hist_time.start = t;
2554        if (ptime->end == 0 || t <= ptime->end)
2555                sched->hist_time.end = t;
2556
2557        if (tr) {
2558                /* time of this sched_switch event becomes last time task seen */
2559                tr->last_time = sample->time;
2560
2561                /* last state is used to determine where to account wait time */
2562                tr->last_state = state;
2563
2564                /* sched out event for task so reset ready to run time */
2565                tr->ready_to_run = 0;
2566        }
2567
2568        perf_evsel__save_time(evsel, sample->time, sample->cpu);
2569
2570        return rc;
2571}
2572
2573static int timehist_sched_switch_event(struct perf_tool *tool,
2574                             union perf_event *event,
2575                             struct perf_evsel *evsel,
2576                             struct perf_sample *sample,
2577                             struct machine *machine __maybe_unused)
2578{
2579        return timehist_sched_change_event(tool, event, evsel, sample, machine);
2580}
2581
2582static int process_lost(struct perf_tool *tool __maybe_unused,
2583                        union perf_event *event,
2584                        struct perf_sample *sample,
2585                        struct machine *machine __maybe_unused)
2586{
2587        char tstr[64];
2588
2589        timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr));
2590        printf("%15s ", tstr);
2591        printf("lost %" PRIu64 " events on cpu %d\n", event->lost.lost, sample->cpu);
2592
2593        return 0;
2594}
2595
2596
2597static void print_thread_runtime(struct thread *t,
2598                                 struct thread_runtime *r)
2599{
2600        double mean = avg_stats(&r->run_stats);
2601        float stddev;
2602
2603        printf("%*s   %5d  %9" PRIu64 " ",
2604               comm_width, timehist_get_commstr(t), t->ppid,
2605               (u64) r->run_stats.n);
2606
2607        print_sched_time(r->total_run_time, 8);
2608        stddev = rel_stddev_stats(stddev_stats(&r->run_stats), mean);
2609        print_sched_time(r->run_stats.min, 6);
2610        printf(" ");
2611        print_sched_time((u64) mean, 6);
2612        printf(" ");
2613        print_sched_time(r->run_stats.max, 6);
2614        printf("  ");
2615        printf("%5.2f", stddev);
2616        printf("   %5" PRIu64, r->migrations);
2617        printf("\n");
2618}
2619
2620static void print_thread_waittime(struct thread *t,
2621                                  struct thread_runtime *r)
2622{
2623        printf("%*s   %5d  %9" PRIu64 " ",
2624               comm_width, timehist_get_commstr(t), t->ppid,
2625               (u64) r->run_stats.n);
2626
2627        print_sched_time(r->total_run_time, 8);
2628        print_sched_time(r->total_sleep_time, 6);
2629        printf(" ");
2630        print_sched_time(r->total_iowait_time, 6);
2631        printf(" ");
2632        print_sched_time(r->total_preempt_time, 6);
2633        printf(" ");
2634        print_sched_time(r->total_delay_time, 6);
2635        printf("\n");
2636}
2637
2638struct total_run_stats {
2639        struct perf_sched *sched;
2640        u64  sched_count;
2641        u64  task_count;
2642        u64  total_run_time;
2643};
2644
2645static int __show_thread_runtime(struct thread *t, void *priv)
2646{
2647        struct total_run_stats *stats = priv;
2648        struct thread_runtime *r;
2649
2650        if (thread__is_filtered(t))
2651                return 0;
2652
2653        r = thread__priv(t);
2654        if (r && r->run_stats.n) {
2655                stats->task_count++;
2656                stats->sched_count += r->run_stats.n;
2657                stats->total_run_time += r->total_run_time;
2658
2659                if (stats->sched->show_state)
2660                        print_thread_waittime(t, r);
2661                else
2662                        print_thread_runtime(t, r);
2663        }
2664
2665        return 0;
2666}
2667
2668static int show_thread_runtime(struct thread *t, void *priv)
2669{
2670        if (t->dead)
2671                return 0;
2672
2673        return __show_thread_runtime(t, priv);
2674}
2675
2676static int show_deadthread_runtime(struct thread *t, void *priv)
2677{
2678        if (!t->dead)
2679                return 0;
2680
2681        return __show_thread_runtime(t, priv);
2682}
2683
2684static size_t callchain__fprintf_folded(FILE *fp, struct callchain_node *node)
2685{
2686        const char *sep = " <- ";
2687        struct callchain_list *chain;
2688        size_t ret = 0;
2689        char bf[1024];
2690        bool first;
2691
2692        if (node == NULL)
2693                return 0;
2694
2695        ret = callchain__fprintf_folded(fp, node->parent);
2696        first = (ret == 0);
2697
2698        list_for_each_entry(chain, &node->val, list) {
2699                if (chain->ip >= PERF_CONTEXT_MAX)
2700                        continue;
2701                if (chain->ms.sym && chain->ms.sym->ignore)
2702                        continue;
2703                ret += fprintf(fp, "%s%s", first ? "" : sep,
2704                               callchain_list__sym_name(chain, bf, sizeof(bf),
2705                                                        false));
2706                first = false;
2707        }
2708
2709        return ret;
2710}
2711
2712static size_t timehist_print_idlehist_callchain(struct rb_root *root)
2713{
2714        size_t ret = 0;
2715        FILE *fp = stdout;
2716        struct callchain_node *chain;
2717        struct rb_node *rb_node = rb_first(root);
2718
2719        printf("  %16s  %8s  %s\n", "Idle time (msec)", "Count", "Callchains");
2720        printf("  %.16s  %.8s  %.50s\n", graph_dotted_line, graph_dotted_line,
2721               graph_dotted_line);
2722
2723        while (rb_node) {
2724                chain = rb_entry(rb_node, struct callchain_node, rb_node);
2725                rb_node = rb_next(rb_node);
2726
2727                ret += fprintf(fp, "  ");
2728                print_sched_time(chain->hit, 12);
2729                ret += 16;  /* print_sched_time returns 2nd arg + 4 */
2730                ret += fprintf(fp, " %8d  ", chain->count);
2731                ret += callchain__fprintf_folded(fp, chain);
2732                ret += fprintf(fp, "\n");
2733        }
2734
2735        return ret;
2736}
2737
2738static void timehist_print_summary(struct perf_sched *sched,
2739                                   struct perf_session *session)
2740{
2741        struct machine *m = &session->machines.host;
2742        struct total_run_stats totals;
2743        u64 task_count;
2744        struct thread *t;
2745        struct thread_runtime *r;
2746        int i;
2747        u64 hist_time = sched->hist_time.end - sched->hist_time.start;
2748
2749        memset(&totals, 0, sizeof(totals));
2750        totals.sched = sched;
2751
2752        if (sched->idle_hist) {
2753                printf("\nIdle-time summary\n");
2754                printf("%*s  parent  sched-out  ", comm_width, "comm");
2755                printf("  idle-time   min-idle    avg-idle    max-idle  stddev  migrations\n");
2756        } else if (sched->show_state) {
2757                printf("\nWait-time summary\n");
2758                printf("%*s  parent   sched-in  ", comm_width, "comm");
2759                printf("   run-time      sleep      iowait     preempt       delay\n");
2760        } else {
2761                printf("\nRuntime summary\n");
2762                printf("%*s  parent   sched-in  ", comm_width, "comm");
2763                printf("   run-time    min-run     avg-run     max-run  stddev  migrations\n");
2764        }
2765        printf("%*s            (count)  ", comm_width, "");
2766        printf("     (msec)     (msec)      (msec)      (msec)       %s\n",
2767               sched->show_state ? "(msec)" : "%");
2768        printf("%.117s\n", graph_dotted_line);
2769
2770        machine__for_each_thread(m, show_thread_runtime, &totals);
2771        task_count = totals.task_count;
2772        if (!task_count)
2773                printf("<no still running tasks>\n");
2774
2775        printf("\nTerminated tasks:\n");
2776        machine__for_each_thread(m, show_deadthread_runtime, &totals);
2777        if (task_count == totals.task_count)
2778                printf("<no terminated tasks>\n");
2779
2780        /* CPU idle stats not tracked when samples were skipped */
2781        if (sched->skipped_samples && !sched->idle_hist)
2782                return;
2783
2784        printf("\nIdle stats:\n");
2785        for (i = 0; i < idle_max_cpu; ++i) {
2786                t = idle_threads[i];
2787                if (!t)
2788                        continue;
2789
2790                r = thread__priv(t);
2791                if (r && r->run_stats.n) {
2792                        totals.sched_count += r->run_stats.n;
2793                        printf("    CPU %2d idle for ", i);
2794                        print_sched_time(r->total_run_time, 6);
2795                        printf(" msec  (%6.2f%%)\n", 100.0 * r->total_run_time / hist_time);
2796                } else
2797                        printf("    CPU %2d idle entire time window\n", i);
2798        }
2799
2800        if (sched->idle_hist && symbol_conf.use_callchain) {
2801                callchain_param.mode  = CHAIN_FOLDED;
2802                callchain_param.value = CCVAL_PERIOD;
2803
2804                callchain_register_param(&callchain_param);
2805
2806                printf("\nIdle stats by callchain:\n");
2807                for (i = 0; i < idle_max_cpu; ++i) {
2808                        struct idle_thread_runtime *itr;
2809
2810                        t = idle_threads[i];
2811                        if (!t)
2812                                continue;
2813
2814                        itr = thread__priv(t);
2815                        if (itr == NULL)
2816                                continue;
2817
2818                        callchain_param.sort(&itr->sorted_root, &itr->callchain,
2819                                             0, &callchain_param);
2820
2821                        printf("  CPU %2d:", i);
2822                        print_sched_time(itr->tr.total_run_time, 6);
2823                        printf(" msec\n");
2824                        timehist_print_idlehist_callchain(&itr->sorted_root);
2825                        printf("\n");
2826                }
2827        }
2828
2829        printf("\n"
2830               "    Total number of unique tasks: %" PRIu64 "\n"
2831               "Total number of context switches: %" PRIu64 "\n",
2832               totals.task_count, totals.sched_count);
2833
2834        printf("           Total run time (msec): ");
2835        print_sched_time(totals.total_run_time, 2);
2836        printf("\n");
2837
2838        printf("    Total scheduling time (msec): ");
2839        print_sched_time(hist_time, 2);
2840        printf(" (x %d)\n", sched->max_cpu);
2841}
2842
2843typedef int (*sched_handler)(struct perf_tool *tool,
2844                          union perf_event *event,
2845                          struct perf_evsel *evsel,
2846                          struct perf_sample *sample,
2847                          struct machine *machine);
2848
2849static int perf_timehist__process_sample(struct perf_tool *tool,
2850                                         union perf_event *event,
2851                                         struct perf_sample *sample,
2852                                         struct perf_evsel *evsel,
2853                                         struct machine *machine)
2854{
2855        struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
2856        int err = 0;
2857        int this_cpu = sample->cpu;
2858
2859        if (this_cpu > sched->max_cpu)
2860                sched->max_cpu = this_cpu;
2861
2862        if (evsel->handler != NULL) {
2863                sched_handler f = evsel->handler;
2864
2865                err = f(tool, event, evsel, sample, machine);
2866        }
2867
2868        return err;
2869}
2870
2871static int timehist_check_attr(struct perf_sched *sched,
2872                               struct perf_evlist *evlist)
2873{
2874        struct perf_evsel *evsel;
2875        struct evsel_runtime *er;
2876
2877        list_for_each_entry(evsel, &evlist->entries, node) {
2878                er = perf_evsel__get_runtime(evsel);
2879                if (er == NULL) {
2880                        pr_err("Failed to allocate memory for evsel runtime data\n");
2881                        return -1;
2882                }
2883
2884                if (sched->show_callchain &&
2885                    !(evsel->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) {
2886                        pr_info("Samples do not have callchains.\n");
2887                        sched->show_callchain = 0;
2888                        symbol_conf.use_callchain = 0;
2889                }
2890        }
2891
2892        return 0;
2893}
2894
2895static int perf_sched__timehist(struct perf_sched *sched)
2896{
2897        const struct perf_evsel_str_handler handlers[] = {
2898                { "sched:sched_switch",       timehist_sched_switch_event, },
2899                { "sched:sched_wakeup",       timehist_sched_wakeup_event, },
2900                { "sched:sched_wakeup_new",   timehist_sched_wakeup_event, },
2901        };
2902        const struct perf_evsel_str_handler migrate_handlers[] = {
2903                { "sched:sched_migrate_task", timehist_migrate_task_event, },
2904        };
2905        struct perf_data_file file = {
2906                .path = input_name,
2907                .mode = PERF_DATA_MODE_READ,
2908                .force = sched->force,
2909        };
2910
2911        struct perf_session *session;
2912        struct perf_evlist *evlist;
2913        int err = -1;
2914
2915        /*
2916         * event handlers for timehist option
2917         */
2918        sched->tool.sample       = perf_timehist__process_sample;
2919        sched->tool.mmap         = perf_event__process_mmap;
2920        sched->tool.comm         = perf_event__process_comm;
2921        sched->tool.exit         = perf_event__process_exit;
2922        sched->tool.fork         = perf_event__process_fork;
2923        sched->tool.lost         = process_lost;
2924        sched->tool.attr         = perf_event__process_attr;
2925        sched->tool.tracing_data = perf_event__process_tracing_data;
2926        sched->tool.build_id     = perf_event__process_build_id;
2927
2928        sched->tool.ordered_events = true;
2929        sched->tool.ordering_requires_timestamps = true;
2930
2931        symbol_conf.use_callchain = sched->show_callchain;
2932
2933        session = perf_session__new(&file, false, &sched->tool);
2934        if (session == NULL)
2935                return -ENOMEM;
2936
2937        evlist = session->evlist;
2938
2939        symbol__init(&session->header.env);
2940
2941        if (perf_time__parse_str(&sched->ptime, sched->time_str) != 0) {
2942                pr_err("Invalid time string\n");
2943                return -EINVAL;
2944        }
2945
2946        if (timehist_check_attr(sched, evlist) != 0)
2947                goto out;
2948
2949        setup_pager();
2950
2951        /* setup per-evsel handlers */
2952        if (perf_session__set_tracepoints_handlers(session, handlers))
2953                goto out;
2954
2955        /* sched_switch event at a minimum needs to exist */
2956        if (!perf_evlist__find_tracepoint_by_name(session->evlist,
2957                                                  "sched:sched_switch")) {
2958                pr_err("No sched_switch events found. Have you run 'perf sched record'?\n");
2959                goto out;
2960        }
2961
2962        if (sched->show_migrations &&
2963            perf_session__set_tracepoints_handlers(session, migrate_handlers))
2964                goto out;
2965
2966        /* pre-allocate struct for per-CPU idle stats */
2967        sched->max_cpu = session->header.env.nr_cpus_online;
2968        if (sched->max_cpu == 0)
2969                sched->max_cpu = 4;
2970        if (init_idle_threads(sched->max_cpu))
2971                goto out;
2972
2973        /* summary_only implies summary option, but don't overwrite summary if set */
2974        if (sched->summary_only)
2975                sched->summary = sched->summary_only;
2976
2977        if (!sched->summary_only)
2978                timehist_header(sched);
2979
2980        err = perf_session__process_events(session);
2981        if (err) {
2982                pr_err("Failed to process events, error %d", err);
2983                goto out;
2984        }
2985
2986        sched->nr_events      = evlist->stats.nr_events[0];
2987        sched->nr_lost_events = evlist->stats.total_lost;
2988        sched->nr_lost_chunks = evlist->stats.nr_events[PERF_RECORD_LOST];
2989
2990        if (sched->summary)
2991                timehist_print_summary(sched, session);
2992
2993out:
2994        free_idle_threads();
2995        perf_session__delete(session);
2996
2997        return err;
2998}
2999
3000

3001static void print_bad_events(struct perf_sched *sched)
3002{
3003        if (sched->nr_unordered_timestamps && sched->nr_timestamps) {
3004                printf("  INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
3005                        (double)sched->nr_unordered_timestamps/(double)sched->nr_timestamps*100.0,
3006                        sched->nr_unordered_timestamps, sched->nr_timestamps);
3007        }
3008        if (sched->nr_lost_events && sched->nr_events) {
3009                printf("  INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
3010                        (double)sched->nr_lost_events/(double)sched->nr_events * 100.0,
3011                        sched->nr_lost_events, sched->nr_events, sched->nr_lost_chunks);
3012        }
3013        if (sched->nr_context_switch_bugs && sched->nr_timestamps) {
3014                printf("  INFO: %.3f%% context switch bugs (%ld out of %ld)",
3015                        (double)sched->nr_context_switch_bugs/(double)sched->nr_timestamps*100.0,
3016                        sched->nr_context_switch_bugs, sched->nr_timestamps);
3017                if (sched->nr_lost_events)
3018                        printf(" (due to lost events?)");
3019                printf("\n");
3020        }
3021}
3022
3023static void __merge_work_atoms(struct rb_root *root, struct work_atoms *data)
3024{
3025        struct rb_node **new = &(root->rb_node), *parent = NULL;
3026        struct work_atoms *this;
3027        const char *comm = thread__comm_str(data->thread), *this_comm;
3028
3029        while (*new) {
3030                int cmp;
3031
3032                this = container_of(*new, struct work_atoms, node);
3033                parent = *new;
3034
3035                this_comm = thread__comm_str(this->thread);
3036                cmp = strcmp(comm, this_comm);
3037                if (cmp > 0) {
3038                        new = &((*new)->rb_left);
3039                } else if (cmp < 0) {
3040                        new = &((*new)->rb_right);
3041                } else {
3042                        this->num_merged++;
3043                        this->total_runtime += data->total_runtime;
3044                        this->nb_atoms += data->nb_atoms;
3045                        this->total_lat += data->total_lat;
3046                        list_splice(&data->work_list, &this->work_list);
3047                        if (this->max_lat < data->max_lat) {
3048                                this->max_lat = data->max_lat;
3049                                this->max_lat_at = data->max_lat_at;
3050                        }
3051                        zfree(&data);
3052                        return;
3053                }
3054        }
3055
3056        data->num_merged++;
3057        rb_link_node(&data->node, parent, new);
3058        rb_insert_color(&data->node, root);
3059}
3060
3061static void perf_sched__merge_lat(struct perf_sched *sched)
3062{
3063        struct work_atoms *data;
3064        struct rb_node *node;
3065
3066        if (sched->skip_merge)
3067                return;
3068
3069        while ((node = rb_first(&sched->atom_root))) {
3070                rb_erase(node, &sched->atom_root);
3071                data = rb_entry(node, struct work_atoms, node);
3072                __merge_work_atoms(&sched->merged_atom_root, data);
3073        }
3074}
3075
3076static int perf_sched__lat(struct perf_sched *sched)
3077{
3078        struct rb_node *next;
3079
3080        setup_pager();
3081
3082        if (perf_sched__read_events(sched))
3083                return -1;
3084
3085        perf_sched__merge_lat(sched);
3086        perf_sched__sort_lat(sched);
3087
3088        printf("\n -----------------------------------------------------------------------------------------------------------------\n");
3089        printf("  Task                  |   Runtime ms  | Switches | Average delay ms | Maximum delay ms | Maximum delay at       |\n");
3090        printf(" -----------------------------------------------------------------------------------------------------------------\n");
3091
3092        next = rb_first(&sched->sorted_atom_root);
3093
3094        while (next) {
3095                struct work_atoms *work_list;
3096
3097                work_list = rb_entry(next, struct work_atoms, node);
3098                output_lat_thread(sched, work_list);
3099                next = rb_next(next);
3100                thread__zput(work_list->thread);
3101        }
3102
3103        printf(" -----------------------------------------------------------------------------------------------------------------\n");
3104        printf("  TOTAL:                |%11.3f ms |%9" PRIu64 " |\n",
3105                (double)sched->all_runtime / NSEC_PER_MSEC, sched->all_count);
3106
3107        printf(" ---------------------------------------------------\n");
3108
3109        print_bad_events(sched);
3110        printf("\n");
3111
3112        return 0;
3113}
3114
3115static int setup_map_cpus(struct perf_sched *sched)
3116{
3117        struct cpu_map *map;
3118
3119        sched->max_cpu  = sysconf(_SC_NPROCESSORS_CONF);
3120
3121        if (sched->map.comp) {
3122                sched->map.comp_cpus = zalloc(sched->max_cpu * sizeof(int));
3123                if (!sched->map.comp_cpus)
3124                        return -1;
3125        }
3126
3127        if (!sched->map.cpus_str)
3128                return 0;
3129
3130        map = cpu_map__new(sched->map.cpus_str);
3131        if (!map) {
3132                pr_err("failed to get cpus map from %s\n", sched->map.cpus_str);
3133                return -1;
3134        }
3135
3136        sched->map.cpus = map;
3137        return 0;
3138}
3139
3140static int setup_color_pids(struct perf_sched *sched)
3141{
3142        struct thread_map *map;
3143
3144        if (!sched->map.color_pids_str)
3145                return 0;
3146
3147        map = thread_map__new_by_tid_str(sched->map.color_pids_str);
3148        if (!map) {
3149                pr_err("failed to get thread map from %s\n", sched->map.color_pids_str);
3150                return -1;
3151        }
3152
3153        sched->map.color_pids = map;
3154        return 0;
3155}
3156
3157static int setup_color_cpus(struct perf_sched *sched)
3158{
3159        struct cpu_map *map;
3160
3161        if (!sched->map.color_cpus_str)
3162                return 0;
3163
3164        map = cpu_map__new(sched->map.color_cpus_str);
3165        if (!map) {
3166                pr_err("failed to get thread map from %s\n", sched->map.color_cpus_str);
3167                return -1;
3168        }
3169
3170        sched->map.color_cpus = map;
3171        return 0;
3172}
3173
3174static int perf_sched__map(struct perf_sched *sched)
3175{
3176        if (setup_map_cpus(sched))
3177                return -1;
3178
3179        if (setup_color_pids(sched))
3180                return -1;
3181
3182        if (setup_color_cpus(sched))
3183                return -1;
3184
3185        setup_pager();
3186        if (perf_sched__read_events(sched))
3187                return -1;
3188        print_bad_events(sched);
3189        return 0;
3190}
3191
3192static int perf_sched__replay(struct perf_sched *sched)
3193{
3194        unsigned long i;
3195
3196        calibrate_run_measurement_overhead(sched);
3197        calibrate_sleep_measurement_overhead(sched);
3198
3199        test_calibrations(sched);
3200
3201        if (perf_sched__read_events(sched))
3202                return -1;
3203
3204        printf("nr_run_events:        %ld\n", sched->nr_run_events);
3205        printf("nr_sleep_events:      %ld\n", sched->nr_sleep_events);
3206        printf("nr_wakeup_events:     %ld\n", sched->nr_wakeup_events);
3207
3208        if (sched->targetless_wakeups)
3209                printf("target-less wakeups:  %ld\n", sched->targetless_wakeups);
3210        if (sched->multitarget_wakeups)
3211                printf("multi-target wakeups: %ld\n", sched->multitarget_wakeups);
3212        if (sched->nr_run_events_optimized)
3213                printf("run atoms optimized: %ld\n",
3214                        sched->nr_run_events_optimized);
3215
3216        print_task_traces(sched);
3217        add_cross_task_wakeups(sched);
3218
3219        create_tasks(sched);
3220        printf("------------------------------------------------------------\n");
3221        for (i = 0; i < sched->replay_repeat; i++)
3222                run_one_test(sched);
3223
3224        return 0;
3225}
3226
3227static void setup_sorting(struct perf_sched *sched, const struct option *options,
3228                          const char * const usage_msg[])
3229{
3230        char *tmp, *tok, *str = strdup(sched->sort_order);
3231
3232        for (tok = strtok_r(str, ", ", &tmp);
3233                        tok; tok = strtok_r(NULL, ", ", &tmp)) {
3234                if (sort_dimension__add(tok, &sched->sort_list) < 0) {
3235                        usage_with_options_msg(usage_msg, options,
3236                                        "Unknown --sort key: `%s'", tok);
3237                }
3238        }
3239
3240        free(str);
3241
3242        sort_dimension__add("pid", &sched->cmp_pid);
3243}
3244
3245static int __cmd_record(int argc, const char **argv)
3246{
3247        unsigned int rec_argc, i, j;
3248        const char **rec_argv;
3249        const char * const record_args[] = {
3250                "record",
3251                "-a",
3252                "-R",
3253                "-m", "1024",
3254                "-c", "1",
3255                "-e", "sched:sched_switch",
3256                "-e", "sched:sched_stat_wait",
3257                "-e", "sched:sched_stat_sleep",
3258                "-e", "sched:sched_stat_iowait",
3259                "-e", "sched:sched_stat_runtime",
3260                "-e", "sched:sched_process_fork",
3261                "-e", "sched:sched_wakeup",
3262                "-e", "sched:sched_wakeup_new",
3263                "-e", "sched:sched_migrate_task",
3264        };
3265
3266        rec_argc = ARRAY_SIZE(record_args) + argc - 1;
3267        rec_argv = calloc(rec_argc + 1, sizeof(char *));
3268
3269        if (rec_argv == NULL)
3270                return -ENOMEM;
3271
3272        for (i = 0; i < ARRAY_SIZE(record_args); i++)
3273                rec_argv[i] = strdup(record_args[i]);
3274
3275        for (j = 1; j < (unsigned int)argc; j++, i++)
3276                rec_argv[i] = argv[j];
3277
3278        BUG_ON(i != rec_argc);
3279
3280        return cmd_record(i, rec_argv);
3281}
3282
3283int cmd_sched(int argc, const char **argv)
3284{
3285        const char default_sort_order[] = "avg, max, switch, runtime";
3286        struct perf_sched sched = {
3287                .tool = {
3288                        .sample          = perf_sched__process_tracepoint_sample,
3289                        .comm            = perf_event__process_comm,
3290                        .namespaces      = perf_event__process_namespaces,
3291                        .lost            = perf_event__process_lost,
3292                        .fork            = perf_sched__process_fork_event,
3293                        .ordered_events = true,
3294                },
3295                .cmp_pid              = LIST_HEAD_INIT(sched.cmp_pid),
3296                .sort_list            = LIST_HEAD_INIT(sched.sort_list),
3297                .start_work_mutex     = PTHREAD_MUTEX_INITIALIZER,
3298                .work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER,
3299                .sort_order           = default_sort_order,
3300                .replay_repeat        = 10,
3301                .profile_cpu          = -1,
3302                .next_shortname1      = 'A',
3303                .next_shortname2      = '0',
3304                .skip_merge           = 0,
3305                .show_callchain       = 1,
3306                .max_stack            = 5,
3307        };
3308        const struct option sched_options[] = {
3309        OPT_STRING('i', "input", &input_name, "file",
3310                    "input file name"),
3311        OPT_INCR('v', "verbose", &verbose,
3312                    "be more verbose (show symbol address, etc)"),
3313        OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
3314                    "dump raw trace in ASCII"),
3315        OPT_BOOLEAN('f', "force", &sched.force, "don't complain, do it"),
3316        OPT_END()
3317        };
3318        const struct option latency_options[] = {
3319        OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]",
3320                   "sort by key(s): runtime, switch, avg, max"),
3321        OPT_INTEGER('C', "CPU", &sched.profile_cpu,
3322                    "CPU to profile on"),
3323        OPT_BOOLEAN('p', "pids", &sched.skip_merge,
3324                    "latency stats per pid instead of per comm"),
3325        OPT_PARENT(sched_options)
3326        };
3327        const struct option replay_options[] = {
3328        OPT_UINTEGER('r', "repeat", &sched.replay_repeat,
3329                     "repeat the workload replay N times (-1: infinite)"),
3330        OPT_PARENT(sched_options)
3331        };
3332        const struct option map_options[] = {
3333        OPT_BOOLEAN(0, "compact", &sched.map.comp,
3334                    "map output in compact mode"),
3335        OPT_STRING(0, "color-pids", &sched.map.color_pids_str, "pids",
3336                   "highlight given pids in map"),
3337        OPT_STRING(0, "color-cpus", &sched.map.color_cpus_str, "cpus",
3338                    "highlight given CPUs in map"),
3339        OPT_STRING(0, "cpus", &sched.map.cpus_str, "cpus",
3340                    "display given CPUs in map"),
3341        OPT_PARENT(sched_options)
3342        };
3343        const struct option timehist_options[] = {
3344        OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
3345                   "file", "vmlinux pathname"),
3346        OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name,
3347                   "file", "kallsyms pathname"),
3348        OPT_BOOLEAN('g', "call-graph", &sched.show_callchain,
3349                    "Display call chains if present (default on)"),
3350        OPT_UINTEGER(0, "max-stack", &sched.max_stack,
3351                   "Maximum number of functions to display backtrace."),
3352        OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
3353                    "Look for files with symbols relative to this directory"),
3354        OPT_BOOLEAN('s', "summary", &sched.summary_only,
3355                    "Show only syscall summary with statistics"),
3356        OPT_BOOLEAN('S', "with-summary", &sched.summary,
3357                    "Show all syscalls and summary with statistics"),
3358        OPT_BOOLEAN('w', "wakeups", &sched.show_wakeups, "Show wakeup events"),
3359        OPT_BOOLEAN('n', "next", &sched.show_next, "Show next task"),
3360        OPT_BOOLEAN('M', "migrations", &sched.show_migrations, "Show migration events"),
3361        OPT_BOOLEAN('V', "cpu-visual", &sched.show_cpu_visual, "Add CPU visual"),
3362        OPT_BOOLEAN('I', "idle-hist", &sched.idle_hist, "Show idle events only"),
3363        OPT_STRING(0, "time", &sched.time_str, "str",
3364                   "Time span for analysis (start,stop)"),
3365        OPT_BOOLEAN(0, "state", &sched.show_state, "Show task state when sched-out"),
3366        OPT_PARENT(sched_options)
3367        };
3368
3369        const char * const latency_usage[] = {
3370                "perf sched latency [<options>]",
3371                NULL
3372        };
3373        const char * const replay_usage[] = {
3374                "perf sched replay [<options>]",
3375                NULL
3376        };
3377        const char * const map_usage[] = {
3378                "perf sched map [<options>]",
3379                NULL
3380        };
3381        const char * const timehist_usage[] = {
3382                "perf sched timehist [<options>]",
3383                NULL
3384        };
3385        const char *const sched_subcommands[] = { "record", "latency", "map",
3386                                                  "replay", "script",
3387                                                  "timehist", NULL };
3388        const char *sched_usage[] = {
3389                NULL,
3390                NULL
3391        };
3392        struct trace_sched_handler lat_ops  = {
3393                .wakeup_event       = latency_wakeup_event,
3394                .switch_event       = latency_switch_event,
3395                .runtime_event      = latency_runtime_event,
3396                .migrate_task_event = latency_migrate_task_event,
3397        };
3398        struct trace_sched_handler map_ops  = {
3399                .switch_event       = map_switch_event,
3400        };
3401        struct trace_sched_handler replay_ops  = {
3402                .wakeup_event       = replay_wakeup_event,
3403                .switch_event       = replay_switch_event,
3404                .fork_event         = replay_fork_event,
3405        };
3406        unsigned int i;
3407
3408        for (i = 0; i < ARRAY_SIZE(sched.curr_pid); i++)
3409                sched.curr_pid[i] = -1;
3410
3411        argc = parse_options_subcommand(argc, argv, sched_options, sched_subcommands,
3412                                        sched_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3413        if (!argc)
3414                usage_with_options(sched_usage, sched_options);
3415
3416        /*
3417         * Aliased to 'perf script' for now:
3418         */
3419        if (!strcmp(argv[0], "script"))
3420                return cmd_script(argc, argv);
3421
3422        if (!strncmp(argv[0], "rec", 3)) {
3423                return __cmd_record(argc, argv);
3424        } else if (!strncmp(argv[0], "lat", 3)) {
3425                sched.tp_handler = &lat_ops;
3426                if (argc > 1) {
3427                        argc = parse_options(argc, argv, latency_options, latency_usage, 0);
3428                        if (argc)
3429                                usage_with_options(latency_usage, latency_options);
3430                }
3431                setup_sorting(&sched, latency_options, latency_usage);
3432                return perf_sched__lat(&sched);
3433        } else if (!strcmp(argv[0], "map")) {
3434                if (argc) {
3435                        argc = parse_options(argc, argv, map_options, map_usage, 0);
3436                        if (argc)
3437                                usage_with_options(map_usage, map_options);
3438                }
3439                sched.tp_handler = &map_ops;
3440                setup_sorting(&sched, latency_options, latency_usage);
3441                return perf_sched__map(&sched);
3442        } else if (!strncmp(argv[0], "rep", 3)) {
3443                sched.tp_handler = &replay_ops;
3444                if (argc) {
3445                        argc = parse_options(argc, argv, replay_options, replay_usage, 0);
3446                        if (argc)
3447                                usage_with_options(replay_usage, replay_options);
3448                }
3449                return perf_sched__replay(&sched);
3450        } else if (!strcmp(argv[0], "timehist")) {
3451                if (argc) {
3452                        argc = parse_options(argc, argv, timehist_options,
3453                                             timehist_usage, 0);
3454                        if (argc)
3455                                usage_with_options(timehist_usage, timehist_options);
3456                }
3457                if ((sched.show_wakeups || sched.show_next) &&
3458                    sched.summary_only) {
3459                        pr_err(" Error: -s and -[n|w] are mutually exclusive.\n");
3460                        parse_options_usage(timehist_usage, timehist_options, "s", true);
3461                        if (sched.show_wakeups)
3462                                parse_options_usage(NULL, timehist_options, "w", true);
3463                        if (sched.show_next)
3464                                parse_options_usage(NULL, timehist_options, "n", true);
3465                        return -EINVAL;
3466                }
3467
3468                return perf_sched__timehist(&sched);
3469        } else {
3470                usage_with_options(sched_usage, sched_options);
3471        }
3472
3473        return 0;
3474}
3475