linux/tools/perf/util/s390-cpumsf.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright IBM Corp. 2018
   4 * Auxtrace support for s390 CPU-Measurement Sampling Facility
   5 *
   6 * Author(s):  Thomas Richter <tmricht@linux.ibm.com>
   7 *
   8 * Auxiliary traces are collected during 'perf record' using rbd000 event.
   9 * Several PERF_RECORD_XXX are generated during recording:
  10 *
  11 * PERF_RECORD_AUX:
  12 *      Records that new data landed in the AUX buffer part.
  13 * PERF_RECORD_AUXTRACE:
  14 *      Defines auxtrace data. Followed by the actual data. The contents of
  15 *      the auxtrace data is dependent on the event and the CPU.
  16 *      This record is generated by perf record command. For details
  17 *      see Documentation/perf.data-file-format.txt.
  18 * PERF_RECORD_AUXTRACE_INFO:
  19 *      Defines a table of contains for PERF_RECORD_AUXTRACE records. This
  20 *      record is generated during 'perf record' command. Each record contains
  21 *      up to 256 entries describing offset and size of the AUXTRACE data in the
  22 *      perf.data file.
  23 * PERF_RECORD_AUXTRACE_ERROR:
  24 *      Indicates an error during AUXTRACE collection such as buffer overflow.
  25 * PERF_RECORD_FINISHED_ROUND:
  26 *      Perf events are not necessarily in time stamp order, as they can be
  27 *      collected in parallel on different CPUs. If the events should be
  28 *      processed in time order they need to be sorted first.
  29 *      Perf report guarantees that there is no reordering over a
  30 *      PERF_RECORD_FINISHED_ROUND boundary event. All perf records with a
  31 *      time stamp lower than this record are processed (and displayed) before
  32 *      the succeeding perf record are processed.
  33 *
  34 * These records are evaluated during perf report command.
  35 *
  36 * 1. PERF_RECORD_AUXTRACE_INFO is used to set up the infrastructure for
  37 * auxiliary trace data processing. See s390_cpumsf_process_auxtrace_info()
  38 * below.
  39 * Auxiliary trace data is collected per CPU. To merge the data into the report
  40 * an auxtrace_queue is created for each CPU. It is assumed that the auxtrace
  41 * data is in ascending order.
  42 *
  43 * Each queue has a double linked list of auxtrace_buffers. This list contains
  44 * the offset and size of a CPU's auxtrace data. During auxtrace processing
  45 * the data portion is mmap()'ed.
  46 *
  47 * To sort the queues in chronological order, all queue access is controlled
  48 * by the auxtrace_heap. This is basically a stack, each stack element has two
  49 * entries, the queue number and a time stamp. However the stack is sorted by
  50 * the time stamps. The highest time stamp is at the bottom the lowest
  51 * (nearest) time stamp is at the top. That sort order is maintained at all
  52 * times!
  53 *
  54 * After the auxtrace infrastructure has been setup, the auxtrace queues are
  55 * filled with data (offset/size pairs) and the auxtrace_heap is populated.
  56 *
  57 * 2. PERF_RECORD_XXX processing triggers access to the auxtrace_queues.
  58 * Each record is handled by s390_cpumsf_process_event(). The time stamp of
  59 * the perf record is compared with the time stamp located on the auxtrace_heap
  60 * top element. If that time stamp is lower than the time stamp from the
  61 * record sample, the auxtrace queues will be processed. As auxtrace queues
  62 * control many auxtrace_buffers and each buffer can be quite large, the
  63 * auxtrace buffer might be processed only partially. In this case the
  64 * position in the auxtrace_buffer of that queue is remembered and the time
  65 * stamp of the last processed entry of the auxtrace_buffer replaces the
  66 * current auxtrace_heap top.
  67 *
  68 * 3. Auxtrace_queues might run of out data and are fed by the
  69 * PERF_RECORD_AUXTRACE handling, see s390_cpumsf_process_auxtrace_event().
  70 *
  71 * Event Generation
  72 * Each sampling-data entry in the auxiliary trace data generates a perf sample.
  73 * This sample is filled
  74 * with data from the auxtrace such as PID/TID, instruction address, CPU state,
  75 * etc. This sample is processed with perf_session__deliver_synth_event() to
  76 * be included into the GUI.
  77 *
  78 * 4. PERF_RECORD_FINISHED_ROUND event is used to process all the remaining
  79 * auxiliary traces entries until the time stamp of this record is reached
  80 * auxtrace_heap top. This is triggered by ordered_event->deliver().
  81 *
  82 *
  83 * Perf event processing.
  84 * Event processing of PERF_RECORD_XXX entries relies on time stamp entries.
  85 * This is the function call sequence:
  86 *
  87 * __cmd_report()
  88 * |
  89 * perf_session__process_events()
  90 * |
  91 * __perf_session__process_events()
  92 * |
  93 * perf_session__process_event()
  94 * |  This functions splits the PERF_RECORD_XXX records.
  95 * |  - Those generated by perf record command (type number equal or higher
  96 * |    than PERF_RECORD_USER_TYPE_START) are handled by
  97 * |    perf_session__process_user_event(see below)
  98 * |  - Those generated by the kernel are handled by
  99 * |    evlist__parse_sample_timestamp()
 100 * |
 101 * evlist__parse_sample_timestamp()
 102 * |  Extract time stamp from sample data.
 103 * |
 104 * perf_session__queue_event()
 105 * |  If timestamp is positive the sample is entered into an ordered_event
 106 * |  list, sort order is the timestamp. The event processing is deferred until
 107 * |  later (see perf_session__process_user_event()).
 108 * |  Other timestamps (0 or -1) are handled immediately by
 109 * |  perf_session__deliver_event(). These are events generated at start up
 110 * |  of command perf record. They create PERF_RECORD_COMM and PERF_RECORD_MMAP*
 111 * |  records. They are needed to create a list of running processes and its
 112 * |  memory mappings and layout. They are needed at the beginning to enable
 113 * |  command perf report to create process trees and memory mappings.
 114 * |
 115 * perf_session__deliver_event()
 116 * |  Delivers a PERF_RECORD_XXX entry for handling.
 117 * |
 118 * auxtrace__process_event()
 119 * |  The timestamp of the PERF_RECORD_XXX entry is taken to correlate with
 120 * |  time stamps from the auxiliary trace buffers. This enables
 121 * |  synchronization between auxiliary trace data and the events on the
 122 * |  perf.data file.
 123 * |
 124 * machine__deliver_event()
 125 * |  Handles the PERF_RECORD_XXX event. This depends on the record type.
 126 *    It might update the process tree, update a process memory map or enter
 127 *    a sample with IP and call back chain data into GUI data pool.
 128 *
 129 *
 130 * Deferred processing determined by perf_session__process_user_event() is
 131 * finally processed when a PERF_RECORD_FINISHED_ROUND is encountered. These
 132 * are generated during command perf record.
 133 * The timestamp of PERF_RECORD_FINISHED_ROUND event is taken to process all
 134 * PERF_RECORD_XXX entries stored in the ordered_event list. This list was
 135 * built up while reading the perf.data file.
 136 * Each event is now processed by calling perf_session__deliver_event().
 137 * This enables time synchronization between the data in the perf.data file and
 138 * the data in the auxiliary trace buffers.
 139 */
 140
 141#include <endian.h>
 142#include <errno.h>
 143#include <byteswap.h>
 144#include <inttypes.h>
 145#include <linux/kernel.h>
 146#include <linux/types.h>
 147#include <linux/bitops.h>
 148#include <linux/log2.h>
 149#include <linux/zalloc.h>
 150
 151#include <sys/stat.h>
 152#include <sys/types.h>
 153
 154#include "color.h"
 155#include "evsel.h"
 156#include "evlist.h"
 157#include "machine.h"
 158#include "session.h"
 159#include "tool.h"
 160#include "debug.h"
 161#include "auxtrace.h"
 162#include "s390-cpumsf.h"
 163#include "s390-cpumsf-kernel.h"
 164#include "s390-cpumcf-kernel.h"
 165#include "config.h"
 166
 167struct s390_cpumsf {
 168        struct auxtrace         auxtrace;
 169        struct auxtrace_queues  queues;
 170        struct auxtrace_heap    heap;
 171        struct perf_session     *session;
 172        struct machine          *machine;
 173        u32                     auxtrace_type;
 174        u32                     pmu_type;
 175        u16                     machine_type;
 176        bool                    data_queued;
 177        bool                    use_logfile;
 178        char                    *logdir;
 179};
 180
 181struct s390_cpumsf_queue {
 182        struct s390_cpumsf      *sf;
 183        unsigned int            queue_nr;
 184        struct auxtrace_buffer  *buffer;
 185        int                     cpu;
 186        FILE                    *logfile;
 187        FILE                    *logfile_ctr;
 188};
 189
 190/* Check if the raw data should be dumped to file. If this is the case and
 191 * the file to dump to has not been opened for writing, do so.
 192 *
 193 * Return 0 on success and greater zero on error so processing continues.
 194 */
 195static int s390_cpumcf_dumpctr(struct s390_cpumsf *sf,
 196                               struct perf_sample *sample)
 197{
 198        struct s390_cpumsf_queue *sfq;
 199        struct auxtrace_queue *q;
 200        int rc = 0;
 201
 202        if (!sf->use_logfile || sf->queues.nr_queues <= sample->cpu)
 203                return rc;
 204
 205        q = &sf->queues.queue_array[sample->cpu];
 206        sfq = q->priv;
 207        if (!sfq)               /* Queue not yet allocated */
 208                return rc;
 209
 210        if (!sfq->logfile_ctr) {
 211                char *name;
 212
 213                rc = (sf->logdir)
 214                        ? asprintf(&name, "%s/aux.ctr.%02x",
 215                                 sf->logdir, sample->cpu)
 216                        : asprintf(&name, "aux.ctr.%02x", sample->cpu);
 217                if (rc > 0)
 218                        sfq->logfile_ctr = fopen(name, "w");
 219                if (sfq->logfile_ctr == NULL) {
 220                        pr_err("Failed to open counter set log file %s, "
 221                               "continue...\n", name);
 222                        rc = 1;
 223                }
 224                free(name);
 225        }
 226
 227        if (sfq->logfile_ctr) {
 228                /* See comment above for -4 */
 229                size_t n = fwrite(sample->raw_data, sample->raw_size - 4, 1,
 230                                  sfq->logfile_ctr);
 231                if (n != 1) {
 232                        pr_err("Failed to write counter set data\n");
 233                        rc = 1;
 234                }
 235        }
 236        return rc;
 237}
 238
 239/* Display s390 CPU measurement facility basic-sampling data entry
 240 * Data written on s390 in big endian byte order and contains bit
 241 * fields across byte boundaries.
 242 */
 243static bool s390_cpumsf_basic_show(const char *color, size_t pos,
 244                                   struct hws_basic_entry *basicp)
 245{
 246        struct hws_basic_entry *basic = basicp;
 247#if __BYTE_ORDER == __LITTLE_ENDIAN
 248        struct hws_basic_entry local;
 249        unsigned long long word = be64toh(*(unsigned long long *)basicp);
 250
 251        memset(&local, 0, sizeof(local));
 252        local.def = be16toh(basicp->def);
 253        local.prim_asn = word & 0xffff;
 254        local.CL = word >> 30 & 0x3;
 255        local.I = word >> 32 & 0x1;
 256        local.AS = word >> 33 & 0x3;
 257        local.P = word >> 35 & 0x1;
 258        local.W = word >> 36 & 0x1;
 259        local.T = word >> 37 & 0x1;
 260        local.U = word >> 40 & 0xf;
 261        local.ia = be64toh(basicp->ia);
 262        local.gpp = be64toh(basicp->gpp);
 263        local.hpp = be64toh(basicp->hpp);
 264        basic = &local;
 265#endif
 266        if (basic->def != 1) {
 267                pr_err("Invalid AUX trace basic entry [%#08zx]\n", pos);
 268                return false;
 269        }
 270        color_fprintf(stdout, color, "    [%#08zx] Basic   Def:%04x Inst:%#04x"
 271                      " %c%c%c%c AS:%d ASN:%#04x IA:%#018llx\n"
 272                      "\t\tCL:%d HPP:%#018llx GPP:%#018llx\n",
 273                      pos, basic->def, basic->U,
 274                      basic->T ? 'T' : ' ',
 275                      basic->W ? 'W' : ' ',
 276                      basic->P ? 'P' : ' ',
 277                      basic->I ? 'I' : ' ',
 278                      basic->AS, basic->prim_asn, basic->ia, basic->CL,
 279                      basic->hpp, basic->gpp);
 280        return true;
 281}
 282
 283/* Display s390 CPU measurement facility diagnostic-sampling data entry.
 284 * Data written on s390 in big endian byte order and contains bit
 285 * fields across byte boundaries.
 286 */
 287static bool s390_cpumsf_diag_show(const char *color, size_t pos,
 288                                  struct hws_diag_entry *diagp)
 289{
 290        struct hws_diag_entry *diag = diagp;
 291#if __BYTE_ORDER == __LITTLE_ENDIAN
 292        struct hws_diag_entry local;
 293        unsigned long long word = be64toh(*(unsigned long long *)diagp);
 294
 295        local.def = be16toh(diagp->def);
 296        local.I = word >> 32 & 0x1;
 297        diag = &local;
 298#endif
 299        if (diag->def < S390_CPUMSF_DIAG_DEF_FIRST) {
 300                pr_err("Invalid AUX trace diagnostic entry [%#08zx]\n", pos);
 301                return false;
 302        }
 303        color_fprintf(stdout, color, "    [%#08zx] Diag    Def:%04x %c\n",
 304                      pos, diag->def, diag->I ? 'I' : ' ');
 305        return true;
 306}
 307
 308/* Return TOD timestamp contained in an trailer entry */
 309static unsigned long long trailer_timestamp(struct hws_trailer_entry *te,
 310                                            int idx)
 311{
 312        /* te->t set: TOD in STCKE format, bytes 8-15
 313         * to->t not set: TOD in STCK format, bytes 0-7
 314         */
 315        unsigned long long ts;
 316
 317        memcpy(&ts, &te->timestamp[idx], sizeof(ts));
 318        return be64toh(ts);
 319}
 320
 321/* Display s390 CPU measurement facility trailer entry */
 322static bool s390_cpumsf_trailer_show(const char *color, size_t pos,
 323                                     struct hws_trailer_entry *te)
 324{
 325#if __BYTE_ORDER == __LITTLE_ENDIAN
 326        struct hws_trailer_entry local;
 327        const unsigned long long flags = be64toh(te->flags);
 328
 329        memset(&local, 0, sizeof(local));
 330        local.f = flags >> 63 & 0x1;
 331        local.a = flags >> 62 & 0x1;
 332        local.t = flags >> 61 & 0x1;
 333        local.bsdes = be16toh((flags >> 16 & 0xffff));
 334        local.dsdes = be16toh((flags & 0xffff));
 335        memcpy(&local.timestamp, te->timestamp, sizeof(te->timestamp));
 336        local.overflow = be64toh(te->overflow);
 337        local.clock_base = be64toh(te->progusage[0]) >> 63 & 1;
 338        local.progusage2 = be64toh(te->progusage2);
 339        te = &local;
 340#endif
 341        if (te->bsdes != sizeof(struct hws_basic_entry)) {
 342                pr_err("Invalid AUX trace trailer entry [%#08zx]\n", pos);
 343                return false;
 344        }
 345        color_fprintf(stdout, color, "    [%#08zx] Trailer %c%c%c bsdes:%d"
 346                      " dsdes:%d Overflow:%lld Time:%#llx\n"
 347                      "\t\tC:%d TOD:%#lx\n",
 348                      pos,
 349                      te->f ? 'F' : ' ',
 350                      te->a ? 'A' : ' ',
 351                      te->t ? 'T' : ' ',
 352                      te->bsdes, te->dsdes, te->overflow,
 353                      trailer_timestamp(te, te->clock_base),
 354                      te->clock_base, te->progusage2);
 355        return true;
 356}
 357
 358/* Test a sample data block. It must be 4KB or a multiple thereof in size and
 359 * 4KB page aligned. Each sample data page has a trailer entry at the
 360 * end which contains the sample entry data sizes.
 361 *
 362 * Return true if the sample data block passes the checks and set the
 363 * basic set entry size and diagnostic set entry size.
 364 *
 365 * Return false on failure.
 366 *
 367 * Note: Old hardware does not set the basic or diagnostic entry sizes
 368 * in the trailer entry. Use the type number instead.
 369 */
 370static bool s390_cpumsf_validate(int machine_type,
 371                                 unsigned char *buf, size_t len,
 372                                 unsigned short *bsdes,
 373                                 unsigned short *dsdes)
 374{
 375        struct hws_basic_entry *basic = (struct hws_basic_entry *)buf;
 376        struct hws_trailer_entry *te;
 377
 378        *dsdes = *bsdes = 0;
 379        if (len & (S390_CPUMSF_PAGESZ - 1))     /* Illegal size */
 380                return false;
 381        if (be16toh(basic->def) != 1)   /* No basic set entry, must be first */
 382                return false;
 383        /* Check for trailer entry at end of SDB */
 384        te = (struct hws_trailer_entry *)(buf + S390_CPUMSF_PAGESZ
 385                                              - sizeof(*te));
 386        *bsdes = be16toh(te->bsdes);
 387        *dsdes = be16toh(te->dsdes);
 388        if (!te->bsdes && !te->dsdes) {
 389                /* Very old hardware, use CPUID */
 390                switch (machine_type) {
 391                case 2097:
 392                case 2098:
 393                        *dsdes = 64;
 394                        *bsdes = 32;
 395                        break;
 396                case 2817:
 397                case 2818:
 398                        *dsdes = 74;
 399                        *bsdes = 32;
 400                        break;
 401                case 2827:
 402                case 2828:
 403                        *dsdes = 85;
 404                        *bsdes = 32;
 405                        break;
 406                case 2964:
 407                case 2965:
 408                        *dsdes = 112;
 409                        *bsdes = 32;
 410                        break;
 411                default:
 412                        /* Illegal trailer entry */
 413                        return false;
 414                }
 415        }
 416        return true;
 417}
 418
 419/* Return true if there is room for another entry */
 420static bool s390_cpumsf_reached_trailer(size_t entry_sz, size_t pos)
 421{
 422        size_t payload = S390_CPUMSF_PAGESZ - sizeof(struct hws_trailer_entry);
 423
 424        if (payload - (pos & (S390_CPUMSF_PAGESZ - 1)) < entry_sz)
 425                return false;
 426        return true;
 427}
 428
 429/* Dump an auxiliary buffer. These buffers are multiple of
 430 * 4KB SDB pages.
 431 */
 432static void s390_cpumsf_dump(struct s390_cpumsf *sf,
 433                             unsigned char *buf, size_t len)
 434{
 435        const char *color = PERF_COLOR_BLUE;
 436        struct hws_basic_entry *basic;
 437        struct hws_diag_entry *diag;
 438        unsigned short bsdes, dsdes;
 439        size_t pos = 0;
 440
 441        color_fprintf(stdout, color,
 442                      ". ... s390 AUX data: size %zu bytes\n",
 443                      len);
 444
 445        if (!s390_cpumsf_validate(sf->machine_type, buf, len, &bsdes,
 446                                  &dsdes)) {
 447                pr_err("Invalid AUX trace data block size:%zu"
 448                       " (type:%d bsdes:%hd dsdes:%hd)\n",
 449                       len, sf->machine_type, bsdes, dsdes);
 450                return;
 451        }
 452
 453        /* s390 kernel always returns 4KB blocks fully occupied,
 454         * no partially filled SDBs.
 455         */
 456        while (pos < len) {
 457                /* Handle Basic entry */
 458                basic = (struct hws_basic_entry *)(buf + pos);
 459                if (s390_cpumsf_basic_show(color, pos, basic))
 460                        pos += bsdes;
 461                else
 462                        return;
 463
 464                /* Handle Diagnostic entry */
 465                diag = (struct hws_diag_entry *)(buf + pos);
 466                if (s390_cpumsf_diag_show(color, pos, diag))
 467                        pos += dsdes;
 468                else
 469                        return;
 470
 471                /* Check for trailer entry */
 472                if (!s390_cpumsf_reached_trailer(bsdes + dsdes, pos)) {
 473                        /* Show trailer entry */
 474                        struct hws_trailer_entry te;
 475
 476                        pos = (pos + S390_CPUMSF_PAGESZ)
 477                               & ~(S390_CPUMSF_PAGESZ - 1);
 478                        pos -= sizeof(te);
 479                        memcpy(&te, buf + pos, sizeof(te));
 480                        /* Set descriptor sizes in case of old hardware
 481                         * where these values are not set.
 482                         */
 483                        te.bsdes = bsdes;
 484                        te.dsdes = dsdes;
 485                        if (s390_cpumsf_trailer_show(color, pos, &te))
 486                                pos += sizeof(te);
 487                        else
 488                                return;
 489                }
 490        }
 491}
 492
 493static void s390_cpumsf_dump_event(struct s390_cpumsf *sf, unsigned char *buf,
 494                                   size_t len)
 495{
 496        printf(".\n");
 497        s390_cpumsf_dump(sf, buf, len);
 498}
 499
 500#define S390_LPP_PID_MASK       0xffffffff
 501
 502static bool s390_cpumsf_make_event(size_t pos,
 503                                   struct hws_basic_entry *basic,
 504                                   struct s390_cpumsf_queue *sfq)
 505{
 506        struct perf_sample sample = {
 507                                .ip = basic->ia,
 508                                .pid = basic->hpp & S390_LPP_PID_MASK,
 509                                .tid = basic->hpp & S390_LPP_PID_MASK,
 510                                .cpumode = PERF_RECORD_MISC_CPUMODE_UNKNOWN,
 511                                .cpu = sfq->cpu,
 512                                .period = 1
 513                            };
 514        union perf_event event;
 515
 516        memset(&event, 0, sizeof(event));
 517        if (basic->CL == 1)     /* Native LPAR mode */
 518                sample.cpumode = basic->P ? PERF_RECORD_MISC_USER
 519                                          : PERF_RECORD_MISC_KERNEL;
 520        else if (basic->CL == 2)        /* Guest kernel/user space */
 521                sample.cpumode = basic->P ? PERF_RECORD_MISC_GUEST_USER
 522                                          : PERF_RECORD_MISC_GUEST_KERNEL;
 523        else if (basic->gpp || basic->prim_asn != 0xffff)
 524                /* Use heuristics on old hardware */
 525                sample.cpumode = basic->P ? PERF_RECORD_MISC_GUEST_USER
 526                                          : PERF_RECORD_MISC_GUEST_KERNEL;
 527        else
 528                sample.cpumode = basic->P ? PERF_RECORD_MISC_USER
 529                                          : PERF_RECORD_MISC_KERNEL;
 530
 531        event.sample.header.type = PERF_RECORD_SAMPLE;
 532        event.sample.header.misc = sample.cpumode;
 533        event.sample.header.size = sizeof(struct perf_event_header);
 534
 535        pr_debug4("%s pos:%#zx ip:%#" PRIx64 " P:%d CL:%d pid:%d.%d cpumode:%d cpu:%d\n",
 536                 __func__, pos, sample.ip, basic->P, basic->CL, sample.pid,
 537                 sample.tid, sample.cpumode, sample.cpu);
 538        if (perf_session__deliver_synth_event(sfq->sf->session, &event,
 539                                              &sample)) {
 540                pr_err("s390 Auxiliary Trace: failed to deliver event\n");
 541                return false;
 542        }
 543        return true;
 544}
 545
 546static unsigned long long get_trailer_time(const unsigned char *buf)
 547{
 548        struct hws_trailer_entry *te;
 549        unsigned long long aux_time, progusage2;
 550        bool clock_base;
 551
 552        te = (struct hws_trailer_entry *)(buf + S390_CPUMSF_PAGESZ
 553                                              - sizeof(*te));
 554
 555#if __BYTE_ORDER == __LITTLE_ENDIAN
 556        clock_base = be64toh(te->progusage[0]) >> 63 & 0x1;
 557        progusage2 = be64toh(te->progusage[1]);
 558#else
 559        clock_base = te->clock_base;
 560        progusage2 = te->progusage2;
 561#endif
 562        if (!clock_base)        /* TOD_CLOCK_BASE value missing */
 563                return 0;
 564
 565        /* Correct calculation to convert time stamp in trailer entry to
 566         * nano seconds (taken from arch/s390 function tod_to_ns()).
 567         * TOD_CLOCK_BASE is stored in trailer entry member progusage2.
 568         */
 569        aux_time = trailer_timestamp(te, clock_base) - progusage2;
 570        aux_time = (aux_time >> 9) * 125 + (((aux_time & 0x1ff) * 125) >> 9);
 571        return aux_time;
 572}
 573
 574/* Process the data samples of a single queue. The first parameter is a
 575 * pointer to the queue, the second parameter is the time stamp. This
 576 * is the time stamp:
 577 * - of the event that triggered this processing.
 578 * - or the time stamp when the last processing of this queue stopped.
 579 *   In this case it stopped at a 4KB page boundary and record the
 580 *   position on where to continue processing on the next invocation
 581 *   (see buffer->use_data and buffer->use_size).
 582 *
 583 * When this function returns the second parameter is updated to
 584 * reflect the time stamp of the last processed auxiliary data entry
 585 * (taken from the trailer entry of that page). The caller uses this
 586 * returned time stamp to record the last processed entry in this
 587 * queue.
 588 *
 589 * The function returns:
 590 * 0:  Processing successful. The second parameter returns the
 591 *     time stamp from the trailer entry until which position
 592 *     processing took place. Subsequent calls resume from this
 593 *     position.
 594 * <0: An error occurred during processing. The second parameter
 595 *     returns the maximum time stamp.
 596 * >0: Done on this queue. The second parameter returns the
 597 *     maximum time stamp.
 598 */
 599static int s390_cpumsf_samples(struct s390_cpumsf_queue *sfq, u64 *ts)
 600{
 601        struct s390_cpumsf *sf = sfq->sf;
 602        unsigned char *buf = sfq->buffer->use_data;
 603        size_t len = sfq->buffer->use_size;
 604        struct hws_basic_entry *basic;
 605        unsigned short bsdes, dsdes;
 606        size_t pos = 0;
 607        int err = 1;
 608        u64 aux_ts;
 609
 610        if (!s390_cpumsf_validate(sf->machine_type, buf, len, &bsdes,
 611                                  &dsdes)) {
 612                *ts = ~0ULL;
 613                return -1;
 614        }
 615
 616        /* Get trailer entry time stamp and check if entries in
 617         * this auxiliary page are ready for processing. If the
 618         * time stamp of the first entry is too high, whole buffer
 619         * can be skipped. In this case return time stamp.
 620         */
 621        aux_ts = get_trailer_time(buf);
 622        if (!aux_ts) {
 623                pr_err("[%#08" PRIx64 "] Invalid AUX trailer entry TOD clock base\n",
 624                       (s64)sfq->buffer->data_offset);
 625                aux_ts = ~0ULL;
 626                goto out;
 627        }
 628        if (aux_ts > *ts) {
 629                *ts = aux_ts;
 630                return 0;
 631        }
 632
 633        while (pos < len) {
 634                /* Handle Basic entry */
 635                basic = (struct hws_basic_entry *)(buf + pos);
 636                if (s390_cpumsf_make_event(pos, basic, sfq))
 637                        pos += bsdes;
 638                else {
 639                        err = -EBADF;
 640                        goto out;
 641                }
 642
 643                pos += dsdes;   /* Skip diagnostic entry */
 644
 645                /* Check for trailer entry */
 646                if (!s390_cpumsf_reached_trailer(bsdes + dsdes, pos)) {
 647                        pos = (pos + S390_CPUMSF_PAGESZ)
 648                               & ~(S390_CPUMSF_PAGESZ - 1);
 649                        /* Check existence of next page */
 650                        if (pos >= len)
 651                                break;
 652                        aux_ts = get_trailer_time(buf + pos);
 653                        if (!aux_ts) {
 654                                aux_ts = ~0ULL;
 655                                goto out;
 656                        }
 657                        if (aux_ts > *ts) {
 658                                *ts = aux_ts;
 659                                sfq->buffer->use_data += pos;
 660                                sfq->buffer->use_size -= pos;
 661                                return 0;
 662                        }
 663                }
 664        }
 665out:
 666        *ts = aux_ts;
 667        sfq->buffer->use_size = 0;
 668        sfq->buffer->use_data = NULL;
 669        return err;     /* Buffer completely scanned or error */
 670}
 671
 672/* Run the s390 auxiliary trace decoder.
 673 * Select the queue buffer to operate on, the caller already selected
 674 * the proper queue, depending on second parameter 'ts'.
 675 * This is the time stamp until which the auxiliary entries should
 676 * be processed. This value is updated by called functions and
 677 * returned to the caller.
 678 *
 679 * Resume processing in the current buffer. If there is no buffer
 680 * get a new buffer from the queue and setup start position for
 681 * processing.
 682 * When a buffer is completely processed remove it from the queue
 683 * before returning.
 684 *
 685 * This function returns
 686 * 1: When the queue is empty. Second parameter will be set to
 687 *    maximum time stamp.
 688 * 0: Normal processing done.
 689 * <0: Error during queue buffer setup. This causes the caller
 690 *     to stop processing completely.
 691 */
 692static int s390_cpumsf_run_decoder(struct s390_cpumsf_queue *sfq,
 693                                   u64 *ts)
 694{
 695
 696        struct auxtrace_buffer *buffer;
 697        struct auxtrace_queue *queue;
 698        int err;
 699
 700        queue = &sfq->sf->queues.queue_array[sfq->queue_nr];
 701
 702        /* Get buffer and last position in buffer to resume
 703         * decoding the auxiliary entries. One buffer might be large
 704         * and decoding might stop in between. This depends on the time
 705         * stamp of the trailer entry in each page of the auxiliary
 706         * data and the time stamp of the event triggering the decoding.
 707         */
 708        if (sfq->buffer == NULL) {
 709                sfq->buffer = buffer = auxtrace_buffer__next(queue,
 710                                                             sfq->buffer);
 711                if (!buffer) {
 712                        *ts = ~0ULL;
 713                        return 1;       /* Processing done on this queue */
 714                }
 715                /* Start with a new buffer on this queue */
 716                if (buffer->data) {
 717                        buffer->use_size = buffer->size;
 718                        buffer->use_data = buffer->data;
 719                }
 720                if (sfq->logfile) {     /* Write into log file */
 721                        size_t rc = fwrite(buffer->data, buffer->size, 1,
 722                                           sfq->logfile);
 723                        if (rc != 1)
 724                                pr_err("Failed to write auxiliary data\n");
 725                }
 726        } else
 727                buffer = sfq->buffer;
 728
 729        if (!buffer->data) {
 730                int fd = perf_data__fd(sfq->sf->session->data);
 731
 732                buffer->data = auxtrace_buffer__get_data(buffer, fd);
 733                if (!buffer->data)
 734                        return -ENOMEM;
 735                buffer->use_size = buffer->size;
 736                buffer->use_data = buffer->data;
 737
 738                if (sfq->logfile) {     /* Write into log file */
 739                        size_t rc = fwrite(buffer->data, buffer->size, 1,
 740                                           sfq->logfile);
 741                        if (rc != 1)
 742                                pr_err("Failed to write auxiliary data\n");
 743                }
 744        }
 745        pr_debug4("%s queue_nr:%d buffer:%" PRId64 " offset:%#" PRIx64 " size:%#zx rest:%#zx\n",
 746                  __func__, sfq->queue_nr, buffer->buffer_nr, buffer->offset,
 747                  buffer->size, buffer->use_size);
 748        err = s390_cpumsf_samples(sfq, ts);
 749
 750        /* If non-zero, there is either an error (err < 0) or the buffer is
 751         * completely done (err > 0). The error is unrecoverable, usually
 752         * some descriptors could not be read successfully, so continue with
 753         * the next buffer.
 754         * In both cases the parameter 'ts' has been updated.
 755         */
 756        if (err) {
 757                sfq->buffer = NULL;
 758                list_del_init(&buffer->list);
 759                auxtrace_buffer__free(buffer);
 760                if (err > 0)            /* Buffer done, no error */
 761                        err = 0;
 762        }
 763        return err;
 764}
 765
 766static struct s390_cpumsf_queue *
 767s390_cpumsf_alloc_queue(struct s390_cpumsf *sf, unsigned int queue_nr)
 768{
 769        struct s390_cpumsf_queue *sfq;
 770
 771        sfq = zalloc(sizeof(struct s390_cpumsf_queue));
 772        if (sfq == NULL)
 773                return NULL;
 774
 775        sfq->sf = sf;
 776        sfq->queue_nr = queue_nr;
 777        sfq->cpu = -1;
 778        if (sf->use_logfile) {
 779                char *name;
 780                int rc;
 781
 782                rc = (sf->logdir)
 783                        ? asprintf(&name, "%s/aux.smp.%02x",
 784                                 sf->logdir, queue_nr)
 785                        : asprintf(&name, "aux.smp.%02x", queue_nr);
 786                if (rc > 0)
 787                        sfq->logfile = fopen(name, "w");
 788                if (sfq->logfile == NULL) {
 789                        pr_err("Failed to open auxiliary log file %s,"
 790                               "continue...\n", name);
 791                        sf->use_logfile = false;
 792                }
 793                free(name);
 794        }
 795        return sfq;
 796}
 797
 798static int s390_cpumsf_setup_queue(struct s390_cpumsf *sf,
 799                                   struct auxtrace_queue *queue,
 800                                   unsigned int queue_nr, u64 ts)
 801{
 802        struct s390_cpumsf_queue *sfq = queue->priv;
 803
 804        if (list_empty(&queue->head))
 805                return 0;
 806
 807        if (sfq == NULL) {
 808                sfq = s390_cpumsf_alloc_queue(sf, queue_nr);
 809                if (!sfq)
 810                        return -ENOMEM;
 811                queue->priv = sfq;
 812
 813                if (queue->cpu != -1)
 814                        sfq->cpu = queue->cpu;
 815        }
 816        return auxtrace_heap__add(&sf->heap, queue_nr, ts);
 817}
 818
 819static int s390_cpumsf_setup_queues(struct s390_cpumsf *sf, u64 ts)
 820{
 821        unsigned int i;
 822        int ret = 0;
 823
 824        for (i = 0; i < sf->queues.nr_queues; i++) {
 825                ret = s390_cpumsf_setup_queue(sf, &sf->queues.queue_array[i],
 826                                              i, ts);
 827                if (ret)
 828                        break;
 829        }
 830        return ret;
 831}
 832
 833static int s390_cpumsf_update_queues(struct s390_cpumsf *sf, u64 ts)
 834{
 835        if (!sf->queues.new_data)
 836                return 0;
 837
 838        sf->queues.new_data = false;
 839        return s390_cpumsf_setup_queues(sf, ts);
 840}
 841
 842static int s390_cpumsf_process_queues(struct s390_cpumsf *sf, u64 timestamp)
 843{
 844        unsigned int queue_nr;
 845        u64 ts;
 846        int ret;
 847
 848        while (1) {
 849                struct auxtrace_queue *queue;
 850                struct s390_cpumsf_queue *sfq;
 851
 852                if (!sf->heap.heap_cnt)
 853                        return 0;
 854
 855                if (sf->heap.heap_array[0].ordinal >= timestamp)
 856                        return 0;
 857
 858                queue_nr = sf->heap.heap_array[0].queue_nr;
 859                queue = &sf->queues.queue_array[queue_nr];
 860                sfq = queue->priv;
 861
 862                auxtrace_heap__pop(&sf->heap);
 863                if (sf->heap.heap_cnt) {
 864                        ts = sf->heap.heap_array[0].ordinal + 1;
 865                        if (ts > timestamp)
 866                                ts = timestamp;
 867                } else {
 868                        ts = timestamp;
 869                }
 870
 871                ret = s390_cpumsf_run_decoder(sfq, &ts);
 872                if (ret < 0) {
 873                        auxtrace_heap__add(&sf->heap, queue_nr, ts);
 874                        return ret;
 875                }
 876                if (!ret) {
 877                        ret = auxtrace_heap__add(&sf->heap, queue_nr, ts);
 878                        if (ret < 0)
 879                                return ret;
 880                }
 881        }
 882        return 0;
 883}
 884
 885static int s390_cpumsf_synth_error(struct s390_cpumsf *sf, int code, int cpu,
 886                                   pid_t pid, pid_t tid, u64 ip, u64 timestamp)
 887{
 888        char msg[MAX_AUXTRACE_ERROR_MSG];
 889        union perf_event event;
 890        int err;
 891
 892        strncpy(msg, "Lost Auxiliary Trace Buffer", sizeof(msg) - 1);
 893        auxtrace_synth_error(&event.auxtrace_error, PERF_AUXTRACE_ERROR_ITRACE,
 894                             code, cpu, pid, tid, ip, msg, timestamp);
 895
 896        err = perf_session__deliver_synth_event(sf->session, &event, NULL);
 897        if (err)
 898                pr_err("s390 Auxiliary Trace: failed to deliver error event,"
 899                        "error %d\n", err);
 900        return err;
 901}
 902
 903static int s390_cpumsf_lost(struct s390_cpumsf *sf, struct perf_sample *sample)
 904{
 905        return s390_cpumsf_synth_error(sf, 1, sample->cpu,
 906                                       sample->pid, sample->tid, 0,
 907                                       sample->time);
 908}
 909
 910static int
 911s390_cpumsf_process_event(struct perf_session *session,
 912                          union perf_event *event,
 913                          struct perf_sample *sample,
 914                          struct perf_tool *tool)
 915{
 916        struct s390_cpumsf *sf = container_of(session->auxtrace,
 917                                              struct s390_cpumsf,
 918                                              auxtrace);
 919        u64 timestamp = sample->time;
 920        struct evsel *ev_bc000;
 921
 922        int err = 0;
 923
 924        if (dump_trace)
 925                return 0;
 926
 927        if (!tool->ordered_events) {
 928                pr_err("s390 Auxiliary Trace requires ordered events\n");
 929                return -EINVAL;
 930        }
 931
 932        if (event->header.type == PERF_RECORD_SAMPLE &&
 933            sample->raw_size) {
 934                /* Handle event with raw data */
 935                ev_bc000 = evlist__event2evsel(session->evlist, event);
 936                if (ev_bc000 &&
 937                    ev_bc000->core.attr.config == PERF_EVENT_CPUM_CF_DIAG)
 938                        err = s390_cpumcf_dumpctr(sf, sample);
 939                return err;
 940        }
 941
 942        if (event->header.type == PERF_RECORD_AUX &&
 943            event->aux.flags & PERF_AUX_FLAG_TRUNCATED)
 944                return s390_cpumsf_lost(sf, sample);
 945
 946        if (timestamp) {
 947                err = s390_cpumsf_update_queues(sf, timestamp);
 948                if (!err)
 949                        err = s390_cpumsf_process_queues(sf, timestamp);
 950        }
 951        return err;
 952}
 953
 954struct s390_cpumsf_synth {
 955        struct perf_tool cpumsf_tool;
 956        struct perf_session *session;
 957};
 958
 959static int
 960s390_cpumsf_process_auxtrace_event(struct perf_session *session,
 961                                   union perf_event *event __maybe_unused,
 962                                   struct perf_tool *tool __maybe_unused)
 963{
 964        struct s390_cpumsf *sf = container_of(session->auxtrace,
 965                                              struct s390_cpumsf,
 966                                              auxtrace);
 967
 968        int fd = perf_data__fd(session->data);
 969        struct auxtrace_buffer *buffer;
 970        off_t data_offset;
 971        int err;
 972
 973        if (sf->data_queued)
 974                return 0;
 975
 976        if (perf_data__is_pipe(session->data)) {
 977                data_offset = 0;
 978        } else {
 979                data_offset = lseek(fd, 0, SEEK_CUR);
 980                if (data_offset == -1)
 981                        return -errno;
 982        }
 983
 984        err = auxtrace_queues__add_event(&sf->queues, session, event,
 985                                         data_offset, &buffer);
 986        if (err)
 987                return err;
 988
 989        /* Dump here after copying piped trace out of the pipe */
 990        if (dump_trace) {
 991                if (auxtrace_buffer__get_data(buffer, fd)) {
 992                        s390_cpumsf_dump_event(sf, buffer->data,
 993                                               buffer->size);
 994                        auxtrace_buffer__put_data(buffer);
 995                }
 996        }
 997        return 0;
 998}
 999
1000static void s390_cpumsf_free_events(struct perf_session *session __maybe_unused)
1001{
1002}
1003
1004static int s390_cpumsf_flush(struct perf_session *session __maybe_unused,
1005                             struct perf_tool *tool __maybe_unused)
1006{
1007        return 0;
1008}
1009
1010static void s390_cpumsf_free_queues(struct perf_session *session)
1011{
1012        struct s390_cpumsf *sf = container_of(session->auxtrace,
1013                                              struct s390_cpumsf,
1014                                              auxtrace);
1015        struct auxtrace_queues *queues = &sf->queues;
1016        unsigned int i;
1017
1018        for (i = 0; i < queues->nr_queues; i++) {
1019                struct s390_cpumsf_queue *sfq = (struct s390_cpumsf_queue *)
1020                                                queues->queue_array[i].priv;
1021
1022                if (sfq != NULL) {
1023                        if (sfq->logfile) {
1024                                fclose(sfq->logfile);
1025                                sfq->logfile = NULL;
1026                        }
1027                        if (sfq->logfile_ctr) {
1028                                fclose(sfq->logfile_ctr);
1029                                sfq->logfile_ctr = NULL;
1030                        }
1031                }
1032                zfree(&queues->queue_array[i].priv);
1033        }
1034        auxtrace_queues__free(queues);
1035}
1036
1037static void s390_cpumsf_free(struct perf_session *session)
1038{
1039        struct s390_cpumsf *sf = container_of(session->auxtrace,
1040                                              struct s390_cpumsf,
1041                                              auxtrace);
1042
1043        auxtrace_heap__free(&sf->heap);
1044        s390_cpumsf_free_queues(session);
1045        session->auxtrace = NULL;
1046        zfree(&sf->logdir);
1047        free(sf);
1048}
1049
1050static bool
1051s390_cpumsf_evsel_is_auxtrace(struct perf_session *session __maybe_unused,
1052                              struct evsel *evsel)
1053{
1054        return evsel->core.attr.type == PERF_TYPE_RAW &&
1055               evsel->core.attr.config == PERF_EVENT_CPUM_SF_DIAG;
1056}
1057
1058static int s390_cpumsf_get_type(const char *cpuid)
1059{
1060        int ret, family = 0;
1061
1062        ret = sscanf(cpuid, "%*[^,],%u", &family);
1063        return (ret == 1) ? family : 0;
1064}
1065
1066/* Check itrace options set on perf report command.
1067 * Return true, if none are set or all options specified can be
1068 * handled on s390 (currently only option 'd' for logging.
1069 * Return false otherwise.
1070 */
1071static bool check_auxtrace_itrace(struct itrace_synth_opts *itops)
1072{
1073        bool ison = false;
1074
1075        if (!itops || !itops->set)
1076                return true;
1077        ison = itops->inject || itops->instructions || itops->branches ||
1078                itops->transactions || itops->ptwrites ||
1079                itops->pwr_events || itops->errors ||
1080                itops->dont_decode || itops->calls || itops->returns ||
1081                itops->callchain || itops->thread_stack ||
1082                itops->last_branch || itops->add_callchain ||
1083                itops->add_last_branch;
1084        if (!ison)
1085                return true;
1086        pr_err("Unsupported --itrace options specified\n");
1087        return false;
1088}
1089
1090/* Check for AUXTRACE dump directory if it is needed.
1091 * On failure print an error message but continue.
1092 * Return 0 on wrong keyword in config file and 1 otherwise.
1093 */
1094static int s390_cpumsf__config(const char *var, const char *value, void *cb)
1095{
1096        struct s390_cpumsf *sf = cb;
1097        struct stat stbuf;
1098        int rc;
1099
1100        if (strcmp(var, "auxtrace.dumpdir"))
1101                return 0;
1102        sf->logdir = strdup(value);
1103        if (sf->logdir == NULL) {
1104                pr_err("Failed to find auxtrace log directory %s,"
1105                       " continue with current directory...\n", value);
1106                return 1;
1107        }
1108        rc = stat(sf->logdir, &stbuf);
1109        if (rc == -1 || !S_ISDIR(stbuf.st_mode)) {
1110                pr_err("Missing auxtrace log directory %s,"
1111                       " continue with current directory...\n", value);
1112                zfree(&sf->logdir);
1113        }
1114        return 1;
1115}
1116
1117int s390_cpumsf_process_auxtrace_info(union perf_event *event,
1118                                      struct perf_session *session)
1119{
1120        struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info;
1121        struct s390_cpumsf *sf;
1122        int err;
1123
1124        if (auxtrace_info->header.size < sizeof(struct perf_record_auxtrace_info))
1125                return -EINVAL;
1126
1127        sf = zalloc(sizeof(struct s390_cpumsf));
1128        if (sf == NULL)
1129                return -ENOMEM;
1130
1131        if (!check_auxtrace_itrace(session->itrace_synth_opts)) {
1132                err = -EINVAL;
1133                goto err_free;
1134        }
1135        sf->use_logfile = session->itrace_synth_opts->log;
1136        if (sf->use_logfile)
1137                perf_config(s390_cpumsf__config, sf);
1138
1139        err = auxtrace_queues__init(&sf->queues);
1140        if (err)
1141                goto err_free;
1142
1143        sf->session = session;
1144        sf->machine = &session->machines.host; /* No kvm support */
1145        sf->auxtrace_type = auxtrace_info->type;
1146        sf->pmu_type = PERF_TYPE_RAW;
1147        sf->machine_type = s390_cpumsf_get_type(session->evlist->env->cpuid);
1148
1149        sf->auxtrace.process_event = s390_cpumsf_process_event;
1150        sf->auxtrace.process_auxtrace_event = s390_cpumsf_process_auxtrace_event;
1151        sf->auxtrace.flush_events = s390_cpumsf_flush;
1152        sf->auxtrace.free_events = s390_cpumsf_free_events;
1153        sf->auxtrace.free = s390_cpumsf_free;
1154        sf->auxtrace.evsel_is_auxtrace = s390_cpumsf_evsel_is_auxtrace;
1155        session->auxtrace = &sf->auxtrace;
1156
1157        if (dump_trace)
1158                return 0;
1159
1160        err = auxtrace_queues__process_index(&sf->queues, session);
1161        if (err)
1162                goto err_free_queues;
1163
1164        if (sf->queues.populated)
1165                sf->data_queued = true;
1166
1167        return 0;
1168
1169err_free_queues:
1170        auxtrace_queues__free(&sf->queues);
1171        session->auxtrace = NULL;
1172err_free:
1173        zfree(&sf->logdir);
1174        free(sf);
1175        return err;
1176}
1177