qemu/util/qsp.c
<<
>>
Prefs
   1/*
   2 * qsp.c - QEMU Synchronization Profiler
   3 *
   4 * Copyright (C) 2018, Emilio G. Cota <cota@braap.org>
   5 *
   6 * License: GNU GPL, version 2 or later.
   7 *   See the COPYING file in the top-level directory.
   8 *
   9 * QSP profiles the time spent in synchronization primitives, which can
  10 * help diagnose performance problems, e.g. scalability issues when
  11 * contention is high.
  12 *
  13 * The primitives currently supported are mutexes, recursive mutexes and
  14 * condition variables. Note that not all related functions are intercepted;
  15 * instead we profile only those functions that can have a performance impact,
  16 * either due to blocking (e.g. cond_wait, mutex_lock) or cache line
  17 * contention (e.g. mutex_lock, mutex_trylock).
  18 *
  19 * QSP's design focuses on speed and scalability. This is achieved
  20 * by having threads do their profiling entirely on thread-local data.
  21 * The appropriate thread-local data is found via a QHT, i.e. a concurrent hash
  22 * table. To aggregate data in order to generate a report, we iterate over
  23 * all entries in the hash table. Depending on the number of threads and
  24 * synchronization objects this might be expensive, but note that it is
  25 * very rarely called -- reports are generated only when requested by users.
  26 *
  27 * Reports are generated as a table where each row represents a call site. A
  28 * call site is the triplet formed by the __file__ and __LINE__ of the caller
  29 * as well as the address of the "object" (i.e. mutex, rec. mutex or condvar)
  30 * being operated on. Optionally, call sites that operate on different objects
  31 * of the same type can be coalesced, which can be particularly useful when
  32 * profiling dynamically-allocated objects.
  33 *
  34 * Alternative designs considered:
  35 *
  36 * - Use an off-the-shelf profiler such as mutrace. This is not a viable option
  37 *   for us because QEMU has __malloc_hook set (by one of the libraries it
  38 *   uses); leaving this hook unset is required to avoid deadlock in mutrace.
  39 *
  40 * - Use a glib HT for each thread, protecting each HT with its own lock.
  41 *   This isn't simpler than the current design, and is 10% slower in the
  42 *   atomic_add-bench microbenchmark (-m option).
  43 *
  44 * - For reports, just use a binary tree as we aggregate data, instead of having
  45 *   an intermediate hash table. This would simplify the code only slightly, but
  46 *   would perform badly if there were many threads and objects to track.
  47 *
  48 * - Wrap operations on qsp entries with RCU read-side critical sections, so
  49 *   that qsp_reset() can delete entries. Unfortunately, the overhead of calling
  50 *   rcu_read_lock/unlock slows down atomic_add-bench -m by 24%. Having
  51 *   a snapshot that is updated on qsp_reset() avoids this overhead.
  52 *
  53 * Related Work:
  54 * - Lennart Poettering's mutrace: http://0pointer.de/blog/projects/mutrace.html
  55 * - Lozi, David, Thomas, Lawall and Muller. "Remote Core Locking: Migrating
  56 *   Critical-Section Execution to Improve the Performance of Multithreaded
  57 *   Applications", USENIX ATC'12.
  58 */
  59#include "qemu/osdep.h"
  60#include "qemu/thread.h"
  61#include "qemu/timer.h"
  62#include "qemu/qht.h"
  63#include "qemu/rcu.h"
  64#include "qemu/xxhash.h"
  65
  66enum QSPType {
  67    QSP_MUTEX,
  68    QSP_BQL_MUTEX,
  69    QSP_REC_MUTEX,
  70    QSP_CONDVAR,
  71};
  72
  73struct QSPCallSite {
  74    const void *obj;
  75    const char *file; /* i.e. __FILE__; shortened later */
  76    int line;
  77    enum QSPType type;
  78};
  79typedef struct QSPCallSite QSPCallSite;
  80
  81struct QSPEntry {
  82    void *thread_ptr;
  83    const QSPCallSite *callsite;
  84    uint64_t n_acqs;
  85    uint64_t ns;
  86    unsigned int n_objs; /* count of coalesced objs; only used for reporting */
  87};
  88typedef struct QSPEntry QSPEntry;
  89
  90struct QSPSnapshot {
  91    struct rcu_head rcu;
  92    struct qht ht;
  93};
  94typedef struct QSPSnapshot QSPSnapshot;
  95
  96/* initial sizing for hash tables */
  97#define QSP_INITIAL_SIZE 64
  98
  99/* If this file is moved, QSP_REL_PATH should be updated accordingly */
 100#define QSP_REL_PATH "util/qsp.c"
 101
 102/* this file's full path. Used to present all call sites with relative paths */
 103static size_t qsp_qemu_path_len;
 104
 105/* the address of qsp_thread gives us a unique 'thread ID' */
 106static __thread int qsp_thread;
 107
 108/*
 109 * Call sites are the same for all threads, so we track them in a separate hash
 110 * table to save memory.
 111 */
 112static struct qht qsp_callsite_ht;
 113
 114static struct qht qsp_ht;
 115static QSPSnapshot *qsp_snapshot;
 116static bool qsp_initialized, qsp_initializing;
 117
 118static const char * const qsp_typenames[] = {
 119    [QSP_MUTEX]     = "mutex",
 120    [QSP_BQL_MUTEX] = "BQL mutex",
 121    [QSP_REC_MUTEX] = "rec_mutex",
 122    [QSP_CONDVAR]   = "condvar",
 123};
 124
 125QemuMutexLockFunc qemu_bql_mutex_lock_func = qemu_mutex_lock_impl;
 126QemuMutexLockFunc qemu_mutex_lock_func = qemu_mutex_lock_impl;
 127QemuMutexTrylockFunc qemu_mutex_trylock_func = qemu_mutex_trylock_impl;
 128QemuRecMutexLockFunc qemu_rec_mutex_lock_func = qemu_rec_mutex_lock_impl;
 129QemuRecMutexTrylockFunc qemu_rec_mutex_trylock_func =
 130    qemu_rec_mutex_trylock_impl;
 131QemuCondWaitFunc qemu_cond_wait_func = qemu_cond_wait_impl;
 132
 133/*
 134 * It pays off to _not_ hash callsite->file; hashing a string is slow, and
 135 * without it we still get a pretty unique hash.
 136 */
 137static inline
 138uint32_t do_qsp_callsite_hash(const QSPCallSite *callsite, uint64_t ab)
 139{
 140    uint64_t cd = (uint64_t)(uintptr_t)callsite->obj;
 141    uint32_t e = callsite->line;
 142    uint32_t f = callsite->type;
 143
 144    return qemu_xxhash6(ab, cd, e, f);
 145}
 146
 147static inline
 148uint32_t qsp_callsite_hash(const QSPCallSite *callsite)
 149{
 150    return do_qsp_callsite_hash(callsite, 0);
 151}
 152
 153static inline uint32_t do_qsp_entry_hash(const QSPEntry *entry, uint64_t a)
 154{
 155    return do_qsp_callsite_hash(entry->callsite, a);
 156}
 157
 158static uint32_t qsp_entry_hash(const QSPEntry *entry)
 159{
 160    return do_qsp_entry_hash(entry, (uint64_t)(uintptr_t)entry->thread_ptr);
 161}
 162
 163static uint32_t qsp_entry_no_thread_hash(const QSPEntry *entry)
 164{
 165    return do_qsp_entry_hash(entry, 0);
 166}
 167
 168/* without the objects we need to hash the file name to get a decent hash */
 169static uint32_t qsp_entry_no_thread_obj_hash(const QSPEntry *entry)
 170{
 171    const QSPCallSite *callsite = entry->callsite;
 172    uint64_t ab = g_str_hash(callsite->file);
 173    uint64_t cd = callsite->line;
 174    uint32_t e = callsite->type;
 175
 176    return qemu_xxhash5(ab, cd, e);
 177}
 178
 179static bool qsp_callsite_cmp(const void *ap, const void *bp)
 180{
 181    const QSPCallSite *a = ap;
 182    const QSPCallSite *b = bp;
 183
 184    return a == b ||
 185        (a->obj == b->obj &&
 186         a->line == b->line &&
 187         a->type == b->type &&
 188         (a->file == b->file || !strcmp(a->file, b->file)));
 189}
 190
 191static bool qsp_callsite_no_obj_cmp(const void *ap, const void *bp)
 192{
 193    const QSPCallSite *a = ap;
 194    const QSPCallSite *b = bp;
 195
 196    return a == b ||
 197        (a->line == b->line &&
 198         a->type == b->type &&
 199         (a->file == b->file || !strcmp(a->file, b->file)));
 200}
 201
 202static bool qsp_entry_no_thread_cmp(const void *ap, const void *bp)
 203{
 204    const QSPEntry *a = ap;
 205    const QSPEntry *b = bp;
 206
 207    return qsp_callsite_cmp(a->callsite, b->callsite);
 208}
 209
 210static bool qsp_entry_no_thread_obj_cmp(const void *ap, const void *bp)
 211{
 212    const QSPEntry *a = ap;
 213    const QSPEntry *b = bp;
 214
 215    return qsp_callsite_no_obj_cmp(a->callsite, b->callsite);
 216}
 217
 218static bool qsp_entry_cmp(const void *ap, const void *bp)
 219{
 220    const QSPEntry *a = ap;
 221    const QSPEntry *b = bp;
 222
 223    return a->thread_ptr == b->thread_ptr &&
 224        qsp_callsite_cmp(a->callsite, b->callsite);
 225}
 226
 227/*
 228 * Normally we'd call this from a constructor function, but we want it to work
 229 * via libutil as well.
 230 */
 231static void qsp_do_init(void)
 232{
 233    /* make sure this file's path in the tree is up to date with QSP_REL_PATH */
 234    g_assert(strstr(__FILE__, QSP_REL_PATH));
 235    qsp_qemu_path_len = strlen(__FILE__) - strlen(QSP_REL_PATH);
 236
 237    qht_init(&qsp_ht, qsp_entry_cmp, QSP_INITIAL_SIZE,
 238             QHT_MODE_AUTO_RESIZE | QHT_MODE_RAW_MUTEXES);
 239    qht_init(&qsp_callsite_ht, qsp_callsite_cmp, QSP_INITIAL_SIZE,
 240             QHT_MODE_AUTO_RESIZE | QHT_MODE_RAW_MUTEXES);
 241}
 242
 243static __attribute__((noinline)) void qsp_init__slowpath(void)
 244{
 245    if (atomic_cmpxchg(&qsp_initializing, false, true) == false) {
 246        qsp_do_init();
 247        atomic_set(&qsp_initialized, true);
 248    } else {
 249        while (!atomic_read(&qsp_initialized)) {
 250            cpu_relax();
 251        }
 252    }
 253}
 254
 255/* qsp_init() must be called from _all_ exported functions */
 256static inline void qsp_init(void)
 257{
 258    if (likely(atomic_read(&qsp_initialized))) {
 259        return;
 260    }
 261    qsp_init__slowpath();
 262}
 263
 264static QSPCallSite *qsp_callsite_find(const QSPCallSite *orig)
 265{
 266    QSPCallSite *callsite;
 267    uint32_t hash;
 268
 269    hash = qsp_callsite_hash(orig);
 270    callsite = qht_lookup(&qsp_callsite_ht, orig, hash);
 271    if (callsite == NULL) {
 272        void *existing = NULL;
 273
 274        callsite = g_new(QSPCallSite, 1);
 275        memcpy(callsite, orig, sizeof(*callsite));
 276        qht_insert(&qsp_callsite_ht, callsite, hash, &existing);
 277        if (unlikely(existing)) {
 278            g_free(callsite);
 279            callsite = existing;
 280        }
 281    }
 282    return callsite;
 283}
 284
 285static QSPEntry *
 286qsp_entry_create(struct qht *ht, const QSPEntry *entry, uint32_t hash)
 287{
 288    QSPEntry *e;
 289    void *existing = NULL;
 290
 291    e = g_new0(QSPEntry, 1);
 292    e->thread_ptr = entry->thread_ptr;
 293    e->callsite = qsp_callsite_find(entry->callsite);
 294
 295    qht_insert(ht, e, hash, &existing);
 296    if (unlikely(existing)) {
 297        g_free(e);
 298        e = existing;
 299    }
 300    return e;
 301}
 302
 303static QSPEntry *
 304qsp_entry_find(struct qht *ht, const QSPEntry *entry, uint32_t hash)
 305{
 306    QSPEntry *e;
 307
 308    e = qht_lookup(ht, entry, hash);
 309    if (e == NULL) {
 310        e = qsp_entry_create(ht, entry, hash);
 311    }
 312    return e;
 313}
 314
 315/*
 316 * Note: Entries are never removed, so callers do not have to be in an RCU
 317 * read-side critical section.
 318 */
 319static QSPEntry *qsp_entry_get(const void *obj, const char *file, int line,
 320                               enum QSPType type)
 321{
 322    QSPCallSite callsite = {
 323        .obj = obj,
 324        .file = file,
 325        .line = line,
 326        .type = type,
 327    };
 328    QSPEntry orig;
 329    uint32_t hash;
 330
 331    qsp_init();
 332
 333    orig.thread_ptr = &qsp_thread;
 334    orig.callsite = &callsite;
 335
 336    hash = qsp_entry_hash(&orig);
 337    return qsp_entry_find(&qsp_ht, &orig, hash);
 338}
 339
 340/*
 341 * @e is in the global hash table; it is only written to by the current thread,
 342 * so we write to it atomically (as in "write once") to prevent torn reads.
 343 */
 344static inline void do_qsp_entry_record(QSPEntry *e, int64_t delta, bool acq)
 345{
 346    atomic_set_u64(&e->ns, e->ns + delta);
 347    if (acq) {
 348        atomic_set_u64(&e->n_acqs, e->n_acqs + 1);
 349    }
 350}
 351
 352static inline void qsp_entry_record(QSPEntry *e, int64_t delta)
 353{
 354    do_qsp_entry_record(e, delta, true);
 355}
 356
 357#define QSP_GEN_VOID(type_, qsp_t_, func_, impl_)                       \
 358    static void func_(type_ *obj, const char *file, int line)           \
 359    {                                                                   \
 360        QSPEntry *e;                                                    \
 361        int64_t t0, t1;                                                 \
 362                                                                        \
 363        t0 = get_clock();                                               \
 364        impl_(obj, file, line);                                         \
 365        t1 = get_clock();                                               \
 366                                                                        \
 367        e = qsp_entry_get(obj, file, line, qsp_t_);                     \
 368        qsp_entry_record(e, t1 - t0);                                   \
 369    }
 370
 371#define QSP_GEN_RET1(type_, qsp_t_, func_, impl_)                       \
 372    static int func_(type_ *obj, const char *file, int line)            \
 373    {                                                                   \
 374        QSPEntry *e;                                                    \
 375        int64_t t0, t1;                                                 \
 376        int err;                                                        \
 377                                                                        \
 378        t0 = get_clock();                                               \
 379        err = impl_(obj, file, line);                                   \
 380        t1 = get_clock();                                               \
 381                                                                        \
 382        e = qsp_entry_get(obj, file, line, qsp_t_);                     \
 383        do_qsp_entry_record(e, t1 - t0, !err);                          \
 384        return err;                                                     \
 385    }
 386
 387QSP_GEN_VOID(QemuMutex, QSP_BQL_MUTEX, qsp_bql_mutex_lock, qemu_mutex_lock_impl)
 388QSP_GEN_VOID(QemuMutex, QSP_MUTEX, qsp_mutex_lock, qemu_mutex_lock_impl)
 389QSP_GEN_RET1(QemuMutex, QSP_MUTEX, qsp_mutex_trylock, qemu_mutex_trylock_impl)
 390
 391QSP_GEN_VOID(QemuRecMutex, QSP_REC_MUTEX, qsp_rec_mutex_lock,
 392             qemu_rec_mutex_lock_impl)
 393QSP_GEN_RET1(QemuRecMutex, QSP_REC_MUTEX, qsp_rec_mutex_trylock,
 394             qemu_rec_mutex_trylock_impl)
 395
 396#undef QSP_GEN_RET1
 397#undef QSP_GEN_VOID
 398
 399static void
 400qsp_cond_wait(QemuCond *cond, QemuMutex *mutex, const char *file, int line)
 401{
 402    QSPEntry *e;
 403    int64_t t0, t1;
 404
 405    t0 = get_clock();
 406    qemu_cond_wait_impl(cond, mutex, file, line);
 407    t1 = get_clock();
 408
 409    e = qsp_entry_get(cond, file, line, QSP_CONDVAR);
 410    qsp_entry_record(e, t1 - t0);
 411}
 412
 413bool qsp_is_enabled(void)
 414{
 415    return atomic_read(&qemu_mutex_lock_func) == qsp_mutex_lock;
 416}
 417
 418void qsp_enable(void)
 419{
 420    atomic_set(&qemu_mutex_lock_func, qsp_mutex_lock);
 421    atomic_set(&qemu_mutex_trylock_func, qsp_mutex_trylock);
 422    atomic_set(&qemu_bql_mutex_lock_func, qsp_bql_mutex_lock);
 423    atomic_set(&qemu_rec_mutex_lock_func, qsp_rec_mutex_lock);
 424    atomic_set(&qemu_rec_mutex_trylock_func, qsp_rec_mutex_trylock);
 425    atomic_set(&qemu_cond_wait_func, qsp_cond_wait);
 426}
 427
 428void qsp_disable(void)
 429{
 430    atomic_set(&qemu_mutex_lock_func, qemu_mutex_lock_impl);
 431    atomic_set(&qemu_mutex_trylock_func, qemu_mutex_trylock_impl);
 432    atomic_set(&qemu_bql_mutex_lock_func, qemu_mutex_lock_impl);
 433    atomic_set(&qemu_rec_mutex_lock_func, qemu_rec_mutex_lock_impl);
 434    atomic_set(&qemu_rec_mutex_trylock_func, qemu_rec_mutex_trylock_impl);
 435    atomic_set(&qemu_cond_wait_func, qemu_cond_wait_impl);
 436}
 437
 438static gint qsp_tree_cmp(gconstpointer ap, gconstpointer bp, gpointer up)
 439{
 440    const QSPEntry *a = ap;
 441    const QSPEntry *b = bp;
 442    enum QSPSortBy sort_by = *(enum QSPSortBy *)up;
 443    const QSPCallSite *ca;
 444    const QSPCallSite *cb;
 445
 446    switch (sort_by) {
 447    case QSP_SORT_BY_TOTAL_WAIT_TIME:
 448        if (a->ns > b->ns) {
 449            return -1;
 450        } else if (a->ns < b->ns) {
 451            return 1;
 452        }
 453        break;
 454    case QSP_SORT_BY_AVG_WAIT_TIME:
 455    {
 456        double avg_a = a->n_acqs ? a->ns / a->n_acqs : 0;
 457        double avg_b = b->n_acqs ? b->ns / b->n_acqs : 0;
 458
 459        if (avg_a > avg_b) {
 460            return -1;
 461        } else if (avg_a < avg_b) {
 462            return 1;
 463        }
 464        break;
 465    }
 466    default:
 467        g_assert_not_reached();
 468    }
 469
 470    ca = a->callsite;
 471    cb = b->callsite;
 472    /* Break the tie with the object's address */
 473    if (ca->obj < cb->obj) {
 474        return -1;
 475    } else if (ca->obj > cb->obj) {
 476        return 1;
 477    } else {
 478        int cmp;
 479
 480        /* same obj. Break the tie with the callsite's file */
 481        cmp = strcmp(ca->file, cb->file);
 482        if (cmp) {
 483            return cmp;
 484        }
 485        /* same callsite file. Break the tie with the callsite's line */
 486        g_assert(ca->line != cb->line);
 487        if (ca->line < cb->line) {
 488            return -1;
 489        } else if (ca->line > cb->line) {
 490            return 1;
 491        } else {
 492            /* break the tie with the callsite's type */
 493            return cb->type - ca->type;
 494        }
 495    }
 496}
 497
 498static void qsp_sort(void *p, uint32_t h, void *userp)
 499{
 500    QSPEntry *e = p;
 501    GTree *tree = userp;
 502
 503    g_tree_insert(tree, e, NULL);
 504}
 505
 506static void qsp_aggregate(void *p, uint32_t h, void *up)
 507{
 508    struct qht *ht = up;
 509    const QSPEntry *e = p;
 510    QSPEntry *agg;
 511    uint32_t hash;
 512
 513    hash = qsp_entry_no_thread_hash(e);
 514    agg = qsp_entry_find(ht, e, hash);
 515    /*
 516     * The entry is in the global hash table; read from it atomically (as in
 517     * "read once").
 518     */
 519    agg->ns += atomic_read_u64(&e->ns);
 520    agg->n_acqs += atomic_read_u64(&e->n_acqs);
 521}
 522
 523static void qsp_iter_diff(void *p, uint32_t hash, void *htp)
 524{
 525    struct qht *ht = htp;
 526    QSPEntry *old = p;
 527    QSPEntry *new;
 528
 529    new = qht_lookup(ht, old, hash);
 530    /* entries are never deleted, so we must have this one */
 531    g_assert(new != NULL);
 532    /* our reading of the stats happened after the snapshot was taken */
 533    g_assert(new->n_acqs >= old->n_acqs);
 534    g_assert(new->ns >= old->ns);
 535
 536    new->n_acqs -= old->n_acqs;
 537    new->ns -= old->ns;
 538
 539    /* No point in reporting an empty entry */
 540    if (new->n_acqs == 0 && new->ns == 0) {
 541        bool removed = qht_remove(ht, new, hash);
 542
 543        g_assert(removed);
 544        g_free(new);
 545    }
 546}
 547
 548static void qsp_diff(struct qht *orig, struct qht *new)
 549{
 550    qht_iter(orig, qsp_iter_diff, new);
 551}
 552
 553static void qsp_iter_callsite_coalesce(void *p, uint32_t h, void *htp)
 554{
 555    struct qht *ht = htp;
 556    QSPEntry *old = p;
 557    QSPEntry *e;
 558    uint32_t hash;
 559
 560    hash = qsp_entry_no_thread_obj_hash(old);
 561    e = qht_lookup(ht, old, hash);
 562    if (e == NULL) {
 563        e = qsp_entry_create(ht, old, hash);
 564        e->n_objs = 1;
 565    } else if (e->callsite->obj != old->callsite->obj) {
 566        e->n_objs++;
 567    }
 568    e->ns += old->ns;
 569    e->n_acqs += old->n_acqs;
 570}
 571
 572static void qsp_ht_delete(void *p, uint32_t h, void *htp)
 573{
 574    g_free(p);
 575}
 576
 577static void qsp_mktree(GTree *tree, bool callsite_coalesce)
 578{
 579    QSPSnapshot *snap;
 580    struct qht ht, coalesce_ht;
 581    struct qht *htp;
 582
 583    /*
 584     * First, see if there's a prior snapshot, so that we read the global hash
 585     * table _after_ the snapshot has been created, which guarantees that
 586     * the entries we'll read will be a superset of the snapshot's entries.
 587     *
 588     * We must remain in an RCU read-side critical section until we're done
 589     * with the snapshot.
 590     */
 591    rcu_read_lock();
 592    snap = atomic_rcu_read(&qsp_snapshot);
 593
 594    /* Aggregate all results from the global hash table into a local one */
 595    qht_init(&ht, qsp_entry_no_thread_cmp, QSP_INITIAL_SIZE,
 596             QHT_MODE_AUTO_RESIZE | QHT_MODE_RAW_MUTEXES);
 597    qht_iter(&qsp_ht, qsp_aggregate, &ht);
 598
 599    /* compute the difference wrt the snapshot, if any */
 600    if (snap) {
 601        qsp_diff(&snap->ht, &ht);
 602    }
 603    /* done with the snapshot; RCU can reclaim it */
 604    rcu_read_unlock();
 605
 606    htp = &ht;
 607    if (callsite_coalesce) {
 608        qht_init(&coalesce_ht, qsp_entry_no_thread_obj_cmp, QSP_INITIAL_SIZE,
 609                 QHT_MODE_AUTO_RESIZE | QHT_MODE_RAW_MUTEXES);
 610        qht_iter(&ht, qsp_iter_callsite_coalesce, &coalesce_ht);
 611
 612        /* free the previous hash table, and point htp to coalesce_ht */
 613        qht_iter(&ht, qsp_ht_delete, NULL);
 614        qht_destroy(&ht);
 615        htp = &coalesce_ht;
 616    }
 617
 618    /* sort the hash table elements by using a tree */
 619    qht_iter(htp, qsp_sort, tree);
 620
 621    /* free the hash table, but keep the elements (those are in the tree now) */
 622    qht_destroy(htp);
 623}
 624
 625/* free string with g_free */
 626static char *qsp_at(const QSPCallSite *callsite)
 627{
 628    GString *s = g_string_new(NULL);
 629    const char *shortened;
 630
 631    /* remove the absolute path to qemu */
 632    if (unlikely(strlen(callsite->file) < qsp_qemu_path_len)) {
 633        shortened = callsite->file;
 634    } else {
 635        shortened = callsite->file + qsp_qemu_path_len;
 636    }
 637    g_string_append_printf(s, "%s:%u", shortened, callsite->line);
 638    return g_string_free(s, FALSE);
 639}
 640
 641struct QSPReportEntry {
 642    const void *obj;
 643    char *callsite_at;
 644    const char *typename;
 645    double time_s;
 646    double ns_avg;
 647    uint64_t n_acqs;
 648    unsigned int n_objs;
 649};
 650typedef struct QSPReportEntry QSPReportEntry;
 651
 652struct QSPReport {
 653    QSPReportEntry *entries;
 654    size_t n_entries;
 655    size_t max_n_entries;
 656};
 657typedef struct QSPReport QSPReport;
 658
 659static gboolean qsp_tree_report(gpointer key, gpointer value, gpointer udata)
 660{
 661    const QSPEntry *e = key;
 662    QSPReport *report = udata;
 663    QSPReportEntry *entry;
 664
 665    if (report->n_entries == report->max_n_entries) {
 666        return TRUE;
 667    }
 668    entry = &report->entries[report->n_entries];
 669    report->n_entries++;
 670
 671    entry->obj = e->callsite->obj;
 672    entry->n_objs = e->n_objs;
 673    entry->callsite_at = qsp_at(e->callsite);
 674    entry->typename = qsp_typenames[e->callsite->type];
 675    entry->time_s = e->ns * 1e-9;
 676    entry->n_acqs = e->n_acqs;
 677    entry->ns_avg = e->n_acqs ? e->ns / e->n_acqs : 0;
 678    return FALSE;
 679}
 680
 681static void
 682pr_report(const QSPReport *rep, FILE *f, fprintf_function pr)
 683{
 684    char *dashes;
 685    size_t max_len = 0;
 686    int callsite_len = 0;
 687    int callsite_rspace;
 688    int n_dashes;
 689    size_t i;
 690
 691    /* find out the maximum length of all 'callsite' fields */
 692    for (i = 0; i < rep->n_entries; i++) {
 693        const QSPReportEntry *e = &rep->entries[i];
 694        size_t len = strlen(e->callsite_at);
 695
 696        if (len > max_len) {
 697            max_len = len;
 698        }
 699    }
 700
 701    callsite_len = MAX(max_len, strlen("Call site"));
 702    /* white space to leave to the right of "Call site" */
 703    callsite_rspace = callsite_len - strlen("Call site");
 704
 705    pr(f, "Type               Object  Call site%*s  Wait Time (s)  "
 706       "       Count  Average (us)\n", callsite_rspace, "");
 707
 708    /* build a horizontal rule with dashes */
 709    n_dashes = 79 + callsite_rspace;
 710    dashes = g_malloc(n_dashes + 1);
 711    memset(dashes, '-', n_dashes);
 712    dashes[n_dashes] = '\0';
 713    pr(f, "%s\n", dashes);
 714
 715    for (i = 0; i < rep->n_entries; i++) {
 716        const QSPReportEntry *e = &rep->entries[i];
 717        GString *s = g_string_new(NULL);
 718
 719        g_string_append_printf(s, "%-9s  ", e->typename);
 720        if (e->n_objs > 1) {
 721            g_string_append_printf(s, "[%12u]", e->n_objs);
 722        } else {
 723            g_string_append_printf(s, "%14p", e->obj);
 724        }
 725        g_string_append_printf(s, "  %s%*s  %13.5f  %12" PRIu64 "  %12.2f\n",
 726                               e->callsite_at,
 727                               callsite_len - (int)strlen(e->callsite_at), "",
 728                               e->time_s, e->n_acqs, e->ns_avg * 1e-3);
 729        pr(f, "%s", s->str);
 730        g_string_free(s, TRUE);
 731    }
 732
 733    pr(f, "%s\n", dashes);
 734    g_free(dashes);
 735}
 736
 737static void report_destroy(QSPReport *rep)
 738{
 739    size_t i;
 740
 741    for (i = 0; i < rep->n_entries; i++) {
 742        QSPReportEntry *e = &rep->entries[i];
 743
 744        g_free(e->callsite_at);
 745    }
 746    g_free(rep->entries);
 747}
 748
 749void qsp_report(FILE *f, fprintf_function cpu_fprintf, size_t max,
 750                enum QSPSortBy sort_by, bool callsite_coalesce)
 751{
 752    GTree *tree = g_tree_new_full(qsp_tree_cmp, &sort_by, g_free, NULL);
 753    QSPReport rep;
 754
 755    qsp_init();
 756
 757    rep.entries = g_new0(QSPReportEntry, max);
 758    rep.n_entries = 0;
 759    rep.max_n_entries = max;
 760
 761    qsp_mktree(tree, callsite_coalesce);
 762    g_tree_foreach(tree, qsp_tree_report, &rep);
 763    g_tree_destroy(tree);
 764
 765    pr_report(&rep, f, cpu_fprintf);
 766    report_destroy(&rep);
 767}
 768
 769static void qsp_snapshot_destroy(QSPSnapshot *snap)
 770{
 771    qht_iter(&snap->ht, qsp_ht_delete, NULL);
 772    qht_destroy(&snap->ht);
 773    g_free(snap);
 774}
 775
 776void qsp_reset(void)
 777{
 778    QSPSnapshot *new = g_new(QSPSnapshot, 1);
 779    QSPSnapshot *old;
 780
 781    qsp_init();
 782
 783    qht_init(&new->ht, qsp_entry_cmp, QSP_INITIAL_SIZE,
 784             QHT_MODE_AUTO_RESIZE | QHT_MODE_RAW_MUTEXES);
 785
 786    /* take a snapshot of the current state */
 787    qht_iter(&qsp_ht, qsp_aggregate, &new->ht);
 788
 789    /* replace the previous snapshot, if any */
 790    old = atomic_xchg(&qsp_snapshot, new);
 791    if (old) {
 792        call_rcu(old, qsp_snapshot_destroy, rcu);
 793    }
 794}
 795