qemu/tests/fp/fp-bench.c
<<
>>
Prefs
   1/*
   2 * fp-bench.c - A collection of simple floating point microbenchmarks.
   3 *
   4 * Copyright (C) 2018, Emilio G. Cota <cota@braap.org>
   5 *
   6 * License: GNU GPL, version 2 or later.
   7 *   See the COPYING file in the top-level directory.
   8 */
   9#ifndef HW_POISON_H
  10#error Must define HW_POISON_H to work around TARGET_* poisoning
  11#endif
  12
  13#include "qemu/osdep.h"
  14#include <math.h>
  15#include <fenv.h>
  16#include "qemu/timer.h"
  17#include "qemu/int128.h"
  18#include "fpu/softfloat.h"
  19
  20/* amortize the computation of random inputs */
  21#define OPS_PER_ITER     50000
  22
  23#define MAX_OPERANDS 3
  24
  25#define SEED_A 0xdeadfacedeadface
  26#define SEED_B 0xbadc0feebadc0fee
  27#define SEED_C 0xbeefdeadbeefdead
  28
  29enum op {
  30    OP_ADD,
  31    OP_SUB,
  32    OP_MUL,
  33    OP_DIV,
  34    OP_FMA,
  35    OP_SQRT,
  36    OP_CMP,
  37    OP_MAX_NR,
  38};
  39
  40static const char * const op_names[] = {
  41    [OP_ADD] = "add",
  42    [OP_SUB] = "sub",
  43    [OP_MUL] = "mul",
  44    [OP_DIV] = "div",
  45    [OP_FMA] = "mulAdd",
  46    [OP_SQRT] = "sqrt",
  47    [OP_CMP] = "cmp",
  48    [OP_MAX_NR] = NULL,
  49};
  50
  51enum precision {
  52    PREC_SINGLE,
  53    PREC_DOUBLE,
  54    PREC_QUAD,
  55    PREC_FLOAT32,
  56    PREC_FLOAT64,
  57    PREC_FLOAT128,
  58    PREC_MAX_NR,
  59};
  60
  61enum rounding {
  62    ROUND_EVEN,
  63    ROUND_ZERO,
  64    ROUND_DOWN,
  65    ROUND_UP,
  66    ROUND_TIEAWAY,
  67    N_ROUND_MODES,
  68};
  69
  70static const char * const round_names[] = {
  71    [ROUND_EVEN] = "even",
  72    [ROUND_ZERO] = "zero",
  73    [ROUND_DOWN] = "down",
  74    [ROUND_UP] = "up",
  75    [ROUND_TIEAWAY] = "tieaway",
  76};
  77
  78enum tester {
  79    TESTER_SOFT,
  80    TESTER_HOST,
  81    TESTER_MAX_NR,
  82};
  83
  84static const char * const tester_names[] = {
  85    [TESTER_SOFT] = "soft",
  86    [TESTER_HOST] = "host",
  87    [TESTER_MAX_NR] = NULL,
  88};
  89
  90union fp {
  91    float f;
  92    double d;
  93    float32 f32;
  94    float64 f64;
  95    float128 f128;
  96    uint64_t u64;
  97};
  98
  99struct op_state;
 100
 101typedef float (*float_func_t)(const struct op_state *s);
 102typedef double (*double_func_t)(const struct op_state *s);
 103
 104union fp_func {
 105    float_func_t float_func;
 106    double_func_t double_func;
 107};
 108
 109typedef void (*bench_func_t)(void);
 110
 111struct op_desc {
 112    const char * const name;
 113};
 114
 115#define DEFAULT_DURATION_SECS 1
 116
 117static uint64_t random_ops[MAX_OPERANDS] = {
 118    SEED_A, SEED_B, SEED_C,
 119};
 120
 121static float128 random_quad_ops[MAX_OPERANDS] = {
 122    {SEED_A, SEED_B}, {SEED_B, SEED_C}, {SEED_C, SEED_A},
 123};
 124static float_status soft_status;
 125static enum precision precision;
 126static enum op operation;
 127static enum tester tester;
 128static uint64_t n_completed_ops;
 129static unsigned int duration = DEFAULT_DURATION_SECS;
 130static int64_t ns_elapsed;
 131/* disable optimizations with volatile */
 132static volatile union fp res;
 133
 134/*
 135 * From: https://en.wikipedia.org/wiki/Xorshift
 136 * This is faster than rand_r(), and gives us a wider range (RAND_MAX is only
 137 * guaranteed to be >= INT_MAX).
 138 */
 139static uint64_t xorshift64star(uint64_t x)
 140{
 141    x ^= x >> 12; /* a */
 142    x ^= x << 25; /* b */
 143    x ^= x >> 27; /* c */
 144    return x * UINT64_C(2685821657736338717);
 145}
 146
 147static void update_random_ops(int n_ops, enum precision prec)
 148{
 149    int i;
 150
 151    for (i = 0; i < n_ops; i++) {
 152
 153        switch (prec) {
 154        case PREC_SINGLE:
 155        case PREC_FLOAT32:
 156        {
 157            uint64_t r = random_ops[i];
 158            do {
 159                r = xorshift64star(r);
 160            } while (!float32_is_normal(r));
 161            random_ops[i] = r;
 162            break;
 163        }
 164        case PREC_DOUBLE:
 165        case PREC_FLOAT64:
 166        {
 167            uint64_t r = random_ops[i];
 168            do {
 169                r = xorshift64star(r);
 170            } while (!float64_is_normal(r));
 171            random_ops[i] = r;
 172            break;
 173        }
 174        case PREC_QUAD:
 175        case PREC_FLOAT128:
 176        {
 177            float128 r = random_quad_ops[i];
 178            uint64_t hi = r.high;
 179            uint64_t lo = r.low;
 180            do {
 181                hi = xorshift64star(hi);
 182                lo = xorshift64star(lo);
 183                r = make_float128(hi, lo);
 184            } while (!float128_is_normal(r));
 185            random_quad_ops[i] = r;
 186            break;
 187        }
 188        default:
 189            g_assert_not_reached();
 190        }
 191    }
 192}
 193
 194static void fill_random(union fp *ops, int n_ops, enum precision prec,
 195                        bool no_neg)
 196{
 197    int i;
 198
 199    for (i = 0; i < n_ops; i++) {
 200        switch (prec) {
 201        case PREC_SINGLE:
 202        case PREC_FLOAT32:
 203            ops[i].f32 = make_float32(random_ops[i]);
 204            if (no_neg && float32_is_neg(ops[i].f32)) {
 205                ops[i].f32 = float32_chs(ops[i].f32);
 206            }
 207            break;
 208        case PREC_DOUBLE:
 209        case PREC_FLOAT64:
 210            ops[i].f64 = make_float64(random_ops[i]);
 211            if (no_neg && float64_is_neg(ops[i].f64)) {
 212                ops[i].f64 = float64_chs(ops[i].f64);
 213            }
 214            break;
 215        case PREC_QUAD:
 216        case PREC_FLOAT128:
 217            ops[i].f128 = random_quad_ops[i];
 218            if (no_neg && float128_is_neg(ops[i].f128)) {
 219                ops[i].f128 = float128_chs(ops[i].f128);
 220            }
 221            break;
 222        default:
 223            g_assert_not_reached();
 224        }
 225    }
 226}
 227
 228/*
 229 * The main benchmark function. Instead of (ab)using macros, we rely
 230 * on the compiler to unfold this at compile-time.
 231 */
 232static void bench(enum precision prec, enum op op, int n_ops, bool no_neg)
 233{
 234    int64_t tf = get_clock() + duration * 1000000000LL;
 235
 236    while (get_clock() < tf) {
 237        union fp ops[MAX_OPERANDS];
 238        int64_t t0;
 239        int i;
 240
 241        update_random_ops(n_ops, prec);
 242        switch (prec) {
 243        case PREC_SINGLE:
 244            fill_random(ops, n_ops, prec, no_neg);
 245            t0 = get_clock();
 246            for (i = 0; i < OPS_PER_ITER; i++) {
 247                float a = ops[0].f;
 248                float b = ops[1].f;
 249                float c = ops[2].f;
 250
 251                switch (op) {
 252                case OP_ADD:
 253                    res.f = a + b;
 254                    break;
 255                case OP_SUB:
 256                    res.f = a - b;
 257                    break;
 258                case OP_MUL:
 259                    res.f = a * b;
 260                    break;
 261                case OP_DIV:
 262                    res.f = a / b;
 263                    break;
 264                case OP_FMA:
 265                    res.f = fmaf(a, b, c);
 266                    break;
 267                case OP_SQRT:
 268                    res.f = sqrtf(a);
 269                    break;
 270                case OP_CMP:
 271                    res.u64 = isgreater(a, b);
 272                    break;
 273                default:
 274                    g_assert_not_reached();
 275                }
 276            }
 277            break;
 278        case PREC_DOUBLE:
 279            fill_random(ops, n_ops, prec, no_neg);
 280            t0 = get_clock();
 281            for (i = 0; i < OPS_PER_ITER; i++) {
 282                double a = ops[0].d;
 283                double b = ops[1].d;
 284                double c = ops[2].d;
 285
 286                switch (op) {
 287                case OP_ADD:
 288                    res.d = a + b;
 289                    break;
 290                case OP_SUB:
 291                    res.d = a - b;
 292                    break;
 293                case OP_MUL:
 294                    res.d = a * b;
 295                    break;
 296                case OP_DIV:
 297                    res.d = a / b;
 298                    break;
 299                case OP_FMA:
 300                    res.d = fma(a, b, c);
 301                    break;
 302                case OP_SQRT:
 303                    res.d = sqrt(a);
 304                    break;
 305                case OP_CMP:
 306                    res.u64 = isgreater(a, b);
 307                    break;
 308                default:
 309                    g_assert_not_reached();
 310                }
 311            }
 312            break;
 313        case PREC_FLOAT32:
 314            fill_random(ops, n_ops, prec, no_neg);
 315            t0 = get_clock();
 316            for (i = 0; i < OPS_PER_ITER; i++) {
 317                float32 a = ops[0].f32;
 318                float32 b = ops[1].f32;
 319                float32 c = ops[2].f32;
 320
 321                switch (op) {
 322                case OP_ADD:
 323                    res.f32 = float32_add(a, b, &soft_status);
 324                    break;
 325                case OP_SUB:
 326                    res.f32 = float32_sub(a, b, &soft_status);
 327                    break;
 328                case OP_MUL:
 329                    res.f = float32_mul(a, b, &soft_status);
 330                    break;
 331                case OP_DIV:
 332                    res.f32 = float32_div(a, b, &soft_status);
 333                    break;
 334                case OP_FMA:
 335                    res.f32 = float32_muladd(a, b, c, 0, &soft_status);
 336                    break;
 337                case OP_SQRT:
 338                    res.f32 = float32_sqrt(a, &soft_status);
 339                    break;
 340                case OP_CMP:
 341                    res.u64 = float32_compare_quiet(a, b, &soft_status);
 342                    break;
 343                default:
 344                    g_assert_not_reached();
 345                }
 346            }
 347            break;
 348        case PREC_FLOAT64:
 349            fill_random(ops, n_ops, prec, no_neg);
 350            t0 = get_clock();
 351            for (i = 0; i < OPS_PER_ITER; i++) {
 352                float64 a = ops[0].f64;
 353                float64 b = ops[1].f64;
 354                float64 c = ops[2].f64;
 355
 356                switch (op) {
 357                case OP_ADD:
 358                    res.f64 = float64_add(a, b, &soft_status);
 359                    break;
 360                case OP_SUB:
 361                    res.f64 = float64_sub(a, b, &soft_status);
 362                    break;
 363                case OP_MUL:
 364                    res.f = float64_mul(a, b, &soft_status);
 365                    break;
 366                case OP_DIV:
 367                    res.f64 = float64_div(a, b, &soft_status);
 368                    break;
 369                case OP_FMA:
 370                    res.f64 = float64_muladd(a, b, c, 0, &soft_status);
 371                    break;
 372                case OP_SQRT:
 373                    res.f64 = float64_sqrt(a, &soft_status);
 374                    break;
 375                case OP_CMP:
 376                    res.u64 = float64_compare_quiet(a, b, &soft_status);
 377                    break;
 378                default:
 379                    g_assert_not_reached();
 380                }
 381            }
 382            break;
 383        case PREC_FLOAT128:
 384            fill_random(ops, n_ops, prec, no_neg);
 385            t0 = get_clock();
 386            for (i = 0; i < OPS_PER_ITER; i++) {
 387                float128 a = ops[0].f128;
 388                float128 b = ops[1].f128;
 389                float128 c = ops[2].f128;
 390
 391                switch (op) {
 392                case OP_ADD:
 393                    res.f128 = float128_add(a, b, &soft_status);
 394                    break;
 395                case OP_SUB:
 396                    res.f128 = float128_sub(a, b, &soft_status);
 397                    break;
 398                case OP_MUL:
 399                    res.f128 = float128_mul(a, b, &soft_status);
 400                    break;
 401                case OP_DIV:
 402                    res.f128 = float128_div(a, b, &soft_status);
 403                    break;
 404                case OP_FMA:
 405                    res.f128 = float128_muladd(a, b, c, 0, &soft_status);
 406                    break;
 407                case OP_SQRT:
 408                    res.f128 = float128_sqrt(a, &soft_status);
 409                    break;
 410                case OP_CMP:
 411                    res.u64 = float128_compare_quiet(a, b, &soft_status);
 412                    break;
 413                default:
 414                    g_assert_not_reached();
 415                }
 416            }
 417            break;
 418        default:
 419            g_assert_not_reached();
 420        }
 421        ns_elapsed += get_clock() - t0;
 422        n_completed_ops += OPS_PER_ITER;
 423    }
 424}
 425
 426#define GEN_BENCH(name, type, prec, op, n_ops)          \
 427    static void __attribute__((flatten)) name(void)     \
 428    {                                                   \
 429        bench(prec, op, n_ops, false);                  \
 430    }
 431
 432#define GEN_BENCH_NO_NEG(name, type, prec, op, n_ops)   \
 433    static void __attribute__((flatten)) name(void)     \
 434    {                                                   \
 435        bench(prec, op, n_ops, true);                   \
 436    }
 437
 438#define GEN_BENCH_ALL_TYPES(opname, op, n_ops)                          \
 439    GEN_BENCH(bench_ ## opname ## _float, float, PREC_SINGLE, op, n_ops) \
 440    GEN_BENCH(bench_ ## opname ## _double, double, PREC_DOUBLE, op, n_ops) \
 441    GEN_BENCH(bench_ ## opname ## _float32, float32, PREC_FLOAT32, op, n_ops) \
 442    GEN_BENCH(bench_ ## opname ## _float64, float64, PREC_FLOAT64, op, n_ops) \
 443    GEN_BENCH(bench_ ## opname ## _float128, float128, PREC_FLOAT128, op, n_ops)
 444
 445GEN_BENCH_ALL_TYPES(add, OP_ADD, 2)
 446GEN_BENCH_ALL_TYPES(sub, OP_SUB, 2)
 447GEN_BENCH_ALL_TYPES(mul, OP_MUL, 2)
 448GEN_BENCH_ALL_TYPES(div, OP_DIV, 2)
 449GEN_BENCH_ALL_TYPES(fma, OP_FMA, 3)
 450GEN_BENCH_ALL_TYPES(cmp, OP_CMP, 2)
 451#undef GEN_BENCH_ALL_TYPES
 452
 453#define GEN_BENCH_ALL_TYPES_NO_NEG(name, op, n)                         \
 454    GEN_BENCH_NO_NEG(bench_ ## name ## _float, float, PREC_SINGLE, op, n) \
 455    GEN_BENCH_NO_NEG(bench_ ## name ## _double, double, PREC_DOUBLE, op, n) \
 456    GEN_BENCH_NO_NEG(bench_ ## name ## _float32, float32, PREC_FLOAT32, op, n) \
 457    GEN_BENCH_NO_NEG(bench_ ## name ## _float64, float64, PREC_FLOAT64, op, n) \
 458    GEN_BENCH_NO_NEG(bench_ ## name ## _float128, float128, PREC_FLOAT128, op, n)
 459
 460GEN_BENCH_ALL_TYPES_NO_NEG(sqrt, OP_SQRT, 1)
 461#undef GEN_BENCH_ALL_TYPES_NO_NEG
 462
 463#undef GEN_BENCH_NO_NEG
 464#undef GEN_BENCH
 465
 466#define GEN_BENCH_FUNCS(opname, op)                             \
 467    [op] = {                                                    \
 468        [PREC_SINGLE]    = bench_ ## opname ## _float,          \
 469        [PREC_DOUBLE]    = bench_ ## opname ## _double,         \
 470        [PREC_FLOAT32]   = bench_ ## opname ## _float32,        \
 471        [PREC_FLOAT64]   = bench_ ## opname ## _float64,        \
 472        [PREC_FLOAT128]   = bench_ ## opname ## _float128,      \
 473    }
 474
 475static const bench_func_t bench_funcs[OP_MAX_NR][PREC_MAX_NR] = {
 476    GEN_BENCH_FUNCS(add, OP_ADD),
 477    GEN_BENCH_FUNCS(sub, OP_SUB),
 478    GEN_BENCH_FUNCS(mul, OP_MUL),
 479    GEN_BENCH_FUNCS(div, OP_DIV),
 480    GEN_BENCH_FUNCS(fma, OP_FMA),
 481    GEN_BENCH_FUNCS(sqrt, OP_SQRT),
 482    GEN_BENCH_FUNCS(cmp, OP_CMP),
 483};
 484
 485#undef GEN_BENCH_FUNCS
 486
 487static void run_bench(void)
 488{
 489    bench_func_t f;
 490
 491    f = bench_funcs[operation][precision];
 492    g_assert(f);
 493    f();
 494}
 495
 496/* @arr must be NULL-terminated */
 497static int find_name(const char * const *arr, const char *name)
 498{
 499    int i;
 500
 501    for (i = 0; arr[i] != NULL; i++) {
 502        if (strcmp(name, arr[i]) == 0) {
 503            return i;
 504        }
 505    }
 506    return -1;
 507}
 508
 509static void usage_complete(int argc, char *argv[])
 510{
 511    gchar *op_list = g_strjoinv(", ", (gchar **)op_names);
 512    gchar *tester_list = g_strjoinv(", ", (gchar **)tester_names);
 513
 514    fprintf(stderr, "Usage: %s [options]\n", argv[0]);
 515    fprintf(stderr, "options:\n");
 516    fprintf(stderr, " -d = duration, in seconds. Default: %d\n",
 517            DEFAULT_DURATION_SECS);
 518    fprintf(stderr, " -h = show this help message.\n");
 519    fprintf(stderr, " -o = floating point operation (%s). Default: %s\n",
 520            op_list, op_names[0]);
 521    fprintf(stderr, " -p = floating point precision (single, double, quad[soft only]). "
 522            "Default: single\n");
 523    fprintf(stderr, " -r = rounding mode (even, zero, down, up, tieaway). "
 524            "Default: even\n");
 525    fprintf(stderr, " -t = tester (%s). Default: %s\n",
 526            tester_list, tester_names[0]);
 527    fprintf(stderr, " -z = flush inputs to zero (soft tester only). "
 528            "Default: disabled\n");
 529    fprintf(stderr, " -Z = flush output to zero (soft tester only). "
 530            "Default: disabled\n");
 531
 532    g_free(tester_list);
 533    g_free(op_list);
 534}
 535
 536static int round_name_to_mode(const char *name)
 537{
 538    int i;
 539
 540    for (i = 0; i < N_ROUND_MODES; i++) {
 541        if (!strcmp(round_names[i], name)) {
 542            return i;
 543        }
 544    }
 545    return -1;
 546}
 547
 548static void QEMU_NORETURN die_host_rounding(enum rounding rounding)
 549{
 550    fprintf(stderr, "fatal: '%s' rounding not supported on this host\n",
 551            round_names[rounding]);
 552    exit(EXIT_FAILURE);
 553}
 554
 555static void set_host_precision(enum rounding rounding)
 556{
 557    int rhost;
 558
 559    switch (rounding) {
 560    case ROUND_EVEN:
 561        rhost = FE_TONEAREST;
 562        break;
 563    case ROUND_ZERO:
 564        rhost = FE_TOWARDZERO;
 565        break;
 566    case ROUND_DOWN:
 567        rhost = FE_DOWNWARD;
 568        break;
 569    case ROUND_UP:
 570        rhost = FE_UPWARD;
 571        break;
 572    case ROUND_TIEAWAY:
 573        die_host_rounding(rounding);
 574        return;
 575    default:
 576        g_assert_not_reached();
 577    }
 578
 579    if (fesetround(rhost)) {
 580        die_host_rounding(rounding);
 581    }
 582}
 583
 584static void set_soft_precision(enum rounding rounding)
 585{
 586    signed char mode;
 587
 588    switch (rounding) {
 589    case ROUND_EVEN:
 590        mode = float_round_nearest_even;
 591        break;
 592    case ROUND_ZERO:
 593        mode = float_round_to_zero;
 594        break;
 595    case ROUND_DOWN:
 596        mode = float_round_down;
 597        break;
 598    case ROUND_UP:
 599        mode = float_round_up;
 600        break;
 601    case ROUND_TIEAWAY:
 602        mode = float_round_ties_away;
 603        break;
 604    default:
 605        g_assert_not_reached();
 606    }
 607    soft_status.float_rounding_mode = mode;
 608}
 609
 610static void parse_args(int argc, char *argv[])
 611{
 612    int c;
 613    int val;
 614    int rounding = ROUND_EVEN;
 615
 616    for (;;) {
 617        c = getopt(argc, argv, "d:ho:p:r:t:zZ");
 618        if (c < 0) {
 619            break;
 620        }
 621        switch (c) {
 622        case 'd':
 623            duration = atoi(optarg);
 624            break;
 625        case 'h':
 626            usage_complete(argc, argv);
 627            exit(EXIT_SUCCESS);
 628        case 'o':
 629            val = find_name(op_names, optarg);
 630            if (val < 0) {
 631                fprintf(stderr, "Unsupported op '%s'\n", optarg);
 632                exit(EXIT_FAILURE);
 633            }
 634            operation = val;
 635            break;
 636        case 'p':
 637            if (!strcmp(optarg, "single")) {
 638                precision = PREC_SINGLE;
 639            } else if (!strcmp(optarg, "double")) {
 640                precision = PREC_DOUBLE;
 641            } else if (!strcmp(optarg, "quad")) {
 642                precision = PREC_QUAD;
 643            } else {
 644                fprintf(stderr, "Unsupported precision '%s'\n", optarg);
 645                exit(EXIT_FAILURE);
 646            }
 647            break;
 648        case 'r':
 649            rounding = round_name_to_mode(optarg);
 650            if (rounding < 0) {
 651                fprintf(stderr, "fatal: invalid rounding mode '%s'\n", optarg);
 652                exit(EXIT_FAILURE);
 653            }
 654            break;
 655        case 't':
 656            val = find_name(tester_names, optarg);
 657            if (val < 0) {
 658                fprintf(stderr, "Unsupported tester '%s'\n", optarg);
 659                exit(EXIT_FAILURE);
 660            }
 661            tester = val;
 662            break;
 663        case 'z':
 664            soft_status.flush_inputs_to_zero = 1;
 665            break;
 666        case 'Z':
 667            soft_status.flush_to_zero = 1;
 668            break;
 669        }
 670    }
 671
 672    /* set precision and rounding mode based on the tester */
 673    switch (tester) {
 674    case TESTER_HOST:
 675        set_host_precision(rounding);
 676        break;
 677    case TESTER_SOFT:
 678        set_soft_precision(rounding);
 679        switch (precision) {
 680        case PREC_SINGLE:
 681            precision = PREC_FLOAT32;
 682            break;
 683        case PREC_DOUBLE:
 684            precision = PREC_FLOAT64;
 685            break;
 686        case PREC_QUAD:
 687            precision = PREC_FLOAT128;
 688            break;
 689        default:
 690            g_assert_not_reached();
 691        }
 692        break;
 693    default:
 694        g_assert_not_reached();
 695    }
 696}
 697
 698static void pr_stats(void)
 699{
 700    printf("%.2f MFlops\n", (double)n_completed_ops / ns_elapsed * 1e3);
 701}
 702
 703int main(int argc, char *argv[])
 704{
 705    parse_args(argc, argv);
 706    run_bench();
 707    pr_stats();
 708    return 0;
 709}
 710