linux/tools/perf/bench/numa.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * numa.c
   4 *
   5 * numa: Simulate NUMA-sensitive workload and measure their NUMA performance
   6 */
   7
   8#include <inttypes.h>
   9/* For the CLR_() macros */
  10#include <pthread.h>
  11
  12#include <subcmd/parse-options.h>
  13#include "../util/cloexec.h"
  14
  15#include "bench.h"
  16
  17#include <errno.h>
  18#include <sched.h>
  19#include <stdio.h>
  20#include <assert.h>
  21#include <malloc.h>
  22#include <signal.h>
  23#include <stdlib.h>
  24#include <string.h>
  25#include <unistd.h>
  26#include <sys/mman.h>
  27#include <sys/time.h>
  28#include <sys/resource.h>
  29#include <sys/wait.h>
  30#include <sys/prctl.h>
  31#include <sys/types.h>
  32#include <linux/kernel.h>
  33#include <linux/time64.h>
  34#include <linux/numa.h>
  35#include <linux/zalloc.h>
  36
  37#include <numa.h>
  38#include <numaif.h>
  39
  40#ifndef RUSAGE_THREAD
  41# define RUSAGE_THREAD 1
  42#endif
  43
  44/*
  45 * Regular printout to the terminal, suppressed if -q is specified:
  46 */
  47#define tprintf(x...) do { if (g && g->p.show_details >= 0) printf(x); } while (0)
  48
  49/*
  50 * Debug printf:
  51 */
  52#undef dprintf
  53#define dprintf(x...) do { if (g && g->p.show_details >= 1) printf(x); } while (0)
  54
  55struct thread_data {
  56        int                     curr_cpu;
  57        cpu_set_t               bind_cpumask;
  58        int                     bind_node;
  59        u8                      *process_data;
  60        int                     process_nr;
  61        int                     thread_nr;
  62        int                     task_nr;
  63        unsigned int            loops_done;
  64        u64                     val;
  65        u64                     runtime_ns;
  66        u64                     system_time_ns;
  67        u64                     user_time_ns;
  68        double                  speed_gbs;
  69        pthread_mutex_t         *process_lock;
  70};
  71
  72/* Parameters set by options: */
  73
  74struct params {
  75        /* Startup synchronization: */
  76        bool                    serialize_startup;
  77
  78        /* Task hierarchy: */
  79        int                     nr_proc;
  80        int                     nr_threads;
  81
  82        /* Working set sizes: */
  83        const char              *mb_global_str;
  84        const char              *mb_proc_str;
  85        const char              *mb_proc_locked_str;
  86        const char              *mb_thread_str;
  87
  88        double                  mb_global;
  89        double                  mb_proc;
  90        double                  mb_proc_locked;
  91        double                  mb_thread;
  92
  93        /* Access patterns to the working set: */
  94        bool                    data_reads;
  95        bool                    data_writes;
  96        bool                    data_backwards;
  97        bool                    data_zero_memset;
  98        bool                    data_rand_walk;
  99        u32                     nr_loops;
 100        u32                     nr_secs;
 101        u32                     sleep_usecs;
 102
 103        /* Working set initialization: */
 104        bool                    init_zero;
 105        bool                    init_random;
 106        bool                    init_cpu0;
 107
 108        /* Misc options: */
 109        int                     show_details;
 110        int                     run_all;
 111        int                     thp;
 112
 113        long                    bytes_global;
 114        long                    bytes_process;
 115        long                    bytes_process_locked;
 116        long                    bytes_thread;
 117
 118        int                     nr_tasks;
 119        bool                    show_quiet;
 120
 121        bool                    show_convergence;
 122        bool                    measure_convergence;
 123
 124        int                     perturb_secs;
 125        int                     nr_cpus;
 126        int                     nr_nodes;
 127
 128        /* Affinity options -C and -N: */
 129        char                    *cpu_list_str;
 130        char                    *node_list_str;
 131};
 132
 133
 134/* Global, read-writable area, accessible to all processes and threads: */
 135
 136struct global_info {
 137        u8                      *data;
 138
 139        pthread_mutex_t         startup_mutex;
 140        pthread_cond_t          startup_cond;
 141        int                     nr_tasks_started;
 142
 143        pthread_mutex_t         start_work_mutex;
 144        pthread_cond_t          start_work_cond;
 145        int                     nr_tasks_working;
 146        bool                    start_work;
 147
 148        pthread_mutex_t         stop_work_mutex;
 149        u64                     bytes_done;
 150
 151        struct thread_data      *threads;
 152
 153        /* Convergence latency measurement: */
 154        bool                    all_converged;
 155        bool                    stop_work;
 156
 157        int                     print_once;
 158
 159        struct params           p;
 160};
 161
 162static struct global_info       *g = NULL;
 163
 164static int parse_cpus_opt(const struct option *opt, const char *arg, int unset);
 165static int parse_nodes_opt(const struct option *opt, const char *arg, int unset);
 166
 167struct params p0;
 168
 169static const struct option options[] = {
 170        OPT_INTEGER('p', "nr_proc"      , &p0.nr_proc,          "number of processes"),
 171        OPT_INTEGER('t', "nr_threads"   , &p0.nr_threads,       "number of threads per process"),
 172
 173        OPT_STRING('G', "mb_global"     , &p0.mb_global_str,    "MB", "global  memory (MBs)"),
 174        OPT_STRING('P', "mb_proc"       , &p0.mb_proc_str,      "MB", "process memory (MBs)"),
 175        OPT_STRING('L', "mb_proc_locked", &p0.mb_proc_locked_str,"MB", "process serialized/locked memory access (MBs), <= process_memory"),
 176        OPT_STRING('T', "mb_thread"     , &p0.mb_thread_str,    "MB", "thread  memory (MBs)"),
 177
 178        OPT_UINTEGER('l', "nr_loops"    , &p0.nr_loops,         "max number of loops to run (default: unlimited)"),
 179        OPT_UINTEGER('s', "nr_secs"     , &p0.nr_secs,          "max number of seconds to run (default: 5 secs)"),
 180        OPT_UINTEGER('u', "usleep"      , &p0.sleep_usecs,      "usecs to sleep per loop iteration"),
 181
 182        OPT_BOOLEAN('R', "data_reads"   , &p0.data_reads,       "access the data via reads (can be mixed with -W)"),
 183        OPT_BOOLEAN('W', "data_writes"  , &p0.data_writes,      "access the data via writes (can be mixed with -R)"),
 184        OPT_BOOLEAN('B', "data_backwards", &p0.data_backwards,  "access the data backwards as well"),
 185        OPT_BOOLEAN('Z', "data_zero_memset", &p0.data_zero_memset,"access the data via glibc bzero only"),
 186        OPT_BOOLEAN('r', "data_rand_walk", &p0.data_rand_walk,  "access the data with random (32bit LFSR) walk"),
 187
 188
 189        OPT_BOOLEAN('z', "init_zero"    , &p0.init_zero,        "bzero the initial allocations"),
 190        OPT_BOOLEAN('I', "init_random"  , &p0.init_random,      "randomize the contents of the initial allocations"),
 191        OPT_BOOLEAN('0', "init_cpu0"    , &p0.init_cpu0,        "do the initial allocations on CPU#0"),
 192        OPT_INTEGER('x', "perturb_secs", &p0.perturb_secs,      "perturb thread 0/0 every X secs, to test convergence stability"),
 193
 194        OPT_INCR   ('d', "show_details" , &p0.show_details,     "Show details"),
 195        OPT_INCR   ('a', "all"          , &p0.run_all,          "Run all tests in the suite"),
 196        OPT_INTEGER('H', "thp"          , &p0.thp,              "MADV_NOHUGEPAGE < 0 < MADV_HUGEPAGE"),
 197        OPT_BOOLEAN('c', "show_convergence", &p0.show_convergence, "show convergence details, "
 198                    "convergence is reached when each process (all its threads) is running on a single NUMA node."),
 199        OPT_BOOLEAN('m', "measure_convergence", &p0.measure_convergence, "measure convergence latency"),
 200        OPT_BOOLEAN('q', "quiet"        , &p0.show_quiet,       "quiet mode"),
 201        OPT_BOOLEAN('S', "serialize-startup", &p0.serialize_startup,"serialize thread startup"),
 202
 203        /* Special option string parsing callbacks: */
 204        OPT_CALLBACK('C', "cpus", NULL, "cpu[,cpu2,...cpuN]",
 205                        "bind the first N tasks to these specific cpus (the rest is unbound)",
 206                        parse_cpus_opt),
 207        OPT_CALLBACK('M', "memnodes", NULL, "node[,node2,...nodeN]",
 208                        "bind the first N tasks to these specific memory nodes (the rest is unbound)",
 209                        parse_nodes_opt),
 210        OPT_END()
 211};
 212
 213static const char * const bench_numa_usage[] = {
 214        "perf bench numa <options>",
 215        NULL
 216};
 217
 218static const char * const numa_usage[] = {
 219        "perf bench numa mem [<options>]",
 220        NULL
 221};
 222
 223/*
 224 * To get number of numa nodes present.
 225 */
 226static int nr_numa_nodes(void)
 227{
 228        int i, nr_nodes = 0;
 229
 230        for (i = 0; i < g->p.nr_nodes; i++) {
 231                if (numa_bitmask_isbitset(numa_nodes_ptr, i))
 232                        nr_nodes++;
 233        }
 234
 235        return nr_nodes;
 236}
 237
 238/*
 239 * To check if given numa node is present.
 240 */
 241static int is_node_present(int node)
 242{
 243        return numa_bitmask_isbitset(numa_nodes_ptr, node);
 244}
 245
 246/*
 247 * To check given numa node has cpus.
 248 */
 249static bool node_has_cpus(int node)
 250{
 251        struct bitmask *cpumask = numa_allocate_cpumask();
 252        bool ret = false; /* fall back to nocpus */
 253        int cpu;
 254
 255        BUG_ON(!cpumask);
 256        if (!numa_node_to_cpus(node, cpumask)) {
 257                for (cpu = 0; cpu < (int)cpumask->size; cpu++) {
 258                        if (numa_bitmask_isbitset(cpumask, cpu)) {
 259                                ret = true;
 260                                break;
 261                        }
 262                }
 263        }
 264        numa_free_cpumask(cpumask);
 265
 266        return ret;
 267}
 268
 269static cpu_set_t bind_to_cpu(int target_cpu)
 270{
 271        cpu_set_t orig_mask, mask;
 272        int ret;
 273
 274        ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask);
 275        BUG_ON(ret);
 276
 277        CPU_ZERO(&mask);
 278
 279        if (target_cpu == -1) {
 280                int cpu;
 281
 282                for (cpu = 0; cpu < g->p.nr_cpus; cpu++)
 283                        CPU_SET(cpu, &mask);
 284        } else {
 285                BUG_ON(target_cpu < 0 || target_cpu >= g->p.nr_cpus);
 286                CPU_SET(target_cpu, &mask);
 287        }
 288
 289        ret = sched_setaffinity(0, sizeof(mask), &mask);
 290        BUG_ON(ret);
 291
 292        return orig_mask;
 293}
 294
 295static cpu_set_t bind_to_node(int target_node)
 296{
 297        cpu_set_t orig_mask, mask;
 298        int cpu;
 299        int ret;
 300
 301        ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask);
 302        BUG_ON(ret);
 303
 304        CPU_ZERO(&mask);
 305
 306        if (target_node == NUMA_NO_NODE) {
 307                for (cpu = 0; cpu < g->p.nr_cpus; cpu++)
 308                        CPU_SET(cpu, &mask);
 309        } else {
 310                struct bitmask *cpumask = numa_allocate_cpumask();
 311
 312                BUG_ON(!cpumask);
 313                if (!numa_node_to_cpus(target_node, cpumask)) {
 314                        for (cpu = 0; cpu < (int)cpumask->size; cpu++) {
 315                                if (numa_bitmask_isbitset(cpumask, cpu))
 316                                        CPU_SET(cpu, &mask);
 317                        }
 318                }
 319                numa_free_cpumask(cpumask);
 320        }
 321
 322        ret = sched_setaffinity(0, sizeof(mask), &mask);
 323        BUG_ON(ret);
 324
 325        return orig_mask;
 326}
 327
 328static void bind_to_cpumask(cpu_set_t mask)
 329{
 330        int ret;
 331
 332        ret = sched_setaffinity(0, sizeof(mask), &mask);
 333        BUG_ON(ret);
 334}
 335
 336static void mempol_restore(void)
 337{
 338        int ret;
 339
 340        ret = set_mempolicy(MPOL_DEFAULT, NULL, g->p.nr_nodes-1);
 341
 342        BUG_ON(ret);
 343}
 344
 345static void bind_to_memnode(int node)
 346{
 347        struct bitmask *node_mask;
 348        int ret;
 349
 350        if (node == NUMA_NO_NODE)
 351                return;
 352
 353        node_mask = numa_allocate_nodemask();
 354        BUG_ON(!node_mask);
 355
 356        numa_bitmask_clearall(node_mask);
 357        numa_bitmask_setbit(node_mask, node);
 358
 359        ret = set_mempolicy(MPOL_BIND, node_mask->maskp, node_mask->size + 1);
 360        dprintf("binding to node %d, mask: %016lx => %d\n", node, *node_mask->maskp, ret);
 361
 362        numa_bitmask_free(node_mask);
 363        BUG_ON(ret);
 364}
 365
 366#define HPSIZE (2*1024*1024)
 367
 368#define set_taskname(fmt...)                            \
 369do {                                                    \
 370        char name[20];                                  \
 371                                                        \
 372        snprintf(name, 20, fmt);                        \
 373        prctl(PR_SET_NAME, name);                       \
 374} while (0)
 375
 376static u8 *alloc_data(ssize_t bytes0, int map_flags,
 377                      int init_zero, int init_cpu0, int thp, int init_random)
 378{
 379        cpu_set_t orig_mask;
 380        ssize_t bytes;
 381        u8 *buf;
 382        int ret;
 383
 384        if (!bytes0)
 385                return NULL;
 386
 387        /* Allocate and initialize all memory on CPU#0: */
 388        if (init_cpu0) {
 389                int node = numa_node_of_cpu(0);
 390
 391                orig_mask = bind_to_node(node);
 392                bind_to_memnode(node);
 393        }
 394
 395        bytes = bytes0 + HPSIZE;
 396
 397        buf = (void *)mmap(0, bytes, PROT_READ|PROT_WRITE, MAP_ANON|map_flags, -1, 0);
 398        BUG_ON(buf == (void *)-1);
 399
 400        if (map_flags == MAP_PRIVATE) {
 401                if (thp > 0) {
 402                        ret = madvise(buf, bytes, MADV_HUGEPAGE);
 403                        if (ret && !g->print_once) {
 404                                g->print_once = 1;
 405                                printf("WARNING: Could not enable THP - do: 'echo madvise > /sys/kernel/mm/transparent_hugepage/enabled'\n");
 406                        }
 407                }
 408                if (thp < 0) {
 409                        ret = madvise(buf, bytes, MADV_NOHUGEPAGE);
 410                        if (ret && !g->print_once) {
 411                                g->print_once = 1;
 412                                printf("WARNING: Could not disable THP: run a CONFIG_TRANSPARENT_HUGEPAGE kernel?\n");
 413                        }
 414                }
 415        }
 416
 417        if (init_zero) {
 418                bzero(buf, bytes);
 419        } else {
 420                /* Initialize random contents, different in each word: */
 421                if (init_random) {
 422                        u64 *wbuf = (void *)buf;
 423                        long off = rand();
 424                        long i;
 425
 426                        for (i = 0; i < bytes/8; i++)
 427                                wbuf[i] = i + off;
 428                }
 429        }
 430
 431        /* Align to 2MB boundary: */
 432        buf = (void *)(((unsigned long)buf + HPSIZE-1) & ~(HPSIZE-1));
 433
 434        /* Restore affinity: */
 435        if (init_cpu0) {
 436                bind_to_cpumask(orig_mask);
 437                mempol_restore();
 438        }
 439
 440        return buf;
 441}
 442
 443static void free_data(void *data, ssize_t bytes)
 444{
 445        int ret;
 446
 447        if (!data)
 448                return;
 449
 450        ret = munmap(data, bytes);
 451        BUG_ON(ret);
 452}
 453
 454/*
 455 * Create a shared memory buffer that can be shared between processes, zeroed:
 456 */
 457static void * zalloc_shared_data(ssize_t bytes)
 458{
 459        return alloc_data(bytes, MAP_SHARED, 1, g->p.init_cpu0,  g->p.thp, g->p.init_random);
 460}
 461
 462/*
 463 * Create a shared memory buffer that can be shared between processes:
 464 */
 465static void * setup_shared_data(ssize_t bytes)
 466{
 467        return alloc_data(bytes, MAP_SHARED, 0, g->p.init_cpu0,  g->p.thp, g->p.init_random);
 468}
 469
 470/*
 471 * Allocate process-local memory - this will either be shared between
 472 * threads of this process, or only be accessed by this thread:
 473 */
 474static void * setup_private_data(ssize_t bytes)
 475{
 476        return alloc_data(bytes, MAP_PRIVATE, 0, g->p.init_cpu0,  g->p.thp, g->p.init_random);
 477}
 478
 479/*
 480 * Return a process-shared (global) mutex:
 481 */
 482static void init_global_mutex(pthread_mutex_t *mutex)
 483{
 484        pthread_mutexattr_t attr;
 485
 486        pthread_mutexattr_init(&attr);
 487        pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
 488        pthread_mutex_init(mutex, &attr);
 489}
 490
 491/*
 492 * Return a process-shared (global) condition variable:
 493 */
 494static void init_global_cond(pthread_cond_t *cond)
 495{
 496        pthread_condattr_t attr;
 497
 498        pthread_condattr_init(&attr);
 499        pthread_condattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
 500        pthread_cond_init(cond, &attr);
 501}
 502
 503static int parse_cpu_list(const char *arg)
 504{
 505        p0.cpu_list_str = strdup(arg);
 506
 507        dprintf("got CPU list: {%s}\n", p0.cpu_list_str);
 508
 509        return 0;
 510}
 511
 512static int parse_setup_cpu_list(void)
 513{
 514        struct thread_data *td;
 515        char *str0, *str;
 516        int t;
 517
 518        if (!g->p.cpu_list_str)
 519                return 0;
 520
 521        dprintf("g->p.nr_tasks: %d\n", g->p.nr_tasks);
 522
 523        str0 = str = strdup(g->p.cpu_list_str);
 524        t = 0;
 525
 526        BUG_ON(!str);
 527
 528        tprintf("# binding tasks to CPUs:\n");
 529        tprintf("#  ");
 530
 531        while (true) {
 532                int bind_cpu, bind_cpu_0, bind_cpu_1;
 533                char *tok, *tok_end, *tok_step, *tok_len, *tok_mul;
 534                int bind_len;
 535                int step;
 536                int mul;
 537
 538                tok = strsep(&str, ",");
 539                if (!tok)
 540                        break;
 541
 542                tok_end = strstr(tok, "-");
 543
 544                dprintf("\ntoken: {%s}, end: {%s}\n", tok, tok_end);
 545                if (!tok_end) {
 546                        /* Single CPU specified: */
 547                        bind_cpu_0 = bind_cpu_1 = atol(tok);
 548                } else {
 549                        /* CPU range specified (for example: "5-11"): */
 550                        bind_cpu_0 = atol(tok);
 551                        bind_cpu_1 = atol(tok_end + 1);
 552                }
 553
 554                step = 1;
 555                tok_step = strstr(tok, "#");
 556                if (tok_step) {
 557                        step = atol(tok_step + 1);
 558                        BUG_ON(step <= 0 || step >= g->p.nr_cpus);
 559                }
 560
 561                /*
 562                 * Mask length.
 563                 * Eg: "--cpus 8_4-16#4" means: '--cpus 8_4,12_4,16_4',
 564                 * where the _4 means the next 4 CPUs are allowed.
 565                 */
 566                bind_len = 1;
 567                tok_len = strstr(tok, "_");
 568                if (tok_len) {
 569                        bind_len = atol(tok_len + 1);
 570                        BUG_ON(bind_len <= 0 || bind_len > g->p.nr_cpus);
 571                }
 572
 573                /* Multiplicator shortcut, "0x8" is a shortcut for: "0,0,0,0,0,0,0,0" */
 574                mul = 1;
 575                tok_mul = strstr(tok, "x");
 576                if (tok_mul) {
 577                        mul = atol(tok_mul + 1);
 578                        BUG_ON(mul <= 0);
 579                }
 580
 581                dprintf("CPUs: %d_%d-%d#%dx%d\n", bind_cpu_0, bind_len, bind_cpu_1, step, mul);
 582
 583                if (bind_cpu_0 >= g->p.nr_cpus || bind_cpu_1 >= g->p.nr_cpus) {
 584                        printf("\nTest not applicable, system has only %d CPUs.\n", g->p.nr_cpus);
 585                        return -1;
 586                }
 587
 588                BUG_ON(bind_cpu_0 < 0 || bind_cpu_1 < 0);
 589                BUG_ON(bind_cpu_0 > bind_cpu_1);
 590
 591                for (bind_cpu = bind_cpu_0; bind_cpu <= bind_cpu_1; bind_cpu += step) {
 592                        int i;
 593
 594                        for (i = 0; i < mul; i++) {
 595                                int cpu;
 596
 597                                if (t >= g->p.nr_tasks) {
 598                                        printf("\n# NOTE: ignoring bind CPUs starting at CPU#%d\n #", bind_cpu);
 599                                        goto out;
 600                                }
 601                                td = g->threads + t;
 602
 603                                if (t)
 604                                        tprintf(",");
 605                                if (bind_len > 1) {
 606                                        tprintf("%2d/%d", bind_cpu, bind_len);
 607                                } else {
 608                                        tprintf("%2d", bind_cpu);
 609                                }
 610
 611                                CPU_ZERO(&td->bind_cpumask);
 612                                for (cpu = bind_cpu; cpu < bind_cpu+bind_len; cpu++) {
 613                                        BUG_ON(cpu < 0 || cpu >= g->p.nr_cpus);
 614                                        CPU_SET(cpu, &td->bind_cpumask);
 615                                }
 616                                t++;
 617                        }
 618                }
 619        }
 620out:
 621
 622        tprintf("\n");
 623
 624        if (t < g->p.nr_tasks)
 625                printf("# NOTE: %d tasks bound, %d tasks unbound\n", t, g->p.nr_tasks - t);
 626
 627        free(str0);
 628        return 0;
 629}
 630
 631static int parse_cpus_opt(const struct option *opt __maybe_unused,
 632                          const char *arg, int unset __maybe_unused)
 633{
 634        if (!arg)
 635                return -1;
 636
 637        return parse_cpu_list(arg);
 638}
 639
 640static int parse_node_list(const char *arg)
 641{
 642        p0.node_list_str = strdup(arg);
 643
 644        dprintf("got NODE list: {%s}\n", p0.node_list_str);
 645
 646        return 0;
 647}
 648
 649static int parse_setup_node_list(void)
 650{
 651        struct thread_data *td;
 652        char *str0, *str;
 653        int t;
 654
 655        if (!g->p.node_list_str)
 656                return 0;
 657
 658        dprintf("g->p.nr_tasks: %d\n", g->p.nr_tasks);
 659
 660        str0 = str = strdup(g->p.node_list_str);
 661        t = 0;
 662
 663        BUG_ON(!str);
 664
 665        tprintf("# binding tasks to NODEs:\n");
 666        tprintf("# ");
 667
 668        while (true) {
 669                int bind_node, bind_node_0, bind_node_1;
 670                char *tok, *tok_end, *tok_step, *tok_mul;
 671                int step;
 672                int mul;
 673
 674                tok = strsep(&str, ",");
 675                if (!tok)
 676                        break;
 677
 678                tok_end = strstr(tok, "-");
 679
 680                dprintf("\ntoken: {%s}, end: {%s}\n", tok, tok_end);
 681                if (!tok_end) {
 682                        /* Single NODE specified: */
 683                        bind_node_0 = bind_node_1 = atol(tok);
 684                } else {
 685                        /* NODE range specified (for example: "5-11"): */
 686                        bind_node_0 = atol(tok);
 687                        bind_node_1 = atol(tok_end + 1);
 688                }
 689
 690                step = 1;
 691                tok_step = strstr(tok, "#");
 692                if (tok_step) {
 693                        step = atol(tok_step + 1);
 694                        BUG_ON(step <= 0 || step >= g->p.nr_nodes);
 695                }
 696
 697                /* Multiplicator shortcut, "0x8" is a shortcut for: "0,0,0,0,0,0,0,0" */
 698                mul = 1;
 699                tok_mul = strstr(tok, "x");
 700                if (tok_mul) {
 701                        mul = atol(tok_mul + 1);
 702                        BUG_ON(mul <= 0);
 703                }
 704
 705                dprintf("NODEs: %d-%d #%d\n", bind_node_0, bind_node_1, step);
 706
 707                if (bind_node_0 >= g->p.nr_nodes || bind_node_1 >= g->p.nr_nodes) {
 708                        printf("\nTest not applicable, system has only %d nodes.\n", g->p.nr_nodes);
 709                        return -1;
 710                }
 711
 712                BUG_ON(bind_node_0 < 0 || bind_node_1 < 0);
 713                BUG_ON(bind_node_0 > bind_node_1);
 714
 715                for (bind_node = bind_node_0; bind_node <= bind_node_1; bind_node += step) {
 716                        int i;
 717
 718                        for (i = 0; i < mul; i++) {
 719                                if (t >= g->p.nr_tasks || !node_has_cpus(bind_node)) {
 720                                        printf("\n# NOTE: ignoring bind NODEs starting at NODE#%d\n", bind_node);
 721                                        goto out;
 722                                }
 723                                td = g->threads + t;
 724
 725                                if (!t)
 726                                        tprintf(" %2d", bind_node);
 727                                else
 728                                        tprintf(",%2d", bind_node);
 729
 730                                td->bind_node = bind_node;
 731                                t++;
 732                        }
 733                }
 734        }
 735out:
 736
 737        tprintf("\n");
 738
 739        if (t < g->p.nr_tasks)
 740                printf("# NOTE: %d tasks mem-bound, %d tasks unbound\n", t, g->p.nr_tasks - t);
 741
 742        free(str0);
 743        return 0;
 744}
 745
 746static int parse_nodes_opt(const struct option *opt __maybe_unused,
 747                          const char *arg, int unset __maybe_unused)
 748{
 749        if (!arg)
 750                return -1;
 751
 752        return parse_node_list(arg);
 753}
 754
 755#define BIT(x) (1ul << x)
 756
 757static inline uint32_t lfsr_32(uint32_t lfsr)
 758{
 759        const uint32_t taps = BIT(1) | BIT(5) | BIT(6) | BIT(31);
 760        return (lfsr>>1) ^ ((0x0u - (lfsr & 0x1u)) & taps);
 761}
 762
 763/*
 764 * Make sure there's real data dependency to RAM (when read
 765 * accesses are enabled), so the compiler, the CPU and the
 766 * kernel (KSM, zero page, etc.) cannot optimize away RAM
 767 * accesses:
 768 */
 769static inline u64 access_data(u64 *data, u64 val)
 770{
 771        if (g->p.data_reads)
 772                val += *data;
 773        if (g->p.data_writes)
 774                *data = val + 1;
 775        return val;
 776}
 777
 778/*
 779 * The worker process does two types of work, a forwards going
 780 * loop and a backwards going loop.
 781 *
 782 * We do this so that on multiprocessor systems we do not create
 783 * a 'train' of processing, with highly synchronized processes,
 784 * skewing the whole benchmark.
 785 */
 786static u64 do_work(u8 *__data, long bytes, int nr, int nr_max, int loop, u64 val)
 787{
 788        long words = bytes/sizeof(u64);
 789        u64 *data = (void *)__data;
 790        long chunk_0, chunk_1;
 791        u64 *d0, *d, *d1;
 792        long off;
 793        long i;
 794
 795        BUG_ON(!data && words);
 796        BUG_ON(data && !words);
 797
 798        if (!data)
 799                return val;
 800
 801        /* Very simple memset() work variant: */
 802        if (g->p.data_zero_memset && !g->p.data_rand_walk) {
 803                bzero(data, bytes);
 804                return val;
 805        }
 806
 807        /* Spread out by PID/TID nr and by loop nr: */
 808        chunk_0 = words/nr_max;
 809        chunk_1 = words/g->p.nr_loops;
 810        off = nr*chunk_0 + loop*chunk_1;
 811
 812        while (off >= words)
 813                off -= words;
 814
 815        if (g->p.data_rand_walk) {
 816                u32 lfsr = nr + loop + val;
 817                int j;
 818
 819                for (i = 0; i < words/1024; i++) {
 820                        long start, end;
 821
 822                        lfsr = lfsr_32(lfsr);
 823
 824                        start = lfsr % words;
 825                        end = min(start + 1024, words-1);
 826
 827                        if (g->p.data_zero_memset) {
 828                                bzero(data + start, (end-start) * sizeof(u64));
 829                        } else {
 830                                for (j = start; j < end; j++)
 831                                        val = access_data(data + j, val);
 832                        }
 833                }
 834        } else if (!g->p.data_backwards || (nr + loop) & 1) {
 835                /* Process data forwards: */
 836
 837                d0 = data + off;
 838                d  = data + off + 1;
 839                d1 = data + words;
 840
 841                for (;;) {
 842                        if (unlikely(d >= d1))
 843                                d = data;
 844                        if (unlikely(d == d0))
 845                                break;
 846
 847                        val = access_data(d, val);
 848
 849                        d++;
 850                }
 851        } else {
 852                /* Process data backwards: */
 853
 854                d0 = data + off;
 855                d  = data + off - 1;
 856                d1 = data + words;
 857
 858                for (;;) {
 859                        if (unlikely(d < data))
 860                                d = data + words-1;
 861                        if (unlikely(d == d0))
 862                                break;
 863
 864                        val = access_data(d, val);
 865
 866                        d--;
 867                }
 868        }
 869
 870        return val;
 871}
 872
 873static void update_curr_cpu(int task_nr, unsigned long bytes_worked)
 874{
 875        unsigned int cpu;
 876
 877        cpu = sched_getcpu();
 878
 879        g->threads[task_nr].curr_cpu = cpu;
 880        prctl(0, bytes_worked);
 881}
 882
 883/*
 884 * Count the number of nodes a process's threads
 885 * are spread out on.
 886 *
 887 * A count of 1 means that the process is compressed
 888 * to a single node. A count of g->p.nr_nodes means it's
 889 * spread out on the whole system.
 890 */
 891static int count_process_nodes(int process_nr)
 892{
 893        char *node_present;
 894        int nodes;
 895        int n, t;
 896
 897        node_present = (char *)malloc(g->p.nr_nodes * sizeof(char));
 898        BUG_ON(!node_present);
 899        for (nodes = 0; nodes < g->p.nr_nodes; nodes++)
 900                node_present[nodes] = 0;
 901
 902        for (t = 0; t < g->p.nr_threads; t++) {
 903                struct thread_data *td;
 904                int task_nr;
 905                int node;
 906
 907                task_nr = process_nr*g->p.nr_threads + t;
 908                td = g->threads + task_nr;
 909
 910                node = numa_node_of_cpu(td->curr_cpu);
 911                if (node < 0) /* curr_cpu was likely still -1 */ {
 912                        free(node_present);
 913                        return 0;
 914                }
 915
 916                node_present[node] = 1;
 917        }
 918
 919        nodes = 0;
 920
 921        for (n = 0; n < g->p.nr_nodes; n++)
 922                nodes += node_present[n];
 923
 924        free(node_present);
 925        return nodes;
 926}
 927
 928/*
 929 * Count the number of distinct process-threads a node contains.
 930 *
 931 * A count of 1 means that the node contains only a single
 932 * process. If all nodes on the system contain at most one
 933 * process then we are well-converged.
 934 */
 935static int count_node_processes(int node)
 936{
 937        int processes = 0;
 938        int t, p;
 939
 940        for (p = 0; p < g->p.nr_proc; p++) {
 941                for (t = 0; t < g->p.nr_threads; t++) {
 942                        struct thread_data *td;
 943                        int task_nr;
 944                        int n;
 945
 946                        task_nr = p*g->p.nr_threads + t;
 947                        td = g->threads + task_nr;
 948
 949                        n = numa_node_of_cpu(td->curr_cpu);
 950                        if (n == node) {
 951                                processes++;
 952                                break;
 953                        }
 954                }
 955        }
 956
 957        return processes;
 958}
 959
 960static void calc_convergence_compression(int *strong)
 961{
 962        unsigned int nodes_min, nodes_max;
 963        int p;
 964
 965        nodes_min = -1;
 966        nodes_max =  0;
 967
 968        for (p = 0; p < g->p.nr_proc; p++) {
 969                unsigned int nodes = count_process_nodes(p);
 970
 971                if (!nodes) {
 972                        *strong = 0;
 973                        return;
 974                }
 975
 976                nodes_min = min(nodes, nodes_min);
 977                nodes_max = max(nodes, nodes_max);
 978        }
 979
 980        /* Strong convergence: all threads compress on a single node: */
 981        if (nodes_min == 1 && nodes_max == 1) {
 982                *strong = 1;
 983        } else {
 984                *strong = 0;
 985                tprintf(" {%d-%d}", nodes_min, nodes_max);
 986        }
 987}
 988
 989static void calc_convergence(double runtime_ns_max, double *convergence)
 990{
 991        unsigned int loops_done_min, loops_done_max;
 992        int process_groups;
 993        int *nodes;
 994        int distance;
 995        int nr_min;
 996        int nr_max;
 997        int strong;
 998        int sum;
 999        int nr;
1000        int node;
1001        int cpu;
1002        int t;
1003
1004        if (!g->p.show_convergence && !g->p.measure_convergence)
1005                return;
1006
1007        nodes = (int *)malloc(g->p.nr_nodes * sizeof(int));
1008        BUG_ON(!nodes);
1009        for (node = 0; node < g->p.nr_nodes; node++)
1010                nodes[node] = 0;
1011
1012        loops_done_min = -1;
1013        loops_done_max = 0;
1014
1015        for (t = 0; t < g->p.nr_tasks; t++) {
1016                struct thread_data *td = g->threads + t;
1017                unsigned int loops_done;
1018
1019                cpu = td->curr_cpu;
1020
1021                /* Not all threads have written it yet: */
1022                if (cpu < 0)
1023                        continue;
1024
1025                node = numa_node_of_cpu(cpu);
1026
1027                nodes[node]++;
1028
1029                loops_done = td->loops_done;
1030                loops_done_min = min(loops_done, loops_done_min);
1031                loops_done_max = max(loops_done, loops_done_max);
1032        }
1033
1034        nr_max = 0;
1035        nr_min = g->p.nr_tasks;
1036        sum = 0;
1037
1038        for (node = 0; node < g->p.nr_nodes; node++) {
1039                if (!is_node_present(node))
1040                        continue;
1041                nr = nodes[node];
1042                nr_min = min(nr, nr_min);
1043                nr_max = max(nr, nr_max);
1044                sum += nr;
1045        }
1046        BUG_ON(nr_min > nr_max);
1047
1048        BUG_ON(sum > g->p.nr_tasks);
1049
1050        if (0 && (sum < g->p.nr_tasks)) {
1051                free(nodes);
1052                return;
1053        }
1054
1055        /*
1056         * Count the number of distinct process groups present
1057         * on nodes - when we are converged this will decrease
1058         * to g->p.nr_proc:
1059         */
1060        process_groups = 0;
1061
1062        for (node = 0; node < g->p.nr_nodes; node++) {
1063                int processes;
1064
1065                if (!is_node_present(node))
1066                        continue;
1067                processes = count_node_processes(node);
1068                nr = nodes[node];
1069                tprintf(" %2d/%-2d", nr, processes);
1070
1071                process_groups += processes;
1072        }
1073
1074        distance = nr_max - nr_min;
1075
1076        tprintf(" [%2d/%-2d]", distance, process_groups);
1077
1078        tprintf(" l:%3d-%-3d (%3d)",
1079                loops_done_min, loops_done_max, loops_done_max-loops_done_min);
1080
1081        if (loops_done_min && loops_done_max) {
1082                double skew = 1.0 - (double)loops_done_min/loops_done_max;
1083
1084                tprintf(" [%4.1f%%]", skew * 100.0);
1085        }
1086
1087        calc_convergence_compression(&strong);
1088
1089        if (strong && process_groups == g->p.nr_proc) {
1090                if (!*convergence) {
1091                        *convergence = runtime_ns_max;
1092                        tprintf(" (%6.1fs converged)\n", *convergence / NSEC_PER_SEC);
1093                        if (g->p.measure_convergence) {
1094                                g->all_converged = true;
1095                                g->stop_work = true;
1096                        }
1097                }
1098        } else {
1099                if (*convergence) {
1100                        tprintf(" (%6.1fs de-converged)", runtime_ns_max / NSEC_PER_SEC);
1101                        *convergence = 0;
1102                }
1103                tprintf("\n");
1104        }
1105
1106        free(nodes);
1107}
1108
1109static void show_summary(double runtime_ns_max, int l, double *convergence)
1110{
1111        tprintf("\r #  %5.1f%%  [%.1f mins]",
1112                (double)(l+1)/g->p.nr_loops*100.0, runtime_ns_max / NSEC_PER_SEC / 60.0);
1113
1114        calc_convergence(runtime_ns_max, convergence);
1115
1116        if (g->p.show_details >= 0)
1117                fflush(stdout);
1118}
1119
1120static void *worker_thread(void *__tdata)
1121{
1122        struct thread_data *td = __tdata;
1123        struct timeval start0, start, stop, diff;
1124        int process_nr = td->process_nr;
1125        int thread_nr = td->thread_nr;
1126        unsigned long last_perturbance;
1127        int task_nr = td->task_nr;
1128        int details = g->p.show_details;
1129        int first_task, last_task;
1130        double convergence = 0;
1131        u64 val = td->val;
1132        double runtime_ns_max;
1133        u8 *global_data;
1134        u8 *process_data;
1135        u8 *thread_data;
1136        u64 bytes_done, secs;
1137        long work_done;
1138        u32 l;
1139        struct rusage rusage;
1140
1141        bind_to_cpumask(td->bind_cpumask);
1142        bind_to_memnode(td->bind_node);
1143
1144        set_taskname("thread %d/%d", process_nr, thread_nr);
1145
1146        global_data = g->data;
1147        process_data = td->process_data;
1148        thread_data = setup_private_data(g->p.bytes_thread);
1149
1150        bytes_done = 0;
1151
1152        last_task = 0;
1153        if (process_nr == g->p.nr_proc-1 && thread_nr == g->p.nr_threads-1)
1154                last_task = 1;
1155
1156        first_task = 0;
1157        if (process_nr == 0 && thread_nr == 0)
1158                first_task = 1;
1159
1160        if (details >= 2) {
1161                printf("#  thread %2d / %2d global mem: %p, process mem: %p, thread mem: %p\n",
1162                        process_nr, thread_nr, global_data, process_data, thread_data);
1163        }
1164
1165        if (g->p.serialize_startup) {
1166                pthread_mutex_lock(&g->startup_mutex);
1167                g->nr_tasks_started++;
1168                /* The last thread wakes the main process. */
1169                if (g->nr_tasks_started == g->p.nr_tasks)
1170                        pthread_cond_signal(&g->startup_cond);
1171
1172                pthread_mutex_unlock(&g->startup_mutex);
1173
1174                /* Here we will wait for the main process to start us all at once: */
1175                pthread_mutex_lock(&g->start_work_mutex);
1176                g->start_work = false;
1177                g->nr_tasks_working++;
1178                while (!g->start_work)
1179                        pthread_cond_wait(&g->start_work_cond, &g->start_work_mutex);
1180
1181                pthread_mutex_unlock(&g->start_work_mutex);
1182        }
1183
1184        gettimeofday(&start0, NULL);
1185
1186        start = stop = start0;
1187        last_perturbance = start.tv_sec;
1188
1189        for (l = 0; l < g->p.nr_loops; l++) {
1190                start = stop;
1191
1192                if (g->stop_work)
1193                        break;
1194
1195                val += do_work(global_data,  g->p.bytes_global,  process_nr, g->p.nr_proc,      l, val);
1196                val += do_work(process_data, g->p.bytes_process, thread_nr,  g->p.nr_threads,   l, val);
1197                val += do_work(thread_data,  g->p.bytes_thread,  0,          1,         l, val);
1198
1199                if (g->p.sleep_usecs) {
1200                        pthread_mutex_lock(td->process_lock);
1201                        usleep(g->p.sleep_usecs);
1202                        pthread_mutex_unlock(td->process_lock);
1203                }
1204                /*
1205                 * Amount of work to be done under a process-global lock:
1206                 */
1207                if (g->p.bytes_process_locked) {
1208                        pthread_mutex_lock(td->process_lock);
1209                        val += do_work(process_data, g->p.bytes_process_locked, thread_nr,  g->p.nr_threads,    l, val);
1210                        pthread_mutex_unlock(td->process_lock);
1211                }
1212
1213                work_done = g->p.bytes_global + g->p.bytes_process +
1214                            g->p.bytes_process_locked + g->p.bytes_thread;
1215
1216                update_curr_cpu(task_nr, work_done);
1217                bytes_done += work_done;
1218
1219                if (details < 0 && !g->p.perturb_secs && !g->p.measure_convergence && !g->p.nr_secs)
1220                        continue;
1221
1222                td->loops_done = l;
1223
1224                gettimeofday(&stop, NULL);
1225
1226                /* Check whether our max runtime timed out: */
1227                if (g->p.nr_secs) {
1228                        timersub(&stop, &start0, &diff);
1229                        if ((u32)diff.tv_sec >= g->p.nr_secs) {
1230                                g->stop_work = true;
1231                                break;
1232                        }
1233                }
1234
1235                /* Update the summary at most once per second: */
1236                if (start.tv_sec == stop.tv_sec)
1237                        continue;
1238
1239                /*
1240                 * Perturb the first task's equilibrium every g->p.perturb_secs seconds,
1241                 * by migrating to CPU#0:
1242                 */
1243                if (first_task && g->p.perturb_secs && (int)(stop.tv_sec - last_perturbance) >= g->p.perturb_secs) {
1244                        cpu_set_t orig_mask;
1245                        int target_cpu;
1246                        int this_cpu;
1247
1248                        last_perturbance = stop.tv_sec;
1249
1250                        /*
1251                         * Depending on where we are running, move into
1252                         * the other half of the system, to create some
1253                         * real disturbance:
1254                         */
1255                        this_cpu = g->threads[task_nr].curr_cpu;
1256                        if (this_cpu < g->p.nr_cpus/2)
1257                                target_cpu = g->p.nr_cpus-1;
1258                        else
1259                                target_cpu = 0;
1260
1261                        orig_mask = bind_to_cpu(target_cpu);
1262
1263                        /* Here we are running on the target CPU already */
1264                        if (details >= 1)
1265                                printf(" (injecting perturbalance, moved to CPU#%d)\n", target_cpu);
1266
1267                        bind_to_cpumask(orig_mask);
1268                }
1269
1270                if (details >= 3) {
1271                        timersub(&stop, &start, &diff);
1272                        runtime_ns_max = diff.tv_sec * NSEC_PER_SEC;
1273                        runtime_ns_max += diff.tv_usec * NSEC_PER_USEC;
1274
1275                        if (details >= 0) {
1276                                printf(" #%2d / %2d: %14.2lf nsecs/op [val: %016"PRIx64"]\n",
1277                                        process_nr, thread_nr, runtime_ns_max / bytes_done, val);
1278                        }
1279                        fflush(stdout);
1280                }
1281                if (!last_task)
1282                        continue;
1283
1284                timersub(&stop, &start0, &diff);
1285                runtime_ns_max = diff.tv_sec * NSEC_PER_SEC;
1286                runtime_ns_max += diff.tv_usec * NSEC_PER_USEC;
1287
1288                show_summary(runtime_ns_max, l, &convergence);
1289        }
1290
1291        gettimeofday(&stop, NULL);
1292        timersub(&stop, &start0, &diff);
1293        td->runtime_ns = diff.tv_sec * NSEC_PER_SEC;
1294        td->runtime_ns += diff.tv_usec * NSEC_PER_USEC;
1295        secs = td->runtime_ns / NSEC_PER_SEC;
1296        td->speed_gbs = secs ? bytes_done / secs / 1e9 : 0;
1297
1298        getrusage(RUSAGE_THREAD, &rusage);
1299        td->system_time_ns = rusage.ru_stime.tv_sec * NSEC_PER_SEC;
1300        td->system_time_ns += rusage.ru_stime.tv_usec * NSEC_PER_USEC;
1301        td->user_time_ns = rusage.ru_utime.tv_sec * NSEC_PER_SEC;
1302        td->user_time_ns += rusage.ru_utime.tv_usec * NSEC_PER_USEC;
1303
1304        free_data(thread_data, g->p.bytes_thread);
1305
1306        pthread_mutex_lock(&g->stop_work_mutex);
1307        g->bytes_done += bytes_done;
1308        pthread_mutex_unlock(&g->stop_work_mutex);
1309
1310        return NULL;
1311}
1312
1313/*
1314 * A worker process starts a couple of threads:
1315 */
1316static void worker_process(int process_nr)
1317{
1318        pthread_mutex_t process_lock;
1319        struct thread_data *td;
1320        pthread_t *pthreads;
1321        u8 *process_data;
1322        int task_nr;
1323        int ret;
1324        int t;
1325
1326        pthread_mutex_init(&process_lock, NULL);
1327        set_taskname("process %d", process_nr);
1328
1329        /*
1330         * Pick up the memory policy and the CPU binding of our first thread,
1331         * so that we initialize memory accordingly:
1332         */
1333        task_nr = process_nr*g->p.nr_threads;
1334        td = g->threads + task_nr;
1335
1336        bind_to_memnode(td->bind_node);
1337        bind_to_cpumask(td->bind_cpumask);
1338
1339        pthreads = zalloc(g->p.nr_threads * sizeof(pthread_t));
1340        process_data = setup_private_data(g->p.bytes_process);
1341
1342        if (g->p.show_details >= 3) {
1343                printf(" # process %2d global mem: %p, process mem: %p\n",
1344                        process_nr, g->data, process_data);
1345        }
1346
1347        for (t = 0; t < g->p.nr_threads; t++) {
1348                task_nr = process_nr*g->p.nr_threads + t;
1349                td = g->threads + task_nr;
1350
1351                td->process_data = process_data;
1352                td->process_nr   = process_nr;
1353                td->thread_nr    = t;
1354                td->task_nr      = task_nr;
1355                td->val          = rand();
1356                td->curr_cpu     = -1;
1357                td->process_lock = &process_lock;
1358
1359                ret = pthread_create(pthreads + t, NULL, worker_thread, td);
1360                BUG_ON(ret);
1361        }
1362
1363        for (t = 0; t < g->p.nr_threads; t++) {
1364                ret = pthread_join(pthreads[t], NULL);
1365                BUG_ON(ret);
1366        }
1367
1368        free_data(process_data, g->p.bytes_process);
1369        free(pthreads);
1370}
1371
1372static void print_summary(void)
1373{
1374        if (g->p.show_details < 0)
1375                return;
1376
1377        printf("\n ###\n");
1378        printf(" # %d %s will execute (on %d nodes, %d CPUs):\n",
1379                g->p.nr_tasks, g->p.nr_tasks == 1 ? "task" : "tasks", nr_numa_nodes(), g->p.nr_cpus);
1380        printf(" #      %5dx %5ldMB global  shared mem operations\n",
1381                        g->p.nr_loops, g->p.bytes_global/1024/1024);
1382        printf(" #      %5dx %5ldMB process shared mem operations\n",
1383                        g->p.nr_loops, g->p.bytes_process/1024/1024);
1384        printf(" #      %5dx %5ldMB thread  local  mem operations\n",
1385                        g->p.nr_loops, g->p.bytes_thread/1024/1024);
1386
1387        printf(" ###\n");
1388
1389        printf("\n ###\n"); fflush(stdout);
1390}
1391
1392static void init_thread_data(void)
1393{
1394        ssize_t size = sizeof(*g->threads)*g->p.nr_tasks;
1395        int t;
1396
1397        g->threads = zalloc_shared_data(size);
1398
1399        for (t = 0; t < g->p.nr_tasks; t++) {
1400                struct thread_data *td = g->threads + t;
1401                int cpu;
1402
1403                /* Allow all nodes by default: */
1404                td->bind_node = NUMA_NO_NODE;
1405
1406                /* Allow all CPUs by default: */
1407                CPU_ZERO(&td->bind_cpumask);
1408                for (cpu = 0; cpu < g->p.nr_cpus; cpu++)
1409                        CPU_SET(cpu, &td->bind_cpumask);
1410        }
1411}
1412
1413static void deinit_thread_data(void)
1414{
1415        ssize_t size = sizeof(*g->threads)*g->p.nr_tasks;
1416
1417        free_data(g->threads, size);
1418}
1419
1420static int init(void)
1421{
1422        g = (void *)alloc_data(sizeof(*g), MAP_SHARED, 1, 0, 0 /* THP */, 0);
1423
1424        /* Copy over options: */
1425        g->p = p0;
1426
1427        g->p.nr_cpus = numa_num_configured_cpus();
1428
1429        g->p.nr_nodes = numa_max_node() + 1;
1430
1431        /* char array in count_process_nodes(): */
1432        BUG_ON(g->p.nr_nodes < 0);
1433
1434        if (g->p.show_quiet && !g->p.show_details)
1435                g->p.show_details = -1;
1436
1437        /* Some memory should be specified: */
1438        if (!g->p.mb_global_str && !g->p.mb_proc_str && !g->p.mb_thread_str)
1439                return -1;
1440
1441        if (g->p.mb_global_str) {
1442                g->p.mb_global = atof(g->p.mb_global_str);
1443                BUG_ON(g->p.mb_global < 0);
1444        }
1445
1446        if (g->p.mb_proc_str) {
1447                g->p.mb_proc = atof(g->p.mb_proc_str);
1448                BUG_ON(g->p.mb_proc < 0);
1449        }
1450
1451        if (g->p.mb_proc_locked_str) {
1452                g->p.mb_proc_locked = atof(g->p.mb_proc_locked_str);
1453                BUG_ON(g->p.mb_proc_locked < 0);
1454                BUG_ON(g->p.mb_proc_locked > g->p.mb_proc);
1455        }
1456
1457        if (g->p.mb_thread_str) {
1458                g->p.mb_thread = atof(g->p.mb_thread_str);
1459                BUG_ON(g->p.mb_thread < 0);
1460        }
1461
1462        BUG_ON(g->p.nr_threads <= 0);
1463        BUG_ON(g->p.nr_proc <= 0);
1464
1465        g->p.nr_tasks = g->p.nr_proc*g->p.nr_threads;
1466
1467        g->p.bytes_global               = g->p.mb_global        *1024L*1024L;
1468        g->p.bytes_process              = g->p.mb_proc          *1024L*1024L;
1469        g->p.bytes_process_locked       = g->p.mb_proc_locked   *1024L*1024L;
1470        g->p.bytes_thread               = g->p.mb_thread        *1024L*1024L;
1471
1472        g->data = setup_shared_data(g->p.bytes_global);
1473
1474        /* Startup serialization: */
1475        init_global_mutex(&g->start_work_mutex);
1476        init_global_cond(&g->start_work_cond);
1477        init_global_mutex(&g->startup_mutex);
1478        init_global_cond(&g->startup_cond);
1479        init_global_mutex(&g->stop_work_mutex);
1480
1481        init_thread_data();
1482
1483        tprintf("#\n");
1484        if (parse_setup_cpu_list() || parse_setup_node_list())
1485                return -1;
1486        tprintf("#\n");
1487
1488        print_summary();
1489
1490        return 0;
1491}
1492
1493static void deinit(void)
1494{
1495        free_data(g->data, g->p.bytes_global);
1496        g->data = NULL;
1497
1498        deinit_thread_data();
1499
1500        free_data(g, sizeof(*g));
1501        g = NULL;
1502}
1503
1504/*
1505 * Print a short or long result, depending on the verbosity setting:
1506 */
1507static void print_res(const char *name, double val,
1508                      const char *txt_unit, const char *txt_short, const char *txt_long)
1509{
1510        if (!name)
1511                name = "main,";
1512
1513        if (!g->p.show_quiet)
1514                printf(" %-30s %15.3f, %-15s %s\n", name, val, txt_unit, txt_short);
1515        else
1516                printf(" %14.3f %s\n", val, txt_long);
1517}
1518
1519static int __bench_numa(const char *name)
1520{
1521        struct timeval start, stop, diff;
1522        u64 runtime_ns_min, runtime_ns_sum;
1523        pid_t *pids, pid, wpid;
1524        double delta_runtime;
1525        double runtime_avg;
1526        double runtime_sec_max;
1527        double runtime_sec_min;
1528        int wait_stat;
1529        double bytes;
1530        int i, t, p;
1531
1532        if (init())
1533                return -1;
1534
1535        pids = zalloc(g->p.nr_proc * sizeof(*pids));
1536        pid = -1;
1537
1538        if (g->p.serialize_startup) {
1539                tprintf(" #\n");
1540                tprintf(" # Startup synchronization: ..."); fflush(stdout);
1541        }
1542
1543        gettimeofday(&start, NULL);
1544
1545        for (i = 0; i < g->p.nr_proc; i++) {
1546                pid = fork();
1547                dprintf(" # process %2d: PID %d\n", i, pid);
1548
1549                BUG_ON(pid < 0);
1550                if (!pid) {
1551                        /* Child process: */
1552                        worker_process(i);
1553
1554                        exit(0);
1555                }
1556                pids[i] = pid;
1557
1558        }
1559
1560        if (g->p.serialize_startup) {
1561                bool threads_ready = false;
1562                double startup_sec;
1563
1564                /*
1565                 * Wait for all the threads to start up. The last thread will
1566                 * signal this process.
1567                 */
1568                pthread_mutex_lock(&g->startup_mutex);
1569                while (g->nr_tasks_started != g->p.nr_tasks)
1570                        pthread_cond_wait(&g->startup_cond, &g->startup_mutex);
1571
1572                pthread_mutex_unlock(&g->startup_mutex);
1573
1574                /* Wait for all threads to be at the start_work_cond. */
1575                while (!threads_ready) {
1576                        pthread_mutex_lock(&g->start_work_mutex);
1577                        threads_ready = (g->nr_tasks_working == g->p.nr_tasks);
1578                        pthread_mutex_unlock(&g->start_work_mutex);
1579                        if (!threads_ready)
1580                                usleep(1);
1581                }
1582
1583                gettimeofday(&stop, NULL);
1584
1585                timersub(&stop, &start, &diff);
1586
1587                startup_sec = diff.tv_sec * NSEC_PER_SEC;
1588                startup_sec += diff.tv_usec * NSEC_PER_USEC;
1589                startup_sec /= NSEC_PER_SEC;
1590
1591                tprintf(" threads initialized in %.6f seconds.\n", startup_sec);
1592                tprintf(" #\n");
1593
1594                start = stop;
1595                /* Start all threads running. */
1596                pthread_mutex_lock(&g->start_work_mutex);
1597                g->start_work = true;
1598                pthread_mutex_unlock(&g->start_work_mutex);
1599                pthread_cond_broadcast(&g->start_work_cond);
1600        } else {
1601                gettimeofday(&start, NULL);
1602        }
1603
1604        /* Parent process: */
1605
1606
1607        for (i = 0; i < g->p.nr_proc; i++) {
1608                wpid = waitpid(pids[i], &wait_stat, 0);
1609                BUG_ON(wpid < 0);
1610                BUG_ON(!WIFEXITED(wait_stat));
1611
1612        }
1613
1614        runtime_ns_sum = 0;
1615        runtime_ns_min = -1LL;
1616
1617        for (t = 0; t < g->p.nr_tasks; t++) {
1618                u64 thread_runtime_ns = g->threads[t].runtime_ns;
1619
1620                runtime_ns_sum += thread_runtime_ns;
1621                runtime_ns_min = min(thread_runtime_ns, runtime_ns_min);
1622        }
1623
1624        gettimeofday(&stop, NULL);
1625        timersub(&stop, &start, &diff);
1626
1627        BUG_ON(bench_format != BENCH_FORMAT_DEFAULT);
1628
1629        tprintf("\n ###\n");
1630        tprintf("\n");
1631
1632        runtime_sec_max = diff.tv_sec * NSEC_PER_SEC;
1633        runtime_sec_max += diff.tv_usec * NSEC_PER_USEC;
1634        runtime_sec_max /= NSEC_PER_SEC;
1635
1636        runtime_sec_min = runtime_ns_min / NSEC_PER_SEC;
1637
1638        bytes = g->bytes_done;
1639        runtime_avg = (double)runtime_ns_sum / g->p.nr_tasks / NSEC_PER_SEC;
1640
1641        if (g->p.measure_convergence) {
1642                print_res(name, runtime_sec_max,
1643                        "secs,", "NUMA-convergence-latency", "secs latency to NUMA-converge");
1644        }
1645
1646        print_res(name, runtime_sec_max,
1647                "secs,", "runtime-max/thread",  "secs slowest (max) thread-runtime");
1648
1649        print_res(name, runtime_sec_min,
1650                "secs,", "runtime-min/thread",  "secs fastest (min) thread-runtime");
1651
1652        print_res(name, runtime_avg,
1653                "secs,", "runtime-avg/thread",  "secs average thread-runtime");
1654
1655        delta_runtime = (runtime_sec_max - runtime_sec_min)/2.0;
1656        print_res(name, delta_runtime / runtime_sec_max * 100.0,
1657                "%,", "spread-runtime/thread",  "% difference between max/avg runtime");
1658
1659        print_res(name, bytes / g->p.nr_tasks / 1e9,
1660                "GB,", "data/thread",           "GB data processed, per thread");
1661
1662        print_res(name, bytes / 1e9,
1663                "GB,", "data-total",            "GB data processed, total");
1664
1665        print_res(name, runtime_sec_max * NSEC_PER_SEC / (bytes / g->p.nr_tasks),
1666                "nsecs,", "runtime/byte/thread","nsecs/byte/thread runtime");
1667
1668        print_res(name, bytes / g->p.nr_tasks / 1e9 / runtime_sec_max,
1669                "GB/sec,", "thread-speed",      "GB/sec/thread speed");
1670
1671        print_res(name, bytes / runtime_sec_max / 1e9,
1672                "GB/sec,", "total-speed",       "GB/sec total speed");
1673
1674        if (g->p.show_details >= 2) {
1675                char tname[14 + 2 * 10 + 1];
1676                struct thread_data *td;
1677                for (p = 0; p < g->p.nr_proc; p++) {
1678                        for (t = 0; t < g->p.nr_threads; t++) {
1679                                memset(tname, 0, sizeof(tname));
1680                                td = g->threads + p*g->p.nr_threads + t;
1681                                snprintf(tname, sizeof(tname), "process%d:thread%d", p, t);
1682                                print_res(tname, td->speed_gbs,
1683                                        "GB/sec",       "thread-speed", "GB/sec/thread speed");
1684                                print_res(tname, td->system_time_ns / NSEC_PER_SEC,
1685                                        "secs", "thread-system-time", "system CPU time/thread");
1686                                print_res(tname, td->user_time_ns / NSEC_PER_SEC,
1687                                        "secs", "thread-user-time", "user CPU time/thread");
1688                        }
1689                }
1690        }
1691
1692        free(pids);
1693
1694        deinit();
1695
1696        return 0;
1697}
1698
1699#define MAX_ARGS 50
1700
1701static int command_size(const char **argv)
1702{
1703        int size = 0;
1704
1705        while (*argv) {
1706                size++;
1707                argv++;
1708        }
1709
1710        BUG_ON(size >= MAX_ARGS);
1711
1712        return size;
1713}
1714
1715static void init_params(struct params *p, const char *name, int argc, const char **argv)
1716{
1717        int i;
1718
1719        printf("\n # Running %s \"perf bench numa", name);
1720
1721        for (i = 0; i < argc; i++)
1722                printf(" %s", argv[i]);
1723
1724        printf("\"\n");
1725
1726        memset(p, 0, sizeof(*p));
1727
1728        /* Initialize nonzero defaults: */
1729
1730        p->serialize_startup            = 1;
1731        p->data_reads                   = true;
1732        p->data_writes                  = true;
1733        p->data_backwards               = true;
1734        p->data_rand_walk               = true;
1735        p->nr_loops                     = -1;
1736        p->init_random                  = true;
1737        p->mb_global_str                = "1";
1738        p->nr_proc                      = 1;
1739        p->nr_threads                   = 1;
1740        p->nr_secs                      = 5;
1741        p->run_all                      = argc == 1;
1742}
1743
1744static int run_bench_numa(const char *name, const char **argv)
1745{
1746        int argc = command_size(argv);
1747
1748        init_params(&p0, name, argc, argv);
1749        argc = parse_options(argc, argv, options, bench_numa_usage, 0);
1750        if (argc)
1751                goto err;
1752
1753        if (__bench_numa(name))
1754                goto err;
1755
1756        return 0;
1757
1758err:
1759        return -1;
1760}
1761
1762#define OPT_BW_RAM              "-s",  "20", "-zZq",    "--thp", " 1", "--no-data_rand_walk"
1763#define OPT_BW_RAM_NOTHP        OPT_BW_RAM,             "--thp", "-1"
1764
1765#define OPT_CONV                "-s", "100", "-zZ0qcm", "--thp", " 1"
1766#define OPT_CONV_NOTHP          OPT_CONV,               "--thp", "-1"
1767
1768#define OPT_BW                  "-s",  "20", "-zZ0q",   "--thp", " 1"
1769#define OPT_BW_NOTHP            OPT_BW,                 "--thp", "-1"
1770
1771/*
1772 * The built-in test-suite executed by "perf bench numa -a".
1773 *
1774 * (A minimum of 4 nodes and 16 GB of RAM is recommended.)
1775 */
1776static const char *tests[][MAX_ARGS] = {
1777   /* Basic single-stream NUMA bandwidth measurements: */
1778   { "RAM-bw-local,",     "mem",  "-p",  "1",  "-t",  "1", "-P", "1024",
1779                          "-C" ,   "0", "-M",   "0", OPT_BW_RAM },
1780   { "RAM-bw-local-NOTHP,",
1781                          "mem",  "-p",  "1",  "-t",  "1", "-P", "1024",
1782                          "-C" ,   "0", "-M",   "0", OPT_BW_RAM_NOTHP },
1783   { "RAM-bw-remote,",    "mem",  "-p",  "1",  "-t",  "1", "-P", "1024",
1784                          "-C" ,   "0", "-M",   "1", OPT_BW_RAM },
1785
1786   /* 2-stream NUMA bandwidth measurements: */
1787   { "RAM-bw-local-2x,",  "mem",  "-p",  "2",  "-t",  "1", "-P", "1024",
1788                           "-C", "0,2", "-M", "0x2", OPT_BW_RAM },
1789   { "RAM-bw-remote-2x,", "mem",  "-p",  "2",  "-t",  "1", "-P", "1024",
1790                           "-C", "0,2", "-M", "1x2", OPT_BW_RAM },
1791
1792   /* Cross-stream NUMA bandwidth measurement: */
1793   { "RAM-bw-cross,",     "mem",  "-p",  "2",  "-t",  "1", "-P", "1024",
1794                           "-C", "0,8", "-M", "1,0", OPT_BW_RAM },
1795
1796   /* Convergence latency measurements: */
1797   { " 1x3-convergence,", "mem",  "-p",  "1", "-t",  "3", "-P",  "512", OPT_CONV },
1798   { " 1x4-convergence,", "mem",  "-p",  "1", "-t",  "4", "-P",  "512", OPT_CONV },
1799   { " 1x6-convergence,", "mem",  "-p",  "1", "-t",  "6", "-P", "1020", OPT_CONV },
1800   { " 2x3-convergence,", "mem",  "-p",  "2", "-t",  "3", "-P", "1020", OPT_CONV },
1801   { " 3x3-convergence,", "mem",  "-p",  "3", "-t",  "3", "-P", "1020", OPT_CONV },
1802   { " 4x4-convergence,", "mem",  "-p",  "4", "-t",  "4", "-P",  "512", OPT_CONV },
1803   { " 4x4-convergence-NOTHP,",
1804                          "mem",  "-p",  "4", "-t",  "4", "-P",  "512", OPT_CONV_NOTHP },
1805   { " 4x6-convergence,", "mem",  "-p",  "4", "-t",  "6", "-P", "1020", OPT_CONV },
1806   { " 4x8-convergence,", "mem",  "-p",  "4", "-t",  "8", "-P",  "512", OPT_CONV },
1807   { " 8x4-convergence,", "mem",  "-p",  "8", "-t",  "4", "-P",  "512", OPT_CONV },
1808   { " 8x4-convergence-NOTHP,",
1809                          "mem",  "-p",  "8", "-t",  "4", "-P",  "512", OPT_CONV_NOTHP },
1810   { " 3x1-convergence,", "mem",  "-p",  "3", "-t",  "1", "-P",  "512", OPT_CONV },
1811   { " 4x1-convergence,", "mem",  "-p",  "4", "-t",  "1", "-P",  "512", OPT_CONV },
1812   { " 8x1-convergence,", "mem",  "-p",  "8", "-t",  "1", "-P",  "512", OPT_CONV },
1813   { "16x1-convergence,", "mem",  "-p", "16", "-t",  "1", "-P",  "256", OPT_CONV },
1814   { "32x1-convergence,", "mem",  "-p", "32", "-t",  "1", "-P",  "128", OPT_CONV },
1815
1816   /* Various NUMA process/thread layout bandwidth measurements: */
1817   { " 2x1-bw-process,",  "mem",  "-p",  "2", "-t",  "1", "-P", "1024", OPT_BW },
1818   { " 3x1-bw-process,",  "mem",  "-p",  "3", "-t",  "1", "-P", "1024", OPT_BW },
1819   { " 4x1-bw-process,",  "mem",  "-p",  "4", "-t",  "1", "-P", "1024", OPT_BW },
1820   { " 8x1-bw-process,",  "mem",  "-p",  "8", "-t",  "1", "-P", " 512", OPT_BW },
1821   { " 8x1-bw-process-NOTHP,",
1822                          "mem",  "-p",  "8", "-t",  "1", "-P", " 512", OPT_BW_NOTHP },
1823   { "16x1-bw-process,",  "mem",  "-p", "16", "-t",  "1", "-P",  "256", OPT_BW },
1824
1825   { " 1x4-bw-thread,",   "mem",  "-p",  "1", "-t",  "4", "-T",  "256", OPT_BW },
1826   { " 1x8-bw-thread,",   "mem",  "-p",  "1", "-t",  "8", "-T",  "256", OPT_BW },
1827   { "1x16-bw-thread,",   "mem",  "-p",  "1", "-t", "16", "-T",  "128", OPT_BW },
1828   { "1x32-bw-thread,",   "mem",  "-p",  "1", "-t", "32", "-T",   "64", OPT_BW },
1829
1830   { " 2x3-bw-process,",  "mem",  "-p",  "2", "-t",  "3", "-P",  "512", OPT_BW },
1831   { " 4x4-bw-process,",  "mem",  "-p",  "4", "-t",  "4", "-P",  "512", OPT_BW },
1832   { " 4x6-bw-process,",  "mem",  "-p",  "4", "-t",  "6", "-P",  "512", OPT_BW },
1833   { " 4x8-bw-process,",  "mem",  "-p",  "4", "-t",  "8", "-P",  "512", OPT_BW },
1834   { " 4x8-bw-process-NOTHP,",
1835                          "mem",  "-p",  "4", "-t",  "8", "-P",  "512", OPT_BW_NOTHP },
1836   { " 3x3-bw-process,",  "mem",  "-p",  "3", "-t",  "3", "-P",  "512", OPT_BW },
1837   { " 5x5-bw-process,",  "mem",  "-p",  "5", "-t",  "5", "-P",  "512", OPT_BW },
1838
1839   { "2x16-bw-process,",  "mem",  "-p",  "2", "-t", "16", "-P",  "512", OPT_BW },
1840   { "1x32-bw-process,",  "mem",  "-p",  "1", "-t", "32", "-P", "2048", OPT_BW },
1841
1842   { "numa02-bw,",        "mem",  "-p",  "1", "-t", "32", "-T",   "32", OPT_BW },
1843   { "numa02-bw-NOTHP,",  "mem",  "-p",  "1", "-t", "32", "-T",   "32", OPT_BW_NOTHP },
1844   { "numa01-bw-thread,", "mem",  "-p",  "2", "-t", "16", "-T",  "192", OPT_BW },
1845   { "numa01-bw-thread-NOTHP,",
1846                          "mem",  "-p",  "2", "-t", "16", "-T",  "192", OPT_BW_NOTHP },
1847};
1848
1849static int bench_all(void)
1850{
1851        int nr = ARRAY_SIZE(tests);
1852        int ret;
1853        int i;
1854
1855        ret = system("echo ' #'; echo ' # Running test on: '$(uname -a); echo ' #'");
1856        BUG_ON(ret < 0);
1857
1858        for (i = 0; i < nr; i++) {
1859                run_bench_numa(tests[i][0], tests[i] + 1);
1860        }
1861
1862        printf("\n");
1863
1864        return 0;
1865}
1866
1867int bench_numa(int argc, const char **argv)
1868{
1869        init_params(&p0, "main,", argc, argv);
1870        argc = parse_options(argc, argv, options, bench_numa_usage, 0);
1871        if (argc)
1872                goto err;
1873
1874        if (p0.run_all)
1875                return bench_all();
1876
1877        if (__bench_numa(NULL))
1878                goto err;
1879
1880        return 0;
1881
1882err:
1883        usage_with_options(numa_usage, options);
1884        return -1;
1885}
1886