LXR linux/drivers/infiniband/hw/hfi1/affinity.c

   1/*
   2 * Copyright(c) 2015, 2016 Intel Corporation.
   3 *
   4 * This file is provided under a dual BSD/GPLv2 license.  When using or
   5 * redistributing this file, you may do so under either license.
   6 *
   7 * GPL LICENSE SUMMARY
   8 *
   9 * This program is free software; you can redistribute it and/or modify
  10 * it under the terms of version 2 of the GNU General Public License as
  11 * published by the Free Software Foundation.
  12 *
  13 * This program is distributed in the hope that it will be useful, but
  14 * WITHOUT ANY WARRANTY; without even the implied warranty of
  15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16 * General Public License for more details.
  17 *
  18 * BSD LICENSE
  19 *
  20 * Redistribution and use in source and binary forms, with or without
  21 * modification, are permitted provided that the following conditions
  22 * are met:
  23 *
  24 *  - Redistributions of source code must retain the above copyright
  25 *    notice, this list of conditions and the following disclaimer.
  26 *  - Redistributions in binary form must reproduce the above copyright
  27 *    notice, this list of conditions and the following disclaimer in
  28 *    the documentation and/or other materials provided with the
  29 *    distribution.
  30 *  - Neither the name of Intel Corporation nor the names of its
  31 *    contributors may be used to endorse or promote products derived
  32 *    from this software without specific prior written permission.
  33 *
  34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  45 *
  46 */
  47#include <linux/topology.h>
  48#include <linux/cpumask.h>
  49#include <linux/module.h>
  50
  51#include "hfi.h"
  52#include "affinity.h"
  53#include "sdma.h"
  54#include "trace.h"
  55
  56struct hfi1_affinity_node_list node_affinity = {
  57        .list = LIST_HEAD_INIT(node_affinity.list),
  58        .lock = __SPIN_LOCK_UNLOCKED(&node_affinity.lock),
  59};
  60
  61/* Name of IRQ types, indexed by enum irq_type */
  62static const char * const irq_type_names[] = {
  63        "SDMA",
  64        "RCVCTXT",
  65        "GENERAL",
  66        "OTHER",
  67};
  68
  69/* Per NUMA node count of HFI devices */
  70static unsigned int *hfi1_per_node_cntr;
  71
  72static inline void init_cpu_mask_set(struct cpu_mask_set *set)
  73{
  74        cpumask_clear(&set->mask);
  75        cpumask_clear(&set->used);
  76        set->gen = 0;
  77}
  78
  79/* Initialize non-HT cpu cores mask */
  80void init_real_cpu_mask(void)
  81{
  82        int possible, curr_cpu, i, ht;
  83
  84        cpumask_clear(&node_affinity.real_cpu_mask);
  85
  86        /* Start with cpu online mask as the real cpu mask */
  87        cpumask_copy(&node_affinity.real_cpu_mask, cpu_online_mask);
  88
  89        /*
  90         * Remove HT cores from the real cpu mask.  Do this in two steps below.
  91         */
  92        possible = cpumask_weight(&node_affinity.real_cpu_mask);
  93        ht = cpumask_weight(topology_sibling_cpumask(
  94                                cpumask_first(&node_affinity.real_cpu_mask)));
  95        /*
  96         * Step 1.  Skip over the first N HT siblings and use them as the
  97         * "real" cores.  Assumes that HT cores are not enumerated in
  98         * succession (except in the single core case).
  99         */
 100        curr_cpu = cpumask_first(&node_affinity.real_cpu_mask);
 101        for (i = 0; i < possible / ht; i++)
 102                curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
 103        /*
 104         * Step 2.  Remove the remaining HT siblings.  Use cpumask_next() to
 105         * skip any gaps.
 106         */
 107        for (; i < possible; i++) {
 108                cpumask_clear_cpu(curr_cpu, &node_affinity.real_cpu_mask);
 109                curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
 110        }
 111}
 112
 113int node_affinity_init(void)
 114{
 115        int node;
 116        struct pci_dev *dev = NULL;
 117        const struct pci_device_id *ids = hfi1_pci_tbl;
 118
 119        cpumask_clear(&node_affinity.proc.used);
 120        cpumask_copy(&node_affinity.proc.mask, cpu_online_mask);
 121
 122        node_affinity.proc.gen = 0;
 123        node_affinity.num_core_siblings =
 124                                cpumask_weight(topology_sibling_cpumask(
 125                                        cpumask_first(&node_affinity.proc.mask)
 126                                        ));
 127        node_affinity.num_online_nodes = num_online_nodes();
 128        node_affinity.num_online_cpus = num_online_cpus();
 129
 130        /*
 131         * The real cpu mask is part of the affinity struct but it has to be
 132         * initialized early. It is needed to calculate the number of user
 133         * contexts in set_up_context_variables().
 134         */
 135        init_real_cpu_mask();
 136
 137        hfi1_per_node_cntr = kcalloc(num_possible_nodes(),
 138                                     sizeof(*hfi1_per_node_cntr), GFP_KERNEL);
 139        if (!hfi1_per_node_cntr)
 140                return -ENOMEM;
 141
 142        while (ids->vendor) {
 143                dev = NULL;
 144                while ((dev = pci_get_device(ids->vendor, ids->device, dev))) {
 145                        node = pcibus_to_node(dev->bus);
 146                        if (node < 0)
 147                                node = numa_node_id();
 148
 149                        hfi1_per_node_cntr[node]++;
 150                }
 151                ids++;
 152        }
 153
 154        return 0;
 155}
 156
 157void node_affinity_destroy(void)
 158{
 159        struct list_head *pos, *q;
 160        struct hfi1_affinity_node *entry;
 161
 162        spin_lock(&node_affinity.lock);
 163        list_for_each_safe(pos, q, &node_affinity.list) {
 164                entry = list_entry(pos, struct hfi1_affinity_node,
 165                                   list);
 166                list_del(pos);
 167                kfree(entry);
 168        }
 169        spin_unlock(&node_affinity.lock);
 170        kfree(hfi1_per_node_cntr);
 171}
 172
 173static struct hfi1_affinity_node *node_affinity_allocate(int node)
 174{
 175        struct hfi1_affinity_node *entry;
 176
 177        entry = kzalloc(sizeof(*entry), GFP_KERNEL);
 178        if (!entry)
 179                return NULL;
 180        entry->node = node;
 181        INIT_LIST_HEAD(&entry->list);
 182
 183        return entry;
 184}
 185
 186/*
 187 * It appends an entry to the list.
 188 * It *must* be called with node_affinity.lock held.
 189 */
 190static void node_affinity_add_tail(struct hfi1_affinity_node *entry)
 191{
 192        list_add_tail(&entry->list, &node_affinity.list);
 193}
 194
 195/* It must be called with node_affinity.lock held */
 196static struct hfi1_affinity_node *node_affinity_lookup(int node)
 197{
 198        struct list_head *pos;
 199        struct hfi1_affinity_node *entry;
 200
 201        list_for_each(pos, &node_affinity.list) {
 202                entry = list_entry(pos, struct hfi1_affinity_node, list);
 203                if (entry->node == node)
 204                        return entry;
 205        }
 206
 207        return NULL;
 208}
 209
 210/*
 211 * Interrupt affinity.
 212 *
 213 * non-rcv avail gets a default mask that
 214 * starts as possible cpus with threads reset
 215 * and each rcv avail reset.
 216 *
 217 * rcv avail gets node relative 1 wrapping back
 218 * to the node relative 1 as necessary.
 219 *
 220 */
 221int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
 222{
 223        int node = pcibus_to_node(dd->pcidev->bus);
 224        struct hfi1_affinity_node *entry;
 225        const struct cpumask *local_mask;
 226        int curr_cpu, possible, i;
 227
 228        if (node < 0)
 229                node = numa_node_id();
 230        dd->node = node;
 231
 232        local_mask = cpumask_of_node(dd->node);
 233        if (cpumask_first(local_mask) >= nr_cpu_ids)
 234                local_mask = topology_core_cpumask(0);
 235
 236        spin_lock(&node_affinity.lock);
 237        entry = node_affinity_lookup(dd->node);
 238        spin_unlock(&node_affinity.lock);
 239
 240        /*
 241         * If this is the first time this NUMA node's affinity is used,
 242         * create an entry in the global affinity structure and initialize it.
 243         */
 244        if (!entry) {
 245                entry = node_affinity_allocate(node);
 246                if (!entry) {
 247                        dd_dev_err(dd,
 248                                   "Unable to allocate global affinity node\n");
 249                        return -ENOMEM;
 250                }
 251                init_cpu_mask_set(&entry->def_intr);
 252                init_cpu_mask_set(&entry->rcv_intr);
 253                cpumask_clear(&entry->general_intr_mask);
 254                /* Use the "real" cpu mask of this node as the default */
 255                cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask,
 256                            local_mask);
 257
 258                /* fill in the receive list */
 259                possible = cpumask_weight(&entry->def_intr.mask);
 260                curr_cpu = cpumask_first(&entry->def_intr.mask);
 261
 262                if (possible == 1) {
 263                        /* only one CPU, everyone will use it */
 264                        cpumask_set_cpu(curr_cpu, &entry->rcv_intr.mask);
 265                        cpumask_set_cpu(curr_cpu, &entry->general_intr_mask);
 266                } else {
 267                        /*
 268                         * The general/control context will be the first CPU in
 269                         * the default list, so it is removed from the default
 270                         * list and added to the general interrupt list.
 271                         */
 272                        cpumask_clear_cpu(curr_cpu, &entry->def_intr.mask);
 273                        cpumask_set_cpu(curr_cpu, &entry->general_intr_mask);
 274                        curr_cpu = cpumask_next(curr_cpu,
 275                                                &entry->def_intr.mask);
 276
 277                        /*
 278                         * Remove the remaining kernel receive queues from
 279                         * the default list and add them to the receive list.
 280                         */
 281                        for (i = 0;
 282                             i < (dd->n_krcv_queues - 1) *
 283                                  hfi1_per_node_cntr[dd->node];
 284                             i++) {
 285                                cpumask_clear_cpu(curr_cpu,
 286                                                  &entry->def_intr.mask);
 287                                cpumask_set_cpu(curr_cpu,
 288                                                &entry->rcv_intr.mask);
 289                                curr_cpu = cpumask_next(curr_cpu,
 290                                                        &entry->def_intr.mask);
 291                                if (curr_cpu >= nr_cpu_ids)
 292                                        break;
 293                        }
 294
 295                        /*
 296                         * If there ends up being 0 CPU cores leftover for SDMA
 297                         * engines, use the same CPU cores as general/control
 298                         * context.
 299                         */
 300                        if (cpumask_weight(&entry->def_intr.mask) == 0)
 301                                cpumask_copy(&entry->def_intr.mask,
 302                                             &entry->general_intr_mask);
 303                }
 304
 305                spin_lock(&node_affinity.lock);
 306                node_affinity_add_tail(entry);
 307                spin_unlock(&node_affinity.lock);
 308        }
 309
 310        return 0;
 311}
 312
 313int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
 314{
 315        int ret;
 316        cpumask_var_t diff;
 317        struct hfi1_affinity_node *entry;
 318        struct cpu_mask_set *set = NULL;
 319        struct sdma_engine *sde = NULL;
 320        struct hfi1_ctxtdata *rcd = NULL;
 321        char extra[64];
 322        int cpu = -1;
 323
 324        extra[0] = '\0';
 325        cpumask_clear(&msix->mask);
 326
 327        ret = zalloc_cpumask_var(&diff, GFP_KERNEL);
 328        if (!ret)
 329                return -ENOMEM;
 330
 331        spin_lock(&node_affinity.lock);
 332        entry = node_affinity_lookup(dd->node);
 333        spin_unlock(&node_affinity.lock);
 334
 335        switch (msix->type) {
 336        case IRQ_SDMA:
 337                sde = (struct sdma_engine *)msix->arg;
 338                scnprintf(extra, 64, "engine %u", sde->this_idx);
 339                set = &entry->def_intr;
 340                break;
 341        case IRQ_GENERAL:
 342                cpu = cpumask_first(&entry->general_intr_mask);
 343                break;
 344        case IRQ_RCVCTXT:
 345                rcd = (struct hfi1_ctxtdata *)msix->arg;
 346                if (rcd->ctxt == HFI1_CTRL_CTXT)
 347                        cpu = cpumask_first(&entry->general_intr_mask);
 348                else
 349                        set = &entry->rcv_intr;
 350                scnprintf(extra, 64, "ctxt %u", rcd->ctxt);
 351                break;
 352        default:
 353                dd_dev_err(dd, "Invalid IRQ type %d\n", msix->type);
 354                return -EINVAL;
 355        }
 356
 357        /*
 358         * The general and control contexts are placed on a particular
 359         * CPU, which is set above. Skip accounting for it. Everything else
 360         * finds its CPU here.
 361         */
 362        if (cpu == -1 && set) {
 363                spin_lock(&node_affinity.lock);
 364                if (cpumask_equal(&set->mask, &set->used)) {
 365                        /*
 366                         * We've used up all the CPUs, bump up the generation
 367                         * and reset the 'used' map
 368                         */
 369                        set->gen++;
 370                        cpumask_clear(&set->used);
 371                }
 372                cpumask_andnot(diff, &set->mask, &set->used);
 373                cpu = cpumask_first(diff);
 374                cpumask_set_cpu(cpu, &set->used);
 375                spin_unlock(&node_affinity.lock);
 376        }
 377
 378        switch (msix->type) {
 379        case IRQ_SDMA:
 380                sde->cpu = cpu;
 381                break;
 382        case IRQ_GENERAL:
 383        case IRQ_RCVCTXT:
 384        case IRQ_OTHER:
 385                break;
 386        }
 387
 388        cpumask_set_cpu(cpu, &msix->mask);
 389        dd_dev_info(dd, "IRQ vector: %u, type %s %s -> cpu: %d\n",
 390                    msix->msix.vector, irq_type_names[msix->type],
 391                    extra, cpu);
 392        irq_set_affinity_hint(msix->msix.vector, &msix->mask);
 393
 394        free_cpumask_var(diff);
 395        return 0;
 396}
 397
 398void hfi1_put_irq_affinity(struct hfi1_devdata *dd,
 399                           struct hfi1_msix_entry *msix)
 400{
 401        struct cpu_mask_set *set = NULL;
 402        struct hfi1_ctxtdata *rcd;
 403        struct hfi1_affinity_node *entry;
 404
 405        spin_lock(&node_affinity.lock);
 406        entry = node_affinity_lookup(dd->node);
 407        spin_unlock(&node_affinity.lock);
 408
 409        switch (msix->type) {
 410        case IRQ_SDMA:
 411                set = &entry->def_intr;
 412                break;
 413        case IRQ_GENERAL:
 414                /* Don't do accounting for general contexts */
 415                break;
 416        case IRQ_RCVCTXT:
 417                rcd = (struct hfi1_ctxtdata *)msix->arg;
 418                /* Don't do accounting for control contexts */
 419                if (rcd->ctxt != HFI1_CTRL_CTXT)
 420                        set = &entry->rcv_intr;
 421                break;
 422        default:
 423                return;
 424        }
 425
 426        if (set) {
 427                spin_lock(&node_affinity.lock);
 428                cpumask_andnot(&set->used, &set->used, &msix->mask);
 429                if (cpumask_empty(&set->used) && set->gen) {
 430                        set->gen--;
 431                        cpumask_copy(&set->used, &set->mask);
 432                }
 433                spin_unlock(&node_affinity.lock);
 434        }
 435
 436        irq_set_affinity_hint(msix->msix.vector, NULL);
 437        cpumask_clear(&msix->mask);
 438}
 439
 440/* This should be called with node_affinity.lock held */
 441static void find_hw_thread_mask(uint hw_thread_no, cpumask_var_t hw_thread_mask,
 442                                struct hfi1_affinity_node_list *affinity)
 443{
 444        int possible, curr_cpu, i;
 445        uint num_cores_per_socket = node_affinity.num_online_cpus /
 446                                        affinity->num_core_siblings /
 447                                                node_affinity.num_online_nodes;
 448
 449        cpumask_copy(hw_thread_mask, &affinity->proc.mask);
 450        if (affinity->num_core_siblings > 0) {
 451                /* Removing other siblings not needed for now */
 452                possible = cpumask_weight(hw_thread_mask);
 453                curr_cpu = cpumask_first(hw_thread_mask);
 454                for (i = 0;
 455                     i < num_cores_per_socket * node_affinity.num_online_nodes;
 456                     i++)
 457                        curr_cpu = cpumask_next(curr_cpu, hw_thread_mask);
 458
 459                for (; i < possible; i++) {
 460                        cpumask_clear_cpu(curr_cpu, hw_thread_mask);
 461                        curr_cpu = cpumask_next(curr_cpu, hw_thread_mask);
 462                }
 463
 464                /* Identifying correct HW threads within physical cores */
 465                cpumask_shift_left(hw_thread_mask, hw_thread_mask,
 466                                   num_cores_per_socket *
 467                                   node_affinity.num_online_nodes *
 468                                   hw_thread_no);
 469        }
 470}
 471
 472int hfi1_get_proc_affinity(int node)
 473{
 474        int cpu = -1, ret, i;
 475        struct hfi1_affinity_node *entry;
 476        cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask;
 477        const struct cpumask *node_mask,
 478                *proc_mask = tsk_cpus_allowed(current);
 479        struct hfi1_affinity_node_list *affinity = &node_affinity;
 480        struct cpu_mask_set *set = &affinity->proc;
 481
 482        /*
 483         * check whether process/context affinity has already
 484         * been set
 485         */
 486        if (cpumask_weight(proc_mask) == 1) {
 487                hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl",
 488                          current->pid, current->comm,
 489                          cpumask_pr_args(proc_mask));
 490                /*
 491                 * Mark the pre-set CPU as used. This is atomic so we don't
 492                 * need the lock
 493                 */
 494                cpu = cpumask_first(proc_mask);
 495                cpumask_set_cpu(cpu, &set->used);
 496                goto done;
 497        } else if (cpumask_weight(proc_mask) < cpumask_weight(&set->mask)) {
 498                hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl",
 499                          current->pid, current->comm,
 500                          cpumask_pr_args(proc_mask));
 501                goto done;
 502        }
 503
 504        /*
 505         * The process does not have a preset CPU affinity so find one to
 506         * recommend using the following algorithm:
 507         *
 508         * For each user process that is opening a context on HFI Y:
 509         *  a) If all cores are filled, reinitialize the bitmask
 510         *  b) Fill real cores first, then HT cores (First set of HT
 511         *     cores on all physical cores, then second set of HT core,
 512         *     and, so on) in the following order:
 513         *
 514         *     1. Same NUMA node as HFI Y and not running an IRQ
 515         *        handler
 516         *     2. Same NUMA node as HFI Y and running an IRQ handler
 517         *     3. Different NUMA node to HFI Y and not running an IRQ
 518         *        handler
 519         *     4. Different NUMA node to HFI Y and running an IRQ
 520         *        handler
 521         *  c) Mark core as filled in the bitmask. As user processes are
 522         *     done, clear cores from the bitmask.
 523         */
 524
 525        ret = zalloc_cpumask_var(&diff, GFP_KERNEL);
 526        if (!ret)
 527                goto done;
 528        ret = zalloc_cpumask_var(&hw_thread_mask, GFP_KERNEL);
 529        if (!ret)
 530                goto free_diff;
 531        ret = zalloc_cpumask_var(&available_mask, GFP_KERNEL);
 532        if (!ret)
 533                goto free_hw_thread_mask;
 534        ret = zalloc_cpumask_var(&intrs_mask, GFP_KERNEL);
 535        if (!ret)
 536                goto free_available_mask;
 537
 538        spin_lock(&affinity->lock);
 539        /*
 540         * If we've used all available HW threads, clear the mask and start
 541         * overloading.
 542         */
 543        if (cpumask_equal(&set->mask, &set->used)) {
 544                set->gen++;
 545                cpumask_clear(&set->used);
 546        }
 547
 548        /*
 549         * If NUMA node has CPUs used by interrupt handlers, include them in the
 550         * interrupt handler mask.
 551         */
 552        entry = node_affinity_lookup(node);
 553        if (entry) {
 554                cpumask_copy(intrs_mask, (entry->def_intr.gen ?
 555                                          &entry->def_intr.mask :
 556                                          &entry->def_intr.used));
 557                cpumask_or(intrs_mask, intrs_mask, (entry->rcv_intr.gen ?
 558                                                    &entry->rcv_intr.mask :
 559                                                    &entry->rcv_intr.used));
 560                cpumask_or(intrs_mask, intrs_mask, &entry->general_intr_mask);
 561        }
 562        hfi1_cdbg(PROC, "CPUs used by interrupts: %*pbl",
 563                  cpumask_pr_args(intrs_mask));
 564
 565        cpumask_copy(hw_thread_mask, &set->mask);
 566
 567        /*
 568         * If HT cores are enabled, identify which HW threads within the
 569         * physical cores should be used.
 570         */
 571        if (affinity->num_core_siblings > 0) {
 572                for (i = 0; i < affinity->num_core_siblings; i++) {
 573                        find_hw_thread_mask(i, hw_thread_mask, affinity);
 574
 575                        /*
 576                         * If there's at least one available core for this HW
 577                         * thread number, stop looking for a core.
 578                         *
 579                         * diff will always be not empty at least once in this
 580                         * loop as the used mask gets reset when
 581                         * (set->mask == set->used) before this loop.
 582                         */
 583                        cpumask_andnot(diff, hw_thread_mask, &set->used);
 584                        if (!cpumask_empty(diff))
 585                                break;
 586                }
 587        }
 588        hfi1_cdbg(PROC, "Same available HW thread on all physical CPUs: %*pbl",
 589                  cpumask_pr_args(hw_thread_mask));
 590
 591        node_mask = cpumask_of_node(node);
 592        hfi1_cdbg(PROC, "Device on NUMA %u, CPUs %*pbl", node,
 593                  cpumask_pr_args(node_mask));
 594
 595        /* Get cpumask of available CPUs on preferred NUMA */
 596        cpumask_and(available_mask, hw_thread_mask, node_mask);
 597        cpumask_andnot(available_mask, available_mask, &set->used);
 598        hfi1_cdbg(PROC, "Available CPUs on NUMA %u: %*pbl", node,
 599                  cpumask_pr_args(available_mask));
 600
 601        /*
 602         * At first, we don't want to place processes on the same
 603         * CPUs as interrupt handlers. Then, CPUs running interrupt
 604         * handlers are used.
 605         *
 606         * 1) If diff is not empty, then there are CPUs not running
 607         *    non-interrupt handlers available, so diff gets copied
 608         *    over to available_mask.
 609         * 2) If diff is empty, then all CPUs not running interrupt
 610         *    handlers are taken, so available_mask contains all
 611         *    available CPUs running interrupt handlers.
 612         * 3) If available_mask is empty, then all CPUs on the
 613         *    preferred NUMA node are taken, so other NUMA nodes are
 614         *    used for process assignments using the same method as
 615         *    the preferred NUMA node.
 616         */
 617        cpumask_andnot(diff, available_mask, intrs_mask);
 618        if (!cpumask_empty(diff))
 619                cpumask_copy(available_mask, diff);
 620
 621        /* If we don't have CPUs on the preferred node, use other NUMA nodes */
 622        if (cpumask_empty(available_mask)) {
 623                cpumask_andnot(available_mask, hw_thread_mask, &set->used);
 624                /* Excluding preferred NUMA cores */
 625                cpumask_andnot(available_mask, available_mask, node_mask);
 626                hfi1_cdbg(PROC,
 627                          "Preferred NUMA node cores are taken, cores available in other NUMA nodes: %*pbl",
 628                          cpumask_pr_args(available_mask));
 629
 630                /*
 631                 * At first, we don't want to place processes on the same
 632                 * CPUs as interrupt handlers.
 633                 */
 634                cpumask_andnot(diff, available_mask, intrs_mask);
 635                if (!cpumask_empty(diff))
 636                        cpumask_copy(available_mask, diff);
 637        }
 638        hfi1_cdbg(PROC, "Possible CPUs for process: %*pbl",
 639                  cpumask_pr_args(available_mask));
 640
 641        cpu = cpumask_first(available_mask);
 642        if (cpu >= nr_cpu_ids) /* empty */
 643                cpu = -1;
 644        else
 645                cpumask_set_cpu(cpu, &set->used);
 646        spin_unlock(&affinity->lock);
 647        hfi1_cdbg(PROC, "Process assigned to CPU %d", cpu);
 648
 649        free_cpumask_var(intrs_mask);
 650free_available_mask:
 651        free_cpumask_var(available_mask);
 652free_hw_thread_mask:
 653        free_cpumask_var(hw_thread_mask);
 654free_diff:
 655        free_cpumask_var(diff);
 656done:
 657        return cpu;
 658}
 659
 660void hfi1_put_proc_affinity(int cpu)
 661{
 662        struct hfi1_affinity_node_list *affinity = &node_affinity;
 663        struct cpu_mask_set *set = &affinity->proc;
 664
 665        if (cpu < 0)
 666                return;
 667        spin_lock(&affinity->lock);
 668        cpumask_clear_cpu(cpu, &set->used);
 669        hfi1_cdbg(PROC, "Returning CPU %d for future process assignment", cpu);
 670        if (cpumask_empty(&set->used) && set->gen) {
 671                set->gen--;
 672                cpumask_copy(&set->used, &set->mask);
 673        }
 674        spin_unlock(&affinity->lock);
 675}
 676
 677/* Prevents concurrent reads and writes of the sdma_affinity attrib */
 678static DEFINE_MUTEX(sdma_affinity_mutex);
 679
 680int hfi1_set_sdma_affinity(struct hfi1_devdata *dd, const char *buf,
 681                           size_t count)
 682{
 683        struct hfi1_affinity_node *entry;
 684        cpumask_var_t mask;
 685        int ret, i;
 686
 687        spin_lock(&node_affinity.lock);
 688        entry = node_affinity_lookup(dd->node);
 689        spin_unlock(&node_affinity.lock);
 690
 691        if (!entry)
 692                return -EINVAL;
 693
 694        ret = zalloc_cpumask_var(&mask, GFP_KERNEL);
 695        if (!ret)
 696                return -ENOMEM;
 697
 698        ret = cpulist_parse(buf, mask);
 699        if (ret)
 700                goto out;
 701
 702        if (!cpumask_subset(mask, cpu_online_mask) || cpumask_empty(mask)) {
 703                dd_dev_warn(dd, "Invalid CPU mask\n");
 704                ret = -EINVAL;
 705                goto out;
 706        }
 707
 708        mutex_lock(&sdma_affinity_mutex);
 709        /* reset the SDMA interrupt affinity details */
 710        init_cpu_mask_set(&entry->def_intr);
 711        cpumask_copy(&entry->def_intr.mask, mask);
 712        /*
 713         * Reassign the affinity for each SDMA interrupt.
 714         */
 715        for (i = 0; i < dd->num_msix_entries; i++) {
 716                struct hfi1_msix_entry *msix;
 717
 718                msix = &dd->msix_entries[i];
 719                if (msix->type != IRQ_SDMA)
 720                        continue;
 721
 722                ret = hfi1_get_irq_affinity(dd, msix);
 723
 724                if (ret)
 725                        break;
 726        }
 727        mutex_unlock(&sdma_affinity_mutex);
 728out:
 729        free_cpumask_var(mask);
 730        return ret ? ret : strnlen(buf, PAGE_SIZE);
 731}
 732
 733int hfi1_get_sdma_affinity(struct hfi1_devdata *dd, char *buf)
 734{
 735        struct hfi1_affinity_node *entry;
 736
 737        spin_lock(&node_affinity.lock);
 738        entry = node_affinity_lookup(dd->node);
 739        spin_unlock(&node_affinity.lock);
 740
 741        if (!entry)
 742                return -EINVAL;
 743
 744        mutex_lock(&sdma_affinity_mutex);
 745        cpumap_print_to_pagebuf(true, buf, &entry->def_intr.mask);
 746        mutex_unlock(&sdma_affinity_mutex);
 747        return strnlen(buf, PAGE_SIZE);
 748}
 749