linux/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c
<<
>>
Prefs
   1/*
   2 * GPL HEADER START
   3 *
   4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License version 2 only,
   8 * as published by the Free Software Foundation.
   9 *
  10 * This program is distributed in the hope that it will be useful, but
  11 * WITHOUT ANY WARRANTY; without even the implied warranty of
  12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13 * General Public License version 2 for more details (a copy is included
  14 * in the LICENSE file that accompanied this code).
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * version 2 along with this program; if not, write to the
  18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19 * Boston, MA 021110-1307, USA
  20 *
  21 * GPL HEADER END
  22 */
  23/*
  24 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  25 * Copyright (c) 2012, Intel Corporation.
  26 */
  27/*
  28 * This file is part of Lustre, http://www.lustre.org/
  29 * Lustre is a trademark of Sun Microsystems, Inc.
  30 *
  31 * Author: liang@whamcloud.com
  32 */
  33
  34#define DEBUG_SUBSYSTEM S_LNET
  35
  36#include <linux/cpu.h>
  37#include <linux/sched.h>
  38#include "../../../include/linux/libcfs/libcfs.h"
  39
  40#ifdef CONFIG_SMP
  41
  42/**
  43 * modparam for setting number of partitions
  44 *
  45 *  0 : estimate best value based on cores or NUMA nodes
  46 *  1 : disable multiple partitions
  47 * >1 : specify number of partitions
  48 */
  49static int      cpu_npartitions;
  50module_param(cpu_npartitions, int, 0444);
  51MODULE_PARM_DESC(cpu_npartitions, "# of CPU partitions");
  52
  53/**
  54 * modparam for setting CPU partitions patterns:
  55 *
  56 * i.e: "0[0,1,2,3] 1[4,5,6,7]", number before bracket is CPU partition ID,
  57 *      number in bracket is processor ID (core or HT)
  58 *
  59 * i.e: "N 0[0,1] 1[2,3]" the first character 'N' means numbers in bracket
  60 *       are NUMA node ID, number before bracket is CPU partition ID.
  61 *
  62 * NB: If user specified cpu_pattern, cpu_npartitions will be ignored
  63 */
  64static char     *cpu_pattern = "";
  65module_param(cpu_pattern, charp, 0444);
  66MODULE_PARM_DESC(cpu_pattern, "CPU partitions pattern");
  67
  68struct cfs_cpt_data {
  69        /* serialize hotplug etc */
  70        spinlock_t              cpt_lock;
  71        /* reserved for hotplug */
  72        unsigned long           cpt_version;
  73        /* mutex to protect cpt_cpumask */
  74        struct mutex            cpt_mutex;
  75        /* scratch buffer for set/unset_node */
  76        cpumask_t               *cpt_cpumask;
  77};
  78
  79static struct cfs_cpt_data      cpt_data;
  80
  81static void cfs_cpu_core_siblings(int cpu, cpumask_t *mask)
  82{
  83        /* return cpumask of cores in the same socket */
  84        cpumask_copy(mask, topology_core_cpumask(cpu));
  85}
  86
  87/* return cpumask of HTs in the same core */
  88static void cfs_cpu_ht_siblings(int cpu, cpumask_t *mask)
  89{
  90        cpumask_copy(mask, topology_sibling_cpumask(cpu));
  91}
  92
  93static void cfs_node_to_cpumask(int node, cpumask_t *mask)
  94{
  95        cpumask_copy(mask, cpumask_of_node(node));
  96}
  97
  98void
  99cfs_cpt_table_free(struct cfs_cpt_table *cptab)
 100{
 101        int     i;
 102
 103        if (cptab->ctb_cpu2cpt != NULL) {
 104                LIBCFS_FREE(cptab->ctb_cpu2cpt,
 105                            num_possible_cpus() *
 106                            sizeof(cptab->ctb_cpu2cpt[0]));
 107        }
 108
 109        for (i = 0; cptab->ctb_parts != NULL && i < cptab->ctb_nparts; i++) {
 110                struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
 111
 112                if (part->cpt_nodemask != NULL) {
 113                        LIBCFS_FREE(part->cpt_nodemask,
 114                                    sizeof(*part->cpt_nodemask));
 115                }
 116
 117                if (part->cpt_cpumask != NULL)
 118                        LIBCFS_FREE(part->cpt_cpumask, cpumask_size());
 119        }
 120
 121        if (cptab->ctb_parts != NULL) {
 122                LIBCFS_FREE(cptab->ctb_parts,
 123                            cptab->ctb_nparts * sizeof(cptab->ctb_parts[0]));
 124        }
 125
 126        if (cptab->ctb_nodemask != NULL)
 127                LIBCFS_FREE(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
 128        if (cptab->ctb_cpumask != NULL)
 129                LIBCFS_FREE(cptab->ctb_cpumask, cpumask_size());
 130
 131        LIBCFS_FREE(cptab, sizeof(*cptab));
 132}
 133EXPORT_SYMBOL(cfs_cpt_table_free);
 134
 135struct cfs_cpt_table *
 136cfs_cpt_table_alloc(unsigned int ncpt)
 137{
 138        struct cfs_cpt_table *cptab;
 139        int     i;
 140
 141        LIBCFS_ALLOC(cptab, sizeof(*cptab));
 142        if (cptab == NULL)
 143                return NULL;
 144
 145        cptab->ctb_nparts = ncpt;
 146
 147        LIBCFS_ALLOC(cptab->ctb_cpumask, cpumask_size());
 148        LIBCFS_ALLOC(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
 149
 150        if (cptab->ctb_cpumask == NULL || cptab->ctb_nodemask == NULL)
 151                goto failed;
 152
 153        LIBCFS_ALLOC(cptab->ctb_cpu2cpt,
 154                     num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0]));
 155        if (cptab->ctb_cpu2cpt == NULL)
 156                goto failed;
 157
 158        memset(cptab->ctb_cpu2cpt, -1,
 159               num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0]));
 160
 161        LIBCFS_ALLOC(cptab->ctb_parts, ncpt * sizeof(cptab->ctb_parts[0]));
 162        if (cptab->ctb_parts == NULL)
 163                goto failed;
 164
 165        for (i = 0; i < ncpt; i++) {
 166                struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
 167
 168                LIBCFS_ALLOC(part->cpt_cpumask, cpumask_size());
 169                LIBCFS_ALLOC(part->cpt_nodemask, sizeof(*part->cpt_nodemask));
 170                if (part->cpt_cpumask == NULL || part->cpt_nodemask == NULL)
 171                        goto failed;
 172        }
 173
 174        spin_lock(&cpt_data.cpt_lock);
 175        /* Reserved for hotplug */
 176        cptab->ctb_version = cpt_data.cpt_version;
 177        spin_unlock(&cpt_data.cpt_lock);
 178
 179        return cptab;
 180
 181 failed:
 182        cfs_cpt_table_free(cptab);
 183        return NULL;
 184}
 185EXPORT_SYMBOL(cfs_cpt_table_alloc);
 186
 187int
 188cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
 189{
 190        char    *tmp = buf;
 191        int     rc = 0;
 192        int     i;
 193        int     j;
 194
 195        for (i = 0; i < cptab->ctb_nparts; i++) {
 196                if (len > 0) {
 197                        rc = snprintf(tmp, len, "%d\t: ", i);
 198                        len -= rc;
 199                }
 200
 201                if (len <= 0) {
 202                        rc = -EFBIG;
 203                        goto out;
 204                }
 205
 206                tmp += rc;
 207                for_each_cpu(j, cptab->ctb_parts[i].cpt_cpumask) {
 208                        rc = snprintf(tmp, len, "%d ", j);
 209                        len -= rc;
 210                        if (len <= 0) {
 211                                rc = -EFBIG;
 212                                goto out;
 213                        }
 214                        tmp += rc;
 215                }
 216
 217                *tmp = '\n';
 218                tmp++;
 219                len--;
 220        }
 221
 222 out:
 223        if (rc < 0)
 224                return rc;
 225
 226        return tmp - buf;
 227}
 228EXPORT_SYMBOL(cfs_cpt_table_print);
 229
 230int
 231cfs_cpt_number(struct cfs_cpt_table *cptab)
 232{
 233        return cptab->ctb_nparts;
 234}
 235EXPORT_SYMBOL(cfs_cpt_number);
 236
 237int
 238cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt)
 239{
 240        LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
 241
 242        return cpt == CFS_CPT_ANY ?
 243               cpumask_weight(cptab->ctb_cpumask) :
 244               cpumask_weight(cptab->ctb_parts[cpt].cpt_cpumask);
 245}
 246EXPORT_SYMBOL(cfs_cpt_weight);
 247
 248int
 249cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt)
 250{
 251        LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
 252
 253        return cpt == CFS_CPT_ANY ?
 254               cpumask_any_and(cptab->ctb_cpumask,
 255                               cpu_online_mask) < nr_cpu_ids :
 256               cpumask_any_and(cptab->ctb_parts[cpt].cpt_cpumask,
 257                               cpu_online_mask) < nr_cpu_ids;
 258}
 259EXPORT_SYMBOL(cfs_cpt_online);
 260
 261cpumask_t *
 262cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt)
 263{
 264        LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
 265
 266        return cpt == CFS_CPT_ANY ?
 267               cptab->ctb_cpumask : cptab->ctb_parts[cpt].cpt_cpumask;
 268}
 269EXPORT_SYMBOL(cfs_cpt_cpumask);
 270
 271nodemask_t *
 272cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt)
 273{
 274        LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
 275
 276        return cpt == CFS_CPT_ANY ?
 277               cptab->ctb_nodemask : cptab->ctb_parts[cpt].cpt_nodemask;
 278}
 279EXPORT_SYMBOL(cfs_cpt_nodemask);
 280
 281int
 282cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
 283{
 284        int     node;
 285
 286        LASSERT(cpt >= 0 && cpt < cptab->ctb_nparts);
 287
 288        if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_online(cpu)) {
 289                CDEBUG(D_INFO, "CPU %d is invalid or it's offline\n", cpu);
 290                return 0;
 291        }
 292
 293        if (cptab->ctb_cpu2cpt[cpu] != -1) {
 294                CDEBUG(D_INFO, "CPU %d is already in partition %d\n",
 295                       cpu, cptab->ctb_cpu2cpt[cpu]);
 296                return 0;
 297        }
 298
 299        cptab->ctb_cpu2cpt[cpu] = cpt;
 300
 301        LASSERT(!cpumask_test_cpu(cpu, cptab->ctb_cpumask));
 302        LASSERT(!cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask));
 303
 304        cpumask_set_cpu(cpu, cptab->ctb_cpumask);
 305        cpumask_set_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
 306
 307        node = cpu_to_node(cpu);
 308
 309        /* first CPU of @node in this CPT table */
 310        if (!node_isset(node, *cptab->ctb_nodemask))
 311                node_set(node, *cptab->ctb_nodemask);
 312
 313        /* first CPU of @node in this partition */
 314        if (!node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask))
 315                node_set(node, *cptab->ctb_parts[cpt].cpt_nodemask);
 316
 317        return 1;
 318}
 319EXPORT_SYMBOL(cfs_cpt_set_cpu);
 320
 321void
 322cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
 323{
 324        int     node;
 325        int     i;
 326
 327        LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
 328
 329        if (cpu < 0 || cpu >= nr_cpu_ids) {
 330                CDEBUG(D_INFO, "Invalid CPU id %d\n", cpu);
 331                return;
 332        }
 333
 334        if (cpt == CFS_CPT_ANY) {
 335                /* caller doesn't know the partition ID */
 336                cpt = cptab->ctb_cpu2cpt[cpu];
 337                if (cpt < 0) { /* not set in this CPT-table */
 338                        CDEBUG(D_INFO, "Try to unset cpu %d which is not in CPT-table %p\n",
 339                               cpt, cptab);
 340                        return;
 341                }
 342
 343        } else if (cpt != cptab->ctb_cpu2cpt[cpu]) {
 344                CDEBUG(D_INFO,
 345                       "CPU %d is not in cpu-partition %d\n", cpu, cpt);
 346                return;
 347        }
 348
 349        LASSERT(cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask));
 350        LASSERT(cpumask_test_cpu(cpu, cptab->ctb_cpumask));
 351
 352        cpumask_clear_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
 353        cpumask_clear_cpu(cpu, cptab->ctb_cpumask);
 354        cptab->ctb_cpu2cpt[cpu] = -1;
 355
 356        node = cpu_to_node(cpu);
 357
 358        LASSERT(node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask));
 359        LASSERT(node_isset(node, *cptab->ctb_nodemask));
 360
 361        for_each_cpu(i, cptab->ctb_parts[cpt].cpt_cpumask) {
 362                /* this CPT has other CPU belonging to this node? */
 363                if (cpu_to_node(i) == node)
 364                        break;
 365        }
 366
 367        if (i >= nr_cpu_ids)
 368                node_clear(node, *cptab->ctb_parts[cpt].cpt_nodemask);
 369
 370        for_each_cpu(i, cptab->ctb_cpumask) {
 371                /* this CPT-table has other CPU belonging to this node? */
 372                if (cpu_to_node(i) == node)
 373                        break;
 374        }
 375
 376        if (i >= nr_cpu_ids)
 377                node_clear(node, *cptab->ctb_nodemask);
 378
 379        return;
 380}
 381EXPORT_SYMBOL(cfs_cpt_unset_cpu);
 382
 383int
 384cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
 385{
 386        int     i;
 387
 388        if (cpumask_weight(mask) == 0 ||
 389            cpumask_any_and(mask, cpu_online_mask) >= nr_cpu_ids) {
 390                CDEBUG(D_INFO, "No online CPU is found in the CPU mask for CPU partition %d\n",
 391                       cpt);
 392                return 0;
 393        }
 394
 395        for_each_cpu(i, mask) {
 396                if (!cfs_cpt_set_cpu(cptab, cpt, i))
 397                        return 0;
 398        }
 399
 400        return 1;
 401}
 402EXPORT_SYMBOL(cfs_cpt_set_cpumask);
 403
 404void
 405cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
 406{
 407        int     i;
 408
 409        for_each_cpu(i, mask)
 410                cfs_cpt_unset_cpu(cptab, cpt, i);
 411}
 412EXPORT_SYMBOL(cfs_cpt_unset_cpumask);
 413
 414int
 415cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node)
 416{
 417        cpumask_t       *mask;
 418        int             rc;
 419
 420        if (node < 0 || node >= MAX_NUMNODES) {
 421                CDEBUG(D_INFO,
 422                       "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
 423                return 0;
 424        }
 425
 426        mutex_lock(&cpt_data.cpt_mutex);
 427
 428        mask = cpt_data.cpt_cpumask;
 429        cfs_node_to_cpumask(node, mask);
 430
 431        rc = cfs_cpt_set_cpumask(cptab, cpt, mask);
 432
 433        mutex_unlock(&cpt_data.cpt_mutex);
 434
 435        return rc;
 436}
 437EXPORT_SYMBOL(cfs_cpt_set_node);
 438
 439void
 440cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node)
 441{
 442        cpumask_t *mask;
 443
 444        if (node < 0 || node >= MAX_NUMNODES) {
 445                CDEBUG(D_INFO,
 446                       "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
 447                return;
 448        }
 449
 450        mutex_lock(&cpt_data.cpt_mutex);
 451
 452        mask = cpt_data.cpt_cpumask;
 453        cfs_node_to_cpumask(node, mask);
 454
 455        cfs_cpt_unset_cpumask(cptab, cpt, mask);
 456
 457        mutex_unlock(&cpt_data.cpt_mutex);
 458}
 459EXPORT_SYMBOL(cfs_cpt_unset_node);
 460
 461int
 462cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
 463{
 464        int     i;
 465
 466        for_each_node_mask(i, *mask) {
 467                if (!cfs_cpt_set_node(cptab, cpt, i))
 468                        return 0;
 469        }
 470
 471        return 1;
 472}
 473EXPORT_SYMBOL(cfs_cpt_set_nodemask);
 474
 475void
 476cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
 477{
 478        int     i;
 479
 480        for_each_node_mask(i, *mask)
 481                cfs_cpt_unset_node(cptab, cpt, i);
 482}
 483EXPORT_SYMBOL(cfs_cpt_unset_nodemask);
 484
 485void
 486cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt)
 487{
 488        int     last;
 489        int     i;
 490
 491        if (cpt == CFS_CPT_ANY) {
 492                last = cptab->ctb_nparts - 1;
 493                cpt = 0;
 494        } else {
 495                last = cpt;
 496        }
 497
 498        for (; cpt <= last; cpt++) {
 499                for_each_cpu(i, cptab->ctb_parts[cpt].cpt_cpumask)
 500                        cfs_cpt_unset_cpu(cptab, cpt, i);
 501        }
 502}
 503EXPORT_SYMBOL(cfs_cpt_clear);
 504
 505int
 506cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt)
 507{
 508        nodemask_t      *mask;
 509        int             weight;
 510        int             rotor;
 511        int             node;
 512
 513        /* convert CPU partition ID to HW node id */
 514
 515        if (cpt < 0 || cpt >= cptab->ctb_nparts) {
 516                mask = cptab->ctb_nodemask;
 517                rotor = cptab->ctb_spread_rotor++;
 518        } else {
 519                mask = cptab->ctb_parts[cpt].cpt_nodemask;
 520                rotor = cptab->ctb_parts[cpt].cpt_spread_rotor++;
 521        }
 522
 523        weight = nodes_weight(*mask);
 524        LASSERT(weight > 0);
 525
 526        rotor %= weight;
 527
 528        for_each_node_mask(node, *mask) {
 529                if (rotor-- == 0)
 530                        return node;
 531        }
 532
 533        LBUG();
 534        return 0;
 535}
 536EXPORT_SYMBOL(cfs_cpt_spread_node);
 537
 538int
 539cfs_cpt_current(struct cfs_cpt_table *cptab, int remap)
 540{
 541        int     cpu = smp_processor_id();
 542        int     cpt = cptab->ctb_cpu2cpt[cpu];
 543
 544        if (cpt < 0) {
 545                if (!remap)
 546                        return cpt;
 547
 548                /* don't return negative value for safety of upper layer,
 549                 * instead we shadow the unknown cpu to a valid partition ID */
 550                cpt = cpu % cptab->ctb_nparts;
 551        }
 552
 553        return cpt;
 554}
 555EXPORT_SYMBOL(cfs_cpt_current);
 556
 557int
 558cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu)
 559{
 560        LASSERT(cpu >= 0 && cpu < nr_cpu_ids);
 561
 562        return cptab->ctb_cpu2cpt[cpu];
 563}
 564EXPORT_SYMBOL(cfs_cpt_of_cpu);
 565
 566int
 567cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
 568{
 569        cpumask_t       *cpumask;
 570        nodemask_t      *nodemask;
 571        int             rc;
 572        int             i;
 573
 574        LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
 575
 576        if (cpt == CFS_CPT_ANY) {
 577                cpumask = cptab->ctb_cpumask;
 578                nodemask = cptab->ctb_nodemask;
 579        } else {
 580                cpumask = cptab->ctb_parts[cpt].cpt_cpumask;
 581                nodemask = cptab->ctb_parts[cpt].cpt_nodemask;
 582        }
 583
 584        if (cpumask_any_and(cpumask, cpu_online_mask) >= nr_cpu_ids) {
 585                CERROR("No online CPU found in CPU partition %d, did someone do CPU hotplug on system? You might need to reload Lustre modules to keep system working well.\n",
 586                       cpt);
 587                return -EINVAL;
 588        }
 589
 590        for_each_online_cpu(i) {
 591                if (cpumask_test_cpu(i, cpumask))
 592                        continue;
 593
 594                rc = set_cpus_allowed_ptr(current, cpumask);
 595                set_mems_allowed(*nodemask);
 596                if (rc == 0)
 597                        schedule(); /* switch to allowed CPU */
 598
 599                return rc;
 600        }
 601
 602        /* don't need to set affinity because all online CPUs are covered */
 603        return 0;
 604}
 605EXPORT_SYMBOL(cfs_cpt_bind);
 606
 607/**
 608 * Choose max to \a number CPUs from \a node and set them in \a cpt.
 609 * We always prefer to choose CPU in the same core/socket.
 610 */
 611static int
 612cfs_cpt_choose_ncpus(struct cfs_cpt_table *cptab, int cpt,
 613                     cpumask_t *node, int number)
 614{
 615        cpumask_t       *socket = NULL;
 616        cpumask_t       *core = NULL;
 617        int             rc = 0;
 618        int             cpu;
 619
 620        LASSERT(number > 0);
 621
 622        if (number >= cpumask_weight(node)) {
 623                while (!cpumask_empty(node)) {
 624                        cpu = cpumask_first(node);
 625
 626                        rc = cfs_cpt_set_cpu(cptab, cpt, cpu);
 627                        if (!rc)
 628                                return -EINVAL;
 629                        cpumask_clear_cpu(cpu, node);
 630                }
 631                return 0;
 632        }
 633
 634        /* allocate scratch buffer */
 635        LIBCFS_ALLOC(socket, cpumask_size());
 636        LIBCFS_ALLOC(core, cpumask_size());
 637        if (socket == NULL || core == NULL) {
 638                rc = -ENOMEM;
 639                goto out;
 640        }
 641
 642        while (!cpumask_empty(node)) {
 643                cpu = cpumask_first(node);
 644
 645                /* get cpumask for cores in the same socket */
 646                cfs_cpu_core_siblings(cpu, socket);
 647                cpumask_and(socket, socket, node);
 648
 649                LASSERT(!cpumask_empty(socket));
 650
 651                while (!cpumask_empty(socket)) {
 652                        int     i;
 653
 654                        /* get cpumask for hts in the same core */
 655                        cfs_cpu_ht_siblings(cpu, core);
 656                        cpumask_and(core, core, node);
 657
 658                        LASSERT(!cpumask_empty(core));
 659
 660                        for_each_cpu(i, core) {
 661                                cpumask_clear_cpu(i, socket);
 662                                cpumask_clear_cpu(i, node);
 663
 664                                rc = cfs_cpt_set_cpu(cptab, cpt, i);
 665                                if (!rc) {
 666                                        rc = -EINVAL;
 667                                        goto out;
 668                                }
 669
 670                                if (--number == 0)
 671                                        goto out;
 672                        }
 673                        cpu = cpumask_first(socket);
 674                }
 675        }
 676
 677 out:
 678        if (socket != NULL)
 679                LIBCFS_FREE(socket, cpumask_size());
 680        if (core != NULL)
 681                LIBCFS_FREE(core, cpumask_size());
 682        return rc;
 683}
 684
 685#define CPT_WEIGHT_MIN  4u
 686
 687static unsigned int
 688cfs_cpt_num_estimate(void)
 689{
 690        unsigned nnode = num_online_nodes();
 691        unsigned ncpu  = num_online_cpus();
 692        unsigned ncpt;
 693
 694        if (ncpu <= CPT_WEIGHT_MIN) {
 695                ncpt = 1;
 696                goto out;
 697        }
 698
 699        /* generate reasonable number of CPU partitions based on total number
 700         * of CPUs, Preferred N should be power2 and match this condition:
 701         * 2 * (N - 1)^2 < NCPUS <= 2 * N^2 */
 702        for (ncpt = 2; ncpu > 2 * ncpt * ncpt; ncpt <<= 1)
 703                ;
 704
 705        if (ncpt <= nnode) { /* fat numa system */
 706                while (nnode > ncpt)
 707                        nnode >>= 1;
 708
 709        } else { /* ncpt > nnode */
 710                while ((nnode << 1) <= ncpt)
 711                        nnode <<= 1;
 712        }
 713
 714        ncpt = nnode;
 715
 716 out:
 717#if (BITS_PER_LONG == 32)
 718        /* config many CPU partitions on 32-bit system could consume
 719         * too much memory */
 720        ncpt = min(2U, ncpt);
 721#endif
 722        while (ncpu % ncpt != 0)
 723                ncpt--; /* worst case is 1 */
 724
 725        return ncpt;
 726}
 727
 728static struct cfs_cpt_table *
 729cfs_cpt_table_create(int ncpt)
 730{
 731        struct cfs_cpt_table *cptab = NULL;
 732        cpumask_t       *mask = NULL;
 733        int             cpt = 0;
 734        int             num;
 735        int             rc;
 736        int             i;
 737
 738        rc = cfs_cpt_num_estimate();
 739        if (ncpt <= 0)
 740                ncpt = rc;
 741
 742        if (ncpt > num_online_cpus() || ncpt > 4 * rc) {
 743                CWARN("CPU partition number %d is larger than suggested value (%d), your system may have performance issue or run out of memory while under pressure\n",
 744                      ncpt, rc);
 745        }
 746
 747        if (num_online_cpus() % ncpt != 0) {
 748                CERROR("CPU number %d is not multiple of cpu_npartition %d, please try different cpu_npartitions value or set pattern string by cpu_pattern=STRING\n",
 749                       (int)num_online_cpus(), ncpt);
 750                goto failed;
 751        }
 752
 753        cptab = cfs_cpt_table_alloc(ncpt);
 754        if (cptab == NULL) {
 755                CERROR("Failed to allocate CPU map(%d)\n", ncpt);
 756                goto failed;
 757        }
 758
 759        num = num_online_cpus() / ncpt;
 760        if (num == 0) {
 761                CERROR("CPU changed while setting CPU partition\n");
 762                goto failed;
 763        }
 764
 765        LIBCFS_ALLOC(mask, cpumask_size());
 766        if (mask == NULL) {
 767                CERROR("Failed to allocate scratch cpumask\n");
 768                goto failed;
 769        }
 770
 771        for_each_online_node(i) {
 772                cfs_node_to_cpumask(i, mask);
 773
 774                while (!cpumask_empty(mask)) {
 775                        struct cfs_cpu_partition *part;
 776                        int    n;
 777
 778                        if (cpt >= ncpt)
 779                                goto failed;
 780
 781                        part = &cptab->ctb_parts[cpt];
 782
 783                        n = num - cpumask_weight(part->cpt_cpumask);
 784                        LASSERT(n > 0);
 785
 786                        rc = cfs_cpt_choose_ncpus(cptab, cpt, mask, n);
 787                        if (rc < 0)
 788                                goto failed;
 789
 790                        LASSERT(num >= cpumask_weight(part->cpt_cpumask));
 791                        if (num == cpumask_weight(part->cpt_cpumask))
 792                                cpt++;
 793                }
 794        }
 795
 796        if (cpt != ncpt ||
 797            num != cpumask_weight(cptab->ctb_parts[ncpt - 1].cpt_cpumask)) {
 798                CERROR("Expect %d(%d) CPU partitions but got %d(%d), CPU hotplug/unplug while setting?\n",
 799                       cptab->ctb_nparts, num, cpt,
 800                       cpumask_weight(cptab->ctb_parts[ncpt - 1].cpt_cpumask));
 801                goto failed;
 802        }
 803
 804        LIBCFS_FREE(mask, cpumask_size());
 805
 806        return cptab;
 807
 808 failed:
 809        CERROR("Failed to setup CPU-partition-table with %d CPU-partitions, online HW nodes: %d, HW cpus: %d.\n",
 810               ncpt, num_online_nodes(), num_online_cpus());
 811
 812        if (mask != NULL)
 813                LIBCFS_FREE(mask, cpumask_size());
 814
 815        if (cptab != NULL)
 816                cfs_cpt_table_free(cptab);
 817
 818        return NULL;
 819}
 820
 821static struct cfs_cpt_table *
 822cfs_cpt_table_create_pattern(char *pattern)
 823{
 824        struct cfs_cpt_table    *cptab;
 825        char                    *str    = pattern;
 826        int                     node    = 0;
 827        int                     high;
 828        int                     ncpt;
 829        int                     c;
 830
 831        for (ncpt = 0;; ncpt++) { /* quick scan bracket */
 832                str = strchr(str, '[');
 833                if (str == NULL)
 834                        break;
 835                str++;
 836        }
 837
 838        str = cfs_trimwhite(pattern);
 839        if (*str == 'n' || *str == 'N') {
 840                pattern = str + 1;
 841                node = 1;
 842        }
 843
 844        if (ncpt == 0 ||
 845            (node && ncpt > num_online_nodes()) ||
 846            (!node && ncpt > num_online_cpus())) {
 847                CERROR("Invalid pattern %s, or too many partitions %d\n",
 848                       pattern, ncpt);
 849                return NULL;
 850        }
 851
 852        high = node ? MAX_NUMNODES - 1 : nr_cpu_ids - 1;
 853
 854        cptab = cfs_cpt_table_alloc(ncpt);
 855        if (cptab == NULL) {
 856                CERROR("Failed to allocate cpu partition table\n");
 857                return NULL;
 858        }
 859
 860        for (str = cfs_trimwhite(pattern), c = 0;; c++) {
 861                struct cfs_range_expr   *range;
 862                struct cfs_expr_list    *el;
 863                char                    *bracket = strchr(str, '[');
 864                int                     cpt;
 865                int                     rc;
 866                int                     i;
 867                int                     n;
 868
 869                if (bracket == NULL) {
 870                        if (*str != 0) {
 871                                CERROR("Invalid pattern %s\n", str);
 872                                goto failed;
 873                        } else if (c != ncpt) {
 874                                CERROR("expect %d partitions but found %d\n",
 875                                       ncpt, c);
 876                                goto failed;
 877                        }
 878                        break;
 879                }
 880
 881                if (sscanf(str, "%d%n", &cpt, &n) < 1) {
 882                        CERROR("Invalid cpu pattern %s\n", str);
 883                        goto failed;
 884                }
 885
 886                if (cpt < 0 || cpt >= ncpt) {
 887                        CERROR("Invalid partition id %d, total partitions %d\n",
 888                               cpt, ncpt);
 889                        goto failed;
 890                }
 891
 892                if (cfs_cpt_weight(cptab, cpt) != 0) {
 893                        CERROR("Partition %d has already been set.\n", cpt);
 894                        goto failed;
 895                }
 896
 897                str = cfs_trimwhite(str + n);
 898                if (str != bracket) {
 899                        CERROR("Invalid pattern %s\n", str);
 900                        goto failed;
 901                }
 902
 903                bracket = strchr(str, ']');
 904                if (bracket == NULL) {
 905                        CERROR("missing right bracket for cpt %d, %s\n",
 906                               cpt, str);
 907                        goto failed;
 908                }
 909
 910                if (cfs_expr_list_parse(str, (bracket - str) + 1,
 911                                        0, high, &el) != 0) {
 912                        CERROR("Can't parse number range: %s\n", str);
 913                        goto failed;
 914                }
 915
 916                list_for_each_entry(range, &el->el_exprs, re_link) {
 917                        for (i = range->re_lo; i <= range->re_hi; i++) {
 918                                if ((i - range->re_lo) % range->re_stride != 0)
 919                                        continue;
 920
 921                                rc = node ? cfs_cpt_set_node(cptab, cpt, i) :
 922                                            cfs_cpt_set_cpu(cptab, cpt, i);
 923                                if (!rc) {
 924                                        cfs_expr_list_free(el);
 925                                        goto failed;
 926                                }
 927                        }
 928                }
 929
 930                cfs_expr_list_free(el);
 931
 932                if (!cfs_cpt_online(cptab, cpt)) {
 933                        CERROR("No online CPU is found on partition %d\n", cpt);
 934                        goto failed;
 935                }
 936
 937                str = cfs_trimwhite(bracket + 1);
 938        }
 939
 940        return cptab;
 941
 942 failed:
 943        cfs_cpt_table_free(cptab);
 944        return NULL;
 945}
 946
 947#ifdef CONFIG_HOTPLUG_CPU
 948static int
 949cfs_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 950{
 951        unsigned int  cpu = (unsigned long)hcpu;
 952        bool         warn;
 953
 954        switch (action) {
 955        case CPU_DEAD:
 956        case CPU_DEAD_FROZEN:
 957        case CPU_ONLINE:
 958        case CPU_ONLINE_FROZEN:
 959                spin_lock(&cpt_data.cpt_lock);
 960                cpt_data.cpt_version++;
 961                spin_unlock(&cpt_data.cpt_lock);
 962        default:
 963                if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) {
 964                        CDEBUG(D_INFO, "CPU changed [cpu %u action %lx]\n",
 965                               cpu, action);
 966                        break;
 967                }
 968
 969                mutex_lock(&cpt_data.cpt_mutex);
 970                /* if all HTs in a core are offline, it may break affinity */
 971                cfs_cpu_ht_siblings(cpu, cpt_data.cpt_cpumask);
 972                warn = cpumask_any_and(cpt_data.cpt_cpumask,
 973                                       cpu_online_mask) >= nr_cpu_ids;
 974                mutex_unlock(&cpt_data.cpt_mutex);
 975                CDEBUG(warn ? D_WARNING : D_INFO,
 976                       "Lustre: can't support CPU plug-out well now, performance and stability could be impacted [CPU %u action: %lx]\n",
 977                       cpu, action);
 978        }
 979
 980        return NOTIFY_OK;
 981}
 982
 983static struct notifier_block cfs_cpu_notifier = {
 984        .notifier_call  = cfs_cpu_notify,
 985        .priority       = 0
 986};
 987
 988#endif
 989
 990void
 991cfs_cpu_fini(void)
 992{
 993        if (cfs_cpt_table != NULL)
 994                cfs_cpt_table_free(cfs_cpt_table);
 995
 996#ifdef CONFIG_HOTPLUG_CPU
 997        unregister_hotcpu_notifier(&cfs_cpu_notifier);
 998#endif
 999        if (cpt_data.cpt_cpumask != NULL)
1000                LIBCFS_FREE(cpt_data.cpt_cpumask, cpumask_size());
1001}
1002
1003int
1004cfs_cpu_init(void)
1005{
1006        LASSERT(cfs_cpt_table == NULL);
1007
1008        memset(&cpt_data, 0, sizeof(cpt_data));
1009
1010        LIBCFS_ALLOC(cpt_data.cpt_cpumask, cpumask_size());
1011        if (cpt_data.cpt_cpumask == NULL) {
1012                CERROR("Failed to allocate scratch buffer\n");
1013                return -1;
1014        }
1015
1016        spin_lock_init(&cpt_data.cpt_lock);
1017        mutex_init(&cpt_data.cpt_mutex);
1018
1019#ifdef CONFIG_HOTPLUG_CPU
1020        register_hotcpu_notifier(&cfs_cpu_notifier);
1021#endif
1022
1023        if (*cpu_pattern != 0) {
1024                cfs_cpt_table = cfs_cpt_table_create_pattern(cpu_pattern);
1025                if (cfs_cpt_table == NULL) {
1026                        CERROR("Failed to create cptab from pattern %s\n",
1027                               cpu_pattern);
1028                        goto failed;
1029                }
1030
1031        } else {
1032                cfs_cpt_table = cfs_cpt_table_create(cpu_npartitions);
1033                if (cfs_cpt_table == NULL) {
1034                        CERROR("Failed to create ptable with npartitions %d\n",
1035                               cpu_npartitions);
1036                        goto failed;
1037                }
1038        }
1039
1040        spin_lock(&cpt_data.cpt_lock);
1041        if (cfs_cpt_table->ctb_version != cpt_data.cpt_version) {
1042                spin_unlock(&cpt_data.cpt_lock);
1043                CERROR("CPU hotplug/unplug during setup\n");
1044                goto failed;
1045        }
1046        spin_unlock(&cpt_data.cpt_lock);
1047
1048        LCONSOLE(0, "HW CPU cores: %d, npartitions: %d\n",
1049                 num_online_cpus(), cfs_cpt_number(cfs_cpt_table));
1050        return 0;
1051
1052 failed:
1053        cfs_cpu_fini();
1054        return -1;
1055}
1056
1057#endif
1058