linux/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c
<<
>>
Prefs
   1/*
   2 * GPL HEADER START
   3 *
   4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License version 2 only,
   8 * as published by the Free Software Foundation.
   9 *
  10 * This program is distributed in the hope that it will be useful, but
  11 * WITHOUT ANY WARRANTY; without even the implied warranty of
  12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13 * General Public License version 2 for more details (a copy is included
  14 * in the LICENSE file that accompanied this code).
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * version 2 along with this program; if not, write to the
  18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19 * Boston, MA 021110-1307, USA
  20 *
  21 * GPL HEADER END
  22 */
  23/*
  24 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  25 * Copyright (c) 2012, Intel Corporation.
  26 */
  27/*
  28 * This file is part of Lustre, http://www.lustre.org/
  29 * Lustre is a trademark of Sun Microsystems, Inc.
  30 *
  31 * Author: liang@whamcloud.com
  32 */
  33
  34#define DEBUG_SUBSYSTEM S_LNET
  35
  36#include <linux/cpu.h>
  37#include <linux/sched.h>
  38#include "../../../include/linux/libcfs/libcfs.h"
  39
  40#ifdef CONFIG_SMP
  41
  42/**
  43 * modparam for setting number of partitions
  44 *
  45 *  0 : estimate best value based on cores or NUMA nodes
  46 *  1 : disable multiple partitions
  47 * >1 : specify number of partitions
  48 */
  49static int      cpu_npartitions;
  50module_param(cpu_npartitions, int, 0444);
  51MODULE_PARM_DESC(cpu_npartitions, "# of CPU partitions");
  52
  53/**
  54 * modparam for setting CPU partitions patterns:
  55 *
  56 * i.e: "0[0,1,2,3] 1[4,5,6,7]", number before bracket is CPU partition ID,
  57 *      number in bracket is processor ID (core or HT)
  58 *
  59 * i.e: "N 0[0,1] 1[2,3]" the first character 'N' means numbers in bracket
  60 *       are NUMA node ID, number before bracket is CPU partition ID.
  61 *
  62 * NB: If user specified cpu_pattern, cpu_npartitions will be ignored
  63 */
  64static char     *cpu_pattern = "";
  65module_param(cpu_pattern, charp, 0444);
  66MODULE_PARM_DESC(cpu_pattern, "CPU partitions pattern");
  67
  68struct cfs_cpt_data {
  69        /* serialize hotplug etc */
  70        spinlock_t              cpt_lock;
  71        /* reserved for hotplug */
  72        unsigned long           cpt_version;
  73        /* mutex to protect cpt_cpumask */
  74        struct mutex            cpt_mutex;
  75        /* scratch buffer for set/unset_node */
  76        cpumask_t               *cpt_cpumask;
  77};
  78
  79static struct cfs_cpt_data      cpt_data;
  80
  81static void cfs_cpu_core_siblings(int cpu, cpumask_t *mask)
  82{
  83        /* return cpumask of cores in the same socket */
  84        cpumask_copy(mask, topology_core_cpumask(cpu));
  85}
  86
  87/* return cpumask of HTs in the same core */
  88static void cfs_cpu_ht_siblings(int cpu, cpumask_t *mask)
  89{
  90        cpumask_copy(mask, topology_thread_cpumask(cpu));
  91}
  92
  93static void cfs_node_to_cpumask(int node, cpumask_t *mask)
  94{
  95        cpumask_copy(mask, cpumask_of_node(node));
  96}
  97
  98void
  99cfs_cpt_table_free(struct cfs_cpt_table *cptab)
 100{
 101        int     i;
 102
 103        if (cptab->ctb_cpu2cpt != NULL) {
 104                LIBCFS_FREE(cptab->ctb_cpu2cpt,
 105                            num_possible_cpus() *
 106                            sizeof(cptab->ctb_cpu2cpt[0]));
 107        }
 108
 109        for (i = 0; cptab->ctb_parts != NULL && i < cptab->ctb_nparts; i++) {
 110                struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
 111
 112                if (part->cpt_nodemask != NULL) {
 113                        LIBCFS_FREE(part->cpt_nodemask,
 114                                    sizeof(*part->cpt_nodemask));
 115                }
 116
 117                if (part->cpt_cpumask != NULL)
 118                        LIBCFS_FREE(part->cpt_cpumask, cpumask_size());
 119        }
 120
 121        if (cptab->ctb_parts != NULL) {
 122                LIBCFS_FREE(cptab->ctb_parts,
 123                            cptab->ctb_nparts * sizeof(cptab->ctb_parts[0]));
 124        }
 125
 126        if (cptab->ctb_nodemask != NULL)
 127                LIBCFS_FREE(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
 128        if (cptab->ctb_cpumask != NULL)
 129                LIBCFS_FREE(cptab->ctb_cpumask, cpumask_size());
 130
 131        LIBCFS_FREE(cptab, sizeof(*cptab));
 132}
 133EXPORT_SYMBOL(cfs_cpt_table_free);
 134
 135struct cfs_cpt_table *
 136cfs_cpt_table_alloc(unsigned int ncpt)
 137{
 138        struct cfs_cpt_table *cptab;
 139        int     i;
 140
 141        LIBCFS_ALLOC(cptab, sizeof(*cptab));
 142        if (cptab == NULL)
 143                return NULL;
 144
 145        cptab->ctb_nparts = ncpt;
 146
 147        LIBCFS_ALLOC(cptab->ctb_cpumask, cpumask_size());
 148        LIBCFS_ALLOC(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
 149
 150        if (cptab->ctb_cpumask == NULL || cptab->ctb_nodemask == NULL)
 151                goto failed;
 152
 153        LIBCFS_ALLOC(cptab->ctb_cpu2cpt,
 154                     num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0]));
 155        if (cptab->ctb_cpu2cpt == NULL)
 156                goto failed;
 157
 158        memset(cptab->ctb_cpu2cpt, -1,
 159               num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0]));
 160
 161        LIBCFS_ALLOC(cptab->ctb_parts, ncpt * sizeof(cptab->ctb_parts[0]));
 162        if (cptab->ctb_parts == NULL)
 163                goto failed;
 164
 165        for (i = 0; i < ncpt; i++) {
 166                struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
 167
 168                LIBCFS_ALLOC(part->cpt_cpumask, cpumask_size());
 169                LIBCFS_ALLOC(part->cpt_nodemask, sizeof(*part->cpt_nodemask));
 170                if (part->cpt_cpumask == NULL || part->cpt_nodemask == NULL)
 171                        goto failed;
 172        }
 173
 174        spin_lock(&cpt_data.cpt_lock);
 175        /* Reserved for hotplug */
 176        cptab->ctb_version = cpt_data.cpt_version;
 177        spin_unlock(&cpt_data.cpt_lock);
 178
 179        return cptab;
 180
 181 failed:
 182        cfs_cpt_table_free(cptab);
 183        return NULL;
 184}
 185EXPORT_SYMBOL(cfs_cpt_table_alloc);
 186
 187int
 188cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
 189{
 190        char    *tmp = buf;
 191        int     rc = 0;
 192        int     i;
 193        int     j;
 194
 195        for (i = 0; i < cptab->ctb_nparts; i++) {
 196                if (len > 0) {
 197                        rc = snprintf(tmp, len, "%d\t: ", i);
 198                        len -= rc;
 199                }
 200
 201                if (len <= 0) {
 202                        rc = -EFBIG;
 203                        goto out;
 204                }
 205
 206                tmp += rc;
 207                for_each_cpu_mask(j, *cptab->ctb_parts[i].cpt_cpumask) {
 208                        rc = snprintf(tmp, len, "%d ", j);
 209                        len -= rc;
 210                        if (len <= 0) {
 211                                rc = -EFBIG;
 212                                goto out;
 213                        }
 214                        tmp += rc;
 215                }
 216
 217                *tmp = '\n';
 218                tmp++;
 219                len--;
 220        }
 221
 222 out:
 223        if (rc < 0)
 224                return rc;
 225
 226        return tmp - buf;
 227}
 228EXPORT_SYMBOL(cfs_cpt_table_print);
 229
 230int
 231cfs_cpt_number(struct cfs_cpt_table *cptab)
 232{
 233        return cptab->ctb_nparts;
 234}
 235EXPORT_SYMBOL(cfs_cpt_number);
 236
 237int
 238cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt)
 239{
 240        LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
 241
 242        return cpt == CFS_CPT_ANY ?
 243               cpus_weight(*cptab->ctb_cpumask) :
 244               cpus_weight(*cptab->ctb_parts[cpt].cpt_cpumask);
 245}
 246EXPORT_SYMBOL(cfs_cpt_weight);
 247
 248int
 249cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt)
 250{
 251        LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
 252
 253        return cpt == CFS_CPT_ANY ?
 254               any_online_cpu(*cptab->ctb_cpumask) != NR_CPUS :
 255               any_online_cpu(*cptab->ctb_parts[cpt].cpt_cpumask) != NR_CPUS;
 256}
 257EXPORT_SYMBOL(cfs_cpt_online);
 258
 259cpumask_t *
 260cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt)
 261{
 262        LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
 263
 264        return cpt == CFS_CPT_ANY ?
 265               cptab->ctb_cpumask : cptab->ctb_parts[cpt].cpt_cpumask;
 266}
 267EXPORT_SYMBOL(cfs_cpt_cpumask);
 268
 269nodemask_t *
 270cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt)
 271{
 272        LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
 273
 274        return cpt == CFS_CPT_ANY ?
 275               cptab->ctb_nodemask : cptab->ctb_parts[cpt].cpt_nodemask;
 276}
 277EXPORT_SYMBOL(cfs_cpt_nodemask);
 278
 279int
 280cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
 281{
 282        int     node;
 283
 284        LASSERT(cpt >= 0 && cpt < cptab->ctb_nparts);
 285
 286        if (cpu < 0 || cpu >= NR_CPUS || !cpu_online(cpu)) {
 287                CDEBUG(D_INFO, "CPU %d is invalid or it's offline\n", cpu);
 288                return 0;
 289        }
 290
 291        if (cptab->ctb_cpu2cpt[cpu] != -1) {
 292                CDEBUG(D_INFO, "CPU %d is already in partition %d\n",
 293                       cpu, cptab->ctb_cpu2cpt[cpu]);
 294                return 0;
 295        }
 296
 297        cptab->ctb_cpu2cpt[cpu] = cpt;
 298
 299        LASSERT(!cpu_isset(cpu, *cptab->ctb_cpumask));
 300        LASSERT(!cpu_isset(cpu, *cptab->ctb_parts[cpt].cpt_cpumask));
 301
 302        cpu_set(cpu, *cptab->ctb_cpumask);
 303        cpu_set(cpu, *cptab->ctb_parts[cpt].cpt_cpumask);
 304
 305        node = cpu_to_node(cpu);
 306
 307        /* first CPU of @node in this CPT table */
 308        if (!node_isset(node, *cptab->ctb_nodemask))
 309                node_set(node, *cptab->ctb_nodemask);
 310
 311        /* first CPU of @node in this partition */
 312        if (!node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask))
 313                node_set(node, *cptab->ctb_parts[cpt].cpt_nodemask);
 314
 315        return 1;
 316}
 317EXPORT_SYMBOL(cfs_cpt_set_cpu);
 318
 319void
 320cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
 321{
 322        int     node;
 323        int     i;
 324
 325        LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
 326
 327        if (cpu < 0 || cpu >= NR_CPUS) {
 328                CDEBUG(D_INFO, "Invalid CPU id %d\n", cpu);
 329                return;
 330        }
 331
 332        if (cpt == CFS_CPT_ANY) {
 333                /* caller doesn't know the partition ID */
 334                cpt = cptab->ctb_cpu2cpt[cpu];
 335                if (cpt < 0) { /* not set in this CPT-table */
 336                        CDEBUG(D_INFO, "Try to unset cpu %d which is not in CPT-table %p\n",
 337                               cpt, cptab);
 338                        return;
 339                }
 340
 341        } else if (cpt != cptab->ctb_cpu2cpt[cpu]) {
 342                CDEBUG(D_INFO,
 343                       "CPU %d is not in cpu-partition %d\n", cpu, cpt);
 344                return;
 345        }
 346
 347        LASSERT(cpu_isset(cpu, *cptab->ctb_parts[cpt].cpt_cpumask));
 348        LASSERT(cpu_isset(cpu, *cptab->ctb_cpumask));
 349
 350        cpu_clear(cpu, *cptab->ctb_parts[cpt].cpt_cpumask);
 351        cpu_clear(cpu, *cptab->ctb_cpumask);
 352        cptab->ctb_cpu2cpt[cpu] = -1;
 353
 354        node = cpu_to_node(cpu);
 355
 356        LASSERT(node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask));
 357        LASSERT(node_isset(node, *cptab->ctb_nodemask));
 358
 359        for_each_cpu_mask(i, *cptab->ctb_parts[cpt].cpt_cpumask) {
 360                /* this CPT has other CPU belonging to this node? */
 361                if (cpu_to_node(i) == node)
 362                        break;
 363        }
 364
 365        if (i == NR_CPUS)
 366                node_clear(node, *cptab->ctb_parts[cpt].cpt_nodemask);
 367
 368        for_each_cpu_mask(i, *cptab->ctb_cpumask) {
 369                /* this CPT-table has other CPU belonging to this node? */
 370                if (cpu_to_node(i) == node)
 371                        break;
 372        }
 373
 374        if (i == NR_CPUS)
 375                node_clear(node, *cptab->ctb_nodemask);
 376
 377        return;
 378}
 379EXPORT_SYMBOL(cfs_cpt_unset_cpu);
 380
 381int
 382cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
 383{
 384        int     i;
 385
 386        if (cpus_weight(*mask) == 0 || any_online_cpu(*mask) == NR_CPUS) {
 387                CDEBUG(D_INFO, "No online CPU is found in the CPU mask for CPU partition %d\n",
 388                       cpt);
 389                return 0;
 390        }
 391
 392        for_each_cpu_mask(i, *mask) {
 393                if (!cfs_cpt_set_cpu(cptab, cpt, i))
 394                        return 0;
 395        }
 396
 397        return 1;
 398}
 399EXPORT_SYMBOL(cfs_cpt_set_cpumask);
 400
 401void
 402cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
 403{
 404        int     i;
 405
 406        for_each_cpu_mask(i, *mask)
 407                cfs_cpt_unset_cpu(cptab, cpt, i);
 408}
 409EXPORT_SYMBOL(cfs_cpt_unset_cpumask);
 410
 411int
 412cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node)
 413{
 414        cpumask_t       *mask;
 415        int             rc;
 416
 417        if (node < 0 || node >= MAX_NUMNODES) {
 418                CDEBUG(D_INFO,
 419                       "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
 420                return 0;
 421        }
 422
 423        mutex_lock(&cpt_data.cpt_mutex);
 424
 425        mask = cpt_data.cpt_cpumask;
 426        cfs_node_to_cpumask(node, mask);
 427
 428        rc = cfs_cpt_set_cpumask(cptab, cpt, mask);
 429
 430        mutex_unlock(&cpt_data.cpt_mutex);
 431
 432        return rc;
 433}
 434EXPORT_SYMBOL(cfs_cpt_set_node);
 435
 436void
 437cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node)
 438{
 439        cpumask_t *mask;
 440
 441        if (node < 0 || node >= MAX_NUMNODES) {
 442                CDEBUG(D_INFO,
 443                       "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
 444                return;
 445        }
 446
 447        mutex_lock(&cpt_data.cpt_mutex);
 448
 449        mask = cpt_data.cpt_cpumask;
 450        cfs_node_to_cpumask(node, mask);
 451
 452        cfs_cpt_unset_cpumask(cptab, cpt, mask);
 453
 454        mutex_unlock(&cpt_data.cpt_mutex);
 455}
 456EXPORT_SYMBOL(cfs_cpt_unset_node);
 457
 458int
 459cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
 460{
 461        int     i;
 462
 463        for_each_node_mask(i, *mask) {
 464                if (!cfs_cpt_set_node(cptab, cpt, i))
 465                        return 0;
 466        }
 467
 468        return 1;
 469}
 470EXPORT_SYMBOL(cfs_cpt_set_nodemask);
 471
 472void
 473cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
 474{
 475        int     i;
 476
 477        for_each_node_mask(i, *mask)
 478                cfs_cpt_unset_node(cptab, cpt, i);
 479}
 480EXPORT_SYMBOL(cfs_cpt_unset_nodemask);
 481
 482void
 483cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt)
 484{
 485        int     last;
 486        int     i;
 487
 488        if (cpt == CFS_CPT_ANY) {
 489                last = cptab->ctb_nparts - 1;
 490                cpt = 0;
 491        } else {
 492                last = cpt;
 493        }
 494
 495        for (; cpt <= last; cpt++) {
 496                for_each_cpu_mask(i, *cptab->ctb_parts[cpt].cpt_cpumask)
 497                        cfs_cpt_unset_cpu(cptab, cpt, i);
 498        }
 499}
 500EXPORT_SYMBOL(cfs_cpt_clear);
 501
 502int
 503cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt)
 504{
 505        nodemask_t      *mask;
 506        int             weight;
 507        int             rotor;
 508        int             node;
 509
 510        /* convert CPU partition ID to HW node id */
 511
 512        if (cpt < 0 || cpt >= cptab->ctb_nparts) {
 513                mask = cptab->ctb_nodemask;
 514                rotor = cptab->ctb_spread_rotor++;
 515        } else {
 516                mask = cptab->ctb_parts[cpt].cpt_nodemask;
 517                rotor = cptab->ctb_parts[cpt].cpt_spread_rotor++;
 518        }
 519
 520        weight = nodes_weight(*mask);
 521        LASSERT(weight > 0);
 522
 523        rotor %= weight;
 524
 525        for_each_node_mask(node, *mask) {
 526                if (rotor-- == 0)
 527                        return node;
 528        }
 529
 530        LBUG();
 531        return 0;
 532}
 533EXPORT_SYMBOL(cfs_cpt_spread_node);
 534
 535int
 536cfs_cpt_current(struct cfs_cpt_table *cptab, int remap)
 537{
 538        int     cpu = smp_processor_id();
 539        int     cpt = cptab->ctb_cpu2cpt[cpu];
 540
 541        if (cpt < 0) {
 542                if (!remap)
 543                        return cpt;
 544
 545                /* don't return negative value for safety of upper layer,
 546                 * instead we shadow the unknown cpu to a valid partition ID */
 547                cpt = cpu % cptab->ctb_nparts;
 548        }
 549
 550        return cpt;
 551}
 552EXPORT_SYMBOL(cfs_cpt_current);
 553
 554int
 555cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu)
 556{
 557        LASSERT(cpu >= 0 && cpu < NR_CPUS);
 558
 559        return cptab->ctb_cpu2cpt[cpu];
 560}
 561EXPORT_SYMBOL(cfs_cpt_of_cpu);
 562
 563int
 564cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
 565{
 566        cpumask_t       *cpumask;
 567        nodemask_t      *nodemask;
 568        int             rc;
 569        int             i;
 570
 571        LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
 572
 573        if (cpt == CFS_CPT_ANY) {
 574                cpumask = cptab->ctb_cpumask;
 575                nodemask = cptab->ctb_nodemask;
 576        } else {
 577                cpumask = cptab->ctb_parts[cpt].cpt_cpumask;
 578                nodemask = cptab->ctb_parts[cpt].cpt_nodemask;
 579        }
 580
 581        if (any_online_cpu(*cpumask) == NR_CPUS) {
 582                CERROR("No online CPU found in CPU partition %d, did someone do CPU hotplug on system? You might need to reload Lustre modules to keep system working well.\n",
 583                       cpt);
 584                return -EINVAL;
 585        }
 586
 587        for_each_online_cpu(i) {
 588                if (cpu_isset(i, *cpumask))
 589                        continue;
 590
 591                rc = set_cpus_allowed_ptr(current, cpumask);
 592                set_mems_allowed(*nodemask);
 593                if (rc == 0)
 594                        schedule(); /* switch to allowed CPU */
 595
 596                return rc;
 597        }
 598
 599        /* don't need to set affinity because all online CPUs are covered */
 600        return 0;
 601}
 602EXPORT_SYMBOL(cfs_cpt_bind);
 603
 604/**
 605 * Choose max to \a number CPUs from \a node and set them in \a cpt.
 606 * We always prefer to choose CPU in the same core/socket.
 607 */
 608static int
 609cfs_cpt_choose_ncpus(struct cfs_cpt_table *cptab, int cpt,
 610                     cpumask_t *node, int number)
 611{
 612        cpumask_t       *socket = NULL;
 613        cpumask_t       *core = NULL;
 614        int             rc = 0;
 615        int             cpu;
 616
 617        LASSERT(number > 0);
 618
 619        if (number >= cpus_weight(*node)) {
 620                while (!cpus_empty(*node)) {
 621                        cpu = first_cpu(*node);
 622
 623                        rc = cfs_cpt_set_cpu(cptab, cpt, cpu);
 624                        if (!rc)
 625                                return -EINVAL;
 626                        cpu_clear(cpu, *node);
 627                }
 628                return 0;
 629        }
 630
 631        /* allocate scratch buffer */
 632        LIBCFS_ALLOC(socket, cpumask_size());
 633        LIBCFS_ALLOC(core, cpumask_size());
 634        if (socket == NULL || core == NULL) {
 635                rc = -ENOMEM;
 636                goto out;
 637        }
 638
 639        while (!cpus_empty(*node)) {
 640                cpu = first_cpu(*node);
 641
 642                /* get cpumask for cores in the same socket */
 643                cfs_cpu_core_siblings(cpu, socket);
 644                cpus_and(*socket, *socket, *node);
 645
 646                LASSERT(!cpus_empty(*socket));
 647
 648                while (!cpus_empty(*socket)) {
 649                        int     i;
 650
 651                        /* get cpumask for hts in the same core */
 652                        cfs_cpu_ht_siblings(cpu, core);
 653                        cpus_and(*core, *core, *node);
 654
 655                        LASSERT(!cpus_empty(*core));
 656
 657                        for_each_cpu_mask(i, *core) {
 658                                cpu_clear(i, *socket);
 659                                cpu_clear(i, *node);
 660
 661                                rc = cfs_cpt_set_cpu(cptab, cpt, i);
 662                                if (!rc) {
 663                                        rc = -EINVAL;
 664                                        goto out;
 665                                }
 666
 667                                if (--number == 0)
 668                                        goto out;
 669                        }
 670                        cpu = first_cpu(*socket);
 671                }
 672        }
 673
 674 out:
 675        if (socket != NULL)
 676                LIBCFS_FREE(socket, cpumask_size());
 677        if (core != NULL)
 678                LIBCFS_FREE(core, cpumask_size());
 679        return rc;
 680}
 681
 682#define CPT_WEIGHT_MIN  4u
 683
 684static unsigned int
 685cfs_cpt_num_estimate(void)
 686{
 687        unsigned nnode = num_online_nodes();
 688        unsigned ncpu  = num_online_cpus();
 689        unsigned ncpt;
 690
 691        if (ncpu <= CPT_WEIGHT_MIN) {
 692                ncpt = 1;
 693                goto out;
 694        }
 695
 696        /* generate reasonable number of CPU partitions based on total number
 697         * of CPUs, Preferred N should be power2 and match this condition:
 698         * 2 * (N - 1)^2 < NCPUS <= 2 * N^2 */
 699        for (ncpt = 2; ncpu > 2 * ncpt * ncpt; ncpt <<= 1) {}
 700
 701        if (ncpt <= nnode) { /* fat numa system */
 702                while (nnode > ncpt)
 703                        nnode >>= 1;
 704
 705        } else { /* ncpt > nnode */
 706                while ((nnode << 1) <= ncpt)
 707                        nnode <<= 1;
 708        }
 709
 710        ncpt = nnode;
 711
 712 out:
 713#if (BITS_PER_LONG == 32)
 714        /* config many CPU partitions on 32-bit system could consume
 715         * too much memory */
 716        ncpt = min(2U, ncpt);
 717#endif
 718        while (ncpu % ncpt != 0)
 719                ncpt--; /* worst case is 1 */
 720
 721        return ncpt;
 722}
 723
 724static struct cfs_cpt_table *
 725cfs_cpt_table_create(int ncpt)
 726{
 727        struct cfs_cpt_table *cptab = NULL;
 728        cpumask_t       *mask = NULL;
 729        int             cpt = 0;
 730        int             num;
 731        int             rc;
 732        int             i;
 733
 734        rc = cfs_cpt_num_estimate();
 735        if (ncpt <= 0)
 736                ncpt = rc;
 737
 738        if (ncpt > num_online_cpus() || ncpt > 4 * rc) {
 739                CWARN("CPU partition number %d is larger than suggested value (%d), your system may have performance issue or run out of memory while under pressure\n",
 740                      ncpt, rc);
 741        }
 742
 743        if (num_online_cpus() % ncpt != 0) {
 744                CERROR("CPU number %d is not multiple of cpu_npartition %d, please try different cpu_npartitions value or set pattern string by cpu_pattern=STRING\n",
 745                       (int)num_online_cpus(), ncpt);
 746                goto failed;
 747        }
 748
 749        cptab = cfs_cpt_table_alloc(ncpt);
 750        if (cptab == NULL) {
 751                CERROR("Failed to allocate CPU map(%d)\n", ncpt);
 752                goto failed;
 753        }
 754
 755        num = num_online_cpus() / ncpt;
 756        if (num == 0) {
 757                CERROR("CPU changed while setting CPU partition\n");
 758                goto failed;
 759        }
 760
 761        LIBCFS_ALLOC(mask, cpumask_size());
 762        if (mask == NULL) {
 763                CERROR("Failed to allocate scratch cpumask\n");
 764                goto failed;
 765        }
 766
 767        for_each_online_node(i) {
 768                cfs_node_to_cpumask(i, mask);
 769
 770                while (!cpus_empty(*mask)) {
 771                        struct cfs_cpu_partition *part;
 772                        int    n;
 773
 774                        if (cpt >= ncpt)
 775                                goto failed;
 776
 777                        part = &cptab->ctb_parts[cpt];
 778
 779                        n = num - cpus_weight(*part->cpt_cpumask);
 780                        LASSERT(n > 0);
 781
 782                        rc = cfs_cpt_choose_ncpus(cptab, cpt, mask, n);
 783                        if (rc < 0)
 784                                goto failed;
 785
 786                        LASSERT(num >= cpus_weight(*part->cpt_cpumask));
 787                        if (num == cpus_weight(*part->cpt_cpumask))
 788                                cpt++;
 789                }
 790        }
 791
 792        if (cpt != ncpt ||
 793            num != cpus_weight(*cptab->ctb_parts[ncpt - 1].cpt_cpumask)) {
 794                CERROR("Expect %d(%d) CPU partitions but got %d(%d), CPU hotplug/unplug while setting?\n",
 795                       cptab->ctb_nparts, num, cpt,
 796                       cpus_weight(*cptab->ctb_parts[ncpt - 1].cpt_cpumask));
 797                goto failed;
 798        }
 799
 800        LIBCFS_FREE(mask, cpumask_size());
 801
 802        return cptab;
 803
 804 failed:
 805        CERROR("Failed to setup CPU-partition-table with %d CPU-partitions, online HW nodes: %d, HW cpus: %d.\n",
 806               ncpt, num_online_nodes(), num_online_cpus());
 807
 808        if (mask != NULL)
 809                LIBCFS_FREE(mask, cpumask_size());
 810
 811        if (cptab != NULL)
 812                cfs_cpt_table_free(cptab);
 813
 814        return NULL;
 815}
 816
 817static struct cfs_cpt_table *
 818cfs_cpt_table_create_pattern(char *pattern)
 819{
 820        struct cfs_cpt_table    *cptab;
 821        char                    *str    = pattern;
 822        int                     node    = 0;
 823        int                     high;
 824        int                     ncpt;
 825        int                     c;
 826
 827        for (ncpt = 0;; ncpt++) { /* quick scan bracket */
 828                str = strchr(str, '[');
 829                if (str == NULL)
 830                        break;
 831                str++;
 832        }
 833
 834        str = cfs_trimwhite(pattern);
 835        if (*str == 'n' || *str == 'N') {
 836                pattern = str + 1;
 837                node = 1;
 838        }
 839
 840        if (ncpt == 0 ||
 841            (node && ncpt > num_online_nodes()) ||
 842            (!node && ncpt > num_online_cpus())) {
 843                CERROR("Invalid pattern %s, or too many partitions %d\n",
 844                       pattern, ncpt);
 845                return NULL;
 846        }
 847
 848        high = node ? MAX_NUMNODES - 1 : NR_CPUS - 1;
 849
 850        cptab = cfs_cpt_table_alloc(ncpt);
 851        if (cptab == NULL) {
 852                CERROR("Failed to allocate cpu partition table\n");
 853                return NULL;
 854        }
 855
 856        for (str = cfs_trimwhite(pattern), c = 0;; c++) {
 857                struct cfs_range_expr   *range;
 858                struct cfs_expr_list    *el;
 859                char                    *bracket = strchr(str, '[');
 860                int                     cpt;
 861                int                     rc;
 862                int                     i;
 863                int                     n;
 864
 865                if (bracket == NULL) {
 866                        if (*str != 0) {
 867                                CERROR("Invalid pattern %s\n", str);
 868                                goto failed;
 869                        } else if (c != ncpt) {
 870                                CERROR("expect %d partitions but found %d\n",
 871                                       ncpt, c);
 872                                goto failed;
 873                        }
 874                        break;
 875                }
 876
 877                if (sscanf(str, "%d%n", &cpt, &n) < 1) {
 878                        CERROR("Invalid cpu pattern %s\n", str);
 879                        goto failed;
 880                }
 881
 882                if (cpt < 0 || cpt >= ncpt) {
 883                        CERROR("Invalid partition id %d, total partitions %d\n",
 884                               cpt, ncpt);
 885                        goto failed;
 886                }
 887
 888                if (cfs_cpt_weight(cptab, cpt) != 0) {
 889                        CERROR("Partition %d has already been set.\n", cpt);
 890                        goto failed;
 891                }
 892
 893                str = cfs_trimwhite(str + n);
 894                if (str != bracket) {
 895                        CERROR("Invalid pattern %s\n", str);
 896                        goto failed;
 897                }
 898
 899                bracket = strchr(str, ']');
 900                if (bracket == NULL) {
 901                        CERROR("missing right bracket for cpt %d, %s\n",
 902                               cpt, str);
 903                        goto failed;
 904                }
 905
 906                if (cfs_expr_list_parse(str, (bracket - str) + 1,
 907                                        0, high, &el) != 0) {
 908                        CERROR("Can't parse number range: %s\n", str);
 909                        goto failed;
 910                }
 911
 912                list_for_each_entry(range, &el->el_exprs, re_link) {
 913                        for (i = range->re_lo; i <= range->re_hi; i++) {
 914                                if ((i - range->re_lo) % range->re_stride != 0)
 915                                        continue;
 916
 917                                rc = node ? cfs_cpt_set_node(cptab, cpt, i) :
 918                                            cfs_cpt_set_cpu(cptab, cpt, i);
 919                                if (!rc) {
 920                                        cfs_expr_list_free(el);
 921                                        goto failed;
 922                                }
 923                        }
 924                }
 925
 926                cfs_expr_list_free(el);
 927
 928                if (!cfs_cpt_online(cptab, cpt)) {
 929                        CERROR("No online CPU is found on partition %d\n", cpt);
 930                        goto failed;
 931                }
 932
 933                str = cfs_trimwhite(bracket + 1);
 934        }
 935
 936        return cptab;
 937
 938 failed:
 939        cfs_cpt_table_free(cptab);
 940        return NULL;
 941}
 942
 943#ifdef CONFIG_HOTPLUG_CPU
 944static int
 945cfs_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 946{
 947        unsigned int  cpu = (unsigned long)hcpu;
 948        bool         warn;
 949
 950        switch (action) {
 951        case CPU_DEAD:
 952        case CPU_DEAD_FROZEN:
 953        case CPU_ONLINE:
 954        case CPU_ONLINE_FROZEN:
 955                spin_lock(&cpt_data.cpt_lock);
 956                cpt_data.cpt_version++;
 957                spin_unlock(&cpt_data.cpt_lock);
 958        default:
 959                if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) {
 960                        CDEBUG(D_INFO, "CPU changed [cpu %u action %lx]\n",
 961                               cpu, action);
 962                        break;
 963                }
 964
 965                mutex_lock(&cpt_data.cpt_mutex);
 966                /* if all HTs in a core are offline, it may break affinity */
 967                cfs_cpu_ht_siblings(cpu, cpt_data.cpt_cpumask);
 968                warn = any_online_cpu(*cpt_data.cpt_cpumask) >= nr_cpu_ids;
 969                mutex_unlock(&cpt_data.cpt_mutex);
 970                CDEBUG(warn ? D_WARNING : D_INFO,
 971                       "Lustre: can't support CPU plug-out well now, performance and stability could be impacted [CPU %u action: %lx]\n",
 972                       cpu, action);
 973        }
 974
 975        return NOTIFY_OK;
 976}
 977
 978static struct notifier_block cfs_cpu_notifier = {
 979        .notifier_call  = cfs_cpu_notify,
 980        .priority       = 0
 981};
 982
 983#endif
 984
 985void
 986cfs_cpu_fini(void)
 987{
 988        if (cfs_cpt_table != NULL)
 989                cfs_cpt_table_free(cfs_cpt_table);
 990
 991#ifdef CONFIG_HOTPLUG_CPU
 992        unregister_hotcpu_notifier(&cfs_cpu_notifier);
 993#endif
 994        if (cpt_data.cpt_cpumask != NULL)
 995                LIBCFS_FREE(cpt_data.cpt_cpumask, cpumask_size());
 996}
 997
 998int
 999cfs_cpu_init(void)
1000{
1001        LASSERT(cfs_cpt_table == NULL);
1002
1003        memset(&cpt_data, 0, sizeof(cpt_data));
1004
1005        LIBCFS_ALLOC(cpt_data.cpt_cpumask, cpumask_size());
1006        if (cpt_data.cpt_cpumask == NULL) {
1007                CERROR("Failed to allocate scratch buffer\n");
1008                return -1;
1009        }
1010
1011        spin_lock_init(&cpt_data.cpt_lock);
1012        mutex_init(&cpt_data.cpt_mutex);
1013
1014#ifdef CONFIG_HOTPLUG_CPU
1015        register_hotcpu_notifier(&cfs_cpu_notifier);
1016#endif
1017
1018        if (*cpu_pattern != 0) {
1019                cfs_cpt_table = cfs_cpt_table_create_pattern(cpu_pattern);
1020                if (cfs_cpt_table == NULL) {
1021                        CERROR("Failed to create cptab from pattern %s\n",
1022                               cpu_pattern);
1023                        goto failed;
1024                }
1025
1026        } else {
1027                cfs_cpt_table = cfs_cpt_table_create(cpu_npartitions);
1028                if (cfs_cpt_table == NULL) {
1029                        CERROR("Failed to create ptable with npartitions %d\n",
1030                               cpu_npartitions);
1031                        goto failed;
1032                }
1033        }
1034
1035        spin_lock(&cpt_data.cpt_lock);
1036        if (cfs_cpt_table->ctb_version != cpt_data.cpt_version) {
1037                spin_unlock(&cpt_data.cpt_lock);
1038                CERROR("CPU hotplug/unplug during setup\n");
1039                goto failed;
1040        }
1041        spin_unlock(&cpt_data.cpt_lock);
1042
1043        LCONSOLE(0, "HW CPU cores: %d, npartitions: %d\n",
1044                 num_online_cpus(), cfs_cpt_number(cfs_cpt_table));
1045        return 0;
1046
1047 failed:
1048        cfs_cpu_fini();
1049        return -1;
1050}
1051
1052#endif
1053