linux/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c
<<
>>
Prefs
   1/*
   2 * GPL HEADER START
   3 *
   4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License version 2 only,
   8 * as published by the Free Software Foundation.
   9 *
  10 * This program is distributed in the hope that it will be useful, but
  11 * WITHOUT ANY WARRANTY; without even the implied warranty of
  12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13 * General Public License version 2 for more details (a copy is included
  14 * in the LICENSE file that accompanied this code).
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * version 2 along with this program; if not, write to the
  18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19 * Boston, MA 021110-1307, USA
  20 *
  21 * GPL HEADER END
  22 */
  23/*
  24 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  25 * Copyright (c) 2012, Intel Corporation.
  26 */
  27/*
  28 * This file is part of Lustre, http://www.lustre.org/
  29 * Lustre is a trademark of Sun Microsystems, Inc.
  30 *
  31 * Author: liang@whamcloud.com
  32 */
  33
  34#define DEBUG_SUBSYSTEM S_LNET
  35
  36#include <linux/cpu.h>
  37#include <linux/sched.h>
  38#include <linux/libcfs/libcfs.h>
  39
  40#ifdef CONFIG_SMP
  41
  42/**
  43 * modparam for setting number of partitions
  44 *
  45 *  0 : estimate best value based on cores or NUMA nodes
  46 *  1 : disable multiple partitions
  47 * >1 : specify number of partitions
  48 */
  49static int      cpu_npartitions;
  50CFS_MODULE_PARM(cpu_npartitions, "i", int, 0444, "# of CPU partitions");
  51
  52/**
  53 * modparam for setting CPU partitions patterns:
  54 *
  55 * i.e: "0[0,1,2,3] 1[4,5,6,7]", number before bracket is CPU partition ID,
  56 *      number in bracket is processor ID (core or HT)
  57 *
  58 * i.e: "N 0[0,1] 1[2,3]" the first character 'N' means numbers in bracket
  59 *       are NUMA node ID, number before bracket is CPU partition ID.
  60 *
  61 * NB: If user specified cpu_pattern, cpu_npartitions will be ignored
  62 */
  63static char     *cpu_pattern = "";
  64CFS_MODULE_PARM(cpu_pattern, "s", charp, 0444, "CPU partitions pattern");
  65
  66struct cfs_cpt_data {
  67        /* serialize hotplug etc */
  68        spinlock_t              cpt_lock;
  69        /* reserved for hotplug */
  70        unsigned long           cpt_version;
  71        /* mutex to protect cpt_cpumask */
  72        struct semaphore        cpt_mutex;
  73        /* scratch buffer for set/unset_node */
  74        cpumask_t               *cpt_cpumask;
  75};
  76
  77static struct cfs_cpt_data      cpt_data;
  78
  79void
  80cfs_cpu_core_siblings(int cpu, cpumask_t *mask)
  81{
  82        /* return cpumask of cores in the same socket */
  83        cpumask_copy(mask, topology_core_cpumask(cpu));
  84}
  85EXPORT_SYMBOL(cfs_cpu_core_siblings);
  86
  87/* return number of cores in the same socket of \a cpu */
  88int
  89cfs_cpu_core_nsiblings(int cpu)
  90{
  91        int     num;
  92
  93        down(&cpt_data.cpt_mutex);
  94
  95        cfs_cpu_core_siblings(cpu, cpt_data.cpt_cpumask);
  96        num = cpus_weight(*cpt_data.cpt_cpumask);
  97
  98        up(&cpt_data.cpt_mutex);
  99
 100        return num;
 101}
 102EXPORT_SYMBOL(cfs_cpu_core_nsiblings);
 103
 104/* return cpumask of HTs in the same core */
 105void
 106cfs_cpu_ht_siblings(int cpu, cpumask_t *mask)
 107{
 108        cpumask_copy(mask, topology_thread_cpumask(cpu));
 109}
 110EXPORT_SYMBOL(cfs_cpu_ht_siblings);
 111
 112/* return number of HTs in the same core of \a cpu */
 113int
 114cfs_cpu_ht_nsiblings(int cpu)
 115{
 116        int     num;
 117
 118        down(&cpt_data.cpt_mutex);
 119
 120        cfs_cpu_ht_siblings(cpu, cpt_data.cpt_cpumask);
 121        num = cpus_weight(*cpt_data.cpt_cpumask);
 122
 123        up(&cpt_data.cpt_mutex);
 124
 125        return num;
 126}
 127EXPORT_SYMBOL(cfs_cpu_ht_nsiblings);
 128
 129void
 130cfs_node_to_cpumask(int node, cpumask_t *mask)
 131{
 132        cpumask_copy(mask, cpumask_of_node(node));
 133}
 134EXPORT_SYMBOL(cfs_node_to_cpumask);
 135
 136void
 137cfs_cpt_table_free(struct cfs_cpt_table *cptab)
 138{
 139        int     i;
 140
 141        if (cptab->ctb_cpu2cpt != NULL) {
 142                LIBCFS_FREE(cptab->ctb_cpu2cpt,
 143                            num_possible_cpus() *
 144                            sizeof(cptab->ctb_cpu2cpt[0]));
 145        }
 146
 147        for (i = 0; cptab->ctb_parts != NULL && i < cptab->ctb_nparts; i++) {
 148                struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
 149
 150                if (part->cpt_nodemask != NULL) {
 151                        LIBCFS_FREE(part->cpt_nodemask,
 152                                    sizeof(*part->cpt_nodemask));
 153                }
 154
 155                if (part->cpt_cpumask != NULL)
 156                        LIBCFS_FREE(part->cpt_cpumask, cpumask_size());
 157        }
 158
 159        if (cptab->ctb_parts != NULL) {
 160                LIBCFS_FREE(cptab->ctb_parts,
 161                            cptab->ctb_nparts * sizeof(cptab->ctb_parts[0]));
 162        }
 163
 164        if (cptab->ctb_nodemask != NULL)
 165                LIBCFS_FREE(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
 166        if (cptab->ctb_cpumask != NULL)
 167                LIBCFS_FREE(cptab->ctb_cpumask, cpumask_size());
 168
 169        LIBCFS_FREE(cptab, sizeof(*cptab));
 170}
 171EXPORT_SYMBOL(cfs_cpt_table_free);
 172
 173struct cfs_cpt_table *
 174cfs_cpt_table_alloc(unsigned int ncpt)
 175{
 176        struct cfs_cpt_table *cptab;
 177        int     i;
 178
 179        LIBCFS_ALLOC(cptab, sizeof(*cptab));
 180        if (cptab == NULL)
 181                return NULL;
 182
 183        cptab->ctb_nparts = ncpt;
 184
 185        LIBCFS_ALLOC(cptab->ctb_cpumask, cpumask_size());
 186        LIBCFS_ALLOC(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
 187
 188        if (cptab->ctb_cpumask == NULL || cptab->ctb_nodemask == NULL)
 189                goto failed;
 190
 191        LIBCFS_ALLOC(cptab->ctb_cpu2cpt,
 192                     num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0]));
 193        if (cptab->ctb_cpu2cpt == NULL)
 194                goto failed;
 195
 196        memset(cptab->ctb_cpu2cpt, -1,
 197               num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0]));
 198
 199        LIBCFS_ALLOC(cptab->ctb_parts, ncpt * sizeof(cptab->ctb_parts[0]));
 200        if (cptab->ctb_parts == NULL)
 201                goto failed;
 202
 203        for (i = 0; i < ncpt; i++) {
 204                struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
 205
 206                LIBCFS_ALLOC(part->cpt_cpumask, cpumask_size());
 207                LIBCFS_ALLOC(part->cpt_nodemask, sizeof(*part->cpt_nodemask));
 208                if (part->cpt_cpumask == NULL || part->cpt_nodemask == NULL)
 209                        goto failed;
 210        }
 211
 212        spin_lock(&cpt_data.cpt_lock);
 213        /* Reserved for hotplug */
 214        cptab->ctb_version = cpt_data.cpt_version;
 215        spin_unlock(&cpt_data.cpt_lock);
 216
 217        return cptab;
 218
 219 failed:
 220        cfs_cpt_table_free(cptab);
 221        return NULL;
 222}
 223EXPORT_SYMBOL(cfs_cpt_table_alloc);
 224
 225int
 226cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
 227{
 228        char    *tmp = buf;
 229        int     rc = 0;
 230        int     i;
 231        int     j;
 232
 233        for (i = 0; i < cptab->ctb_nparts; i++) {
 234                if (len > 0) {
 235                        rc = snprintf(tmp, len, "%d\t: ", i);
 236                        len -= rc;
 237                }
 238
 239                if (len <= 0) {
 240                        rc = -EFBIG;
 241                        goto out;
 242                }
 243
 244                tmp += rc;
 245                for_each_cpu_mask(j, *cptab->ctb_parts[i].cpt_cpumask) {
 246                        rc = snprintf(tmp, len, "%d ", j);
 247                        len -= rc;
 248                        if (len <= 0) {
 249                                rc = -EFBIG;
 250                                goto out;
 251                        }
 252                        tmp += rc;
 253                }
 254
 255                *tmp = '\n';
 256                tmp++;
 257                len--;
 258        }
 259
 260 out:
 261        if (rc < 0)
 262                return rc;
 263
 264        return tmp - buf;
 265}
 266EXPORT_SYMBOL(cfs_cpt_table_print);
 267
 268int
 269cfs_cpt_number(struct cfs_cpt_table *cptab)
 270{
 271        return cptab->ctb_nparts;
 272}
 273EXPORT_SYMBOL(cfs_cpt_number);
 274
 275int
 276cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt)
 277{
 278        LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
 279
 280        return cpt == CFS_CPT_ANY ?
 281               cpus_weight(*cptab->ctb_cpumask) :
 282               cpus_weight(*cptab->ctb_parts[cpt].cpt_cpumask);
 283}
 284EXPORT_SYMBOL(cfs_cpt_weight);
 285
 286int
 287cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt)
 288{
 289        LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
 290
 291        return cpt == CFS_CPT_ANY ?
 292               any_online_cpu(*cptab->ctb_cpumask) != NR_CPUS :
 293               any_online_cpu(*cptab->ctb_parts[cpt].cpt_cpumask) != NR_CPUS;
 294}
 295EXPORT_SYMBOL(cfs_cpt_online);
 296
 297cpumask_t *
 298cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt)
 299{
 300        LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
 301
 302        return cpt == CFS_CPT_ANY ?
 303               cptab->ctb_cpumask : cptab->ctb_parts[cpt].cpt_cpumask;
 304}
 305EXPORT_SYMBOL(cfs_cpt_cpumask);
 306
 307nodemask_t *
 308cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt)
 309{
 310        LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
 311
 312        return cpt == CFS_CPT_ANY ?
 313               cptab->ctb_nodemask : cptab->ctb_parts[cpt].cpt_nodemask;
 314}
 315EXPORT_SYMBOL(cfs_cpt_nodemask);
 316
 317int
 318cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
 319{
 320        int     node;
 321
 322        LASSERT(cpt >= 0 && cpt < cptab->ctb_nparts);
 323
 324        if (cpu < 0 || cpu >= NR_CPUS || !cpu_online(cpu)) {
 325                CDEBUG(D_INFO, "CPU %d is invalid or it's offline\n", cpu);
 326                return 0;
 327        }
 328
 329        if (cptab->ctb_cpu2cpt[cpu] != -1) {
 330                CDEBUG(D_INFO, "CPU %d is already in partition %d\n",
 331                       cpu, cptab->ctb_cpu2cpt[cpu]);
 332                return 0;
 333        }
 334
 335        cptab->ctb_cpu2cpt[cpu] = cpt;
 336
 337        LASSERT(!cpu_isset(cpu, *cptab->ctb_cpumask));
 338        LASSERT(!cpu_isset(cpu, *cptab->ctb_parts[cpt].cpt_cpumask));
 339
 340        cpu_set(cpu, *cptab->ctb_cpumask);
 341        cpu_set(cpu, *cptab->ctb_parts[cpt].cpt_cpumask);
 342
 343        node = cpu_to_node(cpu);
 344
 345        /* first CPU of @node in this CPT table */
 346        if (!node_isset(node, *cptab->ctb_nodemask))
 347                node_set(node, *cptab->ctb_nodemask);
 348
 349        /* first CPU of @node in this partition */
 350        if (!node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask))
 351                node_set(node, *cptab->ctb_parts[cpt].cpt_nodemask);
 352
 353        return 1;
 354}
 355EXPORT_SYMBOL(cfs_cpt_set_cpu);
 356
 357void
 358cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
 359{
 360        int     node;
 361        int     i;
 362
 363        LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
 364
 365        if (cpu < 0 || cpu >= NR_CPUS) {
 366                CDEBUG(D_INFO, "Invalid CPU id %d\n", cpu);
 367                return;
 368        }
 369
 370        if (cpt == CFS_CPT_ANY) {
 371                /* caller doesn't know the partition ID */
 372                cpt = cptab->ctb_cpu2cpt[cpu];
 373                if (cpt < 0) { /* not set in this CPT-table */
 374                        CDEBUG(D_INFO, "Try to unset cpu %d which is "
 375                                       "not in CPT-table %p\n", cpt, cptab);
 376                        return;
 377                }
 378
 379        } else if (cpt != cptab->ctb_cpu2cpt[cpu]) {
 380                CDEBUG(D_INFO,
 381                       "CPU %d is not in cpu-partition %d\n", cpu, cpt);
 382                return;
 383        }
 384
 385        LASSERT(cpu_isset(cpu, *cptab->ctb_parts[cpt].cpt_cpumask));
 386        LASSERT(cpu_isset(cpu, *cptab->ctb_cpumask));
 387
 388        cpu_clear(cpu, *cptab->ctb_parts[cpt].cpt_cpumask);
 389        cpu_clear(cpu, *cptab->ctb_cpumask);
 390        cptab->ctb_cpu2cpt[cpu] = -1;
 391
 392        node = cpu_to_node(cpu);
 393
 394        LASSERT(node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask));
 395        LASSERT(node_isset(node, *cptab->ctb_nodemask));
 396
 397        for_each_cpu_mask(i, *cptab->ctb_parts[cpt].cpt_cpumask) {
 398                /* this CPT has other CPU belonging to this node? */
 399                if (cpu_to_node(i) == node)
 400                        break;
 401        }
 402
 403        if (i == NR_CPUS)
 404                node_clear(node, *cptab->ctb_parts[cpt].cpt_nodemask);
 405
 406        for_each_cpu_mask(i, *cptab->ctb_cpumask) {
 407                /* this CPT-table has other CPU belonging to this node? */
 408                if (cpu_to_node(i) == node)
 409                        break;
 410        }
 411
 412        if (i == NR_CPUS)
 413                node_clear(node, *cptab->ctb_nodemask);
 414
 415        return;
 416}
 417EXPORT_SYMBOL(cfs_cpt_unset_cpu);
 418
 419int
 420cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
 421{
 422        int     i;
 423
 424        if (cpus_weight(*mask) == 0 || any_online_cpu(*mask) == NR_CPUS) {
 425                CDEBUG(D_INFO, "No online CPU is found in the CPU mask "
 426                               "for CPU partition %d\n", cpt);
 427                return 0;
 428        }
 429
 430        for_each_cpu_mask(i, *mask) {
 431                if (!cfs_cpt_set_cpu(cptab, cpt, i))
 432                        return 0;
 433        }
 434
 435        return 1;
 436}
 437EXPORT_SYMBOL(cfs_cpt_set_cpumask);
 438
 439void
 440cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
 441{
 442        int     i;
 443
 444        for_each_cpu_mask(i, *mask)
 445                cfs_cpt_unset_cpu(cptab, cpt, i);
 446}
 447EXPORT_SYMBOL(cfs_cpt_unset_cpumask);
 448
 449int
 450cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node)
 451{
 452        cpumask_t       *mask;
 453        int             rc;
 454
 455        if (node < 0 || node >= MAX_NUMNODES) {
 456                CDEBUG(D_INFO,
 457                       "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
 458                return 0;
 459        }
 460
 461        down(&cpt_data.cpt_mutex);
 462
 463        mask = cpt_data.cpt_cpumask;
 464        cfs_node_to_cpumask(node, mask);
 465
 466        rc = cfs_cpt_set_cpumask(cptab, cpt, mask);
 467
 468        up(&cpt_data.cpt_mutex);
 469
 470        return rc;
 471}
 472EXPORT_SYMBOL(cfs_cpt_set_node);
 473
 474void
 475cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node)
 476{
 477        cpumask_t *mask;
 478
 479        if (node < 0 || node >= MAX_NUMNODES) {
 480                CDEBUG(D_INFO,
 481                       "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
 482                return;
 483        }
 484
 485        down(&cpt_data.cpt_mutex);
 486
 487        mask = cpt_data.cpt_cpumask;
 488        cfs_node_to_cpumask(node, mask);
 489
 490        cfs_cpt_unset_cpumask(cptab, cpt, mask);
 491
 492        up(&cpt_data.cpt_mutex);
 493}
 494EXPORT_SYMBOL(cfs_cpt_unset_node);
 495
 496int
 497cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
 498{
 499        int     i;
 500
 501        for_each_node_mask(i, *mask) {
 502                if (!cfs_cpt_set_node(cptab, cpt, i))
 503                        return 0;
 504        }
 505
 506        return 1;
 507}
 508EXPORT_SYMBOL(cfs_cpt_set_nodemask);
 509
 510void
 511cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
 512{
 513        int     i;
 514
 515        for_each_node_mask(i, *mask)
 516                cfs_cpt_unset_node(cptab, cpt, i);
 517}
 518EXPORT_SYMBOL(cfs_cpt_unset_nodemask);
 519
 520void
 521cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt)
 522{
 523        int     last;
 524        int     i;
 525
 526        if (cpt == CFS_CPT_ANY) {
 527                last = cptab->ctb_nparts - 1;
 528                cpt = 0;
 529        } else {
 530                last = cpt;
 531        }
 532
 533        for (; cpt <= last; cpt++) {
 534                for_each_cpu_mask(i, *cptab->ctb_parts[cpt].cpt_cpumask)
 535                        cfs_cpt_unset_cpu(cptab, cpt, i);
 536        }
 537}
 538EXPORT_SYMBOL(cfs_cpt_clear);
 539
 540int
 541cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt)
 542{
 543        nodemask_t      *mask;
 544        int             weight;
 545        int             rotor;
 546        int             node;
 547
 548        /* convert CPU partition ID to HW node id */
 549
 550        if (cpt < 0 || cpt >= cptab->ctb_nparts) {
 551                mask = cptab->ctb_nodemask;
 552                rotor = cptab->ctb_spread_rotor++;
 553        } else {
 554                mask = cptab->ctb_parts[cpt].cpt_nodemask;
 555                rotor = cptab->ctb_parts[cpt].cpt_spread_rotor++;
 556        }
 557
 558        weight = nodes_weight(*mask);
 559        LASSERT(weight > 0);
 560
 561        rotor %= weight;
 562
 563        for_each_node_mask(node, *mask) {
 564                if (rotor-- == 0)
 565                        return node;
 566        }
 567
 568        LBUG();
 569        return 0;
 570}
 571EXPORT_SYMBOL(cfs_cpt_spread_node);
 572
 573int
 574cfs_cpt_current(struct cfs_cpt_table *cptab, int remap)
 575{
 576        int     cpu = smp_processor_id();
 577        int     cpt = cptab->ctb_cpu2cpt[cpu];
 578
 579        if (cpt < 0) {
 580                if (!remap)
 581                        return cpt;
 582
 583                /* don't return negative value for safety of upper layer,
 584                 * instead we shadow the unknown cpu to a valid partition ID */
 585                cpt = cpu % cptab->ctb_nparts;
 586        }
 587
 588        return cpt;
 589}
 590EXPORT_SYMBOL(cfs_cpt_current);
 591
 592int
 593cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu)
 594{
 595        LASSERT(cpu >= 0 && cpu < NR_CPUS);
 596
 597        return cptab->ctb_cpu2cpt[cpu];
 598}
 599EXPORT_SYMBOL(cfs_cpt_of_cpu);
 600
 601int
 602cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
 603{
 604        cpumask_t       *cpumask;
 605        nodemask_t      *nodemask;
 606        int             rc;
 607        int             i;
 608
 609        LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
 610
 611        if (cpt == CFS_CPT_ANY) {
 612                cpumask = cptab->ctb_cpumask;
 613                nodemask = cptab->ctb_nodemask;
 614        } else {
 615                cpumask = cptab->ctb_parts[cpt].cpt_cpumask;
 616                nodemask = cptab->ctb_parts[cpt].cpt_nodemask;
 617        }
 618
 619        if (any_online_cpu(*cpumask) == NR_CPUS) {
 620                CERROR("No online CPU found in CPU partition %d, did someone "
 621                       "do CPU hotplug on system? You might need to reload "
 622                       "Lustre modules to keep system working well.\n", cpt);
 623                return -EINVAL;
 624        }
 625
 626        for_each_online_cpu(i) {
 627                if (cpu_isset(i, *cpumask))
 628                        continue;
 629
 630                rc = set_cpus_allowed_ptr(current, cpumask);
 631                set_mems_allowed(*nodemask);
 632                if (rc == 0)
 633                        schedule(); /* switch to allowed CPU */
 634
 635                return rc;
 636        }
 637
 638        /* don't need to set affinity because all online CPUs are covered */
 639        return 0;
 640}
 641EXPORT_SYMBOL(cfs_cpt_bind);
 642
 643/**
 644 * Choose max to \a number CPUs from \a node and set them in \a cpt.
 645 * We always prefer to choose CPU in the same core/socket.
 646 */
 647static int
 648cfs_cpt_choose_ncpus(struct cfs_cpt_table *cptab, int cpt,
 649                     cpumask_t *node, int number)
 650{
 651        cpumask_t       *socket = NULL;
 652        cpumask_t       *core = NULL;
 653        int             rc = 0;
 654        int             cpu;
 655
 656        LASSERT(number > 0);
 657
 658        if (number >= cpus_weight(*node)) {
 659                while (!cpus_empty(*node)) {
 660                        cpu = first_cpu(*node);
 661
 662                        rc = cfs_cpt_set_cpu(cptab, cpt, cpu);
 663                        if (!rc)
 664                                return -EINVAL;
 665                        cpu_clear(cpu, *node);
 666                }
 667                return 0;
 668        }
 669
 670        /* allocate scratch buffer */
 671        LIBCFS_ALLOC(socket, cpumask_size());
 672        LIBCFS_ALLOC(core, cpumask_size());
 673        if (socket == NULL || core == NULL) {
 674                rc = -ENOMEM;
 675                goto out;
 676        }
 677
 678        while (!cpus_empty(*node)) {
 679                cpu = first_cpu(*node);
 680
 681                /* get cpumask for cores in the same socket */
 682                cfs_cpu_core_siblings(cpu, socket);
 683                cpus_and(*socket, *socket, *node);
 684
 685                LASSERT(!cpus_empty(*socket));
 686
 687                while (!cpus_empty(*socket)) {
 688                        int     i;
 689
 690                        /* get cpumask for hts in the same core */
 691                        cfs_cpu_ht_siblings(cpu, core);
 692                        cpus_and(*core, *core, *node);
 693
 694                        LASSERT(!cpus_empty(*core));
 695
 696                        for_each_cpu_mask(i, *core) {
 697                                cpu_clear(i, *socket);
 698                                cpu_clear(i, *node);
 699
 700                                rc = cfs_cpt_set_cpu(cptab, cpt, i);
 701                                if (!rc) {
 702                                        rc = -EINVAL;
 703                                        goto out;
 704                                }
 705
 706                                if (--number == 0)
 707                                        goto out;
 708                        }
 709                        cpu = first_cpu(*socket);
 710                }
 711        }
 712
 713 out:
 714        if (socket != NULL)
 715                LIBCFS_FREE(socket, cpumask_size());
 716        if (core != NULL)
 717                LIBCFS_FREE(core, cpumask_size());
 718        return rc;
 719}
 720
 721#define CPT_WEIGHT_MIN  4u
 722
 723static unsigned int
 724cfs_cpt_num_estimate(void)
 725{
 726        unsigned nnode = num_online_nodes();
 727        unsigned ncpu  = num_online_cpus();
 728        unsigned ncpt;
 729
 730        if (ncpu <= CPT_WEIGHT_MIN) {
 731                ncpt = 1;
 732                goto out;
 733        }
 734
 735        /* generate reasonable number of CPU partitions based on total number
 736         * of CPUs, Preferred N should be power2 and match this condition:
 737         * 2 * (N - 1)^2 < NCPUS <= 2 * N^2 */
 738        for (ncpt = 2; ncpu > 2 * ncpt * ncpt; ncpt <<= 1) {}
 739
 740        if (ncpt <= nnode) { /* fat numa system */
 741                while (nnode > ncpt)
 742                        nnode >>= 1;
 743
 744        } else { /* ncpt > nnode */
 745                while ((nnode << 1) <= ncpt)
 746                        nnode <<= 1;
 747        }
 748
 749        ncpt = nnode;
 750
 751 out:
 752#if (BITS_PER_LONG == 32)
 753        /* config many CPU partitions on 32-bit system could consume
 754         * too much memory */
 755        ncpt = min(2U, ncpt);
 756#endif
 757        while (ncpu % ncpt != 0)
 758                ncpt--; /* worst case is 1 */
 759
 760        return ncpt;
 761}
 762
 763static struct cfs_cpt_table *
 764cfs_cpt_table_create(int ncpt)
 765{
 766        struct cfs_cpt_table *cptab = NULL;
 767        cpumask_t       *mask = NULL;
 768        int             cpt = 0;
 769        int             num;
 770        int             rc;
 771        int             i;
 772
 773        rc = cfs_cpt_num_estimate();
 774        if (ncpt <= 0)
 775                ncpt = rc;
 776
 777        if (ncpt > num_online_cpus() || ncpt > 4 * rc) {
 778                CWARN("CPU partition number %d is larger than suggested "
 779                      "value (%d), your system may have performance"
 780                      "issue or run out of memory while under pressure\n",
 781                      ncpt, rc);
 782        }
 783
 784        if (num_online_cpus() % ncpt != 0) {
 785                CERROR("CPU number %d is not multiple of cpu_npartition %d, "
 786                       "please try different cpu_npartitions value or"
 787                       "set pattern string by cpu_pattern=STRING\n",
 788                       (int)num_online_cpus(), ncpt);
 789                goto failed;
 790        }
 791
 792        cptab = cfs_cpt_table_alloc(ncpt);
 793        if (cptab == NULL) {
 794                CERROR("Failed to allocate CPU map(%d)\n", ncpt);
 795                goto failed;
 796        }
 797
 798        num = num_online_cpus() / ncpt;
 799        if (num == 0) {
 800                CERROR("CPU changed while setting CPU partition\n");
 801                goto failed;
 802        }
 803
 804        LIBCFS_ALLOC(mask, cpumask_size());
 805        if (mask == NULL) {
 806                CERROR("Failed to allocate scratch cpumask\n");
 807                goto failed;
 808        }
 809
 810        for_each_online_node(i) {
 811                cfs_node_to_cpumask(i, mask);
 812
 813                while (!cpus_empty(*mask)) {
 814                        struct cfs_cpu_partition *part;
 815                        int    n;
 816
 817                        if (cpt >= ncpt)
 818                                goto failed;
 819
 820                        part = &cptab->ctb_parts[cpt];
 821
 822                        n = num - cpus_weight(*part->cpt_cpumask);
 823                        LASSERT(n > 0);
 824
 825                        rc = cfs_cpt_choose_ncpus(cptab, cpt, mask, n);
 826                        if (rc < 0)
 827                                goto failed;
 828
 829                        LASSERT(num >= cpus_weight(*part->cpt_cpumask));
 830                        if (num == cpus_weight(*part->cpt_cpumask))
 831                                cpt++;
 832                }
 833        }
 834
 835        if (cpt != ncpt ||
 836            num != cpus_weight(*cptab->ctb_parts[ncpt - 1].cpt_cpumask)) {
 837                CERROR("Expect %d(%d) CPU partitions but got %d(%d), "
 838                       "CPU hotplug/unplug while setting?\n",
 839                       cptab->ctb_nparts, num, cpt,
 840                       cpus_weight(*cptab->ctb_parts[ncpt - 1].cpt_cpumask));
 841                goto failed;
 842        }
 843
 844        LIBCFS_FREE(mask, cpumask_size());
 845
 846        return cptab;
 847
 848 failed:
 849        CERROR("Failed to setup CPU-partition-table with %d "
 850               "CPU-partitions, online HW nodes: %d, HW cpus: %d.\n",
 851               ncpt, num_online_nodes(), num_online_cpus());
 852
 853        if (mask != NULL)
 854                LIBCFS_FREE(mask, cpumask_size());
 855
 856        if (cptab != NULL)
 857                cfs_cpt_table_free(cptab);
 858
 859        return NULL;
 860}
 861
 862static struct cfs_cpt_table *
 863cfs_cpt_table_create_pattern(char *pattern)
 864{
 865        struct cfs_cpt_table    *cptab;
 866        char                    *str    = pattern;
 867        int                     node    = 0;
 868        int                     high;
 869        int                     ncpt;
 870        int                     c;
 871
 872        for (ncpt = 0;; ncpt++) { /* quick scan bracket */
 873                str = strchr(str, '[');
 874                if (str == NULL)
 875                        break;
 876                str++;
 877        }
 878
 879        str = cfs_trimwhite(pattern);
 880        if (*str == 'n' || *str == 'N') {
 881                pattern = str + 1;
 882                node = 1;
 883        }
 884
 885        if (ncpt == 0 ||
 886            (node && ncpt > num_online_nodes()) ||
 887            (!node && ncpt > num_online_cpus())) {
 888                CERROR("Invalid pattern %s, or too many partitions %d\n",
 889                       pattern, ncpt);
 890                return NULL;
 891        }
 892
 893        high = node ? MAX_NUMNODES - 1 : NR_CPUS - 1;
 894
 895        cptab = cfs_cpt_table_alloc(ncpt);
 896        if (cptab == NULL) {
 897                CERROR("Failed to allocate cpu partition table\n");
 898                return NULL;
 899        }
 900
 901        for (str = cfs_trimwhite(pattern), c = 0;; c++) {
 902                struct cfs_range_expr   *range;
 903                struct cfs_expr_list    *el;
 904                char                    *bracket = strchr(str, '[');
 905                int                     cpt;
 906                int                     rc;
 907                int                     i;
 908                int                     n;
 909
 910                if (bracket == NULL) {
 911                        if (*str != 0) {
 912                                CERROR("Invalid pattern %s\n", str);
 913                                goto failed;
 914                        } else if (c != ncpt) {
 915                                CERROR("expect %d partitions but found %d\n",
 916                                       ncpt, c);
 917                                goto failed;
 918                        }
 919                        break;
 920                }
 921
 922                if (sscanf(str, "%u%n", &cpt, &n) < 1) {
 923                        CERROR("Invalid cpu pattern %s\n", str);
 924                        goto failed;
 925                }
 926
 927                if (cpt < 0 || cpt >= ncpt) {
 928                        CERROR("Invalid partition id %d, total partitions %d\n",
 929                               cpt, ncpt);
 930                        goto failed;
 931                }
 932
 933                if (cfs_cpt_weight(cptab, cpt) != 0) {
 934                        CERROR("Partition %d has already been set.\n", cpt);
 935                        goto failed;
 936                }
 937
 938                str = cfs_trimwhite(str + n);
 939                if (str != bracket) {
 940                        CERROR("Invalid pattern %s\n", str);
 941                        goto failed;
 942                }
 943
 944                bracket = strchr(str, ']');
 945                if (bracket == NULL) {
 946                        CERROR("missing right bracket for cpt %d, %s\n",
 947                               cpt, str);
 948                        goto failed;
 949                }
 950
 951                if (cfs_expr_list_parse(str, (bracket - str) + 1,
 952                                        0, high, &el) != 0) {
 953                        CERROR("Can't parse number range: %s\n", str);
 954                        goto failed;
 955                }
 956
 957                list_for_each_entry(range, &el->el_exprs, re_link) {
 958                        for (i = range->re_lo; i <= range->re_hi; i++) {
 959                                if ((i - range->re_lo) % range->re_stride != 0)
 960                                        continue;
 961
 962                                rc = node ? cfs_cpt_set_node(cptab, cpt, i) :
 963                                            cfs_cpt_set_cpu(cptab, cpt, i);
 964                                if (!rc) {
 965                                        cfs_expr_list_free(el);
 966                                        goto failed;
 967                                }
 968                        }
 969                }
 970
 971                cfs_expr_list_free(el);
 972
 973                if (!cfs_cpt_online(cptab, cpt)) {
 974                        CERROR("No online CPU is found on partition %d\n", cpt);
 975                        goto failed;
 976                }
 977
 978                str = cfs_trimwhite(bracket + 1);
 979        }
 980
 981        return cptab;
 982
 983 failed:
 984        cfs_cpt_table_free(cptab);
 985        return NULL;
 986}
 987
 988#ifdef CONFIG_HOTPLUG_CPU
 989static int
 990cfs_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 991{
 992        unsigned int  cpu = (unsigned long)hcpu;
 993
 994        switch (action) {
 995        case CPU_DEAD:
 996        case CPU_DEAD_FROZEN:
 997        case CPU_ONLINE:
 998        case CPU_ONLINE_FROZEN:
 999                spin_lock(&cpt_data.cpt_lock);
1000                cpt_data.cpt_version++;
1001                spin_unlock(&cpt_data.cpt_lock);
1002        default:
1003                CWARN("Lustre: can't support CPU hotplug well now, "
1004                      "performance and stability could be impacted"
1005                      "[CPU %u notify: %lx]\n", cpu, action);
1006        }
1007
1008        return NOTIFY_OK;
1009}
1010
1011static struct notifier_block cfs_cpu_notifier = {
1012        .notifier_call  = cfs_cpu_notify,
1013        .priority       = 0
1014};
1015
1016#endif
1017
1018void
1019cfs_cpu_fini(void)
1020{
1021        if (cfs_cpt_table != NULL)
1022                cfs_cpt_table_free(cfs_cpt_table);
1023
1024#ifdef CONFIG_HOTPLUG_CPU
1025        unregister_hotcpu_notifier(&cfs_cpu_notifier);
1026#endif
1027        if (cpt_data.cpt_cpumask != NULL)
1028                LIBCFS_FREE(cpt_data.cpt_cpumask, cpumask_size());
1029}
1030
1031int
1032cfs_cpu_init(void)
1033{
1034        LASSERT(cfs_cpt_table == NULL);
1035
1036        memset(&cpt_data, 0, sizeof(cpt_data));
1037
1038        LIBCFS_ALLOC(cpt_data.cpt_cpumask, cpumask_size());
1039        if (cpt_data.cpt_cpumask == NULL) {
1040                CERROR("Failed to allocate scratch buffer\n");
1041                return -1;
1042        }
1043
1044        spin_lock_init(&cpt_data.cpt_lock);
1045        sema_init(&cpt_data.cpt_mutex, 1);
1046
1047#ifdef CONFIG_HOTPLUG_CPU
1048        register_hotcpu_notifier(&cfs_cpu_notifier);
1049#endif
1050
1051        if (*cpu_pattern != 0) {
1052                cfs_cpt_table = cfs_cpt_table_create_pattern(cpu_pattern);
1053                if (cfs_cpt_table == NULL) {
1054                        CERROR("Failed to create cptab from pattern %s\n",
1055                               cpu_pattern);
1056                        goto failed;
1057                }
1058
1059        } else {
1060                cfs_cpt_table = cfs_cpt_table_create(cpu_npartitions);
1061                if (cfs_cpt_table == NULL) {
1062                        CERROR("Failed to create ptable with npartitions %d\n",
1063                               cpu_npartitions);
1064                        goto failed;
1065                }
1066        }
1067
1068        spin_lock(&cpt_data.cpt_lock);
1069        if (cfs_cpt_table->ctb_version != cpt_data.cpt_version) {
1070                spin_unlock(&cpt_data.cpt_lock);
1071                CERROR("CPU hotplug/unplug during setup\n");
1072                goto failed;
1073        }
1074        spin_unlock(&cpt_data.cpt_lock);
1075
1076        LCONSOLE(0, "HW CPU cores: %d, npartitions: %d\n",
1077                 num_online_cpus(), cfs_cpt_number(cfs_cpt_table));
1078        return 0;
1079
1080 failed:
1081        cfs_cpu_fini();
1082        return -1;
1083}
1084
1085#endif
1086