linux/mm/slub.c
<<
>>
Prefs
   1/*
   2 * SLUB: A slab allocator that limits cache line use instead of queuing
   3 * objects in per cpu and per node lists.
   4 *
   5 * The allocator synchronizes using per slab locks and only
   6 * uses a centralized lock to manage a pool of partial slabs.
   7 *
   8 * (C) 2007 SGI, Christoph Lameter
   9 */
  10
  11#include <linux/mm.h>
  12#include <linux/swap.h> /* struct reclaim_state */
  13#include <linux/module.h>
  14#include <linux/bit_spinlock.h>
  15#include <linux/interrupt.h>
  16#include <linux/bitops.h>
  17#include <linux/slab.h>
  18#include <linux/proc_fs.h>
  19#include <linux/seq_file.h>
  20#include <linux/kmemcheck.h>
  21#include <linux/cpu.h>
  22#include <linux/cpuset.h>
  23#include <linux/mempolicy.h>
  24#include <linux/ctype.h>
  25#include <linux/debugobjects.h>
  26#include <linux/kallsyms.h>
  27#include <linux/memory.h>
  28#include <linux/math64.h>
  29#include <linux/fault-inject.h>
  30
  31#include <trace/events/kmem.h>
  32
  33/*
  34 * Lock order:
  35 *   1. slab_lock(page)
  36 *   2. slab->list_lock
  37 *
  38 *   The slab_lock protects operations on the object of a particular
  39 *   slab and its metadata in the page struct. If the slab lock
  40 *   has been taken then no allocations nor frees can be performed
  41 *   on the objects in the slab nor can the slab be added or removed
  42 *   from the partial or full lists since this would mean modifying
  43 *   the page_struct of the slab.
  44 *
  45 *   The list_lock protects the partial and full list on each node and
  46 *   the partial slab counter. If taken then no new slabs may be added or
  47 *   removed from the lists nor make the number of partial slabs be modified.
  48 *   (Note that the total number of slabs is an atomic value that may be
  49 *   modified without taking the list lock).
  50 *
  51 *   The list_lock is a centralized lock and thus we avoid taking it as
  52 *   much as possible. As long as SLUB does not have to handle partial
  53 *   slabs, operations can continue without any centralized lock. F.e.
  54 *   allocating a long series of objects that fill up slabs does not require
  55 *   the list lock.
  56 *
  57 *   The lock order is sometimes inverted when we are trying to get a slab
  58 *   off a list. We take the list_lock and then look for a page on the list
  59 *   to use. While we do that objects in the slabs may be freed. We can
  60 *   only operate on the slab if we have also taken the slab_lock. So we use
  61 *   a slab_trylock() on the slab. If trylock was successful then no frees
  62 *   can occur anymore and we can use the slab for allocations etc. If the
  63 *   slab_trylock() does not succeed then frees are in progress in the slab and
  64 *   we must stay away from it for a while since we may cause a bouncing
  65 *   cacheline if we try to acquire the lock. So go onto the next slab.
  66 *   If all pages are busy then we may allocate a new slab instead of reusing
  67 *   a partial slab. A new slab has noone operating on it and thus there is
  68 *   no danger of cacheline contention.
  69 *
  70 *   Interrupts are disabled during allocation and deallocation in order to
  71 *   make the slab allocator safe to use in the context of an irq. In addition
  72 *   interrupts are disabled to ensure that the processor does not change
  73 *   while handling per_cpu slabs, due to kernel preemption.
  74 *
  75 * SLUB assigns one slab for allocation to each processor.
  76 * Allocations only occur from these slabs called cpu slabs.
  77 *
  78 * Slabs with free elements are kept on a partial list and during regular
  79 * operations no list for full slabs is used. If an object in a full slab is
  80 * freed then the slab will show up again on the partial lists.
  81 * We track full slabs for debugging purposes though because otherwise we
  82 * cannot scan all objects.
  83 *
  84 * Slabs are freed when they become empty. Teardown and setup is
  85 * minimal so we rely on the page allocators per cpu caches for
  86 * fast frees and allocs.
  87 *
  88 * Overloading of page flags that are otherwise used for LRU management.
  89 *
  90 * PageActive           The slab is frozen and exempt from list processing.
  91 *                      This means that the slab is dedicated to a purpose
  92 *                      such as satisfying allocations for a specific
  93 *                      processor. Objects may be freed in the slab while
  94 *                      it is frozen but slab_free will then skip the usual
  95 *                      list operations. It is up to the processor holding
  96 *                      the slab to integrate the slab into the slab lists
  97 *                      when the slab is no longer needed.
  98 *
  99 *                      One use of this flag is to mark slabs that are
 100 *                      used for allocations. Then such a slab becomes a cpu
 101 *                      slab. The cpu slab may be equipped with an additional
 102 *                      freelist that allows lockless access to
 103 *                      free objects in addition to the regular freelist
 104 *                      that requires the slab lock.
 105 *
 106 * PageError            Slab requires special handling due to debug
 107 *                      options set. This moves slab handling out of
 108 *                      the fast path and disables lockless freelists.
 109 */
 110
 111#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
 112                SLAB_TRACE | SLAB_DEBUG_FREE)
 113
 114static inline int kmem_cache_debug(struct kmem_cache *s)
 115{
 116#ifdef CONFIG_SLUB_DEBUG
 117        return unlikely(s->flags & SLAB_DEBUG_FLAGS);
 118#else
 119        return 0;
 120#endif
 121}
 122
 123/*
 124 * Issues still to be resolved:
 125 *
 126 * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
 127 *
 128 * - Variable sizing of the per node arrays
 129 */
 130
 131/* Enable to test recovery from slab corruption on boot */
 132#undef SLUB_RESILIENCY_TEST
 133
 134/*
 135 * Mininum number of partial slabs. These will be left on the partial
 136 * lists even if they are empty. kmem_cache_shrink may reclaim them.
 137 */
 138#define MIN_PARTIAL 5
 139
 140/*
 141 * Maximum number of desirable partial slabs.
 142 * The existence of more partial slabs makes kmem_cache_shrink
 143 * sort the partial list by the number of objects in the.
 144 */
 145#define MAX_PARTIAL 10
 146
 147#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
 148                                SLAB_POISON | SLAB_STORE_USER)
 149
 150/*
 151 * Debugging flags that require metadata to be stored in the slab.  These get
 152 * disabled when slub_debug=O is used and a cache's min order increases with
 153 * metadata.
 154 */
 155#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
 156
 157/*
 158 * Set of flags that will prevent slab merging
 159 */
 160#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
 161                SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
 162                SLAB_FAILSLAB)
 163
 164#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
 165                SLAB_CACHE_DMA | SLAB_NOTRACK)
 166
 167#define OO_SHIFT        16
 168#define OO_MASK         ((1 << OO_SHIFT) - 1)
 169#define MAX_OBJS_PER_PAGE       65535 /* since page.objects is u16 */
 170
 171/* Internal SLUB flags */
 172#define __OBJECT_POISON         0x80000000UL /* Poison object */
 173
 174static int kmem_size = sizeof(struct kmem_cache);
 175
 176#ifdef CONFIG_SMP
 177static struct notifier_block slab_notifier;
 178#endif
 179
 180static enum {
 181        DOWN,           /* No slab functionality available */
 182        PARTIAL,        /* Kmem_cache_node works */
 183        UP,             /* Everything works but does not show up in sysfs */
 184        SYSFS           /* Sysfs up */
 185} slab_state = DOWN;
 186
 187/* A list of all slab caches on the system */
 188static DECLARE_RWSEM(slub_lock);
 189static LIST_HEAD(slab_caches);
 190
 191/*
 192 * Tracking user of a slab.
 193 */
 194struct track {
 195        unsigned long addr;     /* Called from address */
 196        int cpu;                /* Was running on cpu */
 197        int pid;                /* Pid context */
 198        unsigned long when;     /* When did the operation occur */
 199};
 200
 201enum track_item { TRACK_ALLOC, TRACK_FREE };
 202
 203#ifdef CONFIG_SYSFS
 204static int sysfs_slab_add(struct kmem_cache *);
 205static int sysfs_slab_alias(struct kmem_cache *, const char *);
 206static void sysfs_slab_remove(struct kmem_cache *);
 207
 208#else
 209static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
 210static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
 211                                                        { return 0; }
 212static inline void sysfs_slab_remove(struct kmem_cache *s)
 213{
 214        kfree(s->name);
 215        kfree(s);
 216}
 217
 218#endif
 219
 220static inline void stat(struct kmem_cache *s, enum stat_item si)
 221{
 222#ifdef CONFIG_SLUB_STATS
 223        __this_cpu_inc(s->cpu_slab->stat[si]);
 224#endif
 225}
 226
 227/********************************************************************
 228 *                      Core slab cache functions
 229 *******************************************************************/
 230
 231int slab_is_available(void)
 232{
 233        return slab_state >= UP;
 234}
 235
 236static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
 237{
 238        return s->node[node];
 239}
 240
 241/* Verify that a pointer has an address that is valid within a slab page */
 242static inline int check_valid_pointer(struct kmem_cache *s,
 243                                struct page *page, const void *object)
 244{
 245        void *base;
 246
 247        if (!object)
 248                return 1;
 249
 250        base = page_address(page);
 251        if (object < base || object >= base + page->objects * s->size ||
 252                (object - base) % s->size) {
 253                return 0;
 254        }
 255
 256        return 1;
 257}
 258
 259static inline void *get_freepointer(struct kmem_cache *s, void *object)
 260{
 261        return *(void **)(object + s->offset);
 262}
 263
 264static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
 265{
 266        *(void **)(object + s->offset) = fp;
 267}
 268
 269/* Loop over all objects in a slab */
 270#define for_each_object(__p, __s, __addr, __objects) \
 271        for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\
 272                        __p += (__s)->size)
 273
 274/* Scan freelist */
 275#define for_each_free_object(__p, __s, __free) \
 276        for (__p = (__free); __p; __p = get_freepointer((__s), __p))
 277
 278/* Determine object index from a given position */
 279static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
 280{
 281        return (p - addr) / s->size;
 282}
 283
 284static inline struct kmem_cache_order_objects oo_make(int order,
 285                                                unsigned long size)
 286{
 287        struct kmem_cache_order_objects x = {
 288                (order << OO_SHIFT) + (PAGE_SIZE << order) / size
 289        };
 290
 291        return x;
 292}
 293
 294static inline int oo_order(struct kmem_cache_order_objects x)
 295{
 296        return x.x >> OO_SHIFT;
 297}
 298
 299static inline int oo_objects(struct kmem_cache_order_objects x)
 300{
 301        return x.x & OO_MASK;
 302}
 303
 304#ifdef CONFIG_SLUB_DEBUG
 305/*
 306 * Debug settings:
 307 */
 308#ifdef CONFIG_SLUB_DEBUG_ON
 309static int slub_debug = DEBUG_DEFAULT_FLAGS;
 310#else
 311static int slub_debug;
 312#endif
 313
 314static char *slub_debug_slabs;
 315static int disable_higher_order_debug;
 316
 317/*
 318 * Object debugging
 319 */
 320static void print_section(char *text, u8 *addr, unsigned int length)
 321{
 322        int i, offset;
 323        int newline = 1;
 324        char ascii[17];
 325
 326        ascii[16] = 0;
 327
 328        for (i = 0; i < length; i++) {
 329                if (newline) {
 330                        printk(KERN_ERR "%8s 0x%p: ", text, addr + i);
 331                        newline = 0;
 332                }
 333                printk(KERN_CONT " %02x", addr[i]);
 334                offset = i % 16;
 335                ascii[offset] = isgraph(addr[i]) ? addr[i] : '.';
 336                if (offset == 15) {
 337                        printk(KERN_CONT " %s\n", ascii);
 338                        newline = 1;
 339                }
 340        }
 341        if (!newline) {
 342                i %= 16;
 343                while (i < 16) {
 344                        printk(KERN_CONT "   ");
 345                        ascii[i] = ' ';
 346                        i++;
 347                }
 348                printk(KERN_CONT " %s\n", ascii);
 349        }
 350}
 351
 352static struct track *get_track(struct kmem_cache *s, void *object,
 353        enum track_item alloc)
 354{
 355        struct track *p;
 356
 357        if (s->offset)
 358                p = object + s->offset + sizeof(void *);
 359        else
 360                p = object + s->inuse;
 361
 362        return p + alloc;
 363}
 364
 365static void set_track(struct kmem_cache *s, void *object,
 366                        enum track_item alloc, unsigned long addr)
 367{
 368        struct track *p = get_track(s, object, alloc);
 369
 370        if (addr) {
 371                p->addr = addr;
 372                p->cpu = smp_processor_id();
 373                p->pid = current->pid;
 374                p->when = jiffies;
 375        } else
 376                memset(p, 0, sizeof(struct track));
 377}
 378
 379static void init_tracking(struct kmem_cache *s, void *object)
 380{
 381        if (!(s->flags & SLAB_STORE_USER))
 382                return;
 383
 384        set_track(s, object, TRACK_FREE, 0UL);
 385        set_track(s, object, TRACK_ALLOC, 0UL);
 386}
 387
 388static void print_track(const char *s, struct track *t)
 389{
 390        if (!t->addr)
 391                return;
 392
 393        printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
 394                s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid);
 395}
 396
 397static void print_tracking(struct kmem_cache *s, void *object)
 398{
 399        if (!(s->flags & SLAB_STORE_USER))
 400                return;
 401
 402        print_track("Allocated", get_track(s, object, TRACK_ALLOC));
 403        print_track("Freed", get_track(s, object, TRACK_FREE));
 404}
 405
 406static void print_page_info(struct page *page)
 407{
 408        printk(KERN_ERR "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
 409                page, page->objects, page->inuse, page->freelist, page->flags);
 410
 411}
 412
 413static void slab_bug(struct kmem_cache *s, char *fmt, ...)
 414{
 415        va_list args;
 416        char buf[100];
 417
 418        va_start(args, fmt);
 419        vsnprintf(buf, sizeof(buf), fmt, args);
 420        va_end(args);
 421        printk(KERN_ERR "========================================"
 422                        "=====================================\n");
 423        printk(KERN_ERR "BUG %s: %s\n", s->name, buf);
 424        printk(KERN_ERR "----------------------------------------"
 425                        "-------------------------------------\n\n");
 426}
 427
 428static void slab_fix(struct kmem_cache *s, char *fmt, ...)
 429{
 430        va_list args;
 431        char buf[100];
 432
 433        va_start(args, fmt);
 434        vsnprintf(buf, sizeof(buf), fmt, args);
 435        va_end(args);
 436        printk(KERN_ERR "FIX %s: %s\n", s->name, buf);
 437}
 438
 439static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
 440{
 441        unsigned int off;       /* Offset of last byte */
 442        u8 *addr = page_address(page);
 443
 444        print_tracking(s, p);
 445
 446        print_page_info(page);
 447
 448        printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
 449                        p, p - addr, get_freepointer(s, p));
 450
 451        if (p > addr + 16)
 452                print_section("Bytes b4", p - 16, 16);
 453
 454        print_section("Object", p, min_t(unsigned long, s->objsize, PAGE_SIZE));
 455
 456        if (s->flags & SLAB_RED_ZONE)
 457                print_section("Redzone", p + s->objsize,
 458                        s->inuse - s->objsize);
 459
 460        if (s->offset)
 461                off = s->offset + sizeof(void *);
 462        else
 463                off = s->inuse;
 464
 465        if (s->flags & SLAB_STORE_USER)
 466                off += 2 * sizeof(struct track);
 467
 468        if (off != s->size)
 469                /* Beginning of the filler is the free pointer */
 470                print_section("Padding", p + off, s->size - off);
 471
 472        dump_stack();
 473}
 474
 475static void object_err(struct kmem_cache *s, struct page *page,
 476                        u8 *object, char *reason)
 477{
 478        slab_bug(s, "%s", reason);
 479        print_trailer(s, page, object);
 480}
 481
 482static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...)
 483{
 484        va_list args;
 485        char buf[100];
 486
 487        va_start(args, fmt);
 488        vsnprintf(buf, sizeof(buf), fmt, args);
 489        va_end(args);
 490        slab_bug(s, "%s", buf);
 491        print_page_info(page);
 492        dump_stack();
 493}
 494
 495static void init_object(struct kmem_cache *s, void *object, u8 val)
 496{
 497        u8 *p = object;
 498
 499        if (s->flags & __OBJECT_POISON) {
 500                memset(p, POISON_FREE, s->objsize - 1);
 501                p[s->objsize - 1] = POISON_END;
 502        }
 503
 504        if (s->flags & SLAB_RED_ZONE)
 505                memset(p + s->objsize, val, s->inuse - s->objsize);
 506}
 507
 508static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
 509{
 510        while (bytes) {
 511                if (*start != (u8)value)
 512                        return start;
 513                start++;
 514                bytes--;
 515        }
 516        return NULL;
 517}
 518
 519static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
 520                                                void *from, void *to)
 521{
 522        slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data);
 523        memset(from, data, to - from);
 524}
 525
 526static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
 527                        u8 *object, char *what,
 528                        u8 *start, unsigned int value, unsigned int bytes)
 529{
 530        u8 *fault;
 531        u8 *end;
 532
 533        fault = check_bytes(start, value, bytes);
 534        if (!fault)
 535                return 1;
 536
 537        end = start + bytes;
 538        while (end > fault && end[-1] == value)
 539                end--;
 540
 541        slab_bug(s, "%s overwritten", what);
 542        printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
 543                                        fault, end - 1, fault[0], value);
 544        print_trailer(s, page, object);
 545
 546        restore_bytes(s, what, value, fault, end);
 547        return 0;
 548}
 549
 550/*
 551 * Object layout:
 552 *
 553 * object address
 554 *      Bytes of the object to be managed.
 555 *      If the freepointer may overlay the object then the free
 556 *      pointer is the first word of the object.
 557 *
 558 *      Poisoning uses 0x6b (POISON_FREE) and the last byte is
 559 *      0xa5 (POISON_END)
 560 *
 561 * object + s->objsize
 562 *      Padding to reach word boundary. This is also used for Redzoning.
 563 *      Padding is extended by another word if Redzoning is enabled and
 564 *      objsize == inuse.
 565 *
 566 *      We fill with 0xbb (RED_INACTIVE) for inactive objects and with
 567 *      0xcc (RED_ACTIVE) for objects in use.
 568 *
 569 * object + s->inuse
 570 *      Meta data starts here.
 571 *
 572 *      A. Free pointer (if we cannot overwrite object on free)
 573 *      B. Tracking data for SLAB_STORE_USER
 574 *      C. Padding to reach required alignment boundary or at mininum
 575 *              one word if debugging is on to be able to detect writes
 576 *              before the word boundary.
 577 *
 578 *      Padding is done using 0x5a (POISON_INUSE)
 579 *
 580 * object + s->size
 581 *      Nothing is used beyond s->size.
 582 *
 583 * If slabcaches are merged then the objsize and inuse boundaries are mostly
 584 * ignored. And therefore no slab options that rely on these boundaries
 585 * may be used with merged slabcaches.
 586 */
 587
 588static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
 589{
 590        unsigned long off = s->inuse;   /* The end of info */
 591
 592        if (s->offset)
 593                /* Freepointer is placed after the object. */
 594                off += sizeof(void *);
 595
 596        if (s->flags & SLAB_STORE_USER)
 597                /* We also have user information there */
 598                off += 2 * sizeof(struct track);
 599
 600        if (s->size == off)
 601                return 1;
 602
 603        return check_bytes_and_report(s, page, p, "Object padding",
 604                                p + off, POISON_INUSE, s->size - off);
 605}
 606
 607/* Check the pad bytes at the end of a slab page */
 608static int slab_pad_check(struct kmem_cache *s, struct page *page)
 609{
 610        u8 *start;
 611        u8 *fault;
 612        u8 *end;
 613        int length;
 614        int remainder;
 615
 616        if (!(s->flags & SLAB_POISON))
 617                return 1;
 618
 619        start = page_address(page);
 620        length = (PAGE_SIZE << compound_order(page));
 621        end = start + length;
 622        remainder = length % s->size;
 623        if (!remainder)
 624                return 1;
 625
 626        fault = check_bytes(end - remainder, POISON_INUSE, remainder);
 627        if (!fault)
 628                return 1;
 629        while (end > fault && end[-1] == POISON_INUSE)
 630                end--;
 631
 632        slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
 633        print_section("Padding", end - remainder, remainder);
 634
 635        restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end);
 636        return 0;
 637}
 638
 639static int check_object(struct kmem_cache *s, struct page *page,
 640                                        void *object, u8 val)
 641{
 642        u8 *p = object;
 643        u8 *endobject = object + s->objsize;
 644
 645        if (s->flags & SLAB_RED_ZONE) {
 646                if (!check_bytes_and_report(s, page, object, "Redzone",
 647                        endobject, val, s->inuse - s->objsize))
 648                        return 0;
 649        } else {
 650                if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) {
 651                        check_bytes_and_report(s, page, p, "Alignment padding",
 652                                endobject, POISON_INUSE, s->inuse - s->objsize);
 653                }
 654        }
 655
 656        if (s->flags & SLAB_POISON) {
 657                if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
 658                        (!check_bytes_and_report(s, page, p, "Poison", p,
 659                                        POISON_FREE, s->objsize - 1) ||
 660                         !check_bytes_and_report(s, page, p, "Poison",
 661                                p + s->objsize - 1, POISON_END, 1)))
 662                        return 0;
 663                /*
 664                 * check_pad_bytes cleans up on its own.
 665                 */
 666                check_pad_bytes(s, page, p);
 667        }
 668
 669        if (!s->offset && val == SLUB_RED_ACTIVE)
 670                /*
 671                 * Object and freepointer overlap. Cannot check
 672                 * freepointer while object is allocated.
 673                 */
 674                return 1;
 675
 676        /* Check free pointer validity */
 677        if (!check_valid_pointer(s, page, get_freepointer(s, p))) {
 678                object_err(s, page, p, "Freepointer corrupt");
 679                /*
 680                 * No choice but to zap it and thus lose the remainder
 681                 * of the free objects in this slab. May cause
 682                 * another error because the object count is now wrong.
 683                 */
 684                set_freepointer(s, p, NULL);
 685                return 0;
 686        }
 687        return 1;
 688}
 689
 690static int check_slab(struct kmem_cache *s, struct page *page)
 691{
 692        int maxobj;
 693
 694        VM_BUG_ON(!irqs_disabled());
 695
 696        if (!PageSlab(page)) {
 697                slab_err(s, page, "Not a valid slab page");
 698                return 0;
 699        }
 700
 701        maxobj = (PAGE_SIZE << compound_order(page)) / s->size;
 702        if (page->objects > maxobj) {
 703                slab_err(s, page, "objects %u > max %u",
 704                        s->name, page->objects, maxobj);
 705                return 0;
 706        }
 707        if (page->inuse > page->objects) {
 708                slab_err(s, page, "inuse %u > max %u",
 709                        s->name, page->inuse, page->objects);
 710                return 0;
 711        }
 712        /* Slab_pad_check fixes things up after itself */
 713        slab_pad_check(s, page);
 714        return 1;
 715}
 716
 717/*
 718 * Determine if a certain object on a page is on the freelist. Must hold the
 719 * slab lock to guarantee that the chains are in a consistent state.
 720 */
 721static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
 722{
 723        int nr = 0;
 724        void *fp = page->freelist;
 725        void *object = NULL;
 726        unsigned long max_objects;
 727
 728        while (fp && nr <= page->objects) {
 729                if (fp == search)
 730                        return 1;
 731                if (!check_valid_pointer(s, page, fp)) {
 732                        if (object) {
 733                                object_err(s, page, object,
 734                                        "Freechain corrupt");
 735                                set_freepointer(s, object, NULL);
 736                                break;
 737                        } else {
 738                                slab_err(s, page, "Freepointer corrupt");
 739                                page->freelist = NULL;
 740                                page->inuse = page->objects;
 741                                slab_fix(s, "Freelist cleared");
 742                                return 0;
 743                        }
 744                        break;
 745                }
 746                object = fp;
 747                fp = get_freepointer(s, object);
 748                nr++;
 749        }
 750
 751        max_objects = (PAGE_SIZE << compound_order(page)) / s->size;
 752        if (max_objects > MAX_OBJS_PER_PAGE)
 753                max_objects = MAX_OBJS_PER_PAGE;
 754
 755        if (page->objects != max_objects) {
 756                slab_err(s, page, "Wrong number of objects. Found %d but "
 757                        "should be %d", page->objects, max_objects);
 758                page->objects = max_objects;
 759                slab_fix(s, "Number of objects adjusted.");
 760        }
 761        if (page->inuse != page->objects - nr) {
 762                slab_err(s, page, "Wrong object count. Counter is %d but "
 763                        "counted were %d", page->inuse, page->objects - nr);
 764                page->inuse = page->objects - nr;
 765                slab_fix(s, "Object count adjusted.");
 766        }
 767        return search == NULL;
 768}
 769
 770static void trace(struct kmem_cache *s, struct page *page, void *object,
 771                                                                int alloc)
 772{
 773        if (s->flags & SLAB_TRACE) {
 774                printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
 775                        s->name,
 776                        alloc ? "alloc" : "free",
 777                        object, page->inuse,
 778                        page->freelist);
 779
 780                if (!alloc)
 781                        print_section("Object", (void *)object, s->objsize);
 782
 783                dump_stack();
 784        }
 785}
 786
 787/*
 788 * Hooks for other subsystems that check memory allocations. In a typical
 789 * production configuration these hooks all should produce no code at all.
 790 */
 791static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
 792{
 793        flags &= gfp_allowed_mask;
 794        lockdep_trace_alloc(flags);
 795        might_sleep_if(flags & __GFP_WAIT);
 796
 797        return should_failslab(s->objsize, flags, s->flags);
 798}
 799
 800static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object)
 801{
 802        flags &= gfp_allowed_mask;
 803        kmemcheck_slab_alloc(s, flags, object, s->objsize);
 804        kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags);
 805}
 806
 807static inline void slab_free_hook(struct kmem_cache *s, void *x)
 808{
 809        kmemleak_free_recursive(x, s->flags);
 810}
 811
 812static inline void slab_free_hook_irq(struct kmem_cache *s, void *object)
 813{
 814        kmemcheck_slab_free(s, object, s->objsize);
 815        debug_check_no_locks_freed(object, s->objsize);
 816        if (!(s->flags & SLAB_DEBUG_OBJECTS))
 817                debug_check_no_obj_freed(object, s->objsize);
 818}
 819
 820/*
 821 * Tracking of fully allocated slabs for debugging purposes.
 822 */
 823static void add_full(struct kmem_cache_node *n, struct page *page)
 824{
 825        spin_lock(&n->list_lock);
 826        list_add(&page->lru, &n->full);
 827        spin_unlock(&n->list_lock);
 828}
 829
 830static void remove_full(struct kmem_cache *s, struct page *page)
 831{
 832        struct kmem_cache_node *n;
 833
 834        if (!(s->flags & SLAB_STORE_USER))
 835                return;
 836
 837        n = get_node(s, page_to_nid(page));
 838
 839        spin_lock(&n->list_lock);
 840        list_del(&page->lru);
 841        spin_unlock(&n->list_lock);
 842}
 843
 844/* Tracking of the number of slabs for debugging purposes */
 845static inline unsigned long slabs_node(struct kmem_cache *s, int node)
 846{
 847        struct kmem_cache_node *n = get_node(s, node);
 848
 849        return atomic_long_read(&n->nr_slabs);
 850}
 851
 852static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
 853{
 854        return atomic_long_read(&n->nr_slabs);
 855}
 856
 857static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
 858{
 859        struct kmem_cache_node *n = get_node(s, node);
 860
 861        /*
 862         * May be called early in order to allocate a slab for the
 863         * kmem_cache_node structure. Solve the chicken-egg
 864         * dilemma by deferring the increment of the count during
 865         * bootstrap (see early_kmem_cache_node_alloc).
 866         */
 867        if (n) {
 868                atomic_long_inc(&n->nr_slabs);
 869                atomic_long_add(objects, &n->total_objects);
 870        }
 871}
 872static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
 873{
 874        struct kmem_cache_node *n = get_node(s, node);
 875
 876        atomic_long_dec(&n->nr_slabs);
 877        atomic_long_sub(objects, &n->total_objects);
 878}
 879
 880/* Object debug checks for alloc/free paths */
 881static void setup_object_debug(struct kmem_cache *s, struct page *page,
 882                                                                void *object)
 883{
 884        if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
 885                return;
 886
 887        init_object(s, object, SLUB_RED_INACTIVE);
 888        init_tracking(s, object);
 889}
 890
 891static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *page,
 892                                        void *object, unsigned long addr)
 893{
 894        if (!check_slab(s, page))
 895                goto bad;
 896
 897        if (!on_freelist(s, page, object)) {
 898                object_err(s, page, object, "Object already allocated");
 899                goto bad;
 900        }
 901
 902        if (!check_valid_pointer(s, page, object)) {
 903                object_err(s, page, object, "Freelist Pointer check fails");
 904                goto bad;
 905        }
 906
 907        if (!check_object(s, page, object, SLUB_RED_INACTIVE))
 908                goto bad;
 909
 910        /* Success perform special debug activities for allocs */
 911        if (s->flags & SLAB_STORE_USER)
 912                set_track(s, object, TRACK_ALLOC, addr);
 913        trace(s, page, object, 1);
 914        init_object(s, object, SLUB_RED_ACTIVE);
 915        return 1;
 916
 917bad:
 918        if (PageSlab(page)) {
 919                /*
 920                 * If this is a slab page then lets do the best we can
 921                 * to avoid issues in the future. Marking all objects
 922                 * as used avoids touching the remaining objects.
 923                 */
 924                slab_fix(s, "Marking all objects used");
 925                page->inuse = page->objects;
 926                page->freelist = NULL;
 927        }
 928        return 0;
 929}
 930
 931static noinline int free_debug_processing(struct kmem_cache *s,
 932                 struct page *page, void *object, unsigned long addr)
 933{
 934        if (!check_slab(s, page))
 935                goto fail;
 936
 937        if (!check_valid_pointer(s, page, object)) {
 938                slab_err(s, page, "Invalid object pointer 0x%p", object);
 939                goto fail;
 940        }
 941
 942        if (on_freelist(s, page, object)) {
 943                object_err(s, page, object, "Object already free");
 944                goto fail;
 945        }
 946
 947        if (!check_object(s, page, object, SLUB_RED_ACTIVE))
 948                return 0;
 949
 950        if (unlikely(s != page->slab)) {
 951                if (!PageSlab(page)) {
 952                        slab_err(s, page, "Attempt to free object(0x%p) "
 953                                "outside of slab", object);
 954                } else if (!page->slab) {
 955                        printk(KERN_ERR
 956                                "SLUB <none>: no slab for object 0x%p.\n",
 957                                                object);
 958                        dump_stack();
 959                } else
 960                        object_err(s, page, object,
 961                                        "page slab pointer corrupt.");
 962                goto fail;
 963        }
 964
 965        /* Special debug activities for freeing objects */
 966        if (!PageSlubFrozen(page) && !page->freelist)
 967                remove_full(s, page);
 968        if (s->flags & SLAB_STORE_USER)
 969                set_track(s, object, TRACK_FREE, addr);
 970        trace(s, page, object, 0);
 971        init_object(s, object, SLUB_RED_INACTIVE);
 972        return 1;
 973
 974fail:
 975        slab_fix(s, "Object at 0x%p not freed", object);
 976        return 0;
 977}
 978
 979static int __init setup_slub_debug(char *str)
 980{
 981        slub_debug = DEBUG_DEFAULT_FLAGS;
 982        if (*str++ != '=' || !*str)
 983                /*
 984                 * No options specified. Switch on full debugging.
 985                 */
 986                goto out;
 987
 988        if (*str == ',')
 989                /*
 990                 * No options but restriction on slabs. This means full
 991                 * debugging for slabs matching a pattern.
 992                 */
 993                goto check_slabs;
 994
 995        if (tolower(*str) == 'o') {
 996                /*
 997                 * Avoid enabling debugging on caches if its minimum order
 998                 * would increase as a result.
 999                 */
1000                disable_higher_order_debug = 1;
1001                goto out;
1002        }
1003
1004        slub_debug = 0;
1005        if (*str == '-')
1006                /*
1007                 * Switch off all debugging measures.
1008                 */
1009                goto out;
1010
1011        /*
1012         * Determine which debug features should be switched on
1013         */
1014        for (; *str && *str != ','; str++) {
1015                switch (tolower(*str)) {
1016                case 'f':
1017                        slub_debug |= SLAB_DEBUG_FREE;
1018                        break;
1019                case 'z':
1020                        slub_debug |= SLAB_RED_ZONE;
1021                        break;
1022                case 'p':
1023                        slub_debug |= SLAB_POISON;
1024                        break;
1025                case 'u':
1026                        slub_debug |= SLAB_STORE_USER;
1027                        break;
1028                case 't':
1029                        slub_debug |= SLAB_TRACE;
1030                        break;
1031                case 'a':
1032                        slub_debug |= SLAB_FAILSLAB;
1033                        break;
1034                default:
1035                        printk(KERN_ERR "slub_debug option '%c' "
1036                                "unknown. skipped\n", *str);
1037                }
1038        }
1039
1040check_slabs:
1041        if (*str == ',')
1042                slub_debug_slabs = str + 1;
1043out:
1044        return 1;
1045}
1046
1047__setup("slub_debug", setup_slub_debug);
1048
1049static unsigned long kmem_cache_flags(unsigned long objsize,
1050        unsigned long flags, const char *name,
1051        void (*ctor)(void *))
1052{
1053        /*
1054         * Enable debugging if selected on the kernel commandline.
1055         */
1056        if (slub_debug && (!slub_debug_slabs ||
1057                !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs))))
1058                flags |= slub_debug;
1059
1060        return flags;
1061}
1062#else
1063static inline void setup_object_debug(struct kmem_cache *s,
1064                        struct page *page, void *object) {}
1065
1066static inline int alloc_debug_processing(struct kmem_cache *s,
1067        struct page *page, void *object, unsigned long addr) { return 0; }
1068
1069static inline int free_debug_processing(struct kmem_cache *s,
1070        struct page *page, void *object, unsigned long addr) { return 0; }
1071
1072static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
1073                        { return 1; }
1074static inline int check_object(struct kmem_cache *s, struct page *page,
1075                        void *object, u8 val) { return 1; }
1076static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
1077static inline unsigned long kmem_cache_flags(unsigned long objsize,
1078        unsigned long flags, const char *name,
1079        void (*ctor)(void *))
1080{
1081        return flags;
1082}
1083#define slub_debug 0
1084
1085#define disable_higher_order_debug 0
1086
1087static inline unsigned long slabs_node(struct kmem_cache *s, int node)
1088                                                        { return 0; }
1089static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
1090                                                        { return 0; }
1091static inline void inc_slabs_node(struct kmem_cache *s, int node,
1092                                                        int objects) {}
1093static inline void dec_slabs_node(struct kmem_cache *s, int node,
1094                                                        int objects) {}
1095
1096static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
1097                                                        { return 0; }
1098
1099static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
1100                void *object) {}
1101
1102static inline void slab_free_hook(struct kmem_cache *s, void *x) {}
1103
1104static inline void slab_free_hook_irq(struct kmem_cache *s,
1105                void *object) {}
1106
1107#endif /* CONFIG_SLUB_DEBUG */
1108
1109/*
1110 * Slab allocation and freeing
1111 */
1112static inline struct page *alloc_slab_page(gfp_t flags, int node,
1113                                        struct kmem_cache_order_objects oo)
1114{
1115        int order = oo_order(oo);
1116
1117        flags |= __GFP_NOTRACK;
1118
1119        if (node == NUMA_NO_NODE)
1120                return alloc_pages(flags, order);
1121        else
1122                return alloc_pages_exact_node(node, flags, order);
1123}
1124
1125static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1126{
1127        struct page *page;
1128        struct kmem_cache_order_objects oo = s->oo;
1129        gfp_t alloc_gfp;
1130
1131        flags |= s->allocflags;
1132
1133        /*
1134         * Let the initial higher-order allocation fail under memory pressure
1135         * so we fall-back to the minimum order allocation.
1136         */
1137        alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
1138
1139        page = alloc_slab_page(alloc_gfp, node, oo);
1140        if (unlikely(!page)) {
1141                oo = s->min;
1142                /*
1143                 * Allocation may have failed due to fragmentation.
1144                 * Try a lower order alloc if possible
1145                 */
1146                page = alloc_slab_page(flags, node, oo);
1147                if (!page)
1148                        return NULL;
1149
1150                stat(s, ORDER_FALLBACK);
1151        }
1152
1153        if (kmemcheck_enabled
1154                && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
1155                int pages = 1 << oo_order(oo);
1156
1157                kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
1158
1159                /*
1160                 * Objects from caches that have a constructor don't get
1161                 * cleared when they're allocated, so we need to do it here.
1162                 */
1163                if (s->ctor)
1164                        kmemcheck_mark_uninitialized_pages(page, pages);
1165                else
1166                        kmemcheck_mark_unallocated_pages(page, pages);
1167        }
1168
1169        page->objects = oo_objects(oo);
1170        mod_zone_page_state(page_zone(page),
1171                (s->flags & SLAB_RECLAIM_ACCOUNT) ?
1172                NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1173                1 << oo_order(oo));
1174
1175        return page;
1176}
1177
1178static void setup_object(struct kmem_cache *s, struct page *page,
1179                                void *object)
1180{
1181        setup_object_debug(s, page, object);
1182        if (unlikely(s->ctor))
1183                s->ctor(object);
1184}
1185
1186static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1187{
1188        struct page *page;
1189        void *start;
1190        void *last;
1191        void *p;
1192
1193        BUG_ON(flags & GFP_SLAB_BUG_MASK);
1194
1195        page = allocate_slab(s,
1196                flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
1197        if (!page)
1198                goto out;
1199
1200        inc_slabs_node(s, page_to_nid(page), page->objects);
1201        page->slab = s;
1202        page->flags |= 1 << PG_slab;
1203
1204        start = page_address(page);
1205
1206        if (unlikely(s->flags & SLAB_POISON))
1207                memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page));
1208
1209        last = start;
1210        for_each_object(p, s, start, page->objects) {
1211                setup_object(s, page, last);
1212                set_freepointer(s, last, p);
1213                last = p;
1214        }
1215        setup_object(s, page, last);
1216        set_freepointer(s, last, NULL);
1217
1218        page->freelist = start;
1219        page->inuse = 0;
1220out:
1221        return page;
1222}
1223
1224static void __free_slab(struct kmem_cache *s, struct page *page)
1225{
1226        int order = compound_order(page);
1227        int pages = 1 << order;
1228
1229        if (kmem_cache_debug(s)) {
1230                void *p;
1231
1232                slab_pad_check(s, page);
1233                for_each_object(p, s, page_address(page),
1234                                                page->objects)
1235                        check_object(s, page, p, SLUB_RED_INACTIVE);
1236        }
1237
1238        kmemcheck_free_shadow(page, compound_order(page));
1239
1240        mod_zone_page_state(page_zone(page),
1241                (s->flags & SLAB_RECLAIM_ACCOUNT) ?
1242                NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1243                -pages);
1244
1245        __ClearPageSlab(page);
1246        reset_page_mapcount(page);
1247        if (current->reclaim_state)
1248                current->reclaim_state->reclaimed_slab += pages;
1249        __free_pages(page, order);
1250}
1251
1252static void rcu_free_slab(struct rcu_head *h)
1253{
1254        struct page *page;
1255
1256        page = container_of((struct list_head *)h, struct page, lru);
1257        __free_slab(page->slab, page);
1258}
1259
1260static void free_slab(struct kmem_cache *s, struct page *page)
1261{
1262        if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
1263                /*
1264                 * RCU free overloads the RCU head over the LRU
1265                 */
1266                struct rcu_head *head = (void *)&page->lru;
1267
1268                call_rcu(head, rcu_free_slab);
1269        } else
1270                __free_slab(s, page);
1271}
1272
1273static void discard_slab(struct kmem_cache *s, struct page *page)
1274{
1275        dec_slabs_node(s, page_to_nid(page), page->objects);
1276        free_slab(s, page);
1277}
1278
1279/*
1280 * Per slab locking using the pagelock
1281 */
1282static __always_inline void slab_lock(struct page *page)
1283{
1284        bit_spin_lock(PG_locked, &page->flags);
1285}
1286
1287static __always_inline void slab_unlock(struct page *page)
1288{
1289        __bit_spin_unlock(PG_locked, &page->flags);
1290}
1291
1292static __always_inline int slab_trylock(struct page *page)
1293{
1294        int rc = 1;
1295
1296        rc = bit_spin_trylock(PG_locked, &page->flags);
1297        return rc;
1298}
1299
1300/*
1301 * Management of partially allocated slabs
1302 */
1303static void add_partial(struct kmem_cache_node *n,
1304                                struct page *page, int tail)
1305{
1306        spin_lock(&n->list_lock);
1307        n->nr_partial++;
1308        if (tail)
1309                list_add_tail(&page->lru, &n->partial);
1310        else
1311                list_add(&page->lru, &n->partial);
1312        spin_unlock(&n->list_lock);
1313}
1314
1315static inline void __remove_partial(struct kmem_cache_node *n,
1316                                        struct page *page)
1317{
1318        list_del(&page->lru);
1319        n->nr_partial--;
1320}
1321
1322static void remove_partial(struct kmem_cache *s, struct page *page)
1323{
1324        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1325
1326        spin_lock(&n->list_lock);
1327        __remove_partial(n, page);
1328        spin_unlock(&n->list_lock);
1329}
1330
1331/*
1332 * Lock slab and remove from the partial list.
1333 *
1334 * Must hold list_lock.
1335 */
1336static inline int lock_and_freeze_slab(struct kmem_cache_node *n,
1337                                                        struct page *page)
1338{
1339        if (slab_trylock(page)) {
1340                __remove_partial(n, page);
1341                __SetPageSlubFrozen(page);
1342                return 1;
1343        }
1344        return 0;
1345}
1346
1347/*
1348 * Try to allocate a partial slab from a specific node.
1349 */
1350static struct page *get_partial_node(struct kmem_cache_node *n)
1351{
1352        struct page *page;
1353
1354        /*
1355         * Racy check. If we mistakenly see no partial slabs then we
1356         * just allocate an empty slab. If we mistakenly try to get a
1357         * partial slab and there is none available then get_partials()
1358         * will return NULL.
1359         */
1360        if (!n || !n->nr_partial)
1361                return NULL;
1362
1363        spin_lock(&n->list_lock);
1364        list_for_each_entry(page, &n->partial, lru)
1365                if (lock_and_freeze_slab(n, page))
1366                        goto out;
1367        page = NULL;
1368out:
1369        spin_unlock(&n->list_lock);
1370        return page;
1371}
1372
1373/*
1374 * Get a page from somewhere. Search in increasing NUMA distances.
1375 */
1376static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1377{
1378#ifdef CONFIG_NUMA
1379        struct zonelist *zonelist;
1380        struct zoneref *z;
1381        struct zone *zone;
1382        enum zone_type high_zoneidx = gfp_zone(flags);
1383        struct page *page;
1384
1385        /*
1386         * The defrag ratio allows a configuration of the tradeoffs between
1387         * inter node defragmentation and node local allocations. A lower
1388         * defrag_ratio increases the tendency to do local allocations
1389         * instead of attempting to obtain partial slabs from other nodes.
1390         *
1391         * If the defrag_ratio is set to 0 then kmalloc() always
1392         * returns node local objects. If the ratio is higher then kmalloc()
1393         * may return off node objects because partial slabs are obtained
1394         * from other nodes and filled up.
1395         *
1396         * If /sys/kernel/slab/xx/defrag_ratio is set to 100 (which makes
1397         * defrag_ratio = 1000) then every (well almost) allocation will
1398         * first attempt to defrag slab caches on other nodes. This means
1399         * scanning over all nodes to look for partial slabs which may be
1400         * expensive if we do it every time we are trying to find a slab
1401         * with available objects.
1402         */
1403        if (!s->remote_node_defrag_ratio ||
1404                        get_cycles() % 1024 > s->remote_node_defrag_ratio)
1405                return NULL;
1406
1407        get_mems_allowed();
1408        zonelist = node_zonelist(slab_node(current->mempolicy), flags);
1409        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1410                struct kmem_cache_node *n;
1411
1412                n = get_node(s, zone_to_nid(zone));
1413
1414                if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
1415                                n->nr_partial > s->min_partial) {
1416                        page = get_partial_node(n);
1417                        if (page) {
1418                                put_mems_allowed();
1419                                return page;
1420                        }
1421                }
1422        }
1423        put_mems_allowed();
1424#endif
1425        return NULL;
1426}
1427
1428/*
1429 * Get a partial page, lock it and return it.
1430 */
1431static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
1432{
1433        struct page *page;
1434        int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node;
1435
1436        page = get_partial_node(get_node(s, searchnode));
1437        if (page || node != -1)
1438                return page;
1439
1440        return get_any_partial(s, flags);
1441}
1442
1443/*
1444 * Move a page back to the lists.
1445 *
1446 * Must be called with the slab lock held.
1447 *
1448 * On exit the slab lock will have been dropped.
1449 */
1450static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1451        __releases(bitlock)
1452{
1453        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1454
1455        __ClearPageSlubFrozen(page);
1456        if (page->inuse) {
1457
1458                if (page->freelist) {
1459                        add_partial(n, page, tail);
1460                        stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
1461                } else {
1462                        stat(s, DEACTIVATE_FULL);
1463                        if (kmem_cache_debug(s) && (s->flags & SLAB_STORE_USER))
1464                                add_full(n, page);
1465                }
1466                slab_unlock(page);
1467        } else {
1468                stat(s, DEACTIVATE_EMPTY);
1469                if (n->nr_partial < s->min_partial) {
1470                        /*
1471                         * Adding an empty slab to the partial slabs in order
1472                         * to avoid page allocator overhead. This slab needs
1473                         * to come after the other slabs with objects in
1474                         * so that the others get filled first. That way the
1475                         * size of the partial list stays small.
1476                         *
1477                         * kmem_cache_shrink can reclaim any empty slabs from
1478                         * the partial list.
1479                         */
1480                        add_partial(n, page, 1);
1481                        slab_unlock(page);
1482                } else {
1483                        slab_unlock(page);
1484                        stat(s, FREE_SLAB);
1485                        discard_slab(s, page);
1486                }
1487        }
1488}
1489
1490/*
1491 * Remove the cpu slab
1492 */
1493static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1494        __releases(bitlock)
1495{
1496        struct page *page = c->page;
1497        int tail = 1;
1498
1499        if (page->freelist)
1500                stat(s, DEACTIVATE_REMOTE_FREES);
1501        /*
1502         * Merge cpu freelist into slab freelist. Typically we get here
1503         * because both freelists are empty. So this is unlikely
1504         * to occur.
1505         */
1506        while (unlikely(c->freelist)) {
1507                void **object;
1508
1509                tail = 0;       /* Hot objects. Put the slab first */
1510
1511                /* Retrieve object from cpu_freelist */
1512                object = c->freelist;
1513                c->freelist = get_freepointer(s, c->freelist);
1514
1515                /* And put onto the regular freelist */
1516                set_freepointer(s, object, page->freelist);
1517                page->freelist = object;
1518                page->inuse--;
1519        }
1520        c->page = NULL;
1521        unfreeze_slab(s, page, tail);
1522}
1523
1524static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1525{
1526        stat(s, CPUSLAB_FLUSH);
1527        slab_lock(c->page);
1528        deactivate_slab(s, c);
1529}
1530
1531/*
1532 * Flush cpu slab.
1533 *
1534 * Called from IPI handler with interrupts disabled.
1535 */
1536static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
1537{
1538        struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
1539
1540        if (likely(c && c->page))
1541                flush_slab(s, c);
1542}
1543
1544static void flush_cpu_slab(void *d)
1545{
1546        struct kmem_cache *s = d;
1547
1548        __flush_cpu_slab(s, smp_processor_id());
1549}
1550
1551static void flush_all(struct kmem_cache *s)
1552{
1553        on_each_cpu(flush_cpu_slab, s, 1);
1554}
1555
1556/*
1557 * Check if the objects in a per cpu structure fit numa
1558 * locality expectations.
1559 */
1560static inline int node_match(struct kmem_cache_cpu *c, int node)
1561{
1562#ifdef CONFIG_NUMA
1563        if (node != NUMA_NO_NODE && c->node != node)
1564                return 0;
1565#endif
1566        return 1;
1567}
1568
1569static int count_free(struct page *page)
1570{
1571        return page->objects - page->inuse;
1572}
1573
1574static unsigned long count_partial(struct kmem_cache_node *n,
1575                                        int (*get_count)(struct page *))
1576{
1577        unsigned long flags;
1578        unsigned long x = 0;
1579        struct page *page;
1580
1581        spin_lock_irqsave(&n->list_lock, flags);
1582        list_for_each_entry(page, &n->partial, lru)
1583                x += get_count(page);
1584        spin_unlock_irqrestore(&n->list_lock, flags);
1585        return x;
1586}
1587
1588static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
1589{
1590#ifdef CONFIG_SLUB_DEBUG
1591        return atomic_long_read(&n->total_objects);
1592#else
1593        return 0;
1594#endif
1595}
1596
1597static noinline void
1598slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
1599{
1600        int node;
1601
1602        printk(KERN_WARNING
1603                "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n",
1604                nid, gfpflags);
1605        printk(KERN_WARNING "  cache: %s, object size: %d, buffer size: %d, "
1606                "default order: %d, min order: %d\n", s->name, s->objsize,
1607                s->size, oo_order(s->oo), oo_order(s->min));
1608
1609        if (oo_order(s->min) > get_order(s->objsize))
1610                printk(KERN_WARNING "  %s debugging increased min order, use "
1611                       "slub_debug=O to disable.\n", s->name);
1612
1613        for_each_online_node(node) {
1614                struct kmem_cache_node *n = get_node(s, node);
1615                unsigned long nr_slabs;
1616                unsigned long nr_objs;
1617                unsigned long nr_free;
1618
1619                if (!n)
1620                        continue;
1621
1622                nr_free  = count_partial(n, count_free);
1623                nr_slabs = node_nr_slabs(n);
1624                nr_objs  = node_nr_objs(n);
1625
1626                printk(KERN_WARNING
1627                        "  node %d: slabs: %ld, objs: %ld, free: %ld\n",
1628                        node, nr_slabs, nr_objs, nr_free);
1629        }
1630}
1631
1632/*
1633 * Slow path. The lockless freelist is empty or we need to perform
1634 * debugging duties.
1635 *
1636 * Interrupts are disabled.
1637 *
1638 * Processing is still very fast if new objects have been freed to the
1639 * regular freelist. In that case we simply take over the regular freelist
1640 * as the lockless freelist and zap the regular freelist.
1641 *
1642 * If that is not working then we fall back to the partial lists. We take the
1643 * first element of the freelist as the object to allocate now and move the
1644 * rest of the freelist to the lockless freelist.
1645 *
1646 * And if we were unable to get a new slab from the partial slab lists then
1647 * we need to allocate a new slab. This is the slowest path since it involves
1648 * a call to the page allocator and the setup of a new slab.
1649 */
1650static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
1651                          unsigned long addr, struct kmem_cache_cpu *c)
1652{
1653        void **object;
1654        struct page *new;
1655
1656        /* We handle __GFP_ZERO in the caller */
1657        gfpflags &= ~__GFP_ZERO;
1658
1659        if (!c->page)
1660                goto new_slab;
1661
1662        slab_lock(c->page);
1663        if (unlikely(!node_match(c, node)))
1664                goto another_slab;
1665
1666        stat(s, ALLOC_REFILL);
1667
1668load_freelist:
1669        object = c->page->freelist;
1670        if (unlikely(!object))
1671                goto another_slab;
1672        if (kmem_cache_debug(s))
1673                goto debug;
1674
1675        c->freelist = get_freepointer(s, object);
1676        c->page->inuse = c->page->objects;
1677        c->page->freelist = NULL;
1678        c->node = page_to_nid(c->page);
1679unlock_out:
1680        slab_unlock(c->page);
1681        stat(s, ALLOC_SLOWPATH);
1682        return object;
1683
1684another_slab:
1685        deactivate_slab(s, c);
1686
1687new_slab:
1688        new = get_partial(s, gfpflags, node);
1689        if (new) {
1690                c->page = new;
1691                stat(s, ALLOC_FROM_PARTIAL);
1692                goto load_freelist;
1693        }
1694
1695        gfpflags &= gfp_allowed_mask;
1696        if (gfpflags & __GFP_WAIT)
1697                local_irq_enable();
1698
1699        new = new_slab(s, gfpflags, node);
1700
1701        if (gfpflags & __GFP_WAIT)
1702                local_irq_disable();
1703
1704        if (new) {
1705                c = __this_cpu_ptr(s->cpu_slab);
1706                stat(s, ALLOC_SLAB);
1707                if (c->page)
1708                        flush_slab(s, c);
1709                slab_lock(new);
1710                __SetPageSlubFrozen(new);
1711                c->page = new;
1712                goto load_freelist;
1713        }
1714        if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
1715                slab_out_of_memory(s, gfpflags, node);
1716        return NULL;
1717debug:
1718        if (!alloc_debug_processing(s, c->page, object, addr))
1719                goto another_slab;
1720
1721        c->page->inuse++;
1722        c->page->freelist = get_freepointer(s, object);
1723        c->node = NUMA_NO_NODE;
1724        goto unlock_out;
1725}
1726
1727/*
1728 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
1729 * have the fastpath folded into their functions. So no function call
1730 * overhead for requests that can be satisfied on the fastpath.
1731 *
1732 * The fastpath works by first checking if the lockless freelist can be used.
1733 * If not then __slab_alloc is called for slow processing.
1734 *
1735 * Otherwise we can simply pick the next object from the lockless free list.
1736 */
1737static __always_inline void *slab_alloc(struct kmem_cache *s,
1738                gfp_t gfpflags, int node, unsigned long addr)
1739{
1740        void **object;
1741        struct kmem_cache_cpu *c;
1742        unsigned long flags;
1743
1744        if (slab_pre_alloc_hook(s, gfpflags))
1745                return NULL;
1746
1747        local_irq_save(flags);
1748        c = __this_cpu_ptr(s->cpu_slab);
1749        object = c->freelist;
1750        if (unlikely(!object || !node_match(c, node)))
1751
1752                object = __slab_alloc(s, gfpflags, node, addr, c);
1753
1754        else {
1755                c->freelist = get_freepointer(s, object);
1756                stat(s, ALLOC_FASTPATH);
1757        }
1758        local_irq_restore(flags);
1759
1760        if (unlikely(gfpflags & __GFP_ZERO) && object)
1761                memset(object, 0, s->objsize);
1762
1763        slab_post_alloc_hook(s, gfpflags, object);
1764
1765        return object;
1766}
1767
1768void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
1769{
1770        void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
1771
1772        trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags);
1773
1774        return ret;
1775}
1776EXPORT_SYMBOL(kmem_cache_alloc);
1777
1778#ifdef CONFIG_TRACING
1779void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
1780{
1781        void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
1782        trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
1783        return ret;
1784}
1785EXPORT_SYMBOL(kmem_cache_alloc_trace);
1786
1787void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
1788{
1789        void *ret = kmalloc_order(size, flags, order);
1790        trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags);
1791        return ret;
1792}
1793EXPORT_SYMBOL(kmalloc_order_trace);
1794#endif
1795
1796#ifdef CONFIG_NUMA
1797void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
1798{
1799        void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
1800
1801        trace_kmem_cache_alloc_node(_RET_IP_, ret,
1802                                    s->objsize, s->size, gfpflags, node);
1803
1804        return ret;
1805}
1806EXPORT_SYMBOL(kmem_cache_alloc_node);
1807
1808#ifdef CONFIG_TRACING
1809void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
1810                                    gfp_t gfpflags,
1811                                    int node, size_t size)
1812{
1813        void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
1814
1815        trace_kmalloc_node(_RET_IP_, ret,
1816                           size, s->size, gfpflags, node);
1817        return ret;
1818}
1819EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
1820#endif
1821#endif
1822
1823/*
1824 * Slow patch handling. This may still be called frequently since objects
1825 * have a longer lifetime than the cpu slabs in most processing loads.
1826 *
1827 * So we still attempt to reduce cache line usage. Just take the slab
1828 * lock and free the item. If there is no additional partial page
1829 * handling required then we can return immediately.
1830 */
1831static void __slab_free(struct kmem_cache *s, struct page *page,
1832                        void *x, unsigned long addr)
1833{
1834        void *prior;
1835        void **object = (void *)x;
1836
1837        stat(s, FREE_SLOWPATH);
1838        slab_lock(page);
1839
1840        if (kmem_cache_debug(s))
1841                goto debug;
1842
1843checks_ok:
1844        prior = page->freelist;
1845        set_freepointer(s, object, prior);
1846        page->freelist = object;
1847        page->inuse--;
1848
1849        if (unlikely(PageSlubFrozen(page))) {
1850                stat(s, FREE_FROZEN);
1851                goto out_unlock;
1852        }
1853
1854        if (unlikely(!page->inuse))
1855                goto slab_empty;
1856
1857        /*
1858         * Objects left in the slab. If it was not on the partial list before
1859         * then add it.
1860         */
1861        if (unlikely(!prior)) {
1862                add_partial(get_node(s, page_to_nid(page)), page, 1);
1863                stat(s, FREE_ADD_PARTIAL);
1864        }
1865
1866out_unlock:
1867        slab_unlock(page);
1868        return;
1869
1870slab_empty:
1871        if (prior) {
1872                /*
1873                 * Slab still on the partial list.
1874                 */
1875                remove_partial(s, page);
1876                stat(s, FREE_REMOVE_PARTIAL);
1877        }
1878        slab_unlock(page);
1879        stat(s, FREE_SLAB);
1880        discard_slab(s, page);
1881        return;
1882
1883debug:
1884        if (!free_debug_processing(s, page, x, addr))
1885                goto out_unlock;
1886        goto checks_ok;
1887}
1888
1889/*
1890 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
1891 * can perform fastpath freeing without additional function calls.
1892 *
1893 * The fastpath is only possible if we are freeing to the current cpu slab
1894 * of this processor. This typically the case if we have just allocated
1895 * the item before.
1896 *
1897 * If fastpath is not possible then fall back to __slab_free where we deal
1898 * with all sorts of special processing.
1899 */
1900static __always_inline void slab_free(struct kmem_cache *s,
1901                        struct page *page, void *x, unsigned long addr)
1902{
1903        void **object = (void *)x;
1904        struct kmem_cache_cpu *c;
1905        unsigned long flags;
1906
1907        slab_free_hook(s, x);
1908
1909        local_irq_save(flags);
1910        c = __this_cpu_ptr(s->cpu_slab);
1911
1912        slab_free_hook_irq(s, x);
1913
1914        if (likely(page == c->page && c->node != NUMA_NO_NODE)) {
1915                set_freepointer(s, object, c->freelist);
1916                c->freelist = object;
1917                stat(s, FREE_FASTPATH);
1918        } else
1919                __slab_free(s, page, x, addr);
1920
1921        local_irq_restore(flags);
1922}
1923
1924void kmem_cache_free(struct kmem_cache *s, void *x)
1925{
1926        struct page *page;
1927
1928        page = virt_to_head_page(x);
1929
1930        slab_free(s, page, x, _RET_IP_);
1931
1932        trace_kmem_cache_free(_RET_IP_, x);
1933}
1934EXPORT_SYMBOL(kmem_cache_free);
1935
1936/*
1937 * Object placement in a slab is made very easy because we always start at
1938 * offset 0. If we tune the size of the object to the alignment then we can
1939 * get the required alignment by putting one properly sized object after
1940 * another.
1941 *
1942 * Notice that the allocation order determines the sizes of the per cpu
1943 * caches. Each processor has always one slab available for allocations.
1944 * Increasing the allocation order reduces the number of times that slabs
1945 * must be moved on and off the partial lists and is therefore a factor in
1946 * locking overhead.
1947 */
1948
1949/*
1950 * Mininum / Maximum order of slab pages. This influences locking overhead
1951 * and slab fragmentation. A higher order reduces the number of partial slabs
1952 * and increases the number of allocations possible without having to
1953 * take the list_lock.
1954 */
1955static int slub_min_order;
1956static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER;
1957static int slub_min_objects;
1958
1959/*
1960 * Merge control. If this is set then no merging of slab caches will occur.
1961 * (Could be removed. This was introduced to pacify the merge skeptics.)
1962 */
1963static int slub_nomerge;
1964
1965/*
1966 * Calculate the order of allocation given an slab object size.
1967 *
1968 * The order of allocation has significant impact on performance and other
1969 * system components. Generally order 0 allocations should be preferred since
1970 * order 0 does not cause fragmentation in the page allocator. Larger objects
1971 * be problematic to put into order 0 slabs because there may be too much
1972 * unused space left. We go to a higher order if more than 1/16th of the slab
1973 * would be wasted.
1974 *
1975 * In order to reach satisfactory performance we must ensure that a minimum
1976 * number of objects is in one slab. Otherwise we may generate too much
1977 * activity on the partial lists which requires taking the list_lock. This is
1978 * less a concern for large slabs though which are rarely used.
1979 *
1980 * slub_max_order specifies the order where we begin to stop considering the
1981 * number of objects in a slab as critical. If we reach slub_max_order then
1982 * we try to keep the page order as low as possible. So we accept more waste
1983 * of space in favor of a small page order.
1984 *
1985 * Higher order allocations also allow the placement of more objects in a
1986 * slab and thereby reduce object handling overhead. If the user has
1987 * requested a higher mininum order then we start with that one instead of
1988 * the smallest order which will fit the object.
1989 */
1990static inline int slab_order(int size, int min_objects,
1991                                int max_order, int fract_leftover)
1992{
1993        int order;
1994        int rem;
1995        int min_order = slub_min_order;
1996
1997        if ((PAGE_SIZE << min_order) / size > MAX_OBJS_PER_PAGE)
1998                return get_order(size * MAX_OBJS_PER_PAGE) - 1;
1999
2000        for (order = max(min_order,
2001                                fls(min_objects * size - 1) - PAGE_SHIFT);
2002                        order <= max_order; order++) {
2003
2004                unsigned long slab_size = PAGE_SIZE << order;
2005
2006                if (slab_size < min_objects * size)
2007                        continue;
2008
2009                rem = slab_size % size;
2010
2011                if (rem <= slab_size / fract_leftover)
2012                        break;
2013
2014        }
2015
2016        return order;
2017}
2018
2019static inline int calculate_order(int size)
2020{
2021        int order;
2022        int min_objects;
2023        int fraction;
2024        int max_objects;
2025
2026        /*
2027         * Attempt to find best configuration for a slab. This
2028         * works by first attempting to generate a layout with
2029         * the best configuration and backing off gradually.
2030         *
2031         * First we reduce the acceptable waste in a slab. Then
2032         * we reduce the minimum objects required in a slab.
2033         */
2034        min_objects = slub_min_objects;
2035        if (!min_objects)
2036                min_objects = 4 * (fls(nr_cpu_ids) + 1);
2037        max_objects = (PAGE_SIZE << slub_max_order)/size;
2038        min_objects = min(min_objects, max_objects);
2039
2040        while (min_objects > 1) {
2041                fraction = 16;
2042                while (fraction >= 4) {
2043                        order = slab_order(size, min_objects,
2044                                                slub_max_order, fraction);
2045                        if (order <= slub_max_order)
2046                                return order;
2047                        fraction /= 2;
2048                }
2049                min_objects--;
2050        }
2051
2052        /*
2053         * We were unable to place multiple objects in a slab. Now
2054         * lets see if we can place a single object there.
2055         */
2056        order = slab_order(size, 1, slub_max_order, 1);
2057        if (order <= slub_max_order)
2058                return order;
2059
2060        /*
2061         * Doh this slab cannot be placed using slub_max_order.
2062         */
2063        order = slab_order(size, 1, MAX_ORDER, 1);
2064        if (order < MAX_ORDER)
2065                return order;
2066        return -ENOSYS;
2067}
2068
2069/*
2070 * Figure out what the alignment of the objects will be.
2071 */
2072static unsigned long calculate_alignment(unsigned long flags,
2073                unsigned long align, unsigned long size)
2074{
2075        /*
2076         * If the user wants hardware cache aligned objects then follow that
2077         * suggestion if the object is sufficiently large.
2078         *
2079         * The hardware cache alignment cannot override the specified
2080         * alignment though. If that is greater then use it.
2081         */
2082        if (flags & SLAB_HWCACHE_ALIGN) {
2083                unsigned long ralign = cache_line_size();
2084                while (size <= ralign / 2)
2085                        ralign /= 2;
2086                align = max(align, ralign);
2087        }
2088
2089        if (align < ARCH_SLAB_MINALIGN)
2090                align = ARCH_SLAB_MINALIGN;
2091
2092        return ALIGN(align, sizeof(void *));
2093}
2094
2095static void
2096init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
2097{
2098        n->nr_partial = 0;
2099        spin_lock_init(&n->list_lock);
2100        INIT_LIST_HEAD(&n->partial);
2101#ifdef CONFIG_SLUB_DEBUG
2102        atomic_long_set(&n->nr_slabs, 0);
2103        atomic_long_set(&n->total_objects, 0);
2104        INIT_LIST_HEAD(&n->full);
2105#endif
2106}
2107
2108static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
2109{
2110        BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
2111                        SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));
2112
2113        s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
2114
2115        return s->cpu_slab != NULL;
2116}
2117
2118static struct kmem_cache *kmem_cache_node;
2119
2120/*
2121 * No kmalloc_node yet so do it by hand. We know that this is the first
2122 * slab on the node for this slabcache. There are no concurrent accesses
2123 * possible.
2124 *
2125 * Note that this function only works on the kmalloc_node_cache
2126 * when allocating for the kmalloc_node_cache. This is used for bootstrapping
2127 * memory on a fresh node that has no slab structures yet.
2128 */
2129static void early_kmem_cache_node_alloc(int node)
2130{
2131        struct page *page;
2132        struct kmem_cache_node *n;
2133        unsigned long flags;
2134
2135        BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
2136
2137        page = new_slab(kmem_cache_node, GFP_NOWAIT, node);
2138
2139        BUG_ON(!page);
2140        if (page_to_nid(page) != node) {
2141                printk(KERN_ERR "SLUB: Unable to allocate memory from "
2142                                "node %d\n", node);
2143                printk(KERN_ERR "SLUB: Allocating a useless per node structure "
2144                                "in order to be able to continue\n");
2145        }
2146
2147        n = page->freelist;
2148        BUG_ON(!n);
2149        page->freelist = get_freepointer(kmem_cache_node, n);
2150        page->inuse++;
2151        kmem_cache_node->node[node] = n;
2152#ifdef CONFIG_SLUB_DEBUG
2153        init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
2154        init_tracking(kmem_cache_node, n);
2155#endif
2156        init_kmem_cache_node(n, kmem_cache_node);
2157        inc_slabs_node(kmem_cache_node, node, page->objects);
2158
2159        /*
2160         * lockdep requires consistent irq usage for each lock
2161         * so even though there cannot be a race this early in
2162         * the boot sequence, we still disable irqs.
2163         */
2164        local_irq_save(flags);
2165        add_partial(n, page, 0);
2166        local_irq_restore(flags);
2167}
2168
2169static void free_kmem_cache_nodes(struct kmem_cache *s)
2170{
2171        int node;
2172
2173        for_each_node_state(node, N_NORMAL_MEMORY) {
2174                struct kmem_cache_node *n = s->node[node];
2175
2176                if (n)
2177                        kmem_cache_free(kmem_cache_node, n);
2178
2179                s->node[node] = NULL;
2180        }
2181}
2182
2183static int init_kmem_cache_nodes(struct kmem_cache *s)
2184{
2185        int node;
2186
2187        for_each_node_state(node, N_NORMAL_MEMORY) {
2188                struct kmem_cache_node *n;
2189
2190                if (slab_state == DOWN) {
2191                        early_kmem_cache_node_alloc(node);
2192                        continue;
2193                }
2194                n = kmem_cache_alloc_node(kmem_cache_node,
2195                                                GFP_KERNEL, node);
2196
2197                if (!n) {
2198                        free_kmem_cache_nodes(s);
2199                        return 0;
2200                }
2201
2202                s->node[node] = n;
2203                init_kmem_cache_node(n, s);
2204        }
2205        return 1;
2206}
2207
2208static void set_min_partial(struct kmem_cache *s, unsigned long min)
2209{
2210        if (min < MIN_PARTIAL)
2211                min = MIN_PARTIAL;
2212        else if (min > MAX_PARTIAL)
2213                min = MAX_PARTIAL;
2214        s->min_partial = min;
2215}
2216
2217/*
2218 * calculate_sizes() determines the order and the distribution of data within
2219 * a slab object.
2220 */
2221static int calculate_sizes(struct kmem_cache *s, int forced_order)
2222{
2223        unsigned long flags = s->flags;
2224        unsigned long size = s->objsize;
2225        unsigned long align = s->align;
2226        int order;
2227
2228        /*
2229         * Round up object size to the next word boundary. We can only
2230         * place the free pointer at word boundaries and this determines
2231         * the possible location of the free pointer.
2232         */
2233        size = ALIGN(size, sizeof(void *));
2234
2235#ifdef CONFIG_SLUB_DEBUG
2236        /*
2237         * Determine if we can poison the object itself. If the user of
2238         * the slab may touch the object after free or before allocation
2239         * then we should never poison the object itself.
2240         */
2241        if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) &&
2242                        !s->ctor)
2243                s->flags |= __OBJECT_POISON;
2244        else
2245                s->flags &= ~__OBJECT_POISON;
2246
2247
2248        /*
2249         * If we are Redzoning then check if there is some space between the
2250         * end of the object and the free pointer. If not then add an
2251         * additional word to have some bytes to store Redzone information.
2252         */
2253        if ((flags & SLAB_RED_ZONE) && size == s->objsize)
2254                size += sizeof(void *);
2255#endif
2256
2257        /*
2258         * With that we have determined the number of bytes in actual use
2259         * by the object. This is the potential offset to the free pointer.
2260         */
2261        s->inuse = size;
2262
2263        if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) ||
2264                s->ctor)) {
2265                /*
2266                 * Relocate free pointer after the object if it is not
2267                 * permitted to overwrite the first word of the object on
2268                 * kmem_cache_free.
2269                 *
2270                 * This is the case if we do RCU, have a constructor or
2271                 * destructor or are poisoning the objects.
2272                 */
2273                s->offset = size;
2274                size += sizeof(void *);
2275        }
2276
2277#ifdef CONFIG_SLUB_DEBUG
2278        if (flags & SLAB_STORE_USER)
2279                /*
2280                 * Need to store information about allocs and frees after
2281                 * the object.
2282                 */
2283                size += 2 * sizeof(struct track);
2284
2285        if (flags & SLAB_RED_ZONE)
2286                /*
2287                 * Add some empty padding so that we can catch
2288                 * overwrites from earlier objects rather than let
2289                 * tracking information or the free pointer be
2290                 * corrupted if a user writes before the start
2291                 * of the object.
2292                 */
2293                size += sizeof(void *);
2294#endif
2295
2296        /*
2297         * Determine the alignment based on various parameters that the
2298         * user specified and the dynamic determination of cache line size
2299         * on bootup.
2300         */
2301        align = calculate_alignment(flags, align, s->objsize);
2302        s->align = align;
2303
2304        /*
2305         * SLUB stores one object immediately after another beginning from
2306         * offset 0. In order to align the objects we have to simply size
2307         * each object to conform to the alignment.
2308         */
2309        size = ALIGN(size, align);
2310        s->size = size;
2311        if (forced_order >= 0)
2312                order = forced_order;
2313        else
2314                order = calculate_order(size);
2315
2316        if (order < 0)
2317                return 0;
2318
2319        s->allocflags = 0;
2320        if (order)
2321                s->allocflags |= __GFP_COMP;
2322
2323        if (s->flags & SLAB_CACHE_DMA)
2324                s->allocflags |= SLUB_DMA;
2325
2326        if (s->flags & SLAB_RECLAIM_ACCOUNT)
2327                s->allocflags |= __GFP_RECLAIMABLE;
2328
2329        /*
2330         * Determine the number of objects per slab
2331         */
2332        s->oo = oo_make(order, size);
2333        s->min = oo_make(get_order(size), size);
2334        if (oo_objects(s->oo) > oo_objects(s->max))
2335                s->max = s->oo;
2336
2337        return !!oo_objects(s->oo);
2338
2339}
2340
2341static int kmem_cache_open(struct kmem_cache *s,
2342                const char *name, size_t size,
2343                size_t align, unsigned long flags,
2344                void (*ctor)(void *))
2345{
2346        memset(s, 0, kmem_size);
2347        s->name = name;
2348        s->ctor = ctor;
2349        s->objsize = size;
2350        s->align = align;
2351        s->flags = kmem_cache_flags(size, flags, name, ctor);
2352
2353        if (!calculate_sizes(s, -1))
2354                goto error;
2355        if (disable_higher_order_debug) {
2356                /*
2357                 * Disable debugging flags that store metadata if the min slab
2358                 * order increased.
2359                 */
2360                if (get_order(s->size) > get_order(s->objsize)) {
2361                        s->flags &= ~DEBUG_METADATA_FLAGS;
2362                        s->offset = 0;
2363                        if (!calculate_sizes(s, -1))
2364                                goto error;
2365                }
2366        }
2367
2368        /*
2369         * The larger the object size is, the more pages we want on the partial
2370         * list to avoid pounding the page allocator excessively.
2371         */
2372        set_min_partial(s, ilog2(s->size));
2373        s->refcount = 1;
2374#ifdef CONFIG_NUMA
2375        s->remote_node_defrag_ratio = 1000;
2376#endif
2377        if (!init_kmem_cache_nodes(s))
2378                goto error;
2379
2380        if (alloc_kmem_cache_cpus(s))
2381                return 1;
2382
2383        free_kmem_cache_nodes(s);
2384error:
2385        if (flags & SLAB_PANIC)
2386                panic("Cannot create slab %s size=%lu realsize=%u "
2387                        "order=%u offset=%u flags=%lx\n",
2388                        s->name, (unsigned long)size, s->size, oo_order(s->oo),
2389                        s->offset, flags);
2390        return 0;
2391}
2392
2393/*
2394 * Determine the size of a slab object
2395 */
2396unsigned int kmem_cache_size(struct kmem_cache *s)
2397{
2398        return s->objsize;
2399}
2400EXPORT_SYMBOL(kmem_cache_size);
2401
2402const char *kmem_cache_name(struct kmem_cache *s)
2403{
2404        return s->name;
2405}
2406EXPORT_SYMBOL(kmem_cache_name);
2407
2408static void list_slab_objects(struct kmem_cache *s, struct page *page,
2409                                                        const char *text)
2410{
2411#ifdef CONFIG_SLUB_DEBUG
2412        void *addr = page_address(page);
2413        void *p;
2414        unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) *
2415                                     sizeof(long), GFP_ATOMIC);
2416        if (!map)
2417                return;
2418        slab_err(s, page, "%s", text);
2419        slab_lock(page);
2420        for_each_free_object(p, s, page->freelist)
2421                set_bit(slab_index(p, s, addr), map);
2422
2423        for_each_object(p, s, addr, page->objects) {
2424
2425                if (!test_bit(slab_index(p, s, addr), map)) {
2426                        printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n",
2427                                                        p, p - addr);
2428                        print_tracking(s, p);
2429                }
2430        }
2431        slab_unlock(page);
2432        kfree(map);
2433#endif
2434}
2435
2436/*
2437 * Attempt to free all partial slabs on a node.
2438 */
2439static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
2440{
2441        unsigned long flags;
2442        struct page *page, *h;
2443
2444        spin_lock_irqsave(&n->list_lock, flags);
2445        list_for_each_entry_safe(page, h, &n->partial, lru) {
2446                if (!page->inuse) {
2447                        __remove_partial(n, page);
2448                        discard_slab(s, page);
2449                } else {
2450                        list_slab_objects(s, page,
2451                                "Objects remaining on kmem_cache_close()");
2452                }
2453        }
2454        spin_unlock_irqrestore(&n->list_lock, flags);
2455}
2456
2457/*
2458 * Release all resources used by a slab cache.
2459 */
2460static inline int kmem_cache_close(struct kmem_cache *s)
2461{
2462        int node;
2463
2464        flush_all(s);
2465        free_percpu(s->cpu_slab);
2466        /* Attempt to free all objects */
2467        for_each_node_state(node, N_NORMAL_MEMORY) {
2468                struct kmem_cache_node *n = get_node(s, node);
2469
2470                free_partial(s, n);
2471                if (n->nr_partial || slabs_node(s, node))
2472                        return 1;
2473        }
2474        free_kmem_cache_nodes(s);
2475        return 0;
2476}
2477
2478/*
2479 * Close a cache and release the kmem_cache structure
2480 * (must be used for caches created using kmem_cache_create)
2481 */
2482void kmem_cache_destroy(struct kmem_cache *s)
2483{
2484        down_write(&slub_lock);
2485        s->refcount--;
2486        if (!s->refcount) {
2487                list_del(&s->list);
2488                if (kmem_cache_close(s)) {
2489                        printk(KERN_ERR "SLUB %s: %s called for cache that "
2490                                "still has objects.\n", s->name, __func__);
2491                        dump_stack();
2492                }
2493                if (s->flags & SLAB_DESTROY_BY_RCU)
2494                        rcu_barrier();
2495                sysfs_slab_remove(s);
2496        }
2497        up_write(&slub_lock);
2498}
2499EXPORT_SYMBOL(kmem_cache_destroy);
2500
2501/********************************************************************
2502 *              Kmalloc subsystem
2503 *******************************************************************/
2504
2505struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT];
2506EXPORT_SYMBOL(kmalloc_caches);
2507
2508static struct kmem_cache *kmem_cache;
2509
2510#ifdef CONFIG_ZONE_DMA
2511static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT];
2512#endif
2513
2514static int __init setup_slub_min_order(char *str)
2515{
2516        get_option(&str, &slub_min_order);
2517
2518        return 1;
2519}
2520
2521__setup("slub_min_order=", setup_slub_min_order);
2522
2523static int __init setup_slub_max_order(char *str)
2524{
2525        get_option(&str, &slub_max_order);
2526        slub_max_order = min(slub_max_order, MAX_ORDER - 1);
2527
2528        return 1;
2529}
2530
2531__setup("slub_max_order=", setup_slub_max_order);
2532
2533static int __init setup_slub_min_objects(char *str)
2534{
2535        get_option(&str, &slub_min_objects);
2536
2537        return 1;
2538}
2539
2540__setup("slub_min_objects=", setup_slub_min_objects);
2541
2542static int __init setup_slub_nomerge(char *str)
2543{
2544        slub_nomerge = 1;
2545        return 1;
2546}
2547
2548__setup("slub_nomerge", setup_slub_nomerge);
2549
2550static struct kmem_cache *__init create_kmalloc_cache(const char *name,
2551                                                int size, unsigned int flags)
2552{
2553        struct kmem_cache *s;
2554
2555        s = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
2556
2557        /*
2558         * This function is called with IRQs disabled during early-boot on
2559         * single CPU so there's no need to take slub_lock here.
2560         */
2561        if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN,
2562                                                                flags, NULL))
2563                goto panic;
2564
2565        list_add(&s->list, &slab_caches);
2566        return s;
2567
2568panic:
2569        panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
2570        return NULL;
2571}
2572
2573/*
2574 * Conversion table for small slabs sizes / 8 to the index in the
2575 * kmalloc array. This is necessary for slabs < 192 since we have non power
2576 * of two cache sizes there. The size of larger slabs can be determined using
2577 * fls.
2578 */
2579static s8 size_index[24] = {
2580        3,      /* 8 */
2581        4,      /* 16 */
2582        5,      /* 24 */
2583        5,      /* 32 */
2584        6,      /* 40 */
2585        6,      /* 48 */
2586        6,      /* 56 */
2587        6,      /* 64 */
2588        1,      /* 72 */
2589        1,      /* 80 */
2590        1,      /* 88 */
2591        1,      /* 96 */
2592        7,      /* 104 */
2593        7,      /* 112 */
2594        7,      /* 120 */
2595        7,      /* 128 */
2596        2,      /* 136 */
2597        2,      /* 144 */
2598        2,      /* 152 */
2599        2,      /* 160 */
2600        2,      /* 168 */
2601        2,      /* 176 */
2602        2,      /* 184 */
2603        2       /* 192 */
2604};
2605
2606static inline int size_index_elem(size_t bytes)
2607{
2608        return (bytes - 1) / 8;
2609}
2610
2611static struct kmem_cache *get_slab(size_t size, gfp_t flags)
2612{
2613        int index;
2614
2615        if (size <= 192) {
2616                if (!size)
2617                        return ZERO_SIZE_PTR;
2618
2619                index = size_index[size_index_elem(size)];
2620        } else
2621                index = fls(size - 1);
2622
2623#ifdef CONFIG_ZONE_DMA
2624        if (unlikely((flags & SLUB_DMA)))
2625                return kmalloc_dma_caches[index];
2626
2627#endif
2628        return kmalloc_caches[index];
2629}
2630
2631void *__kmalloc(size_t size, gfp_t flags)
2632{
2633        struct kmem_cache *s;
2634        void *ret;
2635
2636        if (unlikely(size > SLUB_MAX_SIZE))
2637                return kmalloc_large(size, flags);
2638
2639        s = get_slab(size, flags);
2640
2641        if (unlikely(ZERO_OR_NULL_PTR(s)))
2642                return s;
2643
2644        ret = slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_);
2645
2646        trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
2647
2648        return ret;
2649}
2650EXPORT_SYMBOL(__kmalloc);
2651
2652#ifdef CONFIG_NUMA
2653static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
2654{
2655        struct page *page;
2656        void *ptr = NULL;
2657
2658        flags |= __GFP_COMP | __GFP_NOTRACK;
2659        page = alloc_pages_node(node, flags, get_order(size));
2660        if (page)
2661                ptr = page_address(page);
2662
2663        kmemleak_alloc(ptr, size, 1, flags);
2664        return ptr;
2665}
2666
2667void *__kmalloc_node(size_t size, gfp_t flags, int node)
2668{
2669        struct kmem_cache *s;
2670        void *ret;
2671
2672        if (unlikely(size > SLUB_MAX_SIZE)) {
2673                ret = kmalloc_large_node(size, flags, node);
2674
2675                trace_kmalloc_node(_RET_IP_, ret,
2676                                   size, PAGE_SIZE << get_order(size),
2677                                   flags, node);
2678
2679                return ret;
2680        }
2681
2682        s = get_slab(size, flags);
2683
2684        if (unlikely(ZERO_OR_NULL_PTR(s)))
2685                return s;
2686
2687        ret = slab_alloc(s, flags, node, _RET_IP_);
2688
2689        trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
2690
2691        return ret;
2692}
2693EXPORT_SYMBOL(__kmalloc_node);
2694#endif
2695
2696size_t ksize(const void *object)
2697{
2698        struct page *page;
2699        struct kmem_cache *s;
2700
2701        if (unlikely(object == ZERO_SIZE_PTR))
2702                return 0;
2703
2704        page = virt_to_head_page(object);
2705
2706        if (unlikely(!PageSlab(page))) {
2707                WARN_ON(!PageCompound(page));
2708                return PAGE_SIZE << compound_order(page);
2709        }
2710        s = page->slab;
2711
2712#ifdef CONFIG_SLUB_DEBUG
2713        /*
2714         * Debugging requires use of the padding between object
2715         * and whatever may come after it.
2716         */
2717        if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
2718                return s->objsize;
2719
2720#endif
2721        /*
2722         * If we have the need to store the freelist pointer
2723         * back there or track user information then we can
2724         * only use the space before that information.
2725         */
2726        if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
2727                return s->inuse;
2728        /*
2729         * Else we can use all the padding etc for the allocation
2730         */
2731        return s->size;
2732}
2733EXPORT_SYMBOL(ksize);
2734
2735void kfree(const void *x)
2736{
2737        struct page *page;
2738        void *object = (void *)x;
2739
2740        trace_kfree(_RET_IP_, x);
2741
2742        if (unlikely(ZERO_OR_NULL_PTR(x)))
2743                return;
2744
2745        page = virt_to_head_page(x);
2746        if (unlikely(!PageSlab(page))) {
2747                BUG_ON(!PageCompound(page));
2748                kmemleak_free(x);
2749                put_page(page);
2750                return;
2751        }
2752        slab_free(page->slab, page, object, _RET_IP_);
2753}
2754EXPORT_SYMBOL(kfree);
2755
2756/*
2757 * kmem_cache_shrink removes empty slabs from the partial lists and sorts
2758 * the remaining slabs by the number of items in use. The slabs with the
2759 * most items in use come first. New allocations will then fill those up
2760 * and thus they can be removed from the partial lists.
2761 *
2762 * The slabs with the least items are placed last. This results in them
2763 * being allocated from last increasing the chance that the last objects
2764 * are freed in them.
2765 */
2766int kmem_cache_shrink(struct kmem_cache *s)
2767{
2768        int node;
2769        int i;
2770        struct kmem_cache_node *n;
2771        struct page *page;
2772        struct page *t;
2773        int objects = oo_objects(s->max);
2774        struct list_head *slabs_by_inuse =
2775                kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL);
2776        unsigned long flags;
2777
2778        if (!slabs_by_inuse)
2779                return -ENOMEM;
2780
2781        flush_all(s);
2782        for_each_node_state(node, N_NORMAL_MEMORY) {
2783                n = get_node(s, node);
2784
2785                if (!n->nr_partial)
2786                        continue;
2787
2788                for (i = 0; i < objects; i++)
2789                        INIT_LIST_HEAD(slabs_by_inuse + i);
2790
2791                spin_lock_irqsave(&n->list_lock, flags);
2792
2793                /*
2794                 * Build lists indexed by the items in use in each slab.
2795                 *
2796                 * Note that concurrent frees may occur while we hold the
2797                 * list_lock. page->inuse here is the upper limit.
2798                 */
2799                list_for_each_entry_safe(page, t, &n->partial, lru) {
2800                        if (!page->inuse && slab_trylock(page)) {
2801                                /*
2802                                 * Must hold slab lock here because slab_free
2803                                 * may have freed the last object and be
2804                                 * waiting to release the slab.
2805                                 */
2806                                __remove_partial(n, page);
2807                                slab_unlock(page);
2808                                discard_slab(s, page);
2809                        } else {
2810                                list_move(&page->lru,
2811                                slabs_by_inuse + page->inuse);
2812                        }
2813                }
2814
2815                /*
2816                 * Rebuild the partial list with the slabs filled up most
2817                 * first and the least used slabs at the end.
2818                 */
2819                for (i = objects - 1; i >= 0; i--)
2820                        list_splice(slabs_by_inuse + i, n->partial.prev);
2821
2822                spin_unlock_irqrestore(&n->list_lock, flags);
2823        }
2824
2825        kfree(slabs_by_inuse);
2826        return 0;
2827}
2828EXPORT_SYMBOL(kmem_cache_shrink);
2829
2830#if defined(CONFIG_MEMORY_HOTPLUG)
2831static int slab_mem_going_offline_callback(void *arg)
2832{
2833        struct kmem_cache *s;
2834
2835        down_read(&slub_lock);
2836        list_for_each_entry(s, &slab_caches, list)
2837                kmem_cache_shrink(s);
2838        up_read(&slub_lock);
2839
2840        return 0;
2841}
2842
2843static void slab_mem_offline_callback(void *arg)
2844{
2845        struct kmem_cache_node *n;
2846        struct kmem_cache *s;
2847        struct memory_notify *marg = arg;
2848        int offline_node;
2849
2850        offline_node = marg->status_change_nid;
2851
2852        /*
2853         * If the node still has available memory. we need kmem_cache_node
2854         * for it yet.
2855         */
2856        if (offline_node < 0)
2857                return;
2858
2859        down_read(&slub_lock);
2860        list_for_each_entry(s, &slab_caches, list) {
2861                n = get_node(s, offline_node);
2862                if (n) {
2863                        /*
2864                         * if n->nr_slabs > 0, slabs still exist on the node
2865                         * that is going down. We were unable to free them,
2866                         * and offline_pages() function shouldn't call this
2867                         * callback. So, we must fail.
2868                         */
2869                        BUG_ON(slabs_node(s, offline_node));
2870
2871                        s->node[offline_node] = NULL;
2872                        kmem_cache_free(kmem_cache_node, n);
2873                }
2874        }
2875        up_read(&slub_lock);
2876}
2877
2878static int slab_mem_going_online_callback(void *arg)
2879{
2880        struct kmem_cache_node *n;
2881        struct kmem_cache *s;
2882        struct memory_notify *marg = arg;
2883        int nid = marg->status_change_nid;
2884        int ret = 0;
2885
2886        /*
2887         * If the node's memory is already available, then kmem_cache_node is
2888         * already created. Nothing to do.
2889         */
2890        if (nid < 0)
2891                return 0;
2892
2893        /*
2894         * We are bringing a node online. No memory is available yet. We must
2895         * allocate a kmem_cache_node structure in order to bring the node
2896         * online.
2897         */
2898        down_read(&slub_lock);
2899        list_for_each_entry(s, &slab_caches, list) {
2900                /*
2901                 * XXX: kmem_cache_alloc_node will fallback to other nodes
2902                 *      since memory is not yet available from the node that
2903                 *      is brought up.
2904                 */
2905                n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL);
2906                if (!n) {
2907                        ret = -ENOMEM;
2908                        goto out;
2909                }
2910                init_kmem_cache_node(n, s);
2911                s->node[nid] = n;
2912        }
2913out:
2914        up_read(&slub_lock);
2915        return ret;
2916}
2917
2918static int slab_memory_callback(struct notifier_block *self,
2919                                unsigned long action, void *arg)
2920{
2921        int ret = 0;
2922
2923        switch (action) {
2924        case MEM_GOING_ONLINE:
2925                ret = slab_mem_going_online_callback(arg);
2926                break;
2927        case MEM_GOING_OFFLINE:
2928                ret = slab_mem_going_offline_callback(arg);
2929                break;
2930        case MEM_OFFLINE:
2931        case MEM_CANCEL_ONLINE:
2932                slab_mem_offline_callback(arg);
2933                break;
2934        case MEM_ONLINE:
2935        case MEM_CANCEL_OFFLINE:
2936                break;
2937        }
2938        if (ret)
2939                ret = notifier_from_errno(ret);
2940        else
2941                ret = NOTIFY_OK;
2942        return ret;
2943}
2944
2945#endif /* CONFIG_MEMORY_HOTPLUG */
2946
2947/********************************************************************
2948 *                      Basic setup of slabs
2949 *******************************************************************/
2950
2951/*
2952 * Used for early kmem_cache structures that were allocated using
2953 * the page allocator
2954 */
2955
2956static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s)
2957{
2958        int node;
2959
2960        list_add(&s->list, &slab_caches);
2961        s->refcount = -1;
2962
2963        for_each_node_state(node, N_NORMAL_MEMORY) {
2964                struct kmem_cache_node *n = get_node(s, node);
2965                struct page *p;
2966
2967                if (n) {
2968                        list_for_each_entry(p, &n->partial, lru)
2969                                p->slab = s;
2970
2971#ifdef CONFIG_SLAB_DEBUG
2972                        list_for_each_entry(p, &n->full, lru)
2973                                p->slab = s;
2974#endif
2975                }
2976        }
2977}
2978
2979void __init kmem_cache_init(void)
2980{
2981        int i;
2982        int caches = 0;
2983        struct kmem_cache *temp_kmem_cache;
2984        int order;
2985        struct kmem_cache *temp_kmem_cache_node;
2986        unsigned long kmalloc_size;
2987
2988        kmem_size = offsetof(struct kmem_cache, node) +
2989                                nr_node_ids * sizeof(struct kmem_cache_node *);
2990
2991        /* Allocate two kmem_caches from the page allocator */
2992        kmalloc_size = ALIGN(kmem_size, cache_line_size());
2993        order = get_order(2 * kmalloc_size);
2994        kmem_cache = (void *)__get_free_pages(GFP_NOWAIT, order);
2995
2996        /*
2997         * Must first have the slab cache available for the allocations of the
2998         * struct kmem_cache_node's. There is special bootstrap code in
2999         * kmem_cache_open for slab_state == DOWN.
3000         */
3001        kmem_cache_node = (void *)kmem_cache + kmalloc_size;
3002
3003        kmem_cache_open(kmem_cache_node, "kmem_cache_node",
3004                sizeof(struct kmem_cache_node),
3005                0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
3006
3007        hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
3008
3009        /* Able to allocate the per node structures */
3010        slab_state = PARTIAL;
3011
3012        temp_kmem_cache = kmem_cache;
3013        kmem_cache_open(kmem_cache, "kmem_cache", kmem_size,
3014                0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
3015        kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
3016        memcpy(kmem_cache, temp_kmem_cache, kmem_size);
3017
3018        /*
3019         * Allocate kmem_cache_node properly from the kmem_cache slab.
3020         * kmem_cache_node is separately allocated so no need to
3021         * update any list pointers.
3022         */
3023        temp_kmem_cache_node = kmem_cache_node;
3024
3025        kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
3026        memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size);
3027
3028        kmem_cache_bootstrap_fixup(kmem_cache_node);
3029
3030        caches++;
3031        kmem_cache_bootstrap_fixup(kmem_cache);
3032        caches++;
3033        /* Free temporary boot structure */
3034        free_pages((unsigned long)temp_kmem_cache, order);
3035
3036        /* Now we can use the kmem_cache to allocate kmalloc slabs */
3037
3038        /*
3039         * Patch up the size_index table if we have strange large alignment
3040         * requirements for the kmalloc array. This is only the case for
3041         * MIPS it seems. The standard arches will not generate any code here.
3042         *
3043         * Largest permitted alignment is 256 bytes due to the way we
3044         * handle the index determination for the smaller caches.
3045         *
3046         * Make sure that nothing crazy happens if someone starts tinkering
3047         * around with ARCH_KMALLOC_MINALIGN
3048         */
3049        BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
3050                (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
3051
3052        for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
3053                int elem = size_index_elem(i);
3054                if (elem >= ARRAY_SIZE(size_index))
3055                        break;
3056                size_index[elem] = KMALLOC_SHIFT_LOW;
3057        }
3058
3059        if (KMALLOC_MIN_SIZE == 64) {
3060                /*
3061                 * The 96 byte size cache is not used if the alignment
3062                 * is 64 byte.
3063                 */
3064                for (i = 64 + 8; i <= 96; i += 8)
3065                        size_index[size_index_elem(i)] = 7;
3066        } else if (KMALLOC_MIN_SIZE == 128) {
3067                /*
3068                 * The 192 byte sized cache is not used if the alignment
3069                 * is 128 byte. Redirect kmalloc to use the 256 byte cache
3070                 * instead.
3071                 */
3072                for (i = 128 + 8; i <= 192; i += 8)
3073                        size_index[size_index_elem(i)] = 8;
3074        }
3075
3076        /* Caches that are not of the two-to-the-power-of size */
3077        if (KMALLOC_MIN_SIZE <= 32) {
3078                kmalloc_caches[1] = create_kmalloc_cache("kmalloc-96", 96, 0);
3079                caches++;
3080        }
3081
3082        if (KMALLOC_MIN_SIZE <= 64) {
3083                kmalloc_caches[2] = create_kmalloc_cache("kmalloc-192", 192, 0);
3084                caches++;
3085        }
3086
3087        for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
3088                kmalloc_caches[i] = create_kmalloc_cache("kmalloc", 1 << i, 0);
3089                caches++;
3090        }
3091
3092        slab_state = UP;
3093
3094        /* Provide the correct kmalloc names now that the caches are up */
3095        if (KMALLOC_MIN_SIZE <= 32) {
3096                kmalloc_caches[1]->name = kstrdup(kmalloc_caches[1]->name, GFP_NOWAIT);
3097                BUG_ON(!kmalloc_caches[1]->name);
3098        }
3099
3100        if (KMALLOC_MIN_SIZE <= 64) {
3101                kmalloc_caches[2]->name = kstrdup(kmalloc_caches[2]->name, GFP_NOWAIT);
3102                BUG_ON(!kmalloc_caches[2]->name);
3103        }
3104
3105        for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
3106                char *s = kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);
3107
3108                BUG_ON(!s);
3109                kmalloc_caches[i]->name = s;
3110        }
3111
3112#ifdef CONFIG_SMP
3113        register_cpu_notifier(&slab_notifier);
3114#endif
3115
3116#ifdef CONFIG_ZONE_DMA
3117        for (i = 0; i < SLUB_PAGE_SHIFT; i++) {
3118                struct kmem_cache *s = kmalloc_caches[i];
3119
3120                if (s && s->size) {
3121                        char *name = kasprintf(GFP_NOWAIT,
3122                                 "dma-kmalloc-%d", s->objsize);
3123
3124                        BUG_ON(!name);
3125                        kmalloc_dma_caches[i] = create_kmalloc_cache(name,
3126                                s->objsize, SLAB_CACHE_DMA);
3127                }
3128        }
3129#endif
3130        printk(KERN_INFO
3131                "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
3132                " CPUs=%d, Nodes=%d\n",
3133                caches, cache_line_size(),
3134                slub_min_order, slub_max_order, slub_min_objects,
3135                nr_cpu_ids, nr_node_ids);
3136}
3137
3138void __init kmem_cache_init_late(void)
3139{
3140}
3141
3142/*
3143 * Find a mergeable slab cache
3144 */
3145static int slab_unmergeable(struct kmem_cache *s)
3146{
3147        if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
3148                return 1;
3149
3150        if (s->ctor)
3151                return 1;
3152
3153        /*
3154         * We may have set a slab to be unmergeable during bootstrap.
3155         */
3156        if (s->refcount < 0)
3157                return 1;
3158
3159        return 0;
3160}
3161
3162static struct kmem_cache *find_mergeable(size_t size,
3163                size_t align, unsigned long flags, const char *name,
3164                void (*ctor)(void *))
3165{
3166        struct kmem_cache *s;
3167
3168        if (slub_nomerge || (flags & SLUB_NEVER_MERGE))
3169                return NULL;
3170
3171        if (ctor)
3172                return NULL;
3173
3174        size = ALIGN(size, sizeof(void *));
3175        align = calculate_alignment(flags, align, size);
3176        size = ALIGN(size, align);
3177        flags = kmem_cache_flags(size, flags, name, NULL);
3178
3179        list_for_each_entry(s, &slab_caches, list) {
3180                if (slab_unmergeable(s))
3181                        continue;
3182
3183                if (size > s->size)
3184                        continue;
3185
3186                if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME))
3187                                continue;
3188                /*
3189                 * Check if alignment is compatible.
3190                 * Courtesy of Adrian Drzewiecki
3191                 */
3192                if ((s->size & ~(align - 1)) != s->size)
3193                        continue;
3194
3195                if (s->size - size >= sizeof(void *))
3196                        continue;
3197
3198                return s;
3199        }
3200        return NULL;
3201}
3202
3203struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3204                size_t align, unsigned long flags, void (*ctor)(void *))
3205{
3206        struct kmem_cache *s;
3207        char *n;
3208
3209        if (WARN_ON(!name))
3210                return NULL;
3211
3212        down_write(&slub_lock);
3213        s = find_mergeable(size, align, flags, name, ctor);
3214        if (s) {
3215                s->refcount++;
3216                /*
3217                 * Adjust the object sizes so that we clear
3218                 * the complete object on kzalloc.
3219                 */
3220                s->objsize = max(s->objsize, (int)size);
3221                s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
3222
3223                if (sysfs_slab_alias(s, name)) {
3224                        s->refcount--;
3225                        goto err;
3226                }
3227                up_write(&slub_lock);
3228                return s;
3229        }
3230
3231        n = kstrdup(name, GFP_KERNEL);
3232        if (!n)
3233                goto err;
3234
3235        s = kmalloc(kmem_size, GFP_KERNEL);
3236        if (s) {
3237                if (kmem_cache_open(s, n,
3238                                size, align, flags, ctor)) {
3239                        list_add(&s->list, &slab_caches);
3240                        if (sysfs_slab_add(s)) {
3241                                list_del(&s->list);
3242                                kfree(n);
3243                                kfree(s);
3244                                goto err;
3245                        }
3246                        up_write(&slub_lock);
3247                        return s;
3248                }
3249                kfree(n);
3250                kfree(s);
3251        }
3252err:
3253        up_write(&slub_lock);
3254
3255        if (flags & SLAB_PANIC)
3256                panic("Cannot create slabcache %s\n", name);
3257        else
3258                s = NULL;
3259        return s;
3260}
3261EXPORT_SYMBOL(kmem_cache_create);
3262
3263#ifdef CONFIG_SMP
3264/*
3265 * Use the cpu notifier to insure that the cpu slabs are flushed when
3266 * necessary.
3267 */
3268static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
3269                unsigned long action, void *hcpu)
3270{
3271        long cpu = (long)hcpu;
3272        struct kmem_cache *s;
3273        unsigned long flags;
3274
3275        switch (action) {
3276        case CPU_UP_CANCELED:
3277        case CPU_UP_CANCELED_FROZEN:
3278        case CPU_DEAD:
3279        case CPU_DEAD_FROZEN:
3280                down_read(&slub_lock);
3281                list_for_each_entry(s, &slab_caches, list) {
3282                        local_irq_save(flags);
3283                        __flush_cpu_slab(s, cpu);
3284                        local_irq_restore(flags);
3285                }
3286                up_read(&slub_lock);
3287                break;
3288        default:
3289                break;
3290        }
3291        return NOTIFY_OK;
3292}
3293
3294static struct notifier_block __cpuinitdata slab_notifier = {
3295        .notifier_call = slab_cpuup_callback
3296};
3297
3298#endif
3299
3300void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
3301{
3302        struct kmem_cache *s;
3303        void *ret;
3304
3305        if (unlikely(size > SLUB_MAX_SIZE))
3306                return kmalloc_large(size, gfpflags);
3307
3308        s = get_slab(size, gfpflags);
3309
3310        if (unlikely(ZERO_OR_NULL_PTR(s)))
3311                return s;
3312
3313        ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller);
3314
3315        /* Honor the call site pointer we recieved. */
3316        trace_kmalloc(caller, ret, size, s->size, gfpflags);
3317
3318        return ret;
3319}
3320
3321#ifdef CONFIG_NUMA
3322void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
3323                                        int node, unsigned long caller)
3324{
3325        struct kmem_cache *s;
3326        void *ret;
3327
3328        if (unlikely(size > SLUB_MAX_SIZE)) {
3329                ret = kmalloc_large_node(size, gfpflags, node);
3330
3331                trace_kmalloc_node(caller, ret,
3332                                   size, PAGE_SIZE << get_order(size),
3333                                   gfpflags, node);
3334
3335                return ret;
3336        }
3337
3338        s = get_slab(size, gfpflags);
3339
3340        if (unlikely(ZERO_OR_NULL_PTR(s)))
3341                return s;
3342
3343        ret = slab_alloc(s, gfpflags, node, caller);
3344
3345        /* Honor the call site pointer we recieved. */
3346        trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
3347
3348        return ret;
3349}
3350#endif
3351
3352#ifdef CONFIG_SYSFS
3353static int count_inuse(struct page *page)
3354{
3355        return page->inuse;
3356}
3357
3358static int count_total(struct page *page)
3359{
3360        return page->objects;
3361}
3362#endif
3363
3364#ifdef CONFIG_SLUB_DEBUG
3365static int validate_slab(struct kmem_cache *s, struct page *page,
3366                                                unsigned long *map)
3367{
3368        void *p;
3369        void *addr = page_address(page);
3370
3371        if (!check_slab(s, page) ||
3372                        !on_freelist(s, page, NULL))
3373                return 0;
3374
3375        /* Now we know that a valid freelist exists */
3376        bitmap_zero(map, page->objects);
3377
3378        for_each_free_object(p, s, page->freelist) {
3379                set_bit(slab_index(p, s, addr), map);
3380                if (!check_object(s, page, p, SLUB_RED_INACTIVE))
3381                        return 0;
3382        }
3383
3384        for_each_object(p, s, addr, page->objects)
3385                if (!test_bit(slab_index(p, s, addr), map))
3386                        if (!check_object(s, page, p, SLUB_RED_ACTIVE))
3387                                return 0;
3388        return 1;
3389}
3390
3391static void validate_slab_slab(struct kmem_cache *s, struct page *page,
3392                                                unsigned long *map)
3393{
3394        if (slab_trylock(page)) {
3395                validate_slab(s, page, map);
3396                slab_unlock(page);
3397        } else
3398                printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n",
3399                        s->name, page);
3400}
3401
3402static int validate_slab_node(struct kmem_cache *s,
3403                struct kmem_cache_node *n, unsigned long *map)
3404{
3405        unsigned long count = 0;
3406        struct page *page;
3407        unsigned long flags;
3408
3409        spin_lock_irqsave(&n->list_lock, flags);
3410
3411        list_for_each_entry(page, &n->partial, lru) {
3412                validate_slab_slab(s, page, map);
3413                count++;
3414        }
3415        if (count != n->nr_partial)
3416                printk(KERN_ERR "SLUB %s: %ld partial slabs counted but "
3417                        "counter=%ld\n", s->name, count, n->nr_partial);
3418
3419        if (!(s->flags & SLAB_STORE_USER))
3420                goto out;
3421
3422        list_for_each_entry(page, &n->full, lru) {
3423                validate_slab_slab(s, page, map);
3424                count++;
3425        }
3426        if (count != atomic_long_read(&n->nr_slabs))
3427                printk(KERN_ERR "SLUB: %s %ld slabs counted but "
3428                        "counter=%ld\n", s->name, count,
3429                        atomic_long_read(&n->nr_slabs));
3430
3431out:
3432        spin_unlock_irqrestore(&n->list_lock, flags);
3433        return count;
3434}
3435
3436static long validate_slab_cache(struct kmem_cache *s)
3437{
3438        int node;
3439        unsigned long count = 0;
3440        unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
3441                                sizeof(unsigned long), GFP_KERNEL);
3442
3443        if (!map)
3444                return -ENOMEM;
3445
3446        flush_all(s);
3447        for_each_node_state(node, N_NORMAL_MEMORY) {
3448                struct kmem_cache_node *n = get_node(s, node);
3449
3450                count += validate_slab_node(s, n, map);
3451        }
3452        kfree(map);
3453        return count;
3454}
3455/*
3456 * Generate lists of code addresses where slabcache objects are allocated
3457 * and freed.
3458 */
3459
3460struct location {
3461        unsigned long count;
3462        unsigned long addr;
3463        long long sum_time;
3464        long min_time;
3465        long max_time;
3466        long min_pid;
3467        long max_pid;
3468        DECLARE_BITMAP(cpus, NR_CPUS);
3469        nodemask_t nodes;
3470};
3471
3472struct loc_track {
3473        unsigned long max;
3474        unsigned long count;
3475        struct location *loc;
3476};
3477
3478static void free_loc_track(struct loc_track *t)
3479{
3480        if (t->max)
3481                free_pages((unsigned long)t->loc,
3482                        get_order(sizeof(struct location) * t->max));
3483}
3484
3485static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
3486{
3487        struct location *l;
3488        int order;
3489
3490        order = get_order(sizeof(struct location) * max);
3491
3492        l = (void *)__get_free_pages(flags, order);
3493        if (!l)
3494                return 0;
3495
3496        if (t->count) {
3497                memcpy(l, t->loc, sizeof(struct location) * t->count);
3498                free_loc_track(t);
3499        }
3500        t->max = max;
3501        t->loc = l;
3502        return 1;
3503}
3504
3505static int add_location(struct loc_track *t, struct kmem_cache *s,
3506                                const struct track *track)
3507{
3508        long start, end, pos;
3509        struct location *l;
3510        unsigned long caddr;
3511        unsigned long age = jiffies - track->when;
3512
3513        start = -1;
3514        end = t->count;
3515
3516        for ( ; ; ) {
3517                pos = start + (end - start + 1) / 2;
3518
3519                /*
3520                 * There is nothing at "end". If we end up there
3521                 * we need to add something to before end.
3522                 */
3523                if (pos == end)
3524                        break;
3525
3526                caddr = t->loc[pos].addr;
3527                if (track->addr == caddr) {
3528
3529                        l = &t->loc[pos];
3530                        l->count++;
3531                        if (track->when) {
3532                                l->sum_time += age;
3533                                if (age < l->min_time)
3534                                        l->min_time = age;
3535                                if (age > l->max_time)
3536                                        l->max_time = age;
3537
3538                                if (track->pid < l->min_pid)
3539                                        l->min_pid = track->pid;
3540                                if (track->pid > l->max_pid)
3541                                        l->max_pid = track->pid;
3542
3543                                cpumask_set_cpu(track->cpu,
3544                                                to_cpumask(l->cpus));
3545                        }
3546                        node_set(page_to_nid(virt_to_page(track)), l->nodes);
3547                        return 1;
3548                }
3549
3550                if (track->addr < caddr)
3551                        end = pos;
3552                else
3553                        start = pos;
3554        }
3555
3556        /*
3557         * Not found. Insert new tracking element.
3558         */
3559        if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC))
3560                return 0;
3561
3562        l = t->loc + pos;
3563        if (pos < t->count)
3564                memmove(l + 1, l,
3565                        (t->count - pos) * sizeof(struct location));
3566        t->count++;
3567        l->count = 1;
3568        l->addr = track->addr;
3569        l->sum_time = age;
3570        l->min_time = age;
3571        l->max_time = age;
3572        l->min_pid = track->pid;
3573        l->max_pid = track->pid;
3574        cpumask_clear(to_cpumask(l->cpus));
3575        cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));
3576        nodes_clear(l->nodes);
3577        node_set(page_to_nid(virt_to_page(track)), l->nodes);
3578        return 1;
3579}
3580
3581static void process_slab(struct loc_track *t, struct kmem_cache *s,
3582                struct page *page, enum track_item alloc,
3583                unsigned long *map)
3584{
3585        void *addr = page_address(page);
3586        void *p;
3587
3588        bitmap_zero(map, page->objects);
3589        for_each_free_object(p, s, page->freelist)
3590                set_bit(slab_index(p, s, addr), map);
3591
3592        for_each_object(p, s, addr, page->objects)
3593                if (!test_bit(slab_index(p, s, addr), map))
3594                        add_location(t, s, get_track(s, p, alloc));
3595}
3596
3597static int list_locations(struct kmem_cache *s, char *buf,
3598                                        enum track_item alloc)
3599{
3600        int len = 0;
3601        unsigned long i;
3602        struct loc_track t = { 0, 0, NULL };
3603        int node;
3604        unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
3605                                     sizeof(unsigned long), GFP_KERNEL);
3606
3607        if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
3608                                     GFP_TEMPORARY)) {
3609                kfree(map);
3610                return sprintf(buf, "Out of memory\n");
3611        }
3612        /* Push back cpu slabs */
3613        flush_all(s);
3614
3615        for_each_node_state(node, N_NORMAL_MEMORY) {
3616                struct kmem_cache_node *n = get_node(s, node);
3617                unsigned long flags;
3618                struct page *page;
3619
3620                if (!atomic_long_read(&n->nr_slabs))
3621                        continue;
3622
3623                spin_lock_irqsave(&n->list_lock, flags);
3624                list_for_each_entry(page, &n->partial, lru)
3625                        process_slab(&t, s, page, alloc, map);
3626                list_for_each_entry(page, &n->full, lru)
3627                        process_slab(&t, s, page, alloc, map);
3628                spin_unlock_irqrestore(&n->list_lock, flags);
3629        }
3630
3631        for (i = 0; i < t.count; i++) {
3632                struct location *l = &t.loc[i];
3633
3634                if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100)
3635                        break;
3636                len += sprintf(buf + len, "%7ld ", l->count);
3637
3638                if (l->addr)
3639                        len += sprintf(buf + len, "%pS", (void *)l->addr);
3640                else
3641                        len += sprintf(buf + len, "<not-available>");
3642
3643                if (l->sum_time != l->min_time) {
3644                        len += sprintf(buf + len, " age=%ld/%ld/%ld",
3645                                l->min_time,
3646                                (long)div_u64(l->sum_time, l->count),
3647                                l->max_time);
3648                } else
3649                        len += sprintf(buf + len, " age=%ld",
3650                                l->min_time);
3651
3652                if (l->min_pid != l->max_pid)
3653                        len += sprintf(buf + len, " pid=%ld-%ld",
3654                                l->min_pid, l->max_pid);
3655                else
3656                        len += sprintf(buf + len, " pid=%ld",
3657                                l->min_pid);
3658
3659                if (num_online_cpus() > 1 &&
3660                                !cpumask_empty(to_cpumask(l->cpus)) &&
3661                                len < PAGE_SIZE - 60) {
3662                        len += sprintf(buf + len, " cpus=");
3663                        len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50,
3664                                                 to_cpumask(l->cpus));
3665                }
3666
3667                if (nr_online_nodes > 1 && !nodes_empty(l->nodes) &&
3668                                len < PAGE_SIZE - 60) {
3669                        len += sprintf(buf + len, " nodes=");
3670                        len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50,
3671                                        l->nodes);
3672                }
3673
3674                len += sprintf(buf + len, "\n");
3675        }
3676
3677        free_loc_track(&t);
3678        kfree(map);
3679        if (!t.count)
3680                len += sprintf(buf, "No data\n");
3681        return len;
3682}
3683#endif
3684
3685#ifdef SLUB_RESILIENCY_TEST
3686static void resiliency_test(void)
3687{
3688        u8 *p;
3689
3690        BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || SLUB_PAGE_SHIFT < 10);
3691
3692        printk(KERN_ERR "SLUB resiliency testing\n");
3693        printk(KERN_ERR "-----------------------\n");
3694        printk(KERN_ERR "A. Corruption after allocation\n");
3695
3696        p = kzalloc(16, GFP_KERNEL);
3697        p[16] = 0x12;
3698        printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
3699                        " 0x12->0x%p\n\n", p + 16);
3700
3701        validate_slab_cache(kmalloc_caches[4]);
3702
3703        /* Hmmm... The next two are dangerous */
3704        p = kzalloc(32, GFP_KERNEL);
3705        p[32 + sizeof(void *)] = 0x34;
3706        printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
3707                        " 0x34 -> -0x%p\n", p);
3708        printk(KERN_ERR
3709                "If allocated object is overwritten then not detectable\n\n");
3710
3711        validate_slab_cache(kmalloc_caches[5]);
3712        p = kzalloc(64, GFP_KERNEL);
3713        p += 64 + (get_cycles() & 0xff) * sizeof(void *);
3714        *p = 0x56;
3715        printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
3716                                                                        p);
3717        printk(KERN_ERR
3718                "If allocated object is overwritten then not detectable\n\n");
3719        validate_slab_cache(kmalloc_caches[6]);
3720
3721        printk(KERN_ERR "\nB. Corruption after free\n");
3722        p = kzalloc(128, GFP_KERNEL);
3723        kfree(p);
3724        *p = 0x78;
3725        printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
3726        validate_slab_cache(kmalloc_caches[7]);
3727
3728        p = kzalloc(256, GFP_KERNEL);
3729        kfree(p);
3730        p[50] = 0x9a;
3731        printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n",
3732                        p);
3733        validate_slab_cache(kmalloc_caches[8]);
3734
3735        p = kzalloc(512, GFP_KERNEL);
3736        kfree(p);
3737        p[512] = 0xab;
3738        printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
3739        validate_slab_cache(kmalloc_caches[9]);
3740}
3741#else
3742#ifdef CONFIG_SYSFS
3743static void resiliency_test(void) {};
3744#endif
3745#endif
3746
3747#ifdef CONFIG_SYSFS
3748enum slab_stat_type {
3749        SL_ALL,                 /* All slabs */
3750        SL_PARTIAL,             /* Only partially allocated slabs */
3751        SL_CPU,                 /* Only slabs used for cpu caches */
3752        SL_OBJECTS,             /* Determine allocated objects not slabs */
3753        SL_TOTAL                /* Determine object capacity not slabs */
3754};
3755
3756#define SO_ALL          (1 << SL_ALL)
3757#define SO_PARTIAL      (1 << SL_PARTIAL)
3758#define SO_CPU          (1 << SL_CPU)
3759#define SO_OBJECTS      (1 << SL_OBJECTS)
3760#define SO_TOTAL        (1 << SL_TOTAL)
3761
3762static ssize_t show_slab_objects(struct kmem_cache *s,
3763                            char *buf, unsigned long flags)
3764{
3765        unsigned long total = 0;
3766        int node;
3767        int x;
3768        unsigned long *nodes;
3769        unsigned long *per_cpu;
3770
3771        nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL);
3772        if (!nodes)
3773                return -ENOMEM;
3774        per_cpu = nodes + nr_node_ids;
3775
3776        if (flags & SO_CPU) {
3777                int cpu;
3778
3779                for_each_possible_cpu(cpu) {
3780                        struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
3781
3782                        if (!c || c->node < 0)
3783                                continue;
3784
3785                        if (c->page) {
3786                                        if (flags & SO_TOTAL)
3787                                                x = c->page->objects;
3788                                else if (flags & SO_OBJECTS)
3789                                        x = c->page->inuse;
3790                                else
3791                                        x = 1;
3792
3793                                total += x;
3794                                nodes[c->node] += x;
3795                        }
3796                        per_cpu[c->node]++;
3797                }
3798        }
3799
3800        lock_memory_hotplug();
3801#ifdef CONFIG_SLUB_DEBUG
3802        if (flags & SO_ALL) {
3803                for_each_node_state(node, N_NORMAL_MEMORY) {
3804                        struct kmem_cache_node *n = get_node(s, node);
3805
3806                if (flags & SO_TOTAL)
3807                        x = atomic_long_read(&n->total_objects);
3808                else if (flags & SO_OBJECTS)
3809                        x = atomic_long_read(&n->total_objects) -
3810                                count_partial(n, count_free);
3811
3812                        else
3813                                x = atomic_long_read(&n->nr_slabs);
3814                        total += x;
3815                        nodes[node] += x;
3816                }
3817
3818        } else
3819#endif
3820        if (flags & SO_PARTIAL) {
3821                for_each_node_state(node, N_NORMAL_MEMORY) {
3822                        struct kmem_cache_node *n = get_node(s, node);
3823
3824                        if (flags & SO_TOTAL)
3825                                x = count_partial(n, count_total);
3826                        else if (flags & SO_OBJECTS)
3827                                x = count_partial(n, count_inuse);
3828                        else
3829                                x = n->nr_partial;
3830                        total += x;
3831                        nodes[node] += x;
3832                }
3833        }
3834        x = sprintf(buf, "%lu", total);
3835#ifdef CONFIG_NUMA
3836        for_each_node_state(node, N_NORMAL_MEMORY)
3837                if (nodes[node])
3838                        x += sprintf(buf + x, " N%d=%lu",
3839                                        node, nodes[node]);
3840#endif
3841        unlock_memory_hotplug();
3842        kfree(nodes);
3843        return x + sprintf(buf + x, "\n");
3844}
3845
3846#ifdef CONFIG_SLUB_DEBUG
3847static int any_slab_objects(struct kmem_cache *s)
3848{
3849        int node;
3850
3851        for_each_online_node(node) {
3852                struct kmem_cache_node *n = get_node(s, node);
3853
3854                if (!n)
3855                        continue;
3856
3857                if (atomic_long_read(&n->total_objects))
3858                        return 1;
3859        }
3860        return 0;
3861}
3862#endif
3863
3864#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
3865#define to_slab(n) container_of(n, struct kmem_cache, kobj);
3866
3867struct slab_attribute {
3868        struct attribute attr;
3869        ssize_t (*show)(struct kmem_cache *s, char *buf);
3870        ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
3871};
3872
3873#define SLAB_ATTR_RO(_name) \
3874        static struct slab_attribute _name##_attr = __ATTR_RO(_name)
3875
3876#define SLAB_ATTR(_name) \
3877        static struct slab_attribute _name##_attr =  \
3878        __ATTR(_name, 0644, _name##_show, _name##_store)
3879
3880static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
3881{
3882        return sprintf(buf, "%d\n", s->size);
3883}
3884SLAB_ATTR_RO(slab_size);
3885
3886static ssize_t align_show(struct kmem_cache *s, char *buf)
3887{
3888        return sprintf(buf, "%d\n", s->align);
3889}
3890SLAB_ATTR_RO(align);
3891
3892static ssize_t object_size_show(struct kmem_cache *s, char *buf)
3893{
3894        return sprintf(buf, "%d\n", s->objsize);
3895}
3896SLAB_ATTR_RO(object_size);
3897
3898static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
3899{
3900        return sprintf(buf, "%d\n", oo_objects(s->oo));
3901}
3902SLAB_ATTR_RO(objs_per_slab);
3903
3904static ssize_t order_store(struct kmem_cache *s,
3905                                const char *buf, size_t length)
3906{
3907        unsigned long order;
3908        int err;
3909
3910        err = strict_strtoul(buf, 10, &order);
3911        if (err)
3912                return err;
3913
3914        if (order > slub_max_order || order < slub_min_order)
3915                return -EINVAL;
3916
3917        calculate_sizes(s, order);
3918        return length;
3919}
3920
3921static ssize_t order_show(struct kmem_cache *s, char *buf)
3922{
3923        return sprintf(buf, "%d\n", oo_order(s->oo));
3924}
3925SLAB_ATTR(order);
3926
3927static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
3928{
3929        return sprintf(buf, "%lu\n", s->min_partial);
3930}
3931
3932static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
3933                                 size_t length)
3934{
3935        unsigned long min;
3936        int err;
3937
3938        err = strict_strtoul(buf, 10, &min);
3939        if (err)
3940                return err;
3941
3942        set_min_partial(s, min);
3943        return length;
3944}
3945SLAB_ATTR(min_partial);
3946
3947static ssize_t ctor_show(struct kmem_cache *s, char *buf)
3948{
3949        if (!s->ctor)
3950                return 0;
3951        return sprintf(buf, "%pS\n", s->ctor);
3952}
3953SLAB_ATTR_RO(ctor);
3954
3955static ssize_t aliases_show(struct kmem_cache *s, char *buf)
3956{
3957        return sprintf(buf, "%d\n", s->refcount - 1);
3958}
3959SLAB_ATTR_RO(aliases);
3960
3961static ssize_t partial_show(struct kmem_cache *s, char *buf)
3962{
3963        return show_slab_objects(s, buf, SO_PARTIAL);
3964}
3965SLAB_ATTR_RO(partial);
3966
3967static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf)
3968{
3969        return show_slab_objects(s, buf, SO_CPU);
3970}
3971SLAB_ATTR_RO(cpu_slabs);
3972
3973static ssize_t objects_show(struct kmem_cache *s, char *buf)
3974{
3975        return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS);
3976}
3977SLAB_ATTR_RO(objects);
3978
3979static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
3980{
3981        return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS);
3982}
3983SLAB_ATTR_RO(objects_partial);
3984
3985static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
3986{
3987        return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
3988}
3989
3990static ssize_t reclaim_account_store(struct kmem_cache *s,
3991                                const char *buf, size_t length)
3992{
3993        s->flags &= ~SLAB_RECLAIM_ACCOUNT;
3994        if (buf[0] == '1')
3995                s->flags |= SLAB_RECLAIM_ACCOUNT;
3996        return length;
3997}
3998SLAB_ATTR(reclaim_account);
3999
4000static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
4001{
4002        return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
4003}
4004SLAB_ATTR_RO(hwcache_align);
4005
4006#ifdef CONFIG_ZONE_DMA
4007static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
4008{
4009        return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
4010}
4011SLAB_ATTR_RO(cache_dma);
4012#endif
4013
4014static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
4015{
4016        return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
4017}
4018SLAB_ATTR_RO(destroy_by_rcu);
4019
4020#ifdef CONFIG_SLUB_DEBUG
4021static ssize_t slabs_show(struct kmem_cache *s, char *buf)
4022{
4023        return show_slab_objects(s, buf, SO_ALL);
4024}
4025SLAB_ATTR_RO(slabs);
4026
4027static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
4028{
4029        return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
4030}
4031SLAB_ATTR_RO(total_objects);
4032
4033static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
4034{
4035        return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
4036}
4037
4038static ssize_t sanity_checks_store(struct kmem_cache *s,
4039                                const char *buf, size_t length)
4040{
4041        s->flags &= ~SLAB_DEBUG_FREE;
4042        if (buf[0] == '1')
4043                s->flags |= SLAB_DEBUG_FREE;
4044        return length;
4045}
4046SLAB_ATTR(sanity_checks);
4047
4048static ssize_t trace_show(struct kmem_cache *s, char *buf)
4049{
4050        return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
4051}
4052
4053static ssize_t trace_store(struct kmem_cache *s, const char *buf,
4054                                                        size_t length)
4055{
4056        s->flags &= ~SLAB_TRACE;
4057        if (buf[0] == '1')
4058                s->flags |= SLAB_TRACE;
4059        return length;
4060}
4061SLAB_ATTR(trace);
4062
4063static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
4064{
4065        return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
4066}
4067
4068static ssize_t red_zone_store(struct kmem_cache *s,
4069                                const char *buf, size_t length)
4070{
4071        if (any_slab_objects(s))
4072                return -EBUSY;
4073
4074        s->flags &= ~SLAB_RED_ZONE;
4075        if (buf[0] == '1')
4076                s->flags |= SLAB_RED_ZONE;
4077        calculate_sizes(s, -1);
4078        return length;
4079}
4080SLAB_ATTR(red_zone);
4081
4082static ssize_t poison_show(struct kmem_cache *s, char *buf)
4083{
4084        return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
4085}
4086
4087static ssize_t poison_store(struct kmem_cache *s,
4088                                const char *buf, size_t length)
4089{
4090        if (any_slab_objects(s))
4091                return -EBUSY;
4092
4093        s->flags &= ~SLAB_POISON;
4094        if (buf[0] == '1')
4095                s->flags |= SLAB_POISON;
4096        calculate_sizes(s, -1);
4097        return length;
4098}
4099SLAB_ATTR(poison);
4100
4101static ssize_t store_user_show(struct kmem_cache *s, char *buf)
4102{
4103        return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
4104}
4105
4106static ssize_t store_user_store(struct kmem_cache *s,
4107                                const char *buf, size_t length)
4108{
4109        if (any_slab_objects(s))
4110                return -EBUSY;
4111
4112        s->flags &= ~SLAB_STORE_USER;
4113        if (buf[0] == '1')
4114                s->flags |= SLAB_STORE_USER;
4115        calculate_sizes(s, -1);
4116        return length;
4117}
4118SLAB_ATTR(store_user);
4119
4120static ssize_t validate_show(struct kmem_cache *s, char *buf)
4121{
4122        return 0;
4123}
4124
4125static ssize_t validate_store(struct kmem_cache *s,
4126                        const char *buf, size_t length)
4127{
4128        int ret = -EINVAL;
4129
4130        if (buf[0] == '1') {
4131                ret = validate_slab_cache(s);
4132                if (ret >= 0)
4133                        ret = length;
4134        }
4135        return ret;
4136}
4137SLAB_ATTR(validate);
4138
4139static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
4140{
4141        if (!(s->flags & SLAB_STORE_USER))
4142                return -ENOSYS;
4143        return list_locations(s, buf, TRACK_ALLOC);
4144}
4145SLAB_ATTR_RO(alloc_calls);
4146
4147static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
4148{
4149        if (!(s->flags & SLAB_STORE_USER))
4150                return -ENOSYS;
4151        return list_locations(s, buf, TRACK_FREE);
4152}
4153SLAB_ATTR_RO(free_calls);
4154#endif /* CONFIG_SLUB_DEBUG */
4155
4156#ifdef CONFIG_FAILSLAB
4157static ssize_t failslab_show(struct kmem_cache *s, char *buf)
4158{
4159        return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
4160}
4161
4162static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
4163                                                        size_t length)
4164{
4165        s->flags &= ~SLAB_FAILSLAB;
4166        if (buf[0] == '1')
4167                s->flags |= SLAB_FAILSLAB;
4168        return length;
4169}
4170SLAB_ATTR(failslab);
4171#endif
4172
4173static ssize_t shrink_show(struct kmem_cache *s, char *buf)
4174{
4175        return 0;
4176}
4177
4178static ssize_t shrink_store(struct kmem_cache *s,
4179                        const char *buf, size_t length)
4180{
4181        if (buf[0] == '1') {
4182                int rc = kmem_cache_shrink(s);
4183
4184                if (rc)
4185                        return rc;
4186        } else
4187                return -EINVAL;
4188        return length;
4189}
4190SLAB_ATTR(shrink);
4191
4192#ifdef CONFIG_NUMA
4193static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
4194{
4195        return sprintf(buf, "%d\n", s->remote_node_defrag_ratio / 10);
4196}
4197
4198static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
4199                                const char *buf, size_t length)
4200{
4201        unsigned long ratio;
4202        int err;
4203
4204        err = strict_strtoul(buf, 10, &ratio);
4205        if (err)
4206                return err;
4207
4208        if (ratio <= 100)
4209                s->remote_node_defrag_ratio = ratio * 10;
4210
4211        return length;
4212}
4213SLAB_ATTR(remote_node_defrag_ratio);
4214#endif
4215
4216#ifdef CONFIG_SLUB_STATS
4217static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
4218{
4219        unsigned long sum  = 0;
4220        int cpu;
4221        int len;
4222        int *data = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);
4223
4224        if (!data)
4225                return -ENOMEM;
4226
4227        for_each_online_cpu(cpu) {
4228                unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si];
4229
4230                data[cpu] = x;
4231                sum += x;
4232        }
4233
4234        len = sprintf(buf, "%lu", sum);
4235
4236#ifdef CONFIG_SMP
4237        for_each_online_cpu(cpu) {
4238                if (data[cpu] && len < PAGE_SIZE - 20)
4239                        len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]);
4240        }
4241#endif
4242        kfree(data);
4243        return len + sprintf(buf + len, "\n");
4244}
4245
4246static void clear_stat(struct kmem_cache *s, enum stat_item si)
4247{
4248        int cpu;
4249
4250        for_each_online_cpu(cpu)
4251                per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0;
4252}
4253
4254#define STAT_ATTR(si, text)                                     \
4255static ssize_t text##_show(struct kmem_cache *s, char *buf)     \
4256{                                                               \
4257        return show_stat(s, buf, si);                           \
4258}                                                               \
4259static ssize_t text##_store(struct kmem_cache *s,               \
4260                                const char *buf, size_t length) \
4261{                                                               \
4262        if (buf[0] != '0')                                      \
4263                return -EINVAL;                                 \
4264        clear_stat(s, si);                                      \
4265        return length;                                          \
4266}                                                               \
4267SLAB_ATTR(text);                                                \
4268
4269STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
4270STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
4271STAT_ATTR(FREE_FASTPATH, free_fastpath);
4272STAT_ATTR(FREE_SLOWPATH, free_slowpath);
4273STAT_ATTR(FREE_FROZEN, free_frozen);
4274STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial);
4275STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
4276STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial);
4277STAT_ATTR(ALLOC_SLAB, alloc_slab);
4278STAT_ATTR(ALLOC_REFILL, alloc_refill);
4279STAT_ATTR(FREE_SLAB, free_slab);
4280STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush);
4281STAT_ATTR(DEACTIVATE_FULL, deactivate_full);
4282STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
4283STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
4284STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
4285STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
4286STAT_ATTR(ORDER_FALLBACK, order_fallback);
4287#endif
4288
4289static struct attribute *slab_attrs[] = {
4290        &slab_size_attr.attr,
4291        &object_size_attr.attr,
4292        &objs_per_slab_attr.attr,
4293        &order_attr.attr,
4294        &min_partial_attr.attr,
4295        &objects_attr.attr,
4296        &objects_partial_attr.attr,
4297        &partial_attr.attr,
4298        &cpu_slabs_attr.attr,
4299        &ctor_attr.attr,
4300        &aliases_attr.attr,
4301        &align_attr.attr,
4302        &hwcache_align_attr.attr,
4303        &reclaim_account_attr.attr,
4304        &destroy_by_rcu_attr.attr,
4305        &shrink_attr.attr,
4306#ifdef CONFIG_SLUB_DEBUG
4307        &total_objects_attr.attr,
4308        &slabs_attr.attr,
4309        &sanity_checks_attr.attr,
4310        &trace_attr.attr,
4311        &red_zone_attr.attr,
4312        &poison_attr.attr,
4313        &store_user_attr.attr,
4314        &validate_attr.attr,
4315        &alloc_calls_attr.attr,
4316        &free_calls_attr.attr,
4317#endif
4318#ifdef CONFIG_ZONE_DMA
4319        &cache_dma_attr.attr,
4320#endif
4321#ifdef CONFIG_NUMA
4322        &remote_node_defrag_ratio_attr.attr,
4323#endif
4324#ifdef CONFIG_SLUB_STATS
4325        &alloc_fastpath_attr.attr,
4326        &alloc_slowpath_attr.attr,
4327        &free_fastpath_attr.attr,
4328        &free_slowpath_attr.attr,
4329        &free_frozen_attr.attr,
4330        &free_add_partial_attr.attr,
4331        &free_remove_partial_attr.attr,
4332        &alloc_from_partial_attr.attr,
4333        &alloc_slab_attr.attr,
4334        &alloc_refill_attr.attr,
4335        &free_slab_attr.attr,
4336        &cpuslab_flush_attr.attr,
4337        &deactivate_full_attr.attr,
4338        &deactivate_empty_attr.attr,
4339        &deactivate_to_head_attr.attr,
4340        &deactivate_to_tail_attr.attr,
4341        &deactivate_remote_frees_attr.attr,
4342        &order_fallback_attr.attr,
4343#endif
4344#ifdef CONFIG_FAILSLAB
4345        &failslab_attr.attr,
4346#endif
4347
4348        NULL
4349};
4350
4351static struct attribute_group slab_attr_group = {
4352        .attrs = slab_attrs,
4353};
4354
4355static ssize_t slab_attr_show(struct kobject *kobj,
4356                                struct attribute *attr,
4357                                char *buf)
4358{
4359        struct slab_attribute *attribute;
4360        struct kmem_cache *s;
4361        int err;
4362
4363        attribute = to_slab_attr(attr);
4364        s = to_slab(kobj);
4365
4366        if (!attribute->show)
4367                return -EIO;
4368
4369        err = attribute->show(s, buf);
4370
4371        return err;
4372}
4373
4374static ssize_t slab_attr_store(struct kobject *kobj,
4375                                struct attribute *attr,
4376                                const char *buf, size_t len)
4377{
4378        struct slab_attribute *attribute;
4379        struct kmem_cache *s;
4380        int err;
4381
4382        attribute = to_slab_attr(attr);
4383        s = to_slab(kobj);
4384
4385        if (!attribute->store)
4386                return -EIO;
4387
4388        err = attribute->store(s, buf, len);
4389
4390        return err;
4391}
4392
4393static void kmem_cache_release(struct kobject *kobj)
4394{
4395        struct kmem_cache *s = to_slab(kobj);
4396
4397        kfree(s->name);
4398        kfree(s);
4399}
4400
4401static const struct sysfs_ops slab_sysfs_ops = {
4402        .show = slab_attr_show,
4403        .store = slab_attr_store,
4404};
4405
4406static struct kobj_type slab_ktype = {
4407        .sysfs_ops = &slab_sysfs_ops,
4408        .release = kmem_cache_release
4409};
4410
4411static int uevent_filter(struct kset *kset, struct kobject *kobj)
4412{
4413        struct kobj_type *ktype = get_ktype(kobj);
4414
4415        if (ktype == &slab_ktype)
4416                return 1;
4417        return 0;
4418}
4419
4420static const struct kset_uevent_ops slab_uevent_ops = {
4421        .filter = uevent_filter,
4422};
4423
4424static struct kset *slab_kset;
4425
4426#define ID_STR_LENGTH 64
4427
4428/* Create a unique string id for a slab cache:
4429 *
4430 * Format       :[flags-]size
4431 */
4432static char *create_unique_id(struct kmem_cache *s)
4433{
4434        char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
4435        char *p = name;
4436
4437        BUG_ON(!name);
4438
4439        *p++ = ':';
4440        /*
4441         * First flags affecting slabcache operations. We will only
4442         * get here for aliasable slabs so we do not need to support
4443         * too many flags. The flags here must cover all flags that
4444         * are matched during merging to guarantee that the id is
4445         * unique.
4446         */
4447        if (s->flags & SLAB_CACHE_DMA)
4448                *p++ = 'd';
4449        if (s->flags & SLAB_RECLAIM_ACCOUNT)
4450                *p++ = 'a';
4451        if (s->flags & SLAB_DEBUG_FREE)
4452                *p++ = 'F';
4453        if (!(s->flags & SLAB_NOTRACK))
4454                *p++ = 't';
4455        if (p != name + 1)
4456                *p++ = '-';
4457        p += sprintf(p, "%07d", s->size);
4458        BUG_ON(p > name + ID_STR_LENGTH - 1);
4459        return name;
4460}
4461
4462static int sysfs_slab_add(struct kmem_cache *s)
4463{
4464        int err;
4465        const char *name;
4466        int unmergeable;
4467
4468        if (slab_state < SYSFS)
4469                /* Defer until later */
4470                return 0;
4471
4472        unmergeable = slab_unmergeable(s);
4473        if (unmergeable) {
4474                /*
4475                 * Slabcache can never be merged so we can use the name proper.
4476                 * This is typically the case for debug situations. In that
4477                 * case we can catch duplicate names easily.
4478                 */
4479                sysfs_remove_link(&slab_kset->kobj, s->name);
4480                name = s->name;
4481        } else {
4482                /*
4483                 * Create a unique name for the slab as a target
4484                 * for the symlinks.
4485                 */
4486                name = create_unique_id(s);
4487        }
4488
4489        s->kobj.kset = slab_kset;
4490        err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name);
4491        if (err) {
4492                kobject_put(&s->kobj);
4493                return err;
4494        }
4495
4496        err = sysfs_create_group(&s->kobj, &slab_attr_group);
4497        if (err) {
4498                kobject_del(&s->kobj);
4499                kobject_put(&s->kobj);
4500                return err;
4501        }
4502        kobject_uevent(&s->kobj, KOBJ_ADD);
4503        if (!unmergeable) {
4504                /* Setup first alias */
4505                sysfs_slab_alias(s, s->name);
4506                kfree(name);
4507        }
4508        return 0;
4509}
4510
4511static void sysfs_slab_remove(struct kmem_cache *s)
4512{
4513        if (slab_state < SYSFS)
4514                /*
4515                 * Sysfs has not been setup yet so no need to remove the
4516                 * cache from sysfs.
4517                 */
4518                return;
4519
4520        kobject_uevent(&s->kobj, KOBJ_REMOVE);
4521        kobject_del(&s->kobj);
4522        kobject_put(&s->kobj);
4523}
4524
4525/*
4526 * Need to buffer aliases during bootup until sysfs becomes
4527 * available lest we lose that information.
4528 */
4529struct saved_alias {
4530        struct kmem_cache *s;
4531        const char *name;
4532        struct saved_alias *next;
4533};
4534
4535static struct saved_alias *alias_list;
4536
4537static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
4538{
4539        struct saved_alias *al;
4540
4541        if (slab_state == SYSFS) {
4542                /*
4543                 * If we have a leftover link then remove it.
4544                 */
4545                sysfs_remove_link(&slab_kset->kobj, name);
4546                return sysfs_create_link(&slab_kset->kobj, &s->kobj, name);
4547        }
4548
4549        al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
4550        if (!al)
4551                return -ENOMEM;
4552
4553        al->s = s;
4554        al->name = name;
4555        al->next = alias_list;
4556        alias_list = al;
4557        return 0;
4558}
4559
4560static int __init slab_sysfs_init(void)
4561{
4562        struct kmem_cache *s;
4563        int err;
4564
4565        down_write(&slub_lock);
4566
4567        slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
4568        if (!slab_kset) {
4569                up_write(&slub_lock);
4570                printk(KERN_ERR "Cannot register slab subsystem.\n");
4571                return -ENOSYS;
4572        }
4573
4574        slab_state = SYSFS;
4575
4576        list_for_each_entry(s, &slab_caches, list) {
4577                err = sysfs_slab_add(s);
4578                if (err)
4579                        printk(KERN_ERR "SLUB: Unable to add boot slab %s"
4580                                                " to sysfs\n", s->name);
4581        }
4582
4583        while (alias_list) {
4584                struct saved_alias *al = alias_list;
4585
4586                alias_list = alias_list->next;
4587                err = sysfs_slab_alias(al->s, al->name);
4588                if (err)
4589                        printk(KERN_ERR "SLUB: Unable to add boot slab alias"
4590                                        " %s to sysfs\n", s->name);
4591                kfree(al);
4592        }
4593
4594        up_write(&slub_lock);
4595        resiliency_test();
4596        return 0;
4597}
4598
4599__initcall(slab_sysfs_init);
4600#endif /* CONFIG_SYSFS */
4601
4602/*
4603 * The /proc/slabinfo ABI
4604 */
4605#ifdef CONFIG_SLABINFO
4606static void print_slabinfo_header(struct seq_file *m)
4607{
4608        seq_puts(m, "slabinfo - version: 2.1\n");
4609        seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
4610                 "<objperslab> <pagesperslab>");
4611        seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
4612        seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
4613        seq_putc(m, '\n');
4614}
4615
4616static void *s_start(struct seq_file *m, loff_t *pos)
4617{
4618        loff_t n = *pos;
4619
4620        down_read(&slub_lock);
4621        if (!n)
4622                print_slabinfo_header(m);
4623
4624        return seq_list_start(&slab_caches, *pos);
4625}
4626
4627static void *s_next(struct seq_file *m, void *p, loff_t *pos)
4628{
4629        return seq_list_next(p, &slab_caches, pos);
4630}
4631
4632static void s_stop(struct seq_file *m, void *p)
4633{
4634        up_read(&slub_lock);
4635}
4636
4637static int s_show(struct seq_file *m, void *p)
4638{
4639        unsigned long nr_partials = 0;
4640        unsigned long nr_slabs = 0;
4641        unsigned long nr_inuse = 0;
4642        unsigned long nr_objs = 0;
4643        unsigned long nr_free = 0;
4644        struct kmem_cache *s;
4645        int node;
4646
4647        s = list_entry(p, struct kmem_cache, list);
4648
4649        for_each_online_node(node) {
4650                struct kmem_cache_node *n = get_node(s, node);
4651
4652                if (!n)
4653                        continue;
4654
4655                nr_partials += n->nr_partial;
4656                nr_slabs += atomic_long_read(&n->nr_slabs);
4657                nr_objs += atomic_long_read(&n->total_objects);
4658                nr_free += count_partial(n, count_free);
4659        }
4660
4661        nr_inuse = nr_objs - nr_free;
4662
4663        seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse,
4664                   nr_objs, s->size, oo_objects(s->oo),
4665                   (1 << oo_order(s->oo)));
4666        seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0);
4667        seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs,
4668                   0UL);
4669        seq_putc(m, '\n');
4670        return 0;
4671}
4672
4673static const struct seq_operations slabinfo_op = {
4674        .start = s_start,
4675        .next = s_next,
4676        .stop = s_stop,
4677        .show = s_show,
4678};
4679
4680static int slabinfo_open(struct inode *inode, struct file *file)
4681{
4682        return seq_open(file, &slabinfo_op);
4683}
4684
4685static const struct file_operations proc_slabinfo_operations = {
4686        .open           = slabinfo_open,
4687        .read           = seq_read,
4688        .llseek         = seq_lseek,
4689        .release        = seq_release,
4690};
4691
4692static int __init slab_proc_init(void)
4693{
4694        proc_create("slabinfo", S_IRUGO, NULL, &proc_slabinfo_operations);
4695        return 0;
4696}
4697module_init(slab_proc_init);
4698#endif /* CONFIG_SLABINFO */
4699