linux/mm/slub.c
<<
>>
Prefs
   1/*
   2 * SLUB: A slab allocator that limits cache line use instead of queuing
   3 * objects in per cpu and per node lists.
   4 *
   5 * The allocator synchronizes using per slab locks and only
   6 * uses a centralized lock to manage a pool of partial slabs.
   7 *
   8 * (C) 2007 SGI, Christoph Lameter
   9 */
  10
  11#include <linux/mm.h>
  12#include <linux/swap.h> /* struct reclaim_state */
  13#include <linux/module.h>
  14#include <linux/bit_spinlock.h>
  15#include <linux/interrupt.h>
  16#include <linux/bitops.h>
  17#include <linux/slab.h>
  18#include <linux/proc_fs.h>
  19#include <linux/seq_file.h>
  20#include <linux/kmemtrace.h>
  21#include <linux/kmemcheck.h>
  22#include <linux/cpu.h>
  23#include <linux/cpuset.h>
  24#include <linux/mempolicy.h>
  25#include <linux/ctype.h>
  26#include <linux/debugobjects.h>
  27#include <linux/kallsyms.h>
  28#include <linux/memory.h>
  29#include <linux/math64.h>
  30#include <linux/fault-inject.h>
  31
  32/*
  33 * Lock order:
  34 *   1. slab_lock(page)
  35 *   2. slab->list_lock
  36 *
  37 *   The slab_lock protects operations on the object of a particular
  38 *   slab and its metadata in the page struct. If the slab lock
  39 *   has been taken then no allocations nor frees can be performed
  40 *   on the objects in the slab nor can the slab be added or removed
  41 *   from the partial or full lists since this would mean modifying
  42 *   the page_struct of the slab.
  43 *
  44 *   The list_lock protects the partial and full list on each node and
  45 *   the partial slab counter. If taken then no new slabs may be added or
  46 *   removed from the lists nor make the number of partial slabs be modified.
  47 *   (Note that the total number of slabs is an atomic value that may be
  48 *   modified without taking the list lock).
  49 *
  50 *   The list_lock is a centralized lock and thus we avoid taking it as
  51 *   much as possible. As long as SLUB does not have to handle partial
  52 *   slabs, operations can continue without any centralized lock. F.e.
  53 *   allocating a long series of objects that fill up slabs does not require
  54 *   the list lock.
  55 *
  56 *   The lock order is sometimes inverted when we are trying to get a slab
  57 *   off a list. We take the list_lock and then look for a page on the list
  58 *   to use. While we do that objects in the slabs may be freed. We can
  59 *   only operate on the slab if we have also taken the slab_lock. So we use
  60 *   a slab_trylock() on the slab. If trylock was successful then no frees
  61 *   can occur anymore and we can use the slab for allocations etc. If the
  62 *   slab_trylock() does not succeed then frees are in progress in the slab and
  63 *   we must stay away from it for a while since we may cause a bouncing
  64 *   cacheline if we try to acquire the lock. So go onto the next slab.
  65 *   If all pages are busy then we may allocate a new slab instead of reusing
  66 *   a partial slab. A new slab has noone operating on it and thus there is
  67 *   no danger of cacheline contention.
  68 *
  69 *   Interrupts are disabled during allocation and deallocation in order to
  70 *   make the slab allocator safe to use in the context of an irq. In addition
  71 *   interrupts are disabled to ensure that the processor does not change
  72 *   while handling per_cpu slabs, due to kernel preemption.
  73 *
  74 * SLUB assigns one slab for allocation to each processor.
  75 * Allocations only occur from these slabs called cpu slabs.
  76 *
  77 * Slabs with free elements are kept on a partial list and during regular
  78 * operations no list for full slabs is used. If an object in a full slab is
  79 * freed then the slab will show up again on the partial lists.
  80 * We track full slabs for debugging purposes though because otherwise we
  81 * cannot scan all objects.
  82 *
  83 * Slabs are freed when they become empty. Teardown and setup is
  84 * minimal so we rely on the page allocators per cpu caches for
  85 * fast frees and allocs.
  86 *
  87 * Overloading of page flags that are otherwise used for LRU management.
  88 *
  89 * PageActive           The slab is frozen and exempt from list processing.
  90 *                      This means that the slab is dedicated to a purpose
  91 *                      such as satisfying allocations for a specific
  92 *                      processor. Objects may be freed in the slab while
  93 *                      it is frozen but slab_free will then skip the usual
  94 *                      list operations. It is up to the processor holding
  95 *                      the slab to integrate the slab into the slab lists
  96 *                      when the slab is no longer needed.
  97 *
  98 *                      One use of this flag is to mark slabs that are
  99 *                      used for allocations. Then such a slab becomes a cpu
 100 *                      slab. The cpu slab may be equipped with an additional
 101 *                      freelist that allows lockless access to
 102 *                      free objects in addition to the regular freelist
 103 *                      that requires the slab lock.
 104 *
 105 * PageError            Slab requires special handling due to debug
 106 *                      options set. This moves slab handling out of
 107 *                      the fast path and disables lockless freelists.
 108 */
 109
 110#ifdef CONFIG_SLUB_DEBUG
 111#define SLABDEBUG 1
 112#else
 113#define SLABDEBUG 0
 114#endif
 115
 116/*
 117 * Issues still to be resolved:
 118 *
 119 * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
 120 *
 121 * - Variable sizing of the per node arrays
 122 */
 123
 124/* Enable to test recovery from slab corruption on boot */
 125#undef SLUB_RESILIENCY_TEST
 126
 127/*
 128 * Mininum number of partial slabs. These will be left on the partial
 129 * lists even if they are empty. kmem_cache_shrink may reclaim them.
 130 */
 131#define MIN_PARTIAL 5
 132
 133/*
 134 * Maximum number of desirable partial slabs.
 135 * The existence of more partial slabs makes kmem_cache_shrink
 136 * sort the partial list by the number of objects in the.
 137 */
 138#define MAX_PARTIAL 10
 139
 140#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
 141                                SLAB_POISON | SLAB_STORE_USER)
 142
 143/*
 144 * Debugging flags that require metadata to be stored in the slab.  These get
 145 * disabled when slub_debug=O is used and a cache's min order increases with
 146 * metadata.
 147 */
 148#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
 149
 150/*
 151 * Set of flags that will prevent slab merging
 152 */
 153#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
 154                SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE)
 155
 156#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
 157                SLAB_CACHE_DMA | SLAB_NOTRACK)
 158
 159#ifndef ARCH_KMALLOC_MINALIGN
 160#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
 161#endif
 162
 163#ifndef ARCH_SLAB_MINALIGN
 164#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
 165#endif
 166
 167#define OO_SHIFT        16
 168#define OO_MASK         ((1 << OO_SHIFT) - 1)
 169#define MAX_OBJS_PER_PAGE       65535 /* since page.objects is u16 */
 170
 171/* Internal SLUB flags */
 172#define __OBJECT_POISON         0x80000000 /* Poison object */
 173#define __SYSFS_ADD_DEFERRED    0x40000000 /* Not yet visible via sysfs */
 174
 175static int kmem_size = sizeof(struct kmem_cache);
 176
 177#ifdef CONFIG_SMP
 178static struct notifier_block slab_notifier;
 179#endif
 180
 181static enum {
 182        DOWN,           /* No slab functionality available */
 183        PARTIAL,        /* kmem_cache_open() works but kmalloc does not */
 184        UP,             /* Everything works but does not show up in sysfs */
 185        SYSFS           /* Sysfs up */
 186} slab_state = DOWN;
 187
 188/* A list of all slab caches on the system */
 189static DECLARE_RWSEM(slub_lock);
 190static LIST_HEAD(slab_caches);
 191
 192/*
 193 * Tracking user of a slab.
 194 */
 195struct track {
 196        unsigned long addr;     /* Called from address */
 197        int cpu;                /* Was running on cpu */
 198        int pid;                /* Pid context */
 199        unsigned long when;     /* When did the operation occur */
 200};
 201
 202enum track_item { TRACK_ALLOC, TRACK_FREE };
 203
 204#ifdef CONFIG_SLUB_DEBUG
 205static int sysfs_slab_add(struct kmem_cache *);
 206static int sysfs_slab_alias(struct kmem_cache *, const char *);
 207static void sysfs_slab_remove(struct kmem_cache *);
 208
 209#else
 210static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
 211static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
 212                                                        { return 0; }
 213static inline void sysfs_slab_remove(struct kmem_cache *s)
 214{
 215        kfree(s);
 216}
 217
 218#endif
 219
 220static inline void stat(struct kmem_cache_cpu *c, enum stat_item si)
 221{
 222#ifdef CONFIG_SLUB_STATS
 223        c->stat[si]++;
 224#endif
 225}
 226
 227/********************************************************************
 228 *                      Core slab cache functions
 229 *******************************************************************/
 230
 231int slab_is_available(void)
 232{
 233        return slab_state >= UP;
 234}
 235
 236static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
 237{
 238#ifdef CONFIG_NUMA
 239        return s->node[node];
 240#else
 241        return &s->local_node;
 242#endif
 243}
 244
 245static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu)
 246{
 247#ifdef CONFIG_SMP
 248        return s->cpu_slab[cpu];
 249#else
 250        return &s->cpu_slab;
 251#endif
 252}
 253
 254/* Verify that a pointer has an address that is valid within a slab page */
 255static inline int check_valid_pointer(struct kmem_cache *s,
 256                                struct page *page, const void *object)
 257{
 258        void *base;
 259
 260        if (!object)
 261                return 1;
 262
 263        base = page_address(page);
 264        if (object < base || object >= base + page->objects * s->size ||
 265                (object - base) % s->size) {
 266                return 0;
 267        }
 268
 269        return 1;
 270}
 271
 272/*
 273 * Slow version of get and set free pointer.
 274 *
 275 * This version requires touching the cache lines of kmem_cache which
 276 * we avoid to do in the fast alloc free paths. There we obtain the offset
 277 * from the page struct.
 278 */
 279static inline void *get_freepointer(struct kmem_cache *s, void *object)
 280{
 281        return *(void **)(object + s->offset);
 282}
 283
 284static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
 285{
 286        *(void **)(object + s->offset) = fp;
 287}
 288
 289/* Loop over all objects in a slab */
 290#define for_each_object(__p, __s, __addr, __objects) \
 291        for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\
 292                        __p += (__s)->size)
 293
 294/* Scan freelist */
 295#define for_each_free_object(__p, __s, __free) \
 296        for (__p = (__free); __p; __p = get_freepointer((__s), __p))
 297
 298/* Determine object index from a given position */
 299static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
 300{
 301        return (p - addr) / s->size;
 302}
 303
 304static inline struct kmem_cache_order_objects oo_make(int order,
 305                                                unsigned long size)
 306{
 307        struct kmem_cache_order_objects x = {
 308                (order << OO_SHIFT) + (PAGE_SIZE << order) / size
 309        };
 310
 311        return x;
 312}
 313
 314static inline int oo_order(struct kmem_cache_order_objects x)
 315{
 316        return x.x >> OO_SHIFT;
 317}
 318
 319static inline int oo_objects(struct kmem_cache_order_objects x)
 320{
 321        return x.x & OO_MASK;
 322}
 323
 324#ifdef CONFIG_SLUB_DEBUG
 325/*
 326 * Debug settings:
 327 */
 328#ifdef CONFIG_SLUB_DEBUG_ON
 329static int slub_debug = DEBUG_DEFAULT_FLAGS;
 330#else
 331static int slub_debug;
 332#endif
 333
 334static char *slub_debug_slabs;
 335static int disable_higher_order_debug;
 336
 337/*
 338 * Object debugging
 339 */
 340static void print_section(char *text, u8 *addr, unsigned int length)
 341{
 342        int i, offset;
 343        int newline = 1;
 344        char ascii[17];
 345
 346        ascii[16] = 0;
 347
 348        for (i = 0; i < length; i++) {
 349                if (newline) {
 350                        printk(KERN_ERR "%8s 0x%p: ", text, addr + i);
 351                        newline = 0;
 352                }
 353                printk(KERN_CONT " %02x", addr[i]);
 354                offset = i % 16;
 355                ascii[offset] = isgraph(addr[i]) ? addr[i] : '.';
 356                if (offset == 15) {
 357                        printk(KERN_CONT " %s\n", ascii);
 358                        newline = 1;
 359                }
 360        }
 361        if (!newline) {
 362                i %= 16;
 363                while (i < 16) {
 364                        printk(KERN_CONT "   ");
 365                        ascii[i] = ' ';
 366                        i++;
 367                }
 368                printk(KERN_CONT " %s\n", ascii);
 369        }
 370}
 371
 372static struct track *get_track(struct kmem_cache *s, void *object,
 373        enum track_item alloc)
 374{
 375        struct track *p;
 376
 377        if (s->offset)
 378                p = object + s->offset + sizeof(void *);
 379        else
 380                p = object + s->inuse;
 381
 382        return p + alloc;
 383}
 384
 385static void set_track(struct kmem_cache *s, void *object,
 386                        enum track_item alloc, unsigned long addr)
 387{
 388        struct track *p = get_track(s, object, alloc);
 389
 390        if (addr) {
 391                p->addr = addr;
 392                p->cpu = smp_processor_id();
 393                p->pid = current->pid;
 394                p->when = jiffies;
 395        } else
 396                memset(p, 0, sizeof(struct track));
 397}
 398
 399static void init_tracking(struct kmem_cache *s, void *object)
 400{
 401        if (!(s->flags & SLAB_STORE_USER))
 402                return;
 403
 404        set_track(s, object, TRACK_FREE, 0UL);
 405        set_track(s, object, TRACK_ALLOC, 0UL);
 406}
 407
 408static void print_track(const char *s, struct track *t)
 409{
 410        if (!t->addr)
 411                return;
 412
 413        printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
 414                s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid);
 415}
 416
 417static void print_tracking(struct kmem_cache *s, void *object)
 418{
 419        if (!(s->flags & SLAB_STORE_USER))
 420                return;
 421
 422        print_track("Allocated", get_track(s, object, TRACK_ALLOC));
 423        print_track("Freed", get_track(s, object, TRACK_FREE));
 424}
 425
 426static void print_page_info(struct page *page)
 427{
 428        printk(KERN_ERR "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
 429                page, page->objects, page->inuse, page->freelist, page->flags);
 430
 431}
 432
 433static void slab_bug(struct kmem_cache *s, char *fmt, ...)
 434{
 435        va_list args;
 436        char buf[100];
 437
 438        va_start(args, fmt);
 439        vsnprintf(buf, sizeof(buf), fmt, args);
 440        va_end(args);
 441        printk(KERN_ERR "========================================"
 442                        "=====================================\n");
 443        printk(KERN_ERR "BUG %s: %s\n", s->name, buf);
 444        printk(KERN_ERR "----------------------------------------"
 445                        "-------------------------------------\n\n");
 446}
 447
 448static void slab_fix(struct kmem_cache *s, char *fmt, ...)
 449{
 450        va_list args;
 451        char buf[100];
 452
 453        va_start(args, fmt);
 454        vsnprintf(buf, sizeof(buf), fmt, args);
 455        va_end(args);
 456        printk(KERN_ERR "FIX %s: %s\n", s->name, buf);
 457}
 458
 459static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
 460{
 461        unsigned int off;       /* Offset of last byte */
 462        u8 *addr = page_address(page);
 463
 464        print_tracking(s, p);
 465
 466        print_page_info(page);
 467
 468        printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
 469                        p, p - addr, get_freepointer(s, p));
 470
 471        if (p > addr + 16)
 472                print_section("Bytes b4", p - 16, 16);
 473
 474        print_section("Object", p, min_t(unsigned long, s->objsize, PAGE_SIZE));
 475
 476        if (s->flags & SLAB_RED_ZONE)
 477                print_section("Redzone", p + s->objsize,
 478                        s->inuse - s->objsize);
 479
 480        if (s->offset)
 481                off = s->offset + sizeof(void *);
 482        else
 483                off = s->inuse;
 484
 485        if (s->flags & SLAB_STORE_USER)
 486                off += 2 * sizeof(struct track);
 487
 488        if (off != s->size)
 489                /* Beginning of the filler is the free pointer */
 490                print_section("Padding", p + off, s->size - off);
 491
 492        dump_stack();
 493}
 494
 495static void object_err(struct kmem_cache *s, struct page *page,
 496                        u8 *object, char *reason)
 497{
 498        slab_bug(s, "%s", reason);
 499        print_trailer(s, page, object);
 500}
 501
 502static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...)
 503{
 504        va_list args;
 505        char buf[100];
 506
 507        va_start(args, fmt);
 508        vsnprintf(buf, sizeof(buf), fmt, args);
 509        va_end(args);
 510        slab_bug(s, "%s", buf);
 511        print_page_info(page);
 512        dump_stack();
 513}
 514
 515static void init_object(struct kmem_cache *s, void *object, int active)
 516{
 517        u8 *p = object;
 518
 519        if (s->flags & __OBJECT_POISON) {
 520                memset(p, POISON_FREE, s->objsize - 1);
 521                p[s->objsize - 1] = POISON_END;
 522        }
 523
 524        if (s->flags & SLAB_RED_ZONE)
 525                memset(p + s->objsize,
 526                        active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE,
 527                        s->inuse - s->objsize);
 528}
 529
 530static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
 531{
 532        while (bytes) {
 533                if (*start != (u8)value)
 534                        return start;
 535                start++;
 536                bytes--;
 537        }
 538        return NULL;
 539}
 540
 541static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
 542                                                void *from, void *to)
 543{
 544        slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data);
 545        memset(from, data, to - from);
 546}
 547
 548static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
 549                        u8 *object, char *what,
 550                        u8 *start, unsigned int value, unsigned int bytes)
 551{
 552        u8 *fault;
 553        u8 *end;
 554
 555        fault = check_bytes(start, value, bytes);
 556        if (!fault)
 557                return 1;
 558
 559        end = start + bytes;
 560        while (end > fault && end[-1] == value)
 561                end--;
 562
 563        slab_bug(s, "%s overwritten", what);
 564        printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
 565                                        fault, end - 1, fault[0], value);
 566        print_trailer(s, page, object);
 567
 568        restore_bytes(s, what, value, fault, end);
 569        return 0;
 570}
 571
 572/*
 573 * Object layout:
 574 *
 575 * object address
 576 *      Bytes of the object to be managed.
 577 *      If the freepointer may overlay the object then the free
 578 *      pointer is the first word of the object.
 579 *
 580 *      Poisoning uses 0x6b (POISON_FREE) and the last byte is
 581 *      0xa5 (POISON_END)
 582 *
 583 * object + s->objsize
 584 *      Padding to reach word boundary. This is also used for Redzoning.
 585 *      Padding is extended by another word if Redzoning is enabled and
 586 *      objsize == inuse.
 587 *
 588 *      We fill with 0xbb (RED_INACTIVE) for inactive objects and with
 589 *      0xcc (RED_ACTIVE) for objects in use.
 590 *
 591 * object + s->inuse
 592 *      Meta data starts here.
 593 *
 594 *      A. Free pointer (if we cannot overwrite object on free)
 595 *      B. Tracking data for SLAB_STORE_USER
 596 *      C. Padding to reach required alignment boundary or at mininum
 597 *              one word if debugging is on to be able to detect writes
 598 *              before the word boundary.
 599 *
 600 *      Padding is done using 0x5a (POISON_INUSE)
 601 *
 602 * object + s->size
 603 *      Nothing is used beyond s->size.
 604 *
 605 * If slabcaches are merged then the objsize and inuse boundaries are mostly
 606 * ignored. And therefore no slab options that rely on these boundaries
 607 * may be used with merged slabcaches.
 608 */
 609
 610static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
 611{
 612        unsigned long off = s->inuse;   /* The end of info */
 613
 614        if (s->offset)
 615                /* Freepointer is placed after the object. */
 616                off += sizeof(void *);
 617
 618        if (s->flags & SLAB_STORE_USER)
 619                /* We also have user information there */
 620                off += 2 * sizeof(struct track);
 621
 622        if (s->size == off)
 623                return 1;
 624
 625        return check_bytes_and_report(s, page, p, "Object padding",
 626                                p + off, POISON_INUSE, s->size - off);
 627}
 628
 629/* Check the pad bytes at the end of a slab page */
 630static int slab_pad_check(struct kmem_cache *s, struct page *page)
 631{
 632        u8 *start;
 633        u8 *fault;
 634        u8 *end;
 635        int length;
 636        int remainder;
 637
 638        if (!(s->flags & SLAB_POISON))
 639                return 1;
 640
 641        start = page_address(page);
 642        length = (PAGE_SIZE << compound_order(page));
 643        end = start + length;
 644        remainder = length % s->size;
 645        if (!remainder)
 646                return 1;
 647
 648        fault = check_bytes(end - remainder, POISON_INUSE, remainder);
 649        if (!fault)
 650                return 1;
 651        while (end > fault && end[-1] == POISON_INUSE)
 652                end--;
 653
 654        slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
 655        print_section("Padding", end - remainder, remainder);
 656
 657        restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end);
 658        return 0;
 659}
 660
 661static int check_object(struct kmem_cache *s, struct page *page,
 662                                        void *object, int active)
 663{
 664        u8 *p = object;
 665        u8 *endobject = object + s->objsize;
 666
 667        if (s->flags & SLAB_RED_ZONE) {
 668                unsigned int red =
 669                        active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE;
 670
 671                if (!check_bytes_and_report(s, page, object, "Redzone",
 672                        endobject, red, s->inuse - s->objsize))
 673                        return 0;
 674        } else {
 675                if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) {
 676                        check_bytes_and_report(s, page, p, "Alignment padding",
 677                                endobject, POISON_INUSE, s->inuse - s->objsize);
 678                }
 679        }
 680
 681        if (s->flags & SLAB_POISON) {
 682                if (!active && (s->flags & __OBJECT_POISON) &&
 683                        (!check_bytes_and_report(s, page, p, "Poison", p,
 684                                        POISON_FREE, s->objsize - 1) ||
 685                         !check_bytes_and_report(s, page, p, "Poison",
 686                                p + s->objsize - 1, POISON_END, 1)))
 687                        return 0;
 688                /*
 689                 * check_pad_bytes cleans up on its own.
 690                 */
 691                check_pad_bytes(s, page, p);
 692        }
 693
 694        if (!s->offset && active)
 695                /*
 696                 * Object and freepointer overlap. Cannot check
 697                 * freepointer while object is allocated.
 698                 */
 699                return 1;
 700
 701        /* Check free pointer validity */
 702        if (!check_valid_pointer(s, page, get_freepointer(s, p))) {
 703                object_err(s, page, p, "Freepointer corrupt");
 704                /*
 705                 * No choice but to zap it and thus lose the remainder
 706                 * of the free objects in this slab. May cause
 707                 * another error because the object count is now wrong.
 708                 */
 709                set_freepointer(s, p, NULL);
 710                return 0;
 711        }
 712        return 1;
 713}
 714
 715static int check_slab(struct kmem_cache *s, struct page *page)
 716{
 717        int maxobj;
 718
 719        VM_BUG_ON(!irqs_disabled());
 720
 721        if (!PageSlab(page)) {
 722                slab_err(s, page, "Not a valid slab page");
 723                return 0;
 724        }
 725
 726        maxobj = (PAGE_SIZE << compound_order(page)) / s->size;
 727        if (page->objects > maxobj) {
 728                slab_err(s, page, "objects %u > max %u",
 729                        s->name, page->objects, maxobj);
 730                return 0;
 731        }
 732        if (page->inuse > page->objects) {
 733                slab_err(s, page, "inuse %u > max %u",
 734                        s->name, page->inuse, page->objects);
 735                return 0;
 736        }
 737        /* Slab_pad_check fixes things up after itself */
 738        slab_pad_check(s, page);
 739        return 1;
 740}
 741
 742/*
 743 * Determine if a certain object on a page is on the freelist. Must hold the
 744 * slab lock to guarantee that the chains are in a consistent state.
 745 */
 746static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
 747{
 748        int nr = 0;
 749        void *fp = page->freelist;
 750        void *object = NULL;
 751        unsigned long max_objects;
 752
 753        while (fp && nr <= page->objects) {
 754                if (fp == search)
 755                        return 1;
 756                if (!check_valid_pointer(s, page, fp)) {
 757                        if (object) {
 758                                object_err(s, page, object,
 759                                        "Freechain corrupt");
 760                                set_freepointer(s, object, NULL);
 761                                break;
 762                        } else {
 763                                slab_err(s, page, "Freepointer corrupt");
 764                                page->freelist = NULL;
 765                                page->inuse = page->objects;
 766                                slab_fix(s, "Freelist cleared");
 767                                return 0;
 768                        }
 769                        break;
 770                }
 771                object = fp;
 772                fp = get_freepointer(s, object);
 773                nr++;
 774        }
 775
 776        max_objects = (PAGE_SIZE << compound_order(page)) / s->size;
 777        if (max_objects > MAX_OBJS_PER_PAGE)
 778                max_objects = MAX_OBJS_PER_PAGE;
 779
 780        if (page->objects != max_objects) {
 781                slab_err(s, page, "Wrong number of objects. Found %d but "
 782                        "should be %d", page->objects, max_objects);
 783                page->objects = max_objects;
 784                slab_fix(s, "Number of objects adjusted.");
 785        }
 786        if (page->inuse != page->objects - nr) {
 787                slab_err(s, page, "Wrong object count. Counter is %d but "
 788                        "counted were %d", page->inuse, page->objects - nr);
 789                page->inuse = page->objects - nr;
 790                slab_fix(s, "Object count adjusted.");
 791        }
 792        return search == NULL;
 793}
 794
 795static void trace(struct kmem_cache *s, struct page *page, void *object,
 796                                                                int alloc)
 797{
 798        if (s->flags & SLAB_TRACE) {
 799                printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
 800                        s->name,
 801                        alloc ? "alloc" : "free",
 802                        object, page->inuse,
 803                        page->freelist);
 804
 805                if (!alloc)
 806                        print_section("Object", (void *)object, s->objsize);
 807
 808                dump_stack();
 809        }
 810}
 811
 812/*
 813 * Tracking of fully allocated slabs for debugging purposes.
 814 */
 815static void add_full(struct kmem_cache_node *n, struct page *page)
 816{
 817        spin_lock(&n->list_lock);
 818        list_add(&page->lru, &n->full);
 819        spin_unlock(&n->list_lock);
 820}
 821
 822static void remove_full(struct kmem_cache *s, struct page *page)
 823{
 824        struct kmem_cache_node *n;
 825
 826        if (!(s->flags & SLAB_STORE_USER))
 827                return;
 828
 829        n = get_node(s, page_to_nid(page));
 830
 831        spin_lock(&n->list_lock);
 832        list_del(&page->lru);
 833        spin_unlock(&n->list_lock);
 834}
 835
 836/* Tracking of the number of slabs for debugging purposes */
 837static inline unsigned long slabs_node(struct kmem_cache *s, int node)
 838{
 839        struct kmem_cache_node *n = get_node(s, node);
 840
 841        return atomic_long_read(&n->nr_slabs);
 842}
 843
 844static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
 845{
 846        return atomic_long_read(&n->nr_slabs);
 847}
 848
 849static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
 850{
 851        struct kmem_cache_node *n = get_node(s, node);
 852
 853        /*
 854         * May be called early in order to allocate a slab for the
 855         * kmem_cache_node structure. Solve the chicken-egg
 856         * dilemma by deferring the increment of the count during
 857         * bootstrap (see early_kmem_cache_node_alloc).
 858         */
 859        if (!NUMA_BUILD || n) {
 860                atomic_long_inc(&n->nr_slabs);
 861                atomic_long_add(objects, &n->total_objects);
 862        }
 863}
 864static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
 865{
 866        struct kmem_cache_node *n = get_node(s, node);
 867
 868        atomic_long_dec(&n->nr_slabs);
 869        atomic_long_sub(objects, &n->total_objects);
 870}
 871
 872/* Object debug checks for alloc/free paths */
 873static void setup_object_debug(struct kmem_cache *s, struct page *page,
 874                                                                void *object)
 875{
 876        if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
 877                return;
 878
 879        init_object(s, object, 0);
 880        init_tracking(s, object);
 881}
 882
 883static int alloc_debug_processing(struct kmem_cache *s, struct page *page,
 884                                        void *object, unsigned long addr)
 885{
 886        if (!check_slab(s, page))
 887                goto bad;
 888
 889        if (!on_freelist(s, page, object)) {
 890                object_err(s, page, object, "Object already allocated");
 891                goto bad;
 892        }
 893
 894        if (!check_valid_pointer(s, page, object)) {
 895                object_err(s, page, object, "Freelist Pointer check fails");
 896                goto bad;
 897        }
 898
 899        if (!check_object(s, page, object, 0))
 900                goto bad;
 901
 902        /* Success perform special debug activities for allocs */
 903        if (s->flags & SLAB_STORE_USER)
 904                set_track(s, object, TRACK_ALLOC, addr);
 905        trace(s, page, object, 1);
 906        init_object(s, object, 1);
 907        return 1;
 908
 909bad:
 910        if (PageSlab(page)) {
 911                /*
 912                 * If this is a slab page then lets do the best we can
 913                 * to avoid issues in the future. Marking all objects
 914                 * as used avoids touching the remaining objects.
 915                 */
 916                slab_fix(s, "Marking all objects used");
 917                page->inuse = page->objects;
 918                page->freelist = NULL;
 919        }
 920        return 0;
 921}
 922
 923static int free_debug_processing(struct kmem_cache *s, struct page *page,
 924                                        void *object, unsigned long addr)
 925{
 926        if (!check_slab(s, page))
 927                goto fail;
 928
 929        if (!check_valid_pointer(s, page, object)) {
 930                slab_err(s, page, "Invalid object pointer 0x%p", object);
 931                goto fail;
 932        }
 933
 934        if (on_freelist(s, page, object)) {
 935                object_err(s, page, object, "Object already free");
 936                goto fail;
 937        }
 938
 939        if (!check_object(s, page, object, 1))
 940                return 0;
 941
 942        if (unlikely(s != page->slab)) {
 943                if (!PageSlab(page)) {
 944                        slab_err(s, page, "Attempt to free object(0x%p) "
 945                                "outside of slab", object);
 946                } else if (!page->slab) {
 947                        printk(KERN_ERR
 948                                "SLUB <none>: no slab for object 0x%p.\n",
 949                                                object);
 950                        dump_stack();
 951                } else
 952                        object_err(s, page, object,
 953                                        "page slab pointer corrupt.");
 954                goto fail;
 955        }
 956
 957        /* Special debug activities for freeing objects */
 958        if (!PageSlubFrozen(page) && !page->freelist)
 959                remove_full(s, page);
 960        if (s->flags & SLAB_STORE_USER)
 961                set_track(s, object, TRACK_FREE, addr);
 962        trace(s, page, object, 0);
 963        init_object(s, object, 0);
 964        return 1;
 965
 966fail:
 967        slab_fix(s, "Object at 0x%p not freed", object);
 968        return 0;
 969}
 970
 971static int __init setup_slub_debug(char *str)
 972{
 973        slub_debug = DEBUG_DEFAULT_FLAGS;
 974        if (*str++ != '=' || !*str)
 975                /*
 976                 * No options specified. Switch on full debugging.
 977                 */
 978                goto out;
 979
 980        if (*str == ',')
 981                /*
 982                 * No options but restriction on slabs. This means full
 983                 * debugging for slabs matching a pattern.
 984                 */
 985                goto check_slabs;
 986
 987        if (tolower(*str) == 'o') {
 988                /*
 989                 * Avoid enabling debugging on caches if its minimum order
 990                 * would increase as a result.
 991                 */
 992                disable_higher_order_debug = 1;
 993                goto out;
 994        }
 995
 996        slub_debug = 0;
 997        if (*str == '-')
 998                /*
 999                 * Switch off all debugging measures.
1000                 */
1001                goto out;
1002
1003        /*
1004         * Determine which debug features should be switched on
1005         */
1006        for (; *str && *str != ','; str++) {
1007                switch (tolower(*str)) {
1008                case 'f':
1009                        slub_debug |= SLAB_DEBUG_FREE;
1010                        break;
1011                case 'z':
1012                        slub_debug |= SLAB_RED_ZONE;
1013                        break;
1014                case 'p':
1015                        slub_debug |= SLAB_POISON;
1016                        break;
1017                case 'u':
1018                        slub_debug |= SLAB_STORE_USER;
1019                        break;
1020                case 't':
1021                        slub_debug |= SLAB_TRACE;
1022                        break;
1023                default:
1024                        printk(KERN_ERR "slub_debug option '%c' "
1025                                "unknown. skipped\n", *str);
1026                }
1027        }
1028
1029check_slabs:
1030        if (*str == ',')
1031                slub_debug_slabs = str + 1;
1032out:
1033        return 1;
1034}
1035
1036__setup("slub_debug", setup_slub_debug);
1037
1038static unsigned long kmem_cache_flags(unsigned long objsize,
1039        unsigned long flags, const char *name,
1040        void (*ctor)(void *))
1041{
1042        /*
1043         * Enable debugging if selected on the kernel commandline.
1044         */
1045        if (slub_debug && (!slub_debug_slabs ||
1046                !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs))))
1047                flags |= slub_debug;
1048
1049        return flags;
1050}
1051#else
1052static inline void setup_object_debug(struct kmem_cache *s,
1053                        struct page *page, void *object) {}
1054
1055static inline int alloc_debug_processing(struct kmem_cache *s,
1056        struct page *page, void *object, unsigned long addr) { return 0; }
1057
1058static inline int free_debug_processing(struct kmem_cache *s,
1059        struct page *page, void *object, unsigned long addr) { return 0; }
1060
1061static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
1062                        { return 1; }
1063static inline int check_object(struct kmem_cache *s, struct page *page,
1064                        void *object, int active) { return 1; }
1065static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
1066static inline unsigned long kmem_cache_flags(unsigned long objsize,
1067        unsigned long flags, const char *name,
1068        void (*ctor)(void *))
1069{
1070        return flags;
1071}
1072#define slub_debug 0
1073
1074#define disable_higher_order_debug 0
1075
1076static inline unsigned long slabs_node(struct kmem_cache *s, int node)
1077                                                        { return 0; }
1078static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
1079                                                        { return 0; }
1080static inline void inc_slabs_node(struct kmem_cache *s, int node,
1081                                                        int objects) {}
1082static inline void dec_slabs_node(struct kmem_cache *s, int node,
1083                                                        int objects) {}
1084#endif
1085
1086/*
1087 * Slab allocation and freeing
1088 */
1089static inline struct page *alloc_slab_page(gfp_t flags, int node,
1090                                        struct kmem_cache_order_objects oo)
1091{
1092        int order = oo_order(oo);
1093
1094        flags |= __GFP_NOTRACK;
1095
1096        if (node == -1)
1097                return alloc_pages(flags, order);
1098        else
1099                return alloc_pages_node(node, flags, order);
1100}
1101
1102static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1103{
1104        struct page *page;
1105        struct kmem_cache_order_objects oo = s->oo;
1106        gfp_t alloc_gfp;
1107
1108        flags |= s->allocflags;
1109
1110        /*
1111         * Let the initial higher-order allocation fail under memory pressure
1112         * so we fall-back to the minimum order allocation.
1113         */
1114        alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
1115
1116        page = alloc_slab_page(alloc_gfp, node, oo);
1117        if (unlikely(!page)) {
1118                oo = s->min;
1119                /*
1120                 * Allocation may have failed due to fragmentation.
1121                 * Try a lower order alloc if possible
1122                 */
1123                page = alloc_slab_page(flags, node, oo);
1124                if (!page)
1125                        return NULL;
1126
1127                stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK);
1128        }
1129
1130        if (kmemcheck_enabled
1131                && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
1132                int pages = 1 << oo_order(oo);
1133
1134                kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
1135
1136                /*
1137                 * Objects from caches that have a constructor don't get
1138                 * cleared when they're allocated, so we need to do it here.
1139                 */
1140                if (s->ctor)
1141                        kmemcheck_mark_uninitialized_pages(page, pages);
1142                else
1143                        kmemcheck_mark_unallocated_pages(page, pages);
1144        }
1145
1146        page->objects = oo_objects(oo);
1147        mod_zone_page_state(page_zone(page),
1148                (s->flags & SLAB_RECLAIM_ACCOUNT) ?
1149                NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1150                1 << oo_order(oo));
1151
1152        return page;
1153}
1154
1155static void setup_object(struct kmem_cache *s, struct page *page,
1156                                void *object)
1157{
1158        setup_object_debug(s, page, object);
1159        if (unlikely(s->ctor))
1160                s->ctor(object);
1161}
1162
1163static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1164{
1165        struct page *page;
1166        void *start;
1167        void *last;
1168        void *p;
1169
1170        BUG_ON(flags & GFP_SLAB_BUG_MASK);
1171
1172        page = allocate_slab(s,
1173                flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
1174        if (!page)
1175                goto out;
1176
1177        inc_slabs_node(s, page_to_nid(page), page->objects);
1178        page->slab = s;
1179        page->flags |= 1 << PG_slab;
1180        if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
1181                        SLAB_STORE_USER | SLAB_TRACE))
1182                __SetPageSlubDebug(page);
1183
1184        start = page_address(page);
1185
1186        if (unlikely(s->flags & SLAB_POISON))
1187                memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page));
1188
1189        last = start;
1190        for_each_object(p, s, start, page->objects) {
1191                setup_object(s, page, last);
1192                set_freepointer(s, last, p);
1193                last = p;
1194        }
1195        setup_object(s, page, last);
1196        set_freepointer(s, last, NULL);
1197
1198        page->freelist = start;
1199        page->inuse = 0;
1200out:
1201        return page;
1202}
1203
1204static void __free_slab(struct kmem_cache *s, struct page *page)
1205{
1206        int order = compound_order(page);
1207        int pages = 1 << order;
1208
1209        if (unlikely(SLABDEBUG && PageSlubDebug(page))) {
1210                void *p;
1211
1212                slab_pad_check(s, page);
1213                for_each_object(p, s, page_address(page),
1214                                                page->objects)
1215                        check_object(s, page, p, 0);
1216                __ClearPageSlubDebug(page);
1217        }
1218
1219        kmemcheck_free_shadow(page, compound_order(page));
1220
1221        mod_zone_page_state(page_zone(page),
1222                (s->flags & SLAB_RECLAIM_ACCOUNT) ?
1223                NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1224                -pages);
1225
1226        __ClearPageSlab(page);
1227        reset_page_mapcount(page);
1228        if (current->reclaim_state)
1229                current->reclaim_state->reclaimed_slab += pages;
1230        __free_pages(page, order);
1231}
1232
1233static void rcu_free_slab(struct rcu_head *h)
1234{
1235        struct page *page;
1236
1237        page = container_of((struct list_head *)h, struct page, lru);
1238        __free_slab(page->slab, page);
1239}
1240
1241static void free_slab(struct kmem_cache *s, struct page *page)
1242{
1243        if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
1244                /*
1245                 * RCU free overloads the RCU head over the LRU
1246                 */
1247                struct rcu_head *head = (void *)&page->lru;
1248
1249                call_rcu(head, rcu_free_slab);
1250        } else
1251                __free_slab(s, page);
1252}
1253
1254static void discard_slab(struct kmem_cache *s, struct page *page)
1255{
1256        dec_slabs_node(s, page_to_nid(page), page->objects);
1257        free_slab(s, page);
1258}
1259
1260/*
1261 * Per slab locking using the pagelock
1262 */
1263static __always_inline void slab_lock(struct page *page)
1264{
1265        bit_spin_lock(PG_locked, &page->flags);
1266}
1267
1268static __always_inline void slab_unlock(struct page *page)
1269{
1270        __bit_spin_unlock(PG_locked, &page->flags);
1271}
1272
1273static __always_inline int slab_trylock(struct page *page)
1274{
1275        int rc = 1;
1276
1277        rc = bit_spin_trylock(PG_locked, &page->flags);
1278        return rc;
1279}
1280
1281/*
1282 * Management of partially allocated slabs
1283 */
1284static void add_partial(struct kmem_cache_node *n,
1285                                struct page *page, int tail)
1286{
1287        spin_lock(&n->list_lock);
1288        n->nr_partial++;
1289        if (tail)
1290                list_add_tail(&page->lru, &n->partial);
1291        else
1292                list_add(&page->lru, &n->partial);
1293        spin_unlock(&n->list_lock);
1294}
1295
1296static void remove_partial(struct kmem_cache *s, struct page *page)
1297{
1298        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1299
1300        spin_lock(&n->list_lock);
1301        list_del(&page->lru);
1302        n->nr_partial--;
1303        spin_unlock(&n->list_lock);
1304}
1305
1306/*
1307 * Lock slab and remove from the partial list.
1308 *
1309 * Must hold list_lock.
1310 */
1311static inline int lock_and_freeze_slab(struct kmem_cache_node *n,
1312                                                        struct page *page)
1313{
1314        if (slab_trylock(page)) {
1315                list_del(&page->lru);
1316                n->nr_partial--;
1317                __SetPageSlubFrozen(page);
1318                return 1;
1319        }
1320        return 0;
1321}
1322
1323/*
1324 * Try to allocate a partial slab from a specific node.
1325 */
1326static struct page *get_partial_node(struct kmem_cache_node *n)
1327{
1328        struct page *page;
1329
1330        /*
1331         * Racy check. If we mistakenly see no partial slabs then we
1332         * just allocate an empty slab. If we mistakenly try to get a
1333         * partial slab and there is none available then get_partials()
1334         * will return NULL.
1335         */
1336        if (!n || !n->nr_partial)
1337                return NULL;
1338
1339        spin_lock(&n->list_lock);
1340        list_for_each_entry(page, &n->partial, lru)
1341                if (lock_and_freeze_slab(n, page))
1342                        goto out;
1343        page = NULL;
1344out:
1345        spin_unlock(&n->list_lock);
1346        return page;
1347}
1348
1349/*
1350 * Get a page from somewhere. Search in increasing NUMA distances.
1351 */
1352static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1353{
1354#ifdef CONFIG_NUMA
1355        struct zonelist *zonelist;
1356        struct zoneref *z;
1357        struct zone *zone;
1358        enum zone_type high_zoneidx = gfp_zone(flags);
1359        struct page *page;
1360
1361        /*
1362         * The defrag ratio allows a configuration of the tradeoffs between
1363         * inter node defragmentation and node local allocations. A lower
1364         * defrag_ratio increases the tendency to do local allocations
1365         * instead of attempting to obtain partial slabs from other nodes.
1366         *
1367         * If the defrag_ratio is set to 0 then kmalloc() always
1368         * returns node local objects. If the ratio is higher then kmalloc()
1369         * may return off node objects because partial slabs are obtained
1370         * from other nodes and filled up.
1371         *
1372         * If /sys/kernel/slab/xx/defrag_ratio is set to 100 (which makes
1373         * defrag_ratio = 1000) then every (well almost) allocation will
1374         * first attempt to defrag slab caches on other nodes. This means
1375         * scanning over all nodes to look for partial slabs which may be
1376         * expensive if we do it every time we are trying to find a slab
1377         * with available objects.
1378         */
1379        if (!s->remote_node_defrag_ratio ||
1380                        get_cycles() % 1024 > s->remote_node_defrag_ratio)
1381                return NULL;
1382
1383        zonelist = node_zonelist(slab_node(current->mempolicy), flags);
1384        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1385                struct kmem_cache_node *n;
1386
1387                n = get_node(s, zone_to_nid(zone));
1388
1389                if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
1390                                n->nr_partial > s->min_partial) {
1391                        page = get_partial_node(n);
1392                        if (page)
1393                                return page;
1394                }
1395        }
1396#endif
1397        return NULL;
1398}
1399
1400/*
1401 * Get a partial page, lock it and return it.
1402 */
1403static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
1404{
1405        struct page *page;
1406        int searchnode = (node == -1) ? numa_node_id() : node;
1407
1408        page = get_partial_node(get_node(s, searchnode));
1409        if (page || (flags & __GFP_THISNODE))
1410                return page;
1411
1412        return get_any_partial(s, flags);
1413}
1414
1415/*
1416 * Move a page back to the lists.
1417 *
1418 * Must be called with the slab lock held.
1419 *
1420 * On exit the slab lock will have been dropped.
1421 */
1422static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1423{
1424        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1425        struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id());
1426
1427        __ClearPageSlubFrozen(page);
1428        if (page->inuse) {
1429
1430                if (page->freelist) {
1431                        add_partial(n, page, tail);
1432                        stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
1433                } else {
1434                        stat(c, DEACTIVATE_FULL);
1435                        if (SLABDEBUG && PageSlubDebug(page) &&
1436                                                (s->flags & SLAB_STORE_USER))
1437                                add_full(n, page);
1438                }
1439                slab_unlock(page);
1440        } else {
1441                stat(c, DEACTIVATE_EMPTY);
1442                if (n->nr_partial < s->min_partial) {
1443                        /*
1444                         * Adding an empty slab to the partial slabs in order
1445                         * to avoid page allocator overhead. This slab needs
1446                         * to come after the other slabs with objects in
1447                         * so that the others get filled first. That way the
1448                         * size of the partial list stays small.
1449                         *
1450                         * kmem_cache_shrink can reclaim any empty slabs from
1451                         * the partial list.
1452                         */
1453                        add_partial(n, page, 1);
1454                        slab_unlock(page);
1455                } else {
1456                        slab_unlock(page);
1457                        stat(get_cpu_slab(s, raw_smp_processor_id()), FREE_SLAB);
1458                        discard_slab(s, page);
1459                }
1460        }
1461}
1462
1463/*
1464 * Remove the cpu slab
1465 */
1466static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1467{
1468        struct page *page = c->page;
1469        int tail = 1;
1470
1471        if (page->freelist)
1472                stat(c, DEACTIVATE_REMOTE_FREES);
1473        /*
1474         * Merge cpu freelist into slab freelist. Typically we get here
1475         * because both freelists are empty. So this is unlikely
1476         * to occur.
1477         */
1478        while (unlikely(c->freelist)) {
1479                void **object;
1480
1481                tail = 0;       /* Hot objects. Put the slab first */
1482
1483                /* Retrieve object from cpu_freelist */
1484                object = c->freelist;
1485                c->freelist = c->freelist[c->offset];
1486
1487                /* And put onto the regular freelist */
1488                object[c->offset] = page->freelist;
1489                page->freelist = object;
1490                page->inuse--;
1491        }
1492        c->page = NULL;
1493        unfreeze_slab(s, page, tail);
1494}
1495
1496static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1497{
1498        stat(c, CPUSLAB_FLUSH);
1499        slab_lock(c->page);
1500        deactivate_slab(s, c);
1501}
1502
1503/*
1504 * Flush cpu slab.
1505 *
1506 * Called from IPI handler with interrupts disabled.
1507 */
1508static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
1509{
1510        struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
1511
1512        if (likely(c && c->page))
1513                flush_slab(s, c);
1514}
1515
1516static void flush_cpu_slab(void *d)
1517{
1518        struct kmem_cache *s = d;
1519
1520        __flush_cpu_slab(s, smp_processor_id());
1521}
1522
1523static void flush_all(struct kmem_cache *s)
1524{
1525        on_each_cpu(flush_cpu_slab, s, 1);
1526}
1527
1528/*
1529 * Check if the objects in a per cpu structure fit numa
1530 * locality expectations.
1531 */
1532static inline int node_match(struct kmem_cache_cpu *c, int node)
1533{
1534#ifdef CONFIG_NUMA
1535        if (node != -1 && c->node != node)
1536                return 0;
1537#endif
1538        return 1;
1539}
1540
1541static int count_free(struct page *page)
1542{
1543        return page->objects - page->inuse;
1544}
1545
1546static unsigned long count_partial(struct kmem_cache_node *n,
1547                                        int (*get_count)(struct page *))
1548{
1549        unsigned long flags;
1550        unsigned long x = 0;
1551        struct page *page;
1552
1553        spin_lock_irqsave(&n->list_lock, flags);
1554        list_for_each_entry(page, &n->partial, lru)
1555                x += get_count(page);
1556        spin_unlock_irqrestore(&n->list_lock, flags);
1557        return x;
1558}
1559
1560static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
1561{
1562#ifdef CONFIG_SLUB_DEBUG
1563        return atomic_long_read(&n->total_objects);
1564#else
1565        return 0;
1566#endif
1567}
1568
1569static noinline void
1570slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
1571{
1572        int node;
1573
1574        printk(KERN_WARNING
1575                "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n",
1576                nid, gfpflags);
1577        printk(KERN_WARNING "  cache: %s, object size: %d, buffer size: %d, "
1578                "default order: %d, min order: %d\n", s->name, s->objsize,
1579                s->size, oo_order(s->oo), oo_order(s->min));
1580
1581        if (oo_order(s->min) > get_order(s->objsize))
1582                printk(KERN_WARNING "  %s debugging increased min order, use "
1583                       "slub_debug=O to disable.\n", s->name);
1584
1585        for_each_online_node(node) {
1586                struct kmem_cache_node *n = get_node(s, node);
1587                unsigned long nr_slabs;
1588                unsigned long nr_objs;
1589                unsigned long nr_free;
1590
1591                if (!n)
1592                        continue;
1593
1594                nr_free  = count_partial(n, count_free);
1595                nr_slabs = node_nr_slabs(n);
1596                nr_objs  = node_nr_objs(n);
1597
1598                printk(KERN_WARNING
1599                        "  node %d: slabs: %ld, objs: %ld, free: %ld\n",
1600                        node, nr_slabs, nr_objs, nr_free);
1601        }
1602}
1603
1604/*
1605 * Slow path. The lockless freelist is empty or we need to perform
1606 * debugging duties.
1607 *
1608 * Interrupts are disabled.
1609 *
1610 * Processing is still very fast if new objects have been freed to the
1611 * regular freelist. In that case we simply take over the regular freelist
1612 * as the lockless freelist and zap the regular freelist.
1613 *
1614 * If that is not working then we fall back to the partial lists. We take the
1615 * first element of the freelist as the object to allocate now and move the
1616 * rest of the freelist to the lockless freelist.
1617 *
1618 * And if we were unable to get a new slab from the partial slab lists then
1619 * we need to allocate a new slab. This is the slowest path since it involves
1620 * a call to the page allocator and the setup of a new slab.
1621 */
1622static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
1623                          unsigned long addr, struct kmem_cache_cpu *c)
1624{
1625        void **object;
1626        struct page *new;
1627
1628        /* We handle __GFP_ZERO in the caller */
1629        gfpflags &= ~__GFP_ZERO;
1630
1631        if (!c->page)
1632                goto new_slab;
1633
1634        slab_lock(c->page);
1635        if (unlikely(!node_match(c, node)))
1636                goto another_slab;
1637
1638        stat(c, ALLOC_REFILL);
1639
1640load_freelist:
1641        object = c->page->freelist;
1642        if (unlikely(!object))
1643                goto another_slab;
1644        if (unlikely(SLABDEBUG && PageSlubDebug(c->page)))
1645                goto debug;
1646
1647        c->freelist = object[c->offset];
1648        c->page->inuse = c->page->objects;
1649        c->page->freelist = NULL;
1650        c->node = page_to_nid(c->page);
1651unlock_out:
1652        slab_unlock(c->page);
1653        stat(c, ALLOC_SLOWPATH);
1654        return object;
1655
1656another_slab:
1657        deactivate_slab(s, c);
1658
1659new_slab:
1660        new = get_partial(s, gfpflags, node);
1661        if (new) {
1662                c->page = new;
1663                stat(c, ALLOC_FROM_PARTIAL);
1664                goto load_freelist;
1665        }
1666
1667        if (gfpflags & __GFP_WAIT)
1668                local_irq_enable();
1669
1670        new = new_slab(s, gfpflags, node);
1671
1672        if (gfpflags & __GFP_WAIT)
1673                local_irq_disable();
1674
1675        if (new) {
1676                c = get_cpu_slab(s, smp_processor_id());
1677                stat(c, ALLOC_SLAB);
1678                if (c->page)
1679                        flush_slab(s, c);
1680                slab_lock(new);
1681                __SetPageSlubFrozen(new);
1682                c->page = new;
1683                goto load_freelist;
1684        }
1685        if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
1686                slab_out_of_memory(s, gfpflags, node);
1687        return NULL;
1688debug:
1689        if (!alloc_debug_processing(s, c->page, object, addr))
1690                goto another_slab;
1691
1692        c->page->inuse++;
1693        c->page->freelist = object[c->offset];
1694        c->node = -1;
1695        goto unlock_out;
1696}
1697
1698/*
1699 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
1700 * have the fastpath folded into their functions. So no function call
1701 * overhead for requests that can be satisfied on the fastpath.
1702 *
1703 * The fastpath works by first checking if the lockless freelist can be used.
1704 * If not then __slab_alloc is called for slow processing.
1705 *
1706 * Otherwise we can simply pick the next object from the lockless free list.
1707 */
1708static __always_inline void *slab_alloc(struct kmem_cache *s,
1709                gfp_t gfpflags, int node, unsigned long addr)
1710{
1711        void **object;
1712        struct kmem_cache_cpu *c;
1713        unsigned long flags;
1714        unsigned int objsize;
1715
1716        gfpflags &= gfp_allowed_mask;
1717
1718        lockdep_trace_alloc(gfpflags);
1719        might_sleep_if(gfpflags & __GFP_WAIT);
1720
1721        if (should_failslab(s->objsize, gfpflags))
1722                return NULL;
1723
1724        local_irq_save(flags);
1725        c = get_cpu_slab(s, smp_processor_id());
1726        objsize = c->objsize;
1727        if (unlikely(!c->freelist || !node_match(c, node)))
1728
1729                object = __slab_alloc(s, gfpflags, node, addr, c);
1730
1731        else {
1732                object = c->freelist;
1733                c->freelist = object[c->offset];
1734                stat(c, ALLOC_FASTPATH);
1735        }
1736        local_irq_restore(flags);
1737
1738        if (unlikely((gfpflags & __GFP_ZERO) && object))
1739                memset(object, 0, objsize);
1740
1741        kmemcheck_slab_alloc(s, gfpflags, object, c->objsize);
1742        kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags);
1743
1744        return object;
1745}
1746
1747void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
1748{
1749        void *ret = slab_alloc(s, gfpflags, -1, _RET_IP_);
1750
1751        trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags);
1752
1753        return ret;
1754}
1755EXPORT_SYMBOL(kmem_cache_alloc);
1756
1757#ifdef CONFIG_KMEMTRACE
1758void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags)
1759{
1760        return slab_alloc(s, gfpflags, -1, _RET_IP_);
1761}
1762EXPORT_SYMBOL(kmem_cache_alloc_notrace);
1763#endif
1764
1765#ifdef CONFIG_NUMA
1766void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
1767{
1768        void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
1769
1770        trace_kmem_cache_alloc_node(_RET_IP_, ret,
1771                                    s->objsize, s->size, gfpflags, node);
1772
1773        return ret;
1774}
1775EXPORT_SYMBOL(kmem_cache_alloc_node);
1776#endif
1777
1778#ifdef CONFIG_KMEMTRACE
1779void *kmem_cache_alloc_node_notrace(struct kmem_cache *s,
1780                                    gfp_t gfpflags,
1781                                    int node)
1782{
1783        return slab_alloc(s, gfpflags, node, _RET_IP_);
1784}
1785EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
1786#endif
1787
1788/*
1789 * Slow patch handling. This may still be called frequently since objects
1790 * have a longer lifetime than the cpu slabs in most processing loads.
1791 *
1792 * So we still attempt to reduce cache line usage. Just take the slab
1793 * lock and free the item. If there is no additional partial page
1794 * handling required then we can return immediately.
1795 */
1796static void __slab_free(struct kmem_cache *s, struct page *page,
1797                        void *x, unsigned long addr, unsigned int offset)
1798{
1799        void *prior;
1800        void **object = (void *)x;
1801        struct kmem_cache_cpu *c;
1802
1803        c = get_cpu_slab(s, raw_smp_processor_id());
1804        stat(c, FREE_SLOWPATH);
1805        slab_lock(page);
1806
1807        if (unlikely(SLABDEBUG && PageSlubDebug(page)))
1808                goto debug;
1809
1810checks_ok:
1811        prior = object[offset] = page->freelist;
1812        page->freelist = object;
1813        page->inuse--;
1814
1815        if (unlikely(PageSlubFrozen(page))) {
1816                stat(c, FREE_FROZEN);
1817                goto out_unlock;
1818        }
1819
1820        if (unlikely(!page->inuse))
1821                goto slab_empty;
1822
1823        /*
1824         * Objects left in the slab. If it was not on the partial list before
1825         * then add it.
1826         */
1827        if (unlikely(!prior)) {
1828                add_partial(get_node(s, page_to_nid(page)), page, 1);
1829                stat(c, FREE_ADD_PARTIAL);
1830        }
1831
1832out_unlock:
1833        slab_unlock(page);
1834        return;
1835
1836slab_empty:
1837        if (prior) {
1838                /*
1839                 * Slab still on the partial list.
1840                 */
1841                remove_partial(s, page);
1842                stat(c, FREE_REMOVE_PARTIAL);
1843        }
1844        slab_unlock(page);
1845        stat(c, FREE_SLAB);
1846        discard_slab(s, page);
1847        return;
1848
1849debug:
1850        if (!free_debug_processing(s, page, x, addr))
1851                goto out_unlock;
1852        goto checks_ok;
1853}
1854
1855/*
1856 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
1857 * can perform fastpath freeing without additional function calls.
1858 *
1859 * The fastpath is only possible if we are freeing to the current cpu slab
1860 * of this processor. This typically the case if we have just allocated
1861 * the item before.
1862 *
1863 * If fastpath is not possible then fall back to __slab_free where we deal
1864 * with all sorts of special processing.
1865 */
1866static __always_inline void slab_free(struct kmem_cache *s,
1867                        struct page *page, void *x, unsigned long addr)
1868{
1869        void **object = (void *)x;
1870        struct kmem_cache_cpu *c;
1871        unsigned long flags;
1872
1873        kmemleak_free_recursive(x, s->flags);
1874        local_irq_save(flags);
1875        c = get_cpu_slab(s, smp_processor_id());
1876        kmemcheck_slab_free(s, object, c->objsize);
1877        debug_check_no_locks_freed(object, c->objsize);
1878        if (!(s->flags & SLAB_DEBUG_OBJECTS))
1879                debug_check_no_obj_freed(object, c->objsize);
1880        if (likely(page == c->page && c->node >= 0)) {
1881                object[c->offset] = c->freelist;
1882                c->freelist = object;
1883                stat(c, FREE_FASTPATH);
1884        } else
1885                __slab_free(s, page, x, addr, c->offset);
1886
1887        local_irq_restore(flags);
1888}
1889
1890void kmem_cache_free(struct kmem_cache *s, void *x)
1891{
1892        struct page *page;
1893
1894        page = virt_to_head_page(x);
1895
1896        slab_free(s, page, x, _RET_IP_);
1897
1898        trace_kmem_cache_free(_RET_IP_, x);
1899}
1900EXPORT_SYMBOL(kmem_cache_free);
1901
1902/* Figure out on which slab page the object resides */
1903static struct page *get_object_page(const void *x)
1904{
1905        struct page *page = virt_to_head_page(x);
1906
1907        if (!PageSlab(page))
1908                return NULL;
1909
1910        return page;
1911}
1912
1913/*
1914 * Object placement in a slab is made very easy because we always start at
1915 * offset 0. If we tune the size of the object to the alignment then we can
1916 * get the required alignment by putting one properly sized object after
1917 * another.
1918 *
1919 * Notice that the allocation order determines the sizes of the per cpu
1920 * caches. Each processor has always one slab available for allocations.
1921 * Increasing the allocation order reduces the number of times that slabs
1922 * must be moved on and off the partial lists and is therefore a factor in
1923 * locking overhead.
1924 */
1925
1926/*
1927 * Mininum / Maximum order of slab pages. This influences locking overhead
1928 * and slab fragmentation. A higher order reduces the number of partial slabs
1929 * and increases the number of allocations possible without having to
1930 * take the list_lock.
1931 */
1932static int slub_min_order;
1933static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER;
1934static int slub_min_objects;
1935
1936/*
1937 * Merge control. If this is set then no merging of slab caches will occur.
1938 * (Could be removed. This was introduced to pacify the merge skeptics.)
1939 */
1940static int slub_nomerge;
1941
1942/*
1943 * Calculate the order of allocation given an slab object size.
1944 *
1945 * The order of allocation has significant impact on performance and other
1946 * system components. Generally order 0 allocations should be preferred since
1947 * order 0 does not cause fragmentation in the page allocator. Larger objects
1948 * be problematic to put into order 0 slabs because there may be too much
1949 * unused space left. We go to a higher order if more than 1/16th of the slab
1950 * would be wasted.
1951 *
1952 * In order to reach satisfactory performance we must ensure that a minimum
1953 * number of objects is in one slab. Otherwise we may generate too much
1954 * activity on the partial lists which requires taking the list_lock. This is
1955 * less a concern for large slabs though which are rarely used.
1956 *
1957 * slub_max_order specifies the order where we begin to stop considering the
1958 * number of objects in a slab as critical. If we reach slub_max_order then
1959 * we try to keep the page order as low as possible. So we accept more waste
1960 * of space in favor of a small page order.
1961 *
1962 * Higher order allocations also allow the placement of more objects in a
1963 * slab and thereby reduce object handling overhead. If the user has
1964 * requested a higher mininum order then we start with that one instead of
1965 * the smallest order which will fit the object.
1966 */
1967static inline int slab_order(int size, int min_objects,
1968                                int max_order, int fract_leftover)
1969{
1970        int order;
1971        int rem;
1972        int min_order = slub_min_order;
1973
1974        if ((PAGE_SIZE << min_order) / size > MAX_OBJS_PER_PAGE)
1975                return get_order(size * MAX_OBJS_PER_PAGE) - 1;
1976
1977        for (order = max(min_order,
1978                                fls(min_objects * size - 1) - PAGE_SHIFT);
1979                        order <= max_order; order++) {
1980
1981                unsigned long slab_size = PAGE_SIZE << order;
1982
1983                if (slab_size < min_objects * size)
1984                        continue;
1985
1986                rem = slab_size % size;
1987
1988                if (rem <= slab_size / fract_leftover)
1989                        break;
1990
1991        }
1992
1993        return order;
1994}
1995
1996static inline int calculate_order(int size)
1997{
1998        int order;
1999        int min_objects;
2000        int fraction;
2001        int max_objects;
2002
2003        /*
2004         * Attempt to find best configuration for a slab. This
2005         * works by first attempting to generate a layout with
2006         * the best configuration and backing off gradually.
2007         *
2008         * First we reduce the acceptable waste in a slab. Then
2009         * we reduce the minimum objects required in a slab.
2010         */
2011        min_objects = slub_min_objects;
2012        if (!min_objects)
2013                min_objects = 4 * (fls(nr_cpu_ids) + 1);
2014        max_objects = (PAGE_SIZE << slub_max_order)/size;
2015        min_objects = min(min_objects, max_objects);
2016
2017        while (min_objects > 1) {
2018                fraction = 16;
2019                while (fraction >= 4) {
2020                        order = slab_order(size, min_objects,
2021                                                slub_max_order, fraction);
2022                        if (order <= slub_max_order)
2023                                return order;
2024                        fraction /= 2;
2025                }
2026                min_objects--;
2027        }
2028
2029        /*
2030         * We were unable to place multiple objects in a slab. Now
2031         * lets see if we can place a single object there.
2032         */
2033        order = slab_order(size, 1, slub_max_order, 1);
2034        if (order <= slub_max_order)
2035                return order;
2036
2037        /*
2038         * Doh this slab cannot be placed using slub_max_order.
2039         */
2040        order = slab_order(size, 1, MAX_ORDER, 1);
2041        if (order < MAX_ORDER)
2042                return order;
2043        return -ENOSYS;
2044}
2045
2046/*
2047 * Figure out what the alignment of the objects will be.
2048 */
2049static unsigned long calculate_alignment(unsigned long flags,
2050                unsigned long align, unsigned long size)
2051{
2052        /*
2053         * If the user wants hardware cache aligned objects then follow that
2054         * suggestion if the object is sufficiently large.
2055         *
2056         * The hardware cache alignment cannot override the specified
2057         * alignment though. If that is greater then use it.
2058         */
2059        if (flags & SLAB_HWCACHE_ALIGN) {
2060                unsigned long ralign = cache_line_size();
2061                while (size <= ralign / 2)
2062                        ralign /= 2;
2063                align = max(align, ralign);
2064        }
2065
2066        if (align < ARCH_SLAB_MINALIGN)
2067                align = ARCH_SLAB_MINALIGN;
2068
2069        return ALIGN(align, sizeof(void *));
2070}
2071
2072static void init_kmem_cache_cpu(struct kmem_cache *s,
2073                        struct kmem_cache_cpu *c)
2074{
2075        c->page = NULL;
2076        c->freelist = NULL;
2077        c->node = 0;
2078        c->offset = s->offset / sizeof(void *);
2079        c->objsize = s->objsize;
2080#ifdef CONFIG_SLUB_STATS
2081        memset(c->stat, 0, NR_SLUB_STAT_ITEMS * sizeof(unsigned));
2082#endif
2083}
2084
2085static void
2086init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
2087{
2088        n->nr_partial = 0;
2089        spin_lock_init(&n->list_lock);
2090        INIT_LIST_HEAD(&n->partial);
2091#ifdef CONFIG_SLUB_DEBUG
2092        atomic_long_set(&n->nr_slabs, 0);
2093        atomic_long_set(&n->total_objects, 0);
2094        INIT_LIST_HEAD(&n->full);
2095#endif
2096}
2097
2098#ifdef CONFIG_SMP
2099/*
2100 * Per cpu array for per cpu structures.
2101 *
2102 * The per cpu array places all kmem_cache_cpu structures from one processor
2103 * close together meaning that it becomes possible that multiple per cpu
2104 * structures are contained in one cacheline. This may be particularly
2105 * beneficial for the kmalloc caches.
2106 *
2107 * A desktop system typically has around 60-80 slabs. With 100 here we are
2108 * likely able to get per cpu structures for all caches from the array defined
2109 * here. We must be able to cover all kmalloc caches during bootstrap.
2110 *
2111 * If the per cpu array is exhausted then fall back to kmalloc
2112 * of individual cachelines. No sharing is possible then.
2113 */
2114#define NR_KMEM_CACHE_CPU 100
2115
2116static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU],
2117                      kmem_cache_cpu);
2118
2119static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
2120static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS);
2121
2122static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s,
2123                                                        int cpu, gfp_t flags)
2124{
2125        struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu);
2126
2127        if (c)
2128                per_cpu(kmem_cache_cpu_free, cpu) =
2129                                (void *)c->freelist;
2130        else {
2131                /* Table overflow: So allocate ourselves */
2132                c = kmalloc_node(
2133                        ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()),
2134                        flags, cpu_to_node(cpu));
2135                if (!c)
2136                        return NULL;
2137        }
2138
2139        init_kmem_cache_cpu(s, c);
2140        return c;
2141}
2142
2143static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu)
2144{
2145        if (c < per_cpu(kmem_cache_cpu, cpu) ||
2146                        c >= per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) {
2147                kfree(c);
2148                return;
2149        }
2150        c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu);
2151        per_cpu(kmem_cache_cpu_free, cpu) = c;
2152}
2153
2154static void free_kmem_cache_cpus(struct kmem_cache *s)
2155{
2156        int cpu;
2157
2158        for_each_online_cpu(cpu) {
2159                struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2160
2161                if (c) {
2162                        s->cpu_slab[cpu] = NULL;
2163                        free_kmem_cache_cpu(c, cpu);
2164                }
2165        }
2166}
2167
2168static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
2169{
2170        int cpu;
2171
2172        for_each_online_cpu(cpu) {
2173                struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2174
2175                if (c)
2176                        continue;
2177
2178                c = alloc_kmem_cache_cpu(s, cpu, flags);
2179                if (!c) {
2180                        free_kmem_cache_cpus(s);
2181                        return 0;
2182                }
2183                s->cpu_slab[cpu] = c;
2184        }
2185        return 1;
2186}
2187
2188/*
2189 * Initialize the per cpu array.
2190 */
2191static void init_alloc_cpu_cpu(int cpu)
2192{
2193        int i;
2194
2195        if (cpumask_test_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once)))
2196                return;
2197
2198        for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--)
2199                free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu);
2200
2201        cpumask_set_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once));
2202}
2203
2204static void __init init_alloc_cpu(void)
2205{
2206        int cpu;
2207
2208        for_each_online_cpu(cpu)
2209                init_alloc_cpu_cpu(cpu);
2210  }
2211
2212#else
2213static inline void free_kmem_cache_cpus(struct kmem_cache *s) {}
2214static inline void init_alloc_cpu(void) {}
2215
2216static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
2217{
2218        init_kmem_cache_cpu(s, &s->cpu_slab);
2219        return 1;
2220}
2221#endif
2222
2223#ifdef CONFIG_NUMA
2224/*
2225 * No kmalloc_node yet so do it by hand. We know that this is the first
2226 * slab on the node for this slabcache. There are no concurrent accesses
2227 * possible.
2228 *
2229 * Note that this function only works on the kmalloc_node_cache
2230 * when allocating for the kmalloc_node_cache. This is used for bootstrapping
2231 * memory on a fresh node that has no slab structures yet.
2232 */
2233static void early_kmem_cache_node_alloc(gfp_t gfpflags, int node)
2234{
2235        struct page *page;
2236        struct kmem_cache_node *n;
2237        unsigned long flags;
2238
2239        BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
2240
2241        page = new_slab(kmalloc_caches, gfpflags, node);
2242
2243        BUG_ON(!page);
2244        if (page_to_nid(page) != node) {
2245                printk(KERN_ERR "SLUB: Unable to allocate memory from "
2246                                "node %d\n", node);
2247                printk(KERN_ERR "SLUB: Allocating a useless per node structure "
2248                                "in order to be able to continue\n");
2249        }
2250
2251        n = page->freelist;
2252        BUG_ON(!n);
2253        page->freelist = get_freepointer(kmalloc_caches, n);
2254        page->inuse++;
2255        kmalloc_caches->node[node] = n;
2256#ifdef CONFIG_SLUB_DEBUG
2257        init_object(kmalloc_caches, n, 1);
2258        init_tracking(kmalloc_caches, n);
2259#endif
2260        init_kmem_cache_node(n, kmalloc_caches);
2261        inc_slabs_node(kmalloc_caches, node, page->objects);
2262
2263        /*
2264         * lockdep requires consistent irq usage for each lock
2265         * so even though there cannot be a race this early in
2266         * the boot sequence, we still disable irqs.
2267         */
2268        local_irq_save(flags);
2269        add_partial(n, page, 0);
2270        local_irq_restore(flags);
2271}
2272
2273static void free_kmem_cache_nodes(struct kmem_cache *s)
2274{
2275        int node;
2276
2277        for_each_node_state(node, N_NORMAL_MEMORY) {
2278                struct kmem_cache_node *n = s->node[node];
2279                if (n && n != &s->local_node)
2280                        kmem_cache_free(kmalloc_caches, n);
2281                s->node[node] = NULL;
2282        }
2283}
2284
2285static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
2286{
2287        int node;
2288        int local_node;
2289
2290        if (slab_state >= UP)
2291                local_node = page_to_nid(virt_to_page(s));
2292        else
2293                local_node = 0;
2294
2295        for_each_node_state(node, N_NORMAL_MEMORY) {
2296                struct kmem_cache_node *n;
2297
2298                if (local_node == node)
2299                        n = &s->local_node;
2300                else {
2301                        if (slab_state == DOWN) {
2302                                early_kmem_cache_node_alloc(gfpflags, node);
2303                                continue;
2304                        }
2305                        n = kmem_cache_alloc_node(kmalloc_caches,
2306                                                        gfpflags, node);
2307
2308                        if (!n) {
2309                                free_kmem_cache_nodes(s);
2310                                return 0;
2311                        }
2312
2313                }
2314                s->node[node] = n;
2315                init_kmem_cache_node(n, s);
2316        }
2317        return 1;
2318}
2319#else
2320static void free_kmem_cache_nodes(struct kmem_cache *s)
2321{
2322}
2323
2324static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
2325{
2326        init_kmem_cache_node(&s->local_node, s);
2327        return 1;
2328}
2329#endif
2330
2331static void set_min_partial(struct kmem_cache *s, unsigned long min)
2332{
2333        if (min < MIN_PARTIAL)
2334                min = MIN_PARTIAL;
2335        else if (min > MAX_PARTIAL)
2336                min = MAX_PARTIAL;
2337        s->min_partial = min;
2338}
2339
2340/*
2341 * calculate_sizes() determines the order and the distribution of data within
2342 * a slab object.
2343 */
2344static int calculate_sizes(struct kmem_cache *s, int forced_order)
2345{
2346        unsigned long flags = s->flags;
2347        unsigned long size = s->objsize;
2348        unsigned long align = s->align;
2349        int order;
2350
2351        /*
2352         * Round up object size to the next word boundary. We can only
2353         * place the free pointer at word boundaries and this determines
2354         * the possible location of the free pointer.
2355         */
2356        size = ALIGN(size, sizeof(void *));
2357
2358#ifdef CONFIG_SLUB_DEBUG
2359        /*
2360         * Determine if we can poison the object itself. If the user of
2361         * the slab may touch the object after free or before allocation
2362         * then we should never poison the object itself.
2363         */
2364        if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) &&
2365                        !s->ctor)
2366                s->flags |= __OBJECT_POISON;
2367        else
2368                s->flags &= ~__OBJECT_POISON;
2369
2370
2371        /*
2372         * If we are Redzoning then check if there is some space between the
2373         * end of the object and the free pointer. If not then add an
2374         * additional word to have some bytes to store Redzone information.
2375         */
2376        if ((flags & SLAB_RED_ZONE) && size == s->objsize)
2377                size += sizeof(void *);
2378#endif
2379
2380        /*
2381         * With that we have determined the number of bytes in actual use
2382         * by the object. This is the potential offset to the free pointer.
2383         */
2384        s->inuse = size;
2385
2386        if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) ||
2387                s->ctor)) {
2388                /*
2389                 * Relocate free pointer after the object if it is not
2390                 * permitted to overwrite the first word of the object on
2391                 * kmem_cache_free.
2392                 *
2393                 * This is the case if we do RCU, have a constructor or
2394                 * destructor or are poisoning the objects.
2395                 */
2396                s->offset = size;
2397                size += sizeof(void *);
2398        }
2399
2400#ifdef CONFIG_SLUB_DEBUG
2401        if (flags & SLAB_STORE_USER)
2402                /*
2403                 * Need to store information about allocs and frees after
2404                 * the object.
2405                 */
2406                size += 2 * sizeof(struct track);
2407
2408        if (flags & SLAB_RED_ZONE)
2409                /*
2410                 * Add some empty padding so that we can catch
2411                 * overwrites from earlier objects rather than let
2412                 * tracking information or the free pointer be
2413                 * corrupted if a user writes before the start
2414                 * of the object.
2415                 */
2416                size += sizeof(void *);
2417#endif
2418
2419        /*
2420         * Determine the alignment based on various parameters that the
2421         * user specified and the dynamic determination of cache line size
2422         * on bootup.
2423         */
2424        align = calculate_alignment(flags, align, s->objsize);
2425        s->align = align;
2426
2427        /*
2428         * SLUB stores one object immediately after another beginning from
2429         * offset 0. In order to align the objects we have to simply size
2430         * each object to conform to the alignment.
2431         */
2432        size = ALIGN(size, align);
2433        s->size = size;
2434        if (forced_order >= 0)
2435                order = forced_order;
2436        else
2437                order = calculate_order(size);
2438
2439        if (order < 0)
2440                return 0;
2441
2442        s->allocflags = 0;
2443        if (order)
2444                s->allocflags |= __GFP_COMP;
2445
2446        if (s->flags & SLAB_CACHE_DMA)
2447                s->allocflags |= SLUB_DMA;
2448
2449        if (s->flags & SLAB_RECLAIM_ACCOUNT)
2450                s->allocflags |= __GFP_RECLAIMABLE;
2451
2452        /*
2453         * Determine the number of objects per slab
2454         */
2455        s->oo = oo_make(order, size);
2456        s->min = oo_make(get_order(size), size);
2457        if (oo_objects(s->oo) > oo_objects(s->max))
2458                s->max = s->oo;
2459
2460        return !!oo_objects(s->oo);
2461
2462}
2463
2464static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
2465                const char *name, size_t size,
2466                size_t align, unsigned long flags,
2467                void (*ctor)(void *))
2468{
2469        memset(s, 0, kmem_size);
2470        s->name = name;
2471        s->ctor = ctor;
2472        s->objsize = size;
2473        s->align = align;
2474        s->flags = kmem_cache_flags(size, flags, name, ctor);
2475
2476        if (!calculate_sizes(s, -1))
2477                goto error;
2478        if (disable_higher_order_debug) {
2479                /*
2480                 * Disable debugging flags that store metadata if the min slab
2481                 * order increased.
2482                 */
2483                if (get_order(s->size) > get_order(s->objsize)) {
2484                        s->flags &= ~DEBUG_METADATA_FLAGS;
2485                        s->offset = 0;
2486                        if (!calculate_sizes(s, -1))
2487                                goto error;
2488                }
2489        }
2490
2491        /*
2492         * The larger the object size is, the more pages we want on the partial
2493         * list to avoid pounding the page allocator excessively.
2494         */
2495        set_min_partial(s, ilog2(s->size));
2496        s->refcount = 1;
2497#ifdef CONFIG_NUMA
2498        s->remote_node_defrag_ratio = 1000;
2499#endif
2500        if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
2501                goto error;
2502
2503        if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA))
2504                return 1;
2505        free_kmem_cache_nodes(s);
2506error:
2507        if (flags & SLAB_PANIC)
2508                panic("Cannot create slab %s size=%lu realsize=%u "
2509                        "order=%u offset=%u flags=%lx\n",
2510                        s->name, (unsigned long)size, s->size, oo_order(s->oo),
2511                        s->offset, flags);
2512        return 0;
2513}
2514
2515/*
2516 * Check if a given pointer is valid
2517 */
2518int kmem_ptr_validate(struct kmem_cache *s, const void *object)
2519{
2520        struct page *page;
2521
2522        page = get_object_page(object);
2523
2524        if (!page || s != page->slab)
2525                /* No slab or wrong slab */
2526                return 0;
2527
2528        if (!check_valid_pointer(s, page, object))
2529                return 0;
2530
2531        /*
2532         * We could also check if the object is on the slabs freelist.
2533         * But this would be too expensive and it seems that the main
2534         * purpose of kmem_ptr_valid() is to check if the object belongs
2535         * to a certain slab.
2536         */
2537        return 1;
2538}
2539EXPORT_SYMBOL(kmem_ptr_validate);
2540
2541/*
2542 * Determine the size of a slab object
2543 */
2544unsigned int kmem_cache_size(struct kmem_cache *s)
2545{
2546        return s->objsize;
2547}
2548EXPORT_SYMBOL(kmem_cache_size);
2549
2550const char *kmem_cache_name(struct kmem_cache *s)
2551{
2552        return s->name;
2553}
2554EXPORT_SYMBOL(kmem_cache_name);
2555
2556static void list_slab_objects(struct kmem_cache *s, struct page *page,
2557                                                        const char *text)
2558{
2559#ifdef CONFIG_SLUB_DEBUG
2560        void *addr = page_address(page);
2561        void *p;
2562        DECLARE_BITMAP(map, page->objects);
2563
2564        bitmap_zero(map, page->objects);
2565        slab_err(s, page, "%s", text);
2566        slab_lock(page);
2567        for_each_free_object(p, s, page->freelist)
2568                set_bit(slab_index(p, s, addr), map);
2569
2570        for_each_object(p, s, addr, page->objects) {
2571
2572                if (!test_bit(slab_index(p, s, addr), map)) {
2573                        printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n",
2574                                                        p, p - addr);
2575                        print_tracking(s, p);
2576                }
2577        }
2578        slab_unlock(page);
2579#endif
2580}
2581
2582/*
2583 * Attempt to free all partial slabs on a node.
2584 */
2585static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
2586{
2587        unsigned long flags;
2588        struct page *page, *h;
2589
2590        spin_lock_irqsave(&n->list_lock, flags);
2591        list_for_each_entry_safe(page, h, &n->partial, lru) {
2592                if (!page->inuse) {
2593                        list_del(&page->lru);
2594                        discard_slab(s, page);
2595                        n->nr_partial--;
2596                } else {
2597                        list_slab_objects(s, page,
2598                                "Objects remaining on kmem_cache_close()");
2599                }
2600        }
2601        spin_unlock_irqrestore(&n->list_lock, flags);
2602}
2603
2604/*
2605 * Release all resources used by a slab cache.
2606 */
2607static inline int kmem_cache_close(struct kmem_cache *s)
2608{
2609        int node;
2610
2611        flush_all(s);
2612
2613        /* Attempt to free all objects */
2614        free_kmem_cache_cpus(s);
2615        for_each_node_state(node, N_NORMAL_MEMORY) {
2616                struct kmem_cache_node *n = get_node(s, node);
2617
2618                free_partial(s, n);
2619                if (n->nr_partial || slabs_node(s, node))
2620                        return 1;
2621        }
2622        free_kmem_cache_nodes(s);
2623        return 0;
2624}
2625
2626/*
2627 * Close a cache and release the kmem_cache structure
2628 * (must be used for caches created using kmem_cache_create)
2629 */
2630void kmem_cache_destroy(struct kmem_cache *s)
2631{
2632        down_write(&slub_lock);
2633        s->refcount--;
2634        if (!s->refcount) {
2635                list_del(&s->list);
2636                up_write(&slub_lock);
2637                if (kmem_cache_close(s)) {
2638                        printk(KERN_ERR "SLUB %s: %s called for cache that "
2639                                "still has objects.\n", s->name, __func__);
2640                        dump_stack();
2641                }
2642                if (s->flags & SLAB_DESTROY_BY_RCU)
2643                        rcu_barrier();
2644                sysfs_slab_remove(s);
2645        } else
2646                up_write(&slub_lock);
2647}
2648EXPORT_SYMBOL(kmem_cache_destroy);
2649
2650/********************************************************************
2651 *              Kmalloc subsystem
2652 *******************************************************************/
2653
2654struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned;
2655EXPORT_SYMBOL(kmalloc_caches);
2656
2657static int __init setup_slub_min_order(char *str)
2658{
2659        get_option(&str, &slub_min_order);
2660
2661        return 1;
2662}
2663
2664__setup("slub_min_order=", setup_slub_min_order);
2665
2666static int __init setup_slub_max_order(char *str)
2667{
2668        get_option(&str, &slub_max_order);
2669        slub_max_order = min(slub_max_order, MAX_ORDER - 1);
2670
2671        return 1;
2672}
2673
2674__setup("slub_max_order=", setup_slub_max_order);
2675
2676static int __init setup_slub_min_objects(char *str)
2677{
2678        get_option(&str, &slub_min_objects);
2679
2680        return 1;
2681}
2682
2683__setup("slub_min_objects=", setup_slub_min_objects);
2684
2685static int __init setup_slub_nomerge(char *str)
2686{
2687        slub_nomerge = 1;
2688        return 1;
2689}
2690
2691__setup("slub_nomerge", setup_slub_nomerge);
2692
2693static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
2694                const char *name, int size, gfp_t gfp_flags)
2695{
2696        unsigned int flags = 0;
2697
2698        if (gfp_flags & SLUB_DMA)
2699                flags = SLAB_CACHE_DMA;
2700
2701        /*
2702         * This function is called with IRQs disabled during early-boot on
2703         * single CPU so there's no need to take slub_lock here.
2704         */
2705        if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
2706                                                                flags, NULL))
2707                goto panic;
2708
2709        list_add(&s->list, &slab_caches);
2710
2711        if (sysfs_slab_add(s))
2712                goto panic;
2713        return s;
2714
2715panic:
2716        panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
2717}
2718
2719#ifdef CONFIG_ZONE_DMA
2720static struct kmem_cache *kmalloc_caches_dma[SLUB_PAGE_SHIFT];
2721
2722static void sysfs_add_func(struct work_struct *w)
2723{
2724        struct kmem_cache *s;
2725
2726        down_write(&slub_lock);
2727        list_for_each_entry(s, &slab_caches, list) {
2728                if (s->flags & __SYSFS_ADD_DEFERRED) {
2729                        s->flags &= ~__SYSFS_ADD_DEFERRED;
2730                        sysfs_slab_add(s);
2731                }
2732        }
2733        up_write(&slub_lock);
2734}
2735
2736static DECLARE_WORK(sysfs_add_work, sysfs_add_func);
2737
2738static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
2739{
2740        struct kmem_cache *s;
2741        char *text;
2742        size_t realsize;
2743        unsigned long slabflags;
2744
2745        s = kmalloc_caches_dma[index];
2746        if (s)
2747                return s;
2748
2749        /* Dynamically create dma cache */
2750        if (flags & __GFP_WAIT)
2751                down_write(&slub_lock);
2752        else {
2753                if (!down_write_trylock(&slub_lock))
2754                        goto out;
2755        }
2756
2757        if (kmalloc_caches_dma[index])
2758                goto unlock_out;
2759
2760        realsize = kmalloc_caches[index].objsize;
2761        text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d",
2762                         (unsigned int)realsize);
2763        s = kmalloc(kmem_size, flags & ~SLUB_DMA);
2764
2765        /*
2766         * Must defer sysfs creation to a workqueue because we don't know
2767         * what context we are called from. Before sysfs comes up, we don't
2768         * need to do anything because our sysfs initcall will start by
2769         * adding all existing slabs to sysfs.
2770         */
2771        slabflags = SLAB_CACHE_DMA|SLAB_NOTRACK;
2772        if (slab_state >= SYSFS)
2773                slabflags |= __SYSFS_ADD_DEFERRED;
2774
2775        if (!s || !text || !kmem_cache_open(s, flags, text,
2776                        realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) {
2777                kfree(s);
2778                kfree(text);
2779                goto unlock_out;
2780        }
2781
2782        list_add(&s->list, &slab_caches);
2783        kmalloc_caches_dma[index] = s;
2784
2785        if (slab_state >= SYSFS)
2786                schedule_work(&sysfs_add_work);
2787
2788unlock_out:
2789        up_write(&slub_lock);
2790out:
2791        return kmalloc_caches_dma[index];
2792}
2793#endif
2794
2795/*
2796 * Conversion table for small slabs sizes / 8 to the index in the
2797 * kmalloc array. This is necessary for slabs < 192 since we have non power
2798 * of two cache sizes there. The size of larger slabs can be determined using
2799 * fls.
2800 */
2801static s8 size_index[24] = {
2802        3,      /* 8 */
2803        4,      /* 16 */
2804        5,      /* 24 */
2805        5,      /* 32 */
2806        6,      /* 40 */
2807        6,      /* 48 */
2808        6,      /* 56 */
2809        6,      /* 64 */
2810        1,      /* 72 */
2811        1,      /* 80 */
2812        1,      /* 88 */
2813        1,      /* 96 */
2814        7,      /* 104 */
2815        7,      /* 112 */
2816        7,      /* 120 */
2817        7,      /* 128 */
2818        2,      /* 136 */
2819        2,      /* 144 */
2820        2,      /* 152 */
2821        2,      /* 160 */
2822        2,      /* 168 */
2823        2,      /* 176 */
2824        2,      /* 184 */
2825        2       /* 192 */
2826};
2827
2828static inline int size_index_elem(size_t bytes)
2829{
2830        return (bytes - 1) / 8;
2831}
2832
2833static struct kmem_cache *get_slab(size_t size, gfp_t flags)
2834{
2835        int index;
2836
2837        if (size <= 192) {
2838                if (!size)
2839                        return ZERO_SIZE_PTR;
2840
2841                index = size_index[size_index_elem(size)];
2842        } else
2843                index = fls(size - 1);
2844
2845#ifdef CONFIG_ZONE_DMA
2846        if (unlikely((flags & SLUB_DMA)))
2847                return dma_kmalloc_cache(index, flags);
2848
2849#endif
2850        return &kmalloc_caches[index];
2851}
2852
2853void *__kmalloc(size_t size, gfp_t flags)
2854{
2855        struct kmem_cache *s;
2856        void *ret;
2857
2858        if (unlikely(size > SLUB_MAX_SIZE))
2859                return kmalloc_large(size, flags);
2860
2861        s = get_slab(size, flags);
2862
2863        if (unlikely(ZERO_OR_NULL_PTR(s)))
2864                return s;
2865
2866        ret = slab_alloc(s, flags, -1, _RET_IP_);
2867
2868        trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
2869
2870        return ret;
2871}
2872EXPORT_SYMBOL(__kmalloc);
2873
2874static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
2875{
2876        struct page *page;
2877        void *ptr = NULL;
2878
2879        flags |= __GFP_COMP | __GFP_NOTRACK;
2880        page = alloc_pages_node(node, flags, get_order(size));
2881        if (page)
2882                ptr = page_address(page);
2883
2884        kmemleak_alloc(ptr, size, 1, flags);
2885        return ptr;
2886}
2887
2888#ifdef CONFIG_NUMA
2889void *__kmalloc_node(size_t size, gfp_t flags, int node)
2890{
2891        struct kmem_cache *s;
2892        void *ret;
2893
2894        if (unlikely(size > SLUB_MAX_SIZE)) {
2895                ret = kmalloc_large_node(size, flags, node);
2896
2897                trace_kmalloc_node(_RET_IP_, ret,
2898                                   size, PAGE_SIZE << get_order(size),
2899                                   flags, node);
2900
2901                return ret;
2902        }
2903
2904        s = get_slab(size, flags);
2905
2906        if (unlikely(ZERO_OR_NULL_PTR(s)))
2907                return s;
2908
2909        ret = slab_alloc(s, flags, node, _RET_IP_);
2910
2911        trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
2912
2913        return ret;
2914}
2915EXPORT_SYMBOL(__kmalloc_node);
2916#endif
2917
2918size_t ksize(const void *object)
2919{
2920        struct page *page;
2921        struct kmem_cache *s;
2922
2923        if (unlikely(object == ZERO_SIZE_PTR))
2924                return 0;
2925
2926        page = virt_to_head_page(object);
2927
2928        if (unlikely(!PageSlab(page))) {
2929                WARN_ON(!PageCompound(page));
2930                return PAGE_SIZE << compound_order(page);
2931        }
2932        s = page->slab;
2933
2934#ifdef CONFIG_SLUB_DEBUG
2935        /*
2936         * Debugging requires use of the padding between object
2937         * and whatever may come after it.
2938         */
2939        if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
2940                return s->objsize;
2941
2942#endif
2943        /*
2944         * If we have the need to store the freelist pointer
2945         * back there or track user information then we can
2946         * only use the space before that information.
2947         */
2948        if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
2949                return s->inuse;
2950        /*
2951         * Else we can use all the padding etc for the allocation
2952         */
2953        return s->size;
2954}
2955EXPORT_SYMBOL(ksize);
2956
2957void kfree(const void *x)
2958{
2959        struct page *page;
2960        void *object = (void *)x;
2961
2962        trace_kfree(_RET_IP_, x);
2963
2964        if (unlikely(ZERO_OR_NULL_PTR(x)))
2965                return;
2966
2967        page = virt_to_head_page(x);
2968        if (unlikely(!PageSlab(page))) {
2969                BUG_ON(!PageCompound(page));
2970                kmemleak_free(x);
2971                put_page(page);
2972                return;
2973        }
2974        slab_free(page->slab, page, object, _RET_IP_);
2975}
2976EXPORT_SYMBOL(kfree);
2977
2978/*
2979 * kmem_cache_shrink removes empty slabs from the partial lists and sorts
2980 * the remaining slabs by the number of items in use. The slabs with the
2981 * most items in use come first. New allocations will then fill those up
2982 * and thus they can be removed from the partial lists.
2983 *
2984 * The slabs with the least items are placed last. This results in them
2985 * being allocated from last increasing the chance that the last objects
2986 * are freed in them.
2987 */
2988int kmem_cache_shrink(struct kmem_cache *s)
2989{
2990        int node;
2991        int i;
2992        struct kmem_cache_node *n;
2993        struct page *page;
2994        struct page *t;
2995        int objects = oo_objects(s->max);
2996        struct list_head *slabs_by_inuse =
2997                kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL);
2998        unsigned long flags;
2999
3000        if (!slabs_by_inuse)
3001                return -ENOMEM;
3002
3003        flush_all(s);
3004        for_each_node_state(node, N_NORMAL_MEMORY) {
3005                n = get_node(s, node);
3006
3007                if (!n->nr_partial)
3008                        continue;
3009
3010                for (i = 0; i < objects; i++)
3011                        INIT_LIST_HEAD(slabs_by_inuse + i);
3012
3013                spin_lock_irqsave(&n->list_lock, flags);
3014
3015                /*
3016                 * Build lists indexed by the items in use in each slab.
3017                 *
3018                 * Note that concurrent frees may occur while we hold the
3019                 * list_lock. page->inuse here is the upper limit.
3020                 */
3021                list_for_each_entry_safe(page, t, &n->partial, lru) {
3022                        if (!page->inuse && slab_trylock(page)) {
3023                                /*
3024                                 * Must hold slab lock here because slab_free
3025                                 * may have freed the last object and be
3026                                 * waiting to release the slab.
3027                                 */
3028                                list_del(&page->lru);
3029                                n->nr_partial--;
3030                                slab_unlock(page);
3031                                discard_slab(s, page);
3032                        } else {
3033                                list_move(&page->lru,
3034                                slabs_by_inuse + page->inuse);
3035                        }
3036                }
3037
3038                /*
3039                 * Rebuild the partial list with the slabs filled up most
3040                 * first and the least used slabs at the end.
3041                 */
3042                for (i = objects - 1; i >= 0; i--)
3043                        list_splice(slabs_by_inuse + i, n->partial.prev);
3044
3045                spin_unlock_irqrestore(&n->list_lock, flags);
3046        }
3047
3048        kfree(slabs_by_inuse);
3049        return 0;
3050}
3051EXPORT_SYMBOL(kmem_cache_shrink);
3052
3053#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
3054static int slab_mem_going_offline_callback(void *arg)
3055{
3056        struct kmem_cache *s;
3057
3058        down_read(&slub_lock);
3059        list_for_each_entry(s, &slab_caches, list)
3060                kmem_cache_shrink(s);
3061        up_read(&slub_lock);
3062
3063        return 0;
3064}
3065
3066static void slab_mem_offline_callback(void *arg)
3067{
3068        struct kmem_cache_node *n;
3069        struct kmem_cache *s;
3070        struct memory_notify *marg = arg;
3071        int offline_node;
3072
3073        offline_node = marg->status_change_nid;
3074
3075        /*
3076         * If the node still has available memory. we need kmem_cache_node
3077         * for it yet.
3078         */
3079        if (offline_node < 0)
3080                return;
3081
3082        down_read(&slub_lock);
3083        list_for_each_entry(s, &slab_caches, list) {
3084                n = get_node(s, offline_node);
3085                if (n) {
3086                        /*
3087                         * if n->nr_slabs > 0, slabs still exist on the node
3088                         * that is going down. We were unable to free them,
3089                         * and offline_pages() function shoudn't call this
3090                         * callback. So, we must fail.
3091                         */
3092                        BUG_ON(slabs_node(s, offline_node));
3093
3094                        s->node[offline_node] = NULL;
3095                        kmem_cache_free(kmalloc_caches, n);
3096                }
3097        }
3098        up_read(&slub_lock);
3099}
3100
3101static int slab_mem_going_online_callback(void *arg)
3102{
3103        struct kmem_cache_node *n;
3104        struct kmem_cache *s;
3105        struct memory_notify *marg = arg;
3106        int nid = marg->status_change_nid;
3107        int ret = 0;
3108
3109        /*
3110         * If the node's memory is already available, then kmem_cache_node is
3111         * already created. Nothing to do.
3112         */
3113        if (nid < 0)
3114                return 0;
3115
3116        /*
3117         * We are bringing a node online. No memory is available yet. We must
3118         * allocate a kmem_cache_node structure in order to bring the node
3119         * online.
3120         */
3121        down_read(&slub_lock);
3122        list_for_each_entry(s, &slab_caches, list) {
3123                /*
3124                 * XXX: kmem_cache_alloc_node will fallback to other nodes
3125                 *      since memory is not yet available from the node that
3126                 *      is brought up.
3127                 */
3128                n = kmem_cache_alloc(kmalloc_caches, GFP_KERNEL);
3129                if (!n) {
3130                        ret = -ENOMEM;
3131                        goto out;
3132                }
3133                init_kmem_cache_node(n, s);
3134                s->node[nid] = n;
3135        }
3136out:
3137        up_read(&slub_lock);
3138        return ret;
3139}
3140
3141static int slab_memory_callback(struct notifier_block *self,
3142                                unsigned long action, void *arg)
3143{
3144        int ret = 0;
3145
3146        switch (action) {
3147        case MEM_GOING_ONLINE:
3148                ret = slab_mem_going_online_callback(arg);
3149                break;
3150        case MEM_GOING_OFFLINE:
3151                ret = slab_mem_going_offline_callback(arg);
3152                break;
3153        case MEM_OFFLINE:
3154        case MEM_CANCEL_ONLINE:
3155                slab_mem_offline_callback(arg);
3156                break;
3157        case MEM_ONLINE:
3158        case MEM_CANCEL_OFFLINE:
3159                break;
3160        }
3161        if (ret)
3162                ret = notifier_from_errno(ret);
3163        else
3164                ret = NOTIFY_OK;
3165        return ret;
3166}
3167
3168#endif /* CONFIG_MEMORY_HOTPLUG */
3169
3170/********************************************************************
3171 *                      Basic setup of slabs
3172 *******************************************************************/
3173
3174void __init kmem_cache_init(void)
3175{
3176        int i;
3177        int caches = 0;
3178
3179        init_alloc_cpu();
3180
3181#ifdef CONFIG_NUMA
3182        /*
3183         * Must first have the slab cache available for the allocations of the
3184         * struct kmem_cache_node's. There is special bootstrap code in
3185         * kmem_cache_open for slab_state == DOWN.
3186         */
3187        create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
3188                sizeof(struct kmem_cache_node), GFP_NOWAIT);
3189        kmalloc_caches[0].refcount = -1;
3190        caches++;
3191
3192        hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
3193#endif
3194
3195        /* Able to allocate the per node structures */
3196        slab_state = PARTIAL;
3197
3198        /* Caches that are not of the two-to-the-power-of size */
3199        if (KMALLOC_MIN_SIZE <= 32) {
3200                create_kmalloc_cache(&kmalloc_caches[1],
3201                                "kmalloc-96", 96, GFP_NOWAIT);
3202                caches++;
3203        }
3204        if (KMALLOC_MIN_SIZE <= 64) {
3205                create_kmalloc_cache(&kmalloc_caches[2],
3206                                "kmalloc-192", 192, GFP_NOWAIT);
3207                caches++;
3208        }
3209
3210        for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
3211                create_kmalloc_cache(&kmalloc_caches[i],
3212                        "kmalloc", 1 << i, GFP_NOWAIT);
3213                caches++;
3214        }
3215
3216
3217        /*
3218         * Patch up the size_index table if we have strange large alignment
3219         * requirements for the kmalloc array. This is only the case for
3220         * MIPS it seems. The standard arches will not generate any code here.
3221         *
3222         * Largest permitted alignment is 256 bytes due to the way we
3223         * handle the index determination for the smaller caches.
3224         *
3225         * Make sure that nothing crazy happens if someone starts tinkering
3226         * around with ARCH_KMALLOC_MINALIGN
3227         */
3228        BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
3229                (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
3230
3231        for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
3232                int elem = size_index_elem(i);
3233                if (elem >= ARRAY_SIZE(size_index))
3234                        break;
3235                size_index[elem] = KMALLOC_SHIFT_LOW;
3236        }
3237
3238        if (KMALLOC_MIN_SIZE == 64) {
3239                /*
3240                 * The 96 byte size cache is not used if the alignment
3241                 * is 64 byte.
3242                 */
3243                for (i = 64 + 8; i <= 96; i += 8)
3244                        size_index[size_index_elem(i)] = 7;
3245        } else if (KMALLOC_MIN_SIZE == 128) {
3246                /*
3247                 * The 192 byte sized cache is not used if the alignment
3248                 * is 128 byte. Redirect kmalloc to use the 256 byte cache
3249                 * instead.
3250                 */
3251                for (i = 128 + 8; i <= 192; i += 8)
3252                        size_index[size_index_elem(i)] = 8;
3253        }
3254
3255        slab_state = UP;
3256
3257        /* Provide the correct kmalloc names now that the caches are up */
3258        for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++)
3259                kmalloc_caches[i]. name =
3260                        kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);
3261
3262#ifdef CONFIG_SMP
3263        register_cpu_notifier(&slab_notifier);
3264        kmem_size = offsetof(struct kmem_cache, cpu_slab) +
3265                                nr_cpu_ids * sizeof(struct kmem_cache_cpu *);
3266#else
3267        kmem_size = sizeof(struct kmem_cache);
3268#endif
3269
3270        printk(KERN_INFO
3271                "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
3272                " CPUs=%d, Nodes=%d\n",
3273                caches, cache_line_size(),
3274                slub_min_order, slub_max_order, slub_min_objects,
3275                nr_cpu_ids, nr_node_ids);
3276}
3277
3278void __init kmem_cache_init_late(void)
3279{
3280}
3281
3282/*
3283 * Find a mergeable slab cache
3284 */
3285static int slab_unmergeable(struct kmem_cache *s)
3286{
3287        if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
3288                return 1;
3289
3290        if (s->ctor)
3291                return 1;
3292
3293        /*
3294         * We may have set a slab to be unmergeable during bootstrap.
3295         */
3296        if (s->refcount < 0)
3297                return 1;
3298
3299        return 0;
3300}
3301
3302static struct kmem_cache *find_mergeable(size_t size,
3303                size_t align, unsigned long flags, const char *name,
3304                void (*ctor)(void *))
3305{
3306        struct kmem_cache *s;
3307
3308        if (slub_nomerge || (flags & SLUB_NEVER_MERGE))
3309                return NULL;
3310
3311        if (ctor)
3312                return NULL;
3313
3314        size = ALIGN(size, sizeof(void *));
3315        align = calculate_alignment(flags, align, size);
3316        size = ALIGN(size, align);
3317        flags = kmem_cache_flags(size, flags, name, NULL);
3318
3319        list_for_each_entry(s, &slab_caches, list) {
3320                if (slab_unmergeable(s))
3321                        continue;
3322
3323                if (size > s->size)
3324                        continue;
3325
3326                if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME))
3327                                continue;
3328                /*
3329                 * Check if alignment is compatible.
3330                 * Courtesy of Adrian Drzewiecki
3331                 */
3332                if ((s->size & ~(align - 1)) != s->size)
3333                        continue;
3334
3335                if (s->size - size >= sizeof(void *))
3336                        continue;
3337
3338                return s;
3339        }
3340        return NULL;
3341}
3342
3343struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3344                size_t align, unsigned long flags, void (*ctor)(void *))
3345{
3346        struct kmem_cache *s;
3347
3348        if (WARN_ON(!name))
3349                return NULL;
3350
3351        down_write(&slub_lock);
3352        s = find_mergeable(size, align, flags, name, ctor);
3353        if (s) {
3354                int cpu;
3355
3356                s->refcount++;
3357                /*
3358                 * Adjust the object sizes so that we clear
3359                 * the complete object on kzalloc.
3360                 */
3361                s->objsize = max(s->objsize, (int)size);
3362
3363                /*
3364                 * And then we need to update the object size in the
3365                 * per cpu structures
3366                 */
3367                for_each_online_cpu(cpu)
3368                        get_cpu_slab(s, cpu)->objsize = s->objsize;
3369
3370                s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
3371                up_write(&slub_lock);
3372
3373                if (sysfs_slab_alias(s, name)) {
3374                        down_write(&slub_lock);
3375                        s->refcount--;
3376                        up_write(&slub_lock);
3377                        goto err;
3378                }
3379                return s;
3380        }
3381
3382        s = kmalloc(kmem_size, GFP_KERNEL);
3383        if (s) {
3384                if (kmem_cache_open(s, GFP_KERNEL, name,
3385                                size, align, flags, ctor)) {
3386                        list_add(&s->list, &slab_caches);
3387                        up_write(&slub_lock);
3388                        if (sysfs_slab_add(s)) {
3389                                down_write(&slub_lock);
3390                                list_del(&s->list);
3391                                up_write(&slub_lock);
3392                                kfree(s);
3393                                goto err;
3394                        }
3395                        return s;
3396                }
3397                kfree(s);
3398        }
3399        up_write(&slub_lock);
3400
3401err:
3402        if (flags & SLAB_PANIC)
3403                panic("Cannot create slabcache %s\n", name);
3404        else
3405                s = NULL;
3406        return s;
3407}
3408EXPORT_SYMBOL(kmem_cache_create);
3409
3410#ifdef CONFIG_SMP
3411/*
3412 * Use the cpu notifier to insure that the cpu slabs are flushed when
3413 * necessary.
3414 */
3415static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
3416                unsigned long action, void *hcpu)
3417{
3418        long cpu = (long)hcpu;
3419        struct kmem_cache *s;
3420        unsigned long flags;
3421
3422        switch (action) {
3423        case CPU_UP_PREPARE:
3424        case CPU_UP_PREPARE_FROZEN:
3425                init_alloc_cpu_cpu(cpu);
3426                down_read(&slub_lock);
3427                list_for_each_entry(s, &slab_caches, list)
3428                        s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu,
3429                                                        GFP_KERNEL);
3430                up_read(&slub_lock);
3431                break;
3432
3433        case CPU_UP_CANCELED:
3434        case CPU_UP_CANCELED_FROZEN:
3435        case CPU_DEAD:
3436        case CPU_DEAD_FROZEN:
3437                down_read(&slub_lock);
3438                list_for_each_entry(s, &slab_caches, list) {
3439                        struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
3440
3441                        local_irq_save(flags);
3442                        __flush_cpu_slab(s, cpu);
3443                        local_irq_restore(flags);
3444                        free_kmem_cache_cpu(c, cpu);
3445                        s->cpu_slab[cpu] = NULL;
3446                }
3447                up_read(&slub_lock);
3448                break;
3449        default:
3450                break;
3451        }
3452        return NOTIFY_OK;
3453}
3454
3455static struct notifier_block __cpuinitdata slab_notifier = {
3456        .notifier_call = slab_cpuup_callback
3457};
3458
3459#endif
3460
3461void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
3462{
3463        struct kmem_cache *s;
3464        void *ret;
3465
3466        if (unlikely(size > SLUB_MAX_SIZE))
3467                return kmalloc_large(size, gfpflags);
3468
3469        s = get_slab(size, gfpflags);
3470
3471        if (unlikely(ZERO_OR_NULL_PTR(s)))
3472                return s;
3473
3474        ret = slab_alloc(s, gfpflags, -1, caller);
3475
3476        /* Honor the call site pointer we recieved. */
3477        trace_kmalloc(caller, ret, size, s->size, gfpflags);
3478
3479        return ret;
3480}
3481
3482void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
3483                                        int node, unsigned long caller)
3484{
3485        struct kmem_cache *s;
3486        void *ret;
3487
3488        if (unlikely(size > SLUB_MAX_SIZE))
3489                return kmalloc_large_node(size, gfpflags, node);
3490
3491        s = get_slab(size, gfpflags);
3492
3493        if (unlikely(ZERO_OR_NULL_PTR(s)))
3494                return s;
3495
3496        ret = slab_alloc(s, gfpflags, node, caller);
3497
3498        /* Honor the call site pointer we recieved. */
3499        trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
3500
3501        return ret;
3502}
3503
3504#ifdef CONFIG_SLUB_DEBUG
3505static int count_inuse(struct page *page)
3506{
3507        return page->inuse;
3508}
3509
3510static int count_total(struct page *page)
3511{
3512        return page->objects;
3513}
3514
3515static int validate_slab(struct kmem_cache *s, struct page *page,
3516                                                unsigned long *map)
3517{
3518        void *p;
3519        void *addr = page_address(page);
3520
3521        if (!check_slab(s, page) ||
3522                        !on_freelist(s, page, NULL))
3523                return 0;
3524
3525        /* Now we know that a valid freelist exists */
3526        bitmap_zero(map, page->objects);
3527
3528        for_each_free_object(p, s, page->freelist) {
3529                set_bit(slab_index(p, s, addr), map);
3530                if (!check_object(s, page, p, 0))
3531                        return 0;
3532        }
3533
3534        for_each_object(p, s, addr, page->objects)
3535                if (!test_bit(slab_index(p, s, addr), map))
3536                        if (!check_object(s, page, p, 1))
3537                                return 0;
3538        return 1;
3539}
3540
3541static void validate_slab_slab(struct kmem_cache *s, struct page *page,
3542                                                unsigned long *map)
3543{
3544        if (slab_trylock(page)) {
3545                validate_slab(s, page, map);
3546                slab_unlock(page);
3547        } else
3548                printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n",
3549                        s->name, page);
3550
3551        if (s->flags & DEBUG_DEFAULT_FLAGS) {
3552                if (!PageSlubDebug(page))
3553                        printk(KERN_ERR "SLUB %s: SlubDebug not set "
3554                                "on slab 0x%p\n", s->name, page);
3555        } else {
3556                if (PageSlubDebug(page))
3557                        printk(KERN_ERR "SLUB %s: SlubDebug set on "
3558                                "slab 0x%p\n", s->name, page);
3559        }
3560}
3561
3562static int validate_slab_node(struct kmem_cache *s,
3563                struct kmem_cache_node *n, unsigned long *map)
3564{
3565        unsigned long count = 0;
3566        struct page *page;
3567        unsigned long flags;
3568
3569        spin_lock_irqsave(&n->list_lock, flags);
3570
3571        list_for_each_entry(page, &n->partial, lru) {
3572                validate_slab_slab(s, page, map);
3573                count++;
3574        }
3575        if (count != n->nr_partial)
3576                printk(KERN_ERR "SLUB %s: %ld partial slabs counted but "
3577                        "counter=%ld\n", s->name, count, n->nr_partial);
3578
3579        if (!(s->flags & SLAB_STORE_USER))
3580                goto out;
3581
3582        list_for_each_entry(page, &n->full, lru) {
3583                validate_slab_slab(s, page, map);
3584                count++;
3585        }
3586        if (count != atomic_long_read(&n->nr_slabs))
3587                printk(KERN_ERR "SLUB: %s %ld slabs counted but "
3588                        "counter=%ld\n", s->name, count,
3589                        atomic_long_read(&n->nr_slabs));
3590
3591out:
3592        spin_unlock_irqrestore(&n->list_lock, flags);
3593        return count;
3594}
3595
3596static long validate_slab_cache(struct kmem_cache *s)
3597{
3598        int node;
3599        unsigned long count = 0;
3600        unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
3601                                sizeof(unsigned long), GFP_KERNEL);
3602
3603        if (!map)
3604                return -ENOMEM;
3605
3606        flush_all(s);
3607        for_each_node_state(node, N_NORMAL_MEMORY) {
3608                struct kmem_cache_node *n = get_node(s, node);
3609
3610                count += validate_slab_node(s, n, map);
3611        }
3612        kfree(map);
3613        return count;
3614}
3615
3616#ifdef SLUB_RESILIENCY_TEST
3617static void resiliency_test(void)
3618{
3619        u8 *p;
3620
3621        printk(KERN_ERR "SLUB resiliency testing\n");
3622        printk(KERN_ERR "-----------------------\n");
3623        printk(KERN_ERR "A. Corruption after allocation\n");
3624
3625        p = kzalloc(16, GFP_KERNEL);
3626        p[16] = 0x12;
3627        printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
3628                        " 0x12->0x%p\n\n", p + 16);
3629
3630        validate_slab_cache(kmalloc_caches + 4);
3631
3632        /* Hmmm... The next two are dangerous */
3633        p = kzalloc(32, GFP_KERNEL);
3634        p[32 + sizeof(void *)] = 0x34;
3635        printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
3636                        " 0x34 -> -0x%p\n", p);
3637        printk(KERN_ERR
3638                "If allocated object is overwritten then not detectable\n\n");
3639
3640        validate_slab_cache(kmalloc_caches + 5);
3641        p = kzalloc(64, GFP_KERNEL);
3642        p += 64 + (get_cycles() & 0xff) * sizeof(void *);
3643        *p = 0x56;
3644        printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
3645                                                                        p);
3646        printk(KERN_ERR
3647                "If allocated object is overwritten then not detectable\n\n");
3648        validate_slab_cache(kmalloc_caches + 6);
3649
3650        printk(KERN_ERR "\nB. Corruption after free\n");
3651        p = kzalloc(128, GFP_KERNEL);
3652        kfree(p);
3653        *p = 0x78;
3654        printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
3655        validate_slab_cache(kmalloc_caches + 7);
3656
3657        p = kzalloc(256, GFP_KERNEL);
3658        kfree(p);
3659        p[50] = 0x9a;
3660        printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n",
3661                        p);
3662        validate_slab_cache(kmalloc_caches + 8);
3663
3664        p = kzalloc(512, GFP_KERNEL);
3665        kfree(p);
3666        p[512] = 0xab;
3667        printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
3668        validate_slab_cache(kmalloc_caches + 9);
3669}
3670#else
3671static void resiliency_test(void) {};
3672#endif
3673
3674/*
3675 * Generate lists of code addresses where slabcache objects are allocated
3676 * and freed.
3677 */
3678
3679struct location {
3680        unsigned long count;
3681        unsigned long addr;
3682        long long sum_time;
3683        long min_time;
3684        long max_time;
3685        long min_pid;
3686        long max_pid;
3687        DECLARE_BITMAP(cpus, NR_CPUS);
3688        nodemask_t nodes;
3689};
3690
3691struct loc_track {
3692        unsigned long max;
3693        unsigned long count;
3694        struct location *loc;
3695};
3696
3697static void free_loc_track(struct loc_track *t)
3698{
3699        if (t->max)
3700                free_pages((unsigned long)t->loc,
3701                        get_order(sizeof(struct location) * t->max));
3702}
3703
3704static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
3705{
3706        struct location *l;
3707        int order;
3708
3709        order = get_order(sizeof(struct location) * max);
3710
3711        l = (void *)__get_free_pages(flags, order);
3712        if (!l)
3713                return 0;
3714
3715        if (t->count) {
3716                memcpy(l, t->loc, sizeof(struct location) * t->count);
3717                free_loc_track(t);
3718        }
3719        t->max = max;
3720        t->loc = l;
3721        return 1;
3722}
3723
3724static int add_location(struct loc_track *t, struct kmem_cache *s,
3725                                const struct track *track)
3726{
3727        long start, end, pos;
3728        struct location *l;
3729        unsigned long caddr;
3730        unsigned long age = jiffies - track->when;
3731
3732        start = -1;
3733        end = t->count;
3734
3735        for ( ; ; ) {
3736                pos = start + (end - start + 1) / 2;
3737
3738                /*
3739                 * There is nothing at "end". If we end up there
3740                 * we need to add something to before end.
3741                 */
3742                if (pos == end)
3743                        break;
3744
3745                caddr = t->loc[pos].addr;
3746                if (track->addr == caddr) {
3747
3748                        l = &t->loc[pos];
3749                        l->count++;
3750                        if (track->when) {
3751                                l->sum_time += age;
3752                                if (age < l->min_time)
3753                                        l->min_time = age;
3754                                if (age > l->max_time)
3755                                        l->max_time = age;
3756
3757                                if (track->pid < l->min_pid)
3758                                        l->min_pid = track->pid;
3759                                if (track->pid > l->max_pid)
3760                                        l->max_pid = track->pid;
3761
3762                                cpumask_set_cpu(track->cpu,
3763                                                to_cpumask(l->cpus));
3764                        }
3765                        node_set(page_to_nid(virt_to_page(track)), l->nodes);
3766                        return 1;
3767                }
3768
3769                if (track->addr < caddr)
3770                        end = pos;
3771                else
3772                        start = pos;
3773        }
3774
3775        /*
3776         * Not found. Insert new tracking element.
3777         */
3778        if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC))
3779                return 0;
3780
3781        l = t->loc + pos;
3782        if (pos < t->count)
3783                memmove(l + 1, l,
3784                        (t->count - pos) * sizeof(struct location));
3785        t->count++;
3786        l->count = 1;
3787        l->addr = track->addr;
3788        l->sum_time = age;
3789        l->min_time = age;
3790        l->max_time = age;
3791        l->min_pid = track->pid;
3792        l->max_pid = track->pid;
3793        cpumask_clear(to_cpumask(l->cpus));
3794        cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));
3795        nodes_clear(l->nodes);
3796        node_set(page_to_nid(virt_to_page(track)), l->nodes);
3797        return 1;
3798}
3799
3800static void process_slab(struct loc_track *t, struct kmem_cache *s,
3801                struct page *page, enum track_item alloc)
3802{
3803        void *addr = page_address(page);
3804        DECLARE_BITMAP(map, page->objects);
3805        void *p;
3806
3807        bitmap_zero(map, page->objects);
3808        for_each_free_object(p, s, page->freelist)
3809                set_bit(slab_index(p, s, addr), map);
3810
3811        for_each_object(p, s, addr, page->objects)
3812                if (!test_bit(slab_index(p, s, addr), map))
3813                        add_location(t, s, get_track(s, p, alloc));
3814}
3815
3816static int list_locations(struct kmem_cache *s, char *buf,
3817                                        enum track_item alloc)
3818{
3819        int len = 0;
3820        unsigned long i;
3821        struct loc_track t = { 0, 0, NULL };
3822        int node;
3823
3824        if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
3825                        GFP_TEMPORARY))
3826                return sprintf(buf, "Out of memory\n");
3827
3828        /* Push back cpu slabs */
3829        flush_all(s);
3830
3831        for_each_node_state(node, N_NORMAL_MEMORY) {
3832                struct kmem_cache_node *n = get_node(s, node);
3833                unsigned long flags;
3834                struct page *page;
3835
3836                if (!atomic_long_read(&n->nr_slabs))
3837                        continue;
3838
3839                spin_lock_irqsave(&n->list_lock, flags);
3840                list_for_each_entry(page, &n->partial, lru)
3841                        process_slab(&t, s, page, alloc);
3842                list_for_each_entry(page, &n->full, lru)
3843                        process_slab(&t, s, page, alloc);
3844                spin_unlock_irqrestore(&n->list_lock, flags);
3845        }
3846
3847        for (i = 0; i < t.count; i++) {
3848                struct location *l = &t.loc[i];
3849
3850                if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100)
3851                        break;
3852                len += sprintf(buf + len, "%7ld ", l->count);
3853
3854                if (l->addr)
3855                        len += sprint_symbol(buf + len, (unsigned long)l->addr);
3856                else
3857                        len += sprintf(buf + len, "<not-available>");
3858
3859                if (l->sum_time != l->min_time) {
3860                        len += sprintf(buf + len, " age=%ld/%ld/%ld",
3861                                l->min_time,
3862                                (long)div_u64(l->sum_time, l->count),
3863                                l->max_time);
3864                } else
3865                        len += sprintf(buf + len, " age=%ld",
3866                                l->min_time);
3867
3868                if (l->min_pid != l->max_pid)
3869                        len += sprintf(buf + len, " pid=%ld-%ld",
3870                                l->min_pid, l->max_pid);
3871                else
3872                        len += sprintf(buf + len, " pid=%ld",
3873                                l->min_pid);
3874
3875                if (num_online_cpus() > 1 &&
3876                                !cpumask_empty(to_cpumask(l->cpus)) &&
3877                                len < PAGE_SIZE - 60) {
3878                        len += sprintf(buf + len, " cpus=");
3879                        len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50,
3880                                                 to_cpumask(l->cpus));
3881                }
3882
3883                if (nr_online_nodes > 1 && !nodes_empty(l->nodes) &&
3884                                len < PAGE_SIZE - 60) {
3885                        len += sprintf(buf + len, " nodes=");
3886                        len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50,
3887                                        l->nodes);
3888                }
3889
3890                len += sprintf(buf + len, "\n");
3891        }
3892
3893        free_loc_track(&t);
3894        if (!t.count)
3895                len += sprintf(buf, "No data\n");
3896        return len;
3897}
3898
3899enum slab_stat_type {
3900        SL_ALL,                 /* All slabs */
3901        SL_PARTIAL,             /* Only partially allocated slabs */
3902        SL_CPU,                 /* Only slabs used for cpu caches */
3903        SL_OBJECTS,             /* Determine allocated objects not slabs */
3904        SL_TOTAL                /* Determine object capacity not slabs */
3905};
3906
3907#define SO_ALL          (1 << SL_ALL)
3908#define SO_PARTIAL      (1 << SL_PARTIAL)
3909#define SO_CPU          (1 << SL_CPU)
3910#define SO_OBJECTS      (1 << SL_OBJECTS)
3911#define SO_TOTAL        (1 << SL_TOTAL)
3912
3913static ssize_t show_slab_objects(struct kmem_cache *s,
3914                            char *buf, unsigned long flags)
3915{
3916        unsigned long total = 0;
3917        int node;
3918        int x;
3919        unsigned long *nodes;
3920        unsigned long *per_cpu;
3921
3922        nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL);
3923        if (!nodes)
3924                return -ENOMEM;
3925        per_cpu = nodes + nr_node_ids;
3926
3927        if (flags & SO_CPU) {
3928                int cpu;
3929
3930                for_each_possible_cpu(cpu) {
3931                        struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
3932
3933                        if (!c || c->node < 0)
3934                                continue;
3935
3936                        if (c->page) {
3937                                        if (flags & SO_TOTAL)
3938                                                x = c->page->objects;
3939                                else if (flags & SO_OBJECTS)
3940                                        x = c->page->inuse;
3941                                else
3942                                        x = 1;
3943
3944                                total += x;
3945                                nodes[c->node] += x;
3946                        }
3947                        per_cpu[c->node]++;
3948                }
3949        }
3950
3951        if (flags & SO_ALL) {
3952                for_each_node_state(node, N_NORMAL_MEMORY) {
3953                        struct kmem_cache_node *n = get_node(s, node);
3954
3955                if (flags & SO_TOTAL)
3956                        x = atomic_long_read(&n->total_objects);
3957                else if (flags & SO_OBJECTS)
3958                        x = atomic_long_read(&n->total_objects) -
3959                                count_partial(n, count_free);
3960
3961                        else
3962                                x = atomic_long_read(&n->nr_slabs);
3963                        total += x;
3964                        nodes[node] += x;
3965                }
3966
3967        } else if (flags & SO_PARTIAL) {
3968                for_each_node_state(node, N_NORMAL_MEMORY) {
3969                        struct kmem_cache_node *n = get_node(s, node);
3970
3971                        if (flags & SO_TOTAL)
3972                                x = count_partial(n, count_total);
3973                        else if (flags & SO_OBJECTS)
3974                                x = count_partial(n, count_inuse);
3975                        else
3976                                x = n->nr_partial;
3977                        total += x;
3978                        nodes[node] += x;
3979                }
3980        }
3981        x = sprintf(buf, "%lu", total);
3982#ifdef CONFIG_NUMA
3983        for_each_node_state(node, N_NORMAL_MEMORY)
3984                if (nodes[node])
3985                        x += sprintf(buf + x, " N%d=%lu",
3986                                        node, nodes[node]);
3987#endif
3988        kfree(nodes);
3989        return x + sprintf(buf + x, "\n");
3990}
3991
3992static int any_slab_objects(struct kmem_cache *s)
3993{
3994        int node;
3995
3996        for_each_online_node(node) {
3997                struct kmem_cache_node *n = get_node(s, node);
3998
3999                if (!n)
4000                        continue;
4001
4002                if (atomic_long_read(&n->total_objects))
4003                        return 1;
4004        }
4005        return 0;
4006}
4007
4008#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
4009#define to_slab(n) container_of(n, struct kmem_cache, kobj);
4010
4011struct slab_attribute {
4012        struct attribute attr;
4013        ssize_t (*show)(struct kmem_cache *s, char *buf);
4014        ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
4015};
4016
4017#define SLAB_ATTR_RO(_name) \
4018        static struct slab_attribute _name##_attr = __ATTR_RO(_name)
4019
4020#define SLAB_ATTR(_name) \
4021        static struct slab_attribute _name##_attr =  \
4022        __ATTR(_name, 0644, _name##_show, _name##_store)
4023
4024static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
4025{
4026        return sprintf(buf, "%d\n", s->size);
4027}
4028SLAB_ATTR_RO(slab_size);
4029
4030static ssize_t align_show(struct kmem_cache *s, char *buf)
4031{
4032        return sprintf(buf, "%d\n", s->align);
4033}
4034SLAB_ATTR_RO(align);
4035
4036static ssize_t object_size_show(struct kmem_cache *s, char *buf)
4037{
4038        return sprintf(buf, "%d\n", s->objsize);
4039}
4040SLAB_ATTR_RO(object_size);
4041
4042static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
4043{
4044        return sprintf(buf, "%d\n", oo_objects(s->oo));
4045}
4046SLAB_ATTR_RO(objs_per_slab);
4047
4048static ssize_t order_store(struct kmem_cache *s,
4049                                const char *buf, size_t length)
4050{
4051        unsigned long order;
4052        int err;
4053
4054        err = strict_strtoul(buf, 10, &order);
4055        if (err)
4056                return err;
4057
4058        if (order > slub_max_order || order < slub_min_order)
4059                return -EINVAL;
4060
4061        calculate_sizes(s, order);
4062        return length;
4063}
4064
4065static ssize_t order_show(struct kmem_cache *s, char *buf)
4066{
4067        return sprintf(buf, "%d\n", oo_order(s->oo));
4068}
4069SLAB_ATTR(order);
4070
4071static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
4072{
4073        return sprintf(buf, "%lu\n", s->min_partial);
4074}
4075
4076static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
4077                                 size_t length)
4078{
4079        unsigned long min;
4080        int err;
4081
4082        err = strict_strtoul(buf, 10, &min);
4083        if (err)
4084                return err;
4085
4086        set_min_partial(s, min);
4087        return length;
4088}
4089SLAB_ATTR(min_partial);
4090
4091static ssize_t ctor_show(struct kmem_cache *s, char *buf)
4092{
4093        if (s->ctor) {
4094                int n = sprint_symbol(buf, (unsigned long)s->ctor);
4095
4096                return n + sprintf(buf + n, "\n");
4097        }
4098        return 0;
4099}
4100SLAB_ATTR_RO(ctor);
4101
4102static ssize_t aliases_show(struct kmem_cache *s, char *buf)
4103{
4104        return sprintf(buf, "%d\n", s->refcount - 1);
4105}
4106SLAB_ATTR_RO(aliases);
4107
4108static ssize_t slabs_show(struct kmem_cache *s, char *buf)
4109{
4110        return show_slab_objects(s, buf, SO_ALL);
4111}
4112SLAB_ATTR_RO(slabs);
4113
4114static ssize_t partial_show(struct kmem_cache *s, char *buf)
4115{
4116        return show_slab_objects(s, buf, SO_PARTIAL);
4117}
4118SLAB_ATTR_RO(partial);
4119
4120static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf)
4121{
4122        return show_slab_objects(s, buf, SO_CPU);
4123}
4124SLAB_ATTR_RO(cpu_slabs);
4125
4126static ssize_t objects_show(struct kmem_cache *s, char *buf)
4127{
4128        return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS);
4129}
4130SLAB_ATTR_RO(objects);
4131
4132static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
4133{
4134        return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS);
4135}
4136SLAB_ATTR_RO(objects_partial);
4137
4138static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
4139{
4140        return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
4141}
4142SLAB_ATTR_RO(total_objects);
4143
4144static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
4145{
4146        return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
4147}
4148
4149static ssize_t sanity_checks_store(struct kmem_cache *s,
4150                                const char *buf, size_t length)
4151{
4152        s->flags &= ~SLAB_DEBUG_FREE;
4153        if (buf[0] == '1')
4154                s->flags |= SLAB_DEBUG_FREE;
4155        return length;
4156}
4157SLAB_ATTR(sanity_checks);
4158
4159static ssize_t trace_show(struct kmem_cache *s, char *buf)
4160{
4161        return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
4162}
4163
4164static ssize_t trace_store(struct kmem_cache *s, const char *buf,
4165                                                        size_t length)
4166{
4167        s->flags &= ~SLAB_TRACE;
4168        if (buf[0] == '1')
4169                s->flags |= SLAB_TRACE;
4170        return length;
4171}
4172SLAB_ATTR(trace);
4173
4174static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
4175{
4176        return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
4177}
4178
4179static ssize_t reclaim_account_store(struct kmem_cache *s,
4180                                const char *buf, size_t length)
4181{
4182        s->flags &= ~SLAB_RECLAIM_ACCOUNT;
4183        if (buf[0] == '1')
4184                s->flags |= SLAB_RECLAIM_ACCOUNT;
4185        return length;
4186}
4187SLAB_ATTR(reclaim_account);
4188
4189static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
4190{
4191        return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
4192}
4193SLAB_ATTR_RO(hwcache_align);
4194
4195#ifdef CONFIG_ZONE_DMA
4196static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
4197{
4198        return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
4199}
4200SLAB_ATTR_RO(cache_dma);
4201#endif
4202
4203static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
4204{
4205        return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
4206}
4207SLAB_ATTR_RO(destroy_by_rcu);
4208
4209static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
4210{
4211        return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
4212}
4213
4214static ssize_t red_zone_store(struct kmem_cache *s,
4215                                const char *buf, size_t length)
4216{
4217        if (any_slab_objects(s))
4218                return -EBUSY;
4219
4220        s->flags &= ~SLAB_RED_ZONE;
4221        if (buf[0] == '1')
4222                s->flags |= SLAB_RED_ZONE;
4223        calculate_sizes(s, -1);
4224        return length;
4225}
4226SLAB_ATTR(red_zone);
4227
4228static ssize_t poison_show(struct kmem_cache *s, char *buf)
4229{
4230        return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
4231}
4232
4233static ssize_t poison_store(struct kmem_cache *s,
4234                                const char *buf, size_t length)
4235{
4236        if (any_slab_objects(s))
4237                return -EBUSY;
4238
4239        s->flags &= ~SLAB_POISON;
4240        if (buf[0] == '1')
4241                s->flags |= SLAB_POISON;
4242        calculate_sizes(s, -1);
4243        return length;
4244}
4245SLAB_ATTR(poison);
4246
4247static ssize_t store_user_show(struct kmem_cache *s, char *buf)
4248{
4249        return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
4250}
4251
4252static ssize_t store_user_store(struct kmem_cache *s,
4253                                const char *buf, size_t length)
4254{
4255        if (any_slab_objects(s))
4256                return -EBUSY;
4257
4258        s->flags &= ~SLAB_STORE_USER;
4259        if (buf[0] == '1')
4260                s->flags |= SLAB_STORE_USER;
4261        calculate_sizes(s, -1);
4262        return length;
4263}
4264SLAB_ATTR(store_user);
4265
4266static ssize_t validate_show(struct kmem_cache *s, char *buf)
4267{
4268        return 0;
4269}
4270
4271static ssize_t validate_store(struct kmem_cache *s,
4272                        const char *buf, size_t length)
4273{
4274        int ret = -EINVAL;
4275
4276        if (buf[0] == '1') {
4277                ret = validate_slab_cache(s);
4278                if (ret >= 0)
4279                        ret = length;
4280        }
4281        return ret;
4282}
4283SLAB_ATTR(validate);
4284
4285static ssize_t shrink_show(struct kmem_cache *s, char *buf)
4286{
4287        return 0;
4288}
4289
4290static ssize_t shrink_store(struct kmem_cache *s,
4291                        const char *buf, size_t length)
4292{
4293        if (buf[0] == '1') {
4294                int rc = kmem_cache_shrink(s);
4295
4296                if (rc)
4297                        return rc;
4298        } else
4299                return -EINVAL;
4300        return length;
4301}
4302SLAB_ATTR(shrink);
4303
4304static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
4305{
4306        if (!(s->flags & SLAB_STORE_USER))
4307                return -ENOSYS;
4308        return list_locations(s, buf, TRACK_ALLOC);
4309}
4310SLAB_ATTR_RO(alloc_calls);
4311
4312static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
4313{
4314        if (!(s->flags & SLAB_STORE_USER))
4315                return -ENOSYS;
4316        return list_locations(s, buf, TRACK_FREE);
4317}
4318SLAB_ATTR_RO(free_calls);
4319
4320#ifdef CONFIG_NUMA
4321static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
4322{
4323        return sprintf(buf, "%d\n", s->remote_node_defrag_ratio / 10);
4324}
4325
4326static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
4327                                const char *buf, size_t length)
4328{
4329        unsigned long ratio;
4330        int err;
4331
4332        err = strict_strtoul(buf, 10, &ratio);
4333        if (err)
4334                return err;
4335
4336        if (ratio <= 100)
4337                s->remote_node_defrag_ratio = ratio * 10;
4338
4339        return length;
4340}
4341SLAB_ATTR(remote_node_defrag_ratio);
4342#endif
4343
4344#ifdef CONFIG_SLUB_STATS
4345static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
4346{
4347        unsigned long sum  = 0;
4348        int cpu;
4349        int len;
4350        int *data = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);
4351
4352        if (!data)
4353                return -ENOMEM;
4354
4355        for_each_online_cpu(cpu) {
4356                unsigned x = get_cpu_slab(s, cpu)->stat[si];
4357
4358                data[cpu] = x;
4359                sum += x;
4360        }
4361
4362        len = sprintf(buf, "%lu", sum);
4363
4364#ifdef CONFIG_SMP
4365        for_each_online_cpu(cpu) {
4366                if (data[cpu] && len < PAGE_SIZE - 20)
4367                        len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]);
4368        }
4369#endif
4370        kfree(data);
4371        return len + sprintf(buf + len, "\n");
4372}
4373
4374#define STAT_ATTR(si, text)                                     \
4375static ssize_t text##_show(struct kmem_cache *s, char *buf)     \
4376{                                                               \
4377        return show_stat(s, buf, si);                           \
4378}                                                               \
4379SLAB_ATTR_RO(text);                                             \
4380
4381STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
4382STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
4383STAT_ATTR(FREE_FASTPATH, free_fastpath);
4384STAT_ATTR(FREE_SLOWPATH, free_slowpath);
4385STAT_ATTR(FREE_FROZEN, free_frozen);
4386STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial);
4387STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
4388STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial);
4389STAT_ATTR(ALLOC_SLAB, alloc_slab);
4390STAT_ATTR(ALLOC_REFILL, alloc_refill);
4391STAT_ATTR(FREE_SLAB, free_slab);
4392STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush);
4393STAT_ATTR(DEACTIVATE_FULL, deactivate_full);
4394STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
4395STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
4396STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
4397STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
4398STAT_ATTR(ORDER_FALLBACK, order_fallback);
4399#endif
4400
4401static struct attribute *slab_attrs[] = {
4402        &slab_size_attr.attr,
4403        &object_size_attr.attr,
4404        &objs_per_slab_attr.attr,
4405        &order_attr.attr,
4406        &min_partial_attr.attr,
4407        &objects_attr.attr,
4408        &objects_partial_attr.attr,
4409        &total_objects_attr.attr,
4410        &slabs_attr.attr,
4411        &partial_attr.attr,
4412        &cpu_slabs_attr.attr,
4413        &ctor_attr.attr,
4414        &aliases_attr.attr,
4415        &align_attr.attr,
4416        &sanity_checks_attr.attr,
4417        &trace_attr.attr,
4418        &hwcache_align_attr.attr,
4419        &reclaim_account_attr.attr,
4420        &destroy_by_rcu_attr.attr,
4421        &red_zone_attr.attr,
4422        &poison_attr.attr,
4423        &store_user_attr.attr,
4424        &validate_attr.attr,
4425        &shrink_attr.attr,
4426        &alloc_calls_attr.attr,
4427        &free_calls_attr.attr,
4428#ifdef CONFIG_ZONE_DMA
4429        &cache_dma_attr.attr,
4430#endif
4431#ifdef CONFIG_NUMA
4432        &remote_node_defrag_ratio_attr.attr,
4433#endif
4434#ifdef CONFIG_SLUB_STATS
4435        &alloc_fastpath_attr.attr,
4436        &alloc_slowpath_attr.attr,
4437        &free_fastpath_attr.attr,
4438        &free_slowpath_attr.attr,
4439        &free_frozen_attr.attr,
4440        &free_add_partial_attr.attr,
4441        &free_remove_partial_attr.attr,
4442        &alloc_from_partial_attr.attr,
4443        &alloc_slab_attr.attr,
4444        &alloc_refill_attr.attr,
4445        &free_slab_attr.attr,
4446        &cpuslab_flush_attr.attr,
4447        &deactivate_full_attr.attr,
4448        &deactivate_empty_attr.attr,
4449        &deactivate_to_head_attr.attr,
4450        &deactivate_to_tail_attr.attr,
4451        &deactivate_remote_frees_attr.attr,
4452        &order_fallback_attr.attr,
4453#endif
4454        NULL
4455};
4456
4457static struct attribute_group slab_attr_group = {
4458        .attrs = slab_attrs,
4459};
4460
4461static ssize_t slab_attr_show(struct kobject *kobj,
4462                                struct attribute *attr,
4463                                char *buf)
4464{
4465        struct slab_attribute *attribute;
4466        struct kmem_cache *s;
4467        int err;
4468
4469        attribute = to_slab_attr(attr);
4470        s = to_slab(kobj);
4471
4472        if (!attribute->show)
4473                return -EIO;
4474
4475        err = attribute->show(s, buf);
4476
4477        return err;
4478}
4479
4480static ssize_t slab_attr_store(struct kobject *kobj,
4481                                struct attribute *attr,
4482                                const char *buf, size_t len)
4483{
4484        struct slab_attribute *attribute;
4485        struct kmem_cache *s;
4486        int err;
4487
4488        attribute = to_slab_attr(attr);
4489        s = to_slab(kobj);
4490
4491        if (!attribute->store)
4492                return -EIO;
4493
4494        err = attribute->store(s, buf, len);
4495
4496        return err;
4497}
4498
4499static void kmem_cache_release(struct kobject *kobj)
4500{
4501        struct kmem_cache *s = to_slab(kobj);
4502
4503        kfree(s);
4504}
4505
4506static struct sysfs_ops slab_sysfs_ops = {
4507        .show = slab_attr_show,
4508        .store = slab_attr_store,
4509};
4510
4511static struct kobj_type slab_ktype = {
4512        .sysfs_ops = &slab_sysfs_ops,
4513        .release = kmem_cache_release
4514};
4515
4516static int uevent_filter(struct kset *kset, struct kobject *kobj)
4517{
4518        struct kobj_type *ktype = get_ktype(kobj);
4519
4520        if (ktype == &slab_ktype)
4521                return 1;
4522        return 0;
4523}
4524
4525static struct kset_uevent_ops slab_uevent_ops = {
4526        .filter = uevent_filter,
4527};
4528
4529static struct kset *slab_kset;
4530
4531#define ID_STR_LENGTH 64
4532
4533/* Create a unique string id for a slab cache:
4534 *
4535 * Format       :[flags-]size
4536 */
4537static char *create_unique_id(struct kmem_cache *s)
4538{
4539        char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
4540        char *p = name;
4541
4542        BUG_ON(!name);
4543
4544        *p++ = ':';
4545        /*
4546         * First flags affecting slabcache operations. We will only
4547         * get here for aliasable slabs so we do not need to support
4548         * too many flags. The flags here must cover all flags that
4549         * are matched during merging to guarantee that the id is
4550         * unique.
4551         */
4552        if (s->flags & SLAB_CACHE_DMA)
4553                *p++ = 'd';
4554        if (s->flags & SLAB_RECLAIM_ACCOUNT)
4555                *p++ = 'a';
4556        if (s->flags & SLAB_DEBUG_FREE)
4557                *p++ = 'F';
4558        if (!(s->flags & SLAB_NOTRACK))
4559                *p++ = 't';
4560        if (p != name + 1)
4561                *p++ = '-';
4562        p += sprintf(p, "%07d", s->size);
4563        BUG_ON(p > name + ID_STR_LENGTH - 1);
4564        return name;
4565}
4566
4567static int sysfs_slab_add(struct kmem_cache *s)
4568{
4569        int err;
4570        const char *name;
4571        int unmergeable;
4572
4573        if (slab_state < SYSFS)
4574                /* Defer until later */
4575                return 0;
4576
4577        unmergeable = slab_unmergeable(s);
4578        if (unmergeable) {
4579                /*
4580                 * Slabcache can never be merged so we can use the name proper.
4581                 * This is typically the case for debug situations. In that
4582                 * case we can catch duplicate names easily.
4583                 */
4584                sysfs_remove_link(&slab_kset->kobj, s->name);
4585                name = s->name;
4586        } else {
4587                /*
4588                 * Create a unique name for the slab as a target
4589                 * for the symlinks.
4590                 */
4591                name = create_unique_id(s);
4592        }
4593
4594        s->kobj.kset = slab_kset;
4595        err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name);
4596        if (err) {
4597                kobject_put(&s->kobj);
4598                return err;
4599        }
4600
4601        err = sysfs_create_group(&s->kobj, &slab_attr_group);
4602        if (err) {
4603                kobject_del(&s->kobj);
4604                kobject_put(&s->kobj);
4605                return err;
4606        }
4607        kobject_uevent(&s->kobj, KOBJ_ADD);
4608        if (!unmergeable) {
4609                /* Setup first alias */
4610                sysfs_slab_alias(s, s->name);
4611                kfree(name);
4612        }
4613        return 0;
4614}
4615
4616static void sysfs_slab_remove(struct kmem_cache *s)
4617{
4618        kobject_uevent(&s->kobj, KOBJ_REMOVE);
4619        kobject_del(&s->kobj);
4620        kobject_put(&s->kobj);
4621}
4622
4623/*
4624 * Need to buffer aliases during bootup until sysfs becomes
4625 * available lest we lose that information.
4626 */
4627struct saved_alias {
4628        struct kmem_cache *s;
4629        const char *name;
4630        struct saved_alias *next;
4631};
4632
4633static struct saved_alias *alias_list;
4634
4635static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
4636{
4637        struct saved_alias *al;
4638
4639        if (slab_state == SYSFS) {
4640                /*
4641                 * If we have a leftover link then remove it.
4642                 */
4643                sysfs_remove_link(&slab_kset->kobj, name);
4644                return sysfs_create_link(&slab_kset->kobj, &s->kobj, name);
4645        }
4646
4647        al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
4648        if (!al)
4649                return -ENOMEM;
4650
4651        al->s = s;
4652        al->name = name;
4653        al->next = alias_list;
4654        alias_list = al;
4655        return 0;
4656}
4657
4658static int __init slab_sysfs_init(void)
4659{
4660        struct kmem_cache *s;
4661        int err;
4662
4663        slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
4664        if (!slab_kset) {
4665                printk(KERN_ERR "Cannot register slab subsystem.\n");
4666                return -ENOSYS;
4667        }
4668
4669        slab_state = SYSFS;
4670
4671        list_for_each_entry(s, &slab_caches, list) {
4672                err = sysfs_slab_add(s);
4673                if (err)
4674                        printk(KERN_ERR "SLUB: Unable to add boot slab %s"
4675                                                " to sysfs\n", s->name);
4676        }
4677
4678        while (alias_list) {
4679                struct saved_alias *al = alias_list;
4680
4681                alias_list = alias_list->next;
4682                err = sysfs_slab_alias(al->s, al->name);
4683                if (err)
4684                        printk(KERN_ERR "SLUB: Unable to add boot slab alias"
4685                                        " %s to sysfs\n", s->name);
4686                kfree(al);
4687        }
4688
4689        resiliency_test();
4690        return 0;
4691}
4692
4693__initcall(slab_sysfs_init);
4694#endif
4695
4696/*
4697 * The /proc/slabinfo ABI
4698 */
4699#ifdef CONFIG_SLABINFO
4700static void print_slabinfo_header(struct seq_file *m)
4701{
4702        seq_puts(m, "slabinfo - version: 2.1\n");
4703        seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
4704                 "<objperslab> <pagesperslab>");
4705        seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
4706        seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
4707        seq_putc(m, '\n');
4708}
4709
4710static void *s_start(struct seq_file *m, loff_t *pos)
4711{
4712        loff_t n = *pos;
4713
4714        down_read(&slub_lock);
4715        if (!n)
4716                print_slabinfo_header(m);
4717
4718        return seq_list_start(&slab_caches, *pos);
4719}
4720
4721static void *s_next(struct seq_file *m, void *p, loff_t *pos)
4722{
4723        return seq_list_next(p, &slab_caches, pos);
4724}
4725
4726static void s_stop(struct seq_file *m, void *p)
4727{
4728        up_read(&slub_lock);
4729}
4730
4731static int s_show(struct seq_file *m, void *p)
4732{
4733        unsigned long nr_partials = 0;
4734        unsigned long nr_slabs = 0;
4735        unsigned long nr_inuse = 0;
4736        unsigned long nr_objs = 0;
4737        unsigned long nr_free = 0;
4738        struct kmem_cache *s;
4739        int node;
4740
4741        s = list_entry(p, struct kmem_cache, list);
4742
4743        for_each_online_node(node) {
4744                struct kmem_cache_node *n = get_node(s, node);
4745
4746                if (!n)
4747                        continue;
4748
4749                nr_partials += n->nr_partial;
4750                nr_slabs += atomic_long_read(&n->nr_slabs);
4751                nr_objs += atomic_long_read(&n->total_objects);
4752                nr_free += count_partial(n, count_free);
4753        }
4754
4755        nr_inuse = nr_objs - nr_free;
4756
4757        seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse,
4758                   nr_objs, s->size, oo_objects(s->oo),
4759                   (1 << oo_order(s->oo)));
4760        seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0);
4761        seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs,
4762                   0UL);
4763        seq_putc(m, '\n');
4764        return 0;
4765}
4766
4767static const struct seq_operations slabinfo_op = {
4768        .start = s_start,
4769        .next = s_next,
4770        .stop = s_stop,
4771        .show = s_show,
4772};
4773
4774static int slabinfo_open(struct inode *inode, struct file *file)
4775{
4776        return seq_open(file, &slabinfo_op);
4777}
4778
4779static const struct file_operations proc_slabinfo_operations = {
4780        .open           = slabinfo_open,
4781        .read           = seq_read,
4782        .llseek         = seq_lseek,
4783        .release        = seq_release,
4784};
4785
4786static int __init slab_proc_init(void)
4787{
4788        proc_create("slabinfo", S_IRUGO, NULL, &proc_slabinfo_operations);
4789        return 0;
4790}
4791module_init(slab_proc_init);
4792#endif /* CONFIG_SLABINFO */
4793