LXR linux/mm/slab.c

   1/*
   2 * linux/mm/slab.c
   3 * Written by Mark Hemment, 1996/97.
   4 * (markhe@nextd.demon.co.uk)
   5 *
   6 * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
   7 *
   8 * Major cleanup, different bufctl logic, per-cpu arrays
   9 *      (c) 2000 Manfred Spraul
  10 *
  11 * Cleanup, make the head arrays unconditional, preparation for NUMA
  12 *      (c) 2002 Manfred Spraul
  13 *
  14 * An implementation of the Slab Allocator as described in outline in;
  15 *      UNIX Internals: The New Frontiers by Uresh Vahalia
  16 *      Pub: Prentice Hall      ISBN 0-13-101908-2
  17 * or with a little more detail in;
  18 *      The Slab Allocator: An Object-Caching Kernel Memory Allocator
  19 *      Jeff Bonwick (Sun Microsystems).
  20 *      Presented at: USENIX Summer 1994 Technical Conference
  21 *
  22 * The memory is organized in caches, one cache for each object type.
  23 * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
  24 * Each cache consists out of many slabs (they are small (usually one
  25 * page long) and always contiguous), and each slab contains multiple
  26 * initialized objects.
  27 *
  28 * This means, that your constructor is used only for newly allocated
  29 * slabs and you must pass objects with the same initializations to
  30 * kmem_cache_free.
  31 *
  32 * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
  33 * normal). If you need a special memory type, then must create a new
  34 * cache for that memory type.
  35 *
  36 * In order to reduce fragmentation, the slabs are sorted in 3 groups:
  37 *   full slabs with 0 free objects
  38 *   partial slabs
  39 *   empty slabs with no allocated objects
  40 *
  41 * If partial slabs exist, then new allocations come from these slabs,
  42 * otherwise from empty slabs or new slabs are allocated.
  43 *
  44 * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
  45 * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
  46 *
  47 * Each cache has a short per-cpu head array, most allocs
  48 * and frees go into that array, and if that array overflows, then 1/2
  49 * of the entries in the array are given back into the global cache.
  50 * The head array is strictly LIFO and should improve the cache hit rates.
  51 * On SMP, it additionally reduces the spinlock operations.
  52 *
  53 * The c_cpuarray may not be read with enabled local interrupts -
  54 * it's changed with a smp_call_function().
  55 *
  56 * SMP synchronization:
  57 *  constructors and destructors are called without any locking.
  58 *  Several members in struct kmem_cache and struct slab never change, they
  59 *      are accessed without any locking.
  60 *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
  61 *      and local interrupts are disabled so slab code is preempt-safe.
  62 *  The non-constant members are protected with a per-cache irq spinlock.
  63 *
  64 * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
  65 * in 2000 - many ideas in the current implementation are derived from
  66 * his patch.
  67 *
  68 * Further notes from the original documentation:
  69 *
  70 * 11 April '97.  Started multi-threading - markhe
  71 *      The global cache-chain is protected by the mutex 'cache_chain_mutex'.
  72 *      The sem is only needed when accessing/extending the cache-chain, which
  73 *      can never happen inside an interrupt (kmem_cache_create(),
  74 *      kmem_cache_shrink() and kmem_cache_reap()).
  75 *
  76 *      At present, each engine can be growing a cache.  This should be blocked.
  77 *
  78 * 15 March 2005. NUMA slab allocator.
  79 *      Shai Fultheim <shai@scalex86.org>.
  80 *      Shobhit Dayal <shobhit@calsoftinc.com>
  81 *      Alok N Kataria <alokk@calsoftinc.com>
  82 *      Christoph Lameter <christoph@lameter.com>
  83 *
  84 *      Modified the slab allocator to be node aware on NUMA systems.
  85 *      Each node has its own list of partial, free and full slabs.
  86 *      All object allocations for a node occur from node specific slab lists.
  87 */
  88
  89#include        <linux/slab.h>
  90#include        <linux/mm.h>
  91#include        <linux/poison.h>
  92#include        <linux/swap.h>
  93#include        <linux/cache.h>
  94#include        <linux/interrupt.h>
  95#include        <linux/init.h>
  96#include        <linux/compiler.h>
  97#include        <linux/cpuset.h>
  98#include        <linux/proc_fs.h>
  99#include        <linux/seq_file.h>
 100#include        <linux/notifier.h>
 101#include        <linux/kallsyms.h>
 102#include        <linux/cpu.h>
 103#include        <linux/sysctl.h>
 104#include        <linux/module.h>
 105#include        <linux/rcupdate.h>
 106#include        <linux/string.h>
 107#include        <linux/uaccess.h>
 108#include        <linux/nodemask.h>
 109#include        <linux/kmemleak.h>
 110#include        <linux/mempolicy.h>
 111#include        <linux/mutex.h>
 112#include        <linux/fault-inject.h>
 113#include        <linux/rtmutex.h>
 114#include        <linux/reciprocal_div.h>
 115#include        <linux/debugobjects.h>
 116#include        <linux/kmemcheck.h>
 117#include        <linux/memory.h>
 118
 119#include        <asm/cacheflush.h>
 120#include        <asm/tlbflush.h>
 121#include        <asm/page.h>
 122
 123/*
 124 * DEBUG        - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
 125 *                0 for faster, smaller code (especially in the critical paths).
 126 *
 127 * STATS        - 1 to collect stats for /proc/slabinfo.
 128 *                0 for faster, smaller code (especially in the critical paths).
 129 *
 130 * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
 131 */
 132
 133#ifdef CONFIG_DEBUG_SLAB
 134#define DEBUG           1
 135#define STATS           1
 136#define FORCED_DEBUG    1
 137#else
 138#define DEBUG           0
 139#define STATS           0
 140#define FORCED_DEBUG    0
 141#endif
 142
 143/* Shouldn't this be in a header file somewhere? */
 144#define BYTES_PER_WORD          sizeof(void *)
 145#define REDZONE_ALIGN           max(BYTES_PER_WORD, __alignof__(unsigned long long))
 146
 147#ifndef ARCH_KMALLOC_FLAGS
 148#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
 149#endif
 150
 151/* Legal flag mask for kmem_cache_create(). */
 152#if DEBUG
 153# define CREATE_MASK    (SLAB_RED_ZONE | \
 154                         SLAB_POISON | SLAB_HWCACHE_ALIGN | \
 155                         SLAB_CACHE_DMA | \
 156                         SLAB_STORE_USER | \
 157                         SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
 158                         SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
 159                         SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
 160#else
 161# define CREATE_MASK    (SLAB_HWCACHE_ALIGN | \
 162                         SLAB_CACHE_DMA | \
 163                         SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
 164                         SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
 165                         SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
 166#endif
 167
 168/*
 169 * kmem_bufctl_t:
 170 *
 171 * Bufctl's are used for linking objs within a slab
 172 * linked offsets.
 173 *
 174 * This implementation relies on "struct page" for locating the cache &
 175 * slab an object belongs to.
 176 * This allows the bufctl structure to be small (one int), but limits
 177 * the number of objects a slab (not a cache) can contain when off-slab
 178 * bufctls are used. The limit is the size of the largest general cache
 179 * that does not use off-slab slabs.
 180 * For 32bit archs with 4 kB pages, is this 56.
 181 * This is not serious, as it is only for large objects, when it is unwise
 182 * to have too many per slab.
 183 * Note: This limit can be raised by introducing a general cache whose size
 184 * is less than 512 (PAGE_SIZE<<3), but greater than 256.
 185 */
 186
 187typedef unsigned int kmem_bufctl_t;
 188#define BUFCTL_END      (((kmem_bufctl_t)(~0U))-0)
 189#define BUFCTL_FREE     (((kmem_bufctl_t)(~0U))-1)
 190#define BUFCTL_ACTIVE   (((kmem_bufctl_t)(~0U))-2)
 191#define SLAB_LIMIT      (((kmem_bufctl_t)(~0U))-3)
 192
 193/*
 194 * struct slab
 195 *
 196 * Manages the objs in a slab. Placed either at the beginning of mem allocated
 197 * for a slab, or allocated from an general cache.
 198 * Slabs are chained into three list: fully used, partial, fully free slabs.
 199 */
 200struct slab {
 201        struct list_head list;
 202        unsigned long colouroff;
 203        void *s_mem;            /* including colour offset */
 204        unsigned int inuse;     /* num of objs active in slab */
 205        kmem_bufctl_t free;
 206        unsigned short nodeid;
 207};
 208
 209/*
 210 * struct slab_rcu
 211 *
 212 * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
 213 * arrange for kmem_freepages to be called via RCU.  This is useful if
 214 * we need to approach a kernel structure obliquely, from its address
 215 * obtained without the usual locking.  We can lock the structure to
 216 * stabilize it and check it's still at the given address, only if we
 217 * can be sure that the memory has not been meanwhile reused for some
 218 * other kind of object (which our subsystem's lock might corrupt).
 219 *
 220 * rcu_read_lock before reading the address, then rcu_read_unlock after
 221 * taking the spinlock within the structure expected at that address.
 222 *
 223 * We assume struct slab_rcu can overlay struct slab when destroying.
 224 */
 225struct slab_rcu {
 226        struct rcu_head head;
 227        struct kmem_cache *cachep;
 228        void *addr;
 229};
 230
 231/*
 232 * struct array_cache
 233 *
 234 * Purpose:
 235 * - LIFO ordering, to hand out cache-warm objects from _alloc
 236 * - reduce the number of linked list operations
 237 * - reduce spinlock operations
 238 *
 239 * The limit is stored in the per-cpu structure to reduce the data cache
 240 * footprint.
 241 *
 242 */
 243struct array_cache {
 244        unsigned int avail;
 245        unsigned int limit;
 246        unsigned int batchcount;
 247        unsigned int touched;
 248        spinlock_t lock;
 249        void *entry[];  /*
 250                         * Must have this definition in here for the proper
 251                         * alignment of array_cache. Also simplifies accessing
 252                         * the entries.
 253                         */
 254};
 255
 256/*
 257 * bootstrap: The caches do not work without cpuarrays anymore, but the
 258 * cpuarrays are allocated from the generic caches...
 259 */
 260#define BOOT_CPUCACHE_ENTRIES   1
 261struct arraycache_init {
 262        struct array_cache cache;
 263        void *entries[BOOT_CPUCACHE_ENTRIES];
 264};
 265
 266/*
 267 * The slab lists for all objects.
 268 */
 269struct kmem_list3 {
 270        struct list_head slabs_partial; /* partial list first, better asm code */
 271        struct list_head slabs_full;
 272        struct list_head slabs_free;
 273        unsigned long free_objects;
 274        unsigned int free_limit;
 275        unsigned int colour_next;       /* Per-node cache coloring */
 276        spinlock_t list_lock;
 277        struct array_cache *shared;     /* shared per node */
 278        struct array_cache **alien;     /* on other nodes */
 279        unsigned long next_reap;        /* updated without locking */
 280        int free_touched;               /* updated without locking */
 281};
 282
 283/*
 284 * Need this for bootstrapping a per node allocator.
 285 */
 286#define NUM_INIT_LISTS (3 * MAX_NUMNODES)
 287static struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
 288#define CACHE_CACHE 0
 289#define SIZE_AC MAX_NUMNODES
 290#define SIZE_L3 (2 * MAX_NUMNODES)
 291
 292static int drain_freelist(struct kmem_cache *cache,
 293                        struct kmem_list3 *l3, int tofree);
 294static void free_block(struct kmem_cache *cachep, void **objpp, int len,
 295                        int node);
 296static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
 297static void cache_reap(struct work_struct *unused);
 298
 299/*
 300 * This function must be completely optimized away if a constant is passed to
 301 * it.  Mostly the same as what is in linux/slab.h except it returns an index.
 302 */
 303static __always_inline int index_of(const size_t size)
 304{
 305        extern void __bad_size(void);
 306
 307        if (__builtin_constant_p(size)) {
 308                int i = 0;
 309
 310#define CACHE(x) \
 311        if (size <=x) \
 312                return i; \
 313        else \
 314                i++;
 315#include <linux/kmalloc_sizes.h>
 316#undef CACHE
 317                __bad_size();
 318        } else
 319                __bad_size();
 320        return 0;
 321}
 322
 323static int slab_early_init = 1;
 324
 325#define INDEX_AC index_of(sizeof(struct arraycache_init))
 326#define INDEX_L3 index_of(sizeof(struct kmem_list3))
 327
 328static void kmem_list3_init(struct kmem_list3 *parent)
 329{
 330        INIT_LIST_HEAD(&parent->slabs_full);
 331        INIT_LIST_HEAD(&parent->slabs_partial);
 332        INIT_LIST_HEAD(&parent->slabs_free);
 333        parent->shared = NULL;
 334        parent->alien = NULL;
 335        parent->colour_next = 0;
 336        spin_lock_init(&parent->list_lock);
 337        parent->free_objects = 0;
 338        parent->free_touched = 0;
 339}
 340
 341#define MAKE_LIST(cachep, listp, slab, nodeid)                          \
 342        do {                                                            \
 343                INIT_LIST_HEAD(listp);                                  \
 344                list_splice(&(cachep->nodelists[nodeid]->slab), listp); \
 345        } while (0)
 346
 347#define MAKE_ALL_LISTS(cachep, ptr, nodeid)                             \
 348        do {                                                            \
 349        MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid);  \
 350        MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
 351        MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);  \
 352        } while (0)
 353
 354#define CFLGS_OFF_SLAB          (0x80000000UL)
 355#define OFF_SLAB(x)     ((x)->flags & CFLGS_OFF_SLAB)
 356
 357#define BATCHREFILL_LIMIT       16
 358/*
 359 * Optimization question: fewer reaps means less probability for unnessary
 360 * cpucache drain/refill cycles.
 361 *
 362 * OTOH the cpuarrays can contain lots of objects,
 363 * which could lock up otherwise freeable slabs.
 364 */
 365#define REAPTIMEOUT_CPUC        (2*HZ)
 366#define REAPTIMEOUT_LIST3       (4*HZ)
 367
 368#if STATS
 369#define STATS_INC_ACTIVE(x)     ((x)->num_active++)
 370#define STATS_DEC_ACTIVE(x)     ((x)->num_active--)
 371#define STATS_INC_ALLOCED(x)    ((x)->num_allocations++)
 372#define STATS_INC_GROWN(x)      ((x)->grown++)
 373#define STATS_ADD_REAPED(x,y)   ((x)->reaped += (y))
 374#define STATS_SET_HIGH(x)                                               \
 375        do {                                                            \
 376                if ((x)->num_active > (x)->high_mark)                   \
 377                        (x)->high_mark = (x)->num_active;               \
 378        } while (0)
 379#define STATS_INC_ERR(x)        ((x)->errors++)
 380#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++)
 381#define STATS_INC_NODEFREES(x)  ((x)->node_frees++)
 382#define STATS_INC_ACOVERFLOW(x)   ((x)->node_overflow++)
 383#define STATS_SET_FREEABLE(x, i)                                        \
 384        do {                                                            \
 385                if ((x)->max_freeable < i)                              \
 386                        (x)->max_freeable = i;                          \
 387        } while (0)
 388#define STATS_INC_ALLOCHIT(x)   atomic_inc(&(x)->allochit)
 389#define STATS_INC_ALLOCMISS(x)  atomic_inc(&(x)->allocmiss)
 390#define STATS_INC_FREEHIT(x)    atomic_inc(&(x)->freehit)
 391#define STATS_INC_FREEMISS(x)   atomic_inc(&(x)->freemiss)
 392#else
 393#define STATS_INC_ACTIVE(x)     do { } while (0)
 394#define STATS_DEC_ACTIVE(x)     do { } while (0)
 395#define STATS_INC_ALLOCED(x)    do { } while (0)
 396#define STATS_INC_GROWN(x)      do { } while (0)
 397#define STATS_ADD_REAPED(x,y)   do { (void)(y); } while (0)
 398#define STATS_SET_HIGH(x)       do { } while (0)
 399#define STATS_INC_ERR(x)        do { } while (0)
 400#define STATS_INC_NODEALLOCS(x) do { } while (0)
 401#define STATS_INC_NODEFREES(x)  do { } while (0)
 402#define STATS_INC_ACOVERFLOW(x)   do { } while (0)
 403#define STATS_SET_FREEABLE(x, i) do { } while (0)
 404#define STATS_INC_ALLOCHIT(x)   do { } while (0)
 405#define STATS_INC_ALLOCMISS(x)  do { } while (0)
 406#define STATS_INC_FREEHIT(x)    do { } while (0)
 407#define STATS_INC_FREEMISS(x)   do { } while (0)
 408#endif
 409
 410#if DEBUG
 411
 412/*
 413 * memory layout of objects:
 414 * 0            : objp
 415 * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
 416 *              the end of an object is aligned with the end of the real
 417 *              allocation. Catches writes behind the end of the allocation.
 418 * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
 419 *              redzone word.
 420 * cachep->obj_offset: The real object.
 421 * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
 422 * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address
 423 *                                      [BYTES_PER_WORD long]
 424 */
 425static int obj_offset(struct kmem_cache *cachep)
 426{
 427        return cachep->obj_offset;
 428}
 429
 430static int obj_size(struct kmem_cache *cachep)
 431{
 432        return cachep->obj_size;
 433}
 434
 435static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
 436{
 437        BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
 438        return (unsigned long long*) (objp + obj_offset(cachep) -
 439                                      sizeof(unsigned long long));
 440}
 441
 442static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
 443{
 444        BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
 445        if (cachep->flags & SLAB_STORE_USER)
 446                return (unsigned long long *)(objp + cachep->buffer_size -
 447                                              sizeof(unsigned long long) -
 448                                              REDZONE_ALIGN);
 449        return (unsigned long long *) (objp + cachep->buffer_size -
 450                                       sizeof(unsigned long long));
 451}
 452
 453static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 454{
 455        BUG_ON(!(cachep->flags & SLAB_STORE_USER));
 456        return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);
 457}
 458
 459#else
 460
 461#define obj_offset(x)                   0
 462#define obj_size(cachep)                (cachep->buffer_size)
 463#define dbg_redzone1(cachep, objp)      ({BUG(); (unsigned long long *)NULL;})
 464#define dbg_redzone2(cachep, objp)      ({BUG(); (unsigned long long *)NULL;})
 465#define dbg_userword(cachep, objp)      ({BUG(); (void **)NULL;})
 466
 467#endif
 468
 469#ifdef CONFIG_TRACING
 470size_t slab_buffer_size(struct kmem_cache *cachep)
 471{
 472        return cachep->buffer_size;
 473}
 474EXPORT_SYMBOL(slab_buffer_size);
 475#endif
 476
 477/*
 478 * Do not go above this order unless 0 objects fit into the slab.
 479 */
 480#define BREAK_GFP_ORDER_HI      1
 481#define BREAK_GFP_ORDER_LO      0
 482static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
 483
 484/*
 485 * Functions for storing/retrieving the cachep and or slab from the page
 486 * allocator.  These are used to find the slab an obj belongs to.  With kfree(),
 487 * these are used to find the cache which an obj belongs to.
 488 */
 489static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
 490{
 491        page->lru.next = (struct list_head *)cache;
 492}
 493
 494static inline struct kmem_cache *page_get_cache(struct page *page)
 495{
 496        page = compound_head(page);
 497        BUG_ON(!PageSlab(page));
 498        return (struct kmem_cache *)page->lru.next;
 499}
 500
 501static inline void page_set_slab(struct page *page, struct slab *slab)
 502{
 503        page->lru.prev = (struct list_head *)slab;
 504}
 505
 506static inline struct slab *page_get_slab(struct page *page)
 507{
 508        BUG_ON(!PageSlab(page));
 509        return (struct slab *)page->lru.prev;
 510}
 511
 512static inline struct kmem_cache *virt_to_cache(const void *obj)
 513{
 514        struct page *page = virt_to_head_page(obj);
 515        return page_get_cache(page);
 516}
 517
 518static inline struct slab *virt_to_slab(const void *obj)
 519{
 520        struct page *page = virt_to_head_page(obj);
 521        return page_get_slab(page);
 522}
 523
 524static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
 525                                 unsigned int idx)
 526{
 527        return slab->s_mem + cache->buffer_size * idx;
 528}
 529
 530/*
 531 * We want to avoid an expensive divide : (offset / cache->buffer_size)
 532 *   Using the fact that buffer_size is a constant for a particular cache,
 533 *   we can replace (offset / cache->buffer_size) by
 534 *   reciprocal_divide(offset, cache->reciprocal_buffer_size)
 535 */
 536static inline unsigned int obj_to_index(const struct kmem_cache *cache,
 537                                        const struct slab *slab, void *obj)
 538{
 539        u32 offset = (obj - slab->s_mem);
 540        return reciprocal_divide(offset, cache->reciprocal_buffer_size);
 541}
 542
 543/*
 544 * These are the default caches for kmalloc. Custom caches can have other sizes.
 545 */
 546struct cache_sizes malloc_sizes[] = {
 547#define CACHE(x) { .cs_size = (x) },
 548#include <linux/kmalloc_sizes.h>
 549        CACHE(ULONG_MAX)
 550#undef CACHE
 551};
 552EXPORT_SYMBOL(malloc_sizes);
 553
 554/* Must match cache_sizes above. Out of line to keep cache footprint low. */
 555struct cache_names {
 556        char *name;
 557        char *name_dma;
 558};
 559
 560static struct cache_names __initdata cache_names[] = {
 561#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
 562#include <linux/kmalloc_sizes.h>
 563        {NULL,}
 564#undef CACHE
 565};
 566
 567static struct arraycache_init initarray_cache __initdata =
 568    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 569static struct arraycache_init initarray_generic =
 570    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 571
 572/* internal cache of cache description objs */
 573static struct kmem_cache cache_cache = {
 574        .batchcount = 1,
 575        .limit = BOOT_CPUCACHE_ENTRIES,
 576        .shared = 1,
 577        .buffer_size = sizeof(struct kmem_cache),
 578        .name = "kmem_cache",
 579};
 580
 581#define BAD_ALIEN_MAGIC 0x01020304ul
 582
 583/*
 584 * chicken and egg problem: delay the per-cpu array allocation
 585 * until the general caches are up.
 586 */
 587static enum {
 588        NONE,
 589        PARTIAL_AC,
 590        PARTIAL_L3,
 591        EARLY,
 592        FULL
 593} g_cpucache_up;
 594
 595/*
 596 * used by boot code to determine if it can use slab based allocator
 597 */
 598int slab_is_available(void)
 599{
 600        return g_cpucache_up >= EARLY;
 601}
 602
 603#ifdef CONFIG_LOCKDEP
 604
 605/*
 606 * Slab sometimes uses the kmalloc slabs to store the slab headers
 607 * for other slabs "off slab".
 608 * The locking for this is tricky in that it nests within the locks
 609 * of all other slabs in a few places; to deal with this special
 610 * locking we put on-slab caches into a separate lock-class.
 611 *
 612 * We set lock class for alien array caches which are up during init.
 613 * The lock annotation will be lost if all cpus of a node goes down and
 614 * then comes back up during hotplug
 615 */
 616static struct lock_class_key on_slab_l3_key;
 617static struct lock_class_key on_slab_alc_key;
 618
 619static void init_node_lock_keys(int q)
 620{
 621        struct cache_sizes *s = malloc_sizes;
 622
 623        if (g_cpucache_up != FULL)
 624                return;
 625
 626        for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {
 627                struct array_cache **alc;
 628                struct kmem_list3 *l3;
 629                int r;
 630
 631                l3 = s->cs_cachep->nodelists[q];
 632                if (!l3 || OFF_SLAB(s->cs_cachep))
 633                        continue;
 634                lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
 635                alc = l3->alien;
 636                /*
 637                 * FIXME: This check for BAD_ALIEN_MAGIC
 638                 * should go away when common slab code is taught to
 639                 * work even without alien caches.
 640                 * Currently, non NUMA code returns BAD_ALIEN_MAGIC
 641                 * for alloc_alien_cache,
 642                 */
 643                if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
 644                        continue;
 645                for_each_node(r) {
 646                        if (alc[r])
 647                                lockdep_set_class(&alc[r]->lock,
 648                                        &on_slab_alc_key);
 649                }
 650        }
 651}
 652
 653static inline void init_lock_keys(void)
 654{
 655        int node;
 656
 657        for_each_node(node)
 658                init_node_lock_keys(node);
 659}
 660#else
 661static void init_node_lock_keys(int q)
 662{
 663}
 664
 665static inline void init_lock_keys(void)
 666{
 667}
 668#endif
 669
 670/*
 671 * Guard access to the cache-chain.
 672 */
 673static DEFINE_MUTEX(cache_chain_mutex);
 674static struct list_head cache_chain;
 675
 676static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
 677
 678static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
 679{
 680        return cachep->array[smp_processor_id()];
 681}
 682
 683static inline struct kmem_cache *__find_general_cachep(size_t size,
 684                                                        gfp_t gfpflags)
 685{
 686        struct cache_sizes *csizep = malloc_sizes;
 687
 688#if DEBUG
 689        /* This happens if someone tries to call
 690         * kmem_cache_create(), or __kmalloc(), before
 691         * the generic caches are initialized.
 692         */
 693        BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
 694#endif
 695        if (!size)
 696                return ZERO_SIZE_PTR;
 697
 698        while (size > csizep->cs_size)
 699                csizep++;
 700
 701        /*
 702         * Really subtle: The last entry with cs->cs_size==ULONG_MAX
 703         * has cs_{dma,}cachep==NULL. Thus no special case
 704         * for large kmalloc calls required.
 705         */
 706#ifdef CONFIG_ZONE_DMA
 707        if (unlikely(gfpflags & GFP_DMA))
 708                return csizep->cs_dmacachep;
 709#endif
 710        return csizep->cs_cachep;
 711}
 712
 713static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
 714{
 715        return __find_general_cachep(size, gfpflags);
 716}
 717
 718static size_t slab_mgmt_size(size_t nr_objs, size_t align)
 719{
 720        return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
 721}
 722
 723/*
 724 * Calculate the number of objects and left-over bytes for a given buffer size.
 725 */
 726static void cache_estimate(unsigned long gfporder, size_t buffer_size,
 727                           size_t align, int flags, size_t *left_over,
 728                           unsigned int *num)
 729{
 730        int nr_objs;
 731        size_t mgmt_size;
 732        size_t slab_size = PAGE_SIZE << gfporder;
 733
 734        /*
 735         * The slab management structure can be either off the slab or
 736         * on it. For the latter case, the memory allocated for a
 737         * slab is used for:
 738         *
 739         * - The struct slab
 740         * - One kmem_bufctl_t for each object
 741         * - Padding to respect alignment of @align
 742         * - @buffer_size bytes for each object
 743         *
 744         * If the slab management structure is off the slab, then the
 745         * alignment will already be calculated into the size. Because
 746         * the slabs are all pages aligned, the objects will be at the
 747         * correct alignment when allocated.
 748         */
 749        if (flags & CFLGS_OFF_SLAB) {
 750                mgmt_size = 0;
 751                nr_objs = slab_size / buffer_size;
 752
 753                if (nr_objs > SLAB_LIMIT)
 754                        nr_objs = SLAB_LIMIT;
 755        } else {
 756                /*
 757                 * Ignore padding for the initial guess. The padding
 758                 * is at most @align-1 bytes, and @buffer_size is at
 759                 * least @align. In the worst case, this result will
 760                 * be one greater than the number of objects that fit
 761                 * into the memory allocation when taking the padding
 762                 * into account.
 763                 */
 764                nr_objs = (slab_size - sizeof(struct slab)) /
 765                          (buffer_size + sizeof(kmem_bufctl_t));
 766
 767                /*
 768                 * This calculated number will be either the right
 769                 * amount, or one greater than what we want.
 770                 */
 771                if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
 772                       > slab_size)
 773                        nr_objs--;
 774
 775                if (nr_objs > SLAB_LIMIT)
 776                        nr_objs = SLAB_LIMIT;
 777
 778                mgmt_size = slab_mgmt_size(nr_objs, align);
 779        }
 780        *num = nr_objs;
 781        *left_over = slab_size - nr_objs*buffer_size - mgmt_size;
 782}
 783
 784#define slab_error(cachep, msg) __slab_error(__func__, cachep, msg)
 785
 786static void __slab_error(const char *function, struct kmem_cache *cachep,
 787                        char *msg)
 788{
 789        printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
 790               function, cachep->name, msg);
 791        dump_stack();
 792}
 793
 794/*
 795 * By default on NUMA we use alien caches to stage the freeing of
 796 * objects allocated from other nodes. This causes massive memory
 797 * inefficiencies when using fake NUMA setup to split memory into a
 798 * large number of small nodes, so it can be disabled on the command
 799 * line
 800  */
 801
 802static int use_alien_caches __read_mostly = 1;
 803static int __init noaliencache_setup(char *s)
 804{
 805        use_alien_caches = 0;
 806        return 1;
 807}
 808__setup("noaliencache", noaliencache_setup);
 809
 810#ifdef CONFIG_NUMA
 811/*
 812 * Special reaping functions for NUMA systems called from cache_reap().
 813 * These take care of doing round robin flushing of alien caches (containing
 814 * objects freed on different nodes from which they were allocated) and the
 815 * flushing of remote pcps by calling drain_node_pages.
 816 */
 817static DEFINE_PER_CPU(unsigned long, slab_reap_node);
 818
 819static void init_reap_node(int cpu)
 820{
 821        int node;
 822
 823        node = next_node(cpu_to_mem(cpu), node_online_map);
 824        if (node == MAX_NUMNODES)
 825                node = first_node(node_online_map);
 826
 827        per_cpu(slab_reap_node, cpu) = node;
 828}
 829
 830static void next_reap_node(void)
 831{
 832        int node = __this_cpu_read(slab_reap_node);
 833
 834        node = next_node(node, node_online_map);
 835        if (unlikely(node >= MAX_NUMNODES))
 836                node = first_node(node_online_map);
 837        __this_cpu_write(slab_reap_node, node);
 838}
 839
 840#else
 841#define init_reap_node(cpu) do { } while (0)
 842#define next_reap_node(void) do { } while (0)
 843#endif
 844
 845/*
 846 * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
 847 * via the workqueue/eventd.
 848 * Add the CPU number into the expiration time to minimize the possibility of
 849 * the CPUs getting into lockstep and contending for the global cache chain
 850 * lock.
 851 */
 852static void __cpuinit start_cpu_timer(int cpu)
 853{
 854        struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu);
 855
 856        /*
 857         * When this gets called from do_initcalls via cpucache_init(),
 858         * init_workqueues() has already run, so keventd will be setup
 859         * at that time.
 860         */
 861        if (keventd_up() && reap_work->work.func == NULL) {
 862                init_reap_node(cpu);
 863                INIT_DELAYED_WORK_DEFERRABLE(reap_work, cache_reap);
 864                schedule_delayed_work_on(cpu, reap_work,
 865                                        __round_jiffies_relative(HZ, cpu));
 866        }
 867}
 868
 869static struct array_cache *alloc_arraycache(int node, int entries,
 870                                            int batchcount, gfp_t gfp)
 871{
 872        int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
 873        struct array_cache *nc = NULL;
 874
 875        nc = kmalloc_node(memsize, gfp, node);
 876        /*
 877         * The array_cache structures contain pointers to free object.
 878         * However, when such objects are allocated or transfered to another
 879         * cache the pointers are not cleared and they could be counted as
 880         * valid references during a kmemleak scan. Therefore, kmemleak must
 881         * not scan such objects.
 882         */
 883        kmemleak_no_scan(nc);
 884        if (nc) {
 885                nc->avail = 0;
 886                nc->limit = entries;
 887                nc->batchcount = batchcount;
 888                nc->touched = 0;
 889                spin_lock_init(&nc->lock);
 890        }
 891        return nc;
 892}
 893
 894/*
 895 * Transfer objects in one arraycache to another.
 896 * Locking must be handled by the caller.
 897 *
 898 * Return the number of entries transferred.
 899 */
 900static int transfer_objects(struct array_cache *to,
 901                struct array_cache *from, unsigned int max)
 902{
 903        /* Figure out how many entries to transfer */
 904        int nr = min3(from->avail, max, to->limit - to->avail);
 905
 906        if (!nr)
 907                return 0;
 908
 909        memcpy(to->entry + to->avail, from->entry + from->avail -nr,
 910                        sizeof(void *) *nr);
 911
 912        from->avail -= nr;
 913        to->avail += nr;
 914        return nr;
 915}
 916
 917#ifndef CONFIG_NUMA
 918
 919#define drain_alien_cache(cachep, alien) do { } while (0)
 920#define reap_alien(cachep, l3) do { } while (0)
 921
 922static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
 923{
 924        return (struct array_cache **)BAD_ALIEN_MAGIC;
 925}
 926
 927static inline void free_alien_cache(struct array_cache **ac_ptr)
 928{
 929}
 930
 931static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
 932{
 933        return 0;
 934}
 935
 936static inline void *alternate_node_alloc(struct kmem_cache *cachep,
 937                gfp_t flags)
 938{
 939        return NULL;
 940}
 941
 942static inline void *____cache_alloc_node(struct kmem_cache *cachep,
 943                 gfp_t flags, int nodeid)
 944{
 945        return NULL;
 946}
 947
 948#else   /* CONFIG_NUMA */
 949
 950static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
 951static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
 952
 953static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
 954{
 955        struct array_cache **ac_ptr;
 956        int memsize = sizeof(void *) * nr_node_ids;
 957        int i;
 958
 959        if (limit > 1)
 960                limit = 12;
 961        ac_ptr = kzalloc_node(memsize, gfp, node);
 962        if (ac_ptr) {
 963                for_each_node(i) {
 964                        if (i == node || !node_online(i))
 965                                continue;
 966                        ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp);
 967                        if (!ac_ptr[i]) {
 968                                for (i--; i >= 0; i--)
 969                                        kfree(ac_ptr[i]);
 970                                kfree(ac_ptr);
 971                                return NULL;
 972                        }
 973                }
 974        }
 975        return ac_ptr;
 976}
 977
 978static void free_alien_cache(struct array_cache **ac_ptr)
 979{
 980        int i;
 981
 982        if (!ac_ptr)
 983                return;
 984        for_each_node(i)
 985            kfree(ac_ptr[i]);
 986        kfree(ac_ptr);
 987}
 988
 989static void __drain_alien_cache(struct kmem_cache *cachep,
 990                                struct array_cache *ac, int node)
 991{
 992        struct kmem_list3 *rl3 = cachep->nodelists[node];
 993
 994        if (ac->avail) {
 995                spin_lock(&rl3->list_lock);
 996                /*
 997                 * Stuff objects into the remote nodes shared array first.
 998                 * That way we could avoid the overhead of putting the objects
 999                 * into the free lists and getting them back later.
1000                 */

1001                if (rl3->shared)
1002                        transfer_objects(rl3->shared, ac, ac->limit);
1003
1004                free_block(cachep, ac->entry, ac->avail, node);
1005                ac->avail = 0;
1006                spin_unlock(&rl3->list_lock);
1007        }
1008}
1009
1010/*
1011 * Called from cache_reap() to regularly drain alien caches round robin.
1012 */
1013static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
1014{
1015        int node = __this_cpu_read(slab_reap_node);
1016
1017        if (l3->alien) {
1018                struct array_cache *ac = l3->alien[node];
1019
1020                if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {
1021                        __drain_alien_cache(cachep, ac, node);
1022                        spin_unlock_irq(&ac->lock);
1023                }
1024        }
1025}
1026
1027static void drain_alien_cache(struct kmem_cache *cachep,
1028                                struct array_cache **alien)
1029{
1030        int i = 0;
1031        struct array_cache *ac;
1032        unsigned long flags;
1033
1034        for_each_online_node(i) {
1035                ac = alien[i];
1036                if (ac) {
1037                        spin_lock_irqsave(&ac->lock, flags);
1038                        __drain_alien_cache(cachep, ac, i);
1039                        spin_unlock_irqrestore(&ac->lock, flags);
1040                }
1041        }
1042}
1043
1044static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1045{
1046        struct slab *slabp = virt_to_slab(objp);
1047        int nodeid = slabp->nodeid;
1048        struct kmem_list3 *l3;
1049        struct array_cache *alien = NULL;
1050        int node;
1051
1052        node = numa_mem_id();
1053
1054        /*
1055         * Make sure we are not freeing a object from another node to the array
1056         * cache on this cpu.
1057         */
1058        if (likely(slabp->nodeid == node))
1059                return 0;
1060
1061        l3 = cachep->nodelists[node];
1062        STATS_INC_NODEFREES(cachep);
1063        if (l3->alien && l3->alien[nodeid]) {
1064                alien = l3->alien[nodeid];
1065                spin_lock(&alien->lock);
1066                if (unlikely(alien->avail == alien->limit)) {
1067                        STATS_INC_ACOVERFLOW(cachep);
1068                        __drain_alien_cache(cachep, alien, nodeid);
1069                }
1070                alien->entry[alien->avail++] = objp;
1071                spin_unlock(&alien->lock);
1072        } else {
1073                spin_lock(&(cachep->nodelists[nodeid])->list_lock);
1074                free_block(cachep, &objp, 1, nodeid);
1075                spin_unlock(&(cachep->nodelists[nodeid])->list_lock);
1076        }
1077        return 1;
1078}
1079#endif
1080
1081/*
1082 * Allocates and initializes nodelists for a node on each slab cache, used for
1083 * either memory or cpu hotplug.  If memory is being hot-added, the kmem_list3
1084 * will be allocated off-node since memory is not yet online for the new node.
1085 * When hotplugging memory or a cpu, existing nodelists are not replaced if
1086 * already in use.
1087 *
1088 * Must hold cache_chain_mutex.
1089 */
1090static int init_cache_nodelists_node(int node)
1091{
1092        struct kmem_cache *cachep;
1093        struct kmem_list3 *l3;
1094        const int memsize = sizeof(struct kmem_list3);
1095
1096        list_for_each_entry(cachep, &cache_chain, next) {
1097                /*
1098                 * Set up the size64 kmemlist for cpu before we can
1099                 * begin anything. Make sure some other cpu on this
1100                 * node has not already allocated this
1101                 */
1102                if (!cachep->nodelists[node]) {
1103                        l3 = kmalloc_node(memsize, GFP_KERNEL, node);
1104                        if (!l3)
1105                                return -ENOMEM;
1106                        kmem_list3_init(l3);
1107                        l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
1108                            ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1109
1110                        /*
1111                         * The l3s don't come and go as CPUs come and
1112                         * go.  cache_chain_mutex is sufficient
1113                         * protection here.
1114                         */
1115                        cachep->nodelists[node] = l3;
1116                }
1117
1118                spin_lock_irq(&cachep->nodelists[node]->list_lock);
1119                cachep->nodelists[node]->free_limit =
1120                        (1 + nr_cpus_node(node)) *
1121                        cachep->batchcount + cachep->num;
1122                spin_unlock_irq(&cachep->nodelists[node]->list_lock);
1123        }
1124        return 0;
1125}
1126
1127static void __cpuinit cpuup_canceled(long cpu)
1128{
1129        struct kmem_cache *cachep;
1130        struct kmem_list3 *l3 = NULL;
1131        int node = cpu_to_mem(cpu);
1132        const struct cpumask *mask = cpumask_of_node(node);
1133
1134        list_for_each_entry(cachep, &cache_chain, next) {
1135                struct array_cache *nc;
1136                struct array_cache *shared;
1137                struct array_cache **alien;
1138
1139                /* cpu is dead; no one can alloc from it. */
1140                nc = cachep->array[cpu];
1141                cachep->array[cpu] = NULL;
1142                l3 = cachep->nodelists[node];
1143
1144                if (!l3)
1145                        goto free_array_cache;
1146
1147                spin_lock_irq(&l3->list_lock);
1148
1149                /* Free limit for this kmem_list3 */
1150                l3->free_limit -= cachep->batchcount;
1151                if (nc)
1152                        free_block(cachep, nc->entry, nc->avail, node);
1153
1154                if (!cpumask_empty(mask)) {
1155                        spin_unlock_irq(&l3->list_lock);
1156                        goto free_array_cache;
1157                }
1158
1159                shared = l3->shared;
1160                if (shared) {
1161                        free_block(cachep, shared->entry,
1162                                   shared->avail, node);
1163                        l3->shared = NULL;
1164                }
1165
1166                alien = l3->alien;
1167                l3->alien = NULL;
1168
1169                spin_unlock_irq(&l3->list_lock);
1170
1171                kfree(shared);
1172                if (alien) {
1173                        drain_alien_cache(cachep, alien);
1174                        free_alien_cache(alien);
1175                }
1176free_array_cache:
1177                kfree(nc);
1178        }
1179        /*
1180         * In the previous loop, all the objects were freed to
1181         * the respective cache's slabs,  now we can go ahead and
1182         * shrink each nodelist to its limit.
1183         */
1184        list_for_each_entry(cachep, &cache_chain, next) {
1185                l3 = cachep->nodelists[node];
1186                if (!l3)
1187                        continue;
1188                drain_freelist(cachep, l3, l3->free_objects);
1189        }
1190}
1191
1192static int __cpuinit cpuup_prepare(long cpu)
1193{
1194        struct kmem_cache *cachep;
1195        struct kmem_list3 *l3 = NULL;
1196        int node = cpu_to_mem(cpu);
1197        int err;
1198
1199        /*
1200         * We need to do this right in the beginning since
1201         * alloc_arraycache's are going to use this list.
1202         * kmalloc_node allows us to add the slab to the right
1203         * kmem_list3 and not this cpu's kmem_list3
1204         */
1205        err = init_cache_nodelists_node(node);
1206        if (err < 0)
1207                goto bad;
1208
1209        /*
1210         * Now we can go ahead with allocating the shared arrays and
1211         * array caches
1212         */
1213        list_for_each_entry(cachep, &cache_chain, next) {
1214                struct array_cache *nc;
1215                struct array_cache *shared = NULL;
1216                struct array_cache **alien = NULL;
1217
1218                nc = alloc_arraycache(node, cachep->limit,
1219                                        cachep->batchcount, GFP_KERNEL);
1220                if (!nc)
1221                        goto bad;
1222                if (cachep->shared) {
1223                        shared = alloc_arraycache(node,
1224                                cachep->shared * cachep->batchcount,
1225                                0xbaadf00d, GFP_KERNEL);
1226                        if (!shared) {
1227                                kfree(nc);
1228                                goto bad;
1229                        }
1230                }
1231                if (use_alien_caches) {
1232                        alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL);
1233                        if (!alien) {
1234                                kfree(shared);
1235                                kfree(nc);
1236                                goto bad;
1237                        }
1238                }
1239                cachep->array[cpu] = nc;
1240                l3 = cachep->nodelists[node];
1241                BUG_ON(!l3);
1242
1243                spin_lock_irq(&l3->list_lock);
1244                if (!l3->shared) {
1245                        /*
1246                         * We are serialised from CPU_DEAD or
1247                         * CPU_UP_CANCELLED by the cpucontrol lock
1248                         */
1249                        l3->shared = shared;
1250                        shared = NULL;
1251                }
1252#ifdef CONFIG_NUMA
1253                if (!l3->alien) {
1254                        l3->alien = alien;
1255                        alien = NULL;
1256                }
1257#endif
1258                spin_unlock_irq(&l3->list_lock);
1259                kfree(shared);
1260                free_alien_cache(alien);
1261        }
1262        init_node_lock_keys(node);
1263
1264        return 0;
1265bad:
1266        cpuup_canceled(cpu);
1267        return -ENOMEM;
1268}
1269
1270static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1271                                    unsigned long action, void *hcpu)
1272{
1273        long cpu = (long)hcpu;
1274        int err = 0;
1275
1276        switch (action) {
1277        case CPU_UP_PREPARE:
1278        case CPU_UP_PREPARE_FROZEN:
1279                mutex_lock(&cache_chain_mutex);
1280                err = cpuup_prepare(cpu);
1281                mutex_unlock(&cache_chain_mutex);
1282                break;
1283        case CPU_ONLINE:
1284        case CPU_ONLINE_FROZEN:
1285                start_cpu_timer(cpu);
1286                break;
1287#ifdef CONFIG_HOTPLUG_CPU
1288        case CPU_DOWN_PREPARE:
1289        case CPU_DOWN_PREPARE_FROZEN:
1290                /*
1291                 * Shutdown cache reaper. Note that the cache_chain_mutex is
1292                 * held so that if cache_reap() is invoked it cannot do
1293                 * anything expensive but will only modify reap_work
1294                 * and reschedule the timer.
1295                */
1296                cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu));
1297                /* Now the cache_reaper is guaranteed to be not running. */
1298                per_cpu(slab_reap_work, cpu).work.func = NULL;
1299                break;
1300        case CPU_DOWN_FAILED:
1301        case CPU_DOWN_FAILED_FROZEN:
1302                start_cpu_timer(cpu);
1303                break;
1304        case CPU_DEAD:
1305        case CPU_DEAD_FROZEN:
1306                /*
1307                 * Even if all the cpus of a node are down, we don't free the
1308                 * kmem_list3 of any cache. This to avoid a race between
1309                 * cpu_down, and a kmalloc allocation from another cpu for
1310                 * memory from the node of the cpu going down.  The list3
1311                 * structure is usually allocated from kmem_cache_create() and
1312                 * gets destroyed at kmem_cache_destroy().
1313                 */
1314                /* fall through */
1315#endif
1316        case CPU_UP_CANCELED:
1317        case CPU_UP_CANCELED_FROZEN:
1318                mutex_lock(&cache_chain_mutex);
1319                cpuup_canceled(cpu);
1320                mutex_unlock(&cache_chain_mutex);
1321                break;
1322        }
1323        return notifier_from_errno(err);
1324}
1325
1326static struct notifier_block __cpuinitdata cpucache_notifier = {
1327        &cpuup_callback, NULL, 0
1328};
1329
1330#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
1331/*
1332 * Drains freelist for a node on each slab cache, used for memory hot-remove.
1333 * Returns -EBUSY if all objects cannot be drained so that the node is not
1334 * removed.
1335 *
1336 * Must hold cache_chain_mutex.
1337 */
1338static int __meminit drain_cache_nodelists_node(int node)
1339{
1340        struct kmem_cache *cachep;
1341        int ret = 0;
1342
1343        list_for_each_entry(cachep, &cache_chain, next) {
1344                struct kmem_list3 *l3;
1345
1346                l3 = cachep->nodelists[node];
1347                if (!l3)
1348                        continue;
1349
1350                drain_freelist(cachep, l3, l3->free_objects);
1351
1352                if (!list_empty(&l3->slabs_full) ||
1353                    !list_empty(&l3->slabs_partial)) {
1354                        ret = -EBUSY;
1355                        break;
1356                }
1357        }
1358        return ret;
1359}
1360
1361static int __meminit slab_memory_callback(struct notifier_block *self,
1362                                        unsigned long action, void *arg)
1363{
1364        struct memory_notify *mnb = arg;
1365        int ret = 0;
1366        int nid;
1367
1368        nid = mnb->status_change_nid;
1369        if (nid < 0)
1370                goto out;
1371
1372        switch (action) {
1373        case MEM_GOING_ONLINE:
1374                mutex_lock(&cache_chain_mutex);
1375                ret = init_cache_nodelists_node(nid);
1376                mutex_unlock(&cache_chain_mutex);
1377                break;
1378        case MEM_GOING_OFFLINE:
1379                mutex_lock(&cache_chain_mutex);
1380                ret = drain_cache_nodelists_node(nid);
1381                mutex_unlock(&cache_chain_mutex);
1382                break;
1383        case MEM_ONLINE:
1384        case MEM_OFFLINE:
1385        case MEM_CANCEL_ONLINE:
1386        case MEM_CANCEL_OFFLINE:
1387                break;
1388        }
1389out:
1390        return ret ? notifier_from_errno(ret) : NOTIFY_OK;
1391}
1392#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
1393
1394/*
1395 * swap the static kmem_list3 with kmalloced memory
1396 */
1397static void __init init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
1398                                int nodeid)
1399{
1400        struct kmem_list3 *ptr;
1401
1402        ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_NOWAIT, nodeid);
1403        BUG_ON(!ptr);
1404
1405        memcpy(ptr, list, sizeof(struct kmem_list3));
1406        /*
1407         * Do not assume that spinlocks can be initialized via memcpy:
1408         */
1409        spin_lock_init(&ptr->list_lock);
1410
1411        MAKE_ALL_LISTS(cachep, ptr, nodeid);
1412        cachep->nodelists[nodeid] = ptr;
1413}
1414
1415/*
1416 * For setting up all the kmem_list3s for cache whose buffer_size is same as
1417 * size of kmem_list3.
1418 */
1419static void __init set_up_list3s(struct kmem_cache *cachep, int index)
1420{
1421        int node;
1422
1423        for_each_online_node(node) {
1424                cachep->nodelists[node] = &initkmem_list3[index + node];
1425                cachep->nodelists[node]->next_reap = jiffies +
1426                    REAPTIMEOUT_LIST3 +
1427                    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1428        }
1429}
1430
1431/*
1432 * Initialisation.  Called after the page allocator have been initialised and
1433 * before smp_init().
1434 */
1435void __init kmem_cache_init(void)
1436{
1437        size_t left_over;
1438        struct cache_sizes *sizes;
1439        struct cache_names *names;
1440        int i;
1441        int order;
1442        int node;
1443
1444        if (num_possible_nodes() == 1)
1445                use_alien_caches = 0;
1446
1447        for (i = 0; i < NUM_INIT_LISTS; i++) {
1448                kmem_list3_init(&initkmem_list3[i]);
1449                if (i < MAX_NUMNODES)
1450                        cache_cache.nodelists[i] = NULL;
1451        }
1452        set_up_list3s(&cache_cache, CACHE_CACHE);
1453
1454        /*
1455         * Fragmentation resistance on low memory - only use bigger
1456         * page orders on machines with more than 32MB of memory.
1457         */
1458        if (totalram_pages > (32 << 20) >> PAGE_SHIFT)
1459                slab_break_gfp_order = BREAK_GFP_ORDER_HI;
1460
1461        /* Bootstrap is tricky, because several objects are allocated
1462         * from caches that do not exist yet:
1463         * 1) initialize the cache_cache cache: it contains the struct
1464         *    kmem_cache structures of all caches, except cache_cache itself:
1465         *    cache_cache is statically allocated.
1466         *    Initially an __init data area is used for the head array and the
1467         *    kmem_list3 structures, it's replaced with a kmalloc allocated
1468         *    array at the end of the bootstrap.
1469         * 2) Create the first kmalloc cache.
1470         *    The struct kmem_cache for the new cache is allocated normally.
1471         *    An __init data area is used for the head array.
1472         * 3) Create the remaining kmalloc caches, with minimally sized
1473         *    head arrays.
1474         * 4) Replace the __init data head arrays for cache_cache and the first
1475         *    kmalloc cache with kmalloc allocated arrays.
1476         * 5) Replace the __init data for kmem_list3 for cache_cache and
1477         *    the other cache's with kmalloc allocated memory.
1478         * 6) Resize the head arrays of the kmalloc caches to their final sizes.
1479         */
1480
1481        node = numa_mem_id();
1482
1483        /* 1) create the cache_cache */
1484        INIT_LIST_HEAD(&cache_chain);
1485        list_add(&cache_cache.next, &cache_chain);
1486        cache_cache.colour_off = cache_line_size();
1487        cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
1488        cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
1489
1490        /*
1491         * struct kmem_cache size depends on nr_node_ids, which
1492         * can be less than MAX_NUMNODES.
1493         */
1494        cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) +
1495                                 nr_node_ids * sizeof(struct kmem_list3 *);
1496#if DEBUG
1497        cache_cache.obj_size = cache_cache.buffer_size;
1498#endif
1499        cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
1500                                        cache_line_size());
1501        cache_cache.reciprocal_buffer_size =
1502                reciprocal_value(cache_cache.buffer_size);
1503
1504        for (order = 0; order < MAX_ORDER; order++) {
1505                cache_estimate(order, cache_cache.buffer_size,
1506                        cache_line_size(), 0, &left_over, &cache_cache.num);
1507                if (cache_cache.num)
1508                        break;
1509        }
1510        BUG_ON(!cache_cache.num);
1511        cache_cache.gfporder = order;
1512        cache_cache.colour = left_over / cache_cache.colour_off;
1513        cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
1514                                      sizeof(struct slab), cache_line_size());
1515
1516        /* 2+3) create the kmalloc caches */
1517        sizes = malloc_sizes;
1518        names = cache_names;
1519
1520        /*
1521         * Initialize the caches that provide memory for the array cache and the
1522         * kmem_list3 structures first.  Without this, further allocations will
1523         * bug.
1524         */
1525
1526        sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
1527                                        sizes[INDEX_AC].cs_size,
1528                                        ARCH_KMALLOC_MINALIGN,
1529                                        ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1530                                        NULL);
1531
1532        if (INDEX_AC != INDEX_L3) {
1533                sizes[INDEX_L3].cs_cachep =
1534                        kmem_cache_create(names[INDEX_L3].name,
1535                                sizes[INDEX_L3].cs_size,
1536                                ARCH_KMALLOC_MINALIGN,
1537                                ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1538                                NULL);
1539        }
1540
1541        slab_early_init = 0;
1542
1543        while (sizes->cs_size != ULONG_MAX) {
1544                /*
1545                 * For performance, all the general caches are L1 aligned.
1546                 * This should be particularly beneficial on SMP boxes, as it
1547                 * eliminates "false sharing".
1548                 * Note for systems short on memory removing the alignment will
1549                 * allow tighter packing of the smaller caches.
1550                 */
1551                if (!sizes->cs_cachep) {
1552                        sizes->cs_cachep = kmem_cache_create(names->name,
1553                                        sizes->cs_size,
1554                                        ARCH_KMALLOC_MINALIGN,
1555                                        ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1556                                        NULL);
1557                }
1558#ifdef CONFIG_ZONE_DMA
1559                sizes->cs_dmacachep = kmem_cache_create(
1560                                        names->name_dma,
1561                                        sizes->cs_size,
1562                                        ARCH_KMALLOC_MINALIGN,
1563                                        ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
1564                                                SLAB_PANIC,
1565                                        NULL);
1566#endif
1567                sizes++;
1568                names++;
1569        }
1570        /* 4) Replace the bootstrap head arrays */
1571        {
1572                struct array_cache *ptr;
1573
1574                ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1575
1576                BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
1577                memcpy(ptr, cpu_cache_get(&cache_cache),
1578                       sizeof(struct arraycache_init));
1579                /*
1580                 * Do not assume that spinlocks can be initialized via memcpy:
1581                 */
1582                spin_lock_init(&ptr->lock);
1583
1584                cache_cache.array[smp_processor_id()] = ptr;
1585
1586                ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1587
1588                BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
1589                       != &initarray_generic.cache);
1590                memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
1591                       sizeof(struct arraycache_init));
1592                /*
1593                 * Do not assume that spinlocks can be initialized via memcpy:
1594                 */
1595                spin_lock_init(&ptr->lock);
1596
1597                malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
1598                    ptr;
1599        }
1600        /* 5) Replace the bootstrap kmem_list3's */
1601        {
1602                int nid;
1603
1604                for_each_online_node(nid) {
1605                        init_list(&cache_cache, &initkmem_list3[CACHE_CACHE + nid], nid);
1606
1607                        init_list(malloc_sizes[INDEX_AC].cs_cachep,
1608                                  &initkmem_list3[SIZE_AC + nid], nid);
1609
1610                        if (INDEX_AC != INDEX_L3) {
1611                                init_list(malloc_sizes[INDEX_L3].cs_cachep,
1612                                          &initkmem_list3[SIZE_L3 + nid], nid);
1613                        }
1614                }
1615        }
1616
1617        g_cpucache_up = EARLY;
1618}
1619
1620void __init kmem_cache_init_late(void)
1621{
1622        struct kmem_cache *cachep;
1623
1624        /* 6) resize the head arrays to their final sizes */
1625        mutex_lock(&cache_chain_mutex);
1626        list_for_each_entry(cachep, &cache_chain, next)
1627                if (enable_cpucache(cachep, GFP_NOWAIT))
1628                        BUG();
1629        mutex_unlock(&cache_chain_mutex);
1630
1631        /* Done! */
1632        g_cpucache_up = FULL;
1633
1634        /* Annotate slab for lockdep -- annotate the malloc caches */
1635        init_lock_keys();
1636
1637        /*
1638         * Register a cpu startup notifier callback that initializes
1639         * cpu_cache_get for all new cpus
1640         */
1641        register_cpu_notifier(&cpucache_notifier);
1642
1643#ifdef CONFIG_NUMA
1644        /*
1645         * Register a memory hotplug callback that initializes and frees
1646         * nodelists.
1647         */
1648        hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
1649#endif
1650
1651        /*
1652         * The reap timers are started later, with a module init call: That part
1653         * of the kernel is not yet operational.
1654         */
1655}
1656
1657static int __init cpucache_init(void)
1658{
1659        int cpu;
1660
1661        /*
1662         * Register the timers that return unneeded pages to the page allocator
1663         */
1664        for_each_online_cpu(cpu)
1665                start_cpu_timer(cpu);
1666        return 0;
1667}
1668__initcall(cpucache_init);
1669
1670/*
1671 * Interface to system's page allocator. No need to hold the cache-lock.
1672 *
1673 * If we requested dmaable memory, we will get it. Even if we
1674 * did not request dmaable memory, we might get it, but that
1675 * would be relatively rare and ignorable.
1676 */
1677static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1678{
1679        struct page *page;
1680        int nr_pages;
1681        int i;
1682
1683#ifndef CONFIG_MMU
1684        /*
1685         * Nommu uses slab's for process anonymous memory allocations, and thus
1686         * requires __GFP_COMP to properly refcount higher order allocations
1687         */
1688        flags |= __GFP_COMP;
1689#endif
1690
1691        flags |= cachep->gfpflags;
1692        if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1693                flags |= __GFP_RECLAIMABLE;
1694
1695        page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
1696        if (!page)
1697                return NULL;
1698
1699        nr_pages = (1 << cachep->gfporder);
1700        if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1701                add_zone_page_state(page_zone(page),
1702                        NR_SLAB_RECLAIMABLE, nr_pages);
1703        else
1704                add_zone_page_state(page_zone(page),
1705                        NR_SLAB_UNRECLAIMABLE, nr_pages);
1706        for (i = 0; i < nr_pages; i++)
1707                __SetPageSlab(page + i);
1708
1709        if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
1710                kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
1711
1712                if (cachep->ctor)
1713                        kmemcheck_mark_uninitialized_pages(page, nr_pages);
1714                else
1715                        kmemcheck_mark_unallocated_pages(page, nr_pages);
1716        }
1717
1718        return page_address(page);
1719}
1720
1721/*
1722 * Interface to system's page release.
1723 */
1724static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1725{
1726        unsigned long i = (1 << cachep->gfporder);
1727        struct page *page = virt_to_page(addr);
1728        const unsigned long nr_freed = i;
1729
1730        kmemcheck_free_shadow(page, cachep->gfporder);
1731
1732        if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1733                sub_zone_page_state(page_zone(page),
1734                                NR_SLAB_RECLAIMABLE, nr_freed);
1735        else
1736                sub_zone_page_state(page_zone(page),
1737                                NR_SLAB_UNRECLAIMABLE, nr_freed);
1738        while (i--) {
1739                BUG_ON(!PageSlab(page));
1740                __ClearPageSlab(page);
1741                page++;
1742        }
1743        if (current->reclaim_state)
1744                current->reclaim_state->reclaimed_slab += nr_freed;
1745        free_pages((unsigned long)addr, cachep->gfporder);
1746}
1747
1748static void kmem_rcu_free(struct rcu_head *head)
1749{
1750        struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
1751        struct kmem_cache *cachep = slab_rcu->cachep;
1752
1753        kmem_freepages(cachep, slab_rcu->addr);
1754        if (OFF_SLAB(cachep))
1755                kmem_cache_free(cachep->slabp_cache, slab_rcu);
1756}
1757
1758#if DEBUG
1759
1760#ifdef CONFIG_DEBUG_PAGEALLOC
1761static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
1762                            unsigned long caller)
1763{
1764        int size = obj_size(cachep);
1765
1766        addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
1767
1768        if (size < 5 * sizeof(unsigned long))
1769                return;
1770
1771        *addr++ = 0x12345678;
1772        *addr++ = caller;
1773        *addr++ = smp_processor_id();
1774        size -= 3 * sizeof(unsigned long);
1775        {
1776                unsigned long *sptr = &caller;
1777                unsigned long svalue;
1778
1779                while (!kstack_end(sptr)) {
1780                        svalue = *sptr++;
1781                        if (kernel_text_address(svalue)) {
1782                                *addr++ = svalue;
1783                                size -= sizeof(unsigned long);
1784                                if (size <= sizeof(unsigned long))
1785                                        break;
1786                        }
1787                }
1788
1789        }
1790        *addr++ = 0x87654321;
1791}
1792#endif
1793
1794static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
1795{
1796        int size = obj_size(cachep);
1797        addr = &((char *)addr)[obj_offset(cachep)];
1798
1799        memset(addr, val, size);
1800        *(unsigned char *)(addr + size - 1) = POISON_END;
1801}
1802
1803static void dump_line(char *data, int offset, int limit)
1804{
1805        int i;
1806        unsigned char error = 0;
1807        int bad_count = 0;
1808
1809        printk(KERN_ERR "%03x:", offset);
1810        for (i = 0; i < limit; i++) {
1811                if (data[offset + i] != POISON_FREE) {
1812                        error = data[offset + i];
1813                        bad_count++;
1814                }
1815                printk(" %02x", (unsigned char)data[offset + i]);
1816        }
1817        printk("\n");
1818
1819        if (bad_count == 1) {
1820                error ^= POISON_FREE;
1821                if (!(error & (error - 1))) {
1822                        printk(KERN_ERR "Single bit error detected. Probably "
1823                                        "bad RAM.\n");
1824#ifdef CONFIG_X86
1825                        printk(KERN_ERR "Run memtest86+ or a similar memory "
1826                                        "test tool.\n");
1827#else
1828                        printk(KERN_ERR "Run a memory test tool.\n");
1829#endif
1830                }
1831        }
1832}
1833#endif
1834
1835#if DEBUG
1836
1837static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
1838{
1839        int i, size;
1840        char *realobj;
1841
1842        if (cachep->flags & SLAB_RED_ZONE) {
1843                printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n",
1844                        *dbg_redzone1(cachep, objp),
1845                        *dbg_redzone2(cachep, objp));
1846        }
1847
1848        if (cachep->flags & SLAB_STORE_USER) {
1849                printk(KERN_ERR "Last user: [<%p>]",
1850                        *dbg_userword(cachep, objp));
1851                print_symbol("(%s)",
1852                                (unsigned long)*dbg_userword(cachep, objp));
1853                printk("\n");
1854        }
1855        realobj = (char *)objp + obj_offset(cachep);
1856        size = obj_size(cachep);
1857        for (i = 0; i < size && lines; i += 16, lines--) {
1858                int limit;
1859                limit = 16;
1860                if (i + limit > size)
1861                        limit = size - i;
1862                dump_line(realobj, i, limit);
1863        }
1864}
1865
1866static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1867{
1868        char *realobj;
1869        int size, i;
1870        int lines = 0;
1871
1872        realobj = (char *)objp + obj_offset(cachep);
1873        size = obj_size(cachep);
1874
1875        for (i = 0; i < size; i++) {
1876                char exp = POISON_FREE;
1877                if (i == size - 1)
1878                        exp = POISON_END;
1879                if (realobj[i] != exp) {
1880                        int limit;
1881                        /* Mismatch ! */
1882                        /* Print header */
1883                        if (lines == 0) {
1884                                printk(KERN_ERR
1885                                        "Slab corruption: %s start=%p, len=%d\n",
1886                                        cachep->name, realobj, size);
1887                                print_objinfo(cachep, objp, 0);
1888                        }
1889                        /* Hexdump the affected line */
1890                        i = (i / 16) * 16;
1891                        limit = 16;
1892                        if (i + limit > size)
1893                                limit = size - i;
1894                        dump_line(realobj, i, limit);
1895                        i += 16;
1896                        lines++;
1897                        /* Limit to 5 lines */
1898                        if (lines > 5)
1899                                break;
1900                }
1901        }
1902        if (lines != 0) {
1903                /* Print some data about the neighboring objects, if they
1904                 * exist:
1905                 */
1906                struct slab *slabp = virt_to_slab(objp);
1907                unsigned int objnr;
1908
1909                objnr = obj_to_index(cachep, slabp, objp);
1910                if (objnr) {
1911                        objp = index_to_obj(cachep, slabp, objnr - 1);
1912                        realobj = (char *)objp + obj_offset(cachep);
1913                        printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
1914                               realobj, size);
1915                        print_objinfo(cachep, objp, 2);
1916                }
1917                if (objnr + 1 < cachep->num) {
1918                        objp = index_to_obj(cachep, slabp, objnr + 1);
1919                        realobj = (char *)objp + obj_offset(cachep);
1920                        printk(KERN_ERR "Next obj: start=%p, len=%d\n",
1921                               realobj, size);
1922                        print_objinfo(cachep, objp, 2);
1923                }
1924        }
1925}
1926#endif
1927
1928#if DEBUG
1929static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp)
1930{
1931        int i;
1932        for (i = 0; i < cachep->num; i++) {
1933                void *objp = index_to_obj(cachep, slabp, i);
1934
1935                if (cachep->flags & SLAB_POISON) {
1936#ifdef CONFIG_DEBUG_PAGEALLOC
1937                        if (cachep->buffer_size % PAGE_SIZE == 0 &&
1938                                        OFF_SLAB(cachep))
1939                                kernel_map_pages(virt_to_page(objp),
1940                                        cachep->buffer_size / PAGE_SIZE, 1);
1941                        else
1942                                check_poison_obj(cachep, objp);
1943#else
1944                        check_poison_obj(cachep, objp);
1945#endif
1946                }
1947                if (cachep->flags & SLAB_RED_ZONE) {
1948                        if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
1949                                slab_error(cachep, "start of a freed object "
1950                                           "was overwritten");
1951                        if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
1952                                slab_error(cachep, "end of a freed object "
1953                                           "was overwritten");
1954                }
1955        }
1956}
1957#else
1958static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp)
1959{
1960}
1961#endif
1962
1963/**
1964 * slab_destroy - destroy and release all objects in a slab
1965 * @cachep: cache pointer being destroyed
1966 * @slabp: slab pointer being destroyed
1967 *
1968 * Destroy all the objs in a slab, and release the mem back to the system.
1969 * Before calling the slab must have been unlinked from the cache.  The
1970 * cache-lock is not held/needed.
1971 */
1972static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
1973{
1974        void *addr = slabp->s_mem - slabp->colouroff;
1975
1976        slab_destroy_debugcheck(cachep, slabp);
1977        if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
1978                struct slab_rcu *slab_rcu;
1979
1980                slab_rcu = (struct slab_rcu *)slabp;
1981                slab_rcu->cachep = cachep;
1982                slab_rcu->addr = addr;
1983                call_rcu(&slab_rcu->head, kmem_rcu_free);
1984        } else {
1985                kmem_freepages(cachep, addr);
1986                if (OFF_SLAB(cachep))
1987                        kmem_cache_free(cachep->slabp_cache, slabp);
1988        }
1989}
1990
1991static void __kmem_cache_destroy(struct kmem_cache *cachep)
1992{
1993        int i;
1994        struct kmem_list3 *l3;
1995
1996        for_each_online_cpu(i)
1997            kfree(cachep->array[i]);
1998
1999        /* NUMA: free the list3 structures */
2000        for_each_online_node(i) {

2001                l3 = cachep->nodelists[i];
2002                if (l3) {
2003                        kfree(l3->shared);
2004                        free_alien_cache(l3->alien);
2005                        kfree(l3);
2006                }
2007        }
2008        kmem_cache_free(&cache_cache, cachep);
2009}
2010
2011
2012/**
2013 * calculate_slab_order - calculate size (page order) of slabs
2014 * @cachep: pointer to the cache that is being created
2015 * @size: size of objects to be created in this cache.
2016 * @align: required alignment for the objects.
2017 * @flags: slab allocation flags
2018 *
2019 * Also calculates the number of objects per slab.
2020 *
2021 * This could be made much more intelligent.  For now, try to avoid using
2022 * high order pages for slabs.  When the gfp() functions are more friendly
2023 * towards high-order requests, this should be changed.
2024 */
2025static size_t calculate_slab_order(struct kmem_cache *cachep,
2026                        size_t size, size_t align, unsigned long flags)
2027{
2028        unsigned long offslab_limit;
2029        size_t left_over = 0;
2030        int gfporder;
2031
2032        for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
2033                unsigned int num;
2034                size_t remainder;
2035
2036                cache_estimate(gfporder, size, align, flags, &remainder, &num);
2037                if (!num)
2038                        continue;
2039
2040                if (flags & CFLGS_OFF_SLAB) {
2041                        /*
2042                         * Max number of objs-per-slab for caches which
2043                         * use off-slab slabs. Needed to avoid a possible
2044                         * looping condition in cache_grow().
2045                         */
2046                        offslab_limit = size - sizeof(struct slab);
2047                        offslab_limit /= sizeof(kmem_bufctl_t);
2048
2049                        if (num > offslab_limit)
2050                                break;
2051                }
2052
2053                /* Found something acceptable - save it away */
2054                cachep->num = num;
2055                cachep->gfporder = gfporder;
2056                left_over = remainder;
2057
2058                /*
2059                 * A VFS-reclaimable slab tends to have most allocations
2060                 * as GFP_NOFS and we really don't want to have to be allocating
2061                 * higher-order pages when we are unable to shrink dcache.
2062                 */
2063                if (flags & SLAB_RECLAIM_ACCOUNT)
2064                        break;
2065
2066                /*
2067                 * Large number of objects is good, but very large slabs are
2068                 * currently bad for the gfp()s.
2069                 */
2070                if (gfporder >= slab_break_gfp_order)
2071                        break;
2072
2073                /*
2074                 * Acceptable internal fragmentation?
2075                 */
2076                if (left_over * 8 <= (PAGE_SIZE << gfporder))
2077                        break;
2078        }
2079        return left_over;
2080}
2081
2082static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2083{
2084        if (g_cpucache_up == FULL)
2085                return enable_cpucache(cachep, gfp);
2086
2087        if (g_cpucache_up == NONE) {
2088                /*
2089                 * Note: the first kmem_cache_create must create the cache
2090                 * that's used by kmalloc(24), otherwise the creation of
2091                 * further caches will BUG().
2092                 */
2093                cachep->array[smp_processor_id()] = &initarray_generic.cache;
2094
2095                /*
2096                 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
2097                 * the first cache, then we need to set up all its list3s,
2098                 * otherwise the creation of further caches will BUG().
2099                 */
2100                set_up_list3s(cachep, SIZE_AC);
2101                if (INDEX_AC == INDEX_L3)
2102                        g_cpucache_up = PARTIAL_L3;
2103                else
2104                        g_cpucache_up = PARTIAL_AC;
2105        } else {
2106                cachep->array[smp_processor_id()] =
2107                        kmalloc(sizeof(struct arraycache_init), gfp);
2108
2109                if (g_cpucache_up == PARTIAL_AC) {
2110                        set_up_list3s(cachep, SIZE_L3);
2111                        g_cpucache_up = PARTIAL_L3;
2112                } else {
2113                        int node;
2114                        for_each_online_node(node) {
2115                                cachep->nodelists[node] =
2116                                    kmalloc_node(sizeof(struct kmem_list3),
2117                                                gfp, node);
2118                                BUG_ON(!cachep->nodelists[node]);
2119                                kmem_list3_init(cachep->nodelists[node]);
2120                        }
2121                }
2122        }
2123        cachep->nodelists[numa_mem_id()]->next_reap =
2124                        jiffies + REAPTIMEOUT_LIST3 +
2125                        ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
2126
2127        cpu_cache_get(cachep)->avail = 0;
2128        cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
2129        cpu_cache_get(cachep)->batchcount = 1;
2130        cpu_cache_get(cachep)->touched = 0;
2131        cachep->batchcount = 1;
2132        cachep->limit = BOOT_CPUCACHE_ENTRIES;
2133        return 0;
2134}
2135
2136/**
2137 * kmem_cache_create - Create a cache.
2138 * @name: A string which is used in /proc/slabinfo to identify this cache.
2139 * @size: The size of objects to be created in this cache.
2140 * @align: The required alignment for the objects.
2141 * @flags: SLAB flags
2142 * @ctor: A constructor for the objects.
2143 *
2144 * Returns a ptr to the cache on success, NULL on failure.
2145 * Cannot be called within a int, but can be interrupted.
2146 * The @ctor is run when new pages are allocated by the cache.
2147 *
2148 * @name must be valid until the cache is destroyed. This implies that
2149 * the module calling this has to destroy the cache before getting unloaded.
2150 * Note that kmem_cache_name() is not guaranteed to return the same pointer,
2151 * therefore applications must manage it themselves.
2152 *
2153 * The flags are
2154 *
2155 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
2156 * to catch references to uninitialised memory.
2157 *
2158 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
2159 * for buffer overruns.
2160 *
2161 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
2162 * cacheline.  This can be beneficial if you're counting cycles as closely
2163 * as davem.
2164 */
2165struct kmem_cache *
2166kmem_cache_create (const char *name, size_t size, size_t align,
2167        unsigned long flags, void (*ctor)(void *))
2168{
2169        size_t left_over, slab_size, ralign;
2170        struct kmem_cache *cachep = NULL, *pc;
2171        gfp_t gfp;
2172
2173        /*
2174         * Sanity checks... these are all serious usage bugs.
2175         */
2176        if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
2177            size > KMALLOC_MAX_SIZE) {
2178                printk(KERN_ERR "%s: Early error in slab %s\n", __func__,
2179                                name);
2180                BUG();
2181        }
2182
2183        /*
2184         * We use cache_chain_mutex to ensure a consistent view of
2185         * cpu_online_mask as well.  Please see cpuup_callback
2186         */
2187        if (slab_is_available()) {
2188                get_online_cpus();
2189                mutex_lock(&cache_chain_mutex);
2190        }
2191
2192        list_for_each_entry(pc, &cache_chain, next) {
2193                char tmp;
2194                int res;
2195
2196                /*
2197                 * This happens when the module gets unloaded and doesn't
2198                 * destroy its slab cache and no-one else reuses the vmalloc
2199                 * area of the module.  Print a warning.
2200                 */
2201                res = probe_kernel_address(pc->name, tmp);
2202                if (res) {
2203                        printk(KERN_ERR
2204                               "SLAB: cache with size %d has lost its name\n",
2205                               pc->buffer_size);
2206                        continue;
2207                }
2208
2209                if (!strcmp(pc->name, name)) {
2210                        printk(KERN_ERR
2211                               "kmem_cache_create: duplicate cache %s\n", name);
2212                        dump_stack();
2213                        goto oops;
2214                }
2215        }
2216
2217#if DEBUG
2218        WARN_ON(strchr(name, ' '));     /* It confuses parsers */
2219#if FORCED_DEBUG
2220        /*
2221         * Enable redzoning and last user accounting, except for caches with
2222         * large objects, if the increased size would increase the object size
2223         * above the next power of two: caches with object sizes just above a
2224         * power of two have a significant amount of internal fragmentation.
2225         */
2226        if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
2227                                                2 * sizeof(unsigned long long)))
2228                flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
2229        if (!(flags & SLAB_DESTROY_BY_RCU))
2230                flags |= SLAB_POISON;
2231#endif
2232        if (flags & SLAB_DESTROY_BY_RCU)
2233                BUG_ON(flags & SLAB_POISON);
2234#endif
2235        /*
2236         * Always checks flags, a caller might be expecting debug support which
2237         * isn't available.
2238         */
2239        BUG_ON(flags & ~CREATE_MASK);
2240
2241        /*
2242         * Check that size is in terms of words.  This is needed to avoid
2243         * unaligned accesses for some archs when redzoning is used, and makes
2244         * sure any on-slab bufctl's are also correctly aligned.
2245         */
2246        if (size & (BYTES_PER_WORD - 1)) {
2247                size += (BYTES_PER_WORD - 1);
2248                size &= ~(BYTES_PER_WORD - 1);
2249        }
2250
2251        /* calculate the final buffer alignment: */
2252
2253        /* 1) arch recommendation: can be overridden for debug */
2254        if (flags & SLAB_HWCACHE_ALIGN) {
2255                /*
2256                 * Default alignment: as specified by the arch code.  Except if
2257                 * an object is really small, then squeeze multiple objects into
2258                 * one cacheline.
2259                 */
2260                ralign = cache_line_size();
2261                while (size <= ralign / 2)
2262                        ralign /= 2;
2263        } else {
2264                ralign = BYTES_PER_WORD;
2265        }
2266
2267        /*
2268         * Redzoning and user store require word alignment or possibly larger.
2269         * Note this will be overridden by architecture or caller mandated
2270         * alignment if either is greater than BYTES_PER_WORD.
2271         */
2272        if (flags & SLAB_STORE_USER)
2273                ralign = BYTES_PER_WORD;
2274
2275        if (flags & SLAB_RED_ZONE) {
2276                ralign = REDZONE_ALIGN;
2277                /* If redzoning, ensure that the second redzone is suitably
2278                 * aligned, by adjusting the object size accordingly. */
2279                size += REDZONE_ALIGN - 1;
2280                size &= ~(REDZONE_ALIGN - 1);
2281        }
2282
2283        /* 2) arch mandated alignment */
2284        if (ralign < ARCH_SLAB_MINALIGN) {
2285                ralign = ARCH_SLAB_MINALIGN;
2286        }
2287        /* 3) caller mandated alignment */
2288        if (ralign < align) {
2289                ralign = align;
2290        }
2291        /* disable debug if not aligning with REDZONE_ALIGN */
2292        if (ralign & (__alignof__(unsigned long long) - 1))
2293                flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2294        /*
2295         * 4) Store it.
2296         */
2297        align = ralign;
2298
2299        if (slab_is_available())
2300                gfp = GFP_KERNEL;
2301        else
2302                gfp = GFP_NOWAIT;
2303
2304        /* Get cache's description obj. */
2305        cachep = kmem_cache_zalloc(&cache_cache, gfp);
2306        if (!cachep)
2307                goto oops;
2308
2309#if DEBUG
2310        cachep->obj_size = size;
2311
2312        /*
2313         * Both debugging options require word-alignment which is calculated
2314         * into align above.
2315         */
2316        if (flags & SLAB_RED_ZONE) {
2317                /* add space for red zone words */
2318                cachep->obj_offset += align;
2319                size += align + sizeof(unsigned long long);
2320        }
2321        if (flags & SLAB_STORE_USER) {
2322                /* user store requires one word storage behind the end of
2323                 * the real object. But if the second red zone needs to be
2324                 * aligned to 64 bits, we must allow that much space.
2325                 */
2326                if (flags & SLAB_RED_ZONE)
2327                        size += REDZONE_ALIGN;
2328                else
2329                        size += BYTES_PER_WORD;
2330        }
2331#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
2332        if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
2333            && cachep->obj_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) {
2334                cachep->obj_offset += PAGE_SIZE - ALIGN(size, align);
2335                size = PAGE_SIZE;
2336        }
2337#endif
2338#endif
2339
2340        /*
2341         * Determine if the slab management is 'on' or 'off' slab.
2342         * (bootstrapping cannot cope with offslab caches so don't do
2343         * it too early on. Always use on-slab management when
2344         * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
2345         */
2346        if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&
2347            !(flags & SLAB_NOLEAKTRACE))
2348                /*
2349                 * Size is large, assume best to place the slab management obj
2350                 * off-slab (should allow better packing of objs).
2351                 */
2352                flags |= CFLGS_OFF_SLAB;
2353
2354        size = ALIGN(size, align);
2355
2356        left_over = calculate_slab_order(cachep, size, align, flags);
2357
2358        if (!cachep->num) {
2359                printk(KERN_ERR
2360                       "kmem_cache_create: couldn't create cache %s.\n", name);
2361                kmem_cache_free(&cache_cache, cachep);
2362                cachep = NULL;
2363                goto oops;
2364        }
2365        slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
2366                          + sizeof(struct slab), align);
2367
2368        /*
2369         * If the slab has been placed off-slab, and we have enough space then
2370         * move it on-slab. This is at the expense of any extra colouring.
2371         */
2372        if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
2373                flags &= ~CFLGS_OFF_SLAB;
2374                left_over -= slab_size;
2375        }
2376
2377        if (flags & CFLGS_OFF_SLAB) {
2378                /* really off slab. No need for manual alignment */
2379                slab_size =
2380                    cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
2381
2382#ifdef CONFIG_PAGE_POISONING
2383                /* If we're going to use the generic kernel_map_pages()
2384                 * poisoning, then it's going to smash the contents of
2385                 * the redzone and userword anyhow, so switch them off.
2386                 */
2387                if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
2388                        flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2389#endif
2390        }
2391
2392        cachep->colour_off = cache_line_size();
2393        /* Offset must be a multiple of the alignment. */
2394        if (cachep->colour_off < align)
2395                cachep->colour_off = align;
2396        cachep->colour = left_over / cachep->colour_off;
2397        cachep->slab_size = slab_size;
2398        cachep->flags = flags;
2399        cachep->gfpflags = 0;
2400        if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
2401                cachep->gfpflags |= GFP_DMA;
2402        cachep->buffer_size = size;
2403        cachep->reciprocal_buffer_size = reciprocal_value(size);
2404
2405        if (flags & CFLGS_OFF_SLAB) {
2406                cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
2407                /*
2408                 * This is a possibility for one of the malloc_sizes caches.
2409                 * But since we go off slab only for object size greater than
2410                 * PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
2411                 * this should not happen at all.
2412                 * But leave a BUG_ON for some lucky dude.
2413                 */
2414                BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));
2415        }
2416        cachep->ctor = ctor;
2417        cachep->name = name;
2418
2419        if (setup_cpu_cache(cachep, gfp)) {
2420                __kmem_cache_destroy(cachep);
2421                cachep = NULL;
2422                goto oops;
2423        }
2424
2425        /* cache setup completed, link it into the list */
2426        list_add(&cachep->next, &cache_chain);
2427oops:
2428        if (!cachep && (flags & SLAB_PANIC))
2429                panic("kmem_cache_create(): failed to create slab `%s'\n",
2430                      name);
2431        if (slab_is_available()) {
2432                mutex_unlock(&cache_chain_mutex);
2433                put_online_cpus();
2434        }
2435        return cachep;
2436}
2437EXPORT_SYMBOL(kmem_cache_create);
2438
2439#if DEBUG
2440static void check_irq_off(void)
2441{
2442        BUG_ON(!irqs_disabled());
2443}
2444
2445static void check_irq_on(void)
2446{
2447        BUG_ON(irqs_disabled());
2448}
2449
2450static void check_spinlock_acquired(struct kmem_cache *cachep)
2451{
2452#ifdef CONFIG_SMP
2453        check_irq_off();
2454        assert_spin_locked(&cachep->nodelists[numa_mem_id()]->list_lock);
2455#endif
2456}
2457
2458static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
2459{
2460#ifdef CONFIG_SMP
2461        check_irq_off();
2462        assert_spin_locked(&cachep->nodelists[node]->list_lock);
2463#endif
2464}
2465
2466#else
2467#define check_irq_off() do { } while(0)
2468#define check_irq_on()  do { } while(0)
2469#define check_spinlock_acquired(x) do { } while(0)
2470#define check_spinlock_acquired_node(x, y) do { } while(0)
2471#endif
2472
2473static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
2474                        struct array_cache *ac,
2475                        int force, int node);
2476
2477static void do_drain(void *arg)
2478{
2479        struct kmem_cache *cachep = arg;
2480        struct array_cache *ac;
2481        int node = numa_mem_id();
2482
2483        check_irq_off();
2484        ac = cpu_cache_get(cachep);
2485        spin_lock(&cachep->nodelists[node]->list_lock);
2486        free_block(cachep, ac->entry, ac->avail, node);
2487        spin_unlock(&cachep->nodelists[node]->list_lock);
2488        ac->avail = 0;
2489}
2490
2491static void drain_cpu_caches(struct kmem_cache *cachep)
2492{
2493        struct kmem_list3 *l3;
2494        int node;
2495
2496        on_each_cpu(do_drain, cachep, 1);
2497        check_irq_on();
2498        for_each_online_node(node) {
2499                l3 = cachep->nodelists[node];
2500                if (l3 && l3->alien)
2501                        drain_alien_cache(cachep, l3->alien);
2502        }
2503
2504        for_each_online_node(node) {
2505                l3 = cachep->nodelists[node];
2506                if (l3)
2507                        drain_array(cachep, l3, l3->shared, 1, node);
2508        }
2509}
2510
2511/*
2512 * Remove slabs from the list of free slabs.
2513 * Specify the number of slabs to drain in tofree.
2514 *
2515 * Returns the actual number of slabs released.
2516 */
2517static int drain_freelist(struct kmem_cache *cache,
2518                        struct kmem_list3 *l3, int tofree)
2519{
2520        struct list_head *p;
2521        int nr_freed;
2522        struct slab *slabp;
2523
2524        nr_freed = 0;
2525        while (nr_freed < tofree && !list_empty(&l3->slabs_free)) {
2526
2527                spin_lock_irq(&l3->list_lock);
2528                p = l3->slabs_free.prev;
2529                if (p == &l3->slabs_free) {
2530                        spin_unlock_irq(&l3->list_lock);
2531                        goto out;
2532                }
2533
2534                slabp = list_entry(p, struct slab, list);
2535#if DEBUG
2536                BUG_ON(slabp->inuse);
2537#endif
2538                list_del(&slabp->list);
2539                /*
2540                 * Safe to drop the lock. The slab is no longer linked
2541                 * to the cache.
2542                 */
2543                l3->free_objects -= cache->num;
2544                spin_unlock_irq(&l3->list_lock);
2545                slab_destroy(cache, slabp);
2546                nr_freed++;
2547        }
2548out:
2549        return nr_freed;
2550}
2551
2552/* Called with cache_chain_mutex held to protect against cpu hotplug */
2553static int __cache_shrink(struct kmem_cache *cachep)
2554{
2555        int ret = 0, i = 0;
2556        struct kmem_list3 *l3;
2557
2558        drain_cpu_caches(cachep);
2559
2560        check_irq_on();
2561        for_each_online_node(i) {
2562                l3 = cachep->nodelists[i];
2563                if (!l3)
2564                        continue;
2565
2566                drain_freelist(cachep, l3, l3->free_objects);
2567
2568                ret += !list_empty(&l3->slabs_full) ||
2569                        !list_empty(&l3->slabs_partial);
2570        }
2571        return (ret ? 1 : 0);
2572}
2573
2574/**
2575 * kmem_cache_shrink - Shrink a cache.
2576 * @cachep: The cache to shrink.
2577 *
2578 * Releases as many slabs as possible for a cache.
2579 * To help debugging, a zero exit status indicates all slabs were released.
2580 */
2581int kmem_cache_shrink(struct kmem_cache *cachep)
2582{
2583        int ret;
2584        BUG_ON(!cachep || in_interrupt());
2585
2586        get_online_cpus();
2587        mutex_lock(&cache_chain_mutex);
2588        ret = __cache_shrink(cachep);
2589        mutex_unlock(&cache_chain_mutex);
2590        put_online_cpus();
2591        return ret;
2592}
2593EXPORT_SYMBOL(kmem_cache_shrink);
2594
2595/**
2596 * kmem_cache_destroy - delete a cache
2597 * @cachep: the cache to destroy
2598 *
2599 * Remove a &struct kmem_cache object from the slab cache.
2600 *
2601 * It is expected this function will be called by a module when it is
2602 * unloaded.  This will remove the cache completely, and avoid a duplicate
2603 * cache being allocated each time a module is loaded and unloaded, if the
2604 * module doesn't have persistent in-kernel storage across loads and unloads.
2605 *
2606 * The cache must be empty before calling this function.
2607 *
2608 * The caller must guarantee that noone will allocate memory from the cache
2609 * during the kmem_cache_destroy().
2610 */
2611void kmem_cache_destroy(struct kmem_cache *cachep)
2612{
2613        BUG_ON(!cachep || in_interrupt());
2614
2615        /* Find the cache in the chain of caches. */
2616        get_online_cpus();
2617        mutex_lock(&cache_chain_mutex);
2618        /*
2619         * the chain is never empty, cache_cache is never destroyed
2620         */
2621        list_del(&cachep->next);
2622        if (__cache_shrink(cachep)) {
2623                slab_error(cachep, "Can't free all objects");
2624                list_add(&cachep->next, &cache_chain);
2625                mutex_unlock(&cache_chain_mutex);
2626                put_online_cpus();
2627                return;
2628        }
2629
2630        if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
2631                rcu_barrier();
2632
2633        __kmem_cache_destroy(cachep);
2634        mutex_unlock(&cache_chain_mutex);
2635        put_online_cpus();
2636}
2637EXPORT_SYMBOL(kmem_cache_destroy);
2638
2639/*
2640 * Get the memory for a slab management obj.
2641 * For a slab cache when the slab descriptor is off-slab, slab descriptors
2642 * always come from malloc_sizes caches.  The slab descriptor cannot
2643 * come from the same cache which is getting created because,
2644 * when we are searching for an appropriate cache for these
2645 * descriptors in kmem_cache_create, we search through the malloc_sizes array.
2646 * If we are creating a malloc_sizes cache here it would not be visible to
2647 * kmem_find_general_cachep till the initialization is complete.
2648 * Hence we cannot have slabp_cache same as the original cache.
2649 */
2650static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2651                                   int colour_off, gfp_t local_flags,
2652                                   int nodeid)
2653{
2654        struct slab *slabp;
2655
2656        if (OFF_SLAB(cachep)) {
2657                /* Slab management obj is off-slab. */
2658                slabp = kmem_cache_alloc_node(cachep->slabp_cache,
2659                                              local_flags, nodeid);
2660                /*
2661                 * If the first object in the slab is leaked (it's allocated
2662                 * but no one has a reference to it), we want to make sure
2663                 * kmemleak does not treat the ->s_mem pointer as a reference
2664                 * to the object. Otherwise we will not report the leak.
2665                 */
2666                kmemleak_scan_area(&slabp->list, sizeof(struct list_head),
2667                                   local_flags);
2668                if (!slabp)
2669                        return NULL;
2670        } else {
2671                slabp = objp + colour_off;
2672                colour_off += cachep->slab_size;
2673        }
2674        slabp->inuse = 0;
2675        slabp->colouroff = colour_off;
2676        slabp->s_mem = objp + colour_off;
2677        slabp->nodeid = nodeid;
2678        slabp->free = 0;
2679        return slabp;
2680}
2681
2682static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
2683{
2684        return (kmem_bufctl_t *) (slabp + 1);
2685}
2686
2687static void cache_init_objs(struct kmem_cache *cachep,
2688                            struct slab *slabp)
2689{
2690        int i;
2691
2692        for (i = 0; i < cachep->num; i++) {
2693                void *objp = index_to_obj(cachep, slabp, i);
2694#if DEBUG
2695                /* need to poison the objs? */
2696                if (cachep->flags & SLAB_POISON)
2697                        poison_obj(cachep, objp, POISON_FREE);
2698                if (cachep->flags & SLAB_STORE_USER)
2699                        *dbg_userword(cachep, objp) = NULL;
2700
2701                if (cachep->flags & SLAB_RED_ZONE) {
2702                        *dbg_redzone1(cachep, objp) = RED_INACTIVE;
2703                        *dbg_redzone2(cachep, objp) = RED_INACTIVE;
2704                }
2705                /*
2706                 * Constructors are not allowed to allocate memory from the same
2707                 * cache which they are a constructor for.  Otherwise, deadlock.
2708                 * They must also be threaded.
2709                 */
2710                if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2711                        cachep->ctor(objp + obj_offset(cachep));
2712
2713                if (cachep->flags & SLAB_RED_ZONE) {
2714                        if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
2715                                slab_error(cachep, "constructor overwrote the"
2716                                           " end of an object");
2717                        if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
2718                                slab_error(cachep, "constructor overwrote the"
2719                                           " start of an object");
2720                }
2721                if ((cachep->buffer_size % PAGE_SIZE) == 0 &&
2722                            OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
2723                        kernel_map_pages(virt_to_page(objp),
2724                                         cachep->buffer_size / PAGE_SIZE, 0);
2725#else
2726                if (cachep->ctor)
2727                        cachep->ctor(objp);
2728#endif
2729                slab_bufctl(slabp)[i] = i + 1;
2730        }
2731        slab_bufctl(slabp)[i - 1] = BUFCTL_END;
2732}
2733
2734static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
2735{
2736        if (CONFIG_ZONE_DMA_FLAG) {
2737                if (flags & GFP_DMA)
2738                        BUG_ON(!(cachep->gfpflags & GFP_DMA));
2739                else
2740                        BUG_ON(cachep->gfpflags & GFP_DMA);
2741        }
2742}
2743
2744static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
2745                                int nodeid)
2746{
2747        void *objp = index_to_obj(cachep, slabp, slabp->free);
2748        kmem_bufctl_t next;
2749
2750        slabp->inuse++;
2751        next = slab_bufctl(slabp)[slabp->free];
2752#if DEBUG
2753        slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2754        WARN_ON(slabp->nodeid != nodeid);
2755#endif
2756        slabp->free = next;
2757
2758        return objp;
2759}
2760
2761static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
2762                                void *objp, int nodeid)
2763{
2764        unsigned int objnr = obj_to_index(cachep, slabp, objp);
2765
2766#if DEBUG
2767        /* Verify that the slab belongs to the intended node */
2768        WARN_ON(slabp->nodeid != nodeid);
2769
2770        if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) {
2771                printk(KERN_ERR "slab: double free detected in cache "
2772                                "'%s', objp %p\n", cachep->name, objp);
2773                BUG();
2774        }
2775#endif
2776        slab_bufctl(slabp)[objnr] = slabp->free;
2777        slabp->free = objnr;
2778        slabp->inuse--;
2779}
2780
2781/*
2782 * Map pages beginning at addr to the given cache and slab. This is required
2783 * for the slab allocator to be able to lookup the cache and slab of a
2784 * virtual address for kfree, ksize, and slab debugging.
2785 */
2786static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
2787                           void *addr)
2788{
2789        int nr_pages;
2790        struct page *page;
2791
2792        page = virt_to_page(addr);
2793
2794        nr_pages = 1;
2795        if (likely(!PageCompound(page)))
2796                nr_pages <<= cache->gfporder;
2797
2798        do {
2799                page_set_cache(page, cache);
2800                page_set_slab(page, slab);
2801                page++;
2802        } while (--nr_pages);
2803}
2804
2805/*
2806 * Grow (by 1) the number of slabs within a cache.  This is called by
2807 * kmem_cache_alloc() when there are no active objs left in a cache.
2808 */
2809static int cache_grow(struct kmem_cache *cachep,
2810                gfp_t flags, int nodeid, void *objp)
2811{
2812        struct slab *slabp;
2813        size_t offset;
2814        gfp_t local_flags;
2815        struct kmem_list3 *l3;
2816
2817        /*
2818         * Be lazy and only check for valid flags here,  keeping it out of the
2819         * critical path in kmem_cache_alloc().
2820         */
2821        BUG_ON(flags & GFP_SLAB_BUG_MASK);
2822        local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
2823
2824        /* Take the l3 list lock to change the colour_next on this node */
2825        check_irq_off();
2826        l3 = cachep->nodelists[nodeid];
2827        spin_lock(&l3->list_lock);
2828
2829        /* Get colour for the slab, and cal the next value. */
2830        offset = l3->colour_next;
2831        l3->colour_next++;
2832        if (l3->colour_next >= cachep->colour)
2833                l3->colour_next = 0;
2834        spin_unlock(&l3->list_lock);
2835
2836        offset *= cachep->colour_off;
2837
2838        if (local_flags & __GFP_WAIT)
2839                local_irq_enable();
2840
2841        /*
2842         * The test for missing atomic flag is performed here, rather than
2843         * the more obvious place, simply to reduce the critical path length
2844         * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
2845         * will eventually be caught here (where it matters).
2846         */
2847        kmem_flagcheck(cachep, flags);
2848
2849        /*
2850         * Get mem for the objs.  Attempt to allocate a physical page from
2851         * 'nodeid'.
2852         */
2853        if (!objp)
2854                objp = kmem_getpages(cachep, local_flags, nodeid);
2855        if (!objp)
2856                goto failed;
2857
2858        /* Get slab management. */
2859        slabp = alloc_slabmgmt(cachep, objp, offset,
2860                        local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
2861        if (!slabp)
2862                goto opps1;
2863
2864        slab_map_pages(cachep, slabp, objp);
2865
2866        cache_init_objs(cachep, slabp);
2867
2868        if (local_flags & __GFP_WAIT)
2869                local_irq_disable();
2870        check_irq_off();
2871        spin_lock(&l3->list_lock);
2872
2873        /* Make slab active. */
2874        list_add_tail(&slabp->list, &(l3->slabs_free));
2875        STATS_INC_GROWN(cachep);
2876        l3->free_objects += cachep->num;
2877        spin_unlock(&l3->list_lock);
2878        return 1;
2879opps1:
2880        kmem_freepages(cachep, objp);
2881failed:
2882        if (local_flags & __GFP_WAIT)
2883                local_irq_disable();
2884        return 0;
2885}
2886
2887#if DEBUG
2888
2889/*
2890 * Perform extra freeing checks:
2891 * - detect bad pointers.
2892 * - POISON/RED_ZONE checking
2893 */
2894static void kfree_debugcheck(const void *objp)
2895{
2896        if (!virt_addr_valid(objp)) {
2897                printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
2898                       (unsigned long)objp);
2899                BUG();
2900        }
2901}
2902
2903static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
2904{
2905        unsigned long long redzone1, redzone2;
2906
2907        redzone1 = *dbg_redzone1(cache, obj);
2908        redzone2 = *dbg_redzone2(cache, obj);
2909
2910        /*
2911         * Redzone is ok.
2912         */
2913        if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)
2914                return;
2915
2916        if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)
2917                slab_error(cache, "double free detected");
2918        else
2919                slab_error(cache, "memory outside object was overwritten");
2920
2921        printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n",
2922                        obj, redzone1, redzone2);
2923}
2924
2925static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2926                                   void *caller)
2927{
2928        struct page *page;
2929        unsigned int objnr;
2930        struct slab *slabp;
2931
2932        BUG_ON(virt_to_cache(objp) != cachep);
2933
2934        objp -= obj_offset(cachep);
2935        kfree_debugcheck(objp);
2936        page = virt_to_head_page(objp);
2937
2938        slabp = page_get_slab(page);
2939
2940        if (cachep->flags & SLAB_RED_ZONE) {
2941                verify_redzone_free(cachep, objp);
2942                *dbg_redzone1(cachep, objp) = RED_INACTIVE;
2943                *dbg_redzone2(cachep, objp) = RED_INACTIVE;
2944        }
2945        if (cachep->flags & SLAB_STORE_USER)
2946                *dbg_userword(cachep, objp) = caller;
2947
2948        objnr = obj_to_index(cachep, slabp, objp);
2949
2950        BUG_ON(objnr >= cachep->num);
2951        BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
2952
2953#ifdef CONFIG_DEBUG_SLAB_LEAK
2954        slab_bufctl(slabp)[objnr] = BUFCTL_FREE;
2955#endif
2956        if (cachep->flags & SLAB_POISON) {
2957#ifdef CONFIG_DEBUG_PAGEALLOC
2958                if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
2959                        store_stackinfo(cachep, objp, (unsigned long)caller);
2960                        kernel_map_pages(virt_to_page(objp),
2961                                         cachep->buffer_size / PAGE_SIZE, 0);
2962                } else {
2963                        poison_obj(cachep, objp, POISON_FREE);
2964                }
2965#else
2966                poison_obj(cachep, objp, POISON_FREE);
2967#endif
2968        }
2969        return objp;
2970}
2971
2972static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
2973{
2974        kmem_bufctl_t i;
2975        int entries = 0;
2976
2977        /* Check slab's freelist to see if this obj is there. */
2978        for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
2979                entries++;
2980                if (entries > cachep->num || i >= cachep->num)
2981                        goto bad;
2982        }
2983        if (entries != cachep->num - slabp->inuse) {
2984bad:
2985                printk(KERN_ERR "slab: Internal list corruption detected in "
2986                                "cache '%s'(%d), slabp %p(%d). Hexdump:\n",
2987                        cachep->name, cachep->num, slabp, slabp->inuse);
2988                for (i = 0;
2989                     i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
2990                     i++) {
2991                        if (i % 16 == 0)
2992                                printk("\n%03x:", i);
2993                        printk(" %02x", ((unsigned char *)slabp)[i]);
2994                }
2995                printk("\n");
2996                BUG();
2997        }
2998}
2999#else
3000#define kfree_debugcheck(x) do { } while(0)

3001#define cache_free_debugcheck(x,objp,z) (objp)
3002#define check_slabp(x,y) do { } while(0)
3003#endif
3004
3005static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
3006{
3007        int batchcount;
3008        struct kmem_list3 *l3;
3009        struct array_cache *ac;
3010        int node;
3011
3012retry:
3013        check_irq_off();
3014        node = numa_mem_id();
3015        ac = cpu_cache_get(cachep);
3016        batchcount = ac->batchcount;
3017        if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
3018                /*
3019                 * If there was little recent activity on this cache, then
3020                 * perform only a partial refill.  Otherwise we could generate
3021                 * refill bouncing.
3022                 */
3023                batchcount = BATCHREFILL_LIMIT;
3024        }
3025        l3 = cachep->nodelists[node];
3026
3027        BUG_ON(ac->avail > 0 || !l3);
3028        spin_lock(&l3->list_lock);
3029
3030        /* See if we can refill from the shared array */
3031        if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) {
3032                l3->shared->touched = 1;
3033                goto alloc_done;
3034        }
3035
3036        while (batchcount > 0) {
3037                struct list_head *entry;
3038                struct slab *slabp;
3039                /* Get slab alloc is to come from. */
3040                entry = l3->slabs_partial.next;
3041                if (entry == &l3->slabs_partial) {
3042                        l3->free_touched = 1;
3043                        entry = l3->slabs_free.next;
3044                        if (entry == &l3->slabs_free)
3045                                goto must_grow;
3046                }
3047
3048                slabp = list_entry(entry, struct slab, list);
3049                check_slabp(cachep, slabp);
3050                check_spinlock_acquired(cachep);
3051
3052                /*
3053                 * The slab was either on partial or free list so
3054                 * there must be at least one object available for
3055                 * allocation.
3056                 */
3057                BUG_ON(slabp->inuse >= cachep->num);
3058
3059                while (slabp->inuse < cachep->num && batchcount--) {
3060                        STATS_INC_ALLOCED(cachep);
3061                        STATS_INC_ACTIVE(cachep);
3062                        STATS_SET_HIGH(cachep);
3063
3064                        ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
3065                                                            node);
3066                }
3067                check_slabp(cachep, slabp);
3068
3069                /* move slabp to correct slabp list: */
3070                list_del(&slabp->list);
3071                if (slabp->free == BUFCTL_END)
3072                        list_add(&slabp->list, &l3->slabs_full);
3073                else
3074                        list_add(&slabp->list, &l3->slabs_partial);
3075        }
3076
3077must_grow:
3078        l3->free_objects -= ac->avail;
3079alloc_done:
3080        spin_unlock(&l3->list_lock);
3081
3082        if (unlikely(!ac->avail)) {
3083                int x;
3084                x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
3085
3086                /* cache_grow can reenable interrupts, then ac could change. */
3087                ac = cpu_cache_get(cachep);
3088                if (!x && ac->avail == 0)       /* no objects in sight? abort */
3089                        return NULL;
3090
3091                if (!ac->avail)         /* objects refilled by interrupt? */
3092                        goto retry;
3093        }
3094        ac->touched = 1;
3095        return ac->entry[--ac->avail];
3096}
3097
3098static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
3099                                                gfp_t flags)
3100{
3101        might_sleep_if(flags & __GFP_WAIT);
3102#if DEBUG
3103        kmem_flagcheck(cachep, flags);
3104#endif
3105}
3106
3107#if DEBUG
3108static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3109                                gfp_t flags, void *objp, void *caller)
3110{
3111        if (!objp)
3112                return objp;
3113        if (cachep->flags & SLAB_POISON) {
3114#ifdef CONFIG_DEBUG_PAGEALLOC
3115                if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
3116                        kernel_map_pages(virt_to_page(objp),
3117                                         cachep->buffer_size / PAGE_SIZE, 1);
3118                else
3119                        check_poison_obj(cachep, objp);
3120#else
3121                check_poison_obj(cachep, objp);
3122#endif
3123                poison_obj(cachep, objp, POISON_INUSE);
3124        }
3125        if (cachep->flags & SLAB_STORE_USER)
3126                *dbg_userword(cachep, objp) = caller;
3127
3128        if (cachep->flags & SLAB_RED_ZONE) {
3129                if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
3130                                *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
3131                        slab_error(cachep, "double free, or memory outside"
3132                                                " object was overwritten");
3133                        printk(KERN_ERR
3134                                "%p: redzone 1:0x%llx, redzone 2:0x%llx\n",
3135                                objp, *dbg_redzone1(cachep, objp),
3136                                *dbg_redzone2(cachep, objp));
3137                }
3138                *dbg_redzone1(cachep, objp) = RED_ACTIVE;
3139                *dbg_redzone2(cachep, objp) = RED_ACTIVE;
3140        }
3141#ifdef CONFIG_DEBUG_SLAB_LEAK
3142        {
3143                struct slab *slabp;
3144                unsigned objnr;
3145
3146                slabp = page_get_slab(virt_to_head_page(objp));
3147                objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
3148                slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
3149        }
3150#endif
3151        objp += obj_offset(cachep);
3152        if (cachep->ctor && cachep->flags & SLAB_POISON)
3153                cachep->ctor(objp);
3154#if ARCH_SLAB_MINALIGN
3155        if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
3156                printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
3157                       objp, ARCH_SLAB_MINALIGN);
3158        }
3159#endif
3160        return objp;
3161}
3162#else
3163#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
3164#endif
3165
3166static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
3167{
3168        if (cachep == &cache_cache)
3169                return false;
3170
3171        return should_failslab(obj_size(cachep), flags, cachep->flags);
3172}
3173
3174static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3175{
3176        void *objp;
3177        struct array_cache *ac;
3178
3179        check_irq_off();
3180
3181        ac = cpu_cache_get(cachep);
3182        if (likely(ac->avail)) {
3183                STATS_INC_ALLOCHIT(cachep);
3184                ac->touched = 1;
3185                objp = ac->entry[--ac->avail];
3186        } else {
3187                STATS_INC_ALLOCMISS(cachep);
3188                objp = cache_alloc_refill(cachep, flags);
3189                /*
3190                 * the 'ac' may be updated by cache_alloc_refill(),
3191                 * and kmemleak_erase() requires its correct value.
3192                 */
3193                ac = cpu_cache_get(cachep);
3194        }
3195        /*
3196         * To avoid a false negative, if an object that is in one of the
3197         * per-CPU caches is leaked, we need to make sure kmemleak doesn't
3198         * treat the array pointers as a reference to the object.
3199         */
3200        if (objp)
3201                kmemleak_erase(&ac->entry[ac->avail]);
3202        return objp;
3203}
3204
3205#ifdef CONFIG_NUMA
3206/*
3207 * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.
3208 *
3209 * If we are in_interrupt, then process context, including cpusets and
3210 * mempolicy, may not apply and should not be used for allocation policy.
3211 */
3212static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3213{
3214        int nid_alloc, nid_here;
3215
3216        if (in_interrupt() || (flags & __GFP_THISNODE))
3217                return NULL;
3218        nid_alloc = nid_here = numa_mem_id();
3219        get_mems_allowed();
3220        if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
3221                nid_alloc = cpuset_slab_spread_node();
3222        else if (current->mempolicy)
3223                nid_alloc = slab_node(current->mempolicy);
3224        put_mems_allowed();
3225        if (nid_alloc != nid_here)
3226                return ____cache_alloc_node(cachep, flags, nid_alloc);
3227        return NULL;
3228}
3229
3230/*
3231 * Fallback function if there was no memory available and no objects on a
3232 * certain node and fall back is permitted. First we scan all the
3233 * available nodelists for available objects. If that fails then we
3234 * perform an allocation without specifying a node. This allows the page
3235 * allocator to do its reclaim / fallback magic. We then insert the
3236 * slab into the proper nodelist and then allocate from it.
3237 */
3238static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3239{
3240        struct zonelist *zonelist;
3241        gfp_t local_flags;
3242        struct zoneref *z;
3243        struct zone *zone;
3244        enum zone_type high_zoneidx = gfp_zone(flags);
3245        void *obj = NULL;
3246        int nid;
3247
3248        if (flags & __GFP_THISNODE)
3249                return NULL;
3250
3251        get_mems_allowed();
3252        zonelist = node_zonelist(slab_node(current->mempolicy), flags);
3253        local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
3254
3255retry:
3256        /*
3257         * Look through allowed nodes for objects available
3258         * from existing per node queues.
3259         */
3260        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
3261                nid = zone_to_nid(zone);
3262
3263                if (cpuset_zone_allowed_hardwall(zone, flags) &&
3264                        cache->nodelists[nid] &&
3265                        cache->nodelists[nid]->free_objects) {
3266                                obj = ____cache_alloc_node(cache,
3267                                        flags | GFP_THISNODE, nid);
3268                                if (obj)
3269                                        break;
3270                }
3271        }
3272
3273        if (!obj) {
3274                /*
3275                 * This allocation will be performed within the constraints
3276                 * of the current cpuset / memory policy requirements.
3277                 * We may trigger various forms of reclaim on the allowed
3278                 * set and go into memory reserves if necessary.
3279                 */
3280                if (local_flags & __GFP_WAIT)
3281                        local_irq_enable();
3282                kmem_flagcheck(cache, flags);
3283                obj = kmem_getpages(cache, local_flags, numa_mem_id());
3284                if (local_flags & __GFP_WAIT)
3285                        local_irq_disable();
3286                if (obj) {
3287                        /*
3288                         * Insert into the appropriate per node queues
3289                         */
3290                        nid = page_to_nid(virt_to_page(obj));
3291                        if (cache_grow(cache, flags, nid, obj)) {
3292                                obj = ____cache_alloc_node(cache,
3293                                        flags | GFP_THISNODE, nid);
3294                                if (!obj)
3295                                        /*
3296                                         * Another processor may allocate the
3297                                         * objects in the slab since we are
3298                                         * not holding any locks.
3299                                         */
3300                                        goto retry;
3301                        } else {
3302                                /* cache_grow already freed obj */
3303                                obj = NULL;
3304                        }
3305                }
3306        }
3307        put_mems_allowed();
3308        return obj;
3309}
3310
3311/*
3312 * A interface to enable slab creation on nodeid
3313 */
3314static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
3315                                int nodeid)
3316{
3317        struct list_head *entry;
3318        struct slab *slabp;
3319        struct kmem_list3 *l3;
3320        void *obj;
3321        int x;
3322
3323        l3 = cachep->nodelists[nodeid];
3324        BUG_ON(!l3);
3325
3326retry:
3327        check_irq_off();
3328        spin_lock(&l3->list_lock);
3329        entry = l3->slabs_partial.next;
3330        if (entry == &l3->slabs_partial) {
3331                l3->free_touched = 1;
3332                entry = l3->slabs_free.next;
3333                if (entry == &l3->slabs_free)
3334                        goto must_grow;
3335        }
3336
3337        slabp = list_entry(entry, struct slab, list);
3338        check_spinlock_acquired_node(cachep, nodeid);
3339        check_slabp(cachep, slabp);
3340
3341        STATS_INC_NODEALLOCS(cachep);
3342        STATS_INC_ACTIVE(cachep);
3343        STATS_SET_HIGH(cachep);
3344
3345        BUG_ON(slabp->inuse == cachep->num);
3346
3347        obj = slab_get_obj(cachep, slabp, nodeid);
3348        check_slabp(cachep, slabp);
3349        l3->free_objects--;
3350        /* move slabp to correct slabp list: */
3351        list_del(&slabp->list);
3352
3353        if (slabp->free == BUFCTL_END)
3354                list_add(&slabp->list, &l3->slabs_full);
3355        else
3356                list_add(&slabp->list, &l3->slabs_partial);
3357
3358        spin_unlock(&l3->list_lock);
3359        goto done;
3360
3361must_grow:
3362        spin_unlock(&l3->list_lock);
3363        x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
3364        if (x)
3365                goto retry;
3366
3367        return fallback_alloc(cachep, flags);
3368
3369done:
3370        return obj;
3371}
3372
3373/**
3374 * kmem_cache_alloc_node - Allocate an object on the specified node
3375 * @cachep: The cache to allocate from.
3376 * @flags: See kmalloc().
3377 * @nodeid: node number of the target node.
3378 * @caller: return address of caller, used for debug information
3379 *
3380 * Identical to kmem_cache_alloc but it will allocate memory on the given
3381 * node, which can improve the performance for cpu bound structures.
3382 *
3383 * Fallback to other node is possible if __GFP_THISNODE is not set.
3384 */
3385static __always_inline void *
3386__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3387                   void *caller)
3388{
3389        unsigned long save_flags;
3390        void *ptr;
3391        int slab_node = numa_mem_id();
3392
3393        flags &= gfp_allowed_mask;
3394
3395        lockdep_trace_alloc(flags);
3396
3397        if (slab_should_failslab(cachep, flags))
3398                return NULL;
3399
3400        cache_alloc_debugcheck_before(cachep, flags);
3401        local_irq_save(save_flags);
3402
3403        if (nodeid == -1)
3404                nodeid = slab_node;
3405
3406        if (unlikely(!cachep->nodelists[nodeid])) {
3407                /* Node not bootstrapped yet */
3408                ptr = fallback_alloc(cachep, flags);
3409                goto out;
3410        }
3411
3412        if (nodeid == slab_node) {
3413                /*
3414                 * Use the locally cached objects if possible.
3415                 * However ____cache_alloc does not allow fallback
3416                 * to other nodes. It may fail while we still have
3417                 * objects on other nodes available.
3418                 */
3419                ptr = ____cache_alloc(cachep, flags);
3420                if (ptr)
3421                        goto out;
3422        }
3423        /* ___cache_alloc_node can fall back to other nodes */
3424        ptr = ____cache_alloc_node(cachep, flags, nodeid);
3425  out:
3426        local_irq_restore(save_flags);
3427        ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
3428        kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags,
3429                                 flags);
3430
3431        if (likely(ptr))
3432                kmemcheck_slab_alloc(cachep, flags, ptr, obj_size(cachep));
3433
3434        if (unlikely((flags & __GFP_ZERO) && ptr))
3435                memset(ptr, 0, obj_size(cachep));
3436
3437        return ptr;
3438}
3439
3440static __always_inline void *
3441__do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
3442{
3443        void *objp;
3444
3445        if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
3446                objp = alternate_node_alloc(cache, flags);
3447                if (objp)
3448                        goto out;
3449        }
3450        objp = ____cache_alloc(cache, flags);
3451
3452        /*
3453         * We may just have run out of memory on the local node.
3454         * ____cache_alloc_node() knows how to locate memory on other nodes
3455         */
3456        if (!objp)
3457                objp = ____cache_alloc_node(cache, flags, numa_mem_id());
3458
3459  out:
3460        return objp;
3461}
3462#else
3463
3464static __always_inline void *
3465__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3466{
3467        return ____cache_alloc(cachep, flags);
3468}
3469
3470#endif /* CONFIG_NUMA */
3471
3472static __always_inline void *
3473__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
3474{
3475        unsigned long save_flags;
3476        void *objp;
3477
3478        flags &= gfp_allowed_mask;
3479
3480        lockdep_trace_alloc(flags);
3481
3482        if (slab_should_failslab(cachep, flags))
3483                return NULL;
3484
3485        cache_alloc_debugcheck_before(cachep, flags);
3486        local_irq_save(save_flags);
3487        objp = __do_cache_alloc(cachep, flags);
3488        local_irq_restore(save_flags);
3489        objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
3490        kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags,
3491                                 flags);
3492        prefetchw(objp);
3493
3494        if (likely(objp))
3495                kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep));
3496
3497        if (unlikely((flags & __GFP_ZERO) && objp))
3498                memset(objp, 0, obj_size(cachep));
3499
3500        return objp;
3501}
3502
3503/*
3504 * Caller needs to acquire correct kmem_list's list_lock
3505 */
3506static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
3507                       int node)
3508{
3509        int i;
3510        struct kmem_list3 *l3;
3511
3512        for (i = 0; i < nr_objects; i++) {
3513                void *objp = objpp[i];
3514                struct slab *slabp;
3515
3516                slabp = virt_to_slab(objp);
3517                l3 = cachep->nodelists[node];
3518                list_del(&slabp->list);
3519                check_spinlock_acquired_node(cachep, node);
3520                check_slabp(cachep, slabp);
3521                slab_put_obj(cachep, slabp, objp, node);
3522                STATS_DEC_ACTIVE(cachep);
3523                l3->free_objects++;
3524                check_slabp(cachep, slabp);
3525
3526                /* fixup slab chains */
3527                if (slabp->inuse == 0) {
3528                        if (l3->free_objects > l3->free_limit) {
3529                                l3->free_objects -= cachep->num;
3530                                /* No need to drop any previously held
3531                                 * lock here, even if we have a off-slab slab
3532                                 * descriptor it is guaranteed to come from
3533                                 * a different cache, refer to comments before
3534                                 * alloc_slabmgmt.
3535                                 */
3536                                slab_destroy(cachep, slabp);
3537                        } else {
3538                                list_add(&slabp->list, &l3->slabs_free);
3539                        }
3540                } else {
3541                        /* Unconditionally move a slab to the end of the
3542                         * partial list on free - maximum time for the
3543                         * other objects to be freed, too.
3544                         */
3545                        list_add_tail(&slabp->list, &l3->slabs_partial);
3546                }
3547        }
3548}
3549
3550static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
3551{
3552        int batchcount;
3553        struct kmem_list3 *l3;
3554        int node = numa_mem_id();
3555
3556        batchcount = ac->batchcount;
3557#if DEBUG
3558        BUG_ON(!batchcount || batchcount > ac->avail);
3559#endif
3560        check_irq_off();
3561        l3 = cachep->nodelists[node];
3562        spin_lock(&l3->list_lock);
3563        if (l3->shared) {
3564                struct array_cache *shared_array = l3->shared;
3565                int max = shared_array->limit - shared_array->avail;
3566                if (max) {
3567                        if (batchcount > max)
3568                                batchcount = max;
3569                        memcpy(&(shared_array->entry[shared_array->avail]),
3570                               ac->entry, sizeof(void *) * batchcount);
3571                        shared_array->avail += batchcount;
3572                        goto free_done;
3573                }
3574        }
3575
3576        free_block(cachep, ac->entry, batchcount, node);
3577free_done:
3578#if STATS
3579        {
3580                int i = 0;
3581                struct list_head *p;
3582
3583                p = l3->slabs_free.next;
3584                while (p != &(l3->slabs_free)) {
3585                        struct slab *slabp;
3586
3587                        slabp = list_entry(p, struct slab, list);
3588                        BUG_ON(slabp->inuse);
3589
3590                        i++;
3591                        p = p->next;
3592                }
3593                STATS_SET_FREEABLE(cachep, i);
3594        }
3595#endif
3596        spin_unlock(&l3->list_lock);
3597        ac->avail -= batchcount;
3598        memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
3599}
3600
3601/*
3602 * Release an obj back to its cache. If the obj has a constructed state, it must
3603 * be in this state _before_ it is released.  Called with disabled ints.
3604 */
3605static inline void __cache_free(struct kmem_cache *cachep, void *objp)
3606{
3607        struct array_cache *ac = cpu_cache_get(cachep);
3608
3609        check_irq_off();
3610        kmemleak_free_recursive(objp, cachep->flags);
3611        objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
3612
3613        kmemcheck_slab_free(cachep, objp, obj_size(cachep));
3614
3615        /*
3616         * Skip calling cache_free_alien() when the platform is not numa.
3617         * This will avoid cache misses that happen while accessing slabp (which
3618         * is per page memory  reference) to get nodeid. Instead use a global
3619         * variable to skip the call, which is mostly likely to be present in
3620         * the cache.
3621         */
3622        if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
3623                return;
3624
3625        if (likely(ac->avail < ac->limit)) {
3626                STATS_INC_FREEHIT(cachep);
3627                ac->entry[ac->avail++] = objp;
3628                return;
3629        } else {
3630                STATS_INC_FREEMISS(cachep);
3631                cache_flusharray(cachep, ac);
3632                ac->entry[ac->avail++] = objp;
3633        }
3634}
3635
3636/**
3637 * kmem_cache_alloc - Allocate an object
3638 * @cachep: The cache to allocate from.
3639 * @flags: See kmalloc().
3640 *
3641 * Allocate an object from this cache.  The flags are only relevant
3642 * if the cache has no available objects.
3643 */
3644void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3645{
3646        void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
3647
3648        trace_kmem_cache_alloc(_RET_IP_, ret,
3649                               obj_size(cachep), cachep->buffer_size, flags);
3650
3651        return ret;
3652}
3653EXPORT_SYMBOL(kmem_cache_alloc);
3654
3655#ifdef CONFIG_TRACING
3656void *
3657kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags)
3658{
3659        void *ret;
3660
3661        ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
3662
3663        trace_kmalloc(_RET_IP_, ret,
3664                      size, slab_buffer_size(cachep), flags);
3665        return ret;
3666}
3667EXPORT_SYMBOL(kmem_cache_alloc_trace);
3668#endif
3669
3670#ifdef CONFIG_NUMA
3671void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3672{
3673        void *ret = __cache_alloc_node(cachep, flags, nodeid,
3674                                       __builtin_return_address(0));
3675
3676        trace_kmem_cache_alloc_node(_RET_IP_, ret,
3677                                    obj_size(cachep), cachep->buffer_size,
3678                                    flags, nodeid);
3679
3680        return ret;
3681}
3682EXPORT_SYMBOL(kmem_cache_alloc_node);
3683
3684#ifdef CONFIG_TRACING
3685void *kmem_cache_alloc_node_trace(size_t size,
3686                                  struct kmem_cache *cachep,
3687                                  gfp_t flags,
3688                                  int nodeid)
3689{
3690        void *ret;
3691
3692        ret = __cache_alloc_node(cachep, flags, nodeid,
3693                                  __builtin_return_address(0));
3694        trace_kmalloc_node(_RET_IP_, ret,
3695                           size, slab_buffer_size(cachep),
3696                           flags, nodeid);
3697        return ret;
3698}
3699EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
3700#endif
3701
3702static __always_inline void *
3703__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
3704{
3705        struct kmem_cache *cachep;
3706
3707        cachep = kmem_find_general_cachep(size, flags);
3708        if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3709                return cachep;
3710        return kmem_cache_alloc_node_trace(size, cachep, flags, node);
3711}
3712
3713#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
3714void *__kmalloc_node(size_t size, gfp_t flags, int node)
3715{
3716        return __do_kmalloc_node(size, flags, node,
3717                        __builtin_return_address(0));
3718}
3719EXPORT_SYMBOL(__kmalloc_node);
3720
3721void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
3722                int node, unsigned long caller)
3723{
3724        return __do_kmalloc_node(size, flags, node, (void *)caller);
3725}
3726EXPORT_SYMBOL(__kmalloc_node_track_caller);
3727#else
3728void *__kmalloc_node(size_t size, gfp_t flags, int node)
3729{
3730        return __do_kmalloc_node(size, flags, node, NULL);
3731}
3732EXPORT_SYMBOL(__kmalloc_node);
3733#endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */
3734#endif /* CONFIG_NUMA */
3735
3736/**
3737 * __do_kmalloc - allocate memory
3738 * @size: how many bytes of memory are required.
3739 * @flags: the type of memory to allocate (see kmalloc).
3740 * @caller: function caller for debug tracking of the caller
3741 */
3742static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3743                                          void *caller)
3744{
3745        struct kmem_cache *cachep;
3746        void *ret;
3747
3748        /* If you want to save a few bytes .text space: replace
3749         * __ with kmem_.
3750         * Then kmalloc uses the uninlined functions instead of the inline
3751         * functions.
3752         */
3753        cachep = __find_general_cachep(size, flags);
3754        if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3755                return cachep;
3756        ret = __cache_alloc(cachep, flags, caller);
3757
3758        trace_kmalloc((unsigned long) caller, ret,
3759                      size, cachep->buffer_size, flags);
3760
3761        return ret;
3762}
3763
3764
3765#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
3766void *__kmalloc(size_t size, gfp_t flags)
3767{
3768        return __do_kmalloc(size, flags, __builtin_return_address(0));
3769}
3770EXPORT_SYMBOL(__kmalloc);
3771
3772void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
3773{
3774        return __do_kmalloc(size, flags, (void *)caller);
3775}
3776EXPORT_SYMBOL(__kmalloc_track_caller);
3777
3778#else
3779void *__kmalloc(size_t size, gfp_t flags)
3780{
3781        return __do_kmalloc(size, flags, NULL);
3782}
3783EXPORT_SYMBOL(__kmalloc);
3784#endif
3785
3786/**
3787 * kmem_cache_free - Deallocate an object
3788 * @cachep: The cache the allocation was from.
3789 * @objp: The previously allocated object.
3790 *
3791 * Free an object which was previously allocated from this
3792 * cache.
3793 */
3794void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3795{
3796        unsigned long flags;
3797
3798        local_irq_save(flags);
3799        debug_check_no_locks_freed(objp, obj_size(cachep));
3800        if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
3801                debug_check_no_obj_freed(objp, obj_size(cachep));
3802        __cache_free(cachep, objp);
3803        local_irq_restore(flags);
3804
3805        trace_kmem_cache_free(_RET_IP_, objp);
3806}
3807EXPORT_SYMBOL(kmem_cache_free);
3808
3809/**
3810 * kfree - free previously allocated memory
3811 * @objp: pointer returned by kmalloc.
3812 *
3813 * If @objp is NULL, no operation is performed.
3814 *
3815 * Don't free memory not originally allocated by kmalloc()
3816 * or you will run into trouble.
3817 */
3818void kfree(const void *objp)
3819{
3820        struct kmem_cache *c;
3821        unsigned long flags;
3822
3823        trace_kfree(_RET_IP_, objp);
3824
3825        if (unlikely(ZERO_OR_NULL_PTR(objp)))
3826                return;
3827        local_irq_save(flags);
3828        kfree_debugcheck(objp);
3829        c = virt_to_cache(objp);
3830        debug_check_no_locks_freed(objp, obj_size(c));
3831        debug_check_no_obj_freed(objp, obj_size(c));
3832        __cache_free(c, (void *)objp);
3833        local_irq_restore(flags);
3834}
3835EXPORT_SYMBOL(kfree);
3836
3837unsigned int kmem_cache_size(struct kmem_cache *cachep)
3838{
3839        return obj_size(cachep);
3840}
3841EXPORT_SYMBOL(kmem_cache_size);
3842
3843const char *kmem_cache_name(struct kmem_cache *cachep)
3844{
3845        return cachep->name;
3846}
3847EXPORT_SYMBOL_GPL(kmem_cache_name);
3848
3849/*
3850 * This initializes kmem_list3 or resizes various caches for all nodes.
3851 */
3852static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
3853{
3854        int node;
3855        struct kmem_list3 *l3;
3856        struct array_cache *new_shared;
3857        struct array_cache **new_alien = NULL;
3858
3859        for_each_online_node(node) {
3860
3861                if (use_alien_caches) {
3862                        new_alien = alloc_alien_cache(node, cachep->limit, gfp);
3863                        if (!new_alien)
3864                                goto fail;
3865                }
3866
3867                new_shared = NULL;
3868                if (cachep->shared) {
3869                        new_shared = alloc_arraycache(node,
3870                                cachep->shared*cachep->batchcount,
3871                                        0xbaadf00d, gfp);
3872                        if (!new_shared) {
3873                                free_alien_cache(new_alien);
3874                                goto fail;
3875                        }
3876                }
3877
3878                l3 = cachep->nodelists[node];
3879                if (l3) {
3880                        struct array_cache *shared = l3->shared;
3881
3882                        spin_lock_irq(&l3->list_lock);
3883
3884                        if (shared)
3885                                free_block(cachep, shared->entry,
3886                                                shared->avail, node);
3887
3888                        l3->shared = new_shared;
3889                        if (!l3->alien) {
3890                                l3->alien = new_alien;
3891                                new_alien = NULL;
3892                        }
3893                        l3->free_limit = (1 + nr_cpus_node(node)) *
3894                                        cachep->batchcount + cachep->num;
3895                        spin_unlock_irq(&l3->list_lock);
3896                        kfree(shared);
3897                        free_alien_cache(new_alien);
3898                        continue;
3899                }
3900                l3 = kmalloc_node(sizeof(struct kmem_list3), gfp, node);
3901                if (!l3) {
3902                        free_alien_cache(new_alien);
3903                        kfree(new_shared);
3904                        goto fail;
3905                }
3906
3907                kmem_list3_init(l3);
3908                l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
3909                                ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
3910                l3->shared = new_shared;
3911                l3->alien = new_alien;
3912                l3->free_limit = (1 + nr_cpus_node(node)) *
3913                                        cachep->batchcount + cachep->num;
3914                cachep->nodelists[node] = l3;
3915        }
3916        return 0;
3917
3918fail:
3919        if (!cachep->next.next) {
3920                /* Cache is not active yet. Roll back what we did */
3921                node--;
3922                while (node >= 0) {
3923                        if (cachep->nodelists[node]) {
3924                                l3 = cachep->nodelists[node];
3925
3926                                kfree(l3->shared);
3927                                free_alien_cache(l3->alien);
3928                                kfree(l3);
3929                                cachep->nodelists[node] = NULL;
3930                        }
3931                        node--;
3932                }
3933        }
3934        return -ENOMEM;
3935}
3936
3937struct ccupdate_struct {
3938        struct kmem_cache *cachep;
3939        struct array_cache *new[NR_CPUS];
3940};
3941
3942static void do_ccupdate_local(void *info)
3943{
3944        struct ccupdate_struct *new = info;
3945        struct array_cache *old;
3946
3947        check_irq_off();
3948        old = cpu_cache_get(new->cachep);
3949
3950        new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
3951        new->new[smp_processor_id()] = old;
3952}
3953
3954/* Always called with the cache_chain_mutex held */
3955static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3956                                int batchcount, int shared, gfp_t gfp)
3957{
3958        struct ccupdate_struct *new;
3959        int i;
3960
3961        new = kzalloc(sizeof(*new), gfp);
3962        if (!new)
3963                return -ENOMEM;
3964
3965        for_each_online_cpu(i) {
3966                new->new[i] = alloc_arraycache(cpu_to_mem(i), limit,
3967                                                batchcount, gfp);
3968                if (!new->new[i]) {
3969                        for (i--; i >= 0; i--)
3970                                kfree(new->new[i]);
3971                        kfree(new);
3972                        return -ENOMEM;
3973                }
3974        }
3975        new->cachep = cachep;
3976
3977        on_each_cpu(do_ccupdate_local, (void *)new, 1);
3978
3979        check_irq_on();
3980        cachep->batchcount = batchcount;
3981        cachep->limit = limit;
3982        cachep->shared = shared;
3983
3984        for_each_online_cpu(i) {
3985                struct array_cache *ccold = new->new[i];
3986                if (!ccold)
3987                        continue;
3988                spin_lock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
3989                free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i));
3990                spin_unlock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
3991                kfree(ccold);
3992        }
3993        kfree(new);
3994        return alloc_kmemlist(cachep, gfp);
3995}
3996
3997/* Called with cache_chain_mutex held always */
3998static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
3999{
4000        int err;

4001        int limit, shared;
4002
4003        /*
4004         * The head array serves three purposes:
4005         * - create a LIFO ordering, i.e. return objects that are cache-warm
4006         * - reduce the number of spinlock operations.
4007         * - reduce the number of linked list operations on the slab and
4008         *   bufctl chains: array operations are cheaper.
4009         * The numbers are guessed, we should auto-tune as described by
4010         * Bonwick.
4011         */
4012        if (cachep->buffer_size > 131072)
4013                limit = 1;
4014        else if (cachep->buffer_size > PAGE_SIZE)
4015                limit = 8;
4016        else if (cachep->buffer_size > 1024)
4017                limit = 24;
4018        else if (cachep->buffer_size > 256)
4019                limit = 54;
4020        else
4021                limit = 120;
4022
4023        /*
4024         * CPU bound tasks (e.g. network routing) can exhibit cpu bound
4025         * allocation behaviour: Most allocs on one cpu, most free operations
4026         * on another cpu. For these cases, an efficient object passing between
4027         * cpus is necessary. This is provided by a shared array. The array
4028         * replaces Bonwick's magazine layer.
4029         * On uniprocessor, it's functionally equivalent (but less efficient)
4030         * to a larger limit. Thus disabled by default.
4031         */
4032        shared = 0;
4033        if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1)
4034                shared = 8;
4035
4036#if DEBUG
4037        /*
4038         * With debugging enabled, large batchcount lead to excessively long
4039         * periods with disabled local interrupts. Limit the batchcount
4040         */
4041        if (limit > 32)
4042                limit = 32;
4043#endif
4044        err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp);
4045        if (err)
4046                printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
4047                       cachep->name, -err);
4048        return err;
4049}
4050
4051/*
4052 * Drain an array if it contains any elements taking the l3 lock only if
4053 * necessary. Note that the l3 listlock also protects the array_cache
4054 * if drain_array() is used on the shared array.
4055 */
4056static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
4057                         struct array_cache *ac, int force, int node)
4058{
4059        int tofree;
4060
4061        if (!ac || !ac->avail)
4062                return;
4063        if (ac->touched && !force) {
4064                ac->touched = 0;
4065        } else {
4066                spin_lock_irq(&l3->list_lock);
4067                if (ac->avail) {
4068                        tofree = force ? ac->avail : (ac->limit + 4) / 5;
4069                        if (tofree > ac->avail)
4070                                tofree = (ac->avail + 1) / 2;
4071                        free_block(cachep, ac->entry, tofree, node);
4072                        ac->avail -= tofree;
4073                        memmove(ac->entry, &(ac->entry[tofree]),
4074                                sizeof(void *) * ac->avail);
4075                }
4076                spin_unlock_irq(&l3->list_lock);
4077        }
4078}
4079
4080/**
4081 * cache_reap - Reclaim memory from caches.
4082 * @w: work descriptor
4083 *
4084 * Called from workqueue/eventd every few seconds.
4085 * Purpose:
4086 * - clear the per-cpu caches for this CPU.
4087 * - return freeable pages to the main free memory pool.
4088 *
4089 * If we cannot acquire the cache chain mutex then just give up - we'll try
4090 * again on the next iteration.
4091 */
4092static void cache_reap(struct work_struct *w)
4093{
4094        struct kmem_cache *searchp;
4095        struct kmem_list3 *l3;
4096        int node = numa_mem_id();
4097        struct delayed_work *work = to_delayed_work(w);
4098
4099        if (!mutex_trylock(&cache_chain_mutex))
4100                /* Give up. Setup the next iteration. */
4101                goto out;
4102
4103        list_for_each_entry(searchp, &cache_chain, next) {
4104                check_irq_on();
4105
4106                /*
4107                 * We only take the l3 lock if absolutely necessary and we
4108                 * have established with reasonable certainty that
4109                 * we can do some work if the lock was obtained.
4110                 */
4111                l3 = searchp->nodelists[node];
4112
4113                reap_alien(searchp, l3);
4114
4115                drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
4116
4117                /*
4118                 * These are racy checks but it does not matter
4119                 * if we skip one check or scan twice.
4120                 */
4121                if (time_after(l3->next_reap, jiffies))
4122                        goto next;
4123
4124                l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
4125
4126                drain_array(searchp, l3, l3->shared, 0, node);
4127
4128                if (l3->free_touched)
4129                        l3->free_touched = 0;
4130                else {
4131                        int freed;
4132
4133                        freed = drain_freelist(searchp, l3, (l3->free_limit +
4134                                5 * searchp->num - 1) / (5 * searchp->num));
4135                        STATS_ADD_REAPED(searchp, freed);
4136                }
4137next:
4138                cond_resched();
4139        }
4140        check_irq_on();
4141        mutex_unlock(&cache_chain_mutex);
4142        next_reap_node();
4143out:
4144        /* Set up the next iteration */
4145        schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC));
4146}
4147
4148#ifdef CONFIG_SLABINFO
4149
4150static void print_slabinfo_header(struct seq_file *m)
4151{
4152        /*
4153         * Output format version, so at least we can change it
4154         * without _too_ many complaints.
4155         */
4156#if STATS
4157        seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
4158#else
4159        seq_puts(m, "slabinfo - version: 2.1\n");
4160#endif
4161        seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
4162                 "<objperslab> <pagesperslab>");
4163        seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
4164        seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
4165#if STATS
4166        seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
4167                 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
4168        seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
4169#endif
4170        seq_putc(m, '\n');
4171}
4172
4173static void *s_start(struct seq_file *m, loff_t *pos)
4174{
4175        loff_t n = *pos;
4176
4177        mutex_lock(&cache_chain_mutex);
4178        if (!n)
4179                print_slabinfo_header(m);
4180
4181        return seq_list_start(&cache_chain, *pos);
4182}
4183
4184static void *s_next(struct seq_file *m, void *p, loff_t *pos)
4185{
4186        return seq_list_next(p, &cache_chain, pos);
4187}
4188
4189static void s_stop(struct seq_file *m, void *p)
4190{
4191        mutex_unlock(&cache_chain_mutex);
4192}
4193
4194static int s_show(struct seq_file *m, void *p)
4195{
4196        struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next);
4197        struct slab *slabp;
4198        unsigned long active_objs;
4199        unsigned long num_objs;
4200        unsigned long active_slabs = 0;
4201        unsigned long num_slabs, free_objects = 0, shared_avail = 0;
4202        const char *name;
4203        char *error = NULL;
4204        int node;
4205        struct kmem_list3 *l3;
4206
4207        active_objs = 0;
4208        num_slabs = 0;
4209        for_each_online_node(node) {
4210                l3 = cachep->nodelists[node];
4211                if (!l3)
4212                        continue;
4213
4214                check_irq_on();
4215                spin_lock_irq(&l3->list_lock);
4216
4217                list_for_each_entry(slabp, &l3->slabs_full, list) {
4218                        if (slabp->inuse != cachep->num && !error)
4219                                error = "slabs_full accounting error";
4220                        active_objs += cachep->num;
4221                        active_slabs++;
4222                }
4223                list_for_each_entry(slabp, &l3->slabs_partial, list) {
4224                        if (slabp->inuse == cachep->num && !error)
4225                                error = "slabs_partial inuse accounting error";
4226                        if (!slabp->inuse && !error)
4227                                error = "slabs_partial/inuse accounting error";
4228                        active_objs += slabp->inuse;
4229                        active_slabs++;
4230                }
4231                list_for_each_entry(slabp, &l3->slabs_free, list) {
4232                        if (slabp->inuse && !error)
4233                                error = "slabs_free/inuse accounting error";
4234                        num_slabs++;
4235                }
4236                free_objects += l3->free_objects;
4237                if (l3->shared)
4238                        shared_avail += l3->shared->avail;
4239
4240                spin_unlock_irq(&l3->list_lock);
4241        }
4242        num_slabs += active_slabs;
4243        num_objs = num_slabs * cachep->num;
4244        if (num_objs - active_objs != free_objects && !error)
4245                error = "free_objects accounting error";
4246
4247        name = cachep->name;
4248        if (error)
4249                printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
4250
4251        seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
4252                   name, active_objs, num_objs, cachep->buffer_size,
4253                   cachep->num, (1 << cachep->gfporder));
4254        seq_printf(m, " : tunables %4u %4u %4u",
4255                   cachep->limit, cachep->batchcount, cachep->shared);
4256        seq_printf(m, " : slabdata %6lu %6lu %6lu",
4257                   active_slabs, num_slabs, shared_avail);
4258#if STATS
4259        {                       /* list3 stats */
4260                unsigned long high = cachep->high_mark;
4261                unsigned long allocs = cachep->num_allocations;
4262                unsigned long grown = cachep->grown;
4263                unsigned long reaped = cachep->reaped;
4264                unsigned long errors = cachep->errors;
4265                unsigned long max_freeable = cachep->max_freeable;
4266                unsigned long node_allocs = cachep->node_allocs;
4267                unsigned long node_frees = cachep->node_frees;
4268                unsigned long overflows = cachep->node_overflow;
4269
4270                seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu "
4271                           "%4lu %4lu %4lu %4lu %4lu",
4272                           allocs, high, grown,
4273                           reaped, errors, max_freeable, node_allocs,
4274                           node_frees, overflows);
4275        }
4276        /* cpu stats */
4277        {
4278                unsigned long allochit = atomic_read(&cachep->allochit);
4279                unsigned long allocmiss = atomic_read(&cachep->allocmiss);
4280                unsigned long freehit = atomic_read(&cachep->freehit);
4281                unsigned long freemiss = atomic_read(&cachep->freemiss);
4282
4283                seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
4284                           allochit, allocmiss, freehit, freemiss);
4285        }
4286#endif
4287        seq_putc(m, '\n');
4288        return 0;
4289}
4290
4291/*
4292 * slabinfo_op - iterator that generates /proc/slabinfo
4293 *
4294 * Output layout:
4295 * cache-name
4296 * num-active-objs
4297 * total-objs
4298 * object size
4299 * num-active-slabs
4300 * total-slabs
4301 * num-pages-per-slab
4302 * + further values on SMP and with statistics enabled
4303 */
4304
4305static const struct seq_operations slabinfo_op = {
4306        .start = s_start,
4307        .next = s_next,
4308        .stop = s_stop,
4309        .show = s_show,
4310};
4311
4312#define MAX_SLABINFO_WRITE 128
4313/**
4314 * slabinfo_write - Tuning for the slab allocator
4315 * @file: unused
4316 * @buffer: user buffer
4317 * @count: data length
4318 * @ppos: unused
4319 */
4320static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
4321                       size_t count, loff_t *ppos)
4322{
4323        char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
4324        int limit, batchcount, shared, res;
4325        struct kmem_cache *cachep;
4326
4327        if (count > MAX_SLABINFO_WRITE)
4328                return -EINVAL;
4329        if (copy_from_user(&kbuf, buffer, count))
4330                return -EFAULT;
4331        kbuf[MAX_SLABINFO_WRITE] = '\0';
4332
4333        tmp = strchr(kbuf, ' ');
4334        if (!tmp)
4335                return -EINVAL;
4336        *tmp = '\0';
4337        tmp++;
4338        if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
4339                return -EINVAL;
4340
4341        /* Find the cache in the chain of caches. */
4342        mutex_lock(&cache_chain_mutex);
4343        res = -EINVAL;
4344        list_for_each_entry(cachep, &cache_chain, next) {
4345                if (!strcmp(cachep->name, kbuf)) {
4346                        if (limit < 1 || batchcount < 1 ||
4347                                        batchcount > limit || shared < 0) {
4348                                res = 0;
4349                        } else {
4350                                res = do_tune_cpucache(cachep, limit,
4351                                                       batchcount, shared,
4352                                                       GFP_KERNEL);
4353                        }
4354                        break;
4355                }
4356        }
4357        mutex_unlock(&cache_chain_mutex);
4358        if (res >= 0)
4359                res = count;
4360        return res;
4361}
4362
4363static int slabinfo_open(struct inode *inode, struct file *file)
4364{
4365        return seq_open(file, &slabinfo_op);
4366}
4367
4368static const struct file_operations proc_slabinfo_operations = {
4369        .open           = slabinfo_open,
4370        .read           = seq_read,
4371        .write          = slabinfo_write,
4372        .llseek         = seq_lseek,
4373        .release        = seq_release,
4374};
4375
4376#ifdef CONFIG_DEBUG_SLAB_LEAK
4377
4378static void *leaks_start(struct seq_file *m, loff_t *pos)
4379{
4380        mutex_lock(&cache_chain_mutex);
4381        return seq_list_start(&cache_chain, *pos);
4382}
4383
4384static inline int add_caller(unsigned long *n, unsigned long v)
4385{
4386        unsigned long *p;
4387        int l;
4388        if (!v)
4389                return 1;
4390        l = n[1];
4391        p = n + 2;
4392        while (l) {
4393                int i = l/2;
4394                unsigned long *q = p + 2 * i;
4395                if (*q == v) {
4396                        q[1]++;
4397                        return 1;
4398                }
4399                if (*q > v) {
4400                        l = i;
4401                } else {
4402                        p = q + 2;
4403                        l -= i + 1;
4404                }
4405        }
4406        if (++n[1] == n[0])
4407                return 0;
4408        memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n));
4409        p[0] = v;
4410        p[1] = 1;
4411        return 1;
4412}
4413
4414static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
4415{
4416        void *p;
4417        int i;
4418        if (n[0] == n[1])
4419                return;
4420        for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) {
4421                if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
4422                        continue;
4423                if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
4424                        return;
4425        }
4426}
4427
4428static void show_symbol(struct seq_file *m, unsigned long address)
4429{
4430#ifdef CONFIG_KALLSYMS
4431        unsigned long offset, size;
4432        char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN];
4433
4434        if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) {
4435                seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
4436                if (modname[0])
4437                        seq_printf(m, " [%s]", modname);
4438                return;
4439        }
4440#endif
4441        seq_printf(m, "%p", (void *)address);
4442}
4443
4444static int leaks_show(struct seq_file *m, void *p)
4445{
4446        struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next);
4447        struct slab *slabp;
4448        struct kmem_list3 *l3;
4449        const char *name;
4450        unsigned long *n = m->private;
4451        int node;
4452        int i;
4453
4454        if (!(cachep->flags & SLAB_STORE_USER))
4455                return 0;
4456        if (!(cachep->flags & SLAB_RED_ZONE))
4457                return 0;
4458
4459        /* OK, we can do it */
4460
4461        n[1] = 0;
4462
4463        for_each_online_node(node) {
4464                l3 = cachep->nodelists[node];
4465                if (!l3)
4466                        continue;
4467
4468                check_irq_on();
4469                spin_lock_irq(&l3->list_lock);
4470
4471                list_for_each_entry(slabp, &l3->slabs_full, list)
4472                        handle_slab(n, cachep, slabp);
4473                list_for_each_entry(slabp, &l3->slabs_partial, list)
4474                        handle_slab(n, cachep, slabp);
4475                spin_unlock_irq(&l3->list_lock);
4476        }
4477        name = cachep->name;
4478        if (n[0] == n[1]) {
4479                /* Increase the buffer size */
4480                mutex_unlock(&cache_chain_mutex);
4481                m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
4482                if (!m->private) {
4483                        /* Too bad, we are really out */
4484                        m->private = n;
4485                        mutex_lock(&cache_chain_mutex);
4486                        return -ENOMEM;
4487                }
4488                *(unsigned long *)m->private = n[0] * 2;
4489                kfree(n);
4490                mutex_lock(&cache_chain_mutex);
4491                /* Now make sure this entry will be retried */
4492                m->count = m->size;
4493                return 0;
4494        }
4495        for (i = 0; i < n[1]; i++) {
4496                seq_printf(m, "%s: %lu ", name, n[2*i+3]);
4497                show_symbol(m, n[2*i+2]);
4498                seq_putc(m, '\n');
4499        }
4500
4501        return 0;
4502}
4503
4504static const struct seq_operations slabstats_op = {
4505        .start = leaks_start,
4506        .next = s_next,
4507        .stop = s_stop,
4508        .show = leaks_show,
4509};
4510
4511static int slabstats_open(struct inode *inode, struct file *file)
4512{
4513        unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL);
4514        int ret = -ENOMEM;
4515        if (n) {
4516                ret = seq_open(file, &slabstats_op);
4517                if (!ret) {
4518                        struct seq_file *m = file->private_data;
4519                        *n = PAGE_SIZE / (2 * sizeof(unsigned long));
4520                        m->private = n;
4521                        n = NULL;
4522                }
4523                kfree(n);
4524        }
4525        return ret;
4526}
4527
4528static const struct file_operations proc_slabstats_operations = {
4529        .open           = slabstats_open,
4530        .read           = seq_read,
4531        .llseek         = seq_lseek,
4532        .release        = seq_release_private,
4533};
4534#endif
4535
4536static int __init slab_proc_init(void)
4537{
4538        proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations);
4539#ifdef CONFIG_DEBUG_SLAB_LEAK
4540        proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
4541#endif
4542        return 0;
4543}
4544module_init(slab_proc_init);
4545#endif
4546
4547/**
4548 * ksize - get the actual amount of memory allocated for a given object
4549 * @objp: Pointer to the object
4550 *
4551 * kmalloc may internally round up allocations and return more memory
4552 * than requested. ksize() can be used to determine the actual amount of
4553 * memory allocated. The caller may use this additional memory, even though
4554 * a smaller amount of memory was initially specified with the kmalloc call.
4555 * The caller must guarantee that objp points to a valid object previously
4556 * allocated with either kmalloc() or kmem_cache_alloc(). The object
4557 * must not be freed during the duration of the call.
4558 */
4559size_t ksize(const void *objp)
4560{
4561        BUG_ON(!objp);
4562        if (unlikely(objp == ZERO_SIZE_PTR))
4563                return 0;
4564
4565        return obj_size(virt_to_cache(objp));
4566}
4567EXPORT_SYMBOL(ksize);
4568