linux/mm/slab.c
<<
>>
Prefs
   1/*
   2 * linux/mm/slab.c
   3 * Written by Mark Hemment, 1996/97.
   4 * (markhe@nextd.demon.co.uk)
   5 *
   6 * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
   7 *
   8 * Major cleanup, different bufctl logic, per-cpu arrays
   9 *      (c) 2000 Manfred Spraul
  10 *
  11 * Cleanup, make the head arrays unconditional, preparation for NUMA
  12 *      (c) 2002 Manfred Spraul
  13 *
  14 * An implementation of the Slab Allocator as described in outline in;
  15 *      UNIX Internals: The New Frontiers by Uresh Vahalia
  16 *      Pub: Prentice Hall      ISBN 0-13-101908-2
  17 * or with a little more detail in;
  18 *      The Slab Allocator: An Object-Caching Kernel Memory Allocator
  19 *      Jeff Bonwick (Sun Microsystems).
  20 *      Presented at: USENIX Summer 1994 Technical Conference
  21 *
  22 * The memory is organized in caches, one cache for each object type.
  23 * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
  24 * Each cache consists out of many slabs (they are small (usually one
  25 * page long) and always contiguous), and each slab contains multiple
  26 * initialized objects.
  27 *
  28 * This means, that your constructor is used only for newly allocated
  29 * slabs and you must pass objects with the same initializations to
  30 * kmem_cache_free.
  31 *
  32 * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
  33 * normal). If you need a special memory type, then must create a new
  34 * cache for that memory type.
  35 *
  36 * In order to reduce fragmentation, the slabs are sorted in 3 groups:
  37 *   full slabs with 0 free objects
  38 *   partial slabs
  39 *   empty slabs with no allocated objects
  40 *
  41 * If partial slabs exist, then new allocations come from these slabs,
  42 * otherwise from empty slabs or new slabs are allocated.
  43 *
  44 * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
  45 * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
  46 *
  47 * Each cache has a short per-cpu head array, most allocs
  48 * and frees go into that array, and if that array overflows, then 1/2
  49 * of the entries in the array are given back into the global cache.
  50 * The head array is strictly LIFO and should improve the cache hit rates.
  51 * On SMP, it additionally reduces the spinlock operations.
  52 *
  53 * The c_cpuarray may not be read with enabled local interrupts -
  54 * it's changed with a smp_call_function().
  55 *
  56 * SMP synchronization:
  57 *  constructors and destructors are called without any locking.
  58 *  Several members in struct kmem_cache and struct slab never change, they
  59 *      are accessed without any locking.
  60 *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
  61 *      and local interrupts are disabled so slab code is preempt-safe.
  62 *  The non-constant members are protected with a per-cache irq spinlock.
  63 *
  64 * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
  65 * in 2000 - many ideas in the current implementation are derived from
  66 * his patch.
  67 *
  68 * Further notes from the original documentation:
  69 *
  70 * 11 April '97.  Started multi-threading - markhe
  71 *      The global cache-chain is protected by the mutex 'slab_mutex'.
  72 *      The sem is only needed when accessing/extending the cache-chain, which
  73 *      can never happen inside an interrupt (kmem_cache_create(),
  74 *      kmem_cache_shrink() and kmem_cache_reap()).
  75 *
  76 *      At present, each engine can be growing a cache.  This should be blocked.
  77 *
  78 * 15 March 2005. NUMA slab allocator.
  79 *      Shai Fultheim <shai@scalex86.org>.
  80 *      Shobhit Dayal <shobhit@calsoftinc.com>
  81 *      Alok N Kataria <alokk@calsoftinc.com>
  82 *      Christoph Lameter <christoph@lameter.com>
  83 *
  84 *      Modified the slab allocator to be node aware on NUMA systems.
  85 *      Each node has its own list of partial, free and full slabs.
  86 *      All object allocations for a node occur from node specific slab lists.
  87 */
  88
  89#include        <linux/slab.h>
  90#include        <linux/mm.h>
  91#include        <linux/poison.h>
  92#include        <linux/swap.h>
  93#include        <linux/cache.h>
  94#include        <linux/interrupt.h>
  95#include        <linux/init.h>
  96#include        <linux/compiler.h>
  97#include        <linux/cpuset.h>
  98#include        <linux/proc_fs.h>
  99#include        <linux/seq_file.h>
 100#include        <linux/notifier.h>
 101#include        <linux/kallsyms.h>
 102#include        <linux/cpu.h>
 103#include        <linux/sysctl.h>
 104#include        <linux/module.h>
 105#include        <linux/rcupdate.h>
 106#include        <linux/string.h>
 107#include        <linux/uaccess.h>
 108#include        <linux/nodemask.h>
 109#include        <linux/kmemleak.h>
 110#include        <linux/mempolicy.h>
 111#include        <linux/mutex.h>
 112#include        <linux/fault-inject.h>
 113#include        <linux/rtmutex.h>
 114#include        <linux/reciprocal_div.h>
 115#include        <linux/debugobjects.h>
 116#include        <linux/kmemcheck.h>
 117#include        <linux/memory.h>
 118#include        <linux/prefetch.h>
 119
 120#include        <net/sock.h>
 121
 122#include        <asm/cacheflush.h>
 123#include        <asm/tlbflush.h>
 124#include        <asm/page.h>
 125
 126#include <trace/events/kmem.h>
 127
 128#include        "internal.h"
 129
 130#include        "slab.h"
 131
 132/*
 133 * DEBUG        - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
 134 *                0 for faster, smaller code (especially in the critical paths).
 135 *
 136 * STATS        - 1 to collect stats for /proc/slabinfo.
 137 *                0 for faster, smaller code (especially in the critical paths).
 138 *
 139 * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
 140 */
 141
 142#ifdef CONFIG_DEBUG_SLAB
 143#define DEBUG           1
 144#define STATS           1
 145#define FORCED_DEBUG    1
 146#else
 147#define DEBUG           0
 148#define STATS           0
 149#define FORCED_DEBUG    0
 150#endif
 151
 152/* Shouldn't this be in a header file somewhere? */
 153#define BYTES_PER_WORD          sizeof(void *)
 154#define REDZONE_ALIGN           max(BYTES_PER_WORD, __alignof__(unsigned long long))
 155
 156#ifndef ARCH_KMALLOC_FLAGS
 157#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
 158#endif
 159
 160/*
 161 * true if a page was allocated from pfmemalloc reserves for network-based
 162 * swap
 163 */
 164static bool pfmemalloc_active __read_mostly;
 165
 166/*
 167 * kmem_bufctl_t:
 168 *
 169 * Bufctl's are used for linking objs within a slab
 170 * linked offsets.
 171 *
 172 * This implementation relies on "struct page" for locating the cache &
 173 * slab an object belongs to.
 174 * This allows the bufctl structure to be small (one int), but limits
 175 * the number of objects a slab (not a cache) can contain when off-slab
 176 * bufctls are used. The limit is the size of the largest general cache
 177 * that does not use off-slab slabs.
 178 * For 32bit archs with 4 kB pages, is this 56.
 179 * This is not serious, as it is only for large objects, when it is unwise
 180 * to have too many per slab.
 181 * Note: This limit can be raised by introducing a general cache whose size
 182 * is less than 512 (PAGE_SIZE<<3), but greater than 256.
 183 */
 184
 185typedef unsigned int kmem_bufctl_t;
 186#define BUFCTL_END      (((kmem_bufctl_t)(~0U))-0)
 187#define BUFCTL_FREE     (((kmem_bufctl_t)(~0U))-1)
 188#define BUFCTL_ACTIVE   (((kmem_bufctl_t)(~0U))-2)
 189#define SLAB_LIMIT      (((kmem_bufctl_t)(~0U))-3)
 190
 191/*
 192 * struct slab_rcu
 193 *
 194 * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
 195 * arrange for kmem_freepages to be called via RCU.  This is useful if
 196 * we need to approach a kernel structure obliquely, from its address
 197 * obtained without the usual locking.  We can lock the structure to
 198 * stabilize it and check it's still at the given address, only if we
 199 * can be sure that the memory has not been meanwhile reused for some
 200 * other kind of object (which our subsystem's lock might corrupt).
 201 *
 202 * rcu_read_lock before reading the address, then rcu_read_unlock after
 203 * taking the spinlock within the structure expected at that address.
 204 */
 205struct slab_rcu {
 206        struct rcu_head head;
 207        struct kmem_cache *cachep;
 208        void *addr;
 209};
 210
 211/*
 212 * struct slab
 213 *
 214 * Manages the objs in a slab. Placed either at the beginning of mem allocated
 215 * for a slab, or allocated from an general cache.
 216 * Slabs are chained into three list: fully used, partial, fully free slabs.
 217 */
 218struct slab {
 219        union {
 220                struct {
 221                        struct list_head list;
 222                        unsigned long colouroff;
 223                        void *s_mem;            /* including colour offset */
 224                        unsigned int inuse;     /* num of objs active in slab */
 225                        kmem_bufctl_t free;
 226                        unsigned short nodeid;
 227                };
 228                struct slab_rcu __slab_cover_slab_rcu;
 229        };
 230};
 231
 232/*
 233 * struct array_cache
 234 *
 235 * Purpose:
 236 * - LIFO ordering, to hand out cache-warm objects from _alloc
 237 * - reduce the number of linked list operations
 238 * - reduce spinlock operations
 239 *
 240 * The limit is stored in the per-cpu structure to reduce the data cache
 241 * footprint.
 242 *
 243 */
 244struct array_cache {
 245        unsigned int avail;
 246        unsigned int limit;
 247        unsigned int batchcount;
 248        unsigned int touched;
 249        spinlock_t lock;
 250        void *entry[];  /*
 251                         * Must have this definition in here for the proper
 252                         * alignment of array_cache. Also simplifies accessing
 253                         * the entries.
 254                         *
 255                         * Entries should not be directly dereferenced as
 256                         * entries belonging to slabs marked pfmemalloc will
 257                         * have the lower bits set SLAB_OBJ_PFMEMALLOC
 258                         */
 259};
 260
 261#define SLAB_OBJ_PFMEMALLOC     1
 262static inline bool is_obj_pfmemalloc(void *objp)
 263{
 264        return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC;
 265}
 266
 267static inline void set_obj_pfmemalloc(void **objp)
 268{
 269        *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC);
 270        return;
 271}
 272
 273static inline void clear_obj_pfmemalloc(void **objp)
 274{
 275        *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC);
 276}
 277
 278/*
 279 * bootstrap: The caches do not work without cpuarrays anymore, but the
 280 * cpuarrays are allocated from the generic caches...
 281 */
 282#define BOOT_CPUCACHE_ENTRIES   1
 283struct arraycache_init {
 284        struct array_cache cache;
 285        void *entries[BOOT_CPUCACHE_ENTRIES];
 286};
 287
 288/*
 289 * Need this for bootstrapping a per node allocator.
 290 */
 291#define NUM_INIT_LISTS (3 * MAX_NUMNODES)
 292static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS];
 293#define CACHE_CACHE 0
 294#define SIZE_AC MAX_NUMNODES
 295#define SIZE_NODE (2 * MAX_NUMNODES)
 296
 297static int drain_freelist(struct kmem_cache *cache,
 298                        struct kmem_cache_node *n, int tofree);
 299static void free_block(struct kmem_cache *cachep, void **objpp, int len,
 300                        int node);
 301static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
 302static void cache_reap(struct work_struct *unused);
 303
 304static int slab_early_init = 1;
 305
 306#define INDEX_AC kmalloc_index(sizeof(struct arraycache_init))
 307#define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node))
 308
 309static void kmem_cache_node_init(struct kmem_cache_node *parent)
 310{
 311        INIT_LIST_HEAD(&parent->slabs_full);
 312        INIT_LIST_HEAD(&parent->slabs_partial);
 313        INIT_LIST_HEAD(&parent->slabs_free);
 314        parent->shared = NULL;
 315        parent->alien = NULL;
 316        parent->colour_next = 0;
 317        spin_lock_init(&parent->list_lock);
 318        parent->free_objects = 0;
 319        parent->free_touched = 0;
 320}
 321
 322#define MAKE_LIST(cachep, listp, slab, nodeid)                          \
 323        do {                                                            \
 324                INIT_LIST_HEAD(listp);                                  \
 325                list_splice(&(cachep->node[nodeid]->slab), listp);      \
 326        } while (0)
 327
 328#define MAKE_ALL_LISTS(cachep, ptr, nodeid)                             \
 329        do {                                                            \
 330        MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid);  \
 331        MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
 332        MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);  \
 333        } while (0)
 334
 335#define CFLGS_OFF_SLAB          (0x80000000UL)
 336#define OFF_SLAB(x)     ((x)->flags & CFLGS_OFF_SLAB)
 337
 338#define BATCHREFILL_LIMIT       16
 339/*
 340 * Optimization question: fewer reaps means less probability for unnessary
 341 * cpucache drain/refill cycles.
 342 *
 343 * OTOH the cpuarrays can contain lots of objects,
 344 * which could lock up otherwise freeable slabs.
 345 */
 346#define REAPTIMEOUT_CPUC        (2*HZ)
 347#define REAPTIMEOUT_LIST3       (4*HZ)
 348
 349#if STATS
 350#define STATS_INC_ACTIVE(x)     ((x)->num_active++)
 351#define STATS_DEC_ACTIVE(x)     ((x)->num_active--)
 352#define STATS_INC_ALLOCED(x)    ((x)->num_allocations++)
 353#define STATS_INC_GROWN(x)      ((x)->grown++)
 354#define STATS_ADD_REAPED(x,y)   ((x)->reaped += (y))
 355#define STATS_SET_HIGH(x)                                               \
 356        do {                                                            \
 357                if ((x)->num_active > (x)->high_mark)                   \
 358                        (x)->high_mark = (x)->num_active;               \
 359        } while (0)
 360#define STATS_INC_ERR(x)        ((x)->errors++)
 361#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++)
 362#define STATS_INC_NODEFREES(x)  ((x)->node_frees++)
 363#define STATS_INC_ACOVERFLOW(x)   ((x)->node_overflow++)
 364#define STATS_SET_FREEABLE(x, i)                                        \
 365        do {                                                            \
 366                if ((x)->max_freeable < i)                              \
 367                        (x)->max_freeable = i;                          \
 368        } while (0)
 369#define STATS_INC_ALLOCHIT(x)   atomic_inc(&(x)->allochit)
 370#define STATS_INC_ALLOCMISS(x)  atomic_inc(&(x)->allocmiss)
 371#define STATS_INC_FREEHIT(x)    atomic_inc(&(x)->freehit)
 372#define STATS_INC_FREEMISS(x)   atomic_inc(&(x)->freemiss)
 373#else
 374#define STATS_INC_ACTIVE(x)     do { } while (0)
 375#define STATS_DEC_ACTIVE(x)     do { } while (0)
 376#define STATS_INC_ALLOCED(x)    do { } while (0)
 377#define STATS_INC_GROWN(x)      do { } while (0)
 378#define STATS_ADD_REAPED(x,y)   do { (void)(y); } while (0)
 379#define STATS_SET_HIGH(x)       do { } while (0)
 380#define STATS_INC_ERR(x)        do { } while (0)
 381#define STATS_INC_NODEALLOCS(x) do { } while (0)
 382#define STATS_INC_NODEFREES(x)  do { } while (0)
 383#define STATS_INC_ACOVERFLOW(x)   do { } while (0)
 384#define STATS_SET_FREEABLE(x, i) do { } while (0)
 385#define STATS_INC_ALLOCHIT(x)   do { } while (0)
 386#define STATS_INC_ALLOCMISS(x)  do { } while (0)
 387#define STATS_INC_FREEHIT(x)    do { } while (0)
 388#define STATS_INC_FREEMISS(x)   do { } while (0)
 389#endif
 390
 391#if DEBUG
 392
 393/*
 394 * memory layout of objects:
 395 * 0            : objp
 396 * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
 397 *              the end of an object is aligned with the end of the real
 398 *              allocation. Catches writes behind the end of the allocation.
 399 * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
 400 *              redzone word.
 401 * cachep->obj_offset: The real object.
 402 * cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
 403 * cachep->size - 1* BYTES_PER_WORD: last caller address
 404 *                                      [BYTES_PER_WORD long]
 405 */
 406static int obj_offset(struct kmem_cache *cachep)
 407{
 408        return cachep->obj_offset;
 409}
 410
 411static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
 412{
 413        BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
 414        return (unsigned long long*) (objp + obj_offset(cachep) -
 415                                      sizeof(unsigned long long));
 416}
 417
 418static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
 419{
 420        BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
 421        if (cachep->flags & SLAB_STORE_USER)
 422                return (unsigned long long *)(objp + cachep->size -
 423                                              sizeof(unsigned long long) -
 424                                              REDZONE_ALIGN);
 425        return (unsigned long long *) (objp + cachep->size -
 426                                       sizeof(unsigned long long));
 427}
 428
 429static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 430{
 431        BUG_ON(!(cachep->flags & SLAB_STORE_USER));
 432        return (void **)(objp + cachep->size - BYTES_PER_WORD);
 433}
 434
 435#else
 436
 437#define obj_offset(x)                   0
 438#define dbg_redzone1(cachep, objp)      ({BUG(); (unsigned long long *)NULL;})
 439#define dbg_redzone2(cachep, objp)      ({BUG(); (unsigned long long *)NULL;})
 440#define dbg_userword(cachep, objp)      ({BUG(); (void **)NULL;})
 441
 442#endif
 443
 444/*
 445 * Do not go above this order unless 0 objects fit into the slab or
 446 * overridden on the command line.
 447 */
 448#define SLAB_MAX_ORDER_HI       1
 449#define SLAB_MAX_ORDER_LO       0
 450static int slab_max_order = SLAB_MAX_ORDER_LO;
 451static bool slab_max_order_set __initdata;
 452
 453static inline struct kmem_cache *virt_to_cache(const void *obj)
 454{
 455        struct page *page = virt_to_head_page(obj);
 456        return page->slab_cache;
 457}
 458
 459static inline struct slab *virt_to_slab(const void *obj)
 460{
 461        struct page *page = virt_to_head_page(obj);
 462
 463        VM_BUG_ON(!PageSlab(page));
 464        return page->slab_page;
 465}
 466
 467static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
 468                                 unsigned int idx)
 469{
 470        return slab->s_mem + cache->size * idx;
 471}
 472
 473/*
 474 * We want to avoid an expensive divide : (offset / cache->size)
 475 *   Using the fact that size is a constant for a particular cache,
 476 *   we can replace (offset / cache->size) by
 477 *   reciprocal_divide(offset, cache->reciprocal_buffer_size)
 478 */
 479static inline unsigned int obj_to_index(const struct kmem_cache *cache,
 480                                        const struct slab *slab, void *obj)
 481{
 482        u32 offset = (obj - slab->s_mem);
 483        return reciprocal_divide(offset, cache->reciprocal_buffer_size);
 484}
 485
 486static struct arraycache_init initarray_generic =
 487    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 488
 489/* internal cache of cache description objs */
 490static struct kmem_cache kmem_cache_boot = {
 491        .batchcount = 1,
 492        .limit = BOOT_CPUCACHE_ENTRIES,
 493        .shared = 1,
 494        .size = sizeof(struct kmem_cache),
 495        .name = "kmem_cache",
 496};
 497
 498#define BAD_ALIEN_MAGIC 0x01020304ul
 499
 500#ifdef CONFIG_LOCKDEP
 501
 502/*
 503 * Slab sometimes uses the kmalloc slabs to store the slab headers
 504 * for other slabs "off slab".
 505 * The locking for this is tricky in that it nests within the locks
 506 * of all other slabs in a few places; to deal with this special
 507 * locking we put on-slab caches into a separate lock-class.
 508 *
 509 * We set lock class for alien array caches which are up during init.
 510 * The lock annotation will be lost if all cpus of a node goes down and
 511 * then comes back up during hotplug
 512 */
 513static struct lock_class_key on_slab_l3_key;
 514static struct lock_class_key on_slab_alc_key;
 515
 516static struct lock_class_key debugobj_l3_key;
 517static struct lock_class_key debugobj_alc_key;
 518
 519static void slab_set_lock_classes(struct kmem_cache *cachep,
 520                struct lock_class_key *l3_key, struct lock_class_key *alc_key,
 521                int q)
 522{
 523        struct array_cache **alc;
 524        struct kmem_cache_node *n;
 525        int r;
 526
 527        n = cachep->node[q];
 528        if (!n)
 529                return;
 530
 531        lockdep_set_class(&n->list_lock, l3_key);
 532        alc = n->alien;
 533        /*
 534         * FIXME: This check for BAD_ALIEN_MAGIC
 535         * should go away when common slab code is taught to
 536         * work even without alien caches.
 537         * Currently, non NUMA code returns BAD_ALIEN_MAGIC
 538         * for alloc_alien_cache,
 539         */
 540        if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
 541                return;
 542        for_each_node(r) {
 543                if (alc[r])
 544                        lockdep_set_class(&alc[r]->lock, alc_key);
 545        }
 546}
 547
 548static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
 549{
 550        slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, node);
 551}
 552
 553static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
 554{
 555        int node;
 556
 557        for_each_online_node(node)
 558                slab_set_debugobj_lock_classes_node(cachep, node);
 559}
 560
 561static void init_node_lock_keys(int q)
 562{
 563        int i;
 564
 565        if (slab_state < UP)
 566                return;
 567
 568        for (i = 1; i <= KMALLOC_SHIFT_HIGH; i++) {
 569                struct kmem_cache_node *n;
 570                struct kmem_cache *cache = kmalloc_caches[i];
 571
 572                if (!cache)
 573                        continue;
 574
 575                n = cache->node[q];
 576                if (!n || OFF_SLAB(cache))
 577                        continue;
 578
 579                slab_set_lock_classes(cache, &on_slab_l3_key,
 580                                &on_slab_alc_key, q);
 581        }
 582}
 583
 584static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q)
 585{
 586        if (!cachep->node[q])
 587                return;
 588
 589        slab_set_lock_classes(cachep, &on_slab_l3_key,
 590                        &on_slab_alc_key, q);
 591}
 592
 593static inline void on_slab_lock_classes(struct kmem_cache *cachep)
 594{
 595        int node;
 596
 597        VM_BUG_ON(OFF_SLAB(cachep));
 598        for_each_node(node)
 599                on_slab_lock_classes_node(cachep, node);
 600}
 601
 602static inline void init_lock_keys(void)
 603{
 604        int node;
 605
 606        for_each_node(node)
 607                init_node_lock_keys(node);
 608}
 609#else
 610static void init_node_lock_keys(int q)
 611{
 612}
 613
 614static inline void init_lock_keys(void)
 615{
 616}
 617
 618static inline void on_slab_lock_classes(struct kmem_cache *cachep)
 619{
 620}
 621
 622static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, int node)
 623{
 624}
 625
 626static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
 627{
 628}
 629
 630static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
 631{
 632}
 633#endif
 634
 635static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
 636
 637static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
 638{
 639        return cachep->array[smp_processor_id()];
 640}
 641
 642static size_t slab_mgmt_size(size_t nr_objs, size_t align)
 643{
 644        return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
 645}
 646
 647/*
 648 * Calculate the number of objects and left-over bytes for a given buffer size.
 649 */
 650static void cache_estimate(unsigned long gfporder, size_t buffer_size,
 651                           size_t align, int flags, size_t *left_over,
 652                           unsigned int *num)
 653{
 654        int nr_objs;
 655        size_t mgmt_size;
 656        size_t slab_size = PAGE_SIZE << gfporder;
 657
 658        /*
 659         * The slab management structure can be either off the slab or
 660         * on it. For the latter case, the memory allocated for a
 661         * slab is used for:
 662         *
 663         * - The struct slab
 664         * - One kmem_bufctl_t for each object
 665         * - Padding to respect alignment of @align
 666         * - @buffer_size bytes for each object
 667         *
 668         * If the slab management structure is off the slab, then the
 669         * alignment will already be calculated into the size. Because
 670         * the slabs are all pages aligned, the objects will be at the
 671         * correct alignment when allocated.
 672         */
 673        if (flags & CFLGS_OFF_SLAB) {
 674                mgmt_size = 0;
 675                nr_objs = slab_size / buffer_size;
 676
 677                if (nr_objs > SLAB_LIMIT)
 678                        nr_objs = SLAB_LIMIT;
 679        } else {
 680                /*
 681                 * Ignore padding for the initial guess. The padding
 682                 * is at most @align-1 bytes, and @buffer_size is at
 683                 * least @align. In the worst case, this result will
 684                 * be one greater than the number of objects that fit
 685                 * into the memory allocation when taking the padding
 686                 * into account.
 687                 */
 688                nr_objs = (slab_size - sizeof(struct slab)) /
 689                          (buffer_size + sizeof(kmem_bufctl_t));
 690
 691                /*
 692                 * This calculated number will be either the right
 693                 * amount, or one greater than what we want.
 694                 */
 695                if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
 696                       > slab_size)
 697                        nr_objs--;
 698
 699                if (nr_objs > SLAB_LIMIT)
 700                        nr_objs = SLAB_LIMIT;
 701
 702                mgmt_size = slab_mgmt_size(nr_objs, align);
 703        }
 704        *num = nr_objs;
 705        *left_over = slab_size - nr_objs*buffer_size - mgmt_size;
 706}
 707
 708#if DEBUG
 709#define slab_error(cachep, msg) __slab_error(__func__, cachep, msg)
 710
 711static void __slab_error(const char *function, struct kmem_cache *cachep,
 712                        char *msg)
 713{
 714        printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
 715               function, cachep->name, msg);
 716        dump_stack();
 717        add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 718}
 719#endif
 720
 721/*
 722 * By default on NUMA we use alien caches to stage the freeing of
 723 * objects allocated from other nodes. This causes massive memory
 724 * inefficiencies when using fake NUMA setup to split memory into a
 725 * large number of small nodes, so it can be disabled on the command
 726 * line
 727  */
 728
 729static int use_alien_caches __read_mostly = 1;
 730static int __init noaliencache_setup(char *s)
 731{
 732        use_alien_caches = 0;
 733        return 1;
 734}
 735__setup("noaliencache", noaliencache_setup);
 736
 737static int __init slab_max_order_setup(char *str)
 738{
 739        get_option(&str, &slab_max_order);
 740        slab_max_order = slab_max_order < 0 ? 0 :
 741                                min(slab_max_order, MAX_ORDER - 1);
 742        slab_max_order_set = true;
 743
 744        return 1;
 745}
 746__setup("slab_max_order=", slab_max_order_setup);
 747
 748#ifdef CONFIG_NUMA
 749/*
 750 * Special reaping functions for NUMA systems called from cache_reap().
 751 * These take care of doing round robin flushing of alien caches (containing
 752 * objects freed on different nodes from which they were allocated) and the
 753 * flushing of remote pcps by calling drain_node_pages.
 754 */
 755static DEFINE_PER_CPU(unsigned long, slab_reap_node);
 756
 757static void init_reap_node(int cpu)
 758{
 759        int node;
 760
 761        node = next_node(cpu_to_mem(cpu), node_online_map);
 762        if (node == MAX_NUMNODES)
 763                node = first_node(node_online_map);
 764
 765        per_cpu(slab_reap_node, cpu) = node;
 766}
 767
 768static void next_reap_node(void)
 769{
 770        int node = __this_cpu_read(slab_reap_node);
 771
 772        node = next_node(node, node_online_map);
 773        if (unlikely(node >= MAX_NUMNODES))
 774                node = first_node(node_online_map);
 775        __this_cpu_write(slab_reap_node, node);
 776}
 777
 778#else
 779#define init_reap_node(cpu) do { } while (0)
 780#define next_reap_node(void) do { } while (0)
 781#endif
 782
 783/*
 784 * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
 785 * via the workqueue/eventd.
 786 * Add the CPU number into the expiration time to minimize the possibility of
 787 * the CPUs getting into lockstep and contending for the global cache chain
 788 * lock.
 789 */
 790static void start_cpu_timer(int cpu)
 791{
 792        struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu);
 793
 794        /*
 795         * When this gets called from do_initcalls via cpucache_init(),
 796         * init_workqueues() has already run, so keventd will be setup
 797         * at that time.
 798         */
 799        if (keventd_up() && reap_work->work.func == NULL) {
 800                init_reap_node(cpu);
 801                INIT_DEFERRABLE_WORK(reap_work, cache_reap);
 802                schedule_delayed_work_on(cpu, reap_work,
 803                                        __round_jiffies_relative(HZ, cpu));
 804        }
 805}
 806
 807static struct array_cache *alloc_arraycache(int node, int entries,
 808                                            int batchcount, gfp_t gfp)
 809{
 810        int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
 811        struct array_cache *nc = NULL;
 812
 813        nc = kmalloc_node(memsize, gfp, node);
 814        /*
 815         * The array_cache structures contain pointers to free object.
 816         * However, when such objects are allocated or transferred to another
 817         * cache the pointers are not cleared and they could be counted as
 818         * valid references during a kmemleak scan. Therefore, kmemleak must
 819         * not scan such objects.
 820         */
 821        kmemleak_no_scan(nc);
 822        if (nc) {
 823                nc->avail = 0;
 824                nc->limit = entries;
 825                nc->batchcount = batchcount;
 826                nc->touched = 0;
 827                spin_lock_init(&nc->lock);
 828        }
 829        return nc;
 830}
 831
 832static inline bool is_slab_pfmemalloc(struct slab *slabp)
 833{
 834        struct page *page = virt_to_page(slabp->s_mem);
 835
 836        return PageSlabPfmemalloc(page);
 837}
 838
 839/* Clears pfmemalloc_active if no slabs have pfmalloc set */
 840static void recheck_pfmemalloc_active(struct kmem_cache *cachep,
 841                                                struct array_cache *ac)
 842{
 843        struct kmem_cache_node *n = cachep->node[numa_mem_id()];
 844        struct slab *slabp;
 845        unsigned long flags;
 846
 847        if (!pfmemalloc_active)
 848                return;
 849
 850        spin_lock_irqsave(&n->list_lock, flags);
 851        list_for_each_entry(slabp, &n->slabs_full, list)
 852                if (is_slab_pfmemalloc(slabp))
 853                        goto out;
 854
 855        list_for_each_entry(slabp, &n->slabs_partial, list)
 856                if (is_slab_pfmemalloc(slabp))
 857                        goto out;
 858
 859        list_for_each_entry(slabp, &n->slabs_free, list)
 860                if (is_slab_pfmemalloc(slabp))
 861                        goto out;
 862
 863        pfmemalloc_active = false;
 864out:
 865        spin_unlock_irqrestore(&n->list_lock, flags);
 866}
 867
 868static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
 869                                                gfp_t flags, bool force_refill)
 870{
 871        int i;
 872        void *objp = ac->entry[--ac->avail];
 873
 874        /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */
 875        if (unlikely(is_obj_pfmemalloc(objp))) {
 876                struct kmem_cache_node *n;
 877
 878                if (gfp_pfmemalloc_allowed(flags)) {
 879                        clear_obj_pfmemalloc(&objp);
 880                        return objp;
 881                }
 882
 883                /* The caller cannot use PFMEMALLOC objects, find another one */
 884                for (i = 0; i < ac->avail; i++) {
 885                        /* If a !PFMEMALLOC object is found, swap them */
 886                        if (!is_obj_pfmemalloc(ac->entry[i])) {
 887                                objp = ac->entry[i];
 888                                ac->entry[i] = ac->entry[ac->avail];
 889                                ac->entry[ac->avail] = objp;
 890                                return objp;
 891                        }
 892                }
 893
 894                /*
 895                 * If there are empty slabs on the slabs_free list and we are
 896                 * being forced to refill the cache, mark this one !pfmemalloc.
 897                 */
 898                n = cachep->node[numa_mem_id()];
 899                if (!list_empty(&n->slabs_free) && force_refill) {
 900                        struct slab *slabp = virt_to_slab(objp);
 901                        ClearPageSlabPfmemalloc(virt_to_head_page(slabp->s_mem));
 902                        clear_obj_pfmemalloc(&objp);
 903                        recheck_pfmemalloc_active(cachep, ac);
 904                        return objp;
 905                }
 906
 907                /* No !PFMEMALLOC objects available */
 908                ac->avail++;
 909                objp = NULL;
 910        }
 911
 912        return objp;
 913}
 914
 915static inline void *ac_get_obj(struct kmem_cache *cachep,
 916                        struct array_cache *ac, gfp_t flags, bool force_refill)
 917{
 918        void *objp;
 919
 920        if (unlikely(sk_memalloc_socks()))
 921                objp = __ac_get_obj(cachep, ac, flags, force_refill);
 922        else
 923                objp = ac->entry[--ac->avail];
 924
 925        return objp;
 926}
 927
 928static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
 929                                                                void *objp)
 930{
 931        if (unlikely(pfmemalloc_active)) {
 932                /* Some pfmemalloc slabs exist, check if this is one */
 933                struct page *page = virt_to_head_page(objp);
 934                if (PageSlabPfmemalloc(page))
 935                        set_obj_pfmemalloc(&objp);
 936        }
 937
 938        return objp;
 939}
 940
 941static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
 942                                                                void *objp)
 943{
 944        if (unlikely(sk_memalloc_socks()))
 945                objp = __ac_put_obj(cachep, ac, objp);
 946
 947        ac->entry[ac->avail++] = objp;
 948}
 949
 950/*
 951 * Transfer objects in one arraycache to another.
 952 * Locking must be handled by the caller.
 953 *
 954 * Return the number of entries transferred.
 955 */
 956static int transfer_objects(struct array_cache *to,
 957                struct array_cache *from, unsigned int max)
 958{
 959        /* Figure out how many entries to transfer */
 960        int nr = min3(from->avail, max, to->limit - to->avail);
 961
 962        if (!nr)
 963                return 0;
 964
 965        memcpy(to->entry + to->avail, from->entry + from->avail -nr,
 966                        sizeof(void *) *nr);
 967
 968        from->avail -= nr;
 969        to->avail += nr;
 970        return nr;
 971}
 972
 973#ifndef CONFIG_NUMA
 974
 975#define drain_alien_cache(cachep, alien) do { } while (0)
 976#define reap_alien(cachep, n) do { } while (0)
 977
 978static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
 979{
 980        return (struct array_cache **)BAD_ALIEN_MAGIC;
 981}
 982
 983static inline void free_alien_cache(struct array_cache **ac_ptr)
 984{
 985}
 986
 987static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
 988{
 989        return 0;
 990}
 991
 992static inline void *alternate_node_alloc(struct kmem_cache *cachep,
 993                gfp_t flags)
 994{
 995        return NULL;
 996}
 997
 998static inline void *____cache_alloc_node(struct kmem_cache *cachep,
 999                 gfp_t flags, int nodeid)
1000{
1001        return NULL;
1002}
1003
1004#else   /* CONFIG_NUMA */
1005
1006static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
1007static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
1008
1009static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
1010{
1011        struct array_cache **ac_ptr;
1012        int memsize = sizeof(void *) * nr_node_ids;
1013        int i;
1014
1015        if (limit > 1)
1016                limit = 12;
1017        ac_ptr = kzalloc_node(memsize, gfp, node);
1018        if (ac_ptr) {
1019                for_each_node(i) {
1020                        if (i == node || !node_online(i))
1021                                continue;
1022                        ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp);
1023                        if (!ac_ptr[i]) {
1024                                for (i--; i >= 0; i--)
1025                                        kfree(ac_ptr[i]);
1026                                kfree(ac_ptr);
1027                                return NULL;
1028                        }
1029                }
1030        }
1031        return ac_ptr;
1032}
1033
1034static void free_alien_cache(struct array_cache **ac_ptr)
1035{
1036        int i;
1037
1038        if (!ac_ptr)
1039                return;
1040        for_each_node(i)
1041            kfree(ac_ptr[i]);
1042        kfree(ac_ptr);
1043}
1044
1045static void __drain_alien_cache(struct kmem_cache *cachep,
1046                                struct array_cache *ac, int node)
1047{
1048        struct kmem_cache_node *n = cachep->node[node];
1049
1050        if (ac->avail) {
1051                spin_lock(&n->list_lock);
1052                /*
1053                 * Stuff objects into the remote nodes shared array first.
1054                 * That way we could avoid the overhead of putting the objects
1055                 * into the free lists and getting them back later.
1056                 */
1057                if (n->shared)
1058                        transfer_objects(n->shared, ac, ac->limit);
1059
1060                free_block(cachep, ac->entry, ac->avail, node);
1061                ac->avail = 0;
1062                spin_unlock(&n->list_lock);
1063        }
1064}
1065
1066/*
1067 * Called from cache_reap() to regularly drain alien caches round robin.
1068 */
1069static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n)
1070{
1071        int node = __this_cpu_read(slab_reap_node);
1072
1073        if (n->alien) {
1074                struct array_cache *ac = n->alien[node];
1075
1076                if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {
1077                        __drain_alien_cache(cachep, ac, node);
1078                        spin_unlock_irq(&ac->lock);
1079                }
1080        }
1081}
1082
1083static void drain_alien_cache(struct kmem_cache *cachep,
1084                                struct array_cache **alien)
1085{
1086        int i = 0;
1087        struct array_cache *ac;
1088        unsigned long flags;
1089
1090        for_each_online_node(i) {
1091                ac = alien[i];
1092                if (ac) {
1093                        spin_lock_irqsave(&ac->lock, flags);
1094                        __drain_alien_cache(cachep, ac, i);
1095                        spin_unlock_irqrestore(&ac->lock, flags);
1096                }
1097        }
1098}
1099
1100static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1101{
1102        struct slab *slabp = virt_to_slab(objp);
1103        int nodeid = slabp->nodeid;
1104        struct kmem_cache_node *n;
1105        struct array_cache *alien = NULL;
1106        int node;
1107
1108        node = numa_mem_id();
1109
1110        /*
1111         * Make sure we are not freeing a object from another node to the array
1112         * cache on this cpu.
1113         */
1114        if (likely(slabp->nodeid == node))
1115                return 0;
1116
1117        n = cachep->node[node];
1118        STATS_INC_NODEFREES(cachep);
1119        if (n->alien && n->alien[nodeid]) {
1120                alien = n->alien[nodeid];
1121                spin_lock(&alien->lock);
1122                if (unlikely(alien->avail == alien->limit)) {
1123                        STATS_INC_ACOVERFLOW(cachep);
1124                        __drain_alien_cache(cachep, alien, nodeid);
1125                }
1126                ac_put_obj(cachep, alien, objp);
1127                spin_unlock(&alien->lock);
1128        } else {
1129                spin_lock(&(cachep->node[nodeid])->list_lock);
1130                free_block(cachep, &objp, 1, nodeid);
1131                spin_unlock(&(cachep->node[nodeid])->list_lock);
1132        }
1133        return 1;
1134}
1135#endif
1136
1137/*
1138 * Allocates and initializes node for a node on each slab cache, used for
1139 * either memory or cpu hotplug.  If memory is being hot-added, the kmem_cache_node
1140 * will be allocated off-node since memory is not yet online for the new node.
1141 * When hotplugging memory or a cpu, existing node are not replaced if
1142 * already in use.
1143 *
1144 * Must hold slab_mutex.
1145 */
1146static int init_cache_node_node(int node)
1147{
1148        struct kmem_cache *cachep;
1149        struct kmem_cache_node *n;
1150        const int memsize = sizeof(struct kmem_cache_node);
1151
1152        list_for_each_entry(cachep, &slab_caches, list) {
1153                /*
1154                 * Set up the size64 kmemlist for cpu before we can
1155                 * begin anything. Make sure some other cpu on this
1156                 * node has not already allocated this
1157                 */
1158                if (!cachep->node[node]) {
1159                        n = kmalloc_node(memsize, GFP_KERNEL, node);
1160                        if (!n)
1161                                return -ENOMEM;
1162                        kmem_cache_node_init(n);
1163                        n->next_reap = jiffies + REAPTIMEOUT_LIST3 +
1164                            ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1165
1166                        /*
1167                         * The l3s don't come and go as CPUs come and
1168                         * go.  slab_mutex is sufficient
1169                         * protection here.
1170                         */
1171                        cachep->node[node] = n;
1172                }
1173
1174                spin_lock_irq(&cachep->node[node]->list_lock);
1175                cachep->node[node]->free_limit =
1176                        (1 + nr_cpus_node(node)) *
1177                        cachep->batchcount + cachep->num;
1178                spin_unlock_irq(&cachep->node[node]->list_lock);
1179        }
1180        return 0;
1181}
1182
1183static inline int slabs_tofree(struct kmem_cache *cachep,
1184                                                struct kmem_cache_node *n)
1185{
1186        return (n->free_objects + cachep->num - 1) / cachep->num;
1187}
1188
1189static void cpuup_canceled(long cpu)
1190{
1191        struct kmem_cache *cachep;
1192        struct kmem_cache_node *n = NULL;
1193        int node = cpu_to_mem(cpu);
1194        const struct cpumask *mask = cpumask_of_node(node);
1195
1196        list_for_each_entry(cachep, &slab_caches, list) {
1197                struct array_cache *nc;
1198                struct array_cache *shared;
1199                struct array_cache **alien;
1200
1201                /* cpu is dead; no one can alloc from it. */
1202                nc = cachep->array[cpu];
1203                cachep->array[cpu] = NULL;
1204                n = cachep->node[node];
1205
1206                if (!n)
1207                        goto free_array_cache;
1208
1209                spin_lock_irq(&n->list_lock);
1210
1211                /* Free limit for this kmem_cache_node */
1212                n->free_limit -= cachep->batchcount;
1213                if (nc)
1214                        free_block(cachep, nc->entry, nc->avail, node);
1215
1216                if (!cpumask_empty(mask)) {
1217                        spin_unlock_irq(&n->list_lock);
1218                        goto free_array_cache;
1219                }
1220
1221                shared = n->shared;
1222                if (shared) {
1223                        free_block(cachep, shared->entry,
1224                                   shared->avail, node);
1225                        n->shared = NULL;
1226                }
1227
1228                alien = n->alien;
1229                n->alien = NULL;
1230
1231                spin_unlock_irq(&n->list_lock);
1232
1233                kfree(shared);
1234                if (alien) {
1235                        drain_alien_cache(cachep, alien);
1236                        free_alien_cache(alien);
1237                }
1238free_array_cache:
1239                kfree(nc);
1240        }
1241        /*
1242         * In the previous loop, all the objects were freed to
1243         * the respective cache's slabs,  now we can go ahead and
1244         * shrink each nodelist to its limit.
1245         */
1246        list_for_each_entry(cachep, &slab_caches, list) {
1247                n = cachep->node[node];
1248                if (!n)
1249                        continue;
1250                drain_freelist(cachep, n, slabs_tofree(cachep, n));
1251        }
1252}
1253
1254static int cpuup_prepare(long cpu)
1255{
1256        struct kmem_cache *cachep;
1257        struct kmem_cache_node *n = NULL;
1258        int node = cpu_to_mem(cpu);
1259        int err;
1260
1261        /*
1262         * We need to do this right in the beginning since
1263         * alloc_arraycache's are going to use this list.
1264         * kmalloc_node allows us to add the slab to the right
1265         * kmem_cache_node and not this cpu's kmem_cache_node
1266         */
1267        err = init_cache_node_node(node);
1268        if (err < 0)
1269                goto bad;
1270
1271        /*
1272         * Now we can go ahead with allocating the shared arrays and
1273         * array caches
1274         */
1275        list_for_each_entry(cachep, &slab_caches, list) {
1276                struct array_cache *nc;
1277                struct array_cache *shared = NULL;
1278                struct array_cache **alien = NULL;
1279
1280                nc = alloc_arraycache(node, cachep->limit,
1281                                        cachep->batchcount, GFP_KERNEL);
1282                if (!nc)
1283                        goto bad;
1284                if (cachep->shared) {
1285                        shared = alloc_arraycache(node,
1286                                cachep->shared * cachep->batchcount,
1287                                0xbaadf00d, GFP_KERNEL);
1288                        if (!shared) {
1289                                kfree(nc);
1290                                goto bad;
1291                        }
1292                }
1293                if (use_alien_caches) {
1294                        alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL);
1295                        if (!alien) {
1296                                kfree(shared);
1297                                kfree(nc);
1298                                goto bad;
1299                        }
1300                }
1301                cachep->array[cpu] = nc;
1302                n = cachep->node[node];
1303                BUG_ON(!n);
1304
1305                spin_lock_irq(&n->list_lock);
1306                if (!n->shared) {
1307                        /*
1308                         * We are serialised from CPU_DEAD or
1309                         * CPU_UP_CANCELLED by the cpucontrol lock
1310                         */
1311                        n->shared = shared;
1312                        shared = NULL;
1313                }
1314#ifdef CONFIG_NUMA
1315                if (!n->alien) {
1316                        n->alien = alien;
1317                        alien = NULL;
1318                }
1319#endif
1320                spin_unlock_irq(&n->list_lock);
1321                kfree(shared);
1322                free_alien_cache(alien);
1323                if (cachep->flags & SLAB_DEBUG_OBJECTS)
1324                        slab_set_debugobj_lock_classes_node(cachep, node);
1325                else if (!OFF_SLAB(cachep) &&
1326                         !(cachep->flags & SLAB_DESTROY_BY_RCU))
1327                        on_slab_lock_classes_node(cachep, node);
1328        }
1329        init_node_lock_keys(node);
1330
1331        return 0;
1332bad:
1333        cpuup_canceled(cpu);
1334        return -ENOMEM;
1335}
1336
1337static int cpuup_callback(struct notifier_block *nfb,
1338                                    unsigned long action, void *hcpu)
1339{
1340        long cpu = (long)hcpu;
1341        int err = 0;
1342
1343        switch (action) {
1344        case CPU_UP_PREPARE:
1345        case CPU_UP_PREPARE_FROZEN:
1346                mutex_lock(&slab_mutex);
1347                err = cpuup_prepare(cpu);
1348                mutex_unlock(&slab_mutex);
1349                break;
1350        case CPU_ONLINE:
1351        case CPU_ONLINE_FROZEN:
1352                start_cpu_timer(cpu);
1353                break;
1354#ifdef CONFIG_HOTPLUG_CPU
1355        case CPU_DOWN_PREPARE:
1356        case CPU_DOWN_PREPARE_FROZEN:
1357                /*
1358                 * Shutdown cache reaper. Note that the slab_mutex is
1359                 * held so that if cache_reap() is invoked it cannot do
1360                 * anything expensive but will only modify reap_work
1361                 * and reschedule the timer.
1362                */
1363                cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu));
1364                /* Now the cache_reaper is guaranteed to be not running. */
1365                per_cpu(slab_reap_work, cpu).work.func = NULL;
1366                break;
1367        case CPU_DOWN_FAILED:
1368        case CPU_DOWN_FAILED_FROZEN:
1369                start_cpu_timer(cpu);
1370                break;
1371        case CPU_DEAD:
1372        case CPU_DEAD_FROZEN:
1373                /*
1374                 * Even if all the cpus of a node are down, we don't free the
1375                 * kmem_cache_node of any cache. This to avoid a race between
1376                 * cpu_down, and a kmalloc allocation from another cpu for
1377                 * memory from the node of the cpu going down.  The node
1378                 * structure is usually allocated from kmem_cache_create() and
1379                 * gets destroyed at kmem_cache_destroy().
1380                 */
1381                /* fall through */
1382#endif
1383        case CPU_UP_CANCELED:
1384        case CPU_UP_CANCELED_FROZEN:
1385                mutex_lock(&slab_mutex);
1386                cpuup_canceled(cpu);
1387                mutex_unlock(&slab_mutex);
1388                break;
1389        }
1390        return notifier_from_errno(err);
1391}
1392
1393static struct notifier_block cpucache_notifier = {
1394        &cpuup_callback, NULL, 0
1395};
1396
1397#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
1398/*
1399 * Drains freelist for a node on each slab cache, used for memory hot-remove.
1400 * Returns -EBUSY if all objects cannot be drained so that the node is not
1401 * removed.
1402 *
1403 * Must hold slab_mutex.
1404 */
1405static int __meminit drain_cache_node_node(int node)
1406{
1407        struct kmem_cache *cachep;
1408        int ret = 0;
1409
1410        list_for_each_entry(cachep, &slab_caches, list) {
1411                struct kmem_cache_node *n;
1412
1413                n = cachep->node[node];
1414                if (!n)
1415                        continue;
1416
1417                drain_freelist(cachep, n, slabs_tofree(cachep, n));
1418
1419                if (!list_empty(&n->slabs_full) ||
1420                    !list_empty(&n->slabs_partial)) {
1421                        ret = -EBUSY;
1422                        break;
1423                }
1424        }
1425        return ret;
1426}
1427
1428static int __meminit slab_memory_callback(struct notifier_block *self,
1429                                        unsigned long action, void *arg)
1430{
1431        struct memory_notify *mnb = arg;
1432        int ret = 0;
1433        int nid;
1434
1435        nid = mnb->status_change_nid;
1436        if (nid < 0)
1437                goto out;
1438
1439        switch (action) {
1440        case MEM_GOING_ONLINE:
1441                mutex_lock(&slab_mutex);
1442                ret = init_cache_node_node(nid);
1443                mutex_unlock(&slab_mutex);
1444                break;
1445        case MEM_GOING_OFFLINE:
1446                mutex_lock(&slab_mutex);
1447                ret = drain_cache_node_node(nid);
1448                mutex_unlock(&slab_mutex);
1449                break;
1450        case MEM_ONLINE:
1451        case MEM_OFFLINE:
1452        case MEM_CANCEL_ONLINE:
1453        case MEM_CANCEL_OFFLINE:
1454                break;
1455        }
1456out:
1457        return notifier_from_errno(ret);
1458}
1459#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
1460
1461/*
1462 * swap the static kmem_cache_node with kmalloced memory
1463 */
1464static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node *list,
1465                                int nodeid)
1466{
1467        struct kmem_cache_node *ptr;
1468
1469        ptr = kmalloc_node(sizeof(struct kmem_cache_node), GFP_NOWAIT, nodeid);
1470        BUG_ON(!ptr);
1471
1472        memcpy(ptr, list, sizeof(struct kmem_cache_node));
1473        /*
1474         * Do not assume that spinlocks can be initialized via memcpy:
1475         */
1476        spin_lock_init(&ptr->list_lock);
1477
1478        MAKE_ALL_LISTS(cachep, ptr, nodeid);
1479        cachep->node[nodeid] = ptr;
1480}
1481
1482/*
1483 * For setting up all the kmem_cache_node for cache whose buffer_size is same as
1484 * size of kmem_cache_node.
1485 */
1486static void __init set_up_node(struct kmem_cache *cachep, int index)
1487{
1488        int node;
1489
1490        for_each_online_node(node) {
1491                cachep->node[node] = &init_kmem_cache_node[index + node];
1492                cachep->node[node]->next_reap = jiffies +
1493                    REAPTIMEOUT_LIST3 +
1494                    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1495        }
1496}
1497
1498/*
1499 * The memory after the last cpu cache pointer is used for the
1500 * the node pointer.
1501 */
1502static void setup_node_pointer(struct kmem_cache *cachep)
1503{
1504        cachep->node = (struct kmem_cache_node **)&cachep->array[nr_cpu_ids];
1505}
1506
1507/*
1508 * Initialisation.  Called after the page allocator have been initialised and
1509 * before smp_init().
1510 */
1511void __init kmem_cache_init(void)
1512{
1513        int i;
1514
1515        kmem_cache = &kmem_cache_boot;
1516        setup_node_pointer(kmem_cache);
1517
1518        if (num_possible_nodes() == 1)
1519                use_alien_caches = 0;
1520
1521        for (i = 0; i < NUM_INIT_LISTS; i++)
1522                kmem_cache_node_init(&init_kmem_cache_node[i]);
1523
1524        set_up_node(kmem_cache, CACHE_CACHE);
1525
1526        /*
1527         * Fragmentation resistance on low memory - only use bigger
1528         * page orders on machines with more than 32MB of memory if
1529         * not overridden on the command line.
1530         */
1531        if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT)
1532                slab_max_order = SLAB_MAX_ORDER_HI;
1533
1534        /* Bootstrap is tricky, because several objects are allocated
1535         * from caches that do not exist yet:
1536         * 1) initialize the kmem_cache cache: it contains the struct
1537         *    kmem_cache structures of all caches, except kmem_cache itself:
1538         *    kmem_cache is statically allocated.
1539         *    Initially an __init data area is used for the head array and the
1540         *    kmem_cache_node structures, it's replaced with a kmalloc allocated
1541         *    array at the end of the bootstrap.
1542         * 2) Create the first kmalloc cache.
1543         *    The struct kmem_cache for the new cache is allocated normally.
1544         *    An __init data area is used for the head array.
1545         * 3) Create the remaining kmalloc caches, with minimally sized
1546         *    head arrays.
1547         * 4) Replace the __init data head arrays for kmem_cache and the first
1548         *    kmalloc cache with kmalloc allocated arrays.
1549         * 5) Replace the __init data for kmem_cache_node for kmem_cache and
1550         *    the other cache's with kmalloc allocated memory.
1551         * 6) Resize the head arrays of the kmalloc caches to their final sizes.
1552         */
1553
1554        /* 1) create the kmem_cache */
1555
1556        /*
1557         * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
1558         */
1559        create_boot_cache(kmem_cache, "kmem_cache",
1560                offsetof(struct kmem_cache, array[nr_cpu_ids]) +
1561                                  nr_node_ids * sizeof(struct kmem_cache_node *),
1562                                  SLAB_HWCACHE_ALIGN);
1563        list_add(&kmem_cache->list, &slab_caches);
1564
1565        /* 2+3) create the kmalloc caches */
1566
1567        /*
1568         * Initialize the caches that provide memory for the array cache and the
1569         * kmem_cache_node structures first.  Without this, further allocations will
1570         * bug.
1571         */
1572
1573        kmalloc_caches[INDEX_AC] = create_kmalloc_cache("kmalloc-ac",
1574                                        kmalloc_size(INDEX_AC), ARCH_KMALLOC_FLAGS);
1575
1576        if (INDEX_AC != INDEX_NODE)
1577                kmalloc_caches[INDEX_NODE] =
1578                        create_kmalloc_cache("kmalloc-node",
1579                                kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS);
1580
1581        slab_early_init = 0;
1582
1583        /* 4) Replace the bootstrap head arrays */
1584        {
1585                struct array_cache *ptr;
1586
1587                ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1588
1589                memcpy(ptr, cpu_cache_get(kmem_cache),
1590                       sizeof(struct arraycache_init));
1591                /*
1592                 * Do not assume that spinlocks can be initialized via memcpy:
1593                 */
1594                spin_lock_init(&ptr->lock);
1595
1596                kmem_cache->array[smp_processor_id()] = ptr;
1597
1598                ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1599
1600                BUG_ON(cpu_cache_get(kmalloc_caches[INDEX_AC])
1601                       != &initarray_generic.cache);
1602                memcpy(ptr, cpu_cache_get(kmalloc_caches[INDEX_AC]),
1603                       sizeof(struct arraycache_init));
1604                /*
1605                 * Do not assume that spinlocks can be initialized via memcpy:
1606                 */
1607                spin_lock_init(&ptr->lock);
1608
1609                kmalloc_caches[INDEX_AC]->array[smp_processor_id()] = ptr;
1610        }
1611        /* 5) Replace the bootstrap kmem_cache_node */
1612        {
1613                int nid;
1614
1615                for_each_online_node(nid) {
1616                        init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid);
1617
1618                        init_list(kmalloc_caches[INDEX_AC],
1619                                  &init_kmem_cache_node[SIZE_AC + nid], nid);
1620
1621                        if (INDEX_AC != INDEX_NODE) {
1622                                init_list(kmalloc_caches[INDEX_NODE],
1623                                          &init_kmem_cache_node[SIZE_NODE + nid], nid);
1624                        }
1625                }
1626        }
1627
1628        create_kmalloc_caches(ARCH_KMALLOC_FLAGS);
1629}
1630
1631void __init kmem_cache_init_late(void)
1632{
1633        struct kmem_cache *cachep;
1634
1635        slab_state = UP;
1636
1637        /* 6) resize the head arrays to their final sizes */
1638        mutex_lock(&slab_mutex);
1639        list_for_each_entry(cachep, &slab_caches, list)
1640                if (enable_cpucache(cachep, GFP_NOWAIT))
1641                        BUG();
1642        mutex_unlock(&slab_mutex);
1643
1644        /* Annotate slab for lockdep -- annotate the malloc caches */
1645        init_lock_keys();
1646
1647        /* Done! */
1648        slab_state = FULL;
1649
1650        /*
1651         * Register a cpu startup notifier callback that initializes
1652         * cpu_cache_get for all new cpus
1653         */
1654        register_cpu_notifier(&cpucache_notifier);
1655
1656#ifdef CONFIG_NUMA
1657        /*
1658         * Register a memory hotplug callback that initializes and frees
1659         * node.
1660         */
1661        hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
1662#endif
1663
1664        /*
1665         * The reap timers are started later, with a module init call: That part
1666         * of the kernel is not yet operational.
1667         */
1668}
1669
1670static int __init cpucache_init(void)
1671{
1672        int cpu;
1673
1674        /*
1675         * Register the timers that return unneeded pages to the page allocator
1676         */
1677        for_each_online_cpu(cpu)
1678                start_cpu_timer(cpu);
1679
1680        /* Done! */
1681        slab_state = FULL;
1682        return 0;
1683}
1684__initcall(cpucache_init);
1685
1686static noinline void
1687slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
1688{
1689        struct kmem_cache_node *n;
1690        struct slab *slabp;
1691        unsigned long flags;
1692        int node;
1693
1694        printk(KERN_WARNING
1695                "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n",
1696                nodeid, gfpflags);
1697        printk(KERN_WARNING "  cache: %s, object size: %d, order: %d\n",
1698                cachep->name, cachep->size, cachep->gfporder);
1699
1700        for_each_online_node(node) {
1701                unsigned long active_objs = 0, num_objs = 0, free_objects = 0;
1702                unsigned long active_slabs = 0, num_slabs = 0;
1703
1704                n = cachep->node[node];
1705                if (!n)
1706                        continue;
1707
1708                spin_lock_irqsave(&n->list_lock, flags);
1709                list_for_each_entry(slabp, &n->slabs_full, list) {
1710                        active_objs += cachep->num;
1711                        active_slabs++;
1712                }
1713                list_for_each_entry(slabp, &n->slabs_partial, list) {
1714                        active_objs += slabp->inuse;
1715                        active_slabs++;
1716                }
1717                list_for_each_entry(slabp, &n->slabs_free, list)
1718                        num_slabs++;
1719
1720                free_objects += n->free_objects;
1721                spin_unlock_irqrestore(&n->list_lock, flags);
1722
1723                num_slabs += active_slabs;
1724                num_objs = num_slabs * cachep->num;
1725                printk(KERN_WARNING
1726                        "  node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
1727                        node, active_slabs, num_slabs, active_objs, num_objs,
1728                        free_objects);
1729        }
1730}
1731
1732/*
1733 * Interface to system's page allocator. No need to hold the cache-lock.
1734 *
1735 * If we requested dmaable memory, we will get it. Even if we
1736 * did not request dmaable memory, we might get it, but that
1737 * would be relatively rare and ignorable.
1738 */
1739static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1740{
1741        struct page *page;
1742        int nr_pages;
1743        int i;
1744
1745#ifndef CONFIG_MMU
1746        /*
1747         * Nommu uses slab's for process anonymous memory allocations, and thus
1748         * requires __GFP_COMP to properly refcount higher order allocations
1749         */
1750        flags |= __GFP_COMP;
1751#endif
1752
1753        flags |= cachep->allocflags;
1754        if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1755                flags |= __GFP_RECLAIMABLE;
1756
1757        page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
1758        if (!page) {
1759                if (!(flags & __GFP_NOWARN) && printk_ratelimit())
1760                        slab_out_of_memory(cachep, flags, nodeid);
1761                return NULL;
1762        }
1763
1764        /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
1765        if (unlikely(page->pfmemalloc))
1766                pfmemalloc_active = true;
1767
1768        nr_pages = (1 << cachep->gfporder);
1769        if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1770                add_zone_page_state(page_zone(page),
1771                        NR_SLAB_RECLAIMABLE, nr_pages);
1772        else
1773                add_zone_page_state(page_zone(page),
1774                        NR_SLAB_UNRECLAIMABLE, nr_pages);
1775        for (i = 0; i < nr_pages; i++) {
1776                __SetPageSlab(page + i);
1777
1778                if (page->pfmemalloc)
1779                        SetPageSlabPfmemalloc(page + i);
1780        }
1781        memcg_bind_pages(cachep, cachep->gfporder);
1782
1783        if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
1784                kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
1785
1786                if (cachep->ctor)
1787                        kmemcheck_mark_uninitialized_pages(page, nr_pages);
1788                else
1789                        kmemcheck_mark_unallocated_pages(page, nr_pages);
1790        }
1791
1792        return page_address(page);
1793}
1794
1795/*
1796 * Interface to system's page release.
1797 */
1798static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1799{
1800        unsigned long i = (1 << cachep->gfporder);
1801        struct page *page = virt_to_page(addr);
1802        const unsigned long nr_freed = i;
1803
1804        kmemcheck_free_shadow(page, cachep->gfporder);
1805
1806        if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1807                sub_zone_page_state(page_zone(page),
1808                                NR_SLAB_RECLAIMABLE, nr_freed);
1809        else
1810                sub_zone_page_state(page_zone(page),
1811                                NR_SLAB_UNRECLAIMABLE, nr_freed);
1812        while (i--) {
1813                BUG_ON(!PageSlab(page));
1814                __ClearPageSlabPfmemalloc(page);
1815                __ClearPageSlab(page);
1816                page++;
1817        }
1818
1819        memcg_release_pages(cachep, cachep->gfporder);
1820        if (current->reclaim_state)
1821                current->reclaim_state->reclaimed_slab += nr_freed;
1822        free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder);
1823}
1824
1825static void kmem_rcu_free(struct rcu_head *head)
1826{
1827        struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
1828        struct kmem_cache *cachep = slab_rcu->cachep;
1829
1830        kmem_freepages(cachep, slab_rcu->addr);
1831        if (OFF_SLAB(cachep))
1832                kmem_cache_free(cachep->slabp_cache, slab_rcu);
1833}
1834
1835#if DEBUG
1836
1837#ifdef CONFIG_DEBUG_PAGEALLOC
1838static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
1839                            unsigned long caller)
1840{
1841        int size = cachep->object_size;
1842
1843        addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
1844
1845        if (size < 5 * sizeof(unsigned long))
1846                return;
1847
1848        *addr++ = 0x12345678;
1849        *addr++ = caller;
1850        *addr++ = smp_processor_id();
1851        size -= 3 * sizeof(unsigned long);
1852        {
1853                unsigned long *sptr = &caller;
1854                unsigned long svalue;
1855
1856                while (!kstack_end(sptr)) {
1857                        svalue = *sptr++;
1858                        if (kernel_text_address(svalue)) {
1859                                *addr++ = svalue;
1860                                size -= sizeof(unsigned long);
1861                                if (size <= sizeof(unsigned long))
1862                                        break;
1863                        }
1864                }
1865
1866        }
1867        *addr++ = 0x87654321;
1868}
1869#endif
1870
1871static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
1872{
1873        int size = cachep->object_size;
1874        addr = &((char *)addr)[obj_offset(cachep)];
1875
1876        memset(addr, val, size);
1877        *(unsigned char *)(addr + size - 1) = POISON_END;
1878}
1879
1880static void dump_line(char *data, int offset, int limit)
1881{
1882        int i;
1883        unsigned char error = 0;
1884        int bad_count = 0;
1885
1886        printk(KERN_ERR "%03x: ", offset);
1887        for (i = 0; i < limit; i++) {
1888                if (data[offset + i] != POISON_FREE) {
1889                        error = data[offset + i];
1890                        bad_count++;
1891                }
1892        }
1893        print_hex_dump(KERN_CONT, "", 0, 16, 1,
1894                        &data[offset], limit, 1);
1895
1896        if (bad_count == 1) {
1897                error ^= POISON_FREE;
1898                if (!(error & (error - 1))) {
1899                        printk(KERN_ERR "Single bit error detected. Probably "
1900                                        "bad RAM.\n");
1901#ifdef CONFIG_X86
1902                        printk(KERN_ERR "Run memtest86+ or a similar memory "
1903                                        "test tool.\n");
1904#else
1905                        printk(KERN_ERR "Run a memory test tool.\n");
1906#endif
1907                }
1908        }
1909}
1910#endif
1911
1912#if DEBUG
1913
1914static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
1915{
1916        int i, size;
1917        char *realobj;
1918
1919        if (cachep->flags & SLAB_RED_ZONE) {
1920                printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n",
1921                        *dbg_redzone1(cachep, objp),
1922                        *dbg_redzone2(cachep, objp));
1923        }
1924
1925        if (cachep->flags & SLAB_STORE_USER) {
1926                printk(KERN_ERR "Last user: [<%p>](%pSR)\n",
1927                       *dbg_userword(cachep, objp),
1928                       *dbg_userword(cachep, objp));
1929        }
1930        realobj = (char *)objp + obj_offset(cachep);
1931        size = cachep->object_size;
1932        for (i = 0; i < size && lines; i += 16, lines--) {
1933                int limit;
1934                limit = 16;
1935                if (i + limit > size)
1936                        limit = size - i;
1937                dump_line(realobj, i, limit);
1938        }
1939}
1940
1941static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1942{
1943        char *realobj;
1944        int size, i;
1945        int lines = 0;
1946
1947        realobj = (char *)objp + obj_offset(cachep);
1948        size = cachep->object_size;
1949
1950        for (i = 0; i < size; i++) {
1951                char exp = POISON_FREE;
1952                if (i == size - 1)
1953                        exp = POISON_END;
1954                if (realobj[i] != exp) {
1955                        int limit;
1956                        /* Mismatch ! */
1957                        /* Print header */
1958                        if (lines == 0) {
1959                                printk(KERN_ERR
1960                                        "Slab corruption (%s): %s start=%p, len=%d\n",
1961                                        print_tainted(), cachep->name, realobj, size);
1962                                print_objinfo(cachep, objp, 0);
1963                        }
1964                        /* Hexdump the affected line */
1965                        i = (i / 16) * 16;
1966                        limit = 16;
1967                        if (i + limit > size)
1968                                limit = size - i;
1969                        dump_line(realobj, i, limit);
1970                        i += 16;
1971                        lines++;
1972                        /* Limit to 5 lines */
1973                        if (lines > 5)
1974                                break;
1975                }
1976        }
1977        if (lines != 0) {
1978                /* Print some data about the neighboring objects, if they
1979                 * exist:
1980                 */
1981                struct slab *slabp = virt_to_slab(objp);
1982                unsigned int objnr;
1983
1984                objnr = obj_to_index(cachep, slabp, objp);
1985                if (objnr) {
1986                        objp = index_to_obj(cachep, slabp, objnr - 1);
1987                        realobj = (char *)objp + obj_offset(cachep);
1988                        printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
1989                               realobj, size);
1990                        print_objinfo(cachep, objp, 2);
1991                }
1992                if (objnr + 1 < cachep->num) {
1993                        objp = index_to_obj(cachep, slabp, objnr + 1);
1994                        realobj = (char *)objp + obj_offset(cachep);
1995                        printk(KERN_ERR "Next obj: start=%p, len=%d\n",
1996                               realobj, size);
1997                        print_objinfo(cachep, objp, 2);
1998                }
1999        }
2000}
2001#endif
2002
2003#if DEBUG
2004static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp)
2005{
2006        int i;
2007        for (i = 0; i < cachep->num; i++) {
2008                void *objp = index_to_obj(cachep, slabp, i);
2009
2010                if (cachep->flags & SLAB_POISON) {
2011#ifdef CONFIG_DEBUG_PAGEALLOC
2012                        if (cachep->size % PAGE_SIZE == 0 &&
2013                                        OFF_SLAB(cachep))
2014                                kernel_map_pages(virt_to_page(objp),
2015                                        cachep->size / PAGE_SIZE, 1);
2016                        else
2017                                check_poison_obj(cachep, objp);
2018#else
2019                        check_poison_obj(cachep, objp);
2020#endif
2021                }
2022                if (cachep->flags & SLAB_RED_ZONE) {
2023                        if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
2024                                slab_error(cachep, "start of a freed object "
2025                                           "was overwritten");
2026                        if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
2027                                slab_error(cachep, "end of a freed object "
2028                                           "was overwritten");
2029                }
2030        }
2031}
2032#else
2033static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp)
2034{
2035}
2036#endif
2037
2038/**
2039 * slab_destroy - destroy and release all objects in a slab
2040 * @cachep: cache pointer being destroyed
2041 * @slabp: slab pointer being destroyed
2042 *
2043 * Destroy all the objs in a slab, and release the mem back to the system.
2044 * Before calling the slab must have been unlinked from the cache.  The
2045 * cache-lock is not held/needed.
2046 */
2047static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
2048{
2049        void *addr = slabp->s_mem - slabp->colouroff;
2050
2051        slab_destroy_debugcheck(cachep, slabp);
2052        if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
2053                struct slab_rcu *slab_rcu;
2054
2055                slab_rcu = (struct slab_rcu *)slabp;
2056                slab_rcu->cachep = cachep;
2057                slab_rcu->addr = addr;
2058                call_rcu(&slab_rcu->head, kmem_rcu_free);
2059        } else {
2060                kmem_freepages(cachep, addr);
2061                if (OFF_SLAB(cachep))
2062                        kmem_cache_free(cachep->slabp_cache, slabp);
2063        }
2064}
2065
2066/**
2067 * calculate_slab_order - calculate size (page order) of slabs
2068 * @cachep: pointer to the cache that is being created
2069 * @size: size of objects to be created in this cache.
2070 * @align: required alignment for the objects.
2071 * @flags: slab allocation flags
2072 *
2073 * Also calculates the number of objects per slab.
2074 *
2075 * This could be made much more intelligent.  For now, try to avoid using
2076 * high order pages for slabs.  When the gfp() functions are more friendly
2077 * towards high-order requests, this should be changed.
2078 */
2079static size_t calculate_slab_order(struct kmem_cache *cachep,
2080                        size_t size, size_t align, unsigned long flags)
2081{
2082        unsigned long offslab_limit;
2083        size_t left_over = 0;
2084        int gfporder;
2085
2086        for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
2087                unsigned int num;
2088                size_t remainder;
2089
2090                cache_estimate(gfporder, size, align, flags, &remainder, &num);
2091                if (!num)
2092                        continue;
2093
2094                if (flags & CFLGS_OFF_SLAB) {
2095                        /*
2096                         * Max number of objs-per-slab for caches which
2097                         * use off-slab slabs. Needed to avoid a possible
2098                         * looping condition in cache_grow().
2099                         */
2100                        offslab_limit = size - sizeof(struct slab);
2101                        offslab_limit /= sizeof(kmem_bufctl_t);
2102
2103                        if (num > offslab_limit)
2104                                break;
2105                }
2106
2107                /* Found something acceptable - save it away */
2108                cachep->num = num;
2109                cachep->gfporder = gfporder;
2110                left_over = remainder;
2111
2112                /*
2113                 * A VFS-reclaimable slab tends to have most allocations
2114                 * as GFP_NOFS and we really don't want to have to be allocating
2115                 * higher-order pages when we are unable to shrink dcache.
2116                 */
2117                if (flags & SLAB_RECLAIM_ACCOUNT)
2118                        break;
2119
2120                /*
2121                 * Large number of objects is good, but very large slabs are
2122                 * currently bad for the gfp()s.
2123                 */
2124                if (gfporder >= slab_max_order)
2125                        break;
2126
2127                /*
2128                 * Acceptable internal fragmentation?
2129                 */
2130                if (left_over * 8 <= (PAGE_SIZE << gfporder))
2131                        break;
2132        }
2133        return left_over;
2134}
2135
2136static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2137{
2138        if (slab_state >= FULL)
2139                return enable_cpucache(cachep, gfp);
2140
2141        if (slab_state == DOWN) {
2142                /*
2143                 * Note: Creation of first cache (kmem_cache).
2144                 * The setup_node is taken care
2145                 * of by the caller of __kmem_cache_create
2146                 */
2147                cachep->array[smp_processor_id()] = &initarray_generic.cache;
2148                slab_state = PARTIAL;
2149        } else if (slab_state == PARTIAL) {
2150                /*
2151                 * Note: the second kmem_cache_create must create the cache
2152                 * that's used by kmalloc(24), otherwise the creation of
2153                 * further caches will BUG().
2154                 */
2155                cachep->array[smp_processor_id()] = &initarray_generic.cache;
2156
2157                /*
2158                 * If the cache that's used by kmalloc(sizeof(kmem_cache_node)) is
2159                 * the second cache, then we need to set up all its node/,
2160                 * otherwise the creation of further caches will BUG().
2161                 */
2162                set_up_node(cachep, SIZE_AC);
2163                if (INDEX_AC == INDEX_NODE)
2164                        slab_state = PARTIAL_NODE;
2165                else
2166                        slab_state = PARTIAL_ARRAYCACHE;
2167        } else {
2168                /* Remaining boot caches */
2169                cachep->array[smp_processor_id()] =
2170                        kmalloc(sizeof(struct arraycache_init), gfp);
2171
2172                if (slab_state == PARTIAL_ARRAYCACHE) {
2173                        set_up_node(cachep, SIZE_NODE);
2174                        slab_state = PARTIAL_NODE;
2175                } else {
2176                        int node;
2177                        for_each_online_node(node) {
2178                                cachep->node[node] =
2179                                    kmalloc_node(sizeof(struct kmem_cache_node),
2180                                                gfp, node);
2181                                BUG_ON(!cachep->node[node]);
2182                                kmem_cache_node_init(cachep->node[node]);
2183                        }
2184                }
2185        }
2186        cachep->node[numa_mem_id()]->next_reap =
2187                        jiffies + REAPTIMEOUT_LIST3 +
2188                        ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
2189
2190        cpu_cache_get(cachep)->avail = 0;
2191        cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
2192        cpu_cache_get(cachep)->batchcount = 1;
2193        cpu_cache_get(cachep)->touched = 0;
2194        cachep->batchcount = 1;
2195        cachep->limit = BOOT_CPUCACHE_ENTRIES;
2196        return 0;
2197}
2198
2199/**
2200 * __kmem_cache_create - Create a cache.
2201 * @cachep: cache management descriptor
2202 * @flags: SLAB flags
2203 *
2204 * Returns a ptr to the cache on success, NULL on failure.
2205 * Cannot be called within a int, but can be interrupted.
2206 * The @ctor is run when new pages are allocated by the cache.
2207 *
2208 * The flags are
2209 *
2210 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
2211 * to catch references to uninitialised memory.
2212 *
2213 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
2214 * for buffer overruns.
2215 *
2216 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
2217 * cacheline.  This can be beneficial if you're counting cycles as closely
2218 * as davem.
2219 */
2220int
2221__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2222{
2223        size_t left_over, slab_size, ralign;
2224        gfp_t gfp;
2225        int err;
2226        size_t size = cachep->size;
2227
2228#if DEBUG
2229#if FORCED_DEBUG
2230        /*
2231         * Enable redzoning and last user accounting, except for caches with
2232         * large objects, if the increased size would increase the object size
2233         * above the next power of two: caches with object sizes just above a
2234         * power of two have a significant amount of internal fragmentation.
2235         */
2236        if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
2237                                                2 * sizeof(unsigned long long)))
2238                flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
2239        if (!(flags & SLAB_DESTROY_BY_RCU))
2240                flags |= SLAB_POISON;
2241#endif
2242        if (flags & SLAB_DESTROY_BY_RCU)
2243                BUG_ON(flags & SLAB_POISON);
2244#endif
2245
2246        /*
2247         * Check that size is in terms of words.  This is needed to avoid
2248         * unaligned accesses for some archs when redzoning is used, and makes
2249         * sure any on-slab bufctl's are also correctly aligned.
2250         */
2251        if (size & (BYTES_PER_WORD - 1)) {
2252                size += (BYTES_PER_WORD - 1);
2253                size &= ~(BYTES_PER_WORD - 1);
2254        }
2255
2256        /*
2257         * Redzoning and user store require word alignment or possibly larger.
2258         * Note this will be overridden by architecture or caller mandated
2259         * alignment if either is greater than BYTES_PER_WORD.
2260         */
2261        if (flags & SLAB_STORE_USER)
2262                ralign = BYTES_PER_WORD;
2263
2264        if (flags & SLAB_RED_ZONE) {
2265                ralign = REDZONE_ALIGN;
2266                /* If redzoning, ensure that the second redzone is suitably
2267                 * aligned, by adjusting the object size accordingly. */
2268                size += REDZONE_ALIGN - 1;
2269                size &= ~(REDZONE_ALIGN - 1);
2270        }
2271
2272        /* 3) caller mandated alignment */
2273        if (ralign < cachep->align) {
2274                ralign = cachep->align;
2275        }
2276        /* disable debug if necessary */
2277        if (ralign > __alignof__(unsigned long long))
2278                flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2279        /*
2280         * 4) Store it.
2281         */
2282        cachep->align = ralign;
2283
2284        if (slab_is_available())
2285                gfp = GFP_KERNEL;
2286        else
2287                gfp = GFP_NOWAIT;
2288
2289        setup_node_pointer(cachep);
2290#if DEBUG
2291
2292        /*
2293         * Both debugging options require word-alignment which is calculated
2294         * into align above.
2295         */
2296        if (flags & SLAB_RED_ZONE) {
2297                /* add space for red zone words */
2298                cachep->obj_offset += sizeof(unsigned long long);
2299                size += 2 * sizeof(unsigned long long);
2300        }
2301        if (flags & SLAB_STORE_USER) {
2302                /* user store requires one word storage behind the end of
2303                 * the real object. But if the second red zone needs to be
2304                 * aligned to 64 bits, we must allow that much space.
2305                 */
2306                if (flags & SLAB_RED_ZONE)
2307                        size += REDZONE_ALIGN;
2308                else
2309                        size += BYTES_PER_WORD;
2310        }
2311#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
2312        if (size >= kmalloc_size(INDEX_NODE + 1)
2313            && cachep->object_size > cache_line_size()
2314            && ALIGN(size, cachep->align) < PAGE_SIZE) {
2315                cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align);
2316                size = PAGE_SIZE;
2317        }
2318#endif
2319#endif
2320
2321        /*
2322         * Determine if the slab management is 'on' or 'off' slab.
2323         * (bootstrapping cannot cope with offslab caches so don't do
2324         * it too early on. Always use on-slab management when
2325         * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
2326         */
2327        if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&
2328            !(flags & SLAB_NOLEAKTRACE))
2329                /*
2330                 * Size is large, assume best to place the slab management obj
2331                 * off-slab (should allow better packing of objs).
2332                 */
2333                flags |= CFLGS_OFF_SLAB;
2334
2335        size = ALIGN(size, cachep->align);
2336
2337        left_over = calculate_slab_order(cachep, size, cachep->align, flags);
2338
2339        if (!cachep->num)
2340                return -E2BIG;
2341
2342        slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
2343                          + sizeof(struct slab), cachep->align);
2344
2345        /*
2346         * If the slab has been placed off-slab, and we have enough space then
2347         * move it on-slab. This is at the expense of any extra colouring.
2348         */
2349        if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
2350                flags &= ~CFLGS_OFF_SLAB;
2351                left_over -= slab_size;
2352        }
2353
2354        if (flags & CFLGS_OFF_SLAB) {
2355                /* really off slab. No need for manual alignment */
2356                slab_size =
2357                    cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
2358
2359#ifdef CONFIG_PAGE_POISONING
2360                /* If we're going to use the generic kernel_map_pages()
2361                 * poisoning, then it's going to smash the contents of
2362                 * the redzone and userword anyhow, so switch them off.
2363                 */
2364                if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
2365                        flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2366#endif
2367        }
2368
2369        cachep->colour_off = cache_line_size();
2370        /* Offset must be a multiple of the alignment. */
2371        if (cachep->colour_off < cachep->align)
2372                cachep->colour_off = cachep->align;
2373        cachep->colour = left_over / cachep->colour_off;
2374        cachep->slab_size = slab_size;
2375        cachep->flags = flags;
2376        cachep->allocflags = 0;
2377        if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
2378                cachep->allocflags |= GFP_DMA;
2379        cachep->size = size;
2380        cachep->reciprocal_buffer_size = reciprocal_value(size);
2381
2382        if (flags & CFLGS_OFF_SLAB) {
2383                cachep->slabp_cache = kmalloc_slab(slab_size, 0u);
2384                /*
2385                 * This is a possibility for one of the malloc_sizes caches.
2386                 * But since we go off slab only for object size greater than
2387                 * PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
2388                 * this should not happen at all.
2389                 * But leave a BUG_ON for some lucky dude.
2390                 */
2391                BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));
2392        }
2393
2394        err = setup_cpu_cache(cachep, gfp);
2395        if (err) {
2396                __kmem_cache_shutdown(cachep);
2397                return err;
2398        }
2399
2400        if (flags & SLAB_DEBUG_OBJECTS) {
2401                /*
2402                 * Would deadlock through slab_destroy()->call_rcu()->
2403                 * debug_object_activate()->kmem_cache_alloc().
2404                 */
2405                WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU);
2406
2407                slab_set_debugobj_lock_classes(cachep);
2408        } else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU))
2409                on_slab_lock_classes(cachep);
2410
2411        return 0;
2412}
2413
2414#if DEBUG
2415static void check_irq_off(void)
2416{
2417        BUG_ON(!irqs_disabled());
2418}
2419
2420static void check_irq_on(void)
2421{
2422        BUG_ON(irqs_disabled());
2423}
2424
2425static void check_spinlock_acquired(struct kmem_cache *cachep)
2426{
2427#ifdef CONFIG_SMP
2428        check_irq_off();
2429        assert_spin_locked(&cachep->node[numa_mem_id()]->list_lock);
2430#endif
2431}
2432
2433static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
2434{
2435#ifdef CONFIG_SMP
2436        check_irq_off();
2437        assert_spin_locked(&cachep->node[node]->list_lock);
2438#endif
2439}
2440
2441#else
2442#define check_irq_off() do { } while(0)
2443#define check_irq_on()  do { } while(0)
2444#define check_spinlock_acquired(x) do { } while(0)
2445#define check_spinlock_acquired_node(x, y) do { } while(0)
2446#endif
2447
2448static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
2449                        struct array_cache *ac,
2450                        int force, int node);
2451
2452static void do_drain(void *arg)
2453{
2454        struct kmem_cache *cachep = arg;
2455        struct array_cache *ac;
2456        int node = numa_mem_id();
2457
2458        check_irq_off();
2459        ac = cpu_cache_get(cachep);
2460        spin_lock(&cachep->node[node]->list_lock);
2461        free_block(cachep, ac->entry, ac->avail, node);
2462        spin_unlock(&cachep->node[node]->list_lock);
2463        ac->avail = 0;
2464}
2465
2466static void drain_cpu_caches(struct kmem_cache *cachep)
2467{
2468        struct kmem_cache_node *n;
2469        int node;
2470
2471        on_each_cpu(do_drain, cachep, 1);
2472        check_irq_on();
2473        for_each_online_node(node) {
2474                n = cachep->node[node];
2475                if (n && n->alien)
2476                        drain_alien_cache(cachep, n->alien);
2477        }
2478
2479        for_each_online_node(node) {
2480                n = cachep->node[node];
2481                if (n)
2482                        drain_array(cachep, n, n->shared, 1, node);
2483        }
2484}
2485
2486/*
2487 * Remove slabs from the list of free slabs.
2488 * Specify the number of slabs to drain in tofree.
2489 *
2490 * Returns the actual number of slabs released.
2491 */
2492static int drain_freelist(struct kmem_cache *cache,
2493                        struct kmem_cache_node *n, int tofree)
2494{
2495        struct list_head *p;
2496        int nr_freed;
2497        struct slab *slabp;
2498
2499        nr_freed = 0;
2500        while (nr_freed < tofree && !list_empty(&n->slabs_free)) {
2501
2502                spin_lock_irq(&n->list_lock);
2503                p = n->slabs_free.prev;
2504                if (p == &n->slabs_free) {
2505                        spin_unlock_irq(&n->list_lock);
2506                        goto out;
2507                }
2508
2509                slabp = list_entry(p, struct slab, list);
2510#if DEBUG
2511                BUG_ON(slabp->inuse);
2512#endif
2513                list_del(&slabp->list);
2514                /*
2515                 * Safe to drop the lock. The slab is no longer linked
2516                 * to the cache.
2517                 */
2518                n->free_objects -= cache->num;
2519                spin_unlock_irq(&n->list_lock);
2520                slab_destroy(cache, slabp);
2521                nr_freed++;
2522        }
2523out:
2524        return nr_freed;
2525}
2526
2527/* Called with slab_mutex held to protect against cpu hotplug */
2528static int __cache_shrink(struct kmem_cache *cachep)
2529{
2530        int ret = 0, i = 0;
2531        struct kmem_cache_node *n;
2532
2533        drain_cpu_caches(cachep);
2534
2535        check_irq_on();
2536        for_each_online_node(i) {
2537                n = cachep->node[i];
2538                if (!n)
2539                        continue;
2540
2541                drain_freelist(cachep, n, slabs_tofree(cachep, n));
2542
2543                ret += !list_empty(&n->slabs_full) ||
2544                        !list_empty(&n->slabs_partial);
2545        }
2546        return (ret ? 1 : 0);
2547}
2548
2549/**
2550 * kmem_cache_shrink - Shrink a cache.
2551 * @cachep: The cache to shrink.
2552 *
2553 * Releases as many slabs as possible for a cache.
2554 * To help debugging, a zero exit status indicates all slabs were released.
2555 */
2556int kmem_cache_shrink(struct kmem_cache *cachep)
2557{
2558        int ret;
2559        BUG_ON(!cachep || in_interrupt());
2560
2561        get_online_cpus();
2562        mutex_lock(&slab_mutex);
2563        ret = __cache_shrink(cachep);
2564        mutex_unlock(&slab_mutex);
2565        put_online_cpus();
2566        return ret;
2567}
2568EXPORT_SYMBOL(kmem_cache_shrink);
2569
2570int __kmem_cache_shutdown(struct kmem_cache *cachep)
2571{
2572        int i;
2573        struct kmem_cache_node *n;
2574        int rc = __cache_shrink(cachep);
2575
2576        if (rc)
2577                return rc;
2578
2579        for_each_online_cpu(i)
2580            kfree(cachep->array[i]);
2581
2582        /* NUMA: free the node structures */
2583        for_each_online_node(i) {
2584                n = cachep->node[i];
2585                if (n) {
2586                        kfree(n->shared);
2587                        free_alien_cache(n->alien);
2588                        kfree(n);
2589                }
2590        }
2591        return 0;
2592}
2593
2594/*
2595 * Get the memory for a slab management obj.
2596 * For a slab cache when the slab descriptor is off-slab, slab descriptors
2597 * always come from malloc_sizes caches.  The slab descriptor cannot
2598 * come from the same cache which is getting created because,
2599 * when we are searching for an appropriate cache for these
2600 * descriptors in kmem_cache_create, we search through the malloc_sizes array.
2601 * If we are creating a malloc_sizes cache here it would not be visible to
2602 * kmem_find_general_cachep till the initialization is complete.
2603 * Hence we cannot have slabp_cache same as the original cache.
2604 */
2605static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2606                                   int colour_off, gfp_t local_flags,
2607                                   int nodeid)
2608{
2609        struct slab *slabp;
2610
2611        if (OFF_SLAB(cachep)) {
2612                /* Slab management obj is off-slab. */
2613                slabp = kmem_cache_alloc_node(cachep->slabp_cache,
2614                                              local_flags, nodeid);
2615                /*
2616                 * If the first object in the slab is leaked (it's allocated
2617                 * but no one has a reference to it), we want to make sure
2618                 * kmemleak does not treat the ->s_mem pointer as a reference
2619                 * to the object. Otherwise we will not report the leak.
2620                 */
2621                kmemleak_scan_area(&slabp->list, sizeof(struct list_head),
2622                                   local_flags);
2623                if (!slabp)
2624                        return NULL;
2625        } else {
2626                slabp = objp + colour_off;
2627                colour_off += cachep->slab_size;
2628        }
2629        slabp->inuse = 0;
2630        slabp->colouroff = colour_off;
2631        slabp->s_mem = objp + colour_off;
2632        slabp->nodeid = nodeid;
2633        slabp->free = 0;
2634        return slabp;
2635}
2636
2637static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
2638{
2639        return (kmem_bufctl_t *) (slabp + 1);
2640}
2641
2642static void cache_init_objs(struct kmem_cache *cachep,
2643                            struct slab *slabp)
2644{
2645        int i;
2646
2647        for (i = 0; i < cachep->num; i++) {
2648                void *objp = index_to_obj(cachep, slabp, i);
2649#if DEBUG
2650                /* need to poison the objs? */
2651                if (cachep->flags & SLAB_POISON)
2652                        poison_obj(cachep, objp, POISON_FREE);
2653                if (cachep->flags & SLAB_STORE_USER)
2654                        *dbg_userword(cachep, objp) = NULL;
2655
2656                if (cachep->flags & SLAB_RED_ZONE) {
2657                        *dbg_redzone1(cachep, objp) = RED_INACTIVE;
2658                        *dbg_redzone2(cachep, objp) = RED_INACTIVE;
2659                }
2660                /*
2661                 * Constructors are not allowed to allocate memory from the same
2662                 * cache which they are a constructor for.  Otherwise, deadlock.
2663                 * They must also be threaded.
2664                 */
2665                if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2666                        cachep->ctor(objp + obj_offset(cachep));
2667
2668                if (cachep->flags & SLAB_RED_ZONE) {
2669                        if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
2670                                slab_error(cachep, "constructor overwrote the"
2671                                           " end of an object");
2672                        if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
2673                                slab_error(cachep, "constructor overwrote the"
2674                                           " start of an object");
2675                }
2676                if ((cachep->size % PAGE_SIZE) == 0 &&
2677                            OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
2678                        kernel_map_pages(virt_to_page(objp),
2679                                         cachep->size / PAGE_SIZE, 0);
2680#else
2681                if (cachep->ctor)
2682                        cachep->ctor(objp);
2683#endif
2684                slab_bufctl(slabp)[i] = i + 1;
2685        }
2686        slab_bufctl(slabp)[i - 1] = BUFCTL_END;
2687}
2688
2689static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
2690{
2691        if (CONFIG_ZONE_DMA_FLAG) {
2692                if (flags & GFP_DMA)
2693                        BUG_ON(!(cachep->allocflags & GFP_DMA));
2694                else
2695                        BUG_ON(cachep->allocflags & GFP_DMA);
2696        }
2697}
2698
2699static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
2700                                int nodeid)
2701{
2702        void *objp = index_to_obj(cachep, slabp, slabp->free);
2703        kmem_bufctl_t next;
2704
2705        slabp->inuse++;
2706        next = slab_bufctl(slabp)[slabp->free];
2707#if DEBUG
2708        slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2709        WARN_ON(slabp->nodeid != nodeid);
2710#endif
2711        slabp->free = next;
2712
2713        return objp;
2714}
2715
2716static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
2717                                void *objp, int nodeid)
2718{
2719        unsigned int objnr = obj_to_index(cachep, slabp, objp);
2720
2721#if DEBUG
2722        /* Verify that the slab belongs to the intended node */
2723        WARN_ON(slabp->nodeid != nodeid);
2724
2725        if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) {
2726                printk(KERN_ERR "slab: double free detected in cache "
2727                                "'%s', objp %p\n", cachep->name, objp);
2728                BUG();
2729        }
2730#endif
2731        slab_bufctl(slabp)[objnr] = slabp->free;
2732        slabp->free = objnr;
2733        slabp->inuse--;
2734}
2735
2736/*
2737 * Map pages beginning at addr to the given cache and slab. This is required
2738 * for the slab allocator to be able to lookup the cache and slab of a
2739 * virtual address for kfree, ksize, and slab debugging.
2740 */
2741static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
2742                           void *addr)
2743{
2744        int nr_pages;
2745        struct page *page;
2746
2747        page = virt_to_page(addr);
2748
2749        nr_pages = 1;
2750        if (likely(!PageCompound(page)))
2751                nr_pages <<= cache->gfporder;
2752
2753        do {
2754                page->slab_cache = cache;
2755                page->slab_page = slab;
2756                page++;
2757        } while (--nr_pages);
2758}
2759
2760/*
2761 * Grow (by 1) the number of slabs within a cache.  This is called by
2762 * kmem_cache_alloc() when there are no active objs left in a cache.
2763 */
2764static int cache_grow(struct kmem_cache *cachep,
2765                gfp_t flags, int nodeid, void *objp)
2766{
2767        struct slab *slabp;
2768        size_t offset;
2769        gfp_t local_flags;
2770        struct kmem_cache_node *n;
2771
2772        /*
2773         * Be lazy and only check for valid flags here,  keeping it out of the
2774         * critical path in kmem_cache_alloc().
2775         */
2776        BUG_ON(flags & GFP_SLAB_BUG_MASK);
2777        local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
2778
2779        /* Take the node list lock to change the colour_next on this node */
2780        check_irq_off();
2781        n = cachep->node[nodeid];
2782        spin_lock(&n->list_lock);
2783
2784        /* Get colour for the slab, and cal the next value. */
2785        offset = n->colour_next;
2786        n->colour_next++;
2787        if (n->colour_next >= cachep->colour)
2788                n->colour_next = 0;
2789        spin_unlock(&n->list_lock);
2790
2791        offset *= cachep->colour_off;
2792
2793        if (local_flags & __GFP_WAIT)
2794                local_irq_enable();
2795
2796        /*
2797         * The test for missing atomic flag is performed here, rather than
2798         * the more obvious place, simply to reduce the critical path length
2799         * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
2800         * will eventually be caught here (where it matters).
2801         */
2802        kmem_flagcheck(cachep, flags);
2803
2804        /*
2805         * Get mem for the objs.  Attempt to allocate a physical page from
2806         * 'nodeid'.
2807         */
2808        if (!objp)
2809                objp = kmem_getpages(cachep, local_flags, nodeid);
2810        if (!objp)
2811                goto failed;
2812
2813        /* Get slab management. */
2814        slabp = alloc_slabmgmt(cachep, objp, offset,
2815                        local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
2816        if (!slabp)
2817                goto opps1;
2818
2819        slab_map_pages(cachep, slabp, objp);
2820
2821        cache_init_objs(cachep, slabp);
2822
2823        if (local_flags & __GFP_WAIT)
2824                local_irq_disable();
2825        check_irq_off();
2826        spin_lock(&n->list_lock);
2827
2828        /* Make slab active. */
2829        list_add_tail(&slabp->list, &(n->slabs_free));
2830        STATS_INC_GROWN(cachep);
2831        n->free_objects += cachep->num;
2832        spin_unlock(&n->list_lock);
2833        return 1;
2834opps1:
2835        kmem_freepages(cachep, objp);
2836failed:
2837        if (local_flags & __GFP_WAIT)
2838                local_irq_disable();
2839        return 0;
2840}
2841
2842#if DEBUG
2843
2844/*
2845 * Perform extra freeing checks:
2846 * - detect bad pointers.
2847 * - POISON/RED_ZONE checking
2848 */
2849static void kfree_debugcheck(const void *objp)
2850{
2851        if (!virt_addr_valid(objp)) {
2852                printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
2853                       (unsigned long)objp);
2854                BUG();
2855        }
2856}
2857
2858static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
2859{
2860        unsigned long long redzone1, redzone2;
2861
2862        redzone1 = *dbg_redzone1(cache, obj);
2863        redzone2 = *dbg_redzone2(cache, obj);
2864
2865        /*
2866         * Redzone is ok.
2867         */
2868        if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)
2869                return;
2870
2871        if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)
2872                slab_error(cache, "double free detected");
2873        else
2874                slab_error(cache, "memory outside object was overwritten");
2875
2876        printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n",
2877                        obj, redzone1, redzone2);
2878}
2879
2880static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2881                                   unsigned long caller)
2882{
2883        struct page *page;
2884        unsigned int objnr;
2885        struct slab *slabp;
2886
2887        BUG_ON(virt_to_cache(objp) != cachep);
2888
2889        objp -= obj_offset(cachep);
2890        kfree_debugcheck(objp);
2891        page = virt_to_head_page(objp);
2892
2893        slabp = page->slab_page;
2894
2895        if (cachep->flags & SLAB_RED_ZONE) {
2896                verify_redzone_free(cachep, objp);
2897                *dbg_redzone1(cachep, objp) = RED_INACTIVE;
2898                *dbg_redzone2(cachep, objp) = RED_INACTIVE;
2899        }
2900        if (cachep->flags & SLAB_STORE_USER)
2901                *dbg_userword(cachep, objp) = (void *)caller;
2902
2903        objnr = obj_to_index(cachep, slabp, objp);
2904
2905        BUG_ON(objnr >= cachep->num);
2906        BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
2907
2908#ifdef CONFIG_DEBUG_SLAB_LEAK
2909        slab_bufctl(slabp)[objnr] = BUFCTL_FREE;
2910#endif
2911        if (cachep->flags & SLAB_POISON) {
2912#ifdef CONFIG_DEBUG_PAGEALLOC
2913                if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
2914                        store_stackinfo(cachep, objp, caller);
2915                        kernel_map_pages(virt_to_page(objp),
2916                                         cachep->size / PAGE_SIZE, 0);
2917                } else {
2918                        poison_obj(cachep, objp, POISON_FREE);
2919                }
2920#else
2921                poison_obj(cachep, objp, POISON_FREE);
2922#endif
2923        }
2924        return objp;
2925}
2926
2927static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
2928{
2929        kmem_bufctl_t i;
2930        int entries = 0;
2931
2932        /* Check slab's freelist to see if this obj is there. */
2933        for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
2934                entries++;
2935                if (entries > cachep->num || i >= cachep->num)
2936                        goto bad;
2937        }
2938        if (entries != cachep->num - slabp->inuse) {
2939bad:
2940                printk(KERN_ERR "slab: Internal list corruption detected in "
2941                        "cache '%s'(%d), slabp %p(%d). Tainted(%s). Hexdump:\n",
2942                        cachep->name, cachep->num, slabp, slabp->inuse,
2943                        print_tainted());
2944                print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, slabp,
2945                        sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t),
2946                        1);
2947                BUG();
2948        }
2949}
2950#else
2951#define kfree_debugcheck(x) do { } while(0)
2952#define cache_free_debugcheck(x,objp,z) (objp)
2953#define check_slabp(x,y) do { } while(0)
2954#endif
2955
2956static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
2957                                                        bool force_refill)
2958{
2959        int batchcount;
2960        struct kmem_cache_node *n;
2961        struct array_cache *ac;
2962        int node;
2963
2964        check_irq_off();
2965        node = numa_mem_id();
2966        if (unlikely(force_refill))
2967                goto force_grow;
2968retry:
2969        ac = cpu_cache_get(cachep);
2970        batchcount = ac->batchcount;
2971        if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
2972                /*
2973                 * If there was little recent activity on this cache, then
2974                 * perform only a partial refill.  Otherwise we could generate
2975                 * refill bouncing.
2976                 */
2977                batchcount = BATCHREFILL_LIMIT;
2978        }
2979        n = cachep->node[node];
2980
2981        BUG_ON(ac->avail > 0 || !n);
2982        spin_lock(&n->list_lock);
2983
2984        /* See if we can refill from the shared array */
2985        if (n->shared && transfer_objects(ac, n->shared, batchcount)) {
2986                n->shared->touched = 1;
2987                goto alloc_done;
2988        }
2989
2990        while (batchcount > 0) {
2991                struct list_head *entry;
2992                struct slab *slabp;
2993                /* Get slab alloc is to come from. */
2994                entry = n->slabs_partial.next;
2995                if (entry == &n->slabs_partial) {
2996                        n->free_touched = 1;
2997                        entry = n->slabs_free.next;
2998                        if (entry == &n->slabs_free)
2999                                goto must_grow;
3000                }
3001
3002                slabp = list_entry(entry, struct slab, list);
3003                check_slabp(cachep, slabp);
3004                check_spinlock_acquired(cachep);
3005
3006                /*
3007                 * The slab was either on partial or free list so
3008                 * there must be at least one object available for
3009                 * allocation.
3010                 */
3011                BUG_ON(slabp->inuse >= cachep->num);
3012
3013                while (slabp->inuse < cachep->num && batchcount--) {
3014                        STATS_INC_ALLOCED(cachep);
3015                        STATS_INC_ACTIVE(cachep);
3016                        STATS_SET_HIGH(cachep);
3017
3018                        ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp,
3019                                                                        node));
3020                }
3021                check_slabp(cachep, slabp);
3022
3023                /* move slabp to correct slabp list: */
3024                list_del(&slabp->list);
3025                if (slabp->free == BUFCTL_END)
3026                        list_add(&slabp->list, &n->slabs_full);
3027                else
3028                        list_add(&slabp->list, &n->slabs_partial);
3029        }
3030
3031must_grow:
3032        n->free_objects -= ac->avail;
3033alloc_done:
3034        spin_unlock(&n->list_lock);
3035
3036        if (unlikely(!ac->avail)) {
3037                int x;
3038force_grow:
3039                x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
3040
3041                /* cache_grow can reenable interrupts, then ac could change. */
3042                ac = cpu_cache_get(cachep);
3043                node = numa_mem_id();
3044
3045                /* no objects in sight? abort */
3046                if (!x && (ac->avail == 0 || force_refill))
3047                        return NULL;
3048
3049                if (!ac->avail)         /* objects refilled by interrupt? */
3050                        goto retry;
3051        }
3052        ac->touched = 1;
3053
3054        return ac_get_obj(cachep, ac, flags, force_refill);
3055}
3056
3057static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
3058                                                gfp_t flags)
3059{
3060        might_sleep_if(flags & __GFP_WAIT);
3061#if DEBUG
3062        kmem_flagcheck(cachep, flags);
3063#endif
3064}
3065
3066#if DEBUG
3067static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3068                                gfp_t flags, void *objp, unsigned long caller)
3069{
3070        if (!objp)
3071                return objp;
3072        if (cachep->flags & SLAB_POISON) {
3073#ifdef CONFIG_DEBUG_PAGEALLOC
3074                if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
3075                        kernel_map_pages(virt_to_page(objp),
3076                                         cachep->size / PAGE_SIZE, 1);
3077                else
3078                        check_poison_obj(cachep, objp);
3079#else
3080                check_poison_obj(cachep, objp);
3081#endif
3082                poison_obj(cachep, objp, POISON_INUSE);
3083        }
3084        if (cachep->flags & SLAB_STORE_USER)
3085                *dbg_userword(cachep, objp) = (void *)caller;
3086
3087        if (cachep->flags & SLAB_RED_ZONE) {
3088                if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
3089                                *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
3090                        slab_error(cachep, "double free, or memory outside"
3091                                                " object was overwritten");
3092                        printk(KERN_ERR
3093                                "%p: redzone 1:0x%llx, redzone 2:0x%llx\n",
3094                                objp, *dbg_redzone1(cachep, objp),
3095                                *dbg_redzone2(cachep, objp));
3096                }
3097                *dbg_redzone1(cachep, objp) = RED_ACTIVE;
3098                *dbg_redzone2(cachep, objp) = RED_ACTIVE;
3099        }
3100#ifdef CONFIG_DEBUG_SLAB_LEAK
3101        {
3102                struct slab *slabp;
3103                unsigned objnr;
3104
3105                slabp = virt_to_head_page(objp)->slab_page;
3106                objnr = (unsigned)(objp - slabp->s_mem) / cachep->size;
3107                slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
3108        }
3109#endif
3110        objp += obj_offset(cachep);
3111        if (cachep->ctor && cachep->flags & SLAB_POISON)
3112                cachep->ctor(objp);
3113        if (ARCH_SLAB_MINALIGN &&
3114            ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) {
3115                printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
3116                       objp, (int)ARCH_SLAB_MINALIGN);
3117        }
3118        return objp;
3119}
3120#else
3121#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
3122#endif
3123
3124static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
3125{
3126        if (cachep == kmem_cache)
3127                return false;
3128
3129        return should_failslab(cachep->object_size, flags, cachep->flags);
3130}
3131
3132static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3133{
3134        void *objp;
3135        struct array_cache *ac;
3136        bool force_refill = false;
3137
3138        check_irq_off();
3139
3140        ac = cpu_cache_get(cachep);
3141        if (likely(ac->avail)) {
3142                ac->touched = 1;
3143                objp = ac_get_obj(cachep, ac, flags, false);
3144
3145                /*
3146                 * Allow for the possibility all avail objects are not allowed
3147                 * by the current flags
3148                 */
3149                if (objp) {
3150                        STATS_INC_ALLOCHIT(cachep);
3151                        goto out;
3152                }
3153                force_refill = true;
3154        }
3155
3156        STATS_INC_ALLOCMISS(cachep);
3157        objp = cache_alloc_refill(cachep, flags, force_refill);
3158        /*
3159         * the 'ac' may be updated by cache_alloc_refill(),
3160         * and kmemleak_erase() requires its correct value.
3161         */
3162        ac = cpu_cache_get(cachep);
3163
3164out:
3165        /*
3166         * To avoid a false negative, if an object that is in one of the
3167         * per-CPU caches is leaked, we need to make sure kmemleak doesn't
3168         * treat the array pointers as a reference to the object.
3169         */
3170        if (objp)
3171                kmemleak_erase(&ac->entry[ac->avail]);
3172        return objp;
3173}
3174
3175#ifdef CONFIG_NUMA
3176/*
3177 * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.
3178 *
3179 * If we are in_interrupt, then process context, including cpusets and
3180 * mempolicy, may not apply and should not be used for allocation policy.
3181 */
3182static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3183{
3184        int nid_alloc, nid_here;
3185
3186        if (in_interrupt() || (flags & __GFP_THISNODE))
3187                return NULL;
3188        nid_alloc = nid_here = numa_mem_id();
3189        if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
3190                nid_alloc = cpuset_slab_spread_node();
3191        else if (current->mempolicy)
3192                nid_alloc = slab_node();
3193        if (nid_alloc != nid_here)
3194                return ____cache_alloc_node(cachep, flags, nid_alloc);
3195        return NULL;
3196}
3197
3198/*
3199 * Fallback function if there was no memory available and no objects on a
3200 * certain node and fall back is permitted. First we scan all the
3201 * available node for available objects. If that fails then we
3202 * perform an allocation without specifying a node. This allows the page
3203 * allocator to do its reclaim / fallback magic. We then insert the
3204 * slab into the proper nodelist and then allocate from it.
3205 */
3206static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3207{
3208        struct zonelist *zonelist;
3209        gfp_t local_flags;
3210        struct zoneref *z;
3211        struct zone *zone;
3212        enum zone_type high_zoneidx = gfp_zone(flags);
3213        void *obj = NULL;
3214        int nid;
3215        unsigned int cpuset_mems_cookie;
3216
3217        if (flags & __GFP_THISNODE)
3218                return NULL;
3219
3220        local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
3221
3222retry_cpuset:
3223        cpuset_mems_cookie = get_mems_allowed();
3224        zonelist = node_zonelist(slab_node(), flags);
3225
3226retry:
3227        /*
3228         * Look through allowed nodes for objects available
3229         * from existing per node queues.
3230         */
3231        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
3232                nid = zone_to_nid(zone);
3233
3234                if (cpuset_zone_allowed_hardwall(zone, flags) &&
3235                        cache->node[nid] &&
3236                        cache->node[nid]->free_objects) {
3237                                obj = ____cache_alloc_node(cache,
3238                                        flags | GFP_THISNODE, nid);
3239                                if (obj)
3240                                        break;
3241                }
3242        }
3243
3244        if (!obj) {
3245                /*
3246                 * This allocation will be performed within the constraints
3247                 * of the current cpuset / memory policy requirements.
3248                 * We may trigger various forms of reclaim on the allowed
3249                 * set and go into memory reserves if necessary.
3250                 */
3251                if (local_flags & __GFP_WAIT)
3252                        local_irq_enable();
3253                kmem_flagcheck(cache, flags);
3254                obj = kmem_getpages(cache, local_flags, numa_mem_id());
3255                if (local_flags & __GFP_WAIT)
3256                        local_irq_disable();
3257                if (obj) {
3258                        /*
3259                         * Insert into the appropriate per node queues
3260                         */
3261                        nid = page_to_nid(virt_to_page(obj));
3262                        if (cache_grow(cache, flags, nid, obj)) {
3263                                obj = ____cache_alloc_node(cache,
3264                                        flags | GFP_THISNODE, nid);
3265                                if (!obj)
3266                                        /*
3267                                         * Another processor may allocate the
3268                                         * objects in the slab since we are
3269                                         * not holding any locks.
3270                                         */
3271                                        goto retry;
3272                        } else {
3273                                /* cache_grow already freed obj */
3274                                obj = NULL;
3275                        }
3276                }
3277        }
3278
3279        if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj))
3280                goto retry_cpuset;
3281        return obj;
3282}
3283
3284/*
3285 * A interface to enable slab creation on nodeid
3286 */
3287static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
3288                                int nodeid)
3289{
3290        struct list_head *entry;
3291        struct slab *slabp;
3292        struct kmem_cache_node *n;
3293        void *obj;
3294        int x;
3295
3296        VM_BUG_ON(nodeid > num_online_nodes());
3297        n = cachep->node[nodeid];
3298        BUG_ON(!n);
3299
3300retry:
3301        check_irq_off();
3302        spin_lock(&n->list_lock);
3303        entry = n->slabs_partial.next;
3304        if (entry == &n->slabs_partial) {
3305                n->free_touched = 1;
3306                entry = n->slabs_free.next;
3307                if (entry == &n->slabs_free)
3308                        goto must_grow;
3309        }
3310
3311        slabp = list_entry(entry, struct slab, list);
3312        check_spinlock_acquired_node(cachep, nodeid);
3313        check_slabp(cachep, slabp);
3314
3315        STATS_INC_NODEALLOCS(cachep);
3316        STATS_INC_ACTIVE(cachep);
3317        STATS_SET_HIGH(cachep);
3318
3319        BUG_ON(slabp->inuse == cachep->num);
3320
3321        obj = slab_get_obj(cachep, slabp, nodeid);
3322        check_slabp(cachep, slabp);
3323        n->free_objects--;
3324        /* move slabp to correct slabp list: */
3325        list_del(&slabp->list);
3326
3327        if (slabp->free == BUFCTL_END)
3328                list_add(&slabp->list, &n->slabs_full);
3329        else
3330                list_add(&slabp->list, &n->slabs_partial);
3331
3332        spin_unlock(&n->list_lock);
3333        goto done;
3334
3335must_grow:
3336        spin_unlock(&n->list_lock);
3337        x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
3338        if (x)
3339                goto retry;
3340
3341        return fallback_alloc(cachep, flags);
3342
3343done:
3344        return obj;
3345}
3346
3347static __always_inline void *
3348slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3349                   unsigned long caller)
3350{
3351        unsigned long save_flags;
3352        void *ptr;
3353        int slab_node = numa_mem_id();
3354
3355        flags &= gfp_allowed_mask;
3356
3357        lockdep_trace_alloc(flags);
3358
3359        if (slab_should_failslab(cachep, flags))
3360                return NULL;
3361
3362        cachep = memcg_kmem_get_cache(cachep, flags);
3363
3364        cache_alloc_debugcheck_before(cachep, flags);
3365        local_irq_save(save_flags);
3366
3367        if (nodeid == NUMA_NO_NODE)
3368                nodeid = slab_node;
3369
3370        if (unlikely(!cachep->node[nodeid])) {
3371                /* Node not bootstrapped yet */
3372                ptr = fallback_alloc(cachep, flags);
3373                goto out;
3374        }
3375
3376        if (nodeid == slab_node) {
3377                /*
3378                 * Use the locally cached objects if possible.
3379                 * However ____cache_alloc does not allow fallback
3380                 * to other nodes. It may fail while we still have
3381                 * objects on other nodes available.
3382                 */
3383                ptr = ____cache_alloc(cachep, flags);
3384                if (ptr)
3385                        goto out;
3386        }
3387        /* ___cache_alloc_node can fall back to other nodes */
3388        ptr = ____cache_alloc_node(cachep, flags, nodeid);
3389  out:
3390        local_irq_restore(save_flags);
3391        ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
3392        kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags,
3393                                 flags);
3394
3395        if (likely(ptr))
3396                kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size);
3397
3398        if (unlikely((flags & __GFP_ZERO) && ptr))
3399                memset(ptr, 0, cachep->object_size);
3400
3401        return ptr;
3402}
3403
3404static __always_inline void *
3405__do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
3406{
3407        void *objp;
3408
3409        if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
3410                objp = alternate_node_alloc(cache, flags);
3411                if (objp)
3412                        goto out;
3413        }
3414        objp = ____cache_alloc(cache, flags);
3415
3416        /*
3417         * We may just have run out of memory on the local node.
3418         * ____cache_alloc_node() knows how to locate memory on other nodes
3419         */
3420        if (!objp)
3421                objp = ____cache_alloc_node(cache, flags, numa_mem_id());
3422
3423  out:
3424        return objp;
3425}
3426#else
3427
3428static __always_inline void *
3429__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3430{
3431        return ____cache_alloc(cachep, flags);
3432}
3433
3434#endif /* CONFIG_NUMA */
3435
3436static __always_inline void *
3437slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
3438{
3439        unsigned long save_flags;
3440        void *objp;
3441
3442        flags &= gfp_allowed_mask;
3443
3444        lockdep_trace_alloc(flags);
3445
3446        if (slab_should_failslab(cachep, flags))
3447                return NULL;
3448
3449        cachep = memcg_kmem_get_cache(cachep, flags);
3450
3451        cache_alloc_debugcheck_before(cachep, flags);
3452        local_irq_save(save_flags);
3453        objp = __do_cache_alloc(cachep, flags);
3454        local_irq_restore(save_flags);
3455        objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
3456        kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,
3457                                 flags);
3458        prefetchw(objp);
3459
3460        if (likely(objp))
3461                kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size);
3462
3463        if (unlikely((flags & __GFP_ZERO) && objp))
3464                memset(objp, 0, cachep->object_size);
3465
3466        return objp;
3467}
3468
3469/*
3470 * Caller needs to acquire correct kmem_list's list_lock
3471 */
3472static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
3473                       int node)
3474{
3475        int i;
3476        struct kmem_cache_node *n;
3477
3478        for (i = 0; i < nr_objects; i++) {
3479                void *objp;
3480                struct slab *slabp;
3481
3482                clear_obj_pfmemalloc(&objpp[i]);
3483                objp = objpp[i];
3484
3485                slabp = virt_to_slab(objp);
3486                n = cachep->node[node];
3487                list_del(&slabp->list);
3488                check_spinlock_acquired_node(cachep, node);
3489                check_slabp(cachep, slabp);
3490                slab_put_obj(cachep, slabp, objp, node);
3491                STATS_DEC_ACTIVE(cachep);
3492                n->free_objects++;
3493                check_slabp(cachep, slabp);
3494
3495                /* fixup slab chains */
3496                if (slabp->inuse == 0) {
3497                        if (n->free_objects > n->free_limit) {
3498                                n->free_objects -= cachep->num;
3499                                /* No need to drop any previously held
3500                                 * lock here, even if we have a off-slab slab
3501                                 * descriptor it is guaranteed to come from
3502                                 * a different cache, refer to comments before
3503                                 * alloc_slabmgmt.
3504                                 */
3505                                slab_destroy(cachep, slabp);
3506                        } else {
3507                                list_add(&slabp->list, &n->slabs_free);
3508                        }
3509                } else {
3510                        /* Unconditionally move a slab to the end of the
3511                         * partial list on free - maximum time for the
3512                         * other objects to be freed, too.
3513                         */
3514                        list_add_tail(&slabp->list, &n->slabs_partial);
3515                }
3516        }
3517}
3518
3519static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
3520{
3521        int batchcount;
3522        struct kmem_cache_node *n;
3523        int node = numa_mem_id();
3524
3525        batchcount = ac->batchcount;
3526#if DEBUG
3527        BUG_ON(!batchcount || batchcount > ac->avail);
3528#endif
3529        check_irq_off();
3530        n = cachep->node[node];
3531        spin_lock(&n->list_lock);
3532        if (n->shared) {
3533                struct array_cache *shared_array = n->shared;
3534                int max = shared_array->limit - shared_array->avail;
3535                if (max) {
3536                        if (batchcount > max)
3537                                batchcount = max;
3538                        memcpy(&(shared_array->entry[shared_array->avail]),
3539                               ac->entry, sizeof(void *) * batchcount);
3540                        shared_array->avail += batchcount;
3541                        goto free_done;
3542                }
3543        }
3544
3545        free_block(cachep, ac->entry, batchcount, node);
3546free_done:
3547#if STATS
3548        {
3549                int i = 0;
3550                struct list_head *p;
3551
3552                p = n->slabs_free.next;
3553                while (p != &(n->slabs_free)) {
3554                        struct slab *slabp;
3555
3556                        slabp = list_entry(p, struct slab, list);
3557                        BUG_ON(slabp->inuse);
3558
3559                        i++;
3560                        p = p->next;
3561                }
3562                STATS_SET_FREEABLE(cachep, i);
3563        }
3564#endif
3565        spin_unlock(&n->list_lock);
3566        ac->avail -= batchcount;
3567        memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
3568}
3569
3570/*
3571 * Release an obj back to its cache. If the obj has a constructed state, it must
3572 * be in this state _before_ it is released.  Called with disabled ints.
3573 */
3574static inline void __cache_free(struct kmem_cache *cachep, void *objp,
3575                                unsigned long caller)
3576{
3577        struct array_cache *ac = cpu_cache_get(cachep);
3578
3579        check_irq_off();
3580        kmemleak_free_recursive(objp, cachep->flags);
3581        objp = cache_free_debugcheck(cachep, objp, caller);
3582
3583        kmemcheck_slab_free(cachep, objp, cachep->object_size);
3584
3585        /*
3586         * Skip calling cache_free_alien() when the platform is not numa.
3587         * This will avoid cache misses that happen while accessing slabp (which
3588         * is per page memory  reference) to get nodeid. Instead use a global
3589         * variable to skip the call, which is mostly likely to be present in
3590         * the cache.
3591         */
3592        if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
3593                return;
3594
3595        if (likely(ac->avail < ac->limit)) {
3596                STATS_INC_FREEHIT(cachep);
3597        } else {
3598                STATS_INC_FREEMISS(cachep);
3599                cache_flusharray(cachep, ac);
3600        }
3601
3602        ac_put_obj(cachep, ac, objp);
3603}
3604
3605/**
3606 * kmem_cache_alloc - Allocate an object
3607 * @cachep: The cache to allocate from.
3608 * @flags: See kmalloc().
3609 *
3610 * Allocate an object from this cache.  The flags are only relevant
3611 * if the cache has no available objects.
3612 */
3613void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3614{
3615        void *ret = slab_alloc(cachep, flags, _RET_IP_);
3616
3617        trace_kmem_cache_alloc(_RET_IP_, ret,
3618                               cachep->object_size, cachep->size, flags);
3619
3620        return ret;
3621}
3622EXPORT_SYMBOL(kmem_cache_alloc);
3623
3624#ifdef CONFIG_TRACING
3625void *
3626kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
3627{
3628        void *ret;
3629
3630        ret = slab_alloc(cachep, flags, _RET_IP_);
3631
3632        trace_kmalloc(_RET_IP_, ret,
3633                      size, cachep->size, flags);
3634        return ret;
3635}
3636EXPORT_SYMBOL(kmem_cache_alloc_trace);
3637#endif
3638
3639#ifdef CONFIG_NUMA
3640/**
3641 * kmem_cache_alloc_node - Allocate an object on the specified node
3642 * @cachep: The cache to allocate from.
3643 * @flags: See kmalloc().
3644 * @nodeid: node number of the target node.
3645 *
3646 * Identical to kmem_cache_alloc but it will allocate memory on the given
3647 * node, which can improve the performance for cpu bound structures.
3648 *
3649 * Fallback to other node is possible if __GFP_THISNODE is not set.
3650 */
3651void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3652{
3653        void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
3654
3655        trace_kmem_cache_alloc_node(_RET_IP_, ret,
3656                                    cachep->object_size, cachep->size,
3657                                    flags, nodeid);
3658
3659        return ret;
3660}
3661EXPORT_SYMBOL(kmem_cache_alloc_node);
3662
3663#ifdef CONFIG_TRACING
3664void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
3665                                  gfp_t flags,
3666                                  int nodeid,
3667                                  size_t size)
3668{
3669        void *ret;
3670
3671        ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
3672
3673        trace_kmalloc_node(_RET_IP_, ret,
3674                           size, cachep->size,
3675                           flags, nodeid);
3676        return ret;
3677}
3678EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
3679#endif
3680
3681static __always_inline void *
3682__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
3683{
3684        struct kmem_cache *cachep;
3685
3686        cachep = kmalloc_slab(size, flags);
3687        if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3688                return cachep;
3689        return kmem_cache_alloc_node_trace(cachep, flags, node, size);
3690}
3691
3692#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
3693void *__kmalloc_node(size_t size, gfp_t flags, int node)
3694{
3695        return __do_kmalloc_node(size, flags, node, _RET_IP_);
3696}
3697EXPORT_SYMBOL(__kmalloc_node);
3698
3699void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
3700                int node, unsigned long caller)
3701{
3702        return __do_kmalloc_node(size, flags, node, caller);
3703}
3704EXPORT_SYMBOL(__kmalloc_node_track_caller);
3705#else
3706void *__kmalloc_node(size_t size, gfp_t flags, int node)
3707{
3708        return __do_kmalloc_node(size, flags, node, 0);
3709}
3710EXPORT_SYMBOL(__kmalloc_node);
3711#endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */
3712#endif /* CONFIG_NUMA */
3713
3714/**
3715 * __do_kmalloc - allocate memory
3716 * @size: how many bytes of memory are required.
3717 * @flags: the type of memory to allocate (see kmalloc).
3718 * @caller: function caller for debug tracking of the caller
3719 */
3720static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3721                                          unsigned long caller)
3722{
3723        struct kmem_cache *cachep;
3724        void *ret;
3725
3726        /* If you want to save a few bytes .text space: replace
3727         * __ with kmem_.
3728         * Then kmalloc uses the uninlined functions instead of the inline
3729         * functions.
3730         */
3731        cachep = kmalloc_slab(size, flags);
3732        if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3733                return cachep;
3734        ret = slab_alloc(cachep, flags, caller);
3735
3736        trace_kmalloc(caller, ret,
3737                      size, cachep->size, flags);
3738
3739        return ret;
3740}
3741
3742
3743#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
3744void *__kmalloc(size_t size, gfp_t flags)
3745{
3746        return __do_kmalloc(size, flags, _RET_IP_);
3747}
3748EXPORT_SYMBOL(__kmalloc);
3749
3750void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
3751{
3752        return __do_kmalloc(size, flags, caller);
3753}
3754EXPORT_SYMBOL(__kmalloc_track_caller);
3755
3756#else
3757void *__kmalloc(size_t size, gfp_t flags)
3758{
3759        return __do_kmalloc(size, flags, 0);
3760}
3761EXPORT_SYMBOL(__kmalloc);
3762#endif
3763
3764/**
3765 * kmem_cache_free - Deallocate an object
3766 * @cachep: The cache the allocation was from.
3767 * @objp: The previously allocated object.
3768 *
3769 * Free an object which was previously allocated from this
3770 * cache.
3771 */
3772void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3773{
3774        unsigned long flags;
3775        cachep = cache_from_obj(cachep, objp);
3776        if (!cachep)
3777                return;
3778
3779        local_irq_save(flags);
3780        debug_check_no_locks_freed(objp, cachep->object_size);
3781        if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
3782                debug_check_no_obj_freed(objp, cachep->object_size);
3783        __cache_free(cachep, objp, _RET_IP_);
3784        local_irq_restore(flags);
3785
3786        trace_kmem_cache_free(_RET_IP_, objp);
3787}
3788EXPORT_SYMBOL(kmem_cache_free);
3789
3790/**
3791 * kfree - free previously allocated memory
3792 * @objp: pointer returned by kmalloc.
3793 *
3794 * If @objp is NULL, no operation is performed.
3795 *
3796 * Don't free memory not originally allocated by kmalloc()
3797 * or you will run into trouble.
3798 */
3799void kfree(const void *objp)
3800{
3801        struct kmem_cache *c;
3802        unsigned long flags;
3803
3804        trace_kfree(_RET_IP_, objp);
3805
3806        if (unlikely(ZERO_OR_NULL_PTR(objp)))
3807                return;
3808        local_irq_save(flags);
3809        kfree_debugcheck(objp);
3810        c = virt_to_cache(objp);
3811        debug_check_no_locks_freed(objp, c->object_size);
3812
3813        debug_check_no_obj_freed(objp, c->object_size);
3814        __cache_free(c, (void *)objp, _RET_IP_);
3815        local_irq_restore(flags);
3816}
3817EXPORT_SYMBOL(kfree);
3818
3819/*
3820 * This initializes kmem_cache_node or resizes various caches for all nodes.
3821 */
3822static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
3823{
3824        int node;
3825        struct kmem_cache_node *n;
3826        struct array_cache *new_shared;
3827        struct array_cache **new_alien = NULL;
3828
3829        for_each_online_node(node) {
3830
3831                if (use_alien_caches) {
3832                        new_alien = alloc_alien_cache(node, cachep->limit, gfp);
3833                        if (!new_alien)
3834                                goto fail;
3835                }
3836
3837                new_shared = NULL;
3838                if (cachep->shared) {
3839                        new_shared = alloc_arraycache(node,
3840                                cachep->shared*cachep->batchcount,
3841                                        0xbaadf00d, gfp);
3842                        if (!new_shared) {
3843                                free_alien_cache(new_alien);
3844                                goto fail;
3845                        }
3846                }
3847
3848                n = cachep->node[node];
3849                if (n) {
3850                        struct array_cache *shared = n->shared;
3851
3852                        spin_lock_irq(&n->list_lock);
3853
3854                        if (shared)
3855                                free_block(cachep, shared->entry,
3856                                                shared->avail, node);
3857
3858                        n->shared = new_shared;
3859                        if (!n->alien) {
3860                                n->alien = new_alien;
3861                                new_alien = NULL;
3862                        }
3863                        n->free_limit = (1 + nr_cpus_node(node)) *
3864                                        cachep->batchcount + cachep->num;
3865                        spin_unlock_irq(&n->list_lock);
3866                        kfree(shared);
3867                        free_alien_cache(new_alien);
3868                        continue;
3869                }
3870                n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node);
3871                if (!n) {
3872                        free_alien_cache(new_alien);
3873                        kfree(new_shared);
3874                        goto fail;
3875                }
3876
3877                kmem_cache_node_init(n);
3878                n->next_reap = jiffies + REAPTIMEOUT_LIST3 +
3879                                ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
3880                n->shared = new_shared;
3881                n->alien = new_alien;
3882                n->free_limit = (1 + nr_cpus_node(node)) *
3883                                        cachep->batchcount + cachep->num;
3884                cachep->node[node] = n;
3885        }
3886        return 0;
3887
3888fail:
3889        if (!cachep->list.next) {
3890                /* Cache is not active yet. Roll back what we did */
3891                node--;
3892                while (node >= 0) {
3893                        if (cachep->node[node]) {
3894                                n = cachep->node[node];
3895
3896                                kfree(n->shared);
3897                                free_alien_cache(n->alien);
3898                                kfree(n);
3899                                cachep->node[node] = NULL;
3900                        }
3901                        node--;
3902                }
3903        }
3904        return -ENOMEM;
3905}
3906
3907struct ccupdate_struct {
3908        struct kmem_cache *cachep;
3909        struct array_cache *new[0];
3910};
3911
3912static void do_ccupdate_local(void *info)
3913{
3914        struct ccupdate_struct *new = info;
3915        struct array_cache *old;
3916
3917        check_irq_off();
3918        old = cpu_cache_get(new->cachep);
3919
3920        new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
3921        new->new[smp_processor_id()] = old;
3922}
3923
3924/* Always called with the slab_mutex held */
3925static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
3926                                int batchcount, int shared, gfp_t gfp)
3927{
3928        struct ccupdate_struct *new;
3929        int i;
3930
3931        new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *),
3932                      gfp);
3933        if (!new)
3934                return -ENOMEM;
3935
3936        for_each_online_cpu(i) {
3937                new->new[i] = alloc_arraycache(cpu_to_mem(i), limit,
3938                                                batchcount, gfp);
3939                if (!new->new[i]) {
3940                        for (i--; i >= 0; i--)
3941                                kfree(new->new[i]);
3942                        kfree(new);
3943                        return -ENOMEM;
3944                }
3945        }
3946        new->cachep = cachep;
3947
3948        on_each_cpu(do_ccupdate_local, (void *)new, 1);
3949
3950        check_irq_on();
3951        cachep->batchcount = batchcount;
3952        cachep->limit = limit;
3953        cachep->shared = shared;
3954
3955        for_each_online_cpu(i) {
3956                struct array_cache *ccold = new->new[i];
3957                if (!ccold)
3958                        continue;
3959                spin_lock_irq(&cachep->node[cpu_to_mem(i)]->list_lock);
3960                free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i));
3961                spin_unlock_irq(&cachep->node[cpu_to_mem(i)]->list_lock);
3962                kfree(ccold);
3963        }
3964        kfree(new);
3965        return alloc_kmemlist(cachep, gfp);
3966}
3967
3968static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3969                                int batchcount, int shared, gfp_t gfp)
3970{
3971        int ret;
3972        struct kmem_cache *c = NULL;
3973        int i = 0;
3974
3975        ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
3976
3977        if (slab_state < FULL)
3978                return ret;
3979
3980        if ((ret < 0) || !is_root_cache(cachep))
3981                return ret;
3982
3983        VM_BUG_ON(!mutex_is_locked(&slab_mutex));
3984        for_each_memcg_cache_index(i) {
3985                c = cache_from_memcg(cachep, i);
3986                if (c)
3987                        /* return value determined by the parent cache only */
3988                        __do_tune_cpucache(c, limit, batchcount, shared, gfp);
3989        }
3990
3991        return ret;
3992}
3993
3994/* Called with slab_mutex held always */
3995static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
3996{
3997        int err;
3998        int limit = 0;
3999        int shared = 0;
4000        int batchcount = 0;
4001
4002        if (!is_root_cache(cachep)) {
4003                struct kmem_cache *root = memcg_root_cache(cachep);
4004                limit = root->limit;
4005                shared = root->shared;
4006                batchcount = root->batchcount;
4007        }
4008
4009        if (limit && shared && batchcount)
4010                goto skip_setup;
4011        /*
4012         * The head array serves three purposes:
4013         * - create a LIFO ordering, i.e. return objects that are cache-warm
4014         * - reduce the number of spinlock operations.
4015         * - reduce the number of linked list operations on the slab and
4016         *   bufctl chains: array operations are cheaper.
4017         * The numbers are guessed, we should auto-tune as described by
4018         * Bonwick.
4019         */
4020        if (cachep->size > 131072)
4021                limit = 1;
4022        else if (cachep->size > PAGE_SIZE)
4023                limit = 8;
4024        else if (cachep->size > 1024)
4025                limit = 24;
4026        else if (cachep->size > 256)
4027                limit = 54;
4028        else
4029                limit = 120;
4030
4031        /*
4032         * CPU bound tasks (e.g. network routing) can exhibit cpu bound
4033         * allocation behaviour: Most allocs on one cpu, most free operations
4034         * on another cpu. For these cases, an efficient object passing between
4035         * cpus is necessary. This is provided by a shared array. The array
4036         * replaces Bonwick's magazine layer.
4037         * On uniprocessor, it's functionally equivalent (but less efficient)
4038         * to a larger limit. Thus disabled by default.
4039         */
4040        shared = 0;
4041        if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1)
4042                shared = 8;
4043
4044#if DEBUG
4045        /*
4046         * With debugging enabled, large batchcount lead to excessively long
4047         * periods with disabled local interrupts. Limit the batchcount
4048         */
4049        if (limit > 32)
4050                limit = 32;
4051#endif
4052        batchcount = (limit + 1) / 2;
4053skip_setup:
4054        err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
4055        if (err)
4056                printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
4057                       cachep->name, -err);
4058        return err;
4059}
4060
4061/*
4062 * Drain an array if it contains any elements taking the node lock only if
4063 * necessary. Note that the node listlock also protects the array_cache
4064 * if drain_array() is used on the shared array.
4065 */
4066static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
4067                         struct array_cache *ac, int force, int node)
4068{
4069        int tofree;
4070
4071        if (!ac || !ac->avail)
4072                return;
4073        if (ac->touched && !force) {
4074                ac->touched = 0;
4075        } else {
4076                spin_lock_irq(&n->list_lock);
4077                if (ac->avail) {
4078                        tofree = force ? ac->avail : (ac->limit + 4) / 5;
4079                        if (tofree > ac->avail)
4080                                tofree = (ac->avail + 1) / 2;
4081                        free_block(cachep, ac->entry, tofree, node);
4082                        ac->avail -= tofree;
4083                        memmove(ac->entry, &(ac->entry[tofree]),
4084                                sizeof(void *) * ac->avail);
4085                }
4086                spin_unlock_irq(&n->list_lock);
4087        }
4088}
4089
4090/**
4091 * cache_reap - Reclaim memory from caches.
4092 * @w: work descriptor
4093 *
4094 * Called from workqueue/eventd every few seconds.
4095 * Purpose:
4096 * - clear the per-cpu caches for this CPU.
4097 * - return freeable pages to the main free memory pool.
4098 *
4099 * If we cannot acquire the cache chain mutex then just give up - we'll try
4100 * again on the next iteration.
4101 */
4102static void cache_reap(struct work_struct *w)
4103{
4104        struct kmem_cache *searchp;
4105        struct kmem_cache_node *n;
4106        int node = numa_mem_id();
4107        struct delayed_work *work = to_delayed_work(w);
4108
4109        if (!mutex_trylock(&slab_mutex))
4110                /* Give up. Setup the next iteration. */
4111                goto out;
4112
4113        list_for_each_entry(searchp, &slab_caches, list) {
4114                check_irq_on();
4115
4116                /*
4117                 * We only take the node lock if absolutely necessary and we
4118                 * have established with reasonable certainty that
4119                 * we can do some work if the lock was obtained.
4120                 */
4121                n = searchp->node[node];
4122
4123                reap_alien(searchp, n);
4124
4125                drain_array(searchp, n, cpu_cache_get(searchp), 0, node);
4126
4127                /*
4128                 * These are racy checks but it does not matter
4129                 * if we skip one check or scan twice.
4130                 */
4131                if (time_after(n->next_reap, jiffies))
4132                        goto next;
4133
4134                n->next_reap = jiffies + REAPTIMEOUT_LIST3;
4135
4136                drain_array(searchp, n, n->shared, 0, node);
4137
4138                if (n->free_touched)
4139                        n->free_touched = 0;
4140                else {
4141                        int freed;
4142
4143                        freed = drain_freelist(searchp, n, (n->free_limit +
4144                                5 * searchp->num - 1) / (5 * searchp->num));
4145                        STATS_ADD_REAPED(searchp, freed);
4146                }
4147next:
4148                cond_resched();
4149        }
4150        check_irq_on();
4151        mutex_unlock(&slab_mutex);
4152        next_reap_node();
4153out:
4154        /* Set up the next iteration */
4155        schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC));
4156}
4157
4158#ifdef CONFIG_SLABINFO
4159void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
4160{
4161        struct slab *slabp;
4162        unsigned long active_objs;
4163        unsigned long num_objs;
4164        unsigned long active_slabs = 0;
4165        unsigned long num_slabs, free_objects = 0, shared_avail = 0;
4166        const char *name;
4167        char *error = NULL;
4168        int node;
4169        struct kmem_cache_node *n;
4170
4171        active_objs = 0;
4172        num_slabs = 0;
4173        for_each_online_node(node) {
4174                n = cachep->node[node];
4175                if (!n)
4176                        continue;
4177
4178                check_irq_on();
4179                spin_lock_irq(&n->list_lock);
4180
4181                list_for_each_entry(slabp, &n->slabs_full, list) {
4182                        if (slabp->inuse != cachep->num && !error)
4183                                error = "slabs_full accounting error";
4184                        active_objs += cachep->num;
4185                        active_slabs++;
4186                }
4187                list_for_each_entry(slabp, &n->slabs_partial, list) {
4188                        if (slabp->inuse == cachep->num && !error)
4189                                error = "slabs_partial inuse accounting error";
4190                        if (!slabp->inuse && !error)
4191                                error = "slabs_partial/inuse accounting error";
4192                        active_objs += slabp->inuse;
4193                        active_slabs++;
4194                }
4195                list_for_each_entry(slabp, &n->slabs_free, list) {
4196                        if (slabp->inuse && !error)
4197                                error = "slabs_free/inuse accounting error";
4198                        num_slabs++;
4199                }
4200                free_objects += n->free_objects;
4201                if (n->shared)
4202                        shared_avail += n->shared->avail;
4203
4204                spin_unlock_irq(&n->list_lock);
4205        }
4206        num_slabs += active_slabs;
4207        num_objs = num_slabs * cachep->num;
4208        if (num_objs - active_objs != free_objects && !error)
4209                error = "free_objects accounting error";
4210
4211        name = cachep->name;
4212        if (error)
4213                printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
4214
4215        sinfo->active_objs = active_objs;
4216        sinfo->num_objs = num_objs;
4217        sinfo->active_slabs = active_slabs;
4218        sinfo->num_slabs = num_slabs;
4219        sinfo->shared_avail = shared_avail;
4220        sinfo->limit = cachep->limit;
4221        sinfo->batchcount = cachep->batchcount;
4222        sinfo->shared = cachep->shared;
4223        sinfo->objects_per_slab = cachep->num;
4224        sinfo->cache_order = cachep->gfporder;
4225}
4226
4227void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep)
4228{
4229#if STATS
4230        {                       /* node stats */
4231                unsigned long high = cachep->high_mark;
4232                unsigned long allocs = cachep->num_allocations;
4233                unsigned long grown = cachep->grown;
4234                unsigned long reaped = cachep->reaped;
4235                unsigned long errors = cachep->errors;
4236                unsigned long max_freeable = cachep->max_freeable;
4237                unsigned long node_allocs = cachep->node_allocs;
4238                unsigned long node_frees = cachep->node_frees;
4239                unsigned long overflows = cachep->node_overflow;
4240
4241                seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu "
4242                           "%4lu %4lu %4lu %4lu %4lu",
4243                           allocs, high, grown,
4244                           reaped, errors, max_freeable, node_allocs,
4245                           node_frees, overflows);
4246        }
4247        /* cpu stats */
4248        {
4249                unsigned long allochit = atomic_read(&cachep->allochit);
4250                unsigned long allocmiss = atomic_read(&cachep->allocmiss);
4251                unsigned long freehit = atomic_read(&cachep->freehit);
4252                unsigned long freemiss = atomic_read(&cachep->freemiss);
4253
4254                seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
4255                           allochit, allocmiss, freehit, freemiss);
4256        }
4257#endif
4258}
4259
4260#define MAX_SLABINFO_WRITE 128
4261/**
4262 * slabinfo_write - Tuning for the slab allocator
4263 * @file: unused
4264 * @buffer: user buffer
4265 * @count: data length
4266 * @ppos: unused
4267 */
4268ssize_t slabinfo_write(struct file *file, const char __user *buffer,
4269                       size_t count, loff_t *ppos)
4270{
4271        char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
4272        int limit, batchcount, shared, res;
4273        struct kmem_cache *cachep;
4274
4275        if (count > MAX_SLABINFO_WRITE)
4276                return -EINVAL;
4277        if (copy_from_user(&kbuf, buffer, count))
4278                return -EFAULT;
4279        kbuf[MAX_SLABINFO_WRITE] = '\0';
4280
4281        tmp = strchr(kbuf, ' ');
4282        if (!tmp)
4283                return -EINVAL;
4284        *tmp = '\0';
4285        tmp++;
4286        if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
4287                return -EINVAL;
4288
4289        /* Find the cache in the chain of caches. */
4290        mutex_lock(&slab_mutex);
4291        res = -EINVAL;
4292        list_for_each_entry(cachep, &slab_caches, list) {
4293                if (!strcmp(cachep->name, kbuf)) {
4294                        if (limit < 1 || batchcount < 1 ||
4295                                        batchcount > limit || shared < 0) {
4296                                res = 0;
4297                        } else {
4298                                res = do_tune_cpucache(cachep, limit,
4299                                                       batchcount, shared,
4300                                                       GFP_KERNEL);
4301                        }
4302                        break;
4303                }
4304        }
4305        mutex_unlock(&slab_mutex);
4306        if (res >= 0)
4307                res = count;
4308        return res;
4309}
4310
4311#ifdef CONFIG_DEBUG_SLAB_LEAK
4312
4313static void *leaks_start(struct seq_file *m, loff_t *pos)
4314{
4315        mutex_lock(&slab_mutex);
4316        return seq_list_start(&slab_caches, *pos);
4317}
4318
4319static inline int add_caller(unsigned long *n, unsigned long v)
4320{
4321        unsigned long *p;
4322        int l;
4323        if (!v)
4324                return 1;
4325        l = n[1];
4326        p = n + 2;
4327        while (l) {
4328                int i = l/2;
4329                unsigned long *q = p + 2 * i;
4330                if (*q == v) {
4331                        q[1]++;
4332                        return 1;
4333                }
4334                if (*q > v) {
4335                        l = i;
4336                } else {
4337                        p = q + 2;
4338                        l -= i + 1;
4339                }
4340        }
4341        if (++n[1] == n[0])
4342                return 0;
4343        memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n));
4344        p[0] = v;
4345        p[1] = 1;
4346        return 1;
4347}
4348
4349static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
4350{
4351        void *p;
4352        int i;
4353        if (n[0] == n[1])
4354                return;
4355        for (i = 0, p = s->s_mem; i < c->num; i++, p += c->size) {
4356                if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
4357                        continue;
4358                if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
4359                        return;
4360        }
4361}
4362
4363static void show_symbol(struct seq_file *m, unsigned long address)
4364{
4365#ifdef CONFIG_KALLSYMS
4366        unsigned long offset, size;
4367        char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN];
4368
4369        if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) {
4370                seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
4371                if (modname[0])
4372                        seq_printf(m, " [%s]", modname);
4373                return;
4374        }
4375#endif
4376        seq_printf(m, "%p", (void *)address);
4377}
4378
4379static int leaks_show(struct seq_file *m, void *p)
4380{
4381        struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
4382        struct slab *slabp;
4383        struct kmem_cache_node *n;
4384        const char *name;
4385        unsigned long *x = m->private;
4386        int node;
4387        int i;
4388
4389        if (!(cachep->flags & SLAB_STORE_USER))
4390                return 0;
4391        if (!(cachep->flags & SLAB_RED_ZONE))
4392                return 0;
4393
4394        /* OK, we can do it */
4395
4396        x[1] = 0;
4397
4398        for_each_online_node(node) {
4399                n = cachep->node[node];
4400                if (!n)
4401                        continue;
4402
4403                check_irq_on();
4404                spin_lock_irq(&n->list_lock);
4405
4406                list_for_each_entry(slabp, &n->slabs_full, list)
4407                        handle_slab(x, cachep, slabp);
4408                list_for_each_entry(slabp, &n->slabs_partial, list)
4409                        handle_slab(x, cachep, slabp);
4410                spin_unlock_irq(&n->list_lock);
4411        }
4412        name = cachep->name;
4413        if (x[0] == x[1]) {
4414                /* Increase the buffer size */
4415                mutex_unlock(&slab_mutex);
4416                m->private = kzalloc(x[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
4417                if (!m->private) {
4418                        /* Too bad, we are really out */
4419                        m->private = x;
4420                        mutex_lock(&slab_mutex);
4421                        return -ENOMEM;
4422                }
4423                *(unsigned long *)m->private = x[0] * 2;
4424                kfree(x);
4425                mutex_lock(&slab_mutex);
4426                /* Now make sure this entry will be retried */
4427                m->count = m->size;
4428                return 0;
4429        }
4430        for (i = 0; i < x[1]; i++) {
4431                seq_printf(m, "%s: %lu ", name, x[2*i+3]);
4432                show_symbol(m, x[2*i+2]);
4433                seq_putc(m, '\n');
4434        }
4435
4436        return 0;
4437}
4438
4439static const struct seq_operations slabstats_op = {
4440        .start = leaks_start,
4441        .next = slab_next,
4442        .stop = slab_stop,
4443        .show = leaks_show,
4444};
4445
4446static int slabstats_open(struct inode *inode, struct file *file)
4447{
4448        unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL);
4449        int ret = -ENOMEM;
4450        if (n) {
4451                ret = seq_open(file, &slabstats_op);
4452                if (!ret) {
4453                        struct seq_file *m = file->private_data;
4454                        *n = PAGE_SIZE / (2 * sizeof(unsigned long));
4455                        m->private = n;
4456                        n = NULL;
4457                }
4458                kfree(n);
4459        }
4460        return ret;
4461}
4462
4463static const struct file_operations proc_slabstats_operations = {
4464        .open           = slabstats_open,
4465        .read           = seq_read,
4466        .llseek         = seq_lseek,
4467        .release        = seq_release_private,
4468};
4469#endif
4470
4471static int __init slab_proc_init(void)
4472{
4473#ifdef CONFIG_DEBUG_SLAB_LEAK
4474        proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
4475#endif
4476        return 0;
4477}
4478module_init(slab_proc_init);
4479#endif
4480
4481/**
4482 * ksize - get the actual amount of memory allocated for a given object
4483 * @objp: Pointer to the object
4484 *
4485 * kmalloc may internally round up allocations and return more memory
4486 * than requested. ksize() can be used to determine the actual amount of
4487 * memory allocated. The caller may use this additional memory, even though
4488 * a smaller amount of memory was initially specified with the kmalloc call.
4489 * The caller must guarantee that objp points to a valid object previously
4490 * allocated with either kmalloc() or kmem_cache_alloc(). The object
4491 * must not be freed during the duration of the call.
4492 */
4493size_t ksize(const void *objp)
4494{
4495        BUG_ON(!objp);
4496        if (unlikely(objp == ZERO_SIZE_PTR))
4497                return 0;
4498
4499        return virt_to_cache(objp)->object_size;
4500}
4501EXPORT_SYMBOL(ksize);
4502