LXR linux/mm/slab.c

   1/*
   2 * linux/mm/slab.c
   3 * Written by Mark Hemment, 1996/97.
   4 * (markhe@nextd.demon.co.uk)
   5 *
   6 * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
   7 *
   8 * Major cleanup, different bufctl logic, per-cpu arrays
   9 *      (c) 2000 Manfred Spraul
  10 *
  11 * Cleanup, make the head arrays unconditional, preparation for NUMA
  12 *      (c) 2002 Manfred Spraul
  13 *
  14 * An implementation of the Slab Allocator as described in outline in;
  15 *      UNIX Internals: The New Frontiers by Uresh Vahalia
  16 *      Pub: Prentice Hall      ISBN 0-13-101908-2
  17 * or with a little more detail in;
  18 *      The Slab Allocator: An Object-Caching Kernel Memory Allocator
  19 *      Jeff Bonwick (Sun Microsystems).
  20 *      Presented at: USENIX Summer 1994 Technical Conference
  21 *
  22 * The memory is organized in caches, one cache for each object type.
  23 * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
  24 * Each cache consists out of many slabs (they are small (usually one
  25 * page long) and always contiguous), and each slab contains multiple
  26 * initialized objects.
  27 *
  28 * This means, that your constructor is used only for newly allocated
  29 * slabs and you must pass objects with the same initializations to
  30 * kmem_cache_free.
  31 *
  32 * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
  33 * normal). If you need a special memory type, then must create a new
  34 * cache for that memory type.
  35 *
  36 * In order to reduce fragmentation, the slabs are sorted in 3 groups:
  37 *   full slabs with 0 free objects
  38 *   partial slabs
  39 *   empty slabs with no allocated objects
  40 *
  41 * If partial slabs exist, then new allocations come from these slabs,
  42 * otherwise from empty slabs or new slabs are allocated.
  43 *
  44 * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
  45 * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
  46 *
  47 * Each cache has a short per-cpu head array, most allocs
  48 * and frees go into that array, and if that array overflows, then 1/2
  49 * of the entries in the array are given back into the global cache.
  50 * The head array is strictly LIFO and should improve the cache hit rates.
  51 * On SMP, it additionally reduces the spinlock operations.
  52 *
  53 * The c_cpuarray may not be read with enabled local interrupts -
  54 * it's changed with a smp_call_function().
  55 *
  56 * SMP synchronization:
  57 *  constructors and destructors are called without any locking.
  58 *  Several members in struct kmem_cache and struct slab never change, they
  59 *      are accessed without any locking.
  60 *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
  61 *      and local interrupts are disabled so slab code is preempt-safe.
  62 *  The non-constant members are protected with a per-cache irq spinlock.
  63 *
  64 * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
  65 * in 2000 - many ideas in the current implementation are derived from
  66 * his patch.
  67 *
  68 * Further notes from the original documentation:
  69 *
  70 * 11 April '97.  Started multi-threading - markhe
  71 *      The global cache-chain is protected by the mutex 'slab_mutex'.
  72 *      The sem is only needed when accessing/extending the cache-chain, which
  73 *      can never happen inside an interrupt (kmem_cache_create(),
  74 *      kmem_cache_shrink() and kmem_cache_reap()).
  75 *
  76 *      At present, each engine can be growing a cache.  This should be blocked.
  77 *
  78 * 15 March 2005. NUMA slab allocator.
  79 *      Shai Fultheim <shai@scalex86.org>.
  80 *      Shobhit Dayal <shobhit@calsoftinc.com>
  81 *      Alok N Kataria <alokk@calsoftinc.com>
  82 *      Christoph Lameter <christoph@lameter.com>
  83 *
  84 *      Modified the slab allocator to be node aware on NUMA systems.
  85 *      Each node has its own list of partial, free and full slabs.
  86 *      All object allocations for a node occur from node specific slab lists.
  87 */
  88
  89#include        <linux/slab.h>
  90#include        <linux/mm.h>
  91#include        <linux/poison.h>
  92#include        <linux/swap.h>
  93#include        <linux/cache.h>
  94#include        <linux/interrupt.h>
  95#include        <linux/init.h>
  96#include        <linux/compiler.h>
  97#include        <linux/cpuset.h>
  98#include        <linux/proc_fs.h>
  99#include        <linux/seq_file.h>
 100#include        <linux/notifier.h>
 101#include        <linux/kallsyms.h>
 102#include        <linux/cpu.h>
 103#include        <linux/sysctl.h>
 104#include        <linux/module.h>
 105#include        <linux/rcupdate.h>
 106#include        <linux/string.h>
 107#include        <linux/uaccess.h>
 108#include        <linux/nodemask.h>
 109#include        <linux/kmemleak.h>
 110#include        <linux/mempolicy.h>
 111#include        <linux/mutex.h>
 112#include        <linux/fault-inject.h>
 113#include        <linux/rtmutex.h>
 114#include        <linux/reciprocal_div.h>
 115#include        <linux/debugobjects.h>
 116#include        <linux/kmemcheck.h>
 117#include        <linux/memory.h>
 118#include        <linux/prefetch.h>
 119
 120#include        <net/sock.h>
 121
 122#include        <asm/cacheflush.h>
 123#include        <asm/tlbflush.h>
 124#include        <asm/page.h>
 125
 126#include <trace/events/kmem.h>
 127
 128#include        "internal.h"
 129
 130#include        "slab.h"
 131
 132/*
 133 * DEBUG        - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
 134 *                0 for faster, smaller code (especially in the critical paths).
 135 *
 136 * STATS        - 1 to collect stats for /proc/slabinfo.
 137 *                0 for faster, smaller code (especially in the critical paths).
 138 *
 139 * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
 140 */
 141
 142#ifdef CONFIG_DEBUG_SLAB
 143#define DEBUG           1
 144#define STATS           1
 145#define FORCED_DEBUG    1
 146#else
 147#define DEBUG           0
 148#define STATS           0
 149#define FORCED_DEBUG    0
 150#endif
 151
 152/* Shouldn't this be in a header file somewhere? */
 153#define BYTES_PER_WORD          sizeof(void *)
 154#define REDZONE_ALIGN           max(BYTES_PER_WORD, __alignof__(unsigned long long))
 155
 156#ifndef ARCH_KMALLOC_FLAGS
 157#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
 158#endif
 159
 160/*
 161 * true if a page was allocated from pfmemalloc reserves for network-based
 162 * swap
 163 */
 164static bool pfmemalloc_active __read_mostly;
 165
 166/*
 167 * struct array_cache
 168 *
 169 * Purpose:
 170 * - LIFO ordering, to hand out cache-warm objects from _alloc
 171 * - reduce the number of linked list operations
 172 * - reduce spinlock operations
 173 *
 174 * The limit is stored in the per-cpu structure to reduce the data cache
 175 * footprint.
 176 *
 177 */
 178struct array_cache {
 179        unsigned int avail;
 180        unsigned int limit;
 181        unsigned int batchcount;
 182        unsigned int touched;
 183        spinlock_t lock;
 184        void *entry[];  /*
 185                         * Must have this definition in here for the proper
 186                         * alignment of array_cache. Also simplifies accessing
 187                         * the entries.
 188                         *
 189                         * Entries should not be directly dereferenced as
 190                         * entries belonging to slabs marked pfmemalloc will
 191                         * have the lower bits set SLAB_OBJ_PFMEMALLOC
 192                         */
 193};
 194
 195#define SLAB_OBJ_PFMEMALLOC     1
 196static inline bool is_obj_pfmemalloc(void *objp)
 197{
 198        return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC;
 199}
 200
 201static inline void set_obj_pfmemalloc(void **objp)
 202{
 203        *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC);
 204        return;
 205}
 206
 207static inline void clear_obj_pfmemalloc(void **objp)
 208{
 209        *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC);
 210}
 211
 212/*
 213 * bootstrap: The caches do not work without cpuarrays anymore, but the
 214 * cpuarrays are allocated from the generic caches...
 215 */
 216#define BOOT_CPUCACHE_ENTRIES   1
 217struct arraycache_init {
 218        struct array_cache cache;
 219        void *entries[BOOT_CPUCACHE_ENTRIES];
 220};
 221
 222/*
 223 * Need this for bootstrapping a per node allocator.
 224 */
 225#define NUM_INIT_LISTS (3 * MAX_NUMNODES)
 226static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS];
 227#define CACHE_CACHE 0
 228#define SIZE_AC MAX_NUMNODES
 229#define SIZE_NODE (2 * MAX_NUMNODES)
 230
 231static int drain_freelist(struct kmem_cache *cache,
 232                        struct kmem_cache_node *n, int tofree);
 233static void free_block(struct kmem_cache *cachep, void **objpp, int len,
 234                        int node);
 235static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
 236static void cache_reap(struct work_struct *unused);
 237
 238static int slab_early_init = 1;
 239
 240#define INDEX_AC kmalloc_index(sizeof(struct arraycache_init))
 241#define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node))
 242
 243static void kmem_cache_node_init(struct kmem_cache_node *parent)
 244{
 245        INIT_LIST_HEAD(&parent->slabs_full);
 246        INIT_LIST_HEAD(&parent->slabs_partial);
 247        INIT_LIST_HEAD(&parent->slabs_free);
 248        parent->shared = NULL;
 249        parent->alien = NULL;
 250        parent->colour_next = 0;
 251        spin_lock_init(&parent->list_lock);
 252        parent->free_objects = 0;
 253        parent->free_touched = 0;
 254}
 255
 256#define MAKE_LIST(cachep, listp, slab, nodeid)                          \
 257        do {                                                            \
 258                INIT_LIST_HEAD(listp);                                  \
 259                list_splice(&(cachep->node[nodeid]->slab), listp);      \
 260        } while (0)
 261
 262#define MAKE_ALL_LISTS(cachep, ptr, nodeid)                             \
 263        do {                                                            \
 264        MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid);  \
 265        MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
 266        MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);  \
 267        } while (0)
 268
 269#define CFLGS_OFF_SLAB          (0x80000000UL)
 270#define OFF_SLAB(x)     ((x)->flags & CFLGS_OFF_SLAB)
 271
 272#define BATCHREFILL_LIMIT       16
 273/*
 274 * Optimization question: fewer reaps means less probability for unnessary
 275 * cpucache drain/refill cycles.
 276 *
 277 * OTOH the cpuarrays can contain lots of objects,
 278 * which could lock up otherwise freeable slabs.
 279 */
 280#define REAPTIMEOUT_CPUC        (2*HZ)
 281#define REAPTIMEOUT_LIST3       (4*HZ)
 282
 283#if STATS
 284#define STATS_INC_ACTIVE(x)     ((x)->num_active++)
 285#define STATS_DEC_ACTIVE(x)     ((x)->num_active--)
 286#define STATS_INC_ALLOCED(x)    ((x)->num_allocations++)
 287#define STATS_INC_GROWN(x)      ((x)->grown++)
 288#define STATS_ADD_REAPED(x,y)   ((x)->reaped += (y))
 289#define STATS_SET_HIGH(x)                                               \
 290        do {                                                            \
 291                if ((x)->num_active > (x)->high_mark)                   \
 292                        (x)->high_mark = (x)->num_active;               \
 293        } while (0)
 294#define STATS_INC_ERR(x)        ((x)->errors++)
 295#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++)
 296#define STATS_INC_NODEFREES(x)  ((x)->node_frees++)
 297#define STATS_INC_ACOVERFLOW(x)   ((x)->node_overflow++)
 298#define STATS_SET_FREEABLE(x, i)                                        \
 299        do {                                                            \
 300                if ((x)->max_freeable < i)                              \
 301                        (x)->max_freeable = i;                          \
 302        } while (0)
 303#define STATS_INC_ALLOCHIT(x)   atomic_inc(&(x)->allochit)
 304#define STATS_INC_ALLOCMISS(x)  atomic_inc(&(x)->allocmiss)
 305#define STATS_INC_FREEHIT(x)    atomic_inc(&(x)->freehit)
 306#define STATS_INC_FREEMISS(x)   atomic_inc(&(x)->freemiss)
 307#else
 308#define STATS_INC_ACTIVE(x)     do { } while (0)
 309#define STATS_DEC_ACTIVE(x)     do { } while (0)
 310#define STATS_INC_ALLOCED(x)    do { } while (0)
 311#define STATS_INC_GROWN(x)      do { } while (0)
 312#define STATS_ADD_REAPED(x,y)   do { (void)(y); } while (0)
 313#define STATS_SET_HIGH(x)       do { } while (0)
 314#define STATS_INC_ERR(x)        do { } while (0)
 315#define STATS_INC_NODEALLOCS(x) do { } while (0)
 316#define STATS_INC_NODEFREES(x)  do { } while (0)
 317#define STATS_INC_ACOVERFLOW(x)   do { } while (0)
 318#define STATS_SET_FREEABLE(x, i) do { } while (0)
 319#define STATS_INC_ALLOCHIT(x)   do { } while (0)
 320#define STATS_INC_ALLOCMISS(x)  do { } while (0)
 321#define STATS_INC_FREEHIT(x)    do { } while (0)
 322#define STATS_INC_FREEMISS(x)   do { } while (0)
 323#endif
 324
 325#if DEBUG
 326
 327/*
 328 * memory layout of objects:
 329 * 0            : objp
 330 * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
 331 *              the end of an object is aligned with the end of the real
 332 *              allocation. Catches writes behind the end of the allocation.
 333 * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
 334 *              redzone word.
 335 * cachep->obj_offset: The real object.
 336 * cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
 337 * cachep->size - 1* BYTES_PER_WORD: last caller address
 338 *                                      [BYTES_PER_WORD long]
 339 */
 340static int obj_offset(struct kmem_cache *cachep)
 341{
 342        return cachep->obj_offset;
 343}
 344
 345static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
 346{
 347        BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
 348        return (unsigned long long*) (objp + obj_offset(cachep) -
 349                                      sizeof(unsigned long long));
 350}
 351
 352static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
 353{
 354        BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
 355        if (cachep->flags & SLAB_STORE_USER)
 356                return (unsigned long long *)(objp + cachep->size -
 357                                              sizeof(unsigned long long) -
 358                                              REDZONE_ALIGN);
 359        return (unsigned long long *) (objp + cachep->size -
 360                                       sizeof(unsigned long long));
 361}
 362
 363static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 364{
 365        BUG_ON(!(cachep->flags & SLAB_STORE_USER));
 366        return (void **)(objp + cachep->size - BYTES_PER_WORD);
 367}
 368
 369#else
 370
 371#define obj_offset(x)                   0
 372#define dbg_redzone1(cachep, objp)      ({BUG(); (unsigned long long *)NULL;})
 373#define dbg_redzone2(cachep, objp)      ({BUG(); (unsigned long long *)NULL;})
 374#define dbg_userword(cachep, objp)      ({BUG(); (void **)NULL;})
 375
 376#endif
 377
 378/*
 379 * Do not go above this order unless 0 objects fit into the slab or
 380 * overridden on the command line.
 381 */
 382#define SLAB_MAX_ORDER_HI       1
 383#define SLAB_MAX_ORDER_LO       0
 384static int slab_max_order = SLAB_MAX_ORDER_LO;
 385static bool slab_max_order_set __initdata;
 386
 387static inline struct kmem_cache *virt_to_cache(const void *obj)
 388{
 389        struct page *page = virt_to_head_page(obj);
 390        return page->slab_cache;
 391}
 392
 393static inline void *index_to_obj(struct kmem_cache *cache, struct page *page,
 394                                 unsigned int idx)
 395{
 396        return page->s_mem + cache->size * idx;
 397}
 398
 399/*
 400 * We want to avoid an expensive divide : (offset / cache->size)
 401 *   Using the fact that size is a constant for a particular cache,
 402 *   we can replace (offset / cache->size) by
 403 *   reciprocal_divide(offset, cache->reciprocal_buffer_size)
 404 */
 405static inline unsigned int obj_to_index(const struct kmem_cache *cache,
 406                                        const struct page *page, void *obj)
 407{
 408        u32 offset = (obj - page->s_mem);
 409        return reciprocal_divide(offset, cache->reciprocal_buffer_size);
 410}
 411
 412static struct arraycache_init initarray_generic =
 413    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 414
 415/* internal cache of cache description objs */
 416static struct kmem_cache kmem_cache_boot = {
 417        .batchcount = 1,
 418        .limit = BOOT_CPUCACHE_ENTRIES,
 419        .shared = 1,
 420        .size = sizeof(struct kmem_cache),
 421        .name = "kmem_cache",
 422};
 423
 424#define BAD_ALIEN_MAGIC 0x01020304ul
 425
 426#ifdef CONFIG_LOCKDEP
 427
 428/*
 429 * Slab sometimes uses the kmalloc slabs to store the slab headers
 430 * for other slabs "off slab".
 431 * The locking for this is tricky in that it nests within the locks
 432 * of all other slabs in a few places; to deal with this special
 433 * locking we put on-slab caches into a separate lock-class.
 434 *
 435 * We set lock class for alien array caches which are up during init.
 436 * The lock annotation will be lost if all cpus of a node goes down and
 437 * then comes back up during hotplug
 438 */
 439static struct lock_class_key on_slab_l3_key;
 440static struct lock_class_key on_slab_alc_key;
 441
 442static struct lock_class_key debugobj_l3_key;
 443static struct lock_class_key debugobj_alc_key;
 444
 445static void slab_set_lock_classes(struct kmem_cache *cachep,
 446                struct lock_class_key *l3_key, struct lock_class_key *alc_key,
 447                int q)
 448{
 449        struct array_cache **alc;
 450        struct kmem_cache_node *n;
 451        int r;
 452
 453        n = cachep->node[q];
 454        if (!n)
 455                return;
 456
 457        lockdep_set_class(&n->list_lock, l3_key);
 458        alc = n->alien;
 459        /*
 460         * FIXME: This check for BAD_ALIEN_MAGIC
 461         * should go away when common slab code is taught to
 462         * work even without alien caches.
 463         * Currently, non NUMA code returns BAD_ALIEN_MAGIC
 464         * for alloc_alien_cache,
 465         */
 466        if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
 467                return;
 468        for_each_node(r) {
 469                if (alc[r])
 470                        lockdep_set_class(&alc[r]->lock, alc_key);
 471        }
 472}
 473
 474static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
 475{
 476        slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, node);
 477}
 478
 479static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
 480{
 481        int node;
 482
 483        for_each_online_node(node)
 484                slab_set_debugobj_lock_classes_node(cachep, node);
 485}
 486
 487static void init_node_lock_keys(int q)
 488{
 489        int i;
 490
 491        if (slab_state < UP)
 492                return;
 493
 494        for (i = 1; i <= KMALLOC_SHIFT_HIGH; i++) {
 495                struct kmem_cache_node *n;
 496                struct kmem_cache *cache = kmalloc_caches[i];
 497
 498                if (!cache)
 499                        continue;
 500
 501                n = cache->node[q];
 502                if (!n || OFF_SLAB(cache))
 503                        continue;
 504
 505                slab_set_lock_classes(cache, &on_slab_l3_key,
 506                                &on_slab_alc_key, q);
 507        }
 508}
 509
 510static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q)
 511{
 512        if (!cachep->node[q])
 513                return;
 514
 515        slab_set_lock_classes(cachep, &on_slab_l3_key,
 516                        &on_slab_alc_key, q);
 517}
 518
 519static inline void on_slab_lock_classes(struct kmem_cache *cachep)
 520{
 521        int node;
 522
 523        VM_BUG_ON(OFF_SLAB(cachep));
 524        for_each_node(node)
 525                on_slab_lock_classes_node(cachep, node);
 526}
 527
 528static inline void init_lock_keys(void)
 529{
 530        int node;
 531
 532        for_each_node(node)
 533                init_node_lock_keys(node);
 534}
 535#else
 536static void init_node_lock_keys(int q)
 537{
 538}
 539
 540static inline void init_lock_keys(void)
 541{
 542}
 543
 544static inline void on_slab_lock_classes(struct kmem_cache *cachep)
 545{
 546}
 547
 548static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, int node)
 549{
 550}
 551
 552static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
 553{
 554}
 555
 556static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
 557{
 558}
 559#endif
 560
 561static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
 562
 563static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
 564{
 565        return cachep->array[smp_processor_id()];
 566}
 567
 568static size_t slab_mgmt_size(size_t nr_objs, size_t align)
 569{
 570        return ALIGN(nr_objs * sizeof(unsigned int), align);
 571}
 572
 573/*
 574 * Calculate the number of objects and left-over bytes for a given buffer size.
 575 */
 576static void cache_estimate(unsigned long gfporder, size_t buffer_size,
 577                           size_t align, int flags, size_t *left_over,
 578                           unsigned int *num)
 579{
 580        int nr_objs;
 581        size_t mgmt_size;
 582        size_t slab_size = PAGE_SIZE << gfporder;
 583
 584        /*
 585         * The slab management structure can be either off the slab or
 586         * on it. For the latter case, the memory allocated for a
 587         * slab is used for:
 588         *
 589         * - One unsigned int for each object
 590         * - Padding to respect alignment of @align
 591         * - @buffer_size bytes for each object
 592         *
 593         * If the slab management structure is off the slab, then the
 594         * alignment will already be calculated into the size. Because
 595         * the slabs are all pages aligned, the objects will be at the
 596         * correct alignment when allocated.
 597         */
 598        if (flags & CFLGS_OFF_SLAB) {
 599                mgmt_size = 0;
 600                nr_objs = slab_size / buffer_size;
 601
 602        } else {
 603                /*
 604                 * Ignore padding for the initial guess. The padding
 605                 * is at most @align-1 bytes, and @buffer_size is at
 606                 * least @align. In the worst case, this result will
 607                 * be one greater than the number of objects that fit
 608                 * into the memory allocation when taking the padding
 609                 * into account.
 610                 */
 611                nr_objs = (slab_size) / (buffer_size + sizeof(unsigned int));
 612
 613                /*
 614                 * This calculated number will be either the right
 615                 * amount, or one greater than what we want.
 616                 */
 617                if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
 618                       > slab_size)
 619                        nr_objs--;
 620
 621                mgmt_size = slab_mgmt_size(nr_objs, align);
 622        }
 623        *num = nr_objs;
 624        *left_over = slab_size - nr_objs*buffer_size - mgmt_size;
 625}
 626
 627#if DEBUG
 628#define slab_error(cachep, msg) __slab_error(__func__, cachep, msg)
 629
 630static void __slab_error(const char *function, struct kmem_cache *cachep,
 631                        char *msg)
 632{
 633        printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
 634               function, cachep->name, msg);
 635        dump_stack();
 636        add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 637}
 638#endif
 639
 640/*
 641 * By default on NUMA we use alien caches to stage the freeing of
 642 * objects allocated from other nodes. This causes massive memory
 643 * inefficiencies when using fake NUMA setup to split memory into a
 644 * large number of small nodes, so it can be disabled on the command
 645 * line
 646  */
 647
 648static int use_alien_caches __read_mostly = 1;
 649static int __init noaliencache_setup(char *s)
 650{
 651        use_alien_caches = 0;
 652        return 1;
 653}
 654__setup("noaliencache", noaliencache_setup);
 655
 656static int __init slab_max_order_setup(char *str)
 657{
 658        get_option(&str, &slab_max_order);
 659        slab_max_order = slab_max_order < 0 ? 0 :
 660                                min(slab_max_order, MAX_ORDER - 1);
 661        slab_max_order_set = true;
 662
 663        return 1;
 664}
 665__setup("slab_max_order=", slab_max_order_setup);
 666
 667#ifdef CONFIG_NUMA
 668/*
 669 * Special reaping functions for NUMA systems called from cache_reap().
 670 * These take care of doing round robin flushing of alien caches (containing
 671 * objects freed on different nodes from which they were allocated) and the
 672 * flushing of remote pcps by calling drain_node_pages.
 673 */
 674static DEFINE_PER_CPU(unsigned long, slab_reap_node);
 675
 676static void init_reap_node(int cpu)
 677{
 678        int node;
 679
 680        node = next_node(cpu_to_mem(cpu), node_online_map);
 681        if (node == MAX_NUMNODES)
 682                node = first_node(node_online_map);
 683
 684        per_cpu(slab_reap_node, cpu) = node;
 685}
 686
 687static void next_reap_node(void)
 688{
 689        int node = __this_cpu_read(slab_reap_node);
 690
 691        node = next_node(node, node_online_map);
 692        if (unlikely(node >= MAX_NUMNODES))
 693                node = first_node(node_online_map);
 694        __this_cpu_write(slab_reap_node, node);
 695}
 696
 697#else
 698#define init_reap_node(cpu) do { } while (0)
 699#define next_reap_node(void) do { } while (0)
 700#endif
 701
 702/*
 703 * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
 704 * via the workqueue/eventd.
 705 * Add the CPU number into the expiration time to minimize the possibility of
 706 * the CPUs getting into lockstep and contending for the global cache chain
 707 * lock.
 708 */
 709static void start_cpu_timer(int cpu)
 710{
 711        struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu);
 712
 713        /*
 714         * When this gets called from do_initcalls via cpucache_init(),
 715         * init_workqueues() has already run, so keventd will be setup
 716         * at that time.
 717         */
 718        if (keventd_up() && reap_work->work.func == NULL) {
 719                init_reap_node(cpu);
 720                INIT_DEFERRABLE_WORK(reap_work, cache_reap);
 721                schedule_delayed_work_on(cpu, reap_work,
 722                                        __round_jiffies_relative(HZ, cpu));
 723        }
 724}
 725
 726static struct array_cache *alloc_arraycache(int node, int entries,
 727                                            int batchcount, gfp_t gfp)
 728{
 729        int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
 730        struct array_cache *nc = NULL;
 731
 732        nc = kmalloc_node(memsize, gfp, node);
 733        /*
 734         * The array_cache structures contain pointers to free object.
 735         * However, when such objects are allocated or transferred to another
 736         * cache the pointers are not cleared and they could be counted as
 737         * valid references during a kmemleak scan. Therefore, kmemleak must
 738         * not scan such objects.
 739         */
 740        kmemleak_no_scan(nc);
 741        if (nc) {
 742                nc->avail = 0;
 743                nc->limit = entries;
 744                nc->batchcount = batchcount;
 745                nc->touched = 0;
 746                spin_lock_init(&nc->lock);
 747        }
 748        return nc;
 749}
 750
 751static inline bool is_slab_pfmemalloc(struct page *page)
 752{
 753        return PageSlabPfmemalloc(page);
 754}
 755
 756/* Clears pfmemalloc_active if no slabs have pfmalloc set */
 757static void recheck_pfmemalloc_active(struct kmem_cache *cachep,
 758                                                struct array_cache *ac)
 759{
 760        struct kmem_cache_node *n = cachep->node[numa_mem_id()];
 761        struct page *page;
 762        unsigned long flags;
 763
 764        if (!pfmemalloc_active)
 765                return;
 766
 767        spin_lock_irqsave(&n->list_lock, flags);
 768        list_for_each_entry(page, &n->slabs_full, lru)
 769                if (is_slab_pfmemalloc(page))
 770                        goto out;
 771
 772        list_for_each_entry(page, &n->slabs_partial, lru)
 773                if (is_slab_pfmemalloc(page))
 774                        goto out;
 775
 776        list_for_each_entry(page, &n->slabs_free, lru)
 777                if (is_slab_pfmemalloc(page))
 778                        goto out;
 779
 780        pfmemalloc_active = false;
 781out:
 782        spin_unlock_irqrestore(&n->list_lock, flags);
 783}
 784
 785static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
 786                                                gfp_t flags, bool force_refill)
 787{
 788        int i;
 789        void *objp = ac->entry[--ac->avail];
 790
 791        /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */
 792        if (unlikely(is_obj_pfmemalloc(objp))) {
 793                struct kmem_cache_node *n;
 794
 795                if (gfp_pfmemalloc_allowed(flags)) {
 796                        clear_obj_pfmemalloc(&objp);
 797                        return objp;
 798                }
 799
 800                /* The caller cannot use PFMEMALLOC objects, find another one */
 801                for (i = 0; i < ac->avail; i++) {
 802                        /* If a !PFMEMALLOC object is found, swap them */
 803                        if (!is_obj_pfmemalloc(ac->entry[i])) {
 804                                objp = ac->entry[i];
 805                                ac->entry[i] = ac->entry[ac->avail];
 806                                ac->entry[ac->avail] = objp;
 807                                return objp;
 808                        }
 809                }
 810
 811                /*
 812                 * If there are empty slabs on the slabs_free list and we are
 813                 * being forced to refill the cache, mark this one !pfmemalloc.
 814                 */
 815                n = cachep->node[numa_mem_id()];
 816                if (!list_empty(&n->slabs_free) && force_refill) {
 817                        struct page *page = virt_to_head_page(objp);
 818                        ClearPageSlabPfmemalloc(page);
 819                        clear_obj_pfmemalloc(&objp);
 820                        recheck_pfmemalloc_active(cachep, ac);
 821                        return objp;
 822                }
 823
 824                /* No !PFMEMALLOC objects available */
 825                ac->avail++;
 826                objp = NULL;
 827        }
 828
 829        return objp;
 830}
 831
 832static inline void *ac_get_obj(struct kmem_cache *cachep,
 833                        struct array_cache *ac, gfp_t flags, bool force_refill)
 834{
 835        void *objp;
 836
 837        if (unlikely(sk_memalloc_socks()))
 838                objp = __ac_get_obj(cachep, ac, flags, force_refill);
 839        else
 840                objp = ac->entry[--ac->avail];
 841
 842        return objp;
 843}
 844
 845static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
 846                                                                void *objp)
 847{
 848        if (unlikely(pfmemalloc_active)) {
 849                /* Some pfmemalloc slabs exist, check if this is one */
 850                struct page *page = virt_to_head_page(objp);
 851                if (PageSlabPfmemalloc(page))
 852                        set_obj_pfmemalloc(&objp);
 853        }
 854
 855        return objp;
 856}
 857
 858static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
 859                                                                void *objp)
 860{
 861        if (unlikely(sk_memalloc_socks()))
 862                objp = __ac_put_obj(cachep, ac, objp);
 863
 864        ac->entry[ac->avail++] = objp;
 865}
 866
 867/*
 868 * Transfer objects in one arraycache to another.
 869 * Locking must be handled by the caller.
 870 *
 871 * Return the number of entries transferred.
 872 */
 873static int transfer_objects(struct array_cache *to,
 874                struct array_cache *from, unsigned int max)
 875{
 876        /* Figure out how many entries to transfer */
 877        int nr = min3(from->avail, max, to->limit - to->avail);
 878
 879        if (!nr)
 880                return 0;
 881
 882        memcpy(to->entry + to->avail, from->entry + from->avail -nr,
 883                        sizeof(void *) *nr);
 884
 885        from->avail -= nr;
 886        to->avail += nr;
 887        return nr;
 888}
 889
 890#ifndef CONFIG_NUMA
 891
 892#define drain_alien_cache(cachep, alien) do { } while (0)
 893#define reap_alien(cachep, n) do { } while (0)
 894
 895static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
 896{
 897        return (struct array_cache **)BAD_ALIEN_MAGIC;
 898}
 899
 900static inline void free_alien_cache(struct array_cache **ac_ptr)
 901{
 902}
 903
 904static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
 905{
 906        return 0;
 907}
 908
 909static inline void *alternate_node_alloc(struct kmem_cache *cachep,
 910                gfp_t flags)
 911{
 912        return NULL;
 913}
 914
 915static inline void *____cache_alloc_node(struct kmem_cache *cachep,
 916                 gfp_t flags, int nodeid)
 917{
 918        return NULL;
 919}
 920
 921#else   /* CONFIG_NUMA */
 922
 923static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
 924static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
 925
 926static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
 927{
 928        struct array_cache **ac_ptr;
 929        int memsize = sizeof(void *) * nr_node_ids;
 930        int i;
 931
 932        if (limit > 1)
 933                limit = 12;
 934        ac_ptr = kzalloc_node(memsize, gfp, node);
 935        if (ac_ptr) {
 936                for_each_node(i) {
 937                        if (i == node || !node_online(i))
 938                                continue;
 939                        ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp);
 940                        if (!ac_ptr[i]) {
 941                                for (i--; i >= 0; i--)
 942                                        kfree(ac_ptr[i]);
 943                                kfree(ac_ptr);
 944                                return NULL;
 945                        }
 946                }
 947        }
 948        return ac_ptr;
 949}
 950
 951static void free_alien_cache(struct array_cache **ac_ptr)
 952{
 953        int i;
 954
 955        if (!ac_ptr)
 956                return;
 957        for_each_node(i)
 958            kfree(ac_ptr[i]);
 959        kfree(ac_ptr);
 960}
 961
 962static void __drain_alien_cache(struct kmem_cache *cachep,
 963                                struct array_cache *ac, int node)
 964{
 965        struct kmem_cache_node *n = cachep->node[node];
 966
 967        if (ac->avail) {
 968                spin_lock(&n->list_lock);
 969                /*
 970                 * Stuff objects into the remote nodes shared array first.
 971                 * That way we could avoid the overhead of putting the objects
 972                 * into the free lists and getting them back later.
 973                 */
 974                if (n->shared)
 975                        transfer_objects(n->shared, ac, ac->limit);
 976
 977                free_block(cachep, ac->entry, ac->avail, node);
 978                ac->avail = 0;
 979                spin_unlock(&n->list_lock);
 980        }
 981}
 982
 983/*
 984 * Called from cache_reap() to regularly drain alien caches round robin.
 985 */
 986static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n)
 987{
 988        int node = __this_cpu_read(slab_reap_node);
 989
 990        if (n->alien) {
 991                struct array_cache *ac = n->alien[node];
 992
 993                if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {
 994                        __drain_alien_cache(cachep, ac, node);
 995                        spin_unlock_irq(&ac->lock);
 996                }
 997        }
 998}
 999
1000static void drain_alien_cache(struct kmem_cache *cachep,

1001                                struct array_cache **alien)
1002{
1003        int i = 0;
1004        struct array_cache *ac;
1005        unsigned long flags;
1006
1007        for_each_online_node(i) {
1008                ac = alien[i];
1009                if (ac) {
1010                        spin_lock_irqsave(&ac->lock, flags);
1011                        __drain_alien_cache(cachep, ac, i);
1012                        spin_unlock_irqrestore(&ac->lock, flags);
1013                }
1014        }
1015}
1016
1017static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1018{
1019        int nodeid = page_to_nid(virt_to_page(objp));
1020        struct kmem_cache_node *n;
1021        struct array_cache *alien = NULL;
1022        int node;
1023
1024        node = numa_mem_id();
1025
1026        /*
1027         * Make sure we are not freeing a object from another node to the array
1028         * cache on this cpu.
1029         */
1030        if (likely(nodeid == node))
1031                return 0;
1032
1033        n = cachep->node[node];
1034        STATS_INC_NODEFREES(cachep);
1035        if (n->alien && n->alien[nodeid]) {
1036                alien = n->alien[nodeid];
1037                spin_lock(&alien->lock);
1038                if (unlikely(alien->avail == alien->limit)) {
1039                        STATS_INC_ACOVERFLOW(cachep);
1040                        __drain_alien_cache(cachep, alien, nodeid);
1041                }
1042                ac_put_obj(cachep, alien, objp);
1043                spin_unlock(&alien->lock);
1044        } else {
1045                spin_lock(&(cachep->node[nodeid])->list_lock);
1046                free_block(cachep, &objp, 1, nodeid);
1047                spin_unlock(&(cachep->node[nodeid])->list_lock);
1048        }
1049        return 1;
1050}
1051#endif
1052
1053/*
1054 * Allocates and initializes node for a node on each slab cache, used for
1055 * either memory or cpu hotplug.  If memory is being hot-added, the kmem_cache_node
1056 * will be allocated off-node since memory is not yet online for the new node.
1057 * When hotplugging memory or a cpu, existing node are not replaced if
1058 * already in use.
1059 *
1060 * Must hold slab_mutex.
1061 */
1062static int init_cache_node_node(int node)
1063{
1064        struct kmem_cache *cachep;
1065        struct kmem_cache_node *n;
1066        const int memsize = sizeof(struct kmem_cache_node);
1067
1068        list_for_each_entry(cachep, &slab_caches, list) {
1069                /*
1070                 * Set up the size64 kmemlist for cpu before we can
1071                 * begin anything. Make sure some other cpu on this
1072                 * node has not already allocated this
1073                 */
1074                if (!cachep->node[node]) {
1075                        n = kmalloc_node(memsize, GFP_KERNEL, node);
1076                        if (!n)
1077                                return -ENOMEM;
1078                        kmem_cache_node_init(n);
1079                        n->next_reap = jiffies + REAPTIMEOUT_LIST3 +
1080                            ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1081
1082                        /*
1083                         * The l3s don't come and go as CPUs come and
1084                         * go.  slab_mutex is sufficient
1085                         * protection here.
1086                         */
1087                        cachep->node[node] = n;
1088                }
1089
1090                spin_lock_irq(&cachep->node[node]->list_lock);
1091                cachep->node[node]->free_limit =
1092                        (1 + nr_cpus_node(node)) *
1093                        cachep->batchcount + cachep->num;
1094                spin_unlock_irq(&cachep->node[node]->list_lock);
1095        }
1096        return 0;
1097}
1098
1099static inline int slabs_tofree(struct kmem_cache *cachep,
1100                                                struct kmem_cache_node *n)
1101{
1102        return (n->free_objects + cachep->num - 1) / cachep->num;
1103}
1104
1105static void cpuup_canceled(long cpu)
1106{
1107        struct kmem_cache *cachep;
1108        struct kmem_cache_node *n = NULL;
1109        int node = cpu_to_mem(cpu);
1110        const struct cpumask *mask = cpumask_of_node(node);
1111
1112        list_for_each_entry(cachep, &slab_caches, list) {
1113                struct array_cache *nc;
1114                struct array_cache *shared;
1115                struct array_cache **alien;
1116
1117                /* cpu is dead; no one can alloc from it. */
1118                nc = cachep->array[cpu];
1119                cachep->array[cpu] = NULL;
1120                n = cachep->node[node];
1121
1122                if (!n)
1123                        goto free_array_cache;
1124
1125                spin_lock_irq(&n->list_lock);
1126
1127                /* Free limit for this kmem_cache_node */
1128                n->free_limit -= cachep->batchcount;
1129                if (nc)
1130                        free_block(cachep, nc->entry, nc->avail, node);
1131
1132                if (!cpumask_empty(mask)) {
1133                        spin_unlock_irq(&n->list_lock);
1134                        goto free_array_cache;
1135                }
1136
1137                shared = n->shared;
1138                if (shared) {
1139                        free_block(cachep, shared->entry,
1140                                   shared->avail, node);
1141                        n->shared = NULL;
1142                }
1143
1144                alien = n->alien;
1145                n->alien = NULL;
1146
1147                spin_unlock_irq(&n->list_lock);
1148
1149                kfree(shared);
1150                if (alien) {
1151                        drain_alien_cache(cachep, alien);
1152                        free_alien_cache(alien);
1153                }
1154free_array_cache:
1155                kfree(nc);
1156        }
1157        /*
1158         * In the previous loop, all the objects were freed to
1159         * the respective cache's slabs,  now we can go ahead and
1160         * shrink each nodelist to its limit.
1161         */
1162        list_for_each_entry(cachep, &slab_caches, list) {
1163                n = cachep->node[node];
1164                if (!n)
1165                        continue;
1166                drain_freelist(cachep, n, slabs_tofree(cachep, n));
1167        }
1168}
1169
1170static int cpuup_prepare(long cpu)
1171{
1172        struct kmem_cache *cachep;
1173        struct kmem_cache_node *n = NULL;
1174        int node = cpu_to_mem(cpu);
1175        int err;
1176
1177        /*
1178         * We need to do this right in the beginning since
1179         * alloc_arraycache's are going to use this list.
1180         * kmalloc_node allows us to add the slab to the right
1181         * kmem_cache_node and not this cpu's kmem_cache_node
1182         */
1183        err = init_cache_node_node(node);
1184        if (err < 0)
1185                goto bad;
1186
1187        /*
1188         * Now we can go ahead with allocating the shared arrays and
1189         * array caches
1190         */
1191        list_for_each_entry(cachep, &slab_caches, list) {
1192                struct array_cache *nc;
1193                struct array_cache *shared = NULL;
1194                struct array_cache **alien = NULL;
1195
1196                nc = alloc_arraycache(node, cachep->limit,
1197                                        cachep->batchcount, GFP_KERNEL);
1198                if (!nc)
1199                        goto bad;
1200                if (cachep->shared) {
1201                        shared = alloc_arraycache(node,
1202                                cachep->shared * cachep->batchcount,
1203                                0xbaadf00d, GFP_KERNEL);
1204                        if (!shared) {
1205                                kfree(nc);
1206                                goto bad;
1207                        }
1208                }
1209                if (use_alien_caches) {
1210                        alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL);
1211                        if (!alien) {
1212                                kfree(shared);
1213                                kfree(nc);
1214                                goto bad;
1215                        }
1216                }
1217                cachep->array[cpu] = nc;
1218                n = cachep->node[node];
1219                BUG_ON(!n);
1220
1221                spin_lock_irq(&n->list_lock);
1222                if (!n->shared) {
1223                        /*
1224                         * We are serialised from CPU_DEAD or
1225                         * CPU_UP_CANCELLED by the cpucontrol lock
1226                         */
1227                        n->shared = shared;
1228                        shared = NULL;
1229                }
1230#ifdef CONFIG_NUMA
1231                if (!n->alien) {
1232                        n->alien = alien;
1233                        alien = NULL;
1234                }
1235#endif
1236                spin_unlock_irq(&n->list_lock);
1237                kfree(shared);
1238                free_alien_cache(alien);
1239                if (cachep->flags & SLAB_DEBUG_OBJECTS)
1240                        slab_set_debugobj_lock_classes_node(cachep, node);
1241                else if (!OFF_SLAB(cachep) &&
1242                         !(cachep->flags & SLAB_DESTROY_BY_RCU))
1243                        on_slab_lock_classes_node(cachep, node);
1244        }
1245        init_node_lock_keys(node);
1246
1247        return 0;
1248bad:
1249        cpuup_canceled(cpu);
1250        return -ENOMEM;
1251}
1252
1253static int cpuup_callback(struct notifier_block *nfb,
1254                                    unsigned long action, void *hcpu)
1255{
1256        long cpu = (long)hcpu;
1257        int err = 0;
1258
1259        switch (action) {
1260        case CPU_UP_PREPARE:
1261        case CPU_UP_PREPARE_FROZEN:
1262                mutex_lock(&slab_mutex);
1263                err = cpuup_prepare(cpu);
1264                mutex_unlock(&slab_mutex);
1265                break;
1266        case CPU_ONLINE:
1267        case CPU_ONLINE_FROZEN:
1268                start_cpu_timer(cpu);
1269                break;
1270#ifdef CONFIG_HOTPLUG_CPU
1271        case CPU_DOWN_PREPARE:
1272        case CPU_DOWN_PREPARE_FROZEN:
1273                /*
1274                 * Shutdown cache reaper. Note that the slab_mutex is
1275                 * held so that if cache_reap() is invoked it cannot do
1276                 * anything expensive but will only modify reap_work
1277                 * and reschedule the timer.
1278                */
1279                cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu));
1280                /* Now the cache_reaper is guaranteed to be not running. */
1281                per_cpu(slab_reap_work, cpu).work.func = NULL;
1282                break;
1283        case CPU_DOWN_FAILED:
1284        case CPU_DOWN_FAILED_FROZEN:
1285                start_cpu_timer(cpu);
1286                break;
1287        case CPU_DEAD:
1288        case CPU_DEAD_FROZEN:
1289                /*
1290                 * Even if all the cpus of a node are down, we don't free the
1291                 * kmem_cache_node of any cache. This to avoid a race between
1292                 * cpu_down, and a kmalloc allocation from another cpu for
1293                 * memory from the node of the cpu going down.  The node
1294                 * structure is usually allocated from kmem_cache_create() and
1295                 * gets destroyed at kmem_cache_destroy().
1296                 */
1297                /* fall through */
1298#endif
1299        case CPU_UP_CANCELED:
1300        case CPU_UP_CANCELED_FROZEN:
1301                mutex_lock(&slab_mutex);
1302                cpuup_canceled(cpu);
1303                mutex_unlock(&slab_mutex);
1304                break;
1305        }
1306        return notifier_from_errno(err);
1307}
1308
1309static struct notifier_block cpucache_notifier = {
1310        &cpuup_callback, NULL, 0
1311};
1312
1313#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
1314/*
1315 * Drains freelist for a node on each slab cache, used for memory hot-remove.
1316 * Returns -EBUSY if all objects cannot be drained so that the node is not
1317 * removed.
1318 *
1319 * Must hold slab_mutex.
1320 */
1321static int __meminit drain_cache_node_node(int node)
1322{
1323        struct kmem_cache *cachep;
1324        int ret = 0;
1325
1326        list_for_each_entry(cachep, &slab_caches, list) {
1327                struct kmem_cache_node *n;
1328
1329                n = cachep->node[node];
1330                if (!n)
1331                        continue;
1332
1333                drain_freelist(cachep, n, slabs_tofree(cachep, n));
1334
1335                if (!list_empty(&n->slabs_full) ||
1336                    !list_empty(&n->slabs_partial)) {
1337                        ret = -EBUSY;
1338                        break;
1339                }
1340        }
1341        return ret;
1342}
1343
1344static int __meminit slab_memory_callback(struct notifier_block *self,
1345                                        unsigned long action, void *arg)
1346{
1347        struct memory_notify *mnb = arg;
1348        int ret = 0;
1349        int nid;
1350
1351        nid = mnb->status_change_nid;
1352        if (nid < 0)
1353                goto out;
1354
1355        switch (action) {
1356        case MEM_GOING_ONLINE:
1357                mutex_lock(&slab_mutex);
1358                ret = init_cache_node_node(nid);
1359                mutex_unlock(&slab_mutex);
1360                break;
1361        case MEM_GOING_OFFLINE:
1362                mutex_lock(&slab_mutex);
1363                ret = drain_cache_node_node(nid);
1364                mutex_unlock(&slab_mutex);
1365                break;
1366        case MEM_ONLINE:
1367        case MEM_OFFLINE:
1368        case MEM_CANCEL_ONLINE:
1369        case MEM_CANCEL_OFFLINE:
1370                break;
1371        }
1372out:
1373        return notifier_from_errno(ret);
1374}
1375#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
1376
1377/*
1378 * swap the static kmem_cache_node with kmalloced memory
1379 */
1380static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node *list,
1381                                int nodeid)
1382{
1383        struct kmem_cache_node *ptr;
1384
1385        ptr = kmalloc_node(sizeof(struct kmem_cache_node), GFP_NOWAIT, nodeid);
1386        BUG_ON(!ptr);
1387
1388        memcpy(ptr, list, sizeof(struct kmem_cache_node));
1389        /*
1390         * Do not assume that spinlocks can be initialized via memcpy:
1391         */
1392        spin_lock_init(&ptr->list_lock);
1393
1394        MAKE_ALL_LISTS(cachep, ptr, nodeid);
1395        cachep->node[nodeid] = ptr;
1396}
1397
1398/*
1399 * For setting up all the kmem_cache_node for cache whose buffer_size is same as
1400 * size of kmem_cache_node.
1401 */
1402static void __init set_up_node(struct kmem_cache *cachep, int index)
1403{
1404        int node;
1405
1406        for_each_online_node(node) {
1407                cachep->node[node] = &init_kmem_cache_node[index + node];
1408                cachep->node[node]->next_reap = jiffies +
1409                    REAPTIMEOUT_LIST3 +
1410                    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1411        }
1412}
1413
1414/*
1415 * The memory after the last cpu cache pointer is used for the
1416 * the node pointer.
1417 */
1418static void setup_node_pointer(struct kmem_cache *cachep)
1419{
1420        cachep->node = (struct kmem_cache_node **)&cachep->array[nr_cpu_ids];
1421}
1422
1423/*
1424 * Initialisation.  Called after the page allocator have been initialised and
1425 * before smp_init().
1426 */
1427void __init kmem_cache_init(void)
1428{
1429        int i;
1430
1431        BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) <
1432                                        sizeof(struct rcu_head));
1433        kmem_cache = &kmem_cache_boot;
1434        setup_node_pointer(kmem_cache);
1435
1436        if (num_possible_nodes() == 1)
1437                use_alien_caches = 0;
1438
1439        for (i = 0; i < NUM_INIT_LISTS; i++)
1440                kmem_cache_node_init(&init_kmem_cache_node[i]);
1441
1442        set_up_node(kmem_cache, CACHE_CACHE);
1443
1444        /*
1445         * Fragmentation resistance on low memory - only use bigger
1446         * page orders on machines with more than 32MB of memory if
1447         * not overridden on the command line.
1448         */
1449        if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT)
1450                slab_max_order = SLAB_MAX_ORDER_HI;
1451
1452        /* Bootstrap is tricky, because several objects are allocated
1453         * from caches that do not exist yet:
1454         * 1) initialize the kmem_cache cache: it contains the struct
1455         *    kmem_cache structures of all caches, except kmem_cache itself:
1456         *    kmem_cache is statically allocated.
1457         *    Initially an __init data area is used for the head array and the
1458         *    kmem_cache_node structures, it's replaced with a kmalloc allocated
1459         *    array at the end of the bootstrap.
1460         * 2) Create the first kmalloc cache.
1461         *    The struct kmem_cache for the new cache is allocated normally.
1462         *    An __init data area is used for the head array.
1463         * 3) Create the remaining kmalloc caches, with minimally sized
1464         *    head arrays.
1465         * 4) Replace the __init data head arrays for kmem_cache and the first
1466         *    kmalloc cache with kmalloc allocated arrays.
1467         * 5) Replace the __init data for kmem_cache_node for kmem_cache and
1468         *    the other cache's with kmalloc allocated memory.
1469         * 6) Resize the head arrays of the kmalloc caches to their final sizes.
1470         */
1471
1472        /* 1) create the kmem_cache */
1473
1474        /*
1475         * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
1476         */
1477        create_boot_cache(kmem_cache, "kmem_cache",
1478                offsetof(struct kmem_cache, array[nr_cpu_ids]) +
1479                                  nr_node_ids * sizeof(struct kmem_cache_node *),
1480                                  SLAB_HWCACHE_ALIGN);
1481        list_add(&kmem_cache->list, &slab_caches);
1482
1483        /* 2+3) create the kmalloc caches */
1484
1485        /*
1486         * Initialize the caches that provide memory for the array cache and the
1487         * kmem_cache_node structures first.  Without this, further allocations will
1488         * bug.
1489         */
1490
1491        kmalloc_caches[INDEX_AC] = create_kmalloc_cache("kmalloc-ac",
1492                                        kmalloc_size(INDEX_AC), ARCH_KMALLOC_FLAGS);
1493
1494        if (INDEX_AC != INDEX_NODE)
1495                kmalloc_caches[INDEX_NODE] =
1496                        create_kmalloc_cache("kmalloc-node",
1497                                kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS);
1498
1499        slab_early_init = 0;
1500
1501        /* 4) Replace the bootstrap head arrays */
1502        {
1503                struct array_cache *ptr;
1504
1505                ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1506
1507                memcpy(ptr, cpu_cache_get(kmem_cache),
1508                       sizeof(struct arraycache_init));
1509                /*
1510                 * Do not assume that spinlocks can be initialized via memcpy:
1511                 */
1512                spin_lock_init(&ptr->lock);
1513
1514                kmem_cache->array[smp_processor_id()] = ptr;
1515
1516                ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1517
1518                BUG_ON(cpu_cache_get(kmalloc_caches[INDEX_AC])
1519                       != &initarray_generic.cache);
1520                memcpy(ptr, cpu_cache_get(kmalloc_caches[INDEX_AC]),
1521                       sizeof(struct arraycache_init));
1522                /*
1523                 * Do not assume that spinlocks can be initialized via memcpy:
1524                 */
1525                spin_lock_init(&ptr->lock);
1526
1527                kmalloc_caches[INDEX_AC]->array[smp_processor_id()] = ptr;
1528        }
1529        /* 5) Replace the bootstrap kmem_cache_node */
1530        {
1531                int nid;
1532
1533                for_each_online_node(nid) {
1534                        init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid);
1535
1536                        init_list(kmalloc_caches[INDEX_AC],
1537                                  &init_kmem_cache_node[SIZE_AC + nid], nid);
1538
1539                        if (INDEX_AC != INDEX_NODE) {
1540                                init_list(kmalloc_caches[INDEX_NODE],
1541                                          &init_kmem_cache_node[SIZE_NODE + nid], nid);
1542                        }
1543                }
1544        }
1545
1546        create_kmalloc_caches(ARCH_KMALLOC_FLAGS);
1547}
1548
1549void __init kmem_cache_init_late(void)
1550{
1551        struct kmem_cache *cachep;
1552
1553        slab_state = UP;
1554
1555        /* 6) resize the head arrays to their final sizes */
1556        mutex_lock(&slab_mutex);
1557        list_for_each_entry(cachep, &slab_caches, list)
1558                if (enable_cpucache(cachep, GFP_NOWAIT))
1559                        BUG();
1560        mutex_unlock(&slab_mutex);
1561
1562        /* Annotate slab for lockdep -- annotate the malloc caches */
1563        init_lock_keys();
1564
1565        /* Done! */
1566        slab_state = FULL;
1567
1568        /*
1569         * Register a cpu startup notifier callback that initializes
1570         * cpu_cache_get for all new cpus
1571         */
1572        register_cpu_notifier(&cpucache_notifier);
1573
1574#ifdef CONFIG_NUMA
1575        /*
1576         * Register a memory hotplug callback that initializes and frees
1577         * node.
1578         */
1579        hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
1580#endif
1581
1582        /*
1583         * The reap timers are started later, with a module init call: That part
1584         * of the kernel is not yet operational.
1585         */
1586}
1587
1588static int __init cpucache_init(void)
1589{
1590        int cpu;
1591
1592        /*
1593         * Register the timers that return unneeded pages to the page allocator
1594         */
1595        for_each_online_cpu(cpu)
1596                start_cpu_timer(cpu);
1597
1598        /* Done! */
1599        slab_state = FULL;
1600        return 0;
1601}
1602__initcall(cpucache_init);
1603
1604static noinline void
1605slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
1606{
1607        struct kmem_cache_node *n;
1608        struct page *page;
1609        unsigned long flags;
1610        int node;
1611
1612        printk(KERN_WARNING
1613                "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n",
1614                nodeid, gfpflags);
1615        printk(KERN_WARNING "  cache: %s, object size: %d, order: %d\n",
1616                cachep->name, cachep->size, cachep->gfporder);
1617
1618        for_each_online_node(node) {
1619                unsigned long active_objs = 0, num_objs = 0, free_objects = 0;
1620                unsigned long active_slabs = 0, num_slabs = 0;
1621
1622                n = cachep->node[node];
1623                if (!n)
1624                        continue;
1625
1626                spin_lock_irqsave(&n->list_lock, flags);
1627                list_for_each_entry(page, &n->slabs_full, lru) {
1628                        active_objs += cachep->num;
1629                        active_slabs++;
1630                }
1631                list_for_each_entry(page, &n->slabs_partial, lru) {
1632                        active_objs += page->active;
1633                        active_slabs++;
1634                }
1635                list_for_each_entry(page, &n->slabs_free, lru)
1636                        num_slabs++;
1637
1638                free_objects += n->free_objects;
1639                spin_unlock_irqrestore(&n->list_lock, flags);
1640
1641                num_slabs += active_slabs;
1642                num_objs = num_slabs * cachep->num;
1643                printk(KERN_WARNING
1644                        "  node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
1645                        node, active_slabs, num_slabs, active_objs, num_objs,
1646                        free_objects);
1647        }
1648}
1649
1650/*
1651 * Interface to system's page allocator. No need to hold the cache-lock.
1652 *
1653 * If we requested dmaable memory, we will get it. Even if we
1654 * did not request dmaable memory, we might get it, but that
1655 * would be relatively rare and ignorable.
1656 */
1657static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
1658                                                                int nodeid)
1659{
1660        struct page *page;
1661        int nr_pages;
1662
1663        flags |= cachep->allocflags;
1664        if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1665                flags |= __GFP_RECLAIMABLE;
1666
1667        page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
1668        if (!page) {
1669                if (!(flags & __GFP_NOWARN) && printk_ratelimit())
1670                        slab_out_of_memory(cachep, flags, nodeid);
1671                return NULL;
1672        }
1673
1674        /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
1675        if (unlikely(page->pfmemalloc))
1676                pfmemalloc_active = true;
1677
1678        nr_pages = (1 << cachep->gfporder);
1679        if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1680                add_zone_page_state(page_zone(page),
1681                        NR_SLAB_RECLAIMABLE, nr_pages);
1682        else
1683                add_zone_page_state(page_zone(page),
1684                        NR_SLAB_UNRECLAIMABLE, nr_pages);
1685        __SetPageSlab(page);
1686        if (page->pfmemalloc)
1687                SetPageSlabPfmemalloc(page);
1688        memcg_bind_pages(cachep, cachep->gfporder);
1689
1690        if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
1691                kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
1692
1693                if (cachep->ctor)
1694                        kmemcheck_mark_uninitialized_pages(page, nr_pages);
1695                else
1696                        kmemcheck_mark_unallocated_pages(page, nr_pages);
1697        }
1698
1699        return page;
1700}
1701
1702/*
1703 * Interface to system's page release.
1704 */
1705static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
1706{
1707        const unsigned long nr_freed = (1 << cachep->gfporder);
1708
1709        kmemcheck_free_shadow(page, cachep->gfporder);
1710
1711        if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1712                sub_zone_page_state(page_zone(page),
1713                                NR_SLAB_RECLAIMABLE, nr_freed);
1714        else
1715                sub_zone_page_state(page_zone(page),
1716                                NR_SLAB_UNRECLAIMABLE, nr_freed);
1717
1718        BUG_ON(!PageSlab(page));
1719        __ClearPageSlabPfmemalloc(page);
1720        __ClearPageSlab(page);
1721        page_mapcount_reset(page);
1722        page->mapping = NULL;
1723
1724        memcg_release_pages(cachep, cachep->gfporder);
1725        if (current->reclaim_state)
1726                current->reclaim_state->reclaimed_slab += nr_freed;
1727        __free_memcg_kmem_pages(page, cachep->gfporder);
1728}
1729
1730static void kmem_rcu_free(struct rcu_head *head)
1731{
1732        struct kmem_cache *cachep;
1733        struct page *page;
1734
1735        page = container_of(head, struct page, rcu_head);
1736        cachep = page->slab_cache;
1737
1738        kmem_freepages(cachep, page);
1739}
1740
1741#if DEBUG
1742
1743#ifdef CONFIG_DEBUG_PAGEALLOC
1744static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
1745                            unsigned long caller)
1746{
1747        int size = cachep->object_size;
1748
1749        addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
1750
1751        if (size < 5 * sizeof(unsigned long))
1752                return;
1753
1754        *addr++ = 0x12345678;
1755        *addr++ = caller;
1756        *addr++ = smp_processor_id();
1757        size -= 3 * sizeof(unsigned long);
1758        {
1759                unsigned long *sptr = &caller;
1760                unsigned long svalue;
1761
1762                while (!kstack_end(sptr)) {
1763                        svalue = *sptr++;
1764                        if (kernel_text_address(svalue)) {
1765                                *addr++ = svalue;
1766                                size -= sizeof(unsigned long);
1767                                if (size <= sizeof(unsigned long))
1768                                        break;
1769                        }
1770                }
1771
1772        }
1773        *addr++ = 0x87654321;
1774}
1775#endif
1776
1777static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
1778{
1779        int size = cachep->object_size;
1780        addr = &((char *)addr)[obj_offset(cachep)];
1781
1782        memset(addr, val, size);
1783        *(unsigned char *)(addr + size - 1) = POISON_END;
1784}
1785
1786static void dump_line(char *data, int offset, int limit)
1787{
1788        int i;
1789        unsigned char error = 0;
1790        int bad_count = 0;
1791
1792        printk(KERN_ERR "%03x: ", offset);
1793        for (i = 0; i < limit; i++) {
1794                if (data[offset + i] != POISON_FREE) {
1795                        error = data[offset + i];
1796                        bad_count++;
1797                }
1798        }
1799        print_hex_dump(KERN_CONT, "", 0, 16, 1,
1800                        &data[offset], limit, 1);
1801
1802        if (bad_count == 1) {
1803                error ^= POISON_FREE;
1804                if (!(error & (error - 1))) {
1805                        printk(KERN_ERR "Single bit error detected. Probably "
1806                                        "bad RAM.\n");
1807#ifdef CONFIG_X86
1808                        printk(KERN_ERR "Run memtest86+ or a similar memory "
1809                                        "test tool.\n");
1810#else
1811                        printk(KERN_ERR "Run a memory test tool.\n");
1812#endif
1813                }
1814        }
1815}
1816#endif
1817
1818#if DEBUG
1819
1820static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
1821{
1822        int i, size;
1823        char *realobj;
1824
1825        if (cachep->flags & SLAB_RED_ZONE) {
1826                printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n",
1827                        *dbg_redzone1(cachep, objp),
1828                        *dbg_redzone2(cachep, objp));
1829        }
1830
1831        if (cachep->flags & SLAB_STORE_USER) {
1832                printk(KERN_ERR "Last user: [<%p>](%pSR)\n",
1833                       *dbg_userword(cachep, objp),
1834                       *dbg_userword(cachep, objp));
1835        }
1836        realobj = (char *)objp + obj_offset(cachep);
1837        size = cachep->object_size;
1838        for (i = 0; i < size && lines; i += 16, lines--) {
1839                int limit;
1840                limit = 16;
1841                if (i + limit > size)
1842                        limit = size - i;
1843                dump_line(realobj, i, limit);
1844        }
1845}
1846
1847static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1848{
1849        char *realobj;
1850        int size, i;
1851        int lines = 0;
1852
1853        realobj = (char *)objp + obj_offset(cachep);
1854        size = cachep->object_size;
1855
1856        for (i = 0; i < size; i++) {
1857                char exp = POISON_FREE;
1858                if (i == size - 1)
1859                        exp = POISON_END;
1860                if (realobj[i] != exp) {
1861                        int limit;
1862                        /* Mismatch ! */
1863                        /* Print header */
1864                        if (lines == 0) {
1865                                printk(KERN_ERR
1866                                        "Slab corruption (%s): %s start=%p, len=%d\n",
1867                                        print_tainted(), cachep->name, realobj, size);
1868                                print_objinfo(cachep, objp, 0);
1869                        }
1870                        /* Hexdump the affected line */
1871                        i = (i / 16) * 16;
1872                        limit = 16;
1873                        if (i + limit > size)
1874                                limit = size - i;
1875                        dump_line(realobj, i, limit);
1876                        i += 16;
1877                        lines++;
1878                        /* Limit to 5 lines */
1879                        if (lines > 5)
1880                                break;
1881                }
1882        }
1883        if (lines != 0) {
1884                /* Print some data about the neighboring objects, if they
1885                 * exist:
1886                 */
1887                struct page *page = virt_to_head_page(objp);
1888                unsigned int objnr;
1889
1890                objnr = obj_to_index(cachep, page, objp);
1891                if (objnr) {
1892                        objp = index_to_obj(cachep, page, objnr - 1);
1893                        realobj = (char *)objp + obj_offset(cachep);
1894                        printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
1895                               realobj, size);
1896                        print_objinfo(cachep, objp, 2);
1897                }
1898                if (objnr + 1 < cachep->num) {
1899                        objp = index_to_obj(cachep, page, objnr + 1);
1900                        realobj = (char *)objp + obj_offset(cachep);
1901                        printk(KERN_ERR "Next obj: start=%p, len=%d\n",
1902                               realobj, size);
1903                        print_objinfo(cachep, objp, 2);
1904                }
1905        }
1906}
1907#endif
1908
1909#if DEBUG
1910static void slab_destroy_debugcheck(struct kmem_cache *cachep,
1911                                                struct page *page)
1912{
1913        int i;
1914        for (i = 0; i < cachep->num; i++) {
1915                void *objp = index_to_obj(cachep, page, i);
1916
1917                if (cachep->flags & SLAB_POISON) {
1918#ifdef CONFIG_DEBUG_PAGEALLOC
1919                        if (cachep->size % PAGE_SIZE == 0 &&
1920                                        OFF_SLAB(cachep))
1921                                kernel_map_pages(virt_to_page(objp),
1922                                        cachep->size / PAGE_SIZE, 1);
1923                        else
1924                                check_poison_obj(cachep, objp);
1925#else
1926                        check_poison_obj(cachep, objp);
1927#endif
1928                }
1929                if (cachep->flags & SLAB_RED_ZONE) {
1930                        if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
1931                                slab_error(cachep, "start of a freed object "
1932                                           "was overwritten");
1933                        if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
1934                                slab_error(cachep, "end of a freed object "
1935                                           "was overwritten");
1936                }
1937        }
1938}
1939#else
1940static void slab_destroy_debugcheck(struct kmem_cache *cachep,
1941                                                struct page *page)
1942{
1943}
1944#endif
1945
1946/**
1947 * slab_destroy - destroy and release all objects in a slab
1948 * @cachep: cache pointer being destroyed
1949 * @slabp: slab pointer being destroyed
1950 *
1951 * Destroy all the objs in a slab, and release the mem back to the system.
1952 * Before calling the slab must have been unlinked from the cache.  The
1953 * cache-lock is not held/needed.
1954 */
1955static void slab_destroy(struct kmem_cache *cachep, struct page *page)
1956{
1957        void *freelist;
1958
1959        freelist = page->freelist;
1960        slab_destroy_debugcheck(cachep, page);
1961        if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
1962                struct rcu_head *head;
1963
1964                /*
1965                 * RCU free overloads the RCU head over the LRU.
1966                 * slab_page has been overloeaded over the LRU,
1967                 * however it is not used from now on so that
1968                 * we can use it safely.
1969                 */
1970                head = (void *)&page->rcu_head;
1971                call_rcu(head, kmem_rcu_free);
1972
1973        } else {
1974                kmem_freepages(cachep, page);
1975        }
1976
1977        /*
1978         * From now on, we don't use freelist
1979         * although actual page can be freed in rcu context
1980         */
1981        if (OFF_SLAB(cachep))
1982                kmem_cache_free(cachep->freelist_cache, freelist);
1983}
1984
1985/**
1986 * calculate_slab_order - calculate size (page order) of slabs
1987 * @cachep: pointer to the cache that is being created
1988 * @size: size of objects to be created in this cache.
1989 * @align: required alignment for the objects.
1990 * @flags: slab allocation flags
1991 *
1992 * Also calculates the number of objects per slab.
1993 *
1994 * This could be made much more intelligent.  For now, try to avoid using
1995 * high order pages for slabs.  When the gfp() functions are more friendly
1996 * towards high-order requests, this should be changed.
1997 */
1998static size_t calculate_slab_order(struct kmem_cache *cachep,
1999                        size_t size, size_t align, unsigned long flags)
2000{

2001        unsigned long offslab_limit;
2002        size_t left_over = 0;
2003        int gfporder;
2004
2005        for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
2006                unsigned int num;
2007                size_t remainder;
2008
2009                cache_estimate(gfporder, size, align, flags, &remainder, &num);
2010                if (!num)
2011                        continue;
2012
2013                if (flags & CFLGS_OFF_SLAB) {
2014                        /*
2015                         * Max number of objs-per-slab for caches which
2016                         * use off-slab slabs. Needed to avoid a possible
2017                         * looping condition in cache_grow().
2018                         */
2019                        offslab_limit = size;
2020                        offslab_limit /= sizeof(unsigned int);
2021
2022                        if (num > offslab_limit)
2023                                break;
2024                }
2025
2026                /* Found something acceptable - save it away */
2027                cachep->num = num;
2028                cachep->gfporder = gfporder;
2029                left_over = remainder;
2030
2031                /*
2032                 * A VFS-reclaimable slab tends to have most allocations
2033                 * as GFP_NOFS and we really don't want to have to be allocating
2034                 * higher-order pages when we are unable to shrink dcache.
2035                 */
2036                if (flags & SLAB_RECLAIM_ACCOUNT)
2037                        break;
2038
2039                /*
2040                 * Large number of objects is good, but very large slabs are
2041                 * currently bad for the gfp()s.
2042                 */
2043                if (gfporder >= slab_max_order)
2044                        break;
2045
2046                /*
2047                 * Acceptable internal fragmentation?
2048                 */
2049                if (left_over * 8 <= (PAGE_SIZE << gfporder))
2050                        break;
2051        }
2052        return left_over;
2053}
2054
2055static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2056{
2057        if (slab_state >= FULL)
2058                return enable_cpucache(cachep, gfp);
2059
2060        if (slab_state == DOWN) {
2061                /*
2062                 * Note: Creation of first cache (kmem_cache).
2063                 * The setup_node is taken care
2064                 * of by the caller of __kmem_cache_create
2065                 */
2066                cachep->array[smp_processor_id()] = &initarray_generic.cache;
2067                slab_state = PARTIAL;
2068        } else if (slab_state == PARTIAL) {
2069                /*
2070                 * Note: the second kmem_cache_create must create the cache
2071                 * that's used by kmalloc(24), otherwise the creation of
2072                 * further caches will BUG().
2073                 */
2074                cachep->array[smp_processor_id()] = &initarray_generic.cache;
2075
2076                /*
2077                 * If the cache that's used by kmalloc(sizeof(kmem_cache_node)) is
2078                 * the second cache, then we need to set up all its node/,
2079                 * otherwise the creation of further caches will BUG().
2080                 */
2081                set_up_node(cachep, SIZE_AC);
2082                if (INDEX_AC == INDEX_NODE)
2083                        slab_state = PARTIAL_NODE;
2084                else
2085                        slab_state = PARTIAL_ARRAYCACHE;
2086        } else {
2087                /* Remaining boot caches */
2088                cachep->array[smp_processor_id()] =
2089                        kmalloc(sizeof(struct arraycache_init), gfp);
2090
2091                if (slab_state == PARTIAL_ARRAYCACHE) {
2092                        set_up_node(cachep, SIZE_NODE);
2093                        slab_state = PARTIAL_NODE;
2094                } else {
2095                        int node;
2096                        for_each_online_node(node) {
2097                                cachep->node[node] =
2098                                    kmalloc_node(sizeof(struct kmem_cache_node),
2099                                                gfp, node);
2100                                BUG_ON(!cachep->node[node]);
2101                                kmem_cache_node_init(cachep->node[node]);
2102                        }
2103                }
2104        }
2105        cachep->node[numa_mem_id()]->next_reap =
2106                        jiffies + REAPTIMEOUT_LIST3 +
2107                        ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
2108
2109        cpu_cache_get(cachep)->avail = 0;
2110        cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
2111        cpu_cache_get(cachep)->batchcount = 1;
2112        cpu_cache_get(cachep)->touched = 0;
2113        cachep->batchcount = 1;
2114        cachep->limit = BOOT_CPUCACHE_ENTRIES;
2115        return 0;
2116}
2117
2118/**
2119 * __kmem_cache_create - Create a cache.
2120 * @cachep: cache management descriptor
2121 * @flags: SLAB flags
2122 *
2123 * Returns a ptr to the cache on success, NULL on failure.
2124 * Cannot be called within a int, but can be interrupted.
2125 * The @ctor is run when new pages are allocated by the cache.
2126 *
2127 * The flags are
2128 *
2129 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
2130 * to catch references to uninitialised memory.
2131 *
2132 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
2133 * for buffer overruns.
2134 *
2135 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
2136 * cacheline.  This can be beneficial if you're counting cycles as closely
2137 * as davem.
2138 */
2139int
2140__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2141{
2142        size_t left_over, freelist_size, ralign;
2143        gfp_t gfp;
2144        int err;
2145        size_t size = cachep->size;
2146
2147#if DEBUG
2148#if FORCED_DEBUG
2149        /*
2150         * Enable redzoning and last user accounting, except for caches with
2151         * large objects, if the increased size would increase the object size
2152         * above the next power of two: caches with object sizes just above a
2153         * power of two have a significant amount of internal fragmentation.
2154         */
2155        if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
2156                                                2 * sizeof(unsigned long long)))
2157                flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
2158        if (!(flags & SLAB_DESTROY_BY_RCU))
2159                flags |= SLAB_POISON;
2160#endif
2161        if (flags & SLAB_DESTROY_BY_RCU)
2162                BUG_ON(flags & SLAB_POISON);
2163#endif
2164
2165        /*
2166         * Check that size is in terms of words.  This is needed to avoid
2167         * unaligned accesses for some archs when redzoning is used, and makes
2168         * sure any on-slab bufctl's are also correctly aligned.
2169         */
2170        if (size & (BYTES_PER_WORD - 1)) {
2171                size += (BYTES_PER_WORD - 1);
2172                size &= ~(BYTES_PER_WORD - 1);
2173        }
2174
2175        /*
2176         * Redzoning and user store require word alignment or possibly larger.
2177         * Note this will be overridden by architecture or caller mandated
2178         * alignment if either is greater than BYTES_PER_WORD.
2179         */
2180        if (flags & SLAB_STORE_USER)
2181                ralign = BYTES_PER_WORD;
2182
2183        if (flags & SLAB_RED_ZONE) {
2184                ralign = REDZONE_ALIGN;
2185                /* If redzoning, ensure that the second redzone is suitably
2186                 * aligned, by adjusting the object size accordingly. */
2187                size += REDZONE_ALIGN - 1;
2188                size &= ~(REDZONE_ALIGN - 1);
2189        }
2190
2191        /* 3) caller mandated alignment */
2192        if (ralign < cachep->align) {
2193                ralign = cachep->align;
2194        }
2195        /* disable debug if necessary */
2196        if (ralign > __alignof__(unsigned long long))
2197                flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2198        /*
2199         * 4) Store it.
2200         */
2201        cachep->align = ralign;
2202
2203        if (slab_is_available())
2204                gfp = GFP_KERNEL;
2205        else
2206                gfp = GFP_NOWAIT;
2207
2208        setup_node_pointer(cachep);
2209#if DEBUG
2210
2211        /*
2212         * Both debugging options require word-alignment which is calculated
2213         * into align above.
2214         */
2215        if (flags & SLAB_RED_ZONE) {
2216                /* add space for red zone words */
2217                cachep->obj_offset += sizeof(unsigned long long);
2218                size += 2 * sizeof(unsigned long long);
2219        }
2220        if (flags & SLAB_STORE_USER) {
2221                /* user store requires one word storage behind the end of
2222                 * the real object. But if the second red zone needs to be
2223                 * aligned to 64 bits, we must allow that much space.
2224                 */
2225                if (flags & SLAB_RED_ZONE)
2226                        size += REDZONE_ALIGN;
2227                else
2228                        size += BYTES_PER_WORD;
2229        }
2230#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
2231        if (size >= kmalloc_size(INDEX_NODE + 1)
2232            && cachep->object_size > cache_line_size()
2233            && ALIGN(size, cachep->align) < PAGE_SIZE) {
2234                cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align);
2235                size = PAGE_SIZE;
2236        }
2237#endif
2238#endif
2239
2240        /*
2241         * Determine if the slab management is 'on' or 'off' slab.
2242         * (bootstrapping cannot cope with offslab caches so don't do
2243         * it too early on. Always use on-slab management when
2244         * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
2245         */
2246        if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&
2247            !(flags & SLAB_NOLEAKTRACE))
2248                /*
2249                 * Size is large, assume best to place the slab management obj
2250                 * off-slab (should allow better packing of objs).
2251                 */
2252                flags |= CFLGS_OFF_SLAB;
2253
2254        size = ALIGN(size, cachep->align);
2255
2256        left_over = calculate_slab_order(cachep, size, cachep->align, flags);
2257
2258        if (!cachep->num)
2259                return -E2BIG;
2260
2261        freelist_size =
2262                ALIGN(cachep->num * sizeof(unsigned int), cachep->align);
2263
2264        /*
2265         * If the slab has been placed off-slab, and we have enough space then
2266         * move it on-slab. This is at the expense of any extra colouring.
2267         */
2268        if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) {
2269                flags &= ~CFLGS_OFF_SLAB;
2270                left_over -= freelist_size;
2271        }
2272
2273        if (flags & CFLGS_OFF_SLAB) {
2274                /* really off slab. No need for manual alignment */
2275                freelist_size = cachep->num * sizeof(unsigned int);
2276
2277#ifdef CONFIG_PAGE_POISONING
2278                /* If we're going to use the generic kernel_map_pages()
2279                 * poisoning, then it's going to smash the contents of
2280                 * the redzone and userword anyhow, so switch them off.
2281                 */
2282                if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
2283                        flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2284#endif
2285        }
2286
2287        cachep->colour_off = cache_line_size();
2288        /* Offset must be a multiple of the alignment. */
2289        if (cachep->colour_off < cachep->align)
2290                cachep->colour_off = cachep->align;
2291        cachep->colour = left_over / cachep->colour_off;
2292        cachep->freelist_size = freelist_size;
2293        cachep->flags = flags;
2294        cachep->allocflags = __GFP_COMP;
2295        if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
2296                cachep->allocflags |= GFP_DMA;
2297        cachep->size = size;
2298        cachep->reciprocal_buffer_size = reciprocal_value(size);
2299
2300        if (flags & CFLGS_OFF_SLAB) {
2301                cachep->freelist_cache = kmalloc_slab(freelist_size, 0u);
2302                /*
2303                 * This is a possibility for one of the malloc_sizes caches.
2304                 * But since we go off slab only for object size greater than
2305                 * PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
2306                 * this should not happen at all.
2307                 * But leave a BUG_ON for some lucky dude.
2308                 */
2309                BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache));
2310        }
2311
2312        err = setup_cpu_cache(cachep, gfp);
2313        if (err) {
2314                __kmem_cache_shutdown(cachep);
2315                return err;
2316        }
2317
2318        if (flags & SLAB_DEBUG_OBJECTS) {
2319                /*
2320                 * Would deadlock through slab_destroy()->call_rcu()->
2321                 * debug_object_activate()->kmem_cache_alloc().
2322                 */
2323                WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU);
2324
2325                slab_set_debugobj_lock_classes(cachep);
2326        } else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU))
2327                on_slab_lock_classes(cachep);
2328
2329        return 0;
2330}
2331
2332#if DEBUG
2333static void check_irq_off(void)
2334{
2335        BUG_ON(!irqs_disabled());
2336}
2337
2338static void check_irq_on(void)
2339{
2340        BUG_ON(irqs_disabled());
2341}
2342
2343static void check_spinlock_acquired(struct kmem_cache *cachep)
2344{
2345#ifdef CONFIG_SMP
2346        check_irq_off();
2347        assert_spin_locked(&cachep->node[numa_mem_id()]->list_lock);
2348#endif
2349}
2350
2351static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
2352{
2353#ifdef CONFIG_SMP
2354        check_irq_off();
2355        assert_spin_locked(&cachep->node[node]->list_lock);
2356#endif
2357}
2358
2359#else
2360#define check_irq_off() do { } while(0)
2361#define check_irq_on()  do { } while(0)
2362#define check_spinlock_acquired(x) do { } while(0)
2363#define check_spinlock_acquired_node(x, y) do { } while(0)
2364#endif
2365
2366static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
2367                        struct array_cache *ac,
2368                        int force, int node);
2369
2370static void do_drain(void *arg)
2371{
2372        struct kmem_cache *cachep = arg;
2373        struct array_cache *ac;
2374        int node = numa_mem_id();
2375
2376        check_irq_off();
2377        ac = cpu_cache_get(cachep);
2378        spin_lock(&cachep->node[node]->list_lock);
2379        free_block(cachep, ac->entry, ac->avail, node);
2380        spin_unlock(&cachep->node[node]->list_lock);
2381        ac->avail = 0;
2382}
2383
2384static void drain_cpu_caches(struct kmem_cache *cachep)
2385{
2386        struct kmem_cache_node *n;
2387        int node;
2388
2389        on_each_cpu(do_drain, cachep, 1);
2390        check_irq_on();
2391        for_each_online_node(node) {
2392                n = cachep->node[node];
2393                if (n && n->alien)
2394                        drain_alien_cache(cachep, n->alien);
2395        }
2396
2397        for_each_online_node(node) {
2398                n = cachep->node[node];
2399                if (n)
2400                        drain_array(cachep, n, n->shared, 1, node);
2401        }
2402}
2403
2404/*
2405 * Remove slabs from the list of free slabs.
2406 * Specify the number of slabs to drain in tofree.
2407 *
2408 * Returns the actual number of slabs released.
2409 */
2410static int drain_freelist(struct kmem_cache *cache,
2411                        struct kmem_cache_node *n, int tofree)
2412{
2413        struct list_head *p;
2414        int nr_freed;
2415        struct page *page;
2416
2417        nr_freed = 0;
2418        while (nr_freed < tofree && !list_empty(&n->slabs_free)) {
2419
2420                spin_lock_irq(&n->list_lock);
2421                p = n->slabs_free.prev;
2422                if (p == &n->slabs_free) {
2423                        spin_unlock_irq(&n->list_lock);
2424                        goto out;
2425                }
2426
2427                page = list_entry(p, struct page, lru);
2428#if DEBUG
2429                BUG_ON(page->active);
2430#endif
2431                list_del(&page->lru);
2432                /*
2433                 * Safe to drop the lock. The slab is no longer linked
2434                 * to the cache.
2435                 */
2436                n->free_objects -= cache->num;
2437                spin_unlock_irq(&n->list_lock);
2438                slab_destroy(cache, page);
2439                nr_freed++;
2440        }
2441out:
2442        return nr_freed;
2443}
2444
2445/* Called with slab_mutex held to protect against cpu hotplug */
2446static int __cache_shrink(struct kmem_cache *cachep)
2447{
2448        int ret = 0, i = 0;
2449        struct kmem_cache_node *n;
2450
2451        drain_cpu_caches(cachep);
2452
2453        check_irq_on();
2454        for_each_online_node(i) {
2455                n = cachep->node[i];
2456                if (!n)
2457                        continue;
2458
2459                drain_freelist(cachep, n, slabs_tofree(cachep, n));
2460
2461                ret += !list_empty(&n->slabs_full) ||
2462                        !list_empty(&n->slabs_partial);
2463        }
2464        return (ret ? 1 : 0);
2465}
2466
2467/**
2468 * kmem_cache_shrink - Shrink a cache.
2469 * @cachep: The cache to shrink.
2470 *
2471 * Releases as many slabs as possible for a cache.
2472 * To help debugging, a zero exit status indicates all slabs were released.
2473 */
2474int kmem_cache_shrink(struct kmem_cache *cachep)
2475{
2476        int ret;
2477        BUG_ON(!cachep || in_interrupt());
2478
2479        get_online_cpus();
2480        mutex_lock(&slab_mutex);
2481        ret = __cache_shrink(cachep);
2482        mutex_unlock(&slab_mutex);
2483        put_online_cpus();
2484        return ret;
2485}
2486EXPORT_SYMBOL(kmem_cache_shrink);
2487
2488int __kmem_cache_shutdown(struct kmem_cache *cachep)
2489{
2490        int i;
2491        struct kmem_cache_node *n;
2492        int rc = __cache_shrink(cachep);
2493
2494        if (rc)
2495                return rc;
2496
2497        for_each_online_cpu(i)
2498            kfree(cachep->array[i]);
2499
2500        /* NUMA: free the node structures */
2501        for_each_online_node(i) {
2502                n = cachep->node[i];
2503                if (n) {
2504                        kfree(n->shared);
2505                        free_alien_cache(n->alien);
2506                        kfree(n);
2507                }
2508        }
2509        return 0;
2510}
2511
2512/*
2513 * Get the memory for a slab management obj.
2514 * For a slab cache when the slab descriptor is off-slab, slab descriptors
2515 * always come from malloc_sizes caches.  The slab descriptor cannot
2516 * come from the same cache which is getting created because,
2517 * when we are searching for an appropriate cache for these
2518 * descriptors in kmem_cache_create, we search through the malloc_sizes array.
2519 * If we are creating a malloc_sizes cache here it would not be visible to
2520 * kmem_find_general_cachep till the initialization is complete.
2521 * Hence we cannot have freelist_cache same as the original cache.
2522 */
2523static void *alloc_slabmgmt(struct kmem_cache *cachep,
2524                                   struct page *page, int colour_off,
2525                                   gfp_t local_flags, int nodeid)
2526{
2527        void *freelist;
2528        void *addr = page_address(page);
2529
2530        if (OFF_SLAB(cachep)) {
2531                /* Slab management obj is off-slab. */
2532                freelist = kmem_cache_alloc_node(cachep->freelist_cache,
2533                                              local_flags, nodeid);
2534                if (!freelist)
2535                        return NULL;
2536        } else {
2537                freelist = addr + colour_off;
2538                colour_off += cachep->freelist_size;
2539        }
2540        page->active = 0;
2541        page->s_mem = addr + colour_off;
2542        return freelist;
2543}
2544
2545static inline unsigned int *slab_freelist(struct page *page)
2546{
2547        return (unsigned int *)(page->freelist);
2548}
2549
2550static void cache_init_objs(struct kmem_cache *cachep,
2551                            struct page *page)
2552{
2553        int i;
2554
2555        for (i = 0; i < cachep->num; i++) {
2556                void *objp = index_to_obj(cachep, page, i);
2557#if DEBUG
2558                /* need to poison the objs? */
2559                if (cachep->flags & SLAB_POISON)
2560                        poison_obj(cachep, objp, POISON_FREE);
2561                if (cachep->flags & SLAB_STORE_USER)
2562                        *dbg_userword(cachep, objp) = NULL;
2563
2564                if (cachep->flags & SLAB_RED_ZONE) {
2565                        *dbg_redzone1(cachep, objp) = RED_INACTIVE;
2566                        *dbg_redzone2(cachep, objp) = RED_INACTIVE;
2567                }
2568                /*
2569                 * Constructors are not allowed to allocate memory from the same
2570                 * cache which they are a constructor for.  Otherwise, deadlock.
2571                 * They must also be threaded.
2572                 */
2573                if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2574                        cachep->ctor(objp + obj_offset(cachep));
2575
2576                if (cachep->flags & SLAB_RED_ZONE) {
2577                        if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
2578                                slab_error(cachep, "constructor overwrote the"
2579                                           " end of an object");
2580                        if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
2581                                slab_error(cachep, "constructor overwrote the"
2582                                           " start of an object");
2583                }
2584                if ((cachep->size % PAGE_SIZE) == 0 &&
2585                            OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
2586                        kernel_map_pages(virt_to_page(objp),
2587                                         cachep->size / PAGE_SIZE, 0);
2588#else
2589                if (cachep->ctor)
2590                        cachep->ctor(objp);
2591#endif
2592                slab_freelist(page)[i] = i;
2593        }
2594}
2595
2596static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
2597{
2598        if (CONFIG_ZONE_DMA_FLAG) {
2599                if (flags & GFP_DMA)
2600                        BUG_ON(!(cachep->allocflags & GFP_DMA));
2601                else
2602                        BUG_ON(cachep->allocflags & GFP_DMA);
2603        }
2604}
2605
2606static void *slab_get_obj(struct kmem_cache *cachep, struct page *page,
2607                                int nodeid)
2608{
2609        void *objp;
2610
2611        objp = index_to_obj(cachep, page, slab_freelist(page)[page->active]);
2612        page->active++;
2613#if DEBUG
2614        WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);
2615#endif
2616
2617        return objp;
2618}
2619
2620static void slab_put_obj(struct kmem_cache *cachep, struct page *page,
2621                                void *objp, int nodeid)
2622{
2623        unsigned int objnr = obj_to_index(cachep, page, objp);
2624#if DEBUG
2625        unsigned int i;
2626
2627        /* Verify that the slab belongs to the intended node */
2628        WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);
2629
2630        /* Verify double free bug */
2631        for (i = page->active; i < cachep->num; i++) {
2632                if (slab_freelist(page)[i] == objnr) {
2633                        printk(KERN_ERR "slab: double free detected in cache "
2634                                        "'%s', objp %p\n", cachep->name, objp);
2635                        BUG();
2636                }
2637        }
2638#endif
2639        page->active--;
2640        slab_freelist(page)[page->active] = objnr;
2641}
2642
2643/*
2644 * Map pages beginning at addr to the given cache and slab. This is required
2645 * for the slab allocator to be able to lookup the cache and slab of a
2646 * virtual address for kfree, ksize, and slab debugging.
2647 */
2648static void slab_map_pages(struct kmem_cache *cache, struct page *page,
2649                           void *freelist)
2650{
2651        page->slab_cache = cache;
2652        page->freelist = freelist;
2653}
2654
2655/*
2656 * Grow (by 1) the number of slabs within a cache.  This is called by
2657 * kmem_cache_alloc() when there are no active objs left in a cache.
2658 */
2659static int cache_grow(struct kmem_cache *cachep,
2660                gfp_t flags, int nodeid, struct page *page)
2661{
2662        void *freelist;
2663        size_t offset;
2664        gfp_t local_flags;
2665        struct kmem_cache_node *n;
2666
2667        /*
2668         * Be lazy and only check for valid flags here,  keeping it out of the
2669         * critical path in kmem_cache_alloc().
2670         */
2671        BUG_ON(flags & GFP_SLAB_BUG_MASK);
2672        local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
2673
2674        /* Take the node list lock to change the colour_next on this node */
2675        check_irq_off();
2676        n = cachep->node[nodeid];
2677        spin_lock(&n->list_lock);
2678
2679        /* Get colour for the slab, and cal the next value. */
2680        offset = n->colour_next;
2681        n->colour_next++;
2682        if (n->colour_next >= cachep->colour)
2683                n->colour_next = 0;
2684        spin_unlock(&n->list_lock);
2685
2686        offset *= cachep->colour_off;
2687
2688        if (local_flags & __GFP_WAIT)
2689                local_irq_enable();
2690
2691        /*
2692         * The test for missing atomic flag is performed here, rather than
2693         * the more obvious place, simply to reduce the critical path length
2694         * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
2695         * will eventually be caught here (where it matters).
2696         */
2697        kmem_flagcheck(cachep, flags);
2698
2699        /*
2700         * Get mem for the objs.  Attempt to allocate a physical page from
2701         * 'nodeid'.
2702         */
2703        if (!page)
2704                page = kmem_getpages(cachep, local_flags, nodeid);
2705        if (!page)
2706                goto failed;
2707
2708        /* Get slab management. */
2709        freelist = alloc_slabmgmt(cachep, page, offset,
2710                        local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
2711        if (!freelist)
2712                goto opps1;
2713
2714        slab_map_pages(cachep, page, freelist);
2715
2716        cache_init_objs(cachep, page);
2717
2718        if (local_flags & __GFP_WAIT)
2719                local_irq_disable();
2720        check_irq_off();
2721        spin_lock(&n->list_lock);
2722
2723        /* Make slab active. */
2724        list_add_tail(&page->lru, &(n->slabs_free));
2725        STATS_INC_GROWN(cachep);
2726        n->free_objects += cachep->num;
2727        spin_unlock(&n->list_lock);
2728        return 1;
2729opps1:
2730        kmem_freepages(cachep, page);
2731failed:
2732        if (local_flags & __GFP_WAIT)
2733                local_irq_disable();
2734        return 0;
2735}
2736
2737#if DEBUG
2738
2739/*
2740 * Perform extra freeing checks:
2741 * - detect bad pointers.
2742 * - POISON/RED_ZONE checking
2743 */
2744static void kfree_debugcheck(const void *objp)
2745{
2746        if (!virt_addr_valid(objp)) {
2747                printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
2748                       (unsigned long)objp);
2749                BUG();
2750        }
2751}
2752
2753static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
2754{
2755        unsigned long long redzone1, redzone2;
2756
2757        redzone1 = *dbg_redzone1(cache, obj);
2758        redzone2 = *dbg_redzone2(cache, obj);
2759
2760        /*
2761         * Redzone is ok.
2762         */
2763        if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)
2764                return;
2765
2766        if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)
2767                slab_error(cache, "double free detected");
2768        else
2769                slab_error(cache, "memory outside object was overwritten");
2770
2771        printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n",
2772                        obj, redzone1, redzone2);
2773}
2774
2775static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2776                                   unsigned long caller)
2777{
2778        unsigned int objnr;
2779        struct page *page;
2780
2781        BUG_ON(virt_to_cache(objp) != cachep);
2782
2783        objp -= obj_offset(cachep);
2784        kfree_debugcheck(objp);
2785        page = virt_to_head_page(objp);
2786
2787        if (cachep->flags & SLAB_RED_ZONE) {
2788                verify_redzone_free(cachep, objp);
2789                *dbg_redzone1(cachep, objp) = RED_INACTIVE;
2790                *dbg_redzone2(cachep, objp) = RED_INACTIVE;
2791        }
2792        if (cachep->flags & SLAB_STORE_USER)
2793                *dbg_userword(cachep, objp) = (void *)caller;
2794
2795        objnr = obj_to_index(cachep, page, objp);
2796
2797        BUG_ON(objnr >= cachep->num);
2798        BUG_ON(objp != index_to_obj(cachep, page, objnr));
2799
2800        if (cachep->flags & SLAB_POISON) {
2801#ifdef CONFIG_DEBUG_PAGEALLOC
2802                if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
2803                        store_stackinfo(cachep, objp, caller);
2804                        kernel_map_pages(virt_to_page(objp),
2805                                         cachep->size / PAGE_SIZE, 0);
2806                } else {
2807                        poison_obj(cachep, objp, POISON_FREE);
2808                }
2809#else
2810                poison_obj(cachep, objp, POISON_FREE);
2811#endif
2812        }
2813        return objp;
2814}
2815
2816#else
2817#define kfree_debugcheck(x) do { } while(0)
2818#define cache_free_debugcheck(x,objp,z) (objp)
2819#endif
2820
2821static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
2822                                                        bool force_refill)
2823{
2824        int batchcount;
2825        struct kmem_cache_node *n;
2826        struct array_cache *ac;
2827        int node;
2828
2829        check_irq_off();
2830        node = numa_mem_id();
2831        if (unlikely(force_refill))
2832                goto force_grow;
2833retry:
2834        ac = cpu_cache_get(cachep);
2835        batchcount = ac->batchcount;
2836        if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
2837                /*
2838                 * If there was little recent activity on this cache, then
2839                 * perform only a partial refill.  Otherwise we could generate
2840                 * refill bouncing.
2841                 */
2842                batchcount = BATCHREFILL_LIMIT;
2843        }
2844        n = cachep->node[node];
2845
2846        BUG_ON(ac->avail > 0 || !n);
2847        spin_lock(&n->list_lock);
2848
2849        /* See if we can refill from the shared array */
2850        if (n->shared && transfer_objects(ac, n->shared, batchcount)) {
2851                n->shared->touched = 1;
2852                goto alloc_done;
2853        }
2854
2855        while (batchcount > 0) {
2856                struct list_head *entry;
2857                struct page *page;
2858                /* Get slab alloc is to come from. */
2859                entry = n->slabs_partial.next;
2860                if (entry == &n->slabs_partial) {
2861                        n->free_touched = 1;
2862                        entry = n->slabs_free.next;
2863                        if (entry == &n->slabs_free)
2864                                goto must_grow;
2865                }
2866
2867                page = list_entry(entry, struct page, lru);
2868                check_spinlock_acquired(cachep);
2869
2870                /*
2871                 * The slab was either on partial or free list so
2872                 * there must be at least one object available for
2873                 * allocation.
2874                 */
2875                BUG_ON(page->active >= cachep->num);
2876
2877                while (page->active < cachep->num && batchcount--) {
2878                        STATS_INC_ALLOCED(cachep);
2879                        STATS_INC_ACTIVE(cachep);
2880                        STATS_SET_HIGH(cachep);
2881
2882                        ac_put_obj(cachep, ac, slab_get_obj(cachep, page,
2883                                                                        node));
2884                }
2885
2886                /* move slabp to correct slabp list: */
2887                list_del(&page->lru);
2888                if (page->active == cachep->num)
2889                        list_add(&page->list, &n->slabs_full);
2890                else
2891                        list_add(&page->list, &n->slabs_partial);
2892        }
2893
2894must_grow:
2895        n->free_objects -= ac->avail;
2896alloc_done:
2897        spin_unlock(&n->list_lock);
2898
2899        if (unlikely(!ac->avail)) {
2900                int x;
2901force_grow:
2902                x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
2903
2904                /* cache_grow can reenable interrupts, then ac could change. */
2905                ac = cpu_cache_get(cachep);
2906                node = numa_mem_id();
2907
2908                /* no objects in sight? abort */
2909                if (!x && (ac->avail == 0 || force_refill))
2910                        return NULL;
2911
2912                if (!ac->avail)         /* objects refilled by interrupt? */
2913                        goto retry;
2914        }
2915        ac->touched = 1;
2916
2917        return ac_get_obj(cachep, ac, flags, force_refill);
2918}
2919
2920static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
2921                                                gfp_t flags)
2922{
2923        might_sleep_if(flags & __GFP_WAIT);
2924#if DEBUG
2925        kmem_flagcheck(cachep, flags);
2926#endif
2927}
2928
2929#if DEBUG
2930static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
2931                                gfp_t flags, void *objp, unsigned long caller)
2932{
2933        if (!objp)
2934                return objp;
2935        if (cachep->flags & SLAB_POISON) {
2936#ifdef CONFIG_DEBUG_PAGEALLOC
2937                if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
2938                        kernel_map_pages(virt_to_page(objp),
2939                                         cachep->size / PAGE_SIZE, 1);
2940                else
2941                        check_poison_obj(cachep, objp);
2942#else
2943                check_poison_obj(cachep, objp);
2944#endif
2945                poison_obj(cachep, objp, POISON_INUSE);
2946        }
2947        if (cachep->flags & SLAB_STORE_USER)
2948                *dbg_userword(cachep, objp) = (void *)caller;
2949
2950        if (cachep->flags & SLAB_RED_ZONE) {
2951                if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
2952                                *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
2953                        slab_error(cachep, "double free, or memory outside"
2954                                                " object was overwritten");
2955                        printk(KERN_ERR
2956                                "%p: redzone 1:0x%llx, redzone 2:0x%llx\n",
2957                                objp, *dbg_redzone1(cachep, objp),
2958                                *dbg_redzone2(cachep, objp));
2959                }
2960                *dbg_redzone1(cachep, objp) = RED_ACTIVE;
2961                *dbg_redzone2(cachep, objp) = RED_ACTIVE;
2962        }
2963        objp += obj_offset(cachep);
2964        if (cachep->ctor && cachep->flags & SLAB_POISON)
2965                cachep->ctor(objp);
2966        if (ARCH_SLAB_MINALIGN &&
2967            ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) {
2968                printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
2969                       objp, (int)ARCH_SLAB_MINALIGN);
2970        }
2971        return objp;
2972}
2973#else
2974#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
2975#endif
2976
2977static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
2978{
2979        if (cachep == kmem_cache)
2980                return false;
2981
2982        return should_failslab(cachep->object_size, flags, cachep->flags);
2983}
2984
2985static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
2986{
2987        void *objp;
2988        struct array_cache *ac;
2989        bool force_refill = false;
2990
2991        check_irq_off();
2992
2993        ac = cpu_cache_get(cachep);
2994        if (likely(ac->avail)) {
2995                ac->touched = 1;
2996                objp = ac_get_obj(cachep, ac, flags, false);
2997
2998                /*
2999                 * Allow for the possibility all avail objects are not allowed
3000                 * by the current flags

3001                 */
3002                if (objp) {
3003                        STATS_INC_ALLOCHIT(cachep);
3004                        goto out;
3005                }
3006                force_refill = true;
3007        }
3008
3009        STATS_INC_ALLOCMISS(cachep);
3010        objp = cache_alloc_refill(cachep, flags, force_refill);
3011        /*
3012         * the 'ac' may be updated by cache_alloc_refill(),
3013         * and kmemleak_erase() requires its correct value.
3014         */
3015        ac = cpu_cache_get(cachep);
3016
3017out:
3018        /*
3019         * To avoid a false negative, if an object that is in one of the
3020         * per-CPU caches is leaked, we need to make sure kmemleak doesn't
3021         * treat the array pointers as a reference to the object.
3022         */
3023        if (objp)
3024                kmemleak_erase(&ac->entry[ac->avail]);
3025        return objp;
3026}
3027
3028#ifdef CONFIG_NUMA
3029/*
3030 * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.
3031 *
3032 * If we are in_interrupt, then process context, including cpusets and
3033 * mempolicy, may not apply and should not be used for allocation policy.
3034 */
3035static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3036{
3037        int nid_alloc, nid_here;
3038
3039        if (in_interrupt() || (flags & __GFP_THISNODE))
3040                return NULL;
3041        nid_alloc = nid_here = numa_mem_id();
3042        if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
3043                nid_alloc = cpuset_slab_spread_node();
3044        else if (current->mempolicy)
3045                nid_alloc = slab_node();
3046        if (nid_alloc != nid_here)
3047                return ____cache_alloc_node(cachep, flags, nid_alloc);
3048        return NULL;
3049}
3050
3051/*
3052 * Fallback function if there was no memory available and no objects on a
3053 * certain node and fall back is permitted. First we scan all the
3054 * available node for available objects. If that fails then we
3055 * perform an allocation without specifying a node. This allows the page
3056 * allocator to do its reclaim / fallback magic. We then insert the
3057 * slab into the proper nodelist and then allocate from it.
3058 */
3059static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3060{
3061        struct zonelist *zonelist;
3062        gfp_t local_flags;
3063        struct zoneref *z;
3064        struct zone *zone;
3065        enum zone_type high_zoneidx = gfp_zone(flags);
3066        void *obj = NULL;
3067        int nid;
3068        unsigned int cpuset_mems_cookie;
3069
3070        if (flags & __GFP_THISNODE)
3071                return NULL;
3072
3073        local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
3074
3075retry_cpuset:
3076        cpuset_mems_cookie = get_mems_allowed();
3077        zonelist = node_zonelist(slab_node(), flags);
3078
3079retry:
3080        /*
3081         * Look through allowed nodes for objects available
3082         * from existing per node queues.
3083         */
3084        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
3085                nid = zone_to_nid(zone);
3086
3087                if (cpuset_zone_allowed_hardwall(zone, flags) &&
3088                        cache->node[nid] &&
3089                        cache->node[nid]->free_objects) {
3090                                obj = ____cache_alloc_node(cache,
3091                                        flags | GFP_THISNODE, nid);
3092                                if (obj)
3093                                        break;
3094                }
3095        }
3096
3097        if (!obj) {
3098                /*
3099                 * This allocation will be performed within the constraints
3100                 * of the current cpuset / memory policy requirements.
3101                 * We may trigger various forms of reclaim on the allowed
3102                 * set and go into memory reserves if necessary.
3103                 */
3104                struct page *page;
3105
3106                if (local_flags & __GFP_WAIT)
3107                        local_irq_enable();
3108                kmem_flagcheck(cache, flags);
3109                page = kmem_getpages(cache, local_flags, numa_mem_id());
3110                if (local_flags & __GFP_WAIT)
3111                        local_irq_disable();
3112                if (page) {
3113                        /*
3114                         * Insert into the appropriate per node queues
3115                         */
3116                        nid = page_to_nid(page);
3117                        if (cache_grow(cache, flags, nid, page)) {
3118                                obj = ____cache_alloc_node(cache,
3119                                        flags | GFP_THISNODE, nid);
3120                                if (!obj)
3121                                        /*
3122                                         * Another processor may allocate the
3123                                         * objects in the slab since we are
3124                                         * not holding any locks.
3125                                         */
3126                                        goto retry;
3127                        } else {
3128                                /* cache_grow already freed obj */
3129                                obj = NULL;
3130                        }
3131                }
3132        }
3133
3134        if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj))
3135                goto retry_cpuset;
3136        return obj;
3137}
3138
3139/*
3140 * A interface to enable slab creation on nodeid
3141 */
3142static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
3143                                int nodeid)
3144{
3145        struct list_head *entry;
3146        struct page *page;
3147        struct kmem_cache_node *n;
3148        void *obj;
3149        int x;
3150
3151        VM_BUG_ON(nodeid > num_online_nodes());
3152        n = cachep->node[nodeid];
3153        BUG_ON(!n);
3154
3155retry:
3156        check_irq_off();
3157        spin_lock(&n->list_lock);
3158        entry = n->slabs_partial.next;
3159        if (entry == &n->slabs_partial) {
3160                n->free_touched = 1;
3161                entry = n->slabs_free.next;
3162                if (entry == &n->slabs_free)
3163                        goto must_grow;
3164        }
3165
3166        page = list_entry(entry, struct page, lru);
3167        check_spinlock_acquired_node(cachep, nodeid);
3168
3169        STATS_INC_NODEALLOCS(cachep);
3170        STATS_INC_ACTIVE(cachep);
3171        STATS_SET_HIGH(cachep);
3172
3173        BUG_ON(page->active == cachep->num);
3174
3175        obj = slab_get_obj(cachep, page, nodeid);
3176        n->free_objects--;
3177        /* move slabp to correct slabp list: */
3178        list_del(&page->lru);
3179
3180        if (page->active == cachep->num)
3181                list_add(&page->lru, &n->slabs_full);
3182        else
3183                list_add(&page->lru, &n->slabs_partial);
3184
3185        spin_unlock(&n->list_lock);
3186        goto done;
3187
3188must_grow:
3189        spin_unlock(&n->list_lock);
3190        x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
3191        if (x)
3192                goto retry;
3193
3194        return fallback_alloc(cachep, flags);
3195
3196done:
3197        return obj;
3198}
3199
3200static __always_inline void *
3201slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3202                   unsigned long caller)
3203{
3204        unsigned long save_flags;
3205        void *ptr;
3206        int slab_node = numa_mem_id();
3207
3208        flags &= gfp_allowed_mask;
3209
3210        lockdep_trace_alloc(flags);
3211
3212        if (slab_should_failslab(cachep, flags))
3213                return NULL;
3214
3215        cachep = memcg_kmem_get_cache(cachep, flags);
3216
3217        cache_alloc_debugcheck_before(cachep, flags);
3218        local_irq_save(save_flags);
3219
3220        if (nodeid == NUMA_NO_NODE)
3221                nodeid = slab_node;
3222
3223        if (unlikely(!cachep->node[nodeid])) {
3224                /* Node not bootstrapped yet */
3225                ptr = fallback_alloc(cachep, flags);
3226                goto out;
3227        }
3228
3229        if (nodeid == slab_node) {
3230                /*
3231                 * Use the locally cached objects if possible.
3232                 * However ____cache_alloc does not allow fallback
3233                 * to other nodes. It may fail while we still have
3234                 * objects on other nodes available.
3235                 */
3236                ptr = ____cache_alloc(cachep, flags);
3237                if (ptr)
3238                        goto out;
3239        }
3240        /* ___cache_alloc_node can fall back to other nodes */
3241        ptr = ____cache_alloc_node(cachep, flags, nodeid);
3242  out:
3243        local_irq_restore(save_flags);
3244        ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
3245        kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags,
3246                                 flags);
3247
3248        if (likely(ptr))
3249                kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size);
3250
3251        if (unlikely((flags & __GFP_ZERO) && ptr))
3252                memset(ptr, 0, cachep->object_size);
3253
3254        return ptr;
3255}
3256
3257static __always_inline void *
3258__do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
3259{
3260        void *objp;
3261
3262        if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
3263                objp = alternate_node_alloc(cache, flags);
3264                if (objp)
3265                        goto out;
3266        }
3267        objp = ____cache_alloc(cache, flags);
3268
3269        /*
3270         * We may just have run out of memory on the local node.
3271         * ____cache_alloc_node() knows how to locate memory on other nodes
3272         */
3273        if (!objp)
3274                objp = ____cache_alloc_node(cache, flags, numa_mem_id());
3275
3276  out:
3277        return objp;
3278}
3279#else
3280
3281static __always_inline void *
3282__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3283{
3284        return ____cache_alloc(cachep, flags);
3285}
3286
3287#endif /* CONFIG_NUMA */
3288
3289static __always_inline void *
3290slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
3291{
3292        unsigned long save_flags;
3293        void *objp;
3294
3295        flags &= gfp_allowed_mask;
3296
3297        lockdep_trace_alloc(flags);
3298
3299        if (slab_should_failslab(cachep, flags))
3300                return NULL;
3301
3302        cachep = memcg_kmem_get_cache(cachep, flags);
3303
3304        cache_alloc_debugcheck_before(cachep, flags);
3305        local_irq_save(save_flags);
3306        objp = __do_cache_alloc(cachep, flags);
3307        local_irq_restore(save_flags);
3308        objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
3309        kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,
3310                                 flags);
3311        prefetchw(objp);
3312
3313        if (likely(objp))
3314                kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size);
3315
3316        if (unlikely((flags & __GFP_ZERO) && objp))
3317                memset(objp, 0, cachep->object_size);
3318
3319        return objp;
3320}
3321
3322/*
3323 * Caller needs to acquire correct kmem_list's list_lock
3324 */
3325static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
3326                       int node)
3327{
3328        int i;
3329        struct kmem_cache_node *n;
3330
3331        for (i = 0; i < nr_objects; i++) {
3332                void *objp;
3333                struct page *page;
3334
3335                clear_obj_pfmemalloc(&objpp[i]);
3336                objp = objpp[i];
3337
3338                page = virt_to_head_page(objp);
3339                n = cachep->node[node];
3340                list_del(&page->lru);
3341                check_spinlock_acquired_node(cachep, node);
3342                slab_put_obj(cachep, page, objp, node);
3343                STATS_DEC_ACTIVE(cachep);
3344                n->free_objects++;
3345
3346                /* fixup slab chains */
3347                if (page->active == 0) {
3348                        if (n->free_objects > n->free_limit) {
3349                                n->free_objects -= cachep->num;
3350                                /* No need to drop any previously held
3351                                 * lock here, even if we have a off-slab slab
3352                                 * descriptor it is guaranteed to come from
3353                                 * a different cache, refer to comments before
3354                                 * alloc_slabmgmt.
3355                                 */
3356                                slab_destroy(cachep, page);
3357                        } else {
3358                                list_add(&page->lru, &n->slabs_free);
3359                        }
3360                } else {
3361                        /* Unconditionally move a slab to the end of the
3362                         * partial list on free - maximum time for the
3363                         * other objects to be freed, too.
3364                         */
3365                        list_add_tail(&page->lru, &n->slabs_partial);
3366                }
3367        }
3368}
3369
3370static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
3371{
3372        int batchcount;
3373        struct kmem_cache_node *n;
3374        int node = numa_mem_id();
3375
3376        batchcount = ac->batchcount;
3377#if DEBUG
3378        BUG_ON(!batchcount || batchcount > ac->avail);
3379#endif
3380        check_irq_off();
3381        n = cachep->node[node];
3382        spin_lock(&n->list_lock);
3383        if (n->shared) {
3384                struct array_cache *shared_array = n->shared;
3385                int max = shared_array->limit - shared_array->avail;
3386                if (max) {
3387                        if (batchcount > max)
3388                                batchcount = max;
3389                        memcpy(&(shared_array->entry[shared_array->avail]),
3390                               ac->entry, sizeof(void *) * batchcount);
3391                        shared_array->avail += batchcount;
3392                        goto free_done;
3393                }
3394        }
3395
3396        free_block(cachep, ac->entry, batchcount, node);
3397free_done:
3398#if STATS
3399        {
3400                int i = 0;
3401                struct list_head *p;
3402
3403                p = n->slabs_free.next;
3404                while (p != &(n->slabs_free)) {
3405                        struct page *page;
3406
3407                        page = list_entry(p, struct page, lru);
3408                        BUG_ON(page->active);
3409
3410                        i++;
3411                        p = p->next;
3412                }
3413                STATS_SET_FREEABLE(cachep, i);
3414        }
3415#endif
3416        spin_unlock(&n->list_lock);
3417        ac->avail -= batchcount;
3418        memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
3419}
3420
3421/*
3422 * Release an obj back to its cache. If the obj has a constructed state, it must
3423 * be in this state _before_ it is released.  Called with disabled ints.
3424 */
3425static inline void __cache_free(struct kmem_cache *cachep, void *objp,
3426                                unsigned long caller)
3427{
3428        struct array_cache *ac = cpu_cache_get(cachep);
3429
3430        check_irq_off();
3431        kmemleak_free_recursive(objp, cachep->flags);
3432        objp = cache_free_debugcheck(cachep, objp, caller);
3433
3434        kmemcheck_slab_free(cachep, objp, cachep->object_size);
3435
3436        /*
3437         * Skip calling cache_free_alien() when the platform is not numa.
3438         * This will avoid cache misses that happen while accessing slabp (which
3439         * is per page memory  reference) to get nodeid. Instead use a global
3440         * variable to skip the call, which is mostly likely to be present in
3441         * the cache.
3442         */
3443        if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
3444                return;
3445
3446        if (likely(ac->avail < ac->limit)) {
3447                STATS_INC_FREEHIT(cachep);
3448        } else {
3449                STATS_INC_FREEMISS(cachep);
3450                cache_flusharray(cachep, ac);
3451        }
3452
3453        ac_put_obj(cachep, ac, objp);
3454}
3455
3456/**
3457 * kmem_cache_alloc - Allocate an object
3458 * @cachep: The cache to allocate from.
3459 * @flags: See kmalloc().
3460 *
3461 * Allocate an object from this cache.  The flags are only relevant
3462 * if the cache has no available objects.
3463 */
3464void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3465{
3466        void *ret = slab_alloc(cachep, flags, _RET_IP_);
3467
3468        trace_kmem_cache_alloc(_RET_IP_, ret,
3469                               cachep->object_size, cachep->size, flags);
3470
3471        return ret;
3472}
3473EXPORT_SYMBOL(kmem_cache_alloc);
3474
3475#ifdef CONFIG_TRACING
3476void *
3477kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
3478{
3479        void *ret;
3480
3481        ret = slab_alloc(cachep, flags, _RET_IP_);
3482
3483        trace_kmalloc(_RET_IP_, ret,
3484                      size, cachep->size, flags);
3485        return ret;
3486}
3487EXPORT_SYMBOL(kmem_cache_alloc_trace);
3488#endif
3489
3490#ifdef CONFIG_NUMA
3491/**
3492 * kmem_cache_alloc_node - Allocate an object on the specified node
3493 * @cachep: The cache to allocate from.
3494 * @flags: See kmalloc().
3495 * @nodeid: node number of the target node.
3496 *
3497 * Identical to kmem_cache_alloc but it will allocate memory on the given
3498 * node, which can improve the performance for cpu bound structures.
3499 *
3500 * Fallback to other node is possible if __GFP_THISNODE is not set.
3501 */
3502void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3503{
3504        void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
3505
3506        trace_kmem_cache_alloc_node(_RET_IP_, ret,
3507                                    cachep->object_size, cachep->size,
3508                                    flags, nodeid);
3509
3510        return ret;
3511}
3512EXPORT_SYMBOL(kmem_cache_alloc_node);
3513
3514#ifdef CONFIG_TRACING
3515void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
3516                                  gfp_t flags,
3517                                  int nodeid,
3518                                  size_t size)
3519{
3520        void *ret;
3521
3522        ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
3523
3524        trace_kmalloc_node(_RET_IP_, ret,
3525                           size, cachep->size,
3526                           flags, nodeid);
3527        return ret;
3528}
3529EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
3530#endif
3531
3532static __always_inline void *
3533__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
3534{
3535        struct kmem_cache *cachep;
3536
3537        cachep = kmalloc_slab(size, flags);
3538        if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3539                return cachep;
3540        return kmem_cache_alloc_node_trace(cachep, flags, node, size);
3541}
3542
3543#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
3544void *__kmalloc_node(size_t size, gfp_t flags, int node)
3545{
3546        return __do_kmalloc_node(size, flags, node, _RET_IP_);
3547}
3548EXPORT_SYMBOL(__kmalloc_node);
3549
3550void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
3551                int node, unsigned long caller)
3552{
3553        return __do_kmalloc_node(size, flags, node, caller);
3554}
3555EXPORT_SYMBOL(__kmalloc_node_track_caller);
3556#else
3557void *__kmalloc_node(size_t size, gfp_t flags, int node)
3558{
3559        return __do_kmalloc_node(size, flags, node, 0);
3560}
3561EXPORT_SYMBOL(__kmalloc_node);
3562#endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */
3563#endif /* CONFIG_NUMA */
3564
3565/**
3566 * __do_kmalloc - allocate memory
3567 * @size: how many bytes of memory are required.
3568 * @flags: the type of memory to allocate (see kmalloc).
3569 * @caller: function caller for debug tracking of the caller
3570 */
3571static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3572                                          unsigned long caller)
3573{
3574        struct kmem_cache *cachep;
3575        void *ret;
3576
3577        /* If you want to save a few bytes .text space: replace
3578         * __ with kmem_.
3579         * Then kmalloc uses the uninlined functions instead of the inline
3580         * functions.
3581         */
3582        cachep = kmalloc_slab(size, flags);
3583        if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3584                return cachep;
3585        ret = slab_alloc(cachep, flags, caller);
3586
3587        trace_kmalloc(caller, ret,
3588                      size, cachep->size, flags);
3589
3590        return ret;
3591}
3592
3593
3594#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
3595void *__kmalloc(size_t size, gfp_t flags)
3596{
3597        return __do_kmalloc(size, flags, _RET_IP_);
3598}
3599EXPORT_SYMBOL(__kmalloc);
3600
3601void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
3602{
3603        return __do_kmalloc(size, flags, caller);
3604}
3605EXPORT_SYMBOL(__kmalloc_track_caller);
3606
3607#else
3608void *__kmalloc(size_t size, gfp_t flags)
3609{
3610        return __do_kmalloc(size, flags, 0);
3611}
3612EXPORT_SYMBOL(__kmalloc);
3613#endif
3614
3615/**
3616 * kmem_cache_free - Deallocate an object
3617 * @cachep: The cache the allocation was from.
3618 * @objp: The previously allocated object.
3619 *
3620 * Free an object which was previously allocated from this
3621 * cache.
3622 */
3623void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3624{
3625        unsigned long flags;
3626        cachep = cache_from_obj(cachep, objp);
3627        if (!cachep)
3628                return;
3629
3630        local_irq_save(flags);
3631        debug_check_no_locks_freed(objp, cachep->object_size);
3632        if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
3633                debug_check_no_obj_freed(objp, cachep->object_size);
3634        __cache_free(cachep, objp, _RET_IP_);
3635        local_irq_restore(flags);
3636
3637        trace_kmem_cache_free(_RET_IP_, objp);
3638}
3639EXPORT_SYMBOL(kmem_cache_free);
3640
3641/**
3642 * kfree - free previously allocated memory
3643 * @objp: pointer returned by kmalloc.
3644 *
3645 * If @objp is NULL, no operation is performed.
3646 *
3647 * Don't free memory not originally allocated by kmalloc()
3648 * or you will run into trouble.
3649 */
3650void kfree(const void *objp)
3651{
3652        struct kmem_cache *c;
3653        unsigned long flags;
3654
3655        trace_kfree(_RET_IP_, objp);
3656
3657        if (unlikely(ZERO_OR_NULL_PTR(objp)))
3658                return;
3659        local_irq_save(flags);
3660        kfree_debugcheck(objp);
3661        c = virt_to_cache(objp);
3662        debug_check_no_locks_freed(objp, c->object_size);
3663
3664        debug_check_no_obj_freed(objp, c->object_size);
3665        __cache_free(c, (void *)objp, _RET_IP_);
3666        local_irq_restore(flags);
3667}
3668EXPORT_SYMBOL(kfree);
3669
3670/*
3671 * This initializes kmem_cache_node or resizes various caches for all nodes.
3672 */
3673static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
3674{
3675        int node;
3676        struct kmem_cache_node *n;
3677        struct array_cache *new_shared;
3678        struct array_cache **new_alien = NULL;
3679
3680        for_each_online_node(node) {
3681
3682                if (use_alien_caches) {
3683                        new_alien = alloc_alien_cache(node, cachep->limit, gfp);
3684                        if (!new_alien)
3685                                goto fail;
3686                }
3687
3688                new_shared = NULL;
3689                if (cachep->shared) {
3690                        new_shared = alloc_arraycache(node,
3691                                cachep->shared*cachep->batchcount,
3692                                        0xbaadf00d, gfp);
3693                        if (!new_shared) {
3694                                free_alien_cache(new_alien);
3695                                goto fail;
3696                        }
3697                }
3698
3699                n = cachep->node[node];
3700                if (n) {
3701                        struct array_cache *shared = n->shared;
3702
3703                        spin_lock_irq(&n->list_lock);
3704
3705                        if (shared)
3706                                free_block(cachep, shared->entry,
3707                                                shared->avail, node);
3708
3709                        n->shared = new_shared;
3710                        if (!n->alien) {
3711                                n->alien = new_alien;
3712                                new_alien = NULL;
3713                        }
3714                        n->free_limit = (1 + nr_cpus_node(node)) *
3715                                        cachep->batchcount + cachep->num;
3716                        spin_unlock_irq(&n->list_lock);
3717                        kfree(shared);
3718                        free_alien_cache(new_alien);
3719                        continue;
3720                }
3721                n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node);
3722                if (!n) {
3723                        free_alien_cache(new_alien);
3724                        kfree(new_shared);
3725                        goto fail;
3726                }
3727
3728                kmem_cache_node_init(n);
3729                n->next_reap = jiffies + REAPTIMEOUT_LIST3 +
3730                                ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
3731                n->shared = new_shared;
3732                n->alien = new_alien;
3733                n->free_limit = (1 + nr_cpus_node(node)) *
3734                                        cachep->batchcount + cachep->num;
3735                cachep->node[node] = n;
3736        }
3737        return 0;
3738
3739fail:
3740        if (!cachep->list.next) {
3741                /* Cache is not active yet. Roll back what we did */
3742                node--;
3743                while (node >= 0) {
3744                        if (cachep->node[node]) {
3745                                n = cachep->node[node];
3746
3747                                kfree(n->shared);
3748                                free_alien_cache(n->alien);
3749                                kfree(n);
3750                                cachep->node[node] = NULL;
3751                        }
3752                        node--;
3753                }
3754        }
3755        return -ENOMEM;
3756}
3757
3758struct ccupdate_struct {
3759        struct kmem_cache *cachep;
3760        struct array_cache *new[0];
3761};
3762
3763static void do_ccupdate_local(void *info)
3764{
3765        struct ccupdate_struct *new = info;
3766        struct array_cache *old;
3767
3768        check_irq_off();
3769        old = cpu_cache_get(new->cachep);
3770
3771        new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
3772        new->new[smp_processor_id()] = old;
3773}
3774
3775/* Always called with the slab_mutex held */
3776static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
3777                                int batchcount, int shared, gfp_t gfp)
3778{
3779        struct ccupdate_struct *new;
3780        int i;
3781
3782        new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *),
3783                      gfp);
3784        if (!new)
3785                return -ENOMEM;
3786
3787        for_each_online_cpu(i) {
3788                new->new[i] = alloc_arraycache(cpu_to_mem(i), limit,
3789                                                batchcount, gfp);
3790                if (!new->new[i]) {
3791                        for (i--; i >= 0; i--)
3792                                kfree(new->new[i]);
3793                        kfree(new);
3794                        return -ENOMEM;
3795                }
3796        }
3797        new->cachep = cachep;
3798
3799        on_each_cpu(do_ccupdate_local, (void *)new, 1);
3800
3801        check_irq_on();
3802        cachep->batchcount = batchcount;
3803        cachep->limit = limit;
3804        cachep->shared = shared;
3805
3806        for_each_online_cpu(i) {
3807                struct array_cache *ccold = new->new[i];
3808                if (!ccold)
3809                        continue;
3810                spin_lock_irq(&cachep->node[cpu_to_mem(i)]->list_lock);
3811                free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i));
3812                spin_unlock_irq(&cachep->node[cpu_to_mem(i)]->list_lock);
3813                kfree(ccold);
3814        }
3815        kfree(new);
3816        return alloc_kmemlist(cachep, gfp);
3817}
3818
3819static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3820                                int batchcount, int shared, gfp_t gfp)
3821{
3822        int ret;
3823        struct kmem_cache *c = NULL;
3824        int i = 0;
3825
3826        ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
3827
3828        if (slab_state < FULL)
3829                return ret;
3830
3831        if ((ret < 0) || !is_root_cache(cachep))
3832                return ret;
3833
3834        VM_BUG_ON(!mutex_is_locked(&slab_mutex));
3835        for_each_memcg_cache_index(i) {
3836                c = cache_from_memcg_idx(cachep, i);
3837                if (c)
3838                        /* return value determined by the parent cache only */
3839                        __do_tune_cpucache(c, limit, batchcount, shared, gfp);
3840        }
3841
3842        return ret;
3843}
3844
3845/* Called with slab_mutex held always */
3846static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
3847{
3848        int err;
3849        int limit = 0;
3850        int shared = 0;
3851        int batchcount = 0;
3852
3853        if (!is_root_cache(cachep)) {
3854                struct kmem_cache *root = memcg_root_cache(cachep);
3855                limit = root->limit;
3856                shared = root->shared;
3857                batchcount = root->batchcount;
3858        }
3859
3860        if (limit && shared && batchcount)
3861                goto skip_setup;
3862        /*
3863         * The head array serves three purposes:
3864         * - create a LIFO ordering, i.e. return objects that are cache-warm
3865         * - reduce the number of spinlock operations.
3866         * - reduce the number of linked list operations on the slab and
3867         *   bufctl chains: array operations are cheaper.
3868         * The numbers are guessed, we should auto-tune as described by
3869         * Bonwick.
3870         */
3871        if (cachep->size > 131072)
3872                limit = 1;
3873        else if (cachep->size > PAGE_SIZE)
3874                limit = 8;
3875        else if (cachep->size > 1024)
3876                limit = 24;
3877        else if (cachep->size > 256)
3878                limit = 54;
3879        else
3880                limit = 120;
3881
3882        /*
3883         * CPU bound tasks (e.g. network routing) can exhibit cpu bound
3884         * allocation behaviour: Most allocs on one cpu, most free operations
3885         * on another cpu. For these cases, an efficient object passing between
3886         * cpus is necessary. This is provided by a shared array. The array
3887         * replaces Bonwick's magazine layer.
3888         * On uniprocessor, it's functionally equivalent (but less efficient)
3889         * to a larger limit. Thus disabled by default.
3890         */
3891        shared = 0;
3892        if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1)
3893                shared = 8;
3894
3895#if DEBUG
3896        /*
3897         * With debugging enabled, large batchcount lead to excessively long
3898         * periods with disabled local interrupts. Limit the batchcount
3899         */
3900        if (limit > 32)
3901                limit = 32;
3902#endif
3903        batchcount = (limit + 1) / 2;
3904skip_setup:
3905        err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
3906        if (err)
3907                printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
3908                       cachep->name, -err);
3909        return err;
3910}
3911
3912/*
3913 * Drain an array if it contains any elements taking the node lock only if
3914 * necessary. Note that the node listlock also protects the array_cache
3915 * if drain_array() is used on the shared array.
3916 */
3917static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
3918                         struct array_cache *ac, int force, int node)
3919{
3920        int tofree;
3921
3922        if (!ac || !ac->avail)
3923                return;
3924        if (ac->touched && !force) {
3925                ac->touched = 0;
3926        } else {
3927                spin_lock_irq(&n->list_lock);
3928                if (ac->avail) {
3929                        tofree = force ? ac->avail : (ac->limit + 4) / 5;
3930                        if (tofree > ac->avail)
3931                                tofree = (ac->avail + 1) / 2;
3932                        free_block(cachep, ac->entry, tofree, node);
3933                        ac->avail -= tofree;
3934                        memmove(ac->entry, &(ac->entry[tofree]),
3935                                sizeof(void *) * ac->avail);
3936                }
3937                spin_unlock_irq(&n->list_lock);
3938        }
3939}
3940
3941/**
3942 * cache_reap - Reclaim memory from caches.
3943 * @w: work descriptor
3944 *
3945 * Called from workqueue/eventd every few seconds.
3946 * Purpose:
3947 * - clear the per-cpu caches for this CPU.
3948 * - return freeable pages to the main free memory pool.
3949 *
3950 * If we cannot acquire the cache chain mutex then just give up - we'll try
3951 * again on the next iteration.
3952 */
3953static void cache_reap(struct work_struct *w)
3954{
3955        struct kmem_cache *searchp;
3956        struct kmem_cache_node *n;
3957        int node = numa_mem_id();
3958        struct delayed_work *work = to_delayed_work(w);
3959
3960        if (!mutex_trylock(&slab_mutex))
3961                /* Give up. Setup the next iteration. */
3962                goto out;
3963
3964        list_for_each_entry(searchp, &slab_caches, list) {
3965                check_irq_on();
3966
3967                /*
3968                 * We only take the node lock if absolutely necessary and we
3969                 * have established with reasonable certainty that
3970                 * we can do some work if the lock was obtained.
3971                 */
3972                n = searchp->node[node];
3973
3974                reap_alien(searchp, n);
3975
3976                drain_array(searchp, n, cpu_cache_get(searchp), 0, node);
3977
3978                /*
3979                 * These are racy checks but it does not matter
3980                 * if we skip one check or scan twice.
3981                 */
3982                if (time_after(n->next_reap, jiffies))
3983                        goto next;
3984
3985                n->next_reap = jiffies + REAPTIMEOUT_LIST3;
3986
3987                drain_array(searchp, n, n->shared, 0, node);
3988
3989                if (n->free_touched)
3990                        n->free_touched = 0;
3991                else {
3992                        int freed;
3993
3994                        freed = drain_freelist(searchp, n, (n->free_limit +
3995                                5 * searchp->num - 1) / (5 * searchp->num));
3996                        STATS_ADD_REAPED(searchp, freed);
3997                }
3998next:
3999                cond_resched();
4000        }

4001        check_irq_on();
4002        mutex_unlock(&slab_mutex);
4003        next_reap_node();
4004out:
4005        /* Set up the next iteration */
4006        schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC));
4007}
4008
4009#ifdef CONFIG_SLABINFO
4010void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
4011{
4012        struct page *page;
4013        unsigned long active_objs;
4014        unsigned long num_objs;
4015        unsigned long active_slabs = 0;
4016        unsigned long num_slabs, free_objects = 0, shared_avail = 0;
4017        const char *name;
4018        char *error = NULL;
4019        int node;
4020        struct kmem_cache_node *n;
4021
4022        active_objs = 0;
4023        num_slabs = 0;
4024        for_each_online_node(node) {
4025                n = cachep->node[node];
4026                if (!n)
4027                        continue;
4028
4029                check_irq_on();
4030                spin_lock_irq(&n->list_lock);
4031
4032                list_for_each_entry(page, &n->slabs_full, lru) {
4033                        if (page->active != cachep->num && !error)
4034                                error = "slabs_full accounting error";
4035                        active_objs += cachep->num;
4036                        active_slabs++;
4037                }
4038                list_for_each_entry(page, &n->slabs_partial, lru) {
4039                        if (page->active == cachep->num && !error)
4040                                error = "slabs_partial accounting error";
4041                        if (!page->active && !error)
4042                                error = "slabs_partial accounting error";
4043                        active_objs += page->active;
4044                        active_slabs++;
4045                }
4046                list_for_each_entry(page, &n->slabs_free, lru) {
4047                        if (page->active && !error)
4048                                error = "slabs_free accounting error";
4049                        num_slabs++;
4050                }
4051                free_objects += n->free_objects;
4052                if (n->shared)
4053                        shared_avail += n->shared->avail;
4054
4055                spin_unlock_irq(&n->list_lock);
4056        }
4057        num_slabs += active_slabs;
4058        num_objs = num_slabs * cachep->num;
4059        if (num_objs - active_objs != free_objects && !error)
4060                error = "free_objects accounting error";
4061
4062        name = cachep->name;
4063        if (error)
4064                printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
4065
4066        sinfo->active_objs = active_objs;
4067        sinfo->num_objs = num_objs;
4068        sinfo->active_slabs = active_slabs;
4069        sinfo->num_slabs = num_slabs;
4070        sinfo->shared_avail = shared_avail;
4071        sinfo->limit = cachep->limit;
4072        sinfo->batchcount = cachep->batchcount;
4073        sinfo->shared = cachep->shared;
4074        sinfo->objects_per_slab = cachep->num;
4075        sinfo->cache_order = cachep->gfporder;
4076}
4077
4078void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep)
4079{
4080#if STATS
4081        {                       /* node stats */
4082                unsigned long high = cachep->high_mark;
4083                unsigned long allocs = cachep->num_allocations;
4084                unsigned long grown = cachep->grown;
4085                unsigned long reaped = cachep->reaped;
4086                unsigned long errors = cachep->errors;
4087                unsigned long max_freeable = cachep->max_freeable;
4088                unsigned long node_allocs = cachep->node_allocs;
4089                unsigned long node_frees = cachep->node_frees;
4090                unsigned long overflows = cachep->node_overflow;
4091
4092                seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu "
4093                           "%4lu %4lu %4lu %4lu %4lu",
4094                           allocs, high, grown,
4095                           reaped, errors, max_freeable, node_allocs,
4096                           node_frees, overflows);
4097        }
4098        /* cpu stats */
4099        {
4100                unsigned long allochit = atomic_read(&cachep->allochit);
4101                unsigned long allocmiss = atomic_read(&cachep->allocmiss);
4102                unsigned long freehit = atomic_read(&cachep->freehit);
4103                unsigned long freemiss = atomic_read(&cachep->freemiss);
4104
4105                seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
4106                           allochit, allocmiss, freehit, freemiss);
4107        }
4108#endif
4109}
4110
4111#define MAX_SLABINFO_WRITE 128
4112/**
4113 * slabinfo_write - Tuning for the slab allocator
4114 * @file: unused
4115 * @buffer: user buffer
4116 * @count: data length
4117 * @ppos: unused
4118 */
4119ssize_t slabinfo_write(struct file *file, const char __user *buffer,
4120                       size_t count, loff_t *ppos)
4121{
4122        char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
4123        int limit, batchcount, shared, res;
4124        struct kmem_cache *cachep;
4125
4126        if (count > MAX_SLABINFO_WRITE)
4127                return -EINVAL;
4128        if (copy_from_user(&kbuf, buffer, count))
4129                return -EFAULT;
4130        kbuf[MAX_SLABINFO_WRITE] = '\0';
4131
4132        tmp = strchr(kbuf, ' ');
4133        if (!tmp)
4134                return -EINVAL;
4135        *tmp = '\0';
4136        tmp++;
4137        if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
4138                return -EINVAL;
4139
4140        /* Find the cache in the chain of caches. */
4141        mutex_lock(&slab_mutex);
4142        res = -EINVAL;
4143        list_for_each_entry(cachep, &slab_caches, list) {
4144                if (!strcmp(cachep->name, kbuf)) {
4145                        if (limit < 1 || batchcount < 1 ||
4146                                        batchcount > limit || shared < 0) {
4147                                res = 0;
4148                        } else {
4149                                res = do_tune_cpucache(cachep, limit,
4150                                                       batchcount, shared,
4151                                                       GFP_KERNEL);
4152                        }
4153                        break;
4154                }
4155        }
4156        mutex_unlock(&slab_mutex);
4157        if (res >= 0)
4158                res = count;
4159        return res;
4160}
4161
4162#ifdef CONFIG_DEBUG_SLAB_LEAK
4163
4164static void *leaks_start(struct seq_file *m, loff_t *pos)
4165{
4166        mutex_lock(&slab_mutex);
4167        return seq_list_start(&slab_caches, *pos);
4168}
4169
4170static inline int add_caller(unsigned long *n, unsigned long v)
4171{
4172        unsigned long *p;
4173        int l;
4174        if (!v)
4175                return 1;
4176        l = n[1];
4177        p = n + 2;
4178        while (l) {
4179                int i = l/2;
4180                unsigned long *q = p + 2 * i;
4181                if (*q == v) {
4182                        q[1]++;
4183                        return 1;
4184                }
4185                if (*q > v) {
4186                        l = i;
4187                } else {
4188                        p = q + 2;
4189                        l -= i + 1;
4190                }
4191        }
4192        if (++n[1] == n[0])
4193                return 0;
4194        memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n));
4195        p[0] = v;
4196        p[1] = 1;
4197        return 1;
4198}
4199
4200static void handle_slab(unsigned long *n, struct kmem_cache *c,
4201                                                struct page *page)
4202{
4203        void *p;
4204        int i, j;
4205
4206        if (n[0] == n[1])
4207                return;
4208        for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) {
4209                bool active = true;
4210
4211                for (j = page->active; j < c->num; j++) {
4212                        /* Skip freed item */
4213                        if (slab_freelist(page)[j] == i) {
4214                                active = false;
4215                                break;
4216                        }
4217                }
4218                if (!active)
4219                        continue;
4220
4221                if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
4222                        return;
4223        }
4224}
4225
4226static void show_symbol(struct seq_file *m, unsigned long address)
4227{
4228#ifdef CONFIG_KALLSYMS
4229        unsigned long offset, size;
4230        char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN];
4231
4232        if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) {
4233                seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
4234                if (modname[0])
4235                        seq_printf(m, " [%s]", modname);
4236                return;
4237        }
4238#endif
4239        seq_printf(m, "%p", (void *)address);
4240}
4241
4242static int leaks_show(struct seq_file *m, void *p)
4243{
4244        struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
4245        struct page *page;
4246        struct kmem_cache_node *n;
4247        const char *name;
4248        unsigned long *x = m->private;
4249        int node;
4250        int i;
4251
4252        if (!(cachep->flags & SLAB_STORE_USER))
4253                return 0;
4254        if (!(cachep->flags & SLAB_RED_ZONE))
4255                return 0;
4256
4257        /* OK, we can do it */
4258
4259        x[1] = 0;
4260
4261        for_each_online_node(node) {
4262                n = cachep->node[node];
4263                if (!n)
4264                        continue;
4265
4266                check_irq_on();
4267                spin_lock_irq(&n->list_lock);
4268
4269                list_for_each_entry(page, &n->slabs_full, lru)
4270                        handle_slab(x, cachep, page);
4271                list_for_each_entry(page, &n->slabs_partial, lru)
4272                        handle_slab(x, cachep, page);
4273                spin_unlock_irq(&n->list_lock);
4274        }
4275        name = cachep->name;
4276        if (x[0] == x[1]) {
4277                /* Increase the buffer size */
4278                mutex_unlock(&slab_mutex);
4279                m->private = kzalloc(x[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
4280                if (!m->private) {
4281                        /* Too bad, we are really out */
4282                        m->private = x;
4283                        mutex_lock(&slab_mutex);
4284                        return -ENOMEM;
4285                }
4286                *(unsigned long *)m->private = x[0] * 2;
4287                kfree(x);
4288                mutex_lock(&slab_mutex);
4289                /* Now make sure this entry will be retried */
4290                m->count = m->size;
4291                return 0;
4292        }
4293        for (i = 0; i < x[1]; i++) {
4294                seq_printf(m, "%s: %lu ", name, x[2*i+3]);
4295                show_symbol(m, x[2*i+2]);
4296                seq_putc(m, '\n');
4297        }
4298
4299        return 0;
4300}
4301
4302static const struct seq_operations slabstats_op = {
4303        .start = leaks_start,
4304        .next = slab_next,
4305        .stop = slab_stop,
4306        .show = leaks_show,
4307};
4308
4309static int slabstats_open(struct inode *inode, struct file *file)
4310{
4311        unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL);
4312        int ret = -ENOMEM;
4313        if (n) {
4314                ret = seq_open(file, &slabstats_op);
4315                if (!ret) {
4316                        struct seq_file *m = file->private_data;
4317                        *n = PAGE_SIZE / (2 * sizeof(unsigned long));
4318                        m->private = n;
4319                        n = NULL;
4320                }
4321                kfree(n);
4322        }
4323        return ret;
4324}
4325
4326static const struct file_operations proc_slabstats_operations = {
4327        .open           = slabstats_open,
4328        .read           = seq_read,
4329        .llseek         = seq_lseek,
4330        .release        = seq_release_private,
4331};
4332#endif
4333
4334static int __init slab_proc_init(void)
4335{
4336#ifdef CONFIG_DEBUG_SLAB_LEAK
4337        proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
4338#endif
4339        return 0;
4340}
4341module_init(slab_proc_init);
4342#endif
4343
4344/**
4345 * ksize - get the actual amount of memory allocated for a given object
4346 * @objp: Pointer to the object
4347 *
4348 * kmalloc may internally round up allocations and return more memory
4349 * than requested. ksize() can be used to determine the actual amount of
4350 * memory allocated. The caller may use this additional memory, even though
4351 * a smaller amount of memory was initially specified with the kmalloc call.
4352 * The caller must guarantee that objp points to a valid object previously
4353 * allocated with either kmalloc() or kmem_cache_alloc(). The object
4354 * must not be freed during the duration of the call.
4355 */
4356size_t ksize(const void *objp)
4357{
4358        BUG_ON(!objp);
4359        if (unlikely(objp == ZERO_SIZE_PTR))
4360                return 0;
4361
4362        return virt_to_cache(objp)->object_size;
4363}
4364EXPORT_SYMBOL(ksize);
4365