LXR linux/mm/slub.c

   1/*
   2 * SLUB: A slab allocator that limits cache line use instead of queuing
   3 * objects in per cpu and per node lists.
   4 *
   5 * The allocator synchronizes using per slab locks or atomic operatios
   6 * and only uses a centralized lock to manage a pool of partial slabs.
   7 *
   8 * (C) 2007 SGI, Christoph Lameter
   9 * (C) 2011 Linux Foundation, Christoph Lameter
  10 */
  11
  12#include <linux/mm.h>
  13#include <linux/swap.h> /* struct reclaim_state */
  14#include <linux/module.h>
  15#include <linux/bit_spinlock.h>
  16#include <linux/interrupt.h>
  17#include <linux/bitops.h>
  18#include <linux/slab.h>
  19#include "slab.h"
  20#include <linux/proc_fs.h>
  21#include <linux/notifier.h>
  22#include <linux/seq_file.h>
  23#include <linux/kmemcheck.h>
  24#include <linux/cpu.h>
  25#include <linux/cpuset.h>
  26#include <linux/mempolicy.h>
  27#include <linux/ctype.h>
  28#include <linux/debugobjects.h>
  29#include <linux/kallsyms.h>
  30#include <linux/memory.h>
  31#include <linux/math64.h>
  32#include <linux/fault-inject.h>
  33#include <linux/stacktrace.h>
  34#include <linux/prefetch.h>
  35#include <linux/memcontrol.h>
  36
  37#include <trace/events/kmem.h>
  38
  39#include "internal.h"
  40
  41/*
  42 * Lock order:
  43 *   1. slab_mutex (Global Mutex)
  44 *   2. node->list_lock
  45 *   3. slab_lock(page) (Only on some arches and for debugging)
  46 *
  47 *   slab_mutex
  48 *
  49 *   The role of the slab_mutex is to protect the list of all the slabs
  50 *   and to synchronize major metadata changes to slab cache structures.
  51 *
  52 *   The slab_lock is only used for debugging and on arches that do not
  53 *   have the ability to do a cmpxchg_double. It only protects the second
  54 *   double word in the page struct. Meaning
  55 *      A. page->freelist       -> List of object free in a page
  56 *      B. page->counters       -> Counters of objects
  57 *      C. page->frozen         -> frozen state
  58 *
  59 *   If a slab is frozen then it is exempt from list management. It is not
  60 *   on any list. The processor that froze the slab is the one who can
  61 *   perform list operations on the page. Other processors may put objects
  62 *   onto the freelist but the processor that froze the slab is the only
  63 *   one that can retrieve the objects from the page's freelist.
  64 *
  65 *   The list_lock protects the partial and full list on each node and
  66 *   the partial slab counter. If taken then no new slabs may be added or
  67 *   removed from the lists nor make the number of partial slabs be modified.
  68 *   (Note that the total number of slabs is an atomic value that may be
  69 *   modified without taking the list lock).
  70 *
  71 *   The list_lock is a centralized lock and thus we avoid taking it as
  72 *   much as possible. As long as SLUB does not have to handle partial
  73 *   slabs, operations can continue without any centralized lock. F.e.
  74 *   allocating a long series of objects that fill up slabs does not require
  75 *   the list lock.
  76 *   Interrupts are disabled during allocation and deallocation in order to
  77 *   make the slab allocator safe to use in the context of an irq. In addition
  78 *   interrupts are disabled to ensure that the processor does not change
  79 *   while handling per_cpu slabs, due to kernel preemption.
  80 *
  81 * SLUB assigns one slab for allocation to each processor.
  82 * Allocations only occur from these slabs called cpu slabs.
  83 *
  84 * Slabs with free elements are kept on a partial list and during regular
  85 * operations no list for full slabs is used. If an object in a full slab is
  86 * freed then the slab will show up again on the partial lists.
  87 * We track full slabs for debugging purposes though because otherwise we
  88 * cannot scan all objects.
  89 *
  90 * Slabs are freed when they become empty. Teardown and setup is
  91 * minimal so we rely on the page allocators per cpu caches for
  92 * fast frees and allocs.
  93 *
  94 * Overloading of page flags that are otherwise used for LRU management.
  95 *
  96 * PageActive           The slab is frozen and exempt from list processing.
  97 *                      This means that the slab is dedicated to a purpose
  98 *                      such as satisfying allocations for a specific
  99 *                      processor. Objects may be freed in the slab while
 100 *                      it is frozen but slab_free will then skip the usual
 101 *                      list operations. It is up to the processor holding
 102 *                      the slab to integrate the slab into the slab lists
 103 *                      when the slab is no longer needed.
 104 *
 105 *                      One use of this flag is to mark slabs that are
 106 *                      used for allocations. Then such a slab becomes a cpu
 107 *                      slab. The cpu slab may be equipped with an additional
 108 *                      freelist that allows lockless access to
 109 *                      free objects in addition to the regular freelist
 110 *                      that requires the slab lock.
 111 *
 112 * PageError            Slab requires special handling due to debug
 113 *                      options set. This moves slab handling out of
 114 *                      the fast path and disables lockless freelists.
 115 */
 116
 117static inline int kmem_cache_debug(struct kmem_cache *s)
 118{
 119#ifdef CONFIG_SLUB_DEBUG
 120        return unlikely(s->flags & SLAB_DEBUG_FLAGS);
 121#else
 122        return 0;
 123#endif
 124}
 125
 126static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
 127{
 128#ifdef CONFIG_SLUB_CPU_PARTIAL
 129        return !kmem_cache_debug(s);
 130#else
 131        return false;
 132#endif
 133}
 134
 135/*
 136 * Issues still to be resolved:
 137 *
 138 * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
 139 *
 140 * - Variable sizing of the per node arrays
 141 */
 142
 143/* Enable to test recovery from slab corruption on boot */
 144#undef SLUB_RESILIENCY_TEST
 145
 146/* Enable to log cmpxchg failures */
 147#undef SLUB_DEBUG_CMPXCHG
 148
 149/*
 150 * Mininum number of partial slabs. These will be left on the partial
 151 * lists even if they are empty. kmem_cache_shrink may reclaim them.
 152 */
 153#define MIN_PARTIAL 5
 154
 155/*
 156 * Maximum number of desirable partial slabs.
 157 * The existence of more partial slabs makes kmem_cache_shrink
 158 * sort the partial list by the number of objects in use.
 159 */
 160#define MAX_PARTIAL 10
 161
 162#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
 163                                SLAB_POISON | SLAB_STORE_USER)
 164
 165/*
 166 * Debugging flags that require metadata to be stored in the slab.  These get
 167 * disabled when slub_debug=O is used and a cache's min order increases with
 168 * metadata.
 169 */
 170#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
 171
 172/*
 173 * Set of flags that will prevent slab merging
 174 */
 175#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
 176                SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
 177                SLAB_FAILSLAB)
 178
 179#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
 180                SLAB_CACHE_DMA | SLAB_NOTRACK)
 181
 182#define OO_SHIFT        16
 183#define OO_MASK         ((1 << OO_SHIFT) - 1)
 184#define MAX_OBJS_PER_PAGE       32767 /* since page.objects is u15 */
 185
 186/* Internal SLUB flags */
 187#define __OBJECT_POISON         0x80000000UL /* Poison object */
 188#define __CMPXCHG_DOUBLE        0x40000000UL /* Use cmpxchg_double */
 189
 190#ifdef CONFIG_SMP
 191static struct notifier_block slab_notifier;
 192#endif
 193
 194/*
 195 * Tracking user of a slab.
 196 */
 197#define TRACK_ADDRS_COUNT 16
 198struct track {
 199        unsigned long addr;     /* Called from address */
 200#ifdef CONFIG_STACKTRACE
 201        unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */
 202#endif
 203        int cpu;                /* Was running on cpu */
 204        int pid;                /* Pid context */
 205        unsigned long when;     /* When did the operation occur */
 206};
 207
 208enum track_item { TRACK_ALLOC, TRACK_FREE };
 209
 210#ifdef CONFIG_SYSFS
 211static int sysfs_slab_add(struct kmem_cache *);
 212static int sysfs_slab_alias(struct kmem_cache *, const char *);
 213static void sysfs_slab_remove(struct kmem_cache *);
 214static void memcg_propagate_slab_attrs(struct kmem_cache *s);
 215#else
 216static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
 217static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
 218                                                        { return 0; }
 219static inline void sysfs_slab_remove(struct kmem_cache *s) { }
 220
 221static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
 222#endif
 223
 224static inline void stat(const struct kmem_cache *s, enum stat_item si)
 225{
 226#ifdef CONFIG_SLUB_STATS
 227        __this_cpu_inc(s->cpu_slab->stat[si]);
 228#endif
 229}
 230
 231/********************************************************************
 232 *                      Core slab cache functions
 233 *******************************************************************/
 234
 235static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
 236{
 237        return s->node[node];
 238}
 239
 240/* Verify that a pointer has an address that is valid within a slab page */
 241static inline int check_valid_pointer(struct kmem_cache *s,
 242                                struct page *page, const void *object)
 243{
 244        void *base;
 245
 246        if (!object)
 247                return 1;
 248
 249        base = page_address(page);
 250        if (object < base || object >= base + page->objects * s->size ||
 251                (object - base) % s->size) {
 252                return 0;
 253        }
 254
 255        return 1;
 256}
 257
 258static inline void *get_freepointer(struct kmem_cache *s, void *object)
 259{
 260        return *(void **)(object + s->offset);
 261}
 262
 263static void prefetch_freepointer(const struct kmem_cache *s, void *object)
 264{
 265        prefetch(object + s->offset);
 266}
 267
 268static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
 269{
 270        void *p;
 271
 272#ifdef CONFIG_DEBUG_PAGEALLOC
 273        probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p));
 274#else
 275        p = get_freepointer(s, object);
 276#endif
 277        return p;
 278}
 279
 280static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
 281{
 282        *(void **)(object + s->offset) = fp;
 283}
 284
 285/* Loop over all objects in a slab */
 286#define for_each_object(__p, __s, __addr, __objects) \
 287        for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\
 288                        __p += (__s)->size)
 289
 290/* Determine object index from a given position */
 291static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
 292{
 293        return (p - addr) / s->size;
 294}
 295
 296static inline size_t slab_ksize(const struct kmem_cache *s)
 297{
 298#ifdef CONFIG_SLUB_DEBUG
 299        /*
 300         * Debugging requires use of the padding between object
 301         * and whatever may come after it.
 302         */
 303        if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
 304                return s->object_size;
 305
 306#endif
 307        /*
 308         * If we have the need to store the freelist pointer
 309         * back there or track user information then we can
 310         * only use the space before that information.
 311         */
 312        if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
 313                return s->inuse;
 314        /*
 315         * Else we can use all the padding etc for the allocation
 316         */
 317        return s->size;
 318}
 319
 320static inline int order_objects(int order, unsigned long size, int reserved)
 321{
 322        return ((PAGE_SIZE << order) - reserved) / size;
 323}
 324
 325static inline struct kmem_cache_order_objects oo_make(int order,
 326                unsigned long size, int reserved)
 327{
 328        struct kmem_cache_order_objects x = {
 329                (order << OO_SHIFT) + order_objects(order, size, reserved)
 330        };
 331
 332        return x;
 333}
 334
 335static inline int oo_order(struct kmem_cache_order_objects x)
 336{
 337        return x.x >> OO_SHIFT;
 338}
 339
 340static inline int oo_objects(struct kmem_cache_order_objects x)
 341{
 342        return x.x & OO_MASK;
 343}
 344
 345/*
 346 * Per slab locking using the pagelock
 347 */
 348static __always_inline void slab_lock(struct page *page)
 349{
 350        bit_spin_lock(PG_locked, &page->flags);
 351}
 352
 353static __always_inline void slab_unlock(struct page *page)
 354{
 355        __bit_spin_unlock(PG_locked, &page->flags);
 356}
 357
 358static inline void set_page_slub_counters(struct page *page, unsigned long counters_new)
 359{
 360        struct page tmp;
 361        tmp.counters = counters_new;
 362        /*
 363         * page->counters can cover frozen/inuse/objects as well
 364         * as page->_count.  If we assign to ->counters directly
 365         * we run the risk of losing updates to page->_count, so
 366         * be careful and only assign to the fields we need.
 367         */
 368        page->frozen  = tmp.frozen;
 369        page->inuse   = tmp.inuse;
 370        page->objects = tmp.objects;
 371}
 372
 373/* Interrupts must be disabled (for the fallback code to work right) */
 374static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
 375                void *freelist_old, unsigned long counters_old,
 376                void *freelist_new, unsigned long counters_new,
 377                const char *n)
 378{
 379        VM_BUG_ON(!irqs_disabled());
 380#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
 381    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
 382        if (s->flags & __CMPXCHG_DOUBLE) {
 383                if (cmpxchg_double(&page->freelist, &page->counters,
 384                        freelist_old, counters_old,
 385                        freelist_new, counters_new))
 386                return 1;
 387        } else
 388#endif
 389        {
 390                slab_lock(page);
 391                if (page->freelist == freelist_old &&
 392                                        page->counters == counters_old) {
 393                        page->freelist = freelist_new;
 394                        set_page_slub_counters(page, counters_new);
 395                        slab_unlock(page);
 396                        return 1;
 397                }
 398                slab_unlock(page);
 399        }
 400
 401        cpu_relax();
 402        stat(s, CMPXCHG_DOUBLE_FAIL);
 403
 404#ifdef SLUB_DEBUG_CMPXCHG
 405        printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name);
 406#endif
 407
 408        return 0;
 409}
 410
 411static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
 412                void *freelist_old, unsigned long counters_old,
 413                void *freelist_new, unsigned long counters_new,
 414                const char *n)
 415{
 416#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
 417    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
 418        if (s->flags & __CMPXCHG_DOUBLE) {
 419                if (cmpxchg_double(&page->freelist, &page->counters,
 420                        freelist_old, counters_old,
 421                        freelist_new, counters_new))
 422                return 1;
 423        } else
 424#endif
 425        {
 426                unsigned long flags;
 427
 428                local_irq_save(flags);
 429                slab_lock(page);
 430                if (page->freelist == freelist_old &&
 431                                        page->counters == counters_old) {
 432                        page->freelist = freelist_new;
 433                        set_page_slub_counters(page, counters_new);
 434                        slab_unlock(page);
 435                        local_irq_restore(flags);
 436                        return 1;
 437                }
 438                slab_unlock(page);
 439                local_irq_restore(flags);
 440        }
 441
 442        cpu_relax();
 443        stat(s, CMPXCHG_DOUBLE_FAIL);
 444
 445#ifdef SLUB_DEBUG_CMPXCHG
 446        printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name);
 447#endif
 448
 449        return 0;
 450}
 451
 452#ifdef CONFIG_SLUB_DEBUG
 453/*
 454 * Determine a map of object in use on a page.
 455 *
 456 * Node listlock must be held to guarantee that the page does
 457 * not vanish from under us.
 458 */
 459static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
 460{
 461        void *p;
 462        void *addr = page_address(page);
 463
 464        for (p = page->freelist; p; p = get_freepointer(s, p))
 465                set_bit(slab_index(p, s, addr), map);
 466}
 467
 468/*
 469 * Debug settings:
 470 */
 471#ifdef CONFIG_SLUB_DEBUG_ON
 472static int slub_debug = DEBUG_DEFAULT_FLAGS;
 473#else
 474static int slub_debug;
 475#endif
 476
 477static char *slub_debug_slabs;
 478static int disable_higher_order_debug;
 479
 480/*
 481 * Object debugging
 482 */
 483static void print_section(char *text, u8 *addr, unsigned int length)
 484{
 485        print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
 486                        length, 1);
 487}
 488
 489static struct track *get_track(struct kmem_cache *s, void *object,
 490        enum track_item alloc)
 491{
 492        struct track *p;
 493
 494        if (s->offset)
 495                p = object + s->offset + sizeof(void *);
 496        else
 497                p = object + s->inuse;
 498
 499        return p + alloc;
 500}
 501
 502static void set_track(struct kmem_cache *s, void *object,
 503                        enum track_item alloc, unsigned long addr)
 504{
 505        struct track *p = get_track(s, object, alloc);
 506
 507        if (addr) {
 508#ifdef CONFIG_STACKTRACE
 509                struct stack_trace trace;
 510                int i;
 511
 512                trace.nr_entries = 0;
 513                trace.max_entries = TRACK_ADDRS_COUNT;
 514                trace.entries = p->addrs;
 515                trace.skip = 3;
 516                save_stack_trace(&trace);
 517
 518                /* See rant in lockdep.c */
 519                if (trace.nr_entries != 0 &&
 520                    trace.entries[trace.nr_entries - 1] == ULONG_MAX)
 521                        trace.nr_entries--;
 522
 523                for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++)
 524                        p->addrs[i] = 0;
 525#endif
 526                p->addr = addr;
 527                p->cpu = smp_processor_id();
 528                p->pid = current->pid;
 529                p->when = jiffies;
 530        } else
 531                memset(p, 0, sizeof(struct track));
 532}
 533
 534static void init_tracking(struct kmem_cache *s, void *object)
 535{
 536        if (!(s->flags & SLAB_STORE_USER))
 537                return;
 538
 539        set_track(s, object, TRACK_FREE, 0UL);
 540        set_track(s, object, TRACK_ALLOC, 0UL);
 541}
 542
 543static void print_track(const char *s, struct track *t)
 544{
 545        if (!t->addr)
 546                return;
 547
 548        printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
 549                s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid);
 550#ifdef CONFIG_STACKTRACE
 551        {
 552                int i;
 553                for (i = 0; i < TRACK_ADDRS_COUNT; i++)
 554                        if (t->addrs[i])
 555                                printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]);
 556                        else
 557                                break;
 558        }
 559#endif
 560}
 561
 562static void print_tracking(struct kmem_cache *s, void *object)
 563{
 564        if (!(s->flags & SLAB_STORE_USER))
 565                return;
 566
 567        print_track("Allocated", get_track(s, object, TRACK_ALLOC));
 568        print_track("Freed", get_track(s, object, TRACK_FREE));
 569}
 570
 571static void print_page_info(struct page *page)
 572{
 573        printk(KERN_ERR
 574               "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
 575               page, page->objects, page->inuse, page->freelist, page->flags);
 576
 577}
 578
 579static void slab_bug(struct kmem_cache *s, char *fmt, ...)
 580{
 581        va_list args;
 582        char buf[100];
 583
 584        va_start(args, fmt);
 585        vsnprintf(buf, sizeof(buf), fmt, args);
 586        va_end(args);
 587        printk(KERN_ERR "========================================"
 588                        "=====================================\n");
 589        printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf);
 590        printk(KERN_ERR "----------------------------------------"
 591                        "-------------------------------------\n\n");
 592
 593        add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 594}
 595
 596static void slab_fix(struct kmem_cache *s, char *fmt, ...)
 597{
 598        va_list args;
 599        char buf[100];
 600
 601        va_start(args, fmt);
 602        vsnprintf(buf, sizeof(buf), fmt, args);
 603        va_end(args);
 604        printk(KERN_ERR "FIX %s: %s\n", s->name, buf);
 605}
 606
 607static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
 608{
 609        unsigned int off;       /* Offset of last byte */
 610        u8 *addr = page_address(page);
 611
 612        print_tracking(s, p);
 613
 614        print_page_info(page);
 615
 616        printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
 617                        p, p - addr, get_freepointer(s, p));
 618
 619        if (p > addr + 16)
 620                print_section("Bytes b4 ", p - 16, 16);
 621
 622        print_section("Object ", p, min_t(unsigned long, s->object_size,
 623                                PAGE_SIZE));
 624        if (s->flags & SLAB_RED_ZONE)
 625                print_section("Redzone ", p + s->object_size,
 626                        s->inuse - s->object_size);
 627
 628        if (s->offset)
 629                off = s->offset + sizeof(void *);
 630        else
 631                off = s->inuse;
 632
 633        if (s->flags & SLAB_STORE_USER)
 634                off += 2 * sizeof(struct track);
 635
 636        if (off != s->size)
 637                /* Beginning of the filler is the free pointer */
 638                print_section("Padding ", p + off, s->size - off);
 639
 640        dump_stack();
 641}
 642
 643static void object_err(struct kmem_cache *s, struct page *page,
 644                        u8 *object, char *reason)
 645{
 646        slab_bug(s, "%s", reason);
 647        print_trailer(s, page, object);
 648}
 649
 650static void slab_err(struct kmem_cache *s, struct page *page,
 651                        const char *fmt, ...)
 652{
 653        va_list args;
 654        char buf[100];
 655
 656        va_start(args, fmt);
 657        vsnprintf(buf, sizeof(buf), fmt, args);
 658        va_end(args);
 659        slab_bug(s, "%s", buf);
 660        print_page_info(page);
 661        dump_stack();
 662}
 663
 664static void init_object(struct kmem_cache *s, void *object, u8 val)
 665{
 666        u8 *p = object;
 667
 668        if (s->flags & __OBJECT_POISON) {
 669                memset(p, POISON_FREE, s->object_size - 1);
 670                p[s->object_size - 1] = POISON_END;
 671        }
 672
 673        if (s->flags & SLAB_RED_ZONE)
 674                memset(p + s->object_size, val, s->inuse - s->object_size);
 675}
 676
 677static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
 678                                                void *from, void *to)
 679{
 680        slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data);
 681        memset(from, data, to - from);
 682}
 683
 684static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
 685                        u8 *object, char *what,
 686                        u8 *start, unsigned int value, unsigned int bytes)
 687{
 688        u8 *fault;
 689        u8 *end;
 690
 691        fault = memchr_inv(start, value, bytes);
 692        if (!fault)
 693                return 1;
 694
 695        end = start + bytes;
 696        while (end > fault && end[-1] == value)
 697                end--;
 698
 699        slab_bug(s, "%s overwritten", what);
 700        printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
 701                                        fault, end - 1, fault[0], value);
 702        print_trailer(s, page, object);
 703
 704        restore_bytes(s, what, value, fault, end);
 705        return 0;
 706}
 707
 708/*
 709 * Object layout:
 710 *
 711 * object address
 712 *      Bytes of the object to be managed.
 713 *      If the freepointer may overlay the object then the free
 714 *      pointer is the first word of the object.
 715 *
 716 *      Poisoning uses 0x6b (POISON_FREE) and the last byte is
 717 *      0xa5 (POISON_END)
 718 *
 719 * object + s->object_size
 720 *      Padding to reach word boundary. This is also used for Redzoning.
 721 *      Padding is extended by another word if Redzoning is enabled and
 722 *      object_size == inuse.
 723 *
 724 *      We fill with 0xbb (RED_INACTIVE) for inactive objects and with
 725 *      0xcc (RED_ACTIVE) for objects in use.
 726 *
 727 * object + s->inuse
 728 *      Meta data starts here.
 729 *
 730 *      A. Free pointer (if we cannot overwrite object on free)
 731 *      B. Tracking data for SLAB_STORE_USER
 732 *      C. Padding to reach required alignment boundary or at mininum
 733 *              one word if debugging is on to be able to detect writes
 734 *              before the word boundary.
 735 *
 736 *      Padding is done using 0x5a (POISON_INUSE)
 737 *
 738 * object + s->size
 739 *      Nothing is used beyond s->size.
 740 *
 741 * If slabcaches are merged then the object_size and inuse boundaries are mostly
 742 * ignored. And therefore no slab options that rely on these boundaries
 743 * may be used with merged slabcaches.
 744 */
 745
 746static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
 747{
 748        unsigned long off = s->inuse;   /* The end of info */
 749
 750        if (s->offset)
 751                /* Freepointer is placed after the object. */
 752                off += sizeof(void *);
 753
 754        if (s->flags & SLAB_STORE_USER)
 755                /* We also have user information there */
 756                off += 2 * sizeof(struct track);
 757
 758        if (s->size == off)
 759                return 1;
 760
 761        return check_bytes_and_report(s, page, p, "Object padding",
 762                                p + off, POISON_INUSE, s->size - off);
 763}
 764
 765/* Check the pad bytes at the end of a slab page */
 766static int slab_pad_check(struct kmem_cache *s, struct page *page)
 767{
 768        u8 *start;
 769        u8 *fault;
 770        u8 *end;
 771        int length;
 772        int remainder;
 773
 774        if (!(s->flags & SLAB_POISON))
 775                return 1;
 776
 777        start = page_address(page);
 778        length = (PAGE_SIZE << compound_order(page)) - s->reserved;
 779        end = start + length;
 780        remainder = length % s->size;
 781        if (!remainder)
 782                return 1;
 783
 784        fault = memchr_inv(end - remainder, POISON_INUSE, remainder);
 785        if (!fault)
 786                return 1;
 787        while (end > fault && end[-1] == POISON_INUSE)
 788                end--;
 789
 790        slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
 791        print_section("Padding ", end - remainder, remainder);
 792
 793        restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end);
 794        return 0;
 795}
 796
 797static int check_object(struct kmem_cache *s, struct page *page,
 798                                        void *object, u8 val)
 799{
 800        u8 *p = object;
 801        u8 *endobject = object + s->object_size;
 802
 803        if (s->flags & SLAB_RED_ZONE) {
 804                if (!check_bytes_and_report(s, page, object, "Redzone",
 805                        endobject, val, s->inuse - s->object_size))
 806                        return 0;
 807        } else {
 808                if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
 809                        check_bytes_and_report(s, page, p, "Alignment padding",
 810                                endobject, POISON_INUSE,
 811                                s->inuse - s->object_size);
 812                }
 813        }
 814
 815        if (s->flags & SLAB_POISON) {
 816                if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
 817                        (!check_bytes_and_report(s, page, p, "Poison", p,
 818                                        POISON_FREE, s->object_size - 1) ||
 819                         !check_bytes_and_report(s, page, p, "Poison",
 820                                p + s->object_size - 1, POISON_END, 1)))
 821                        return 0;
 822                /*
 823                 * check_pad_bytes cleans up on its own.
 824                 */
 825                check_pad_bytes(s, page, p);
 826        }
 827
 828        if (!s->offset && val == SLUB_RED_ACTIVE)
 829                /*
 830                 * Object and freepointer overlap. Cannot check
 831                 * freepointer while object is allocated.
 832                 */
 833                return 1;
 834
 835        /* Check free pointer validity */
 836        if (!check_valid_pointer(s, page, get_freepointer(s, p))) {
 837                object_err(s, page, p, "Freepointer corrupt");
 838                /*
 839                 * No choice but to zap it and thus lose the remainder
 840                 * of the free objects in this slab. May cause
 841                 * another error because the object count is now wrong.
 842                 */
 843                set_freepointer(s, p, NULL);
 844                return 0;
 845        }
 846        return 1;
 847}
 848
 849static int check_slab(struct kmem_cache *s, struct page *page)
 850{
 851        int maxobj;
 852
 853        VM_BUG_ON(!irqs_disabled());
 854
 855        if (!PageSlab(page)) {
 856                slab_err(s, page, "Not a valid slab page");
 857                return 0;
 858        }
 859
 860        maxobj = order_objects(compound_order(page), s->size, s->reserved);
 861        if (page->objects > maxobj) {
 862                slab_err(s, page, "objects %u > max %u",
 863                        s->name, page->objects, maxobj);
 864                return 0;
 865        }
 866        if (page->inuse > page->objects) {
 867                slab_err(s, page, "inuse %u > max %u",
 868                        s->name, page->inuse, page->objects);
 869                return 0;
 870        }
 871        /* Slab_pad_check fixes things up after itself */
 872        slab_pad_check(s, page);
 873        return 1;
 874}
 875
 876/*
 877 * Determine if a certain object on a page is on the freelist. Must hold the
 878 * slab lock to guarantee that the chains are in a consistent state.
 879 */
 880static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
 881{
 882        int nr = 0;
 883        void *fp;
 884        void *object = NULL;
 885        unsigned long max_objects;
 886
 887        fp = page->freelist;
 888        while (fp && nr <= page->objects) {
 889                if (fp == search)
 890                        return 1;
 891                if (!check_valid_pointer(s, page, fp)) {
 892                        if (object) {
 893                                object_err(s, page, object,
 894                                        "Freechain corrupt");
 895                                set_freepointer(s, object, NULL);
 896                        } else {
 897                                slab_err(s, page, "Freepointer corrupt");
 898                                page->freelist = NULL;
 899                                page->inuse = page->objects;
 900                                slab_fix(s, "Freelist cleared");
 901                                return 0;
 902                        }
 903                        break;
 904                }
 905                object = fp;
 906                fp = get_freepointer(s, object);
 907                nr++;
 908        }
 909
 910        max_objects = order_objects(compound_order(page), s->size, s->reserved);
 911        if (max_objects > MAX_OBJS_PER_PAGE)
 912                max_objects = MAX_OBJS_PER_PAGE;
 913
 914        if (page->objects != max_objects) {
 915                slab_err(s, page, "Wrong number of objects. Found %d but "
 916                        "should be %d", page->objects, max_objects);
 917                page->objects = max_objects;
 918                slab_fix(s, "Number of objects adjusted.");
 919        }
 920        if (page->inuse != page->objects - nr) {
 921                slab_err(s, page, "Wrong object count. Counter is %d but "
 922                        "counted were %d", page->inuse, page->objects - nr);
 923                page->inuse = page->objects - nr;
 924                slab_fix(s, "Object count adjusted.");
 925        }
 926        return search == NULL;
 927}
 928
 929static void trace(struct kmem_cache *s, struct page *page, void *object,
 930                                                                int alloc)
 931{
 932        if (s->flags & SLAB_TRACE) {
 933                printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
 934                        s->name,
 935                        alloc ? "alloc" : "free",
 936                        object, page->inuse,
 937                        page->freelist);
 938
 939                if (!alloc)
 940                        print_section("Object ", (void *)object,
 941                                        s->object_size);
 942
 943                dump_stack();
 944        }
 945}
 946
 947/*
 948 * Hooks for other subsystems that check memory allocations. In a typical
 949 * production configuration these hooks all should produce no code at all.
 950 */
 951static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
 952{
 953        kmemleak_alloc(ptr, size, 1, flags);
 954}
 955
 956static inline void kfree_hook(const void *x)
 957{
 958        kmemleak_free(x);
 959}
 960
 961static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
 962{
 963        flags &= gfp_allowed_mask;
 964        lockdep_trace_alloc(flags);
 965        might_sleep_if(flags & __GFP_WAIT);
 966
 967        return should_failslab(s->object_size, flags, s->flags);
 968}
 969
 970static inline void slab_post_alloc_hook(struct kmem_cache *s,
 971                                        gfp_t flags, void *object)
 972{
 973        flags &= gfp_allowed_mask;
 974        kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
 975        kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags);
 976}
 977
 978static inline void slab_free_hook(struct kmem_cache *s, void *x)
 979{
 980        kmemleak_free_recursive(x, s->flags);
 981
 982        /*
 983         * Trouble is that we may no longer disable interrupts in the fast path
 984         * So in order to make the debug calls that expect irqs to be
 985         * disabled we need to disable interrupts temporarily.
 986         */
 987#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP)
 988        {
 989                unsigned long flags;
 990
 991                local_irq_save(flags);
 992                kmemcheck_slab_free(s, x, s->object_size);
 993                debug_check_no_locks_freed(x, s->object_size);
 994                local_irq_restore(flags);
 995        }
 996#endif
 997        if (!(s->flags & SLAB_DEBUG_OBJECTS))
 998                debug_check_no_obj_freed(x, s->object_size);
 999}
1000

1001/*
1002 * Tracking of fully allocated slabs for debugging purposes.
1003 */
1004static void add_full(struct kmem_cache *s,
1005        struct kmem_cache_node *n, struct page *page)
1006{
1007        if (!(s->flags & SLAB_STORE_USER))
1008                return;
1009
1010        lockdep_assert_held(&n->list_lock);
1011        list_add(&page->lru, &n->full);
1012}
1013
1014static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page)
1015{
1016        if (!(s->flags & SLAB_STORE_USER))
1017                return;
1018
1019        lockdep_assert_held(&n->list_lock);
1020        list_del(&page->lru);
1021}
1022
1023/* Tracking of the number of slabs for debugging purposes */
1024static inline unsigned long slabs_node(struct kmem_cache *s, int node)
1025{
1026        struct kmem_cache_node *n = get_node(s, node);
1027
1028        return atomic_long_read(&n->nr_slabs);
1029}
1030
1031static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
1032{
1033        return atomic_long_read(&n->nr_slabs);
1034}
1035
1036static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
1037{
1038        struct kmem_cache_node *n = get_node(s, node);
1039
1040        /*
1041         * May be called early in order to allocate a slab for the
1042         * kmem_cache_node structure. Solve the chicken-egg
1043         * dilemma by deferring the increment of the count during
1044         * bootstrap (see early_kmem_cache_node_alloc).
1045         */
1046        if (likely(n)) {
1047                atomic_long_inc(&n->nr_slabs);
1048                atomic_long_add(objects, &n->total_objects);
1049        }
1050}
1051static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
1052{
1053        struct kmem_cache_node *n = get_node(s, node);
1054
1055        atomic_long_dec(&n->nr_slabs);
1056        atomic_long_sub(objects, &n->total_objects);
1057}
1058
1059/* Object debug checks for alloc/free paths */
1060static void setup_object_debug(struct kmem_cache *s, struct page *page,
1061                                                                void *object)
1062{
1063        if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
1064                return;
1065
1066        init_object(s, object, SLUB_RED_INACTIVE);
1067        init_tracking(s, object);
1068}
1069
1070static noinline int alloc_debug_processing(struct kmem_cache *s,
1071                                        struct page *page,
1072                                        void *object, unsigned long addr)
1073{
1074        if (!check_slab(s, page))
1075                goto bad;
1076
1077        if (!check_valid_pointer(s, page, object)) {
1078                object_err(s, page, object, "Freelist Pointer check fails");
1079                goto bad;
1080        }
1081
1082        if (!check_object(s, page, object, SLUB_RED_INACTIVE))
1083                goto bad;
1084
1085        /* Success perform special debug activities for allocs */
1086        if (s->flags & SLAB_STORE_USER)
1087                set_track(s, object, TRACK_ALLOC, addr);
1088        trace(s, page, object, 1);
1089        init_object(s, object, SLUB_RED_ACTIVE);
1090        return 1;
1091
1092bad:
1093        if (PageSlab(page)) {
1094                /*
1095                 * If this is a slab page then lets do the best we can
1096                 * to avoid issues in the future. Marking all objects
1097                 * as used avoids touching the remaining objects.
1098                 */
1099                slab_fix(s, "Marking all objects used");
1100                page->inuse = page->objects;
1101                page->freelist = NULL;
1102        }
1103        return 0;
1104}
1105
1106static noinline struct kmem_cache_node *free_debug_processing(
1107        struct kmem_cache *s, struct page *page, void *object,
1108        unsigned long addr, unsigned long *flags)
1109{
1110        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1111
1112        spin_lock_irqsave(&n->list_lock, *flags);
1113        slab_lock(page);
1114
1115        if (!check_slab(s, page))
1116                goto fail;
1117
1118        if (!check_valid_pointer(s, page, object)) {
1119                slab_err(s, page, "Invalid object pointer 0x%p", object);
1120                goto fail;
1121        }
1122
1123        if (on_freelist(s, page, object)) {
1124                object_err(s, page, object, "Object already free");
1125                goto fail;
1126        }
1127
1128        if (!check_object(s, page, object, SLUB_RED_ACTIVE))
1129                goto out;
1130
1131        if (unlikely(s != page->slab_cache)) {
1132                if (!PageSlab(page)) {
1133                        slab_err(s, page, "Attempt to free object(0x%p) "
1134                                "outside of slab", object);
1135                } else if (!page->slab_cache) {
1136                        printk(KERN_ERR
1137                                "SLUB <none>: no slab for object 0x%p.\n",
1138                                                object);
1139                        dump_stack();
1140                } else
1141                        object_err(s, page, object,
1142                                        "page slab pointer corrupt.");
1143                goto fail;
1144        }
1145
1146        if (s->flags & SLAB_STORE_USER)
1147                set_track(s, object, TRACK_FREE, addr);
1148        trace(s, page, object, 0);
1149        init_object(s, object, SLUB_RED_INACTIVE);
1150out:
1151        slab_unlock(page);
1152        /*
1153         * Keep node_lock to preserve integrity
1154         * until the object is actually freed
1155         */
1156        return n;
1157
1158fail:
1159        slab_unlock(page);
1160        spin_unlock_irqrestore(&n->list_lock, *flags);
1161        slab_fix(s, "Object at 0x%p not freed", object);
1162        return NULL;
1163}
1164
1165static int __init setup_slub_debug(char *str)
1166{
1167        slub_debug = DEBUG_DEFAULT_FLAGS;
1168        if (*str++ != '=' || !*str)
1169                /*
1170                 * No options specified. Switch on full debugging.
1171                 */
1172                goto out;
1173
1174        if (*str == ',')
1175                /*
1176                 * No options but restriction on slabs. This means full
1177                 * debugging for slabs matching a pattern.
1178                 */
1179                goto check_slabs;
1180
1181        if (tolower(*str) == 'o') {
1182                /*
1183                 * Avoid enabling debugging on caches if its minimum order
1184                 * would increase as a result.
1185                 */
1186                disable_higher_order_debug = 1;
1187                goto out;
1188        }
1189
1190        slub_debug = 0;
1191        if (*str == '-')
1192                /*
1193                 * Switch off all debugging measures.
1194                 */
1195                goto out;
1196
1197        /*
1198         * Determine which debug features should be switched on
1199         */
1200        for (; *str && *str != ','; str++) {
1201                switch (tolower(*str)) {
1202                case 'f':
1203                        slub_debug |= SLAB_DEBUG_FREE;
1204                        break;
1205                case 'z':
1206                        slub_debug |= SLAB_RED_ZONE;
1207                        break;
1208                case 'p':
1209                        slub_debug |= SLAB_POISON;
1210                        break;
1211                case 'u':
1212                        slub_debug |= SLAB_STORE_USER;
1213                        break;
1214                case 't':
1215                        slub_debug |= SLAB_TRACE;
1216                        break;
1217                case 'a':
1218                        slub_debug |= SLAB_FAILSLAB;
1219                        break;
1220                default:
1221                        printk(KERN_ERR "slub_debug option '%c' "
1222                                "unknown. skipped\n", *str);
1223                }
1224        }
1225
1226check_slabs:
1227        if (*str == ',')
1228                slub_debug_slabs = str + 1;
1229out:
1230        return 1;
1231}
1232
1233__setup("slub_debug", setup_slub_debug);
1234
1235static unsigned long kmem_cache_flags(unsigned long object_size,
1236        unsigned long flags, const char *name,
1237        void (*ctor)(void *))
1238{
1239        /*
1240         * Enable debugging if selected on the kernel commandline.
1241         */
1242        if (slub_debug && (!slub_debug_slabs || (name &&
1243                !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))))
1244                flags |= slub_debug;
1245
1246        return flags;
1247}
1248#else
1249static inline void setup_object_debug(struct kmem_cache *s,
1250                        struct page *page, void *object) {}
1251
1252static inline int alloc_debug_processing(struct kmem_cache *s,
1253        struct page *page, void *object, unsigned long addr) { return 0; }
1254
1255static inline struct kmem_cache_node *free_debug_processing(
1256        struct kmem_cache *s, struct page *page, void *object,
1257        unsigned long addr, unsigned long *flags) { return NULL; }
1258
1259static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
1260                        { return 1; }
1261static inline int check_object(struct kmem_cache *s, struct page *page,
1262                        void *object, u8 val) { return 1; }
1263static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
1264                                        struct page *page) {}
1265static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
1266                                        struct page *page) {}
1267static inline unsigned long kmem_cache_flags(unsigned long object_size,
1268        unsigned long flags, const char *name,
1269        void (*ctor)(void *))
1270{
1271        return flags;
1272}
1273#define slub_debug 0
1274
1275#define disable_higher_order_debug 0
1276
1277static inline unsigned long slabs_node(struct kmem_cache *s, int node)
1278                                                        { return 0; }
1279static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
1280                                                        { return 0; }
1281static inline void inc_slabs_node(struct kmem_cache *s, int node,
1282                                                        int objects) {}
1283static inline void dec_slabs_node(struct kmem_cache *s, int node,
1284                                                        int objects) {}
1285
1286static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
1287{
1288        kmemleak_alloc(ptr, size, 1, flags);
1289}
1290
1291static inline void kfree_hook(const void *x)
1292{
1293        kmemleak_free(x);
1294}
1295
1296static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
1297                                                        { return 0; }
1298
1299static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
1300                void *object)
1301{
1302        kmemleak_alloc_recursive(object, s->object_size, 1, s->flags,
1303                flags & gfp_allowed_mask);
1304}
1305
1306static inline void slab_free_hook(struct kmem_cache *s, void *x)
1307{
1308        kmemleak_free_recursive(x, s->flags);
1309}
1310
1311#endif /* CONFIG_SLUB_DEBUG */
1312
1313/*
1314 * Slab allocation and freeing
1315 */
1316static inline struct page *alloc_slab_page(gfp_t flags, int node,
1317                                        struct kmem_cache_order_objects oo)
1318{
1319        int order = oo_order(oo);
1320
1321        flags |= __GFP_NOTRACK;
1322
1323        if (node == NUMA_NO_NODE)
1324                return alloc_pages(flags, order);
1325        else
1326                return alloc_pages_exact_node(node, flags, order);
1327}
1328
1329static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1330{
1331        struct page *page;
1332        struct kmem_cache_order_objects oo = s->oo;
1333        gfp_t alloc_gfp;
1334
1335        flags &= gfp_allowed_mask;
1336
1337        if (flags & __GFP_WAIT)
1338                local_irq_enable();
1339
1340        flags |= s->allocflags;
1341
1342        /*
1343         * Let the initial higher-order allocation fail under memory pressure
1344         * so we fall-back to the minimum order allocation.
1345         */
1346        alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
1347
1348        page = alloc_slab_page(alloc_gfp, node, oo);
1349        if (unlikely(!page)) {
1350                oo = s->min;
1351                /*
1352                 * Allocation may have failed due to fragmentation.
1353                 * Try a lower order alloc if possible
1354                 */
1355                page = alloc_slab_page(flags, node, oo);
1356
1357                if (page)
1358                        stat(s, ORDER_FALLBACK);
1359        }
1360
1361        if (kmemcheck_enabled && page
1362                && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
1363                int pages = 1 << oo_order(oo);
1364
1365                kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
1366
1367                /*
1368                 * Objects from caches that have a constructor don't get
1369                 * cleared when they're allocated, so we need to do it here.
1370                 */
1371                if (s->ctor)
1372                        kmemcheck_mark_uninitialized_pages(page, pages);
1373                else
1374                        kmemcheck_mark_unallocated_pages(page, pages);
1375        }
1376
1377        if (flags & __GFP_WAIT)
1378                local_irq_disable();
1379        if (!page)
1380                return NULL;
1381
1382        page->objects = oo_objects(oo);
1383        mod_zone_page_state(page_zone(page),
1384                (s->flags & SLAB_RECLAIM_ACCOUNT) ?
1385                NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1386                1 << oo_order(oo));
1387
1388        return page;
1389}
1390
1391static void setup_object(struct kmem_cache *s, struct page *page,
1392                                void *object)
1393{
1394        setup_object_debug(s, page, object);
1395        if (unlikely(s->ctor))
1396                s->ctor(object);
1397}
1398
1399static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1400{
1401        struct page *page;
1402        void *start;
1403        void *last;
1404        void *p;
1405        int order;
1406
1407        BUG_ON(flags & GFP_SLAB_BUG_MASK);
1408
1409        page = allocate_slab(s,
1410                flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
1411        if (!page)
1412                goto out;
1413
1414        order = compound_order(page);
1415        inc_slabs_node(s, page_to_nid(page), page->objects);
1416        memcg_bind_pages(s, order);
1417        page->slab_cache = s;
1418        __SetPageSlab(page);
1419        if (page->pfmemalloc)
1420                SetPageSlabPfmemalloc(page);
1421
1422        start = page_address(page);
1423
1424        if (unlikely(s->flags & SLAB_POISON))
1425                memset(start, POISON_INUSE, PAGE_SIZE << order);
1426
1427        last = start;
1428        for_each_object(p, s, start, page->objects) {
1429                setup_object(s, page, last);
1430                set_freepointer(s, last, p);
1431                last = p;
1432        }
1433        setup_object(s, page, last);
1434        set_freepointer(s, last, NULL);
1435
1436        page->freelist = start;
1437        page->inuse = page->objects;
1438        page->frozen = 1;
1439out:
1440        return page;
1441}
1442
1443static void __free_slab(struct kmem_cache *s, struct page *page)
1444{
1445        int order = compound_order(page);
1446        int pages = 1 << order;
1447
1448        if (kmem_cache_debug(s)) {
1449                void *p;
1450
1451                slab_pad_check(s, page);
1452                for_each_object(p, s, page_address(page),
1453                                                page->objects)
1454                        check_object(s, page, p, SLUB_RED_INACTIVE);
1455        }
1456
1457        kmemcheck_free_shadow(page, compound_order(page));
1458
1459        mod_zone_page_state(page_zone(page),
1460                (s->flags & SLAB_RECLAIM_ACCOUNT) ?
1461                NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1462                -pages);
1463
1464        __ClearPageSlabPfmemalloc(page);
1465        __ClearPageSlab(page);
1466
1467        memcg_release_pages(s, order);
1468        page_mapcount_reset(page);
1469        if (current->reclaim_state)
1470                current->reclaim_state->reclaimed_slab += pages;
1471        __free_memcg_kmem_pages(page, order);
1472}
1473
1474#define need_reserve_slab_rcu                                           \
1475        (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
1476
1477static void rcu_free_slab(struct rcu_head *h)
1478{
1479        struct page *page;
1480
1481        if (need_reserve_slab_rcu)
1482                page = virt_to_head_page(h);
1483        else
1484                page = container_of((struct list_head *)h, struct page, lru);
1485
1486        __free_slab(page->slab_cache, page);
1487}
1488
1489static void free_slab(struct kmem_cache *s, struct page *page)
1490{
1491        if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
1492                struct rcu_head *head;
1493
1494                if (need_reserve_slab_rcu) {
1495                        int order = compound_order(page);
1496                        int offset = (PAGE_SIZE << order) - s->reserved;
1497
1498                        VM_BUG_ON(s->reserved != sizeof(*head));
1499                        head = page_address(page) + offset;
1500                } else {
1501                        /*
1502                         * RCU free overloads the RCU head over the LRU
1503                         */
1504                        head = (void *)&page->lru;
1505                }
1506
1507                call_rcu(head, rcu_free_slab);
1508        } else
1509                __free_slab(s, page);
1510}
1511
1512static void discard_slab(struct kmem_cache *s, struct page *page)
1513{
1514        dec_slabs_node(s, page_to_nid(page), page->objects);
1515        free_slab(s, page);
1516}
1517
1518/*
1519 * Management of partially allocated slabs.
1520 */
1521static inline void
1522__add_partial(struct kmem_cache_node *n, struct page *page, int tail)
1523{
1524        n->nr_partial++;
1525        if (tail == DEACTIVATE_TO_TAIL)
1526                list_add_tail(&page->lru, &n->partial);
1527        else
1528                list_add(&page->lru, &n->partial);
1529}
1530
1531static inline void add_partial(struct kmem_cache_node *n,
1532                                struct page *page, int tail)
1533{
1534        lockdep_assert_held(&n->list_lock);
1535        __add_partial(n, page, tail);
1536}
1537
1538static inline void
1539__remove_partial(struct kmem_cache_node *n, struct page *page)
1540{
1541        list_del(&page->lru);
1542        n->nr_partial--;
1543}
1544
1545static inline void remove_partial(struct kmem_cache_node *n,
1546                                        struct page *page)
1547{
1548        lockdep_assert_held(&n->list_lock);
1549        __remove_partial(n, page);
1550}
1551
1552/*
1553 * Remove slab from the partial list, freeze it and
1554 * return the pointer to the freelist.
1555 *
1556 * Returns a list of objects or NULL if it fails.
1557 */
1558static inline void *acquire_slab(struct kmem_cache *s,
1559                struct kmem_cache_node *n, struct page *page,
1560                int mode, int *objects)
1561{
1562        void *freelist;
1563        unsigned long counters;
1564        struct page new;
1565
1566        lockdep_assert_held(&n->list_lock);
1567
1568        /*
1569         * Zap the freelist and set the frozen bit.
1570         * The old freelist is the list of objects for the
1571         * per cpu allocation list.
1572         */
1573        freelist = page->freelist;
1574        counters = page->counters;
1575        new.counters = counters;
1576        *objects = new.objects - new.inuse;
1577        if (mode) {
1578                new.inuse = page->objects;
1579                new.freelist = NULL;
1580        } else {
1581                new.freelist = freelist;
1582        }
1583
1584        VM_BUG_ON(new.frozen);
1585        new.frozen = 1;
1586
1587        if (!__cmpxchg_double_slab(s, page,
1588                        freelist, counters,
1589                        new.freelist, new.counters,
1590                        "acquire_slab"))
1591                return NULL;
1592
1593        remove_partial(n, page);
1594        WARN_ON(!freelist);
1595        return freelist;
1596}
1597
1598static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain);
1599static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags);
1600
1601/*
1602 * Try to allocate a partial slab from a specific node.
1603 */
1604static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
1605                                struct kmem_cache_cpu *c, gfp_t flags)
1606{
1607        struct page *page, *page2;
1608        void *object = NULL;
1609        int available = 0;
1610        int objects;
1611
1612        /*
1613         * Racy check. If we mistakenly see no partial slabs then we
1614         * just allocate an empty slab. If we mistakenly try to get a
1615         * partial slab and there is none available then get_partials()
1616         * will return NULL.
1617         */
1618        if (!n || !n->nr_partial)
1619                return NULL;
1620
1621        spin_lock(&n->list_lock);
1622        list_for_each_entry_safe(page, page2, &n->partial, lru) {
1623                void *t;
1624
1625                if (!pfmemalloc_match(page, flags))
1626                        continue;
1627
1628                t = acquire_slab(s, n, page, object == NULL, &objects);
1629                if (!t)
1630                        break;
1631
1632                available += objects;
1633                if (!object) {
1634                        c->page = page;
1635                        stat(s, ALLOC_FROM_PARTIAL);
1636                        object = t;
1637                } else {
1638                        put_cpu_partial(s, page, 0);
1639                        stat(s, CPU_PARTIAL_NODE);
1640                }
1641                if (!kmem_cache_has_cpu_partial(s)
1642                        || available > s->cpu_partial / 2)
1643                        break;
1644
1645        }
1646        spin_unlock(&n->list_lock);
1647        return object;
1648}
1649
1650/*
1651 * Get a page from somewhere. Search in increasing NUMA distances.
1652 */
1653static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
1654                struct kmem_cache_cpu *c)
1655{
1656#ifdef CONFIG_NUMA
1657        struct zonelist *zonelist;
1658        struct zoneref *z;
1659        struct zone *zone;
1660        enum zone_type high_zoneidx = gfp_zone(flags);
1661        void *object;
1662        unsigned int cpuset_mems_cookie;
1663
1664        /*
1665         * The defrag ratio allows a configuration of the tradeoffs between
1666         * inter node defragmentation and node local allocations. A lower
1667         * defrag_ratio increases the tendency to do local allocations
1668         * instead of attempting to obtain partial slabs from other nodes.
1669         *
1670         * If the defrag_ratio is set to 0 then kmalloc() always
1671         * returns node local objects. If the ratio is higher then kmalloc()
1672         * may return off node objects because partial slabs are obtained
1673         * from other nodes and filled up.
1674         *
1675         * If /sys/kernel/slab/xx/defrag_ratio is set to 100 (which makes
1676         * defrag_ratio = 1000) then every (well almost) allocation will
1677         * first attempt to defrag slab caches on other nodes. This means
1678         * scanning over all nodes to look for partial slabs which may be
1679         * expensive if we do it every time we are trying to find a slab
1680         * with available objects.
1681         */
1682        if (!s->remote_node_defrag_ratio ||
1683                        get_cycles() % 1024 > s->remote_node_defrag_ratio)
1684                return NULL;
1685
1686        do {
1687                cpuset_mems_cookie = get_mems_allowed();
1688                zonelist = node_zonelist(slab_node(), flags);
1689                for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1690                        struct kmem_cache_node *n;
1691
1692                        n = get_node(s, zone_to_nid(zone));
1693
1694                        if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
1695                                        n->nr_partial > s->min_partial) {
1696                                object = get_partial_node(s, n, c, flags);
1697                                if (object) {
1698                                        /*
1699                                         * Return the object even if
1700                                         * put_mems_allowed indicated that
1701                                         * the cpuset mems_allowed was
1702                                         * updated in parallel. It's a
1703                                         * harmless race between the alloc
1704                                         * and the cpuset update.
1705                                         */
1706                                        put_mems_allowed(cpuset_mems_cookie);
1707                                        return object;
1708                                }
1709                        }
1710                }
1711        } while (!put_mems_allowed(cpuset_mems_cookie));
1712#endif
1713        return NULL;
1714}
1715
1716/*
1717 * Get a partial page, lock it and return it.
1718 */
1719static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
1720                struct kmem_cache_cpu *c)
1721{
1722        void *object;
1723        int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node;
1724
1725        object = get_partial_node(s, get_node(s, searchnode), c, flags);
1726        if (object || node != NUMA_NO_NODE)
1727                return object;
1728
1729        return get_any_partial(s, flags, c);
1730}
1731
1732#ifdef CONFIG_PREEMPT
1733/*
1734 * Calculate the next globally unique transaction for disambiguiation
1735 * during cmpxchg. The transactions start with the cpu number and are then
1736 * incremented by CONFIG_NR_CPUS.
1737 */
1738#define TID_STEP  roundup_pow_of_two(CONFIG_NR_CPUS)
1739#else
1740/*
1741 * No preemption supported therefore also no need to check for
1742 * different cpus.
1743 */
1744#define TID_STEP 1
1745#endif
1746
1747static inline unsigned long next_tid(unsigned long tid)
1748{
1749        return tid + TID_STEP;
1750}
1751
1752static inline unsigned int tid_to_cpu(unsigned long tid)
1753{
1754        return tid % TID_STEP;
1755}
1756
1757static inline unsigned long tid_to_event(unsigned long tid)
1758{
1759        return tid / TID_STEP;
1760}
1761
1762static inline unsigned int init_tid(int cpu)
1763{
1764        return cpu;
1765}
1766
1767static inline void note_cmpxchg_failure(const char *n,
1768                const struct kmem_cache *s, unsigned long tid)
1769{
1770#ifdef SLUB_DEBUG_CMPXCHG
1771        unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
1772
1773        printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name);
1774
1775#ifdef CONFIG_PREEMPT
1776        if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
1777                printk("due to cpu change %d -> %d\n",
1778                        tid_to_cpu(tid), tid_to_cpu(actual_tid));
1779        else
1780#endif
1781        if (tid_to_event(tid) != tid_to_event(actual_tid))
1782                printk("due to cpu running other code. Event %ld->%ld\n",
1783                        tid_to_event(tid), tid_to_event(actual_tid));
1784        else
1785                printk("for unknown reason: actual=%lx was=%lx target=%lx\n",
1786                        actual_tid, tid, next_tid(tid));
1787#endif
1788        stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
1789}
1790
1791static void init_kmem_cache_cpus(struct kmem_cache *s)
1792{
1793        int cpu;
1794
1795        for_each_possible_cpu(cpu)
1796                per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
1797}
1798
1799/*
1800 * Remove the cpu slab
1801 */
1802static void deactivate_slab(struct kmem_cache *s, struct page *page,
1803                                void *freelist)
1804{
1805        enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
1806        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1807        int lock = 0;
1808        enum slab_modes l = M_NONE, m = M_NONE;
1809        void *nextfree;
1810        int tail = DEACTIVATE_TO_HEAD;
1811        struct page new;
1812        struct page old;
1813
1814        if (page->freelist) {
1815                stat(s, DEACTIVATE_REMOTE_FREES);
1816                tail = DEACTIVATE_TO_TAIL;
1817        }
1818
1819        /*
1820         * Stage one: Free all available per cpu objects back
1821         * to the page freelist while it is still frozen. Leave the
1822         * last one.
1823         *
1824         * There is no need to take the list->lock because the page
1825         * is still frozen.
1826         */
1827        while (freelist && (nextfree = get_freepointer(s, freelist))) {
1828                void *prior;
1829                unsigned long counters;
1830
1831                do {
1832                        prior = page->freelist;
1833                        counters = page->counters;
1834                        set_freepointer(s, freelist, prior);
1835                        new.counters = counters;
1836                        new.inuse--;
1837                        VM_BUG_ON(!new.frozen);
1838
1839                } while (!__cmpxchg_double_slab(s, page,
1840                        prior, counters,
1841                        freelist, new.counters,
1842                        "drain percpu freelist"));
1843
1844                freelist = nextfree;
1845        }
1846
1847        /*
1848         * Stage two: Ensure that the page is unfrozen while the
1849         * list presence reflects the actual number of objects
1850         * during unfreeze.
1851         *
1852         * We setup the list membership and then perform a cmpxchg
1853         * with the count. If there is a mismatch then the page
1854         * is not unfrozen but the page is on the wrong list.
1855         *
1856         * Then we restart the process which may have to remove
1857         * the page from the list that we just put it on again
1858         * because the number of objects in the slab may have
1859         * changed.
1860         */
1861redo:
1862
1863        old.freelist = page->freelist;
1864        old.counters = page->counters;
1865        VM_BUG_ON(!old.frozen);
1866
1867        /* Determine target state of the slab */
1868        new.counters = old.counters;
1869        if (freelist) {
1870                new.inuse--;
1871                set_freepointer(s, freelist, old.freelist);
1872                new.freelist = freelist;
1873        } else
1874                new.freelist = old.freelist;
1875
1876        new.frozen = 0;
1877
1878        if (!new.inuse && n->nr_partial > s->min_partial)
1879                m = M_FREE;
1880        else if (new.freelist) {
1881                m = M_PARTIAL;
1882                if (!lock) {
1883                        lock = 1;
1884                        /*
1885                         * Taking the spinlock removes the possiblity
1886                         * that acquire_slab() will see a slab page that
1887                         * is frozen
1888                         */
1889                        spin_lock(&n->list_lock);
1890                }
1891        } else {
1892                m = M_FULL;
1893                if (kmem_cache_debug(s) && !lock) {
1894                        lock = 1;
1895                        /*
1896                         * This also ensures that the scanning of full
1897                         * slabs from diagnostic functions will not see
1898                         * any frozen slabs.
1899                         */
1900                        spin_lock(&n->list_lock);
1901                }
1902        }
1903
1904        if (l != m) {
1905
1906                if (l == M_PARTIAL)
1907
1908                        remove_partial(n, page);
1909
1910                else if (l == M_FULL)
1911
1912                        remove_full(s, n, page);
1913
1914                if (m == M_PARTIAL) {
1915
1916                        add_partial(n, page, tail);
1917                        stat(s, tail);
1918
1919                } else if (m == M_FULL) {
1920
1921                        stat(s, DEACTIVATE_FULL);
1922                        add_full(s, n, page);
1923
1924                }
1925        }
1926
1927        l = m;
1928        if (!__cmpxchg_double_slab(s, page,
1929                                old.freelist, old.counters,
1930                                new.freelist, new.counters,
1931                                "unfreezing slab"))
1932                goto redo;
1933
1934        if (lock)
1935                spin_unlock(&n->list_lock);
1936
1937        if (m == M_FREE) {
1938                stat(s, DEACTIVATE_EMPTY);
1939                discard_slab(s, page);
1940                stat(s, FREE_SLAB);
1941        }
1942}
1943
1944/*
1945 * Unfreeze all the cpu partial slabs.
1946 *
1947 * This function must be called with interrupts disabled
1948 * for the cpu using c (or some other guarantee must be there
1949 * to guarantee no concurrent accesses).
1950 */
1951static void unfreeze_partials(struct kmem_cache *s,
1952                struct kmem_cache_cpu *c)
1953{
1954#ifdef CONFIG_SLUB_CPU_PARTIAL
1955        struct kmem_cache_node *n = NULL, *n2 = NULL;
1956        struct page *page, *discard_page = NULL;
1957
1958        while ((page = c->partial)) {
1959                struct page new;
1960                struct page old;
1961
1962                c->partial = page->next;
1963
1964                n2 = get_node(s, page_to_nid(page));
1965                if (n != n2) {
1966                        if (n)
1967                                spin_unlock(&n->list_lock);
1968
1969                        n = n2;
1970                        spin_lock(&n->list_lock);
1971                }
1972
1973                do {
1974
1975                        old.freelist = page->freelist;
1976                        old.counters = page->counters;
1977                        VM_BUG_ON(!old.frozen);
1978
1979                        new.counters = old.counters;
1980                        new.freelist = old.freelist;
1981
1982                        new.frozen = 0;
1983
1984                } while (!__cmpxchg_double_slab(s, page,
1985                                old.freelist, old.counters,
1986                                new.freelist, new.counters,
1987                                "unfreezing slab"));
1988
1989                if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) {
1990                        page->next = discard_page;
1991                        discard_page = page;
1992                } else {
1993                        add_partial(n, page, DEACTIVATE_TO_TAIL);
1994                        stat(s, FREE_ADD_PARTIAL);
1995                }
1996        }
1997
1998        if (n)
1999                spin_unlock(&n->list_lock);
2000

2001        while (discard_page) {
2002                page = discard_page;
2003                discard_page = discard_page->next;
2004
2005                stat(s, DEACTIVATE_EMPTY);
2006                discard_slab(s, page);
2007                stat(s, FREE_SLAB);
2008        }
2009#endif
2010}
2011
2012/*
2013 * Put a page that was just frozen (in __slab_free) into a partial page
2014 * slot if available. This is done without interrupts disabled and without
2015 * preemption disabled. The cmpxchg is racy and may put the partial page
2016 * onto a random cpus partial slot.
2017 *
2018 * If we did not find a slot then simply move all the partials to the
2019 * per node partial list.
2020 */
2021static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
2022{
2023#ifdef CONFIG_SLUB_CPU_PARTIAL
2024        struct page *oldpage;
2025        int pages;
2026        int pobjects;
2027
2028        do {
2029                pages = 0;
2030                pobjects = 0;
2031                oldpage = this_cpu_read(s->cpu_slab->partial);
2032
2033                if (oldpage) {
2034                        pobjects = oldpage->pobjects;
2035                        pages = oldpage->pages;
2036                        if (drain && pobjects > s->cpu_partial) {
2037                                unsigned long flags;
2038                                /*
2039                                 * partial array is full. Move the existing
2040                                 * set to the per node partial list.
2041                                 */
2042                                local_irq_save(flags);
2043                                unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
2044                                local_irq_restore(flags);
2045                                oldpage = NULL;
2046                                pobjects = 0;
2047                                pages = 0;
2048                                stat(s, CPU_PARTIAL_DRAIN);
2049                        }
2050                }
2051
2052                pages++;
2053                pobjects += page->objects - page->inuse;
2054
2055                page->pages = pages;
2056                page->pobjects = pobjects;
2057                page->next = oldpage;
2058
2059        } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
2060                                                                != oldpage);
2061#endif
2062}
2063
2064static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
2065{
2066        stat(s, CPUSLAB_FLUSH);
2067        deactivate_slab(s, c->page, c->freelist);
2068
2069        c->tid = next_tid(c->tid);
2070        c->page = NULL;
2071        c->freelist = NULL;
2072}
2073
2074/*
2075 * Flush cpu slab.
2076 *
2077 * Called from IPI handler with interrupts disabled.
2078 */
2079static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
2080{
2081        struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
2082
2083        if (likely(c)) {
2084                if (c->page)
2085                        flush_slab(s, c);
2086
2087                unfreeze_partials(s, c);
2088        }
2089}
2090
2091static void flush_cpu_slab(void *d)
2092{
2093        struct kmem_cache *s = d;
2094
2095        __flush_cpu_slab(s, smp_processor_id());
2096}
2097
2098static bool has_cpu_slab(int cpu, void *info)
2099{
2100        struct kmem_cache *s = info;
2101        struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
2102
2103        return c->page || c->partial;
2104}
2105
2106static void flush_all(struct kmem_cache *s)
2107{
2108        on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
2109}
2110
2111/*
2112 * Check if the objects in a per cpu structure fit numa
2113 * locality expectations.
2114 */
2115static inline int node_match(struct page *page, int node)
2116{
2117#ifdef CONFIG_NUMA
2118        if (!page || (node != NUMA_NO_NODE && page_to_nid(page) != node))
2119                return 0;
2120#endif
2121        return 1;
2122}
2123
2124static int count_free(struct page *page)
2125{
2126        return page->objects - page->inuse;
2127}
2128
2129static unsigned long count_partial(struct kmem_cache_node *n,
2130                                        int (*get_count)(struct page *))
2131{
2132        unsigned long flags;
2133        unsigned long x = 0;
2134        struct page *page;
2135
2136        spin_lock_irqsave(&n->list_lock, flags);
2137        list_for_each_entry(page, &n->partial, lru)
2138                x += get_count(page);
2139        spin_unlock_irqrestore(&n->list_lock, flags);
2140        return x;
2141}
2142
2143static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
2144{
2145#ifdef CONFIG_SLUB_DEBUG
2146        return atomic_long_read(&n->total_objects);
2147#else
2148        return 0;
2149#endif
2150}
2151
2152static noinline void
2153slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
2154{
2155        int node;
2156
2157        printk(KERN_WARNING
2158                "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n",
2159                nid, gfpflags);
2160        printk(KERN_WARNING "  cache: %s, object size: %d, buffer size: %d, "
2161                "default order: %d, min order: %d\n", s->name, s->object_size,
2162                s->size, oo_order(s->oo), oo_order(s->min));
2163
2164        if (oo_order(s->min) > get_order(s->object_size))
2165                printk(KERN_WARNING "  %s debugging increased min order, use "
2166                       "slub_debug=O to disable.\n", s->name);
2167
2168        for_each_online_node(node) {
2169                struct kmem_cache_node *n = get_node(s, node);
2170                unsigned long nr_slabs;
2171                unsigned long nr_objs;
2172                unsigned long nr_free;
2173
2174                if (!n)
2175                        continue;
2176
2177                nr_free  = count_partial(n, count_free);
2178                nr_slabs = node_nr_slabs(n);
2179                nr_objs  = node_nr_objs(n);
2180
2181                printk(KERN_WARNING
2182                        "  node %d: slabs: %ld, objs: %ld, free: %ld\n",
2183                        node, nr_slabs, nr_objs, nr_free);
2184        }
2185}
2186
2187static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
2188                        int node, struct kmem_cache_cpu **pc)
2189{
2190        void *freelist;
2191        struct kmem_cache_cpu *c = *pc;
2192        struct page *page;
2193
2194        freelist = get_partial(s, flags, node, c);
2195
2196        if (freelist)
2197                return freelist;
2198
2199        page = new_slab(s, flags, node);
2200        if (page) {
2201                c = __this_cpu_ptr(s->cpu_slab);
2202                if (c->page)
2203                        flush_slab(s, c);
2204
2205                /*
2206                 * No other reference to the page yet so we can
2207                 * muck around with it freely without cmpxchg
2208                 */
2209                freelist = page->freelist;
2210                page->freelist = NULL;
2211
2212                stat(s, ALLOC_SLAB);
2213                c->page = page;
2214                *pc = c;
2215        } else
2216                freelist = NULL;
2217
2218        return freelist;
2219}
2220
2221static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags)
2222{
2223        if (unlikely(PageSlabPfmemalloc(page)))
2224                return gfp_pfmemalloc_allowed(gfpflags);
2225
2226        return true;
2227}
2228
2229/*
2230 * Check the page->freelist of a page and either transfer the freelist to the
2231 * per cpu freelist or deactivate the page.
2232 *
2233 * The page is still frozen if the return value is not NULL.
2234 *
2235 * If this function returns NULL then the page has been unfrozen.
2236 *
2237 * This function must be called with interrupt disabled.
2238 */
2239static inline void *get_freelist(struct kmem_cache *s, struct page *page)
2240{
2241        struct page new;
2242        unsigned long counters;
2243        void *freelist;
2244
2245        do {
2246                freelist = page->freelist;
2247                counters = page->counters;
2248
2249                new.counters = counters;
2250                VM_BUG_ON(!new.frozen);
2251
2252                new.inuse = page->objects;
2253                new.frozen = freelist != NULL;
2254
2255        } while (!__cmpxchg_double_slab(s, page,
2256                freelist, counters,
2257                NULL, new.counters,
2258                "get_freelist"));
2259
2260        return freelist;
2261}
2262
2263/*
2264 * Slow path. The lockless freelist is empty or we need to perform
2265 * debugging duties.
2266 *
2267 * Processing is still very fast if new objects have been freed to the
2268 * regular freelist. In that case we simply take over the regular freelist
2269 * as the lockless freelist and zap the regular freelist.
2270 *
2271 * If that is not working then we fall back to the partial lists. We take the
2272 * first element of the freelist as the object to allocate now and move the
2273 * rest of the freelist to the lockless freelist.
2274 *
2275 * And if we were unable to get a new slab from the partial slab lists then
2276 * we need to allocate a new slab. This is the slowest path since it involves
2277 * a call to the page allocator and the setup of a new slab.
2278 */
2279static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
2280                          unsigned long addr, struct kmem_cache_cpu *c)
2281{
2282        void *freelist;
2283        struct page *page;
2284        unsigned long flags;
2285
2286        local_irq_save(flags);
2287#ifdef CONFIG_PREEMPT
2288        /*
2289         * We may have been preempted and rescheduled on a different
2290         * cpu before disabling interrupts. Need to reload cpu area
2291         * pointer.
2292         */
2293        c = this_cpu_ptr(s->cpu_slab);
2294#endif
2295
2296        page = c->page;
2297        if (!page)
2298                goto new_slab;
2299redo:
2300
2301        if (unlikely(!node_match(page, node))) {
2302                stat(s, ALLOC_NODE_MISMATCH);
2303                deactivate_slab(s, page, c->freelist);
2304                c->page = NULL;
2305                c->freelist = NULL;
2306                goto new_slab;
2307        }
2308
2309        /*
2310         * By rights, we should be searching for a slab page that was
2311         * PFMEMALLOC but right now, we are losing the pfmemalloc
2312         * information when the page leaves the per-cpu allocator
2313         */
2314        if (unlikely(!pfmemalloc_match(page, gfpflags))) {
2315                deactivate_slab(s, page, c->freelist);
2316                c->page = NULL;
2317                c->freelist = NULL;
2318                goto new_slab;
2319        }
2320
2321        /* must check again c->freelist in case of cpu migration or IRQ */
2322        freelist = c->freelist;
2323        if (freelist)
2324                goto load_freelist;
2325
2326        stat(s, ALLOC_SLOWPATH);
2327
2328        freelist = get_freelist(s, page);
2329
2330        if (!freelist) {
2331                c->page = NULL;
2332                stat(s, DEACTIVATE_BYPASS);
2333                goto new_slab;
2334        }
2335
2336        stat(s, ALLOC_REFILL);
2337
2338load_freelist:
2339        /*
2340         * freelist is pointing to the list of objects to be used.
2341         * page is pointing to the page from which the objects are obtained.
2342         * That page must be frozen for per cpu allocations to work.
2343         */
2344        VM_BUG_ON(!c->page->frozen);
2345        c->freelist = get_freepointer(s, freelist);
2346        c->tid = next_tid(c->tid);
2347        local_irq_restore(flags);
2348        return freelist;
2349
2350new_slab:
2351
2352        if (c->partial) {
2353                page = c->page = c->partial;
2354                c->partial = page->next;
2355                stat(s, CPU_PARTIAL_ALLOC);
2356                c->freelist = NULL;
2357                goto redo;
2358        }
2359
2360        freelist = new_slab_objects(s, gfpflags, node, &c);
2361
2362        if (unlikely(!freelist)) {
2363                if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
2364                        slab_out_of_memory(s, gfpflags, node);
2365
2366                local_irq_restore(flags);
2367                return NULL;
2368        }
2369
2370        page = c->page;
2371        if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
2372                goto load_freelist;
2373
2374        /* Only entered in the debug case */
2375        if (kmem_cache_debug(s) &&
2376                        !alloc_debug_processing(s, page, freelist, addr))
2377                goto new_slab;  /* Slab failed checks. Next slab needed */
2378
2379        deactivate_slab(s, page, get_freepointer(s, freelist));
2380        c->page = NULL;
2381        c->freelist = NULL;
2382        local_irq_restore(flags);
2383        return freelist;
2384}
2385
2386/*
2387 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
2388 * have the fastpath folded into their functions. So no function call
2389 * overhead for requests that can be satisfied on the fastpath.
2390 *
2391 * The fastpath works by first checking if the lockless freelist can be used.
2392 * If not then __slab_alloc is called for slow processing.
2393 *
2394 * Otherwise we can simply pick the next object from the lockless free list.
2395 */
2396static __always_inline void *slab_alloc_node(struct kmem_cache *s,
2397                gfp_t gfpflags, int node, unsigned long addr)
2398{
2399        void **object;
2400        struct kmem_cache_cpu *c;
2401        struct page *page;
2402        unsigned long tid;
2403
2404        if (slab_pre_alloc_hook(s, gfpflags))
2405                return NULL;
2406
2407        s = memcg_kmem_get_cache(s, gfpflags);
2408redo:
2409        /*
2410         * Must read kmem_cache cpu data via this cpu ptr. Preemption is
2411         * enabled. We may switch back and forth between cpus while
2412         * reading from one cpu area. That does not matter as long
2413         * as we end up on the original cpu again when doing the cmpxchg.
2414         *
2415         * Preemption is disabled for the retrieval of the tid because that
2416         * must occur from the current processor. We cannot allow rescheduling
2417         * on a different processor between the determination of the pointer
2418         * and the retrieval of the tid.
2419         */
2420        preempt_disable();
2421        c = __this_cpu_ptr(s->cpu_slab);
2422
2423        /*
2424         * The transaction ids are globally unique per cpu and per operation on
2425         * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
2426         * occurs on the right processor and that there was no operation on the
2427         * linked list in between.
2428         */
2429        tid = c->tid;
2430        preempt_enable();
2431
2432        object = c->freelist;
2433        page = c->page;
2434        if (unlikely(!object || !node_match(page, node)))
2435                object = __slab_alloc(s, gfpflags, node, addr, c);
2436
2437        else {
2438                void *next_object = get_freepointer_safe(s, object);
2439
2440                /*
2441                 * The cmpxchg will only match if there was no additional
2442                 * operation and if we are on the right processor.
2443                 *
2444                 * The cmpxchg does the following atomically (without lock
2445                 * semantics!)
2446                 * 1. Relocate first pointer to the current per cpu area.
2447                 * 2. Verify that tid and freelist have not been changed
2448                 * 3. If they were not changed replace tid and freelist
2449                 *
2450                 * Since this is without lock semantics the protection is only
2451                 * against code executing on this cpu *not* from access by
2452                 * other cpus.
2453                 */
2454                if (unlikely(!this_cpu_cmpxchg_double(
2455                                s->cpu_slab->freelist, s->cpu_slab->tid,
2456                                object, tid,
2457                                next_object, next_tid(tid)))) {
2458
2459                        note_cmpxchg_failure("slab_alloc", s, tid);
2460                        goto redo;
2461                }
2462                prefetch_freepointer(s, next_object);
2463                stat(s, ALLOC_FASTPATH);
2464        }
2465
2466        if (unlikely(gfpflags & __GFP_ZERO) && object)
2467                memset(object, 0, s->object_size);
2468
2469        slab_post_alloc_hook(s, gfpflags, object);
2470
2471        return object;
2472}
2473
2474static __always_inline void *slab_alloc(struct kmem_cache *s,
2475                gfp_t gfpflags, unsigned long addr)
2476{
2477        return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);
2478}
2479
2480void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
2481{
2482        void *ret = slab_alloc(s, gfpflags, _RET_IP_);
2483
2484        trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size,
2485                                s->size, gfpflags);
2486
2487        return ret;
2488}
2489EXPORT_SYMBOL(kmem_cache_alloc);
2490
2491#ifdef CONFIG_TRACING
2492void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
2493{
2494        void *ret = slab_alloc(s, gfpflags, _RET_IP_);
2495        trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
2496        return ret;
2497}
2498EXPORT_SYMBOL(kmem_cache_alloc_trace);
2499#endif
2500
2501#ifdef CONFIG_NUMA
2502void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
2503{
2504        void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
2505
2506        trace_kmem_cache_alloc_node(_RET_IP_, ret,
2507                                    s->object_size, s->size, gfpflags, node);
2508
2509        return ret;
2510}
2511EXPORT_SYMBOL(kmem_cache_alloc_node);
2512
2513#ifdef CONFIG_TRACING
2514void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
2515                                    gfp_t gfpflags,
2516                                    int node, size_t size)
2517{
2518        void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
2519
2520        trace_kmalloc_node(_RET_IP_, ret,
2521                           size, s->size, gfpflags, node);
2522        return ret;
2523}
2524EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
2525#endif
2526#endif
2527
2528/*
2529 * Slow patch handling. This may still be called frequently since objects
2530 * have a longer lifetime than the cpu slabs in most processing loads.
2531 *
2532 * So we still attempt to reduce cache line usage. Just take the slab
2533 * lock and free the item. If there is no additional partial page
2534 * handling required then we can return immediately.
2535 */
2536static void __slab_free(struct kmem_cache *s, struct page *page,
2537                        void *x, unsigned long addr)
2538{
2539        void *prior;
2540        void **object = (void *)x;
2541        int was_frozen;
2542        struct page new;
2543        unsigned long counters;
2544        struct kmem_cache_node *n = NULL;
2545        unsigned long uninitialized_var(flags);
2546
2547        stat(s, FREE_SLOWPATH);
2548
2549        if (kmem_cache_debug(s) &&
2550                !(n = free_debug_processing(s, page, x, addr, &flags)))
2551                return;
2552
2553        do {
2554                if (unlikely(n)) {
2555                        spin_unlock_irqrestore(&n->list_lock, flags);
2556                        n = NULL;
2557                }
2558                prior = page->freelist;
2559                counters = page->counters;
2560                set_freepointer(s, object, prior);
2561                new.counters = counters;
2562                was_frozen = new.frozen;
2563                new.inuse--;
2564                if ((!new.inuse || !prior) && !was_frozen) {
2565
2566                        if (kmem_cache_has_cpu_partial(s) && !prior) {
2567
2568                                /*
2569                                 * Slab was on no list before and will be
2570                                 * partially empty
2571                                 * We can defer the list move and instead
2572                                 * freeze it.
2573                                 */
2574                                new.frozen = 1;
2575
2576                        } else { /* Needs to be taken off a list */
2577
2578                                n = get_node(s, page_to_nid(page));
2579                                /*
2580                                 * Speculatively acquire the list_lock.
2581                                 * If the cmpxchg does not succeed then we may
2582                                 * drop the list_lock without any processing.
2583                                 *
2584                                 * Otherwise the list_lock will synchronize with
2585                                 * other processors updating the list of slabs.
2586                                 */
2587                                spin_lock_irqsave(&n->list_lock, flags);
2588
2589                        }
2590                }
2591
2592        } while (!cmpxchg_double_slab(s, page,
2593                prior, counters,
2594                object, new.counters,
2595                "__slab_free"));
2596
2597        if (likely(!n)) {
2598
2599                /*
2600                 * If we just froze the page then put it onto the
2601                 * per cpu partial list.
2602                 */
2603                if (new.frozen && !was_frozen) {
2604                        put_cpu_partial(s, page, 1);
2605                        stat(s, CPU_PARTIAL_FREE);
2606                }
2607                /*
2608                 * The list lock was not taken therefore no list
2609                 * activity can be necessary.
2610                 */
2611                if (was_frozen)
2612                        stat(s, FREE_FROZEN);
2613                return;
2614        }
2615
2616        if (unlikely(!new.inuse && n->nr_partial > s->min_partial))
2617                goto slab_empty;
2618
2619        /*
2620         * Objects left in the slab. If it was not on the partial list before
2621         * then add it.
2622         */
2623        if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
2624                if (kmem_cache_debug(s))
2625                        remove_full(s, n, page);
2626                add_partial(n, page, DEACTIVATE_TO_TAIL);
2627                stat(s, FREE_ADD_PARTIAL);
2628        }
2629        spin_unlock_irqrestore(&n->list_lock, flags);
2630        return;
2631
2632slab_empty:
2633        if (prior) {
2634                /*
2635                 * Slab on the partial list.
2636                 */
2637                remove_partial(n, page);
2638                stat(s, FREE_REMOVE_PARTIAL);
2639        } else {
2640                /* Slab must be on the full list */
2641                remove_full(s, n, page);
2642        }
2643
2644        spin_unlock_irqrestore(&n->list_lock, flags);
2645        stat(s, FREE_SLAB);
2646        discard_slab(s, page);
2647}
2648
2649/*
2650 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
2651 * can perform fastpath freeing without additional function calls.
2652 *
2653 * The fastpath is only possible if we are freeing to the current cpu slab
2654 * of this processor. This typically the case if we have just allocated
2655 * the item before.
2656 *
2657 * If fastpath is not possible then fall back to __slab_free where we deal
2658 * with all sorts of special processing.
2659 */
2660static __always_inline void slab_free(struct kmem_cache *s,
2661                        struct page *page, void *x, unsigned long addr)
2662{
2663        void **object = (void *)x;
2664        struct kmem_cache_cpu *c;
2665        unsigned long tid;
2666
2667        slab_free_hook(s, x);
2668
2669redo:
2670        /*
2671         * Determine the currently cpus per cpu slab.
2672         * The cpu may change afterward. However that does not matter since
2673         * data is retrieved via this pointer. If we are on the same cpu
2674         * during the cmpxchg then the free will succedd.
2675         */
2676        preempt_disable();
2677        c = __this_cpu_ptr(s->cpu_slab);
2678
2679        tid = c->tid;
2680        preempt_enable();
2681
2682        if (likely(page == c->page)) {
2683                set_freepointer(s, object, c->freelist);
2684
2685                if (unlikely(!this_cpu_cmpxchg_double(
2686                                s->cpu_slab->freelist, s->cpu_slab->tid,
2687                                c->freelist, tid,
2688                                object, next_tid(tid)))) {
2689
2690                        note_cmpxchg_failure("slab_free", s, tid);
2691                        goto redo;
2692                }
2693                stat(s, FREE_FASTPATH);
2694        } else
2695                __slab_free(s, page, x, addr);
2696
2697}
2698
2699void kmem_cache_free(struct kmem_cache *s, void *x)
2700{
2701        s = cache_from_obj(s, x);
2702        if (!s)
2703                return;
2704        slab_free(s, virt_to_head_page(x), x, _RET_IP_);
2705        trace_kmem_cache_free(_RET_IP_, x);
2706}
2707EXPORT_SYMBOL(kmem_cache_free);
2708
2709/*
2710 * Object placement in a slab is made very easy because we always start at
2711 * offset 0. If we tune the size of the object to the alignment then we can
2712 * get the required alignment by putting one properly sized object after
2713 * another.
2714 *
2715 * Notice that the allocation order determines the sizes of the per cpu
2716 * caches. Each processor has always one slab available for allocations.
2717 * Increasing the allocation order reduces the number of times that slabs
2718 * must be moved on and off the partial lists and is therefore a factor in
2719 * locking overhead.
2720 */
2721
2722/*
2723 * Mininum / Maximum order of slab pages. This influences locking overhead
2724 * and slab fragmentation. A higher order reduces the number of partial slabs
2725 * and increases the number of allocations possible without having to
2726 * take the list_lock.
2727 */
2728static int slub_min_order;
2729static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER;
2730static int slub_min_objects;
2731
2732/*
2733 * Merge control. If this is set then no merging of slab caches will occur.
2734 * (Could be removed. This was introduced to pacify the merge skeptics.)
2735 */
2736static int slub_nomerge;
2737
2738/*
2739 * Calculate the order of allocation given an slab object size.
2740 *
2741 * The order of allocation has significant impact on performance and other
2742 * system components. Generally order 0 allocations should be preferred since
2743 * order 0 does not cause fragmentation in the page allocator. Larger objects
2744 * be problematic to put into order 0 slabs because there may be too much
2745 * unused space left. We go to a higher order if more than 1/16th of the slab
2746 * would be wasted.
2747 *
2748 * In order to reach satisfactory performance we must ensure that a minimum
2749 * number of objects is in one slab. Otherwise we may generate too much
2750 * activity on the partial lists which requires taking the list_lock. This is
2751 * less a concern for large slabs though which are rarely used.
2752 *
2753 * slub_max_order specifies the order where we begin to stop considering the
2754 * number of objects in a slab as critical. If we reach slub_max_order then
2755 * we try to keep the page order as low as possible. So we accept more waste
2756 * of space in favor of a small page order.
2757 *
2758 * Higher order allocations also allow the placement of more objects in a
2759 * slab and thereby reduce object handling overhead. If the user has
2760 * requested a higher mininum order then we start with that one instead of
2761 * the smallest order which will fit the object.
2762 */
2763static inline int slab_order(int size, int min_objects,
2764                                int max_order, int fract_leftover, int reserved)
2765{
2766        int order;
2767        int rem;
2768        int min_order = slub_min_order;
2769
2770        if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
2771                return get_order(size * MAX_OBJS_PER_PAGE) - 1;
2772
2773        for (order = max(min_order,
2774                                fls(min_objects * size - 1) - PAGE_SHIFT);
2775                        order <= max_order; order++) {
2776
2777                unsigned long slab_size = PAGE_SIZE << order;
2778
2779                if (slab_size < min_objects * size + reserved)
2780                        continue;
2781
2782                rem = (slab_size - reserved) % size;
2783
2784                if (rem <= slab_size / fract_leftover)
2785                        break;
2786
2787        }
2788
2789        return order;
2790}
2791
2792static inline int calculate_order(int size, int reserved)
2793{
2794        int order;
2795        int min_objects;
2796        int fraction;
2797        int max_objects;
2798
2799        /*
2800         * Attempt to find best configuration for a slab. This
2801         * works by first attempting to generate a layout with
2802         * the best configuration and backing off gradually.
2803         *
2804         * First we reduce the acceptable waste in a slab. Then
2805         * we reduce the minimum objects required in a slab.
2806         */
2807        min_objects = slub_min_objects;
2808        if (!min_objects)
2809                min_objects = 4 * (fls(nr_cpu_ids) + 1);
2810        max_objects = order_objects(slub_max_order, size, reserved);
2811        min_objects = min(min_objects, max_objects);
2812
2813        while (min_objects > 1) {
2814                fraction = 16;
2815                while (fraction >= 4) {
2816                        order = slab_order(size, min_objects,
2817                                        slub_max_order, fraction, reserved);
2818                        if (order <= slub_max_order)
2819                                return order;
2820                        fraction /= 2;
2821                }
2822                min_objects--;
2823        }
2824
2825        /*
2826         * We were unable to place multiple objects in a slab. Now
2827         * lets see if we can place a single object there.
2828         */
2829        order = slab_order(size, 1, slub_max_order, 1, reserved);
2830        if (order <= slub_max_order)
2831                return order;
2832
2833        /*
2834         * Doh this slab cannot be placed using slub_max_order.
2835         */
2836        order = slab_order(size, 1, MAX_ORDER, 1, reserved);
2837        if (order < MAX_ORDER)
2838                return order;
2839        return -ENOSYS;
2840}
2841
2842static void
2843init_kmem_cache_node(struct kmem_cache_node *n)
2844{
2845        n->nr_partial = 0;
2846        spin_lock_init(&n->list_lock);
2847        INIT_LIST_HEAD(&n->partial);
2848#ifdef CONFIG_SLUB_DEBUG
2849        atomic_long_set(&n->nr_slabs, 0);
2850        atomic_long_set(&n->total_objects, 0);
2851        INIT_LIST_HEAD(&n->full);
2852#endif
2853}
2854
2855static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
2856{
2857        BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
2858                        KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu));
2859
2860        /*
2861         * Must align to double word boundary for the double cmpxchg
2862         * instructions to work; see __pcpu_double_call_return_bool().
2863         */
2864        s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu),
2865                                     2 * sizeof(void *));
2866
2867        if (!s->cpu_slab)
2868                return 0;
2869
2870        init_kmem_cache_cpus(s);
2871
2872        return 1;
2873}
2874
2875static struct kmem_cache *kmem_cache_node;
2876
2877/*
2878 * No kmalloc_node yet so do it by hand. We know that this is the first
2879 * slab on the node for this slabcache. There are no concurrent accesses
2880 * possible.
2881 *
2882 * Note that this function only works on the kmem_cache_node
2883 * when allocating for the kmem_cache_node. This is used for bootstrapping
2884 * memory on a fresh node that has no slab structures yet.
2885 */
2886static void early_kmem_cache_node_alloc(int node)
2887{
2888        struct page *page;
2889        struct kmem_cache_node *n;
2890
2891        BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
2892
2893        page = new_slab(kmem_cache_node, GFP_NOWAIT, node);
2894
2895        BUG_ON(!page);
2896        if (page_to_nid(page) != node) {
2897                printk(KERN_ERR "SLUB: Unable to allocate memory from "
2898                                "node %d\n", node);
2899                printk(KERN_ERR "SLUB: Allocating a useless per node structure "
2900                                "in order to be able to continue\n");
2901        }
2902
2903        n = page->freelist;
2904        BUG_ON(!n);
2905        page->freelist = get_freepointer(kmem_cache_node, n);
2906        page->inuse = 1;
2907        page->frozen = 0;
2908        kmem_cache_node->node[node] = n;
2909#ifdef CONFIG_SLUB_DEBUG
2910        init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
2911        init_tracking(kmem_cache_node, n);
2912#endif
2913        init_kmem_cache_node(n);
2914        inc_slabs_node(kmem_cache_node, node, page->objects);
2915
2916        /*
2917         * No locks need to be taken here as it has just been
2918         * initialized and there is no concurrent access.
2919         */
2920        __add_partial(n, page, DEACTIVATE_TO_HEAD);
2921}
2922
2923static void free_kmem_cache_nodes(struct kmem_cache *s)
2924{
2925        int node;
2926
2927        for_each_node_state(node, N_NORMAL_MEMORY) {
2928                struct kmem_cache_node *n = s->node[node];
2929
2930                if (n)
2931                        kmem_cache_free(kmem_cache_node, n);
2932
2933                s->node[node] = NULL;
2934        }
2935}
2936
2937static int init_kmem_cache_nodes(struct kmem_cache *s)
2938{
2939        int node;
2940
2941        for_each_node_state(node, N_NORMAL_MEMORY) {
2942                struct kmem_cache_node *n;
2943
2944                if (slab_state == DOWN) {
2945                        early_kmem_cache_node_alloc(node);
2946                        continue;
2947                }
2948                n = kmem_cache_alloc_node(kmem_cache_node,
2949                                                GFP_KERNEL, node);
2950
2951                if (!n) {
2952                        free_kmem_cache_nodes(s);
2953                        return 0;
2954                }
2955
2956                s->node[node] = n;
2957                init_kmem_cache_node(n);
2958        }
2959        return 1;
2960}
2961
2962static void set_min_partial(struct kmem_cache *s, unsigned long min)
2963{
2964        if (min < MIN_PARTIAL)
2965                min = MIN_PARTIAL;
2966        else if (min > MAX_PARTIAL)
2967                min = MAX_PARTIAL;
2968        s->min_partial = min;
2969}
2970
2971/*
2972 * calculate_sizes() determines the order and the distribution of data within
2973 * a slab object.
2974 */
2975static int calculate_sizes(struct kmem_cache *s, int forced_order)
2976{
2977        unsigned long flags = s->flags;
2978        unsigned long size = s->object_size;
2979        int order;
2980
2981        /*
2982         * Round up object size to the next word boundary. We can only
2983         * place the free pointer at word boundaries and this determines
2984         * the possible location of the free pointer.
2985         */
2986        size = ALIGN(size, sizeof(void *));
2987
2988#ifdef CONFIG_SLUB_DEBUG
2989        /*
2990         * Determine if we can poison the object itself. If the user of
2991         * the slab may touch the object after free or before allocation
2992         * then we should never poison the object itself.
2993         */
2994        if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) &&
2995                        !s->ctor)
2996                s->flags |= __OBJECT_POISON;
2997        else
2998                s->flags &= ~__OBJECT_POISON;
2999
3000

3001        /*
3002         * If we are Redzoning then check if there is some space between the
3003         * end of the object and the free pointer. If not then add an
3004         * additional word to have some bytes to store Redzone information.
3005         */
3006        if ((flags & SLAB_RED_ZONE) && size == s->object_size)
3007                size += sizeof(void *);
3008#endif
3009
3010        /*
3011         * With that we have determined the number of bytes in actual use
3012         * by the object. This is the potential offset to the free pointer.
3013         */
3014        s->inuse = size;
3015
3016        if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) ||
3017                s->ctor)) {
3018                /*
3019                 * Relocate free pointer after the object if it is not
3020                 * permitted to overwrite the first word of the object on
3021                 * kmem_cache_free.
3022                 *
3023                 * This is the case if we do RCU, have a constructor or
3024                 * destructor or are poisoning the objects.
3025                 */
3026                s->offset = size;
3027                size += sizeof(void *);
3028        }
3029
3030#ifdef CONFIG_SLUB_DEBUG
3031        if (flags & SLAB_STORE_USER)
3032                /*
3033                 * Need to store information about allocs and frees after
3034                 * the object.
3035                 */
3036                size += 2 * sizeof(struct track);
3037
3038        if (flags & SLAB_RED_ZONE)
3039                /*
3040                 * Add some empty padding so that we can catch
3041                 * overwrites from earlier objects rather than let
3042                 * tracking information or the free pointer be
3043                 * corrupted if a user writes before the start
3044                 * of the object.
3045                 */
3046                size += sizeof(void *);
3047#endif
3048
3049        /*
3050         * SLUB stores one object immediately after another beginning from
3051         * offset 0. In order to align the objects we have to simply size
3052         * each object to conform to the alignment.
3053         */
3054        size = ALIGN(size, s->align);
3055        s->size = size;
3056        if (forced_order >= 0)
3057                order = forced_order;
3058        else
3059                order = calculate_order(size, s->reserved);
3060
3061        if (order < 0)
3062                return 0;
3063
3064        s->allocflags = 0;
3065        if (order)
3066                s->allocflags |= __GFP_COMP;
3067
3068        if (s->flags & SLAB_CACHE_DMA)
3069                s->allocflags |= GFP_DMA;
3070
3071        if (s->flags & SLAB_RECLAIM_ACCOUNT)
3072                s->allocflags |= __GFP_RECLAIMABLE;
3073
3074        /*
3075         * Determine the number of objects per slab
3076         */
3077        s->oo = oo_make(order, size, s->reserved);
3078        s->min = oo_make(get_order(size), size, s->reserved);
3079        if (oo_objects(s->oo) > oo_objects(s->max))
3080                s->max = s->oo;
3081
3082        return !!oo_objects(s->oo);
3083}
3084
3085static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
3086{
3087        s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
3088        s->reserved = 0;
3089
3090        if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU))
3091                s->reserved = sizeof(struct rcu_head);
3092
3093        if (!calculate_sizes(s, -1))
3094                goto error;
3095        if (disable_higher_order_debug) {
3096                /*
3097                 * Disable debugging flags that store metadata if the min slab
3098                 * order increased.
3099                 */
3100                if (get_order(s->size) > get_order(s->object_size)) {
3101                        s->flags &= ~DEBUG_METADATA_FLAGS;
3102                        s->offset = 0;
3103                        if (!calculate_sizes(s, -1))
3104                                goto error;
3105                }
3106        }
3107
3108#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
3109    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
3110        if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0)
3111                /* Enable fast mode */
3112                s->flags |= __CMPXCHG_DOUBLE;
3113#endif
3114
3115        /*
3116         * The larger the object size is, the more pages we want on the partial
3117         * list to avoid pounding the page allocator excessively.
3118         */
3119        set_min_partial(s, ilog2(s->size) / 2);
3120
3121        /*
3122         * cpu_partial determined the maximum number of objects kept in the
3123         * per cpu partial lists of a processor.
3124         *
3125         * Per cpu partial lists mainly contain slabs that just have one
3126         * object freed. If they are used for allocation then they can be
3127         * filled up again with minimal effort. The slab will never hit the
3128         * per node partial lists and therefore no locking will be required.
3129         *
3130         * This setting also determines
3131         *
3132         * A) The number of objects from per cpu partial slabs dumped to the
3133         *    per node list when we reach the limit.
3134         * B) The number of objects in cpu partial slabs to extract from the
3135         *    per node list when we run out of per cpu objects. We only fetch
3136         *    50% to keep some capacity around for frees.
3137         */
3138        if (!kmem_cache_has_cpu_partial(s))
3139                s->cpu_partial = 0;
3140        else if (s->size >= PAGE_SIZE)
3141                s->cpu_partial = 2;
3142        else if (s->size >= 1024)
3143                s->cpu_partial = 6;
3144        else if (s->size >= 256)
3145                s->cpu_partial = 13;
3146        else
3147                s->cpu_partial = 30;
3148
3149#ifdef CONFIG_NUMA
3150        s->remote_node_defrag_ratio = 1000;
3151#endif
3152        if (!init_kmem_cache_nodes(s))
3153                goto error;
3154
3155        if (alloc_kmem_cache_cpus(s))
3156                return 0;
3157
3158        free_kmem_cache_nodes(s);
3159error:
3160        if (flags & SLAB_PANIC)
3161                panic("Cannot create slab %s size=%lu realsize=%u "
3162                        "order=%u offset=%u flags=%lx\n",
3163                        s->name, (unsigned long)s->size, s->size,
3164                        oo_order(s->oo), s->offset, flags);
3165        return -EINVAL;
3166}
3167
3168static void list_slab_objects(struct kmem_cache *s, struct page *page,
3169                                                        const char *text)
3170{
3171#ifdef CONFIG_SLUB_DEBUG
3172        void *addr = page_address(page);
3173        void *p;
3174        unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) *
3175                                     sizeof(long), GFP_ATOMIC);
3176        if (!map)
3177                return;
3178        slab_err(s, page, text, s->name);
3179        slab_lock(page);
3180
3181        get_map(s, page, map);
3182        for_each_object(p, s, addr, page->objects) {
3183
3184                if (!test_bit(slab_index(p, s, addr), map)) {
3185                        printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n",
3186                                                        p, p - addr);
3187                        print_tracking(s, p);
3188                }
3189        }
3190        slab_unlock(page);
3191        kfree(map);
3192#endif
3193}
3194
3195/*
3196 * Attempt to free all partial slabs on a node.
3197 * This is called from kmem_cache_close(). We must be the last thread
3198 * using the cache and therefore we do not need to lock anymore.
3199 */
3200static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
3201{
3202        struct page *page, *h;
3203
3204        list_for_each_entry_safe(page, h, &n->partial, lru) {
3205                if (!page->inuse) {
3206                        __remove_partial(n, page);
3207                        discard_slab(s, page);
3208                } else {
3209                        list_slab_objects(s, page,
3210                        "Objects remaining in %s on kmem_cache_close()");
3211                }
3212        }
3213}
3214
3215/*
3216 * Release all resources used by a slab cache.
3217 */
3218static inline int kmem_cache_close(struct kmem_cache *s)
3219{
3220        int node;
3221
3222        flush_all(s);
3223        /* Attempt to free all objects */
3224        for_each_node_state(node, N_NORMAL_MEMORY) {
3225                struct kmem_cache_node *n = get_node(s, node);
3226
3227                free_partial(s, n);
3228                if (n->nr_partial || slabs_node(s, node))
3229                        return 1;
3230        }
3231        free_percpu(s->cpu_slab);
3232        free_kmem_cache_nodes(s);
3233        return 0;
3234}
3235
3236int __kmem_cache_shutdown(struct kmem_cache *s)
3237{
3238        int rc = kmem_cache_close(s);
3239
3240        if (!rc) {
3241                /*
3242                 * We do the same lock strategy around sysfs_slab_add, see
3243                 * __kmem_cache_create. Because this is pretty much the last
3244                 * operation we do and the lock will be released shortly after
3245                 * that in slab_common.c, we could just move sysfs_slab_remove
3246                 * to a later point in common code. We should do that when we
3247                 * have a common sysfs framework for all allocators.
3248                 */
3249                mutex_unlock(&slab_mutex);
3250                sysfs_slab_remove(s);
3251                mutex_lock(&slab_mutex);
3252        }
3253
3254        return rc;
3255}
3256
3257/********************************************************************
3258 *              Kmalloc subsystem
3259 *******************************************************************/
3260
3261static int __init setup_slub_min_order(char *str)
3262{
3263        get_option(&str, &slub_min_order);
3264
3265        return 1;
3266}
3267
3268__setup("slub_min_order=", setup_slub_min_order);
3269
3270static int __init setup_slub_max_order(char *str)
3271{
3272        get_option(&str, &slub_max_order);
3273        slub_max_order = min(slub_max_order, MAX_ORDER - 1);
3274
3275        return 1;
3276}
3277
3278__setup("slub_max_order=", setup_slub_max_order);
3279
3280static int __init setup_slub_min_objects(char *str)
3281{
3282        get_option(&str, &slub_min_objects);
3283
3284        return 1;
3285}
3286
3287__setup("slub_min_objects=", setup_slub_min_objects);
3288
3289static int __init setup_slub_nomerge(char *str)
3290{
3291        slub_nomerge = 1;
3292        return 1;
3293}
3294
3295__setup("slub_nomerge", setup_slub_nomerge);
3296
3297void *__kmalloc(size_t size, gfp_t flags)
3298{
3299        struct kmem_cache *s;
3300        void *ret;
3301
3302        if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
3303                return kmalloc_large(size, flags);
3304
3305        s = kmalloc_slab(size, flags);
3306
3307        if (unlikely(ZERO_OR_NULL_PTR(s)))
3308                return s;
3309
3310        ret = slab_alloc(s, flags, _RET_IP_);
3311
3312        trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
3313
3314        return ret;
3315}
3316EXPORT_SYMBOL(__kmalloc);
3317
3318#ifdef CONFIG_NUMA
3319static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
3320{
3321        struct page *page;
3322        void *ptr = NULL;
3323
3324        flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG;
3325        page = alloc_pages_node(node, flags, get_order(size));
3326        if (page)
3327                ptr = page_address(page);
3328
3329        kmalloc_large_node_hook(ptr, size, flags);
3330        return ptr;
3331}
3332
3333void *__kmalloc_node(size_t size, gfp_t flags, int node)
3334{
3335        struct kmem_cache *s;
3336        void *ret;
3337
3338        if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
3339                ret = kmalloc_large_node(size, flags, node);
3340
3341                trace_kmalloc_node(_RET_IP_, ret,
3342                                   size, PAGE_SIZE << get_order(size),
3343                                   flags, node);
3344
3345                return ret;
3346        }
3347
3348        s = kmalloc_slab(size, flags);
3349
3350        if (unlikely(ZERO_OR_NULL_PTR(s)))
3351                return s;
3352
3353        ret = slab_alloc_node(s, flags, node, _RET_IP_);
3354
3355        trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
3356
3357        return ret;
3358}
3359EXPORT_SYMBOL(__kmalloc_node);
3360#endif
3361
3362size_t ksize(const void *object)
3363{
3364        struct page *page;
3365
3366        if (unlikely(object == ZERO_SIZE_PTR))
3367                return 0;
3368
3369        page = virt_to_head_page(object);
3370
3371        if (unlikely(!PageSlab(page))) {
3372                WARN_ON(!PageCompound(page));
3373                return PAGE_SIZE << compound_order(page);
3374        }
3375
3376        return slab_ksize(page->slab_cache);
3377}
3378EXPORT_SYMBOL(ksize);
3379
3380void kfree(const void *x)
3381{
3382        struct page *page;
3383        void *object = (void *)x;
3384
3385        trace_kfree(_RET_IP_, x);
3386
3387        if (unlikely(ZERO_OR_NULL_PTR(x)))
3388                return;
3389
3390        page = virt_to_head_page(x);
3391        if (unlikely(!PageSlab(page))) {
3392                BUG_ON(!PageCompound(page));
3393                kfree_hook(x);
3394                __free_memcg_kmem_pages(page, compound_order(page));
3395                return;
3396        }
3397        slab_free(page->slab_cache, page, object, _RET_IP_);
3398}
3399EXPORT_SYMBOL(kfree);
3400
3401/*
3402 * kmem_cache_shrink removes empty slabs from the partial lists and sorts
3403 * the remaining slabs by the number of items in use. The slabs with the
3404 * most items in use come first. New allocations will then fill those up
3405 * and thus they can be removed from the partial lists.
3406 *
3407 * The slabs with the least items are placed last. This results in them
3408 * being allocated from last increasing the chance that the last objects
3409 * are freed in them.
3410 */
3411int kmem_cache_shrink(struct kmem_cache *s)
3412{
3413        int node;
3414        int i;
3415        struct kmem_cache_node *n;
3416        struct page *page;
3417        struct page *t;
3418        int objects = oo_objects(s->max);
3419        struct list_head *slabs_by_inuse =
3420                kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL);
3421        unsigned long flags;
3422
3423        if (!slabs_by_inuse)
3424                return -ENOMEM;
3425
3426        flush_all(s);
3427        for_each_node_state(node, N_NORMAL_MEMORY) {
3428                n = get_node(s, node);
3429
3430                if (!n->nr_partial)
3431                        continue;
3432
3433                for (i = 0; i < objects; i++)
3434                        INIT_LIST_HEAD(slabs_by_inuse + i);
3435
3436                spin_lock_irqsave(&n->list_lock, flags);
3437
3438                /*
3439                 * Build lists indexed by the items in use in each slab.
3440                 *
3441                 * Note that concurrent frees may occur while we hold the
3442                 * list_lock. page->inuse here is the upper limit.
3443                 */
3444                list_for_each_entry_safe(page, t, &n->partial, lru) {
3445                        list_move(&page->lru, slabs_by_inuse + page->inuse);
3446                        if (!page->inuse)
3447                                n->nr_partial--;
3448                }
3449
3450                /*
3451                 * Rebuild the partial list with the slabs filled up most
3452                 * first and the least used slabs at the end.
3453                 */
3454                for (i = objects - 1; i > 0; i--)
3455                        list_splice(slabs_by_inuse + i, n->partial.prev);
3456
3457                spin_unlock_irqrestore(&n->list_lock, flags);
3458
3459                /* Release empty slabs */
3460                list_for_each_entry_safe(page, t, slabs_by_inuse, lru)
3461                        discard_slab(s, page);
3462        }
3463
3464        kfree(slabs_by_inuse);
3465        return 0;
3466}
3467EXPORT_SYMBOL(kmem_cache_shrink);
3468
3469static int slab_mem_going_offline_callback(void *arg)
3470{
3471        struct kmem_cache *s;
3472
3473        mutex_lock(&slab_mutex);
3474        list_for_each_entry(s, &slab_caches, list)
3475                kmem_cache_shrink(s);
3476        mutex_unlock(&slab_mutex);
3477
3478        return 0;
3479}
3480
3481static void slab_mem_offline_callback(void *arg)
3482{
3483        struct kmem_cache_node *n;
3484        struct kmem_cache *s;
3485        struct memory_notify *marg = arg;
3486        int offline_node;
3487
3488        offline_node = marg->status_change_nid_normal;
3489
3490        /*
3491         * If the node still has available memory. we need kmem_cache_node
3492         * for it yet.
3493         */
3494        if (offline_node < 0)
3495                return;
3496
3497        mutex_lock(&slab_mutex);
3498        list_for_each_entry(s, &slab_caches, list) {
3499                n = get_node(s, offline_node);
3500                if (n) {
3501                        /*
3502                         * if n->nr_slabs > 0, slabs still exist on the node
3503                         * that is going down. We were unable to free them,
3504                         * and offline_pages() function shouldn't call this
3505                         * callback. So, we must fail.
3506                         */
3507                        BUG_ON(slabs_node(s, offline_node));
3508
3509                        s->node[offline_node] = NULL;
3510                        kmem_cache_free(kmem_cache_node, n);
3511                }
3512        }
3513        mutex_unlock(&slab_mutex);
3514}
3515
3516static int slab_mem_going_online_callback(void *arg)
3517{
3518        struct kmem_cache_node *n;
3519        struct kmem_cache *s;
3520        struct memory_notify *marg = arg;
3521        int nid = marg->status_change_nid_normal;
3522        int ret = 0;
3523
3524        /*
3525         * If the node's memory is already available, then kmem_cache_node is
3526         * already created. Nothing to do.
3527         */
3528        if (nid < 0)
3529                return 0;
3530
3531        /*
3532         * We are bringing a node online. No memory is available yet. We must
3533         * allocate a kmem_cache_node structure in order to bring the node
3534         * online.
3535         */
3536        mutex_lock(&slab_mutex);
3537        list_for_each_entry(s, &slab_caches, list) {
3538                /*
3539                 * XXX: kmem_cache_alloc_node will fallback to other nodes
3540                 *      since memory is not yet available from the node that
3541                 *      is brought up.
3542                 */
3543                n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL);
3544                if (!n) {
3545                        ret = -ENOMEM;
3546                        goto out;
3547                }
3548                init_kmem_cache_node(n);
3549                s->node[nid] = n;
3550        }
3551out:
3552        mutex_unlock(&slab_mutex);
3553        return ret;
3554}
3555
3556static int slab_memory_callback(struct notifier_block *self,
3557                                unsigned long action, void *arg)
3558{
3559        int ret = 0;
3560
3561        switch (action) {
3562        case MEM_GOING_ONLINE:
3563                ret = slab_mem_going_online_callback(arg);
3564                break;
3565        case MEM_GOING_OFFLINE:
3566                ret = slab_mem_going_offline_callback(arg);
3567                break;
3568        case MEM_OFFLINE:
3569        case MEM_CANCEL_ONLINE:
3570                slab_mem_offline_callback(arg);
3571                break;
3572        case MEM_ONLINE:
3573        case MEM_CANCEL_OFFLINE:
3574                break;
3575        }
3576        if (ret)
3577                ret = notifier_from_errno(ret);
3578        else
3579                ret = NOTIFY_OK;
3580        return ret;
3581}
3582
3583static struct notifier_block slab_memory_callback_nb = {
3584        .notifier_call = slab_memory_callback,
3585        .priority = SLAB_CALLBACK_PRI,
3586};
3587
3588/********************************************************************
3589 *                      Basic setup of slabs
3590 *******************************************************************/
3591
3592/*
3593 * Used for early kmem_cache structures that were allocated using
3594 * the page allocator. Allocate them properly then fix up the pointers
3595 * that may be pointing to the wrong kmem_cache structure.
3596 */
3597
3598static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
3599{
3600        int node;
3601        struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
3602
3603        memcpy(s, static_cache, kmem_cache->object_size);
3604
3605        /*
3606         * This runs very early, and only the boot processor is supposed to be
3607         * up.  Even if it weren't true, IRQs are not up so we couldn't fire
3608         * IPIs around.
3609         */
3610        __flush_cpu_slab(s, smp_processor_id());
3611        for_each_node_state(node, N_NORMAL_MEMORY) {
3612                struct kmem_cache_node *n = get_node(s, node);
3613                struct page *p;
3614
3615                if (n) {
3616                        list_for_each_entry(p, &n->partial, lru)
3617                                p->slab_cache = s;
3618
3619#ifdef CONFIG_SLUB_DEBUG
3620                        list_for_each_entry(p, &n->full, lru)
3621                                p->slab_cache = s;
3622#endif
3623                }
3624        }
3625        list_add(&s->list, &slab_caches);
3626        return s;
3627}
3628
3629void __init kmem_cache_init(void)
3630{
3631        static __initdata struct kmem_cache boot_kmem_cache,
3632                boot_kmem_cache_node;
3633
3634        if (debug_guardpage_minorder())
3635                slub_max_order = 0;
3636
3637        kmem_cache_node = &boot_kmem_cache_node;
3638        kmem_cache = &boot_kmem_cache;
3639
3640        create_boot_cache(kmem_cache_node, "kmem_cache_node",
3641                sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN);
3642
3643        register_hotmemory_notifier(&slab_memory_callback_nb);
3644
3645        /* Able to allocate the per node structures */
3646        slab_state = PARTIAL;
3647
3648        create_boot_cache(kmem_cache, "kmem_cache",
3649                        offsetof(struct kmem_cache, node) +
3650                                nr_node_ids * sizeof(struct kmem_cache_node *),
3651                       SLAB_HWCACHE_ALIGN);
3652
3653        kmem_cache = bootstrap(&boot_kmem_cache);
3654
3655        /*
3656         * Allocate kmem_cache_node properly from the kmem_cache slab.
3657         * kmem_cache_node is separately allocated so no need to
3658         * update any list pointers.
3659         */
3660        kmem_cache_node = bootstrap(&boot_kmem_cache_node);
3661
3662        /* Now we can use the kmem_cache to allocate kmalloc slabs */
3663        create_kmalloc_caches(0);
3664
3665#ifdef CONFIG_SMP
3666        register_cpu_notifier(&slab_notifier);
3667#endif
3668
3669        printk(KERN_INFO
3670                "SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d,"
3671                " CPUs=%d, Nodes=%d\n",
3672                cache_line_size(),
3673                slub_min_order, slub_max_order, slub_min_objects,
3674                nr_cpu_ids, nr_node_ids);
3675}
3676
3677void __init kmem_cache_init_late(void)
3678{
3679}
3680
3681/*
3682 * Find a mergeable slab cache
3683 */
3684static int slab_unmergeable(struct kmem_cache *s)
3685{
3686        if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
3687                return 1;
3688
3689        if (s->ctor)
3690                return 1;
3691
3692        /*
3693         * We may have set a slab to be unmergeable during bootstrap.
3694         */
3695        if (s->refcount < 0)
3696                return 1;
3697
3698        return 0;
3699}
3700
3701static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
3702                size_t align, unsigned long flags, const char *name,
3703                void (*ctor)(void *))
3704{
3705        struct kmem_cache *s;
3706
3707        if (slub_nomerge || (flags & SLUB_NEVER_MERGE))
3708                return NULL;
3709
3710        if (ctor)
3711                return NULL;
3712
3713        size = ALIGN(size, sizeof(void *));
3714        align = calculate_alignment(flags, align, size);
3715        size = ALIGN(size, align);
3716        flags = kmem_cache_flags(size, flags, name, NULL);
3717
3718        list_for_each_entry(s, &slab_caches, list) {
3719                if (slab_unmergeable(s))
3720                        continue;
3721
3722                if (size > s->size)
3723                        continue;
3724
3725                if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME))
3726                                continue;
3727                /*
3728                 * Check if alignment is compatible.
3729                 * Courtesy of Adrian Drzewiecki
3730                 */
3731                if ((s->size & ~(align - 1)) != s->size)
3732                        continue;
3733
3734                if (s->size - size >= sizeof(void *))
3735                        continue;
3736
3737                if (!cache_match_memcg(s, memcg))
3738                        continue;
3739
3740                return s;
3741        }
3742        return NULL;
3743}
3744
3745struct kmem_cache *
3746__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
3747                   size_t align, unsigned long flags, void (*ctor)(void *))
3748{
3749        struct kmem_cache *s;
3750
3751        s = find_mergeable(memcg, size, align, flags, name, ctor);
3752        if (s) {
3753                s->refcount++;
3754                /*
3755                 * Adjust the object sizes so that we clear
3756                 * the complete object on kzalloc.
3757                 */
3758                s->object_size = max(s->object_size, (int)size);
3759                s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
3760
3761                if (sysfs_slab_alias(s, name)) {
3762                        s->refcount--;
3763                        s = NULL;
3764                }
3765        }
3766
3767        return s;
3768}
3769
3770int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
3771{
3772        int err;
3773
3774        err = kmem_cache_open(s, flags);
3775        if (err)
3776                return err;
3777
3778        /* Mutex is not taken during early boot */
3779        if (slab_state <= UP)
3780                return 0;
3781
3782        memcg_propagate_slab_attrs(s);
3783        mutex_unlock(&slab_mutex);
3784        err = sysfs_slab_add(s);
3785        mutex_lock(&slab_mutex);
3786
3787        if (err)
3788                kmem_cache_close(s);
3789
3790        return err;
3791}
3792
3793#ifdef CONFIG_SMP
3794/*
3795 * Use the cpu notifier to insure that the cpu slabs are flushed when
3796 * necessary.
3797 */
3798static int slab_cpuup_callback(struct notifier_block *nfb,
3799                unsigned long action, void *hcpu)
3800{
3801        long cpu = (long)hcpu;
3802        struct kmem_cache *s;
3803        unsigned long flags;
3804
3805        switch (action) {
3806        case CPU_UP_CANCELED:
3807        case CPU_UP_CANCELED_FROZEN:
3808        case CPU_DEAD:
3809        case CPU_DEAD_FROZEN:
3810                mutex_lock(&slab_mutex);
3811                list_for_each_entry(s, &slab_caches, list) {
3812                        local_irq_save(flags);
3813                        __flush_cpu_slab(s, cpu);
3814                        local_irq_restore(flags);
3815                }
3816                mutex_unlock(&slab_mutex);
3817                break;
3818        default:
3819                break;
3820        }
3821        return NOTIFY_OK;
3822}
3823
3824static struct notifier_block slab_notifier = {
3825        .notifier_call = slab_cpuup_callback
3826};
3827
3828#endif
3829
3830void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
3831{
3832        struct kmem_cache *s;
3833        void *ret;
3834
3835        if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
3836                return kmalloc_large(size, gfpflags);
3837
3838        s = kmalloc_slab(size, gfpflags);
3839
3840        if (unlikely(ZERO_OR_NULL_PTR(s)))
3841                return s;
3842
3843        ret = slab_alloc(s, gfpflags, caller);
3844
3845        /* Honor the call site pointer we received. */
3846        trace_kmalloc(caller, ret, size, s->size, gfpflags);
3847
3848        return ret;
3849}
3850
3851#ifdef CONFIG_NUMA
3852void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
3853                                        int node, unsigned long caller)
3854{
3855        struct kmem_cache *s;
3856        void *ret;
3857
3858        if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
3859                ret = kmalloc_large_node(size, gfpflags, node);
3860
3861                trace_kmalloc_node(caller, ret,
3862                                   size, PAGE_SIZE << get_order(size),
3863                                   gfpflags, node);
3864
3865                return ret;
3866        }
3867
3868        s = kmalloc_slab(size, gfpflags);
3869
3870        if (unlikely(ZERO_OR_NULL_PTR(s)))
3871                return s;
3872
3873        ret = slab_alloc_node(s, gfpflags, node, caller);
3874
3875        /* Honor the call site pointer we received. */
3876        trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
3877
3878        return ret;
3879}
3880#endif
3881
3882#ifdef CONFIG_SYSFS
3883static int count_inuse(struct page *page)
3884{
3885        return page->inuse;
3886}
3887
3888static int count_total(struct page *page)
3889{
3890        return page->objects;
3891}
3892#endif
3893
3894#ifdef CONFIG_SLUB_DEBUG
3895static int validate_slab(struct kmem_cache *s, struct page *page,
3896                                                unsigned long *map)
3897{
3898        void *p;
3899        void *addr = page_address(page);
3900
3901        if (!check_slab(s, page) ||
3902                        !on_freelist(s, page, NULL))
3903                return 0;
3904
3905        /* Now we know that a valid freelist exists */
3906        bitmap_zero(map, page->objects);
3907
3908        get_map(s, page, map);
3909        for_each_object(p, s, addr, page->objects) {
3910                if (test_bit(slab_index(p, s, addr), map))
3911                        if (!check_object(s, page, p, SLUB_RED_INACTIVE))
3912                                return 0;
3913        }
3914
3915        for_each_object(p, s, addr, page->objects)
3916                if (!test_bit(slab_index(p, s, addr), map))
3917                        if (!check_object(s, page, p, SLUB_RED_ACTIVE))
3918                                return 0;
3919        return 1;
3920}
3921
3922static void validate_slab_slab(struct kmem_cache *s, struct page *page,
3923                                                unsigned long *map)
3924{
3925        slab_lock(page);
3926        validate_slab(s, page, map);
3927        slab_unlock(page);
3928}
3929
3930static int validate_slab_node(struct kmem_cache *s,
3931                struct kmem_cache_node *n, unsigned long *map)
3932{
3933        unsigned long count = 0;
3934        struct page *page;
3935        unsigned long flags;
3936
3937        spin_lock_irqsave(&n->list_lock, flags);
3938
3939        list_for_each_entry(page, &n->partial, lru) {
3940                validate_slab_slab(s, page, map);
3941                count++;
3942        }
3943        if (count != n->nr_partial)
3944                printk(KERN_ERR "SLUB %s: %ld partial slabs counted but "
3945                        "counter=%ld\n", s->name, count, n->nr_partial);
3946
3947        if (!(s->flags & SLAB_STORE_USER))
3948                goto out;
3949
3950        list_for_each_entry(page, &n->full, lru) {
3951                validate_slab_slab(s, page, map);
3952                count++;
3953        }
3954        if (count != atomic_long_read(&n->nr_slabs))
3955                printk(KERN_ERR "SLUB: %s %ld slabs counted but "
3956                        "counter=%ld\n", s->name, count,
3957                        atomic_long_read(&n->nr_slabs));
3958
3959out:
3960        spin_unlock_irqrestore(&n->list_lock, flags);
3961        return count;
3962}
3963
3964static long validate_slab_cache(struct kmem_cache *s)
3965{
3966        int node;
3967        unsigned long count = 0;
3968        unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
3969                                sizeof(unsigned long), GFP_KERNEL);
3970
3971        if (!map)
3972                return -ENOMEM;
3973
3974        flush_all(s);
3975        for_each_node_state(node, N_NORMAL_MEMORY) {
3976                struct kmem_cache_node *n = get_node(s, node);
3977
3978                count += validate_slab_node(s, n, map);
3979        }
3980        kfree(map);
3981        return count;
3982}
3983/*
3984 * Generate lists of code addresses where slabcache objects are allocated
3985 * and freed.
3986 */
3987
3988struct location {
3989        unsigned long count;
3990        unsigned long addr;
3991        long long sum_time;
3992        long min_time;
3993        long max_time;
3994        long min_pid;
3995        long max_pid;
3996        DECLARE_BITMAP(cpus, NR_CPUS);
3997        nodemask_t nodes;
3998};
3999
4000struct loc_track {

4001        unsigned long max;
4002        unsigned long count;
4003        struct location *loc;
4004};
4005
4006static void free_loc_track(struct loc_track *t)
4007{
4008        if (t->max)
4009                free_pages((unsigned long)t->loc,
4010                        get_order(sizeof(struct location) * t->max));
4011}
4012
4013static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
4014{
4015        struct location *l;
4016        int order;
4017
4018        order = get_order(sizeof(struct location) * max);
4019
4020        l = (void *)__get_free_pages(flags, order);
4021        if (!l)
4022                return 0;
4023
4024        if (t->count) {
4025                memcpy(l, t->loc, sizeof(struct location) * t->count);
4026                free_loc_track(t);
4027        }
4028        t->max = max;
4029        t->loc = l;
4030        return 1;
4031}
4032
4033static int add_location(struct loc_track *t, struct kmem_cache *s,
4034                                const struct track *track)
4035{
4036        long start, end, pos;
4037        struct location *l;
4038        unsigned long caddr;
4039        unsigned long age = jiffies - track->when;
4040
4041        start = -1;
4042        end = t->count;
4043
4044        for ( ; ; ) {
4045                pos = start + (end - start + 1) / 2;
4046
4047                /*
4048                 * There is nothing at "end". If we end up there
4049                 * we need to add something to before end.
4050                 */
4051                if (pos == end)
4052                        break;
4053
4054                caddr = t->loc[pos].addr;
4055                if (track->addr == caddr) {
4056
4057                        l = &t->loc[pos];
4058                        l->count++;
4059                        if (track->when) {
4060                                l->sum_time += age;
4061                                if (age < l->min_time)
4062                                        l->min_time = age;
4063                                if (age > l->max_time)
4064                                        l->max_time = age;
4065
4066                                if (track->pid < l->min_pid)
4067                                        l->min_pid = track->pid;
4068                                if (track->pid > l->max_pid)
4069                                        l->max_pid = track->pid;
4070
4071                                cpumask_set_cpu(track->cpu,
4072                                                to_cpumask(l->cpus));
4073                        }
4074                        node_set(page_to_nid(virt_to_page(track)), l->nodes);
4075                        return 1;
4076                }
4077
4078                if (track->addr < caddr)
4079                        end = pos;
4080                else
4081                        start = pos;
4082        }
4083
4084        /*
4085         * Not found. Insert new tracking element.
4086         */
4087        if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC))
4088                return 0;
4089
4090        l = t->loc + pos;
4091        if (pos < t->count)
4092                memmove(l + 1, l,
4093                        (t->count - pos) * sizeof(struct location));
4094        t->count++;
4095        l->count = 1;
4096        l->addr = track->addr;
4097        l->sum_time = age;
4098        l->min_time = age;
4099        l->max_time = age;
4100        l->min_pid = track->pid;
4101        l->max_pid = track->pid;
4102        cpumask_clear(to_cpumask(l->cpus));
4103        cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));
4104        nodes_clear(l->nodes);
4105        node_set(page_to_nid(virt_to_page(track)), l->nodes);
4106        return 1;
4107}
4108
4109static void process_slab(struct loc_track *t, struct kmem_cache *s,
4110                struct page *page, enum track_item alloc,
4111                unsigned long *map)
4112{
4113        void *addr = page_address(page);
4114        void *p;
4115
4116        bitmap_zero(map, page->objects);
4117        get_map(s, page, map);
4118
4119        for_each_object(p, s, addr, page->objects)
4120                if (!test_bit(slab_index(p, s, addr), map))
4121                        add_location(t, s, get_track(s, p, alloc));
4122}
4123
4124static int list_locations(struct kmem_cache *s, char *buf,
4125                                        enum track_item alloc)
4126{
4127        int len = 0;
4128        unsigned long i;
4129        struct loc_track t = { 0, 0, NULL };
4130        int node;
4131        unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
4132                                     sizeof(unsigned long), GFP_KERNEL);
4133
4134        if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
4135                                     GFP_TEMPORARY)) {
4136                kfree(map);
4137                return sprintf(buf, "Out of memory\n");
4138        }
4139        /* Push back cpu slabs */
4140        flush_all(s);
4141
4142        for_each_node_state(node, N_NORMAL_MEMORY) {
4143                struct kmem_cache_node *n = get_node(s, node);
4144                unsigned long flags;
4145                struct page *page;
4146
4147                if (!atomic_long_read(&n->nr_slabs))
4148                        continue;
4149
4150                spin_lock_irqsave(&n->list_lock, flags);
4151                list_for_each_entry(page, &n->partial, lru)
4152                        process_slab(&t, s, page, alloc, map);
4153                list_for_each_entry(page, &n->full, lru)
4154                        process_slab(&t, s, page, alloc, map);
4155                spin_unlock_irqrestore(&n->list_lock, flags);
4156        }
4157
4158        for (i = 0; i < t.count; i++) {
4159                struct location *l = &t.loc[i];
4160
4161                if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100)
4162                        break;
4163                len += sprintf(buf + len, "%7ld ", l->count);
4164
4165                if (l->addr)
4166                        len += sprintf(buf + len, "%pS", (void *)l->addr);
4167                else
4168                        len += sprintf(buf + len, "<not-available>");
4169
4170                if (l->sum_time != l->min_time) {
4171                        len += sprintf(buf + len, " age=%ld/%ld/%ld",
4172                                l->min_time,
4173                                (long)div_u64(l->sum_time, l->count),
4174                                l->max_time);
4175                } else
4176                        len += sprintf(buf + len, " age=%ld",
4177                                l->min_time);
4178
4179                if (l->min_pid != l->max_pid)
4180                        len += sprintf(buf + len, " pid=%ld-%ld",
4181                                l->min_pid, l->max_pid);
4182                else
4183                        len += sprintf(buf + len, " pid=%ld",
4184                                l->min_pid);
4185
4186                if (num_online_cpus() > 1 &&
4187                                !cpumask_empty(to_cpumask(l->cpus)) &&
4188                                len < PAGE_SIZE - 60) {
4189                        len += sprintf(buf + len, " cpus=");
4190                        len += cpulist_scnprintf(buf + len,
4191                                                 PAGE_SIZE - len - 50,
4192                                                 to_cpumask(l->cpus));
4193                }
4194
4195                if (nr_online_nodes > 1 && !nodes_empty(l->nodes) &&
4196                                len < PAGE_SIZE - 60) {
4197                        len += sprintf(buf + len, " nodes=");
4198                        len += nodelist_scnprintf(buf + len,
4199                                                  PAGE_SIZE - len - 50,
4200                                                  l->nodes);
4201                }
4202
4203                len += sprintf(buf + len, "\n");
4204        }
4205
4206        free_loc_track(&t);
4207        kfree(map);
4208        if (!t.count)
4209                len += sprintf(buf, "No data\n");
4210        return len;
4211}
4212#endif
4213
4214#ifdef SLUB_RESILIENCY_TEST
4215static void resiliency_test(void)
4216{
4217        u8 *p;
4218
4219        BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10);
4220
4221        printk(KERN_ERR "SLUB resiliency testing\n");
4222        printk(KERN_ERR "-----------------------\n");
4223        printk(KERN_ERR "A. Corruption after allocation\n");
4224
4225        p = kzalloc(16, GFP_KERNEL);
4226        p[16] = 0x12;
4227        printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
4228                        " 0x12->0x%p\n\n", p + 16);
4229
4230        validate_slab_cache(kmalloc_caches[4]);
4231
4232        /* Hmmm... The next two are dangerous */
4233        p = kzalloc(32, GFP_KERNEL);
4234        p[32 + sizeof(void *)] = 0x34;
4235        printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
4236                        " 0x34 -> -0x%p\n", p);
4237        printk(KERN_ERR
4238                "If allocated object is overwritten then not detectable\n\n");
4239
4240        validate_slab_cache(kmalloc_caches[5]);
4241        p = kzalloc(64, GFP_KERNEL);
4242        p += 64 + (get_cycles() & 0xff) * sizeof(void *);
4243        *p = 0x56;
4244        printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
4245                                                                        p);
4246        printk(KERN_ERR
4247                "If allocated object is overwritten then not detectable\n\n");
4248        validate_slab_cache(kmalloc_caches[6]);
4249
4250        printk(KERN_ERR "\nB. Corruption after free\n");
4251        p = kzalloc(128, GFP_KERNEL);
4252        kfree(p);
4253        *p = 0x78;
4254        printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
4255        validate_slab_cache(kmalloc_caches[7]);
4256
4257        p = kzalloc(256, GFP_KERNEL);
4258        kfree(p);
4259        p[50] = 0x9a;
4260        printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n",
4261                        p);
4262        validate_slab_cache(kmalloc_caches[8]);
4263
4264        p = kzalloc(512, GFP_KERNEL);
4265        kfree(p);
4266        p[512] = 0xab;
4267        printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
4268        validate_slab_cache(kmalloc_caches[9]);
4269}
4270#else
4271#ifdef CONFIG_SYSFS
4272static void resiliency_test(void) {};
4273#endif
4274#endif
4275
4276#ifdef CONFIG_SYSFS
4277enum slab_stat_type {
4278        SL_ALL,                 /* All slabs */
4279        SL_PARTIAL,             /* Only partially allocated slabs */
4280        SL_CPU,                 /* Only slabs used for cpu caches */
4281        SL_OBJECTS,             /* Determine allocated objects not slabs */
4282        SL_TOTAL                /* Determine object capacity not slabs */
4283};
4284
4285#define SO_ALL          (1 << SL_ALL)
4286#define SO_PARTIAL      (1 << SL_PARTIAL)
4287#define SO_CPU          (1 << SL_CPU)
4288#define SO_OBJECTS      (1 << SL_OBJECTS)
4289#define SO_TOTAL        (1 << SL_TOTAL)
4290
4291static ssize_t show_slab_objects(struct kmem_cache *s,
4292                            char *buf, unsigned long flags)
4293{
4294        unsigned long total = 0;
4295        int node;
4296        int x;
4297        unsigned long *nodes;
4298
4299        nodes = kzalloc(sizeof(unsigned long) * nr_node_ids, GFP_KERNEL);
4300        if (!nodes)
4301                return -ENOMEM;
4302
4303        if (flags & SO_CPU) {
4304                int cpu;
4305
4306                for_each_possible_cpu(cpu) {
4307                        struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab,
4308                                                               cpu);
4309                        int node;
4310                        struct page *page;
4311
4312                        page = ACCESS_ONCE(c->page);
4313                        if (!page)
4314                                continue;
4315
4316                        node = page_to_nid(page);
4317                        if (flags & SO_TOTAL)
4318                                x = page->objects;
4319                        else if (flags & SO_OBJECTS)
4320                                x = page->inuse;
4321                        else
4322                                x = 1;
4323
4324                        total += x;
4325                        nodes[node] += x;
4326
4327                        page = ACCESS_ONCE(c->partial);
4328                        if (page) {
4329                                node = page_to_nid(page);
4330                                if (flags & SO_TOTAL)
4331                                        WARN_ON_ONCE(1);
4332                                else if (flags & SO_OBJECTS)
4333                                        WARN_ON_ONCE(1);
4334                                else
4335                                        x = page->pages;
4336                                total += x;
4337                                nodes[node] += x;
4338                        }
4339                }
4340        }
4341
4342        lock_memory_hotplug();
4343#ifdef CONFIG_SLUB_DEBUG
4344        if (flags & SO_ALL) {
4345                for_each_node_state(node, N_NORMAL_MEMORY) {
4346                        struct kmem_cache_node *n = get_node(s, node);
4347
4348                        if (flags & SO_TOTAL)
4349                                x = atomic_long_read(&n->total_objects);
4350                        else if (flags & SO_OBJECTS)
4351                                x = atomic_long_read(&n->total_objects) -
4352                                        count_partial(n, count_free);
4353                        else
4354                                x = atomic_long_read(&n->nr_slabs);
4355                        total += x;
4356                        nodes[node] += x;
4357                }
4358
4359        } else
4360#endif
4361        if (flags & SO_PARTIAL) {
4362                for_each_node_state(node, N_NORMAL_MEMORY) {
4363                        struct kmem_cache_node *n = get_node(s, node);
4364
4365                        if (flags & SO_TOTAL)
4366                                x = count_partial(n, count_total);
4367                        else if (flags & SO_OBJECTS)
4368                                x = count_partial(n, count_inuse);
4369                        else
4370                                x = n->nr_partial;
4371                        total += x;
4372                        nodes[node] += x;
4373                }
4374        }
4375        x = sprintf(buf, "%lu", total);
4376#ifdef CONFIG_NUMA
4377        for_each_node_state(node, N_NORMAL_MEMORY)
4378                if (nodes[node])
4379                        x += sprintf(buf + x, " N%d=%lu",
4380                                        node, nodes[node]);
4381#endif
4382        unlock_memory_hotplug();
4383        kfree(nodes);
4384        return x + sprintf(buf + x, "\n");
4385}
4386
4387#ifdef CONFIG_SLUB_DEBUG
4388static int any_slab_objects(struct kmem_cache *s)
4389{
4390        int node;
4391
4392        for_each_online_node(node) {
4393                struct kmem_cache_node *n = get_node(s, node);
4394
4395                if (!n)
4396                        continue;
4397
4398                if (atomic_long_read(&n->total_objects))
4399                        return 1;
4400        }
4401        return 0;
4402}
4403#endif
4404
4405#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
4406#define to_slab(n) container_of(n, struct kmem_cache, kobj)
4407
4408struct slab_attribute {
4409        struct attribute attr;
4410        ssize_t (*show)(struct kmem_cache *s, char *buf);
4411        ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
4412};
4413
4414#define SLAB_ATTR_RO(_name) \
4415        static struct slab_attribute _name##_attr = \
4416        __ATTR(_name, 0400, _name##_show, NULL)
4417
4418#define SLAB_ATTR(_name) \
4419        static struct slab_attribute _name##_attr =  \
4420        __ATTR(_name, 0600, _name##_show, _name##_store)
4421
4422static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
4423{
4424        return sprintf(buf, "%d\n", s->size);
4425}
4426SLAB_ATTR_RO(slab_size);
4427
4428static ssize_t align_show(struct kmem_cache *s, char *buf)
4429{
4430        return sprintf(buf, "%d\n", s->align);
4431}
4432SLAB_ATTR_RO(align);
4433
4434static ssize_t object_size_show(struct kmem_cache *s, char *buf)
4435{
4436        return sprintf(buf, "%d\n", s->object_size);
4437}
4438SLAB_ATTR_RO(object_size);
4439
4440static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
4441{
4442        return sprintf(buf, "%d\n", oo_objects(s->oo));
4443}
4444SLAB_ATTR_RO(objs_per_slab);
4445
4446static ssize_t order_store(struct kmem_cache *s,
4447                                const char *buf, size_t length)
4448{
4449        unsigned long order;
4450        int err;
4451
4452        err = kstrtoul(buf, 10, &order);
4453        if (err)
4454                return err;
4455
4456        if (order > slub_max_order || order < slub_min_order)
4457                return -EINVAL;
4458
4459        calculate_sizes(s, order);
4460        return length;
4461}
4462
4463static ssize_t order_show(struct kmem_cache *s, char *buf)
4464{
4465        return sprintf(buf, "%d\n", oo_order(s->oo));
4466}
4467SLAB_ATTR(order);
4468
4469static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
4470{
4471        return sprintf(buf, "%lu\n", s->min_partial);
4472}
4473
4474static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
4475                                 size_t length)
4476{
4477        unsigned long min;
4478        int err;
4479
4480        err = kstrtoul(buf, 10, &min);
4481        if (err)
4482                return err;
4483
4484        set_min_partial(s, min);
4485        return length;
4486}
4487SLAB_ATTR(min_partial);
4488
4489static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
4490{
4491        return sprintf(buf, "%u\n", s->cpu_partial);
4492}
4493
4494static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
4495                                 size_t length)
4496{
4497        unsigned long objects;
4498        int err;
4499
4500        err = kstrtoul(buf, 10, &objects);
4501        if (err)
4502                return err;
4503        if (objects && !kmem_cache_has_cpu_partial(s))
4504                return -EINVAL;
4505
4506        s->cpu_partial = objects;
4507        flush_all(s);
4508        return length;
4509}
4510SLAB_ATTR(cpu_partial);
4511
4512static ssize_t ctor_show(struct kmem_cache *s, char *buf)
4513{
4514        if (!s->ctor)
4515                return 0;
4516        return sprintf(buf, "%pS\n", s->ctor);
4517}
4518SLAB_ATTR_RO(ctor);
4519
4520static ssize_t aliases_show(struct kmem_cache *s, char *buf)
4521{
4522        return sprintf(buf, "%d\n", s->refcount - 1);
4523}
4524SLAB_ATTR_RO(aliases);
4525
4526static ssize_t partial_show(struct kmem_cache *s, char *buf)
4527{
4528        return show_slab_objects(s, buf, SO_PARTIAL);
4529}
4530SLAB_ATTR_RO(partial);
4531
4532static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf)
4533{
4534        return show_slab_objects(s, buf, SO_CPU);
4535}
4536SLAB_ATTR_RO(cpu_slabs);
4537
4538static ssize_t objects_show(struct kmem_cache *s, char *buf)
4539{
4540        return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS);
4541}
4542SLAB_ATTR_RO(objects);
4543
4544static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
4545{
4546        return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS);
4547}
4548SLAB_ATTR_RO(objects_partial);
4549
4550static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
4551{
4552        int objects = 0;
4553        int pages = 0;
4554        int cpu;
4555        int len;
4556
4557        for_each_online_cpu(cpu) {
4558                struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial;
4559
4560                if (page) {
4561                        pages += page->pages;
4562                        objects += page->pobjects;
4563                }
4564        }
4565
4566        len = sprintf(buf, "%d(%d)", objects, pages);
4567
4568#ifdef CONFIG_SMP
4569        for_each_online_cpu(cpu) {
4570                struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial;
4571
4572                if (page && len < PAGE_SIZE - 20)
4573                        len += sprintf(buf + len, " C%d=%d(%d)", cpu,
4574                                page->pobjects, page->pages);
4575        }
4576#endif
4577        return len + sprintf(buf + len, "\n");
4578}
4579SLAB_ATTR_RO(slabs_cpu_partial);
4580
4581static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
4582{
4583        return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
4584}
4585
4586static ssize_t reclaim_account_store(struct kmem_cache *s,
4587                                const char *buf, size_t length)
4588{
4589        s->flags &= ~SLAB_RECLAIM_ACCOUNT;
4590        if (buf[0] == '1')
4591                s->flags |= SLAB_RECLAIM_ACCOUNT;
4592        return length;
4593}
4594SLAB_ATTR(reclaim_account);
4595
4596static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
4597{
4598        return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
4599}
4600SLAB_ATTR_RO(hwcache_align);
4601
4602#ifdef CONFIG_ZONE_DMA
4603static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
4604{
4605        return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
4606}
4607SLAB_ATTR_RO(cache_dma);
4608#endif
4609
4610static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
4611{
4612        return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
4613}
4614SLAB_ATTR_RO(destroy_by_rcu);
4615
4616static ssize_t reserved_show(struct kmem_cache *s, char *buf)
4617{
4618        return sprintf(buf, "%d\n", s->reserved);
4619}
4620SLAB_ATTR_RO(reserved);
4621
4622#ifdef CONFIG_SLUB_DEBUG
4623static ssize_t slabs_show(struct kmem_cache *s, char *buf)
4624{
4625        return show_slab_objects(s, buf, SO_ALL);
4626}
4627SLAB_ATTR_RO(slabs);
4628
4629static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
4630{
4631        return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
4632}
4633SLAB_ATTR_RO(total_objects);
4634
4635static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
4636{
4637        return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
4638}
4639
4640static ssize_t sanity_checks_store(struct kmem_cache *s,
4641                                const char *buf, size_t length)
4642{
4643        s->flags &= ~SLAB_DEBUG_FREE;
4644        if (buf[0] == '1') {
4645                s->flags &= ~__CMPXCHG_DOUBLE;
4646                s->flags |= SLAB_DEBUG_FREE;
4647        }
4648        return length;
4649}
4650SLAB_ATTR(sanity_checks);
4651
4652static ssize_t trace_show(struct kmem_cache *s, char *buf)
4653{
4654        return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
4655}
4656
4657static ssize_t trace_store(struct kmem_cache *s, const char *buf,
4658                                                        size_t length)
4659{
4660        s->flags &= ~SLAB_TRACE;
4661        if (buf[0] == '1') {
4662                s->flags &= ~__CMPXCHG_DOUBLE;
4663                s->flags |= SLAB_TRACE;
4664        }
4665        return length;
4666}
4667SLAB_ATTR(trace);
4668
4669static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
4670{
4671        return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
4672}
4673
4674static ssize_t red_zone_store(struct kmem_cache *s,
4675                                const char *buf, size_t length)
4676{
4677        if (any_slab_objects(s))
4678                return -EBUSY;
4679
4680        s->flags &= ~SLAB_RED_ZONE;
4681        if (buf[0] == '1') {
4682                s->flags &= ~__CMPXCHG_DOUBLE;
4683                s->flags |= SLAB_RED_ZONE;
4684        }
4685        calculate_sizes(s, -1);
4686        return length;
4687}
4688SLAB_ATTR(red_zone);
4689
4690static ssize_t poison_show(struct kmem_cache *s, char *buf)
4691{
4692        return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
4693}
4694
4695static ssize_t poison_store(struct kmem_cache *s,
4696                                const char *buf, size_t length)
4697{
4698        if (any_slab_objects(s))
4699                return -EBUSY;
4700
4701        s->flags &= ~SLAB_POISON;
4702        if (buf[0] == '1') {
4703                s->flags &= ~__CMPXCHG_DOUBLE;
4704                s->flags |= SLAB_POISON;
4705        }
4706        calculate_sizes(s, -1);
4707        return length;
4708}
4709SLAB_ATTR(poison);
4710
4711static ssize_t store_user_show(struct kmem_cache *s, char *buf)
4712{
4713        return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
4714}
4715
4716static ssize_t store_user_store(struct kmem_cache *s,
4717                                const char *buf, size_t length)
4718{
4719        if (any_slab_objects(s))
4720                return -EBUSY;
4721
4722        s->flags &= ~SLAB_STORE_USER;
4723        if (buf[0] == '1') {
4724                s->flags &= ~__CMPXCHG_DOUBLE;
4725                s->flags |= SLAB_STORE_USER;
4726        }
4727        calculate_sizes(s, -1);
4728        return length;
4729}
4730SLAB_ATTR(store_user);
4731
4732static ssize_t validate_show(struct kmem_cache *s, char *buf)
4733{
4734        return 0;
4735}
4736
4737static ssize_t validate_store(struct kmem_cache *s,
4738                        const char *buf, size_t length)
4739{
4740        int ret = -EINVAL;
4741
4742        if (buf[0] == '1') {
4743                ret = validate_slab_cache(s);
4744                if (ret >= 0)
4745                        ret = length;
4746        }
4747        return ret;
4748}
4749SLAB_ATTR(validate);
4750
4751static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
4752{
4753        if (!(s->flags & SLAB_STORE_USER))
4754                return -ENOSYS;
4755        return list_locations(s, buf, TRACK_ALLOC);
4756}
4757SLAB_ATTR_RO(alloc_calls);
4758
4759static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
4760{
4761        if (!(s->flags & SLAB_STORE_USER))
4762                return -ENOSYS;
4763        return list_locations(s, buf, TRACK_FREE);
4764}
4765SLAB_ATTR_RO(free_calls);
4766#endif /* CONFIG_SLUB_DEBUG */
4767
4768#ifdef CONFIG_FAILSLAB
4769static ssize_t failslab_show(struct kmem_cache *s, char *buf)
4770{
4771        return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
4772}
4773
4774static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
4775                                                        size_t length)
4776{
4777        s->flags &= ~SLAB_FAILSLAB;
4778        if (buf[0] == '1')
4779                s->flags |= SLAB_FAILSLAB;
4780        return length;
4781}
4782SLAB_ATTR(failslab);
4783#endif
4784
4785static ssize_t shrink_show(struct kmem_cache *s, char *buf)
4786{
4787        return 0;
4788}
4789
4790static ssize_t shrink_store(struct kmem_cache *s,
4791                        const char *buf, size_t length)
4792{
4793        if (buf[0] == '1') {
4794                int rc = kmem_cache_shrink(s);
4795
4796                if (rc)
4797                        return rc;
4798        } else
4799                return -EINVAL;
4800        return length;
4801}
4802SLAB_ATTR(shrink);
4803
4804#ifdef CONFIG_NUMA
4805static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
4806{
4807        return sprintf(buf, "%d\n", s->remote_node_defrag_ratio / 10);
4808}
4809
4810static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
4811                                const char *buf, size_t length)
4812{
4813        unsigned long ratio;
4814        int err;
4815
4816        err = kstrtoul(buf, 10, &ratio);
4817        if (err)
4818                return err;
4819
4820        if (ratio <= 100)
4821                s->remote_node_defrag_ratio = ratio * 10;
4822
4823        return length;
4824}
4825SLAB_ATTR(remote_node_defrag_ratio);
4826#endif
4827
4828#ifdef CONFIG_SLUB_STATS
4829static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
4830{
4831        unsigned long sum  = 0;
4832        int cpu;
4833        int len;
4834        int *data = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);
4835
4836        if (!data)
4837                return -ENOMEM;
4838
4839        for_each_online_cpu(cpu) {
4840                unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si];
4841
4842                data[cpu] = x;
4843                sum += x;
4844        }
4845
4846        len = sprintf(buf, "%lu", sum);
4847
4848#ifdef CONFIG_SMP
4849        for_each_online_cpu(cpu) {
4850                if (data[cpu] && len < PAGE_SIZE - 20)
4851                        len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]);
4852        }
4853#endif
4854        kfree(data);
4855        return len + sprintf(buf + len, "\n");
4856}
4857
4858static void clear_stat(struct kmem_cache *s, enum stat_item si)
4859{
4860        int cpu;
4861
4862        for_each_online_cpu(cpu)
4863                per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0;
4864}
4865
4866#define STAT_ATTR(si, text)                                     \
4867static ssize_t text##_show(struct kmem_cache *s, char *buf)     \
4868{                                                               \
4869        return show_stat(s, buf, si);                           \
4870}                                                               \
4871static ssize_t text##_store(struct kmem_cache *s,               \
4872                                const char *buf, size_t length) \
4873{                                                               \
4874        if (buf[0] != '0')                                      \
4875                return -EINVAL;                                 \
4876        clear_stat(s, si);                                      \
4877        return length;                                          \
4878}                                                               \
4879SLAB_ATTR(text);                                                \
4880
4881STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
4882STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
4883STAT_ATTR(FREE_FASTPATH, free_fastpath);
4884STAT_ATTR(FREE_SLOWPATH, free_slowpath);
4885STAT_ATTR(FREE_FROZEN, free_frozen);
4886STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial);
4887STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
4888STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial);
4889STAT_ATTR(ALLOC_SLAB, alloc_slab);
4890STAT_ATTR(ALLOC_REFILL, alloc_refill);
4891STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch);
4892STAT_ATTR(FREE_SLAB, free_slab);
4893STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush);
4894STAT_ATTR(DEACTIVATE_FULL, deactivate_full);
4895STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
4896STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
4897STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
4898STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
4899STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass);
4900STAT_ATTR(ORDER_FALLBACK, order_fallback);
4901STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
4902STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
4903STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
4904STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
4905STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
4906STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
4907#endif
4908
4909static struct attribute *slab_attrs[] = {
4910        &slab_size_attr.attr,
4911        &object_size_attr.attr,
4912        &objs_per_slab_attr.attr,
4913        &order_attr.attr,
4914        &min_partial_attr.attr,
4915        &cpu_partial_attr.attr,
4916        &objects_attr.attr,
4917        &objects_partial_attr.attr,
4918        &partial_attr.attr,
4919        &cpu_slabs_attr.attr,
4920        &ctor_attr.attr,
4921        &aliases_attr.attr,
4922        &align_attr.attr,
4923        &hwcache_align_attr.attr,
4924        &reclaim_account_attr.attr,
4925        &destroy_by_rcu_attr.attr,
4926        &shrink_attr.attr,
4927        &reserved_attr.attr,
4928        &slabs_cpu_partial_attr.attr,
4929#ifdef CONFIG_SLUB_DEBUG
4930        &total_objects_attr.attr,
4931        &slabs_attr.attr,
4932        &sanity_checks_attr.attr,
4933        &trace_attr.attr,
4934        &red_zone_attr.attr,
4935        &poison_attr.attr,
4936        &store_user_attr.attr,
4937        &validate_attr.attr,
4938        &alloc_calls_attr.attr,
4939        &free_calls_attr.attr,
4940#endif
4941#ifdef CONFIG_ZONE_DMA
4942        &cache_dma_attr.attr,
4943#endif
4944#ifdef CONFIG_NUMA
4945        &remote_node_defrag_ratio_attr.attr,
4946#endif
4947#ifdef CONFIG_SLUB_STATS
4948        &alloc_fastpath_attr.attr,
4949        &alloc_slowpath_attr.attr,
4950        &free_fastpath_attr.attr,
4951        &free_slowpath_attr.attr,
4952        &free_frozen_attr.attr,
4953        &free_add_partial_attr.attr,
4954        &free_remove_partial_attr.attr,
4955        &alloc_from_partial_attr.attr,
4956        &alloc_slab_attr.attr,
4957        &alloc_refill_attr.attr,
4958        &alloc_node_mismatch_attr.attr,
4959        &free_slab_attr.attr,
4960        &cpuslab_flush_attr.attr,
4961        &deactivate_full_attr.attr,
4962        &deactivate_empty_attr.attr,
4963        &deactivate_to_head_attr.attr,
4964        &deactivate_to_tail_attr.attr,
4965        &deactivate_remote_frees_attr.attr,
4966        &deactivate_bypass_attr.attr,
4967        &order_fallback_attr.attr,
4968        &cmpxchg_double_fail_attr.attr,
4969        &cmpxchg_double_cpu_fail_attr.attr,
4970        &cpu_partial_alloc_attr.attr,
4971        &cpu_partial_free_attr.attr,
4972        &cpu_partial_node_attr.attr,
4973        &cpu_partial_drain_attr.attr,
4974#endif
4975#ifdef CONFIG_FAILSLAB
4976        &failslab_attr.attr,
4977#endif
4978
4979        NULL
4980};
4981
4982static struct attribute_group slab_attr_group = {
4983        .attrs = slab_attrs,
4984};
4985
4986static ssize_t slab_attr_show(struct kobject *kobj,
4987                                struct attribute *attr,
4988                                char *buf)
4989{
4990        struct slab_attribute *attribute;
4991        struct kmem_cache *s;
4992        int err;
4993
4994        attribute = to_slab_attr(attr);
4995        s = to_slab(kobj);
4996
4997        if (!attribute->show)
4998                return -EIO;
4999
5000        err = attribute->show(s, buf);

5001
5002        return err;
5003}
5004
5005static ssize_t slab_attr_store(struct kobject *kobj,
5006                                struct attribute *attr,
5007                                const char *buf, size_t len)
5008{
5009        struct slab_attribute *attribute;
5010        struct kmem_cache *s;
5011        int err;
5012
5013        attribute = to_slab_attr(attr);
5014        s = to_slab(kobj);
5015
5016        if (!attribute->store)
5017                return -EIO;
5018
5019        err = attribute->store(s, buf, len);
5020#ifdef CONFIG_MEMCG_KMEM
5021        if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
5022                int i;
5023
5024                mutex_lock(&slab_mutex);
5025                if (s->max_attr_size < len)
5026                        s->max_attr_size = len;
5027
5028                /*
5029                 * This is a best effort propagation, so this function's return
5030                 * value will be determined by the parent cache only. This is
5031                 * basically because not all attributes will have a well
5032                 * defined semantics for rollbacks - most of the actions will
5033                 * have permanent effects.
5034                 *
5035                 * Returning the error value of any of the children that fail
5036                 * is not 100 % defined, in the sense that users seeing the
5037                 * error code won't be able to know anything about the state of
5038                 * the cache.
5039                 *
5040                 * Only returning the error code for the parent cache at least
5041                 * has well defined semantics. The cache being written to
5042                 * directly either failed or succeeded, in which case we loop
5043                 * through the descendants with best-effort propagation.
5044                 */
5045                for_each_memcg_cache_index(i) {
5046                        struct kmem_cache *c = cache_from_memcg_idx(s, i);
5047                        if (c)
5048                                attribute->store(c, buf, len);
5049                }
5050                mutex_unlock(&slab_mutex);
5051        }
5052#endif
5053        return err;
5054}
5055
5056static void memcg_propagate_slab_attrs(struct kmem_cache *s)
5057{
5058#ifdef CONFIG_MEMCG_KMEM
5059        int i;
5060        char *buffer = NULL;
5061
5062        if (!is_root_cache(s))
5063                return;
5064
5065        /*
5066         * This mean this cache had no attribute written. Therefore, no point
5067         * in copying default values around
5068         */
5069        if (!s->max_attr_size)
5070                return;
5071
5072        for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) {
5073                char mbuf[64];
5074                char *buf;
5075                struct slab_attribute *attr = to_slab_attr(slab_attrs[i]);
5076
5077                if (!attr || !attr->store || !attr->show)
5078                        continue;
5079
5080                /*
5081                 * It is really bad that we have to allocate here, so we will
5082                 * do it only as a fallback. If we actually allocate, though,
5083                 * we can just use the allocated buffer until the end.
5084                 *
5085                 * Most of the slub attributes will tend to be very small in
5086                 * size, but sysfs allows buffers up to a page, so they can
5087                 * theoretically happen.
5088                 */
5089                if (buffer)
5090                        buf = buffer;
5091                else if (s->max_attr_size < ARRAY_SIZE(mbuf))
5092                        buf = mbuf;
5093                else {
5094                        buffer = (char *) get_zeroed_page(GFP_KERNEL);
5095                        if (WARN_ON(!buffer))
5096                                continue;
5097                        buf = buffer;
5098                }
5099
5100                attr->show(s->memcg_params->root_cache, buf);
5101                attr->store(s, buf, strlen(buf));
5102        }
5103
5104        if (buffer)
5105                free_page((unsigned long)buffer);
5106#endif
5107}
5108
5109static const struct sysfs_ops slab_sysfs_ops = {
5110        .show = slab_attr_show,
5111        .store = slab_attr_store,
5112};
5113
5114static struct kobj_type slab_ktype = {
5115        .sysfs_ops = &slab_sysfs_ops,
5116};
5117
5118static int uevent_filter(struct kset *kset, struct kobject *kobj)
5119{
5120        struct kobj_type *ktype = get_ktype(kobj);
5121
5122        if (ktype == &slab_ktype)
5123                return 1;
5124        return 0;
5125}
5126
5127static const struct kset_uevent_ops slab_uevent_ops = {
5128        .filter = uevent_filter,
5129};
5130
5131static struct kset *slab_kset;
5132
5133#define ID_STR_LENGTH 64
5134
5135/* Create a unique string id for a slab cache:
5136 *
5137 * Format       :[flags-]size
5138 */
5139static char *create_unique_id(struct kmem_cache *s)
5140{
5141        char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
5142        char *p = name;
5143
5144        BUG_ON(!name);
5145
5146        *p++ = ':';
5147        /*
5148         * First flags affecting slabcache operations. We will only
5149         * get here for aliasable slabs so we do not need to support
5150         * too many flags. The flags here must cover all flags that
5151         * are matched during merging to guarantee that the id is
5152         * unique.
5153         */
5154        if (s->flags & SLAB_CACHE_DMA)
5155                *p++ = 'd';
5156        if (s->flags & SLAB_RECLAIM_ACCOUNT)
5157                *p++ = 'a';
5158        if (s->flags & SLAB_DEBUG_FREE)
5159                *p++ = 'F';
5160        if (!(s->flags & SLAB_NOTRACK))
5161                *p++ = 't';
5162        if (p != name + 1)
5163                *p++ = '-';
5164        p += sprintf(p, "%07d", s->size);
5165
5166#ifdef CONFIG_MEMCG_KMEM
5167        if (!is_root_cache(s))
5168                p += sprintf(p, "-%08d",
5169                                memcg_cache_id(s->memcg_params->memcg));
5170#endif
5171
5172        BUG_ON(p > name + ID_STR_LENGTH - 1);
5173        return name;
5174}
5175
5176static int sysfs_slab_add(struct kmem_cache *s)
5177{
5178        int err;
5179        const char *name;
5180        int unmergeable = slab_unmergeable(s);
5181
5182        if (unmergeable) {
5183                /*
5184                 * Slabcache can never be merged so we can use the name proper.
5185                 * This is typically the case for debug situations. In that
5186                 * case we can catch duplicate names easily.
5187                 */
5188                sysfs_remove_link(&slab_kset->kobj, s->name);
5189                name = s->name;
5190        } else {
5191                /*
5192                 * Create a unique name for the slab as a target
5193                 * for the symlinks.
5194                 */
5195                name = create_unique_id(s);
5196        }
5197
5198        s->kobj.kset = slab_kset;
5199        err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
5200        if (err) {
5201                kobject_put(&s->kobj);
5202                return err;
5203        }
5204
5205        err = sysfs_create_group(&s->kobj, &slab_attr_group);
5206        if (err) {
5207                kobject_del(&s->kobj);
5208                kobject_put(&s->kobj);
5209                return err;
5210        }
5211        kobject_uevent(&s->kobj, KOBJ_ADD);
5212        if (!unmergeable) {
5213                /* Setup first alias */
5214                sysfs_slab_alias(s, s->name);
5215                kfree(name);
5216        }
5217        return 0;
5218}
5219
5220static void sysfs_slab_remove(struct kmem_cache *s)
5221{
5222        if (slab_state < FULL)
5223                /*
5224                 * Sysfs has not been setup yet so no need to remove the
5225                 * cache from sysfs.
5226                 */
5227                return;
5228
5229        kobject_uevent(&s->kobj, KOBJ_REMOVE);
5230        kobject_del(&s->kobj);
5231        kobject_put(&s->kobj);
5232}
5233
5234/*
5235 * Need to buffer aliases during bootup until sysfs becomes
5236 * available lest we lose that information.
5237 */
5238struct saved_alias {
5239        struct kmem_cache *s;
5240        const char *name;
5241        struct saved_alias *next;
5242};
5243
5244static struct saved_alias *alias_list;
5245
5246static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
5247{
5248        struct saved_alias *al;
5249
5250        if (slab_state == FULL) {
5251                /*
5252                 * If we have a leftover link then remove it.
5253                 */
5254                sysfs_remove_link(&slab_kset->kobj, name);
5255                return sysfs_create_link(&slab_kset->kobj, &s->kobj, name);
5256        }
5257
5258        al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
5259        if (!al)
5260                return -ENOMEM;
5261
5262        al->s = s;
5263        al->name = name;
5264        al->next = alias_list;
5265        alias_list = al;
5266        return 0;
5267}
5268
5269static int __init slab_sysfs_init(void)
5270{
5271        struct kmem_cache *s;
5272        int err;
5273
5274        mutex_lock(&slab_mutex);
5275
5276        slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
5277        if (!slab_kset) {
5278                mutex_unlock(&slab_mutex);
5279                printk(KERN_ERR "Cannot register slab subsystem.\n");
5280                return -ENOSYS;
5281        }
5282
5283        slab_state = FULL;
5284
5285        list_for_each_entry(s, &slab_caches, list) {
5286                err = sysfs_slab_add(s);
5287                if (err)
5288                        printk(KERN_ERR "SLUB: Unable to add boot slab %s"
5289                                                " to sysfs\n", s->name);
5290        }
5291
5292        while (alias_list) {
5293                struct saved_alias *al = alias_list;
5294
5295                alias_list = alias_list->next;
5296                err = sysfs_slab_alias(al->s, al->name);
5297                if (err)
5298                        printk(KERN_ERR "SLUB: Unable to add boot slab alias"
5299                                        " %s to sysfs\n", al->name);
5300                kfree(al);
5301        }
5302
5303        mutex_unlock(&slab_mutex);
5304        resiliency_test();
5305        return 0;
5306}
5307
5308__initcall(slab_sysfs_init);
5309#endif /* CONFIG_SYSFS */
5310
5311/*
5312 * The /proc/slabinfo ABI
5313 */
5314#ifdef CONFIG_SLABINFO
5315void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
5316{
5317        unsigned long nr_slabs = 0;
5318        unsigned long nr_objs = 0;
5319        unsigned long nr_free = 0;
5320        int node;
5321
5322        for_each_online_node(node) {
5323                struct kmem_cache_node *n = get_node(s, node);
5324
5325                if (!n)
5326                        continue;
5327
5328                nr_slabs += node_nr_slabs(n);
5329                nr_objs += node_nr_objs(n);
5330                nr_free += count_partial(n, count_free);
5331        }
5332
5333        sinfo->active_objs = nr_objs - nr_free;
5334        sinfo->num_objs = nr_objs;
5335        sinfo->active_slabs = nr_slabs;
5336        sinfo->num_slabs = nr_slabs;
5337        sinfo->objects_per_slab = oo_objects(s->oo);
5338        sinfo->cache_order = oo_order(s->oo);
5339}
5340
5341void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s)
5342{
5343}
5344
5345ssize_t slabinfo_write(struct file *file, const char __user *buffer,
5346                       size_t count, loff_t *ppos)
5347{
5348        return -EIO;
5349}
5350#endif /* CONFIG_SLABINFO */
5351