LXR linux/mm/slub.c

   1/*
   2 * SLUB: A slab allocator that limits cache line use instead of queuing
   3 * objects in per cpu and per node lists.
   4 *
   5 * The allocator synchronizes using per slab locks or atomic operatios
   6 * and only uses a centralized lock to manage a pool of partial slabs.
   7 *
   8 * (C) 2007 SGI, Christoph Lameter
   9 * (C) 2011 Linux Foundation, Christoph Lameter
  10 */
  11
  12#include <linux/mm.h>
  13#include <linux/swap.h> /* struct reclaim_state */
  14#include <linux/module.h>
  15#include <linux/bit_spinlock.h>
  16#include <linux/interrupt.h>
  17#include <linux/bitops.h>
  18#include <linux/slab.h>
  19#include "slab.h"
  20#include <linux/proc_fs.h>
  21#include <linux/notifier.h>
  22#include <linux/seq_file.h>
  23#include <linux/kmemcheck.h>
  24#include <linux/cpu.h>
  25#include <linux/cpuset.h>
  26#include <linux/mempolicy.h>
  27#include <linux/ctype.h>
  28#include <linux/debugobjects.h>
  29#include <linux/kallsyms.h>
  30#include <linux/memory.h>
  31#include <linux/math64.h>
  32#include <linux/fault-inject.h>
  33#include <linux/stacktrace.h>
  34#include <linux/prefetch.h>
  35#include <linux/memcontrol.h>
  36
  37#include <trace/events/kmem.h>
  38
  39#include "internal.h"
  40
  41/*
  42 * Lock order:
  43 *   1. slab_mutex (Global Mutex)
  44 *   2. node->list_lock
  45 *   3. slab_lock(page) (Only on some arches and for debugging)
  46 *
  47 *   slab_mutex
  48 *
  49 *   The role of the slab_mutex is to protect the list of all the slabs
  50 *   and to synchronize major metadata changes to slab cache structures.
  51 *
  52 *   The slab_lock is only used for debugging and on arches that do not
  53 *   have the ability to do a cmpxchg_double. It only protects the second
  54 *   double word in the page struct. Meaning
  55 *      A. page->freelist       -> List of object free in a page
  56 *      B. page->counters       -> Counters of objects
  57 *      C. page->frozen         -> frozen state
  58 *
  59 *   If a slab is frozen then it is exempt from list management. It is not
  60 *   on any list. The processor that froze the slab is the one who can
  61 *   perform list operations on the page. Other processors may put objects
  62 *   onto the freelist but the processor that froze the slab is the only
  63 *   one that can retrieve the objects from the page's freelist.
  64 *
  65 *   The list_lock protects the partial and full list on each node and
  66 *   the partial slab counter. If taken then no new slabs may be added or
  67 *   removed from the lists nor make the number of partial slabs be modified.
  68 *   (Note that the total number of slabs is an atomic value that may be
  69 *   modified without taking the list lock).
  70 *
  71 *   The list_lock is a centralized lock and thus we avoid taking it as
  72 *   much as possible. As long as SLUB does not have to handle partial
  73 *   slabs, operations can continue without any centralized lock. F.e.
  74 *   allocating a long series of objects that fill up slabs does not require
  75 *   the list lock.
  76 *   Interrupts are disabled during allocation and deallocation in order to
  77 *   make the slab allocator safe to use in the context of an irq. In addition
  78 *   interrupts are disabled to ensure that the processor does not change
  79 *   while handling per_cpu slabs, due to kernel preemption.
  80 *
  81 * SLUB assigns one slab for allocation to each processor.
  82 * Allocations only occur from these slabs called cpu slabs.
  83 *
  84 * Slabs with free elements are kept on a partial list and during regular
  85 * operations no list for full slabs is used. If an object in a full slab is
  86 * freed then the slab will show up again on the partial lists.
  87 * We track full slabs for debugging purposes though because otherwise we
  88 * cannot scan all objects.
  89 *
  90 * Slabs are freed when they become empty. Teardown and setup is
  91 * minimal so we rely on the page allocators per cpu caches for
  92 * fast frees and allocs.
  93 *
  94 * Overloading of page flags that are otherwise used for LRU management.
  95 *
  96 * PageActive           The slab is frozen and exempt from list processing.
  97 *                      This means that the slab is dedicated to a purpose
  98 *                      such as satisfying allocations for a specific
  99 *                      processor. Objects may be freed in the slab while
 100 *                      it is frozen but slab_free will then skip the usual
 101 *                      list operations. It is up to the processor holding
 102 *                      the slab to integrate the slab into the slab lists
 103 *                      when the slab is no longer needed.
 104 *
 105 *                      One use of this flag is to mark slabs that are
 106 *                      used for allocations. Then such a slab becomes a cpu
 107 *                      slab. The cpu slab may be equipped with an additional
 108 *                      freelist that allows lockless access to
 109 *                      free objects in addition to the regular freelist
 110 *                      that requires the slab lock.
 111 *
 112 * PageError            Slab requires special handling due to debug
 113 *                      options set. This moves slab handling out of
 114 *                      the fast path and disables lockless freelists.
 115 */
 116
 117static inline int kmem_cache_debug(struct kmem_cache *s)
 118{
 119#ifdef CONFIG_SLUB_DEBUG
 120        return unlikely(s->flags & SLAB_DEBUG_FLAGS);
 121#else
 122        return 0;
 123#endif
 124}
 125
 126static inline void *fixup_red_left(struct kmem_cache *s, void *p)
 127{
 128        if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE)
 129                p += s->red_left_pad;
 130
 131        return p;
 132}
 133
 134/*
 135 * Issues still to be resolved:
 136 *
 137 * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
 138 *
 139 * - Variable sizing of the per node arrays
 140 */
 141
 142/* Enable to test recovery from slab corruption on boot */
 143#undef SLUB_RESILIENCY_TEST
 144
 145/* Enable to log cmpxchg failures */
 146#undef SLUB_DEBUG_CMPXCHG
 147
 148/*
 149 * Mininum number of partial slabs. These will be left on the partial
 150 * lists even if they are empty. kmem_cache_shrink may reclaim them.
 151 */
 152#define MIN_PARTIAL 5
 153
 154/*
 155 * Maximum number of desirable partial slabs.
 156 * The existence of more partial slabs makes kmem_cache_shrink
 157 * sort the partial list by the number of objects in the.
 158 */
 159#define MAX_PARTIAL 10
 160
 161#define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \
 162                                SLAB_POISON | SLAB_STORE_USER)
 163
 164/*
 165 * These debug flags cannot use CMPXCHG because there might be consistency
 166 * issues when checking or reading debug information
 167 */
 168#define SLAB_NO_CMPXCHG (SLAB_CONSISTENCY_CHECKS | SLAB_STORE_USER | \
 169                                SLAB_TRACE)
 170
 171
 172/*
 173 * Debugging flags that require metadata to be stored in the slab.  These get
 174 * disabled when slub_debug=O is used and a cache's min order increases with
 175 * metadata.
 176 */
 177#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
 178
 179/*
 180 * Set of flags that will prevent slab merging
 181 */
 182#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
 183                SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
 184                SLAB_FAILSLAB)
 185
 186#define SLUB_MERGE_SAME (SLAB_CONSISTENCY_CHECKS | SLAB_RECLAIM_ACCOUNT | \
 187                SLAB_CACHE_DMA | SLAB_NOTRACK | SLAB_ACCOUNT)
 188
 189#define OO_SHIFT        16
 190#define OO_MASK         ((1 << OO_SHIFT) - 1)
 191#define MAX_OBJS_PER_PAGE       32767 /* since page.objects is u15 */
 192
 193/* Internal SLUB flags */
 194#define __OBJECT_POISON         0x80000000UL /* Poison object */
 195#define __CMPXCHG_DOUBLE        0x40000000UL /* Use cmpxchg_double */
 196
 197#ifdef CONFIG_SMP
 198static struct notifier_block slab_notifier;
 199#endif
 200
 201/*
 202 * Tracking user of a slab.
 203 */
 204#define TRACK_ADDRS_COUNT 16
 205struct track {
 206        unsigned long addr;     /* Called from address */
 207#ifdef CONFIG_STACKTRACE
 208        unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */
 209#endif
 210        int cpu;                /* Was running on cpu */
 211        int pid;                /* Pid context */
 212        unsigned long when;     /* When did the operation occur */
 213};
 214
 215enum track_item { TRACK_ALLOC, TRACK_FREE };
 216
 217#ifdef CONFIG_SYSFS
 218static int sysfs_slab_add(struct kmem_cache *);
 219static int sysfs_slab_alias(struct kmem_cache *, const char *);
 220static void sysfs_slab_remove(struct kmem_cache *);
 221static void memcg_propagate_slab_attrs(struct kmem_cache *s);
 222#else
 223static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
 224static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
 225                                                        { return 0; }
 226static inline void sysfs_slab_remove(struct kmem_cache *s) { }
 227
 228static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
 229#endif
 230
 231static inline void stat(const struct kmem_cache *s, enum stat_item si)
 232{
 233#ifdef CONFIG_SLUB_STATS
 234        __this_cpu_inc(s->cpu_slab->stat[si]);
 235#endif
 236}
 237
 238/********************************************************************
 239 *                      Core slab cache functions
 240 *******************************************************************/
 241
 242static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
 243{
 244        return s->node[node];
 245}
 246
 247static inline void *get_freepointer(struct kmem_cache *s, void *object)
 248{
 249        return *(void **)(object + s->offset);
 250}
 251
 252static void prefetch_freepointer(const struct kmem_cache *s, void *object)
 253{
 254        prefetch(object + s->offset);
 255}
 256
 257static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
 258{
 259        void *p;
 260
 261        if (!debug_pagealloc_enabled())
 262                return get_freepointer(s, object);
 263
 264        probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p));
 265        return p;
 266}
 267
 268static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
 269{
 270        *(void **)(object + s->offset) = fp;
 271}
 272
 273/* Loop over all objects in a slab */
 274#define for_each_object(__p, __s, __addr, __objects) \
 275        for (__p = fixup_red_left(__s, __addr); \
 276                __p < (__addr) + (__objects) * (__s)->size; \
 277                __p += (__s)->size)
 278
 279#define for_each_object_idx(__p, __idx, __s, __addr, __objects) \
 280        for (__p = fixup_red_left(__s, __addr), __idx = 1; \
 281                __idx <= __objects; \
 282                __p += (__s)->size, __idx++)
 283
 284/* Determine object index from a given position */
 285static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
 286{
 287        return (p - addr) / s->size;
 288}
 289
 290static inline size_t slab_ksize(const struct kmem_cache *s)
 291{
 292#ifdef CONFIG_SLUB_DEBUG
 293        /*
 294         * Debugging requires use of the padding between object
 295         * and whatever may come after it.
 296         */
 297        if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
 298                return s->object_size;
 299
 300#endif
 301        /*
 302         * If we have the need to store the freelist pointer
 303         * back there or track user information then we can
 304         * only use the space before that information.
 305         */
 306        if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
 307                return s->inuse;
 308        /*
 309         * Else we can use all the padding etc for the allocation
 310         */
 311        return s->size;
 312}
 313
 314static inline int order_objects(int order, unsigned long size, int reserved)
 315{
 316        return ((PAGE_SIZE << order) - reserved) / size;
 317}
 318
 319static inline struct kmem_cache_order_objects oo_make(int order,
 320                unsigned long size, int reserved)
 321{
 322        struct kmem_cache_order_objects x = {
 323                (order << OO_SHIFT) + order_objects(order, size, reserved)
 324        };
 325
 326        return x;
 327}
 328
 329static inline int oo_order(struct kmem_cache_order_objects x)
 330{
 331        return x.x >> OO_SHIFT;
 332}
 333
 334static inline int oo_objects(struct kmem_cache_order_objects x)
 335{
 336        return x.x & OO_MASK;
 337}
 338
 339/*
 340 * Per slab locking using the pagelock
 341 */
 342static __always_inline void slab_lock(struct page *page)
 343{
 344        bit_spin_lock(PG_locked, &page->flags);
 345}
 346
 347static __always_inline void slab_unlock(struct page *page)
 348{
 349        __bit_spin_unlock(PG_locked, &page->flags);
 350}
 351
 352static inline void set_page_slub_counters(struct page *page, unsigned long counters_new)
 353{
 354        struct page tmp;
 355        tmp.counters = counters_new;
 356        /*
 357         * page->counters can cover frozen/inuse/objects as well
 358         * as page->_count.  If we assign to ->counters directly
 359         * we run the risk of losing updates to page->_count, so
 360         * be careful and only assign to the fields we need.
 361         */
 362        page->frozen  = tmp.frozen;
 363        page->inuse   = tmp.inuse;
 364        page->objects = tmp.objects;
 365}
 366
 367/* Interrupts must be disabled (for the fallback code to work right) */
 368static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
 369                void *freelist_old, unsigned long counters_old,
 370                void *freelist_new, unsigned long counters_new,
 371                const char *n)
 372{
 373        VM_BUG_ON(!irqs_disabled());
 374#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
 375    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
 376        if (s->flags & __CMPXCHG_DOUBLE) {
 377                if (cmpxchg_double(&page->freelist, &page->counters,
 378                        freelist_old, counters_old,
 379                        freelist_new, counters_new))
 380                return 1;
 381        } else
 382#endif
 383        {
 384                slab_lock(page);
 385                if (page->freelist == freelist_old && page->counters == counters_old) {
 386                        page->freelist = freelist_new;
 387                        set_page_slub_counters(page, counters_new);
 388                        slab_unlock(page);
 389                        return 1;
 390                }
 391                slab_unlock(page);
 392        }
 393
 394        cpu_relax();
 395        stat(s, CMPXCHG_DOUBLE_FAIL);
 396
 397#ifdef SLUB_DEBUG_CMPXCHG
 398        printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name);
 399#endif
 400
 401        return 0;
 402}
 403
 404static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
 405                void *freelist_old, unsigned long counters_old,
 406                void *freelist_new, unsigned long counters_new,
 407                const char *n)
 408{
 409#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
 410    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
 411        if (s->flags & __CMPXCHG_DOUBLE) {
 412                if (cmpxchg_double(&page->freelist, &page->counters,
 413                        freelist_old, counters_old,
 414                        freelist_new, counters_new))
 415                return 1;
 416        } else
 417#endif
 418        {
 419                unsigned long flags;
 420
 421                local_irq_save(flags);
 422                slab_lock(page);
 423                if (page->freelist == freelist_old && page->counters == counters_old) {
 424                        page->freelist = freelist_new;
 425                        set_page_slub_counters(page, counters_new);
 426                        slab_unlock(page);
 427                        local_irq_restore(flags);
 428                        return 1;
 429                }
 430                slab_unlock(page);
 431                local_irq_restore(flags);
 432        }
 433
 434        cpu_relax();
 435        stat(s, CMPXCHG_DOUBLE_FAIL);
 436
 437#ifdef SLUB_DEBUG_CMPXCHG
 438        printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name);
 439#endif
 440
 441        return 0;
 442}
 443
 444#ifdef CONFIG_SLUB_DEBUG
 445/*
 446 * Determine a map of object in use on a page.
 447 *
 448 * Node listlock must be held to guarantee that the page does
 449 * not vanish from under us.
 450 */
 451static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
 452{
 453        void *p;
 454        void *addr = page_address(page);
 455
 456        for (p = page->freelist; p; p = get_freepointer(s, p))
 457                set_bit(slab_index(p, s, addr), map);
 458}
 459
 460static inline int size_from_object(struct kmem_cache *s)
 461{
 462        if (s->flags & SLAB_RED_ZONE)
 463                return s->size - s->red_left_pad;
 464
 465        return s->size;
 466}
 467
 468static inline void *restore_red_left(struct kmem_cache *s, void *p)
 469{
 470        if (s->flags & SLAB_RED_ZONE)
 471                p -= s->red_left_pad;
 472
 473        return p;
 474}
 475
 476/*
 477 * Debug settings:
 478 */
 479#ifdef CONFIG_SLUB_DEBUG_ON
 480static int slub_debug = DEBUG_DEFAULT_FLAGS;
 481#else
 482static int slub_debug;
 483#endif
 484
 485static char *slub_debug_slabs;
 486static int disable_higher_order_debug;
 487
 488/*
 489 * Object debugging
 490 */
 491
 492/* Verify that a pointer has an address that is valid within a slab page */
 493static inline int check_valid_pointer(struct kmem_cache *s,
 494                                struct page *page, void *object)
 495{
 496        void *base;
 497
 498        if (!object)
 499                return 1;
 500
 501        base = page_address(page);
 502        object = restore_red_left(s, object);
 503        if (object < base || object >= base + page->objects * s->size ||
 504                (object - base) % s->size) {
 505                return 0;
 506        }
 507
 508        return 1;
 509}
 510
 511static void print_section(char *text, u8 *addr, unsigned int length)
 512{
 513        print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
 514                        length, 1);
 515}
 516
 517static struct track *get_track(struct kmem_cache *s, void *object,
 518        enum track_item alloc)
 519{
 520        struct track *p;
 521
 522        if (s->offset)
 523                p = object + s->offset + sizeof(void *);
 524        else
 525                p = object + s->inuse;
 526
 527        return p + alloc;
 528}
 529
 530static void set_track(struct kmem_cache *s, void *object,
 531                        enum track_item alloc, unsigned long addr)
 532{
 533        struct track *p = get_track(s, object, alloc);
 534
 535        if (addr) {
 536#ifdef CONFIG_STACKTRACE
 537                struct stack_trace trace;
 538                int i;
 539
 540                trace.nr_entries = 0;
 541                trace.max_entries = TRACK_ADDRS_COUNT;
 542                trace.entries = p->addrs;
 543                trace.skip = 3;
 544                save_stack_trace(&trace);
 545
 546                /* See rant in lockdep.c */
 547                if (trace.nr_entries != 0 &&
 548                    trace.entries[trace.nr_entries - 1] == ULONG_MAX)
 549                        trace.nr_entries--;
 550
 551                for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++)
 552                        p->addrs[i] = 0;
 553#endif
 554                p->addr = addr;
 555                p->cpu = smp_processor_id();
 556                p->pid = current->pid;
 557                p->when = jiffies;
 558        } else
 559                memset(p, 0, sizeof(struct track));
 560}
 561
 562static void init_tracking(struct kmem_cache *s, void *object)
 563{
 564        if (!(s->flags & SLAB_STORE_USER))
 565                return;
 566
 567        set_track(s, object, TRACK_FREE, 0UL);
 568        set_track(s, object, TRACK_ALLOC, 0UL);
 569}
 570
 571static void print_track(const char *s, struct track *t)
 572{
 573        if (!t->addr)
 574                return;
 575
 576        printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
 577                s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid);
 578#ifdef CONFIG_STACKTRACE
 579        {
 580                int i;
 581                for (i = 0; i < TRACK_ADDRS_COUNT; i++)
 582                        if (t->addrs[i])
 583                                printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]);
 584                        else
 585                                break;
 586        }
 587#endif
 588}
 589
 590static void print_tracking(struct kmem_cache *s, void *object)
 591{
 592        if (!(s->flags & SLAB_STORE_USER))
 593                return;
 594
 595        print_track("Allocated", get_track(s, object, TRACK_ALLOC));
 596        print_track("Freed", get_track(s, object, TRACK_FREE));
 597}
 598
 599static void print_page_info(struct page *page)
 600{
 601        printk(KERN_ERR "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
 602                page, page->objects, page->inuse, page->freelist, page->flags);
 603
 604}
 605
 606static void slab_bug(struct kmem_cache *s, char *fmt, ...)
 607{
 608        va_list args;
 609        char buf[100];
 610
 611        va_start(args, fmt);
 612        vsnprintf(buf, sizeof(buf), fmt, args);
 613        va_end(args);
 614        printk(KERN_ERR "========================================"
 615                        "=====================================\n");
 616        printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf);
 617        printk(KERN_ERR "----------------------------------------"
 618                        "-------------------------------------\n\n");
 619
 620        add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 621}
 622
 623static void slab_fix(struct kmem_cache *s, char *fmt, ...)
 624{
 625        va_list args;
 626        char buf[100];
 627
 628        va_start(args, fmt);
 629        vsnprintf(buf, sizeof(buf), fmt, args);
 630        va_end(args);
 631        printk(KERN_ERR "FIX %s: %s\n", s->name, buf);
 632}
 633
 634static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
 635{
 636        unsigned int off;       /* Offset of last byte */
 637        u8 *addr = page_address(page);
 638
 639        print_tracking(s, p);
 640
 641        print_page_info(page);
 642
 643        printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
 644                        p, p - addr, get_freepointer(s, p));
 645
 646        if (s->flags & SLAB_RED_ZONE)
 647                print_section("Redzone ", p - s->red_left_pad, s->red_left_pad);
 648        else if (p > addr + 16)
 649                print_section("Bytes b4 ", p - 16, 16);
 650
 651        print_section("Object ", p, min_t(unsigned long, s->object_size,
 652                                PAGE_SIZE));
 653        if (s->flags & SLAB_RED_ZONE)
 654                print_section("Redzone ", p + s->object_size,
 655                        s->inuse - s->object_size);
 656
 657        if (s->offset)
 658                off = s->offset + sizeof(void *);
 659        else
 660                off = s->inuse;
 661
 662        if (s->flags & SLAB_STORE_USER)
 663                off += 2 * sizeof(struct track);
 664
 665        if (off != size_from_object(s))
 666                /* Beginning of the filler is the free pointer */
 667                print_section("Padding ", p + off, size_from_object(s) - off);
 668
 669        dump_stack();
 670}
 671
 672static void object_err(struct kmem_cache *s, struct page *page,
 673                        u8 *object, char *reason)
 674{
 675        slab_bug(s, "%s", reason);
 676        print_trailer(s, page, object);
 677}
 678
 679static void slab_err(struct kmem_cache *s, struct page *page, const char *fmt, ...)
 680{
 681        va_list args;
 682        char buf[100];
 683
 684        va_start(args, fmt);
 685        vsnprintf(buf, sizeof(buf), fmt, args);
 686        va_end(args);
 687        slab_bug(s, "%s", buf);
 688        print_page_info(page);
 689        dump_stack();
 690}
 691
 692static void init_object(struct kmem_cache *s, void *object, u8 val)
 693{
 694        u8 *p = object;
 695
 696        if (s->flags & SLAB_RED_ZONE)
 697                memset(p - s->red_left_pad, val, s->red_left_pad);
 698
 699        if (s->flags & __OBJECT_POISON) {
 700                memset(p, POISON_FREE, s->object_size - 1);
 701                p[s->object_size - 1] = POISON_END;
 702        }
 703
 704        if (s->flags & SLAB_RED_ZONE)
 705                memset(p + s->object_size, val, s->inuse - s->object_size);
 706}
 707
 708static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
 709                                                void *from, void *to)
 710{
 711        slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data);
 712        memset(from, data, to - from);
 713}
 714
 715static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
 716                        u8 *object, char *what,
 717                        u8 *start, unsigned int value, unsigned int bytes)
 718{
 719        u8 *fault;
 720        u8 *end;
 721
 722        fault = memchr_inv(start, value, bytes);
 723        if (!fault)
 724                return 1;
 725
 726        end = start + bytes;
 727        while (end > fault && end[-1] == value)
 728                end--;
 729
 730        slab_bug(s, "%s overwritten", what);
 731        printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
 732                                        fault, end - 1, fault[0], value);
 733        print_trailer(s, page, object);
 734
 735        restore_bytes(s, what, value, fault, end);
 736        return 0;
 737}
 738
 739/*
 740 * Object layout:
 741 *
 742 * object address
 743 *      Bytes of the object to be managed.
 744 *      If the freepointer may overlay the object then the free
 745 *      pointer is the first word of the object.
 746 *
 747 *      Poisoning uses 0x6b (POISON_FREE) and the last byte is
 748 *      0xa5 (POISON_END)
 749 *
 750 * object + s->object_size
 751 *      Padding to reach word boundary. This is also used for Redzoning.
 752 *      Padding is extended by another word if Redzoning is enabled and
 753 *      object_size == inuse.
 754 *
 755 *      We fill with 0xbb (RED_INACTIVE) for inactive objects and with
 756 *      0xcc (RED_ACTIVE) for objects in use.
 757 *
 758 * object + s->inuse
 759 *      Meta data starts here.
 760 *
 761 *      A. Free pointer (if we cannot overwrite object on free)
 762 *      B. Tracking data for SLAB_STORE_USER
 763 *      C. Padding to reach required alignment boundary or at mininum
 764 *              one word if debugging is on to be able to detect writes
 765 *              before the word boundary.
 766 *
 767 *      Padding is done using 0x5a (POISON_INUSE)
 768 *
 769 * object + s->size
 770 *      Nothing is used beyond s->size.
 771 *
 772 * If slabcaches are merged then the object_size and inuse boundaries are mostly
 773 * ignored. And therefore no slab options that rely on these boundaries
 774 * may be used with merged slabcaches.
 775 */
 776
 777static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
 778{
 779        unsigned long off = s->inuse;   /* The end of info */
 780
 781        if (s->offset)
 782                /* Freepointer is placed after the object. */
 783                off += sizeof(void *);
 784
 785        if (s->flags & SLAB_STORE_USER)
 786                /* We also have user information there */
 787                off += 2 * sizeof(struct track);
 788
 789        if (size_from_object(s) == off)
 790                return 1;
 791
 792        return check_bytes_and_report(s, page, p, "Object padding",
 793                        p + off, POISON_INUSE, size_from_object(s) - off);
 794}
 795
 796/* Check the pad bytes at the end of a slab page */
 797static int slab_pad_check(struct kmem_cache *s, struct page *page)
 798{
 799        u8 *start;
 800        u8 *fault;
 801        u8 *end;
 802        int length;
 803        int remainder;
 804
 805        if (!(s->flags & SLAB_POISON))
 806                return 1;
 807
 808        start = page_address(page);
 809        length = (PAGE_SIZE << compound_order(page)) - s->reserved;
 810        end = start + length;
 811        remainder = length % s->size;
 812        if (!remainder)
 813                return 1;
 814
 815        fault = memchr_inv(end - remainder, POISON_INUSE, remainder);
 816        if (!fault)
 817                return 1;
 818        while (end > fault && end[-1] == POISON_INUSE)
 819                end--;
 820
 821        slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
 822        print_section("Padding ", end - remainder, remainder);
 823
 824        restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end);
 825        return 0;
 826}
 827
 828static int check_object(struct kmem_cache *s, struct page *page,
 829                                        void *object, u8 val)
 830{
 831        u8 *p = object;
 832        u8 *endobject = object + s->object_size;
 833
 834        if (s->flags & SLAB_RED_ZONE) {
 835                if (!check_bytes_and_report(s, page, object, "Redzone",
 836                        object - s->red_left_pad, val, s->red_left_pad))
 837                        return 0;
 838
 839                if (!check_bytes_and_report(s, page, object, "Redzone",
 840                        endobject, val, s->inuse - s->object_size))
 841                        return 0;
 842        } else {
 843                if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
 844                        check_bytes_and_report(s, page, p, "Alignment padding",
 845                                endobject, POISON_INUSE, s->inuse - s->object_size);
 846                }
 847        }
 848
 849        if (s->flags & SLAB_POISON) {
 850                if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
 851                        (!check_bytes_and_report(s, page, p, "Poison", p,
 852                                        POISON_FREE, s->object_size - 1) ||
 853                         !check_bytes_and_report(s, page, p, "Poison",
 854                                p + s->object_size - 1, POISON_END, 1)))
 855                        return 0;
 856                /*
 857                 * check_pad_bytes cleans up on its own.
 858                 */
 859                check_pad_bytes(s, page, p);
 860        }
 861
 862        if (!s->offset && val == SLUB_RED_ACTIVE)
 863                /*
 864                 * Object and freepointer overlap. Cannot check
 865                 * freepointer while object is allocated.
 866                 */
 867                return 1;
 868
 869        /* Check free pointer validity */
 870        if (!check_valid_pointer(s, page, get_freepointer(s, p))) {
 871                object_err(s, page, p, "Freepointer corrupt");
 872                /*
 873                 * No choice but to zap it and thus lose the remainder
 874                 * of the free objects in this slab. May cause
 875                 * another error because the object count is now wrong.
 876                 */
 877                set_freepointer(s, p, NULL);
 878                return 0;
 879        }
 880        return 1;
 881}
 882
 883static int check_slab(struct kmem_cache *s, struct page *page)
 884{
 885        int maxobj;
 886
 887        VM_BUG_ON(!irqs_disabled());
 888
 889        if (!PageSlab(page)) {
 890                slab_err(s, page, "Not a valid slab page");
 891                return 0;
 892        }
 893
 894        maxobj = order_objects(compound_order(page), s->size, s->reserved);
 895        if (page->objects > maxobj) {
 896                slab_err(s, page, "objects %u > max %u",
 897                        s->name, page->objects, maxobj);
 898                return 0;
 899        }
 900        if (page->inuse > page->objects) {
 901                slab_err(s, page, "inuse %u > max %u",
 902                        s->name, page->inuse, page->objects);
 903                return 0;
 904        }
 905        /* Slab_pad_check fixes things up after itself */
 906        slab_pad_check(s, page);
 907        return 1;
 908}
 909
 910/*
 911 * Determine if a certain object on a page is on the freelist. Must hold the
 912 * slab lock to guarantee that the chains are in a consistent state.
 913 */
 914static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
 915{
 916        int nr = 0;
 917        void *fp;
 918        void *object = NULL;
 919        unsigned long max_objects;
 920
 921        fp = page->freelist;
 922        while (fp && nr <= page->objects) {
 923                if (fp == search)
 924                        return 1;
 925                if (!check_valid_pointer(s, page, fp)) {
 926                        if (object) {
 927                                object_err(s, page, object,
 928                                        "Freechain corrupt");
 929                                set_freepointer(s, object, NULL);
 930                                break;
 931                        } else {
 932                                slab_err(s, page, "Freepointer corrupt");
 933                                page->freelist = NULL;
 934                                page->inuse = page->objects;
 935                                slab_fix(s, "Freelist cleared");
 936                                return 0;
 937                        }
 938                        break;
 939                }
 940                object = fp;
 941                fp = get_freepointer(s, object);
 942                nr++;
 943        }
 944
 945        max_objects = order_objects(compound_order(page), s->size, s->reserved);
 946        if (max_objects > MAX_OBJS_PER_PAGE)
 947                max_objects = MAX_OBJS_PER_PAGE;
 948
 949        if (page->objects != max_objects) {
 950                slab_err(s, page, "Wrong number of objects. Found %d but "
 951                        "should be %d", page->objects, max_objects);
 952                page->objects = max_objects;
 953                slab_fix(s, "Number of objects adjusted.");
 954        }
 955        if (page->inuse != page->objects - nr) {
 956                slab_err(s, page, "Wrong object count. Counter is %d but "
 957                        "counted were %d", page->inuse, page->objects - nr);
 958                page->inuse = page->objects - nr;
 959                slab_fix(s, "Object count adjusted.");
 960        }
 961        return search == NULL;
 962}
 963
 964static void trace(struct kmem_cache *s, struct page *page, void *object,
 965                                                                int alloc)
 966{
 967        if (s->flags & SLAB_TRACE) {
 968                printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
 969                        s->name,
 970                        alloc ? "alloc" : "free",
 971                        object, page->inuse,
 972                        page->freelist);
 973
 974                if (!alloc)
 975                        print_section("Object ", (void *)object, s->object_size);
 976
 977                dump_stack();
 978        }
 979}
 980
 981/*
 982 * Hooks for other subsystems that check memory allocations. In a typical
 983 * production configuration these hooks all should produce no code at all.
 984 */
 985static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
 986{
 987        flags &= gfp_allowed_mask;
 988        lockdep_trace_alloc(flags);
 989        might_sleep_if(flags & __GFP_WAIT);
 990
 991        return should_failslab(s->object_size, flags, s->flags);
 992}
 993
 994static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
 995                                        size_t size, void **p)
 996{
 997        size_t i;
 998
 999        flags &= gfp_allowed_mask;
1000        for (i = 0; i < size; i++) {

1001                void *object = p[i];
1002
1003                kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
1004                kmemleak_alloc_recursive(object, s->object_size, 1,
1005                                         s->flags, flags);
1006        }
1007}
1008
1009static inline void slab_free_hook(struct kmem_cache *s, void *x)
1010{
1011        kmemleak_free_recursive(x, s->flags);
1012
1013        /*
1014         * Trouble is that we may no longer disable interupts in the fast path
1015         * So in order to make the debug calls that expect irqs to be
1016         * disabled we need to disable interrupts temporarily.
1017         */
1018#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP)
1019        {
1020                unsigned long flags;
1021
1022                local_irq_save(flags);
1023                kmemcheck_slab_free(s, x, s->object_size);
1024                debug_check_no_locks_freed(x, s->object_size);
1025                local_irq_restore(flags);
1026        }
1027#endif
1028        if (!(s->flags & SLAB_DEBUG_OBJECTS))
1029                debug_check_no_obj_freed(x, s->object_size);
1030}
1031
1032/*
1033 * Tracking of fully allocated slabs for debugging purposes.
1034 *
1035 * list_lock must be held.
1036 */
1037static void add_full(struct kmem_cache *s,
1038        struct kmem_cache_node *n, struct page *page)
1039{
1040        if (!(s->flags & SLAB_STORE_USER))
1041                return;
1042
1043        list_add(&page->lru, &n->full);
1044}
1045
1046/*
1047 * list_lock must be held.
1048 */
1049static void remove_full(struct kmem_cache *s, struct page *page)
1050{
1051        if (!(s->flags & SLAB_STORE_USER))
1052                return;
1053
1054        list_del(&page->lru);
1055}
1056
1057/* Tracking of the number of slabs for debugging purposes */
1058static inline unsigned long slabs_node(struct kmem_cache *s, int node)
1059{
1060        struct kmem_cache_node *n = get_node(s, node);
1061
1062        return atomic_long_read(&n->nr_slabs);
1063}
1064
1065static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
1066{
1067        return atomic_long_read(&n->nr_slabs);
1068}
1069
1070static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
1071{
1072        struct kmem_cache_node *n = get_node(s, node);
1073
1074        /*
1075         * May be called early in order to allocate a slab for the
1076         * kmem_cache_node structure. Solve the chicken-egg
1077         * dilemma by deferring the increment of the count during
1078         * bootstrap (see early_kmem_cache_node_alloc).
1079         */
1080        if (likely(n)) {
1081                atomic_long_inc(&n->nr_slabs);
1082                atomic_long_add(objects, &n->total_objects);
1083        }
1084}
1085static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
1086{
1087        struct kmem_cache_node *n = get_node(s, node);
1088
1089        atomic_long_dec(&n->nr_slabs);
1090        atomic_long_sub(objects, &n->total_objects);
1091}
1092
1093/* Object debug checks for alloc/free paths */
1094static void setup_object_debug(struct kmem_cache *s, struct page *page,
1095                                                                void *object)
1096{
1097        if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
1098                return;
1099
1100        init_object(s, object, SLUB_RED_INACTIVE);
1101        init_tracking(s, object);
1102}
1103
1104static inline int alloc_consistency_checks(struct kmem_cache *s, struct page *page,
1105                                        void *object)
1106{
1107        if (!check_slab(s, page))
1108                return 0;
1109
1110        if (!check_valid_pointer(s, page, object)) {
1111                object_err(s, page, object, "Freelist Pointer check fails");
1112                return 0;
1113        }
1114
1115        if (!check_object(s, page, object, SLUB_RED_INACTIVE))
1116                return 0;
1117
1118        return 1;
1119}
1120
1121static noinline int alloc_debug_processing(struct kmem_cache *s,
1122                                        struct page *page,
1123                                        void *object, unsigned long addr)
1124{
1125        if (s->flags & SLAB_CONSISTENCY_CHECKS) {
1126                if (!alloc_consistency_checks(s, page, object))
1127                        goto bad;
1128        }
1129
1130        /* Success perform special debug activities for allocs */
1131        if (s->flags & SLAB_STORE_USER)
1132                set_track(s, object, TRACK_ALLOC, addr);
1133        trace(s, page, object, 1);
1134        init_object(s, object, SLUB_RED_ACTIVE);
1135        return 1;
1136
1137bad:
1138        if (PageSlab(page)) {
1139                /*
1140                 * If this is a slab page then lets do the best we can
1141                 * to avoid issues in the future. Marking all objects
1142                 * as used avoids touching the remaining objects.
1143                 */
1144                slab_fix(s, "Marking all objects used");
1145                page->inuse = page->objects;
1146                page->freelist = NULL;
1147        }
1148        return 0;
1149}
1150
1151static inline int free_consistency_checks(struct kmem_cache *s,
1152                struct page *page, void *object, unsigned long addr)
1153{
1154        if (!check_valid_pointer(s, page, object)) {
1155                slab_err(s, page, "Invalid object pointer 0x%p", object);
1156                return 0;
1157        }
1158
1159        if (on_freelist(s, page, object)) {
1160                object_err(s, page, object, "Object already free");
1161                return 0;
1162        }
1163
1164        if (!check_object(s, page, object, SLUB_RED_ACTIVE))
1165                return 0;
1166
1167        if (unlikely(s != page->slab_cache)) {
1168                if (!PageSlab(page)) {
1169                        slab_err(s, page, "Attempt to free object(0x%p) "
1170                                "outside of slab", object);
1171                } else if (!page->slab_cache) {
1172                        printk(KERN_ERR
1173                                "SLUB <none>: no slab for object 0x%p.\n",
1174                                                object);
1175                        dump_stack();
1176                } else
1177                        object_err(s, page, object,
1178                                        "page slab pointer corrupt.");
1179                return 0;
1180        }
1181        return 1;
1182}
1183
1184/* Supports checking bulk free of a constructed freelist */
1185static noinline int free_debug_processing(
1186        struct kmem_cache *s, struct page *page,
1187        void *head, void *tail, int bulk_cnt,
1188        unsigned long addr)
1189{
1190        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1191        void *object = head;
1192        int cnt = 0;
1193        unsigned long uninitialized_var(flags);
1194        int ret = 0;
1195
1196        spin_lock_irqsave(&n->list_lock, flags);
1197        slab_lock(page);
1198
1199        if (s->flags & SLAB_CONSISTENCY_CHECKS) {
1200                if (!check_slab(s, page))
1201                        goto out;
1202        }
1203
1204next_object:
1205        cnt++;
1206
1207        if (s->flags & SLAB_CONSISTENCY_CHECKS) {
1208                if (!free_consistency_checks(s, page, object, addr))
1209                        goto out;
1210        }
1211
1212        if (s->flags & SLAB_STORE_USER)
1213                set_track(s, object, TRACK_FREE, addr);
1214        trace(s, page, object, 0);
1215        /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
1216        init_object(s, object, SLUB_RED_INACTIVE);
1217
1218        /* Reached end of constructed freelist yet? */
1219        if (object != tail) {
1220                object = get_freepointer(s, object);
1221                goto next_object;
1222        }
1223        ret = 1;
1224
1225out:
1226        if (cnt != bulk_cnt)
1227                slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n",
1228                         bulk_cnt, cnt);
1229
1230        slab_unlock(page);
1231        spin_unlock_irqrestore(&n->list_lock, flags);
1232        if (!ret)
1233                slab_fix(s, "Object at 0x%p not freed", object);
1234        return ret;
1235}
1236
1237static int __init setup_slub_debug(char *str)
1238{
1239        slub_debug = DEBUG_DEFAULT_FLAGS;
1240        if (*str++ != '=' || !*str)
1241                /*
1242                 * No options specified. Switch on full debugging.
1243                 */
1244                goto out;
1245
1246        if (*str == ',')
1247                /*
1248                 * No options but restriction on slabs. This means full
1249                 * debugging for slabs matching a pattern.
1250                 */
1251                goto check_slabs;
1252
1253        if (tolower(*str) == 'o') {
1254                /*
1255                 * Avoid enabling debugging on caches if its minimum order
1256                 * would increase as a result.
1257                 */
1258                disable_higher_order_debug = 1;
1259                goto out;
1260        }
1261
1262        slub_debug = 0;
1263        if (*str == '-')
1264                /*
1265                 * Switch off all debugging measures.
1266                 */
1267                goto out;
1268
1269        /*
1270         * Determine which debug features should be switched on
1271         */
1272        for (; *str && *str != ','; str++) {
1273                switch (tolower(*str)) {
1274                case 'f':
1275                        slub_debug |= SLAB_CONSISTENCY_CHECKS;
1276                        break;
1277                case 'z':
1278                        slub_debug |= SLAB_RED_ZONE;
1279                        break;
1280                case 'p':
1281                        slub_debug |= SLAB_POISON;
1282                        break;
1283                case 'u':
1284                        slub_debug |= SLAB_STORE_USER;
1285                        break;
1286                case 't':
1287                        slub_debug |= SLAB_TRACE;
1288                        break;
1289                case 'a':
1290                        slub_debug |= SLAB_FAILSLAB;
1291                        break;
1292                default:
1293                        printk(KERN_ERR "slub_debug option '%c' "
1294                                "unknown. skipped\n", *str);
1295                }
1296        }
1297
1298check_slabs:
1299        if (*str == ',')
1300                slub_debug_slabs = str + 1;
1301out:
1302        return 1;
1303}
1304
1305__setup("slub_debug", setup_slub_debug);
1306
1307/*
1308 * kmem_cache_flags - apply debugging options to the cache
1309 * @object_size:        the size of an object without meta data
1310 * @flags:              flags to set
1311 * @name:               name of the cache
1312 * @ctor:               constructor function
1313 *
1314 * Debug option(s) are applied to @flags. In addition to the debug
1315 * option(s), if a slab name (or multiple) is specified i.e.
1316 * slub_debug=<Debug-Options>,<slab name1>,<slab name2> ...
1317 * then only the select slabs will receive the debug option(s).
1318 */
1319static unsigned long kmem_cache_flags(unsigned long object_size,
1320        unsigned long flags, const char *name,
1321        void (*ctor)(void *))
1322{
1323        char *iter;
1324        size_t len;
1325
1326        /* If slub_debug = 0, it folds into the if conditional. */
1327        if (!slub_debug_slabs)
1328                return flags | slub_debug;
1329
1330        len = strlen(name);
1331        iter = slub_debug_slabs;
1332        while (*iter) {
1333                char *end, *glob;
1334                size_t cmplen;
1335
1336                end = strchr(iter, ',');
1337                if (!end)
1338                        end = iter + strlen(iter);
1339
1340                glob = strnchr(iter, end - iter, '*');
1341                if (glob)
1342                        cmplen = glob - iter;
1343                else
1344                        cmplen = max_t(size_t, len, (end - iter));
1345
1346                if (!strncmp(name, iter, cmplen)) {
1347                        flags |= slub_debug;
1348                        break;
1349                }
1350
1351                if (!*end)
1352                        break;
1353                iter = end + 1;
1354        }
1355
1356        return flags;
1357}
1358#else /* !CONFIG_SLUB_DEBUG */
1359static inline void setup_object_debug(struct kmem_cache *s,
1360                        struct page *page, void *object) {}
1361
1362static inline int alloc_debug_processing(struct kmem_cache *s,
1363        struct page *page, void *object, unsigned long addr) { return 0; }
1364
1365static inline int free_debug_processing(
1366        struct kmem_cache *s, struct page *page,
1367        void *head, void *tail, int bulk_cnt,
1368        unsigned long addr) { return 0; }
1369
1370static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
1371                        { return 1; }
1372static inline int check_object(struct kmem_cache *s, struct page *page,
1373                        void *object, u8 val) { return 1; }
1374static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
1375                                        struct page *page) {}
1376static inline void remove_full(struct kmem_cache *s, struct page *page) {}
1377static inline unsigned long kmem_cache_flags(unsigned long object_size,
1378        unsigned long flags, const char *name,
1379        void (*ctor)(void *))
1380{
1381        return flags;
1382}
1383#define slub_debug 0
1384
1385#define disable_higher_order_debug 0
1386
1387static inline unsigned long slabs_node(struct kmem_cache *s, int node)
1388                                                        { return 0; }
1389static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
1390                                                        { return 0; }
1391static inline void inc_slabs_node(struct kmem_cache *s, int node,
1392                                                        int objects) {}
1393static inline void dec_slabs_node(struct kmem_cache *s, int node,
1394                                                        int objects) {}
1395
1396static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
1397                                                        { return 0; }
1398
1399static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
1400                void *object) {}
1401
1402static inline void slab_free_hook(struct kmem_cache *s, void *x) {}
1403
1404#endif /* CONFIG_SLUB_DEBUG */
1405
1406static inline void slab_free_freelist_hook(struct kmem_cache *s,
1407                                           void *head, void *tail)
1408{
1409/*
1410 * Compiler cannot detect this function can be removed if slab_free_hook()
1411 * evaluates to nothing.  Thus, catch all relevant config debug options here.
1412 */
1413#if defined(CONFIG_KMEMCHECK) ||                \
1414        defined(CONFIG_LOCKDEP) ||              \
1415        defined(CONFIG_DEBUG_KMEMLEAK) ||       \
1416        defined(CONFIG_DEBUG_OBJECTS_FREE) ||   \
1417        defined(CONFIG_KASAN)
1418
1419        void *object = head;
1420        void *tail_obj = tail ? : head;
1421
1422        do {
1423                slab_free_hook(s, object);
1424        } while ((object != tail_obj) &&
1425                 (object = get_freepointer(s, object)));
1426#endif
1427}
1428
1429/*
1430 * Slab allocation and freeing
1431 */
1432static inline struct page *alloc_slab_page(struct kmem_cache *s,
1433                gfp_t flags, int node, struct kmem_cache_order_objects oo)
1434{
1435        struct page *page;
1436        int order = oo_order(oo);
1437
1438        flags |= __GFP_NOTRACK;
1439
1440        if (memcg_charge_slab(s, flags, order))
1441                return NULL;
1442
1443        if (node == NUMA_NO_NODE)
1444                page = alloc_pages(flags, order);
1445        else
1446                page = alloc_pages_exact_node(node, flags, order);
1447
1448        if (!page)
1449                memcg_uncharge_slab(s, order);
1450
1451        return page;
1452}
1453
1454static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1455{
1456        struct page *page;
1457        struct kmem_cache_order_objects oo = s->oo;
1458        gfp_t alloc_gfp;
1459
1460        flags &= gfp_allowed_mask;
1461
1462        if (flags & __GFP_WAIT)
1463                local_irq_enable();
1464
1465        flags |= s->allocflags;
1466
1467        /*
1468         * Let the initial higher-order allocation fail under memory pressure
1469         * so we fall-back to the minimum order allocation.
1470         */
1471        alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
1472
1473        page = alloc_slab_page(s, alloc_gfp, node, oo);
1474        if (unlikely(!page)) {
1475                oo = s->min;
1476                /*
1477                 * Allocation may have failed due to fragmentation.
1478                 * Try a lower order alloc if possible
1479                 */
1480                page = alloc_slab_page(s, flags, node, oo);
1481
1482                if (page)
1483                        stat(s, ORDER_FALLBACK);
1484        }
1485
1486        if (kmemcheck_enabled && page
1487                && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
1488                int pages = 1 << oo_order(oo);
1489
1490                kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
1491
1492                /*
1493                 * Objects from caches that have a constructor don't get
1494                 * cleared when they're allocated, so we need to do it here.
1495                 */
1496                if (s->ctor)
1497                        kmemcheck_mark_uninitialized_pages(page, pages);
1498                else
1499                        kmemcheck_mark_unallocated_pages(page, pages);
1500        }
1501
1502        if (flags & __GFP_WAIT)
1503                local_irq_disable();
1504        if (!page)
1505                return NULL;
1506
1507        page->objects = oo_objects(oo);
1508        mod_zone_page_state(page_zone(page),
1509                (s->flags & SLAB_RECLAIM_ACCOUNT) ?
1510                NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1511                1 << oo_order(oo));
1512
1513        return page;
1514}
1515
1516static void setup_object(struct kmem_cache *s, struct page *page,
1517                                void *object)
1518{
1519        setup_object_debug(s, page, object);
1520        if (unlikely(s->ctor))
1521                s->ctor(object);
1522}
1523
1524static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1525{
1526        struct page *page;
1527        void *start;
1528        void *p;
1529        int order;
1530        int idx;
1531
1532        BUG_ON(flags & GFP_SLAB_BUG_MASK);
1533
1534        page = allocate_slab(s,
1535                flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
1536        if (!page)
1537                goto out;
1538
1539        order = compound_order(page);
1540        inc_slabs_node(s, page_to_nid(page), page->objects);
1541        memcg_bind_pages(s, order);
1542        page->slab_cache = s;
1543        __SetPageSlab(page);
1544        if (page_is_pfmemalloc(page))
1545                SetPageSlabPfmemalloc(page);
1546
1547        start = page_address(page);
1548
1549        if (unlikely(s->flags & SLAB_POISON))
1550                memset(start, POISON_INUSE, PAGE_SIZE << order);
1551
1552        for_each_object_idx(p, idx, s, start, page->objects) {
1553                setup_object(s, page, p);
1554                if (likely(idx < page->objects))
1555                        set_freepointer(s, p, p + s->size);
1556                else
1557                        set_freepointer(s, p, NULL);
1558        }
1559
1560        page->freelist = fixup_red_left(s, start);
1561        page->inuse = page->objects;
1562        page->frozen = 1;
1563out:
1564        return page;
1565}
1566
1567static void __free_slab(struct kmem_cache *s, struct page *page)
1568{
1569        int order = compound_order(page);
1570        int pages = 1 << order;
1571
1572        if (s->flags & SLAB_CONSISTENCY_CHECKS) {
1573                void *p;
1574
1575                slab_pad_check(s, page);
1576                for_each_object(p, s, page_address(page),
1577                                                page->objects)
1578                        check_object(s, page, p, SLUB_RED_INACTIVE);
1579        }
1580
1581        kmemcheck_free_shadow(page, compound_order(page));
1582
1583        mod_zone_page_state(page_zone(page),
1584                (s->flags & SLAB_RECLAIM_ACCOUNT) ?
1585                NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1586                -pages);
1587
1588        __ClearPageSlabPfmemalloc(page);
1589        __ClearPageSlab(page);
1590
1591        memcg_release_pages(s, order);
1592        page_mapcount_reset(page);
1593        if (current->reclaim_state)
1594                current->reclaim_state->reclaimed_slab += pages;
1595        __free_pages(page, order);
1596        memcg_uncharge_slab(s, order);
1597}
1598
1599#define need_reserve_slab_rcu                                           \
1600        (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
1601
1602static void rcu_free_slab(struct rcu_head *h)
1603{
1604        struct page *page;
1605
1606        if (need_reserve_slab_rcu)
1607                page = virt_to_head_page(h);
1608        else
1609                page = container_of((struct list_head *)h, struct page, lru);
1610
1611        __free_slab(page->slab_cache, page);
1612}
1613
1614static void free_slab(struct kmem_cache *s, struct page *page)
1615{
1616        if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
1617                struct rcu_head *head;
1618
1619                if (need_reserve_slab_rcu) {
1620                        int order = compound_order(page);
1621                        int offset = (PAGE_SIZE << order) - s->reserved;
1622
1623                        VM_BUG_ON(s->reserved != sizeof(*head));
1624                        head = page_address(page) + offset;
1625                } else {
1626                        /*
1627                         * RCU free overloads the RCU head over the LRU
1628                         */
1629                        head = (void *)&page->lru;
1630                }
1631
1632                call_rcu(head, rcu_free_slab);
1633        } else
1634                __free_slab(s, page);
1635}
1636
1637static void discard_slab(struct kmem_cache *s, struct page *page)
1638{
1639        dec_slabs_node(s, page_to_nid(page), page->objects);
1640        free_slab(s, page);
1641}
1642
1643/*
1644 * Management of partially allocated slabs.
1645 *
1646 * list_lock must be held.
1647 */
1648static inline void add_partial(struct kmem_cache_node *n,
1649                                struct page *page, int tail)
1650{
1651        n->nr_partial++;
1652        if (tail == DEACTIVATE_TO_TAIL)
1653                list_add_tail(&page->lru, &n->partial);
1654        else
1655                list_add(&page->lru, &n->partial);
1656}
1657
1658/*
1659 * list_lock must be held.
1660 */
1661static inline void remove_partial(struct kmem_cache_node *n,
1662                                        struct page *page)
1663{
1664        list_del(&page->lru);
1665        n->nr_partial--;
1666}
1667
1668/*
1669 * Remove slab from the partial list, freeze it and
1670 * return the pointer to the freelist.
1671 *
1672 * Returns a list of objects or NULL if it fails.
1673 *
1674 * Must hold list_lock since we modify the partial list.
1675 */
1676static inline void *acquire_slab(struct kmem_cache *s,
1677                struct kmem_cache_node *n, struct page *page,
1678                int mode, int *objects)
1679{
1680        void *freelist;
1681        unsigned long counters;
1682        struct page new;
1683
1684        /*
1685         * Zap the freelist and set the frozen bit.
1686         * The old freelist is the list of objects for the
1687         * per cpu allocation list.
1688         */
1689        freelist = page->freelist;
1690        counters = page->counters;
1691        new.counters = counters;
1692        *objects = new.objects - new.inuse;
1693        if (mode) {
1694                new.inuse = page->objects;
1695                new.freelist = NULL;
1696        } else {
1697                new.freelist = freelist;
1698        }
1699
1700        VM_BUG_ON(new.frozen);
1701        new.frozen = 1;
1702
1703        if (!__cmpxchg_double_slab(s, page,
1704                        freelist, counters,
1705                        new.freelist, new.counters,
1706                        "acquire_slab"))
1707                return NULL;
1708
1709        remove_partial(n, page);
1710        WARN_ON(!freelist);
1711        return freelist;
1712}
1713
1714static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain);
1715static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags);
1716
1717/*
1718 * Try to allocate a partial slab from a specific node.
1719 */
1720static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
1721                                struct kmem_cache_cpu *c, gfp_t flags)
1722{
1723        struct page *page, *page2;
1724        void *object = NULL;
1725        int available = 0;
1726        int objects;
1727
1728        /*
1729         * Racy check. If we mistakenly see no partial slabs then we
1730         * just allocate an empty slab. If we mistakenly try to get a
1731         * partial slab and there is none available then get_partials()
1732         * will return NULL.
1733         */
1734        if (!n || !n->nr_partial)
1735                return NULL;
1736
1737        spin_lock(&n->list_lock);
1738        list_for_each_entry_safe(page, page2, &n->partial, lru) {
1739                void *t;
1740
1741                if (!pfmemalloc_match(page, flags))
1742                        continue;
1743
1744                t = acquire_slab(s, n, page, object == NULL, &objects);
1745                if (!t)
1746                        break;
1747
1748                available += objects;
1749                if (!object) {
1750                        c->page = page;
1751                        stat(s, ALLOC_FROM_PARTIAL);
1752                        object = t;
1753                } else {
1754                        put_cpu_partial(s, page, 0);
1755                        stat(s, CPU_PARTIAL_NODE);
1756                }
1757                if (kmem_cache_debug(s) || available > s->cpu_partial / 2)
1758                        break;
1759
1760        }
1761        spin_unlock(&n->list_lock);
1762        return object;
1763}
1764
1765/*
1766 * Get a page from somewhere. Search in increasing NUMA distances.
1767 */
1768static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
1769                struct kmem_cache_cpu *c)
1770{
1771#ifdef CONFIG_NUMA
1772        struct zonelist *zonelist;
1773        struct zoneref *z;
1774        struct zone *zone;
1775        enum zone_type high_zoneidx = gfp_zone(flags);
1776        void *object;
1777        unsigned int cpuset_mems_cookie;
1778
1779        /*
1780         * The defrag ratio allows a configuration of the tradeoffs between
1781         * inter node defragmentation and node local allocations. A lower
1782         * defrag_ratio increases the tendency to do local allocations
1783         * instead of attempting to obtain partial slabs from other nodes.
1784         *
1785         * If the defrag_ratio is set to 0 then kmalloc() always
1786         * returns node local objects. If the ratio is higher then kmalloc()
1787         * may return off node objects because partial slabs are obtained
1788         * from other nodes and filled up.
1789         *
1790         * If /sys/kernel/slab/xx/defrag_ratio is set to 100 (which makes
1791         * defrag_ratio = 1000) then every (well almost) allocation will
1792         * first attempt to defrag slab caches on other nodes. This means
1793         * scanning over all nodes to look for partial slabs which may be
1794         * expensive if we do it every time we are trying to find a slab
1795         * with available objects.
1796         */
1797        if (!s->remote_node_defrag_ratio ||
1798                        get_cycles() % 1024 > s->remote_node_defrag_ratio)
1799                return NULL;
1800
1801        do {
1802                cpuset_mems_cookie = read_mems_allowed_begin();
1803                zonelist = node_zonelist(slab_node(), flags);
1804                for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1805                        struct kmem_cache_node *n;
1806
1807                        n = get_node(s, zone_to_nid(zone));
1808
1809                        if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
1810                                        n->nr_partial > s->min_partial) {
1811                                object = get_partial_node(s, n, c, flags);
1812                                if (object) {
1813                                        /*
1814                                         * Don't check read_mems_allowed_retry()
1815                                         * here - if mems_allowed was updated in
1816                                         * parallel, that was a harmless race
1817                                         * between allocation and the cpuset
1818                                         * update
1819                                         */
1820                                        return object;
1821                                }
1822                        }
1823                }
1824        } while (read_mems_allowed_retry(cpuset_mems_cookie));
1825#endif
1826        return NULL;
1827}
1828
1829/*
1830 * Get a partial page, lock it and return it.
1831 */
1832static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
1833                struct kmem_cache_cpu *c)
1834{
1835        void *object;
1836        int searchnode = node;
1837
1838        if (node == NUMA_NO_NODE)
1839                searchnode = numa_mem_id();
1840        else if (!node_present_pages(node))
1841                searchnode = node_to_mem_node(node);
1842
1843        object = get_partial_node(s, get_node(s, searchnode), c, flags);
1844        if (object || node != NUMA_NO_NODE)
1845                return object;
1846
1847        return get_any_partial(s, flags, c);
1848}
1849
1850#ifdef CONFIG_PREEMPT
1851/*
1852 * Calculate the next globally unique transaction for disambiguiation
1853 * during cmpxchg. The transactions start with the cpu number and are then
1854 * incremented by CONFIG_NR_CPUS.
1855 */
1856#define TID_STEP  roundup_pow_of_two(CONFIG_NR_CPUS)
1857#else
1858/*
1859 * No preemption supported therefore also no need to check for
1860 * different cpus.
1861 */
1862#define TID_STEP 1
1863#endif
1864
1865static inline unsigned long next_tid(unsigned long tid)
1866{
1867        return tid + TID_STEP;
1868}
1869
1870static inline unsigned int tid_to_cpu(unsigned long tid)
1871{
1872        return tid % TID_STEP;
1873}
1874
1875static inline unsigned long tid_to_event(unsigned long tid)
1876{
1877        return tid / TID_STEP;
1878}
1879
1880static inline unsigned int init_tid(int cpu)
1881{
1882        return cpu;
1883}
1884
1885static inline void note_cmpxchg_failure(const char *n,
1886                const struct kmem_cache *s, unsigned long tid)
1887{
1888#ifdef SLUB_DEBUG_CMPXCHG
1889        unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
1890
1891        printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name);
1892
1893#ifdef CONFIG_PREEMPT
1894        if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
1895                printk("due to cpu change %d -> %d\n",
1896                        tid_to_cpu(tid), tid_to_cpu(actual_tid));
1897        else
1898#endif
1899        if (tid_to_event(tid) != tid_to_event(actual_tid))
1900                printk("due to cpu running other code. Event %ld->%ld\n",
1901                        tid_to_event(tid), tid_to_event(actual_tid));
1902        else
1903                printk("for unknown reason: actual=%lx was=%lx target=%lx\n",
1904                        actual_tid, tid, next_tid(tid));
1905#endif
1906        stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
1907}
1908
1909static void init_kmem_cache_cpus(struct kmem_cache *s)
1910{
1911        int cpu;
1912
1913        for_each_possible_cpu(cpu)
1914                per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
1915}
1916
1917/*
1918 * Remove the cpu slab
1919 */
1920static void deactivate_slab(struct kmem_cache *s, struct page *page, void *freelist)
1921{
1922        enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
1923        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1924        int lock = 0;
1925        enum slab_modes l = M_NONE, m = M_NONE;
1926        void *nextfree;
1927        int tail = DEACTIVATE_TO_HEAD;
1928        struct page new;
1929        struct page old;
1930
1931        if (page->freelist) {
1932                stat(s, DEACTIVATE_REMOTE_FREES);
1933                tail = DEACTIVATE_TO_TAIL;
1934        }
1935
1936        /*
1937         * Stage one: Free all available per cpu objects back
1938         * to the page freelist while it is still frozen. Leave the
1939         * last one.
1940         *
1941         * There is no need to take the list->lock because the page
1942         * is still frozen.
1943         */
1944        while (freelist && (nextfree = get_freepointer(s, freelist))) {
1945                void *prior;
1946                unsigned long counters;
1947
1948                do {
1949                        prior = page->freelist;
1950                        counters = page->counters;
1951                        set_freepointer(s, freelist, prior);
1952                        new.counters = counters;
1953                        new.inuse--;
1954                        VM_BUG_ON(!new.frozen);
1955
1956                } while (!__cmpxchg_double_slab(s, page,
1957                        prior, counters,
1958                        freelist, new.counters,
1959                        "drain percpu freelist"));
1960
1961                freelist = nextfree;
1962        }
1963
1964        /*
1965         * Stage two: Ensure that the page is unfrozen while the
1966         * list presence reflects the actual number of objects
1967         * during unfreeze.
1968         *
1969         * We setup the list membership and then perform a cmpxchg
1970         * with the count. If there is a mismatch then the page
1971         * is not unfrozen but the page is on the wrong list.
1972         *
1973         * Then we restart the process which may have to remove
1974         * the page from the list that we just put it on again
1975         * because the number of objects in the slab may have
1976         * changed.
1977         */
1978redo:
1979
1980        old.freelist = page->freelist;
1981        old.counters = page->counters;
1982        VM_BUG_ON(!old.frozen);
1983
1984        /* Determine target state of the slab */
1985        new.counters = old.counters;
1986        if (freelist) {
1987                new.inuse--;
1988                set_freepointer(s, freelist, old.freelist);
1989                new.freelist = freelist;
1990        } else
1991                new.freelist = old.freelist;
1992
1993        new.frozen = 0;
1994
1995        if (!new.inuse && n->nr_partial > s->min_partial)
1996                m = M_FREE;
1997        else if (new.freelist) {
1998                m = M_PARTIAL;
1999                if (!lock) {
2000                        lock = 1;

2001                        /*
2002                         * Taking the spinlock removes the possiblity
2003                         * that acquire_slab() will see a slab page that
2004                         * is frozen
2005                         */
2006                        spin_lock(&n->list_lock);
2007                }
2008        } else {
2009                m = M_FULL;
2010                if (kmem_cache_debug(s) && !lock) {
2011                        lock = 1;
2012                        /*
2013                         * This also ensures that the scanning of full
2014                         * slabs from diagnostic functions will not see
2015                         * any frozen slabs.
2016                         */
2017                        spin_lock(&n->list_lock);
2018                }
2019        }
2020
2021        if (l != m) {
2022
2023                if (l == M_PARTIAL)
2024
2025                        remove_partial(n, page);
2026
2027                else if (l == M_FULL)
2028
2029                        remove_full(s, page);
2030
2031                if (m == M_PARTIAL) {
2032
2033                        add_partial(n, page, tail);
2034                        stat(s, tail);
2035
2036                } else if (m == M_FULL) {
2037
2038                        stat(s, DEACTIVATE_FULL);
2039                        add_full(s, n, page);
2040
2041                }
2042        }
2043
2044        l = m;
2045        if (!__cmpxchg_double_slab(s, page,
2046                                old.freelist, old.counters,
2047                                new.freelist, new.counters,
2048                                "unfreezing slab"))
2049                goto redo;
2050
2051        if (lock)
2052                spin_unlock(&n->list_lock);
2053
2054        if (m == M_FREE) {
2055                stat(s, DEACTIVATE_EMPTY);
2056                discard_slab(s, page);
2057                stat(s, FREE_SLAB);
2058        }
2059}
2060
2061/*
2062 * Unfreeze all the cpu partial slabs.
2063 *
2064 * This function must be called with interrupts disabled
2065 * for the cpu using c (or some other guarantee must be there
2066 * to guarantee no concurrent accesses).
2067 */
2068static void unfreeze_partials(struct kmem_cache *s,
2069                struct kmem_cache_cpu *c)
2070{
2071        struct kmem_cache_node *n = NULL, *n2 = NULL;
2072        struct page *page, *discard_page = NULL;
2073
2074        while ((page = c->partial)) {
2075                struct page new;
2076                struct page old;
2077
2078                c->partial = page->next;
2079
2080                n2 = get_node(s, page_to_nid(page));
2081                if (n != n2) {
2082                        if (n)
2083                                spin_unlock(&n->list_lock);
2084
2085                        n = n2;
2086                        spin_lock(&n->list_lock);
2087                }
2088
2089                do {
2090
2091                        old.freelist = page->freelist;
2092                        old.counters = page->counters;
2093                        VM_BUG_ON(!old.frozen);
2094
2095                        new.counters = old.counters;
2096                        new.freelist = old.freelist;
2097
2098                        new.frozen = 0;
2099
2100                } while (!__cmpxchg_double_slab(s, page,
2101                                old.freelist, old.counters,
2102                                new.freelist, new.counters,
2103                                "unfreezing slab"));
2104
2105                if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) {
2106                        page->next = discard_page;
2107                        discard_page = page;
2108                } else {
2109                        add_partial(n, page, DEACTIVATE_TO_TAIL);
2110                        stat(s, FREE_ADD_PARTIAL);
2111                }
2112        }
2113
2114        if (n)
2115                spin_unlock(&n->list_lock);
2116
2117        while (discard_page) {
2118                page = discard_page;
2119                discard_page = discard_page->next;
2120
2121                stat(s, DEACTIVATE_EMPTY);
2122                discard_slab(s, page);
2123                stat(s, FREE_SLAB);
2124        }
2125}
2126
2127/*
2128 * Put a page that was just frozen (in __slab_free) into a partial page
2129 * slot if available. This is done without interrupts disabled and without
2130 * preemption disabled. The cmpxchg is racy and may put the partial page
2131 * onto a random cpus partial slot.
2132 *
2133 * If we did not find a slot then simply move all the partials to the
2134 * per node partial list.
2135 */
2136static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
2137{
2138        struct page *oldpage;
2139        int pages;
2140        int pobjects;
2141
2142        preempt_disable();
2143        do {
2144                pages = 0;
2145                pobjects = 0;
2146                oldpage = this_cpu_read(s->cpu_slab->partial);
2147
2148                if (oldpage) {
2149                        pobjects = oldpage->pobjects;
2150                        pages = oldpage->pages;
2151                        if (drain && pobjects > s->cpu_partial) {
2152                                unsigned long flags;
2153                                /*
2154                                 * partial array is full. Move the existing
2155                                 * set to the per node partial list.
2156                                 */
2157                                local_irq_save(flags);
2158                                unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
2159                                local_irq_restore(flags);
2160                                oldpage = NULL;
2161                                pobjects = 0;
2162                                pages = 0;
2163                                stat(s, CPU_PARTIAL_DRAIN);
2164                        }
2165                }
2166
2167                pages++;
2168                pobjects += page->objects - page->inuse;
2169
2170                page->pages = pages;
2171                page->pobjects = pobjects;
2172                page->next = oldpage;
2173
2174        } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage);
2175
2176        if (unlikely(!s->cpu_partial)) {
2177                unsigned long flags;
2178
2179                local_irq_save(flags);
2180                unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
2181                local_irq_restore(flags);
2182        }
2183        preempt_enable();
2184}
2185
2186static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
2187{
2188        stat(s, CPUSLAB_FLUSH);
2189        deactivate_slab(s, c->page, c->freelist);
2190
2191        c->tid = next_tid(c->tid);
2192        c->page = NULL;
2193        c->freelist = NULL;
2194}
2195
2196/*
2197 * Flush cpu slab.
2198 *
2199 * Called from IPI handler with interrupts disabled.
2200 */
2201static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
2202{
2203        struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
2204
2205        if (likely(c)) {
2206                if (c->page)
2207                        flush_slab(s, c);
2208
2209                unfreeze_partials(s, c);
2210        }
2211}
2212
2213static void flush_cpu_slab(void *d)
2214{
2215        struct kmem_cache *s = d;
2216
2217        __flush_cpu_slab(s, smp_processor_id());
2218}
2219
2220static bool has_cpu_slab(int cpu, void *info)
2221{
2222        struct kmem_cache *s = info;
2223        struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
2224
2225        return c->page || c->partial;
2226}
2227
2228static void flush_all(struct kmem_cache *s)
2229{
2230        on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
2231}
2232
2233/*
2234 * Check if the objects in a per cpu structure fit numa
2235 * locality expectations.
2236 */
2237static inline int node_match(struct page *page, int node)
2238{
2239#ifdef CONFIG_NUMA
2240        if (!page || (node != NUMA_NO_NODE && page_to_nid(page) != node))
2241                return 0;
2242#endif
2243        return 1;
2244}
2245
2246static int count_free(struct page *page)
2247{
2248        return page->objects - page->inuse;
2249}
2250
2251static unsigned long count_partial(struct kmem_cache_node *n,
2252                                        int (*get_count)(struct page *))
2253{
2254        unsigned long flags;
2255        unsigned long x = 0;
2256        struct page *page;
2257
2258        spin_lock_irqsave(&n->list_lock, flags);
2259        list_for_each_entry(page, &n->partial, lru)
2260                x += get_count(page);
2261        spin_unlock_irqrestore(&n->list_lock, flags);
2262        return x;
2263}
2264
2265static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
2266{
2267#ifdef CONFIG_SLUB_DEBUG
2268        return atomic_long_read(&n->total_objects);
2269#else
2270        return 0;
2271#endif
2272}
2273
2274static noinline void
2275slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
2276{
2277        int node;
2278
2279        printk(KERN_WARNING
2280                "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n",
2281                nid, gfpflags);
2282        printk(KERN_WARNING "  cache: %s, object size: %d, buffer size: %d, "
2283                "default order: %d, min order: %d\n", s->name, s->object_size,
2284                s->size, oo_order(s->oo), oo_order(s->min));
2285
2286        if (oo_order(s->min) > get_order(s->object_size))
2287                printk(KERN_WARNING "  %s debugging increased min order, use "
2288                       "slub_debug=O to disable.\n", s->name);
2289
2290        for_each_online_node(node) {
2291                struct kmem_cache_node *n = get_node(s, node);
2292                unsigned long nr_slabs;
2293                unsigned long nr_objs;
2294                unsigned long nr_free;
2295
2296                if (!n)
2297                        continue;
2298
2299                nr_free  = count_partial(n, count_free);
2300                nr_slabs = node_nr_slabs(n);
2301                nr_objs  = node_nr_objs(n);
2302
2303                printk(KERN_WARNING
2304                        "  node %d: slabs: %ld, objs: %ld, free: %ld\n",
2305                        node, nr_slabs, nr_objs, nr_free);
2306        }
2307}
2308
2309static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
2310                        int node, struct kmem_cache_cpu **pc)
2311{
2312        void *freelist;
2313        struct kmem_cache_cpu *c = *pc;
2314        struct page *page;
2315
2316        WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO));
2317
2318        freelist = get_partial(s, flags, node, c);
2319
2320        if (freelist)
2321                return freelist;
2322
2323        page = new_slab(s, flags, node);
2324        if (page) {
2325                c = this_cpu_ptr(s->cpu_slab);
2326                if (c->page)
2327                        flush_slab(s, c);
2328
2329                /*
2330                 * No other reference to the page yet so we can
2331                 * muck around with it freely without cmpxchg
2332                 */
2333                freelist = page->freelist;
2334                page->freelist = NULL;
2335
2336                stat(s, ALLOC_SLAB);
2337                c->page = page;
2338                *pc = c;
2339        } else
2340                freelist = NULL;
2341
2342        return freelist;
2343}
2344
2345static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags)
2346{
2347        if (unlikely(PageSlabPfmemalloc(page)))
2348                return gfp_pfmemalloc_allowed(gfpflags);
2349
2350        return true;
2351}
2352
2353/*
2354 * Check the page->freelist of a page and either transfer the freelist to the per cpu freelist
2355 * or deactivate the page.
2356 *
2357 * The page is still frozen if the return value is not NULL.
2358 *
2359 * If this function returns NULL then the page has been unfrozen.
2360 *
2361 * This function must be called with interrupt disabled.
2362 */
2363static inline void *get_freelist(struct kmem_cache *s, struct page *page)
2364{
2365        struct page new;
2366        unsigned long counters;
2367        void *freelist;
2368
2369        do {
2370                freelist = page->freelist;
2371                counters = page->counters;
2372
2373                new.counters = counters;
2374                VM_BUG_ON(!new.frozen);
2375
2376                new.inuse = page->objects;
2377                new.frozen = freelist != NULL;
2378
2379        } while (!__cmpxchg_double_slab(s, page,
2380                freelist, counters,
2381                NULL, new.counters,
2382                "get_freelist"));
2383
2384        return freelist;
2385}
2386
2387/*
2388 * Slow path. The lockless freelist is empty or we need to perform
2389 * debugging duties.
2390 *
2391 * Processing is still very fast if new objects have been freed to the
2392 * regular freelist. In that case we simply take over the regular freelist
2393 * as the lockless freelist and zap the regular freelist.
2394 *
2395 * If that is not working then we fall back to the partial lists. We take the
2396 * first element of the freelist as the object to allocate now and move the
2397 * rest of the freelist to the lockless freelist.
2398 *
2399 * And if we were unable to get a new slab from the partial slab lists then
2400 * we need to allocate a new slab. This is the slowest path since it involves
2401 * a call to the page allocator and the setup of a new slab.
2402 *
2403 * Version of __slab_alloc to use when we know that interrupts are
2404 * already disabled (which is the case for bulk allocation).
2405 */
2406static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
2407                          unsigned long addr, struct kmem_cache_cpu *c)
2408{
2409        void *freelist;
2410        struct page *page;
2411
2412        page = c->page;
2413        if (!page)
2414                goto new_slab;
2415redo:
2416
2417        if (unlikely(!node_match(page, node))) {
2418                int searchnode = node;
2419
2420                if (node != NUMA_NO_NODE && !node_present_pages(node))
2421                        searchnode = node_to_mem_node(node);
2422
2423                if (unlikely(!node_match(page, searchnode))) {
2424                        stat(s, ALLOC_NODE_MISMATCH);
2425                        deactivate_slab(s, page, c->freelist);
2426                        c->page = NULL;
2427                        c->freelist = NULL;
2428                        goto new_slab;
2429                }
2430        }
2431
2432        /*
2433         * By rights, we should be searching for a slab page that was
2434         * PFMEMALLOC but right now, we are losing the pfmemalloc
2435         * information when the page leaves the per-cpu allocator
2436         */
2437        if (unlikely(!pfmemalloc_match(page, gfpflags))) {
2438                deactivate_slab(s, page, c->freelist);
2439                c->page = NULL;
2440                c->freelist = NULL;
2441                goto new_slab;
2442        }
2443
2444        /* must check again c->freelist in case of cpu migration or IRQ */
2445        freelist = c->freelist;
2446        if (freelist)
2447                goto load_freelist;
2448
2449        stat(s, ALLOC_SLOWPATH);
2450
2451        freelist = get_freelist(s, page);
2452
2453        if (!freelist) {
2454                c->page = NULL;
2455                stat(s, DEACTIVATE_BYPASS);
2456                goto new_slab;
2457        }
2458
2459        stat(s, ALLOC_REFILL);
2460
2461load_freelist:
2462        /*
2463         * freelist is pointing to the list of objects to be used.
2464         * page is pointing to the page from which the objects are obtained.
2465         * That page must be frozen for per cpu allocations to work.
2466         */
2467        VM_BUG_ON(!c->page->frozen);
2468        c->freelist = get_freepointer(s, freelist);
2469        c->tid = next_tid(c->tid);
2470        return freelist;
2471
2472new_slab:
2473
2474        if (c->partial) {
2475                page = c->page = c->partial;
2476                c->partial = page->next;
2477                stat(s, CPU_PARTIAL_ALLOC);
2478                c->freelist = NULL;
2479                goto redo;
2480        }
2481
2482        freelist = new_slab_objects(s, gfpflags, node, &c);
2483
2484        if (unlikely(!freelist)) {
2485                if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
2486                        slab_out_of_memory(s, gfpflags, node);
2487                return NULL;
2488        }
2489
2490        page = c->page;
2491        if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
2492                goto load_freelist;
2493
2494        /* Only entered in the debug case */
2495        if (kmem_cache_debug(s) && !alloc_debug_processing(s, page, freelist, addr))
2496                goto new_slab;  /* Slab failed checks. Next slab needed */
2497
2498        deactivate_slab(s, page, get_freepointer(s, freelist));
2499        c->page = NULL;
2500        c->freelist = NULL;
2501        return freelist;
2502}
2503
2504/*
2505 * Another one that disabled interrupt and compensates for possible
2506 * cpu changes by refetching the per cpu area pointer.
2507 */
2508static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
2509                          unsigned long addr, struct kmem_cache_cpu *c)
2510{
2511        void *p;
2512        unsigned long flags;
2513
2514        local_irq_save(flags);
2515#ifdef CONFIG_PREEMPT
2516        /*
2517         * We may have been preempted and rescheduled on a different
2518         * cpu before disabling interrupts. Need to reload cpu area
2519         * pointer.
2520         */
2521        c = this_cpu_ptr(s->cpu_slab);
2522#endif
2523
2524        p = ___slab_alloc(s, gfpflags, node, addr, c);
2525        local_irq_restore(flags);
2526        return p;
2527}
2528
2529/*
2530 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
2531 * have the fastpath folded into their functions. So no function call
2532 * overhead for requests that can be satisfied on the fastpath.
2533 *
2534 * The fastpath works by first checking if the lockless freelist can be used.
2535 * If not then __slab_alloc is called for slow processing.
2536 *
2537 * Otherwise we can simply pick the next object from the lockless free list.
2538 */
2539static __always_inline void *slab_alloc_node(struct kmem_cache *s,
2540                gfp_t gfpflags, int node, unsigned long addr)
2541{
2542        void *object;
2543        struct kmem_cache_cpu *c;
2544        struct page *page;
2545        unsigned long tid;
2546
2547        if (slab_pre_alloc_hook(s, gfpflags))
2548                return NULL;
2549
2550        s = memcg_kmem_get_cache(s, gfpflags);
2551redo:
2552        /*
2553         * Must read kmem_cache cpu data via this cpu ptr. Preemption is
2554         * enabled. We may switch back and forth between cpus while
2555         * reading from one cpu area. That does not matter as long
2556         * as we end up on the original cpu again when doing the cmpxchg.
2557         *
2558         * Preemption is disabled for the retrieval of the tid because that
2559         * must occur from the current processor. We cannot allow rescheduling
2560         * on a different processor between the determination of the pointer
2561         * and the retrieval of the tid.
2562         */
2563        preempt_disable();
2564        c = this_cpu_ptr(s->cpu_slab);
2565
2566        /*
2567         * The transaction ids are globally unique per cpu and per operation on
2568         * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
2569         * occurs on the right processor and that there was no operation on the
2570         * linked list in between.
2571         */
2572        tid = c->tid;
2573        preempt_enable();
2574
2575        object = c->freelist;
2576        page = c->page;
2577        if (unlikely(!object || !node_match(page, node)))
2578                object = __slab_alloc(s, gfpflags, node, addr, c);
2579
2580        else {
2581                void *next_object = get_freepointer_safe(s, object);
2582
2583                /*
2584                 * The cmpxchg will only match if there was no additional
2585                 * operation and if we are on the right processor.
2586                 *
2587                 * The cmpxchg does the following atomically (without lock semantics!)
2588                 * 1. Relocate first pointer to the current per cpu area.
2589                 * 2. Verify that tid and freelist have not been changed
2590                 * 3. If they were not changed replace tid and freelist
2591                 *
2592                 * Since this is without lock semantics the protection is only against
2593                 * code executing on this cpu *not* from access by other cpus.
2594                 */
2595                if (unlikely(!this_cpu_cmpxchg_double(
2596                                s->cpu_slab->freelist, s->cpu_slab->tid,
2597                                object, tid,
2598                                next_object, next_tid(tid)))) {
2599
2600                        note_cmpxchg_failure("slab_alloc", s, tid);
2601                        goto redo;
2602                }
2603                prefetch_freepointer(s, next_object);
2604                stat(s, ALLOC_FASTPATH);
2605        }
2606
2607        if (unlikely(gfpflags & __GFP_ZERO) && object)
2608                memset(object, 0, s->object_size);
2609
2610        slab_post_alloc_hook(s, gfpflags, 1, &object);
2611
2612        return object;
2613}
2614
2615static __always_inline void *slab_alloc(struct kmem_cache *s,
2616                gfp_t gfpflags, unsigned long addr)
2617{
2618        return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);
2619}
2620
2621void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
2622{
2623        void *ret = slab_alloc(s, gfpflags, _RET_IP_);
2624
2625        trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, s->size, gfpflags);
2626
2627        return ret;
2628}
2629EXPORT_SYMBOL(kmem_cache_alloc);
2630
2631#ifdef CONFIG_TRACING
2632void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
2633{
2634        void *ret = slab_alloc(s, gfpflags, _RET_IP_);
2635        trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
2636        return ret;
2637}
2638EXPORT_SYMBOL(kmem_cache_alloc_trace);
2639
2640void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
2641{
2642        void *ret = kmalloc_order(size, flags, order);
2643        trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags);
2644        return ret;
2645}
2646EXPORT_SYMBOL(kmalloc_order_trace);
2647#endif
2648
2649#ifdef CONFIG_NUMA
2650void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
2651{
2652        void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
2653
2654        trace_kmem_cache_alloc_node(_RET_IP_, ret,
2655                                    s->object_size, s->size, gfpflags, node);
2656
2657        return ret;
2658}
2659EXPORT_SYMBOL(kmem_cache_alloc_node);
2660
2661#ifdef CONFIG_TRACING
2662void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
2663                                    gfp_t gfpflags,
2664                                    int node, size_t size)
2665{
2666        void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
2667
2668        trace_kmalloc_node(_RET_IP_, ret,
2669                           size, s->size, gfpflags, node);
2670        return ret;
2671}
2672EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
2673#endif
2674#endif
2675
2676/*
2677 * Slow patch handling. This may still be called frequently since objects
2678 * have a longer lifetime than the cpu slabs in most processing loads.
2679 *
2680 * So we still attempt to reduce cache line usage. Just take the slab
2681 * lock and free the item. If there is no additional partial page
2682 * handling required then we can return immediately.
2683 */
2684static void __slab_free(struct kmem_cache *s, struct page *page,
2685                        void *head, void *tail, int cnt,
2686                        unsigned long addr)
2687
2688{
2689        void *prior;
2690        int was_frozen;
2691        struct page new;
2692        unsigned long counters;
2693        struct kmem_cache_node *n = NULL;
2694        unsigned long uninitialized_var(flags);
2695
2696        stat(s, FREE_SLOWPATH);
2697
2698        if (kmem_cache_debug(s) &&
2699            !free_debug_processing(s, page, head, tail, cnt, addr))
2700                return;
2701
2702        do {
2703                if (unlikely(n)) {
2704                        spin_unlock_irqrestore(&n->list_lock, flags);
2705                        n = NULL;
2706                }
2707                prior = page->freelist;
2708                counters = page->counters;
2709                set_freepointer(s, tail, prior);
2710                new.counters = counters;
2711                was_frozen = new.frozen;
2712                new.inuse -= cnt;
2713                if ((!new.inuse || !prior) && !was_frozen) {
2714
2715                        if (!kmem_cache_debug(s) && !prior)
2716
2717                                /*
2718                                 * Slab was on no list before and will be partially empty
2719                                 * We can defer the list move and instead freeze it.
2720                                 */
2721                                new.frozen = 1;
2722
2723                        else { /* Needs to be taken off a list */
2724
2725                                n = get_node(s, page_to_nid(page));
2726                                /*
2727                                 * Speculatively acquire the list_lock.
2728                                 * If the cmpxchg does not succeed then we may
2729                                 * drop the list_lock without any processing.
2730                                 *
2731                                 * Otherwise the list_lock will synchronize with
2732                                 * other processors updating the list of slabs.
2733                                 */
2734                                spin_lock_irqsave(&n->list_lock, flags);
2735
2736                        }
2737                }
2738
2739        } while (!cmpxchg_double_slab(s, page,
2740                prior, counters,
2741                head, new.counters,
2742                "__slab_free"));
2743
2744        if (likely(!n)) {
2745
2746                /*
2747                 * If we just froze the page then put it onto the
2748                 * per cpu partial list.
2749                 */
2750                if (new.frozen && !was_frozen) {
2751                        put_cpu_partial(s, page, 1);
2752                        stat(s, CPU_PARTIAL_FREE);
2753                }
2754                /*
2755                 * The list lock was not taken therefore no list
2756                 * activity can be necessary.
2757                 */
2758                if (was_frozen)
2759                        stat(s, FREE_FROZEN);
2760                return;
2761        }
2762
2763        if (unlikely(!new.inuse && n->nr_partial > s->min_partial))
2764                goto slab_empty;
2765
2766        /*
2767         * Objects left in the slab. If it was not on the partial list before
2768         * then add it.
2769         */
2770        if (kmem_cache_debug(s) && unlikely(!prior)) {
2771                remove_full(s, page);
2772                add_partial(n, page, DEACTIVATE_TO_TAIL);
2773                stat(s, FREE_ADD_PARTIAL);
2774        }
2775        spin_unlock_irqrestore(&n->list_lock, flags);
2776        return;
2777
2778slab_empty:
2779        if (prior) {
2780                /*
2781                 * Slab on the partial list.
2782                 */
2783                remove_partial(n, page);
2784                stat(s, FREE_REMOVE_PARTIAL);
2785        } else
2786                /* Slab must be on the full list */
2787                remove_full(s, page);
2788
2789        spin_unlock_irqrestore(&n->list_lock, flags);
2790        stat(s, FREE_SLAB);
2791        discard_slab(s, page);
2792}
2793
2794/*
2795 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
2796 * can perform fastpath freeing without additional function calls.
2797 *
2798 * The fastpath is only possible if we are freeing to the current cpu slab
2799 * of this processor. This typically the case if we have just allocated
2800 * the item before.
2801 *
2802 * If fastpath is not possible then fall back to __slab_free where we deal
2803 * with all sorts of special processing.
2804 *
2805 * Bulk free of a freelist with several objects (all pointing to the
2806 * same page) possible by specifying head and tail ptr, plus objects
2807 * count (cnt). Bulk free indicated by tail pointer being set.
2808 */
2809static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
2810                                      void *head, void *tail, int cnt,
2811                                      unsigned long addr)
2812{
2813        void *tail_obj = tail ? : head;
2814        struct kmem_cache_cpu *c;
2815        unsigned long tid;
2816
2817        slab_free_freelist_hook(s, head, tail);
2818
2819redo:
2820        /*
2821         * Determine the currently cpus per cpu slab.
2822         * The cpu may change afterward. However that does not matter since
2823         * data is retrieved via this pointer. If we are on the same cpu
2824         * during the cmpxchg then the free will succedd.
2825         */
2826        preempt_disable();
2827        c = this_cpu_ptr(s->cpu_slab);
2828
2829        tid = c->tid;
2830        preempt_enable();
2831
2832        if (likely(page == c->page)) {
2833                set_freepointer(s, tail_obj, c->freelist);
2834
2835                if (unlikely(!this_cpu_cmpxchg_double(
2836                                s->cpu_slab->freelist, s->cpu_slab->tid,
2837                                c->freelist, tid,
2838                                head, next_tid(tid)))) {
2839
2840                        note_cmpxchg_failure("slab_free", s, tid);
2841                        goto redo;
2842                }
2843                stat(s, FREE_FASTPATH);
2844        } else
2845                __slab_free(s, page, head, tail_obj, cnt, addr);
2846
2847}
2848
2849void kmem_cache_free(struct kmem_cache *s, void *x)
2850{
2851        s = cache_from_obj(s, x);
2852        if (!s)
2853                return;
2854        slab_free(s, virt_to_head_page(x), x, NULL, 1, _RET_IP_);
2855        trace_kmem_cache_free(_RET_IP_, x);
2856}
2857EXPORT_SYMBOL(kmem_cache_free);
2858
2859struct detached_freelist {
2860        struct page *page;
2861        void *tail;
2862        void *freelist;
2863        int cnt;
2864        struct kmem_cache *s;
2865};
2866
2867/*
2868 * This function progressively scans the array with free objects (with
2869 * a limited look ahead) and extract objects belonging to the same
2870 * page.  It builds a detached freelist directly within the given
2871 * page/objects.  This can happen without any need for
2872 * synchronization, because the objects are owned by running process.
2873 * The freelist is build up as a single linked list in the objects.
2874 * The idea is, that this detached freelist can then be bulk
2875 * transferred to the real freelist(s), but only requiring a single
2876 * synchronization primitive.  Look ahead in the array is limited due
2877 * to performance reasons.
2878 */
2879static inline
2880int build_detached_freelist(struct kmem_cache *s, size_t size,
2881                            void **p, struct detached_freelist *df)
2882{
2883        size_t first_skipped_index = 0;
2884        int lookahead = 3;
2885        void *object;
2886
2887        /* Always re-init detached_freelist */
2888        df->page = NULL;
2889
2890        do {
2891                object = p[--size];
2892        } while (!object && size);
2893
2894        if (!object)
2895                return 0;
2896
2897        /* Support for memcg, compiler can optimize this out */
2898        df->s = cache_from_obj(s, object);
2899
2900        /* Start new detached freelist */
2901        set_freepointer(df->s, object, NULL);
2902        df->page = virt_to_head_page(object);
2903        df->tail = object;
2904        df->freelist = object;
2905        p[size] = NULL; /* mark object processed */
2906        df->cnt = 1;
2907
2908        while (size) {
2909                object = p[--size];
2910                if (!object)
2911                        continue; /* Skip processed objects */
2912
2913                /* df->page is always set at this point */
2914                if (df->page == virt_to_head_page(object)) {
2915                        /* Opportunity build freelist */
2916                        set_freepointer(df->s, object, df->freelist);
2917                        df->freelist = object;
2918                        df->cnt++;
2919                        p[size] = NULL; /* mark object processed */
2920
2921                        continue;
2922                }
2923
2924                /* Limit look ahead search */
2925                if (!--lookahead)
2926                        break;
2927
2928                if (!first_skipped_index)
2929                        first_skipped_index = size + 1;
2930        }
2931
2932        return first_skipped_index;
2933}
2934
2935/* Note that interrupts must be enabled when calling this function. */
2936void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
2937{
2938        if (WARN_ON(!size))
2939                return;
2940
2941        do {
2942                struct detached_freelist df;
2943
2944                size = build_detached_freelist(s, size, p, &df);
2945                if (unlikely(!df.page))
2946                        continue;
2947
2948                slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_);
2949        } while (likely(size));
2950}
2951EXPORT_SYMBOL(kmem_cache_free_bulk);
2952
2953/* Note that interrupts must be enabled when calling this function. */
2954int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
2955                          void **p)
2956{
2957        struct kmem_cache_cpu *c;
2958        int i;
2959
2960        /* memcg and kmem_cache debug support */
2961        if (unlikely(slab_pre_alloc_hook(s, flags)))
2962                return false;
2963        /*
2964         * Drain objects in the per cpu slab, while disabling local
2965         * IRQs, which protects against PREEMPT and interrupts
2966         * handlers invoking normal fastpath.
2967         */
2968        local_irq_disable();
2969        c = this_cpu_ptr(s->cpu_slab);
2970
2971        for (i = 0; i < size; i++) {
2972                void *object = c->freelist;
2973
2974                if (unlikely(!object)) {
2975                        /*
2976                         * Invoking slow path likely have side-effect
2977                         * of re-populating per CPU c->freelist
2978                         */
2979                        p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
2980                                            _RET_IP_, c);
2981                        if (unlikely(!p[i]))
2982                                goto error;
2983
2984                        c = this_cpu_ptr(s->cpu_slab);
2985                        continue; /* goto for-loop */
2986                }
2987                c->freelist = get_freepointer(s, object);
2988                p[i] = object;
2989        }
2990        c->tid = next_tid(c->tid);
2991        local_irq_enable();
2992
2993        /* Clear memory outside IRQ disabled fastpath loop */
2994        if (unlikely(flags & __GFP_ZERO)) {
2995                int j;
2996
2997                for (j = 0; j < i; j++)
2998                        memset(p[j], 0, s->object_size);
2999        }
3000

3001        /* memcg and kmem_cache debug support */
3002        slab_post_alloc_hook(s, flags, size, p);
3003        return i;
3004error:
3005        local_irq_enable();
3006        slab_post_alloc_hook(s, flags, i, p);
3007        __kmem_cache_free_bulk(s, i, p);
3008        return 0;
3009}
3010EXPORT_SYMBOL(kmem_cache_alloc_bulk);
3011
3012
3013/*
3014 * Object placement in a slab is made very easy because we always start at
3015 * offset 0. If we tune the size of the object to the alignment then we can
3016 * get the required alignment by putting one properly sized object after
3017 * another.
3018 *
3019 * Notice that the allocation order determines the sizes of the per cpu
3020 * caches. Each processor has always one slab available for allocations.
3021 * Increasing the allocation order reduces the number of times that slabs
3022 * must be moved on and off the partial lists and is therefore a factor in
3023 * locking overhead.
3024 */
3025
3026/*
3027 * Mininum / Maximum order of slab pages. This influences locking overhead
3028 * and slab fragmentation. A higher order reduces the number of partial slabs
3029 * and increases the number of allocations possible without having to
3030 * take the list_lock.
3031 */
3032static int slub_min_order;
3033static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER;
3034static int slub_min_objects;
3035
3036/*
3037 * Merge control. If this is set then no merging of slab caches will occur.
3038 * (Could be removed. This was introduced to pacify the merge skeptics.)
3039 */
3040static int slub_nomerge;
3041
3042/*
3043 * Calculate the order of allocation given an slab object size.
3044 *
3045 * The order of allocation has significant impact on performance and other
3046 * system components. Generally order 0 allocations should be preferred since
3047 * order 0 does not cause fragmentation in the page allocator. Larger objects
3048 * be problematic to put into order 0 slabs because there may be too much
3049 * unused space left. We go to a higher order if more than 1/16th of the slab
3050 * would be wasted.
3051 *
3052 * In order to reach satisfactory performance we must ensure that a minimum
3053 * number of objects is in one slab. Otherwise we may generate too much
3054 * activity on the partial lists which requires taking the list_lock. This is
3055 * less a concern for large slabs though which are rarely used.
3056 *
3057 * slub_max_order specifies the order where we begin to stop considering the
3058 * number of objects in a slab as critical. If we reach slub_max_order then
3059 * we try to keep the page order as low as possible. So we accept more waste
3060 * of space in favor of a small page order.
3061 *
3062 * Higher order allocations also allow the placement of more objects in a
3063 * slab and thereby reduce object handling overhead. If the user has
3064 * requested a higher mininum order then we start with that one instead of
3065 * the smallest order which will fit the object.
3066 */
3067static inline int slab_order(int size, int min_objects,
3068                                int max_order, int fract_leftover, int reserved)
3069{
3070        int order;
3071        int rem;
3072        int min_order = slub_min_order;
3073
3074        if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
3075                return get_order(size * MAX_OBJS_PER_PAGE) - 1;
3076
3077        for (order = max(min_order,
3078                                fls(min_objects * size - 1) - PAGE_SHIFT);
3079                        order <= max_order; order++) {
3080
3081                unsigned long slab_size = PAGE_SIZE << order;
3082
3083                if (slab_size < min_objects * size + reserved)
3084                        continue;
3085
3086                rem = (slab_size - reserved) % size;
3087
3088                if (rem <= slab_size / fract_leftover)
3089                        break;
3090
3091        }
3092
3093        return order;
3094}
3095
3096static inline int calculate_order(int size, int reserved)
3097{
3098        int order;
3099        int min_objects;
3100        int fraction;
3101        int max_objects;
3102
3103        /*
3104         * Attempt to find best configuration for a slab. This
3105         * works by first attempting to generate a layout with
3106         * the best configuration and backing off gradually.
3107         *
3108         * First we reduce the acceptable waste in a slab. Then
3109         * we reduce the minimum objects required in a slab.
3110         */
3111        min_objects = slub_min_objects;
3112        if (!min_objects)
3113                min_objects = 4 * (fls(nr_cpu_ids) + 1);
3114        max_objects = order_objects(slub_max_order, size, reserved);
3115        min_objects = min(min_objects, max_objects);
3116
3117        while (min_objects > 1) {
3118                fraction = 16;
3119                while (fraction >= 4) {
3120                        order = slab_order(size, min_objects,
3121                                        slub_max_order, fraction, reserved);
3122                        if (order <= slub_max_order)
3123                                return order;
3124                        fraction /= 2;
3125                }
3126                min_objects--;
3127        }
3128
3129        /*
3130         * We were unable to place multiple objects in a slab. Now
3131         * lets see if we can place a single object there.
3132         */
3133        order = slab_order(size, 1, slub_max_order, 1, reserved);
3134        if (order <= slub_max_order)
3135                return order;
3136
3137        /*
3138         * Doh this slab cannot be placed using slub_max_order.
3139         */
3140        order = slab_order(size, 1, MAX_ORDER, 1, reserved);
3141        if (order < MAX_ORDER)
3142                return order;
3143        return -ENOSYS;
3144}
3145
3146static void
3147init_kmem_cache_node(struct kmem_cache_node *n)
3148{
3149        n->nr_partial = 0;
3150        spin_lock_init(&n->list_lock);
3151        INIT_LIST_HEAD(&n->partial);
3152#ifdef CONFIG_SLUB_DEBUG
3153        atomic_long_set(&n->nr_slabs, 0);
3154        atomic_long_set(&n->total_objects, 0);
3155        INIT_LIST_HEAD(&n->full);
3156#endif
3157}
3158
3159static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
3160{
3161        BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
3162                        KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu));
3163
3164        /*
3165         * Must align to double word boundary for the double cmpxchg
3166         * instructions to work; see __pcpu_double_call_return_bool().
3167         */
3168        s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu),
3169                                     2 * sizeof(void *));
3170
3171        if (!s->cpu_slab)
3172                return 0;
3173
3174        init_kmem_cache_cpus(s);
3175
3176        return 1;
3177}
3178
3179static struct kmem_cache *kmem_cache_node;
3180
3181/*
3182 * No kmalloc_node yet so do it by hand. We know that this is the first
3183 * slab on the node for this slabcache. There are no concurrent accesses
3184 * possible.
3185 *
3186 * Note that this function only works on the kmalloc_node_cache
3187 * when allocating for the kmalloc_node_cache. This is used for bootstrapping
3188 * memory on a fresh node that has no slab structures yet.
3189 */
3190static void early_kmem_cache_node_alloc(int node)
3191{
3192        struct page *page;
3193        struct kmem_cache_node *n;
3194
3195        BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
3196
3197        page = new_slab(kmem_cache_node, GFP_NOWAIT, node);
3198
3199        BUG_ON(!page);
3200        if (page_to_nid(page) != node) {
3201                printk(KERN_ERR "SLUB: Unable to allocate memory from "
3202                                "node %d\n", node);
3203                printk(KERN_ERR "SLUB: Allocating a useless per node structure "
3204                                "in order to be able to continue\n");
3205        }
3206
3207        n = page->freelist;
3208        BUG_ON(!n);
3209        page->freelist = get_freepointer(kmem_cache_node, n);
3210        page->inuse = 1;
3211        page->frozen = 0;
3212        kmem_cache_node->node[node] = n;
3213#ifdef CONFIG_SLUB_DEBUG
3214        init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
3215        init_tracking(kmem_cache_node, n);
3216#endif
3217        init_kmem_cache_node(n);
3218        inc_slabs_node(kmem_cache_node, node, page->objects);
3219
3220        add_partial(n, page, DEACTIVATE_TO_HEAD);
3221}
3222
3223static void free_kmem_cache_nodes(struct kmem_cache *s)
3224{
3225        int node;
3226
3227        for_each_node_state(node, N_NORMAL_MEMORY) {
3228                struct kmem_cache_node *n = s->node[node];
3229
3230                if (n)
3231                        kmem_cache_free(kmem_cache_node, n);
3232
3233                s->node[node] = NULL;
3234        }
3235}
3236
3237static int init_kmem_cache_nodes(struct kmem_cache *s)
3238{
3239        int node;
3240
3241        for_each_node_state(node, N_NORMAL_MEMORY) {
3242                struct kmem_cache_node *n;
3243
3244                if (slab_state == DOWN) {
3245                        early_kmem_cache_node_alloc(node);
3246                        continue;
3247                }
3248                n = kmem_cache_alloc_node(kmem_cache_node,
3249                                                GFP_KERNEL, node);
3250
3251                if (!n) {
3252                        free_kmem_cache_nodes(s);
3253                        return 0;
3254                }
3255
3256                s->node[node] = n;
3257                init_kmem_cache_node(n);
3258        }
3259        return 1;
3260}
3261
3262static void set_min_partial(struct kmem_cache *s, unsigned long min)
3263{
3264        if (min < MIN_PARTIAL)
3265                min = MIN_PARTIAL;
3266        else if (min > MAX_PARTIAL)
3267                min = MAX_PARTIAL;
3268        s->min_partial = min;
3269}
3270
3271/*
3272 * calculate_sizes() determines the order and the distribution of data within
3273 * a slab object.
3274 */
3275static int calculate_sizes(struct kmem_cache *s, int forced_order)
3276{
3277        unsigned long flags = s->flags;
3278        unsigned long size = s->object_size;
3279        int order;
3280
3281        /*
3282         * Round up object size to the next word boundary. We can only
3283         * place the free pointer at word boundaries and this determines
3284         * the possible location of the free pointer.
3285         */
3286        size = ALIGN(size, sizeof(void *));
3287
3288#ifdef CONFIG_SLUB_DEBUG
3289        /*
3290         * Determine if we can poison the object itself. If the user of
3291         * the slab may touch the object after free or before allocation
3292         * then we should never poison the object itself.
3293         */
3294        if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) &&
3295                        !s->ctor)
3296                s->flags |= __OBJECT_POISON;
3297        else
3298                s->flags &= ~__OBJECT_POISON;
3299
3300
3301        /*
3302         * If we are Redzoning then check if there is some space between the
3303         * end of the object and the free pointer. If not then add an
3304         * additional word to have some bytes to store Redzone information.
3305         */
3306        if ((flags & SLAB_RED_ZONE) && size == s->object_size)
3307                size += sizeof(void *);
3308#endif
3309
3310        /*
3311         * With that we have determined the number of bytes in actual use
3312         * by the object. This is the potential offset to the free pointer.
3313         */
3314        s->inuse = size;
3315
3316        if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) ||
3317                s->ctor)) {
3318                /*
3319                 * Relocate free pointer after the object if it is not
3320                 * permitted to overwrite the first word of the object on
3321                 * kmem_cache_free.
3322                 *
3323                 * This is the case if we do RCU, have a constructor or
3324                 * destructor or are poisoning the objects.
3325                 */
3326                s->offset = size;
3327                size += sizeof(void *);
3328        }
3329
3330#ifdef CONFIG_SLUB_DEBUG
3331        if (flags & SLAB_STORE_USER)
3332                /*
3333                 * Need to store information about allocs and frees after
3334                 * the object.
3335                 */
3336                size += 2 * sizeof(struct track);
3337
3338        if (flags & SLAB_RED_ZONE) {
3339                /*
3340                 * Add some empty padding so that we can catch
3341                 * overwrites from earlier objects rather than let
3342                 * tracking information or the free pointer be
3343                 * corrupted if a user writes before the start
3344                 * of the object.
3345                 */
3346                size += sizeof(void *);
3347
3348                s->red_left_pad = sizeof(void *);
3349                s->red_left_pad = ALIGN(s->red_left_pad, s->align);
3350                size += s->red_left_pad;
3351        }
3352#endif
3353
3354        /*
3355         * SLUB stores one object immediately after another beginning from
3356         * offset 0. In order to align the objects we have to simply size
3357         * each object to conform to the alignment.
3358         */
3359        size = ALIGN(size, s->align);
3360        s->size = size;
3361        if (forced_order >= 0)
3362                order = forced_order;
3363        else
3364                order = calculate_order(size, s->reserved);
3365
3366        if (order < 0)
3367                return 0;
3368
3369        s->allocflags = 0;
3370        if (order)
3371                s->allocflags |= __GFP_COMP;
3372
3373        if (s->flags & SLAB_CACHE_DMA)
3374                s->allocflags |= GFP_DMA;
3375
3376        if (s->flags & SLAB_RECLAIM_ACCOUNT)
3377                s->allocflags |= __GFP_RECLAIMABLE;
3378
3379        /*
3380         * Determine the number of objects per slab
3381         */
3382        s->oo = oo_make(order, size, s->reserved);
3383        s->min = oo_make(get_order(size), size, s->reserved);
3384        if (oo_objects(s->oo) > oo_objects(s->max))
3385                s->max = s->oo;
3386
3387        return !!oo_objects(s->oo);
3388}
3389
3390static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
3391{
3392        s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
3393        s->reserved = 0;
3394
3395        if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU))
3396                s->reserved = sizeof(struct rcu_head);
3397
3398        if (!calculate_sizes(s, -1))
3399                goto error;
3400        if (disable_higher_order_debug) {
3401                /*
3402                 * Disable debugging flags that store metadata if the min slab
3403                 * order increased.
3404                 */
3405                if (get_order(s->size) > get_order(s->object_size)) {
3406                        s->flags &= ~DEBUG_METADATA_FLAGS;
3407                        s->offset = 0;
3408                        if (!calculate_sizes(s, -1))
3409                                goto error;
3410                }
3411        }
3412
3413#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
3414    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
3415        if (system_has_cmpxchg_double() && (s->flags & SLAB_NO_CMPXCHG) == 0)
3416                /* Enable fast mode */
3417                s->flags |= __CMPXCHG_DOUBLE;
3418#endif
3419
3420        /*
3421         * The larger the object size is, the more pages we want on the partial
3422         * list to avoid pounding the page allocator excessively.
3423         */
3424        set_min_partial(s, ilog2(s->size) / 2);
3425
3426        /*
3427         * cpu_partial determined the maximum number of objects kept in the
3428         * per cpu partial lists of a processor.
3429         *
3430         * Per cpu partial lists mainly contain slabs that just have one
3431         * object freed. If they are used for allocation then they can be
3432         * filled up again with minimal effort. The slab will never hit the
3433         * per node partial lists and therefore no locking will be required.
3434         *
3435         * This setting also determines
3436         *
3437         * A) The number of objects from per cpu partial slabs dumped to the
3438         *    per node list when we reach the limit.
3439         * B) The number of objects in cpu partial slabs to extract from the
3440         *    per node list when we run out of per cpu objects. We only fetch 50%
3441         *    to keep some capacity around for frees.
3442         */
3443        if (kmem_cache_debug(s))
3444                s->cpu_partial = 0;
3445        else if (s->size >= PAGE_SIZE)
3446                s->cpu_partial = 2;
3447        else if (s->size >= 1024)
3448                s->cpu_partial = 6;
3449        else if (s->size >= 256)
3450                s->cpu_partial = 13;
3451        else
3452                s->cpu_partial = 30;
3453
3454#ifdef CONFIG_NUMA
3455        s->remote_node_defrag_ratio = 1000;
3456#endif
3457        if (!init_kmem_cache_nodes(s))
3458                goto error;
3459
3460        if (alloc_kmem_cache_cpus(s))
3461                return 0;
3462
3463        free_kmem_cache_nodes(s);
3464error:
3465        if (flags & SLAB_PANIC)
3466                panic("Cannot create slab %s size=%lu realsize=%u "
3467                        "order=%u offset=%u flags=%lx\n",
3468                        s->name, (unsigned long)s->size, s->size, oo_order(s->oo),
3469                        s->offset, flags);
3470        return -EINVAL;
3471}
3472
3473static void list_slab_objects(struct kmem_cache *s, struct page *page,
3474                                                        const char *text)
3475{
3476#ifdef CONFIG_SLUB_DEBUG
3477        void *addr = page_address(page);
3478        void *p;
3479        unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) *
3480                                     sizeof(long), GFP_ATOMIC);
3481        if (!map)
3482                return;
3483        slab_err(s, page, text, s->name);
3484        slab_lock(page);
3485
3486        get_map(s, page, map);
3487        for_each_object(p, s, addr, page->objects) {
3488
3489                if (!test_bit(slab_index(p, s, addr), map)) {
3490                        printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n",
3491                                                        p, p - addr);
3492                        print_tracking(s, p);
3493                }
3494        }
3495        slab_unlock(page);
3496        kfree(map);
3497#endif
3498}
3499
3500/*
3501 * Attempt to free all partial slabs on a node.
3502 * This is called from kmem_cache_close(). We must be the last thread
3503 * using the cache and therefore we do not need to lock anymore.
3504 */
3505static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
3506{
3507        struct page *page, *h;
3508
3509        list_for_each_entry_safe(page, h, &n->partial, lru) {
3510                if (!page->inuse) {
3511                        remove_partial(n, page);
3512                        discard_slab(s, page);
3513                } else {
3514                        list_slab_objects(s, page,
3515                        "Objects remaining in %s on kmem_cache_close()");
3516                }
3517        }
3518}
3519
3520/*
3521 * Release all resources used by a slab cache.
3522 */
3523static inline int kmem_cache_close(struct kmem_cache *s)
3524{
3525        int node;
3526
3527        flush_all(s);
3528        /* Attempt to free all objects */
3529        for_each_node_state(node, N_NORMAL_MEMORY) {
3530                struct kmem_cache_node *n = get_node(s, node);
3531
3532                free_partial(s, n);
3533                if (n->nr_partial || slabs_node(s, node))
3534                        return 1;
3535        }
3536        free_percpu(s->cpu_slab);
3537        free_kmem_cache_nodes(s);
3538        return 0;
3539}
3540
3541int __kmem_cache_shutdown(struct kmem_cache *s)
3542{
3543        int rc = kmem_cache_close(s);
3544
3545        if (!rc) {
3546                /*
3547                 * Since slab_attr_store may take the slab_mutex, we should
3548                 * release the lock while removing the sysfs entry in order to
3549                 * avoid a deadlock. Because this is pretty much the last
3550                 * operation we do and the lock will be released shortly after
3551                 * that in slab_common.c, we could just move sysfs_slab_remove
3552                 * to a later point in common code. We should do that when we
3553                 * have a common sysfs framework for all allocators.
3554                 */
3555                mutex_unlock(&slab_mutex);
3556                sysfs_slab_remove(s);
3557                mutex_lock(&slab_mutex);
3558        }
3559
3560        return rc;
3561}
3562
3563/********************************************************************
3564 *              Kmalloc subsystem
3565 *******************************************************************/
3566
3567static int __init setup_slub_min_order(char *str)
3568{
3569        get_option(&str, &slub_min_order);
3570
3571        return 1;
3572}
3573
3574__setup("slub_min_order=", setup_slub_min_order);
3575
3576static int __init setup_slub_max_order(char *str)
3577{
3578        get_option(&str, &slub_max_order);
3579        slub_max_order = min(slub_max_order, MAX_ORDER - 1);
3580
3581        return 1;
3582}
3583
3584__setup("slub_max_order=", setup_slub_max_order);
3585
3586static int __init setup_slub_min_objects(char *str)
3587{
3588        get_option(&str, &slub_min_objects);
3589
3590        return 1;
3591}
3592
3593__setup("slub_min_objects=", setup_slub_min_objects);
3594
3595static int __init setup_slub_nomerge(char *str)
3596{
3597        slub_nomerge = 1;
3598        return 1;
3599}
3600
3601__setup("slub_nomerge", setup_slub_nomerge);
3602
3603void *__kmalloc(size_t size, gfp_t flags)
3604{
3605        struct kmem_cache *s;
3606        void *ret;
3607
3608        if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
3609                return kmalloc_large(size, flags);
3610
3611        s = kmalloc_slab(size, flags);
3612
3613        if (unlikely(ZERO_OR_NULL_PTR(s)))
3614                return s;
3615
3616        ret = slab_alloc(s, flags, _RET_IP_);
3617
3618        trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
3619
3620        return ret;
3621}
3622EXPORT_SYMBOL(__kmalloc);
3623
3624#ifdef CONFIG_NUMA
3625static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
3626{
3627        struct page *page;
3628        void *ptr = NULL;
3629
3630        flags |= __GFP_COMP | __GFP_NOTRACK;
3631        page = alloc_pages_node(node, flags, get_order(size));
3632        if (page)
3633                ptr = page_address(page);
3634
3635        kmemleak_alloc(ptr, size, 1, flags);
3636        return ptr;
3637}
3638
3639void *__kmalloc_node(size_t size, gfp_t flags, int node)
3640{
3641        struct kmem_cache *s;
3642        void *ret;
3643
3644        if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
3645                ret = kmalloc_large_node(size, flags, node);
3646
3647                trace_kmalloc_node(_RET_IP_, ret,
3648                                   size, PAGE_SIZE << get_order(size),
3649                                   flags, node);
3650
3651                return ret;
3652        }
3653
3654        s = kmalloc_slab(size, flags);
3655
3656        if (unlikely(ZERO_OR_NULL_PTR(s)))
3657                return s;
3658
3659        ret = slab_alloc_node(s, flags, node, _RET_IP_);
3660
3661        trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
3662
3663        return ret;
3664}
3665EXPORT_SYMBOL(__kmalloc_node);
3666#endif
3667
3668#ifdef CONFIG_HARDENED_USERCOPY
3669/*
3670 * Rejects objects that are incorrectly sized.
3671 *
3672 * Returns NULL if check passes, otherwise const char * to name of cache
3673 * to indicate an error.
3674 */
3675const char *__check_heap_object(const void *ptr, unsigned long n,
3676                                struct page *page)
3677{
3678        struct kmem_cache *s;
3679        unsigned long offset;
3680        size_t object_size;
3681
3682        /* Find object and usable object size. */
3683        s = page->slab_cache;
3684        object_size = slab_ksize(s);
3685
3686        /* Reject impossible pointers. */
3687        if (ptr < page_address(page))
3688                return s->name;
3689
3690        /* Find offset within object. */
3691        offset = (ptr - page_address(page)) % s->size;
3692
3693        /* Adjust for redzone and reject if within the redzone. */
3694        if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) {
3695                if (offset < s->red_left_pad)
3696                        return s->name;
3697                offset -= s->red_left_pad;
3698        }
3699
3700        /* Allow address range falling entirely within object size. */
3701        if (offset <= object_size && n <= object_size - offset)
3702                return NULL;
3703
3704        return s->name;
3705}
3706#endif /* CONFIG_HARDENED_USERCOPY */
3707
3708size_t ksize(const void *object)
3709{
3710        struct page *page;
3711
3712        if (unlikely(object == ZERO_SIZE_PTR))
3713                return 0;
3714
3715        page = virt_to_head_page(object);
3716
3717        if (unlikely(!PageSlab(page))) {
3718                WARN_ON(!PageCompound(page));
3719                return PAGE_SIZE << compound_order(page);
3720        }
3721
3722        return slab_ksize(page->slab_cache);
3723}
3724EXPORT_SYMBOL(ksize);
3725
3726#ifdef CONFIG_SLUB_DEBUG
3727bool verify_mem_not_deleted(const void *x)
3728{
3729        struct page *page;
3730        void *object = (void *)x;
3731        unsigned long flags;
3732        bool rv;
3733
3734        if (unlikely(ZERO_OR_NULL_PTR(x)))
3735                return false;
3736
3737        local_irq_save(flags);
3738
3739        page = virt_to_head_page(x);
3740        if (unlikely(!PageSlab(page))) {
3741                /* maybe it was from stack? */
3742                rv = true;
3743                goto out_unlock;
3744        }
3745
3746        slab_lock(page);
3747        if (on_freelist(page->slab_cache, page, object)) {
3748                object_err(page->slab_cache, page, object, "Object is on free-list");
3749                rv = false;
3750        } else {
3751                rv = true;
3752        }
3753        slab_unlock(page);
3754
3755out_unlock:
3756        local_irq_restore(flags);
3757        return rv;
3758}
3759EXPORT_SYMBOL(verify_mem_not_deleted);
3760#endif
3761
3762void kfree(const void *x)
3763{
3764        struct page *page;
3765        void *object = (void *)x;
3766
3767        trace_kfree(_RET_IP_, x);
3768
3769        if (unlikely(ZERO_OR_NULL_PTR(x)))
3770                return;
3771
3772        page = virt_to_head_page(x);
3773        if (unlikely(!PageSlab(page))) {
3774                BUG_ON(!PageCompound(page));
3775                kmemleak_free(x);
3776                __free_pages(page, compound_order(page));
3777                return;
3778        }
3779        slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_);
3780}
3781EXPORT_SYMBOL(kfree);
3782
3783/*
3784 * kmem_cache_shrink removes empty slabs from the partial lists and sorts
3785 * the remaining slabs by the number of items in use. The slabs with the
3786 * most items in use come first. New allocations will then fill those up
3787 * and thus they can be removed from the partial lists.
3788 *
3789 * The slabs with the least items are placed last. This results in them
3790 * being allocated from last increasing the chance that the last objects
3791 * are freed in them.
3792 */
3793int kmem_cache_shrink(struct kmem_cache *s)
3794{
3795        int node;
3796        int i;
3797        struct kmem_cache_node *n;
3798        struct page *page;
3799        struct page *t;
3800        int objects = oo_objects(s->max);
3801        struct list_head *slabs_by_inuse =
3802                kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL);
3803        unsigned long flags;
3804
3805        if (!slabs_by_inuse)
3806                return -ENOMEM;
3807
3808        flush_all(s);
3809        for_each_node_state(node, N_NORMAL_MEMORY) {
3810                n = get_node(s, node);
3811
3812                for (i = 0; i < objects; i++)
3813                        INIT_LIST_HEAD(slabs_by_inuse + i);
3814
3815                spin_lock_irqsave(&n->list_lock, flags);
3816
3817                /*
3818                 * Build lists indexed by the items in use in each slab.
3819                 *
3820                 * Note that concurrent frees may occur while we hold the
3821                 * list_lock. page->inuse here is the upper limit.
3822                 */
3823                list_for_each_entry_safe(page, t, &n->partial, lru) {
3824                        list_move(&page->lru, slabs_by_inuse + page->inuse);
3825                        if (!page->inuse)
3826                                n->nr_partial--;
3827                }
3828
3829                /*
3830                 * Rebuild the partial list with the slabs filled up most
3831                 * first and the least used slabs at the end.
3832                 */
3833                for (i = objects - 1; i > 0; i--)
3834                        list_splice(slabs_by_inuse + i, n->partial.prev);
3835
3836                spin_unlock_irqrestore(&n->list_lock, flags);
3837
3838                /* Release empty slabs */
3839                list_for_each_entry_safe(page, t, slabs_by_inuse, lru)
3840                        discard_slab(s, page);
3841        }
3842
3843        kfree(slabs_by_inuse);
3844        return 0;
3845}
3846EXPORT_SYMBOL(kmem_cache_shrink);
3847
3848#ifdef CONFIG_MEMCG
3849void __kmemcg_cache_deactivate(struct kmem_cache *s)
3850{
3851        /*
3852         * Disable empty slabs caching. Used to avoid pinning offline
3853         * memory cgroups by kmem pages that can be freed.
3854         */
3855        s->cpu_partial = 0;
3856        s->min_partial = 0;
3857
3858        /*
3859         * s->cpu_partial is checked locklessly (see put_cpu_partial), so
3860         * we have to make sure the change is visible.
3861         */
3862        synchronize_sched();
3863
3864        kmem_cache_shrink(s);
3865}
3866#endif
3867
3868static int slab_mem_going_offline_callback(void *arg)
3869{
3870        struct kmem_cache *s;
3871
3872        mutex_lock(&slab_mutex);
3873        list_for_each_entry(s, &slab_caches, list)
3874                kmem_cache_shrink(s);
3875        mutex_unlock(&slab_mutex);
3876
3877        return 0;
3878}
3879
3880static void slab_mem_offline_callback(void *arg)
3881{
3882        struct kmem_cache_node *n;
3883        struct kmem_cache *s;
3884        struct memory_notify *marg = arg;
3885        int offline_node;
3886
3887        offline_node = marg->status_change_nid_normal;
3888
3889        /*
3890         * If the node still has available memory. we need kmem_cache_node
3891         * for it yet.
3892         */
3893        if (offline_node < 0)
3894                return;
3895
3896        mutex_lock(&slab_mutex);
3897        list_for_each_entry(s, &slab_caches, list) {
3898                n = get_node(s, offline_node);
3899                if (n) {
3900                        /*
3901                         * if n->nr_slabs > 0, slabs still exist on the node
3902                         * that is going down. We were unable to free them,
3903                         * and offline_pages() function shouldn't call this
3904                         * callback. So, we must fail.
3905                         */
3906                        BUG_ON(slabs_node(s, offline_node));
3907
3908                        s->node[offline_node] = NULL;
3909                        kmem_cache_free(kmem_cache_node, n);
3910                }
3911        }
3912        mutex_unlock(&slab_mutex);
3913}
3914
3915static int slab_mem_going_online_callback(void *arg)
3916{
3917        struct kmem_cache_node *n;
3918        struct kmem_cache *s;
3919        struct memory_notify *marg = arg;
3920        int nid = marg->status_change_nid_normal;
3921        int ret = 0;
3922
3923        /*
3924         * If the node's memory is already available, then kmem_cache_node is
3925         * already created. Nothing to do.
3926         */
3927        if (nid < 0)
3928                return 0;
3929
3930        /*
3931         * We are bringing a node online. No memory is available yet. We must
3932         * allocate a kmem_cache_node structure in order to bring the node
3933         * online.
3934         */
3935        mutex_lock(&slab_mutex);
3936        list_for_each_entry(s, &slab_caches, list) {
3937                /*
3938                 * XXX: kmem_cache_alloc_node will fallback to other nodes
3939                 *      since memory is not yet available from the node that
3940                 *      is brought up.
3941                 */
3942                n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL);
3943                if (!n) {
3944                        ret = -ENOMEM;
3945                        goto out;
3946                }
3947                init_kmem_cache_node(n);
3948                s->node[nid] = n;
3949        }
3950out:
3951        mutex_unlock(&slab_mutex);
3952        return ret;
3953}
3954
3955static int slab_memory_callback(struct notifier_block *self,
3956                                unsigned long action, void *arg)
3957{
3958        int ret = 0;
3959
3960        switch (action) {
3961        case MEM_GOING_ONLINE:
3962                ret = slab_mem_going_online_callback(arg);
3963                break;
3964        case MEM_GOING_OFFLINE:
3965                ret = slab_mem_going_offline_callback(arg);
3966                break;
3967        case MEM_OFFLINE:
3968        case MEM_CANCEL_ONLINE:
3969                slab_mem_offline_callback(arg);
3970                break;
3971        case MEM_ONLINE:
3972        case MEM_CANCEL_OFFLINE:
3973                break;
3974        }
3975        if (ret)
3976                ret = notifier_from_errno(ret);
3977        else
3978                ret = NOTIFY_OK;
3979        return ret;
3980}
3981
3982static struct notifier_block slab_memory_callback_nb = {
3983        .notifier_call = slab_memory_callback,
3984        .priority = SLAB_CALLBACK_PRI,
3985};
3986
3987/********************************************************************
3988 *                      Basic setup of slabs
3989 *******************************************************************/
3990
3991/*
3992 * Used for early kmem_cache structures that were allocated using
3993 * the page allocator. Allocate them properly then fix up the pointers
3994 * that may be pointing to the wrong kmem_cache structure.
3995 */
3996
3997static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
3998{
3999        int node;
4000        struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);

4001
4002        memcpy(s, static_cache, kmem_cache->object_size);
4003
4004        /*
4005         * This runs very early, and only the boot processor is supposed to be
4006         * up.  Even if it weren't true, IRQs are not up so we couldn't fire
4007         * IPIs around.
4008         */
4009        __flush_cpu_slab(s, smp_processor_id());
4010        for_each_node_state(node, N_NORMAL_MEMORY) {
4011                struct kmem_cache_node *n = get_node(s, node);
4012                struct page *p;
4013
4014                if (n) {
4015                        list_for_each_entry(p, &n->partial, lru)
4016                                p->slab_cache = s;
4017
4018#ifdef CONFIG_SLUB_DEBUG
4019                        list_for_each_entry(p, &n->full, lru)
4020                                p->slab_cache = s;
4021#endif
4022                }
4023        }
4024        list_add(&s->list, &slab_caches);
4025        return s;
4026}
4027
4028void __init kmem_cache_init(void)
4029{
4030        static __initdata struct kmem_cache boot_kmem_cache,
4031                boot_kmem_cache_node;
4032
4033        if (debug_guardpage_minorder())
4034                slub_max_order = 0;
4035
4036        kmem_cache_node = &boot_kmem_cache_node;
4037        kmem_cache = &boot_kmem_cache;
4038
4039        create_boot_cache(kmem_cache_node, "kmem_cache_node",
4040                sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN);
4041
4042        register_hotmemory_notifier(&slab_memory_callback_nb);
4043
4044        /* Able to allocate the per node structures */
4045        slab_state = PARTIAL;
4046
4047        create_boot_cache(kmem_cache, "kmem_cache",
4048                        offsetof(struct kmem_cache, node) +
4049                                nr_node_ids * sizeof(struct kmem_cache_node *),
4050                       SLAB_HWCACHE_ALIGN);
4051
4052        kmem_cache = bootstrap(&boot_kmem_cache);
4053
4054        /*
4055         * Allocate kmem_cache_node properly from the kmem_cache slab.
4056         * kmem_cache_node is separately allocated so no need to
4057         * update any list pointers.
4058         */
4059        kmem_cache_node = bootstrap(&boot_kmem_cache_node);
4060
4061        /* Now we can use the kmem_cache to allocate kmalloc slabs */
4062        create_kmalloc_caches(0);
4063
4064#ifdef CONFIG_SMP
4065        register_cpu_notifier(&slab_notifier);
4066#endif
4067
4068        printk(KERN_INFO
4069                "SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d,"
4070                " CPUs=%d, Nodes=%d\n",
4071                cache_line_size(),
4072                slub_min_order, slub_max_order, slub_min_objects,
4073                nr_cpu_ids, nr_node_ids);
4074}
4075
4076void __init kmem_cache_init_late(void)
4077{
4078}
4079
4080/*
4081 * Find a mergeable slab cache
4082 */
4083static int slab_unmergeable(struct kmem_cache *s)
4084{
4085        if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
4086                return 1;
4087
4088        if (!is_root_cache(s))
4089                return 1;
4090
4091        if (s->ctor)
4092                return 1;
4093
4094        /*
4095         * We may have set a slab to be unmergeable during bootstrap.
4096         */
4097        if (s->refcount < 0)
4098                return 1;
4099
4100        return 0;
4101}
4102
4103static struct kmem_cache *find_mergeable(size_t size, size_t align,
4104                unsigned long flags, const char *name, void (*ctor)(void *))
4105{
4106        struct kmem_cache *s;
4107
4108        if (slub_nomerge || (flags & SLUB_NEVER_MERGE))
4109                return NULL;
4110
4111        if (ctor)
4112                return NULL;
4113
4114        size = ALIGN(size, sizeof(void *));
4115        align = calculate_alignment(flags, align, size);
4116        size = ALIGN(size, align);
4117        flags = kmem_cache_flags(size, flags, name, NULL);
4118
4119        list_for_each_entry(s, &slab_caches, list) {
4120                if (slab_unmergeable(s))
4121                        continue;
4122
4123                if (size > s->size)
4124                        continue;
4125
4126                if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME))
4127                        continue;
4128                /*
4129                 * Check if alignment is compatible.
4130                 * Courtesy of Adrian Drzewiecki
4131                 */
4132                if ((s->size & ~(align - 1)) != s->size)
4133                        continue;
4134
4135                if (s->size - size >= sizeof(void *))
4136                        continue;
4137
4138                return s;
4139        }
4140        return NULL;
4141}
4142
4143struct kmem_cache *
4144__kmem_cache_alias(const char *name, size_t size, size_t align,
4145                   unsigned long flags, void (*ctor)(void *))
4146{
4147        struct kmem_cache *s;
4148
4149        s = find_mergeable(size, align, flags, name, ctor);
4150        if (s) {
4151                s->refcount++;
4152                /*
4153                 * Adjust the object sizes so that we clear
4154                 * the complete object on kzalloc.
4155                 */
4156                s->object_size = max(s->object_size, (int)size);
4157                s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
4158
4159                if (sysfs_slab_alias(s, name)) {
4160                        s->refcount--;
4161                        s = NULL;
4162                }
4163        }
4164
4165        return s;
4166}
4167
4168int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
4169{
4170        int err;
4171
4172        err = kmem_cache_open(s, flags);
4173        if (err)
4174                return err;
4175
4176        /* Mutex is not taken during early boot */
4177        if (slab_state <= UP)
4178                return 0;
4179
4180        memcg_propagate_slab_attrs(s);
4181        err = sysfs_slab_add(s);
4182        if (err)
4183                kmem_cache_close(s);
4184
4185        return err;
4186}
4187
4188#ifdef CONFIG_SMP
4189/*
4190 * Use the cpu notifier to insure that the cpu slabs are flushed when
4191 * necessary.
4192 */
4193static int slab_cpuup_callback(struct notifier_block *nfb,
4194                unsigned long action, void *hcpu)
4195{
4196        long cpu = (long)hcpu;
4197        struct kmem_cache *s;
4198        unsigned long flags;
4199
4200        switch (action) {
4201        case CPU_UP_CANCELED:
4202        case CPU_UP_CANCELED_FROZEN:
4203        case CPU_DEAD:
4204        case CPU_DEAD_FROZEN:
4205                mutex_lock(&slab_mutex);
4206                list_for_each_entry(s, &slab_caches, list) {
4207                        local_irq_save(flags);
4208                        __flush_cpu_slab(s, cpu);
4209                        local_irq_restore(flags);
4210                }
4211                mutex_unlock(&slab_mutex);
4212                break;
4213        default:
4214                break;
4215        }
4216        return NOTIFY_OK;
4217}
4218
4219static struct notifier_block slab_notifier = {
4220        .notifier_call = slab_cpuup_callback
4221};
4222
4223#endif
4224
4225void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
4226{
4227        struct kmem_cache *s;
4228        void *ret;
4229
4230        if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
4231                return kmalloc_large(size, gfpflags);
4232
4233        s = kmalloc_slab(size, gfpflags);
4234
4235        if (unlikely(ZERO_OR_NULL_PTR(s)))
4236                return s;
4237
4238        ret = slab_alloc(s, gfpflags, caller);
4239
4240        /* Honor the call site pointer we received. */
4241        trace_kmalloc(caller, ret, size, s->size, gfpflags);
4242
4243        return ret;
4244}
4245
4246#ifdef CONFIG_NUMA
4247void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
4248                                        int node, unsigned long caller)
4249{
4250        struct kmem_cache *s;
4251        void *ret;
4252
4253        if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
4254                ret = kmalloc_large_node(size, gfpflags, node);
4255
4256                trace_kmalloc_node(caller, ret,
4257                                   size, PAGE_SIZE << get_order(size),
4258                                   gfpflags, node);
4259
4260                return ret;
4261        }
4262
4263        s = kmalloc_slab(size, gfpflags);
4264
4265        if (unlikely(ZERO_OR_NULL_PTR(s)))
4266                return s;
4267
4268        ret = slab_alloc_node(s, gfpflags, node, caller);
4269
4270        /* Honor the call site pointer we received. */
4271        trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
4272
4273        return ret;
4274}
4275#endif
4276
4277#ifdef CONFIG_SYSFS
4278static int count_inuse(struct page *page)
4279{
4280        return page->inuse;
4281}
4282
4283static int count_total(struct page *page)
4284{
4285        return page->objects;
4286}
4287#endif
4288
4289#ifdef CONFIG_SLUB_DEBUG
4290static int validate_slab(struct kmem_cache *s, struct page *page,
4291                                                unsigned long *map)
4292{
4293        void *p;
4294        void *addr = page_address(page);
4295
4296        if (!check_slab(s, page) ||
4297                        !on_freelist(s, page, NULL))
4298                return 0;
4299
4300        /* Now we know that a valid freelist exists */
4301        bitmap_zero(map, page->objects);
4302
4303        get_map(s, page, map);
4304        for_each_object(p, s, addr, page->objects) {
4305                if (test_bit(slab_index(p, s, addr), map))
4306                        if (!check_object(s, page, p, SLUB_RED_INACTIVE))
4307                                return 0;
4308        }
4309
4310        for_each_object(p, s, addr, page->objects)
4311                if (!test_bit(slab_index(p, s, addr), map))
4312                        if (!check_object(s, page, p, SLUB_RED_ACTIVE))
4313                                return 0;
4314        return 1;
4315}
4316
4317static void validate_slab_slab(struct kmem_cache *s, struct page *page,
4318                                                unsigned long *map)
4319{
4320        slab_lock(page);
4321        validate_slab(s, page, map);
4322        slab_unlock(page);
4323}
4324
4325static int validate_slab_node(struct kmem_cache *s,
4326                struct kmem_cache_node *n, unsigned long *map)
4327{
4328        unsigned long count = 0;
4329        struct page *page;
4330        unsigned long flags;
4331
4332        spin_lock_irqsave(&n->list_lock, flags);
4333
4334        list_for_each_entry(page, &n->partial, lru) {
4335                validate_slab_slab(s, page, map);
4336                count++;
4337        }
4338        if (count != n->nr_partial)
4339                printk(KERN_ERR "SLUB %s: %ld partial slabs counted but "
4340                        "counter=%ld\n", s->name, count, n->nr_partial);
4341
4342        if (!(s->flags & SLAB_STORE_USER))
4343                goto out;
4344
4345        list_for_each_entry(page, &n->full, lru) {
4346                validate_slab_slab(s, page, map);
4347                count++;
4348        }
4349        if (count != atomic_long_read(&n->nr_slabs))
4350                printk(KERN_ERR "SLUB: %s %ld slabs counted but "
4351                        "counter=%ld\n", s->name, count,
4352                        atomic_long_read(&n->nr_slabs));
4353
4354out:
4355        spin_unlock_irqrestore(&n->list_lock, flags);
4356        return count;
4357}
4358
4359static long validate_slab_cache(struct kmem_cache *s)
4360{
4361        int node;
4362        unsigned long count = 0;
4363        unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
4364                                sizeof(unsigned long), GFP_KERNEL);
4365
4366        if (!map)
4367                return -ENOMEM;
4368
4369        flush_all(s);
4370        for_each_node_state(node, N_NORMAL_MEMORY) {
4371                struct kmem_cache_node *n = get_node(s, node);
4372
4373                count += validate_slab_node(s, n, map);
4374        }
4375        kfree(map);
4376        return count;
4377}
4378/*
4379 * Generate lists of code addresses where slabcache objects are allocated
4380 * and freed.
4381 */
4382
4383struct location {
4384        unsigned long count;
4385        unsigned long addr;
4386        long long sum_time;
4387        long min_time;
4388        long max_time;
4389        long min_pid;
4390        long max_pid;
4391        DECLARE_BITMAP(cpus, NR_CPUS);
4392        nodemask_t nodes;
4393};
4394
4395struct loc_track {
4396        unsigned long max;
4397        unsigned long count;
4398        struct location *loc;
4399};
4400
4401static void free_loc_track(struct loc_track *t)
4402{
4403        if (t->max)
4404                free_pages((unsigned long)t->loc,
4405                        get_order(sizeof(struct location) * t->max));
4406}
4407
4408static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
4409{
4410        struct location *l;
4411        int order;
4412
4413        order = get_order(sizeof(struct location) * max);
4414
4415        l = (void *)__get_free_pages(flags, order);
4416        if (!l)
4417                return 0;
4418
4419        if (t->count) {
4420                memcpy(l, t->loc, sizeof(struct location) * t->count);
4421                free_loc_track(t);
4422        }
4423        t->max = max;
4424        t->loc = l;
4425        return 1;
4426}
4427
4428static int add_location(struct loc_track *t, struct kmem_cache *s,
4429                                const struct track *track)
4430{
4431        long start, end, pos;
4432        struct location *l;
4433        unsigned long caddr;
4434        unsigned long age = jiffies - track->when;
4435
4436        start = -1;
4437        end = t->count;
4438
4439        for ( ; ; ) {
4440                pos = start + (end - start + 1) / 2;
4441
4442                /*
4443                 * There is nothing at "end". If we end up there
4444                 * we need to add something to before end.
4445                 */
4446                if (pos == end)
4447                        break;
4448
4449                caddr = t->loc[pos].addr;
4450                if (track->addr == caddr) {
4451
4452                        l = &t->loc[pos];
4453                        l->count++;
4454                        if (track->when) {
4455                                l->sum_time += age;
4456                                if (age < l->min_time)
4457                                        l->min_time = age;
4458                                if (age > l->max_time)
4459                                        l->max_time = age;
4460
4461                                if (track->pid < l->min_pid)
4462                                        l->min_pid = track->pid;
4463                                if (track->pid > l->max_pid)
4464                                        l->max_pid = track->pid;
4465
4466                                cpumask_set_cpu(track->cpu,
4467                                                to_cpumask(l->cpus));
4468                        }
4469                        node_set(page_to_nid(virt_to_page(track)), l->nodes);
4470                        return 1;
4471                }
4472
4473                if (track->addr < caddr)
4474                        end = pos;
4475                else
4476                        start = pos;
4477        }
4478
4479        /*
4480         * Not found. Insert new tracking element.
4481         */
4482        if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC))
4483                return 0;
4484
4485        l = t->loc + pos;
4486        if (pos < t->count)
4487                memmove(l + 1, l,
4488                        (t->count - pos) * sizeof(struct location));
4489        t->count++;
4490        l->count = 1;
4491        l->addr = track->addr;
4492        l->sum_time = age;
4493        l->min_time = age;
4494        l->max_time = age;
4495        l->min_pid = track->pid;
4496        l->max_pid = track->pid;
4497        cpumask_clear(to_cpumask(l->cpus));
4498        cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));
4499        nodes_clear(l->nodes);
4500        node_set(page_to_nid(virt_to_page(track)), l->nodes);
4501        return 1;
4502}
4503
4504static void process_slab(struct loc_track *t, struct kmem_cache *s,
4505                struct page *page, enum track_item alloc,
4506                unsigned long *map)
4507{
4508        void *addr = page_address(page);
4509        void *p;
4510
4511        bitmap_zero(map, page->objects);
4512        get_map(s, page, map);
4513
4514        for_each_object(p, s, addr, page->objects)
4515                if (!test_bit(slab_index(p, s, addr), map))
4516                        add_location(t, s, get_track(s, p, alloc));
4517}
4518
4519static int list_locations(struct kmem_cache *s, char *buf,
4520                                        enum track_item alloc)
4521{
4522        int len = 0;
4523        unsigned long i;
4524        struct loc_track t = { 0, 0, NULL };
4525        int node;
4526        unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
4527                                     sizeof(unsigned long), GFP_KERNEL);
4528
4529        if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
4530                                     GFP_TEMPORARY)) {
4531                kfree(map);
4532                return sprintf(buf, "Out of memory\n");
4533        }
4534        /* Push back cpu slabs */
4535        flush_all(s);
4536
4537        for_each_node_state(node, N_NORMAL_MEMORY) {
4538                struct kmem_cache_node *n = get_node(s, node);
4539                unsigned long flags;
4540                struct page *page;
4541
4542                if (!atomic_long_read(&n->nr_slabs))
4543                        continue;
4544
4545                spin_lock_irqsave(&n->list_lock, flags);
4546                list_for_each_entry(page, &n->partial, lru)
4547                        process_slab(&t, s, page, alloc, map);
4548                list_for_each_entry(page, &n->full, lru)
4549                        process_slab(&t, s, page, alloc, map);
4550                spin_unlock_irqrestore(&n->list_lock, flags);
4551        }
4552
4553        for (i = 0; i < t.count; i++) {
4554                struct location *l = &t.loc[i];
4555
4556                if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100)
4557                        break;
4558                len += sprintf(buf + len, "%7ld ", l->count);
4559
4560                if (l->addr)
4561                        len += sprintf(buf + len, "%pS", (void *)l->addr);
4562                else
4563                        len += sprintf(buf + len, "<not-available>");
4564
4565                if (l->sum_time != l->min_time) {
4566                        len += sprintf(buf + len, " age=%ld/%ld/%ld",
4567                                l->min_time,
4568                                (long)div_u64(l->sum_time, l->count),
4569                                l->max_time);
4570                } else
4571                        len += sprintf(buf + len, " age=%ld",
4572                                l->min_time);
4573
4574                if (l->min_pid != l->max_pid)
4575                        len += sprintf(buf + len, " pid=%ld-%ld",
4576                                l->min_pid, l->max_pid);
4577                else
4578                        len += sprintf(buf + len, " pid=%ld",
4579                                l->min_pid);
4580
4581                if (num_online_cpus() > 1 &&
4582                                !cpumask_empty(to_cpumask(l->cpus)) &&
4583                                len < PAGE_SIZE - 60) {
4584                        len += sprintf(buf + len, " cpus=");
4585                        len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50,
4586                                                 to_cpumask(l->cpus));
4587                }
4588
4589                if (nr_online_nodes > 1 && !nodes_empty(l->nodes) &&
4590                                len < PAGE_SIZE - 60) {
4591                        len += sprintf(buf + len, " nodes=");
4592                        len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50,
4593                                        l->nodes);
4594                }
4595
4596                len += sprintf(buf + len, "\n");
4597        }
4598
4599        free_loc_track(&t);
4600        kfree(map);
4601        if (!t.count)
4602                len += sprintf(buf, "No data\n");
4603        return len;
4604}
4605#endif
4606
4607#ifdef SLUB_RESILIENCY_TEST
4608static void resiliency_test(void)
4609{
4610        u8 *p;
4611
4612        BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10);
4613
4614        printk(KERN_ERR "SLUB resiliency testing\n");
4615        printk(KERN_ERR "-----------------------\n");
4616        printk(KERN_ERR "A. Corruption after allocation\n");
4617
4618        p = kzalloc(16, GFP_KERNEL);
4619        p[16] = 0x12;
4620        printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
4621                        " 0x12->0x%p\n\n", p + 16);
4622
4623        validate_slab_cache(kmalloc_caches[4]);
4624
4625        /* Hmmm... The next two are dangerous */
4626        p = kzalloc(32, GFP_KERNEL);
4627        p[32 + sizeof(void *)] = 0x34;
4628        printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
4629                        " 0x34 -> -0x%p\n", p);
4630        printk(KERN_ERR
4631                "If allocated object is overwritten then not detectable\n\n");
4632
4633        validate_slab_cache(kmalloc_caches[5]);
4634        p = kzalloc(64, GFP_KERNEL);
4635        p += 64 + (get_cycles() & 0xff) * sizeof(void *);
4636        *p = 0x56;
4637        printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
4638                                                                        p);
4639        printk(KERN_ERR
4640                "If allocated object is overwritten then not detectable\n\n");
4641        validate_slab_cache(kmalloc_caches[6]);
4642
4643        printk(KERN_ERR "\nB. Corruption after free\n");
4644        p = kzalloc(128, GFP_KERNEL);
4645        kfree(p);
4646        *p = 0x78;
4647        printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
4648        validate_slab_cache(kmalloc_caches[7]);
4649
4650        p = kzalloc(256, GFP_KERNEL);
4651        kfree(p);
4652        p[50] = 0x9a;
4653        printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n",
4654                        p);
4655        validate_slab_cache(kmalloc_caches[8]);
4656
4657        p = kzalloc(512, GFP_KERNEL);
4658        kfree(p);
4659        p[512] = 0xab;
4660        printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
4661        validate_slab_cache(kmalloc_caches[9]);
4662}
4663#else
4664#ifdef CONFIG_SYSFS
4665static void resiliency_test(void) {};
4666#endif
4667#endif
4668
4669#ifdef CONFIG_SYSFS
4670enum slab_stat_type {
4671        SL_ALL,                 /* All slabs */
4672        SL_PARTIAL,             /* Only partially allocated slabs */
4673        SL_CPU,                 /* Only slabs used for cpu caches */
4674        SL_OBJECTS,             /* Determine allocated objects not slabs */
4675        SL_TOTAL                /* Determine object capacity not slabs */
4676};
4677
4678#define SO_ALL          (1 << SL_ALL)
4679#define SO_PARTIAL      (1 << SL_PARTIAL)
4680#define SO_CPU          (1 << SL_CPU)
4681#define SO_OBJECTS      (1 << SL_OBJECTS)
4682#define SO_TOTAL        (1 << SL_TOTAL)
4683
4684static ssize_t show_slab_objects(struct kmem_cache *s,
4685                            char *buf, unsigned long flags)
4686{
4687        unsigned long total = 0;
4688        int node;
4689        int x;
4690        unsigned long *nodes;
4691        unsigned long *per_cpu;
4692
4693        nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL);
4694        if (!nodes)
4695                return -ENOMEM;
4696        per_cpu = nodes + nr_node_ids;
4697
4698        if (flags & SO_CPU) {
4699                int cpu;
4700
4701                for_each_possible_cpu(cpu) {
4702                        struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
4703                        int node;
4704                        struct page *page;
4705
4706                        page = ACCESS_ONCE(c->page);
4707                        if (!page)
4708                                continue;
4709
4710                        node = page_to_nid(page);
4711                        if (flags & SO_TOTAL)
4712                                x = page->objects;
4713                        else if (flags & SO_OBJECTS)
4714                                x = page->inuse;
4715                        else
4716                                x = 1;
4717
4718                        total += x;
4719                        nodes[node] += x;
4720
4721                        page = ACCESS_ONCE(c->partial);
4722                        if (page) {
4723                                node = page_to_nid(page);
4724                                if (flags & SO_TOTAL)
4725                                        WARN_ON_ONCE(1);
4726                                else if (flags & SO_OBJECTS)
4727                                        WARN_ON_ONCE(1);
4728                                else
4729                                        x = page->pages;
4730                                total += x;
4731                                nodes[node] += x;
4732                        }
4733
4734                        per_cpu[node]++;
4735                }
4736        }
4737
4738        get_online_mems();
4739#ifdef CONFIG_SLUB_DEBUG
4740        if (flags & SO_ALL) {
4741                for_each_node_state(node, N_NORMAL_MEMORY) {
4742                        struct kmem_cache_node *n = get_node(s, node);
4743
4744                if (flags & SO_TOTAL)
4745                        x = atomic_long_read(&n->total_objects);
4746                else if (flags & SO_OBJECTS)
4747                        x = atomic_long_read(&n->total_objects) -
4748                                count_partial(n, count_free);
4749
4750                        else
4751                                x = atomic_long_read(&n->nr_slabs);
4752                        total += x;
4753                        nodes[node] += x;
4754                }
4755
4756        } else
4757#endif
4758        if (flags & SO_PARTIAL) {
4759                for_each_node_state(node, N_NORMAL_MEMORY) {
4760                        struct kmem_cache_node *n = get_node(s, node);
4761
4762                        if (flags & SO_TOTAL)
4763                                x = count_partial(n, count_total);
4764                        else if (flags & SO_OBJECTS)
4765                                x = count_partial(n, count_inuse);
4766                        else
4767                                x = n->nr_partial;
4768                        total += x;
4769                        nodes[node] += x;
4770                }
4771        }
4772        x = sprintf(buf, "%lu", total);
4773#ifdef CONFIG_NUMA
4774        for_each_node_state(node, N_NORMAL_MEMORY)
4775                if (nodes[node])
4776                        x += sprintf(buf + x, " N%d=%lu",
4777                                        node, nodes[node]);
4778#endif
4779        put_online_mems();
4780        kfree(nodes);
4781        return x + sprintf(buf + x, "\n");
4782}
4783
4784#ifdef CONFIG_SLUB_DEBUG
4785static int any_slab_objects(struct kmem_cache *s)
4786{
4787        int node;
4788
4789        for_each_online_node(node) {
4790                struct kmem_cache_node *n = get_node(s, node);
4791
4792                if (!n)
4793                        continue;
4794
4795                if (atomic_long_read(&n->total_objects))
4796                        return 1;
4797        }
4798        return 0;
4799}
4800#endif
4801
4802#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
4803#define to_slab(n) container_of(n, struct kmem_cache, kobj)
4804
4805struct slab_attribute {
4806        struct attribute attr;
4807        ssize_t (*show)(struct kmem_cache *s, char *buf);
4808        ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
4809};
4810
4811#define SLAB_ATTR_RO(_name) \
4812        static struct slab_attribute _name##_attr = \
4813        __ATTR(_name, 0400, _name##_show, NULL)
4814
4815#define SLAB_ATTR(_name) \
4816        static struct slab_attribute _name##_attr =  \
4817        __ATTR(_name, 0600, _name##_show, _name##_store)
4818
4819static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
4820{
4821        return sprintf(buf, "%d\n", s->size);
4822}
4823SLAB_ATTR_RO(slab_size);
4824
4825static ssize_t align_show(struct kmem_cache *s, char *buf)
4826{
4827        return sprintf(buf, "%d\n", s->align);
4828}
4829SLAB_ATTR_RO(align);
4830
4831static ssize_t object_size_show(struct kmem_cache *s, char *buf)
4832{
4833        return sprintf(buf, "%d\n", s->object_size);
4834}
4835SLAB_ATTR_RO(object_size);
4836
4837static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
4838{
4839        return sprintf(buf, "%d\n", oo_objects(s->oo));
4840}
4841SLAB_ATTR_RO(objs_per_slab);
4842
4843static ssize_t order_store(struct kmem_cache *s,
4844                                const char *buf, size_t length)
4845{
4846        unsigned long order;
4847        int err;
4848
4849        err = strict_strtoul(buf, 10, &order);
4850        if (err)
4851                return err;
4852
4853        if (order > slub_max_order || order < slub_min_order)
4854                return -EINVAL;
4855
4856        calculate_sizes(s, order);
4857        return length;
4858}
4859
4860static ssize_t order_show(struct kmem_cache *s, char *buf)
4861{
4862        return sprintf(buf, "%d\n", oo_order(s->oo));
4863}
4864SLAB_ATTR(order);
4865
4866static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
4867{
4868        return sprintf(buf, "%lu\n", s->min_partial);
4869}
4870
4871static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
4872                                 size_t length)
4873{
4874        unsigned long min;
4875        int err;
4876
4877        err = strict_strtoul(buf, 10, &min);
4878        if (err)
4879                return err;
4880
4881        set_min_partial(s, min);
4882        return length;
4883}
4884SLAB_ATTR(min_partial);
4885
4886static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
4887{
4888        return sprintf(buf, "%u\n", s->cpu_partial);
4889}
4890
4891static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
4892                                 size_t length)
4893{
4894        unsigned long objects;
4895        int err;
4896
4897        err = strict_strtoul(buf, 10, &objects);
4898        if (err)
4899                return err;
4900        if (objects && kmem_cache_debug(s))
4901                return -EINVAL;
4902
4903        s->cpu_partial = objects;
4904        flush_all(s);
4905        return length;
4906}
4907SLAB_ATTR(cpu_partial);
4908
4909static ssize_t ctor_show(struct kmem_cache *s, char *buf)
4910{
4911        if (!s->ctor)
4912                return 0;
4913        return sprintf(buf, "%pS\n", s->ctor);
4914}
4915SLAB_ATTR_RO(ctor);
4916
4917static ssize_t aliases_show(struct kmem_cache *s, char *buf)
4918{
4919        return sprintf(buf, "%d\n", s->refcount - 1);
4920}
4921SLAB_ATTR_RO(aliases);
4922
4923static ssize_t partial_show(struct kmem_cache *s, char *buf)
4924{
4925        return show_slab_objects(s, buf, SO_PARTIAL);
4926}
4927SLAB_ATTR_RO(partial);
4928
4929static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf)
4930{
4931        return show_slab_objects(s, buf, SO_CPU);
4932}
4933SLAB_ATTR_RO(cpu_slabs);
4934
4935static ssize_t objects_show(struct kmem_cache *s, char *buf)
4936{
4937        return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS);
4938}
4939SLAB_ATTR_RO(objects);
4940
4941static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
4942{
4943        return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS);
4944}
4945SLAB_ATTR_RO(objects_partial);
4946
4947static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
4948{
4949        int objects = 0;
4950        int pages = 0;
4951        int cpu;
4952        int len;
4953
4954        for_each_online_cpu(cpu) {
4955                struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial;
4956
4957                if (page) {
4958                        pages += page->pages;
4959                        objects += page->pobjects;
4960                }
4961        }
4962
4963        len = sprintf(buf, "%d(%d)", objects, pages);
4964
4965#ifdef CONFIG_SMP
4966        for_each_online_cpu(cpu) {
4967                struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial;
4968
4969                if (page && len < PAGE_SIZE - 20)
4970                        len += sprintf(buf + len, " C%d=%d(%d)", cpu,
4971                                page->pobjects, page->pages);
4972        }
4973#endif
4974        return len + sprintf(buf + len, "\n");
4975}
4976SLAB_ATTR_RO(slabs_cpu_partial);
4977
4978static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
4979{
4980        return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
4981}
4982
4983static ssize_t reclaim_account_store(struct kmem_cache *s,
4984                                const char *buf, size_t length)
4985{
4986        s->flags &= ~SLAB_RECLAIM_ACCOUNT;
4987        if (buf[0] == '1')
4988                s->flags |= SLAB_RECLAIM_ACCOUNT;
4989        return length;
4990}
4991SLAB_ATTR(reclaim_account);
4992
4993static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
4994{
4995        return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
4996}
4997SLAB_ATTR_RO(hwcache_align);
4998
4999#ifdef CONFIG_ZONE_DMA
5000static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)

5001{
5002        return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
5003}
5004SLAB_ATTR_RO(cache_dma);
5005#endif
5006
5007static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
5008{
5009        return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
5010}
5011SLAB_ATTR_RO(destroy_by_rcu);
5012
5013static ssize_t reserved_show(struct kmem_cache *s, char *buf)
5014{
5015        return sprintf(buf, "%d\n", s->reserved);
5016}
5017SLAB_ATTR_RO(reserved);
5018
5019#ifdef CONFIG_SLUB_DEBUG
5020static ssize_t slabs_show(struct kmem_cache *s, char *buf)
5021{
5022        return show_slab_objects(s, buf, SO_ALL);
5023}
5024SLAB_ATTR_RO(slabs);
5025
5026static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
5027{
5028        return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
5029}
5030SLAB_ATTR_RO(total_objects);
5031
5032static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
5033{
5034        return sprintf(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS));
5035}
5036
5037static ssize_t sanity_checks_store(struct kmem_cache *s,
5038                                const char *buf, size_t length)
5039{
5040        s->flags &= ~SLAB_CONSISTENCY_CHECKS;
5041        if (buf[0] == '1') {
5042                s->flags &= ~__CMPXCHG_DOUBLE;
5043                s->flags |= SLAB_CONSISTENCY_CHECKS;
5044        }
5045        return length;
5046}
5047SLAB_ATTR(sanity_checks);
5048
5049static ssize_t trace_show(struct kmem_cache *s, char *buf)
5050{
5051        return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
5052}
5053
5054static ssize_t trace_store(struct kmem_cache *s, const char *buf,
5055                                                        size_t length)
5056{
5057        s->flags &= ~SLAB_TRACE;
5058        if (buf[0] == '1') {
5059                s->flags &= ~__CMPXCHG_DOUBLE;
5060                s->flags |= SLAB_TRACE;
5061        }
5062        return length;
5063}
5064SLAB_ATTR(trace);
5065
5066static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
5067{
5068        return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
5069}
5070
5071static ssize_t red_zone_store(struct kmem_cache *s,
5072                                const char *buf, size_t length)
5073{
5074        if (any_slab_objects(s))
5075                return -EBUSY;
5076
5077        s->flags &= ~SLAB_RED_ZONE;
5078        if (buf[0] == '1') {
5079                s->flags |= SLAB_RED_ZONE;
5080        }
5081        calculate_sizes(s, -1);
5082        return length;
5083}
5084SLAB_ATTR(red_zone);
5085
5086static ssize_t poison_show(struct kmem_cache *s, char *buf)
5087{
5088        return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
5089}
5090
5091static ssize_t poison_store(struct kmem_cache *s,
5092                                const char *buf, size_t length)
5093{
5094        if (any_slab_objects(s))
5095                return -EBUSY;
5096
5097        s->flags &= ~SLAB_POISON;
5098        if (buf[0] == '1') {
5099                s->flags |= SLAB_POISON;
5100        }
5101        calculate_sizes(s, -1);
5102        return length;
5103}
5104SLAB_ATTR(poison);
5105
5106static ssize_t store_user_show(struct kmem_cache *s, char *buf)
5107{
5108        return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
5109}
5110
5111static ssize_t store_user_store(struct kmem_cache *s,
5112                                const char *buf, size_t length)
5113{
5114        if (any_slab_objects(s))
5115                return -EBUSY;
5116
5117        s->flags &= ~SLAB_STORE_USER;
5118        if (buf[0] == '1') {
5119                s->flags &= ~__CMPXCHG_DOUBLE;
5120                s->flags |= SLAB_STORE_USER;
5121        }
5122        calculate_sizes(s, -1);
5123        return length;
5124}
5125SLAB_ATTR(store_user);
5126
5127static ssize_t validate_show(struct kmem_cache *s, char *buf)
5128{
5129        return 0;
5130}
5131
5132static ssize_t validate_store(struct kmem_cache *s,
5133                        const char *buf, size_t length)
5134{
5135        int ret = -EINVAL;
5136
5137        if (buf[0] == '1') {
5138                ret = validate_slab_cache(s);
5139                if (ret >= 0)
5140                        ret = length;
5141        }
5142        return ret;
5143}
5144SLAB_ATTR(validate);
5145
5146static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
5147{
5148        if (!(s->flags & SLAB_STORE_USER))
5149                return -ENOSYS;
5150        return list_locations(s, buf, TRACK_ALLOC);
5151}
5152SLAB_ATTR_RO(alloc_calls);
5153
5154static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
5155{
5156        if (!(s->flags & SLAB_STORE_USER))
5157                return -ENOSYS;
5158        return list_locations(s, buf, TRACK_FREE);
5159}
5160SLAB_ATTR_RO(free_calls);
5161#endif /* CONFIG_SLUB_DEBUG */
5162
5163#ifdef CONFIG_FAILSLAB
5164static ssize_t failslab_show(struct kmem_cache *s, char *buf)
5165{
5166        return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
5167}
5168
5169static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
5170                                                        size_t length)
5171{
5172        s->flags &= ~SLAB_FAILSLAB;
5173        if (buf[0] == '1')
5174                s->flags |= SLAB_FAILSLAB;
5175        return length;
5176}
5177SLAB_ATTR(failslab);
5178#endif
5179
5180static ssize_t shrink_show(struct kmem_cache *s, char *buf)
5181{
5182        return 0;
5183}
5184
5185static ssize_t shrink_store(struct kmem_cache *s,
5186                        const char *buf, size_t length)
5187{
5188        if (buf[0] == '1') {
5189                int rc = kmem_cache_shrink(s);
5190
5191                if (rc)
5192                        return rc;
5193        } else
5194                return -EINVAL;
5195        return length;
5196}
5197SLAB_ATTR(shrink);
5198
5199#ifdef CONFIG_NUMA
5200static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
5201{
5202        return sprintf(buf, "%d\n", s->remote_node_defrag_ratio / 10);
5203}
5204
5205static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
5206                                const char *buf, size_t length)
5207{
5208        unsigned long ratio;
5209        int err;
5210
5211        err = strict_strtoul(buf, 10, &ratio);
5212        if (err)
5213                return err;
5214
5215        if (ratio <= 100)
5216                s->remote_node_defrag_ratio = ratio * 10;
5217
5218        return length;
5219}
5220SLAB_ATTR(remote_node_defrag_ratio);
5221#endif
5222
5223#ifdef CONFIG_SLUB_STATS
5224static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
5225{
5226        unsigned long sum  = 0;
5227        int cpu;
5228        int len;
5229        int *data = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);
5230
5231        if (!data)
5232                return -ENOMEM;
5233
5234        for_each_online_cpu(cpu) {
5235                unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si];
5236
5237                data[cpu] = x;
5238                sum += x;
5239        }
5240
5241        len = sprintf(buf, "%lu", sum);
5242
5243#ifdef CONFIG_SMP
5244        for_each_online_cpu(cpu) {
5245                if (data[cpu] && len < PAGE_SIZE - 20)
5246                        len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]);
5247        }
5248#endif
5249        kfree(data);
5250        return len + sprintf(buf + len, "\n");
5251}
5252
5253static void clear_stat(struct kmem_cache *s, enum stat_item si)
5254{
5255        int cpu;
5256
5257        for_each_online_cpu(cpu)
5258                per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0;
5259}
5260
5261#define STAT_ATTR(si, text)                                     \
5262static ssize_t text##_show(struct kmem_cache *s, char *buf)     \
5263{                                                               \
5264        return show_stat(s, buf, si);                           \
5265}                                                               \
5266static ssize_t text##_store(struct kmem_cache *s,               \
5267                                const char *buf, size_t length) \
5268{                                                               \
5269        if (buf[0] != '0')                                      \
5270                return -EINVAL;                                 \
5271        clear_stat(s, si);                                      \
5272        return length;                                          \
5273}                                                               \
5274SLAB_ATTR(text);                                                \
5275
5276STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
5277STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
5278STAT_ATTR(FREE_FASTPATH, free_fastpath);
5279STAT_ATTR(FREE_SLOWPATH, free_slowpath);
5280STAT_ATTR(FREE_FROZEN, free_frozen);
5281STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial);
5282STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
5283STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial);
5284STAT_ATTR(ALLOC_SLAB, alloc_slab);
5285STAT_ATTR(ALLOC_REFILL, alloc_refill);
5286STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch);
5287STAT_ATTR(FREE_SLAB, free_slab);
5288STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush);
5289STAT_ATTR(DEACTIVATE_FULL, deactivate_full);
5290STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
5291STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
5292STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
5293STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
5294STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass);
5295STAT_ATTR(ORDER_FALLBACK, order_fallback);
5296STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
5297STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
5298STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
5299STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
5300STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
5301STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
5302#endif
5303
5304static struct attribute *slab_attrs[] = {
5305        &slab_size_attr.attr,
5306        &object_size_attr.attr,
5307        &objs_per_slab_attr.attr,
5308        &order_attr.attr,
5309        &min_partial_attr.attr,
5310        &cpu_partial_attr.attr,
5311        &objects_attr.attr,
5312        &objects_partial_attr.attr,
5313        &partial_attr.attr,
5314        &cpu_slabs_attr.attr,
5315        &ctor_attr.attr,
5316        &aliases_attr.attr,
5317        &align_attr.attr,
5318        &hwcache_align_attr.attr,
5319        &reclaim_account_attr.attr,
5320        &destroy_by_rcu_attr.attr,
5321        &shrink_attr.attr,
5322        &reserved_attr.attr,
5323        &slabs_cpu_partial_attr.attr,
5324#ifdef CONFIG_SLUB_DEBUG
5325        &total_objects_attr.attr,
5326        &slabs_attr.attr,
5327        &sanity_checks_attr.attr,
5328        &trace_attr.attr,
5329        &red_zone_attr.attr,
5330        &poison_attr.attr,
5331        &store_user_attr.attr,
5332        &validate_attr.attr,
5333        &alloc_calls_attr.attr,
5334        &free_calls_attr.attr,
5335#endif
5336#ifdef CONFIG_ZONE_DMA
5337        &cache_dma_attr.attr,
5338#endif
5339#ifdef CONFIG_NUMA
5340        &remote_node_defrag_ratio_attr.attr,
5341#endif
5342#ifdef CONFIG_SLUB_STATS
5343        &alloc_fastpath_attr.attr,
5344        &alloc_slowpath_attr.attr,
5345        &free_fastpath_attr.attr,
5346        &free_slowpath_attr.attr,
5347        &free_frozen_attr.attr,
5348        &free_add_partial_attr.attr,
5349        &free_remove_partial_attr.attr,
5350        &alloc_from_partial_attr.attr,
5351        &alloc_slab_attr.attr,
5352        &alloc_refill_attr.attr,
5353        &alloc_node_mismatch_attr.attr,
5354        &free_slab_attr.attr,
5355        &cpuslab_flush_attr.attr,
5356        &deactivate_full_attr.attr,
5357        &deactivate_empty_attr.attr,
5358        &deactivate_to_head_attr.attr,
5359        &deactivate_to_tail_attr.attr,
5360        &deactivate_remote_frees_attr.attr,
5361        &deactivate_bypass_attr.attr,
5362        &order_fallback_attr.attr,
5363        &cmpxchg_double_fail_attr.attr,
5364        &cmpxchg_double_cpu_fail_attr.attr,
5365        &cpu_partial_alloc_attr.attr,
5366        &cpu_partial_free_attr.attr,
5367        &cpu_partial_node_attr.attr,
5368        &cpu_partial_drain_attr.attr,
5369#endif
5370#ifdef CONFIG_FAILSLAB
5371        &failslab_attr.attr,
5372#endif
5373
5374        NULL
5375};
5376
5377static struct attribute_group slab_attr_group = {
5378        .attrs = slab_attrs,
5379};
5380
5381static ssize_t slab_attr_show(struct kobject *kobj,
5382                                struct attribute *attr,
5383                                char *buf)
5384{
5385        struct slab_attribute *attribute;
5386        struct kmem_cache *s;
5387        int err;
5388
5389        attribute = to_slab_attr(attr);
5390        s = to_slab(kobj);
5391
5392        if (!attribute->show)
5393                return -EIO;
5394
5395        err = attribute->show(s, buf);
5396
5397        return err;
5398}
5399
5400static ssize_t slab_attr_store(struct kobject *kobj,
5401                                struct attribute *attr,
5402                                const char *buf, size_t len)
5403{
5404        struct slab_attribute *attribute;
5405        struct kmem_cache *s;
5406        int err;
5407
5408        attribute = to_slab_attr(attr);
5409        s = to_slab(kobj);
5410
5411        if (!attribute->store)
5412                return -EIO;
5413
5414        err = attribute->store(s, buf, len);
5415#ifdef CONFIG_MEMCG_KMEM
5416        if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
5417                int i;
5418
5419                mutex_lock(&slab_mutex);
5420                if (s->max_attr_size < len)
5421                        s->max_attr_size = len;
5422
5423                /*
5424                 * This is a best effort propagation, so this function's return
5425                 * value will be determined by the parent cache only. This is
5426                 * basically because not all attributes will have a well
5427                 * defined semantics for rollbacks - most of the actions will
5428                 * have permanent effects.
5429                 *
5430                 * Returning the error value of any of the children that fail
5431                 * is not 100 % defined, in the sense that users seeing the
5432                 * error code won't be able to know anything about the state of
5433                 * the cache.
5434                 *
5435                 * Only returning the error code for the parent cache at least
5436                 * has well defined semantics. The cache being written to
5437                 * directly either failed or succeeded, in which case we loop
5438                 * through the descendants with best-effort propagation.
5439                 */
5440                for_each_memcg_cache_index(i) {
5441                        struct kmem_cache *c = cache_from_memcg_idx(s, i);
5442                        if (c)
5443                                attribute->store(c, buf, len);
5444                }
5445                mutex_unlock(&slab_mutex);
5446        }
5447#endif
5448        return err;
5449}
5450
5451static void memcg_propagate_slab_attrs(struct kmem_cache *s)
5452{
5453#ifdef CONFIG_MEMCG_KMEM
5454        int i;
5455        char *buffer = NULL;
5456
5457        if (!is_root_cache(s))
5458                return;
5459
5460        /*
5461         * This mean this cache had no attribute written. Therefore, no point
5462         * in copying default values around
5463         */
5464        if (!s->max_attr_size)
5465                return;
5466
5467        for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) {
5468                char mbuf[64];
5469                char *buf;
5470                struct slab_attribute *attr = to_slab_attr(slab_attrs[i]);
5471
5472                if (!attr || !attr->store || !attr->show)
5473                        continue;
5474
5475                /*
5476                 * It is really bad that we have to allocate here, so we will
5477                 * do it only as a fallback. If we actually allocate, though,
5478                 * we can just use the allocated buffer until the end.
5479                 *
5480                 * Most of the slub attributes will tend to be very small in
5481                 * size, but sysfs allows buffers up to a page, so they can
5482                 * theoretically happen.
5483                 */
5484                if (buffer)
5485                        buf = buffer;
5486                else if (s->max_attr_size < ARRAY_SIZE(mbuf))
5487                        buf = mbuf;
5488                else {
5489                        buffer = (char *) get_zeroed_page(GFP_KERNEL);
5490                        if (WARN_ON(!buffer))
5491                                continue;
5492                        buf = buffer;
5493                }
5494
5495                attr->show(s->memcg_params->root_cache, buf);
5496                attr->store(s, buf, strlen(buf));
5497        }
5498
5499        if (buffer)
5500                free_page((unsigned long)buffer);
5501#endif
5502}
5503
5504static const struct sysfs_ops slab_sysfs_ops = {
5505        .show = slab_attr_show,
5506        .store = slab_attr_store,
5507};
5508
5509static struct kobj_type slab_ktype = {
5510        .sysfs_ops = &slab_sysfs_ops,
5511};
5512
5513static int uevent_filter(struct kset *kset, struct kobject *kobj)
5514{
5515        struct kobj_type *ktype = get_ktype(kobj);
5516
5517        if (ktype == &slab_ktype)
5518                return 1;
5519        return 0;
5520}
5521
5522static const struct kset_uevent_ops slab_uevent_ops = {
5523        .filter = uevent_filter,
5524};
5525
5526static struct kset *slab_kset;
5527
5528#define ID_STR_LENGTH 64
5529
5530/* Create a unique string id for a slab cache:
5531 *
5532 * Format       :[flags-]size
5533 */
5534static char *create_unique_id(struct kmem_cache *s)
5535{
5536        char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
5537        char *p = name;
5538
5539        BUG_ON(!name);
5540
5541        *p++ = ':';
5542        /*
5543         * First flags affecting slabcache operations. We will only
5544         * get here for aliasable slabs so we do not need to support
5545         * too many flags. The flags here must cover all flags that
5546         * are matched during merging to guarantee that the id is
5547         * unique.
5548         */
5549        if (s->flags & SLAB_CACHE_DMA)
5550                *p++ = 'd';
5551        if (s->flags & SLAB_RECLAIM_ACCOUNT)
5552                *p++ = 'a';
5553        if (s->flags & SLAB_CONSISTENCY_CHECKS)
5554                *p++ = 'F';
5555        if (!(s->flags & SLAB_NOTRACK))
5556                *p++ = 't';
5557        if (s->flags & SLAB_ACCOUNT)
5558                *p++ = 'A';
5559        if (p != name + 1)
5560                *p++ = '-';
5561        p += sprintf(p, "%07d", s->size);
5562
5563#ifdef CONFIG_MEMCG_KMEM
5564        if (!is_root_cache(s))
5565                p += sprintf(p, "-%08d", memcg_cache_id(s->memcg_params->memcg));
5566#endif
5567
5568        BUG_ON(p > name + ID_STR_LENGTH - 1);
5569        return name;
5570}
5571
5572static int sysfs_slab_add(struct kmem_cache *s)
5573{
5574        int err;
5575        const char *name;
5576        int unmergeable = slab_unmergeable(s);
5577
5578        if (unmergeable) {
5579                /*
5580                 * Slabcache can never be merged so we can use the name proper.
5581                 * This is typically the case for debug situations. In that
5582                 * case we can catch duplicate names easily.
5583                 */
5584                sysfs_remove_link(&slab_kset->kobj, s->name);
5585                name = s->name;
5586        } else {
5587                /*
5588                 * Create a unique name for the slab as a target
5589                 * for the symlinks.
5590                 */
5591                name = create_unique_id(s);
5592        }
5593
5594        s->kobj.kset = slab_kset;
5595        err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name);
5596        if (err) {
5597                kobject_put(&s->kobj);
5598                return err;
5599        }
5600
5601        err = sysfs_create_group(&s->kobj, &slab_attr_group);
5602        if (err) {
5603                kobject_del(&s->kobj);
5604                kobject_put(&s->kobj);
5605                return err;
5606        }
5607        kobject_uevent(&s->kobj, KOBJ_ADD);
5608        if (!unmergeable) {
5609                /* Setup first alias */
5610                sysfs_slab_alias(s, s->name);
5611                kfree(name);
5612        }
5613        return 0;
5614}
5615
5616static void sysfs_slab_remove(struct kmem_cache *s)
5617{
5618        if (slab_state < FULL)
5619                /*
5620                 * Sysfs has not been setup yet so no need to remove the
5621                 * cache from sysfs.
5622                 */
5623                return;
5624
5625        kobject_uevent(&s->kobj, KOBJ_REMOVE);
5626        kobject_del(&s->kobj);
5627        kobject_put(&s->kobj);
5628}
5629
5630/*
5631 * Need to buffer aliases during bootup until sysfs becomes
5632 * available lest we lose that information.
5633 */
5634struct saved_alias {
5635        struct kmem_cache *s;
5636        const char *name;
5637        struct saved_alias *next;
5638};
5639
5640static struct saved_alias *alias_list;
5641
5642static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
5643{
5644        struct saved_alias *al;
5645
5646        if (slab_state == FULL) {
5647                /*
5648                 * If we have a leftover link then remove it.
5649                 */
5650                sysfs_remove_link(&slab_kset->kobj, name);
5651                return sysfs_create_link(&slab_kset->kobj, &s->kobj, name);
5652        }
5653
5654        al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
5655        if (!al)
5656                return -ENOMEM;
5657
5658        al->s = s;
5659        al->name = name;
5660        al->next = alias_list;
5661        alias_list = al;
5662        return 0;
5663}
5664
5665static int __init slab_sysfs_init(void)
5666{
5667        struct kmem_cache *s;
5668        int err;
5669
5670        mutex_lock(&slab_mutex);
5671
5672        slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
5673        if (!slab_kset) {
5674                mutex_unlock(&slab_mutex);
5675                printk(KERN_ERR "Cannot register slab subsystem.\n");
5676                return -ENOSYS;
5677        }
5678
5679        slab_state = FULL;
5680
5681        list_for_each_entry(s, &slab_caches, list) {
5682                err = sysfs_slab_add(s);
5683                if (err)
5684                        printk(KERN_ERR "SLUB: Unable to add boot slab %s"
5685                                                " to sysfs\n", s->name);
5686        }
5687
5688        while (alias_list) {
5689                struct saved_alias *al = alias_list;
5690
5691                alias_list = alias_list->next;
5692                err = sysfs_slab_alias(al->s, al->name);
5693                if (err)
5694                        printk(KERN_ERR "SLUB: Unable to add boot slab alias"
5695                                        " %s to sysfs\n", al->name);
5696                kfree(al);
5697        }
5698
5699        mutex_unlock(&slab_mutex);
5700        resiliency_test();
5701        return 0;
5702}
5703
5704__initcall(slab_sysfs_init);
5705#endif /* CONFIG_SYSFS */
5706
5707/*
5708 * The /proc/slabinfo ABI
5709 */
5710#ifdef CONFIG_SLABINFO
5711void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
5712{
5713        unsigned long nr_partials = 0;
5714        unsigned long nr_slabs = 0;
5715        unsigned long nr_objs = 0;
5716        unsigned long nr_free = 0;
5717        int node;
5718
5719        for_each_online_node(node) {
5720                struct kmem_cache_node *n = get_node(s, node);
5721
5722                if (!n)
5723                        continue;
5724
5725                nr_partials += n->nr_partial;
5726                nr_slabs += atomic_long_read(&n->nr_slabs);
5727                nr_objs += atomic_long_read(&n->total_objects);
5728                nr_free += count_partial(n, count_free);
5729        }
5730
5731        sinfo->active_objs = nr_objs - nr_free;
5732        sinfo->num_objs = nr_objs;
5733        sinfo->active_slabs = nr_slabs;
5734        sinfo->num_slabs = nr_slabs;
5735        sinfo->objects_per_slab = oo_objects(s->oo);
5736        sinfo->cache_order = oo_order(s->oo);
5737}
5738
5739void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s)
5740{
5741}
5742
5743ssize_t slabinfo_write(struct file *file, const char __user *buffer,
5744                       size_t count, loff_t *ppos)
5745{
5746        return -EIO;
5747}
5748#endif /* CONFIG_SLABINFO */
5749